diff --git "a/sft/moce/checkpoint-16632/trainer_state.json" "b/sft/moce/checkpoint-16632/trainer_state.json" deleted file mode 100644--- "a/sft/moce/checkpoint-16632/trainer_state.json" +++ /dev/null @@ -1,249513 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.999969938373666, - "eval_steps": 500, - "global_step": 16632, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "auxiliary_loss_clip": 0.05028445, - "auxiliary_loss_mlp": 0.02215396, - "balance_loss_clip": 2.43573999, - "balance_loss_mlp": 1.76983953, - "epoch": 6.012325266796934e-05, - "flos": 24456507091200.0, - "grad_norm": 55.00561300220404, - "language_loss": 2.85272503, - "learning_rate": 0.0, - "loss": 1.94613922, - "num_input_tokens_seen": 19155, - "step": 1, - "time_per_iteration": 18.059409618377686 - }, - { - "auxiliary_loss_clip": 0.03380539, - "auxiliary_loss_mlp": 0.01459449, - "balance_loss_clip": 1.62786555, - "balance_loss_mlp": 1.18936849, - "epoch": 0.00012024650533593868, - "flos": 20225931246720.0, - "grad_norm": 34.93149751452764, - "language_loss": 1.82606053, - "learning_rate": 4.4628432569317594e-07, - "loss": 1.87446034, - "num_input_tokens_seen": 36175, - "step": 2, - "time_per_iteration": 2.6318798065185547 - }, - { - "auxiliary_loss_clip": 0.03320229, - "auxiliary_loss_mlp": 0.01440978, - "balance_loss_clip": 1.62577581, - "balance_loss_mlp": 1.18882656, - "epoch": 0.000180369758003908, - "flos": 22309935454080.0, - "grad_norm": 32.71870482280511, - "language_loss": 1.57573509, - "learning_rate": 7.073439208833112e-07, - "loss": 1.62334716, - "num_input_tokens_seen": 54870, - "step": 3, - "time_per_iteration": 2.6362481117248535 - }, - { - "auxiliary_loss_clip": 0.03361497, - "auxiliary_loss_mlp": 0.01451404, - "balance_loss_clip": 1.62418985, - "balance_loss_mlp": 1.15500188, - "epoch": 0.00024049301067187735, - "flos": 22414650577920.0, - "grad_norm": 51.2387172839747, - "language_loss": 1.67362881, - "learning_rate": 8.925686513863519e-07, - "loss": 1.72175777, - "num_input_tokens_seen": 74575, - "step": 4, - "time_per_iteration": 2.7070822715759277 - }, - { - "auxiliary_loss_clip": 0.03402497, - "auxiliary_loss_mlp": 0.01505358, - "balance_loss_clip": 1.62493396, - "balance_loss_mlp": 1.21715808, - "epoch": 0.0003006162633398467, - "flos": 21396978449280.0, - "grad_norm": 56.088721215944275, - "language_loss": 1.91627169, - "learning_rate": 1.0362401141348472e-06, - "loss": 1.96535027, - "num_input_tokens_seen": 92580, - "step": 5, - "time_per_iteration": 2.91436767578125 - }, - { - "auxiliary_loss_clip": 0.03370454, - "auxiliary_loss_mlp": 0.01515599, - "balance_loss_clip": 1.61556244, - "balance_loss_mlp": 1.22110426, - "epoch": 0.000360739516007816, - "flos": 21652375127040.0, - "grad_norm": 33.397169652317885, - "language_loss": 1.60591149, - "learning_rate": 1.153628246576487e-06, - "loss": 1.65477204, - "num_input_tokens_seen": 109705, - "step": 6, - "time_per_iteration": 2.994969367980957 - }, - { - "auxiliary_loss_clip": 0.03354239, - "auxiliary_loss_mlp": 0.01486417, - "balance_loss_clip": 1.61577415, - "balance_loss_mlp": 1.20336628, - "epoch": 0.0004208627686757854, - "flos": 27159742897920.0, - "grad_norm": 24.6270766983672, - "language_loss": 1.53276002, - "learning_rate": 1.2528784983718962e-06, - "loss": 1.58116663, - "num_input_tokens_seen": 129425, - "step": 7, - "time_per_iteration": 3.0675876140594482 - }, - { - "auxiliary_loss_clip": 0.03321216, - "auxiliary_loss_mlp": 0.0144328, - "balance_loss_clip": 1.61205018, - "balance_loss_mlp": 1.16499734, - "epoch": 0.0004809860213437547, - "flos": 31319096135040.0, - "grad_norm": 31.71613063643349, - "language_loss": 1.43881059, - "learning_rate": 1.338852977079528e-06, - "loss": 1.48645568, - "num_input_tokens_seen": 149210, - "step": 8, - "time_per_iteration": 3.172358751296997 - }, - { - "auxiliary_loss_clip": 0.03368839, - "auxiliary_loss_mlp": 0.01496105, - "balance_loss_clip": 1.6120348, - "balance_loss_mlp": 1.21229148, - "epoch": 0.000541109274011724, - "flos": 32160411463680.0, - "grad_norm": 28.204490849684397, - "language_loss": 1.4969244, - "learning_rate": 1.4146878417666224e-06, - "loss": 1.54557395, - "num_input_tokens_seen": 169055, - "step": 9, - "time_per_iteration": 3.112215280532837 - }, - { - "auxiliary_loss_clip": 0.03308365, - "auxiliary_loss_mlp": 0.01475035, - "balance_loss_clip": 1.61541438, - "balance_loss_mlp": 1.20647991, - "epoch": 0.0006012325266796934, - "flos": 18916808163840.0, - "grad_norm": 23.420774723604698, - "language_loss": 1.44714785, - "learning_rate": 1.4825244398280232e-06, - "loss": 1.49498188, - "num_input_tokens_seen": 188045, - "step": 10, - "time_per_iteration": 2.9495606422424316 - }, - { - "auxiliary_loss_clip": 0.03364194, - "auxiliary_loss_mlp": 0.01494262, - "balance_loss_clip": 1.62042511, - "balance_loss_mlp": 1.22036684, - "epoch": 0.0006613557793476627, - "flos": 20774861867520.0, - "grad_norm": 18.353281468858004, - "language_loss": 1.4520936, - "learning_rate": 1.5438901072051983e-06, - "loss": 1.50067806, - "num_input_tokens_seen": 207035, - "step": 11, - "time_per_iteration": 3.0797431468963623 - }, - { - "auxiliary_loss_clip": 0.03292683, - "auxiliary_loss_mlp": 0.0145154, - "balance_loss_clip": 1.60771322, - "balance_loss_mlp": 1.17554641, - "epoch": 0.000721479032015632, - "flos": 16581680997120.0, - "grad_norm": 16.61869254675767, - "language_loss": 1.45121813, - "learning_rate": 1.5999125722696629e-06, - "loss": 1.49866033, - "num_input_tokens_seen": 223225, - "step": 12, - "time_per_iteration": 2.9887659549713135 - }, - { - "auxiliary_loss_clip": 0.03321669, - "auxiliary_loss_mlp": 0.01405912, - "balance_loss_clip": 1.61740756, - "balance_loss_mlp": 1.14765704, - "epoch": 0.0007816022846836014, - "flos": 23805471144960.0, - "grad_norm": 14.02187318243825, - "language_loss": 1.23759985, - "learning_rate": 1.6514482443788434e-06, - "loss": 1.28487587, - "num_input_tokens_seen": 242570, - "step": 13, - "time_per_iteration": 3.032742977142334 - }, - { - "auxiliary_loss_clip": 0.03287474, - "auxiliary_loss_mlp": 0.01470749, - "balance_loss_clip": 1.61299658, - "balance_loss_mlp": 1.20257616, - "epoch": 0.0008417255373515708, - "flos": 19172204841600.0, - "grad_norm": 5.790568956401358, - "language_loss": 1.20684385, - "learning_rate": 1.6991628240650723e-06, - "loss": 1.254426, - "num_input_tokens_seen": 261215, - "step": 14, - "time_per_iteration": 3.002887487411499 - }, - { - "auxiliary_loss_clip": 0.03272826, - "auxiliary_loss_mlp": 0.01431255, - "balance_loss_clip": 1.6181426, - "balance_loss_mlp": 1.16804111, - "epoch": 0.00090184879001954, - "flos": 26395564026240.0, - "grad_norm": 6.353887091300461, - "language_loss": 1.12925518, - "learning_rate": 1.7435840350181584e-06, - "loss": 1.176296, - "num_input_tokens_seen": 280035, - "step": 15, - "time_per_iteration": 3.0238780975341797 - }, - { - "auxiliary_loss_clip": 0.03238489, - "auxiliary_loss_mlp": 0.01411651, - "balance_loss_clip": 1.60288334, - "balance_loss_mlp": 1.16197944, - "epoch": 0.0009619720426875094, - "flos": 24679500785280.0, - "grad_norm": 4.670310144758637, - "language_loss": 1.11125767, - "learning_rate": 1.7851373027727038e-06, - "loss": 1.15775907, - "num_input_tokens_seen": 300265, - "step": 16, - "time_per_iteration": 4.605847120285034 - }, - { - "auxiliary_loss_clip": 0.03223993, - "auxiliary_loss_mlp": 0.01417304, - "balance_loss_clip": 1.60910368, - "balance_loss_mlp": 1.17774093, - "epoch": 0.0010220952953554788, - "flos": 18624531196800.0, - "grad_norm": 8.838429022323517, - "language_loss": 1.12645221, - "learning_rate": 1.8241705979033208e-06, - "loss": 1.17286515, - "num_input_tokens_seen": 317375, - "step": 17, - "time_per_iteration": 4.579033851623535 - }, - { - "auxiliary_loss_clip": 0.03161492, - "auxiliary_loss_mlp": 0.01379312, - "balance_loss_clip": 1.60685277, - "balance_loss_mlp": 1.1475693, - "epoch": 0.001082218548023448, - "flos": 26142537646080.0, - "grad_norm": 3.823557061532633, - "language_loss": 1.08069181, - "learning_rate": 1.860972167459798e-06, - "loss": 1.12609982, - "num_input_tokens_seen": 337975, - "step": 18, - "time_per_iteration": 3.0132579803466797 - }, - { - "auxiliary_loss_clip": 0.0318761, - "auxiliary_loss_mlp": 0.01403306, - "balance_loss_clip": 1.60585093, - "balance_loss_mlp": 1.13799417, - "epoch": 0.0011423418006914173, - "flos": 19609776322560.0, - "grad_norm": 4.403621106373983, - "language_loss": 1.02445412, - "learning_rate": 1.89578346593066e-06, - "loss": 1.07036328, - "num_input_tokens_seen": 356635, - "step": 19, - "time_per_iteration": 3.016176462173462 - }, - { - "auxiliary_loss_clip": 0.0313029, - "auxiliary_loss_mlp": 0.01342049, - "balance_loss_clip": 1.60759044, - "balance_loss_mlp": 1.12155962, - "epoch": 0.0012024650533593868, - "flos": 17895365107200.0, - "grad_norm": 3.958333686933058, - "language_loss": 1.16706228, - "learning_rate": 1.928808765521199e-06, - "loss": 1.21178555, - "num_input_tokens_seen": 375625, - "step": 20, - "time_per_iteration": 3.0274486541748047 - }, - { - "auxiliary_loss_clip": 0.03118109, - "auxiliary_loss_mlp": 0.01378536, - "balance_loss_clip": 1.58886433, - "balance_loss_mlp": 1.1298182, - "epoch": 0.001262588306027356, - "flos": 21252043071360.0, - "grad_norm": 4.333519066420982, - "language_loss": 1.06129968, - "learning_rate": 1.9602224192552076e-06, - "loss": 1.10626626, - "num_input_tokens_seen": 394350, - "step": 21, - "time_per_iteration": 2.9418578147888184 - }, - { - "auxiliary_loss_clip": 0.03013912, - "auxiliary_loss_mlp": 0.0137937, - "balance_loss_clip": 1.57028937, - "balance_loss_mlp": 1.14552903, - "epoch": 0.0013227115586953253, - "flos": 26104077158400.0, - "grad_norm": 3.63841390311849, - "language_loss": 1.05861485, - "learning_rate": 1.9901744328983746e-06, - "loss": 1.10254765, - "num_input_tokens_seen": 413255, - "step": 22, - "time_per_iteration": 2.9651288986206055 - }, - { - "auxiliary_loss_clip": 0.02966296, - "auxiliary_loss_mlp": 0.01334065, - "balance_loss_clip": 1.57175612, - "balance_loss_mlp": 1.12377954, - "epoch": 0.0013828348113632948, - "flos": 23951376190080.0, - "grad_norm": 2.8746130742538347, - "language_loss": 0.9177655, - "learning_rate": 2.018794797290208e-06, - "loss": 0.96076906, - "num_input_tokens_seen": 433065, - "step": 23, - "time_per_iteration": 3.049853563308716 - }, - { - "auxiliary_loss_clip": 0.02932793, - "auxiliary_loss_mlp": 0.01362183, - "balance_loss_clip": 1.56404662, - "balance_loss_mlp": 1.14236116, - "epoch": 0.001442958064031264, - "flos": 15959851724160.0, - "grad_norm": 3.0897201135857735, - "language_loss": 1.08192635, - "learning_rate": 2.046196897962839e-06, - "loss": 1.12487614, - "num_input_tokens_seen": 451175, - "step": 24, - "time_per_iteration": 3.0543172359466553 - }, - { - "auxiliary_loss_clip": 0.02823838, - "auxiliary_loss_mlp": 0.01329007, - "balance_loss_clip": 1.55692792, - "balance_loss_mlp": 1.11853111, - "epoch": 0.0015030813166992333, - "flos": 18108350801280.0, - "grad_norm": 4.111246686692462, - "language_loss": 1.01367807, - "learning_rate": 2.0724802282696944e-06, - "loss": 1.05520654, - "num_input_tokens_seen": 468775, - "step": 25, - "time_per_iteration": 3.0059614181518555 - }, - { - "auxiliary_loss_clip": 0.02818207, - "auxiliary_loss_mlp": 0.01309454, - "balance_loss_clip": 1.55974329, - "balance_loss_mlp": 1.10012197, - "epoch": 0.0015632045693672028, - "flos": 22234558763520.0, - "grad_norm": 2.7163042439620018, - "language_loss": 1.0669204, - "learning_rate": 2.0977325700720194e-06, - "loss": 1.10819697, - "num_input_tokens_seen": 488530, - "step": 26, - "time_per_iteration": 3.1159534454345703 - }, - { - "auxiliary_loss_clip": 0.0276047, - "auxiliary_loss_mlp": 0.01325034, - "balance_loss_clip": 1.54973662, - "balance_loss_mlp": 1.12533486, - "epoch": 0.001623327822035172, - "flos": 23991955580160.0, - "grad_norm": 2.562596284241794, - "language_loss": 0.95537072, - "learning_rate": 2.122031762649933e-06, - "loss": 0.99622583, - "num_input_tokens_seen": 510495, - "step": 27, - "time_per_iteration": 3.018643617630005 - }, - { - "auxiliary_loss_clip": 0.02736222, - "auxiliary_loss_mlp": 0.01311707, - "balance_loss_clip": 1.55399776, - "balance_loss_mlp": 1.13089037, - "epoch": 0.0016834510747031415, - "flos": 19677647070720.0, - "grad_norm": 2.42975125432869, - "language_loss": 1.06393945, - "learning_rate": 2.1454471497582483e-06, - "loss": 1.10441875, - "num_input_tokens_seen": 528605, - "step": 28, - "time_per_iteration": 2.9263083934783936 - }, - { - "auxiliary_loss_clip": 0.0270011, - "auxiliary_loss_mlp": 0.0131913, - "balance_loss_clip": 1.53841436, - "balance_loss_mlp": 1.13297284, - "epoch": 0.0017435743273711108, - "flos": 20923819568640.0, - "grad_norm": 4.42090805909513, - "language_loss": 1.02493238, - "learning_rate": 2.1680407726407727e-06, - "loss": 1.06512475, - "num_input_tokens_seen": 548515, - "step": 29, - "time_per_iteration": 3.0062997341156006 - }, - { - "auxiliary_loss_clip": 0.0269246, - "auxiliary_loss_mlp": 0.01312758, - "balance_loss_clip": 1.53459728, - "balance_loss_mlp": 1.12631428, - "epoch": 0.00180369758003908, - "flos": 19528976678400.0, - "grad_norm": 3.1534114534186446, - "language_loss": 1.19265521, - "learning_rate": 2.189868360711334e-06, - "loss": 1.23270726, - "num_input_tokens_seen": 564025, - "step": 30, - "time_per_iteration": 2.931145429611206 - }, - { - "auxiliary_loss_clip": 0.02610377, - "auxiliary_loss_mlp": 0.01337183, - "balance_loss_clip": 1.52116311, - "balance_loss_mlp": 1.15665221, - "epoch": 0.0018638208327070496, - "flos": 27453169100160.0, - "grad_norm": 2.735994596991484, - "language_loss": 1.02616811, - "learning_rate": 2.2109801597326265e-06, - "loss": 1.06564379, - "num_input_tokens_seen": 583345, - "step": 31, - "time_per_iteration": 2.993251085281372 - }, - { - "auxiliary_loss_clip": 0.02582044, - "auxiliary_loss_mlp": 0.01331305, - "balance_loss_clip": 1.522609, - "balance_loss_mlp": 1.15163302, - "epoch": 0.0019239440853750188, - "flos": 13589460380160.0, - "grad_norm": 3.9056907796043654, - "language_loss": 0.95266509, - "learning_rate": 2.2314216284658796e-06, - "loss": 0.99179864, - "num_input_tokens_seen": 600010, - "step": 32, - "time_per_iteration": 2.9459571838378906 - }, - { - "auxiliary_loss_clip": 0.02564836, - "auxiliary_loss_mlp": 0.01302659, - "balance_loss_clip": 1.51811624, - "balance_loss_mlp": 1.13586164, - "epoch": 0.001984067338042988, - "flos": 11253866336640.0, - "grad_norm": 3.226486022987097, - "language_loss": 0.95143497, - "learning_rate": 2.2512340280885094e-06, - "loss": 0.99010992, - "num_input_tokens_seen": 616295, - "step": 33, - "time_per_iteration": 2.9855570793151855 - }, - { - "auxiliary_loss_clip": 0.02421202, - "auxiliary_loss_mlp": 0.01304214, - "balance_loss_clip": 1.48474145, - "balance_loss_mlp": 1.14676213, - "epoch": 0.0020441905907109576, - "flos": 22386245898240.0, - "grad_norm": 2.1714659525821247, - "language_loss": 0.91547924, - "learning_rate": 2.270454923596497e-06, - "loss": 0.9527334, - "num_input_tokens_seen": 637640, - "step": 34, - "time_per_iteration": 2.981541872024536 - }, - { - "auxiliary_loss_clip": 0.02375249, - "auxiliary_loss_mlp": 0.01271963, - "balance_loss_clip": 1.45095515, - "balance_loss_mlp": 1.11689591, - "epoch": 0.0021043138433789266, - "flos": 49778580337920.0, - "grad_norm": 2.2635429103650386, - "language_loss": 0.76603377, - "learning_rate": 2.2891186125067434e-06, - "loss": 0.80250585, - "num_input_tokens_seen": 659710, - "step": 35, - "time_per_iteration": 3.2267208099365234 - }, - { - "auxiliary_loss_clip": 0.02347187, - "auxiliary_loss_mlp": 0.01276388, - "balance_loss_clip": 1.46356034, - "balance_loss_mlp": 1.13238275, - "epoch": 0.002164437096046896, - "flos": 20557961591040.0, - "grad_norm": 2.3605884715298506, - "language_loss": 0.88713098, - "learning_rate": 2.307256493152974e-06, - "loss": 0.92336679, - "num_input_tokens_seen": 679670, - "step": 36, - "time_per_iteration": 2.948162078857422 - }, - { - "auxiliary_loss_clip": 0.02289192, - "auxiliary_loss_mlp": 0.01338204, - "balance_loss_clip": 1.45043015, - "balance_loss_mlp": 1.19105196, - "epoch": 0.0022245603487148656, - "flos": 26542295084160.0, - "grad_norm": 2.4929063351918166, - "language_loss": 0.93038809, - "learning_rate": 2.3248973825097614e-06, - "loss": 0.96666199, - "num_input_tokens_seen": 700170, - "step": 37, - "time_per_iteration": 2.9556422233581543 - }, - { - "auxiliary_loss_clip": 0.02249098, - "auxiliary_loss_mlp": 0.01276785, - "balance_loss_clip": 1.44485605, - "balance_loss_mlp": 1.15500069, - "epoch": 0.0022846836013828346, - "flos": 20338188226560.0, - "grad_norm": 2.177909778954084, - "language_loss": 1.03952074, - "learning_rate": 2.3420677916238357e-06, - "loss": 1.07477951, - "num_input_tokens_seen": 718545, - "step": 38, - "time_per_iteration": 2.9959065914154053 - }, - { - "auxiliary_loss_clip": 0.02216028, - "auxiliary_loss_mlp": 0.01260768, - "balance_loss_clip": 1.43807542, - "balance_loss_mlp": 1.13726676, - "epoch": 0.002344806854050804, - "flos": 26247575992320.0, - "grad_norm": 2.22652515093943, - "language_loss": 0.85297108, - "learning_rate": 2.358792165262154e-06, - "loss": 0.887739, - "num_input_tokens_seen": 739865, - "step": 39, - "time_per_iteration": 3.035399913787842 - }, - { - "auxiliary_loss_clip": 0.02192275, - "auxiliary_loss_mlp": 0.01250434, - "balance_loss_clip": 1.4289664, - "balance_loss_mlp": 1.12216496, - "epoch": 0.0024049301067187736, - "flos": 11801539981440.0, - "grad_norm": 3.258228308703562, - "language_loss": 0.90279335, - "learning_rate": 2.3750930912143747e-06, - "loss": 0.93722045, - "num_input_tokens_seen": 755770, - "step": 40, - "time_per_iteration": 3.060368299484253 - }, - { - "auxiliary_loss_clip": 0.02142113, - "auxiliary_loss_mlp": 0.01273783, - "balance_loss_clip": 1.41895449, - "balance_loss_mlp": 1.16086745, - "epoch": 0.0024650533593867426, - "flos": 20631506688000.0, - "grad_norm": 3.245861029799582, - "language_loss": 0.93271625, - "learning_rate": 2.3909914837471044e-06, - "loss": 0.9668752, - "num_input_tokens_seen": 773440, - "step": 41, - "time_per_iteration": 2.9518353939056396 - }, - { - "auxiliary_loss_clip": 0.02105753, - "auxiliary_loss_mlp": 0.01254821, - "balance_loss_clip": 1.41097844, - "balance_loss_mlp": 1.15168142, - "epoch": 0.002525176612054712, - "flos": 18406122549120.0, - "grad_norm": 3.3039479788253536, - "language_loss": 0.97533798, - "learning_rate": 2.4065067449483835e-06, - "loss": 1.0089438, - "num_input_tokens_seen": 790455, - "step": 42, - "time_per_iteration": 2.933177947998047 - }, - { - "auxiliary_loss_clip": 0.020675, - "auxiliary_loss_mlp": 0.01298422, - "balance_loss_clip": 1.41198874, - "balance_loss_mlp": 1.19189644, - "epoch": 0.0025852998647226816, - "flos": 28184023128960.0, - "grad_norm": 3.15071165872949, - "language_loss": 0.97562659, - "learning_rate": 2.4216569070848724e-06, - "loss": 1.00928593, - "num_input_tokens_seen": 810645, - "step": 43, - "time_per_iteration": 2.9760589599609375 - }, - { - "auxiliary_loss_clip": 0.02086351, - "auxiliary_loss_mlp": 0.01314601, - "balance_loss_clip": 1.41042757, - "balance_loss_mlp": 1.20283043, - "epoch": 0.0026454231173906506, - "flos": 14283110897280.0, - "grad_norm": 2.3612650137146574, - "language_loss": 0.93435001, - "learning_rate": 2.4364587585915504e-06, - "loss": 0.96835947, - "num_input_tokens_seen": 827470, - "step": 44, - "time_per_iteration": 2.9239895343780518 - }, - { - "auxiliary_loss_clip": 0.02043996, - "auxiliary_loss_mlp": 0.01272131, - "balance_loss_clip": 1.40557313, - "balance_loss_mlp": 1.17399764, - "epoch": 0.00270554637005862, - "flos": 22419211605120.0, - "grad_norm": 2.1476860292916644, - "language_loss": 0.98677421, - "learning_rate": 2.450927955901469e-06, - "loss": 1.01993537, - "num_input_tokens_seen": 847285, - "step": 45, - "time_per_iteration": 2.9626305103302 - }, - { - "auxiliary_loss_clip": 0.02018804, - "auxiliary_loss_mlp": 0.01228873, - "balance_loss_clip": 1.39126372, - "balance_loss_mlp": 1.14208817, - "epoch": 0.0027656696227265896, - "flos": 23985778440960.0, - "grad_norm": 1.8862192248435494, - "language_loss": 1.02800822, - "learning_rate": 2.465079122983384e-06, - "loss": 1.06048501, - "num_input_tokens_seen": 867545, - "step": 46, - "time_per_iteration": 2.9913573265075684 - }, - { - "auxiliary_loss_clip": 0.0198766, - "auxiliary_loss_mlp": 0.01272862, - "balance_loss_clip": 1.38388658, - "balance_loss_mlp": 1.182549, - "epoch": 0.0028257928753945586, - "flos": 37669503087360.0, - "grad_norm": 2.1076645953887696, - "language_loss": 0.87839413, - "learning_rate": 2.4789259401737868e-06, - "loss": 0.9109993, - "num_input_tokens_seen": 889915, - "step": 47, - "time_per_iteration": 3.0189881324768066 - }, - { - "auxiliary_loss_clip": 0.01949271, - "auxiliary_loss_mlp": 0.01255947, - "balance_loss_clip": 1.37360096, - "balance_loss_mlp": 1.16963911, - "epoch": 0.002885916128062528, - "flos": 22454547609600.0, - "grad_norm": 4.4561049138068, - "language_loss": 0.87809587, - "learning_rate": 2.492481223656015e-06, - "loss": 0.91014802, - "num_input_tokens_seen": 908975, - "step": 48, - "time_per_iteration": 2.863565444946289 - }, - { - "auxiliary_loss_clip": 0.01949016, - "auxiliary_loss_mlp": 0.0124182, - "balance_loss_clip": 1.36337733, - "balance_loss_mlp": 1.15069616, - "epoch": 0.0029460393807304976, - "flos": 27012796358400.0, - "grad_norm": 2.9451035624229855, - "language_loss": 0.89691317, - "learning_rate": 2.5057569967437924e-06, - "loss": 0.9288215, - "num_input_tokens_seen": 929810, - "step": 49, - "time_per_iteration": 2.9967453479766846 - }, - { - "auxiliary_loss_clip": 0.0194038, - "auxiliary_loss_mlp": 0.01234077, - "balance_loss_clip": 1.35742152, - "balance_loss_mlp": 1.14996314, - "epoch": 0.0030061626333984666, - "flos": 15851832549120.0, - "grad_norm": 3.162716210197168, - "language_loss": 0.90914285, - "learning_rate": 2.51876455396287e-06, - "loss": 0.94088745, - "num_input_tokens_seen": 948650, - "step": 50, - "time_per_iteration": 2.8832523822784424 - }, - { - "auxiliary_loss_clip": 0.01938537, - "auxiliary_loss_mlp": 0.01199505, - "balance_loss_clip": 1.36240602, - "balance_loss_mlp": 1.11844242, - "epoch": 0.003066285886066436, - "flos": 31827052316160.0, - "grad_norm": 6.098010360158733, - "language_loss": 0.86977792, - "learning_rate": 2.5315145187866316e-06, - "loss": 0.90115827, - "num_input_tokens_seen": 966455, - "step": 51, - "time_per_iteration": 2.9061717987060547 - }, - { - "auxiliary_loss_clip": 0.01895637, - "auxiliary_loss_mlp": 0.01206588, - "balance_loss_clip": 1.35252357, - "balance_loss_mlp": 1.12829173, - "epoch": 0.0031264091387344056, - "flos": 41427482774400.0, - "grad_norm": 2.043292881862276, - "language_loss": 0.95171362, - "learning_rate": 2.5440168957651953e-06, - "loss": 0.98273587, - "num_input_tokens_seen": 988110, - "step": 52, - "time_per_iteration": 3.0266616344451904 - }, - { - "auxiliary_loss_clip": 0.01893195, - "auxiliary_loss_mlp": 0.01241159, - "balance_loss_clip": 1.34894896, - "balance_loss_mlp": 1.16162264, - "epoch": 0.0031865323914023747, - "flos": 23440941970560.0, - "grad_norm": 4.2358840345824635, - "language_loss": 0.92323011, - "learning_rate": 2.5562811176888872e-06, - "loss": 0.95457363, - "num_input_tokens_seen": 1008550, - "step": 53, - "time_per_iteration": 2.8850226402282715 - }, - { - "auxiliary_loss_clip": 0.01882736, - "auxiliary_loss_mlp": 0.01197045, - "balance_loss_clip": 1.35264134, - "balance_loss_mlp": 1.11669779, - "epoch": 0.003246655644070344, - "flos": 14429195510400.0, - "grad_norm": 2.290226623360683, - "language_loss": 0.8260113, - "learning_rate": 2.5683160883431093e-06, - "loss": 0.85680908, - "num_input_tokens_seen": 1026840, - "step": 54, - "time_per_iteration": 2.9433553218841553 - }, - { - "auxiliary_loss_clip": 0.01880073, - "auxiliary_loss_mlp": 0.01210775, - "balance_loss_clip": 1.34162152, - "balance_loss_mlp": 1.13233542, - "epoch": 0.0033067788967383136, - "flos": 35918247496320.0, - "grad_norm": 2.911577423572303, - "language_loss": 0.81303245, - "learning_rate": 2.580130221340046e-06, - "loss": 0.84394085, - "num_input_tokens_seen": 1048875, - "step": 55, - "time_per_iteration": 3.0040643215179443 - }, - { - "auxiliary_loss_clip": 0.01870075, - "auxiliary_loss_mlp": 0.0120375, - "balance_loss_clip": 1.33644819, - "balance_loss_mlp": 1.12521541, - "epoch": 0.003366902149406283, - "flos": 22958732862720.0, - "grad_norm": 2.639118679342801, - "language_loss": 0.87089968, - "learning_rate": 2.5917314754514246e-06, - "loss": 0.90163803, - "num_input_tokens_seen": 1066435, - "step": 56, - "time_per_iteration": 2.830453395843506 - }, - { - "auxiliary_loss_clip": 0.01869912, - "auxiliary_loss_mlp": 0.01161425, - "balance_loss_clip": 1.32921791, - "balance_loss_mlp": 1.08851671, - "epoch": 0.003427025402074252, - "flos": 26582838560640.0, - "grad_norm": 2.101574700040827, - "language_loss": 0.92890096, - "learning_rate": 2.6031273868139713e-06, - "loss": 0.95921433, - "num_input_tokens_seen": 1090330, - "step": 57, - "time_per_iteration": 7.0071024894714355 - }, - { - "auxiliary_loss_clip": 0.01833802, - "auxiliary_loss_mlp": 0.0121675, - "balance_loss_clip": 1.33333457, - "balance_loss_mlp": 1.14493799, - "epoch": 0.0034871486547422216, - "flos": 23951196622080.0, - "grad_norm": 14.610065921505914, - "language_loss": 0.9972856, - "learning_rate": 2.614325098333948e-06, - "loss": 1.02779114, - "num_input_tokens_seen": 1109840, - "step": 58, - "time_per_iteration": 2.830960273742676 - }, - { - "auxiliary_loss_clip": 0.0181804, - "auxiliary_loss_mlp": 0.01199311, - "balance_loss_clip": 1.32073379, - "balance_loss_mlp": 1.12835753, - "epoch": 0.003547271907410191, - "flos": 21214983214080.0, - "grad_norm": 2.120622270947527, - "language_loss": 0.88172519, - "learning_rate": 2.625331386578098e-06, - "loss": 0.91189873, - "num_input_tokens_seen": 1128415, - "step": 59, - "time_per_iteration": 2.8507089614868164 - }, - { - "auxiliary_loss_clip": 0.01839573, - "auxiliary_loss_mlp": 0.01163328, - "balance_loss_clip": 1.32924581, - "balance_loss_mlp": 1.09075332, - "epoch": 0.00360739516007816, - "flos": 16504903676160.0, - "grad_norm": 2.021991994360373, - "language_loss": 0.93542433, - "learning_rate": 2.63615268640451e-06, - "loss": 0.96545339, - "num_input_tokens_seen": 1146515, - "step": 60, - "time_per_iteration": 2.8517534732818604 - }, - { - "auxiliary_loss_clip": 0.0181893, - "auxiliary_loss_mlp": 0.01176948, - "balance_loss_clip": 1.31414318, - "balance_loss_mlp": 1.10923755, - "epoch": 0.0036675184127461296, - "flos": 19464805031040.0, - "grad_norm": 2.908283338489548, - "language_loss": 0.90021706, - "learning_rate": 2.6467951135575943e-06, - "loss": 0.9301759, - "num_input_tokens_seen": 1166330, - "step": 61, - "time_per_iteration": 2.8853390216827393 - }, - { - "auxiliary_loss_clip": 0.01803943, - "auxiliary_loss_mlp": 0.01142904, - "balance_loss_clip": 1.31131864, - "balance_loss_mlp": 1.07581341, - "epoch": 0.003727641665414099, - "flos": 20957323979520.0, - "grad_norm": 1.8428161811646855, - "language_loss": 0.88479733, - "learning_rate": 2.657264485425803e-06, - "loss": 0.91426575, - "num_input_tokens_seen": 1186010, - "step": 62, - "time_per_iteration": 2.8860812187194824 - }, - { - "auxiliary_loss_clip": 0.01785338, - "auxiliary_loss_mlp": 0.0116457, - "balance_loss_clip": 1.30233741, - "balance_loss_mlp": 1.09504724, - "epoch": 0.003787764918082068, - "flos": 18406050721920.0, - "grad_norm": 2.4385306002926512, - "language_loss": 0.96280968, - "learning_rate": 2.6675663401385186e-06, - "loss": 0.99230874, - "num_input_tokens_seen": 1204985, - "step": 63, - "time_per_iteration": 2.9081404209136963 - }, - { - "auxiliary_loss_clip": 0.01795068, - "auxiliary_loss_mlp": 0.01171321, - "balance_loss_clip": 1.31071985, - "balance_loss_mlp": 1.10499322, - "epoch": 0.0038478881707500376, - "flos": 12459243962880.0, - "grad_norm": 3.0781639748926697, - "language_loss": 0.98840165, - "learning_rate": 2.677705954159056e-06, - "loss": 1.01806557, - "num_input_tokens_seen": 1223545, - "step": 64, - "time_per_iteration": 2.893603801727295 - }, - { - "auxiliary_loss_clip": 0.01801311, - "auxiliary_loss_mlp": 0.01151112, - "balance_loss_clip": 1.30960393, - "balance_loss_mlp": 1.08368695, - "epoch": 0.003908011423418007, - "flos": 13553334276480.0, - "grad_norm": 2.4813676281781554, - "language_loss": 0.85397774, - "learning_rate": 2.6876883585136904e-06, - "loss": 0.88350195, - "num_input_tokens_seen": 1241175, - "step": 65, - "time_per_iteration": 2.8768796920776367 - }, - { - "auxiliary_loss_clip": 0.01777474, - "auxiliary_loss_mlp": 0.01155217, - "balance_loss_clip": 1.29563761, - "balance_loss_mlp": 1.087888, - "epoch": 0.003968134676085976, - "flos": 18333475292160.0, - "grad_norm": 1.8550079005121831, - "language_loss": 0.85281348, - "learning_rate": 2.697518353781685e-06, - "loss": 0.88214046, - "num_input_tokens_seen": 1259315, - "step": 66, - "time_per_iteration": 2.769274950027466 - }, - { - "auxiliary_loss_clip": 0.01779987, - "auxiliary_loss_mlp": 0.01151372, - "balance_loss_clip": 1.29312515, - "balance_loss_mlp": 1.07650828, - "epoch": 0.004028257928753946, - "flos": 20485242506880.0, - "grad_norm": 2.74895944689593, - "language_loss": 0.96567476, - "learning_rate": 2.7072005239581103e-06, - "loss": 0.99498826, - "num_input_tokens_seen": 1277055, - "step": 67, - "time_per_iteration": 2.889369249343872 - }, - { - "auxiliary_loss_clip": 0.01752442, - "auxiliary_loss_mlp": 0.01152779, - "balance_loss_clip": 1.28765118, - "balance_loss_mlp": 1.08120584, - "epoch": 0.004088381181421915, - "flos": 18843837684480.0, - "grad_norm": 2.109359538419204, - "language_loss": 0.94516367, - "learning_rate": 2.7167392492896727e-06, - "loss": 0.97421581, - "num_input_tokens_seen": 1294355, - "step": 68, - "time_per_iteration": 2.8107409477233887 - }, - { - "auxiliary_loss_clip": 0.01747204, - "auxiliary_loss_mlp": 0.0115424, - "balance_loss_clip": 1.28511512, - "balance_loss_mlp": 1.08476448, - "epoch": 0.004148504434089885, - "flos": 19427817000960.0, - "grad_norm": 2.2931216646069092, - "language_loss": 0.96014255, - "learning_rate": 2.7261387181735195e-06, - "loss": 0.98915702, - "num_input_tokens_seen": 1313525, - "step": 69, - "time_per_iteration": 2.8138387203216553 - }, - { - "auxiliary_loss_clip": 0.01741342, - "auxiliary_loss_mlp": 0.01160375, - "balance_loss_clip": 1.28807163, - "balance_loss_mlp": 1.09581161, - "epoch": 0.004208627686757853, - "flos": 20811023884800.0, - "grad_norm": 2.1764096137707494, - "language_loss": 0.98070192, - "learning_rate": 2.7354029381999196e-06, - "loss": 1.00971913, - "num_input_tokens_seen": 1330505, - "step": 70, - "time_per_iteration": 2.8319084644317627 - }, - { - "auxiliary_loss_clip": 0.0174721, - "auxiliary_loss_mlp": 0.01145619, - "balance_loss_clip": 1.27791202, - "balance_loss_mlp": 1.07685876, - "epoch": 0.004268750939425823, - "flos": 19098623831040.0, - "grad_norm": 2.9300158782571324, - "language_loss": 0.94016141, - "learning_rate": 2.7445357464116983e-06, - "loss": 0.96908975, - "num_input_tokens_seen": 1349615, - "step": 71, - "time_per_iteration": 2.8469433784484863 - }, - { - "auxiliary_loss_clip": 0.01815227, - "auxiliary_loss_mlp": 0.01294388, - "balance_loss_clip": 1.43495834, - "balance_loss_mlp": 1.25490558, - "epoch": 0.004328874192093792, - "flos": 52439635514880.0, - "grad_norm": 2.409331683106634, - "language_loss": 0.65682542, - "learning_rate": 2.75354081884615e-06, - "loss": 0.68792164, - "num_input_tokens_seen": 1410275, - "step": 72, - "time_per_iteration": 3.2019593715667725 - }, - { - "auxiliary_loss_clip": 0.01799527, - "auxiliary_loss_mlp": 0.01271558, - "balance_loss_clip": 1.43197393, - "balance_loss_mlp": 1.2316941, - "epoch": 0.004388997444761762, - "flos": 66473239564800.0, - "grad_norm": 2.25068040880696, - "language_loss": 0.63694263, - "learning_rate": 2.7624216794188286e-06, - "loss": 0.66765356, - "num_input_tokens_seen": 1473020, - "step": 73, - "time_per_iteration": 3.3545596599578857 - }, - { - "auxiliary_loss_clip": 0.01720805, - "auxiliary_loss_mlp": 0.01140553, - "balance_loss_clip": 1.26912856, - "balance_loss_mlp": 1.07279444, - "epoch": 0.004449120697429731, - "flos": 18952970181120.0, - "grad_norm": 2.554977860093902, - "language_loss": 0.86212188, - "learning_rate": 2.771181708202938e-06, - "loss": 0.89073551, - "num_input_tokens_seen": 1490385, - "step": 74, - "time_per_iteration": 2.823498487472534 - }, - { - "auxiliary_loss_clip": 0.0172287, - "auxiliary_loss_mlp": 0.01162493, - "balance_loss_clip": 1.26811171, - "balance_loss_mlp": 1.09344697, - "epoch": 0.004509243950097701, - "flos": 21105491581440.0, - "grad_norm": 3.0087618017840105, - "language_loss": 0.97196102, - "learning_rate": 2.779824149153005e-06, - "loss": 1.00081468, - "num_input_tokens_seen": 1509725, - "step": 75, - "time_per_iteration": 2.888415575027466 - }, - { - "auxiliary_loss_clip": 0.0170198, - "auxiliary_loss_mlp": 0.01142315, - "balance_loss_clip": 1.26420689, - "balance_loss_mlp": 1.07608271, - "epoch": 0.004569367202765669, - "flos": 20698730991360.0, - "grad_norm": 2.6610382542709043, - "language_loss": 0.87740695, - "learning_rate": 2.788352117317012e-06, - "loss": 0.90584993, - "num_input_tokens_seen": 1527245, - "step": 76, - "time_per_iteration": 2.9226863384246826 - }, - { - "auxiliary_loss_clip": 0.01702512, - "auxiliary_loss_mlp": 0.01145374, - "balance_loss_clip": 1.26239479, - "balance_loss_mlp": 1.07656646, - "epoch": 0.004629490455433639, - "flos": 28658474899200.0, - "grad_norm": 2.4272090643104574, - "language_loss": 0.91791159, - "learning_rate": 2.796768605577095e-06, - "loss": 0.94639051, - "num_input_tokens_seen": 1548930, - "step": 77, - "time_per_iteration": 2.8720929622650146 - }, - { - "auxiliary_loss_clip": 0.01693018, - "auxiliary_loss_mlp": 0.01165978, - "balance_loss_clip": 1.26398146, - "balance_loss_mlp": 1.09569168, - "epoch": 0.004689613708101608, - "flos": 11072409805440.0, - "grad_norm": 2.2822185142383034, - "language_loss": 0.9211635, - "learning_rate": 2.80507649095533e-06, - "loss": 0.94975346, - "num_input_tokens_seen": 1565695, - "step": 78, - "time_per_iteration": 2.7832391262054443 - }, - { - "auxiliary_loss_clip": 0.01689271, - "auxiliary_loss_mlp": 0.01153255, - "balance_loss_clip": 1.25836253, - "balance_loss_mlp": 1.08482933, - "epoch": 0.004749736960769578, - "flos": 21799106184960.0, - "grad_norm": 2.263191265943929, - "language_loss": 0.82771945, - "learning_rate": 2.813278540517843e-06, - "loss": 0.85614467, - "num_input_tokens_seen": 1582625, - "step": 79, - "time_per_iteration": 2.7723355293273926 - }, - { - "auxiliary_loss_clip": 0.01702468, - "auxiliary_loss_mlp": 0.01130708, - "balance_loss_clip": 1.26147008, - "balance_loss_mlp": 1.0609467, - "epoch": 0.004809860213437547, - "flos": 19792597570560.0, - "grad_norm": 1.9992491725405546, - "language_loss": 0.91272199, - "learning_rate": 2.8213774169075505e-06, - "loss": 0.94105375, - "num_input_tokens_seen": 1601725, - "step": 80, - "time_per_iteration": 2.742046356201172 - }, - { - "auxiliary_loss_clip": 0.01671156, - "auxiliary_loss_mlp": 0.01144048, - "balance_loss_clip": 1.25365841, - "balance_loss_mlp": 1.07371473, - "epoch": 0.004869983466105517, - "flos": 26574327037440.0, - "grad_norm": 2.0371265012476742, - "language_loss": 0.95241439, - "learning_rate": 2.829375683533245e-06, - "loss": 0.9805665, - "num_input_tokens_seen": 1622420, - "step": 81, - "time_per_iteration": 2.8996386528015137 - }, - { - "auxiliary_loss_clip": 0.01686092, - "auxiliary_loss_mlp": 0.01147828, - "balance_loss_clip": 1.25779653, - "balance_loss_mlp": 1.08149946, - "epoch": 0.004930106718773485, - "flos": 12823378087680.0, - "grad_norm": 2.9441337112970296, - "language_loss": 0.96288472, - "learning_rate": 2.8372758094402803e-06, - "loss": 0.99122393, - "num_input_tokens_seen": 1640715, - "step": 82, - "time_per_iteration": 2.819120407104492 - }, - { - "auxiliary_loss_clip": 0.01668255, - "auxiliary_loss_mlp": 0.01156428, - "balance_loss_clip": 1.2461338, - "balance_loss_mlp": 1.08709574, - "epoch": 0.004990229971441455, - "flos": 25774919902080.0, - "grad_norm": 2.6601797838877856, - "language_loss": 0.86762071, - "learning_rate": 2.84508017388607e-06, - "loss": 0.89586747, - "num_input_tokens_seen": 1662210, - "step": 83, - "time_per_iteration": 2.7959344387054443 - }, - { - "auxiliary_loss_clip": 0.01662665, - "auxiliary_loss_mlp": 0.01154043, - "balance_loss_clip": 1.24844718, - "balance_loss_mlp": 1.084234, - "epoch": 0.005050353224109424, - "flos": 17457254922240.0, - "grad_norm": 2.5416281292503986, - "language_loss": 0.92081314, - "learning_rate": 2.852791070641559e-06, - "loss": 0.94898021, - "num_input_tokens_seen": 1681070, - "step": 84, - "time_per_iteration": 2.7176246643066406 - }, - { - "auxiliary_loss_clip": 0.01647627, - "auxiliary_loss_mlp": 0.01154949, - "balance_loss_clip": 1.36429358, - "balance_loss_mlp": 1.11527622, - "epoch": 0.005110476476777394, - "flos": 69805460367360.0, - "grad_norm": 1.4023430227621099, - "language_loss": 0.6252538, - "learning_rate": 2.8604107120381682e-06, - "loss": 0.65327954, - "num_input_tokens_seen": 1747140, - "step": 85, - "time_per_iteration": 3.296835422515869 - }, - { - "auxiliary_loss_clip": 0.01649469, - "auxiliary_loss_mlp": 0.0112642, - "balance_loss_clip": 1.23797417, - "balance_loss_mlp": 1.05642033, - "epoch": 0.005170599729445363, - "flos": 24790105739520.0, - "grad_norm": 1.805253124779358, - "language_loss": 0.90709531, - "learning_rate": 2.8679412327780482e-06, - "loss": 0.93485421, - "num_input_tokens_seen": 1767475, - "step": 86, - "time_per_iteration": 2.761484146118164 - }, - { - "auxiliary_loss_clip": 0.01653351, - "auxiliary_loss_mlp": 0.01158608, - "balance_loss_clip": 1.24437881, - "balance_loss_mlp": 1.08741617, - "epoch": 0.005230722982113333, - "flos": 23258048895360.0, - "grad_norm": 2.3398213465495776, - "language_loss": 0.81961077, - "learning_rate": 2.8753846935240833e-06, - "loss": 0.8477304, - "num_input_tokens_seen": 1784980, - "step": 87, - "time_per_iteration": 2.763185739517212 - }, - { - "auxiliary_loss_clip": 0.01641581, - "auxiliary_loss_mlp": 0.01152623, - "balance_loss_clip": 1.24129367, - "balance_loss_mlp": 1.08457828, - "epoch": 0.005290846234781301, - "flos": 16727909264640.0, - "grad_norm": 3.1951080427559857, - "language_loss": 0.95790672, - "learning_rate": 2.8827430842847267e-06, - "loss": 0.98584872, - "num_input_tokens_seen": 1803030, - "step": 88, - "time_per_iteration": 2.7855517864227295 - }, - { - "auxiliary_loss_clip": 0.01658657, - "auxiliary_loss_mlp": 0.01147064, - "balance_loss_clip": 1.24130976, - "balance_loss_mlp": 1.07978201, - "epoch": 0.005350969487449271, - "flos": 20886077352960.0, - "grad_norm": 3.405407923072192, - "language_loss": 0.86023164, - "learning_rate": 2.8900183276075957e-06, - "loss": 0.88828892, - "num_input_tokens_seen": 1822865, - "step": 89, - "time_per_iteration": 2.7517924308776855 - }, - { - "auxiliary_loss_clip": 0.01647446, - "auxiliary_loss_mlp": 0.01133456, - "balance_loss_clip": 1.23541856, - "balance_loss_mlp": 1.06727123, - "epoch": 0.00541109274011724, - "flos": 26209977431040.0, - "grad_norm": 2.130771496386599, - "language_loss": 0.9150058, - "learning_rate": 2.8972122815946455e-06, - "loss": 0.94281483, - "num_input_tokens_seen": 1842435, - "step": 90, - "time_per_iteration": 2.7526872158050537 - }, - { - "auxiliary_loss_clip": 0.01629409, - "auxiliary_loss_mlp": 0.01133822, - "balance_loss_clip": 1.23219132, - "balance_loss_mlp": 1.06582534, - "epoch": 0.00547121599278521, - "flos": 21178569801600.0, - "grad_norm": 2.6928798867856796, - "language_loss": 0.86073506, - "learning_rate": 2.90432674275074e-06, - "loss": 0.88836741, - "num_input_tokens_seen": 1860065, - "step": 91, - "time_per_iteration": 2.7995588779449463 - }, - { - "auxiliary_loss_clip": 0.01628638, - "auxiliary_loss_mlp": 0.01138916, - "balance_loss_clip": 1.22774827, - "balance_loss_mlp": 1.07335091, - "epoch": 0.005531339245453179, - "flos": 19718801078400.0, - "grad_norm": 5.062847798051961, - "language_loss": 0.87041199, - "learning_rate": 2.91136344867656e-06, - "loss": 0.8980875, - "num_input_tokens_seen": 1878135, - "step": 92, - "time_per_iteration": 2.7813079357147217 - }, - { - "auxiliary_loss_clip": 0.01620799, - "auxiliary_loss_mlp": 0.01174163, - "balance_loss_clip": 1.21933174, - "balance_loss_mlp": 1.10650027, - "epoch": 0.005591462498121149, - "flos": 17636089760640.0, - "grad_norm": 4.340668874696889, - "language_loss": 0.9210887, - "learning_rate": 2.918324080615938e-06, - "loss": 0.94903833, - "num_input_tokens_seen": 1894895, - "step": 93, - "time_per_iteration": 2.7582218647003174 - }, - { - "auxiliary_loss_clip": 0.0163427, - "auxiliary_loss_mlp": 0.01153574, - "balance_loss_clip": 1.22659743, - "balance_loss_mlp": 1.08238208, - "epoch": 0.005651585750789117, - "flos": 20011221699840.0, - "grad_norm": 4.327341326162078, - "language_loss": 0.87578797, - "learning_rate": 2.925210265866963e-06, - "loss": 0.90366644, - "num_input_tokens_seen": 1913220, - "step": 94, - "time_per_iteration": 2.783581256866455 - }, - { - "auxiliary_loss_clip": 0.01570285, - "auxiliary_loss_mlp": 0.01051726, - "balance_loss_clip": 1.31970167, - "balance_loss_mlp": 1.01376939, - "epoch": 0.005711709003457087, - "flos": 59812957981440.0, - "grad_norm": 1.3608185384271176, - "language_loss": 0.68098927, - "learning_rate": 2.932023580065507e-06, - "loss": 0.70720935, - "num_input_tokens_seen": 1970970, - "step": 95, - "time_per_iteration": 3.1328847408294678 - }, - { - "auxiliary_loss_clip": 0.01612519, - "auxiliary_loss_mlp": 0.01150182, - "balance_loss_clip": 1.21488237, - "balance_loss_mlp": 1.08318627, - "epoch": 0.005771832256125056, - "flos": 15559591495680.0, - "grad_norm": 6.736145376327001, - "language_loss": 0.90221369, - "learning_rate": 2.9387655493491906e-06, - "loss": 0.92984068, - "num_input_tokens_seen": 1988930, - "step": 96, - "time_per_iteration": 2.8015241622924805 - }, - { - "auxiliary_loss_clip": 0.01605814, - "auxiliary_loss_mlp": 0.01142022, - "balance_loss_clip": 1.21851277, - "balance_loss_mlp": 1.08003318, - "epoch": 0.005831955508793026, - "flos": 22528380015360.0, - "grad_norm": 3.8307865500968044, - "language_loss": 0.89869905, - "learning_rate": 2.9454376524092147e-06, - "loss": 0.92617744, - "num_input_tokens_seen": 2006285, - "step": 97, - "time_per_iteration": 4.387299060821533 - }, - { - "auxiliary_loss_clip": 0.01593214, - "auxiliary_loss_mlp": 0.01140673, - "balance_loss_clip": 1.2102325, - "balance_loss_mlp": 1.07200789, - "epoch": 0.005892078761460995, - "flos": 22049834094720.0, - "grad_norm": 2.291581893082518, - "language_loss": 0.76274347, - "learning_rate": 2.952041322436969e-06, - "loss": 0.79008234, - "num_input_tokens_seen": 2024905, - "step": 98, - "time_per_iteration": 2.751507043838501 - }, - { - "auxiliary_loss_clip": 0.01533926, - "auxiliary_loss_mlp": 0.01036775, - "balance_loss_clip": 1.29271698, - "balance_loss_mlp": 1.00129879, - "epoch": 0.005952202014128965, - "flos": 68539143317760.0, - "grad_norm": 1.0388395506080574, - "language_loss": 0.65518898, - "learning_rate": 2.9585779489718204e-06, - "loss": 0.68089598, - "num_input_tokens_seen": 2086220, - "step": 99, - "time_per_iteration": 3.3125040531158447 - }, - { - "auxiliary_loss_clip": 0.01595694, - "auxiliary_loss_mlp": 0.01142556, - "balance_loss_clip": 1.21028757, - "balance_loss_mlp": 1.07217503, - "epoch": 0.006012325266796933, - "flos": 22960887678720.0, - "grad_norm": 2.051483688350497, - "language_loss": 0.90885437, - "learning_rate": 2.9650488796560464e-06, - "loss": 0.93623686, - "num_input_tokens_seen": 2103365, - "step": 100, - "time_per_iteration": 2.7632548809051514 - }, - { - "auxiliary_loss_clip": 0.01607235, - "auxiliary_loss_mlp": 0.01150276, - "balance_loss_clip": 1.21294045, - "balance_loss_mlp": 1.08394814, - "epoch": 0.006072448519464903, - "flos": 17347942857600.0, - "grad_norm": 2.0181737234491566, - "language_loss": 0.91081136, - "learning_rate": 2.971455421902446e-06, - "loss": 0.9383865, - "num_input_tokens_seen": 2121995, - "step": 101, - "time_per_iteration": 2.7214279174804688 - }, - { - "auxiliary_loss_clip": 0.015938, - "auxiliary_loss_mlp": 0.01152009, - "balance_loss_clip": 1.21248627, - "balance_loss_mlp": 1.08124638, - "epoch": 0.006132571772132872, - "flos": 24681116897280.0, - "grad_norm": 2.076276442041171, - "language_loss": 0.90774924, - "learning_rate": 2.9777988444798075e-06, - "loss": 0.93520737, - "num_input_tokens_seen": 2141815, - "step": 102, - "time_per_iteration": 2.8389108180999756 - }, - { - "auxiliary_loss_clip": 0.01588155, - "auxiliary_loss_mlp": 0.01133785, - "balance_loss_clip": 1.20914173, - "balance_loss_mlp": 1.06912589, - "epoch": 0.006192695024800842, - "flos": 21465675210240.0, - "grad_norm": 2.3272829989328456, - "language_loss": 0.88006896, - "learning_rate": 2.9840803790210285e-06, - "loss": 0.90728837, - "num_input_tokens_seen": 2161125, - "step": 103, - "time_per_iteration": 2.768784761428833 - }, - { - "auxiliary_loss_clip": 0.01588751, - "auxiliary_loss_mlp": 0.01136216, - "balance_loss_clip": 1.21138883, - "balance_loss_mlp": 1.06998372, - "epoch": 0.006252818277468811, - "flos": 17420410546560.0, - "grad_norm": 1.9182889224259552, - "language_loss": 0.93644351, - "learning_rate": 2.990301221458371e-06, - "loss": 0.96369314, - "num_input_tokens_seen": 2179510, - "step": 104, - "time_per_iteration": 2.7109038829803467 - }, - { - "auxiliary_loss_clip": 0.01579421, - "auxiliary_loss_mlp": 0.01146524, - "balance_loss_clip": 1.20086741, - "balance_loss_mlp": 1.08258009, - "epoch": 0.006312941530136781, - "flos": 19099557584640.0, - "grad_norm": 3.0437899698059367, - "language_loss": 0.96655375, - "learning_rate": 2.9964625333900544e-06, - "loss": 0.99381316, - "num_input_tokens_seen": 2197870, - "step": 105, - "time_per_iteration": 2.7254133224487305 - }, - { - "auxiliary_loss_clip": 0.01578331, - "auxiliary_loss_mlp": 0.01158544, - "balance_loss_clip": 1.20144236, - "balance_loss_mlp": 1.08768642, - "epoch": 0.006373064782804749, - "flos": 24060831909120.0, - "grad_norm": 3.1837681777002302, - "language_loss": 0.87119448, - "learning_rate": 3.002565443382063e-06, - "loss": 0.89856327, - "num_input_tokens_seen": 2217495, - "step": 106, - "time_per_iteration": 2.7705447673797607 - }, - { - "auxiliary_loss_clip": 0.01561845, - "auxiliary_loss_mlp": 0.01143018, - "balance_loss_clip": 1.18746924, - "balance_loss_mlp": 1.0751636, - "epoch": 0.006433188035472719, - "flos": 18332433797760.0, - "grad_norm": 2.228856706842439, - "language_loss": 0.83398581, - "learning_rate": 3.008611048208843e-06, - "loss": 0.86103439, - "num_input_tokens_seen": 2236520, - "step": 107, - "time_per_iteration": 2.6885263919830322 - }, - { - "auxiliary_loss_clip": 0.01469631, - "auxiliary_loss_mlp": 0.0103327, - "balance_loss_clip": 1.25210869, - "balance_loss_mlp": 1.00179863, - "epoch": 0.006493311288140688, - "flos": 62562387594240.0, - "grad_norm": 0.9900995959758047, - "language_loss": 0.64796811, - "learning_rate": 3.014600414036285e-06, - "loss": 0.67299712, - "num_input_tokens_seen": 2300140, - "step": 108, - "time_per_iteration": 3.278621196746826 - }, - { - "auxiliary_loss_clip": 0.01552898, - "auxiliary_loss_mlp": 0.01132858, - "balance_loss_clip": 1.18960094, - "balance_loss_mlp": 1.06424141, - "epoch": 0.006553434540808658, - "flos": 19500141035520.0, - "grad_norm": 2.019247660217844, - "language_loss": 0.97709465, - "learning_rate": 3.0205345775501937e-06, - "loss": 1.00395215, - "num_input_tokens_seen": 2317320, - "step": 109, - "time_per_iteration": 2.750502347946167 - }, - { - "auxiliary_loss_clip": 0.01550996, - "auxiliary_loss_mlp": 0.01140204, - "balance_loss_clip": 1.19136214, - "balance_loss_mlp": 1.07430482, - "epoch": 0.006613557793476627, - "flos": 21105132445440.0, - "grad_norm": 1.9540987754213832, - "language_loss": 0.84243041, - "learning_rate": 3.0264145470332218e-06, - "loss": 0.86934245, - "num_input_tokens_seen": 2337820, - "step": 110, - "time_per_iteration": 2.82443904876709 - }, - { - "auxiliary_loss_clip": 0.01544634, - "auxiliary_loss_mlp": 0.01151549, - "balance_loss_clip": 1.18396342, - "balance_loss_mlp": 1.08493507, - "epoch": 0.006673681046144597, - "flos": 26030747543040.0, - "grad_norm": 2.4580319150483563, - "language_loss": 0.82940048, - "learning_rate": 3.032241303393073e-06, - "loss": 0.85636234, - "num_input_tokens_seen": 2358560, - "step": 111, - "time_per_iteration": 2.8308968544006348 - }, - { - "auxiliary_loss_clip": 0.0154596, - "auxiliary_loss_mlp": 0.01133366, - "balance_loss_clip": 1.18776846, - "balance_loss_mlp": 1.06970847, - "epoch": 0.006733804298812566, - "flos": 23147767163520.0, - "grad_norm": 2.356589096997363, - "language_loss": 0.93989801, - "learning_rate": 3.0380158011446e-06, - "loss": 0.9666912, - "num_input_tokens_seen": 2379005, - "step": 112, - "time_per_iteration": 2.8007922172546387 - }, - { - "auxiliary_loss_clip": 0.01549647, - "auxiliary_loss_mlp": 0.01136979, - "balance_loss_clip": 1.18394601, - "balance_loss_mlp": 1.07322621, - "epoch": 0.006793927551480535, - "flos": 11764444210560.0, - "grad_norm": 2.521639841990545, - "language_loss": 0.79509294, - "learning_rate": 3.0437389693482466e-06, - "loss": 0.82195914, - "num_input_tokens_seen": 2395610, - "step": 113, - "time_per_iteration": 2.7599966526031494 - }, - { - "auxiliary_loss_clip": 0.0153736, - "auxiliary_loss_mlp": 0.01131524, - "balance_loss_clip": 1.18028498, - "balance_loss_mlp": 1.06562555, - "epoch": 0.006854050804148504, - "flos": 19171953446400.0, - "grad_norm": 2.343117351168218, - "language_loss": 0.93439317, - "learning_rate": 3.0494117125071475e-06, - "loss": 0.96108204, - "num_input_tokens_seen": 2415005, - "step": 114, - "time_per_iteration": 2.723540782928467 - }, - { - "auxiliary_loss_clip": 0.01544971, - "auxiliary_loss_mlp": 0.01138932, - "balance_loss_clip": 1.17997146, - "balance_loss_mlp": 1.07918465, - "epoch": 0.006914174056816474, - "flos": 21981891519360.0, - "grad_norm": 1.9509019191057126, - "language_loss": 0.9463321, - "learning_rate": 3.055034911425055e-06, - "loss": 0.97317111, - "num_input_tokens_seen": 2433965, - "step": 115, - "time_per_iteration": 2.7077698707580566 - }, - { - "auxiliary_loss_clip": 0.01537699, - "auxiliary_loss_mlp": 0.01118178, - "balance_loss_clip": 1.17675614, - "balance_loss_mlp": 1.05151677, - "epoch": 0.006974297309484443, - "flos": 16289152634880.0, - "grad_norm": 10.363795807176915, - "language_loss": 0.82148951, - "learning_rate": 3.0606094240271244e-06, - "loss": 0.84804827, - "num_input_tokens_seen": 2451605, - "step": 116, - "time_per_iteration": 2.681190013885498 - }, - { - "auxiliary_loss_clip": 0.01528803, - "auxiliary_loss_mlp": 0.01126189, - "balance_loss_clip": 1.17677391, - "balance_loss_mlp": 1.06219721, - "epoch": 0.007034420562152413, - "flos": 26104005331200.0, - "grad_norm": 2.4150591879391627, - "language_loss": 0.88368428, - "learning_rate": 3.0661360861454656e-06, - "loss": 0.91023421, - "num_input_tokens_seen": 2472035, - "step": 117, - "time_per_iteration": 2.776143789291382 - }, - { - "auxiliary_loss_clip": 0.01527909, - "auxiliary_loss_mlp": 0.01146127, - "balance_loss_clip": 1.17495561, - "balance_loss_mlp": 1.08041906, - "epoch": 0.007094543814820382, - "flos": 14204609723520.0, - "grad_norm": 2.3639764059040265, - "language_loss": 0.8454417, - "learning_rate": 3.071615712271274e-06, - "loss": 0.87218207, - "num_input_tokens_seen": 2489285, - "step": 118, - "time_per_iteration": 2.7110469341278076 - }, - { - "auxiliary_loss_clip": 0.01538161, - "auxiliary_loss_mlp": 0.01163868, - "balance_loss_clip": 1.1759789, - "balance_loss_mlp": 1.0984937, - "epoch": 0.007154667067488351, - "flos": 14976007228800.0, - "grad_norm": 2.231843342078736, - "language_loss": 0.99319011, - "learning_rate": 3.0770490962752172e-06, - "loss": 1.02021039, - "num_input_tokens_seen": 2506460, - "step": 119, - "time_per_iteration": 2.674121856689453 - }, - { - "auxiliary_loss_clip": 0.01540018, - "auxiliary_loss_mlp": 0.01120611, - "balance_loss_clip": 1.17242217, - "balance_loss_mlp": 1.05738258, - "epoch": 0.00721479032015632, - "flos": 20193288762240.0, - "grad_norm": 2.7981733983226764, - "language_loss": 0.8963809, - "learning_rate": 3.082437012097686e-06, - "loss": 0.92298722, - "num_input_tokens_seen": 2525565, - "step": 120, - "time_per_iteration": 2.745962381362915 - }, - { - "auxiliary_loss_clip": 0.01524916, - "auxiliary_loss_mlp": 0.01129465, - "balance_loss_clip": 1.1734432, - "balance_loss_mlp": 1.06513989, - "epoch": 0.00727491357282429, - "flos": 23147228459520.0, - "grad_norm": 1.797716104424251, - "language_loss": 0.93491542, - "learning_rate": 3.0877802144103967e-06, - "loss": 0.96145928, - "num_input_tokens_seen": 2546605, - "step": 121, - "time_per_iteration": 2.7924466133117676 - }, - { - "auxiliary_loss_clip": 0.01526294, - "auxiliary_loss_mlp": 0.0114832, - "balance_loss_clip": 1.17395604, - "balance_loss_mlp": 1.08490098, - "epoch": 0.007335036825492259, - "flos": 15521669712000.0, - "grad_norm": 2.3704869501778285, - "language_loss": 0.90462255, - "learning_rate": 3.09307943925077e-06, - "loss": 0.93136871, - "num_input_tokens_seen": 2560730, - "step": 122, - "time_per_iteration": 2.930413246154785 - }, - { - "auxiliary_loss_clip": 0.01521826, - "auxiliary_loss_mlp": 0.01146566, - "balance_loss_clip": 1.1681807, - "balance_loss_mlp": 1.07861674, - "epoch": 0.007395160078160229, - "flos": 24243365848320.0, - "grad_norm": 2.4867163179710037, - "language_loss": 0.92660481, - "learning_rate": 3.0983354046304154e-06, - "loss": 0.95328873, - "num_input_tokens_seen": 2579550, - "step": 123, - "time_per_iteration": 2.7484309673309326 - }, - { - "auxiliary_loss_clip": 0.01519363, - "auxiliary_loss_mlp": 0.01127611, - "balance_loss_clip": 1.16324139, - "balance_loss_mlp": 1.0651449, - "epoch": 0.007455283330828198, - "flos": 31759792099200.0, - "grad_norm": 2.366639004459226, - "language_loss": 0.71187961, - "learning_rate": 3.103548811118979e-06, - "loss": 0.73834932, - "num_input_tokens_seen": 2600390, - "step": 124, - "time_per_iteration": 2.8419976234436035 - }, - { - "auxiliary_loss_clip": 0.01506936, - "auxiliary_loss_mlp": 0.01125571, - "balance_loss_clip": 1.16464007, - "balance_loss_mlp": 1.06167519, - "epoch": 0.007515406583496167, - "flos": 26615157822720.0, - "grad_norm": 2.1632751269766106, - "language_loss": 0.88450015, - "learning_rate": 3.108720342404542e-06, - "loss": 0.91082525, - "num_input_tokens_seen": 2620770, - "step": 125, - "time_per_iteration": 2.823296308517456 - }, - { - "auxiliary_loss_clip": 0.01522239, - "auxiliary_loss_mlp": 0.01142214, - "balance_loss_clip": 1.16456664, - "balance_loss_mlp": 1.07912827, - "epoch": 0.007575529836164136, - "flos": 18223696350720.0, - "grad_norm": 2.6632616920164067, - "language_loss": 0.82381976, - "learning_rate": 3.1138506658316945e-06, - "loss": 0.85046428, - "num_input_tokens_seen": 2639900, - "step": 126, - "time_per_iteration": 2.7325809001922607 - }, - { - "auxiliary_loss_clip": 0.015153, - "auxiliary_loss_mlp": 0.01142869, - "balance_loss_clip": 1.16330886, - "balance_loss_mlp": 1.08088017, - "epoch": 0.007635653088832106, - "flos": 21580410228480.0, - "grad_norm": 3.925284628341409, - "language_loss": 0.6743899, - "learning_rate": 3.1189404329183404e-06, - "loss": 0.7009716, - "num_input_tokens_seen": 2657450, - "step": 127, - "time_per_iteration": 2.709821939468384 - }, - { - "auxiliary_loss_clip": 0.01503057, - "auxiliary_loss_mlp": 0.01132416, - "balance_loss_clip": 1.165169, - "balance_loss_mlp": 1.06861567, - "epoch": 0.007695776341500075, - "flos": 25375054723200.0, - "grad_norm": 2.0535131533503734, - "language_loss": 0.8819322, - "learning_rate": 3.1239902798522317e-06, - "loss": 0.90828693, - "num_input_tokens_seen": 2678150, - "step": 128, - "time_per_iteration": 2.764707565307617 - }, - { - "auxiliary_loss_clip": 0.01505955, - "auxiliary_loss_mlp": 0.01144223, - "balance_loss_clip": 1.16043079, - "balance_loss_mlp": 1.08042252, - "epoch": 0.007755899594168045, - "flos": 22343906741760.0, - "grad_norm": 2.6427711693827005, - "language_loss": 0.84719259, - "learning_rate": 3.129000827968184e-06, - "loss": 0.87369436, - "num_input_tokens_seen": 2698290, - "step": 129, - "time_per_iteration": 2.7472774982452393 - }, - { - "auxiliary_loss_clip": 0.01497871, - "auxiliary_loss_mlp": 0.01130211, - "balance_loss_clip": 1.15871263, - "balance_loss_mlp": 1.06655347, - "epoch": 0.007816022846836013, - "flos": 22638230784000.0, - "grad_norm": 2.366492959329914, - "language_loss": 0.97564614, - "learning_rate": 3.133972684206866e-06, - "loss": 1.00192702, - "num_input_tokens_seen": 2717630, - "step": 130, - "time_per_iteration": 2.6955018043518066 - }, - { - "auxiliary_loss_clip": 0.01492272, - "auxiliary_loss_mlp": 0.01134965, - "balance_loss_clip": 1.15630865, - "balance_loss_mlp": 1.06987715, - "epoch": 0.007876146099503984, - "flos": 18182901479040.0, - "grad_norm": 2.2164470079204572, - "language_loss": 0.82658112, - "learning_rate": 3.138906441556014e-06, - "loss": 0.85285342, - "num_input_tokens_seen": 2735835, - "step": 131, - "time_per_iteration": 2.722247362136841 - }, - { - "auxiliary_loss_clip": 0.01500937, - "auxiliary_loss_mlp": 0.01128359, - "balance_loss_clip": 1.15885806, - "balance_loss_mlp": 1.06694245, - "epoch": 0.007936269352171952, - "flos": 27119486730240.0, - "grad_norm": 2.7663180664822193, - "language_loss": 0.82781422, - "learning_rate": 3.143802679474861e-06, - "loss": 0.85410714, - "num_input_tokens_seen": 2756335, - "step": 132, - "time_per_iteration": 2.7937612533569336 - }, - { - "auxiliary_loss_clip": 0.01491919, - "auxiliary_loss_mlp": 0.01128624, - "balance_loss_clip": 1.15346444, - "balance_loss_mlp": 1.0664922, - "epoch": 0.007996392604839923, - "flos": 19026335710080.0, - "grad_norm": 2.182366740159355, - "language_loss": 0.95499313, - "learning_rate": 3.1486619643025565e-06, - "loss": 0.98119843, - "num_input_tokens_seen": 2775090, - "step": 133, - "time_per_iteration": 2.7380354404449463 - }, - { - "auxiliary_loss_clip": 0.01487746, - "auxiliary_loss_mlp": 0.0112871, - "balance_loss_clip": 1.16170454, - "balance_loss_mlp": 1.06843781, - "epoch": 0.008056515857507891, - "flos": 25484151306240.0, - "grad_norm": 1.8164116645967854, - "language_loss": 0.73478442, - "learning_rate": 3.153484849651286e-06, - "loss": 0.76094896, - "num_input_tokens_seen": 2795320, - "step": 134, - "time_per_iteration": 2.7483408451080322 - }, - { - "auxiliary_loss_clip": 0.01484621, - "auxiliary_loss_mlp": 0.01132134, - "balance_loss_clip": 1.15115011, - "balance_loss_mlp": 1.06695068, - "epoch": 0.00811663911017586, - "flos": 20557566541440.0, - "grad_norm": 5.027018494085059, - "language_loss": 0.88792509, - "learning_rate": 3.1582718767847806e-06, - "loss": 0.91409266, - "num_input_tokens_seen": 2812815, - "step": 135, - "time_per_iteration": 2.6838128566741943 - }, - { - "auxiliary_loss_clip": 0.01487119, - "auxiliary_loss_mlp": 0.0113257, - "balance_loss_clip": 1.15490174, - "balance_loss_mlp": 1.06714821, - "epoch": 0.00817676236284383, - "flos": 18799738761600.0, - "grad_norm": 1.9282722528396903, - "language_loss": 0.89138198, - "learning_rate": 3.1630235749828485e-06, - "loss": 0.91757882, - "num_input_tokens_seen": 2830445, - "step": 136, - "time_per_iteration": 2.726475238800049 - }, - { - "auxiliary_loss_clip": 0.01483417, - "auxiliary_loss_mlp": 0.01110724, - "balance_loss_clip": 1.1494019, - "balance_loss_mlp": 1.05078554, - "epoch": 0.008236885615511799, - "flos": 23873593288320.0, - "grad_norm": 2.2984339413846078, - "language_loss": 0.84091324, - "learning_rate": 3.1677404618925676e-06, - "loss": 0.86685467, - "num_input_tokens_seen": 2846965, - "step": 137, - "time_per_iteration": 7.4708640575408936 - }, - { - "auxiliary_loss_clip": 0.01481848, - "auxiliary_loss_mlp": 0.01118837, - "balance_loss_clip": 1.1500535, - "balance_loss_mlp": 1.05894589, - "epoch": 0.00829700886817977, - "flos": 24643626076800.0, - "grad_norm": 1.69378413504035, - "language_loss": 0.9018681, - "learning_rate": 3.1724230438666953e-06, - "loss": 0.92787492, - "num_input_tokens_seen": 2867520, - "step": 138, - "time_per_iteration": 4.311830520629883 - }, - { - "auxiliary_loss_clip": 0.01469655, - "auxiliary_loss_mlp": 0.01123604, - "balance_loss_clip": 1.14824438, - "balance_loss_mlp": 1.05904007, - "epoch": 0.008357132120847738, - "flos": 25262007644160.0, - "grad_norm": 2.1515203004813785, - "language_loss": 0.91478992, - "learning_rate": 3.177071816289865e-06, - "loss": 0.94072247, - "num_input_tokens_seen": 2885675, - "step": 139, - "time_per_iteration": 2.7678122520446777 - }, - { - "auxiliary_loss_clip": 0.01486799, - "auxiliary_loss_mlp": 0.01124947, - "balance_loss_clip": 1.15521085, - "balance_loss_mlp": 1.06195688, - "epoch": 0.008417255373515706, - "flos": 27344898529920.0, - "grad_norm": 2.305315677890536, - "language_loss": 0.85667789, - "learning_rate": 3.181687263893095e-06, - "loss": 0.88279533, - "num_input_tokens_seen": 2905960, - "step": 140, - "time_per_iteration": 2.8557639122009277 - }, - { - "auxiliary_loss_clip": 0.01473538, - "auxiliary_loss_mlp": 0.01122701, - "balance_loss_clip": 1.14923954, - "balance_loss_mlp": 1.06166625, - "epoch": 0.008477378626183677, - "flos": 17639070589440.0, - "grad_norm": 2.3443620963590455, - "language_loss": 0.84346074, - "learning_rate": 3.186269861057098e-06, - "loss": 0.86942315, - "num_input_tokens_seen": 2922780, - "step": 141, - "time_per_iteration": 2.7656807899475098 - }, - { - "auxiliary_loss_clip": 0.01477141, - "auxiliary_loss_mlp": 0.01135217, - "balance_loss_clip": 1.14718878, - "balance_loss_mlp": 1.07360983, - "epoch": 0.008537501878851645, - "flos": 13881342297600.0, - "grad_norm": 2.29020652115343, - "language_loss": 0.8105557, - "learning_rate": 3.1908200721048745e-06, - "loss": 0.83667928, - "num_input_tokens_seen": 2938765, - "step": 142, - "time_per_iteration": 2.747598171234131 - }, - { - "auxiliary_loss_clip": 0.01378886, - "auxiliary_loss_mlp": 0.01060004, - "balance_loss_clip": 1.19240355, - "balance_loss_mlp": 1.03406358, - "epoch": 0.008597625131519616, - "flos": 71248101281280.0, - "grad_norm": 1.056887207538052, - "language_loss": 0.66899812, - "learning_rate": 3.195338351584042e-06, - "loss": 0.69338703, - "num_input_tokens_seen": 3006665, - "step": 143, - "time_per_iteration": 3.346982002258301 - }, - { - "auxiliary_loss_clip": 0.01467707, - "auxiliary_loss_mlp": 0.01123721, - "balance_loss_clip": 1.14666772, - "balance_loss_mlp": 1.06273365, - "epoch": 0.008657748384187584, - "flos": 17602836744960.0, - "grad_norm": 2.6467048454978523, - "language_loss": 0.84356761, - "learning_rate": 3.1998251445393258e-06, - "loss": 0.86948192, - "num_input_tokens_seen": 3024335, - "step": 144, - "time_per_iteration": 2.762087345123291 - }, - { - "auxiliary_loss_clip": 0.01455701, - "auxiliary_loss_mlp": 0.01114511, - "balance_loss_clip": 1.14058816, - "balance_loss_mlp": 1.05085373, - "epoch": 0.008717871636855555, - "flos": 19715317459200.0, - "grad_norm": 1.8692883316747366, - "language_loss": 0.88353741, - "learning_rate": 3.204280886775619e-06, - "loss": 0.90923953, - "num_input_tokens_seen": 3043300, - "step": 145, - "time_per_iteration": 2.7050039768218994 - }, - { - "auxiliary_loss_clip": 0.01470385, - "auxiliary_loss_mlp": 0.01121817, - "balance_loss_clip": 1.14247775, - "balance_loss_mlp": 1.05873132, - "epoch": 0.008777994889523523, - "flos": 24717422568960.0, - "grad_norm": 1.860830881508538, - "language_loss": 0.86182559, - "learning_rate": 3.208706005112005e-06, - "loss": 0.88774765, - "num_input_tokens_seen": 3064610, - "step": 146, - "time_per_iteration": 2.741013288497925 - }, - { - "auxiliary_loss_clip": 0.01356998, - "auxiliary_loss_mlp": 0.01029681, - "balance_loss_clip": 1.18072379, - "balance_loss_mlp": 1.00431335, - "epoch": 0.008838118142191492, - "flos": 70132067758080.0, - "grad_norm": 0.8598047517885464, - "language_loss": 0.60122073, - "learning_rate": 3.213100917627104e-06, - "loss": 0.6250875, - "num_input_tokens_seen": 3130385, - "step": 147, - "time_per_iteration": 3.27382230758667 - }, - { - "auxiliary_loss_clip": 0.01463009, - "auxiliary_loss_mlp": 0.01123472, - "balance_loss_clip": 1.14658976, - "balance_loss_mlp": 1.06548882, - "epoch": 0.008898241394859462, - "flos": 20044797937920.0, - "grad_norm": 1.8116070485228748, - "language_loss": 0.84620225, - "learning_rate": 3.2174660338961135e-06, - "loss": 0.87206709, - "num_input_tokens_seen": 3149760, - "step": 148, - "time_per_iteration": 2.72910475730896 - }, - { - "auxiliary_loss_clip": 0.01466623, - "auxiliary_loss_mlp": 0.01144944, - "balance_loss_clip": 1.14777792, - "balance_loss_mlp": 1.07985532, - "epoch": 0.008958364647527431, - "flos": 10743611685120.0, - "grad_norm": 2.5530775415688205, - "language_loss": 0.88680327, - "learning_rate": 3.2218017552198588e-06, - "loss": 0.91291893, - "num_input_tokens_seen": 3164500, - "step": 149, - "time_per_iteration": 2.688528537750244 - }, - { - "auxiliary_loss_clip": 0.01463954, - "auxiliary_loss_mlp": 0.01114885, - "balance_loss_clip": 1.14290714, - "balance_loss_mlp": 1.05728304, - "epoch": 0.009018487900195401, - "flos": 29127467802240.0, - "grad_norm": 2.1996557200804823, - "language_loss": 0.93269086, - "learning_rate": 3.226108474846181e-06, - "loss": 0.95847929, - "num_input_tokens_seen": 3182455, - "step": 150, - "time_per_iteration": 2.7901580333709717 - }, - { - "auxiliary_loss_clip": 0.01450819, - "auxiliary_loss_mlp": 0.01114571, - "balance_loss_clip": 1.13812149, - "balance_loss_mlp": 1.05839944, - "epoch": 0.00907861115286337, - "flos": 32963661354240.0, - "grad_norm": 4.690239135210318, - "language_loss": 0.7421813, - "learning_rate": 3.2303865781839817e-06, - "loss": 0.7678352, - "num_input_tokens_seen": 3203995, - "step": 151, - "time_per_iteration": 2.79590106010437 - }, - { - "auxiliary_loss_clip": 0.01463077, - "auxiliary_loss_mlp": 0.01128244, - "balance_loss_clip": 1.14311624, - "balance_loss_mlp": 1.06954527, - "epoch": 0.009138734405531338, - "flos": 21762441377280.0, - "grad_norm": 4.291097242497492, - "language_loss": 0.88460332, - "learning_rate": 3.234636443010188e-06, - "loss": 0.9105165, - "num_input_tokens_seen": 3222575, - "step": 152, - "time_per_iteration": 2.701775550842285 - }, - { - "auxiliary_loss_clip": 0.01462099, - "auxiliary_loss_mlp": 0.01122264, - "balance_loss_clip": 1.14743185, - "balance_loss_mlp": 1.06275451, - "epoch": 0.009198857658199309, - "flos": 20842517134080.0, - "grad_norm": 3.861411936226758, - "language_loss": 0.83918798, - "learning_rate": 3.238858439669943e-06, - "loss": 0.8650316, - "num_input_tokens_seen": 3240180, - "step": 153, - "time_per_iteration": 2.730654716491699 - }, - { - "auxiliary_loss_clip": 0.01453756, - "auxiliary_loss_mlp": 0.01136244, - "balance_loss_clip": 1.14024806, - "balance_loss_mlp": 1.07554269, - "epoch": 0.009258980910867277, - "flos": 24827381078400.0, - "grad_norm": 1.8788427995178905, - "language_loss": 0.89924759, - "learning_rate": 3.2430529312702712e-06, - "loss": 0.92514759, - "num_input_tokens_seen": 3259800, - "step": 154, - "time_per_iteration": 2.8150386810302734 - }, - { - "auxiliary_loss_clip": 0.01457041, - "auxiliary_loss_mlp": 0.01148182, - "balance_loss_clip": 1.1422174, - "balance_loss_mlp": 1.08934021, - "epoch": 0.009319104163535248, - "flos": 28767786963840.0, - "grad_norm": 2.155148564981828, - "language_loss": 0.89730597, - "learning_rate": 3.2472202738674737e-06, - "loss": 0.9233582, - "num_input_tokens_seen": 3280400, - "step": 155, - "time_per_iteration": 2.7780215740203857 - }, - { - "auxiliary_loss_clip": 0.01462257, - "auxiliary_loss_mlp": 0.01115972, - "balance_loss_clip": 1.14140153, - "balance_loss_mlp": 1.0580368, - "epoch": 0.009379227416203216, - "flos": 16582004219520.0, - "grad_norm": 2.6722626388977986, - "language_loss": 0.86758631, - "learning_rate": 3.2513608166485063e-06, - "loss": 0.8933686, - "num_input_tokens_seen": 3297600, - "step": 156, - "time_per_iteration": 2.7195818424224854 - }, - { - "auxiliary_loss_clip": 0.01460326, - "auxiliary_loss_mlp": 0.01116019, - "balance_loss_clip": 1.14530039, - "balance_loss_mlp": 1.05770147, - "epoch": 0.009439350668871187, - "flos": 18329919845760.0, - "grad_norm": 2.3212743339319926, - "language_loss": 0.99652225, - "learning_rate": 3.2554749021065498e-06, - "loss": 1.0222857, - "num_input_tokens_seen": 3313635, - "step": 157, - "time_per_iteration": 2.7530624866485596 - }, - { - "auxiliary_loss_clip": 0.01445494, - "auxiliary_loss_mlp": 0.01139991, - "balance_loss_clip": 1.14011836, - "balance_loss_mlp": 1.08162606, - "epoch": 0.009499473921539155, - "flos": 24349912565760.0, - "grad_norm": 2.2650385025378834, - "language_loss": 0.88388717, - "learning_rate": 3.2595628662110186e-06, - "loss": 0.90974212, - "num_input_tokens_seen": 3333735, - "step": 158, - "time_per_iteration": 2.744640588760376 - }, - { - "auxiliary_loss_clip": 0.01451838, - "auxiliary_loss_mlp": 0.01122147, - "balance_loss_clip": 1.13977575, - "balance_loss_mlp": 1.0630666, - "epoch": 0.009559597174207124, - "flos": 16399326625920.0, - "grad_norm": 2.1807440045696165, - "language_loss": 0.86407602, - "learning_rate": 3.2636250385721982e-06, - "loss": 0.88981581, - "num_input_tokens_seen": 3348800, - "step": 159, - "time_per_iteration": 2.7330005168914795 - }, - { - "auxiliary_loss_clip": 0.01441743, - "auxiliary_loss_mlp": 0.01137796, - "balance_loss_clip": 1.13474953, - "balance_loss_mlp": 1.07752383, - "epoch": 0.009619720426875094, - "flos": 22856890826880.0, - "grad_norm": 1.7296815250329798, - "language_loss": 0.86756837, - "learning_rate": 3.2676617426007263e-06, - "loss": 0.89336377, - "num_input_tokens_seen": 3368595, - "step": 160, - "time_per_iteration": 2.844817876815796 - }, - { - "auxiliary_loss_clip": 0.01447614, - "auxiliary_loss_mlp": 0.0112266, - "balance_loss_clip": 1.13978457, - "balance_loss_mlp": 1.06725168, - "epoch": 0.009679843679543063, - "flos": 19135001329920.0, - "grad_norm": 2.462408333273543, - "language_loss": 0.91543746, - "learning_rate": 3.2716732956621042e-06, - "loss": 0.94114017, - "num_input_tokens_seen": 3384975, - "step": 161, - "time_per_iteration": 2.667666435241699 - }, - { - "auxiliary_loss_clip": 0.01453392, - "auxiliary_loss_mlp": 0.01111804, - "balance_loss_clip": 1.14104879, - "balance_loss_mlp": 1.05610919, - "epoch": 0.009739966932211033, - "flos": 20302995876480.0, - "grad_norm": 1.7914334411859298, - "language_loss": 0.91582954, - "learning_rate": 3.2756600092264203e-06, - "loss": 0.94148147, - "num_input_tokens_seen": 3404755, - "step": 162, - "time_per_iteration": 2.6779961585998535 - }, - { - "auxiliary_loss_clip": 0.0131522, - "auxiliary_loss_mlp": 0.01056953, - "balance_loss_clip": 1.15019548, - "balance_loss_mlp": 1.03358769, - "epoch": 0.009800090184879002, - "flos": 67034234177280.0, - "grad_norm": 1.183297200633083, - "language_loss": 0.72292268, - "learning_rate": 3.279622189013474e-06, - "loss": 0.74664438, - "num_input_tokens_seen": 3467210, - "step": 163, - "time_per_iteration": 3.226755142211914 - }, - { - "auxiliary_loss_clip": 0.01439788, - "auxiliary_loss_mlp": 0.01116102, - "balance_loss_clip": 1.13873029, - "balance_loss_mlp": 1.05921507, - "epoch": 0.00986021343754697, - "flos": 17164690646400.0, - "grad_norm": 3.3372881081540937, - "language_loss": 0.84684807, - "learning_rate": 3.283560135133457e-06, - "loss": 0.87240696, - "num_input_tokens_seen": 3483220, - "step": 164, - "time_per_iteration": 2.768935203552246 - }, - { - "auxiliary_loss_clip": 0.01430933, - "auxiliary_loss_mlp": 0.0110117, - "balance_loss_clip": 1.13048434, - "balance_loss_mlp": 1.04533219, - "epoch": 0.00992033669021494, - "flos": 17749424148480.0, - "grad_norm": 4.079659732294038, - "language_loss": 0.89080763, - "learning_rate": 3.2874741422233565e-06, - "loss": 0.91612864, - "num_input_tokens_seen": 3501465, - "step": 165, - "time_per_iteration": 2.673292875289917 - }, - { - "auxiliary_loss_clip": 0.01433192, - "auxiliary_loss_mlp": 0.01128138, - "balance_loss_clip": 1.13111067, - "balance_loss_mlp": 1.06819916, - "epoch": 0.00998045994288291, - "flos": 25297164080640.0, - "grad_norm": 1.7359539169577796, - "language_loss": 0.79931343, - "learning_rate": 3.2913644995792465e-06, - "loss": 0.82492673, - "num_input_tokens_seen": 3520480, - "step": 166, - "time_per_iteration": 2.762742757797241 - }, - { - "auxiliary_loss_clip": 0.01438026, - "auxiliary_loss_mlp": 0.01129718, - "balance_loss_clip": 1.13488948, - "balance_loss_mlp": 1.07066131, - "epoch": 0.01004058319555088, - "flos": 32298954220800.0, - "grad_norm": 2.3252666324684585, - "language_loss": 0.92125285, - "learning_rate": 3.2952314912845914e-06, - "loss": 0.94693023, - "num_input_tokens_seen": 3539570, - "step": 167, - "time_per_iteration": 2.970964193344116 - }, - { - "auxiliary_loss_clip": 0.01429698, - "auxiliary_loss_mlp": 0.01133324, - "balance_loss_clip": 1.13294363, - "balance_loss_mlp": 1.07734346, - "epoch": 0.010100706448218848, - "flos": 11319941404800.0, - "grad_norm": 13.512238716069085, - "language_loss": 0.90781063, - "learning_rate": 3.299075396334735e-06, - "loss": 0.93344086, - "num_input_tokens_seen": 3555465, - "step": 168, - "time_per_iteration": 2.8039841651916504 - }, - { - "auxiliary_loss_clip": 0.01424367, - "auxiliary_loss_mlp": 0.01104795, - "balance_loss_clip": 1.12848639, - "balance_loss_mlp": 1.04700291, - "epoch": 0.010160829700886819, - "flos": 29719491765120.0, - "grad_norm": 1.6705351130563955, - "language_loss": 0.87173021, - "learning_rate": 3.3028964887576868e-06, - "loss": 0.89702177, - "num_input_tokens_seen": 3578970, - "step": 169, - "time_per_iteration": 2.8215444087982178 - }, - { - "auxiliary_loss_clip": 0.01425902, - "auxiliary_loss_mlp": 0.01110538, - "balance_loss_clip": 1.13139379, - "balance_loss_mlp": 1.05317438, - "epoch": 0.010220952953554787, - "flos": 20412343854720.0, - "grad_norm": 1.7404257397879006, - "language_loss": 0.84622329, - "learning_rate": 3.306695037731344e-06, - "loss": 0.87158769, - "num_input_tokens_seen": 3597275, - "step": 170, - "time_per_iteration": 2.6759181022644043 - }, - { - "auxiliary_loss_clip": 0.0143612, - "auxiliary_loss_mlp": 0.01137162, - "balance_loss_clip": 1.13149834, - "balance_loss_mlp": 1.07874942, - "epoch": 0.010281076206222756, - "flos": 31285124847360.0, - "grad_norm": 2.174517661608974, - "language_loss": 0.89936447, - "learning_rate": 3.3104713076972827e-06, - "loss": 0.92509729, - "num_input_tokens_seen": 3618905, - "step": 171, - "time_per_iteration": 2.800394058227539 - }, - { - "auxiliary_loss_clip": 0.01430673, - "auxiliary_loss_mlp": 0.01108779, - "balance_loss_clip": 1.1347487, - "balance_loss_mlp": 1.05382347, - "epoch": 0.010341199458890726, - "flos": 21982286568960.0, - "grad_norm": 1.938241860949196, - "language_loss": 0.88895655, - "learning_rate": 3.314225558471224e-06, - "loss": 0.91435111, - "num_input_tokens_seen": 3639610, - "step": 172, - "time_per_iteration": 2.755190849304199 - }, - { - "auxiliary_loss_clip": 0.01418638, - "auxiliary_loss_mlp": 0.01118471, - "balance_loss_clip": 1.12744904, - "balance_loss_mlp": 1.06270456, - "epoch": 0.010401322711558695, - "flos": 30810529422720.0, - "grad_norm": 1.7925778946034159, - "language_loss": 0.80943549, - "learning_rate": 3.317958045350308e-06, - "loss": 0.83480656, - "num_input_tokens_seen": 3664030, - "step": 173, - "time_per_iteration": 2.751945734024048 - }, - { - "auxiliary_loss_clip": 0.01429615, - "auxiliary_loss_mlp": 0.01107965, - "balance_loss_clip": 1.13108575, - "balance_loss_mlp": 1.05534625, - "epoch": 0.010461445964226665, - "flos": 24715124098560.0, - "grad_norm": 2.1644843911099216, - "language_loss": 0.82763064, - "learning_rate": 3.3216690192172596e-06, - "loss": 0.85300648, - "num_input_tokens_seen": 3683615, - "step": 174, - "time_per_iteration": 2.676630735397339 - }, - { - "auxiliary_loss_clip": 0.01423443, - "auxiliary_loss_mlp": 0.01120976, - "balance_loss_clip": 1.12816644, - "balance_loss_mlp": 1.06523335, - "epoch": 0.010521569216894634, - "flos": 27710361457920.0, - "grad_norm": 2.331494685324117, - "language_loss": 0.72837007, - "learning_rate": 3.325358726641591e-06, - "loss": 0.75381434, - "num_input_tokens_seen": 3704540, - "step": 175, - "time_per_iteration": 2.6876866817474365 - }, - { - "auxiliary_loss_clip": 0.01425333, - "auxiliary_loss_mlp": 0.01127215, - "balance_loss_clip": 1.12866652, - "balance_loss_mlp": 1.06980324, - "epoch": 0.010581692469562603, - "flos": 12458346122880.0, - "grad_norm": 4.811985773634618, - "language_loss": 0.97983754, - "learning_rate": 3.329027409977902e-06, - "loss": 1.00536299, - "num_input_tokens_seen": 3721320, - "step": 176, - "time_per_iteration": 2.8159937858581543 - }, - { - "auxiliary_loss_clip": 0.0141033, - "auxiliary_loss_mlp": 0.01130651, - "balance_loss_clip": 1.12546706, - "balance_loss_mlp": 1.07738805, - "epoch": 0.010641815722230573, - "flos": 19427601519360.0, - "grad_norm": 2.8326118759658585, - "language_loss": 0.76926064, - "learning_rate": 3.3326753074614087e-06, - "loss": 0.7946704, - "num_input_tokens_seen": 3739385, - "step": 177, - "time_per_iteration": 5.7707555294036865 - }, - { - "auxiliary_loss_clip": 0.01421858, - "auxiliary_loss_mlp": 0.01104718, - "balance_loss_clip": 1.12455702, - "balance_loss_mlp": 1.05002475, - "epoch": 0.010701938974898541, - "flos": 18332577452160.0, - "grad_norm": 2.6517911185675014, - "language_loss": 0.76942402, - "learning_rate": 3.3363026533007716e-06, - "loss": 0.79468977, - "num_input_tokens_seen": 3756360, - "step": 178, - "time_per_iteration": 4.337082386016846 - }, - { - "auxiliary_loss_clip": 0.01430293, - "auxiliary_loss_mlp": 0.01109414, - "balance_loss_clip": 1.1303575, - "balance_loss_mlp": 1.05252683, - "epoch": 0.010762062227566512, - "flos": 19203985399680.0, - "grad_norm": 2.6843360372821925, - "language_loss": 0.84022826, - "learning_rate": 3.3399096777683303e-06, - "loss": 0.86562538, - "num_input_tokens_seen": 3773930, - "step": 179, - "time_per_iteration": 2.6826629638671875 - }, - { - "auxiliary_loss_clip": 0.01418094, - "auxiliary_loss_mlp": 0.01108667, - "balance_loss_clip": 1.12202275, - "balance_loss_mlp": 1.05158973, - "epoch": 0.01082218548023448, - "flos": 31425427370880.0, - "grad_norm": 2.0256655839140083, - "language_loss": 0.83674574, - "learning_rate": 3.3434966072878213e-06, - "loss": 0.86201334, - "num_input_tokens_seen": 3793630, - "step": 180, - "time_per_iteration": 2.7483785152435303 - }, - { - "auxiliary_loss_clip": 0.01421326, - "auxiliary_loss_mlp": 0.01120347, - "balance_loss_clip": 1.12740374, - "balance_loss_mlp": 1.0646286, - "epoch": 0.01088230873290245, - "flos": 25046436170880.0, - "grad_norm": 3.253139118534122, - "language_loss": 0.77958715, - "learning_rate": 3.3470636645196674e-06, - "loss": 0.80500388, - "num_input_tokens_seen": 3813610, - "step": 181, - "time_per_iteration": 2.698941469192505 - }, - { - "auxiliary_loss_clip": 0.01414948, - "auxiliary_loss_mlp": 0.01130231, - "balance_loss_clip": 1.12188053, - "balance_loss_mlp": 1.07577634, - "epoch": 0.01094243198557042, - "flos": 22893411980160.0, - "grad_norm": 2.56637338396407, - "language_loss": 0.76438594, - "learning_rate": 3.3506110684439156e-06, - "loss": 0.78983772, - "num_input_tokens_seen": 3831390, - "step": 182, - "time_per_iteration": 2.6951375007629395 - }, - { - "auxiliary_loss_clip": 0.01412526, - "auxiliary_loss_mlp": 0.01126665, - "balance_loss_clip": 1.12167537, - "balance_loss_mlp": 1.0702554, - "epoch": 0.011002555238238388, - "flos": 17165049782400.0, - "grad_norm": 2.083158831639218, - "language_loss": 0.87484097, - "learning_rate": 3.3541390344409054e-06, - "loss": 0.90023291, - "num_input_tokens_seen": 3849705, - "step": 183, - "time_per_iteration": 2.733753204345703 - }, - { - "auxiliary_loss_clip": 0.01415922, - "auxiliary_loss_mlp": 0.01110585, - "balance_loss_clip": 1.12529624, - "balance_loss_mlp": 1.05922985, - "epoch": 0.011062678490906358, - "flos": 22310150935680.0, - "grad_norm": 3.105080129831269, - "language_loss": 0.86911464, - "learning_rate": 3.357647774369736e-06, - "loss": 0.89437973, - "num_input_tokens_seen": 3869230, - "step": 184, - "time_per_iteration": 2.6783828735351562 - }, - { - "auxiliary_loss_clip": 0.01410648, - "auxiliary_loss_mlp": 0.01108321, - "balance_loss_clip": 1.12499499, - "balance_loss_mlp": 1.05203021, - "epoch": 0.011122801743574327, - "flos": 24388373053440.0, - "grad_norm": 1.8650514063709744, - "language_loss": 0.83885491, - "learning_rate": 3.3611374966446085e-06, - "loss": 0.86404455, - "num_input_tokens_seen": 3889735, - "step": 185, - "time_per_iteration": 2.6863327026367188 - }, - { - "auxiliary_loss_clip": 0.01419384, - "auxiliary_loss_mlp": 0.01107812, - "balance_loss_clip": 1.12355363, - "balance_loss_mlp": 1.04999495, - "epoch": 0.011182924996242297, - "flos": 18150258994560.0, - "grad_norm": 2.8933407749520743, - "language_loss": 0.71027243, - "learning_rate": 3.3646084063091142e-06, - "loss": 0.73554444, - "num_input_tokens_seen": 3908855, - "step": 186, - "time_per_iteration": 2.819805383682251 - }, - { - "auxiliary_loss_clip": 0.01415699, - "auxiliary_loss_mlp": 0.01108312, - "balance_loss_clip": 1.12262082, - "balance_loss_mlp": 1.05574071, - "epoch": 0.011243048248910266, - "flos": 15486800584320.0, - "grad_norm": 2.4244794785226733, - "language_loss": 1.01999915, - "learning_rate": 3.3680607051085194e-06, - "loss": 1.04523933, - "num_input_tokens_seen": 3923865, - "step": 187, - "time_per_iteration": 2.65875506401062 - }, - { - "auxiliary_loss_clip": 0.01404987, - "auxiliary_loss_mlp": 0.01107995, - "balance_loss_clip": 1.12269068, - "balance_loss_mlp": 1.05253887, - "epoch": 0.011303171501578235, - "flos": 40916868986880.0, - "grad_norm": 2.0089158406542524, - "language_loss": 0.74998611, - "learning_rate": 3.371494591560139e-06, - "loss": 0.77511597, - "num_input_tokens_seen": 3946870, - "step": 188, - "time_per_iteration": 2.8631174564361572 - }, - { - "auxiliary_loss_clip": 0.01298557, - "auxiliary_loss_mlp": 0.01067058, - "balance_loss_clip": 1.14124644, - "balance_loss_mlp": 1.04474187, - "epoch": 0.011363294754246205, - "flos": 66302697790080.0, - "grad_norm": 0.7620731385906954, - "language_loss": 0.56192517, - "learning_rate": 3.3749102610218297e-06, - "loss": 0.5855813, - "num_input_tokens_seen": 4010005, - "step": 189, - "time_per_iteration": 3.2704074382781982 - }, - { - "auxiliary_loss_clip": 0.01402206, - "auxiliary_loss_mlp": 0.011217, - "balance_loss_clip": 1.11730003, - "balance_loss_mlp": 1.06662548, - "epoch": 0.011423418006914174, - "flos": 24900279730560.0, - "grad_norm": 2.640219984380571, - "language_loss": 0.95085573, - "learning_rate": 3.3783079057586833e-06, - "loss": 0.97609472, - "num_input_tokens_seen": 4029035, - "step": 190, - "time_per_iteration": 2.6898255348205566 - }, - { - "auxiliary_loss_clip": 0.01405088, - "auxiliary_loss_mlp": 0.01103893, - "balance_loss_clip": 1.11979234, - "balance_loss_mlp": 1.05167961, - "epoch": 0.011483541259582144, - "flos": 19791879298560.0, - "grad_norm": 4.133813113517846, - "language_loss": 0.8463847, - "learning_rate": 3.3816877150079665e-06, - "loss": 0.8714745, - "num_input_tokens_seen": 4046995, - "step": 191, - "time_per_iteration": 2.71589994430542 - }, - { - "auxiliary_loss_clip": 0.01403196, - "auxiliary_loss_mlp": 0.01118385, - "balance_loss_clip": 1.11570346, - "balance_loss_mlp": 1.06624269, - "epoch": 0.011543664512250112, - "flos": 26176939896960.0, - "grad_norm": 2.0065119945705887, - "language_loss": 0.91894913, - "learning_rate": 3.385049875042367e-06, - "loss": 0.94416493, - "num_input_tokens_seen": 4065865, - "step": 192, - "time_per_iteration": 2.775974988937378 - }, - { - "auxiliary_loss_clip": 0.01398496, - "auxiliary_loss_mlp": 0.01118924, - "balance_loss_clip": 1.11665678, - "balance_loss_mlp": 1.06117916, - "epoch": 0.011603787764918083, - "flos": 23768985905280.0, - "grad_norm": 2.10033302347605, - "language_loss": 0.86923265, - "learning_rate": 3.3883945692315938e-06, - "loss": 0.89440691, - "num_input_tokens_seen": 4085305, - "step": 193, - "time_per_iteration": 2.792947292327881 - }, - { - "auxiliary_loss_clip": 0.01402535, - "auxiliary_loss_mlp": 0.01102276, - "balance_loss_clip": 1.11514282, - "balance_loss_mlp": 1.05061066, - "epoch": 0.011663911017586051, - "flos": 25954688494080.0, - "grad_norm": 2.2253165290939076, - "language_loss": 0.92296255, - "learning_rate": 3.3917219781023906e-06, - "loss": 0.94801068, - "num_input_tokens_seen": 4105185, - "step": 194, - "time_per_iteration": 2.6886558532714844 - }, - { - "auxiliary_loss_clip": 0.01407209, - "auxiliary_loss_mlp": 0.01108641, - "balance_loss_clip": 1.11930478, - "balance_loss_mlp": 1.05630851, - "epoch": 0.01172403427025402, - "flos": 17895149625600.0, - "grad_norm": 2.4241235245311503, - "language_loss": 0.89768875, - "learning_rate": 3.3950322793970014e-06, - "loss": 0.92284721, - "num_input_tokens_seen": 4123160, - "step": 195, - "time_per_iteration": 2.654517889022827 - }, - { - "auxiliary_loss_clip": 0.01400339, - "auxiliary_loss_mlp": 0.01114485, - "balance_loss_clip": 1.11779022, - "balance_loss_mlp": 1.05981565, - "epoch": 0.01178415752292199, - "flos": 17894539094400.0, - "grad_norm": 3.1130999341447385, - "language_loss": 0.86019921, - "learning_rate": 3.3983256481301445e-06, - "loss": 0.88534749, - "num_input_tokens_seen": 4140425, - "step": 196, - "time_per_iteration": 2.643598794937134 - }, - { - "auxiliary_loss_clip": 0.01398067, - "auxiliary_loss_mlp": 0.01107082, - "balance_loss_clip": 1.11464977, - "balance_loss_mlp": 1.05308056, - "epoch": 0.011844280775589959, - "flos": 22893555634560.0, - "grad_norm": 3.666533247373141, - "language_loss": 0.93052697, - "learning_rate": 3.4016022566445335e-06, - "loss": 0.95557845, - "num_input_tokens_seen": 4159555, - "step": 197, - "time_per_iteration": 2.7120354175567627 - }, - { - "auxiliary_loss_clip": 0.01396424, - "auxiliary_loss_mlp": 0.01112388, - "balance_loss_clip": 1.11625624, - "balance_loss_mlp": 1.05943501, - "epoch": 0.01190440402825793, - "flos": 26980333441920.0, - "grad_norm": 1.9614954763997827, - "language_loss": 0.79043806, - "learning_rate": 3.4048622746649966e-06, - "loss": 0.81552619, - "num_input_tokens_seen": 4180480, - "step": 198, - "time_per_iteration": 2.774059772491455 - }, - { - "auxiliary_loss_clip": 0.0139305, - "auxiliary_loss_mlp": 0.01120527, - "balance_loss_clip": 1.11708748, - "balance_loss_mlp": 1.06821764, - "epoch": 0.011964527280925898, - "flos": 20521584092160.0, - "grad_norm": 1.8823459083646328, - "language_loss": 0.88239717, - "learning_rate": 3.4081058693512278e-06, - "loss": 0.90753293, - "num_input_tokens_seen": 4198835, - "step": 199, - "time_per_iteration": 2.6808881759643555 - }, - { - "auxiliary_loss_clip": 0.01403709, - "auxiliary_loss_mlp": 0.0112899, - "balance_loss_clip": 1.11951399, - "balance_loss_mlp": 1.07200766, - "epoch": 0.012024650533593867, - "flos": 27745984771200.0, - "grad_norm": 2.0663906916258497, - "language_loss": 0.81151628, - "learning_rate": 3.411333205349222e-06, - "loss": 0.83684325, - "num_input_tokens_seen": 4219335, - "step": 200, - "time_per_iteration": 2.625380516052246 - }, - { - "auxiliary_loss_clip": 0.0140201, - "auxiliary_loss_mlp": 0.01104413, - "balance_loss_clip": 1.11633158, - "balance_loss_mlp": 1.05048287, - "epoch": 0.012084773786261837, - "flos": 10452017076480.0, - "grad_norm": 2.253120238884594, - "language_loss": 0.87696433, - "learning_rate": 3.4145444448414217e-06, - "loss": 0.90202856, - "num_input_tokens_seen": 4236940, - "step": 201, - "time_per_iteration": 2.6062326431274414 - }, - { - "auxiliary_loss_clip": 0.01399494, - "auxiliary_loss_mlp": 0.01115643, - "balance_loss_clip": 1.11764228, - "balance_loss_mlp": 1.0614028, - "epoch": 0.012144897038929806, - "flos": 23105751229440.0, - "grad_norm": 2.088192664231089, - "language_loss": 0.84052485, - "learning_rate": 3.4177397475956223e-06, - "loss": 0.86567622, - "num_input_tokens_seen": 4256755, - "step": 202, - "time_per_iteration": 2.6981592178344727 - }, - { - "auxiliary_loss_clip": 0.01388741, - "auxiliary_loss_mlp": 0.0111019, - "balance_loss_clip": 1.11006808, - "balance_loss_mlp": 1.05771446, - "epoch": 0.012205020291597776, - "flos": 21033203460480.0, - "grad_norm": 1.7861279575653157, - "language_loss": 0.89964712, - "learning_rate": 3.4209192710126685e-06, - "loss": 0.92463642, - "num_input_tokens_seen": 4276505, - "step": 203, - "time_per_iteration": 2.668757438659668 - }, - { - "auxiliary_loss_clip": 0.01276289, - "auxiliary_loss_mlp": 0.01095021, - "balance_loss_clip": 1.12578154, - "balance_loss_mlp": 1.07470798, - "epoch": 0.012265143544265745, - "flos": 68447785075200.0, - "grad_norm": 1.0265297625980543, - "language_loss": 0.61255801, - "learning_rate": 3.4240831701729837e-06, - "loss": 0.63627112, - "num_input_tokens_seen": 4330965, - "step": 204, - "time_per_iteration": 3.161599636077881 - }, - { - "auxiliary_loss_clip": 0.01396271, - "auxiliary_loss_mlp": 0.01111806, - "balance_loss_clip": 1.11291122, - "balance_loss_mlp": 1.05930579, - "epoch": 0.012325266796933715, - "flos": 17019252478080.0, - "grad_norm": 2.3248674300118184, - "language_loss": 0.91324663, - "learning_rate": 3.4272315978819516e-06, - "loss": 0.93832743, - "num_input_tokens_seen": 4348200, - "step": 205, - "time_per_iteration": 2.6764047145843506 - }, - { - "auxiliary_loss_clip": 0.01404558, - "auxiliary_loss_mlp": 0.0112167, - "balance_loss_clip": 1.11773109, - "balance_loss_mlp": 1.06773925, - "epoch": 0.012385390049601683, - "flos": 20190056538240.0, - "grad_norm": 2.1088315130515207, - "language_loss": 0.89305568, - "learning_rate": 3.4303647047142043e-06, - "loss": 0.91831797, - "num_input_tokens_seen": 4365460, - "step": 206, - "time_per_iteration": 2.7157227993011475 - }, - { - "auxiliary_loss_clip": 0.0139534, - "auxiliary_loss_mlp": 0.01100957, - "balance_loss_clip": 1.11176991, - "balance_loss_mlp": 1.04888678, - "epoch": 0.012445513302269652, - "flos": 16253134272000.0, - "grad_norm": 2.399816031687551, - "language_loss": 0.95542914, - "learning_rate": 3.43348263905683e-06, - "loss": 0.9803921, - "num_input_tokens_seen": 4383650, - "step": 207, - "time_per_iteration": 2.611348867416382 - }, - { - "auxiliary_loss_clip": 0.01393005, - "auxiliary_loss_mlp": 0.01117764, - "balance_loss_clip": 1.11658561, - "balance_loss_mlp": 1.06497812, - "epoch": 0.012505636554937622, - "flos": 23769380954880.0, - "grad_norm": 1.8144323603981871, - "language_loss": 0.75985783, - "learning_rate": 3.436585547151547e-06, - "loss": 0.78496552, - "num_input_tokens_seen": 4403765, - "step": 208, - "time_per_iteration": 2.7184154987335205 - }, - { - "auxiliary_loss_clip": 0.0138146, - "auxiliary_loss_mlp": 0.01108623, - "balance_loss_clip": 1.11071992, - "balance_loss_mlp": 1.05576587, - "epoch": 0.012565759807605591, - "flos": 30591546157440.0, - "grad_norm": 2.2326965650696855, - "language_loss": 0.98386943, - "learning_rate": 3.4396735731358586e-06, - "loss": 1.00877023, - "num_input_tokens_seen": 4421935, - "step": 209, - "time_per_iteration": 2.7354249954223633 - }, - { - "auxiliary_loss_clip": 0.01387012, - "auxiliary_loss_mlp": 0.0111836, - "balance_loss_clip": 1.11136842, - "balance_loss_mlp": 1.06490695, - "epoch": 0.012625883060273561, - "flos": 40113511355520.0, - "grad_norm": 9.084733304650118, - "language_loss": 0.85514843, - "learning_rate": 3.4427468590832302e-06, - "loss": 0.88020217, - "num_input_tokens_seen": 4441470, - "step": 210, - "time_per_iteration": 2.888749122619629 - }, - { - "auxiliary_loss_clip": 0.01384384, - "auxiliary_loss_mlp": 0.01121559, - "balance_loss_clip": 1.11018038, - "balance_loss_mlp": 1.07115781, - "epoch": 0.01268600631294153, - "flos": 27089178629760.0, - "grad_norm": 3.431917100192063, - "language_loss": 0.97194636, - "learning_rate": 3.445805545042314e-06, - "loss": 0.99700582, - "num_input_tokens_seen": 4459950, - "step": 211, - "time_per_iteration": 2.7465193271636963 - }, - { - "auxiliary_loss_clip": 0.01393556, - "auxiliary_loss_mlp": 0.01123542, - "balance_loss_clip": 1.11511767, - "balance_loss_mlp": 1.06999326, - "epoch": 0.012746129565609499, - "flos": 16982767238400.0, - "grad_norm": 2.3992368053115163, - "language_loss": 0.9508543, - "learning_rate": 3.448849769075239e-06, - "loss": 0.97602528, - "num_input_tokens_seen": 4478390, - "step": 212, - "time_per_iteration": 2.6340651512145996 - }, - { - "auxiliary_loss_clip": 0.01381697, - "auxiliary_loss_mlp": 0.01116386, - "balance_loss_clip": 1.112149, - "balance_loss_mlp": 1.06381512, - "epoch": 0.012806252818277469, - "flos": 46533476995200.0, - "grad_norm": 1.701444843398511, - "language_loss": 0.76078421, - "learning_rate": 3.4518796672950093e-06, - "loss": 0.78576505, - "num_input_tokens_seen": 4501665, - "step": 213, - "time_per_iteration": 2.9250640869140625 - }, - { - "auxiliary_loss_clip": 0.01385821, - "auxiliary_loss_mlp": 0.01111776, - "balance_loss_clip": 1.11002433, - "balance_loss_mlp": 1.06056333, - "epoch": 0.012866376070945438, - "flos": 14388616120320.0, - "grad_norm": 3.5300370267625922, - "language_loss": 0.86698866, - "learning_rate": 3.4548953739020187e-06, - "loss": 0.89196461, - "num_input_tokens_seen": 4519055, - "step": 214, - "time_per_iteration": 2.645289659500122 - }, - { - "auxiliary_loss_clip": 0.01383455, - "auxiliary_loss_mlp": 0.01128262, - "balance_loss_clip": 1.1159339, - "balance_loss_mlp": 1.07359219, - "epoch": 0.012926499323613408, - "flos": 26140813793280.0, - "grad_norm": 2.14433888305053, - "language_loss": 0.77582061, - "learning_rate": 3.4578970212197196e-06, - "loss": 0.80093777, - "num_input_tokens_seen": 4540870, - "step": 215, - "time_per_iteration": 2.7315175533294678 - }, - { - "auxiliary_loss_clip": 0.01391951, - "auxiliary_loss_mlp": 0.01115104, - "balance_loss_clip": 1.11440635, - "balance_loss_mlp": 1.0638206, - "epoch": 0.012986622576281377, - "flos": 30117202128000.0, - "grad_norm": 2.2964706747038233, - "language_loss": 0.90423942, - "learning_rate": 3.460884739729461e-06, - "loss": 0.92930996, - "num_input_tokens_seen": 4560395, - "step": 216, - "time_per_iteration": 2.724698781967163 - }, - { - "auxiliary_loss_clip": 0.01384729, - "auxiliary_loss_mlp": 0.01113374, - "balance_loss_clip": 1.10847259, - "balance_loss_mlp": 1.06096959, - "epoch": 0.013046745828949347, - "flos": 13954025468160.0, - "grad_norm": 3.60062834696173, - "language_loss": 0.93473232, - "learning_rate": 3.463858658104523e-06, - "loss": 0.95971346, - "num_input_tokens_seen": 4575785, - "step": 217, - "time_per_iteration": 5.762276649475098 - }, - { - "auxiliary_loss_clip": 0.01377712, - "auxiliary_loss_mlp": 0.0110874, - "balance_loss_clip": 1.10726643, - "balance_loss_mlp": 1.05433273, - "epoch": 0.013106869081617315, - "flos": 17347835116800.0, - "grad_norm": 1.943339896357513, - "language_loss": 0.93811166, - "learning_rate": 3.4668189032433696e-06, - "loss": 0.96297616, - "num_input_tokens_seen": 4594985, - "step": 218, - "time_per_iteration": 5.832701206207275 - }, - { - "auxiliary_loss_clip": 0.01372884, - "auxiliary_loss_mlp": 0.01106717, - "balance_loss_clip": 1.10647273, - "balance_loss_mlp": 1.05552888, - "epoch": 0.013166992334285284, - "flos": 25884914325120.0, - "grad_norm": 2.252873600345955, - "language_loss": 0.86196327, - "learning_rate": 3.46976560030214e-06, - "loss": 0.88675928, - "num_input_tokens_seen": 4616125, - "step": 219, - "time_per_iteration": 2.794581651687622 - }, - { - "auxiliary_loss_clip": 0.0137885, - "auxiliary_loss_mlp": 0.01102953, - "balance_loss_clip": 1.10957599, - "balance_loss_mlp": 1.05188394, - "epoch": 0.013227115586953254, - "flos": 31175956437120.0, - "grad_norm": 1.897987121161891, - "language_loss": 0.8748548, - "learning_rate": 3.4726988727263976e-06, - "loss": 0.89967287, - "num_input_tokens_seen": 4637795, - "step": 220, - "time_per_iteration": 2.799927234649658 - }, - { - "auxiliary_loss_clip": 0.01370688, - "auxiliary_loss_mlp": 0.01115596, - "balance_loss_clip": 1.10440111, - "balance_loss_mlp": 1.0679127, - "epoch": 0.013287238839621223, - "flos": 20409470766720.0, - "grad_norm": 3.2557072980071795, - "language_loss": 0.86437249, - "learning_rate": 3.475618842282164e-06, - "loss": 0.88923532, - "num_input_tokens_seen": 4656835, - "step": 221, - "time_per_iteration": 2.7040672302246094 - }, - { - "auxiliary_loss_clip": 0.01376134, - "auxiliary_loss_mlp": 0.01116397, - "balance_loss_clip": 1.10384834, - "balance_loss_mlp": 1.0637064, - "epoch": 0.013347362092289193, - "flos": 14137134024960.0, - "grad_norm": 2.585706849100757, - "language_loss": 0.92369294, - "learning_rate": 3.4785256290862486e-06, - "loss": 0.94861829, - "num_input_tokens_seen": 4673015, - "step": 222, - "time_per_iteration": 2.6648194789886475 - }, - { - "auxiliary_loss_clip": 0.01373283, - "auxiliary_loss_mlp": 0.01106423, - "balance_loss_clip": 1.10636806, - "balance_loss_mlp": 1.05156267, - "epoch": 0.013407485344957162, - "flos": 21797705554560.0, - "grad_norm": 7.739608779999776, - "language_loss": 0.95708215, - "learning_rate": 3.481419351635897e-06, - "loss": 0.98187923, - "num_input_tokens_seen": 4692355, - "step": 223, - "time_per_iteration": 2.7261807918548584 - }, - { - "auxiliary_loss_clip": 0.01374555, - "auxiliary_loss_mlp": 0.0110963, - "balance_loss_clip": 1.10768425, - "balance_loss_mlp": 1.05870414, - "epoch": 0.013467608597625132, - "flos": 18621622195200.0, - "grad_norm": 2.673591615227502, - "language_loss": 0.88031876, - "learning_rate": 3.484300126837776e-06, - "loss": 0.90516055, - "num_input_tokens_seen": 4710080, - "step": 224, - "time_per_iteration": 2.601686477661133 - }, - { - "auxiliary_loss_clip": 0.01374533, - "auxiliary_loss_mlp": 0.01103, - "balance_loss_clip": 1.10679817, - "balance_loss_mlp": 1.04804444, - "epoch": 0.013527731850293101, - "flos": 18552314903040.0, - "grad_norm": 3.0722216996453535, - "language_loss": 0.89625597, - "learning_rate": 3.487168070036317e-06, - "loss": 0.9210313, - "num_input_tokens_seen": 4728980, - "step": 225, - "time_per_iteration": 2.6677513122558594 - }, - { - "auxiliary_loss_clip": 0.01369955, - "auxiliary_loss_mlp": 0.0112021, - "balance_loss_clip": 1.10561275, - "balance_loss_mlp": 1.06675696, - "epoch": 0.01358785510296107, - "flos": 19165381257600.0, - "grad_norm": 1.9576206039109396, - "language_loss": 0.98980033, - "learning_rate": 3.4900232950414224e-06, - "loss": 1.01470196, - "num_input_tokens_seen": 4747020, - "step": 226, - "time_per_iteration": 2.8320930004119873 - }, - { - "auxiliary_loss_clip": 0.01375268, - "auxiliary_loss_mlp": 0.01110039, - "balance_loss_clip": 1.10837173, - "balance_loss_mlp": 1.05572701, - "epoch": 0.01364797835562904, - "flos": 23329941966720.0, - "grad_norm": 2.3303410550109245, - "language_loss": 0.90965348, - "learning_rate": 3.4928659141555727e-06, - "loss": 0.93450654, - "num_input_tokens_seen": 4765000, - "step": 227, - "time_per_iteration": 2.648606061935425 - }, - { - "auxiliary_loss_clip": 0.01255161, - "auxiliary_loss_mlp": 0.01079249, - "balance_loss_clip": 1.11229861, - "balance_loss_mlp": 1.06017554, - "epoch": 0.013708101608297009, - "flos": 70993746097920.0, - "grad_norm": 0.9472069433514878, - "language_loss": 0.57650995, - "learning_rate": 3.4956960382003234e-06, - "loss": 0.59985405, - "num_input_tokens_seen": 4833210, - "step": 228, - "time_per_iteration": 3.246328592300415 - }, - { - "auxiliary_loss_clip": 0.01366835, - "auxiliary_loss_mlp": 0.01117377, - "balance_loss_clip": 1.10507822, - "balance_loss_mlp": 1.06711841, - "epoch": 0.013768224860964979, - "flos": 16325170997760.0, - "grad_norm": 2.957038430634678, - "language_loss": 0.87773621, - "learning_rate": 3.4985137765422354e-06, - "loss": 0.90257835, - "num_input_tokens_seen": 4850120, - "step": 229, - "time_per_iteration": 2.6319024562835693 - }, - { - "auxiliary_loss_clip": 0.01375278, - "auxiliary_loss_mlp": 0.01098609, - "balance_loss_clip": 1.10567176, - "balance_loss_mlp": 1.04873204, - "epoch": 0.013828348113632948, - "flos": 20193037367040.0, - "grad_norm": 4.72663824849547, - "language_loss": 0.83937395, - "learning_rate": 3.501319237118231e-06, - "loss": 0.86411285, - "num_input_tokens_seen": 4866215, - "step": 230, - "time_per_iteration": 2.7026398181915283 - }, - { - "auxiliary_loss_clip": 0.01373544, - "auxiliary_loss_mlp": 0.01113683, - "balance_loss_clip": 1.10701275, - "balance_loss_mlp": 1.06361556, - "epoch": 0.013888471366300916, - "flos": 20741070147840.0, - "grad_norm": 2.2562202151287867, - "language_loss": 0.904212, - "learning_rate": 3.5041125264604056e-06, - "loss": 0.9290843, - "num_input_tokens_seen": 4885630, - "step": 231, - "time_per_iteration": 2.6424474716186523 - }, - { - "auxiliary_loss_clip": 0.01377759, - "auxiliary_loss_mlp": 0.01110232, - "balance_loss_clip": 1.11118639, - "balance_loss_mlp": 1.06030726, - "epoch": 0.013948594618968886, - "flos": 22090628966400.0, - "grad_norm": 2.0229562700819215, - "language_loss": 0.83624899, - "learning_rate": 3.5068937497203002e-06, - "loss": 0.86112887, - "num_input_tokens_seen": 4905570, - "step": 232, - "time_per_iteration": 2.621704339981079 - }, - { - "auxiliary_loss_clip": 0.01377798, - "auxiliary_loss_mlp": 0.01094369, - "balance_loss_clip": 1.10229027, - "balance_loss_mlp": 1.04253721, - "epoch": 0.014008717871636855, - "flos": 19063108258560.0, - "grad_norm": 5.516695444379509, - "language_loss": 0.74727643, - "learning_rate": 3.509663010692652e-06, - "loss": 0.77199805, - "num_input_tokens_seen": 4923535, - "step": 233, - "time_per_iteration": 2.659188747406006 - }, - { - "auxiliary_loss_clip": 0.01382744, - "auxiliary_loss_mlp": 0.01125121, - "balance_loss_clip": 1.1099937, - "balance_loss_mlp": 1.0723356, - "epoch": 0.014068841124304825, - "flos": 14530822064640.0, - "grad_norm": 2.5763093382937483, - "language_loss": 0.85633421, - "learning_rate": 3.512420411838642e-06, - "loss": 0.88141286, - "num_input_tokens_seen": 4939200, - "step": 234, - "time_per_iteration": 2.610635757446289 - }, - { - "auxiliary_loss_clip": 0.01374562, - "auxiliary_loss_mlp": 0.01114672, - "balance_loss_clip": 1.10890436, - "balance_loss_mlp": 1.06467605, - "epoch": 0.014128964376972794, - "flos": 18077396256000.0, - "grad_norm": 2.467487286445388, - "language_loss": 0.89192498, - "learning_rate": 3.515166054308634e-06, - "loss": 0.91681731, - "num_input_tokens_seen": 4956620, - "step": 235, - "time_per_iteration": 2.668769359588623 - }, - { - "auxiliary_loss_clip": 0.01373018, - "auxiliary_loss_mlp": 0.01131641, - "balance_loss_clip": 1.11011076, - "balance_loss_mlp": 1.08073914, - "epoch": 0.014189087629640764, - "flos": 25334331678720.0, - "grad_norm": 2.143165146200321, - "language_loss": 0.85535377, - "learning_rate": 3.5179000379644498e-06, - "loss": 0.88040036, - "num_input_tokens_seen": 4975650, - "step": 236, - "time_per_iteration": 2.7570323944091797 - }, - { - "auxiliary_loss_clip": 0.01369632, - "auxiliary_loss_mlp": 0.01100269, - "balance_loss_clip": 1.10296702, - "balance_loss_mlp": 1.04905629, - "epoch": 0.014249210882308733, - "flos": 36139744713600.0, - "grad_norm": 2.1351980688483136, - "language_loss": 0.82550979, - "learning_rate": 3.520622461401154e-06, - "loss": 0.85020876, - "num_input_tokens_seen": 4997415, - "step": 237, - "time_per_iteration": 2.811617374420166 - }, - { - "auxiliary_loss_clip": 0.01369728, - "auxiliary_loss_mlp": 0.01124352, - "balance_loss_clip": 1.10659075, - "balance_loss_mlp": 1.07085085, - "epoch": 0.014309334134976702, - "flos": 12932977461120.0, - "grad_norm": 2.0241581748099313, - "language_loss": 0.77096599, - "learning_rate": 3.5233334219683935e-06, - "loss": 0.79590684, - "num_input_tokens_seen": 5013905, - "step": 238, - "time_per_iteration": 2.8044662475585938 - }, - { - "auxiliary_loss_clip": 0.01367496, - "auxiliary_loss_mlp": 0.01111406, - "balance_loss_clip": 1.10897434, - "balance_loss_mlp": 1.06343579, - "epoch": 0.014369457387644672, - "flos": 20777519473920.0, - "grad_norm": 1.8300428555870456, - "language_loss": 0.8707583, - "learning_rate": 3.526033015791284e-06, - "loss": 0.89554727, - "num_input_tokens_seen": 5033645, - "step": 239, - "time_per_iteration": 2.681452751159668 - }, - { - "auxiliary_loss_clip": 0.01353036, - "auxiliary_loss_mlp": 0.01103184, - "balance_loss_clip": 1.10036874, - "balance_loss_mlp": 1.05516672, - "epoch": 0.01442958064031264, - "flos": 25848536826240.0, - "grad_norm": 2.109315431148974, - "language_loss": 0.93055749, - "learning_rate": 3.528721337790862e-06, - "loss": 0.95511973, - "num_input_tokens_seen": 5052875, - "step": 240, - "time_per_iteration": 2.679826021194458 - }, - { - "auxiliary_loss_clip": 0.01360794, - "auxiliary_loss_mlp": 0.01103084, - "balance_loss_clip": 1.10475957, - "balance_loss_mlp": 1.05611515, - "epoch": 0.014489703892980611, - "flos": 28219718269440.0, - "grad_norm": 3.7136133710916575, - "language_loss": 0.8482846, - "learning_rate": 3.531398481704111e-06, - "loss": 0.87292337, - "num_input_tokens_seen": 5075005, - "step": 241, - "time_per_iteration": 2.679126262664795 - }, - { - "auxiliary_loss_clip": 0.01359518, - "auxiliary_loss_mlp": 0.01119602, - "balance_loss_clip": 1.11010456, - "balance_loss_mlp": 1.06931913, - "epoch": 0.01454982714564858, - "flos": 22490925108480.0, - "grad_norm": 1.8502491938168453, - "language_loss": 0.88590866, - "learning_rate": 3.534064540103573e-06, - "loss": 0.9106999, - "num_input_tokens_seen": 5091875, - "step": 242, - "time_per_iteration": 2.7366583347320557 - }, - { - "auxiliary_loss_clip": 0.01359534, - "auxiliary_loss_mlp": 0.01104713, - "balance_loss_clip": 1.10356677, - "balance_loss_mlp": 1.05342889, - "epoch": 0.014609950398316548, - "flos": 21653201139840.0, - "grad_norm": 2.261458758817042, - "language_loss": 0.86688942, - "learning_rate": 3.536719604416555e-06, - "loss": 0.89153194, - "num_input_tokens_seen": 5111290, - "step": 243, - "time_per_iteration": 2.764378070831299 - }, - { - "auxiliary_loss_clip": 0.01364897, - "auxiliary_loss_mlp": 0.01106776, - "balance_loss_clip": 1.10636568, - "balance_loss_mlp": 1.05656552, - "epoch": 0.014670073650984519, - "flos": 21869993675520.0, - "grad_norm": 1.6964959858678799, - "language_loss": 0.84256208, - "learning_rate": 3.5393637649439464e-06, - "loss": 0.86727887, - "num_input_tokens_seen": 5132265, - "step": 244, - "time_per_iteration": 2.630441188812256 - }, - { - "auxiliary_loss_clip": 0.01372266, - "auxiliary_loss_mlp": 0.01115072, - "balance_loss_clip": 1.10771632, - "balance_loss_mlp": 1.06328762, - "epoch": 0.014730196903652487, - "flos": 23183713699200.0, - "grad_norm": 8.49550264430495, - "language_loss": 0.78613877, - "learning_rate": 3.54199711087864e-06, - "loss": 0.81101215, - "num_input_tokens_seen": 5148575, - "step": 245, - "time_per_iteration": 2.6991443634033203 - }, - { - "auxiliary_loss_clip": 0.01371598, - "auxiliary_loss_mlp": 0.0110404, - "balance_loss_clip": 1.10405719, - "balance_loss_mlp": 1.05008554, - "epoch": 0.014790320156320457, - "flos": 23222605150080.0, - "grad_norm": 2.2582939339926305, - "language_loss": 0.84165329, - "learning_rate": 3.5446197303235913e-06, - "loss": 0.86640966, - "num_input_tokens_seen": 5170415, - "step": 246, - "time_per_iteration": 2.726743221282959 - }, - { - "auxiliary_loss_clip": 0.01365538, - "auxiliary_loss_mlp": 0.01101456, - "balance_loss_clip": 1.10242295, - "balance_loss_mlp": 1.05062532, - "epoch": 0.014850443408988426, - "flos": 15815490963840.0, - "grad_norm": 1.9870849133800452, - "language_loss": 0.89958012, - "learning_rate": 3.5472317103095034e-06, - "loss": 0.92425001, - "num_input_tokens_seen": 5188565, - "step": 247, - "time_per_iteration": 2.5998406410217285 - }, - { - "auxiliary_loss_clip": 0.01364581, - "auxiliary_loss_mlp": 0.01098108, - "balance_loss_clip": 1.09896278, - "balance_loss_mlp": 1.0489223, - "epoch": 0.014910566661656396, - "flos": 22781657790720.0, - "grad_norm": 2.0527635487774343, - "language_loss": 0.783005, - "learning_rate": 3.549833136812155e-06, - "loss": 0.80763197, - "num_input_tokens_seen": 5207810, - "step": 248, - "time_per_iteration": 2.689784049987793 - }, - { - "auxiliary_loss_clip": 0.01365896, - "auxiliary_loss_mlp": 0.01110511, - "balance_loss_clip": 1.10732806, - "balance_loss_mlp": 1.06044269, - "epoch": 0.014970689914324365, - "flos": 26865023806080.0, - "grad_norm": 1.9405946352322343, - "language_loss": 0.83855766, - "learning_rate": 3.552424094769381e-06, - "loss": 0.86332172, - "num_input_tokens_seen": 5226210, - "step": 249, - "time_per_iteration": 2.8210339546203613 - }, - { - "auxiliary_loss_clip": 0.01358179, - "auxiliary_loss_mlp": 0.01106801, - "balance_loss_clip": 1.10089588, - "balance_loss_mlp": 1.05802023, - "epoch": 0.015030813166992334, - "flos": 13985662371840.0, - "grad_norm": 2.0689026358419786, - "language_loss": 0.93631709, - "learning_rate": 3.5550046680977174e-06, - "loss": 0.96096689, - "num_input_tokens_seen": 5241660, - "step": 250, - "time_per_iteration": 2.7074570655822754 - }, - { - "auxiliary_loss_clip": 0.01368183, - "auxiliary_loss_mlp": 0.01115393, - "balance_loss_clip": 1.1065619, - "balance_loss_mlp": 1.06415713, - "epoch": 0.015090936419660304, - "flos": 24717817618560.0, - "grad_norm": 2.6509740932573127, - "language_loss": 0.9678722, - "learning_rate": 3.5575749397087034e-06, - "loss": 0.99270797, - "num_input_tokens_seen": 5261090, - "step": 251, - "time_per_iteration": 2.6740176677703857 - }, - { - "auxiliary_loss_clip": 0.01361249, - "auxiliary_loss_mlp": 0.01108489, - "balance_loss_clip": 1.10063529, - "balance_loss_mlp": 1.0597558, - "epoch": 0.015151059672328273, - "flos": 25738793798400.0, - "grad_norm": 1.996044018630987, - "language_loss": 0.84516245, - "learning_rate": 3.5601349915248707e-06, - "loss": 0.86985981, - "num_input_tokens_seen": 5279175, - "step": 252, - "time_per_iteration": 2.7198123931884766 - }, - { - "auxiliary_loss_clip": 0.01356789, - "auxiliary_loss_mlp": 0.0111346, - "balance_loss_clip": 1.1023767, - "balance_loss_mlp": 1.06346345, - "epoch": 0.015211182924996243, - "flos": 21871214737920.0, - "grad_norm": 2.3132428526475275, - "language_loss": 0.98516917, - "learning_rate": 3.5626849044954064e-06, - "loss": 1.0098716, - "num_input_tokens_seen": 5296975, - "step": 253, - "time_per_iteration": 2.6751561164855957 - }, - { - "auxiliary_loss_clip": 0.01244193, - "auxiliary_loss_mlp": 0.01100072, - "balance_loss_clip": 1.1058414, - "balance_loss_mlp": 1.08338308, - "epoch": 0.015271306177664212, - "flos": 66895080888960.0, - "grad_norm": 0.8719135194962525, - "language_loss": 0.55628473, - "learning_rate": 3.5652247586115167e-06, - "loss": 0.57972741, - "num_input_tokens_seen": 5358375, - "step": 254, - "time_per_iteration": 3.2305996417999268 - }, - { - "auxiliary_loss_clip": 0.0136146, - "auxiliary_loss_mlp": 0.01119692, - "balance_loss_clip": 1.0985806, - "balance_loss_mlp": 1.06952846, - "epoch": 0.01533142943033218, - "flos": 26834069260800.0, - "grad_norm": 2.113472843461701, - "language_loss": 0.90234184, - "learning_rate": 3.567754632921479e-06, - "loss": 0.92715329, - "num_input_tokens_seen": 5377255, - "step": 255, - "time_per_iteration": 2.7138473987579346 - }, - { - "auxiliary_loss_clip": 0.01357311, - "auxiliary_loss_mlp": 0.01137867, - "balance_loss_clip": 1.1001389, - "balance_loss_mlp": 1.08803785, - "epoch": 0.01539155268300015, - "flos": 20813753318400.0, - "grad_norm": 2.320838285045027, - "language_loss": 0.85392761, - "learning_rate": 3.5702746055454075e-06, - "loss": 0.87887937, - "num_input_tokens_seen": 5395320, - "step": 256, - "time_per_iteration": 2.7135775089263916 - }, - { - "auxiliary_loss_clip": 0.01363873, - "auxiliary_loss_mlp": 0.0112257, - "balance_loss_clip": 1.10053098, - "balance_loss_mlp": 1.07281172, - "epoch": 0.01545167593566812, - "flos": 15961862885760.0, - "grad_norm": 4.480294478847577, - "language_loss": 0.71472508, - "learning_rate": 3.5727847536897254e-06, - "loss": 0.73958945, - "num_input_tokens_seen": 5411970, - "step": 257, - "time_per_iteration": 6.340675592422485 - }, - { - "auxiliary_loss_clip": 0.01355912, - "auxiliary_loss_mlp": 0.01112611, - "balance_loss_clip": 1.10014856, - "balance_loss_mlp": 1.06280565, - "epoch": 0.01551179918833609, - "flos": 22601745544320.0, - "grad_norm": 2.0292888191897673, - "language_loss": 0.94713151, - "learning_rate": 3.5752851536613596e-06, - "loss": 0.97181678, - "num_input_tokens_seen": 5430245, - "step": 258, - "time_per_iteration": 5.674164772033691 - }, - { - "auxiliary_loss_clip": 0.01356656, - "auxiliary_loss_mlp": 0.01113313, - "balance_loss_clip": 1.09867072, - "balance_loss_mlp": 1.0645566, - "epoch": 0.015571922441004058, - "flos": 22816706486400.0, - "grad_norm": 2.3215886633849236, - "language_loss": 0.93037683, - "learning_rate": 3.577775880881658e-06, - "loss": 0.95507646, - "num_input_tokens_seen": 5448905, - "step": 259, - "time_per_iteration": 2.6286497116088867 - }, - { - "auxiliary_loss_clip": 0.01348977, - "auxiliary_loss_mlp": 0.01102171, - "balance_loss_clip": 1.10076857, - "balance_loss_mlp": 1.05625176, - "epoch": 0.015632045693672027, - "flos": 18947439486720.0, - "grad_norm": 1.9575053933526474, - "language_loss": 0.97368109, - "learning_rate": 3.5802570099000424e-06, - "loss": 0.99819261, - "num_input_tokens_seen": 5466405, - "step": 260, - "time_per_iteration": 2.625072717666626 - }, - { - "auxiliary_loss_clip": 0.01362999, - "auxiliary_loss_mlp": 0.01127943, - "balance_loss_clip": 1.1010474, - "balance_loss_mlp": 1.07940137, - "epoch": 0.015692168946339995, - "flos": 29971728046080.0, - "grad_norm": 2.2828802632863305, - "language_loss": 0.87807435, - "learning_rate": 3.5827286144073947e-06, - "loss": 0.90298378, - "num_input_tokens_seen": 5487055, - "step": 261, - "time_per_iteration": 2.6737279891967773 - }, - { - "auxiliary_loss_clip": 0.01357008, - "auxiliary_loss_mlp": 0.01125312, - "balance_loss_clip": 1.09822345, - "balance_loss_mlp": 1.07665133, - "epoch": 0.015752292199007967, - "flos": 19392085946880.0, - "grad_norm": 5.057676675675106, - "language_loss": 0.67100549, - "learning_rate": 3.5851907672491904e-06, - "loss": 0.69582868, - "num_input_tokens_seen": 5506600, - "step": 262, - "time_per_iteration": 2.651690721511841 - }, - { - "auxiliary_loss_clip": 0.01353953, - "auxiliary_loss_mlp": 0.01135541, - "balance_loss_clip": 1.09924924, - "balance_loss_mlp": 1.08499634, - "epoch": 0.015812415451675936, - "flos": 20339804338560.0, - "grad_norm": 3.0820356667611337, - "language_loss": 0.68077701, - "learning_rate": 3.587643540438383e-06, - "loss": 0.70567191, - "num_input_tokens_seen": 5524350, - "step": 263, - "time_per_iteration": 2.6885130405426025 - }, - { - "auxiliary_loss_clip": 0.01355592, - "auxiliary_loss_mlp": 0.01116799, - "balance_loss_clip": 1.09620881, - "balance_loss_mlp": 1.06766081, - "epoch": 0.015872538704343905, - "flos": 17525412979200.0, - "grad_norm": 3.9089218881424674, - "language_loss": 0.85002583, - "learning_rate": 3.590087005168037e-06, - "loss": 0.87474978, - "num_input_tokens_seen": 5542145, - "step": 264, - "time_per_iteration": 2.6557912826538086 - }, - { - "auxiliary_loss_clip": 0.01360388, - "auxiliary_loss_mlp": 0.01102763, - "balance_loss_clip": 1.10088885, - "balance_loss_mlp": 1.056319, - "epoch": 0.015932661957011873, - "flos": 15260490944640.0, - "grad_norm": 2.7020928553211476, - "language_loss": 1.04234743, - "learning_rate": 3.5925212318237344e-06, - "loss": 1.06697881, - "num_input_tokens_seen": 5557920, - "step": 265, - "time_per_iteration": 2.6262216567993164 - }, - { - "auxiliary_loss_clip": 0.01364512, - "auxiliary_loss_mlp": 0.01120309, - "balance_loss_clip": 1.1033864, - "balance_loss_mlp": 1.06835794, - "epoch": 0.015992785209679845, - "flos": 20302528999680.0, - "grad_norm": 3.1220748516520134, - "language_loss": 0.74914098, - "learning_rate": 3.5949462899957323e-06, - "loss": 0.7739892, - "num_input_tokens_seen": 5576290, - "step": 266, - "time_per_iteration": 2.6244583129882812 - }, - { - "auxiliary_loss_clip": 0.01349738, - "auxiliary_loss_mlp": 0.0111189, - "balance_loss_clip": 1.1000762, - "balance_loss_mlp": 1.06206095, - "epoch": 0.016052908462347814, - "flos": 23362368969600.0, - "grad_norm": 1.8166776194063956, - "language_loss": 0.90909529, - "learning_rate": 3.5973622484909068e-06, - "loss": 0.93371153, - "num_input_tokens_seen": 5595205, - "step": 267, - "time_per_iteration": 2.6753580570220947 - }, - { - "auxiliary_loss_clip": 0.01359091, - "auxiliary_loss_mlp": 0.01115968, - "balance_loss_clip": 1.10122573, - "balance_loss_mlp": 1.06797481, - "epoch": 0.016113031715015783, - "flos": 21286588976640.0, - "grad_norm": 2.450608875877181, - "language_loss": 0.85636413, - "learning_rate": 3.599769175344462e-06, - "loss": 0.88111478, - "num_input_tokens_seen": 5612645, - "step": 268, - "time_per_iteration": 2.7161567211151123 - }, - { - "auxiliary_loss_clip": 0.01351132, - "auxiliary_loss_mlp": 0.01102276, - "balance_loss_clip": 1.10226274, - "balance_loss_mlp": 1.05475891, - "epoch": 0.01617315496768375, - "flos": 18914689261440.0, - "grad_norm": 2.1714201716772457, - "language_loss": 0.88080788, - "learning_rate": 3.602167137831432e-06, - "loss": 0.90534198, - "num_input_tokens_seen": 5628345, - "step": 269, - "time_per_iteration": 2.6403756141662598 - }, - { - "auxiliary_loss_clip": 0.01357907, - "auxiliary_loss_mlp": 0.01111574, - "balance_loss_clip": 1.10001528, - "balance_loss_mlp": 1.06021833, - "epoch": 0.01623327822035172, - "flos": 16546488647040.0, - "grad_norm": 2.5848702107942803, - "language_loss": 0.97077739, - "learning_rate": 3.6045562024779565e-06, - "loss": 0.99547219, - "num_input_tokens_seen": 5645940, - "step": 270, - "time_per_iteration": 2.635546922683716 - }, - { - "auxiliary_loss_clip": 0.01356007, - "auxiliary_loss_mlp": 0.01118132, - "balance_loss_clip": 1.10402, - "balance_loss_mlp": 1.06918478, - "epoch": 0.016293401473019692, - "flos": 23513481486720.0, - "grad_norm": 2.1115750591463223, - "language_loss": 0.86112005, - "learning_rate": 3.606936435072361e-06, - "loss": 0.8858614, - "num_input_tokens_seen": 5665690, - "step": 271, - "time_per_iteration": 2.6877286434173584 - }, - { - "auxiliary_loss_clip": 0.013537, - "auxiliary_loss_mlp": 0.01105687, - "balance_loss_clip": 1.0962286, - "balance_loss_mlp": 1.057693, - "epoch": 0.01635352472568766, - "flos": 29016072748800.0, - "grad_norm": 2.5391912683658413, - "language_loss": 0.81550127, - "learning_rate": 3.609307900676025e-06, - "loss": 0.84009504, - "num_input_tokens_seen": 5683190, - "step": 272, - "time_per_iteration": 2.6728365421295166 - }, - { - "auxiliary_loss_clip": 0.01348527, - "auxiliary_loss_mlp": 0.01120864, - "balance_loss_clip": 1.09806561, - "balance_loss_mlp": 1.07368064, - "epoch": 0.01641364797835563, - "flos": 13370513028480.0, - "grad_norm": 2.3613573538590487, - "language_loss": 0.81075382, - "learning_rate": 3.611670663634051e-06, - "loss": 0.83544779, - "num_input_tokens_seen": 5699780, - "step": 273, - "time_per_iteration": 2.595008134841919 - }, - { - "auxiliary_loss_clip": 0.01346135, - "auxiliary_loss_mlp": 0.01105539, - "balance_loss_clip": 1.09398317, - "balance_loss_mlp": 1.05749762, - "epoch": 0.016473771231023598, - "flos": 18878239935360.0, - "grad_norm": 2.1979313648400547, - "language_loss": 0.9131726, - "learning_rate": 3.614024787585744e-06, - "loss": 0.9376893, - "num_input_tokens_seen": 5716980, - "step": 274, - "time_per_iteration": 2.684718132019043 - }, - { - "auxiliary_loss_clip": 0.013432, - "auxiliary_loss_mlp": 0.01108715, - "balance_loss_clip": 1.09515727, - "balance_loss_mlp": 1.06062579, - "epoch": 0.016533894483691566, - "flos": 22601637803520.0, - "grad_norm": 1.9719932168994616, - "language_loss": 0.88054645, - "learning_rate": 3.6163703354748927e-06, - "loss": 0.90506566, - "num_input_tokens_seen": 5737780, - "step": 275, - "time_per_iteration": 2.7204532623291016 - }, - { - "auxiliary_loss_clip": 0.01346726, - "auxiliary_loss_mlp": 0.01102856, - "balance_loss_clip": 1.09623361, - "balance_loss_mlp": 1.05312169, - "epoch": 0.01659401773635954, - "flos": 21507188353920.0, - "grad_norm": 1.7930545784536995, - "language_loss": 0.80726624, - "learning_rate": 3.6187073695598707e-06, - "loss": 0.83176208, - "num_input_tokens_seen": 5758330, - "step": 276, - "time_per_iteration": 3.04716157913208 - }, - { - "auxiliary_loss_clip": 0.0133817, - "auxiliary_loss_mlp": 0.01096103, - "balance_loss_clip": 1.09588337, - "balance_loss_mlp": 1.05220985, - "epoch": 0.016654140989027507, - "flos": 32850973411200.0, - "grad_norm": 1.9196343116615175, - "language_loss": 0.80707026, - "learning_rate": 3.621035951423551e-06, - "loss": 0.83141291, - "num_input_tokens_seen": 5778340, - "step": 277, - "time_per_iteration": 2.809645652770996 - }, - { - "auxiliary_loss_clip": 0.01337061, - "auxiliary_loss_mlp": 0.0109637, - "balance_loss_clip": 1.08979487, - "balance_loss_mlp": 1.04923487, - "epoch": 0.016714264241695476, - "flos": 12306228024960.0, - "grad_norm": 2.3224792061881185, - "language_loss": 0.80508065, - "learning_rate": 3.623356141983041e-06, - "loss": 0.82941496, - "num_input_tokens_seen": 5794295, - "step": 278, - "time_per_iteration": 2.604830741882324 - }, - { - "auxiliary_loss_clip": 0.01341116, - "auxiliary_loss_mlp": 0.01101968, - "balance_loss_clip": 1.09395671, - "balance_loss_mlp": 1.05585837, - "epoch": 0.016774387494363444, - "flos": 27123796362240.0, - "grad_norm": 2.0021377353660057, - "language_loss": 0.90582991, - "learning_rate": 3.6256680014992486e-06, - "loss": 0.93026078, - "num_input_tokens_seen": 5814405, - "step": 279, - "time_per_iteration": 2.7193243503570557 - }, - { - "auxiliary_loss_clip": 0.01346095, - "auxiliary_loss_mlp": 0.01112065, - "balance_loss_clip": 1.09383631, - "balance_loss_mlp": 1.06450009, - "epoch": 0.016834510747031413, - "flos": 20191493082240.0, - "grad_norm": 2.9314445951013988, - "language_loss": 0.94049025, - "learning_rate": 3.6279715895862713e-06, - "loss": 0.96507192, - "num_input_tokens_seen": 5832795, - "step": 280, - "time_per_iteration": 2.680924654006958 - }, - { - "auxiliary_loss_clip": 0.01346658, - "auxiliary_loss_mlp": 0.01109166, - "balance_loss_clip": 1.09285879, - "balance_loss_mlp": 1.06060064, - "epoch": 0.016894633999699385, - "flos": 27274262434560.0, - "grad_norm": 2.6758913403282483, - "language_loss": 0.74425459, - "learning_rate": 3.6302669652206183e-06, - "loss": 0.76881289, - "num_input_tokens_seen": 5855750, - "step": 281, - "time_per_iteration": 2.691152811050415 - }, - { - "auxiliary_loss_clip": 0.01343371, - "auxiliary_loss_mlp": 0.01117708, - "balance_loss_clip": 1.09609079, - "balance_loss_mlp": 1.0724318, - "epoch": 0.016954757252367354, - "flos": 14902964922240.0, - "grad_norm": 3.4878028680462005, - "language_loss": 0.80255079, - "learning_rate": 3.632554186750274e-06, - "loss": 0.82716167, - "num_input_tokens_seen": 5872610, - "step": 282, - "time_per_iteration": 2.592664957046509 - }, - { - "auxiliary_loss_clip": 0.01348082, - "auxiliary_loss_mlp": 0.01118449, - "balance_loss_clip": 1.09700727, - "balance_loss_mlp": 1.07114697, - "epoch": 0.017014880505035322, - "flos": 21358805270400.0, - "grad_norm": 2.296781711700251, - "language_loss": 0.77719986, - "learning_rate": 3.6348333119035937e-06, - "loss": 0.80186516, - "num_input_tokens_seen": 5892985, - "step": 283, - "time_per_iteration": 2.6502227783203125 - }, - { - "auxiliary_loss_clip": 0.01347311, - "auxiliary_loss_mlp": 0.01092934, - "balance_loss_clip": 1.0977478, - "balance_loss_mlp": 1.04804015, - "epoch": 0.01707500375770329, - "flos": 35333154858240.0, - "grad_norm": 2.3467060832193414, - "language_loss": 0.84246969, - "learning_rate": 3.6371043977980503e-06, - "loss": 0.86687213, - "num_input_tokens_seen": 5914060, - "step": 284, - "time_per_iteration": 2.8534958362579346 - }, - { - "auxiliary_loss_clip": 0.01337962, - "auxiliary_loss_mlp": 0.01100399, - "balance_loss_clip": 1.09212708, - "balance_loss_mlp": 1.05297756, - "epoch": 0.01713512701037126, - "flos": 23582070506880.0, - "grad_norm": 2.7335752956200388, - "language_loss": 0.96998906, - "learning_rate": 3.639367500948819e-06, - "loss": 0.99437273, - "num_input_tokens_seen": 5932860, - "step": 285, - "time_per_iteration": 2.6338655948638916 - }, - { - "auxiliary_loss_clip": 0.01341319, - "auxiliary_loss_mlp": 0.01095606, - "balance_loss_clip": 1.09538078, - "balance_loss_mlp": 1.05123687, - "epoch": 0.01719525026303923, - "flos": 27634661544960.0, - "grad_norm": 2.294843469150046, - "language_loss": 0.94079655, - "learning_rate": 3.6416226772772178e-06, - "loss": 0.96516573, - "num_input_tokens_seen": 5952725, - "step": 286, - "time_per_iteration": 2.711087942123413 - }, - { - "auxiliary_loss_clip": 0.01332862, - "auxiliary_loss_mlp": 0.0109035, - "balance_loss_clip": 1.08986938, - "balance_loss_mlp": 1.04409683, - "epoch": 0.0172553735157072, - "flos": 26979722910720.0, - "grad_norm": 1.9277896882465477, - "language_loss": 0.92464817, - "learning_rate": 3.643869982119001e-06, - "loss": 0.94888031, - "num_input_tokens_seen": 5970560, - "step": 287, - "time_per_iteration": 2.640267848968506 - }, - { - "auxiliary_loss_clip": 0.01338192, - "auxiliary_loss_mlp": 0.01092315, - "balance_loss_clip": 1.09039164, - "balance_loss_mlp": 1.04651475, - "epoch": 0.01731549676837517, - "flos": 14056621689600.0, - "grad_norm": 2.7883535936791035, - "language_loss": 1.01873291, - "learning_rate": 3.646109470232502e-06, - "loss": 1.04303789, - "num_input_tokens_seen": 5982980, - "step": 288, - "time_per_iteration": 2.558312177658081 - }, - { - "auxiliary_loss_clip": 0.01225082, - "auxiliary_loss_mlp": 0.01188305, - "balance_loss_clip": 1.09194219, - "balance_loss_mlp": 1.17228377, - "epoch": 0.017375620021043137, - "flos": 66510694471680.0, - "grad_norm": 0.9289960013542303, - "language_loss": 0.63867617, - "learning_rate": 3.6483411958066417e-06, - "loss": 0.66281009, - "num_input_tokens_seen": 6049445, - "step": 289, - "time_per_iteration": 3.386254072189331 - }, - { - "auxiliary_loss_clip": 0.01341215, - "auxiliary_loss_mlp": 0.01107788, - "balance_loss_clip": 1.09622383, - "balance_loss_mlp": 1.06482446, - "epoch": 0.01743574327371111, - "flos": 15225154940160.0, - "grad_norm": 2.368974734045724, - "language_loss": 0.88156199, - "learning_rate": 3.6505652124687957e-06, - "loss": 0.90605205, - "num_input_tokens_seen": 6064150, - "step": 290, - "time_per_iteration": 2.5670948028564453 - }, - { - "auxiliary_loss_clip": 0.0133848, - "auxiliary_loss_mlp": 0.010946, - "balance_loss_clip": 1.09388971, - "balance_loss_mlp": 1.04965782, - "epoch": 0.017495866526379078, - "flos": 25373869574400.0, - "grad_norm": 2.2011772664145504, - "language_loss": 0.84472585, - "learning_rate": 3.6527815732925258e-06, - "loss": 0.8690567, - "num_input_tokens_seen": 6083920, - "step": 291, - "time_per_iteration": 2.648452043533325 - }, - { - "auxiliary_loss_clip": 0.01343563, - "auxiliary_loss_mlp": 0.01115116, - "balance_loss_clip": 1.10129941, - "balance_loss_mlp": 1.06607366, - "epoch": 0.017555989779047047, - "flos": 26359473836160.0, - "grad_norm": 1.7675259544479762, - "language_loss": 0.72679955, - "learning_rate": 3.6549903308051806e-06, - "loss": 0.75138628, - "num_input_tokens_seen": 6105460, - "step": 292, - "time_per_iteration": 2.7239537239074707 - }, - { - "auxiliary_loss_clip": 0.01334066, - "auxiliary_loss_mlp": 0.01107289, - "balance_loss_clip": 1.09397244, - "balance_loss_mlp": 1.06170392, - "epoch": 0.017616113031715015, - "flos": 22338807010560.0, - "grad_norm": 2.419616990787406, - "language_loss": 0.86866581, - "learning_rate": 3.6571915369953646e-06, - "loss": 0.89307928, - "num_input_tokens_seen": 6122890, - "step": 293, - "time_per_iteration": 2.642854690551758 - }, - { - "auxiliary_loss_clip": 0.01333726, - "auxiliary_loss_mlp": 0.0110557, - "balance_loss_clip": 1.09271646, - "balance_loss_mlp": 1.06086659, - "epoch": 0.017676236284382984, - "flos": 20156911263360.0, - "grad_norm": 2.112624444766753, - "language_loss": 0.80896151, - "learning_rate": 3.6593852433202797e-06, - "loss": 0.83335447, - "num_input_tokens_seen": 6142890, - "step": 294, - "time_per_iteration": 2.598176956176758 - }, - { - "auxiliary_loss_clip": 0.01334179, - "auxiliary_loss_mlp": 0.01113433, - "balance_loss_clip": 1.09030747, - "balance_loss_mlp": 1.06892014, - "epoch": 0.017736359537050956, - "flos": 25223331674880.0, - "grad_norm": 2.8289841764142416, - "language_loss": 0.83806521, - "learning_rate": 3.6615715007129453e-06, - "loss": 0.86254132, - "num_input_tokens_seen": 6162030, - "step": 295, - "time_per_iteration": 2.750103712081909 - }, - { - "auxiliary_loss_clip": 0.01339845, - "auxiliary_loss_mlp": 0.01121984, - "balance_loss_clip": 1.09978509, - "balance_loss_mlp": 1.0772326, - "epoch": 0.017796482789718925, - "flos": 20338798757760.0, - "grad_norm": 1.8804378237246864, - "language_loss": 0.84576106, - "learning_rate": 3.6637503595892897e-06, - "loss": 0.87037927, - "num_input_tokens_seen": 6180540, - "step": 296, - "time_per_iteration": 4.154251337051392 - }, - { - "auxiliary_loss_clip": 0.01337678, - "auxiliary_loss_mlp": 0.01105295, - "balance_loss_clip": 1.09463406, - "balance_loss_mlp": 1.06154561, - "epoch": 0.017856606042386893, - "flos": 22379206832640.0, - "grad_norm": 2.055710812588959, - "language_loss": 0.87810111, - "learning_rate": 3.665921869855132e-06, - "loss": 0.90253091, - "num_input_tokens_seen": 6199425, - "step": 297, - "time_per_iteration": 4.379676103591919 - }, - { - "auxiliary_loss_clip": 0.0133717, - "auxiliary_loss_mlp": 0.01103766, - "balance_loss_clip": 1.09343684, - "balance_loss_mlp": 1.06004047, - "epoch": 0.017916729295054862, - "flos": 20230061310720.0, - "grad_norm": 2.689351030321763, - "language_loss": 0.88947791, - "learning_rate": 3.6680860809130346e-06, - "loss": 0.91388726, - "num_input_tokens_seen": 6219170, - "step": 298, - "time_per_iteration": 4.1055779457092285 - }, - { - "auxiliary_loss_clip": 0.01333843, - "auxiliary_loss_mlp": 0.01121179, - "balance_loss_clip": 1.09470236, - "balance_loss_mlp": 1.07499719, - "epoch": 0.01797685254772283, - "flos": 19390972625280.0, - "grad_norm": 1.8935027270905305, - "language_loss": 0.88550889, - "learning_rate": 3.6702430416690516e-06, - "loss": 0.91005915, - "num_input_tokens_seen": 6237930, - "step": 299, - "time_per_iteration": 2.611168622970581 - }, - { - "auxiliary_loss_clip": 0.0133938, - "auxiliary_loss_mlp": 0.0110718, - "balance_loss_clip": 1.09468794, - "balance_loss_mlp": 1.06130886, - "epoch": 0.018036975800390802, - "flos": 24426007528320.0, - "grad_norm": 4.075580609786654, - "language_loss": 0.64664406, - "learning_rate": 3.672392800539357e-06, - "loss": 0.67110968, - "num_input_tokens_seen": 6257170, - "step": 300, - "time_per_iteration": 2.645603656768799 - }, - { - "auxiliary_loss_clip": 0.01338559, - "auxiliary_loss_mlp": 0.01111665, - "balance_loss_clip": 1.09775913, - "balance_loss_mlp": 1.06636548, - "epoch": 0.01809709905305877, - "flos": 15778933896960.0, - "grad_norm": 2.5071418214687515, - "language_loss": 0.87940675, - "learning_rate": 3.6745354054567686e-06, - "loss": 0.90390897, - "num_input_tokens_seen": 6274780, - "step": 301, - "time_per_iteration": 2.6035923957824707 - }, - { - "auxiliary_loss_clip": 0.01238361, - "auxiliary_loss_mlp": 0.01073699, - "balance_loss_clip": 1.1100142, - "balance_loss_mlp": 1.05901265, - "epoch": 0.01815722230572674, - "flos": 67348382526720.0, - "grad_norm": 0.8350739260664176, - "language_loss": 0.62219667, - "learning_rate": 3.676670903877158e-06, - "loss": 0.64531732, - "num_input_tokens_seen": 6340435, - "step": 302, - "time_per_iteration": 3.3307297229766846 - }, - { - "auxiliary_loss_clip": 0.0132981, - "auxiliary_loss_mlp": 0.01110918, - "balance_loss_clip": 1.0910126, - "balance_loss_mlp": 1.06507051, - "epoch": 0.01821734555839471, - "flos": 15485615435520.0, - "grad_norm": 2.115144575016314, - "language_loss": 0.89737153, - "learning_rate": 3.6787993427857567e-06, - "loss": 0.9217788, - "num_input_tokens_seen": 6358160, - "step": 303, - "time_per_iteration": 2.6773293018341064 - }, - { - "auxiliary_loss_clip": 0.01335628, - "auxiliary_loss_mlp": 0.01118481, - "balance_loss_clip": 1.09579217, - "balance_loss_mlp": 1.07237101, - "epoch": 0.018277468811062677, - "flos": 24097424889600.0, - "grad_norm": 1.8670669350935472, - "language_loss": 0.80417514, - "learning_rate": 3.680920768703364e-06, - "loss": 0.82871628, - "num_input_tokens_seen": 6378485, - "step": 304, - "time_per_iteration": 2.691347360610962 - }, - { - "auxiliary_loss_clip": 0.01330802, - "auxiliary_loss_mlp": 0.01091671, - "balance_loss_clip": 1.09832263, - "balance_loss_mlp": 1.04858923, - "epoch": 0.01833759206373065, - "flos": 20959335141120.0, - "grad_norm": 1.6863564291935742, - "language_loss": 0.82761526, - "learning_rate": 3.6830352276924415e-06, - "loss": 0.85184002, - "num_input_tokens_seen": 6397845, - "step": 305, - "time_per_iteration": 2.6883981227874756 - }, - { - "auxiliary_loss_clip": 0.01330759, - "auxiliary_loss_mlp": 0.01093908, - "balance_loss_clip": 1.09012437, - "balance_loss_mlp": 1.05115986, - "epoch": 0.018397715316398618, - "flos": 19390757143680.0, - "grad_norm": 2.1780708917523297, - "language_loss": 0.91148543, - "learning_rate": 3.685142765363119e-06, - "loss": 0.93573213, - "num_input_tokens_seen": 6416475, - "step": 306, - "time_per_iteration": 2.6465187072753906 - }, - { - "auxiliary_loss_clip": 0.01324743, - "auxiliary_loss_mlp": 0.01091696, - "balance_loss_clip": 1.08900762, - "balance_loss_mlp": 1.04882836, - "epoch": 0.018457838569066586, - "flos": 29132531619840.0, - "grad_norm": 3.4680205003751072, - "language_loss": 0.86581063, - "learning_rate": 3.687243426879095e-06, - "loss": 0.88997507, - "num_input_tokens_seen": 6437520, - "step": 307, - "time_per_iteration": 2.7787318229675293 - }, - { - "auxiliary_loss_clip": 0.01326572, - "auxiliary_loss_mlp": 0.01110018, - "balance_loss_clip": 1.09346747, - "balance_loss_mlp": 1.06247783, - "epoch": 0.018517961821734555, - "flos": 19208654167680.0, - "grad_norm": 2.413130156754219, - "language_loss": 0.71650648, - "learning_rate": 3.6893372569634466e-06, - "loss": 0.74087244, - "num_input_tokens_seen": 6455680, - "step": 308, - "time_per_iteration": 2.652973175048828 - }, - { - "auxiliary_loss_clip": 0.01331912, - "auxiliary_loss_mlp": 0.01102766, - "balance_loss_clip": 1.09061241, - "balance_loss_mlp": 1.05911207, - "epoch": 0.018578085074402523, - "flos": 19863018184320.0, - "grad_norm": 2.1869498369051077, - "language_loss": 0.91841364, - "learning_rate": 3.6914242999043395e-06, - "loss": 0.94276047, - "num_input_tokens_seen": 6474880, - "step": 309, - "time_per_iteration": 2.6613030433654785 - }, - { - "auxiliary_loss_clip": 0.01339178, - "auxiliary_loss_mlp": 0.01096668, - "balance_loss_clip": 1.09145641, - "balance_loss_mlp": 1.05084395, - "epoch": 0.018638208327070496, - "flos": 29606947476480.0, - "grad_norm": 2.0400456475786353, - "language_loss": 0.72784412, - "learning_rate": 3.69350459956065e-06, - "loss": 0.75220263, - "num_input_tokens_seen": 6495945, - "step": 310, - "time_per_iteration": 2.705345392227173 - }, - { - "auxiliary_loss_clip": 0.01331019, - "auxiliary_loss_mlp": 0.01113021, - "balance_loss_clip": 1.09560525, - "balance_loss_mlp": 1.06922317, - "epoch": 0.018698331579738464, - "flos": 45731555907840.0, - "grad_norm": 2.1345597100799645, - "language_loss": 0.74162471, - "learning_rate": 3.695578199367497e-06, - "loss": 0.76606506, - "num_input_tokens_seen": 6519930, - "step": 311, - "time_per_iteration": 2.846503496170044 - }, - { - "auxiliary_loss_clip": 0.01338389, - "auxiliary_loss_mlp": 0.01104203, - "balance_loss_clip": 1.09206033, - "balance_loss_mlp": 1.0609777, - "epoch": 0.018758454832406433, - "flos": 20483662308480.0, - "grad_norm": 3.713635021153945, - "language_loss": 0.91668129, - "learning_rate": 3.6976451423416825e-06, - "loss": 0.94110715, - "num_input_tokens_seen": 6535070, - "step": 312, - "time_per_iteration": 2.598400592803955 - }, - { - "auxiliary_loss_clip": 0.01339145, - "auxiliary_loss_mlp": 0.01116197, - "balance_loss_clip": 1.09512305, - "balance_loss_mlp": 1.07034922, - "epoch": 0.0188185780850744, - "flos": 15777784661760.0, - "grad_norm": 4.5530066286460045, - "language_loss": 0.89634913, - "learning_rate": 3.699705471087043e-06, - "loss": 0.92090249, - "num_input_tokens_seen": 6554135, - "step": 313, - "time_per_iteration": 2.6944596767425537 - }, - { - "auxiliary_loss_clip": 0.01340962, - "auxiliary_loss_mlp": 0.0109941, - "balance_loss_clip": 1.09381938, - "balance_loss_mlp": 1.05430174, - "epoch": 0.018878701337742373, - "flos": 22455732758400.0, - "grad_norm": 2.3990870717118455, - "language_loss": 0.7335974, - "learning_rate": 3.7017592277997256e-06, - "loss": 0.75800109, - "num_input_tokens_seen": 6572275, - "step": 314, - "time_per_iteration": 2.6550133228302 - }, - { - "auxiliary_loss_clip": 0.01329658, - "auxiliary_loss_mlp": 0.01105546, - "balance_loss_clip": 1.09075165, - "balance_loss_mlp": 1.06246412, - "epoch": 0.018938824590410342, - "flos": 30993530238720.0, - "grad_norm": 5.81191681220521, - "language_loss": 0.89890182, - "learning_rate": 3.7038064542733654e-06, - "loss": 0.92325383, - "num_input_tokens_seen": 6594520, - "step": 315, - "time_per_iteration": 2.7121222019195557 - }, - { - "auxiliary_loss_clip": 0.0133262, - "auxiliary_loss_mlp": 0.01096177, - "balance_loss_clip": 1.09287357, - "balance_loss_mlp": 1.05209303, - "epoch": 0.01899894784307831, - "flos": 23258910821760.0, - "grad_norm": 2.446494284682687, - "language_loss": 0.80517328, - "learning_rate": 3.7058471919041945e-06, - "loss": 0.82946122, - "num_input_tokens_seen": 6614245, - "step": 316, - "time_per_iteration": 2.640573501586914 - }, - { - "auxiliary_loss_clip": 0.01326654, - "auxiliary_loss_mlp": 0.01094904, - "balance_loss_clip": 1.09036672, - "balance_loss_mlp": 1.05046248, - "epoch": 0.01905907109574628, - "flos": 17457901367040.0, - "grad_norm": 2.3705495670370524, - "language_loss": 0.90161496, - "learning_rate": 3.7078814816960605e-06, - "loss": 0.92583054, - "num_input_tokens_seen": 6632015, - "step": 317, - "time_per_iteration": 2.594388246536255 - }, - { - "auxiliary_loss_clip": 0.01324014, - "auxiliary_loss_mlp": 0.01097498, - "balance_loss_clip": 1.08944559, - "balance_loss_mlp": 1.05281842, - "epoch": 0.019119194348414248, - "flos": 14970225139200.0, - "grad_norm": 7.443622240044352, - "language_loss": 0.90836811, - "learning_rate": 3.709909364265374e-06, - "loss": 0.93258321, - "num_input_tokens_seen": 6649015, - "step": 318, - "time_per_iteration": 2.6647114753723145 - }, - { - "auxiliary_loss_clip": 0.01326579, - "auxiliary_loss_mlp": 0.01092817, - "balance_loss_clip": 1.0886786, - "balance_loss_mlp": 1.05102181, - "epoch": 0.01917931760108222, - "flos": 25482822503040.0, - "grad_norm": 2.232217614618188, - "language_loss": 0.93955356, - "learning_rate": 3.7119308798459706e-06, - "loss": 0.9637475, - "num_input_tokens_seen": 6669225, - "step": 319, - "time_per_iteration": 2.6901800632476807 - }, - { - "auxiliary_loss_clip": 0.01209258, - "auxiliary_loss_mlp": 0.01057567, - "balance_loss_clip": 1.08611965, - "balance_loss_mlp": 1.04288089, - "epoch": 0.01923944085375019, - "flos": 71556967353600.0, - "grad_norm": 1.0009907084180605, - "language_loss": 0.59817195, - "learning_rate": 3.7139460682939026e-06, - "loss": 0.62084019, - "num_input_tokens_seen": 6725775, - "step": 320, - "time_per_iteration": 3.1044812202453613 - }, - { - "auxiliary_loss_clip": 0.01323701, - "auxiliary_loss_mlp": 0.01105882, - "balance_loss_clip": 1.08827436, - "balance_loss_mlp": 1.06291938, - "epoch": 0.019299564106418157, - "flos": 19682495406720.0, - "grad_norm": 3.6735645336458163, - "language_loss": 0.89620435, - "learning_rate": 3.715954969092154e-06, - "loss": 0.92050016, - "num_input_tokens_seen": 6744170, - "step": 321, - "time_per_iteration": 2.650325298309326 - }, - { - "auxiliary_loss_clip": 0.01333523, - "auxiliary_loss_mlp": 0.01118534, - "balance_loss_clip": 1.09200621, - "balance_loss_mlp": 1.07440257, - "epoch": 0.019359687359086126, - "flos": 24387151991040.0, - "grad_norm": 2.289334718991835, - "language_loss": 0.82897186, - "learning_rate": 3.7179576213552805e-06, - "loss": 0.85349244, - "num_input_tokens_seen": 6764565, - "step": 322, - "time_per_iteration": 2.65793514251709 - }, - { - "auxiliary_loss_clip": 0.01332983, - "auxiliary_loss_mlp": 0.01092262, - "balance_loss_clip": 1.09035325, - "balance_loss_mlp": 1.05061018, - "epoch": 0.019419810611754094, - "flos": 23951376190080.0, - "grad_norm": 2.3678949255052912, - "language_loss": 0.72983897, - "learning_rate": 3.719954063833981e-06, - "loss": 0.75409144, - "num_input_tokens_seen": 6785310, - "step": 323, - "time_per_iteration": 2.6827828884124756 - }, - { - "auxiliary_loss_clip": 0.01321298, - "auxiliary_loss_mlp": 0.01092254, - "balance_loss_clip": 1.08474624, - "balance_loss_mlp": 1.04974401, - "epoch": 0.019479933864422067, - "flos": 22160223567360.0, - "grad_norm": 1.9971507164977458, - "language_loss": 0.92358303, - "learning_rate": 3.721944334919596e-06, - "loss": 0.9477185, - "num_input_tokens_seen": 6803290, - "step": 324, - "time_per_iteration": 2.667363405227661 - }, - { - "auxiliary_loss_clip": 0.0133014, - "auxiliary_loss_mlp": 0.01089098, - "balance_loss_clip": 1.09217644, - "balance_loss_mlp": 1.04878139, - "epoch": 0.019540057117090035, - "flos": 22236821320320.0, - "grad_norm": 6.407507213214319, - "language_loss": 0.65127969, - "learning_rate": 3.7239284726485375e-06, - "loss": 0.67547202, - "num_input_tokens_seen": 6822570, - "step": 325, - "time_per_iteration": 2.658700466156006 - }, - { - "auxiliary_loss_clip": 0.01328385, - "auxiliary_loss_mlp": 0.01109788, - "balance_loss_clip": 1.09598839, - "balance_loss_mlp": 1.06675363, - "epoch": 0.019600180369758004, - "flos": 23076771932160.0, - "grad_norm": 1.7177375017641943, - "language_loss": 0.76394802, - "learning_rate": 3.72590651470665e-06, - "loss": 0.78832972, - "num_input_tokens_seen": 6841910, - "step": 326, - "time_per_iteration": 2.6326630115509033 - }, - { - "auxiliary_loss_clip": 0.01322824, - "auxiliary_loss_mlp": 0.01103487, - "balance_loss_clip": 1.09083152, - "balance_loss_mlp": 1.06040514, - "epoch": 0.019660303622425972, - "flos": 25410857604480.0, - "grad_norm": 2.041100065316132, - "language_loss": 0.79262185, - "learning_rate": 3.727878498433505e-06, - "loss": 0.81688493, - "num_input_tokens_seen": 6862480, - "step": 327, - "time_per_iteration": 2.7195518016815186 - }, - { - "auxiliary_loss_clip": 0.0132945, - "auxiliary_loss_mlp": 0.01099712, - "balance_loss_clip": 1.09292865, - "balance_loss_mlp": 1.05832207, - "epoch": 0.01972042687509394, - "flos": 23657519024640.0, - "grad_norm": 2.852301933148325, - "language_loss": 0.80569315, - "learning_rate": 3.7298444608266328e-06, - "loss": 0.82998472, - "num_input_tokens_seen": 6882015, - "step": 328, - "time_per_iteration": 2.6789369583129883 - }, - { - "auxiliary_loss_clip": 0.01327544, - "auxiliary_loss_mlp": 0.01094059, - "balance_loss_clip": 1.08719349, - "balance_loss_mlp": 1.05045235, - "epoch": 0.019780550127761913, - "flos": 18223480869120.0, - "grad_norm": 2.280823996815513, - "language_loss": 0.93599927, - "learning_rate": 3.731804438545683e-06, - "loss": 0.96021533, - "num_input_tokens_seen": 6899785, - "step": 329, - "time_per_iteration": 2.6043548583984375 - }, - { - "auxiliary_loss_clip": 0.0133329, - "auxiliary_loss_mlp": 0.0110952, - "balance_loss_clip": 1.09211767, - "balance_loss_mlp": 1.06629419, - "epoch": 0.01984067338042988, - "flos": 22418780641920.0, - "grad_norm": 2.788704520584699, - "language_loss": 0.7476396, - "learning_rate": 3.7337584679165324e-06, - "loss": 0.77206767, - "num_input_tokens_seen": 6918575, - "step": 330, - "time_per_iteration": 2.706001043319702 - }, - { - "auxiliary_loss_clip": 0.0133006, - "auxiliary_loss_mlp": 0.01115344, - "balance_loss_clip": 1.09077096, - "balance_loss_mlp": 1.07280993, - "epoch": 0.01990079663309785, - "flos": 17055199013760.0, - "grad_norm": 4.201650057157668, - "language_loss": 0.93435889, - "learning_rate": 3.7357065849353186e-06, - "loss": 0.95881295, - "num_input_tokens_seen": 6936965, - "step": 331, - "time_per_iteration": 2.6499180793762207 - }, - { - "auxiliary_loss_clip": 0.01316843, - "auxiliary_loss_mlp": 0.01085812, - "balance_loss_clip": 1.08825564, - "balance_loss_mlp": 1.04563856, - "epoch": 0.01996091988576582, - "flos": 15961791058560.0, - "grad_norm": 2.5475056489813968, - "language_loss": 0.9293468, - "learning_rate": 3.737648825272422e-06, - "loss": 0.95337331, - "num_input_tokens_seen": 6953475, - "step": 332, - "time_per_iteration": 2.5990231037139893 - }, - { - "auxiliary_loss_clip": 0.01325701, - "auxiliary_loss_mlp": 0.01091941, - "balance_loss_clip": 1.09376514, - "balance_loss_mlp": 1.04902601, - "epoch": 0.02002104313843379, - "flos": 23586451966080.0, - "grad_norm": 2.7319388202061106, - "language_loss": 0.75380504, - "learning_rate": 3.739585224276384e-06, - "loss": 0.77798152, - "num_input_tokens_seen": 6971630, - "step": 333, - "time_per_iteration": 2.6225569248199463 - }, - { - "auxiliary_loss_clip": 0.01323488, - "auxiliary_loss_mlp": 0.01083816, - "balance_loss_clip": 1.08822608, - "balance_loss_mlp": 1.04249835, - "epoch": 0.02008116639110176, - "flos": 34094883352320.0, - "grad_norm": 3.3732742696494924, - "language_loss": 0.78797042, - "learning_rate": 3.7415158169777673e-06, - "loss": 0.81204355, - "num_input_tokens_seen": 6992775, - "step": 334, - "time_per_iteration": 2.725562572479248 - }, - { - "auxiliary_loss_clip": 0.01325152, - "auxiliary_loss_mlp": 0.01093257, - "balance_loss_clip": 1.08535278, - "balance_loss_mlp": 1.04867256, - "epoch": 0.020141289643769728, - "flos": 19683716469120.0, - "grad_norm": 1.945115565921162, - "language_loss": 0.83465719, - "learning_rate": 3.7434406380929575e-06, - "loss": 0.8588413, - "num_input_tokens_seen": 7011425, - "step": 335, - "time_per_iteration": 2.638871192932129 - }, - { - "auxiliary_loss_clip": 0.01322365, - "auxiliary_loss_mlp": 0.01085854, - "balance_loss_clip": 1.08842373, - "balance_loss_mlp": 1.04405963, - "epoch": 0.020201412896437697, - "flos": 20740567357440.0, - "grad_norm": 2.3527147371949058, - "language_loss": 0.92432821, - "learning_rate": 3.745359722027911e-06, - "loss": 0.94841033, - "num_input_tokens_seen": 7029450, - "step": 336, - "time_per_iteration": 2.6654980182647705 - }, - { - "auxiliary_loss_clip": 0.01321531, - "auxiliary_loss_mlp": 0.01079695, - "balance_loss_clip": 1.08577883, - "balance_loss_mlp": 1.03818631, - "epoch": 0.020261536149105665, - "flos": 20266510636800.0, - "grad_norm": 1.7223490941555537, - "language_loss": 0.88663971, - "learning_rate": 3.7472731028818428e-06, - "loss": 0.91065204, - "num_input_tokens_seen": 7047555, - "step": 337, - "time_per_iteration": 4.246743440628052 - }, - { - "auxiliary_loss_clip": 0.01312441, - "auxiliary_loss_mlp": 0.01102336, - "balance_loss_clip": 1.08320296, - "balance_loss_mlp": 1.05841899, - "epoch": 0.020321659401773638, - "flos": 25848752307840.0, - "grad_norm": 1.6493597356962735, - "language_loss": 0.89869279, - "learning_rate": 3.7491808144508626e-06, - "loss": 0.92284054, - "num_input_tokens_seen": 7068185, - "step": 338, - "time_per_iteration": 5.869866609573364 - }, - { - "auxiliary_loss_clip": 0.01321566, - "auxiliary_loss_mlp": 0.0109858, - "balance_loss_clip": 1.08546185, - "balance_loss_mlp": 1.05554605, - "epoch": 0.020381782654441606, - "flos": 17495033051520.0, - "grad_norm": 2.1603069065052694, - "language_loss": 0.85168982, - "learning_rate": 3.7510828902315576e-06, - "loss": 0.87589133, - "num_input_tokens_seen": 7085955, - "step": 339, - "time_per_iteration": 2.603130340576172 - }, - { - "auxiliary_loss_clip": 0.01328225, - "auxiliary_loss_mlp": 0.01099064, - "balance_loss_clip": 1.0902226, - "balance_loss_mlp": 1.05524242, - "epoch": 0.020441905907109575, - "flos": 24243940465920.0, - "grad_norm": 2.1746002196087817, - "language_loss": 0.88821882, - "learning_rate": 3.75297936342452e-06, - "loss": 0.91249174, - "num_input_tokens_seen": 7106345, - "step": 340, - "time_per_iteration": 2.7247626781463623 - }, - { - "auxiliary_loss_clip": 0.01322505, - "auxiliary_loss_mlp": 0.01085559, - "balance_loss_clip": 1.08594203, - "balance_loss_mlp": 1.04004502, - "epoch": 0.020502029159777543, - "flos": 22233301787520.0, - "grad_norm": 2.004763613818719, - "language_loss": 0.88489276, - "learning_rate": 3.7548702669378253e-06, - "loss": 0.9089734, - "num_input_tokens_seen": 7125070, - "step": 341, - "time_per_iteration": 2.731411933898926 - }, - { - "auxiliary_loss_clip": 0.01324734, - "auxiliary_loss_mlp": 0.01098572, - "balance_loss_clip": 1.08451748, - "balance_loss_mlp": 1.05479813, - "epoch": 0.020562152412445512, - "flos": 23987861429760.0, - "grad_norm": 2.3638593093640736, - "language_loss": 0.80611861, - "learning_rate": 3.756755633390458e-06, - "loss": 0.83035159, - "num_input_tokens_seen": 7144675, - "step": 342, - "time_per_iteration": 2.6085095405578613 - }, - { - "auxiliary_loss_clip": 0.01313805, - "auxiliary_loss_mlp": 0.01098164, - "balance_loss_clip": 1.08411694, - "balance_loss_mlp": 1.05138612, - "epoch": 0.020622275665113484, - "flos": 26975305537920.0, - "grad_norm": 1.727276092160433, - "language_loss": 0.89612651, - "learning_rate": 3.7586354951156886e-06, - "loss": 0.92024612, - "num_input_tokens_seen": 7165505, - "step": 343, - "time_per_iteration": 2.739912509918213 - }, - { - "auxiliary_loss_clip": 0.01324722, - "auxiliary_loss_mlp": 0.01096954, - "balance_loss_clip": 1.09109879, - "balance_loss_mlp": 1.05518293, - "epoch": 0.020682398917781453, - "flos": 22600704049920.0, - "grad_norm": 2.6902665590614663, - "language_loss": 0.78381217, - "learning_rate": 3.7605098841644e-06, - "loss": 0.80802888, - "num_input_tokens_seen": 7184605, - "step": 344, - "time_per_iteration": 2.638439655303955 - }, - { - "auxiliary_loss_clip": 0.01310552, - "auxiliary_loss_mlp": 0.01103983, - "balance_loss_clip": 1.08375537, - "balance_loss_mlp": 1.05982804, - "epoch": 0.02074252217044942, - "flos": 15013605790080.0, - "grad_norm": 2.2675296623639114, - "language_loss": 0.75051636, - "learning_rate": 3.7623788323083666e-06, - "loss": 0.77466166, - "num_input_tokens_seen": 7203065, - "step": 345, - "time_per_iteration": 2.581258773803711 - }, - { - "auxiliary_loss_clip": 0.01316305, - "auxiliary_loss_mlp": 0.01107937, - "balance_loss_clip": 1.08855689, - "balance_loss_mlp": 1.06447339, - "epoch": 0.02080264542311739, - "flos": 25337958952320.0, - "grad_norm": 2.2144688897761395, - "language_loss": 0.90414572, - "learning_rate": 3.7642423710434837e-06, - "loss": 0.92838824, - "num_input_tokens_seen": 7222995, - "step": 346, - "time_per_iteration": 2.6281676292419434 - }, - { - "auxiliary_loss_clip": 0.01312286, - "auxiliary_loss_mlp": 0.01096576, - "balance_loss_clip": 1.08357453, - "balance_loss_mlp": 1.05621195, - "epoch": 0.02086276867578536, - "flos": 24388804016640.0, - "grad_norm": 3.1106741063140366, - "language_loss": 0.79133296, - "learning_rate": 3.7661005315929563e-06, - "loss": 0.81542158, - "num_input_tokens_seen": 7244625, - "step": 347, - "time_per_iteration": 2.6477038860321045 - }, - { - "auxiliary_loss_clip": 0.01317665, - "auxiliary_loss_mlp": 0.01097416, - "balance_loss_clip": 1.08921003, - "balance_loss_mlp": 1.05328524, - "epoch": 0.02092289192845333, - "flos": 24462205459200.0, - "grad_norm": 3.7065871267995893, - "language_loss": 0.71211165, - "learning_rate": 3.7679533449104354e-06, - "loss": 0.73626244, - "num_input_tokens_seen": 7263255, - "step": 348, - "time_per_iteration": 2.6215686798095703 - }, - { - "auxiliary_loss_clip": 0.01319168, - "auxiliary_loss_mlp": 0.01104109, - "balance_loss_clip": 1.0859139, - "balance_loss_mlp": 1.06066906, - "epoch": 0.0209830151811213, - "flos": 17451185523840.0, - "grad_norm": 2.3976328225512495, - "language_loss": 0.77118891, - "learning_rate": 3.7698008416831116e-06, - "loss": 0.79542166, - "num_input_tokens_seen": 7279275, - "step": 349, - "time_per_iteration": 2.60102915763855 - }, - { - "auxiliary_loss_clip": 0.01304146, - "auxiliary_loss_mlp": 0.01101496, - "balance_loss_clip": 1.08412242, - "balance_loss_mlp": 1.06017756, - "epoch": 0.021043138433789268, - "flos": 24573995562240.0, - "grad_norm": 1.7599420553547571, - "language_loss": 0.85191035, - "learning_rate": 3.7716430523347664e-06, - "loss": 0.87596673, - "num_input_tokens_seen": 7300180, - "step": 350, - "time_per_iteration": 2.7636313438415527 - }, - { - "auxiliary_loss_clip": 0.01310639, - "auxiliary_loss_mlp": 0.01090182, - "balance_loss_clip": 1.08742464, - "balance_loss_mlp": 1.05015147, - "epoch": 0.021103261686457236, - "flos": 24454053072000.0, - "grad_norm": 2.2188224040826956, - "language_loss": 0.7998929, - "learning_rate": 3.773480007028776e-06, - "loss": 0.82390112, - "num_input_tokens_seen": 7317430, - "step": 351, - "time_per_iteration": 2.651803493499756 - }, - { - "auxiliary_loss_clip": 0.01318922, - "auxiliary_loss_mlp": 0.01104903, - "balance_loss_clip": 1.08851838, - "balance_loss_mlp": 1.06093884, - "epoch": 0.021163384939125205, - "flos": 14683083816960.0, - "grad_norm": 2.30399977815629, - "language_loss": 0.8746841, - "learning_rate": 3.775311735671078e-06, - "loss": 0.89892232, - "num_input_tokens_seen": 7334875, - "step": 352, - "time_per_iteration": 2.687080144882202 - }, - { - "auxiliary_loss_clip": 0.01311303, - "auxiliary_loss_mlp": 0.01101912, - "balance_loss_clip": 1.0859803, - "balance_loss_mlp": 1.05861485, - "epoch": 0.021223508191793177, - "flos": 24493195918080.0, - "grad_norm": 2.574621592267882, - "language_loss": 0.8247534, - "learning_rate": 3.7771382679130878e-06, - "loss": 0.84888554, - "num_input_tokens_seen": 7355185, - "step": 353, - "time_per_iteration": 2.7096078395843506 - }, - { - "auxiliary_loss_clip": 0.01308698, - "auxiliary_loss_mlp": 0.01092448, - "balance_loss_clip": 1.08573294, - "balance_loss_mlp": 1.05160654, - "epoch": 0.021283631444461146, - "flos": 24126978804480.0, - "grad_norm": 1.9591973719581535, - "language_loss": 0.8089481, - "learning_rate": 3.7789596331545845e-06, - "loss": 0.83295953, - "num_input_tokens_seen": 7374425, - "step": 354, - "time_per_iteration": 2.658649444580078 - }, - { - "auxiliary_loss_clip": 0.01314249, - "auxiliary_loss_mlp": 0.01095812, - "balance_loss_clip": 1.08369493, - "balance_loss_mlp": 1.05218124, - "epoch": 0.021343754697129114, - "flos": 25192233475200.0, - "grad_norm": 2.22170783568627, - "language_loss": 0.81311834, - "learning_rate": 3.780775860546545e-06, - "loss": 0.837219, - "num_input_tokens_seen": 7394175, - "step": 355, - "time_per_iteration": 2.619551420211792 - }, - { - "auxiliary_loss_clip": 0.01310207, - "auxiliary_loss_mlp": 0.01090401, - "balance_loss_clip": 1.08222032, - "balance_loss_mlp": 1.04851055, - "epoch": 0.021403877949797083, - "flos": 17274182279040.0, - "grad_norm": 2.212340256471132, - "language_loss": 0.89746779, - "learning_rate": 3.7825869789939474e-06, - "loss": 0.92147392, - "num_input_tokens_seen": 7412645, - "step": 356, - "time_per_iteration": 2.5877137184143066 - }, - { - "auxiliary_loss_clip": 0.01308298, - "auxiliary_loss_mlp": 0.0108474, - "balance_loss_clip": 1.08573771, - "balance_loss_mlp": 1.04191971, - "epoch": 0.021464001202465055, - "flos": 30917435276160.0, - "grad_norm": 1.9878508054592678, - "language_loss": 0.79956681, - "learning_rate": 3.784393017158528e-06, - "loss": 0.82349718, - "num_input_tokens_seen": 7432275, - "step": 357, - "time_per_iteration": 2.781755208969116 - }, - { - "auxiliary_loss_clip": 0.0130988, - "auxiliary_loss_mlp": 0.01083565, - "balance_loss_clip": 1.08250284, - "balance_loss_mlp": 1.04417801, - "epoch": 0.021524124455133024, - "flos": 18186385098240.0, - "grad_norm": 2.6679617624252137, - "language_loss": 0.76516652, - "learning_rate": 3.786194003461506e-06, - "loss": 0.78910094, - "num_input_tokens_seen": 7450245, - "step": 358, - "time_per_iteration": 2.63144850730896 - }, - { - "auxiliary_loss_clip": 0.01307251, - "auxiliary_loss_mlp": 0.01092013, - "balance_loss_clip": 1.08083165, - "balance_loss_mlp": 1.04842997, - "epoch": 0.021584247707800992, - "flos": 13805786039040.0, - "grad_norm": 2.344744226979962, - "language_loss": 0.88770491, - "learning_rate": 3.787989966086264e-06, - "loss": 0.91169769, - "num_input_tokens_seen": 7466845, - "step": 359, - "time_per_iteration": 2.641932964324951 - }, - { - "auxiliary_loss_clip": 0.01315087, - "auxiliary_loss_mlp": 0.01090441, - "balance_loss_clip": 1.08486438, - "balance_loss_mlp": 1.05088758, - "epoch": 0.02164437096046896, - "flos": 23294713703040.0, - "grad_norm": 3.6505103877164804, - "language_loss": 0.75853801, - "learning_rate": 3.789780932980997e-06, - "loss": 0.78259325, - "num_input_tokens_seen": 7485450, - "step": 360, - "time_per_iteration": 2.5901477336883545 - }, - { - "auxiliary_loss_clip": 0.01203506, - "auxiliary_loss_mlp": 0.0103078, - "balance_loss_clip": 1.07682121, - "balance_loss_mlp": 1.01781011, - "epoch": 0.02170449421313693, - "flos": 68899578341760.0, - "grad_norm": 0.8439708743577624, - "language_loss": 0.64861441, - "learning_rate": 3.79156693186132e-06, - "loss": 0.67095727, - "num_input_tokens_seen": 7553780, - "step": 361, - "time_per_iteration": 3.278409957885742 - }, - { - "auxiliary_loss_clip": 0.01306068, - "auxiliary_loss_mlp": 0.01086116, - "balance_loss_clip": 1.0792098, - "balance_loss_mlp": 1.04501224, - "epoch": 0.0217646174658049, - "flos": 25228539146880.0, - "grad_norm": 3.144635825096315, - "language_loss": 0.78844237, - "learning_rate": 3.7933479902128433e-06, - "loss": 0.81236422, - "num_input_tokens_seen": 7574155, - "step": 362, - "time_per_iteration": 2.6302051544189453 - }, - { - "auxiliary_loss_clip": 0.01309585, - "auxiliary_loss_mlp": 0.01093258, - "balance_loss_clip": 1.08188891, - "balance_loss_mlp": 1.05244076, - "epoch": 0.02182474071847287, - "flos": 22893124671360.0, - "grad_norm": 2.019833715135914, - "language_loss": 0.92474592, - "learning_rate": 3.7951241352937077e-06, - "loss": 0.94877434, - "num_input_tokens_seen": 7592320, - "step": 363, - "time_per_iteration": 2.6566081047058105 - }, - { - "auxiliary_loss_clip": 0.01305173, - "auxiliary_loss_mlp": 0.01096467, - "balance_loss_clip": 1.0816617, - "balance_loss_mlp": 1.05693769, - "epoch": 0.02188486397114084, - "flos": 23658991482240.0, - "grad_norm": 2.282586403147275, - "language_loss": 0.89844346, - "learning_rate": 3.7968953941370915e-06, - "loss": 0.92245984, - "num_input_tokens_seen": 7611185, - "step": 364, - "time_per_iteration": 2.711911201477051 - }, - { - "auxiliary_loss_clip": 0.01311963, - "auxiliary_loss_mlp": 0.0109247, - "balance_loss_clip": 1.08607888, - "balance_loss_mlp": 1.04955506, - "epoch": 0.021944987223808807, - "flos": 21543637680000.0, - "grad_norm": 1.948927065488749, - "language_loss": 0.79460645, - "learning_rate": 3.798661793553676e-06, - "loss": 0.81865084, - "num_input_tokens_seen": 7631970, - "step": 365, - "time_per_iteration": 2.6396052837371826 - }, - { - "auxiliary_loss_clip": 0.01306043, - "auxiliary_loss_mlp": 0.01100405, - "balance_loss_clip": 1.08267248, - "balance_loss_mlp": 1.05658317, - "epoch": 0.022005110476476776, - "flos": 16070887641600.0, - "grad_norm": 1.85181498507666, - "language_loss": 0.84341359, - "learning_rate": 3.8004233601340808e-06, - "loss": 0.86747801, - "num_input_tokens_seen": 7649745, - "step": 366, - "time_per_iteration": 2.6278867721557617 - }, - { - "auxiliary_loss_clip": 0.01312113, - "auxiliary_loss_mlp": 0.01087574, - "balance_loss_clip": 1.08304918, - "balance_loss_mlp": 1.04859269, - "epoch": 0.022065233729144748, - "flos": 21433715084160.0, - "grad_norm": 1.9326288300300676, - "language_loss": 0.87040466, - "learning_rate": 3.8021801202512694e-06, - "loss": 0.89440155, - "num_input_tokens_seen": 7668830, - "step": 367, - "time_per_iteration": 2.6410560607910156 - }, - { - "auxiliary_loss_clip": 0.01312217, - "auxiliary_loss_mlp": 0.01096053, - "balance_loss_clip": 1.08074582, - "balance_loss_mlp": 1.05335259, - "epoch": 0.022125356981812717, - "flos": 21543709507200.0, - "grad_norm": 2.7247329926128976, - "language_loss": 0.8487373, - "learning_rate": 3.803932100062912e-06, - "loss": 0.87282002, - "num_input_tokens_seen": 7687240, - "step": 368, - "time_per_iteration": 2.652012825012207 - }, - { - "auxiliary_loss_clip": 0.01312089, - "auxiliary_loss_mlp": 0.01079926, - "balance_loss_clip": 1.0801568, - "balance_loss_mlp": 1.04027653, - "epoch": 0.022185480234480685, - "flos": 20704153944960.0, - "grad_norm": 2.4839328990540794, - "language_loss": 0.75997221, - "learning_rate": 3.8056793255137264e-06, - "loss": 0.78389233, - "num_input_tokens_seen": 7704440, - "step": 369, - "time_per_iteration": 2.601384401321411 - }, - { - "auxiliary_loss_clip": 0.01306737, - "auxiliary_loss_mlp": 0.01099274, - "balance_loss_clip": 1.08232927, - "balance_loss_mlp": 1.05836105, - "epoch": 0.022245603487148654, - "flos": 25193203142400.0, - "grad_norm": 2.189428421230448, - "language_loss": 0.82977992, - "learning_rate": 3.8074218223377844e-06, - "loss": 0.85383999, - "num_input_tokens_seen": 7727160, - "step": 370, - "time_per_iteration": 2.6538548469543457 - }, - { - "auxiliary_loss_clip": 0.01306327, - "auxiliary_loss_mlp": 0.01099594, - "balance_loss_clip": 1.08127654, - "balance_loss_mlp": 1.05713177, - "epoch": 0.022305726739816623, - "flos": 21395936954880.0, - "grad_norm": 1.8569755368340455, - "language_loss": 0.81588483, - "learning_rate": 3.8091596160607834e-06, - "loss": 0.83994406, - "num_input_tokens_seen": 7747730, - "step": 371, - "time_per_iteration": 2.6779489517211914 - }, - { - "auxiliary_loss_clip": 0.01311283, - "auxiliary_loss_mlp": 0.01093653, - "balance_loss_clip": 1.08593988, - "balance_loss_mlp": 1.05169153, - "epoch": 0.022365849992484595, - "flos": 22492146170880.0, - "grad_norm": 2.0622769904034817, - "language_loss": 0.83493644, - "learning_rate": 3.8108927320022896e-06, - "loss": 0.85898578, - "num_input_tokens_seen": 7766765, - "step": 372, - "time_per_iteration": 2.676797866821289 - }, - { - "auxiliary_loss_clip": 0.01303906, - "auxiliary_loss_mlp": 0.01091688, - "balance_loss_clip": 1.08125615, - "balance_loss_mlp": 1.05022752, - "epoch": 0.022425973245152563, - "flos": 17856581397120.0, - "grad_norm": 2.8569846697004424, - "language_loss": 0.79004842, - "learning_rate": 3.8126211952779548e-06, - "loss": 0.81400436, - "num_input_tokens_seen": 7784010, - "step": 373, - "time_per_iteration": 2.593186616897583 - }, - { - "auxiliary_loss_clip": 0.01309731, - "auxiliary_loss_mlp": 0.01087409, - "balance_loss_clip": 1.08431911, - "balance_loss_mlp": 1.0448271, - "epoch": 0.022486096497820532, - "flos": 15483029656320.0, - "grad_norm": 2.5442660874947385, - "language_loss": 0.77622557, - "learning_rate": 3.8143450308016952e-06, - "loss": 0.80019701, - "num_input_tokens_seen": 7801305, - "step": 374, - "time_per_iteration": 2.628392457962036 - }, - { - "auxiliary_loss_clip": 0.0129871, - "auxiliary_loss_mlp": 0.01076131, - "balance_loss_clip": 1.07404125, - "balance_loss_mlp": 1.03395462, - "epoch": 0.0225462197504885, - "flos": 27784157950080.0, - "grad_norm": 1.574507922341891, - "language_loss": 0.86032569, - "learning_rate": 3.8160642632878525e-06, - "loss": 0.88407415, - "num_input_tokens_seen": 7823965, - "step": 375, - "time_per_iteration": 2.6783435344696045 - }, - { - "auxiliary_loss_clip": 0.01307026, - "auxiliary_loss_mlp": 0.01102393, - "balance_loss_clip": 1.08340597, - "balance_loss_mlp": 1.0590483, - "epoch": 0.02260634300315647, - "flos": 19975490645760.0, - "grad_norm": 2.1279260859120286, - "language_loss": 0.8901403, - "learning_rate": 3.817778917253314e-06, - "loss": 0.91423446, - "num_input_tokens_seen": 7842115, - "step": 376, - "time_per_iteration": 2.621629476547241 - }, - { - "auxiliary_loss_clip": 0.01306872, - "auxiliary_loss_mlp": 0.01087647, - "balance_loss_clip": 1.07870364, - "balance_loss_mlp": 1.04868913, - "epoch": 0.02266646625582444, - "flos": 16028189349120.0, - "grad_norm": 3.0367767906095917, - "language_loss": 0.75437558, - "learning_rate": 3.8194890170196155e-06, - "loss": 0.77832079, - "num_input_tokens_seen": 7857830, - "step": 377, - "time_per_iteration": 2.5465245246887207 - }, - { - "auxiliary_loss_clip": 0.01298987, - "auxiliary_loss_mlp": 0.01093623, - "balance_loss_clip": 1.08128345, - "balance_loss_mlp": 1.0517087, - "epoch": 0.02272658950849241, - "flos": 20404622430720.0, - "grad_norm": 2.1955644054597374, - "language_loss": 0.99231368, - "learning_rate": 3.8211945867150055e-06, - "loss": 1.01623976, - "num_input_tokens_seen": 7875840, - "step": 378, - "time_per_iteration": 7.184643983840942 - }, - { - "auxiliary_loss_clip": 0.01202133, - "auxiliary_loss_mlp": 0.01040839, - "balance_loss_clip": 1.0828104, - "balance_loss_mlp": 1.0283463, - "epoch": 0.02278671276116038, - "flos": 69847332647040.0, - "grad_norm": 0.9608118941287621, - "language_loss": 0.75395739, - "learning_rate": 3.822895650276492e-06, - "loss": 0.7763871, - "num_input_tokens_seen": 7940190, - "step": 379, - "time_per_iteration": 4.961140394210815 - }, - { - "auxiliary_loss_clip": 0.01308523, - "auxiliary_loss_mlp": 0.01087195, - "balance_loss_clip": 1.07820678, - "balance_loss_mlp": 1.04792738, - "epoch": 0.022846836013828347, - "flos": 38508771340800.0, - "grad_norm": 3.7276648293904375, - "language_loss": 0.78197825, - "learning_rate": 3.824592231451859e-06, - "loss": 0.8059355, - "num_input_tokens_seen": 7960840, - "step": 380, - "time_per_iteration": 2.7892863750457764 - }, - { - "auxiliary_loss_clip": 0.01301718, - "auxiliary_loss_mlp": 0.01088822, - "balance_loss_clip": 1.07955217, - "balance_loss_mlp": 1.04945946, - "epoch": 0.02290695926649632, - "flos": 20959478795520.0, - "grad_norm": 2.0941800643649855, - "language_loss": 0.96743369, - "learning_rate": 3.826284353801652e-06, - "loss": 0.99133915, - "num_input_tokens_seen": 7975500, - "step": 381, - "time_per_iteration": 2.619854688644409 - }, - { - "auxiliary_loss_clip": 0.01311313, - "auxiliary_loss_mlp": 0.01093973, - "balance_loss_clip": 1.08192921, - "balance_loss_mlp": 1.0539186, - "epoch": 0.022967082519164288, - "flos": 24022407335040.0, - "grad_norm": 2.122042453210184, - "language_loss": 0.87664795, - "learning_rate": 3.827972040701142e-06, - "loss": 0.90070075, - "num_input_tokens_seen": 7993880, - "step": 382, - "time_per_iteration": 2.617398500442505 - }, - { - "auxiliary_loss_clip": 0.01304042, - "auxiliary_loss_mlp": 0.01096828, - "balance_loss_clip": 1.0821979, - "balance_loss_mlp": 1.05760849, - "epoch": 0.023027205771832256, - "flos": 20997149184000.0, - "grad_norm": 1.978420170714987, - "language_loss": 0.84990942, - "learning_rate": 3.829655315342268e-06, - "loss": 0.87391812, - "num_input_tokens_seen": 8012730, - "step": 383, - "time_per_iteration": 2.6345314979553223 - }, - { - "auxiliary_loss_clip": 0.01300873, - "auxiliary_loss_mlp": 0.0111136, - "balance_loss_clip": 1.08199024, - "balance_loss_mlp": 1.0716393, - "epoch": 0.023087329024500225, - "flos": 21360816432000.0, - "grad_norm": 2.0575071112917778, - "language_loss": 0.83349717, - "learning_rate": 3.831334200735543e-06, - "loss": 0.8576194, - "num_input_tokens_seen": 8031275, - "step": 384, - "time_per_iteration": 2.6339902877807617 - }, - { - "auxiliary_loss_clip": 0.0129979, - "auxiliary_loss_mlp": 0.010893, - "balance_loss_clip": 1.08362782, - "balance_loss_mlp": 1.05255938, - "epoch": 0.023147452277168194, - "flos": 21872435800320.0, - "grad_norm": 1.7828777740185773, - "language_loss": 0.89289594, - "learning_rate": 3.8330087197119426e-06, - "loss": 0.91678685, - "num_input_tokens_seen": 8051600, - "step": 385, - "time_per_iteration": 2.690460205078125 - }, - { - "auxiliary_loss_clip": 0.01305297, - "auxiliary_loss_mlp": 0.01118129, - "balance_loss_clip": 1.08288455, - "balance_loss_mlp": 1.07926655, - "epoch": 0.023207575529836166, - "flos": 18916700423040.0, - "grad_norm": 1.9487706588237765, - "language_loss": 0.70157433, - "learning_rate": 3.83467889492477e-06, - "loss": 0.72580856, - "num_input_tokens_seen": 8070600, - "step": 386, - "time_per_iteration": 2.681957721710205 - }, - { - "auxiliary_loss_clip": 0.01305989, - "auxiliary_loss_mlp": 0.0109088, - "balance_loss_clip": 1.08441973, - "balance_loss_mlp": 1.05309081, - "epoch": 0.023267698782504134, - "flos": 25046005207680.0, - "grad_norm": 2.354342660334866, - "language_loss": 0.87840039, - "learning_rate": 3.836344748851495e-06, - "loss": 0.90236908, - "num_input_tokens_seen": 8090680, - "step": 387, - "time_per_iteration": 2.6511123180389404 - }, - { - "auxiliary_loss_clip": 0.01304298, - "auxiliary_loss_mlp": 0.01075541, - "balance_loss_clip": 1.08178413, - "balance_loss_mlp": 1.03658366, - "epoch": 0.023327822035172103, - "flos": 28879217930880.0, - "grad_norm": 2.2068948332198643, - "language_loss": 0.8341614, - "learning_rate": 3.838006303795566e-06, - "loss": 0.85795981, - "num_input_tokens_seen": 8114610, - "step": 388, - "time_per_iteration": 2.7062034606933594 - }, - { - "auxiliary_loss_clip": 0.01301997, - "auxiliary_loss_mlp": 0.01089724, - "balance_loss_clip": 1.08110905, - "balance_loss_mlp": 1.05284107, - "epoch": 0.02338794528784007, - "flos": 27121533805440.0, - "grad_norm": 2.1887236217853863, - "language_loss": 0.93710232, - "learning_rate": 3.839663581888206e-06, - "loss": 0.96101958, - "num_input_tokens_seen": 8133975, - "step": 389, - "time_per_iteration": 2.680280923843384 - }, - { - "auxiliary_loss_clip": 0.01296082, - "auxiliary_loss_mlp": 0.01083127, - "balance_loss_clip": 1.0818491, - "balance_loss_mlp": 1.04397893, - "epoch": 0.02344806854050804, - "flos": 21322355944320.0, - "grad_norm": 1.981860280002506, - "language_loss": 0.87747037, - "learning_rate": 3.841316605090178e-06, - "loss": 0.9012624, - "num_input_tokens_seen": 8153570, - "step": 390, - "time_per_iteration": 2.65970516204834 - }, - { - "auxiliary_loss_clip": 0.01301203, - "auxiliary_loss_mlp": 0.01092853, - "balance_loss_clip": 1.08357048, - "balance_loss_mlp": 1.0568521, - "epoch": 0.023508191793176012, - "flos": 24789997998720.0, - "grad_norm": 2.134782100250632, - "language_loss": 0.89370871, - "learning_rate": 3.842965395193529e-06, - "loss": 0.91764927, - "num_input_tokens_seen": 8170075, - "step": 391, - "time_per_iteration": 2.620009660720825 - }, - { - "auxiliary_loss_clip": 0.01296395, - "auxiliary_loss_mlp": 0.01072264, - "balance_loss_clip": 1.07956719, - "balance_loss_mlp": 1.03521371, - "epoch": 0.02356831504584398, - "flos": 25995375624960.0, - "grad_norm": 2.366558958564603, - "language_loss": 0.86076117, - "learning_rate": 3.84460997382332e-06, - "loss": 0.88444775, - "num_input_tokens_seen": 8190420, - "step": 392, - "time_per_iteration": 2.7171695232391357 - }, - { - "auxiliary_loss_clip": 0.01293283, - "auxiliary_loss_mlp": 0.01084283, - "balance_loss_clip": 1.07891107, - "balance_loss_mlp": 1.04763794, - "epoch": 0.02362843829851195, - "flos": 19062461813760.0, - "grad_norm": 2.038818686720474, - "language_loss": 0.89096916, - "learning_rate": 3.8462503624393256e-06, - "loss": 0.91474473, - "num_input_tokens_seen": 8208790, - "step": 393, - "time_per_iteration": 2.632129669189453 - }, - { - "auxiliary_loss_clip": 0.01304158, - "auxiliary_loss_mlp": 0.01102255, - "balance_loss_clip": 1.08471596, - "balance_loss_mlp": 1.06279635, - "epoch": 0.023688561551179918, - "flos": 16071031296000.0, - "grad_norm": 1.7920692319020195, - "language_loss": 0.8156364, - "learning_rate": 3.84788658233771e-06, - "loss": 0.83970058, - "num_input_tokens_seen": 8226885, - "step": 394, - "time_per_iteration": 2.5932936668395996 - }, - { - "auxiliary_loss_clip": 0.01296851, - "auxiliary_loss_mlp": 0.01088191, - "balance_loss_clip": 1.07939875, - "balance_loss_mlp": 1.04920936, - "epoch": 0.023748684803847887, - "flos": 21724375939200.0, - "grad_norm": 4.539737106404062, - "language_loss": 0.85808635, - "learning_rate": 3.84951865465269e-06, - "loss": 0.88193679, - "num_input_tokens_seen": 8246825, - "step": 395, - "time_per_iteration": 2.6112868785858154 - }, - { - "auxiliary_loss_clip": 0.01194704, - "auxiliary_loss_mlp": 0.01034684, - "balance_loss_clip": 1.07210529, - "balance_loss_mlp": 1.02319229, - "epoch": 0.02380880805651586, - "flos": 61926192881280.0, - "grad_norm": 0.9258089920958834, - "language_loss": 0.6380353, - "learning_rate": 3.851146600358172e-06, - "loss": 0.66032922, - "num_input_tokens_seen": 8302835, - "step": 396, - "time_per_iteration": 3.031489133834839 - }, - { - "auxiliary_loss_clip": 0.0129188, - "auxiliary_loss_mlp": 0.01071022, - "balance_loss_clip": 1.07806754, - "balance_loss_mlp": 1.03447223, - "epoch": 0.023868931309183827, - "flos": 20266331068800.0, - "grad_norm": 2.3741099598177624, - "language_loss": 0.83878696, - "learning_rate": 3.852770440269372e-06, - "loss": 0.86241591, - "num_input_tokens_seen": 8320745, - "step": 397, - "time_per_iteration": 2.6049532890319824 - }, - { - "auxiliary_loss_clip": 0.01297108, - "auxiliary_loss_mlp": 0.01087341, - "balance_loss_clip": 1.08104038, - "balance_loss_mlp": 1.04890823, - "epoch": 0.023929054561851796, - "flos": 21139103733120.0, - "grad_norm": 4.6847154905409205, - "language_loss": 0.84066498, - "learning_rate": 3.854390195044404e-06, - "loss": 0.86450952, - "num_input_tokens_seen": 8339540, - "step": 398, - "time_per_iteration": 2.6516692638397217 - }, - { - "auxiliary_loss_clip": 0.01295876, - "auxiliary_loss_mlp": 0.01078722, - "balance_loss_clip": 1.07671928, - "balance_loss_mlp": 1.04007471, - "epoch": 0.023989177814519765, - "flos": 13698521049600.0, - "grad_norm": 2.80358563189936, - "language_loss": 0.86029691, - "learning_rate": 3.856005885185868e-06, - "loss": 0.88404286, - "num_input_tokens_seen": 8354890, - "step": 399, - "time_per_iteration": 2.5452589988708496 - }, - { - "auxiliary_loss_clip": 0.01292698, - "auxiliary_loss_mlp": 0.01090822, - "balance_loss_clip": 1.08074594, - "balance_loss_mlp": 1.05308056, - "epoch": 0.024049301067187733, - "flos": 26322018929280.0, - "grad_norm": 2.021318687641168, - "language_loss": 0.86254489, - "learning_rate": 3.857617531042398e-06, - "loss": 0.88638014, - "num_input_tokens_seen": 8375845, - "step": 400, - "time_per_iteration": 2.6626927852630615 - }, - { - "auxiliary_loss_clip": 0.01299822, - "auxiliary_loss_mlp": 0.01083301, - "balance_loss_clip": 1.08346462, - "balance_loss_mlp": 1.04687035, - "epoch": 0.024109424319855705, - "flos": 24425432910720.0, - "grad_norm": 1.735822397657743, - "language_loss": 0.79276752, - "learning_rate": 3.8592251528102065e-06, - "loss": 0.81659877, - "num_input_tokens_seen": 8395240, - "step": 401, - "time_per_iteration": 2.68418025970459 - }, - { - "auxiliary_loss_clip": 0.0129275, - "auxiliary_loss_mlp": 0.01091389, - "balance_loss_clip": 1.07852793, - "balance_loss_mlp": 1.05493474, - "epoch": 0.024169547572523674, - "flos": 29604397610880.0, - "grad_norm": 3.889755427752258, - "language_loss": 0.78890866, - "learning_rate": 3.8608287705345976e-06, - "loss": 0.81274998, - "num_input_tokens_seen": 8416950, - "step": 402, - "time_per_iteration": 2.7509379386901855 - }, - { - "auxiliary_loss_clip": 0.01296434, - "auxiliary_loss_mlp": 0.01082712, - "balance_loss_clip": 1.07797897, - "balance_loss_mlp": 1.04399323, - "epoch": 0.024229670825191642, - "flos": 22601458235520.0, - "grad_norm": 2.49356632429363, - "language_loss": 0.94936156, - "learning_rate": 3.86242840411147e-06, - "loss": 0.97315305, - "num_input_tokens_seen": 8433660, - "step": 403, - "time_per_iteration": 2.5760560035705566 - }, - { - "auxiliary_loss_clip": 0.0129994, - "auxiliary_loss_mlp": 0.01091893, - "balance_loss_clip": 1.07754242, - "balance_loss_mlp": 1.05315053, - "epoch": 0.02428979407785961, - "flos": 18150258994560.0, - "grad_norm": 2.361656575803209, - "language_loss": 0.99877387, - "learning_rate": 3.864024073288798e-06, - "loss": 1.0226922, - "num_input_tokens_seen": 8450180, - "step": 404, - "time_per_iteration": 2.5966458320617676 - }, - { - "auxiliary_loss_clip": 0.01298911, - "auxiliary_loss_mlp": 0.01100127, - "balance_loss_clip": 1.08096266, - "balance_loss_mlp": 1.06312442, - "epoch": 0.024349917330527583, - "flos": 15304984917120.0, - "grad_norm": 2.3162348618509276, - "language_loss": 0.8802169, - "learning_rate": 3.865615797668091e-06, - "loss": 0.90420723, - "num_input_tokens_seen": 8467775, - "step": 405, - "time_per_iteration": 2.5728275775909424 - }, - { - "auxiliary_loss_clip": 0.01306827, - "auxiliary_loss_mlp": 0.01097881, - "balance_loss_clip": 1.084512, - "balance_loss_mlp": 1.06004393, - "epoch": 0.024410040583195552, - "flos": 20773892200320.0, - "grad_norm": 2.7399607903318275, - "language_loss": 0.93386561, - "learning_rate": 3.867203596705844e-06, - "loss": 0.95791268, - "num_input_tokens_seen": 8486765, - "step": 406, - "time_per_iteration": 2.612668991088867 - }, - { - "auxiliary_loss_clip": 0.01299426, - "auxiliary_loss_mlp": 0.01088378, - "balance_loss_clip": 1.08213782, - "balance_loss_mlp": 1.0500164, - "epoch": 0.02447016383586352, - "flos": 21798854789760.0, - "grad_norm": 2.1742012769968526, - "language_loss": 0.87128031, - "learning_rate": 3.86878748971496e-06, - "loss": 0.89515841, - "num_input_tokens_seen": 8506515, - "step": 407, - "time_per_iteration": 2.5982017517089844 - }, - { - "auxiliary_loss_clip": 0.01298266, - "auxiliary_loss_mlp": 0.01083858, - "balance_loss_clip": 1.08472157, - "balance_loss_mlp": 1.04630709, - "epoch": 0.02453028708853149, - "flos": 33948116380800.0, - "grad_norm": 2.1458430439144234, - "language_loss": 0.74102569, - "learning_rate": 3.8703674958661596e-06, - "loss": 0.76484692, - "num_input_tokens_seen": 8528035, - "step": 408, - "time_per_iteration": 2.708670139312744 - }, - { - "auxiliary_loss_clip": 0.01300128, - "auxiliary_loss_mlp": 0.01089985, - "balance_loss_clip": 1.08222318, - "balance_loss_mlp": 1.05233896, - "epoch": 0.024590410341199458, - "flos": 21793000872960.0, - "grad_norm": 2.4878473813549675, - "language_loss": 0.92509401, - "learning_rate": 3.871943634189376e-06, - "loss": 0.94899511, - "num_input_tokens_seen": 8546455, - "step": 409, - "time_per_iteration": 2.665321111679077 - }, - { - "auxiliary_loss_clip": 0.01296394, - "auxiliary_loss_mlp": 0.01077538, - "balance_loss_clip": 1.08126342, - "balance_loss_mlp": 1.04291987, - "epoch": 0.02465053359386743, - "flos": 35114782124160.0, - "grad_norm": 2.2521095969191722, - "language_loss": 0.82792604, - "learning_rate": 3.873515923575128e-06, - "loss": 0.85166532, - "num_input_tokens_seen": 8568450, - "step": 410, - "time_per_iteration": 2.848928213119507 - }, - { - "auxiliary_loss_clip": 0.01299459, - "auxiliary_loss_mlp": 0.01089133, - "balance_loss_clip": 1.08187068, - "balance_loss_mlp": 1.05284572, - "epoch": 0.0247106568465354, - "flos": 27451409333760.0, - "grad_norm": 2.1393760271628595, - "language_loss": 0.77577484, - "learning_rate": 3.875084382775879e-06, - "loss": 0.79966074, - "num_input_tokens_seen": 8589340, - "step": 411, - "time_per_iteration": 2.6645278930664062 - }, - { - "auxiliary_loss_clip": 0.01298341, - "auxiliary_loss_mlp": 0.0110154, - "balance_loss_clip": 1.07977521, - "balance_loss_mlp": 1.06289268, - "epoch": 0.024770780099203367, - "flos": 20703794808960.0, - "grad_norm": 2.2974658872162665, - "language_loss": 0.86379063, - "learning_rate": 3.87664903040738e-06, - "loss": 0.88778943, - "num_input_tokens_seen": 8607150, - "step": 412, - "time_per_iteration": 2.6091151237487793 - }, - { - "auxiliary_loss_clip": 0.01187014, - "auxiliary_loss_mlp": 0.01031436, - "balance_loss_clip": 1.07387948, - "balance_loss_mlp": 1.02089787, - "epoch": 0.024830903351871336, - "flos": 69551859369600.0, - "grad_norm": 0.8687159185244209, - "language_loss": 0.5852263, - "learning_rate": 3.878209884949994e-06, - "loss": 0.60741079, - "num_input_tokens_seen": 8669865, - "step": 413, - "time_per_iteration": 3.2269625663757324 - }, - { - "auxiliary_loss_clip": 0.0129043, - "auxiliary_loss_mlp": 0.01091958, - "balance_loss_clip": 1.07709181, - "balance_loss_mlp": 1.05249953, - "epoch": 0.024891026604539304, - "flos": 32270477713920.0, - "grad_norm": 1.8280666153990437, - "language_loss": 0.80517173, - "learning_rate": 3.879766964750006e-06, - "loss": 0.82899559, - "num_input_tokens_seen": 8690235, - "step": 414, - "time_per_iteration": 2.720341444015503 - }, - { - "auxiliary_loss_clip": 0.01287097, - "auxiliary_loss_mlp": 0.0109242, - "balance_loss_clip": 1.0756042, - "balance_loss_mlp": 1.0556556, - "epoch": 0.024951149857207276, - "flos": 18840282238080.0, - "grad_norm": 2.1921003994701302, - "language_loss": 0.80227423, - "learning_rate": 3.881320288020917e-06, - "loss": 0.82606936, - "num_input_tokens_seen": 8706295, - "step": 415, - "time_per_iteration": 2.6473400592803955 - }, - { - "auxiliary_loss_clip": 0.01302694, - "auxiliary_loss_mlp": 0.01082455, - "balance_loss_clip": 1.08156919, - "balance_loss_mlp": 1.04497528, - "epoch": 0.025011273109875245, - "flos": 15377201210880.0, - "grad_norm": 2.9318871737289776, - "language_loss": 0.96236515, - "learning_rate": 3.882869872844723e-06, - "loss": 0.9862166, - "num_input_tokens_seen": 8724200, - "step": 416, - "time_per_iteration": 2.596189260482788 - }, - { - "auxiliary_loss_clip": 0.01291636, - "auxiliary_loss_mlp": 0.01074465, - "balance_loss_clip": 1.07628798, - "balance_loss_mlp": 1.0355792, - "epoch": 0.025071396362543213, - "flos": 18915515274240.0, - "grad_norm": 1.741746736079687, - "language_loss": 0.77381694, - "learning_rate": 3.884415737173176e-06, - "loss": 0.79747796, - "num_input_tokens_seen": 8744170, - "step": 417, - "time_per_iteration": 5.610344171524048 - }, - { - "auxiliary_loss_clip": 0.01290746, - "auxiliary_loss_mlp": 0.0109022, - "balance_loss_clip": 1.08072221, - "balance_loss_mlp": 1.05264485, - "epoch": 0.025131519615211182, - "flos": 25337958952320.0, - "grad_norm": 1.554385639735456, - "language_loss": 0.77076226, - "learning_rate": 3.8859578988290344e-06, - "loss": 0.79457194, - "num_input_tokens_seen": 8765120, - "step": 418, - "time_per_iteration": 5.837290525436401 - }, - { - "auxiliary_loss_clip": 0.01297026, - "auxiliary_loss_mlp": 0.01071197, - "balance_loss_clip": 1.08019948, - "balance_loss_mlp": 1.03550553, - "epoch": 0.02519164286787915, - "flos": 18953149749120.0, - "grad_norm": 2.4603268634516207, - "language_loss": 0.81445098, - "learning_rate": 3.887496375507294e-06, - "loss": 0.83813322, - "num_input_tokens_seen": 8783500, - "step": 419, - "time_per_iteration": 2.582590341567993 - }, - { - "auxiliary_loss_clip": 0.01291114, - "auxiliary_loss_mlp": 0.01086736, - "balance_loss_clip": 1.07929599, - "balance_loss_mlp": 1.04708743, - "epoch": 0.025251766120547123, - "flos": 17421092904960.0, - "grad_norm": 1.8078532084212713, - "language_loss": 0.73618573, - "learning_rate": 3.8890311847764065e-06, - "loss": 0.75996423, - "num_input_tokens_seen": 8801175, - "step": 420, - "time_per_iteration": 2.6739418506622314 - }, - { - "auxiliary_loss_clip": 0.01290485, - "auxiliary_loss_mlp": 0.01096292, - "balance_loss_clip": 1.07605243, - "balance_loss_mlp": 1.05924153, - "epoch": 0.02531188937321509, - "flos": 25045430590080.0, - "grad_norm": 1.77336014903074, - "language_loss": 0.79040134, - "learning_rate": 3.890562344079484e-06, - "loss": 0.81426907, - "num_input_tokens_seen": 8820215, - "step": 421, - "time_per_iteration": 2.6928632259368896 - }, - { - "auxiliary_loss_clip": 0.01290689, - "auxiliary_loss_mlp": 0.01088863, - "balance_loss_clip": 1.07922924, - "balance_loss_mlp": 1.04983425, - "epoch": 0.02537201262588306, - "flos": 30592228515840.0, - "grad_norm": 2.2139016136437104, - "language_loss": 0.8203755, - "learning_rate": 3.89208987073549e-06, - "loss": 0.84417105, - "num_input_tokens_seen": 8839660, - "step": 422, - "time_per_iteration": 2.714707851409912 - }, - { - "auxiliary_loss_clip": 0.01293659, - "auxiliary_loss_mlp": 0.01078975, - "balance_loss_clip": 1.07677865, - "balance_loss_mlp": 1.04430926, - "epoch": 0.02543213587855103, - "flos": 26065365275520.0, - "grad_norm": 2.1259138778576356, - "language_loss": 0.83458018, - "learning_rate": 3.893613781940409e-06, - "loss": 0.85830647, - "num_input_tokens_seen": 8859280, - "step": 423, - "time_per_iteration": 2.652757167816162 - }, - { - "auxiliary_loss_clip": 0.01287497, - "auxiliary_loss_mlp": 0.01078335, - "balance_loss_clip": 1.0742569, - "balance_loss_mlp": 1.04221487, - "epoch": 0.025492259131218997, - "flos": 36022818965760.0, - "grad_norm": 2.012741083661608, - "language_loss": 0.74129444, - "learning_rate": 3.895134094768415e-06, - "loss": 0.76495278, - "num_input_tokens_seen": 8880560, - "step": 424, - "time_per_iteration": 2.7724521160125732 - }, - { - "auxiliary_loss_clip": 0.01296446, - "auxiliary_loss_mlp": 0.01093799, - "balance_loss_clip": 1.07987142, - "balance_loss_mlp": 1.05782199, - "epoch": 0.02555238238388697, - "flos": 18588045957120.0, - "grad_norm": 4.623670538116741, - "language_loss": 0.83193713, - "learning_rate": 3.896650826173015e-06, - "loss": 0.85583955, - "num_input_tokens_seen": 8899155, - "step": 425, - "time_per_iteration": 2.608029842376709 - }, - { - "auxiliary_loss_clip": 0.01292462, - "auxiliary_loss_mlp": 0.01092376, - "balance_loss_clip": 1.07259536, - "balance_loss_mlp": 1.0544672, - "epoch": 0.025612505636554938, - "flos": 24243186280320.0, - "grad_norm": 2.5075767706443566, - "language_loss": 0.853073, - "learning_rate": 3.898163992988186e-06, - "loss": 0.87692136, - "num_input_tokens_seen": 8917890, - "step": 426, - "time_per_iteration": 2.6445271968841553 - }, - { - "auxiliary_loss_clip": 0.01175923, - "auxiliary_loss_mlp": 0.01017688, - "balance_loss_clip": 1.06532824, - "balance_loss_mlp": 1.00781715, - "epoch": 0.025672628889222907, - "flos": 60586941265920.0, - "grad_norm": 0.8949637292547264, - "language_loss": 0.57219732, - "learning_rate": 3.899673611929491e-06, - "loss": 0.5941335, - "num_input_tokens_seen": 8978260, - "step": 427, - "time_per_iteration": 3.2690517902374268 - }, - { - "auxiliary_loss_clip": 0.01291989, - "auxiliary_loss_mlp": 0.01092649, - "balance_loss_clip": 1.08155811, - "balance_loss_mlp": 1.05674267, - "epoch": 0.025732752141890875, - "flos": 19573255169280.0, - "grad_norm": 2.4869215225306673, - "language_loss": 0.88130605, - "learning_rate": 3.901179699595194e-06, - "loss": 0.90515244, - "num_input_tokens_seen": 8994460, - "step": 428, - "time_per_iteration": 2.6143813133239746 - }, - { - "auxiliary_loss_clip": 0.01283603, - "auxiliary_loss_mlp": 0.0107531, - "balance_loss_clip": 1.07418942, - "balance_loss_mlp": 1.03735399, - "epoch": 0.025792875394558847, - "flos": 31284262920960.0, - "grad_norm": 2.067247304638145, - "language_loss": 0.85790849, - "learning_rate": 3.902682272467353e-06, - "loss": 0.88149762, - "num_input_tokens_seen": 9016670, - "step": 429, - "time_per_iteration": 2.749328374862671 - }, - { - "auxiliary_loss_clip": 0.01288943, - "auxiliary_loss_mlp": 0.01083888, - "balance_loss_clip": 1.07337689, - "balance_loss_mlp": 1.04590786, - "epoch": 0.025852998647226816, - "flos": 32379610210560.0, - "grad_norm": 2.4411876712444034, - "language_loss": 0.8815223, - "learning_rate": 3.904181346912895e-06, - "loss": 0.90525061, - "num_input_tokens_seen": 9039720, - "step": 430, - "time_per_iteration": 2.7483572959899902 - }, - { - "auxiliary_loss_clip": 0.01290726, - "auxiliary_loss_mlp": 0.01080495, - "balance_loss_clip": 1.0803287, - "balance_loss_mlp": 1.04573333, - "epoch": 0.025913121899894784, - "flos": 20193288762240.0, - "grad_norm": 2.086180078538185, - "language_loss": 0.84249514, - "learning_rate": 3.905676939184698e-06, - "loss": 0.8662073, - "num_input_tokens_seen": 9059850, - "step": 431, - "time_per_iteration": 2.6531126499176025 - }, - { - "auxiliary_loss_clip": 0.01286945, - "auxiliary_loss_mlp": 0.01073345, - "balance_loss_clip": 1.07570636, - "balance_loss_mlp": 1.03951311, - "epoch": 0.025973245152562753, - "flos": 14720430983040.0, - "grad_norm": 2.681931959502968, - "language_loss": 0.86511916, - "learning_rate": 3.907169065422638e-06, - "loss": 0.88872206, - "num_input_tokens_seen": 9077590, - "step": 432, - "time_per_iteration": 2.7582762241363525 - }, - { - "auxiliary_loss_clip": 0.01287429, - "auxiliary_loss_mlp": 0.01072961, - "balance_loss_clip": 1.07632601, - "balance_loss_mlp": 1.03891492, - "epoch": 0.02603336840523072, - "flos": 30992991534720.0, - "grad_norm": 1.95596969308187, - "language_loss": 0.76036298, - "learning_rate": 3.908657741654636e-06, - "loss": 0.7839669, - "num_input_tokens_seen": 9099880, - "step": 433, - "time_per_iteration": 2.707771062850952 - }, - { - "auxiliary_loss_clip": 0.01289436, - "auxiliary_loss_mlp": 0.01088504, - "balance_loss_clip": 1.07470191, - "balance_loss_mlp": 1.04973757, - "epoch": 0.026093491657898694, - "flos": 17674262939520.0, - "grad_norm": 2.157056093147959, - "language_loss": 0.8979522, - "learning_rate": 3.910142983797699e-06, - "loss": 0.92173159, - "num_input_tokens_seen": 9118620, - "step": 434, - "time_per_iteration": 2.5665409564971924 - }, - { - "auxiliary_loss_clip": 0.01289617, - "auxiliary_loss_mlp": 0.01096405, - "balance_loss_clip": 1.07960439, - "balance_loss_mlp": 1.05904448, - "epoch": 0.026153614910566662, - "flos": 17857874286720.0, - "grad_norm": 2.306071945033866, - "language_loss": 0.80187833, - "learning_rate": 3.9116248076589305e-06, - "loss": 0.82573849, - "num_input_tokens_seen": 9135655, - "step": 435, - "time_per_iteration": 2.614440679550171 - }, - { - "auxiliary_loss_clip": 0.01285396, - "auxiliary_loss_mlp": 0.01092207, - "balance_loss_clip": 1.07367229, - "balance_loss_mlp": 1.05503798, - "epoch": 0.02621373816323463, - "flos": 20011113959040.0, - "grad_norm": 3.0257040949539356, - "language_loss": 0.86361396, - "learning_rate": 3.913103228936546e-06, - "loss": 0.88739002, - "num_input_tokens_seen": 9153520, - "step": 436, - "time_per_iteration": 2.635033130645752 - }, - { - "auxiliary_loss_clip": 0.01289558, - "auxiliary_loss_mlp": 0.01096903, - "balance_loss_clip": 1.07716811, - "balance_loss_mlp": 1.06080687, - "epoch": 0.0262738614159026, - "flos": 19281193683840.0, - "grad_norm": 2.4233286399217993, - "language_loss": 0.74725163, - "learning_rate": 3.914578263220868e-06, - "loss": 0.77111626, - "num_input_tokens_seen": 9170750, - "step": 437, - "time_per_iteration": 2.6614880561828613 - }, - { - "auxiliary_loss_clip": 0.01286403, - "auxiliary_loss_mlp": 0.01100399, - "balance_loss_clip": 1.07628679, - "balance_loss_mlp": 1.06220388, - "epoch": 0.026333984668570568, - "flos": 18807208790400.0, - "grad_norm": 2.79370908187484, - "language_loss": 0.9131338, - "learning_rate": 3.916049925995316e-06, - "loss": 0.93700182, - "num_input_tokens_seen": 9188430, - "step": 438, - "time_per_iteration": 2.674877166748047 - }, - { - "auxiliary_loss_clip": 0.01169678, - "auxiliary_loss_mlp": 0.01072518, - "balance_loss_clip": 1.0602653, - "balance_loss_mlp": 1.06250465, - "epoch": 0.02639410792123854, - "flos": 64572020691840.0, - "grad_norm": 0.8871275810137318, - "language_loss": 0.62631273, - "learning_rate": 3.917518232637377e-06, - "loss": 0.64873469, - "num_input_tokens_seen": 9255835, - "step": 439, - "time_per_iteration": 3.2527849674224854 - }, - { - "auxiliary_loss_clip": 0.01296492, - "auxiliary_loss_mlp": 0.01095184, - "balance_loss_clip": 1.08175814, - "balance_loss_mlp": 1.05758572, - "epoch": 0.02645423117390651, - "flos": 28473462921600.0, - "grad_norm": 3.31985956061953, - "language_loss": 0.75982475, - "learning_rate": 3.918983198419573e-06, - "loss": 0.78374153, - "num_input_tokens_seen": 9276835, - "step": 440, - "time_per_iteration": 2.6770262718200684 - }, - { - "auxiliary_loss_clip": 0.01286342, - "auxiliary_loss_mlp": 0.01076505, - "balance_loss_clip": 1.07652593, - "balance_loss_mlp": 1.04048026, - "epoch": 0.026514354426574478, - "flos": 18551237495040.0, - "grad_norm": 3.0236705091068283, - "language_loss": 0.83197021, - "learning_rate": 3.920444838510415e-06, - "loss": 0.85559869, - "num_input_tokens_seen": 9295075, - "step": 441, - "time_per_iteration": 2.591306209564209 - }, - { - "auxiliary_loss_clip": 0.01291817, - "auxiliary_loss_mlp": 0.01086154, - "balance_loss_clip": 1.07703269, - "balance_loss_mlp": 1.04829359, - "epoch": 0.026574477679242446, - "flos": 20667812359680.0, - "grad_norm": 2.202684635319811, - "language_loss": 0.78490162, - "learning_rate": 3.92190316797534e-06, - "loss": 0.80868137, - "num_input_tokens_seen": 9314205, - "step": 442, - "time_per_iteration": 2.633054733276367 - }, - { - "auxiliary_loss_clip": 0.0116251, - "auxiliary_loss_mlp": 0.01015158, - "balance_loss_clip": 1.05336332, - "balance_loss_mlp": 1.0054301, - "epoch": 0.026634600931910415, - "flos": 57956125340160.0, - "grad_norm": 0.9609264438471399, - "language_loss": 0.64459753, - "learning_rate": 3.92335820177765e-06, - "loss": 0.66637421, - "num_input_tokens_seen": 9367395, - "step": 443, - "time_per_iteration": 3.1241400241851807 - }, - { - "auxiliary_loss_clip": 0.01291897, - "auxiliary_loss_mlp": 0.01085882, - "balance_loss_clip": 1.08147204, - "balance_loss_mlp": 1.04906964, - "epoch": 0.026694724184578387, - "flos": 15815131827840.0, - "grad_norm": 2.121488874389134, - "language_loss": 0.82093638, - "learning_rate": 3.924809954779425e-06, - "loss": 0.84471416, - "num_input_tokens_seen": 9385185, - "step": 444, - "time_per_iteration": 2.6202428340911865 - }, - { - "auxiliary_loss_clip": 0.0129406, - "auxiliary_loss_mlp": 0.01082041, - "balance_loss_clip": 1.07940578, - "balance_loss_mlp": 1.04263067, - "epoch": 0.026754847437246355, - "flos": 23440259612160.0, - "grad_norm": 2.2213674770888607, - "language_loss": 0.95689106, - "learning_rate": 3.9262584417424425e-06, - "loss": 0.98065209, - "num_input_tokens_seen": 9403225, - "step": 445, - "time_per_iteration": 2.6071228981018066 - }, - { - "auxiliary_loss_clip": 0.01289866, - "auxiliary_loss_mlp": 0.01094053, - "balance_loss_clip": 1.07953668, - "balance_loss_mlp": 1.05492878, - "epoch": 0.026814970689914324, - "flos": 17341801632000.0, - "grad_norm": 2.775359545549618, - "language_loss": 0.91932094, - "learning_rate": 3.9277036773290725e-06, - "loss": 0.94316012, - "num_input_tokens_seen": 9420540, - "step": 446, - "time_per_iteration": 2.5791916847229004 - }, - { - "auxiliary_loss_clip": 0.01289847, - "auxiliary_loss_mlp": 0.01088114, - "balance_loss_clip": 1.08072042, - "balance_loss_mlp": 1.05092025, - "epoch": 0.026875093942582293, - "flos": 17894718662400.0, - "grad_norm": 2.0562763127679204, - "language_loss": 0.79831308, - "learning_rate": 3.92914567610317e-06, - "loss": 0.82209271, - "num_input_tokens_seen": 9438840, - "step": 447, - "time_per_iteration": 2.6420843601226807 - }, - { - "auxiliary_loss_clip": 0.01289397, - "auxiliary_loss_mlp": 0.01079607, - "balance_loss_clip": 1.07901013, - "balance_loss_mlp": 1.04446411, - "epoch": 0.026935217195250265, - "flos": 21723980889600.0, - "grad_norm": 2.231264914467203, - "language_loss": 0.86402845, - "learning_rate": 3.930584452530952e-06, - "loss": 0.8877185, - "num_input_tokens_seen": 9457215, - "step": 448, - "time_per_iteration": 2.590277910232544 - }, - { - "auxiliary_loss_clip": 0.01282455, - "auxiliary_loss_mlp": 0.01091099, - "balance_loss_clip": 1.07706833, - "balance_loss_mlp": 1.05662322, - "epoch": 0.026995340447918233, - "flos": 23622685810560.0, - "grad_norm": 1.941778256808524, - "language_loss": 0.88581634, - "learning_rate": 3.9320200209818755e-06, - "loss": 0.90955186, - "num_input_tokens_seen": 9475615, - "step": 449, - "time_per_iteration": 2.610065460205078 - }, - { - "auxiliary_loss_clip": 0.01293472, - "auxiliary_loss_mlp": 0.01085576, - "balance_loss_clip": 1.07856452, - "balance_loss_mlp": 1.04814398, - "epoch": 0.027055463700586202, - "flos": 17931275729280.0, - "grad_norm": 2.199007921978797, - "language_loss": 0.80395782, - "learning_rate": 3.933452395729493e-06, - "loss": 0.8277483, - "num_input_tokens_seen": 9493975, - "step": 450, - "time_per_iteration": 2.637465238571167 - }, - { - "auxiliary_loss_clip": 0.01284612, - "auxiliary_loss_mlp": 0.0108001, - "balance_loss_clip": 1.08025336, - "balance_loss_mlp": 1.04384232, - "epoch": 0.02711558695325417, - "flos": 25118903859840.0, - "grad_norm": 1.599374223212879, - "language_loss": 0.81562543, - "learning_rate": 3.934881590952304e-06, - "loss": 0.83927161, - "num_input_tokens_seen": 9514810, - "step": 451, - "time_per_iteration": 2.6506927013397217 - }, - { - "auxiliary_loss_clip": 0.0128567, - "auxiliary_loss_mlp": 0.01090719, - "balance_loss_clip": 1.08126068, - "balance_loss_mlp": 1.0533824, - "epoch": 0.02717571020592214, - "flos": 24239559006720.0, - "grad_norm": 1.9677929562692107, - "language_loss": 0.77019048, - "learning_rate": 3.936307620734599e-06, - "loss": 0.79395437, - "num_input_tokens_seen": 9533635, - "step": 452, - "time_per_iteration": 2.5751442909240723 - }, - { - "auxiliary_loss_clip": 0.01286865, - "auxiliary_loss_mlp": 0.01088287, - "balance_loss_clip": 1.08011293, - "balance_loss_mlp": 1.05135596, - "epoch": 0.02723583345859011, - "flos": 25118939773440.0, - "grad_norm": 1.7205362750177517, - "language_loss": 0.72874546, - "learning_rate": 3.937730499067294e-06, - "loss": 0.75249696, - "num_input_tokens_seen": 9555420, - "step": 453, - "time_per_iteration": 2.668083667755127 - }, - { - "auxiliary_loss_clip": 0.01281405, - "auxiliary_loss_mlp": 0.01083223, - "balance_loss_clip": 1.07714963, - "balance_loss_mlp": 1.04748416, - "epoch": 0.02729595671125808, - "flos": 42741597847680.0, - "grad_norm": 1.8353680194819204, - "language_loss": 0.82419729, - "learning_rate": 3.939150239848748e-06, - "loss": 0.84784359, - "num_input_tokens_seen": 9578950, - "step": 454, - "time_per_iteration": 2.8580126762390137 - }, - { - "auxiliary_loss_clip": 0.01285525, - "auxiliary_loss_mlp": 0.01077241, - "balance_loss_clip": 1.07935429, - "balance_loss_mlp": 1.043648, - "epoch": 0.02735607996392605, - "flos": 21430985650560.0, - "grad_norm": 1.985829769195046, - "language_loss": 0.75404847, - "learning_rate": 3.9405668568855866e-06, - "loss": 0.77767611, - "num_input_tokens_seen": 9598160, - "step": 455, - "time_per_iteration": 2.6593477725982666 - }, - { - "auxiliary_loss_clip": 0.01282853, - "auxiliary_loss_mlp": 0.01094959, - "balance_loss_clip": 1.07477236, - "balance_loss_mlp": 1.0597918, - "epoch": 0.027416203216594017, - "flos": 20851280052480.0, - "grad_norm": 1.92483069519606, - "language_loss": 0.80670613, - "learning_rate": 3.941980363893499e-06, - "loss": 0.83048427, - "num_input_tokens_seen": 9616010, - "step": 456, - "time_per_iteration": 2.6798384189605713 - }, - { - "auxiliary_loss_clip": 0.01280135, - "auxiliary_loss_mlp": 0.01080319, - "balance_loss_clip": 1.07714963, - "balance_loss_mlp": 1.0435549, - "epoch": 0.027476326469261986, - "flos": 13224500242560.0, - "grad_norm": 2.171481572134165, - "language_loss": 0.81587321, - "learning_rate": 3.9433907744980384e-06, - "loss": 0.83947778, - "num_input_tokens_seen": 9634000, - "step": 457, - "time_per_iteration": 5.62308406829834 - }, - { - "auxiliary_loss_clip": 0.01283922, - "auxiliary_loss_mlp": 0.01084055, - "balance_loss_clip": 1.07603848, - "balance_loss_mlp": 1.04891229, - "epoch": 0.027536449721929958, - "flos": 24024526237440.0, - "grad_norm": 2.024184269172234, - "language_loss": 0.94030929, - "learning_rate": 3.944798102235412e-06, - "loss": 0.96398914, - "num_input_tokens_seen": 9653455, - "step": 458, - "time_per_iteration": 5.694372653961182 - }, - { - "auxiliary_loss_clip": 0.01280807, - "auxiliary_loss_mlp": 0.01091426, - "balance_loss_clip": 1.07479525, - "balance_loss_mlp": 1.05666471, - "epoch": 0.027596572974597926, - "flos": 13006055681280.0, - "grad_norm": 2.356061876390436, - "language_loss": 0.79279089, - "learning_rate": 3.9462023605532545e-06, - "loss": 0.81651318, - "num_input_tokens_seen": 9669650, - "step": 459, - "time_per_iteration": 2.626948595046997 - }, - { - "auxiliary_loss_clip": 0.01286253, - "auxiliary_loss_mlp": 0.01081623, - "balance_loss_clip": 1.08119941, - "balance_loss_mlp": 1.04278445, - "epoch": 0.027656696227265895, - "flos": 26143076350080.0, - "grad_norm": 2.0583603779546404, - "language_loss": 0.83362132, - "learning_rate": 3.947603562811407e-06, - "loss": 0.85730016, - "num_input_tokens_seen": 9691415, - "step": 460, - "time_per_iteration": 2.7191598415374756 - }, - { - "auxiliary_loss_clip": 0.01158037, - "auxiliary_loss_mlp": 0.01054463, - "balance_loss_clip": 1.05032754, - "balance_loss_mlp": 1.044402, - "epoch": 0.027716819479933864, - "flos": 60697222997760.0, - "grad_norm": 1.612511499168885, - "language_loss": 0.7351321, - "learning_rate": 3.949001722282675e-06, - "loss": 0.7572571, - "num_input_tokens_seen": 9755605, - "step": 461, - "time_per_iteration": 3.210820436477661 - }, - { - "auxiliary_loss_clip": 0.01284234, - "auxiliary_loss_mlp": 0.01079832, - "balance_loss_clip": 1.08432341, - "balance_loss_mlp": 1.04700136, - "epoch": 0.027776942732601832, - "flos": 31211938886400.0, - "grad_norm": 2.4500038571081073, - "language_loss": 0.81596625, - "learning_rate": 3.950396852153582e-06, - "loss": 0.839607, - "num_input_tokens_seen": 9776270, - "step": 462, - "time_per_iteration": 2.683197021484375 - }, - { - "auxiliary_loss_clip": 0.01280414, - "auxiliary_loss_mlp": 0.0107864, - "balance_loss_clip": 1.07752454, - "balance_loss_mlp": 1.0454762, - "epoch": 0.027837065985269804, - "flos": 22674644196480.0, - "grad_norm": 2.258526594266715, - "language_loss": 0.90062451, - "learning_rate": 3.951788965525118e-06, - "loss": 0.92421508, - "num_input_tokens_seen": 9794465, - "step": 463, - "time_per_iteration": 2.641674757003784 - }, - { - "auxiliary_loss_clip": 0.01151842, - "auxiliary_loss_mlp": 0.01010002, - "balance_loss_clip": 1.04755902, - "balance_loss_mlp": 1.00027454, - "epoch": 0.027897189237937773, - "flos": 62182487399040.0, - "grad_norm": 0.8962796480673014, - "language_loss": 0.59058654, - "learning_rate": 3.953178075413476e-06, - "loss": 0.61220491, - "num_input_tokens_seen": 9849685, - "step": 464, - "time_per_iteration": 3.1129612922668457 - }, - { - "auxiliary_loss_clip": 0.01292933, - "auxiliary_loss_mlp": 0.01100533, - "balance_loss_clip": 1.08296049, - "balance_loss_mlp": 1.06412649, - "epoch": 0.02795731249060574, - "flos": 24493160004480.0, - "grad_norm": 2.3712654859298055, - "language_loss": 0.81454253, - "learning_rate": 3.954564194750784e-06, - "loss": 0.83847719, - "num_input_tokens_seen": 9869505, - "step": 465, - "time_per_iteration": 2.723144769668579 - }, - { - "auxiliary_loss_clip": 0.01279938, - "auxiliary_loss_mlp": 0.01092668, - "balance_loss_clip": 1.07546401, - "balance_loss_mlp": 1.05630863, - "epoch": 0.02801743574327371, - "flos": 23733003456000.0, - "grad_norm": 1.9968224423519798, - "language_loss": 0.78396618, - "learning_rate": 3.955947336385828e-06, - "loss": 0.80769229, - "num_input_tokens_seen": 9890950, - "step": 466, - "time_per_iteration": 2.6278555393218994 - }, - { - "auxiliary_loss_clip": 0.0127853, - "auxiliary_loss_mlp": 0.01091802, - "balance_loss_clip": 1.07703936, - "balance_loss_mlp": 1.05661178, - "epoch": 0.02807755899594168, - "flos": 20629100476800.0, - "grad_norm": 2.010021605622182, - "language_loss": 0.87699366, - "learning_rate": 3.957327513084761e-06, - "loss": 0.90069699, - "num_input_tokens_seen": 9911265, - "step": 467, - "time_per_iteration": 2.6687490940093994 - }, - { - "auxiliary_loss_clip": 0.01285129, - "auxiliary_loss_mlp": 0.01112935, - "balance_loss_clip": 1.07874036, - "balance_loss_mlp": 1.07576585, - "epoch": 0.02813768224860965, - "flos": 19244564789760.0, - "grad_norm": 2.2302958424490416, - "language_loss": 0.86091757, - "learning_rate": 3.958704737531818e-06, - "loss": 0.88489819, - "num_input_tokens_seen": 9929025, - "step": 468, - "time_per_iteration": 2.5745644569396973 - }, - { - "auxiliary_loss_clip": 0.01281128, - "auxiliary_loss_mlp": 0.01085455, - "balance_loss_clip": 1.07529211, - "balance_loss_mlp": 1.04857147, - "epoch": 0.02819780550127762, - "flos": 20813968800000.0, - "grad_norm": 2.1866562002509875, - "language_loss": 0.91690558, - "learning_rate": 3.9600790223300065e-06, - "loss": 0.94057143, - "num_input_tokens_seen": 9945190, - "step": 469, - "time_per_iteration": 2.610821008682251 - }, - { - "auxiliary_loss_clip": 0.0127909, - "auxiliary_loss_mlp": 0.0110095, - "balance_loss_clip": 1.07675052, - "balance_loss_mlp": 1.06482995, - "epoch": 0.028257928753945588, - "flos": 19974125928960.0, - "grad_norm": 2.674428223667968, - "language_loss": 0.81758964, - "learning_rate": 3.96145038000181e-06, - "loss": 0.84139001, - "num_input_tokens_seen": 9962820, - "step": 470, - "time_per_iteration": 2.6004326343536377 - }, - { - "auxiliary_loss_clip": 0.0128074, - "auxiliary_loss_mlp": 0.01086643, - "balance_loss_clip": 1.07482624, - "balance_loss_mlp": 1.04947352, - "epoch": 0.028318052006613557, - "flos": 20484488321280.0, - "grad_norm": 1.788793606991614, - "language_loss": 0.93071401, - "learning_rate": 3.962818822989861e-06, - "loss": 0.95438784, - "num_input_tokens_seen": 9982595, - "step": 471, - "time_per_iteration": 2.556288719177246 - }, - { - "auxiliary_loss_clip": 0.01273697, - "auxiliary_loss_mlp": 0.0110454, - "balance_loss_clip": 1.07223165, - "balance_loss_mlp": 1.06884849, - "epoch": 0.02837817525928153, - "flos": 28514832410880.0, - "grad_norm": 1.8550872135639116, - "language_loss": 0.7613501, - "learning_rate": 3.964184363657625e-06, - "loss": 0.78513247, - "num_input_tokens_seen": 10004645, - "step": 472, - "time_per_iteration": 2.667804002761841 - }, - { - "auxiliary_loss_clip": 0.01280341, - "auxiliary_loss_mlp": 0.01090649, - "balance_loss_clip": 1.07279634, - "balance_loss_mlp": 1.05624473, - "epoch": 0.028438298511949497, - "flos": 18551668458240.0, - "grad_norm": 1.9914661475951314, - "language_loss": 0.93097353, - "learning_rate": 3.965547014290071e-06, - "loss": 0.95468336, - "num_input_tokens_seen": 10022555, - "step": 473, - "time_per_iteration": 2.6402342319488525 - }, - { - "auxiliary_loss_clip": 0.01287339, - "auxiliary_loss_mlp": 0.01124194, - "balance_loss_clip": 1.07773685, - "balance_loss_mlp": 1.08979011, - "epoch": 0.028498421764617466, - "flos": 16910227722240.0, - "grad_norm": 3.2560638787193237, - "language_loss": 0.88488632, - "learning_rate": 3.96690678709433e-06, - "loss": 0.90900171, - "num_input_tokens_seen": 10041025, - "step": 474, - "time_per_iteration": 2.5853888988494873 - }, - { - "auxiliary_loss_clip": 0.0127783, - "auxiliary_loss_mlp": 0.01093132, - "balance_loss_clip": 1.07535374, - "balance_loss_mlp": 1.05620146, - "epoch": 0.028558545017285435, - "flos": 27778699082880.0, - "grad_norm": 3.1427023167402006, - "language_loss": 0.78901398, - "learning_rate": 3.968263694200355e-06, - "loss": 0.81272364, - "num_input_tokens_seen": 10060775, - "step": 475, - "time_per_iteration": 2.654519557952881 - }, - { - "auxiliary_loss_clip": 0.01148107, - "auxiliary_loss_mlp": 0.01095224, - "balance_loss_clip": 1.04505777, - "balance_loss_mlp": 1.08583021, - "epoch": 0.028618668269953403, - "flos": 65654367258240.0, - "grad_norm": 0.9280065830162254, - "language_loss": 0.66926932, - "learning_rate": 3.969617747661569e-06, - "loss": 0.6917026, - "num_input_tokens_seen": 10120225, - "step": 476, - "time_per_iteration": 3.1292569637298584 - }, - { - "auxiliary_loss_clip": 0.01279748, - "auxiliary_loss_mlp": 0.01088794, - "balance_loss_clip": 1.07638311, - "balance_loss_mlp": 1.05188656, - "epoch": 0.028678791522621375, - "flos": 21937074324480.0, - "grad_norm": 2.985672001195028, - "language_loss": 0.83807188, - "learning_rate": 3.970968959455509e-06, - "loss": 0.86175728, - "num_input_tokens_seen": 10137880, - "step": 477, - "time_per_iteration": 2.651493549346924 - }, - { - "auxiliary_loss_clip": 0.01284956, - "auxiliary_loss_mlp": 0.0108711, - "balance_loss_clip": 1.07924342, - "balance_loss_mlp": 1.05089426, - "epoch": 0.028738914775289344, - "flos": 24572128055040.0, - "grad_norm": 2.1929055744411943, - "language_loss": 0.8233152, - "learning_rate": 3.97231734148446e-06, - "loss": 0.84703588, - "num_input_tokens_seen": 10156930, - "step": 478, - "time_per_iteration": 2.6986753940582275 - }, - { - "auxiliary_loss_clip": 0.01277687, - "auxiliary_loss_mlp": 0.01080644, - "balance_loss_clip": 1.07448888, - "balance_loss_mlp": 1.04500043, - "epoch": 0.028799038027957313, - "flos": 23257977068160.0, - "grad_norm": 4.057107988717453, - "language_loss": 0.81195259, - "learning_rate": 3.973662905576082e-06, - "loss": 0.83553594, - "num_input_tokens_seen": 10176295, - "step": 479, - "time_per_iteration": 2.6321041584014893 - }, - { - "auxiliary_loss_clip": 0.01273765, - "auxiliary_loss_mlp": 0.01083313, - "balance_loss_clip": 1.07335579, - "balance_loss_mlp": 1.04552341, - "epoch": 0.02885916128062528, - "flos": 22164102236160.0, - "grad_norm": 2.352225573775279, - "language_loss": 0.7335608, - "learning_rate": 3.975005663484038e-06, - "loss": 0.75713164, - "num_input_tokens_seen": 10195790, - "step": 480, - "time_per_iteration": 2.650696277618408 - }, - { - "auxiliary_loss_clip": 0.01273107, - "auxiliary_loss_mlp": 0.01075586, - "balance_loss_clip": 1.07424879, - "balance_loss_mlp": 1.04277968, - "epoch": 0.02891928453329325, - "flos": 22932842135040.0, - "grad_norm": 1.867890428108999, - "language_loss": 0.87560165, - "learning_rate": 3.976345626888605e-06, - "loss": 0.89908862, - "num_input_tokens_seen": 10218405, - "step": 481, - "time_per_iteration": 2.6585533618927 - }, - { - "auxiliary_loss_clip": 0.01142103, - "auxiliary_loss_mlp": 0.01017301, - "balance_loss_clip": 1.04286921, - "balance_loss_mlp": 1.00895679, - "epoch": 0.028979407785961222, - "flos": 57432941792640.0, - "grad_norm": 0.8486437303263991, - "language_loss": 0.66030192, - "learning_rate": 3.9776828073972864e-06, - "loss": 0.68189597, - "num_input_tokens_seen": 10271005, - "step": 482, - "time_per_iteration": 2.9788918495178223 - }, - { - "auxiliary_loss_clip": 0.01287904, - "auxiliary_loss_mlp": 0.01082416, - "balance_loss_clip": 1.07739437, - "balance_loss_mlp": 1.04868007, - "epoch": 0.02903953103862919, - "flos": 16722737706240.0, - "grad_norm": 2.6473263724689873, - "language_loss": 0.7899214, - "learning_rate": 3.979017216545415e-06, - "loss": 0.81362462, - "num_input_tokens_seen": 10288405, - "step": 483, - "time_per_iteration": 2.5642752647399902 - }, - { - "auxiliary_loss_clip": 0.01283775, - "auxiliary_loss_mlp": 0.01097438, - "balance_loss_clip": 1.07794189, - "balance_loss_mlp": 1.06155562, - "epoch": 0.02909965429129716, - "flos": 16763640318720.0, - "grad_norm": 2.6777328906555766, - "language_loss": 0.75510043, - "learning_rate": 3.980348865796749e-06, - "loss": 0.77891254, - "num_input_tokens_seen": 10306875, - "step": 484, - "time_per_iteration": 2.608337640762329 - }, - { - "auxiliary_loss_clip": 0.0127962, - "auxiliary_loss_mlp": 0.01081582, - "balance_loss_clip": 1.07543373, - "balance_loss_mlp": 1.04760778, - "epoch": 0.029159777543965128, - "flos": 19785343023360.0, - "grad_norm": 2.3457282915841113, - "language_loss": 0.8378315, - "learning_rate": 3.9816777665440615e-06, - "loss": 0.86144352, - "num_input_tokens_seen": 10323965, - "step": 485, - "time_per_iteration": 2.591409921646118 - }, - { - "auxiliary_loss_clip": 0.01282377, - "auxiliary_loss_mlp": 0.01084922, - "balance_loss_clip": 1.08029485, - "balance_loss_mlp": 1.04956484, - "epoch": 0.029219900796633096, - "flos": 19642670202240.0, - "grad_norm": 2.044831141886674, - "language_loss": 0.84432101, - "learning_rate": 3.983003930109732e-06, - "loss": 0.86799401, - "num_input_tokens_seen": 10342620, - "step": 486, - "time_per_iteration": 2.7101452350616455 - }, - { - "auxiliary_loss_clip": 0.01276806, - "auxiliary_loss_mlp": 0.01090739, - "balance_loss_clip": 1.07363296, - "balance_loss_mlp": 1.05476189, - "epoch": 0.02928002404930107, - "flos": 25885704424320.0, - "grad_norm": 12.432525192672303, - "language_loss": 0.88968349, - "learning_rate": 3.984327367746315e-06, - "loss": 0.91335887, - "num_input_tokens_seen": 10364610, - "step": 487, - "time_per_iteration": 2.637910842895508 - }, - { - "auxiliary_loss_clip": 0.01283084, - "auxiliary_loss_mlp": 0.01069223, - "balance_loss_clip": 1.07921362, - "balance_loss_mlp": 1.03677416, - "epoch": 0.029340147301969037, - "flos": 20660234590080.0, - "grad_norm": 2.566388301054309, - "language_loss": 0.88581878, - "learning_rate": 3.985648090637122e-06, - "loss": 0.90934181, - "num_input_tokens_seen": 10380910, - "step": 488, - "time_per_iteration": 2.6569244861602783 - }, - { - "auxiliary_loss_clip": 0.01275613, - "auxiliary_loss_mlp": 0.01081415, - "balance_loss_clip": 1.07419777, - "balance_loss_mlp": 1.04667735, - "epoch": 0.029400270554637006, - "flos": 24428018689920.0, - "grad_norm": 2.0135021623582503, - "language_loss": 0.88869834, - "learning_rate": 3.986966109896785e-06, - "loss": 0.91226858, - "num_input_tokens_seen": 10400665, - "step": 489, - "time_per_iteration": 2.805555582046509 - }, - { - "auxiliary_loss_clip": 0.01271096, - "auxiliary_loss_mlp": 0.01077182, - "balance_loss_clip": 1.0704807, - "balance_loss_mlp": 1.04168141, - "epoch": 0.029460393807304974, - "flos": 20120892900480.0, - "grad_norm": 2.807428314395572, - "language_loss": 0.88554472, - "learning_rate": 3.988281436571815e-06, - "loss": 0.90902752, - "num_input_tokens_seen": 10420150, - "step": 490, - "time_per_iteration": 2.612993001937866 - }, - { - "auxiliary_loss_clip": 0.01276687, - "auxiliary_loss_mlp": 0.01088031, - "balance_loss_clip": 1.0729506, - "balance_loss_mlp": 1.0536747, - "epoch": 0.029520517059972943, - "flos": 17675914965120.0, - "grad_norm": 2.430337539839543, - "language_loss": 0.91496718, - "learning_rate": 3.989594081641164e-06, - "loss": 0.93861437, - "num_input_tokens_seen": 10438210, - "step": 491, - "time_per_iteration": 2.6203627586364746 - }, - { - "auxiliary_loss_clip": 0.01266864, - "auxiliary_loss_mlp": 0.01072939, - "balance_loss_clip": 1.07131863, - "balance_loss_mlp": 1.03984618, - "epoch": 0.029580640312640915, - "flos": 18953185662720.0, - "grad_norm": 1.9753258841331502, - "language_loss": 0.85654163, - "learning_rate": 3.9909040560167675e-06, - "loss": 0.87993968, - "num_input_tokens_seen": 10455125, - "step": 492, - "time_per_iteration": 2.636378288269043 - }, - { - "auxiliary_loss_clip": 0.01279009, - "auxiliary_loss_mlp": 0.01100381, - "balance_loss_clip": 1.07765996, - "balance_loss_mlp": 1.06471384, - "epoch": 0.029640763565308884, - "flos": 18726121837440.0, - "grad_norm": 4.076790847855052, - "language_loss": 0.84615922, - "learning_rate": 3.992211370544093e-06, - "loss": 0.86995316, - "num_input_tokens_seen": 10470990, - "step": 493, - "time_per_iteration": 2.6144914627075195 - }, - { - "auxiliary_loss_clip": 0.01272514, - "auxiliary_loss_mlp": 0.01074657, - "balance_loss_clip": 1.07140934, - "balance_loss_mlp": 1.04042029, - "epoch": 0.029700886817976852, - "flos": 20595308757120.0, - "grad_norm": 1.8084917907335818, - "language_loss": 0.8658669, - "learning_rate": 3.99351603600268e-06, - "loss": 0.88933873, - "num_input_tokens_seen": 10490685, - "step": 494, - "time_per_iteration": 2.7063095569610596 - }, - { - "auxiliary_loss_clip": 0.01281688, - "auxiliary_loss_mlp": 0.01084428, - "balance_loss_clip": 1.07739305, - "balance_loss_mlp": 1.05279028, - "epoch": 0.02976101007064482, - "flos": 22236857233920.0, - "grad_norm": 7.125038043922513, - "language_loss": 0.86841047, - "learning_rate": 3.994818063106668e-06, - "loss": 0.8920716, - "num_input_tokens_seen": 10509435, - "step": 495, - "time_per_iteration": 2.641700267791748 - }, - { - "auxiliary_loss_clip": 0.01268945, - "auxiliary_loss_mlp": 0.01078198, - "balance_loss_clip": 1.07384837, - "balance_loss_mlp": 1.04508162, - "epoch": 0.029821133323312793, - "flos": 23732644320000.0, - "grad_norm": 2.201071528053665, - "language_loss": 0.61988759, - "learning_rate": 3.99611746250533e-06, - "loss": 0.64335901, - "num_input_tokens_seen": 10530050, - "step": 496, - "time_per_iteration": 2.6524407863616943 - }, - { - "auxiliary_loss_clip": 0.01270994, - "auxiliary_loss_mlp": 0.01089922, - "balance_loss_clip": 1.07575428, - "balance_loss_mlp": 1.05680561, - "epoch": 0.02988125657598076, - "flos": 22419498913920.0, - "grad_norm": 1.7538974268426115, - "language_loss": 0.88820887, - "learning_rate": 3.997414244783595e-06, - "loss": 0.91181797, - "num_input_tokens_seen": 10551370, - "step": 497, - "time_per_iteration": 5.648245811462402 - }, - { - "auxiliary_loss_clip": 0.01277289, - "auxiliary_loss_mlp": 0.01079642, - "balance_loss_clip": 1.07670021, - "balance_loss_mlp": 1.04604888, - "epoch": 0.02994137982864873, - "flos": 13845108453120.0, - "grad_norm": 2.8395997319333204, - "language_loss": 0.85091698, - "learning_rate": 3.998708420462557e-06, - "loss": 0.87448633, - "num_input_tokens_seen": 10569225, - "step": 498, - "time_per_iteration": 4.362173080444336 - }, - { - "auxiliary_loss_clip": 0.0127249, - "auxiliary_loss_mlp": 0.01078673, - "balance_loss_clip": 1.07436109, - "balance_loss_mlp": 1.04691589, - "epoch": 0.0300015030813167, - "flos": 23908354675200.0, - "grad_norm": 3.2275044857926605, - "language_loss": 0.77883017, - "learning_rate": 4e-06, - "loss": 0.80234182, - "num_input_tokens_seen": 10586170, - "step": 499, - "time_per_iteration": 2.6029655933380127 - }, - { - "auxiliary_loss_clip": 0.01272525, - "auxiliary_loss_mlp": 0.01082339, - "balance_loss_clip": 1.07433248, - "balance_loss_mlp": 1.04905546, - "epoch": 0.030061626333984667, - "flos": 22016796560640.0, - "grad_norm": 2.244229511477372, - "language_loss": 0.82687509, - "learning_rate": 3.9999999620799e-06, - "loss": 0.85042375, - "num_input_tokens_seen": 10606205, - "step": 500, - "time_per_iteration": 2.6293113231658936 - }, - { - "auxiliary_loss_clip": 0.01266453, - "auxiliary_loss_mlp": 0.0108458, - "balance_loss_clip": 1.07100737, - "balance_loss_mlp": 1.04922247, - "epoch": 0.03012174958665264, - "flos": 23039747988480.0, - "grad_norm": 3.2569274145363356, - "language_loss": 0.88086087, - "learning_rate": 3.9999998483196e-06, - "loss": 0.90437114, - "num_input_tokens_seen": 10625995, - "step": 501, - "time_per_iteration": 2.601081132888794 - }, - { - "auxiliary_loss_clip": 0.01273997, - "auxiliary_loss_mlp": 0.01071746, - "balance_loss_clip": 1.07361674, - "balance_loss_mlp": 1.04025102, - "epoch": 0.030181872839320608, - "flos": 18953257489920.0, - "grad_norm": 3.3627001763511855, - "language_loss": 0.86654103, - "learning_rate": 3.9999996587191065e-06, - "loss": 0.88999844, - "num_input_tokens_seen": 10644105, - "step": 502, - "time_per_iteration": 2.5507659912109375 - }, - { - "auxiliary_loss_clip": 0.01270542, - "auxiliary_loss_mlp": 0.01081534, - "balance_loss_clip": 1.07475543, - "balance_loss_mlp": 1.04827452, - "epoch": 0.030241996091988577, - "flos": 16728017005440.0, - "grad_norm": 2.4572357458963876, - "language_loss": 0.84281206, - "learning_rate": 3.999999393278425e-06, - "loss": 0.86633277, - "num_input_tokens_seen": 10661090, - "step": 503, - "time_per_iteration": 2.618587017059326 - }, - { - "auxiliary_loss_clip": 0.01262547, - "auxiliary_loss_mlp": 0.01091143, - "balance_loss_clip": 1.0710721, - "balance_loss_mlp": 1.05781209, - "epoch": 0.030302119344656545, - "flos": 28621271387520.0, - "grad_norm": 1.6994359255159197, - "language_loss": 0.88137805, - "learning_rate": 3.999999051997567e-06, - "loss": 0.90491492, - "num_input_tokens_seen": 10682380, - "step": 504, - "time_per_iteration": 2.6794183254241943 - }, - { - "auxiliary_loss_clip": 0.01264601, - "auxiliary_loss_mlp": 0.01086749, - "balance_loss_clip": 1.07040262, - "balance_loss_mlp": 1.0541091, - "epoch": 0.030362242597324514, - "flos": 15669334523520.0, - "grad_norm": 2.074855698516145, - "language_loss": 0.786093, - "learning_rate": 3.9999986348765425e-06, - "loss": 0.80960649, - "num_input_tokens_seen": 10699925, - "step": 505, - "time_per_iteration": 2.564960479736328 - }, - { - "auxiliary_loss_clip": 0.01134686, - "auxiliary_loss_mlp": 0.010147, - "balance_loss_clip": 1.03763247, - "balance_loss_mlp": 1.00692737, - "epoch": 0.030422365849992486, - "flos": 72125973676800.0, - "grad_norm": 0.9565689962416369, - "language_loss": 0.54981297, - "learning_rate": 3.999998141915371e-06, - "loss": 0.57130682, - "num_input_tokens_seen": 10766525, - "step": 506, - "time_per_iteration": 3.3345654010772705 - }, - { - "auxiliary_loss_clip": 0.01266577, - "auxiliary_loss_mlp": 0.01090299, - "balance_loss_clip": 1.07119894, - "balance_loss_mlp": 1.05687308, - "epoch": 0.030482489102660455, - "flos": 19427817000960.0, - "grad_norm": 2.2738865373146684, - "language_loss": 0.83377159, - "learning_rate": 3.999997573114069e-06, - "loss": 0.8573404, - "num_input_tokens_seen": 10786725, - "step": 507, - "time_per_iteration": 2.645613670349121 - }, - { - "auxiliary_loss_clip": 0.01269938, - "auxiliary_loss_mlp": 0.01076205, - "balance_loss_clip": 1.07151937, - "balance_loss_mlp": 1.04344678, - "epoch": 0.030542612355328423, - "flos": 20375822701440.0, - "grad_norm": 2.375369924968869, - "language_loss": 0.88842839, - "learning_rate": 3.999996928472659e-06, - "loss": 0.91188985, - "num_input_tokens_seen": 10805390, - "step": 508, - "time_per_iteration": 2.617283344268799 - }, - { - "auxiliary_loss_clip": 0.01272148, - "auxiliary_loss_mlp": 0.01067206, - "balance_loss_clip": 1.07232118, - "balance_loss_mlp": 1.03394616, - "epoch": 0.030602735607996392, - "flos": 34677354297600.0, - "grad_norm": 6.964954749829821, - "language_loss": 0.71807706, - "learning_rate": 3.999996207991165e-06, - "loss": 0.74147063, - "num_input_tokens_seen": 10828030, - "step": 509, - "time_per_iteration": 2.7723498344421387 - }, - { - "auxiliary_loss_clip": 0.01264594, - "auxiliary_loss_mlp": 0.01074377, - "balance_loss_clip": 1.07241154, - "balance_loss_mlp": 1.04333544, - "epoch": 0.03066285886066436, - "flos": 23658668259840.0, - "grad_norm": 1.9285974370038053, - "language_loss": 0.82031929, - "learning_rate": 3.999995411669614e-06, - "loss": 0.84370899, - "num_input_tokens_seen": 10845240, - "step": 510, - "time_per_iteration": 2.6254217624664307 - }, - { - "auxiliary_loss_clip": 0.01268793, - "auxiliary_loss_mlp": 0.01075379, - "balance_loss_clip": 1.07532823, - "balance_loss_mlp": 1.04252458, - "epoch": 0.030722982113332332, - "flos": 23002975440000.0, - "grad_norm": 5.706057095430757, - "language_loss": 0.83572316, - "learning_rate": 3.999994539508036e-06, - "loss": 0.85916495, - "num_input_tokens_seen": 10864325, - "step": 511, - "time_per_iteration": 2.613457441329956 - }, - { - "auxiliary_loss_clip": 0.01269742, - "auxiliary_loss_mlp": 0.01081314, - "balance_loss_clip": 1.07207167, - "balance_loss_mlp": 1.0496521, - "epoch": 0.0307831053660003, - "flos": 24750855152640.0, - "grad_norm": 2.025270681093948, - "language_loss": 0.82109964, - "learning_rate": 3.9999935915064655e-06, - "loss": 0.84461015, - "num_input_tokens_seen": 10883860, - "step": 512, - "time_per_iteration": 2.630404233932495 - }, - { - "auxiliary_loss_clip": 0.01266054, - "auxiliary_loss_mlp": 0.01084436, - "balance_loss_clip": 1.07086158, - "balance_loss_mlp": 1.05070007, - "epoch": 0.03084322861866827, - "flos": 26140885620480.0, - "grad_norm": 2.500363981205655, - "language_loss": 0.86933553, - "learning_rate": 3.9999925676649374e-06, - "loss": 0.89284045, - "num_input_tokens_seen": 10904555, - "step": 513, - "time_per_iteration": 2.671926259994507 - }, - { - "auxiliary_loss_clip": 0.01272542, - "auxiliary_loss_mlp": 0.01080065, - "balance_loss_clip": 1.07461214, - "balance_loss_mlp": 1.04744935, - "epoch": 0.03090335187133624, - "flos": 18771298168320.0, - "grad_norm": 1.704575426690477, - "language_loss": 0.79124331, - "learning_rate": 3.999991467983491e-06, - "loss": 0.81476939, - "num_input_tokens_seen": 10923700, - "step": 514, - "time_per_iteration": 2.6158573627471924 - }, - { - "auxiliary_loss_clip": 0.01265821, - "auxiliary_loss_mlp": 0.01067844, - "balance_loss_clip": 1.07397485, - "balance_loss_mlp": 1.03711247, - "epoch": 0.030963475124004207, - "flos": 23221886878080.0, - "grad_norm": 2.729063628201222, - "language_loss": 0.77758944, - "learning_rate": 3.999990292462167e-06, - "loss": 0.80092615, - "num_input_tokens_seen": 10942730, - "step": 515, - "time_per_iteration": 2.636294364929199 - }, - { - "auxiliary_loss_clip": 0.0126398, - "auxiliary_loss_mlp": 0.01072575, - "balance_loss_clip": 1.06835747, - "balance_loss_mlp": 1.03874326, - "epoch": 0.03102359837667218, - "flos": 42525595411200.0, - "grad_norm": 2.1228851207681503, - "language_loss": 0.82452714, - "learning_rate": 3.999989041101011e-06, - "loss": 0.84789264, - "num_input_tokens_seen": 10967120, - "step": 516, - "time_per_iteration": 2.8078057765960693 - }, - { - "auxiliary_loss_clip": 0.01263726, - "auxiliary_loss_mlp": 0.01073859, - "balance_loss_clip": 1.0712111, - "balance_loss_mlp": 1.04090929, - "epoch": 0.031083721629340148, - "flos": 21176953689600.0, - "grad_norm": 1.9016724574566626, - "language_loss": 0.79088318, - "learning_rate": 3.999987713900071e-06, - "loss": 0.81425899, - "num_input_tokens_seen": 10986775, - "step": 517, - "time_per_iteration": 2.5935981273651123 - }, - { - "auxiliary_loss_clip": 0.0125895, - "auxiliary_loss_mlp": 0.0107836, - "balance_loss_clip": 1.07049131, - "balance_loss_mlp": 1.04629326, - "epoch": 0.031143844882008116, - "flos": 29716187713920.0, - "grad_norm": 1.6829619528007147, - "language_loss": 0.90798068, - "learning_rate": 3.999986310859396e-06, - "loss": 0.93135381, - "num_input_tokens_seen": 11011360, - "step": 518, - "time_per_iteration": 2.6855509281158447 - }, - { - "auxiliary_loss_clip": 0.01272237, - "auxiliary_loss_mlp": 0.01097567, - "balance_loss_clip": 1.07848859, - "balance_loss_mlp": 1.06230497, - "epoch": 0.031203968134676085, - "flos": 23112467072640.0, - "grad_norm": 1.8835331125391583, - "language_loss": 0.86759162, - "learning_rate": 3.999984831979039e-06, - "loss": 0.89128959, - "num_input_tokens_seen": 11030150, - "step": 519, - "time_per_iteration": 2.628380060195923 - }, - { - "auxiliary_loss_clip": 0.01265864, - "auxiliary_loss_mlp": 0.01086943, - "balance_loss_clip": 1.06901193, - "balance_loss_mlp": 1.05578136, - "epoch": 0.03126409138734405, - "flos": 20954379064320.0, - "grad_norm": 3.8823628482318164, - "language_loss": 0.87246573, - "learning_rate": 3.999983277259057e-06, - "loss": 0.89599377, - "num_input_tokens_seen": 11049145, - "step": 520, - "time_per_iteration": 2.5850255489349365 - }, - { - "auxiliary_loss_clip": 0.01269157, - "auxiliary_loss_mlp": 0.01086266, - "balance_loss_clip": 1.07231963, - "balance_loss_mlp": 1.0528394, - "epoch": 0.031324214640012026, - "flos": 21650112570240.0, - "grad_norm": 1.7050130216714323, - "language_loss": 0.89274424, - "learning_rate": 3.999981646699509e-06, - "loss": 0.91629851, - "num_input_tokens_seen": 11068835, - "step": 521, - "time_per_iteration": 2.6412506103515625 - }, - { - "auxiliary_loss_clip": 0.01263772, - "auxiliary_loss_mlp": 0.01082584, - "balance_loss_clip": 1.0717473, - "balance_loss_mlp": 1.04827595, - "epoch": 0.03138433789267999, - "flos": 23441337020160.0, - "grad_norm": 2.085624200373119, - "language_loss": 0.71452564, - "learning_rate": 3.999979940300456e-06, - "loss": 0.73798925, - "num_input_tokens_seen": 11088980, - "step": 522, - "time_per_iteration": 2.6561174392700195 - }, - { - "auxiliary_loss_clip": 0.01265725, - "auxiliary_loss_mlp": 0.01082552, - "balance_loss_clip": 1.06871116, - "balance_loss_mlp": 1.05079484, - "epoch": 0.03144446114534796, - "flos": 18982164960000.0, - "grad_norm": 4.223323698032832, - "language_loss": 0.84758592, - "learning_rate": 3.999978158061963e-06, - "loss": 0.87106872, - "num_input_tokens_seen": 11104300, - "step": 523, - "time_per_iteration": 2.608565330505371 - }, - { - "auxiliary_loss_clip": 0.01271589, - "auxiliary_loss_mlp": 0.01076253, - "balance_loss_clip": 1.07193565, - "balance_loss_mlp": 1.04296994, - "epoch": 0.031504584398015935, - "flos": 22637692080000.0, - "grad_norm": 2.324094801199308, - "language_loss": 0.89989722, - "learning_rate": 3.999976299984099e-06, - "loss": 0.92337573, - "num_input_tokens_seen": 11123335, - "step": 524, - "time_per_iteration": 2.68269944190979 - }, - { - "auxiliary_loss_clip": 0.01273471, - "auxiliary_loss_mlp": 0.0108318, - "balance_loss_clip": 1.07427168, - "balance_loss_mlp": 1.04944324, - "epoch": 0.0315647076506839, - "flos": 25297056339840.0, - "grad_norm": 2.4635323942475766, - "language_loss": 0.80114233, - "learning_rate": 3.999974366066933e-06, - "loss": 0.82470882, - "num_input_tokens_seen": 11140880, - "step": 525, - "time_per_iteration": 2.6396324634552 - }, - { - "auxiliary_loss_clip": 0.01264716, - "auxiliary_loss_mlp": 0.01080959, - "balance_loss_clip": 1.0681529, - "balance_loss_mlp": 1.04798603, - "epoch": 0.03162483090335187, - "flos": 16982839065600.0, - "grad_norm": 2.3553733144031948, - "language_loss": 0.81162, - "learning_rate": 3.999972356310538e-06, - "loss": 0.83507675, - "num_input_tokens_seen": 11158710, - "step": 526, - "time_per_iteration": 2.6167168617248535 - }, - { - "auxiliary_loss_clip": 0.01273987, - "auxiliary_loss_mlp": 0.01072725, - "balance_loss_clip": 1.07507181, - "balance_loss_mlp": 1.03736734, - "epoch": 0.03168495415601984, - "flos": 18734489706240.0, - "grad_norm": 1.9666844995001491, - "language_loss": 0.81491739, - "learning_rate": 3.999970270714991e-06, - "loss": 0.83838451, - "num_input_tokens_seen": 11177550, - "step": 527, - "time_per_iteration": 2.580310821533203 - }, - { - "auxiliary_loss_clip": 0.01261155, - "auxiliary_loss_mlp": 0.01080842, - "balance_loss_clip": 1.06786597, - "balance_loss_mlp": 1.04717755, - "epoch": 0.03174507740868781, - "flos": 21214875473280.0, - "grad_norm": 1.9105688869262756, - "language_loss": 0.93801636, - "learning_rate": 3.999968109280371e-06, - "loss": 0.96143627, - "num_input_tokens_seen": 11196230, - "step": 528, - "time_per_iteration": 2.5901002883911133 - }, - { - "auxiliary_loss_clip": 0.01263275, - "auxiliary_loss_mlp": 0.01071724, - "balance_loss_clip": 1.06776333, - "balance_loss_mlp": 1.0387274, - "epoch": 0.03180520066135578, - "flos": 24787663614720.0, - "grad_norm": 1.8924176613796981, - "language_loss": 0.84130204, - "learning_rate": 3.99996587200676e-06, - "loss": 0.86465204, - "num_input_tokens_seen": 11214935, - "step": 529, - "time_per_iteration": 2.593867063522339 - }, - { - "auxiliary_loss_clip": 0.01266309, - "auxiliary_loss_mlp": 0.01088988, - "balance_loss_clip": 1.07501197, - "balance_loss_mlp": 1.0563724, - "epoch": 0.03186532391402375, - "flos": 24864261367680.0, - "grad_norm": 2.316883777672742, - "language_loss": 0.90458709, - "learning_rate": 3.999963558894243e-06, - "loss": 0.92814004, - "num_input_tokens_seen": 11235310, - "step": 530, - "time_per_iteration": 2.5994982719421387 - }, - { - "auxiliary_loss_clip": 0.01261024, - "auxiliary_loss_mlp": 0.0107627, - "balance_loss_clip": 1.06481552, - "balance_loss_mlp": 1.04188991, - "epoch": 0.03192544716669172, - "flos": 21215055041280.0, - "grad_norm": 2.2744046769674324, - "language_loss": 0.76334512, - "learning_rate": 3.999961169942907e-06, - "loss": 0.78671807, - "num_input_tokens_seen": 11254425, - "step": 531, - "time_per_iteration": 2.618149757385254 - }, - { - "auxiliary_loss_clip": 0.01260981, - "auxiliary_loss_mlp": 0.01064937, - "balance_loss_clip": 1.0669558, - "balance_loss_mlp": 1.03143883, - "epoch": 0.03198557041935969, - "flos": 24353216616960.0, - "grad_norm": 2.467757262816931, - "language_loss": 0.90483695, - "learning_rate": 3.999958705152843e-06, - "loss": 0.92809618, - "num_input_tokens_seen": 11274595, - "step": 532, - "time_per_iteration": 2.647947072982788 - }, - { - "auxiliary_loss_clip": 0.01146464, - "auxiliary_loss_mlp": 0.01012028, - "balance_loss_clip": 1.04988623, - "balance_loss_mlp": 1.00325394, - "epoch": 0.032045693672027656, - "flos": 61827367587840.0, - "grad_norm": 1.9655071928838626, - "language_loss": 0.57953775, - "learning_rate": 3.9999561645241445e-06, - "loss": 0.60112268, - "num_input_tokens_seen": 11336705, - "step": 533, - "time_per_iteration": 3.2502808570861816 - }, - { - "auxiliary_loss_clip": 0.01260941, - "auxiliary_loss_mlp": 0.01084263, - "balance_loss_clip": 1.06724441, - "balance_loss_mlp": 1.0516715, - "epoch": 0.03210581692469563, - "flos": 28401174800640.0, - "grad_norm": 1.7138682169725878, - "language_loss": 0.86666048, - "learning_rate": 3.999953548056907e-06, - "loss": 0.89011252, - "num_input_tokens_seen": 11356820, - "step": 534, - "time_per_iteration": 2.678739070892334 - }, - { - "auxiliary_loss_clip": 0.01259554, - "auxiliary_loss_mlp": 0.01066669, - "balance_loss_clip": 1.06782031, - "balance_loss_mlp": 1.03407741, - "epoch": 0.03216594017736359, - "flos": 24717709877760.0, - "grad_norm": 2.12774196295415, - "language_loss": 0.77627808, - "learning_rate": 3.999950855751232e-06, - "loss": 0.79954034, - "num_input_tokens_seen": 11376645, - "step": 535, - "time_per_iteration": 2.7128217220306396 - }, - { - "auxiliary_loss_clip": 0.01261708, - "auxiliary_loss_mlp": 0.01081378, - "balance_loss_clip": 1.06843078, - "balance_loss_mlp": 1.0485003, - "epoch": 0.032226063430031565, - "flos": 31175453646720.0, - "grad_norm": 3.9913279940153585, - "language_loss": 0.80939913, - "learning_rate": 3.999948087607219e-06, - "loss": 0.83283001, - "num_input_tokens_seen": 11397310, - "step": 536, - "time_per_iteration": 2.7490127086639404 - }, - { - "auxiliary_loss_clip": 0.01262237, - "auxiliary_loss_mlp": 0.01075987, - "balance_loss_clip": 1.06839073, - "balance_loss_mlp": 1.04167831, - "epoch": 0.03228618668269954, - "flos": 32198225506560.0, - "grad_norm": 1.6888601787189168, - "language_loss": 0.7009111, - "learning_rate": 3.999945243624975e-06, - "loss": 0.72429335, - "num_input_tokens_seen": 11418475, - "step": 537, - "time_per_iteration": 5.5609166622161865 - }, - { - "auxiliary_loss_clip": 0.0126357, - "auxiliary_loss_mlp": 0.01084205, - "balance_loss_clip": 1.07331729, - "balance_loss_mlp": 1.05161297, - "epoch": 0.0323463099353675, - "flos": 22670154996480.0, - "grad_norm": 2.146306428033486, - "language_loss": 0.82684958, - "learning_rate": 3.999942323804607e-06, - "loss": 0.85032725, - "num_input_tokens_seen": 11436630, - "step": 538, - "time_per_iteration": 2.5465030670166016 - }, - { - "auxiliary_loss_clip": 0.01269537, - "auxiliary_loss_mlp": 0.01078099, - "balance_loss_clip": 1.06987572, - "balance_loss_mlp": 1.04536414, - "epoch": 0.032406433188035474, - "flos": 26905172232960.0, - "grad_norm": 1.8709064214989917, - "language_loss": 0.79146457, - "learning_rate": 3.999939328146225e-06, - "loss": 0.81494099, - "num_input_tokens_seen": 11457275, - "step": 539, - "time_per_iteration": 4.172123432159424 - }, - { - "auxiliary_loss_clip": 0.0126143, - "auxiliary_loss_mlp": 0.01069528, - "balance_loss_clip": 1.06830835, - "balance_loss_mlp": 1.03567231, - "epoch": 0.03246655644070344, - "flos": 31503928544640.0, - "grad_norm": 35.59051030008172, - "language_loss": 0.77379727, - "learning_rate": 3.999936256649943e-06, - "loss": 0.79710686, - "num_input_tokens_seen": 11476925, - "step": 540, - "time_per_iteration": 2.5633046627044678 - }, - { - "auxiliary_loss_clip": 0.01269863, - "auxiliary_loss_mlp": 0.01073669, - "balance_loss_clip": 1.07271969, - "balance_loss_mlp": 1.04124355, - "epoch": 0.03252667969337141, - "flos": 23218331431680.0, - "grad_norm": 2.0489065110302636, - "language_loss": 0.85458571, - "learning_rate": 3.999933109315878e-06, - "loss": 0.878021, - "num_input_tokens_seen": 11496830, - "step": 541, - "time_per_iteration": 2.6079938411712646 - }, - { - "auxiliary_loss_clip": 0.01258504, - "auxiliary_loss_mlp": 0.01082451, - "balance_loss_clip": 1.06961954, - "balance_loss_mlp": 1.04835749, - "epoch": 0.032586802946039384, - "flos": 14757454926720.0, - "grad_norm": 2.674731240129174, - "language_loss": 0.89234567, - "learning_rate": 3.9999298861441496e-06, - "loss": 0.91575521, - "num_input_tokens_seen": 11515605, - "step": 542, - "time_per_iteration": 2.597036600112915 - }, - { - "auxiliary_loss_clip": 0.0126351, - "auxiliary_loss_mlp": 0.01081041, - "balance_loss_clip": 1.06974792, - "balance_loss_mlp": 1.04792452, - "epoch": 0.03264692619870735, - "flos": 24280677100800.0, - "grad_norm": 2.2714121360014334, - "language_loss": 0.71123677, - "learning_rate": 3.999926587134879e-06, - "loss": 0.73468232, - "num_input_tokens_seen": 11536230, - "step": 543, - "time_per_iteration": 2.634601354598999 - }, - { - "auxiliary_loss_clip": 0.01259994, - "auxiliary_loss_mlp": 0.01088763, - "balance_loss_clip": 1.06379187, - "balance_loss_mlp": 1.05545604, - "epoch": 0.03270704945137532, - "flos": 22893160584960.0, - "grad_norm": 4.777521083182084, - "language_loss": 0.91540575, - "learning_rate": 3.999923212288192e-06, - "loss": 0.93889332, - "num_input_tokens_seen": 11554715, - "step": 544, - "time_per_iteration": 2.6173009872436523 - }, - { - "auxiliary_loss_clip": 0.01264485, - "auxiliary_loss_mlp": 0.01085684, - "balance_loss_clip": 1.06989884, - "balance_loss_mlp": 1.05571437, - "epoch": 0.032767172704043286, - "flos": 18041018757120.0, - "grad_norm": 2.6951315012120025, - "language_loss": 0.65799558, - "learning_rate": 3.999919761604216e-06, - "loss": 0.68149722, - "num_input_tokens_seen": 11571370, - "step": 545, - "time_per_iteration": 2.6500988006591797 - }, - { - "auxiliary_loss_clip": 0.012623, - "auxiliary_loss_mlp": 0.0107161, - "balance_loss_clip": 1.06693912, - "balance_loss_mlp": 1.0393517, - "epoch": 0.03282729595671126, - "flos": 22528739151360.0, - "grad_norm": 2.2564766449723908, - "language_loss": 0.92221987, - "learning_rate": 3.999916235083083e-06, - "loss": 0.94555902, - "num_input_tokens_seen": 11588560, - "step": 546, - "time_per_iteration": 2.673250913619995 - }, - { - "auxiliary_loss_clip": 0.01260258, - "auxiliary_loss_mlp": 0.01077296, - "balance_loss_clip": 1.06488204, - "balance_loss_mlp": 1.04313052, - "epoch": 0.03288741920937923, - "flos": 20410620001920.0, - "grad_norm": 2.1923718908590653, - "language_loss": 0.81706661, - "learning_rate": 3.999912632724925e-06, - "loss": 0.84044212, - "num_input_tokens_seen": 11605685, - "step": 547, - "time_per_iteration": 2.725198745727539 - }, - { - "auxiliary_loss_clip": 0.0126227, - "auxiliary_loss_mlp": 0.0107871, - "balance_loss_clip": 1.06794477, - "balance_loss_mlp": 1.04480648, - "epoch": 0.032947542462047195, - "flos": 20777986350720.0, - "grad_norm": 1.730652582963277, - "language_loss": 0.81227565, - "learning_rate": 3.999908954529881e-06, - "loss": 0.83568549, - "num_input_tokens_seen": 11626290, - "step": 548, - "time_per_iteration": 2.714073419570923 - }, - { - "auxiliary_loss_clip": 0.01264818, - "auxiliary_loss_mlp": 0.01084154, - "balance_loss_clip": 1.06963027, - "balance_loss_mlp": 1.04870164, - "epoch": 0.03300766571471517, - "flos": 19901263190400.0, - "grad_norm": 3.8540092911047603, - "language_loss": 0.67460287, - "learning_rate": 3.999905200498087e-06, - "loss": 0.69809258, - "num_input_tokens_seen": 11643950, - "step": 549, - "time_per_iteration": 2.6747171878814697 - }, - { - "auxiliary_loss_clip": 0.0125805, - "auxiliary_loss_mlp": 0.01076001, - "balance_loss_clip": 1.06968856, - "balance_loss_mlp": 1.04236054, - "epoch": 0.03306778896738313, - "flos": 17967760968960.0, - "grad_norm": 1.933615596136007, - "language_loss": 0.86379111, - "learning_rate": 3.999901370629689e-06, - "loss": 0.88713157, - "num_input_tokens_seen": 11662560, - "step": 550, - "time_per_iteration": 2.553386926651001 - }, - { - "auxiliary_loss_clip": 0.01264951, - "auxiliary_loss_mlp": 0.01095377, - "balance_loss_clip": 1.07279766, - "balance_loss_mlp": 1.06142652, - "epoch": 0.033127912220051105, - "flos": 21653380707840.0, - "grad_norm": 3.1958143211070977, - "language_loss": 0.8127178, - "learning_rate": 3.99989746492483e-06, - "loss": 0.83632112, - "num_input_tokens_seen": 11682265, - "step": 551, - "time_per_iteration": 2.6231682300567627 - }, - { - "auxiliary_loss_clip": 0.01271579, - "auxiliary_loss_mlp": 0.0108998, - "balance_loss_clip": 1.07285261, - "balance_loss_mlp": 1.05626702, - "epoch": 0.03318803547271908, - "flos": 30188376927360.0, - "grad_norm": 2.9473143774727606, - "language_loss": 0.86134821, - "learning_rate": 3.999893483383658e-06, - "loss": 0.88496381, - "num_input_tokens_seen": 11699300, - "step": 552, - "time_per_iteration": 2.7002694606781006 - }, - { - "auxiliary_loss_clip": 0.01267081, - "auxiliary_loss_mlp": 0.01081671, - "balance_loss_clip": 1.07191086, - "balance_loss_mlp": 1.04650474, - "epoch": 0.03324815872538704, - "flos": 20376038183040.0, - "grad_norm": 2.990469903058063, - "language_loss": 0.9301765, - "learning_rate": 3.999889426006326e-06, - "loss": 0.95366406, - "num_input_tokens_seen": 11716955, - "step": 553, - "time_per_iteration": 2.6629648208618164 - }, - { - "auxiliary_loss_clip": 0.01262345, - "auxiliary_loss_mlp": 0.01077186, - "balance_loss_clip": 1.06925786, - "balance_loss_mlp": 1.04149485, - "epoch": 0.033308281978055014, - "flos": 24494560634880.0, - "grad_norm": 2.1924330874053166, - "language_loss": 0.78881586, - "learning_rate": 3.999885292792986e-06, - "loss": 0.8122111, - "num_input_tokens_seen": 11736130, - "step": 554, - "time_per_iteration": 2.668970823287964 - }, - { - "auxiliary_loss_clip": 0.01258048, - "auxiliary_loss_mlp": 0.0108557, - "balance_loss_clip": 1.06745815, - "balance_loss_mlp": 1.05045104, - "epoch": 0.03336840523072298, - "flos": 23400326666880.0, - "grad_norm": 2.2144550089326938, - "language_loss": 0.81971425, - "learning_rate": 3.999881083743795e-06, - "loss": 0.84315038, - "num_input_tokens_seen": 11754425, - "step": 555, - "time_per_iteration": 2.610807418823242 - }, - { - "auxiliary_loss_clip": 0.01264442, - "auxiliary_loss_mlp": 0.0108339, - "balance_loss_clip": 1.06914032, - "balance_loss_mlp": 1.04805672, - "epoch": 0.03342852848339095, - "flos": 30550571717760.0, - "grad_norm": 3.7821745066525487, - "language_loss": 0.88661897, - "learning_rate": 3.999876798858914e-06, - "loss": 0.9100973, - "num_input_tokens_seen": 11772845, - "step": 556, - "time_per_iteration": 2.6288907527923584 - }, - { - "auxiliary_loss_clip": 0.01262553, - "auxiliary_loss_mlp": 0.01084158, - "balance_loss_clip": 1.06896496, - "balance_loss_mlp": 1.04863358, - "epoch": 0.03348865173605892, - "flos": 22893304239360.0, - "grad_norm": 1.974910128087634, - "language_loss": 0.83708388, - "learning_rate": 3.999872438138503e-06, - "loss": 0.860551, - "num_input_tokens_seen": 11792850, - "step": 557, - "time_per_iteration": 2.649401903152466 - }, - { - "auxiliary_loss_clip": 0.01268198, - "auxiliary_loss_mlp": 0.01069057, - "balance_loss_clip": 1.07400489, - "balance_loss_mlp": 1.03684711, - "epoch": 0.03354877498872689, - "flos": 17676022705920.0, - "grad_norm": 3.176542206824637, - "language_loss": 0.94202292, - "learning_rate": 3.999868001582729e-06, - "loss": 0.96539545, - "num_input_tokens_seen": 11809670, - "step": 558, - "time_per_iteration": 2.550515651702881 - }, - { - "auxiliary_loss_clip": 0.01258948, - "auxiliary_loss_mlp": 0.01074291, - "balance_loss_clip": 1.06591845, - "balance_loss_mlp": 1.04036427, - "epoch": 0.03360889824139486, - "flos": 21652985658240.0, - "grad_norm": 2.6619487077732384, - "language_loss": 0.77115649, - "learning_rate": 3.99986348919176e-06, - "loss": 0.79448891, - "num_input_tokens_seen": 11829665, - "step": 559, - "time_per_iteration": 2.729597330093384 - }, - { - "auxiliary_loss_clip": 0.01261947, - "auxiliary_loss_mlp": 0.01080822, - "balance_loss_clip": 1.06835234, - "balance_loss_mlp": 1.04882574, - "epoch": 0.033669021494062826, - "flos": 21795730306560.0, - "grad_norm": 1.945022837871561, - "language_loss": 0.87472397, - "learning_rate": 3.9998589009657675e-06, - "loss": 0.89815164, - "num_input_tokens_seen": 11848190, - "step": 560, - "time_per_iteration": 2.6082279682159424 - }, - { - "auxiliary_loss_clip": 0.01257198, - "auxiliary_loss_mlp": 0.0107356, - "balance_loss_clip": 1.06704283, - "balance_loss_mlp": 1.04199314, - "epoch": 0.0337291447467308, - "flos": 21866222747520.0, - "grad_norm": 2.4061219554407502, - "language_loss": 0.81578708, - "learning_rate": 3.999854236904925e-06, - "loss": 0.83909464, - "num_input_tokens_seen": 11864795, - "step": 561, - "time_per_iteration": 2.602193832397461 - }, - { - "auxiliary_loss_clip": 0.01254722, - "auxiliary_loss_mlp": 0.01076361, - "balance_loss_clip": 1.06685936, - "balance_loss_mlp": 1.04422247, - "epoch": 0.03378926799939877, - "flos": 24245951627520.0, - "grad_norm": 1.683217504050761, - "language_loss": 0.82320511, - "learning_rate": 3.999849497009409e-06, - "loss": 0.84651601, - "num_input_tokens_seen": 11885275, - "step": 562, - "time_per_iteration": 2.675872564315796 - }, - { - "auxiliary_loss_clip": 0.01262146, - "auxiliary_loss_mlp": 0.01084212, - "balance_loss_clip": 1.06894755, - "balance_loss_mlp": 1.0508337, - "epoch": 0.033849391252066735, - "flos": 16507812677760.0, - "grad_norm": 2.262509698135982, - "language_loss": 0.84285647, - "learning_rate": 3.999844681279401e-06, - "loss": 0.86632001, - "num_input_tokens_seen": 11903595, - "step": 563, - "time_per_iteration": 2.586944103240967 - }, - { - "auxiliary_loss_clip": 0.01258135, - "auxiliary_loss_mlp": 0.01083866, - "balance_loss_clip": 1.0675565, - "balance_loss_mlp": 1.05094075, - "epoch": 0.03390951450473471, - "flos": 15669298609920.0, - "grad_norm": 2.115200912185494, - "language_loss": 0.94438875, - "learning_rate": 3.99983978971508e-06, - "loss": 0.96780878, - "num_input_tokens_seen": 11917815, - "step": 564, - "time_per_iteration": 2.5444440841674805 - }, - { - "auxiliary_loss_clip": 0.01259509, - "auxiliary_loss_mlp": 0.01073406, - "balance_loss_clip": 1.06518865, - "balance_loss_mlp": 1.03907406, - "epoch": 0.03396963775740267, - "flos": 22674787850880.0, - "grad_norm": 2.6560391741906924, - "language_loss": 0.94669235, - "learning_rate": 3.999834822316635e-06, - "loss": 0.97002149, - "num_input_tokens_seen": 11936305, - "step": 565, - "time_per_iteration": 2.5614171028137207 - }, - { - "auxiliary_loss_clip": 0.01150452, - "auxiliary_loss_mlp": 0.01081579, - "balance_loss_clip": 1.04835606, - "balance_loss_mlp": 1.07499874, - "epoch": 0.034029761010070644, - "flos": 64392683063040.0, - "grad_norm": 1.0610477485673708, - "language_loss": 0.54800498, - "learning_rate": 3.9998297790842535e-06, - "loss": 0.57032537, - "num_input_tokens_seen": 11998940, - "step": 566, - "time_per_iteration": 3.229137659072876 - }, - { - "auxiliary_loss_clip": 0.0126129, - "auxiliary_loss_mlp": 0.01073482, - "balance_loss_clip": 1.06798041, - "balance_loss_mlp": 1.03793335, - "epoch": 0.034089884262738616, - "flos": 25004204755200.0, - "grad_norm": 3.1955261820278564, - "language_loss": 0.76836932, - "learning_rate": 3.999824660018126e-06, - "loss": 0.79171705, - "num_input_tokens_seen": 12018860, - "step": 567, - "time_per_iteration": 2.632741928100586 - }, - { - "auxiliary_loss_clip": 0.01253596, - "auxiliary_loss_mlp": 0.01083559, - "balance_loss_clip": 1.06611466, - "balance_loss_mlp": 1.05153918, - "epoch": 0.03415000751540658, - "flos": 28439096584320.0, - "grad_norm": 2.115683621050472, - "language_loss": 0.80834144, - "learning_rate": 3.999819465118447e-06, - "loss": 0.83171296, - "num_input_tokens_seen": 12039675, - "step": 568, - "time_per_iteration": 2.7206337451934814 - }, - { - "auxiliary_loss_clip": 0.01254921, - "auxiliary_loss_mlp": 0.01082401, - "balance_loss_clip": 1.06888509, - "balance_loss_mlp": 1.04940367, - "epoch": 0.034210130768074554, - "flos": 21468727866240.0, - "grad_norm": 1.891360159585894, - "language_loss": 0.86560667, - "learning_rate": 3.999814194385413e-06, - "loss": 0.88897985, - "num_input_tokens_seen": 12057680, - "step": 569, - "time_per_iteration": 2.7271673679351807 - }, - { - "auxiliary_loss_clip": 0.01255135, - "auxiliary_loss_mlp": 0.01082251, - "balance_loss_clip": 1.06644094, - "balance_loss_mlp": 1.04922962, - "epoch": 0.03427025402074252, - "flos": 18697501676160.0, - "grad_norm": 1.6888504559193653, - "language_loss": 0.95945716, - "learning_rate": 3.9998088478192255e-06, - "loss": 0.982831, - "num_input_tokens_seen": 12076135, - "step": 570, - "time_per_iteration": 2.5918867588043213 - }, - { - "auxiliary_loss_clip": 0.01255487, - "auxiliary_loss_mlp": 0.0108066, - "balance_loss_clip": 1.06228065, - "balance_loss_mlp": 1.0435617, - "epoch": 0.03433037727341049, - "flos": 20849987162880.0, - "grad_norm": 2.39132447086081, - "language_loss": 0.7964232, - "learning_rate": 3.9998034254200846e-06, - "loss": 0.8197847, - "num_input_tokens_seen": 12094785, - "step": 571, - "time_per_iteration": 2.590184450149536 - }, - { - "auxiliary_loss_clip": 0.01256218, - "auxiliary_loss_mlp": 0.01091484, - "balance_loss_clip": 1.06740785, - "balance_loss_mlp": 1.0565083, - "epoch": 0.03439050052607846, - "flos": 25410282986880.0, - "grad_norm": 2.0738695690993, - "language_loss": 0.80214274, - "learning_rate": 3.999797927188199e-06, - "loss": 0.82561976, - "num_input_tokens_seen": 12114590, - "step": 572, - "time_per_iteration": 2.6862123012542725 - }, - { - "auxiliary_loss_clip": 0.01263024, - "auxiliary_loss_mlp": 0.01074173, - "balance_loss_clip": 1.06995344, - "balance_loss_mlp": 1.04098535, - "epoch": 0.03445062377874643, - "flos": 17640147997440.0, - "grad_norm": 2.2324763929909284, - "language_loss": 0.84548658, - "learning_rate": 3.999792353123774e-06, - "loss": 0.86885858, - "num_input_tokens_seen": 12132390, - "step": 573, - "time_per_iteration": 2.78487229347229 - }, - { - "auxiliary_loss_clip": 0.01256326, - "auxiliary_loss_mlp": 0.01068789, - "balance_loss_clip": 1.0644815, - "balance_loss_mlp": 1.03781831, - "epoch": 0.0345107470314144, - "flos": 16764502245120.0, - "grad_norm": 2.576428901855709, - "language_loss": 0.76602584, - "learning_rate": 3.999786703227023e-06, - "loss": 0.78927696, - "num_input_tokens_seen": 12149035, - "step": 574, - "time_per_iteration": 2.5697100162506104 - }, - { - "auxiliary_loss_clip": 0.01255191, - "auxiliary_loss_mlp": 0.0107671, - "balance_loss_clip": 1.06581593, - "balance_loss_mlp": 1.04502439, - "epoch": 0.03457087028408237, - "flos": 14684448533760.0, - "grad_norm": 2.156110110571344, - "language_loss": 0.83854586, - "learning_rate": 3.9997809774981606e-06, - "loss": 0.86186486, - "num_input_tokens_seen": 12167530, - "step": 575, - "time_per_iteration": 2.596418619155884 - }, - { - "auxiliary_loss_clip": 0.01249695, - "auxiliary_loss_mlp": 0.01076053, - "balance_loss_clip": 1.06684637, - "balance_loss_mlp": 1.04334211, - "epoch": 0.03463099353675034, - "flos": 20011293527040.0, - "grad_norm": 2.350120742735315, - "language_loss": 0.83990753, - "learning_rate": 3.9997751759374025e-06, - "loss": 0.86316502, - "num_input_tokens_seen": 12186340, - "step": 576, - "time_per_iteration": 5.821930646896362 - }, - { - "auxiliary_loss_clip": 0.01257114, - "auxiliary_loss_mlp": 0.01079503, - "balance_loss_clip": 1.07237518, - "balance_loss_mlp": 1.04817426, - "epoch": 0.03469111678941831, - "flos": 25301150490240.0, - "grad_norm": 2.138457686407641, - "language_loss": 0.85803086, - "learning_rate": 3.99976929854497e-06, - "loss": 0.88139701, - "num_input_tokens_seen": 12204090, - "step": 577, - "time_per_iteration": 4.225277423858643 - }, - { - "auxiliary_loss_clip": 0.01253845, - "auxiliary_loss_mlp": 0.01080214, - "balance_loss_clip": 1.06869018, - "balance_loss_mlp": 1.04712176, - "epoch": 0.034751240042086275, - "flos": 23259413612160.0, - "grad_norm": 4.535240156776142, - "language_loss": 0.72226608, - "learning_rate": 3.9997633453210845e-06, - "loss": 0.74560666, - "num_input_tokens_seen": 12224850, - "step": 578, - "time_per_iteration": 4.486239433288574 - }, - { - "auxiliary_loss_clip": 0.01251871, - "auxiliary_loss_mlp": 0.01080519, - "balance_loss_clip": 1.06461096, - "balance_loss_mlp": 1.04663968, - "epoch": 0.03481136329475425, - "flos": 23769237300480.0, - "grad_norm": 1.9496379050984929, - "language_loss": 0.77785492, - "learning_rate": 3.999757316265973e-06, - "loss": 0.80117887, - "num_input_tokens_seen": 12244935, - "step": 579, - "time_per_iteration": 2.6706583499908447 - }, - { - "auxiliary_loss_clip": 0.01251647, - "auxiliary_loss_mlp": 0.01087497, - "balance_loss_clip": 1.06656826, - "balance_loss_mlp": 1.05435717, - "epoch": 0.03487148654742222, - "flos": 20157521794560.0, - "grad_norm": 2.054973215074824, - "language_loss": 0.86841297, - "learning_rate": 3.999751211379863e-06, - "loss": 0.8918044, - "num_input_tokens_seen": 12262140, - "step": 580, - "time_per_iteration": 2.639146566390991 - }, - { - "auxiliary_loss_clip": 0.01256528, - "auxiliary_loss_mlp": 0.01069029, - "balance_loss_clip": 1.06636667, - "balance_loss_mlp": 1.0398469, - "epoch": 0.034931609800090184, - "flos": 15669585918720.0, - "grad_norm": 2.205850105033732, - "language_loss": 0.82570344, - "learning_rate": 3.999745030662987e-06, - "loss": 0.84895897, - "num_input_tokens_seen": 12280930, - "step": 581, - "time_per_iteration": 2.6505649089813232 - }, - { - "auxiliary_loss_clip": 0.01252942, - "auxiliary_loss_mlp": 0.01072317, - "balance_loss_clip": 1.06823969, - "balance_loss_mlp": 1.04168022, - "epoch": 0.034991733052758156, - "flos": 16362374509440.0, - "grad_norm": 2.1922492117358146, - "language_loss": 0.7733047, - "learning_rate": 3.99973877411558e-06, - "loss": 0.79655731, - "num_input_tokens_seen": 12299125, - "step": 582, - "time_per_iteration": 2.7323596477508545 - }, - { - "auxiliary_loss_clip": 0.01250253, - "auxiliary_loss_mlp": 0.01082356, - "balance_loss_clip": 1.06794167, - "balance_loss_mlp": 1.04861939, - "epoch": 0.03505185630542612, - "flos": 19387309438080.0, - "grad_norm": 2.1536178016194327, - "language_loss": 0.87679923, - "learning_rate": 3.999732441737877e-06, - "loss": 0.90012532, - "num_input_tokens_seen": 12316905, - "step": 583, - "time_per_iteration": 2.6049294471740723 - }, - { - "auxiliary_loss_clip": 0.01255473, - "auxiliary_loss_mlp": 0.01092826, - "balance_loss_clip": 1.06699181, - "balance_loss_mlp": 1.06104505, - "epoch": 0.03511197955809409, - "flos": 21323828401920.0, - "grad_norm": 3.7027110169592015, - "language_loss": 0.81196821, - "learning_rate": 3.99972603353012e-06, - "loss": 0.83545119, - "num_input_tokens_seen": 12335070, - "step": 584, - "time_per_iteration": 2.6011815071105957 - }, - { - "auxiliary_loss_clip": 0.01251161, - "auxiliary_loss_mlp": 0.01069463, - "balance_loss_clip": 1.06472683, - "balance_loss_mlp": 1.03832567, - "epoch": 0.035172102810762065, - "flos": 14136595320960.0, - "grad_norm": 3.067717812226321, - "language_loss": 0.92399198, - "learning_rate": 3.999719549492551e-06, - "loss": 0.94719815, - "num_input_tokens_seen": 12350315, - "step": 585, - "time_per_iteration": 2.5592780113220215 - }, - { - "auxiliary_loss_clip": 0.01251271, - "auxiliary_loss_mlp": 0.01077423, - "balance_loss_clip": 1.06562734, - "balance_loss_mlp": 1.04552317, - "epoch": 0.03523222606343003, - "flos": 20296890564480.0, - "grad_norm": 2.196660024103635, - "language_loss": 0.87644351, - "learning_rate": 3.9997129896254165e-06, - "loss": 0.89973044, - "num_input_tokens_seen": 12366030, - "step": 586, - "time_per_iteration": 2.5486221313476562 - }, - { - "auxiliary_loss_clip": 0.01256485, - "auxiliary_loss_mlp": 0.0108018, - "balance_loss_clip": 1.06803596, - "balance_loss_mlp": 1.04918551, - "epoch": 0.035292349316098, - "flos": 20375822701440.0, - "grad_norm": 2.1222089199850878, - "language_loss": 0.76079381, - "learning_rate": 3.999706353928965e-06, - "loss": 0.78416049, - "num_input_tokens_seen": 12384895, - "step": 587, - "time_per_iteration": 2.5923714637756348 - }, - { - "auxiliary_loss_clip": 0.01257125, - "auxiliary_loss_mlp": 0.01068649, - "balance_loss_clip": 1.06683922, - "balance_loss_mlp": 1.03586686, - "epoch": 0.03535247256876597, - "flos": 21468871520640.0, - "grad_norm": 2.212352192395094, - "language_loss": 0.78601038, - "learning_rate": 3.999699642403449e-06, - "loss": 0.80926806, - "num_input_tokens_seen": 12404980, - "step": 588, - "time_per_iteration": 2.579280138015747 - }, - { - "auxiliary_loss_clip": 0.0125398, - "auxiliary_loss_mlp": 0.0107827, - "balance_loss_clip": 1.06582928, - "balance_loss_mlp": 1.04367518, - "epoch": 0.03541259582143394, - "flos": 23623044946560.0, - "grad_norm": 2.153589114745919, - "language_loss": 0.94312829, - "learning_rate": 3.99969285504912e-06, - "loss": 0.96645081, - "num_input_tokens_seen": 12423835, - "step": 589, - "time_per_iteration": 2.5964701175689697 - }, - { - "auxiliary_loss_clip": 0.01256884, - "auxiliary_loss_mlp": 0.01078108, - "balance_loss_clip": 1.06697679, - "balance_loss_mlp": 1.04666042, - "epoch": 0.03547271907410191, - "flos": 33726367768320.0, - "grad_norm": 2.1162556876212695, - "language_loss": 0.84116042, - "learning_rate": 3.99968599186624e-06, - "loss": 0.8645103, - "num_input_tokens_seen": 12443135, - "step": 590, - "time_per_iteration": 2.746436357498169 - }, - { - "auxiliary_loss_clip": 0.01249398, - "auxiliary_loss_mlp": 0.01068452, - "balance_loss_clip": 1.06658125, - "balance_loss_mlp": 1.03893578, - "epoch": 0.03553284232676988, - "flos": 21142695093120.0, - "grad_norm": 1.984522351394552, - "language_loss": 0.8684091, - "learning_rate": 3.999679052855065e-06, - "loss": 0.89158762, - "num_input_tokens_seen": 12462895, - "step": 591, - "time_per_iteration": 2.692303419113159 - }, - { - "auxiliary_loss_clip": 0.01250641, - "auxiliary_loss_mlp": 0.01082122, - "balance_loss_clip": 1.06297326, - "balance_loss_mlp": 1.04883862, - "epoch": 0.03559296557943785, - "flos": 20046593617920.0, - "grad_norm": 2.0873185001780783, - "language_loss": 0.83075488, - "learning_rate": 3.999672038015861e-06, - "loss": 0.85408247, - "num_input_tokens_seen": 12481515, - "step": 592, - "time_per_iteration": 2.7822203636169434 - }, - { - "auxiliary_loss_clip": 0.01146211, - "auxiliary_loss_mlp": 0.01034159, - "balance_loss_clip": 1.05013406, - "balance_loss_mlp": 1.02676773, - "epoch": 0.035653088832105814, - "flos": 60334597244160.0, - "grad_norm": 0.8804992705477848, - "language_loss": 0.59754086, - "learning_rate": 3.999664947348893e-06, - "loss": 0.61934447, - "num_input_tokens_seen": 12548220, - "step": 593, - "time_per_iteration": 3.274080276489258 - }, - { - "auxiliary_loss_clip": 0.01249386, - "auxiliary_loss_mlp": 0.0107742, - "balance_loss_clip": 1.06737614, - "balance_loss_mlp": 1.04473329, - "epoch": 0.035713212084773786, - "flos": 20113135562880.0, - "grad_norm": 1.8086551314359374, - "language_loss": 0.87077361, - "learning_rate": 3.999657780854429e-06, - "loss": 0.89404166, - "num_input_tokens_seen": 12566105, - "step": 594, - "time_per_iteration": 2.682236671447754 - }, - { - "auxiliary_loss_clip": 0.012487, - "auxiliary_loss_mlp": 0.01082358, - "balance_loss_clip": 1.06235993, - "balance_loss_mlp": 1.05057716, - "epoch": 0.03577333533744176, - "flos": 26285785084800.0, - "grad_norm": 5.516524335860627, - "language_loss": 0.83920246, - "learning_rate": 3.999650538532742e-06, - "loss": 0.86251307, - "num_input_tokens_seen": 12586680, - "step": 595, - "time_per_iteration": 2.773669481277466 - }, - { - "auxiliary_loss_clip": 0.01248678, - "auxiliary_loss_mlp": 0.01090544, - "balance_loss_clip": 1.06579614, - "balance_loss_mlp": 1.05850017, - "epoch": 0.035833458590109724, - "flos": 10889732211840.0, - "grad_norm": 2.3448814752825204, - "language_loss": 0.96041518, - "learning_rate": 3.999643220384106e-06, - "loss": 0.98380733, - "num_input_tokens_seen": 12601605, - "step": 596, - "time_per_iteration": 2.6541590690612793 - }, - { - "auxiliary_loss_clip": 0.01252662, - "auxiliary_loss_mlp": 0.01081887, - "balance_loss_clip": 1.0675534, - "balance_loss_mlp": 1.05165553, - "epoch": 0.035893581842777696, - "flos": 22090198003200.0, - "grad_norm": 2.4353221882859004, - "language_loss": 0.82993281, - "learning_rate": 3.999635826408799e-06, - "loss": 0.85327828, - "num_input_tokens_seen": 12620365, - "step": 597, - "time_per_iteration": 2.7023818492889404 - }, - { - "auxiliary_loss_clip": 0.01247839, - "auxiliary_loss_mlp": 0.01079829, - "balance_loss_clip": 1.0668776, - "balance_loss_mlp": 1.04766583, - "epoch": 0.03595370509544566, - "flos": 23038347358080.0, - "grad_norm": 2.374757318483944, - "language_loss": 0.81364304, - "learning_rate": 3.999628356607101e-06, - "loss": 0.83691972, - "num_input_tokens_seen": 12641140, - "step": 598, - "time_per_iteration": 2.731229782104492 - }, - { - "auxiliary_loss_clip": 0.01243692, - "auxiliary_loss_mlp": 0.01077827, - "balance_loss_clip": 1.0663228, - "balance_loss_mlp": 1.04587913, - "epoch": 0.03601382834811363, - "flos": 20777734955520.0, - "grad_norm": 1.817680341814684, - "language_loss": 0.81172699, - "learning_rate": 3.999620810979295e-06, - "loss": 0.83494222, - "num_input_tokens_seen": 12661080, - "step": 599, - "time_per_iteration": 2.710191011428833 - }, - { - "auxiliary_loss_clip": 0.01250419, - "auxiliary_loss_mlp": 0.01074577, - "balance_loss_clip": 1.06356514, - "balance_loss_mlp": 1.045228, - "epoch": 0.036073951600781605, - "flos": 23951627585280.0, - "grad_norm": 2.3963649020429627, - "language_loss": 0.8651731, - "learning_rate": 3.999613189525668e-06, - "loss": 0.88842309, - "num_input_tokens_seen": 12678270, - "step": 600, - "time_per_iteration": 2.682262420654297 - }, - { - "auxiliary_loss_clip": 0.01241882, - "auxiliary_loss_mlp": 0.01084809, - "balance_loss_clip": 1.05918193, - "balance_loss_mlp": 1.05297971, - "epoch": 0.03613407485344957, - "flos": 18912283050240.0, - "grad_norm": 2.0308947613075423, - "language_loss": 0.82355881, - "learning_rate": 3.999605492246508e-06, - "loss": 0.84682572, - "num_input_tokens_seen": 12697295, - "step": 601, - "time_per_iteration": 2.6570894718170166 - }, - { - "auxiliary_loss_clip": 0.01240868, - "auxiliary_loss_mlp": 0.010708, - "balance_loss_clip": 1.06129336, - "balance_loss_mlp": 1.03920949, - "epoch": 0.03619419810611754, - "flos": 23038526926080.0, - "grad_norm": 2.3080142694085555, - "language_loss": 0.7502507, - "learning_rate": 3.999597719142107e-06, - "loss": 0.77336735, - "num_input_tokens_seen": 12716165, - "step": 602, - "time_per_iteration": 2.6434237957000732 - }, - { - "auxiliary_loss_clip": 0.01239543, - "auxiliary_loss_mlp": 0.01066859, - "balance_loss_clip": 1.0604254, - "balance_loss_mlp": 1.03562629, - "epoch": 0.03625432135878551, - "flos": 29457774293760.0, - "grad_norm": 1.9681237382646195, - "language_loss": 0.79599822, - "learning_rate": 3.999589870212761e-06, - "loss": 0.81906223, - "num_input_tokens_seen": 12735475, - "step": 603, - "time_per_iteration": 2.7201666831970215 - }, - { - "auxiliary_loss_clip": 0.01244834, - "auxiliary_loss_mlp": 0.01071177, - "balance_loss_clip": 1.06545615, - "balance_loss_mlp": 1.04130292, - "epoch": 0.03631444461145348, - "flos": 23508525409920.0, - "grad_norm": 1.8363641170913294, - "language_loss": 0.86668456, - "learning_rate": 3.9995819454587664e-06, - "loss": 0.88984472, - "num_input_tokens_seen": 12754540, - "step": 604, - "time_per_iteration": 2.60249924659729 - }, - { - "auxiliary_loss_clip": 0.01248906, - "auxiliary_loss_mlp": 0.01072985, - "balance_loss_clip": 1.0674324, - "balance_loss_mlp": 1.04010737, - "epoch": 0.03637456786412145, - "flos": 16618130323200.0, - "grad_norm": 2.510130211393037, - "language_loss": 0.80746496, - "learning_rate": 3.999573944880424e-06, - "loss": 0.83068383, - "num_input_tokens_seen": 12773050, - "step": 605, - "time_per_iteration": 2.766684055328369 - }, - { - "auxiliary_loss_clip": 0.01244274, - "auxiliary_loss_mlp": 0.0107873, - "balance_loss_clip": 1.0630821, - "balance_loss_mlp": 1.04846251, - "epoch": 0.03643469111678942, - "flos": 15851832549120.0, - "grad_norm": 2.2216143800596835, - "language_loss": 0.85942292, - "learning_rate": 3.9995658684780375e-06, - "loss": 0.882653, - "num_input_tokens_seen": 12791240, - "step": 606, - "time_per_iteration": 2.6133925914764404 - }, - { - "auxiliary_loss_clip": 0.01247732, - "auxiliary_loss_mlp": 0.01077404, - "balance_loss_clip": 1.06413972, - "balance_loss_mlp": 1.04588532, - "epoch": 0.03649481436945739, - "flos": 23620387340160.0, - "grad_norm": 2.0684825764003394, - "language_loss": 0.82179952, - "learning_rate": 3.999557716251912e-06, - "loss": 0.84505081, - "num_input_tokens_seen": 12812245, - "step": 607, - "time_per_iteration": 2.6805856227874756 - }, - { - "auxiliary_loss_clip": 0.01245394, - "auxiliary_loss_mlp": 0.01073743, - "balance_loss_clip": 1.06585169, - "balance_loss_mlp": 1.04317796, - "epoch": 0.036554937622125354, - "flos": 21755581879680.0, - "grad_norm": 2.3717179235904533, - "language_loss": 0.83567071, - "learning_rate": 3.999549488202358e-06, - "loss": 0.8588621, - "num_input_tokens_seen": 12831085, - "step": 608, - "time_per_iteration": 2.6593453884124756 - }, - { - "auxiliary_loss_clip": 0.01251062, - "auxiliary_loss_mlp": 0.01073705, - "balance_loss_clip": 1.06682992, - "balance_loss_mlp": 1.04006422, - "epoch": 0.036615060874793326, - "flos": 17819772935040.0, - "grad_norm": 2.4795108668903305, - "language_loss": 0.8201133, - "learning_rate": 3.999541184329688e-06, - "loss": 0.84336102, - "num_input_tokens_seen": 12849115, - "step": 609, - "time_per_iteration": 2.6299383640289307 - }, - { - "auxiliary_loss_clip": 0.01255655, - "auxiliary_loss_mlp": 0.01091893, - "balance_loss_clip": 1.07322037, - "balance_loss_mlp": 1.06158984, - "epoch": 0.0366751841274613, - "flos": 26753808320640.0, - "grad_norm": 1.992640540297191, - "language_loss": 0.79448462, - "learning_rate": 3.999532804634215e-06, - "loss": 0.81796008, - "num_input_tokens_seen": 12868005, - "step": 610, - "time_per_iteration": 2.65120530128479 - }, - { - "auxiliary_loss_clip": 0.01254423, - "auxiliary_loss_mlp": 0.01088228, - "balance_loss_clip": 1.06914616, - "balance_loss_mlp": 1.05656588, - "epoch": 0.03673530738012926, - "flos": 22196960202240.0, - "grad_norm": 1.9328503999291824, - "language_loss": 0.87282723, - "learning_rate": 3.9995243491162575e-06, - "loss": 0.89625371, - "num_input_tokens_seen": 12886890, - "step": 611, - "time_per_iteration": 2.7398059368133545 - }, - { - "auxiliary_loss_clip": 0.01248885, - "auxiliary_loss_mlp": 0.01097673, - "balance_loss_clip": 1.06917143, - "balance_loss_mlp": 1.06651139, - "epoch": 0.036795430632797235, - "flos": 24681655601280.0, - "grad_norm": 3.7435200854847266, - "language_loss": 0.72589231, - "learning_rate": 3.999515817776136e-06, - "loss": 0.74935788, - "num_input_tokens_seen": 12906130, - "step": 612, - "time_per_iteration": 2.700406551361084 - }, - { - "auxiliary_loss_clip": 0.01249112, - "auxiliary_loss_mlp": 0.01076924, - "balance_loss_clip": 1.06581926, - "balance_loss_mlp": 1.04480934, - "epoch": 0.0368555538854652, - "flos": 17748921358080.0, - "grad_norm": 3.0863603820013434, - "language_loss": 0.79110008, - "learning_rate": 3.999507210614175e-06, - "loss": 0.81436038, - "num_input_tokens_seen": 12925260, - "step": 613, - "time_per_iteration": 2.630472183227539 - }, - { - "auxiliary_loss_clip": 0.01242581, - "auxiliary_loss_mlp": 0.01090278, - "balance_loss_clip": 1.06378841, - "balance_loss_mlp": 1.05961776, - "epoch": 0.03691567713813317, - "flos": 20594554571520.0, - "grad_norm": 2.2015687298668336, - "language_loss": 0.93885028, - "learning_rate": 3.9994985276307e-06, - "loss": 0.96217889, - "num_input_tokens_seen": 12944590, - "step": 614, - "time_per_iteration": 2.6977972984313965 - }, - { - "auxiliary_loss_clip": 0.01254503, - "auxiliary_loss_mlp": 0.01081137, - "balance_loss_clip": 1.07009673, - "balance_loss_mlp": 1.04732919, - "epoch": 0.036975800390801145, - "flos": 33650380546560.0, - "grad_norm": 3.0661216019279576, - "language_loss": 0.72932875, - "learning_rate": 3.999489768826041e-06, - "loss": 0.75268513, - "num_input_tokens_seen": 12964785, - "step": 615, - "time_per_iteration": 2.697291612625122 - }, - { - "auxiliary_loss_clip": 0.01250213, - "auxiliary_loss_mlp": 0.010716, - "balance_loss_clip": 1.06649876, - "balance_loss_mlp": 1.04015231, - "epoch": 0.03703592364346911, - "flos": 28293694329600.0, - "grad_norm": 2.9941392641088695, - "language_loss": 0.81630868, - "learning_rate": 3.999480934200528e-06, - "loss": 0.83952683, - "num_input_tokens_seen": 12986705, - "step": 616, - "time_per_iteration": 4.1762495040893555 - }, - { - "auxiliary_loss_clip": 0.0124999, - "auxiliary_loss_mlp": 0.01076541, - "balance_loss_clip": 1.06807041, - "balance_loss_mlp": 1.0467627, - "epoch": 0.03709604689613708, - "flos": 31504215853440.0, - "grad_norm": 2.320593216419041, - "language_loss": 0.68178958, - "learning_rate": 3.999472023754499e-06, - "loss": 0.70505488, - "num_input_tokens_seen": 13010560, - "step": 617, - "time_per_iteration": 4.224538564682007 - }, - { - "auxiliary_loss_clip": 0.01254259, - "auxiliary_loss_mlp": 0.010771, - "balance_loss_clip": 1.07098567, - "balance_loss_mlp": 1.04415071, - "epoch": 0.03715617014880505, - "flos": 19609381272960.0, - "grad_norm": 2.245411088847763, - "language_loss": 0.80595517, - "learning_rate": 3.99946303748829e-06, - "loss": 0.82926875, - "num_input_tokens_seen": 13028935, - "step": 618, - "time_per_iteration": 4.200341463088989 - }, - { - "auxiliary_loss_clip": 0.01257669, - "auxiliary_loss_mlp": 0.01079294, - "balance_loss_clip": 1.06808555, - "balance_loss_mlp": 1.04605901, - "epoch": 0.03721629340147302, - "flos": 15924192497280.0, - "grad_norm": 10.155035046705617, - "language_loss": 0.91591841, - "learning_rate": 3.999453975402242e-06, - "loss": 0.93928802, - "num_input_tokens_seen": 13046000, - "step": 619, - "time_per_iteration": 2.5787301063537598 - }, - { - "auxiliary_loss_clip": 0.01251145, - "auxiliary_loss_mlp": 0.01083548, - "balance_loss_clip": 1.06999123, - "balance_loss_mlp": 1.05181432, - "epoch": 0.03727641665414099, - "flos": 21104090951040.0, - "grad_norm": 2.0803022158745406, - "language_loss": 0.94071603, - "learning_rate": 3.9994448374967e-06, - "loss": 0.96406299, - "num_input_tokens_seen": 13062995, - "step": 620, - "time_per_iteration": 2.5987205505371094 - }, - { - "auxiliary_loss_clip": 0.01249568, - "auxiliary_loss_mlp": 0.0108317, - "balance_loss_clip": 1.06624317, - "balance_loss_mlp": 1.0502919, - "epoch": 0.037336539906808956, - "flos": 24131683486080.0, - "grad_norm": 1.7431896174296577, - "language_loss": 0.77319217, - "learning_rate": 3.999435623772008e-06, - "loss": 0.79651952, - "num_input_tokens_seen": 13084120, - "step": 621, - "time_per_iteration": 2.68758225440979 - }, - { - "auxiliary_loss_clip": 0.01247252, - "auxiliary_loss_mlp": 0.01071013, - "balance_loss_clip": 1.06894088, - "balance_loss_mlp": 1.03792048, - "epoch": 0.03739666315947693, - "flos": 22346384780160.0, - "grad_norm": 2.3852872810563364, - "language_loss": 0.86546707, - "learning_rate": 3.999426334228518e-06, - "loss": 0.88864976, - "num_input_tokens_seen": 13100035, - "step": 622, - "time_per_iteration": 2.607121467590332 - }, - { - "auxiliary_loss_clip": 0.012499, - "auxiliary_loss_mlp": 0.01072461, - "balance_loss_clip": 1.06715882, - "balance_loss_mlp": 1.04048872, - "epoch": 0.0374567864121449, - "flos": 20449511452800.0, - "grad_norm": 2.2621736327299766, - "language_loss": 0.90008956, - "learning_rate": 3.999416968866581e-06, - "loss": 0.92331314, - "num_input_tokens_seen": 13118070, - "step": 623, - "time_per_iteration": 2.6513512134552 - }, - { - "auxiliary_loss_clip": 0.01251762, - "auxiliary_loss_mlp": 0.01090534, - "balance_loss_clip": 1.07006013, - "balance_loss_mlp": 1.05844235, - "epoch": 0.037516909664812866, - "flos": 19208043636480.0, - "grad_norm": 2.760597076727266, - "language_loss": 0.84095174, - "learning_rate": 3.999407527686551e-06, - "loss": 0.8643747, - "num_input_tokens_seen": 13136355, - "step": 624, - "time_per_iteration": 2.66623592376709 - }, - { - "auxiliary_loss_clip": 0.01252431, - "auxiliary_loss_mlp": 0.01076353, - "balance_loss_clip": 1.06697702, - "balance_loss_mlp": 1.04423809, - "epoch": 0.03757703291748084, - "flos": 35005218664320.0, - "grad_norm": 4.259276014089895, - "language_loss": 0.66778994, - "learning_rate": 3.999398010688788e-06, - "loss": 0.69107783, - "num_input_tokens_seen": 13155435, - "step": 625, - "time_per_iteration": 2.7288877964019775 - }, - { - "auxiliary_loss_clip": 0.01244959, - "auxiliary_loss_mlp": 0.01076274, - "balance_loss_clip": 1.06605244, - "balance_loss_mlp": 1.042943, - "epoch": 0.0376371561701488, - "flos": 25483899911040.0, - "grad_norm": 3.375450269409945, - "language_loss": 0.77496696, - "learning_rate": 3.999388417873652e-06, - "loss": 0.79817927, - "num_input_tokens_seen": 13174295, - "step": 626, - "time_per_iteration": 2.648942470550537 - }, - { - "auxiliary_loss_clip": 0.01249107, - "auxiliary_loss_mlp": 0.0108376, - "balance_loss_clip": 1.06770003, - "balance_loss_mlp": 1.05200303, - "epoch": 0.037697279422816775, - "flos": 18185630912640.0, - "grad_norm": 2.0480468386724766, - "language_loss": 0.81463408, - "learning_rate": 3.999378749241506e-06, - "loss": 0.83796275, - "num_input_tokens_seen": 13192500, - "step": 627, - "time_per_iteration": 2.6209845542907715 - }, - { - "auxiliary_loss_clip": 0.01254363, - "auxiliary_loss_mlp": 0.01084942, - "balance_loss_clip": 1.07041132, - "balance_loss_mlp": 1.05215955, - "epoch": 0.03775740267548475, - "flos": 24644272521600.0, - "grad_norm": 1.6934072791943036, - "language_loss": 0.88809037, - "learning_rate": 3.999369004792719e-06, - "loss": 0.91148341, - "num_input_tokens_seen": 13213470, - "step": 628, - "time_per_iteration": 2.7221415042877197 - }, - { - "auxiliary_loss_clip": 0.01247303, - "auxiliary_loss_mlp": 0.01080197, - "balance_loss_clip": 1.0627017, - "balance_loss_mlp": 1.04765344, - "epoch": 0.03781752592815271, - "flos": 21288205088640.0, - "grad_norm": 2.536151380104699, - "language_loss": 0.79840028, - "learning_rate": 3.999359184527658e-06, - "loss": 0.82167524, - "num_input_tokens_seen": 13232365, - "step": 629, - "time_per_iteration": 2.6535024642944336 - }, - { - "auxiliary_loss_clip": 0.01249218, - "auxiliary_loss_mlp": 0.0106958, - "balance_loss_clip": 1.06675959, - "balance_loss_mlp": 1.03885961, - "epoch": 0.037877649180820684, - "flos": 22089623385600.0, - "grad_norm": 1.6861994278356789, - "language_loss": 0.76824844, - "learning_rate": 3.999349288446696e-06, - "loss": 0.79143643, - "num_input_tokens_seen": 13251920, - "step": 630, - "time_per_iteration": 2.6175966262817383 - }, - { - "auxiliary_loss_clip": 0.01254291, - "auxiliary_loss_mlp": 0.01075963, - "balance_loss_clip": 1.06833327, - "balance_loss_mlp": 1.04504025, - "epoch": 0.03793777243348865, - "flos": 14501339976960.0, - "grad_norm": 3.12435515576561, - "language_loss": 0.91593724, - "learning_rate": 3.99933931655021e-06, - "loss": 0.93923974, - "num_input_tokens_seen": 13267440, - "step": 631, - "time_per_iteration": 2.565293788909912 - }, - { - "auxiliary_loss_clip": 0.01243525, - "auxiliary_loss_mlp": 0.01087901, - "balance_loss_clip": 1.06386209, - "balance_loss_mlp": 1.05356884, - "epoch": 0.03799789568615662, - "flos": 21908418249600.0, - "grad_norm": 1.6822536287963328, - "language_loss": 0.92157543, - "learning_rate": 3.999329268838575e-06, - "loss": 0.94488978, - "num_input_tokens_seen": 13287850, - "step": 632, - "time_per_iteration": 2.6235203742980957 - }, - { - "auxiliary_loss_clip": 0.01248362, - "auxiliary_loss_mlp": 0.01067296, - "balance_loss_clip": 1.06696796, - "balance_loss_mlp": 1.03613472, - "epoch": 0.03805801893882459, - "flos": 24827021942400.0, - "grad_norm": 2.1097171792430456, - "language_loss": 0.83139223, - "learning_rate": 3.999319145312175e-06, - "loss": 0.85454881, - "num_input_tokens_seen": 13307760, - "step": 633, - "time_per_iteration": 2.6461985111236572 - }, - { - "auxiliary_loss_clip": 0.01247735, - "auxiliary_loss_mlp": 0.01079895, - "balance_loss_clip": 1.06473529, - "balance_loss_mlp": 1.04811358, - "epoch": 0.03811814219149256, - "flos": 30482952364800.0, - "grad_norm": 1.599115294194595, - "language_loss": 0.69883299, - "learning_rate": 3.999308945971392e-06, - "loss": 0.72210932, - "num_input_tokens_seen": 13331230, - "step": 634, - "time_per_iteration": 2.709033727645874 - }, - { - "auxiliary_loss_clip": 0.01133204, - "auxiliary_loss_mlp": 0.01009504, - "balance_loss_clip": 1.04124916, - "balance_loss_mlp": 1.00249422, - "epoch": 0.03817826544416053, - "flos": 66992577379200.0, - "grad_norm": 0.893126545279708, - "language_loss": 0.61645919, - "learning_rate": 3.999298670816614e-06, - "loss": 0.63788629, - "num_input_tokens_seen": 13394760, - "step": 635, - "time_per_iteration": 3.2099475860595703 - }, - { - "auxiliary_loss_clip": 0.01244276, - "auxiliary_loss_mlp": 0.01072984, - "balance_loss_clip": 1.06475401, - "balance_loss_mlp": 1.04129851, - "epoch": 0.038238388696828496, - "flos": 20485350247680.0, - "grad_norm": 2.0563589539657205, - "language_loss": 0.83629507, - "learning_rate": 3.9992883198482294e-06, - "loss": 0.85946769, - "num_input_tokens_seen": 13412775, - "step": 636, - "time_per_iteration": 2.6278960704803467 - }, - { - "auxiliary_loss_clip": 0.01248078, - "auxiliary_loss_mlp": 0.01096471, - "balance_loss_clip": 1.06714165, - "balance_loss_mlp": 1.06530952, - "epoch": 0.03829851194949647, - "flos": 17965893461760.0, - "grad_norm": 2.346379148367956, - "language_loss": 0.79578567, - "learning_rate": 3.999277893066632e-06, - "loss": 0.81923115, - "num_input_tokens_seen": 13427835, - "step": 637, - "time_per_iteration": 2.646414279937744 - }, - { - "auxiliary_loss_clip": 0.01247939, - "auxiliary_loss_mlp": 0.01088528, - "balance_loss_clip": 1.06356907, - "balance_loss_mlp": 1.0562222, - "epoch": 0.03835863520216444, - "flos": 22456522857600.0, - "grad_norm": 1.9563283234999833, - "language_loss": 0.83989692, - "learning_rate": 3.999267390472215e-06, - "loss": 0.86326158, - "num_input_tokens_seen": 13447295, - "step": 638, - "time_per_iteration": 2.6416285037994385 - }, - { - "auxiliary_loss_clip": 0.01253172, - "auxiliary_loss_mlp": 0.01074704, - "balance_loss_clip": 1.06563985, - "balance_loss_mlp": 1.04163575, - "epoch": 0.038418758454832405, - "flos": 22164425458560.0, - "grad_norm": 2.5596504471077224, - "language_loss": 0.70109725, - "learning_rate": 3.999256812065381e-06, - "loss": 0.72437602, - "num_input_tokens_seen": 13468455, - "step": 639, - "time_per_iteration": 2.610682487487793 - }, - { - "auxiliary_loss_clip": 0.01248829, - "auxiliary_loss_mlp": 0.01081808, - "balance_loss_clip": 1.06618333, - "balance_loss_mlp": 1.04790449, - "epoch": 0.03847888170750038, - "flos": 22747435107840.0, - "grad_norm": 2.5791624605537082, - "language_loss": 0.85322344, - "learning_rate": 3.999246157846526e-06, - "loss": 0.87652987, - "num_input_tokens_seen": 13489085, - "step": 640, - "time_per_iteration": 2.700456380844116 - }, - { - "auxiliary_loss_clip": 0.01252579, - "auxiliary_loss_mlp": 0.01083722, - "balance_loss_clip": 1.06751871, - "balance_loss_mlp": 1.04934239, - "epoch": 0.03853900496016834, - "flos": 22711201263360.0, - "grad_norm": 2.331268680461456, - "language_loss": 0.82141805, - "learning_rate": 3.9992354278160574e-06, - "loss": 0.84478104, - "num_input_tokens_seen": 13509120, - "step": 641, - "time_per_iteration": 2.6572046279907227 - }, - { - "auxiliary_loss_clip": 0.0112759, - "auxiliary_loss_mlp": 0.01008008, - "balance_loss_clip": 1.03825259, - "balance_loss_mlp": 1.00095105, - "epoch": 0.038599128212836314, - "flos": 70399136355840.0, - "grad_norm": 0.9037629700551453, - "language_loss": 0.65444964, - "learning_rate": 3.999224621974381e-06, - "loss": 0.67580563, - "num_input_tokens_seen": 13562005, - "step": 642, - "time_per_iteration": 3.199925422668457 - }, - { - "auxiliary_loss_clip": 0.01246698, - "auxiliary_loss_mlp": 0.01064563, - "balance_loss_clip": 1.0651319, - "balance_loss_mlp": 1.03453398, - "epoch": 0.03865925146550429, - "flos": 23295144666240.0, - "grad_norm": 1.9113268312481755, - "language_loss": 0.79272145, - "learning_rate": 3.999213740321906e-06, - "loss": 0.81583405, - "num_input_tokens_seen": 13582185, - "step": 643, - "time_per_iteration": 2.641437292098999 - }, - { - "auxiliary_loss_clip": 0.01244786, - "auxiliary_loss_mlp": 0.01076057, - "balance_loss_clip": 1.06219232, - "balance_loss_mlp": 1.04599261, - "epoch": 0.03871937471817225, - "flos": 21430446946560.0, - "grad_norm": 2.2104774200729262, - "language_loss": 0.8294487, - "learning_rate": 3.999202782859046e-06, - "loss": 0.85265714, - "num_input_tokens_seen": 13599555, - "step": 644, - "time_per_iteration": 2.600558280944824 - }, - { - "auxiliary_loss_clip": 0.01247273, - "auxiliary_loss_mlp": 0.01074554, - "balance_loss_clip": 1.06383467, - "balance_loss_mlp": 1.04193854, - "epoch": 0.038779497970840224, - "flos": 34277309550720.0, - "grad_norm": 1.994902925690418, - "language_loss": 0.82286513, - "learning_rate": 3.9991917495862165e-06, - "loss": 0.8460834, - "num_input_tokens_seen": 13621160, - "step": 645, - "time_per_iteration": 2.6751983165740967 - }, - { - "auxiliary_loss_clip": 0.01248631, - "auxiliary_loss_mlp": 0.01070807, - "balance_loss_clip": 1.06525111, - "balance_loss_mlp": 1.03890657, - "epoch": 0.03883962122350819, - "flos": 22748189293440.0, - "grad_norm": 2.290384247239265, - "language_loss": 0.81889713, - "learning_rate": 3.9991806405038345e-06, - "loss": 0.84209144, - "num_input_tokens_seen": 13641915, - "step": 646, - "time_per_iteration": 2.6987667083740234 - }, - { - "auxiliary_loss_clip": 0.01250204, - "auxiliary_loss_mlp": 0.01078836, - "balance_loss_clip": 1.06982899, - "balance_loss_mlp": 1.04791331, - "epoch": 0.03889974447617616, - "flos": 21945837242880.0, - "grad_norm": 1.9171219640425325, - "language_loss": 0.82015383, - "learning_rate": 3.999169455612323e-06, - "loss": 0.84344423, - "num_input_tokens_seen": 13661410, - "step": 647, - "time_per_iteration": 2.590102195739746 - }, - { - "auxiliary_loss_clip": 0.0124696, - "auxiliary_loss_mlp": 0.01072111, - "balance_loss_clip": 1.06628954, - "balance_loss_mlp": 1.04216528, - "epoch": 0.03895986772884413, - "flos": 31504826384640.0, - "grad_norm": 1.9398424653049293, - "language_loss": 0.84477997, - "learning_rate": 3.999158194912106e-06, - "loss": 0.86797059, - "num_input_tokens_seen": 13681705, - "step": 648, - "time_per_iteration": 2.7516121864318848 - }, - { - "auxiliary_loss_clip": 0.01244808, - "auxiliary_loss_mlp": 0.0107293, - "balance_loss_clip": 1.06524062, - "balance_loss_mlp": 1.04210222, - "epoch": 0.0390199909815121, - "flos": 19901011795200.0, - "grad_norm": 2.3870859420748136, - "language_loss": 0.84254295, - "learning_rate": 3.9991468584036086e-06, - "loss": 0.86572027, - "num_input_tokens_seen": 13700400, - "step": 649, - "time_per_iteration": 2.6116180419921875 - }, - { - "auxiliary_loss_clip": 0.01246653, - "auxiliary_loss_mlp": 0.01073574, - "balance_loss_clip": 1.06560743, - "balance_loss_mlp": 1.0416739, - "epoch": 0.03908011423418007, - "flos": 21612478095360.0, - "grad_norm": 2.00775905451926, - "language_loss": 0.79783499, - "learning_rate": 3.999135446087263e-06, - "loss": 0.82103723, - "num_input_tokens_seen": 13720145, - "step": 650, - "time_per_iteration": 2.574939727783203 - }, - { - "auxiliary_loss_clip": 0.01242721, - "auxiliary_loss_mlp": 0.01077536, - "balance_loss_clip": 1.06209707, - "balance_loss_mlp": 1.04534984, - "epoch": 0.039140237486848035, - "flos": 18661411486080.0, - "grad_norm": 2.334811800093409, - "language_loss": 0.78698987, - "learning_rate": 3.9991239579635e-06, - "loss": 0.81019247, - "num_input_tokens_seen": 13737500, - "step": 651, - "time_per_iteration": 2.5930917263031006 - }, - { - "auxiliary_loss_clip": 0.0124425, - "auxiliary_loss_mlp": 0.010838, - "balance_loss_clip": 1.06317663, - "balance_loss_mlp": 1.05087411, - "epoch": 0.03920036073951601, - "flos": 18661124177280.0, - "grad_norm": 3.361008988618244, - "language_loss": 0.87392938, - "learning_rate": 3.999112394032757e-06, - "loss": 0.89720988, - "num_input_tokens_seen": 13754750, - "step": 652, - "time_per_iteration": 2.6072869300842285 - }, - { - "auxiliary_loss_clip": 0.01239638, - "auxiliary_loss_mlp": 0.01073938, - "balance_loss_clip": 1.06362963, - "balance_loss_mlp": 1.0434916, - "epoch": 0.03926048399218398, - "flos": 31354468053120.0, - "grad_norm": 2.6218665998754904, - "language_loss": 0.79297256, - "learning_rate": 3.999100754295471e-06, - "loss": 0.81610829, - "num_input_tokens_seen": 13771990, - "step": 653, - "time_per_iteration": 2.626145362854004 - }, - { - "auxiliary_loss_clip": 0.01250652, - "auxiliary_loss_mlp": 0.01075546, - "balance_loss_clip": 1.06496143, - "balance_loss_mlp": 1.04374111, - "epoch": 0.039320607244851945, - "flos": 29603499770880.0, - "grad_norm": 2.0720296605490094, - "language_loss": 0.85909009, - "learning_rate": 3.999089038752085e-06, - "loss": 0.88235211, - "num_input_tokens_seen": 13792750, - "step": 654, - "time_per_iteration": 2.6775124073028564 - }, - { - "auxiliary_loss_clip": 0.01126661, - "auxiliary_loss_mlp": 0.01016641, - "balance_loss_clip": 1.03977203, - "balance_loss_mlp": 1.01001298, - "epoch": 0.03938073049751992, - "flos": 66534609951360.0, - "grad_norm": 0.7366259780501333, - "language_loss": 0.4997997, - "learning_rate": 3.999077247403041e-06, - "loss": 0.52123272, - "num_input_tokens_seen": 13858570, - "step": 655, - "time_per_iteration": 3.3006510734558105 - }, - { - "auxiliary_loss_clip": 0.01241143, - "auxiliary_loss_mlp": 0.01076374, - "balance_loss_clip": 1.0658412, - "balance_loss_mlp": 1.04680991, - "epoch": 0.03944085375018788, - "flos": 23367827836800.0, - "grad_norm": 4.17474796245144, - "language_loss": 0.80903178, - "learning_rate": 3.9990653802487886e-06, - "loss": 0.83220696, - "num_input_tokens_seen": 13876335, - "step": 656, - "time_per_iteration": 4.228931427001953 - }, - { - "auxiliary_loss_clip": 0.01251519, - "auxiliary_loss_mlp": 0.01093573, - "balance_loss_clip": 1.06740427, - "balance_loss_mlp": 1.05802524, - "epoch": 0.039500977002855854, - "flos": 18548292579840.0, - "grad_norm": 2.068956760077258, - "language_loss": 0.76289558, - "learning_rate": 3.999053437289776e-06, - "loss": 0.7863465, - "num_input_tokens_seen": 13892640, - "step": 657, - "time_per_iteration": 4.218473434448242 - }, - { - "auxiliary_loss_clip": 0.0124824, - "auxiliary_loss_mlp": 0.01076812, - "balance_loss_clip": 1.06641233, - "balance_loss_mlp": 1.04522133, - "epoch": 0.039561100255523826, - "flos": 25338174433920.0, - "grad_norm": 2.07475431213476, - "language_loss": 0.8179062, - "learning_rate": 3.999041418526457e-06, - "loss": 0.84115672, - "num_input_tokens_seen": 13910085, - "step": 658, - "time_per_iteration": 2.671675682067871 - }, - { - "auxiliary_loss_clip": 0.01242678, - "auxiliary_loss_mlp": 0.01077963, - "balance_loss_clip": 1.06347871, - "balance_loss_mlp": 1.0454669, - "epoch": 0.03962122350819179, - "flos": 18219889509120.0, - "grad_norm": 2.2444983110753625, - "language_loss": 0.90790772, - "learning_rate": 3.999029323959287e-06, - "loss": 0.93111408, - "num_input_tokens_seen": 13928800, - "step": 659, - "time_per_iteration": 4.2601988315582275 - }, - { - "auxiliary_loss_clip": 0.01247633, - "auxiliary_loss_mlp": 0.01073069, - "balance_loss_clip": 1.06654835, - "balance_loss_mlp": 1.04215825, - "epoch": 0.03968134676085976, - "flos": 20522230536960.0, - "grad_norm": 2.2083626038373656, - "language_loss": 0.79760063, - "learning_rate": 3.999017153588724e-06, - "loss": 0.82080764, - "num_input_tokens_seen": 13948325, - "step": 660, - "time_per_iteration": 2.62716007232666 - }, - { - "auxiliary_loss_clip": 0.01246027, - "auxiliary_loss_mlp": 0.01077579, - "balance_loss_clip": 1.0675652, - "balance_loss_mlp": 1.0456785, - "epoch": 0.03974147001352773, - "flos": 22422587483520.0, - "grad_norm": 1.6747851381362888, - "language_loss": 0.81757367, - "learning_rate": 3.999004907415231e-06, - "loss": 0.8408097, - "num_input_tokens_seen": 13969090, - "step": 661, - "time_per_iteration": 2.645423412322998 - }, - { - "auxiliary_loss_clip": 0.01119895, - "auxiliary_loss_mlp": 0.01007167, - "balance_loss_clip": 1.03320217, - "balance_loss_mlp": 1.00077713, - "epoch": 0.0398015932661957, - "flos": 71128769322240.0, - "grad_norm": 0.9117564509831767, - "language_loss": 0.69349593, - "learning_rate": 3.998992585439272e-06, - "loss": 0.71476656, - "num_input_tokens_seen": 14037555, - "step": 662, - "time_per_iteration": 3.3032331466674805 - }, - { - "auxiliary_loss_clip": 0.01249217, - "auxiliary_loss_mlp": 0.01074722, - "balance_loss_clip": 1.06995225, - "balance_loss_mlp": 1.04322648, - "epoch": 0.03986171651886367, - "flos": 16800951571200.0, - "grad_norm": 2.160679749799672, - "language_loss": 0.82765651, - "learning_rate": 3.998980187661314e-06, - "loss": 0.85089582, - "num_input_tokens_seen": 14055765, - "step": 663, - "time_per_iteration": 2.6217782497406006 - }, - { - "auxiliary_loss_clip": 0.01252759, - "auxiliary_loss_mlp": 0.01063705, - "balance_loss_clip": 1.06966817, - "balance_loss_mlp": 1.03254378, - "epoch": 0.03992183977153164, - "flos": 24535068197760.0, - "grad_norm": 2.19374813563436, - "language_loss": 0.87302262, - "learning_rate": 3.998967714081826e-06, - "loss": 0.89618725, - "num_input_tokens_seen": 14074195, - "step": 664, - "time_per_iteration": 2.6729183197021484 - }, - { - "auxiliary_loss_clip": 0.01241647, - "auxiliary_loss_mlp": 0.0106515, - "balance_loss_clip": 1.06656313, - "balance_loss_mlp": 1.03346384, - "epoch": 0.03998196302419961, - "flos": 15595897167360.0, - "grad_norm": 2.036983550581997, - "language_loss": 0.84821391, - "learning_rate": 3.998955164701281e-06, - "loss": 0.87128186, - "num_input_tokens_seen": 14090215, - "step": 665, - "time_per_iteration": 2.593832015991211 - }, - { - "auxiliary_loss_clip": 0.012521, - "auxiliary_loss_mlp": 0.01085682, - "balance_loss_clip": 1.06867695, - "balance_loss_mlp": 1.05223155, - "epoch": 0.04004208627686758, - "flos": 25305065072640.0, - "grad_norm": 2.172699570421913, - "language_loss": 0.81745672, - "learning_rate": 3.998942539520158e-06, - "loss": 0.8408345, - "num_input_tokens_seen": 14112150, - "step": 666, - "time_per_iteration": 2.6743290424346924 - }, - { - "auxiliary_loss_clip": 0.01241565, - "auxiliary_loss_mlp": 0.01073617, - "balance_loss_clip": 1.06443083, - "balance_loss_mlp": 1.04007161, - "epoch": 0.04010220952953555, - "flos": 23475847011840.0, - "grad_norm": 2.1003520396389828, - "language_loss": 0.87117827, - "learning_rate": 3.998929838538932e-06, - "loss": 0.89433014, - "num_input_tokens_seen": 14131475, - "step": 667, - "time_per_iteration": 2.6147067546844482 - }, - { - "auxiliary_loss_clip": 0.0124275, - "auxiliary_loss_mlp": 0.01071583, - "balance_loss_clip": 1.07009172, - "balance_loss_mlp": 1.04161382, - "epoch": 0.04016233278220352, - "flos": 18617025254400.0, - "grad_norm": 2.331266403294307, - "language_loss": 0.80641299, - "learning_rate": 3.998917061758087e-06, - "loss": 0.82955635, - "num_input_tokens_seen": 14146165, - "step": 668, - "time_per_iteration": 2.6015820503234863 - }, - { - "auxiliary_loss_clip": 0.01115034, - "auxiliary_loss_mlp": 0.01008949, - "balance_loss_clip": 1.02975297, - "balance_loss_mlp": 1.00317907, - "epoch": 0.040222456034871484, - "flos": 70906194696960.0, - "grad_norm": 0.7870483750596657, - "language_loss": 0.60066259, - "learning_rate": 3.998904209178107e-06, - "loss": 0.62190247, - "num_input_tokens_seen": 14215005, - "step": 669, - "time_per_iteration": 3.2993202209472656 - }, - { - "auxiliary_loss_clip": 0.01242272, - "auxiliary_loss_mlp": 0.01071485, - "balance_loss_clip": 1.06408751, - "balance_loss_mlp": 1.04120564, - "epoch": 0.040282579287539456, - "flos": 23764712186880.0, - "grad_norm": 1.7022357666604506, - "language_loss": 0.86290276, - "learning_rate": 3.9988912807994785e-06, - "loss": 0.88604033, - "num_input_tokens_seen": 14235510, - "step": 670, - "time_per_iteration": 2.700657844543457 - }, - { - "auxiliary_loss_clip": 0.01242087, - "auxiliary_loss_mlp": 0.01080448, - "balance_loss_clip": 1.06647801, - "balance_loss_mlp": 1.05014467, - "epoch": 0.04034270254020743, - "flos": 18478518410880.0, - "grad_norm": 1.8224152334464152, - "language_loss": 0.75569212, - "learning_rate": 3.998878276622692e-06, - "loss": 0.77891749, - "num_input_tokens_seen": 14254565, - "step": 671, - "time_per_iteration": 2.6698572635650635 - }, - { - "auxiliary_loss_clip": 0.01248936, - "auxiliary_loss_mlp": 0.01076667, - "balance_loss_clip": 1.06943047, - "balance_loss_mlp": 1.04605412, - "epoch": 0.040402825792875394, - "flos": 17201858244480.0, - "grad_norm": 1.9730812981627939, - "language_loss": 0.92416775, - "learning_rate": 3.998865196648242e-06, - "loss": 0.94742376, - "num_input_tokens_seen": 14271885, - "step": 672, - "time_per_iteration": 2.567563533782959 - }, - { - "auxiliary_loss_clip": 0.01245231, - "auxiliary_loss_mlp": 0.010776, - "balance_loss_clip": 1.0677104, - "balance_loss_mlp": 1.04422188, - "epoch": 0.040462949045543366, - "flos": 19172168928000.0, - "grad_norm": 1.800141829654062, - "language_loss": 0.90174723, - "learning_rate": 3.998852040876622e-06, - "loss": 0.92497551, - "num_input_tokens_seen": 14289670, - "step": 673, - "time_per_iteration": 2.547154426574707 - }, - { - "auxiliary_loss_clip": 0.01239752, - "auxiliary_loss_mlp": 0.01084248, - "balance_loss_clip": 1.06466973, - "balance_loss_mlp": 1.05184698, - "epoch": 0.04052307229821133, - "flos": 24019821555840.0, - "grad_norm": 2.3989934860433486, - "language_loss": 0.75016737, - "learning_rate": 3.998838809308334e-06, - "loss": 0.7734074, - "num_input_tokens_seen": 14309285, - "step": 674, - "time_per_iteration": 2.681896924972534 - }, - { - "auxiliary_loss_clip": 0.01249861, - "auxiliary_loss_mlp": 0.01064308, - "balance_loss_clip": 1.06744063, - "balance_loss_mlp": 1.03334963, - "epoch": 0.0405831955508793, - "flos": 16436601964800.0, - "grad_norm": 2.55613513039197, - "language_loss": 0.78289407, - "learning_rate": 3.9988255019438766e-06, - "loss": 0.80603576, - "num_input_tokens_seen": 14328300, - "step": 675, - "time_per_iteration": 2.6965043544769287 - }, - { - "auxiliary_loss_clip": 0.01241749, - "auxiliary_loss_mlp": 0.01079652, - "balance_loss_clip": 1.06532836, - "balance_loss_mlp": 1.04648817, - "epoch": 0.040643318803547275, - "flos": 24279922915200.0, - "grad_norm": 2.047384767684118, - "language_loss": 0.76844448, - "learning_rate": 3.998812118783757e-06, - "loss": 0.79165846, - "num_input_tokens_seen": 14346395, - "step": 676, - "time_per_iteration": 2.6216623783111572 - }, - { - "auxiliary_loss_clip": 0.01248147, - "auxiliary_loss_mlp": 0.01079294, - "balance_loss_clip": 1.06811619, - "balance_loss_mlp": 1.04813254, - "epoch": 0.04070344205621524, - "flos": 17712076982400.0, - "grad_norm": 2.318905665785744, - "language_loss": 0.85139382, - "learning_rate": 3.9987986598284804e-06, - "loss": 0.8746683, - "num_input_tokens_seen": 14364605, - "step": 677, - "time_per_iteration": 2.5663015842437744 - }, - { - "auxiliary_loss_clip": 0.01240385, - "auxiliary_loss_mlp": 0.01070741, - "balance_loss_clip": 1.06558609, - "balance_loss_mlp": 1.03901923, - "epoch": 0.04076356530888321, - "flos": 26177658168960.0, - "grad_norm": 2.5041724349122645, - "language_loss": 0.76572061, - "learning_rate": 3.998785125078559e-06, - "loss": 0.78883183, - "num_input_tokens_seen": 14385265, - "step": 678, - "time_per_iteration": 2.624689817428589 - }, - { - "auxiliary_loss_clip": 0.01240972, - "auxiliary_loss_mlp": 0.01072606, - "balance_loss_clip": 1.06374967, - "balance_loss_mlp": 1.04242194, - "epoch": 0.04082368856155118, - "flos": 35773455772800.0, - "grad_norm": 1.7096242150987748, - "language_loss": 0.82139099, - "learning_rate": 3.998771514534505e-06, - "loss": 0.84452677, - "num_input_tokens_seen": 14406090, - "step": 679, - "time_per_iteration": 2.7073023319244385 - }, - { - "auxiliary_loss_clip": 0.01248879, - "auxiliary_loss_mlp": 0.01064116, - "balance_loss_clip": 1.07185793, - "balance_loss_mlp": 1.0340035, - "epoch": 0.04088381181421915, - "flos": 28146640049280.0, - "grad_norm": 1.963288262989073, - "language_loss": 0.76260424, - "learning_rate": 3.998757828196835e-06, - "loss": 0.78573418, - "num_input_tokens_seen": 14425130, - "step": 680, - "time_per_iteration": 2.6767218112945557 - }, - { - "auxiliary_loss_clip": 0.01244441, - "auxiliary_loss_mlp": 0.01071738, - "balance_loss_clip": 1.06458521, - "balance_loss_mlp": 1.03864551, - "epoch": 0.04094393506688712, - "flos": 27597673514880.0, - "grad_norm": 1.713943858995997, - "language_loss": 0.83089912, - "learning_rate": 3.9987440660660685e-06, - "loss": 0.85406095, - "num_input_tokens_seen": 14447355, - "step": 681, - "time_per_iteration": 2.6386382579803467 - }, - { - "auxiliary_loss_clip": 0.01244279, - "auxiliary_loss_mlp": 0.01073303, - "balance_loss_clip": 1.06438065, - "balance_loss_mlp": 1.04127121, - "epoch": 0.04100405831955509, - "flos": 23112036109440.0, - "grad_norm": 1.706698119772261, - "language_loss": 0.71538687, - "learning_rate": 3.998730228142726e-06, - "loss": 0.7385627, - "num_input_tokens_seen": 14466790, - "step": 682, - "time_per_iteration": 2.618792772293091 - }, - { - "auxiliary_loss_clip": 0.01243156, - "auxiliary_loss_mlp": 0.01078429, - "balance_loss_clip": 1.06440282, - "balance_loss_mlp": 1.04781592, - "epoch": 0.04106418157222306, - "flos": 20156731695360.0, - "grad_norm": 1.6947476714586034, - "language_loss": 0.72599399, - "learning_rate": 3.998716314427333e-06, - "loss": 0.74920982, - "num_input_tokens_seen": 14485195, - "step": 683, - "time_per_iteration": 2.676133394241333 - }, - { - "auxiliary_loss_clip": 0.01241071, - "auxiliary_loss_mlp": 0.01079531, - "balance_loss_clip": 1.07077932, - "balance_loss_mlp": 1.04851258, - "epoch": 0.041124304824891024, - "flos": 17420697855360.0, - "grad_norm": 2.098652785935233, - "language_loss": 0.81419414, - "learning_rate": 3.998702324920417e-06, - "loss": 0.8374002, - "num_input_tokens_seen": 14503370, - "step": 684, - "time_per_iteration": 2.6538476943969727 - }, - { - "auxiliary_loss_clip": 0.01242791, - "auxiliary_loss_mlp": 0.0107365, - "balance_loss_clip": 1.06783867, - "balance_loss_mlp": 1.04139185, - "epoch": 0.041184428077558996, - "flos": 25780163287680.0, - "grad_norm": 1.5053911947555274, - "language_loss": 0.90680599, - "learning_rate": 3.9986882596225085e-06, - "loss": 0.92997038, - "num_input_tokens_seen": 14526415, - "step": 685, - "time_per_iteration": 2.6541450023651123 - }, - { - "auxiliary_loss_clip": 0.01244219, - "auxiliary_loss_mlp": 0.01072481, - "balance_loss_clip": 1.06659365, - "balance_loss_mlp": 1.04093838, - "epoch": 0.04124455133022697, - "flos": 22964766347520.0, - "grad_norm": 2.2251875217653185, - "language_loss": 0.87851977, - "learning_rate": 3.998674118534141e-06, - "loss": 0.90168673, - "num_input_tokens_seen": 14546595, - "step": 686, - "time_per_iteration": 2.7298531532287598 - }, - { - "auxiliary_loss_clip": 0.01247476, - "auxiliary_loss_mlp": 0.01073385, - "balance_loss_clip": 1.06586432, - "balance_loss_mlp": 1.04224789, - "epoch": 0.04130467458289493, - "flos": 21289067015040.0, - "grad_norm": 1.8582614005091855, - "language_loss": 0.7152915, - "learning_rate": 3.998659901655851e-06, - "loss": 0.73850012, - "num_input_tokens_seen": 14566590, - "step": 687, - "time_per_iteration": 2.6284232139587402 - }, - { - "auxiliary_loss_clip": 0.01243582, - "auxiliary_loss_mlp": 0.01076448, - "balance_loss_clip": 1.06979251, - "balance_loss_mlp": 1.04756403, - "epoch": 0.041364797835562905, - "flos": 19974233669760.0, - "grad_norm": 2.596672934278983, - "language_loss": 0.86028284, - "learning_rate": 3.998645608988177e-06, - "loss": 0.88348317, - "num_input_tokens_seen": 14585965, - "step": 688, - "time_per_iteration": 2.522634506225586 - }, - { - "auxiliary_loss_clip": 0.01241593, - "auxiliary_loss_mlp": 0.01079647, - "balance_loss_clip": 1.06802177, - "balance_loss_mlp": 1.04908216, - "epoch": 0.04142492108823087, - "flos": 21906227520000.0, - "grad_norm": 2.852238187591699, - "language_loss": 0.83393514, - "learning_rate": 3.998631240531661e-06, - "loss": 0.85714757, - "num_input_tokens_seen": 14606015, - "step": 689, - "time_per_iteration": 2.6140944957733154 - }, - { - "auxiliary_loss_clip": 0.01238254, - "auxiliary_loss_mlp": 0.01085009, - "balance_loss_clip": 1.06293654, - "balance_loss_mlp": 1.05463421, - "epoch": 0.04148504434089884, - "flos": 27639617621760.0, - "grad_norm": 2.870474577544969, - "language_loss": 0.68398476, - "learning_rate": 3.998616796286848e-06, - "loss": 0.70721734, - "num_input_tokens_seen": 14629955, - "step": 690, - "time_per_iteration": 2.658987522125244 - }, - { - "auxiliary_loss_clip": 0.01235903, - "auxiliary_loss_mlp": 0.01075275, - "balance_loss_clip": 1.0625304, - "balance_loss_mlp": 1.04565191, - "epoch": 0.041545167593566815, - "flos": 20518387781760.0, - "grad_norm": 1.634289561889102, - "language_loss": 0.74927461, - "learning_rate": 3.998602276254286e-06, - "loss": 0.77238643, - "num_input_tokens_seen": 14648000, - "step": 691, - "time_per_iteration": 2.599957227706909 - }, - { - "auxiliary_loss_clip": 0.01239089, - "auxiliary_loss_mlp": 0.01081705, - "balance_loss_clip": 1.06458938, - "balance_loss_mlp": 1.04978108, - "epoch": 0.04160529084623478, - "flos": 11868907939200.0, - "grad_norm": 2.123432521314224, - "language_loss": 0.84469771, - "learning_rate": 3.998587680434526e-06, - "loss": 0.86790562, - "num_input_tokens_seen": 14662235, - "step": 692, - "time_per_iteration": 2.5748491287231445 - }, - { - "auxiliary_loss_clip": 0.01242126, - "auxiliary_loss_mlp": 0.01076613, - "balance_loss_clip": 1.06274796, - "balance_loss_mlp": 1.04409313, - "epoch": 0.04166541409890275, - "flos": 14828306503680.0, - "grad_norm": 2.3463094595874665, - "language_loss": 0.88948715, - "learning_rate": 3.99857300882812e-06, - "loss": 0.91267455, - "num_input_tokens_seen": 14676065, - "step": 693, - "time_per_iteration": 2.569277286529541 - }, - { - "auxiliary_loss_clip": 0.01245438, - "auxiliary_loss_mlp": 0.01071471, - "balance_loss_clip": 1.06845784, - "balance_loss_mlp": 1.04123962, - "epoch": 0.04172553735157072, - "flos": 25808137004160.0, - "grad_norm": 5.499777597079252, - "language_loss": 0.81987685, - "learning_rate": 3.998558261435626e-06, - "loss": 0.84304595, - "num_input_tokens_seen": 14694955, - "step": 694, - "time_per_iteration": 2.6798722743988037 - }, - { - "auxiliary_loss_clip": 0.01242101, - "auxiliary_loss_mlp": 0.01073692, - "balance_loss_clip": 1.06179321, - "balance_loss_mlp": 1.04303181, - "epoch": 0.04178566060423869, - "flos": 24279815174400.0, - "grad_norm": 2.051302362473346, - "language_loss": 0.83672506, - "learning_rate": 3.9985434382576015e-06, - "loss": 0.85988301, - "num_input_tokens_seen": 14715510, - "step": 695, - "time_per_iteration": 2.684537649154663 - }, - { - "auxiliary_loss_clip": 0.01242205, - "auxiliary_loss_mlp": 0.01080004, - "balance_loss_clip": 1.06535804, - "balance_loss_mlp": 1.04822254, - "epoch": 0.04184578385690666, - "flos": 18222008411520.0, - "grad_norm": 2.113561459264794, - "language_loss": 0.84351176, - "learning_rate": 3.99852853929461e-06, - "loss": 0.86673379, - "num_input_tokens_seen": 14731755, - "step": 696, - "time_per_iteration": 4.1141321659088135 - }, - { - "auxiliary_loss_clip": 0.01238462, - "auxiliary_loss_mlp": 0.01083207, - "balance_loss_clip": 1.06265593, - "balance_loss_mlp": 1.05099702, - "epoch": 0.041905907109574626, - "flos": 22776342577920.0, - "grad_norm": 6.921460264787684, - "language_loss": 0.93193012, - "learning_rate": 3.998513564547216e-06, - "loss": 0.95514685, - "num_input_tokens_seen": 14750810, - "step": 697, - "time_per_iteration": 5.71666693687439 - }, - { - "auxiliary_loss_clip": 0.01235964, - "auxiliary_loss_mlp": 0.01074448, - "balance_loss_clip": 1.06324339, - "balance_loss_mlp": 1.04495573, - "epoch": 0.0419660303622426, - "flos": 20156947176960.0, - "grad_norm": 2.1002029886241904, - "language_loss": 0.83775562, - "learning_rate": 3.998498514015987e-06, - "loss": 0.86085975, - "num_input_tokens_seen": 14768435, - "step": 698, - "time_per_iteration": 4.194530010223389 - }, - { - "auxiliary_loss_clip": 0.01239177, - "auxiliary_loss_mlp": 0.01093516, - "balance_loss_clip": 1.06274605, - "balance_loss_mlp": 1.06175828, - "epoch": 0.042026153614910564, - "flos": 23076376882560.0, - "grad_norm": 2.1234669437327955, - "language_loss": 0.91715962, - "learning_rate": 3.998483387701495e-06, - "loss": 0.94048655, - "num_input_tokens_seen": 14786690, - "step": 699, - "time_per_iteration": 2.6399078369140625 - }, - { - "auxiliary_loss_clip": 0.01113327, - "auxiliary_loss_mlp": 0.0102038, - "balance_loss_clip": 1.03020263, - "balance_loss_mlp": 1.01403797, - "epoch": 0.042086276867578536, - "flos": 64495243370880.0, - "grad_norm": 0.9035134571641164, - "language_loss": 0.67873394, - "learning_rate": 3.998468185604312e-06, - "loss": 0.70007098, - "num_input_tokens_seen": 14853840, - "step": 700, - "time_per_iteration": 3.192026376724243 - }, - { - "auxiliary_loss_clip": 0.01246765, - "auxiliary_loss_mlp": 0.01082955, - "balance_loss_clip": 1.06717515, - "balance_loss_mlp": 1.05017269, - "epoch": 0.04214640012024651, - "flos": 15487016065920.0, - "grad_norm": 2.2754848646841888, - "language_loss": 0.884673, - "learning_rate": 3.998452907725016e-06, - "loss": 0.90797025, - "num_input_tokens_seen": 14869580, - "step": 701, - "time_per_iteration": 2.5790441036224365 - }, - { - "auxiliary_loss_clip": 0.01242428, - "auxiliary_loss_mlp": 0.01080259, - "balance_loss_clip": 1.06793952, - "balance_loss_mlp": 1.04833448, - "epoch": 0.04220652337291447, - "flos": 23877040993920.0, - "grad_norm": 2.000128536077818, - "language_loss": 0.67100394, - "learning_rate": 3.998437554064184e-06, - "loss": 0.69423079, - "num_input_tokens_seen": 14891065, - "step": 702, - "time_per_iteration": 2.6247870922088623 - }, - { - "auxiliary_loss_clip": 0.01107168, - "auxiliary_loss_mlp": 0.01005563, - "balance_loss_clip": 1.02512407, - "balance_loss_mlp": 0.99922067, - "epoch": 0.042266646625582445, - "flos": 63795451628160.0, - "grad_norm": 0.8439205282656718, - "language_loss": 0.60756463, - "learning_rate": 3.9984221246224006e-06, - "loss": 0.62869191, - "num_input_tokens_seen": 14954815, - "step": 703, - "time_per_iteration": 3.1991655826568604 - }, - { - "auxiliary_loss_clip": 0.01107933, - "auxiliary_loss_mlp": 0.01006502, - "balance_loss_clip": 1.02562141, - "balance_loss_mlp": 0.99973089, - "epoch": 0.04232676987825041, - "flos": 50018863345920.0, - "grad_norm": 1.0471369072250156, - "language_loss": 0.57677412, - "learning_rate": 3.9984066194002494e-06, - "loss": 0.59791845, - "num_input_tokens_seen": 15003050, - "step": 704, - "time_per_iteration": 3.037705659866333 - }, - { - "auxiliary_loss_clip": 0.01241513, - "auxiliary_loss_mlp": 0.01072126, - "balance_loss_clip": 1.06549489, - "balance_loss_mlp": 1.0406549, - "epoch": 0.04238689313091838, - "flos": 21616105368960.0, - "grad_norm": 2.9488804643242488, - "language_loss": 0.87553984, - "learning_rate": 3.998391038398319e-06, - "loss": 0.89867628, - "num_input_tokens_seen": 15021990, - "step": 705, - "time_per_iteration": 2.6233222484588623 - }, - { - "auxiliary_loss_clip": 0.01230342, - "auxiliary_loss_mlp": 0.0107194, - "balance_loss_clip": 1.0605582, - "balance_loss_mlp": 1.04204249, - "epoch": 0.042447016383586354, - "flos": 19135109070720.0, - "grad_norm": 2.556815837902013, - "language_loss": 0.71071029, - "learning_rate": 3.998375381617201e-06, - "loss": 0.73373306, - "num_input_tokens_seen": 15040700, - "step": 706, - "time_per_iteration": 2.560434579849243 - }, - { - "auxiliary_loss_clip": 0.0123412, - "auxiliary_loss_mlp": 0.01070349, - "balance_loss_clip": 1.06249404, - "balance_loss_mlp": 1.03799582, - "epoch": 0.04250713963625432, - "flos": 24426007528320.0, - "grad_norm": 2.0814078632624167, - "language_loss": 0.93418455, - "learning_rate": 3.9983596490574875e-06, - "loss": 0.95722926, - "num_input_tokens_seen": 15056725, - "step": 707, - "time_per_iteration": 2.6130473613739014 - }, - { - "auxiliary_loss_clip": 0.01237541, - "auxiliary_loss_mlp": 0.01067908, - "balance_loss_clip": 1.05994225, - "balance_loss_mlp": 1.03617477, - "epoch": 0.04256726288892229, - "flos": 30367391333760.0, - "grad_norm": 2.424205580643553, - "language_loss": 0.81514043, - "learning_rate": 3.998343840719776e-06, - "loss": 0.83819497, - "num_input_tokens_seen": 15077550, - "step": 708, - "time_per_iteration": 2.656277894973755 - }, - { - "auxiliary_loss_clip": 0.01243932, - "auxiliary_loss_mlp": 0.0108167, - "balance_loss_clip": 1.06461239, - "balance_loss_mlp": 1.04934049, - "epoch": 0.04262738614159026, - "flos": 16362661818240.0, - "grad_norm": 2.0883592727868145, - "language_loss": 0.82027614, - "learning_rate": 3.998327956604666e-06, - "loss": 0.8435322, - "num_input_tokens_seen": 15094955, - "step": 709, - "time_per_iteration": 2.5758891105651855 - }, - { - "auxiliary_loss_clip": 0.01243538, - "auxiliary_loss_mlp": 0.01071217, - "balance_loss_clip": 1.06374872, - "balance_loss_mlp": 1.03960264, - "epoch": 0.04268750939425823, - "flos": 20412379768320.0, - "grad_norm": 2.7686525844665133, - "language_loss": 0.8502059, - "learning_rate": 3.99831199671276e-06, - "loss": 0.87335348, - "num_input_tokens_seen": 15113395, - "step": 710, - "time_per_iteration": 2.571559429168701 - }, - { - "auxiliary_loss_clip": 0.0124498, - "auxiliary_loss_mlp": 0.01072229, - "balance_loss_clip": 1.06788397, - "balance_loss_mlp": 1.04166365, - "epoch": 0.0427476326469262, - "flos": 20302959962880.0, - "grad_norm": 7.911177124524585, - "language_loss": 0.84914303, - "learning_rate": 3.998295961044662e-06, - "loss": 0.87231517, - "num_input_tokens_seen": 15132920, - "step": 711, - "time_per_iteration": 2.569959878921509 - }, - { - "auxiliary_loss_clip": 0.01237769, - "auxiliary_loss_mlp": 0.01074338, - "balance_loss_clip": 1.06188083, - "balance_loss_mlp": 1.04229426, - "epoch": 0.042807755899594166, - "flos": 21650794928640.0, - "grad_norm": 1.7189790042473796, - "language_loss": 0.85439789, - "learning_rate": 3.9982798496009804e-06, - "loss": 0.87751901, - "num_input_tokens_seen": 15153115, - "step": 712, - "time_per_iteration": 2.6200509071350098 - }, - { - "auxiliary_loss_clip": 0.01242397, - "auxiliary_loss_mlp": 0.01069523, - "balance_loss_clip": 1.06085837, - "balance_loss_mlp": 1.03989983, - "epoch": 0.04286787915226214, - "flos": 21435007973760.0, - "grad_norm": 5.490507523621204, - "language_loss": 0.91178697, - "learning_rate": 3.998263662382328e-06, - "loss": 0.93490618, - "num_input_tokens_seen": 15172770, - "step": 713, - "time_per_iteration": 2.6353416442871094 - }, - { - "auxiliary_loss_clip": 0.01104693, - "auxiliary_loss_mlp": 0.01006514, - "balance_loss_clip": 1.02325606, - "balance_loss_mlp": 0.99955195, - "epoch": 0.04292800240493011, - "flos": 66397970615040.0, - "grad_norm": 0.9310328114407391, - "language_loss": 0.63725489, - "learning_rate": 3.9982473993893165e-06, - "loss": 0.65836698, - "num_input_tokens_seen": 15240055, - "step": 714, - "time_per_iteration": 3.2544445991516113 - }, - { - "auxiliary_loss_clip": 0.01239175, - "auxiliary_loss_mlp": 0.01085992, - "balance_loss_clip": 1.06602359, - "balance_loss_mlp": 1.05552244, - "epoch": 0.042988125657598075, - "flos": 31650264552960.0, - "grad_norm": 1.8449858143817996, - "language_loss": 0.75010103, - "learning_rate": 3.998231060622563e-06, - "loss": 0.77335274, - "num_input_tokens_seen": 15261585, - "step": 715, - "time_per_iteration": 2.7048466205596924 - }, - { - "auxiliary_loss_clip": 0.01242734, - "auxiliary_loss_mlp": 0.01074126, - "balance_loss_clip": 1.0666225, - "balance_loss_mlp": 1.04227352, - "epoch": 0.04304824891026605, - "flos": 33248468292480.0, - "grad_norm": 1.9505519101092619, - "language_loss": 0.72289199, - "learning_rate": 3.998214646082688e-06, - "loss": 0.74606061, - "num_input_tokens_seen": 15281160, - "step": 716, - "time_per_iteration": 2.7807397842407227 - }, - { - "auxiliary_loss_clip": 0.01104303, - "auxiliary_loss_mlp": 0.01006894, - "balance_loss_clip": 1.02277207, - "balance_loss_mlp": 0.99997944, - "epoch": 0.04310837216293401, - "flos": 64064782782720.0, - "grad_norm": 0.9245106661639481, - "language_loss": 0.65587437, - "learning_rate": 3.998198155770314e-06, - "loss": 0.67698634, - "num_input_tokens_seen": 15344505, - "step": 717, - "time_per_iteration": 3.250870943069458 - }, - { - "auxiliary_loss_clip": 0.01103971, - "auxiliary_loss_mlp": 0.01009587, - "balance_loss_clip": 1.02238059, - "balance_loss_mlp": 1.00267255, - "epoch": 0.043168495415601985, - "flos": 61343757849600.0, - "grad_norm": 0.9849394627593366, - "language_loss": 0.58785796, - "learning_rate": 3.998181589686065e-06, - "loss": 0.60899353, - "num_input_tokens_seen": 15404050, - "step": 718, - "time_per_iteration": 3.0402464866638184 - }, - { - "auxiliary_loss_clip": 0.0124025, - "auxiliary_loss_mlp": 0.0107507, - "balance_loss_clip": 1.06784248, - "balance_loss_mlp": 1.0424546, - "epoch": 0.04322861866826996, - "flos": 20704261685760.0, - "grad_norm": 1.9557310597444375, - "language_loss": 0.91440111, - "learning_rate": 3.99816494783057e-06, - "loss": 0.9375543, - "num_input_tokens_seen": 15424190, - "step": 719, - "time_per_iteration": 2.6500089168548584 - }, - { - "auxiliary_loss_clip": 0.01235843, - "auxiliary_loss_mlp": 0.01072906, - "balance_loss_clip": 1.06020999, - "balance_loss_mlp": 1.04296041, - "epoch": 0.04328874192093792, - "flos": 30373352991360.0, - "grad_norm": 1.7057721639328365, - "language_loss": 0.66461253, - "learning_rate": 3.99814823020446e-06, - "loss": 0.68770003, - "num_input_tokens_seen": 15446500, - "step": 720, - "time_per_iteration": 2.673184871673584 - }, - { - "auxiliary_loss_clip": 0.01234245, - "auxiliary_loss_mlp": 0.01072069, - "balance_loss_clip": 1.06111717, - "balance_loss_mlp": 1.04131258, - "epoch": 0.043348865173605894, - "flos": 21944795748480.0, - "grad_norm": 1.9491363249287763, - "language_loss": 0.77460182, - "learning_rate": 3.9981314368083684e-06, - "loss": 0.79766488, - "num_input_tokens_seen": 15465830, - "step": 721, - "time_per_iteration": 2.6695611476898193 - }, - { - "auxiliary_loss_clip": 0.01241854, - "auxiliary_loss_mlp": 0.01087169, - "balance_loss_clip": 1.06622314, - "balance_loss_mlp": 1.05719972, - "epoch": 0.04340898842627386, - "flos": 15264225959040.0, - "grad_norm": 2.8383174670702718, - "language_loss": 0.88298881, - "learning_rate": 3.998114567642933e-06, - "loss": 0.90627909, - "num_input_tokens_seen": 15479985, - "step": 722, - "time_per_iteration": 2.661313533782959 - }, - { - "auxiliary_loss_clip": 0.01244836, - "auxiliary_loss_mlp": 0.01076885, - "balance_loss_clip": 1.06665182, - "balance_loss_mlp": 1.0480125, - "epoch": 0.04346911167894183, - "flos": 27965434913280.0, - "grad_norm": 5.515838365549148, - "language_loss": 0.84387141, - "learning_rate": 3.998097622708792e-06, - "loss": 0.86708868, - "num_input_tokens_seen": 15501545, - "step": 723, - "time_per_iteration": 2.6447954177856445 - }, - { - "auxiliary_loss_clip": 0.01245825, - "auxiliary_loss_mlp": 0.01081354, - "balance_loss_clip": 1.06723523, - "balance_loss_mlp": 1.05019248, - "epoch": 0.0435292349316098, - "flos": 29242202820480.0, - "grad_norm": 1.7852936089408447, - "language_loss": 0.82789439, - "learning_rate": 3.99808060200659e-06, - "loss": 0.85116619, - "num_input_tokens_seen": 15521725, - "step": 724, - "time_per_iteration": 2.676985263824463 - }, - { - "auxiliary_loss_clip": 0.0124127, - "auxiliary_loss_mlp": 0.01087491, - "balance_loss_clip": 1.06535757, - "balance_loss_mlp": 1.05609179, - "epoch": 0.04358935818427777, - "flos": 20558356640640.0, - "grad_norm": 2.011685360503238, - "language_loss": 0.79444051, - "learning_rate": 3.998063505536971e-06, - "loss": 0.81772816, - "num_input_tokens_seen": 15540910, - "step": 725, - "time_per_iteration": 2.6241447925567627 - }, - { - "auxiliary_loss_clip": 0.01251777, - "auxiliary_loss_mlp": 0.01074923, - "balance_loss_clip": 1.06783843, - "balance_loss_mlp": 1.04309392, - "epoch": 0.04364948143694574, - "flos": 14464926564480.0, - "grad_norm": 2.2160842755462817, - "language_loss": 0.87175703, - "learning_rate": 3.998046333300584e-06, - "loss": 0.89502406, - "num_input_tokens_seen": 15558640, - "step": 726, - "time_per_iteration": 2.555551052093506 - }, - { - "auxiliary_loss_clip": 0.01100917, - "auxiliary_loss_mlp": 0.01015411, - "balance_loss_clip": 1.02171838, - "balance_loss_mlp": 1.00947404, - "epoch": 0.043709604689613706, - "flos": 50067268922880.0, - "grad_norm": 0.908981905466007, - "language_loss": 0.55868411, - "learning_rate": 3.998029085298079e-06, - "loss": 0.5798474, - "num_input_tokens_seen": 15612975, - "step": 727, - "time_per_iteration": 3.375901699066162 - }, - { - "auxiliary_loss_clip": 0.01245647, - "auxiliary_loss_mlp": 0.0108809, - "balance_loss_clip": 1.06717396, - "balance_loss_mlp": 1.05614173, - "epoch": 0.04376972794228168, - "flos": 13991588115840.0, - "grad_norm": 2.282663852625415, - "language_loss": 0.82326066, - "learning_rate": 3.998011761530112e-06, - "loss": 0.84659809, - "num_input_tokens_seen": 15631070, - "step": 728, - "time_per_iteration": 2.605970621109009 - }, - { - "auxiliary_loss_clip": 0.01237902, - "auxiliary_loss_mlp": 0.01073495, - "balance_loss_clip": 1.06600416, - "balance_loss_mlp": 1.04321551, - "epoch": 0.04382985119494965, - "flos": 22009901149440.0, - "grad_norm": 2.1303486954703152, - "language_loss": 0.76890069, - "learning_rate": 3.997994361997338e-06, - "loss": 0.7920146, - "num_input_tokens_seen": 15647825, - "step": 729, - "time_per_iteration": 2.652466297149658 - }, - { - "auxiliary_loss_clip": 0.01243746, - "auxiliary_loss_mlp": 0.01079207, - "balance_loss_clip": 1.06438255, - "balance_loss_mlp": 1.04859376, - "epoch": 0.043889974447617615, - "flos": 24206521472640.0, - "grad_norm": 2.1385115795714107, - "language_loss": 0.95153189, - "learning_rate": 3.997976886700417e-06, - "loss": 0.97476137, - "num_input_tokens_seen": 15668260, - "step": 730, - "time_per_iteration": 2.734614133834839 - }, - { - "auxiliary_loss_clip": 0.01238581, - "auxiliary_loss_mlp": 0.01074727, - "balance_loss_clip": 1.06093788, - "balance_loss_mlp": 1.04315984, - "epoch": 0.04395009770028559, - "flos": 17274541415040.0, - "grad_norm": 2.333073864238008, - "language_loss": 0.88456279, - "learning_rate": 3.997959335640013e-06, - "loss": 0.90769589, - "num_input_tokens_seen": 15685630, - "step": 731, - "time_per_iteration": 2.5912294387817383 - }, - { - "auxiliary_loss_clip": 0.01242247, - "auxiliary_loss_mlp": 0.01076563, - "balance_loss_clip": 1.06636512, - "balance_loss_mlp": 1.04757094, - "epoch": 0.04401022095295355, - "flos": 12310286261760.0, - "grad_norm": 3.0398759554531254, - "language_loss": 0.88683128, - "learning_rate": 3.997941708816791e-06, - "loss": 0.9100194, - "num_input_tokens_seen": 15698645, - "step": 732, - "time_per_iteration": 2.5897367000579834 - }, - { - "auxiliary_loss_clip": 0.01242736, - "auxiliary_loss_mlp": 0.01087795, - "balance_loss_clip": 1.06544232, - "balance_loss_mlp": 1.05646718, - "epoch": 0.044070344205621524, - "flos": 20959658363520.0, - "grad_norm": 2.304959545118842, - "language_loss": 0.85829747, - "learning_rate": 3.997924006231419e-06, - "loss": 0.88160276, - "num_input_tokens_seen": 15716775, - "step": 733, - "time_per_iteration": 2.650681972503662 - }, - { - "auxiliary_loss_clip": 0.01246603, - "auxiliary_loss_mlp": 0.01088724, - "balance_loss_clip": 1.06722379, - "balance_loss_mlp": 1.05544066, - "epoch": 0.044130467458289496, - "flos": 13845288021120.0, - "grad_norm": 2.207780377909299, - "language_loss": 0.91189414, - "learning_rate": 3.9979062278845685e-06, - "loss": 0.93524742, - "num_input_tokens_seen": 15733320, - "step": 734, - "time_per_iteration": 2.5956180095672607 - }, - { - "auxiliary_loss_clip": 0.01238395, - "auxiliary_loss_mlp": 0.01067579, - "balance_loss_clip": 1.06596422, - "balance_loss_mlp": 1.03781235, - "epoch": 0.04419059071095746, - "flos": 28655063107200.0, - "grad_norm": 1.9297536072777384, - "language_loss": 0.77884138, - "learning_rate": 3.9978883737769125e-06, - "loss": 0.8019011, - "num_input_tokens_seen": 15752705, - "step": 735, - "time_per_iteration": 2.603809118270874 - }, - { - "auxiliary_loss_clip": 0.01234188, - "auxiliary_loss_mlp": 0.01070499, - "balance_loss_clip": 1.06063068, - "balance_loss_mlp": 1.04091144, - "epoch": 0.04425071396362543, - "flos": 28183304856960.0, - "grad_norm": 2.266122200005257, - "language_loss": 0.8832593, - "learning_rate": 3.9978704439091305e-06, - "loss": 0.90630615, - "num_input_tokens_seen": 15772800, - "step": 736, - "time_per_iteration": 5.841086149215698 - }, - { - "auxiliary_loss_clip": 0.01235947, - "auxiliary_loss_mlp": 0.01081098, - "balance_loss_clip": 1.06597185, - "balance_loss_mlp": 1.05165362, - "epoch": 0.0443108372162934, - "flos": 23658452778240.0, - "grad_norm": 1.8984177574034653, - "language_loss": 0.84481263, - "learning_rate": 3.997852438281901e-06, - "loss": 0.8679831, - "num_input_tokens_seen": 15793665, - "step": 737, - "time_per_iteration": 4.1386003494262695 - }, - { - "auxiliary_loss_clip": 0.01240863, - "auxiliary_loss_mlp": 0.01072388, - "balance_loss_clip": 1.0653491, - "balance_loss_mlp": 1.03961766, - "epoch": 0.04437096046896137, - "flos": 33979861025280.0, - "grad_norm": 2.2366199062134706, - "language_loss": 0.84712577, - "learning_rate": 3.997834356895906e-06, - "loss": 0.87025833, - "num_input_tokens_seen": 15813175, - "step": 738, - "time_per_iteration": 4.447159290313721 - }, - { - "auxiliary_loss_clip": 0.01098733, - "auxiliary_loss_mlp": 0.0102196, - "balance_loss_clip": 1.02144337, - "balance_loss_mlp": 1.01685739, - "epoch": 0.04443108372162934, - "flos": 67397506375680.0, - "grad_norm": 0.8779518557387592, - "language_loss": 0.59179878, - "learning_rate": 3.9978161997518324e-06, - "loss": 0.61300576, - "num_input_tokens_seen": 15872050, - "step": 739, - "time_per_iteration": 3.0780396461486816 - }, - { - "auxiliary_loss_clip": 0.012386, - "auxiliary_loss_mlp": 0.01067387, - "balance_loss_clip": 1.06604302, - "balance_loss_mlp": 1.03717899, - "epoch": 0.04449120697429731, - "flos": 29752672953600.0, - "grad_norm": 2.295102845773205, - "language_loss": 0.91329807, - "learning_rate": 3.997797966850369e-06, - "loss": 0.93635798, - "num_input_tokens_seen": 15891085, - "step": 740, - "time_per_iteration": 2.6687562465667725 - }, - { - "auxiliary_loss_clip": 0.01243424, - "auxiliary_loss_mlp": 0.01067832, - "balance_loss_clip": 1.06807768, - "balance_loss_mlp": 1.03929377, - "epoch": 0.04455133022696528, - "flos": 36502119072000.0, - "grad_norm": 2.0543845689042484, - "language_loss": 0.71875739, - "learning_rate": 3.997779658192205e-06, - "loss": 0.74186987, - "num_input_tokens_seen": 15914225, - "step": 741, - "time_per_iteration": 2.707231283187866 - }, - { - "auxiliary_loss_clip": 0.01233192, - "auxiliary_loss_mlp": 0.01084138, - "balance_loss_clip": 1.062482, - "balance_loss_mlp": 1.05476475, - "epoch": 0.044611453479633245, - "flos": 28803661672320.0, - "grad_norm": 1.7086571433899975, - "language_loss": 0.88933527, - "learning_rate": 3.997761273778037e-06, - "loss": 0.91250861, - "num_input_tokens_seen": 15934540, - "step": 742, - "time_per_iteration": 2.6647751331329346 - }, - { - "auxiliary_loss_clip": 0.01237248, - "auxiliary_loss_mlp": 0.0106534, - "balance_loss_clip": 1.06481838, - "balance_loss_mlp": 1.03367805, - "epoch": 0.04467157673230122, - "flos": 20010970304640.0, - "grad_norm": 1.9055071619943689, - "language_loss": 0.83840811, - "learning_rate": 3.997742813608561e-06, - "loss": 0.86143398, - "num_input_tokens_seen": 15952560, - "step": 743, - "time_per_iteration": 2.697864055633545 - }, - { - "auxiliary_loss_clip": 0.01239398, - "auxiliary_loss_mlp": 0.01073846, - "balance_loss_clip": 1.06395566, - "balance_loss_mlp": 1.04373407, - "epoch": 0.04473169998496919, - "flos": 18004964480640.0, - "grad_norm": 2.2041873634107696, - "language_loss": 0.80026019, - "learning_rate": 3.997724277684479e-06, - "loss": 0.82339263, - "num_input_tokens_seen": 15970620, - "step": 744, - "time_per_iteration": 2.6551101207733154 - }, - { - "auxiliary_loss_clip": 0.01236158, - "auxiliary_loss_mlp": 0.01076186, - "balance_loss_clip": 1.06385589, - "balance_loss_mlp": 1.04665816, - "epoch": 0.044791823237637154, - "flos": 20631722169600.0, - "grad_norm": 2.139129927663487, - "language_loss": 0.85502481, - "learning_rate": 3.99770566600649e-06, - "loss": 0.87814826, - "num_input_tokens_seen": 15987325, - "step": 745, - "time_per_iteration": 2.6686010360717773 - }, - { - "auxiliary_loss_clip": 0.01235001, - "auxiliary_loss_mlp": 0.01066107, - "balance_loss_clip": 1.06320596, - "balance_loss_mlp": 1.03594685, - "epoch": 0.04485194649030513, - "flos": 31176171918720.0, - "grad_norm": 1.8251828520192552, - "language_loss": 0.69291008, - "learning_rate": 3.997686978575302e-06, - "loss": 0.71592116, - "num_input_tokens_seen": 16008310, - "step": 746, - "time_per_iteration": 2.6782095432281494 - }, - { - "auxiliary_loss_clip": 0.01244022, - "auxiliary_loss_mlp": 0.01081644, - "balance_loss_clip": 1.07012939, - "balance_loss_mlp": 1.05000615, - "epoch": 0.04491206974297309, - "flos": 26143291831680.0, - "grad_norm": 3.6053643469900982, - "language_loss": 0.68531066, - "learning_rate": 3.997668215391625e-06, - "loss": 0.70856726, - "num_input_tokens_seen": 16029620, - "step": 747, - "time_per_iteration": 2.6589114665985107 - }, - { - "auxiliary_loss_clip": 0.0124018, - "auxiliary_loss_mlp": 0.01083594, - "balance_loss_clip": 1.0652504, - "balance_loss_mlp": 1.05183625, - "epoch": 0.044972192995641064, - "flos": 20667668705280.0, - "grad_norm": 1.8376208182131786, - "language_loss": 0.66778374, - "learning_rate": 3.997649376456168e-06, - "loss": 0.69102144, - "num_input_tokens_seen": 16049065, - "step": 748, - "time_per_iteration": 2.674691677093506 - }, - { - "auxiliary_loss_clip": 0.01243343, - "auxiliary_loss_mlp": 0.01085665, - "balance_loss_clip": 1.07101417, - "balance_loss_mlp": 1.05596995, - "epoch": 0.045032316248309036, - "flos": 16106834177280.0, - "grad_norm": 2.4197486882062322, - "language_loss": 0.76684916, - "learning_rate": 3.997630461769647e-06, - "loss": 0.7901392, - "num_input_tokens_seen": 16066765, - "step": 749, - "time_per_iteration": 2.5940611362457275 - }, - { - "auxiliary_loss_clip": 0.01243381, - "auxiliary_loss_mlp": 0.01083303, - "balance_loss_clip": 1.06892776, - "balance_loss_mlp": 1.05338168, - "epoch": 0.045092439500977, - "flos": 17858843953920.0, - "grad_norm": 1.926675828378473, - "language_loss": 0.88739896, - "learning_rate": 3.997611471332778e-06, - "loss": 0.91066581, - "num_input_tokens_seen": 16085980, - "step": 750, - "time_per_iteration": 2.551717758178711 - }, - { - "auxiliary_loss_clip": 0.01238484, - "auxiliary_loss_mlp": 0.01077419, - "balance_loss_clip": 1.062783, - "balance_loss_mlp": 1.04404092, - "epoch": 0.04515256275364497, - "flos": 24462815990400.0, - "grad_norm": 3.4910287963746116, - "language_loss": 0.74371743, - "learning_rate": 3.9975924051462825e-06, - "loss": 0.76687646, - "num_input_tokens_seen": 16106260, - "step": 751, - "time_per_iteration": 2.6299028396606445 - }, - { - "auxiliary_loss_clip": 0.0123577, - "auxiliary_loss_mlp": 0.01078322, - "balance_loss_clip": 1.06347609, - "balance_loss_mlp": 1.04884171, - "epoch": 0.04521268600631294, - "flos": 20916385453440.0, - "grad_norm": 3.3938056459605583, - "language_loss": 0.69115144, - "learning_rate": 3.997573263210883e-06, - "loss": 0.71429229, - "num_input_tokens_seen": 16123475, - "step": 752, - "time_per_iteration": 2.571223020553589 - }, - { - "auxiliary_loss_clip": 0.01235899, - "auxiliary_loss_mlp": 0.01060876, - "balance_loss_clip": 1.0627141, - "balance_loss_mlp": 1.03212225, - "epoch": 0.04527280925898091, - "flos": 13371374954880.0, - "grad_norm": 2.69328062598792, - "language_loss": 0.92126763, - "learning_rate": 3.997554045527305e-06, - "loss": 0.94423538, - "num_input_tokens_seen": 16138335, - "step": 753, - "time_per_iteration": 2.6100237369537354 - }, - { - "auxiliary_loss_clip": 0.01239023, - "auxiliary_loss_mlp": 0.01080271, - "balance_loss_clip": 1.06628633, - "balance_loss_mlp": 1.05116034, - "epoch": 0.04533293251164888, - "flos": 23254565276160.0, - "grad_norm": 4.138305317267875, - "language_loss": 0.91373456, - "learning_rate": 3.997534752096277e-06, - "loss": 0.93692756, - "num_input_tokens_seen": 16157110, - "step": 754, - "time_per_iteration": 2.642747402191162 - }, - { - "auxiliary_loss_clip": 0.01229195, - "auxiliary_loss_mlp": 0.01078016, - "balance_loss_clip": 1.06402516, - "balance_loss_mlp": 1.04725957, - "epoch": 0.04539305576431685, - "flos": 12422004537600.0, - "grad_norm": 4.559941934311277, - "language_loss": 0.78558046, - "learning_rate": 3.997515382918531e-06, - "loss": 0.80865264, - "num_input_tokens_seen": 16174155, - "step": 755, - "time_per_iteration": 2.6316659450531006 - }, - { - "auxiliary_loss_clip": 0.01240044, - "auxiliary_loss_mlp": 0.01081048, - "balance_loss_clip": 1.06624937, - "balance_loss_mlp": 1.05099559, - "epoch": 0.04545317901698482, - "flos": 16070995382400.0, - "grad_norm": 2.193539224658874, - "language_loss": 0.78473848, - "learning_rate": 3.9974959379948015e-06, - "loss": 0.80794942, - "num_input_tokens_seen": 16192240, - "step": 756, - "time_per_iteration": 2.6390748023986816 - }, - { - "auxiliary_loss_clip": 0.01101224, - "auxiliary_loss_mlp": 0.01013849, - "balance_loss_clip": 1.02455997, - "balance_loss_mlp": 1.0089612, - "epoch": 0.045513302269652785, - "flos": 66396139021440.0, - "grad_norm": 0.8202876780471967, - "language_loss": 0.62756521, - "learning_rate": 3.997476417325827e-06, - "loss": 0.64871597, - "num_input_tokens_seen": 16255775, - "step": 757, - "time_per_iteration": 3.2393198013305664 - }, - { - "auxiliary_loss_clip": 0.01235136, - "auxiliary_loss_mlp": 0.01071767, - "balance_loss_clip": 1.06455243, - "balance_loss_mlp": 1.04346693, - "epoch": 0.04557342552232076, - "flos": 21471169991040.0, - "grad_norm": 1.6528285304744148, - "language_loss": 0.84211069, - "learning_rate": 3.997456820912346e-06, - "loss": 0.86517978, - "num_input_tokens_seen": 16277015, - "step": 758, - "time_per_iteration": 2.6508655548095703 - }, - { - "auxiliary_loss_clip": 0.01228461, - "auxiliary_loss_mlp": 0.01067033, - "balance_loss_clip": 1.05912399, - "balance_loss_mlp": 1.0391618, - "epoch": 0.04563354877498873, - "flos": 23732680233600.0, - "grad_norm": 2.695805662282291, - "language_loss": 0.88150775, - "learning_rate": 3.997437148755101e-06, - "loss": 0.9044627, - "num_input_tokens_seen": 16296005, - "step": 759, - "time_per_iteration": 2.7782890796661377 - }, - { - "auxiliary_loss_clip": 0.01240589, - "auxiliary_loss_mlp": 0.01078815, - "balance_loss_clip": 1.06747675, - "balance_loss_mlp": 1.04846466, - "epoch": 0.045693672027656694, - "flos": 25735741142400.0, - "grad_norm": 2.392455009776849, - "language_loss": 0.73440695, - "learning_rate": 3.9974174008548405e-06, - "loss": 0.75760102, - "num_input_tokens_seen": 16315300, - "step": 760, - "time_per_iteration": 2.7138822078704834 - }, - { - "auxiliary_loss_clip": 0.01240372, - "auxiliary_loss_mlp": 0.01079791, - "balance_loss_clip": 1.07095265, - "balance_loss_mlp": 1.05162191, - "epoch": 0.045753795280324666, - "flos": 19719016560000.0, - "grad_norm": 3.497321311688565, - "language_loss": 0.81781888, - "learning_rate": 3.9973975772123105e-06, - "loss": 0.84102058, - "num_input_tokens_seen": 16333820, - "step": 761, - "time_per_iteration": 2.631303310394287 - }, - { - "auxiliary_loss_clip": 0.01231969, - "auxiliary_loss_mlp": 0.01078623, - "balance_loss_clip": 1.06324267, - "balance_loss_mlp": 1.04922605, - "epoch": 0.04581391853299264, - "flos": 23255786338560.0, - "grad_norm": 2.0632320043111965, - "language_loss": 0.79811668, - "learning_rate": 3.997377677828266e-06, - "loss": 0.82122266, - "num_input_tokens_seen": 16355290, - "step": 762, - "time_per_iteration": 2.646928071975708 - }, - { - "auxiliary_loss_clip": 0.01093869, - "auxiliary_loss_mlp": 0.01027943, - "balance_loss_clip": 1.01857328, - "balance_loss_mlp": 1.02288842, - "epoch": 0.0458740417856606, - "flos": 64231155601920.0, - "grad_norm": 1.0128965743658471, - "language_loss": 0.58723813, - "learning_rate": 3.9973577027034585e-06, - "loss": 0.60845619, - "num_input_tokens_seen": 16415995, - "step": 763, - "time_per_iteration": 3.1712563037872314 - }, - { - "auxiliary_loss_clip": 0.012343, - "auxiliary_loss_mlp": 0.01082461, - "balance_loss_clip": 1.06205368, - "balance_loss_mlp": 1.0531354, - "epoch": 0.045934165038328575, - "flos": 20770121272320.0, - "grad_norm": 4.978761831483118, - "language_loss": 0.87544954, - "learning_rate": 3.9973376518386475e-06, - "loss": 0.89861715, - "num_input_tokens_seen": 16433120, - "step": 764, - "time_per_iteration": 2.5985426902770996 - }, - { - "auxiliary_loss_clip": 0.01236145, - "auxiliary_loss_mlp": 0.01087868, - "balance_loss_clip": 1.06553543, - "balance_loss_mlp": 1.05854285, - "epoch": 0.04599428829099654, - "flos": 30262891691520.0, - "grad_norm": 2.0894169515773067, - "language_loss": 0.85966802, - "learning_rate": 3.997317525234592e-06, - "loss": 0.88290817, - "num_input_tokens_seen": 16453360, - "step": 765, - "time_per_iteration": 2.6572606563568115 - }, - { - "auxiliary_loss_clip": 0.01239644, - "auxiliary_loss_mlp": 0.01077398, - "balance_loss_clip": 1.06530261, - "balance_loss_mlp": 1.04573584, - "epoch": 0.04605441154366451, - "flos": 23038921975680.0, - "grad_norm": 2.628046285830335, - "language_loss": 0.88265938, - "learning_rate": 3.997297322892056e-06, - "loss": 0.90582979, - "num_input_tokens_seen": 16471160, - "step": 766, - "time_per_iteration": 2.673226833343506 - }, - { - "auxiliary_loss_clip": 0.01235506, - "auxiliary_loss_mlp": 0.0107998, - "balance_loss_clip": 1.06371713, - "balance_loss_mlp": 1.05115545, - "epoch": 0.046114534796332485, - "flos": 22017407091840.0, - "grad_norm": 2.343908591401411, - "language_loss": 0.84302223, - "learning_rate": 3.997277044811806e-06, - "loss": 0.86617708, - "num_input_tokens_seen": 16488940, - "step": 767, - "time_per_iteration": 2.683429002761841 - }, - { - "auxiliary_loss_clip": 0.01236229, - "auxiliary_loss_mlp": 0.01067844, - "balance_loss_clip": 1.06769753, - "balance_loss_mlp": 1.03791094, - "epoch": 0.04617465804900045, - "flos": 29862380067840.0, - "grad_norm": 1.9268984031305718, - "language_loss": 0.8669976, - "learning_rate": 3.99725669099461e-06, - "loss": 0.89003831, - "num_input_tokens_seen": 16509505, - "step": 768, - "time_per_iteration": 2.8125200271606445 - }, - { - "auxiliary_loss_clip": 0.01234175, - "auxiliary_loss_mlp": 0.01076069, - "balance_loss_clip": 1.06150854, - "balance_loss_mlp": 1.04738712, - "epoch": 0.04623478130166842, - "flos": 25630056351360.0, - "grad_norm": 2.115272554881108, - "language_loss": 0.75152099, - "learning_rate": 3.9972362614412395e-06, - "loss": 0.77462339, - "num_input_tokens_seen": 16528840, - "step": 769, - "time_per_iteration": 2.7286128997802734 - }, - { - "auxiliary_loss_clip": 0.01229956, - "auxiliary_loss_mlp": 0.01072391, - "balance_loss_clip": 1.06326365, - "balance_loss_mlp": 1.04462695, - "epoch": 0.04629490455433639, - "flos": 20449080489600.0, - "grad_norm": 1.8368669953292174, - "language_loss": 0.86292851, - "learning_rate": 3.997215756152471e-06, - "loss": 0.885952, - "num_input_tokens_seen": 16548335, - "step": 770, - "time_per_iteration": 2.68608021736145 - }, - { - "auxiliary_loss_clip": 0.01239009, - "auxiliary_loss_mlp": 0.01072125, - "balance_loss_clip": 1.06274092, - "balance_loss_mlp": 1.04284704, - "epoch": 0.04635502780700436, - "flos": 23148736830720.0, - "grad_norm": 2.058802627607224, - "language_loss": 0.86842889, - "learning_rate": 3.99719517512908e-06, - "loss": 0.89154023, - "num_input_tokens_seen": 16567725, - "step": 771, - "time_per_iteration": 2.637509822845459 - }, - { - "auxiliary_loss_clip": 0.01239449, - "auxiliary_loss_mlp": 0.01079651, - "balance_loss_clip": 1.06184912, - "balance_loss_mlp": 1.04884768, - "epoch": 0.04641515105967233, - "flos": 23292020183040.0, - "grad_norm": 1.87920888608735, - "language_loss": 0.83691382, - "learning_rate": 3.997174518371848e-06, - "loss": 0.8601048, - "num_input_tokens_seen": 16588175, - "step": 772, - "time_per_iteration": 2.745006561279297 - }, - { - "auxiliary_loss_clip": 0.01236322, - "auxiliary_loss_mlp": 0.0107061, - "balance_loss_clip": 1.06672883, - "balance_loss_mlp": 1.04220271, - "epoch": 0.046475274312340296, - "flos": 25115204759040.0, - "grad_norm": 1.9655107083336736, - "language_loss": 0.73639083, - "learning_rate": 3.997153785881557e-06, - "loss": 0.75946015, - "num_input_tokens_seen": 16607735, - "step": 773, - "time_per_iteration": 2.869290828704834 - }, - { - "auxiliary_loss_clip": 0.01231219, - "auxiliary_loss_mlp": 0.01071681, - "balance_loss_clip": 1.06529772, - "balance_loss_mlp": 1.04054356, - "epoch": 0.04653539756500827, - "flos": 25264916645760.0, - "grad_norm": 2.096431798380756, - "language_loss": 0.78228974, - "learning_rate": 3.997132977658996e-06, - "loss": 0.80531871, - "num_input_tokens_seen": 16627225, - "step": 774, - "time_per_iteration": 2.6967568397521973 - }, - { - "auxiliary_loss_clip": 0.01230587, - "auxiliary_loss_mlp": 0.01069519, - "balance_loss_clip": 1.06347871, - "balance_loss_mlp": 1.04131365, - "epoch": 0.046595520817676234, - "flos": 35404150089600.0, - "grad_norm": 2.018140205527256, - "language_loss": 0.73187691, - "learning_rate": 3.997112093704952e-06, - "loss": 0.75487792, - "num_input_tokens_seen": 16647785, - "step": 775, - "time_per_iteration": 2.737140417098999 - }, - { - "auxiliary_loss_clip": 0.01231996, - "auxiliary_loss_mlp": 0.01066454, - "balance_loss_clip": 1.06187618, - "balance_loss_mlp": 1.03650832, - "epoch": 0.046655644070344206, - "flos": 18112516778880.0, - "grad_norm": 1.668093168561758, - "language_loss": 0.77180624, - "learning_rate": 3.997091134020217e-06, - "loss": 0.7947908, - "num_input_tokens_seen": 16667555, - "step": 776, - "time_per_iteration": 4.154085159301758 - }, - { - "auxiliary_loss_clip": 0.0122577, - "auxiliary_loss_mlp": 0.01071334, - "balance_loss_clip": 1.06031108, - "balance_loss_mlp": 1.04352236, - "epoch": 0.04671576732301218, - "flos": 29205286617600.0, - "grad_norm": 1.9054628166827923, - "language_loss": 0.7087816, - "learning_rate": 3.997070098605585e-06, - "loss": 0.73175263, - "num_input_tokens_seen": 16686875, - "step": 777, - "time_per_iteration": 4.176887512207031 - }, - { - "auxiliary_loss_clip": 0.0122979, - "auxiliary_loss_mlp": 0.01076806, - "balance_loss_clip": 1.06275606, - "balance_loss_mlp": 1.04705119, - "epoch": 0.04677589057568014, - "flos": 30478319510400.0, - "grad_norm": 1.8083238359854679, - "language_loss": 0.77069759, - "learning_rate": 3.997048987461856e-06, - "loss": 0.79376352, - "num_input_tokens_seen": 16706420, - "step": 778, - "time_per_iteration": 5.943394422531128 - }, - { - "auxiliary_loss_clip": 0.01227067, - "auxiliary_loss_mlp": 0.01064982, - "balance_loss_clip": 1.06043744, - "balance_loss_mlp": 1.03563297, - "epoch": 0.046836013828348115, - "flos": 20557674282240.0, - "grad_norm": 2.1737778598926463, - "language_loss": 0.79181123, - "learning_rate": 3.997027800589829e-06, - "loss": 0.81473172, - "num_input_tokens_seen": 16726390, - "step": 779, - "time_per_iteration": 2.611804485321045 - }, - { - "auxiliary_loss_clip": 0.01219629, - "auxiliary_loss_mlp": 0.01070238, - "balance_loss_clip": 1.05842376, - "balance_loss_mlp": 1.04271269, - "epoch": 0.04689613708101608, - "flos": 25447378757760.0, - "grad_norm": 1.888854926622149, - "language_loss": 0.77364886, - "learning_rate": 3.997006537990308e-06, - "loss": 0.79654753, - "num_input_tokens_seen": 16748965, - "step": 780, - "time_per_iteration": 2.668239116668701 - }, - { - "auxiliary_loss_clip": 0.012253, - "auxiliary_loss_mlp": 0.01073321, - "balance_loss_clip": 1.06098521, - "balance_loss_mlp": 1.04605746, - "epoch": 0.04695626033368405, - "flos": 23001395241600.0, - "grad_norm": 1.7616538282563206, - "language_loss": 0.76700419, - "learning_rate": 3.996985199664099e-06, - "loss": 0.78999043, - "num_input_tokens_seen": 16768620, - "step": 781, - "time_per_iteration": 2.5979926586151123 - }, - { - "auxiliary_loss_clip": 0.01236637, - "auxiliary_loss_mlp": 0.01077479, - "balance_loss_clip": 1.0639379, - "balance_loss_mlp": 1.04836786, - "epoch": 0.047016383586352024, - "flos": 29133357632640.0, - "grad_norm": 3.0946494667490856, - "language_loss": 0.73786414, - "learning_rate": 3.99696378561201e-06, - "loss": 0.76100528, - "num_input_tokens_seen": 16789755, - "step": 782, - "time_per_iteration": 2.708855390548706 - }, - { - "auxiliary_loss_clip": 0.0122968, - "auxiliary_loss_mlp": 0.01069368, - "balance_loss_clip": 1.06431556, - "balance_loss_mlp": 1.04253423, - "epoch": 0.04707650683901999, - "flos": 14976330451200.0, - "grad_norm": 2.1459158015790183, - "language_loss": 0.80524659, - "learning_rate": 3.996942295834855e-06, - "loss": 0.82823706, - "num_input_tokens_seen": 16807585, - "step": 783, - "time_per_iteration": 2.6355738639831543 - }, - { - "auxiliary_loss_clip": 0.01222415, - "auxiliary_loss_mlp": 0.01063155, - "balance_loss_clip": 1.06221437, - "balance_loss_mlp": 1.03663135, - "epoch": 0.04713663009168796, - "flos": 21651118151040.0, - "grad_norm": 1.9084512066318515, - "language_loss": 0.81687874, - "learning_rate": 3.996920730333448e-06, - "loss": 0.83973444, - "num_input_tokens_seen": 16827220, - "step": 784, - "time_per_iteration": 2.64365291595459 - }, - { - "auxiliary_loss_clip": 0.01226632, - "auxiliary_loss_mlp": 0.01074549, - "balance_loss_clip": 1.0582943, - "balance_loss_mlp": 1.04719007, - "epoch": 0.04719675334435593, - "flos": 21325408600320.0, - "grad_norm": 3.970707764370453, - "language_loss": 0.80619848, - "learning_rate": 3.996899089108607e-06, - "loss": 0.82921028, - "num_input_tokens_seen": 16846230, - "step": 785, - "time_per_iteration": 2.682971715927124 - }, - { - "auxiliary_loss_clip": 0.01231621, - "auxiliary_loss_mlp": 0.01063774, - "balance_loss_clip": 1.06683421, - "balance_loss_mlp": 1.03784585, - "epoch": 0.0472568765970239, - "flos": 17931383470080.0, - "grad_norm": 2.074448818096939, - "language_loss": 0.89784658, - "learning_rate": 3.996877372161152e-06, - "loss": 0.92080051, - "num_input_tokens_seen": 16865325, - "step": 786, - "time_per_iteration": 2.6072235107421875 - }, - { - "auxiliary_loss_clip": 0.01227201, - "auxiliary_loss_mlp": 0.01069453, - "balance_loss_clip": 1.05475712, - "balance_loss_mlp": 1.03912568, - "epoch": 0.04731699984969187, - "flos": 18077324428800.0, - "grad_norm": 6.783818284100465, - "language_loss": 0.76794451, - "learning_rate": 3.9968555794919065e-06, - "loss": 0.79091108, - "num_input_tokens_seen": 16882930, - "step": 787, - "time_per_iteration": 2.595069646835327 - }, - { - "auxiliary_loss_clip": 0.01233526, - "auxiliary_loss_mlp": 0.01070856, - "balance_loss_clip": 1.06563127, - "balance_loss_mlp": 1.04248405, - "epoch": 0.047377123102359836, - "flos": 23185078416000.0, - "grad_norm": 2.309745026689568, - "language_loss": 0.81301165, - "learning_rate": 3.996833711101698e-06, - "loss": 0.83605546, - "num_input_tokens_seen": 16900710, - "step": 788, - "time_per_iteration": 2.633812427520752 - }, - { - "auxiliary_loss_clip": 0.01225447, - "auxiliary_loss_mlp": 0.01078934, - "balance_loss_clip": 1.06370282, - "balance_loss_mlp": 1.04934621, - "epoch": 0.04743724635502781, - "flos": 22747794243840.0, - "grad_norm": 2.941245147417381, - "language_loss": 0.84428835, - "learning_rate": 3.996811766991355e-06, - "loss": 0.86733222, - "num_input_tokens_seen": 16919210, - "step": 789, - "time_per_iteration": 2.6711082458496094 - }, - { - "auxiliary_loss_clip": 0.01230866, - "auxiliary_loss_mlp": 0.01071483, - "balance_loss_clip": 1.06367648, - "balance_loss_mlp": 1.0441606, - "epoch": 0.04749736960769577, - "flos": 17238702620160.0, - "grad_norm": 2.0289407228390615, - "language_loss": 0.81787878, - "learning_rate": 3.996789747161709e-06, - "loss": 0.84090227, - "num_input_tokens_seen": 16937125, - "step": 790, - "time_per_iteration": 2.6136717796325684 - }, - { - "auxiliary_loss_clip": 0.01224033, - "auxiliary_loss_mlp": 0.01064065, - "balance_loss_clip": 1.05880189, - "balance_loss_mlp": 1.03546715, - "epoch": 0.047557492860363745, - "flos": 40479261592320.0, - "grad_norm": 2.9735437778568965, - "language_loss": 0.88116109, - "learning_rate": 3.996767651613597e-06, - "loss": 0.90404207, - "num_input_tokens_seen": 16958610, - "step": 791, - "time_per_iteration": 2.747586727142334 - }, - { - "auxiliary_loss_clip": 0.01226267, - "auxiliary_loss_mlp": 0.01066471, - "balance_loss_clip": 1.06144643, - "balance_loss_mlp": 1.03743124, - "epoch": 0.04761761611303172, - "flos": 18698004466560.0, - "grad_norm": 2.1239226540804537, - "language_loss": 0.90671498, - "learning_rate": 3.996745480347854e-06, - "loss": 0.92964232, - "num_input_tokens_seen": 16977300, - "step": 792, - "time_per_iteration": 2.591477870941162 - }, - { - "auxiliary_loss_clip": 0.01226882, - "auxiliary_loss_mlp": 0.0107926, - "balance_loss_clip": 1.05968022, - "balance_loss_mlp": 1.05225897, - "epoch": 0.04767773936569968, - "flos": 20921987975040.0, - "grad_norm": 1.9120988315570397, - "language_loss": 0.73246223, - "learning_rate": 3.996723233365324e-06, - "loss": 0.75552362, - "num_input_tokens_seen": 16994950, - "step": 793, - "time_per_iteration": 2.6319899559020996 - }, - { - "auxiliary_loss_clip": 0.01231301, - "auxiliary_loss_mlp": 0.01070716, - "balance_loss_clip": 1.06213653, - "balance_loss_mlp": 1.04146254, - "epoch": 0.047737862618367655, - "flos": 23732680233600.0, - "grad_norm": 1.86347948201136, - "language_loss": 0.86139679, - "learning_rate": 3.996700910666847e-06, - "loss": 0.88441694, - "num_input_tokens_seen": 17014760, - "step": 794, - "time_per_iteration": 2.6835687160491943 - }, - { - "auxiliary_loss_clip": 0.01228204, - "auxiliary_loss_mlp": 0.01077895, - "balance_loss_clip": 1.05969596, - "balance_loss_mlp": 1.04935622, - "epoch": 0.04779798587103562, - "flos": 23695764030720.0, - "grad_norm": 2.370166301863074, - "language_loss": 0.69069195, - "learning_rate": 3.996678512253272e-06, - "loss": 0.71375293, - "num_input_tokens_seen": 17032715, - "step": 795, - "time_per_iteration": 2.669261932373047 - }, - { - "auxiliary_loss_clip": 0.01225748, - "auxiliary_loss_mlp": 0.01076275, - "balance_loss_clip": 1.06129098, - "balance_loss_mlp": 1.04756904, - "epoch": 0.04785810912370359, - "flos": 23183641872000.0, - "grad_norm": 1.744925212230271, - "language_loss": 0.810256, - "learning_rate": 3.996656038125449e-06, - "loss": 0.83327615, - "num_input_tokens_seen": 17052215, - "step": 796, - "time_per_iteration": 2.5800065994262695 - }, - { - "auxiliary_loss_clip": 0.01228235, - "auxiliary_loss_mlp": 0.01065433, - "balance_loss_clip": 1.06224668, - "balance_loss_mlp": 1.03638172, - "epoch": 0.047918232376371564, - "flos": 18040623707520.0, - "grad_norm": 1.979164246440182, - "language_loss": 0.8128069, - "learning_rate": 3.996633488284228e-06, - "loss": 0.83574355, - "num_input_tokens_seen": 17069225, - "step": 797, - "time_per_iteration": 2.58878493309021 - }, - { - "auxiliary_loss_clip": 0.01100259, - "auxiliary_loss_mlp": 0.01007215, - "balance_loss_clip": 1.02779806, - "balance_loss_mlp": 1.00266171, - "epoch": 0.04797835562903953, - "flos": 62442588758400.0, - "grad_norm": 0.912416075283383, - "language_loss": 0.64532876, - "learning_rate": 3.996610862730465e-06, - "loss": 0.66640353, - "num_input_tokens_seen": 17126680, - "step": 798, - "time_per_iteration": 3.0779380798339844 - }, - { - "auxiliary_loss_clip": 0.01229665, - "auxiliary_loss_mlp": 0.01068747, - "balance_loss_clip": 1.05799031, - "balance_loss_mlp": 1.04121017, - "epoch": 0.0480384788817075, - "flos": 21507296094720.0, - "grad_norm": 2.0206600610723333, - "language_loss": 0.91274291, - "learning_rate": 3.996588161465018e-06, - "loss": 0.935727, - "num_input_tokens_seen": 17144835, - "step": 799, - "time_per_iteration": 2.660438299179077 - }, - { - "auxiliary_loss_clip": 0.01230751, - "auxiliary_loss_mlp": 0.010715, - "balance_loss_clip": 1.06640434, - "balance_loss_mlp": 1.04274678, - "epoch": 0.048098602134375466, - "flos": 21726710323200.0, - "grad_norm": 2.0752654205923866, - "language_loss": 0.86825287, - "learning_rate": 3.996565384488748e-06, - "loss": 0.89127541, - "num_input_tokens_seen": 17165030, - "step": 800, - "time_per_iteration": 2.6700456142425537 - }, - { - "auxiliary_loss_clip": 0.01229893, - "auxiliary_loss_mlp": 0.01072058, - "balance_loss_clip": 1.06186771, - "balance_loss_mlp": 1.04618931, - "epoch": 0.04815872538704344, - "flos": 22931082368640.0, - "grad_norm": 2.5310108886746976, - "language_loss": 0.83949852, - "learning_rate": 3.996542531802518e-06, - "loss": 0.86251807, - "num_input_tokens_seen": 17184895, - "step": 801, - "time_per_iteration": 2.7724695205688477 - }, - { - "auxiliary_loss_clip": 0.01227846, - "auxiliary_loss_mlp": 0.010756, - "balance_loss_clip": 1.06226814, - "balance_loss_mlp": 1.04847932, - "epoch": 0.04821884863971141, - "flos": 43174716042240.0, - "grad_norm": 1.9607091513106172, - "language_loss": 0.79818648, - "learning_rate": 3.996519603407196e-06, - "loss": 0.82122099, - "num_input_tokens_seen": 17208225, - "step": 802, - "time_per_iteration": 2.861309766769409 - }, - { - "auxiliary_loss_clip": 0.0122832, - "auxiliary_loss_mlp": 0.01069086, - "balance_loss_clip": 1.06392837, - "balance_loss_mlp": 1.04278886, - "epoch": 0.048278971892379376, - "flos": 18620006083200.0, - "grad_norm": 1.798745906633195, - "language_loss": 0.86600745, - "learning_rate": 3.996496599303649e-06, - "loss": 0.88898146, - "num_input_tokens_seen": 17226305, - "step": 803, - "time_per_iteration": 2.612684965133667 - }, - { - "auxiliary_loss_clip": 0.01222438, - "auxiliary_loss_mlp": 0.01063116, - "balance_loss_clip": 1.06214345, - "balance_loss_mlp": 1.03643703, - "epoch": 0.04833909514504735, - "flos": 20230061310720.0, - "grad_norm": 5.958214069975319, - "language_loss": 0.85139012, - "learning_rate": 3.996473519492753e-06, - "loss": 0.8742457, - "num_input_tokens_seen": 17244545, - "step": 804, - "time_per_iteration": 2.596965789794922 - }, - { - "auxiliary_loss_clip": 0.01225485, - "auxiliary_loss_mlp": 0.0106948, - "balance_loss_clip": 1.06206632, - "balance_loss_mlp": 1.04222918, - "epoch": 0.04839921839771532, - "flos": 24645170361600.0, - "grad_norm": 1.9492340448514227, - "language_loss": 0.85939878, - "learning_rate": 3.99645036397538e-06, - "loss": 0.88234842, - "num_input_tokens_seen": 17265730, - "step": 805, - "time_per_iteration": 2.6773781776428223 - }, - { - "auxiliary_loss_clip": 0.01221339, - "auxiliary_loss_mlp": 0.01071867, - "balance_loss_clip": 1.05968738, - "balance_loss_mlp": 1.04591477, - "epoch": 0.048459341650383285, - "flos": 24827452905600.0, - "grad_norm": 1.8764849579047527, - "language_loss": 0.68025368, - "learning_rate": 3.9964271327524085e-06, - "loss": 0.70318574, - "num_input_tokens_seen": 17284820, - "step": 806, - "time_per_iteration": 2.6270596981048584 - }, - { - "auxiliary_loss_clip": 0.01221043, - "auxiliary_loss_mlp": 0.01060505, - "balance_loss_clip": 1.06064904, - "balance_loss_mlp": 1.03384972, - "epoch": 0.04851946490305126, - "flos": 22163204396160.0, - "grad_norm": 8.586680684018, - "language_loss": 0.76488906, - "learning_rate": 3.9964038258247214e-06, - "loss": 0.78770459, - "num_input_tokens_seen": 17305085, - "step": 807, - "time_per_iteration": 2.6783089637756348 - }, - { - "auxiliary_loss_clip": 0.01218859, - "auxiliary_loss_mlp": 0.01068871, - "balance_loss_clip": 1.05734789, - "balance_loss_mlp": 1.04290676, - "epoch": 0.04857958815571922, - "flos": 19792022952960.0, - "grad_norm": 2.4056749627509157, - "language_loss": 0.86882269, - "learning_rate": 3.9963804431932005e-06, - "loss": 0.89170003, - "num_input_tokens_seen": 17322715, - "step": 808, - "time_per_iteration": 2.6447641849517822 - }, - { - "auxiliary_loss_clip": 0.01227529, - "auxiliary_loss_mlp": 0.01069446, - "balance_loss_clip": 1.06140316, - "balance_loss_mlp": 1.0424329, - "epoch": 0.048639711408387194, - "flos": 18697968552960.0, - "grad_norm": 2.6040733531164424, - "language_loss": 0.89710444, - "learning_rate": 3.996356984858732e-06, - "loss": 0.92007422, - "num_input_tokens_seen": 17341455, - "step": 809, - "time_per_iteration": 2.6679790019989014 - }, - { - "auxiliary_loss_clip": 0.01226608, - "auxiliary_loss_mlp": 0.01067211, - "balance_loss_clip": 1.0643065, - "balance_loss_mlp": 1.04060316, - "epoch": 0.048699834661055166, - "flos": 24863507182080.0, - "grad_norm": 3.0721319202916324, - "language_loss": 0.84918916, - "learning_rate": 3.996333450822208e-06, - "loss": 0.87212729, - "num_input_tokens_seen": 17360765, - "step": 810, - "time_per_iteration": 2.696772575378418 - }, - { - "auxiliary_loss_clip": 0.01227202, - "auxiliary_loss_mlp": 0.01067343, - "balance_loss_clip": 1.0622344, - "balance_loss_mlp": 1.04049683, - "epoch": 0.04875995791372313, - "flos": 20704010290560.0, - "grad_norm": 1.8136675943398954, - "language_loss": 0.80799425, - "learning_rate": 3.99630984108452e-06, - "loss": 0.83093977, - "num_input_tokens_seen": 17380625, - "step": 811, - "time_per_iteration": 2.653808355331421 - }, - { - "auxiliary_loss_clip": 0.01217843, - "auxiliary_loss_mlp": 0.01070621, - "balance_loss_clip": 1.05928314, - "balance_loss_mlp": 1.04466903, - "epoch": 0.048820081166391104, - "flos": 18588297352320.0, - "grad_norm": 1.7193599003225197, - "language_loss": 0.74634516, - "learning_rate": 3.9962861556465615e-06, - "loss": 0.76922977, - "num_input_tokens_seen": 17399355, - "step": 812, - "time_per_iteration": 2.7274649143218994 - }, - { - "auxiliary_loss_clip": 0.01222659, - "auxiliary_loss_mlp": 0.01073562, - "balance_loss_clip": 1.06445217, - "balance_loss_mlp": 1.04862356, - "epoch": 0.04888020441905907, - "flos": 22707322594560.0, - "grad_norm": 1.9311665765462733, - "language_loss": 0.90124279, - "learning_rate": 3.996262394509233e-06, - "loss": 0.92420495, - "num_input_tokens_seen": 17418240, - "step": 813, - "time_per_iteration": 2.654874801635742 - }, - { - "auxiliary_loss_clip": 0.0122, - "auxiliary_loss_mlp": 0.01057827, - "balance_loss_clip": 1.06157589, - "balance_loss_mlp": 1.03248262, - "epoch": 0.04894032767172704, - "flos": 22784351310720.0, - "grad_norm": 1.9238840150723209, - "language_loss": 0.74904704, - "learning_rate": 3.9962385576734335e-06, - "loss": 0.77182531, - "num_input_tokens_seen": 17436250, - "step": 814, - "time_per_iteration": 2.7381603717803955 - }, - { - "auxiliary_loss_clip": 0.01223782, - "auxiliary_loss_mlp": 0.01069686, - "balance_loss_clip": 1.06125045, - "balance_loss_mlp": 1.04289961, - "epoch": 0.04900045092439501, - "flos": 25516147345920.0, - "grad_norm": 2.1966001004582596, - "language_loss": 0.83816808, - "learning_rate": 3.9962146451400675e-06, - "loss": 0.86110282, - "num_input_tokens_seen": 17455750, - "step": 815, - "time_per_iteration": 2.7289621829986572 - }, - { - "auxiliary_loss_clip": 0.01227011, - "auxiliary_loss_mlp": 0.01060571, - "balance_loss_clip": 1.06326818, - "balance_loss_mlp": 1.0344646, - "epoch": 0.04906057417706298, - "flos": 25958136199680.0, - "grad_norm": 2.3329994981275943, - "language_loss": 0.90796101, - "learning_rate": 3.996190656910043e-06, - "loss": 0.93083686, - "num_input_tokens_seen": 17474995, - "step": 816, - "time_per_iteration": 4.174290180206299 - }, - { - "auxiliary_loss_clip": 0.01226278, - "auxiliary_loss_mlp": 0.0105651, - "balance_loss_clip": 1.06172895, - "balance_loss_mlp": 1.03054583, - "epoch": 0.04912069742973095, - "flos": 18624638937600.0, - "grad_norm": 2.2253098946667853, - "language_loss": 0.79834002, - "learning_rate": 3.996166592984268e-06, - "loss": 0.82116789, - "num_input_tokens_seen": 17493395, - "step": 817, - "time_per_iteration": 4.2819907665252686 - }, - { - "auxiliary_loss_clip": 0.01222491, - "auxiliary_loss_mlp": 0.01072358, - "balance_loss_clip": 1.06228495, - "balance_loss_mlp": 1.04563141, - "epoch": 0.049180820682398915, - "flos": 23699786353920.0, - "grad_norm": 1.9292138186207266, - "language_loss": 0.8532303, - "learning_rate": 3.996142453363656e-06, - "loss": 0.8761788, - "num_input_tokens_seen": 17514565, - "step": 818, - "time_per_iteration": 7.687308073043823 - }, - { - "auxiliary_loss_clip": 0.01228571, - "auxiliary_loss_mlp": 0.01064433, - "balance_loss_clip": 1.06170368, - "balance_loss_mlp": 1.0369786, - "epoch": 0.04924094393506689, - "flos": 22420396753920.0, - "grad_norm": 2.1064810754058407, - "language_loss": 0.75623614, - "learning_rate": 3.996118238049124e-06, - "loss": 0.77916616, - "num_input_tokens_seen": 17534590, - "step": 819, - "time_per_iteration": 2.5708072185516357 - }, - { - "auxiliary_loss_clip": 0.01227988, - "auxiliary_loss_mlp": 0.010616, - "balance_loss_clip": 1.06580663, - "balance_loss_mlp": 1.03785336, - "epoch": 0.04930106718773486, - "flos": 15738246766080.0, - "grad_norm": 2.8685299631500487, - "language_loss": 0.85082126, - "learning_rate": 3.996093947041586e-06, - "loss": 0.87371719, - "num_input_tokens_seen": 17551900, - "step": 820, - "time_per_iteration": 2.695204973220825 - }, - { - "auxiliary_loss_clip": 0.01224953, - "auxiliary_loss_mlp": 0.01065985, - "balance_loss_clip": 1.06082845, - "balance_loss_mlp": 1.04037917, - "epoch": 0.049361190440402825, - "flos": 26250628648320.0, - "grad_norm": 1.734636988660555, - "language_loss": 0.90459162, - "learning_rate": 3.996069580341966e-06, - "loss": 0.92750102, - "num_input_tokens_seen": 17571485, - "step": 821, - "time_per_iteration": 2.6284992694854736 - }, - { - "auxiliary_loss_clip": 0.01222526, - "auxiliary_loss_mlp": 0.01080357, - "balance_loss_clip": 1.06015635, - "balance_loss_mlp": 1.05485809, - "epoch": 0.0494213136930708, - "flos": 21252366293760.0, - "grad_norm": 1.7915267676548876, - "language_loss": 0.89795959, - "learning_rate": 3.996045137951188e-06, - "loss": 0.92098844, - "num_input_tokens_seen": 17591410, - "step": 822, - "time_per_iteration": 2.6085855960845947 - }, - { - "auxiliary_loss_clip": 0.0122571, - "auxiliary_loss_mlp": 0.01062887, - "balance_loss_clip": 1.0639379, - "balance_loss_mlp": 1.03472972, - "epoch": 0.04948143694573876, - "flos": 27965506740480.0, - "grad_norm": 2.28747155105076, - "language_loss": 0.67558801, - "learning_rate": 3.996020619870178e-06, - "loss": 0.69847399, - "num_input_tokens_seen": 17612010, - "step": 823, - "time_per_iteration": 2.644277572631836 - }, - { - "auxiliary_loss_clip": 0.01099376, - "auxiliary_loss_mlp": 0.0100741, - "balance_loss_clip": 1.0267303, - "balance_loss_mlp": 1.00266516, - "epoch": 0.049541560198406734, - "flos": 66180995533440.0, - "grad_norm": 1.3456360586087317, - "language_loss": 0.62254131, - "learning_rate": 3.995996026099866e-06, - "loss": 0.64360917, - "num_input_tokens_seen": 17673430, - "step": 824, - "time_per_iteration": 3.230381488800049 - }, - { - "auxiliary_loss_clip": 0.01228758, - "auxiliary_loss_mlp": 0.01066541, - "balance_loss_clip": 1.06346989, - "balance_loss_mlp": 1.03909945, - "epoch": 0.049601683451074706, - "flos": 22892693708160.0, - "grad_norm": 1.8854339538524305, - "language_loss": 0.90479428, - "learning_rate": 3.995971356641185e-06, - "loss": 0.92774737, - "num_input_tokens_seen": 17689545, - "step": 825, - "time_per_iteration": 2.58868670463562 - }, - { - "auxiliary_loss_clip": 0.01227734, - "auxiliary_loss_mlp": 0.01066527, - "balance_loss_clip": 1.06315517, - "balance_loss_mlp": 1.03844118, - "epoch": 0.04966180670374267, - "flos": 21433643256960.0, - "grad_norm": 2.307419213246734, - "language_loss": 0.66851091, - "learning_rate": 3.9959466114950695e-06, - "loss": 0.69145352, - "num_input_tokens_seen": 17705965, - "step": 826, - "time_per_iteration": 2.59468412399292 - }, - { - "auxiliary_loss_clip": 0.01230149, - "auxiliary_loss_mlp": 0.01069061, - "balance_loss_clip": 1.06421614, - "balance_loss_mlp": 1.04216766, - "epoch": 0.04972192995641064, - "flos": 23107367341440.0, - "grad_norm": 1.8316571551414482, - "language_loss": 0.78298402, - "learning_rate": 3.995921790662459e-06, - "loss": 0.80597603, - "num_input_tokens_seen": 17724580, - "step": 827, - "time_per_iteration": 2.7148005962371826 - }, - { - "auxiliary_loss_clip": 0.01230507, - "auxiliary_loss_mlp": 0.01079145, - "balance_loss_clip": 1.06385946, - "balance_loss_mlp": 1.05119085, - "epoch": 0.04978205320907861, - "flos": 40406147458560.0, - "grad_norm": 1.6017511297862308, - "language_loss": 0.78696525, - "learning_rate": 3.995896894144294e-06, - "loss": 0.81006181, - "num_input_tokens_seen": 17747755, - "step": 828, - "time_per_iteration": 2.86991548538208 - }, - { - "auxiliary_loss_clip": 0.0121958, - "auxiliary_loss_mlp": 0.01059689, - "balance_loss_clip": 1.05939984, - "balance_loss_mlp": 1.03390431, - "epoch": 0.04984217646174658, - "flos": 25228539146880.0, - "grad_norm": 2.48577103336206, - "language_loss": 0.83530867, - "learning_rate": 3.995871921941519e-06, - "loss": 0.85810131, - "num_input_tokens_seen": 17768550, - "step": 829, - "time_per_iteration": 2.655895948410034 - }, - { - "auxiliary_loss_clip": 0.01226863, - "auxiliary_loss_mlp": 0.01080723, - "balance_loss_clip": 1.06109536, - "balance_loss_mlp": 1.05068195, - "epoch": 0.04990229971441455, - "flos": 15959636242560.0, - "grad_norm": 2.078538436430036, - "language_loss": 0.74857247, - "learning_rate": 3.99584687405508e-06, - "loss": 0.77164829, - "num_input_tokens_seen": 17786080, - "step": 830, - "time_per_iteration": 2.5820400714874268 - }, - { - "auxiliary_loss_clip": 0.0122584, - "auxiliary_loss_mlp": 0.01074077, - "balance_loss_clip": 1.06154907, - "balance_loss_mlp": 1.04667115, - "epoch": 0.04996242296708252, - "flos": 18405116968320.0, - "grad_norm": 1.8327841960194244, - "language_loss": 0.79279459, - "learning_rate": 3.995821750485929e-06, - "loss": 0.81579381, - "num_input_tokens_seen": 17803635, - "step": 831, - "time_per_iteration": 2.5980231761932373 - }, - { - "auxiliary_loss_clip": 0.01173206, - "auxiliary_loss_mlp": 0.01072743, - "balance_loss_clip": 1.0542444, - "balance_loss_mlp": 1.04725623, - "epoch": 0.05002254621975049, - "flos": 17858053854720.0, - "grad_norm": 3.034319898285603, - "language_loss": 0.91497368, - "learning_rate": 3.995796551235016e-06, - "loss": 0.93743312, - "num_input_tokens_seen": 17822190, - "step": 832, - "time_per_iteration": 2.7498815059661865 - }, - { - "auxiliary_loss_clip": 0.01194428, - "auxiliary_loss_mlp": 0.01081719, - "balance_loss_clip": 1.05826366, - "balance_loss_mlp": 1.05667353, - "epoch": 0.050082669472418455, - "flos": 45660273367680.0, - "grad_norm": 1.887029338258115, - "language_loss": 0.83167893, - "learning_rate": 3.9957712763032974e-06, - "loss": 0.85444039, - "num_input_tokens_seen": 17846915, - "step": 833, - "time_per_iteration": 2.863208770751953 - }, - { - "auxiliary_loss_clip": 0.01199525, - "auxiliary_loss_mlp": 0.01061962, - "balance_loss_clip": 1.05888343, - "balance_loss_mlp": 1.03468657, - "epoch": 0.05014279272508643, - "flos": 37962067363200.0, - "grad_norm": 2.8753922020214033, - "language_loss": 0.82409853, - "learning_rate": 3.995745925691733e-06, - "loss": 0.84671336, - "num_input_tokens_seen": 17867270, - "step": 834, - "time_per_iteration": 2.7868030071258545 - }, - { - "auxiliary_loss_clip": 0.01216246, - "auxiliary_loss_mlp": 0.01064427, - "balance_loss_clip": 1.06272483, - "balance_loss_mlp": 1.03672278, - "epoch": 0.0502029159777544, - "flos": 20996179516800.0, - "grad_norm": 2.2306487397141646, - "language_loss": 0.92186153, - "learning_rate": 3.995720499401282e-06, - "loss": 0.94466823, - "num_input_tokens_seen": 17884880, - "step": 835, - "time_per_iteration": 2.6224496364593506 - }, - { - "auxiliary_loss_clip": 0.01229494, - "auxiliary_loss_mlp": 0.01074922, - "balance_loss_clip": 1.06143415, - "balance_loss_mlp": 1.0464313, - "epoch": 0.050263039230422364, - "flos": 15888066393600.0, - "grad_norm": 2.196832783808158, - "language_loss": 0.76143622, - "learning_rate": 3.995694997432911e-06, - "loss": 0.78448039, - "num_input_tokens_seen": 17903695, - "step": 836, - "time_per_iteration": 2.5648462772369385 - }, - { - "auxiliary_loss_clip": 0.01211162, - "auxiliary_loss_mlp": 0.01075977, - "balance_loss_clip": 1.06259084, - "balance_loss_mlp": 1.04992962, - "epoch": 0.050323162483090336, - "flos": 23732752060800.0, - "grad_norm": 2.100773352560791, - "language_loss": 0.83627856, - "learning_rate": 3.9956694197875855e-06, - "loss": 0.85914999, - "num_input_tokens_seen": 17920745, - "step": 837, - "time_per_iteration": 2.7420156002044678 - }, - { - "auxiliary_loss_clip": 0.01198815, - "auxiliary_loss_mlp": 0.0078439, - "balance_loss_clip": 1.06345344, - "balance_loss_mlp": 1.00053763, - "epoch": 0.0503832857357583, - "flos": 20266223328000.0, - "grad_norm": 2.1353335821274477, - "language_loss": 0.72857559, - "learning_rate": 3.995643766466275e-06, - "loss": 0.7484076, - "num_input_tokens_seen": 17938220, - "step": 838, - "time_per_iteration": 2.679177761077881 - }, - { - "auxiliary_loss_clip": 0.01189223, - "auxiliary_loss_mlp": 0.01071526, - "balance_loss_clip": 1.05415273, - "balance_loss_mlp": 1.04510927, - "epoch": 0.05044340898842627, - "flos": 17785011548160.0, - "grad_norm": 1.8138261016039334, - "language_loss": 0.83462799, - "learning_rate": 3.995618037469953e-06, - "loss": 0.85723549, - "num_input_tokens_seen": 17957325, - "step": 839, - "time_per_iteration": 2.69063663482666 - }, - { - "auxiliary_loss_clip": 0.01220356, - "auxiliary_loss_mlp": 0.01069331, - "balance_loss_clip": 1.05991399, - "balance_loss_mlp": 1.04411805, - "epoch": 0.050503532241094246, - "flos": 22966526113920.0, - "grad_norm": 1.7513762525269907, - "language_loss": 0.85775483, - "learning_rate": 3.995592232799595e-06, - "loss": 0.88065171, - "num_input_tokens_seen": 17975875, - "step": 840, - "time_per_iteration": 2.6477303504943848 - }, - { - "auxiliary_loss_clip": 0.01192112, - "auxiliary_loss_mlp": 0.01064377, - "balance_loss_clip": 1.05451894, - "balance_loss_mlp": 1.036291, - "epoch": 0.05056365549376221, - "flos": 22776989022720.0, - "grad_norm": 1.7956760046069329, - "language_loss": 0.9457823, - "learning_rate": 3.99556635245618e-06, - "loss": 0.96834719, - "num_input_tokens_seen": 17994340, - "step": 841, - "time_per_iteration": 2.8354220390319824 - }, - { - "auxiliary_loss_clip": 0.0122473, - "auxiliary_loss_mlp": 0.01070125, - "balance_loss_clip": 1.06219172, - "balance_loss_mlp": 1.04329097, - "epoch": 0.05062377874643018, - "flos": 30916968399360.0, - "grad_norm": 2.3106044659054104, - "language_loss": 0.77566791, - "learning_rate": 3.995540396440688e-06, - "loss": 0.79861641, - "num_input_tokens_seen": 18015260, - "step": 842, - "time_per_iteration": 2.6909749507904053 - }, - { - "auxiliary_loss_clip": 0.01214637, - "auxiliary_loss_mlp": 0.01071033, - "balance_loss_clip": 1.06270838, - "balance_loss_mlp": 1.04391265, - "epoch": 0.05068390199909815, - "flos": 19647159402240.0, - "grad_norm": 2.8849837971101864, - "language_loss": 0.78126526, - "learning_rate": 3.995514364754105e-06, - "loss": 0.80412203, - "num_input_tokens_seen": 18033960, - "step": 843, - "time_per_iteration": 2.6534156799316406 - }, - { - "auxiliary_loss_clip": 0.01212948, - "auxiliary_loss_mlp": 0.01063612, - "balance_loss_clip": 1.06317043, - "balance_loss_mlp": 1.03894806, - "epoch": 0.05074402525176612, - "flos": 37962103276800.0, - "grad_norm": 1.9320015451631862, - "language_loss": 0.83256191, - "learning_rate": 3.995488257397417e-06, - "loss": 0.85532749, - "num_input_tokens_seen": 18056700, - "step": 844, - "time_per_iteration": 2.7682149410247803 - }, - { - "auxiliary_loss_clip": 0.01216308, - "auxiliary_loss_mlp": 0.01067162, - "balance_loss_clip": 1.06307864, - "balance_loss_mlp": 1.04138875, - "epoch": 0.05080414850443409, - "flos": 22054610603520.0, - "grad_norm": 2.113957107027846, - "language_loss": 0.77108061, - "learning_rate": 3.995462074371614e-06, - "loss": 0.79391527, - "num_input_tokens_seen": 18075815, - "step": 845, - "time_per_iteration": 2.6720399856567383 - }, - { - "auxiliary_loss_clip": 0.01206643, - "auxiliary_loss_mlp": 0.01065522, - "balance_loss_clip": 1.05881417, - "balance_loss_mlp": 1.03885484, - "epoch": 0.05086427175710206, - "flos": 20225787592320.0, - "grad_norm": 1.8497392628450484, - "language_loss": 0.87773871, - "learning_rate": 3.99543581567769e-06, - "loss": 0.90046036, - "num_input_tokens_seen": 18095095, - "step": 846, - "time_per_iteration": 2.696049690246582 - }, - { - "auxiliary_loss_clip": 0.01206291, - "auxiliary_loss_mlp": 0.01069231, - "balance_loss_clip": 1.06204462, - "balance_loss_mlp": 1.04330277, - "epoch": 0.05092439500977003, - "flos": 15159223526400.0, - "grad_norm": 1.695550491545423, - "language_loss": 0.87364423, - "learning_rate": 3.9954094813166394e-06, - "loss": 0.89639944, - "num_input_tokens_seen": 18112675, - "step": 847, - "time_per_iteration": 2.666907548904419 - }, - { - "auxiliary_loss_clip": 0.01175052, - "auxiliary_loss_mlp": 0.01071976, - "balance_loss_clip": 1.06267309, - "balance_loss_mlp": 1.0447005, - "epoch": 0.050984518262437994, - "flos": 22055149307520.0, - "grad_norm": 2.5687168450386637, - "language_loss": 0.81878662, - "learning_rate": 3.995383071289462e-06, - "loss": 0.84125686, - "num_input_tokens_seen": 18130745, - "step": 848, - "time_per_iteration": 2.782135486602783 - }, - { - "auxiliary_loss_clip": 0.0122638, - "auxiliary_loss_mlp": 0.01071388, - "balance_loss_clip": 1.06619906, - "balance_loss_mlp": 1.04544854, - "epoch": 0.05104464151510597, - "flos": 30225329043840.0, - "grad_norm": 1.678404869397893, - "language_loss": 0.87187904, - "learning_rate": 3.995356585597158e-06, - "loss": 0.89485669, - "num_input_tokens_seen": 18152410, - "step": 849, - "time_per_iteration": 2.787992000579834 - }, - { - "auxiliary_loss_clip": 0.01220251, - "auxiliary_loss_mlp": 0.0106131, - "balance_loss_clip": 1.06049275, - "balance_loss_mlp": 1.03545308, - "epoch": 0.05110476476777394, - "flos": 18332900674560.0, - "grad_norm": 2.125711462362114, - "language_loss": 0.8315587, - "learning_rate": 3.995330024240732e-06, - "loss": 0.85437429, - "num_input_tokens_seen": 18170870, - "step": 850, - "time_per_iteration": 2.6548752784729004 - }, - { - "auxiliary_loss_clip": 0.01210598, - "auxiliary_loss_mlp": 0.01063491, - "balance_loss_clip": 1.06061506, - "balance_loss_mlp": 1.0379566, - "epoch": 0.051164888020441904, - "flos": 37998732170880.0, - "grad_norm": 2.2115645013354253, - "language_loss": 0.65423882, - "learning_rate": 3.995303387221192e-06, - "loss": 0.67697972, - "num_input_tokens_seen": 18191555, - "step": 851, - "time_per_iteration": 2.817197322845459 - }, - { - "auxiliary_loss_clip": 0.0120566, - "auxiliary_loss_mlp": 0.01075745, - "balance_loss_clip": 1.05822444, - "balance_loss_mlp": 1.04761147, - "epoch": 0.051225011273109876, - "flos": 23038634666880.0, - "grad_norm": 2.3720786299251073, - "language_loss": 0.83587611, - "learning_rate": 3.995276674539547e-06, - "loss": 0.8586902, - "num_input_tokens_seen": 18208620, - "step": 852, - "time_per_iteration": 2.685727119445801 - }, - { - "auxiliary_loss_clip": 0.01193575, - "auxiliary_loss_mlp": 0.01074152, - "balance_loss_clip": 1.05924761, - "balance_loss_mlp": 1.04737723, - "epoch": 0.05128513452577785, - "flos": 18259822454400.0, - "grad_norm": 2.1832763559951234, - "language_loss": 0.80761266, - "learning_rate": 3.995249886196811e-06, - "loss": 0.8302899, - "num_input_tokens_seen": 18226370, - "step": 853, - "time_per_iteration": 2.6078240871429443 - }, - { - "auxiliary_loss_clip": 0.01222394, - "auxiliary_loss_mlp": 0.01065268, - "balance_loss_clip": 1.06223083, - "balance_loss_mlp": 1.03780222, - "epoch": 0.05134525777844581, - "flos": 27198957571200.0, - "grad_norm": 1.8511550328562763, - "language_loss": 0.75617325, - "learning_rate": 3.995223022193999e-06, - "loss": 0.77904987, - "num_input_tokens_seen": 18247075, - "step": 854, - "time_per_iteration": 2.633543014526367 - }, - { - "auxiliary_loss_clip": 0.01202415, - "auxiliary_loss_mlp": 0.01065973, - "balance_loss_clip": 1.06141627, - "balance_loss_mlp": 1.03828049, - "epoch": 0.051405381031113785, - "flos": 28362247436160.0, - "grad_norm": 2.04057054323539, - "language_loss": 0.81722355, - "learning_rate": 3.99519608253213e-06, - "loss": 0.83990741, - "num_input_tokens_seen": 18265680, - "step": 855, - "time_per_iteration": 2.760880708694458 - }, - { - "auxiliary_loss_clip": 0.01076712, - "auxiliary_loss_mlp": 0.00762392, - "balance_loss_clip": 1.0358243, - "balance_loss_mlp": 1.00074518, - "epoch": 0.05146550428378175, - "flos": 65618169327360.0, - "grad_norm": 0.9894594919315515, - "language_loss": 0.65634769, - "learning_rate": 3.995169067212227e-06, - "loss": 0.67473871, - "num_input_tokens_seen": 18327015, - "step": 856, - "time_per_iteration": 6.271182298660278 - }, - { - "auxiliary_loss_clip": 0.01194232, - "auxiliary_loss_mlp": 0.01056626, - "balance_loss_clip": 1.05972147, - "balance_loss_mlp": 1.02994716, - "epoch": 0.05152562753644972, - "flos": 22054861998720.0, - "grad_norm": 1.8001295724347575, - "language_loss": 0.77139348, - "learning_rate": 3.9951419762353116e-06, - "loss": 0.79390204, - "num_input_tokens_seen": 18345235, - "step": 857, - "time_per_iteration": 4.905239582061768 - }, - { - "auxiliary_loss_clip": 0.01183581, - "auxiliary_loss_mlp": 0.01059685, - "balance_loss_clip": 1.05640614, - "balance_loss_mlp": 1.03291047, - "epoch": 0.051585750789117694, - "flos": 18509544783360.0, - "grad_norm": 2.111656321737554, - "language_loss": 0.89194518, - "learning_rate": 3.995114809602412e-06, - "loss": 0.91437781, - "num_input_tokens_seen": 18362350, - "step": 858, - "time_per_iteration": 2.7349045276641846 - }, - { - "auxiliary_loss_clip": 0.01196113, - "auxiliary_loss_mlp": 0.01060739, - "balance_loss_clip": 1.06114125, - "balance_loss_mlp": 1.03398848, - "epoch": 0.05164587404178566, - "flos": 23730238108800.0, - "grad_norm": 2.030377637624243, - "language_loss": 0.75684321, - "learning_rate": 3.9950875673145605e-06, - "loss": 0.77941179, - "num_input_tokens_seen": 18383390, - "step": 859, - "time_per_iteration": 2.7611751556396484 - }, - { - "auxiliary_loss_clip": 0.01186313, - "auxiliary_loss_mlp": 0.0107269, - "balance_loss_clip": 1.05708003, - "balance_loss_mlp": 1.04354358, - "epoch": 0.05170599729445363, - "flos": 16252882876800.0, - "grad_norm": 2.134655488493178, - "language_loss": 0.91122925, - "learning_rate": 3.995060249372788e-06, - "loss": 0.93381929, - "num_input_tokens_seen": 18399220, - "step": 860, - "time_per_iteration": 2.666740894317627 - }, - { - "auxiliary_loss_clip": 0.0122488, - "auxiliary_loss_mlp": 0.01060586, - "balance_loss_clip": 1.06531346, - "balance_loss_mlp": 1.03536153, - "epoch": 0.0517661205471216, - "flos": 23985922095360.0, - "grad_norm": 1.7954568874114027, - "language_loss": 0.82378531, - "learning_rate": 3.99503285577813e-06, - "loss": 0.84663993, - "num_input_tokens_seen": 18419005, - "step": 861, - "time_per_iteration": 2.6337814331054688 - }, - { - "auxiliary_loss_clip": 0.01198486, - "auxiliary_loss_mlp": 0.01060236, - "balance_loss_clip": 1.06147969, - "balance_loss_mlp": 1.03437924, - "epoch": 0.05182624379978957, - "flos": 29277718392960.0, - "grad_norm": 2.5785699637959776, - "language_loss": 0.78664875, - "learning_rate": 3.995005386531627e-06, - "loss": 0.80923599, - "num_input_tokens_seen": 18440550, - "step": 862, - "time_per_iteration": 2.7570109367370605 - }, - { - "auxiliary_loss_clip": 0.01189664, - "auxiliary_loss_mlp": 0.01070327, - "balance_loss_clip": 1.058797, - "balance_loss_mlp": 1.04547238, - "epoch": 0.05188636705245754, - "flos": 24170826332160.0, - "grad_norm": 1.7880881456146414, - "language_loss": 0.89090264, - "learning_rate": 3.9949778416343195e-06, - "loss": 0.91350257, - "num_input_tokens_seen": 18461950, - "step": 863, - "time_per_iteration": 2.7118866443634033 - }, - { - "auxiliary_loss_clip": 0.01201772, - "auxiliary_loss_mlp": 0.01064316, - "balance_loss_clip": 1.06488204, - "balance_loss_mlp": 1.0369451, - "epoch": 0.051946490305125506, - "flos": 26760703731840.0, - "grad_norm": 2.081656150811602, - "language_loss": 0.76119763, - "learning_rate": 3.9949502210872525e-06, - "loss": 0.78385854, - "num_input_tokens_seen": 18480555, - "step": 864, - "time_per_iteration": 2.6946637630462646 - }, - { - "auxiliary_loss_clip": 0.01186585, - "auxiliary_loss_mlp": 0.01067959, - "balance_loss_clip": 1.05559874, - "balance_loss_mlp": 1.04046965, - "epoch": 0.05200661355779348, - "flos": 21502519585920.0, - "grad_norm": 1.9374308734697678, - "language_loss": 0.7908361, - "learning_rate": 3.994922524891474e-06, - "loss": 0.81338149, - "num_input_tokens_seen": 18499645, - "step": 865, - "time_per_iteration": 2.7700579166412354 - }, - { - "auxiliary_loss_clip": 0.01210067, - "auxiliary_loss_mlp": 0.01067568, - "balance_loss_clip": 1.06164694, - "balance_loss_mlp": 1.04152083, - "epoch": 0.05206673681046144, - "flos": 18114492026880.0, - "grad_norm": 2.269489500676155, - "language_loss": 0.85860598, - "learning_rate": 3.994894753048032e-06, - "loss": 0.88138229, - "num_input_tokens_seen": 18516810, - "step": 866, - "time_per_iteration": 2.659614086151123 - }, - { - "auxiliary_loss_clip": 0.01186536, - "auxiliary_loss_mlp": 0.01070465, - "balance_loss_clip": 1.06327558, - "balance_loss_mlp": 1.04371393, - "epoch": 0.052126860063129415, - "flos": 17524191916800.0, - "grad_norm": 2.1733876112564565, - "language_loss": 0.87495244, - "learning_rate": 3.9948669055579815e-06, - "loss": 0.89752245, - "num_input_tokens_seen": 18532510, - "step": 867, - "time_per_iteration": 2.740238904953003 - }, - { - "auxiliary_loss_clip": 0.01167585, - "auxiliary_loss_mlp": 0.01078445, - "balance_loss_clip": 1.05696058, - "balance_loss_mlp": 1.05437636, - "epoch": 0.05218698331579739, - "flos": 32598054771840.0, - "grad_norm": 1.8498678854952728, - "language_loss": 0.63917863, - "learning_rate": 3.9948389824223785e-06, - "loss": 0.66163892, - "num_input_tokens_seen": 18557380, - "step": 868, - "time_per_iteration": 2.9310383796691895 - }, - { - "auxiliary_loss_clip": 0.01225135, - "auxiliary_loss_mlp": 0.01069894, - "balance_loss_clip": 1.06287289, - "balance_loss_mlp": 1.04173636, - "epoch": 0.05224710656846535, - "flos": 22127293774080.0, - "grad_norm": 2.742912036955754, - "language_loss": 0.83379138, - "learning_rate": 3.994810983642281e-06, - "loss": 0.85674161, - "num_input_tokens_seen": 18575720, - "step": 869, - "time_per_iteration": 2.6453137397766113 - }, - { - "auxiliary_loss_clip": 0.01216406, - "auxiliary_loss_mlp": 0.01056401, - "balance_loss_clip": 1.0645746, - "balance_loss_mlp": 1.03053236, - "epoch": 0.052307229821133325, - "flos": 11145092976000.0, - "grad_norm": 2.188953802542244, - "language_loss": 0.87822217, - "learning_rate": 3.994782909218751e-06, - "loss": 0.90095031, - "num_input_tokens_seen": 18592185, - "step": 870, - "time_per_iteration": 2.7044875621795654 - }, - { - "auxiliary_loss_clip": 0.01226316, - "auxiliary_loss_mlp": 0.01064746, - "balance_loss_clip": 1.06603277, - "balance_loss_mlp": 1.03965199, - "epoch": 0.05236735307380129, - "flos": 19128070005120.0, - "grad_norm": 1.975067156516721, - "language_loss": 0.80651748, - "learning_rate": 3.994754759152854e-06, - "loss": 0.82942802, - "num_input_tokens_seen": 18609560, - "step": 871, - "time_per_iteration": 2.6892175674438477 - }, - { - "auxiliary_loss_clip": 0.0119502, - "auxiliary_loss_mlp": 0.01064309, - "balance_loss_clip": 1.0650804, - "balance_loss_mlp": 1.0396452, - "epoch": 0.05242747632646926, - "flos": 20960663944320.0, - "grad_norm": 1.7402390708810018, - "language_loss": 0.81330585, - "learning_rate": 3.994726533445656e-06, - "loss": 0.83589917, - "num_input_tokens_seen": 18629405, - "step": 872, - "time_per_iteration": 2.8044185638427734 - }, - { - "auxiliary_loss_clip": 0.0107835, - "auxiliary_loss_mlp": 0.01020667, - "balance_loss_clip": 1.03168392, - "balance_loss_mlp": 1.01515913, - "epoch": 0.052487599579137234, - "flos": 65020542842880.0, - "grad_norm": 0.883483589670371, - "language_loss": 0.61589074, - "learning_rate": 3.9946982320982274e-06, - "loss": 0.63688087, - "num_input_tokens_seen": 18681480, - "step": 873, - "time_per_iteration": 3.1711297035217285 - }, - { - "auxiliary_loss_clip": 0.01197438, - "auxiliary_loss_mlp": 0.01056818, - "balance_loss_clip": 1.06202292, - "balance_loss_mlp": 1.03120041, - "epoch": 0.0525477228318052, - "flos": 23288859786240.0, - "grad_norm": 2.1995328011281488, - "language_loss": 0.88965189, - "learning_rate": 3.994669855111643e-06, - "loss": 0.91219449, - "num_input_tokens_seen": 18700390, - "step": 874, - "time_per_iteration": 2.8240153789520264 - }, - { - "auxiliary_loss_clip": 0.01197247, - "auxiliary_loss_mlp": 0.01063458, - "balance_loss_clip": 1.0614326, - "balance_loss_mlp": 1.03682709, - "epoch": 0.05260784608447317, - "flos": 32230221546240.0, - "grad_norm": 1.858649685360537, - "language_loss": 0.74537963, - "learning_rate": 3.994641402486977e-06, - "loss": 0.76798666, - "num_input_tokens_seen": 18721280, - "step": 875, - "time_per_iteration": 2.9111931324005127 - }, - { - "auxiliary_loss_clip": 0.01206205, - "auxiliary_loss_mlp": 0.01058912, - "balance_loss_clip": 1.06306934, - "balance_loss_mlp": 1.03210175, - "epoch": 0.052667969337141136, - "flos": 24463211040000.0, - "grad_norm": 1.7697857141051123, - "language_loss": 0.92843151, - "learning_rate": 3.99461287422531e-06, - "loss": 0.95108265, - "num_input_tokens_seen": 18741545, - "step": 876, - "time_per_iteration": 2.800252676010132 - }, - { - "auxiliary_loss_clip": 0.01100151, - "auxiliary_loss_mlp": 0.01006341, - "balance_loss_clip": 1.02669787, - "balance_loss_mlp": 1.0020256, - "epoch": 0.05272809258980911, - "flos": 57784329567360.0, - "grad_norm": 0.8383495859932864, - "language_loss": 0.62929404, - "learning_rate": 3.994584270327722e-06, - "loss": 0.65035897, - "num_input_tokens_seen": 18801400, - "step": 877, - "time_per_iteration": 3.2090368270874023 - }, - { - "auxiliary_loss_clip": 0.01200578, - "auxiliary_loss_mlp": 0.0106702, - "balance_loss_clip": 1.06150424, - "balance_loss_mlp": 1.03931606, - "epoch": 0.05278821584247708, - "flos": 17420805596160.0, - "grad_norm": 2.042786693643985, - "language_loss": 0.85383844, - "learning_rate": 3.994555590795299e-06, - "loss": 0.87651443, - "num_input_tokens_seen": 18819670, - "step": 878, - "time_per_iteration": 2.823835849761963 - }, - { - "auxiliary_loss_clip": 0.0122514, - "auxiliary_loss_mlp": 0.01061117, - "balance_loss_clip": 1.0635035, - "balance_loss_mlp": 1.03551078, - "epoch": 0.052848339095145046, - "flos": 26137258346880.0, - "grad_norm": 1.7462717669338121, - "language_loss": 0.83076209, - "learning_rate": 3.9945268356291275e-06, - "loss": 0.8536247, - "num_input_tokens_seen": 18840580, - "step": 879, - "time_per_iteration": 2.743673086166382 - }, - { - "auxiliary_loss_clip": 0.0119139, - "auxiliary_loss_mlp": 0.01066471, - "balance_loss_clip": 1.06152987, - "balance_loss_mlp": 1.04013824, - "epoch": 0.05290846234781302, - "flos": 16472081623680.0, - "grad_norm": 1.9601789563010765, - "language_loss": 0.84284604, - "learning_rate": 3.9944980048302985e-06, - "loss": 0.86542469, - "num_input_tokens_seen": 18859295, - "step": 880, - "time_per_iteration": 2.7560529708862305 - }, - { - "auxiliary_loss_clip": 0.01184956, - "auxiliary_loss_mlp": 0.01065063, - "balance_loss_clip": 1.05969453, - "balance_loss_mlp": 1.03887296, - "epoch": 0.05296858560048098, - "flos": 19865173000320.0, - "grad_norm": 2.4477328752698564, - "language_loss": 0.86870736, - "learning_rate": 3.994469098399906e-06, - "loss": 0.89120758, - "num_input_tokens_seen": 18877485, - "step": 881, - "time_per_iteration": 2.855395555496216 - }, - { - "auxiliary_loss_clip": 0.01207858, - "auxiliary_loss_mlp": 0.01070235, - "balance_loss_clip": 1.05984437, - "balance_loss_mlp": 1.04238808, - "epoch": 0.053028708853148955, - "flos": 24388588535040.0, - "grad_norm": 1.7611192020675561, - "language_loss": 0.87967896, - "learning_rate": 3.994440116339046e-06, - "loss": 0.90245986, - "num_input_tokens_seen": 18898275, - "step": 882, - "time_per_iteration": 2.8480119705200195 - }, - { - "auxiliary_loss_clip": 0.01224906, - "auxiliary_loss_mlp": 0.01057944, - "balance_loss_clip": 1.06268644, - "balance_loss_mlp": 1.03059733, - "epoch": 0.05308883210581693, - "flos": 36393166143360.0, - "grad_norm": 2.3555018967788635, - "language_loss": 0.69469339, - "learning_rate": 3.994411058648816e-06, - "loss": 0.71752191, - "num_input_tokens_seen": 18920665, - "step": 883, - "time_per_iteration": 2.8808236122131348 - }, - { - "auxiliary_loss_clip": 0.01166777, - "auxiliary_loss_mlp": 0.01063991, - "balance_loss_clip": 1.05333591, - "balance_loss_mlp": 1.03855157, - "epoch": 0.05314895535848489, - "flos": 22855095146880.0, - "grad_norm": 2.039016812023355, - "language_loss": 0.76100993, - "learning_rate": 3.994381925330319e-06, - "loss": 0.78331757, - "num_input_tokens_seen": 18939835, - "step": 884, - "time_per_iteration": 2.8462212085723877 - }, - { - "auxiliary_loss_clip": 0.01172569, - "auxiliary_loss_mlp": 0.01066856, - "balance_loss_clip": 1.06269383, - "balance_loss_mlp": 1.04147613, - "epoch": 0.053209078611152864, - "flos": 12860330204160.0, - "grad_norm": 1.9865896222141148, - "language_loss": 0.86195529, - "learning_rate": 3.994352716384659e-06, - "loss": 0.88434947, - "num_input_tokens_seen": 18958405, - "step": 885, - "time_per_iteration": 2.7825753688812256 - }, - { - "auxiliary_loss_clip": 0.0118405, - "auxiliary_loss_mlp": 0.01068976, - "balance_loss_clip": 1.05229151, - "balance_loss_mlp": 1.04203486, - "epoch": 0.05326920186382083, - "flos": 12164596698240.0, - "grad_norm": 2.608647457747672, - "language_loss": 0.85971159, - "learning_rate": 3.994323431812945e-06, - "loss": 0.88224185, - "num_input_tokens_seen": 18975445, - "step": 886, - "time_per_iteration": 2.7393639087677 - }, - { - "auxiliary_loss_clip": 0.0117343, - "auxiliary_loss_mlp": 0.01065966, - "balance_loss_clip": 1.05620933, - "balance_loss_mlp": 1.03879774, - "epoch": 0.0533293251164888, - "flos": 22704485420160.0, - "grad_norm": 2.040002880698432, - "language_loss": 0.8961553, - "learning_rate": 3.994294071616286e-06, - "loss": 0.91854936, - "num_input_tokens_seen": 18991930, - "step": 887, - "time_per_iteration": 2.8606581687927246 - }, - { - "auxiliary_loss_clip": 0.01144444, - "auxiliary_loss_mlp": 0.01072438, - "balance_loss_clip": 1.04453194, - "balance_loss_mlp": 1.04411352, - "epoch": 0.053389448369156774, - "flos": 26940939200640.0, - "grad_norm": 2.062562868466936, - "language_loss": 0.74852538, - "learning_rate": 3.994264635795796e-06, - "loss": 0.77069414, - "num_input_tokens_seen": 19009790, - "step": 888, - "time_per_iteration": 2.8675312995910645 - }, - { - "auxiliary_loss_clip": 0.01164085, - "auxiliary_loss_mlp": 0.01072324, - "balance_loss_clip": 1.05659473, - "balance_loss_mlp": 1.04525173, - "epoch": 0.05344957162182474, - "flos": 25556331686400.0, - "grad_norm": 1.7884280759117637, - "language_loss": 0.88440782, - "learning_rate": 3.994235124352592e-06, - "loss": 0.9067719, - "num_input_tokens_seen": 19030170, - "step": 889, - "time_per_iteration": 2.9419636726379395 - }, - { - "auxiliary_loss_clip": 0.0121577, - "auxiliary_loss_mlp": 0.0105125, - "balance_loss_clip": 1.06085157, - "balance_loss_mlp": 1.02607334, - "epoch": 0.05350969487449271, - "flos": 19719591177600.0, - "grad_norm": 1.9333059575084248, - "language_loss": 0.88386381, - "learning_rate": 3.994205537287791e-06, - "loss": 0.90653402, - "num_input_tokens_seen": 19048075, - "step": 890, - "time_per_iteration": 2.7030327320098877 - }, - { - "auxiliary_loss_clip": 0.01195034, - "auxiliary_loss_mlp": 0.01069003, - "balance_loss_clip": 1.05835462, - "balance_loss_mlp": 1.04450595, - "epoch": 0.053569818127160676, - "flos": 27016351804800.0, - "grad_norm": 2.435204176890571, - "language_loss": 0.93450797, - "learning_rate": 3.994175874602517e-06, - "loss": 0.95714831, - "num_input_tokens_seen": 19067465, - "step": 891, - "time_per_iteration": 2.81527042388916 - }, - { - "auxiliary_loss_clip": 0.01190797, - "auxiliary_loss_mlp": 0.01066955, - "balance_loss_clip": 1.05605483, - "balance_loss_mlp": 1.03909576, - "epoch": 0.05362994137982865, - "flos": 13188338225280.0, - "grad_norm": 2.3400199158693087, - "language_loss": 0.71625131, - "learning_rate": 3.994146136297893e-06, - "loss": 0.73882878, - "num_input_tokens_seen": 19085505, - "step": 892, - "time_per_iteration": 2.825984239578247 - }, - { - "auxiliary_loss_clip": 0.01191313, - "auxiliary_loss_mlp": 0.0078394, - "balance_loss_clip": 1.05727172, - "balance_loss_mlp": 1.00024366, - "epoch": 0.05369006463249662, - "flos": 28658008022400.0, - "grad_norm": 1.6058100223173828, - "language_loss": 0.82331586, - "learning_rate": 3.994116322375049e-06, - "loss": 0.84306836, - "num_input_tokens_seen": 19104360, - "step": 893, - "time_per_iteration": 2.8618266582489014 - }, - { - "auxiliary_loss_clip": 0.01192677, - "auxiliary_loss_mlp": 0.01063531, - "balance_loss_clip": 1.0572021, - "balance_loss_mlp": 1.03850877, - "epoch": 0.053750187885164585, - "flos": 28913153304960.0, - "grad_norm": 2.0228714136718122, - "language_loss": 0.82052565, - "learning_rate": 3.994086432835114e-06, - "loss": 0.84308773, - "num_input_tokens_seen": 19124680, - "step": 894, - "time_per_iteration": 2.8347885608673096 - }, - { - "auxiliary_loss_clip": 0.0120111, - "auxiliary_loss_mlp": 0.01065233, - "balance_loss_clip": 1.0570271, - "balance_loss_mlp": 1.03997254, - "epoch": 0.05381031113783256, - "flos": 15158828476800.0, - "grad_norm": 2.260594705980758, - "language_loss": 0.76133072, - "learning_rate": 3.994056467679221e-06, - "loss": 0.78399414, - "num_input_tokens_seen": 19142895, - "step": 895, - "time_per_iteration": 2.7288858890533447 - }, - { - "auxiliary_loss_clip": 0.01200143, - "auxiliary_loss_mlp": 0.01060588, - "balance_loss_clip": 1.06422663, - "balance_loss_mlp": 1.03547084, - "epoch": 0.05387043439050053, - "flos": 21835232288640.0, - "grad_norm": 2.0450623179174974, - "language_loss": 0.86767507, - "learning_rate": 3.9940264269085065e-06, - "loss": 0.89028239, - "num_input_tokens_seen": 19163125, - "step": 896, - "time_per_iteration": 4.404265642166138 - }, - { - "auxiliary_loss_clip": 0.0122203, - "auxiliary_loss_mlp": 0.00782931, - "balance_loss_clip": 1.06062579, - "balance_loss_mlp": 1.0002867, - "epoch": 0.053930557643168495, - "flos": 17310308382720.0, - "grad_norm": 3.0866230440609805, - "language_loss": 0.8797363, - "learning_rate": 3.9939963105241115e-06, - "loss": 0.89978594, - "num_input_tokens_seen": 19179385, - "step": 897, - "time_per_iteration": 4.843130588531494 - }, - { - "auxiliary_loss_clip": 0.01201639, - "auxiliary_loss_mlp": 0.01063724, - "balance_loss_clip": 1.05896854, - "balance_loss_mlp": 1.03658032, - "epoch": 0.05399068089583647, - "flos": 17348481561600.0, - "grad_norm": 1.8270040910241792, - "language_loss": 0.90170419, - "learning_rate": 3.993966118527175e-06, - "loss": 0.92435783, - "num_input_tokens_seen": 19198725, - "step": 898, - "time_per_iteration": 2.695235252380371 - }, - { - "auxiliary_loss_clip": 0.01200189, - "auxiliary_loss_mlp": 0.01076438, - "balance_loss_clip": 1.05787873, - "balance_loss_mlp": 1.05105805, - "epoch": 0.05405080414850443, - "flos": 17486952491520.0, - "grad_norm": 2.793625116693953, - "language_loss": 0.91544139, - "learning_rate": 3.993935850918845e-06, - "loss": 0.93820769, - "num_input_tokens_seen": 19212380, - "step": 899, - "time_per_iteration": 2.7509548664093018 - }, - { - "auxiliary_loss_clip": 0.01186479, - "auxiliary_loss_mlp": 0.01068594, - "balance_loss_clip": 1.05614042, - "balance_loss_mlp": 1.04154527, - "epoch": 0.054110927401172404, - "flos": 24496787278080.0, - "grad_norm": 1.983572968760697, - "language_loss": 0.75742769, - "learning_rate": 3.9939055077002665e-06, - "loss": 0.77997845, - "num_input_tokens_seen": 19232235, - "step": 900, - "time_per_iteration": 2.771371364593506 - }, - { - "auxiliary_loss_clip": 0.01211506, - "auxiliary_loss_mlp": 0.01058176, - "balance_loss_clip": 1.05839145, - "balance_loss_mlp": 1.03401244, - "epoch": 0.054171050653840376, - "flos": 22930040874240.0, - "grad_norm": 2.192527627735503, - "language_loss": 0.74331856, - "learning_rate": 3.993875088872592e-06, - "loss": 0.76601535, - "num_input_tokens_seen": 19251460, - "step": 901, - "time_per_iteration": 2.859912157058716 - }, - { - "auxiliary_loss_clip": 0.01177502, - "auxiliary_loss_mlp": 0.01065445, - "balance_loss_clip": 1.0569309, - "balance_loss_mlp": 1.04166329, - "epoch": 0.05423117390650834, - "flos": 12933192942720.0, - "grad_norm": 2.352700712836257, - "language_loss": 0.85287452, - "learning_rate": 3.9938445944369745e-06, - "loss": 0.87530404, - "num_input_tokens_seen": 19269060, - "step": 902, - "time_per_iteration": 2.7940642833709717 - }, - { - "auxiliary_loss_clip": 0.01161069, - "auxiliary_loss_mlp": 0.01066664, - "balance_loss_clip": 1.04903233, - "balance_loss_mlp": 1.04112983, - "epoch": 0.05429129715917631, - "flos": 19901335017600.0, - "grad_norm": 1.9620711230312637, - "language_loss": 0.86385572, - "learning_rate": 3.993814024394569e-06, - "loss": 0.88613302, - "num_input_tokens_seen": 19288620, - "step": 903, - "time_per_iteration": 2.9258980751037598 - }, - { - "auxiliary_loss_clip": 0.0121005, - "auxiliary_loss_mlp": 0.01059616, - "balance_loss_clip": 1.06094384, - "balance_loss_mlp": 1.03534508, - "epoch": 0.05435142041184428, - "flos": 16908611610240.0, - "grad_norm": 2.175127974944855, - "language_loss": 0.74927866, - "learning_rate": 3.993783378746537e-06, - "loss": 0.7719754, - "num_input_tokens_seen": 19306615, - "step": 904, - "time_per_iteration": 2.7239954471588135 - }, - { - "auxiliary_loss_clip": 0.01208402, - "auxiliary_loss_mlp": 0.01067543, - "balance_loss_clip": 1.06052148, - "balance_loss_mlp": 1.04325962, - "epoch": 0.05441154366451225, - "flos": 23948323534080.0, - "grad_norm": 2.5191963984804535, - "language_loss": 0.85946918, - "learning_rate": 3.993752657494039e-06, - "loss": 0.88222867, - "num_input_tokens_seen": 19321680, - "step": 905, - "time_per_iteration": 2.693896532058716 - }, - { - "auxiliary_loss_clip": 0.01198232, - "auxiliary_loss_mlp": 0.01078072, - "balance_loss_clip": 1.06483209, - "balance_loss_mlp": 1.05400348, - "epoch": 0.05447166691718022, - "flos": 19975382904960.0, - "grad_norm": 1.7753581401878566, - "language_loss": 0.74413162, - "learning_rate": 3.993721860638241e-06, - "loss": 0.7668947, - "num_input_tokens_seen": 19339760, - "step": 906, - "time_per_iteration": 2.6679019927978516 - }, - { - "auxiliary_loss_clip": 0.01192373, - "auxiliary_loss_mlp": 0.01064381, - "balance_loss_clip": 1.05954027, - "balance_loss_mlp": 1.0397284, - "epoch": 0.05453179016984819, - "flos": 24936513575040.0, - "grad_norm": 2.3037248114268896, - "language_loss": 0.87340188, - "learning_rate": 3.993690988180309e-06, - "loss": 0.89596951, - "num_input_tokens_seen": 19359585, - "step": 907, - "time_per_iteration": 2.7363240718841553 - }, - { - "auxiliary_loss_clip": 0.01205519, - "auxiliary_loss_mlp": 0.01068463, - "balance_loss_clip": 1.0616293, - "balance_loss_mlp": 1.04332149, - "epoch": 0.05459191342251616, - "flos": 18115102558080.0, - "grad_norm": 1.6666873589767146, - "language_loss": 0.86928803, - "learning_rate": 3.9936600401214165e-06, - "loss": 0.89202785, - "num_input_tokens_seen": 19378590, - "step": 908, - "time_per_iteration": 2.6266026496887207 - }, - { - "auxiliary_loss_clip": 0.01198848, - "auxiliary_loss_mlp": 0.01067336, - "balance_loss_clip": 1.05974221, - "balance_loss_mlp": 1.04107404, - "epoch": 0.054652036675184125, - "flos": 19208295031680.0, - "grad_norm": 2.1282794409977215, - "language_loss": 0.89792144, - "learning_rate": 3.9936290164627345e-06, - "loss": 0.92058325, - "num_input_tokens_seen": 19397910, - "step": 909, - "time_per_iteration": 2.7163166999816895 - }, - { - "auxiliary_loss_clip": 0.01200393, - "auxiliary_loss_mlp": 0.01073374, - "balance_loss_clip": 1.06157839, - "balance_loss_mlp": 1.04742169, - "epoch": 0.0547121599278521, - "flos": 16325745615360.0, - "grad_norm": 2.095924869989121, - "language_loss": 0.70949811, - "learning_rate": 3.99359791720544e-06, - "loss": 0.73223579, - "num_input_tokens_seen": 19415950, - "step": 910, - "time_per_iteration": 2.6697354316711426 - }, - { - "auxiliary_loss_clip": 0.01187784, - "auxiliary_loss_mlp": 0.01054671, - "balance_loss_clip": 1.05651259, - "balance_loss_mlp": 1.02975583, - "epoch": 0.05477228318052007, - "flos": 20339014239360.0, - "grad_norm": 1.6633724338567386, - "language_loss": 0.83651805, - "learning_rate": 3.993566742350714e-06, - "loss": 0.85894263, - "num_input_tokens_seen": 19435275, - "step": 911, - "time_per_iteration": 2.692798137664795 - }, - { - "auxiliary_loss_clip": 0.01187113, - "auxiliary_loss_mlp": 0.01073028, - "balance_loss_clip": 1.05334687, - "balance_loss_mlp": 1.04719508, - "epoch": 0.054832406433188034, - "flos": 21973092687360.0, - "grad_norm": 2.283907419545301, - "language_loss": 0.76320881, - "learning_rate": 3.993535491899736e-06, - "loss": 0.78581023, - "num_input_tokens_seen": 19452090, - "step": 912, - "time_per_iteration": 2.6653189659118652 - }, - { - "auxiliary_loss_clip": 0.01186313, - "auxiliary_loss_mlp": 0.01051652, - "balance_loss_clip": 1.05707574, - "balance_loss_mlp": 1.0271548, - "epoch": 0.054892529685856006, - "flos": 16398931576320.0, - "grad_norm": 2.366460016615147, - "language_loss": 0.82826668, - "learning_rate": 3.993504165853694e-06, - "loss": 0.85064626, - "num_input_tokens_seen": 19470865, - "step": 913, - "time_per_iteration": 2.6826348304748535 - }, - { - "auxiliary_loss_clip": 0.01194515, - "auxiliary_loss_mlp": 0.01060483, - "balance_loss_clip": 1.0581125, - "balance_loss_mlp": 1.03651023, - "epoch": 0.05495265293852397, - "flos": 23912341084800.0, - "grad_norm": 3.3338391252510586, - "language_loss": 0.8373239, - "learning_rate": 3.993472764213772e-06, - "loss": 0.85987389, - "num_input_tokens_seen": 19492145, - "step": 914, - "time_per_iteration": 2.7358829975128174 - }, - { - "auxiliary_loss_clip": 0.0120705, - "auxiliary_loss_mlp": 0.0078227, - "balance_loss_clip": 1.06039774, - "balance_loss_mlp": 1.00027478, - "epoch": 0.055012776191191944, - "flos": 23586954756480.0, - "grad_norm": 2.520244909384168, - "language_loss": 0.90146536, - "learning_rate": 3.9934412869811655e-06, - "loss": 0.92135859, - "num_input_tokens_seen": 19511015, - "step": 915, - "time_per_iteration": 2.9398341178894043 - }, - { - "auxiliary_loss_clip": 0.01201461, - "auxiliary_loss_mlp": 0.01059252, - "balance_loss_clip": 1.06274199, - "balance_loss_mlp": 1.03558862, - "epoch": 0.055072899443859916, - "flos": 17528501548800.0, - "grad_norm": 2.182721785653499, - "language_loss": 0.89710975, - "learning_rate": 3.993409734157064e-06, - "loss": 0.91971689, - "num_input_tokens_seen": 19529040, - "step": 916, - "time_per_iteration": 2.7210159301757812 - }, - { - "auxiliary_loss_clip": 0.01175226, - "auxiliary_loss_mlp": 0.01066073, - "balance_loss_clip": 1.05741024, - "balance_loss_mlp": 1.04103947, - "epoch": 0.05513302269652788, - "flos": 21687172427520.0, - "grad_norm": 1.7899379897310368, - "language_loss": 0.8016991, - "learning_rate": 3.993378105742666e-06, - "loss": 0.82411212, - "num_input_tokens_seen": 19549540, - "step": 917, - "time_per_iteration": 2.7923104763031006 - }, - { - "auxiliary_loss_clip": 0.01139072, - "auxiliary_loss_mlp": 0.0105947, - "balance_loss_clip": 1.05135942, - "balance_loss_mlp": 1.03414989, - "epoch": 0.05519314594919585, - "flos": 21613340021760.0, - "grad_norm": 2.106744179667805, - "language_loss": 0.79437333, - "learning_rate": 3.9933464017391705e-06, - "loss": 0.81635869, - "num_input_tokens_seen": 19567570, - "step": 918, - "time_per_iteration": 2.8051092624664307 - }, - { - "auxiliary_loss_clip": 0.01196947, - "auxiliary_loss_mlp": 0.01055679, - "balance_loss_clip": 1.05616307, - "balance_loss_mlp": 1.03166997, - "epoch": 0.05525326920186382, - "flos": 21798567480960.0, - "grad_norm": 2.454030193031321, - "language_loss": 0.89019686, - "learning_rate": 3.99331462214778e-06, - "loss": 0.91272312, - "num_input_tokens_seen": 19585330, - "step": 919, - "time_per_iteration": 2.6846773624420166 - }, - { - "auxiliary_loss_clip": 0.01213326, - "auxiliary_loss_mlp": 0.01069349, - "balance_loss_clip": 1.05950904, - "balance_loss_mlp": 1.04417229, - "epoch": 0.05531339245453179, - "flos": 28439635288320.0, - "grad_norm": 2.246354931091656, - "language_loss": 0.8746047, - "learning_rate": 3.993282766969699e-06, - "loss": 0.89743137, - "num_input_tokens_seen": 19604970, - "step": 920, - "time_per_iteration": 2.6699845790863037 - }, - { - "auxiliary_loss_clip": 0.01190424, - "auxiliary_loss_mlp": 0.0106036, - "balance_loss_clip": 1.06023288, - "balance_loss_mlp": 1.03657782, - "epoch": 0.05537351570719976, - "flos": 37375143131520.0, - "grad_norm": 1.975714125194334, - "language_loss": 0.6568011, - "learning_rate": 3.993250836206136e-06, - "loss": 0.67930895, - "num_input_tokens_seen": 19626235, - "step": 921, - "time_per_iteration": 2.833644390106201 - }, - { - "auxiliary_loss_clip": 0.01209678, - "auxiliary_loss_mlp": 0.01065483, - "balance_loss_clip": 1.06060767, - "balance_loss_mlp": 1.03874445, - "epoch": 0.05543363895986773, - "flos": 20084479488000.0, - "grad_norm": 1.7242493696651606, - "language_loss": 0.71861136, - "learning_rate": 3.993218829858301e-06, - "loss": 0.74136293, - "num_input_tokens_seen": 19644305, - "step": 922, - "time_per_iteration": 2.6168808937072754 - }, - { - "auxiliary_loss_clip": 0.01187138, - "auxiliary_loss_mlp": 0.01067213, - "balance_loss_clip": 1.05423355, - "balance_loss_mlp": 1.04223895, - "epoch": 0.0554937622125357, - "flos": 24533200690560.0, - "grad_norm": 2.6848185900705412, - "language_loss": 0.82304025, - "learning_rate": 3.993186747927408e-06, - "loss": 0.8455838, - "num_input_tokens_seen": 19662130, - "step": 923, - "time_per_iteration": 2.7298316955566406 - }, - { - "auxiliary_loss_clip": 0.01202941, - "auxiliary_loss_mlp": 0.01064106, - "balance_loss_clip": 1.05725455, - "balance_loss_mlp": 1.03933442, - "epoch": 0.055553885465203665, - "flos": 14320063013760.0, - "grad_norm": 1.9334372940525173, - "language_loss": 0.78759122, - "learning_rate": 3.993154590414675e-06, - "loss": 0.81026167, - "num_input_tokens_seen": 19680715, - "step": 924, - "time_per_iteration": 2.6869630813598633 - }, - { - "auxiliary_loss_clip": 0.0116422, - "auxiliary_loss_mlp": 0.01053758, - "balance_loss_clip": 1.05395627, - "balance_loss_mlp": 1.02844954, - "epoch": 0.05561400871787164, - "flos": 27381132374400.0, - "grad_norm": 2.005203138116014, - "language_loss": 1.02005315, - "learning_rate": 3.993122357321319e-06, - "loss": 1.04223299, - "num_input_tokens_seen": 19700535, - "step": 925, - "time_per_iteration": 2.716089963912964 - }, - { - "auxiliary_loss_clip": 0.01163201, - "auxiliary_loss_mlp": 0.01052104, - "balance_loss_clip": 1.05070591, - "balance_loss_mlp": 1.02739179, - "epoch": 0.05567413197053961, - "flos": 23221096778880.0, - "grad_norm": 2.0106641835017482, - "language_loss": 0.80939209, - "learning_rate": 3.993090048648564e-06, - "loss": 0.83154511, - "num_input_tokens_seen": 19718825, - "step": 926, - "time_per_iteration": 2.895803451538086 - }, - { - "auxiliary_loss_clip": 0.01207515, - "auxiliary_loss_mlp": 0.01068168, - "balance_loss_clip": 1.05892682, - "balance_loss_mlp": 1.0419066, - "epoch": 0.055734255223207574, - "flos": 25264952559360.0, - "grad_norm": 2.9732625845644045, - "language_loss": 0.73220479, - "learning_rate": 3.993057664397634e-06, - "loss": 0.75496161, - "num_input_tokens_seen": 19739080, - "step": 927, - "time_per_iteration": 2.677725076675415 - }, - { - "auxiliary_loss_clip": 0.01101002, - "auxiliary_loss_mlp": 0.01015011, - "balance_loss_clip": 1.02922702, - "balance_loss_mlp": 1.01014709, - "epoch": 0.055794378475875546, - "flos": 66503116702080.0, - "grad_norm": 0.8406874373244947, - "language_loss": 0.59841412, - "learning_rate": 3.9930252045697585e-06, - "loss": 0.61957431, - "num_input_tokens_seen": 19802960, - "step": 928, - "time_per_iteration": 3.187382221221924 - }, - { - "auxiliary_loss_clip": 0.01202438, - "auxiliary_loss_mlp": 0.01065066, - "balance_loss_clip": 1.05921853, - "balance_loss_mlp": 1.04070008, - "epoch": 0.05585450172854351, - "flos": 25337635729920.0, - "grad_norm": 2.0668361967965994, - "language_loss": 0.95411372, - "learning_rate": 3.992992669166168e-06, - "loss": 0.97678876, - "num_input_tokens_seen": 19822765, - "step": 929, - "time_per_iteration": 2.6930506229400635 - }, - { - "auxiliary_loss_clip": 0.01171806, - "auxiliary_loss_mlp": 0.01068051, - "balance_loss_clip": 1.05343258, - "balance_loss_mlp": 1.04101443, - "epoch": 0.05591462498121148, - "flos": 33911738881920.0, - "grad_norm": 2.1442452677256627, - "language_loss": 0.71756601, - "learning_rate": 3.992960058188094e-06, - "loss": 0.7399646, - "num_input_tokens_seen": 19843590, - "step": 930, - "time_per_iteration": 2.803219795227051 - }, - { - "auxiliary_loss_clip": 0.01188277, - "auxiliary_loss_mlp": 0.01058888, - "balance_loss_clip": 1.05783677, - "balance_loss_mlp": 1.03377056, - "epoch": 0.055974748233879455, - "flos": 17930880679680.0, - "grad_norm": 2.381261552273062, - "language_loss": 0.85279298, - "learning_rate": 3.992927371636776e-06, - "loss": 0.87526459, - "num_input_tokens_seen": 19860230, - "step": 931, - "time_per_iteration": 2.6215872764587402 - }, - { - "auxiliary_loss_clip": 0.01203533, - "auxiliary_loss_mlp": 0.00783076, - "balance_loss_clip": 1.05677414, - "balance_loss_mlp": 1.00025761, - "epoch": 0.05603487148654742, - "flos": 24021976371840.0, - "grad_norm": 2.2861197477099973, - "language_loss": 0.83645165, - "learning_rate": 3.9928946095134525e-06, - "loss": 0.85631776, - "num_input_tokens_seen": 19880795, - "step": 932, - "time_per_iteration": 2.664062261581421 - }, - { - "auxiliary_loss_clip": 0.01200637, - "auxiliary_loss_mlp": 0.0107041, - "balance_loss_clip": 1.05897784, - "balance_loss_mlp": 1.04407716, - "epoch": 0.05609499473921539, - "flos": 17307758517120.0, - "grad_norm": 1.8036739452122519, - "language_loss": 0.73694205, - "learning_rate": 3.992861771819365e-06, - "loss": 0.7596525, - "num_input_tokens_seen": 19897960, - "step": 933, - "time_per_iteration": 2.631620168685913 - }, - { - "auxiliary_loss_clip": 0.01153445, - "auxiliary_loss_mlp": 0.01076903, - "balance_loss_clip": 1.04885209, - "balance_loss_mlp": 1.05060577, - "epoch": 0.05615511799188336, - "flos": 20994742972800.0, - "grad_norm": 2.385249039382274, - "language_loss": 0.86660421, - "learning_rate": 3.99282885855576e-06, - "loss": 0.88890779, - "num_input_tokens_seen": 19913315, - "step": 934, - "time_per_iteration": 2.7739439010620117 - }, - { - "auxiliary_loss_clip": 0.01164295, - "auxiliary_loss_mlp": 0.0108083, - "balance_loss_clip": 1.05509257, - "balance_loss_mlp": 1.0557723, - "epoch": 0.05621524124455133, - "flos": 17273535834240.0, - "grad_norm": 2.2740258482680433, - "language_loss": 0.80388415, - "learning_rate": 3.992795869723885e-06, - "loss": 0.82633543, - "num_input_tokens_seen": 19928790, - "step": 935, - "time_per_iteration": 5.93512487411499 - }, - { - "auxiliary_loss_clip": 0.01093927, - "auxiliary_loss_mlp": 0.01019701, - "balance_loss_clip": 1.02288604, - "balance_loss_mlp": 1.01540911, - "epoch": 0.0562753644972193, - "flos": 58719370458240.0, - "grad_norm": 0.820561718243334, - "language_loss": 0.69191676, - "learning_rate": 3.99276280532499e-06, - "loss": 0.71305299, - "num_input_tokens_seen": 19988785, - "step": 936, - "time_per_iteration": 4.862478733062744 - }, - { - "auxiliary_loss_clip": 0.01213648, - "auxiliary_loss_mlp": 0.01068507, - "balance_loss_clip": 1.05806684, - "balance_loss_mlp": 1.04429567, - "epoch": 0.05633548774988727, - "flos": 17457039440640.0, - "grad_norm": 1.9573264311231433, - "language_loss": 0.7572521, - "learning_rate": 3.992729665360331e-06, - "loss": 0.78007358, - "num_input_tokens_seen": 20007685, - "step": 937, - "time_per_iteration": 4.219425916671753 - }, - { - "auxiliary_loss_clip": 0.01085529, - "auxiliary_loss_mlp": 0.01013805, - "balance_loss_clip": 1.02476001, - "balance_loss_mlp": 1.00944233, - "epoch": 0.05639561100255524, - "flos": 70654928083200.0, - "grad_norm": 0.9053055994078011, - "language_loss": 0.64309287, - "learning_rate": 3.992696449831162e-06, - "loss": 0.66408622, - "num_input_tokens_seen": 20072750, - "step": 938, - "time_per_iteration": 3.1298794746398926 - }, - { - "auxiliary_loss_clip": 0.01171203, - "auxiliary_loss_mlp": 0.01068815, - "balance_loss_clip": 1.05175185, - "balance_loss_mlp": 1.0426966, - "epoch": 0.056455734255223204, - "flos": 20485996692480.0, - "grad_norm": 2.7427540631348832, - "language_loss": 0.79751205, - "learning_rate": 3.992663158738745e-06, - "loss": 0.8199122, - "num_input_tokens_seen": 20089070, - "step": 939, - "time_per_iteration": 2.6863484382629395 - }, - { - "auxiliary_loss_clip": 0.01175528, - "auxiliary_loss_mlp": 0.01068297, - "balance_loss_clip": 1.0509069, - "balance_loss_mlp": 1.04338217, - "epoch": 0.056515857507891176, - "flos": 22053569109120.0, - "grad_norm": 1.8374791395473227, - "language_loss": 0.73919088, - "learning_rate": 3.992629792084341e-06, - "loss": 0.76162916, - "num_input_tokens_seen": 20108790, - "step": 940, - "time_per_iteration": 2.7111120223999023 - }, - { - "auxiliary_loss_clip": 0.01198483, - "auxiliary_loss_mlp": 0.01058511, - "balance_loss_clip": 1.05900669, - "balance_loss_mlp": 1.03252339, - "epoch": 0.05657598076055915, - "flos": 24025316336640.0, - "grad_norm": 2.2993716569389813, - "language_loss": 0.70622003, - "learning_rate": 3.992596349869216e-06, - "loss": 0.72878999, - "num_input_tokens_seen": 20128455, - "step": 941, - "time_per_iteration": 2.657594680786133 - }, - { - "auxiliary_loss_clip": 0.01135396, - "auxiliary_loss_mlp": 0.01059543, - "balance_loss_clip": 1.04961574, - "balance_loss_mlp": 1.03382993, - "epoch": 0.05663610401322711, - "flos": 20480609652480.0, - "grad_norm": 2.0678542992190847, - "language_loss": 0.80921417, - "learning_rate": 3.992562832094637e-06, - "loss": 0.83116359, - "num_input_tokens_seen": 20145775, - "step": 942, - "time_per_iteration": 2.7379891872406006 - }, - { - "auxiliary_loss_clip": 0.01186767, - "auxiliary_loss_mlp": 0.01062055, - "balance_loss_clip": 1.05228579, - "balance_loss_mlp": 1.03554332, - "epoch": 0.056696227265895086, - "flos": 21069042255360.0, - "grad_norm": 2.245249922529115, - "language_loss": 0.88858449, - "learning_rate": 3.9925292387618755e-06, - "loss": 0.91107273, - "num_input_tokens_seen": 20164315, - "step": 943, - "time_per_iteration": 2.6502583026885986 - }, - { - "auxiliary_loss_clip": 0.01199122, - "auxiliary_loss_mlp": 0.0105963, - "balance_loss_clip": 1.05991781, - "balance_loss_mlp": 1.03534663, - "epoch": 0.05675635051856306, - "flos": 17821317219840.0, - "grad_norm": 2.5514256959015995, - "language_loss": 0.74771839, - "learning_rate": 3.992495569872206e-06, - "loss": 0.77030593, - "num_input_tokens_seen": 20182760, - "step": 944, - "time_per_iteration": 2.676079034805298 - }, - { - "auxiliary_loss_clip": 0.01204502, - "auxiliary_loss_mlp": 0.01064591, - "balance_loss_clip": 1.05980551, - "balance_loss_mlp": 1.04085672, - "epoch": 0.05681647377123102, - "flos": 23114945111040.0, - "grad_norm": 1.5959266123312272, - "language_loss": 0.79406166, - "learning_rate": 3.992461825426906e-06, - "loss": 0.81675267, - "num_input_tokens_seen": 20203830, - "step": 945, - "time_per_iteration": 2.734299421310425 - }, - { - "auxiliary_loss_clip": 0.01195984, - "auxiliary_loss_mlp": 0.0105672, - "balance_loss_clip": 1.05686593, - "balance_loss_mlp": 1.03156662, - "epoch": 0.056876597023898995, - "flos": 16070528505600.0, - "grad_norm": 2.5637081249861824, - "language_loss": 0.82651746, - "learning_rate": 3.992428005427252e-06, - "loss": 0.84904456, - "num_input_tokens_seen": 20220365, - "step": 946, - "time_per_iteration": 2.6636929512023926 - }, - { - "auxiliary_loss_clip": 0.0122014, - "auxiliary_loss_mlp": 0.01061449, - "balance_loss_clip": 1.06224144, - "balance_loss_mlp": 1.03524721, - "epoch": 0.05693672027656696, - "flos": 16835641130880.0, - "grad_norm": 1.8433174156507384, - "language_loss": 0.79031301, - "learning_rate": 3.992394109874529e-06, - "loss": 0.81312895, - "num_input_tokens_seen": 20238640, - "step": 947, - "time_per_iteration": 2.623671293258667 - }, - { - "auxiliary_loss_clip": 0.0117587, - "auxiliary_loss_mlp": 0.01061489, - "balance_loss_clip": 1.05605412, - "balance_loss_mlp": 1.03569245, - "epoch": 0.05699684352923493, - "flos": 21389113370880.0, - "grad_norm": 6.8661947111986725, - "language_loss": 0.85425055, - "learning_rate": 3.9923601387700225e-06, - "loss": 0.87662411, - "num_input_tokens_seen": 20251025, - "step": 948, - "time_per_iteration": 2.7410409450531006 - }, - { - "auxiliary_loss_clip": 0.01214005, - "auxiliary_loss_mlp": 0.01063231, - "balance_loss_clip": 1.05969238, - "balance_loss_mlp": 1.03598022, - "epoch": 0.057056966781902904, - "flos": 15560309767680.0, - "grad_norm": 3.649211317819821, - "language_loss": 0.87346625, - "learning_rate": 3.992326092115019e-06, - "loss": 0.89623863, - "num_input_tokens_seen": 20269775, - "step": 949, - "time_per_iteration": 2.6893157958984375 - }, - { - "auxiliary_loss_clip": 0.01194543, - "auxiliary_loss_mlp": 0.0106695, - "balance_loss_clip": 1.05799937, - "balance_loss_mlp": 1.04266715, - "epoch": 0.05711709003457087, - "flos": 19937856170880.0, - "grad_norm": 1.8324883776363103, - "language_loss": 0.7874645, - "learning_rate": 3.992291969910811e-06, - "loss": 0.8100794, - "num_input_tokens_seen": 20287715, - "step": 950, - "time_per_iteration": 2.623924732208252 - }, - { - "auxiliary_loss_clip": 0.01180518, - "auxiliary_loss_mlp": 0.01068771, - "balance_loss_clip": 1.05322623, - "balance_loss_mlp": 1.04384422, - "epoch": 0.05717721328723884, - "flos": 30332701774080.0, - "grad_norm": 3.8045132244795816, - "language_loss": 0.82477522, - "learning_rate": 3.992257772158691e-06, - "loss": 0.8472681, - "num_input_tokens_seen": 20307070, - "step": 951, - "time_per_iteration": 2.697479724884033 - }, - { - "auxiliary_loss_clip": 0.01167302, - "auxiliary_loss_mlp": 0.01061039, - "balance_loss_clip": 1.04906607, - "balance_loss_mlp": 1.03375173, - "epoch": 0.05723733653990681, - "flos": 23654358627840.0, - "grad_norm": 2.4180383362968634, - "language_loss": 0.86899263, - "learning_rate": 3.992223498859958e-06, - "loss": 0.89127606, - "num_input_tokens_seen": 20324945, - "step": 952, - "time_per_iteration": 2.707716226577759 - }, - { - "auxiliary_loss_clip": 0.01191405, - "auxiliary_loss_mlp": 0.01064705, - "balance_loss_clip": 1.05511189, - "balance_loss_mlp": 1.03630924, - "epoch": 0.05729745979257478, - "flos": 22055759838720.0, - "grad_norm": 2.195434645270168, - "language_loss": 0.79087842, - "learning_rate": 3.9921891500159084e-06, - "loss": 0.81343949, - "num_input_tokens_seen": 20346135, - "step": 953, - "time_per_iteration": 2.671255588531494 - }, - { - "auxiliary_loss_clip": 0.01190026, - "auxiliary_loss_mlp": 0.01066447, - "balance_loss_clip": 1.05984342, - "balance_loss_mlp": 1.04056656, - "epoch": 0.05735758304524275, - "flos": 19604353368960.0, - "grad_norm": 2.2066085695914466, - "language_loss": 0.86644447, - "learning_rate": 3.992154725627848e-06, - "loss": 0.88900924, - "num_input_tokens_seen": 20364450, - "step": 954, - "time_per_iteration": 2.671657085418701 - }, - { - "auxiliary_loss_clip": 0.01210569, - "auxiliary_loss_mlp": 0.01062619, - "balance_loss_clip": 1.06119955, - "balance_loss_mlp": 1.03723955, - "epoch": 0.057417706297910716, - "flos": 19099018880640.0, - "grad_norm": 2.2872795023766113, - "language_loss": 0.88071024, - "learning_rate": 3.9921202256970804e-06, - "loss": 0.90344214, - "num_input_tokens_seen": 20383500, - "step": 955, - "time_per_iteration": 2.69960880279541 - }, - { - "auxiliary_loss_clip": 0.01179864, - "auxiliary_loss_mlp": 0.01068889, - "balance_loss_clip": 1.0523231, - "balance_loss_mlp": 1.04209054, - "epoch": 0.05747782955057869, - "flos": 16654507822080.0, - "grad_norm": 1.9113555723128555, - "language_loss": 0.89160776, - "learning_rate": 3.992085650224914e-06, - "loss": 0.91409534, - "num_input_tokens_seen": 20400295, - "step": 956, - "time_per_iteration": 2.667868137359619 - }, - { - "auxiliary_loss_clip": 0.01167867, - "auxiliary_loss_mlp": 0.01060669, - "balance_loss_clip": 1.05720079, - "balance_loss_mlp": 1.03450251, - "epoch": 0.05753795280324665, - "flos": 14502058248960.0, - "grad_norm": 3.2877973901728095, - "language_loss": 0.75473189, - "learning_rate": 3.99205099921266e-06, - "loss": 0.77701724, - "num_input_tokens_seen": 20419085, - "step": 957, - "time_per_iteration": 2.6938796043395996 - }, - { - "auxiliary_loss_clip": 0.0117627, - "auxiliary_loss_mlp": 0.01072849, - "balance_loss_clip": 1.05432248, - "balance_loss_mlp": 1.0448705, - "epoch": 0.057598076055914625, - "flos": 18076318848000.0, - "grad_norm": 2.0004055711005257, - "language_loss": 0.79582155, - "learning_rate": 3.992016272661633e-06, - "loss": 0.81831264, - "num_input_tokens_seen": 20437465, - "step": 958, - "time_per_iteration": 2.6933834552764893 - }, - { - "auxiliary_loss_clip": 0.01186244, - "auxiliary_loss_mlp": 0.01059908, - "balance_loss_clip": 1.05851364, - "balance_loss_mlp": 1.03572011, - "epoch": 0.0576581993085826, - "flos": 22124600254080.0, - "grad_norm": 2.669863855173802, - "language_loss": 0.8840394, - "learning_rate": 3.99198147057315e-06, - "loss": 0.906501, - "num_input_tokens_seen": 20456235, - "step": 959, - "time_per_iteration": 2.7094578742980957 - }, - { - "auxiliary_loss_clip": 0.01169479, - "auxiliary_loss_mlp": 0.01063656, - "balance_loss_clip": 1.05511999, - "balance_loss_mlp": 1.03881276, - "epoch": 0.05771832256125056, - "flos": 33181746779520.0, - "grad_norm": 2.0960373333994764, - "language_loss": 0.78850955, - "learning_rate": 3.991946592948529e-06, - "loss": 0.8108409, - "num_input_tokens_seen": 20476825, - "step": 960, - "time_per_iteration": 2.822922945022583 - }, - { - "auxiliary_loss_clip": 0.0113413, - "auxiliary_loss_mlp": 0.01067189, - "balance_loss_clip": 1.05177355, - "balance_loss_mlp": 1.04020023, - "epoch": 0.057778445813918534, - "flos": 24170143973760.0, - "grad_norm": 2.063464892179025, - "language_loss": 0.92986894, - "learning_rate": 3.991911639789094e-06, - "loss": 0.95188212, - "num_input_tokens_seen": 20496965, - "step": 961, - "time_per_iteration": 2.793952226638794 - }, - { - "auxiliary_loss_clip": 0.01182535, - "auxiliary_loss_mlp": 0.0106764, - "balance_loss_clip": 1.0554297, - "balance_loss_mlp": 1.04091299, - "epoch": 0.0578385690665865, - "flos": 29643037666560.0, - "grad_norm": 2.0649993155313067, - "language_loss": 0.68164188, - "learning_rate": 3.991876611096169e-06, - "loss": 0.70414358, - "num_input_tokens_seen": 20518035, - "step": 962, - "time_per_iteration": 2.8396694660186768 - }, - { - "auxiliary_loss_clip": 0.01159524, - "auxiliary_loss_mlp": 0.01073851, - "balance_loss_clip": 1.05128908, - "balance_loss_mlp": 1.04909074, - "epoch": 0.05789869231925447, - "flos": 20885430908160.0, - "grad_norm": 2.2685465488517074, - "language_loss": 0.8848027, - "learning_rate": 3.991841506871084e-06, - "loss": 0.90713644, - "num_input_tokens_seen": 20534740, - "step": 963, - "time_per_iteration": 2.7077019214630127 - }, - { - "auxiliary_loss_clip": 0.01183778, - "auxiliary_loss_mlp": 0.01061251, - "balance_loss_clip": 1.06018209, - "balance_loss_mlp": 1.03516829, - "epoch": 0.057958815571922444, - "flos": 26031106679040.0, - "grad_norm": 2.392959969035536, - "language_loss": 0.85288298, - "learning_rate": 3.99180632711517e-06, - "loss": 0.87533331, - "num_input_tokens_seen": 20553485, - "step": 964, - "time_per_iteration": 2.7218217849731445 - }, - { - "auxiliary_loss_clip": 0.01188683, - "auxiliary_loss_mlp": 0.01069422, - "balance_loss_clip": 1.05959499, - "balance_loss_mlp": 1.04325557, - "epoch": 0.05801893882459041, - "flos": 18077683564800.0, - "grad_norm": 3.087349735715565, - "language_loss": 0.78159416, - "learning_rate": 3.99177107182976e-06, - "loss": 0.80417526, - "num_input_tokens_seen": 20572155, - "step": 965, - "time_per_iteration": 2.6902661323547363 - }, - { - "auxiliary_loss_clip": 0.01156531, - "auxiliary_loss_mlp": 0.0107109, - "balance_loss_clip": 1.0523715, - "balance_loss_mlp": 1.04462528, - "epoch": 0.05807906207725838, - "flos": 17748885444480.0, - "grad_norm": 1.9742288518319486, - "language_loss": 0.81403655, - "learning_rate": 3.99173574101619e-06, - "loss": 0.83631277, - "num_input_tokens_seen": 20590395, - "step": 966, - "time_per_iteration": 2.7423267364501953 - }, - { - "auxiliary_loss_clip": 0.01198908, - "auxiliary_loss_mlp": 0.01065021, - "balance_loss_clip": 1.058887, - "balance_loss_mlp": 1.04113197, - "epoch": 0.058139185329926346, - "flos": 18040372312320.0, - "grad_norm": 1.8776530142118544, - "language_loss": 0.76480806, - "learning_rate": 3.9917003346758035e-06, - "loss": 0.78744727, - "num_input_tokens_seen": 20608435, - "step": 967, - "time_per_iteration": 2.642885446548462 - }, - { - "auxiliary_loss_clip": 0.01084339, - "auxiliary_loss_mlp": 0.0103139, - "balance_loss_clip": 1.02675521, - "balance_loss_mlp": 1.0269078, - "epoch": 0.05819930858259432, - "flos": 62363297485440.0, - "grad_norm": 0.985564929959949, - "language_loss": 0.57357776, - "learning_rate": 3.991664852809939e-06, - "loss": 0.59473509, - "num_input_tokens_seen": 20668575, - "step": 968, - "time_per_iteration": 3.1017024517059326 - }, - { - "auxiliary_loss_clip": 0.01188824, - "auxiliary_loss_mlp": 0.01057715, - "balance_loss_clip": 1.05784404, - "balance_loss_mlp": 1.03147697, - "epoch": 0.05825943183526229, - "flos": 19135360465920.0, - "grad_norm": 2.1276337565108485, - "language_loss": 0.82286429, - "learning_rate": 3.991629295419945e-06, - "loss": 0.84532964, - "num_input_tokens_seen": 20687355, - "step": 969, - "time_per_iteration": 2.669055461883545 - }, - { - "auxiliary_loss_clip": 0.01206272, - "auxiliary_loss_mlp": 0.00782724, - "balance_loss_clip": 1.06255269, - "balance_loss_mlp": 1.00024962, - "epoch": 0.058319555087930255, - "flos": 29022465369600.0, - "grad_norm": 7.916507288074279, - "language_loss": 0.7803669, - "learning_rate": 3.991593662507167e-06, - "loss": 0.80025685, - "num_input_tokens_seen": 20705710, - "step": 970, - "time_per_iteration": 2.733030080795288 - }, - { - "auxiliary_loss_clip": 0.01181452, - "auxiliary_loss_mlp": 0.01064945, - "balance_loss_clip": 1.05691695, - "balance_loss_mlp": 1.03887415, - "epoch": 0.05837967834059823, - "flos": 18879999701760.0, - "grad_norm": 3.163102883752813, - "language_loss": 0.92229038, - "learning_rate": 3.991557954072958e-06, - "loss": 0.94475436, - "num_input_tokens_seen": 20722405, - "step": 971, - "time_per_iteration": 2.730377435684204 - }, - { - "auxiliary_loss_clip": 0.01180948, - "auxiliary_loss_mlp": 0.01062613, - "balance_loss_clip": 1.05320477, - "balance_loss_mlp": 1.03722143, - "epoch": 0.05843980159326619, - "flos": 25703062744320.0, - "grad_norm": 1.700187330091603, - "language_loss": 0.85959208, - "learning_rate": 3.991522170118673e-06, - "loss": 0.88202775, - "num_input_tokens_seen": 20741480, - "step": 972, - "time_per_iteration": 2.687185049057007 - }, - { - "auxiliary_loss_clip": 0.0116993, - "auxiliary_loss_mlp": 0.01079713, - "balance_loss_clip": 1.05714142, - "balance_loss_mlp": 1.05601454, - "epoch": 0.058499924845934165, - "flos": 25552129795200.0, - "grad_norm": 2.00599255988541, - "language_loss": 0.87503272, - "learning_rate": 3.991486310645667e-06, - "loss": 0.89752913, - "num_input_tokens_seen": 20759685, - "step": 973, - "time_per_iteration": 2.7166664600372314 - }, - { - "auxiliary_loss_clip": 0.01206524, - "auxiliary_loss_mlp": 0.00784111, - "balance_loss_clip": 1.06111121, - "balance_loss_mlp": 1.00026989, - "epoch": 0.05856004809860214, - "flos": 16436171001600.0, - "grad_norm": 1.879365930358842, - "language_loss": 0.74800295, - "learning_rate": 3.991450375655301e-06, - "loss": 0.76790935, - "num_input_tokens_seen": 20778180, - "step": 974, - "time_per_iteration": 2.713594675064087 - }, - { - "auxiliary_loss_clip": 0.01197101, - "auxiliary_loss_mlp": 0.00782207, - "balance_loss_clip": 1.059551, - "balance_loss_mlp": 1.00025892, - "epoch": 0.0586201713512701, - "flos": 39458824116480.0, - "grad_norm": 1.5923993506380014, - "language_loss": 0.76874506, - "learning_rate": 3.991414365148936e-06, - "loss": 0.78853816, - "num_input_tokens_seen": 20802705, - "step": 975, - "time_per_iteration": 7.600914716720581 - }, - { - "auxiliary_loss_clip": 0.01215491, - "auxiliary_loss_mlp": 0.01069506, - "balance_loss_clip": 1.06030774, - "balance_loss_mlp": 1.0444721, - "epoch": 0.058680294603938074, - "flos": 23365170230400.0, - "grad_norm": 3.6132976830219734, - "language_loss": 0.76748288, - "learning_rate": 3.99137827912794e-06, - "loss": 0.79033279, - "num_input_tokens_seen": 20822540, - "step": 976, - "time_per_iteration": 4.324799537658691 - }, - { - "auxiliary_loss_clip": 0.01176132, - "auxiliary_loss_mlp": 0.01077003, - "balance_loss_clip": 1.05271626, - "balance_loss_mlp": 1.04963279, - "epoch": 0.05874041785660604, - "flos": 32232017226240.0, - "grad_norm": 1.943198757110789, - "language_loss": 0.87343585, - "learning_rate": 3.991342117593679e-06, - "loss": 0.89596725, - "num_input_tokens_seen": 20844175, - "step": 977, - "time_per_iteration": 2.7742488384246826 - }, - { - "auxiliary_loss_clip": 0.01187161, - "auxiliary_loss_mlp": 0.01067914, - "balance_loss_clip": 1.06209528, - "balance_loss_mlp": 1.04231977, - "epoch": 0.05880054110927401, - "flos": 22310043194880.0, - "grad_norm": 1.718987046197629, - "language_loss": 0.7969116, - "learning_rate": 3.991305880547527e-06, - "loss": 0.81946236, - "num_input_tokens_seen": 20864730, - "step": 978, - "time_per_iteration": 2.733372926712036 - }, - { - "auxiliary_loss_clip": 0.01136264, - "auxiliary_loss_mlp": 0.01076585, - "balance_loss_clip": 1.05591321, - "balance_loss_mlp": 1.04927468, - "epoch": 0.05886066436194198, - "flos": 27380450016000.0, - "grad_norm": 1.8692877257975375, - "language_loss": 0.80665666, - "learning_rate": 3.991269567990855e-06, - "loss": 0.82878518, - "num_input_tokens_seen": 20885200, - "step": 979, - "time_per_iteration": 3.2624220848083496 - }, - { - "auxiliary_loss_clip": 0.01074686, - "auxiliary_loss_mlp": 0.01029701, - "balance_loss_clip": 1.02640033, - "balance_loss_mlp": 1.02495658, - "epoch": 0.05892078761460995, - "flos": 59584493525760.0, - "grad_norm": 0.9436493040005753, - "language_loss": 0.59004962, - "learning_rate": 3.9912331799250415e-06, - "loss": 0.6110934, - "num_input_tokens_seen": 20940325, - "step": 980, - "time_per_iteration": 3.4688587188720703 - }, - { - "auxiliary_loss_clip": 0.01211665, - "auxiliary_loss_mlp": 0.01078603, - "balance_loss_clip": 1.06178868, - "balance_loss_mlp": 1.05242431, - "epoch": 0.05898091086727792, - "flos": 15414081500160.0, - "grad_norm": 2.2770545408130514, - "language_loss": 0.86436182, - "learning_rate": 3.9911967163514665e-06, - "loss": 0.88726455, - "num_input_tokens_seen": 20958220, - "step": 981, - "time_per_iteration": 2.5824644565582275 - }, - { - "auxiliary_loss_clip": 0.01190085, - "auxiliary_loss_mlp": 0.0106921, - "balance_loss_clip": 1.05943286, - "balance_loss_mlp": 1.04629803, - "epoch": 0.059041034119945886, - "flos": 23655328295040.0, - "grad_norm": 2.1333982175691855, - "language_loss": 0.79293346, - "learning_rate": 3.991160177271513e-06, - "loss": 0.81552643, - "num_input_tokens_seen": 20978920, - "step": 982, - "time_per_iteration": 2.68428897857666 - }, - { - "auxiliary_loss_clip": 0.01192274, - "auxiliary_loss_mlp": 0.01068234, - "balance_loss_clip": 1.05926657, - "balance_loss_mlp": 1.04356933, - "epoch": 0.05910115737261386, - "flos": 24754087376640.0, - "grad_norm": 2.319627739094249, - "language_loss": 0.84413779, - "learning_rate": 3.9911235626865654e-06, - "loss": 0.86674285, - "num_input_tokens_seen": 20999490, - "step": 983, - "time_per_iteration": 2.7006261348724365 - }, - { - "auxiliary_loss_clip": 0.0120015, - "auxiliary_loss_mlp": 0.01072669, - "balance_loss_clip": 1.05969584, - "balance_loss_mlp": 1.04799283, - "epoch": 0.05916128062528183, - "flos": 11728749070080.0, - "grad_norm": 1.8014395118859294, - "language_loss": 0.84510243, - "learning_rate": 3.9910868725980125e-06, - "loss": 0.86783063, - "num_input_tokens_seen": 21017865, - "step": 984, - "time_per_iteration": 2.640246868133545 - }, - { - "auxiliary_loss_clip": 0.01188594, - "auxiliary_loss_mlp": 0.01055296, - "balance_loss_clip": 1.05650342, - "balance_loss_mlp": 1.03171611, - "epoch": 0.059221403877949795, - "flos": 21902995296000.0, - "grad_norm": 2.473231587287368, - "language_loss": 0.77611595, - "learning_rate": 3.9910501070072465e-06, - "loss": 0.7985549, - "num_input_tokens_seen": 21035900, - "step": 985, - "time_per_iteration": 2.626371383666992 - }, - { - "auxiliary_loss_clip": 0.01150113, - "auxiliary_loss_mlp": 0.01060814, - "balance_loss_clip": 1.05341148, - "balance_loss_mlp": 1.03542209, - "epoch": 0.05928152713061777, - "flos": 20514580940160.0, - "grad_norm": 1.9082382068459252, - "language_loss": 0.90593231, - "learning_rate": 3.991013265915661e-06, - "loss": 0.92804158, - "num_input_tokens_seen": 21053235, - "step": 986, - "time_per_iteration": 2.7834935188293457 - }, - { - "auxiliary_loss_clip": 0.01200704, - "auxiliary_loss_mlp": 0.01061312, - "balance_loss_clip": 1.05555892, - "balance_loss_mlp": 1.03425193, - "epoch": 0.05934165038328574, - "flos": 24495135252480.0, - "grad_norm": 2.216017383423336, - "language_loss": 0.75688565, - "learning_rate": 3.9909763493246525e-06, - "loss": 0.77950585, - "num_input_tokens_seen": 21073090, - "step": 987, - "time_per_iteration": 2.6669981479644775 - }, - { - "auxiliary_loss_clip": 0.01203558, - "auxiliary_loss_mlp": 0.01057756, - "balance_loss_clip": 1.06134868, - "balance_loss_mlp": 1.03331852, - "epoch": 0.059401773635953704, - "flos": 38728041914880.0, - "grad_norm": 2.2869993581633827, - "language_loss": 0.71867943, - "learning_rate": 3.990939357235621e-06, - "loss": 0.7412926, - "num_input_tokens_seen": 21094895, - "step": 988, - "time_per_iteration": 2.805851697921753 - }, - { - "auxiliary_loss_clip": 0.0105006, - "auxiliary_loss_mlp": 0.0101134, - "balance_loss_clip": 1.02230322, - "balance_loss_mlp": 1.00688171, - "epoch": 0.059461896888621676, - "flos": 58023565125120.0, - "grad_norm": 0.9416454944601763, - "language_loss": 0.7124939, - "learning_rate": 3.99090228964997e-06, - "loss": 0.73310792, - "num_input_tokens_seen": 21147555, - "step": 989, - "time_per_iteration": 3.100306749343872 - }, - { - "auxiliary_loss_clip": 0.0117797, - "auxiliary_loss_mlp": 0.01072264, - "balance_loss_clip": 1.05793095, - "balance_loss_mlp": 1.04389191, - "epoch": 0.05952202014128964, - "flos": 22127760650880.0, - "grad_norm": 2.0167260155417113, - "language_loss": 0.78245646, - "learning_rate": 3.990865146569105e-06, - "loss": 0.80495882, - "num_input_tokens_seen": 21167845, - "step": 990, - "time_per_iteration": 2.8133904933929443 - }, - { - "auxiliary_loss_clip": 0.01198295, - "auxiliary_loss_mlp": 0.01053485, - "balance_loss_clip": 1.06166339, - "balance_loss_mlp": 1.02761686, - "epoch": 0.059582143393957614, - "flos": 20445776438400.0, - "grad_norm": 2.2411623387553727, - "language_loss": 0.86522102, - "learning_rate": 3.990827927994434e-06, - "loss": 0.88773882, - "num_input_tokens_seen": 21185085, - "step": 991, - "time_per_iteration": 2.6964831352233887 - }, - { - "auxiliary_loss_clip": 0.0121783, - "auxiliary_loss_mlp": 0.01064707, - "balance_loss_clip": 1.0613625, - "balance_loss_mlp": 1.03943431, - "epoch": 0.059642266646625586, - "flos": 20594877793920.0, - "grad_norm": 1.8566945591898132, - "language_loss": 0.76738375, - "learning_rate": 3.9907906339273674e-06, - "loss": 0.79020917, - "num_input_tokens_seen": 21204230, - "step": 992, - "time_per_iteration": 2.646942377090454 - }, - { - "auxiliary_loss_clip": 0.01146457, - "auxiliary_loss_mlp": 0.01062309, - "balance_loss_clip": 1.05571234, - "balance_loss_mlp": 1.03834832, - "epoch": 0.05970238989929355, - "flos": 19352655792000.0, - "grad_norm": 2.3469050968731233, - "language_loss": 0.75117075, - "learning_rate": 3.9907532643693215e-06, - "loss": 0.77325845, - "num_input_tokens_seen": 21222655, - "step": 993, - "time_per_iteration": 2.7642974853515625 - }, - { - "auxiliary_loss_clip": 0.01157785, - "auxiliary_loss_mlp": 0.01075532, - "balance_loss_clip": 1.05397618, - "balance_loss_mlp": 1.04774487, - "epoch": 0.05976251315196152, - "flos": 30264040926720.0, - "grad_norm": 2.725207959052886, - "language_loss": 0.79177904, - "learning_rate": 3.990715819321712e-06, - "loss": 0.81411219, - "num_input_tokens_seen": 21242310, - "step": 994, - "time_per_iteration": 2.8414714336395264 - }, - { - "auxiliary_loss_clip": 0.01214724, - "auxiliary_loss_mlp": 0.01079016, - "balance_loss_clip": 1.06264019, - "balance_loss_mlp": 1.05361295, - "epoch": 0.05982263640462949, - "flos": 23185150243200.0, - "grad_norm": 2.8097993094234983, - "language_loss": 0.79917169, - "learning_rate": 3.99067829878596e-06, - "loss": 0.82210916, - "num_input_tokens_seen": 21261410, - "step": 995, - "time_per_iteration": 2.6524364948272705 - }, - { - "auxiliary_loss_clip": 0.0116696, - "auxiliary_loss_mlp": 0.01068218, - "balance_loss_clip": 1.05704355, - "balance_loss_mlp": 1.04208767, - "epoch": 0.05988275965729746, - "flos": 27850879463040.0, - "grad_norm": 1.902030256537741, - "language_loss": 0.87013257, - "learning_rate": 3.990640702763487e-06, - "loss": 0.89248431, - "num_input_tokens_seen": 21280080, - "step": 996, - "time_per_iteration": 2.7431676387786865 - }, - { - "auxiliary_loss_clip": 0.01177854, - "auxiliary_loss_mlp": 0.01081123, - "balance_loss_clip": 1.05684328, - "balance_loss_mlp": 1.05055761, - "epoch": 0.05994288290996543, - "flos": 24680003575680.0, - "grad_norm": 2.971039758986745, - "language_loss": 0.87273014, - "learning_rate": 3.990603031255718e-06, - "loss": 0.89531994, - "num_input_tokens_seen": 21296765, - "step": 997, - "time_per_iteration": 2.748448371887207 - }, - { - "auxiliary_loss_clip": 0.01069915, - "auxiliary_loss_mlp": 0.01014417, - "balance_loss_clip": 1.02303648, - "balance_loss_mlp": 1.00972033, - "epoch": 0.0600030061626334, - "flos": 69929568835200.0, - "grad_norm": 1.0091092068179202, - "language_loss": 0.75381488, - "learning_rate": 3.990565284264083e-06, - "loss": 0.7746582, - "num_input_tokens_seen": 21363345, - "step": 998, - "time_per_iteration": 3.2950518131256104 - }, - { - "auxiliary_loss_clip": 0.01170062, - "auxiliary_loss_mlp": 0.01065521, - "balance_loss_clip": 1.05893683, - "balance_loss_mlp": 1.03893745, - "epoch": 0.06006312941530137, - "flos": 26540140268160.0, - "grad_norm": 1.8197691299520968, - "language_loss": 0.76053095, - "learning_rate": 3.990527461790013e-06, - "loss": 0.7828868, - "num_input_tokens_seen": 21385290, - "step": 999, - "time_per_iteration": 2.733802556991577 - }, - { - "auxiliary_loss_clip": 0.01197834, - "auxiliary_loss_mlp": 0.01059542, - "balance_loss_clip": 1.05646563, - "balance_loss_mlp": 1.03339899, - "epoch": 0.060123252667969335, - "flos": 27344000689920.0, - "grad_norm": 2.5948629341774874, - "language_loss": 0.82992184, - "learning_rate": 3.990489563834943e-06, - "loss": 0.85249555, - "num_input_tokens_seen": 21407625, - "step": 1000, - "time_per_iteration": 2.710981845855713 - }, - { - "auxiliary_loss_clip": 0.0118571, - "auxiliary_loss_mlp": 0.01062188, - "balance_loss_clip": 1.05856955, - "balance_loss_mlp": 1.03480577, - "epoch": 0.06018337592063731, - "flos": 27016710940800.0, - "grad_norm": 2.111409807940472, - "language_loss": 0.85820085, - "learning_rate": 3.990451590400309e-06, - "loss": 0.88067985, - "num_input_tokens_seen": 21426835, - "step": 1001, - "time_per_iteration": 2.73445463180542 - }, - { - "auxiliary_loss_clip": 0.01191917, - "auxiliary_loss_mlp": 0.01062059, - "balance_loss_clip": 1.06167853, - "balance_loss_mlp": 1.03719211, - "epoch": 0.06024349917330528, - "flos": 25592960580480.0, - "grad_norm": 1.8359711451165206, - "language_loss": 0.74128318, - "learning_rate": 3.990413541487551e-06, - "loss": 0.76382297, - "num_input_tokens_seen": 21444920, - "step": 1002, - "time_per_iteration": 2.8861100673675537 - }, - { - "auxiliary_loss_clip": 0.01214316, - "auxiliary_loss_mlp": 0.01062589, - "balance_loss_clip": 1.06316125, - "balance_loss_mlp": 1.03737664, - "epoch": 0.060303622425973244, - "flos": 26133271937280.0, - "grad_norm": 2.1835040648243997, - "language_loss": 0.75520515, - "learning_rate": 3.990375417098112e-06, - "loss": 0.77797419, - "num_input_tokens_seen": 21463555, - "step": 1003, - "time_per_iteration": 2.632889747619629 - }, - { - "auxiliary_loss_clip": 0.01187709, - "auxiliary_loss_mlp": 0.01064806, - "balance_loss_clip": 1.05934548, - "balance_loss_mlp": 1.03928304, - "epoch": 0.060363745678641216, - "flos": 20377187418240.0, - "grad_norm": 2.3150099602993155, - "language_loss": 0.70349169, - "learning_rate": 3.990337217233437e-06, - "loss": 0.72601682, - "num_input_tokens_seen": 21481990, - "step": 1004, - "time_per_iteration": 2.6947617530822754 - }, - { - "auxiliary_loss_clip": 0.01212815, - "auxiliary_loss_mlp": 0.01077454, - "balance_loss_clip": 1.06629324, - "balance_loss_mlp": 1.05168116, - "epoch": 0.06042386893130918, - "flos": 17749172753280.0, - "grad_norm": 2.276868338025253, - "language_loss": 0.83444524, - "learning_rate": 3.990298941894976e-06, - "loss": 0.85734791, - "num_input_tokens_seen": 21500385, - "step": 1005, - "time_per_iteration": 2.581683397293091 - }, - { - "auxiliary_loss_clip": 0.01077621, - "auxiliary_loss_mlp": 0.01004707, - "balance_loss_clip": 1.02541244, - "balance_loss_mlp": 1.00029612, - "epoch": 0.06048399218397715, - "flos": 68538496872960.0, - "grad_norm": 0.903813421793838, - "language_loss": 0.59018111, - "learning_rate": 3.9902605910841794e-06, - "loss": 0.61100447, - "num_input_tokens_seen": 21561040, - "step": 1006, - "time_per_iteration": 3.222104787826538 - }, - { - "auxiliary_loss_clip": 0.01183553, - "auxiliary_loss_mlp": 0.01059038, - "balance_loss_clip": 1.05334234, - "balance_loss_mlp": 1.03284812, - "epoch": 0.060544115436645125, - "flos": 23258515772160.0, - "grad_norm": 2.1584333764290853, - "language_loss": 0.74229443, - "learning_rate": 3.990222164802503e-06, - "loss": 0.76472032, - "num_input_tokens_seen": 21580655, - "step": 1007, - "time_per_iteration": 2.7130653858184814 - }, - { - "auxiliary_loss_clip": 0.0119408, - "auxiliary_loss_mlp": 0.01060431, - "balance_loss_clip": 1.06008601, - "balance_loss_mlp": 1.03493261, - "epoch": 0.06060423868931309, - "flos": 23878441624320.0, - "grad_norm": 1.7956876298455304, - "language_loss": 0.8081426, - "learning_rate": 3.9901836630514006e-06, - "loss": 0.8306877, - "num_input_tokens_seen": 21599650, - "step": 1008, - "time_per_iteration": 2.7151994705200195 - }, - { - "auxiliary_loss_clip": 0.01175291, - "auxiliary_loss_mlp": 0.01056357, - "balance_loss_clip": 1.05982351, - "balance_loss_mlp": 1.0305717, - "epoch": 0.06066436194198106, - "flos": 18728061171840.0, - "grad_norm": 2.3069524306559837, - "language_loss": 0.78198558, - "learning_rate": 3.990145085832335e-06, - "loss": 0.8043021, - "num_input_tokens_seen": 21617550, - "step": 1009, - "time_per_iteration": 2.7313599586486816 - }, - { - "auxiliary_loss_clip": 0.01194621, - "auxiliary_loss_mlp": 0.01061233, - "balance_loss_clip": 1.06150866, - "balance_loss_mlp": 1.03726041, - "epoch": 0.06072448519464903, - "flos": 24640465680000.0, - "grad_norm": 1.7452257697216769, - "language_loss": 0.93148172, - "learning_rate": 3.990106433146769e-06, - "loss": 0.95404023, - "num_input_tokens_seen": 21635865, - "step": 1010, - "time_per_iteration": 2.7233662605285645 - }, - { - "auxiliary_loss_clip": 0.01148246, - "auxiliary_loss_mlp": 0.00784144, - "balance_loss_clip": 1.05304599, - "balance_loss_mlp": 1.00029802, - "epoch": 0.060784608447317, - "flos": 17378825575680.0, - "grad_norm": 2.9999367504779517, - "language_loss": 0.72022474, - "learning_rate": 3.9900677049961665e-06, - "loss": 0.73954868, - "num_input_tokens_seen": 21653945, - "step": 1011, - "time_per_iteration": 2.804858446121216 - }, - { - "auxiliary_loss_clip": 0.01194231, - "auxiliary_loss_mlp": 0.01077344, - "balance_loss_clip": 1.05968046, - "balance_loss_mlp": 1.04868615, - "epoch": 0.06084473169998497, - "flos": 23692208584320.0, - "grad_norm": 1.9573218215833301, - "language_loss": 0.87526691, - "learning_rate": 3.990028901381999e-06, - "loss": 0.89798272, - "num_input_tokens_seen": 21671230, - "step": 1012, - "time_per_iteration": 2.6466245651245117 - }, - { - "auxiliary_loss_clip": 0.01184459, - "auxiliary_loss_mlp": 0.01064264, - "balance_loss_clip": 1.05652905, - "balance_loss_mlp": 1.03838325, - "epoch": 0.06090485495265294, - "flos": 23546339452800.0, - "grad_norm": 1.9062230938156723, - "language_loss": 0.76947677, - "learning_rate": 3.989990022305734e-06, - "loss": 0.79196405, - "num_input_tokens_seen": 21691155, - "step": 1013, - "time_per_iteration": 4.297588586807251 - }, - { - "auxiliary_loss_clip": 0.01207383, - "auxiliary_loss_mlp": 0.00783488, - "balance_loss_clip": 1.06573224, - "balance_loss_mlp": 1.00034499, - "epoch": 0.06096497820532091, - "flos": 20339301548160.0, - "grad_norm": 2.441711811862119, - "language_loss": 0.86151874, - "learning_rate": 3.98995106776885e-06, - "loss": 0.88142747, - "num_input_tokens_seen": 21707405, - "step": 1014, - "time_per_iteration": 4.301488637924194 - }, - { - "auxiliary_loss_clip": 0.0121503, - "auxiliary_loss_mlp": 0.01072817, - "balance_loss_clip": 1.06605387, - "balance_loss_mlp": 1.04508948, - "epoch": 0.061025101457988874, - "flos": 26939035779840.0, - "grad_norm": 2.4309754772209184, - "language_loss": 0.73197287, - "learning_rate": 3.98991203777282e-06, - "loss": 0.75485134, - "num_input_tokens_seen": 21728090, - "step": 1015, - "time_per_iteration": 4.384514808654785 - }, - { - "auxiliary_loss_clip": 0.01187374, - "auxiliary_loss_mlp": 0.01068593, - "balance_loss_clip": 1.06084347, - "balance_loss_mlp": 1.04228365, - "epoch": 0.061085224710656846, - "flos": 25375054723200.0, - "grad_norm": 1.5896529502124837, - "language_loss": 0.79109907, - "learning_rate": 3.9898729323191275e-06, - "loss": 0.81365877, - "num_input_tokens_seen": 21747950, - "step": 1016, - "time_per_iteration": 4.3249351978302 - }, - { - "auxiliary_loss_clip": 0.01173015, - "auxiliary_loss_mlp": 0.0105746, - "balance_loss_clip": 1.06036103, - "balance_loss_mlp": 1.03249741, - "epoch": 0.06114534796332482, - "flos": 24824759385600.0, - "grad_norm": 1.6772682648410928, - "language_loss": 0.76014191, - "learning_rate": 3.989833751409254e-06, - "loss": 0.78244662, - "num_input_tokens_seen": 21767900, - "step": 1017, - "time_per_iteration": 2.7983243465423584 - }, - { - "auxiliary_loss_clip": 0.01188817, - "auxiliary_loss_mlp": 0.01074603, - "balance_loss_clip": 1.06584609, - "balance_loss_mlp": 1.0483532, - "epoch": 0.061205471215992784, - "flos": 20631434860800.0, - "grad_norm": 2.001716657382839, - "language_loss": 0.85798436, - "learning_rate": 3.989794495044685e-06, - "loss": 0.88061857, - "num_input_tokens_seen": 21787375, - "step": 1018, - "time_per_iteration": 2.702399253845215 - }, - { - "auxiliary_loss_clip": 0.01174344, - "auxiliary_loss_mlp": 0.01069438, - "balance_loss_clip": 1.06325769, - "balance_loss_mlp": 1.04231787, - "epoch": 0.061265594468660756, - "flos": 16508351381760.0, - "grad_norm": 2.9546929267460813, - "language_loss": 0.76985347, - "learning_rate": 3.989755163226909e-06, - "loss": 0.79229128, - "num_input_tokens_seen": 21806275, - "step": 1019, - "time_per_iteration": 2.780104875564575 - }, - { - "auxiliary_loss_clip": 0.01160861, - "auxiliary_loss_mlp": 0.0106141, - "balance_loss_clip": 1.05355084, - "balance_loss_mlp": 1.03511262, - "epoch": 0.06132571772132872, - "flos": 26246211275520.0, - "grad_norm": 2.1848809329980288, - "language_loss": 0.84122044, - "learning_rate": 3.989715755957418e-06, - "loss": 0.86344314, - "num_input_tokens_seen": 21826430, - "step": 1020, - "time_per_iteration": 2.785963535308838 - }, - { - "auxiliary_loss_clip": 0.01198473, - "auxiliary_loss_mlp": 0.01063342, - "balance_loss_clip": 1.06365371, - "balance_loss_mlp": 1.03604269, - "epoch": 0.06138584097399669, - "flos": 37414788768000.0, - "grad_norm": 1.933053672026977, - "language_loss": 0.79114467, - "learning_rate": 3.989676273237705e-06, - "loss": 0.81376278, - "num_input_tokens_seen": 21847800, - "step": 1021, - "time_per_iteration": 2.7968955039978027 - }, - { - "auxiliary_loss_clip": 0.01189659, - "auxiliary_loss_mlp": 0.01064044, - "balance_loss_clip": 1.06159925, - "balance_loss_mlp": 1.04114437, - "epoch": 0.061445964226664665, - "flos": 17420661941760.0, - "grad_norm": 2.089525934673828, - "language_loss": 0.87768298, - "learning_rate": 3.9896367150692705e-06, - "loss": 0.90022004, - "num_input_tokens_seen": 21863385, - "step": 1022, - "time_per_iteration": 2.70906138420105 - }, - { - "auxiliary_loss_clip": 0.01198737, - "auxiliary_loss_mlp": 0.0106635, - "balance_loss_clip": 1.06627858, - "balance_loss_mlp": 1.04079151, - "epoch": 0.06150608747933263, - "flos": 22600021691520.0, - "grad_norm": 1.7284486983379121, - "language_loss": 0.82892007, - "learning_rate": 3.989597081453611e-06, - "loss": 0.85157096, - "num_input_tokens_seen": 21881880, - "step": 1023, - "time_per_iteration": 2.71539568901062 - }, - { - "auxiliary_loss_clip": 0.01100664, - "auxiliary_loss_mlp": 0.01010751, - "balance_loss_clip": 1.03727341, - "balance_loss_mlp": 1.00614953, - "epoch": 0.0615662107320006, - "flos": 56741482005120.0, - "grad_norm": 0.8894752517384502, - "language_loss": 0.6505782, - "learning_rate": 3.989557372392231e-06, - "loss": 0.67169237, - "num_input_tokens_seen": 21940550, - "step": 1024, - "time_per_iteration": 3.175217628479004 - }, - { - "auxiliary_loss_clip": 0.01167458, - "auxiliary_loss_mlp": 0.01073669, - "balance_loss_clip": 1.05906856, - "balance_loss_mlp": 1.04553604, - "epoch": 0.06162633398466857, - "flos": 22564793427840.0, - "grad_norm": 2.320347485789288, - "language_loss": 0.88069236, - "learning_rate": 3.989517587886636e-06, - "loss": 0.90310359, - "num_input_tokens_seen": 21958390, - "step": 1025, - "time_per_iteration": 2.690725564956665 - }, - { - "auxiliary_loss_clip": 0.01197064, - "auxiliary_loss_mlp": 0.01066504, - "balance_loss_clip": 1.06452, - "balance_loss_mlp": 1.04173219, - "epoch": 0.06168645723733654, - "flos": 25593104234880.0, - "grad_norm": 2.5217294712155414, - "language_loss": 0.84536898, - "learning_rate": 3.989477727938335e-06, - "loss": 0.86800468, - "num_input_tokens_seen": 21978625, - "step": 1026, - "time_per_iteration": 2.7420806884765625 - }, - { - "auxiliary_loss_clip": 0.01160797, - "auxiliary_loss_mlp": 0.0107525, - "balance_loss_clip": 1.05669701, - "balance_loss_mlp": 1.04934609, - "epoch": 0.06174658049000451, - "flos": 15997917162240.0, - "grad_norm": 2.354014397396182, - "language_loss": 0.8228389, - "learning_rate": 3.989437792548839e-06, - "loss": 0.84519935, - "num_input_tokens_seen": 21996035, - "step": 1027, - "time_per_iteration": 2.6683874130249023 - }, - { - "auxiliary_loss_clip": 0.01160199, - "auxiliary_loss_mlp": 0.01067253, - "balance_loss_clip": 1.06181073, - "balance_loss_mlp": 1.04232645, - "epoch": 0.06180670374267248, - "flos": 11285970117120.0, - "grad_norm": 4.43492107605727, - "language_loss": 0.83898664, - "learning_rate": 3.989397781719663e-06, - "loss": 0.86126107, - "num_input_tokens_seen": 22011625, - "step": 1028, - "time_per_iteration": 2.705387592315674 - }, - { - "auxiliary_loss_clip": 0.0106503, - "auxiliary_loss_mlp": 0.01008074, - "balance_loss_clip": 1.02410197, - "balance_loss_mlp": 1.00347257, - "epoch": 0.06186682699534045, - "flos": 65130142216320.0, - "grad_norm": 0.9383255649985517, - "language_loss": 0.604738, - "learning_rate": 3.989357695452323e-06, - "loss": 0.62546903, - "num_input_tokens_seen": 22066035, - "step": 1029, - "time_per_iteration": 3.0268616676330566 - }, - { - "auxiliary_loss_clip": 0.01176182, - "auxiliary_loss_mlp": 0.01074173, - "balance_loss_clip": 1.05641246, - "balance_loss_mlp": 1.04737473, - "epoch": 0.061926950248008414, - "flos": 21105742976640.0, - "grad_norm": 4.246634693563946, - "language_loss": 0.82589179, - "learning_rate": 3.98931753374834e-06, - "loss": 0.84839535, - "num_input_tokens_seen": 22085015, - "step": 1030, - "time_per_iteration": 2.7035892009735107 - }, - { - "auxiliary_loss_clip": 0.0122298, - "auxiliary_loss_mlp": 0.01077745, - "balance_loss_clip": 1.06850278, - "balance_loss_mlp": 1.05185235, - "epoch": 0.061987073500676386, - "flos": 17748454481280.0, - "grad_norm": 2.585240230669548, - "language_loss": 0.79576576, - "learning_rate": 3.989277296609237e-06, - "loss": 0.81877303, - "num_input_tokens_seen": 22102775, - "step": 1031, - "time_per_iteration": 2.60622501373291 - }, - { - "auxiliary_loss_clip": 0.01188957, - "auxiliary_loss_mlp": 0.01076754, - "balance_loss_clip": 1.06396544, - "balance_loss_mlp": 1.04982424, - "epoch": 0.06204719675334436, - "flos": 21836237869440.0, - "grad_norm": 1.8815476991595563, - "language_loss": 0.77384412, - "learning_rate": 3.98923698403654e-06, - "loss": 0.79650116, - "num_input_tokens_seen": 22121680, - "step": 1032, - "time_per_iteration": 2.6753971576690674 - }, - { - "auxiliary_loss_clip": 0.01198757, - "auxiliary_loss_mlp": 0.01074736, - "balance_loss_clip": 1.05916619, - "balance_loss_mlp": 1.04848623, - "epoch": 0.06210732000601232, - "flos": 19353697286400.0, - "grad_norm": 3.147941025479245, - "language_loss": 0.89323574, - "learning_rate": 3.989196596031776e-06, - "loss": 0.91597068, - "num_input_tokens_seen": 22138155, - "step": 1033, - "time_per_iteration": 2.7313079833984375 - }, - { - "auxiliary_loss_clip": 0.01209161, - "auxiliary_loss_mlp": 0.01066082, - "balance_loss_clip": 1.06214237, - "balance_loss_mlp": 1.04119134, - "epoch": 0.062167443258680295, - "flos": 24749382695040.0, - "grad_norm": 2.1035343880884145, - "language_loss": 0.8455385, - "learning_rate": 3.989156132596479e-06, - "loss": 0.8682909, - "num_input_tokens_seen": 22157420, - "step": 1034, - "time_per_iteration": 2.7541439533233643 - }, - { - "auxiliary_loss_clip": 0.01180042, - "auxiliary_loss_mlp": 0.01057312, - "balance_loss_clip": 1.05896068, - "balance_loss_mlp": 1.03155136, - "epoch": 0.06222756651134827, - "flos": 34458478773120.0, - "grad_norm": 1.8983498110529735, - "language_loss": 0.8082794, - "learning_rate": 3.989115593732182e-06, - "loss": 0.83065289, - "num_input_tokens_seen": 22178620, - "step": 1035, - "time_per_iteration": 2.7965424060821533 - }, - { - "auxiliary_loss_clip": 0.01158806, - "auxiliary_loss_mlp": 0.01072478, - "balance_loss_clip": 1.05936599, - "balance_loss_mlp": 1.04432034, - "epoch": 0.06228768976401623, - "flos": 25666469763840.0, - "grad_norm": 2.145216314952277, - "language_loss": 0.78365827, - "learning_rate": 3.989074979440421e-06, - "loss": 0.80597103, - "num_input_tokens_seen": 22197125, - "step": 1036, - "time_per_iteration": 2.7858450412750244 - }, - { - "auxiliary_loss_clip": 0.01192097, - "auxiliary_loss_mlp": 0.01071382, - "balance_loss_clip": 1.05977845, - "balance_loss_mlp": 1.04663444, - "epoch": 0.062347813016684205, - "flos": 25295619795840.0, - "grad_norm": 1.9535870339716077, - "language_loss": 0.86544567, - "learning_rate": 3.989034289722739e-06, - "loss": 0.88808048, - "num_input_tokens_seen": 22217575, - "step": 1037, - "time_per_iteration": 2.685373306274414 - }, - { - "auxiliary_loss_clip": 0.01197778, - "auxiliary_loss_mlp": 0.01057095, - "balance_loss_clip": 1.06127763, - "balance_loss_mlp": 1.02966499, - "epoch": 0.06240793626935217, - "flos": 26907039740160.0, - "grad_norm": 2.697396725345887, - "language_loss": 0.8067717, - "learning_rate": 3.988993524580676e-06, - "loss": 0.82932043, - "num_input_tokens_seen": 22236840, - "step": 1038, - "time_per_iteration": 2.7305831909179688 - }, - { - "auxiliary_loss_clip": 0.01145896, - "auxiliary_loss_mlp": 0.01072721, - "balance_loss_clip": 1.05226004, - "balance_loss_mlp": 1.04330015, - "epoch": 0.06246805952202014, - "flos": 21615782146560.0, - "grad_norm": 1.8888526922505675, - "language_loss": 0.85465872, - "learning_rate": 3.98895268401578e-06, - "loss": 0.87684488, - "num_input_tokens_seen": 22256465, - "step": 1039, - "time_per_iteration": 2.7351109981536865 - }, - { - "auxiliary_loss_clip": 0.01188545, - "auxiliary_loss_mlp": 0.01070323, - "balance_loss_clip": 1.05834138, - "balance_loss_mlp": 1.04472923, - "epoch": 0.0625281827746881, - "flos": 19311896833920.0, - "grad_norm": 2.217895985816133, - "language_loss": 0.81172895, - "learning_rate": 3.9889117680296e-06, - "loss": 0.83431756, - "num_input_tokens_seen": 22274025, - "step": 1040, - "time_per_iteration": 2.6532907485961914 - }, - { - "auxiliary_loss_clip": 0.0121654, - "auxiliary_loss_mlp": 0.0106312, - "balance_loss_clip": 1.06718016, - "balance_loss_mlp": 1.03808582, - "epoch": 0.06258830602735609, - "flos": 27745769289600.0, - "grad_norm": 2.1960038080149817, - "language_loss": 0.69304991, - "learning_rate": 3.988870776623685e-06, - "loss": 0.71584648, - "num_input_tokens_seen": 22292245, - "step": 1041, - "time_per_iteration": 2.6445486545562744 - }, - { - "auxiliary_loss_clip": 0.01214659, - "auxiliary_loss_mlp": 0.01057975, - "balance_loss_clip": 1.06247008, - "balance_loss_mlp": 1.03182077, - "epoch": 0.06264842928002405, - "flos": 23222605150080.0, - "grad_norm": 2.7326158002445, - "language_loss": 0.81187552, - "learning_rate": 3.9888297097995905e-06, - "loss": 0.83460188, - "num_input_tokens_seen": 22311455, - "step": 1042, - "time_per_iteration": 2.6111559867858887 - }, - { - "auxiliary_loss_clip": 0.01211653, - "auxiliary_loss_mlp": 0.01052676, - "balance_loss_clip": 1.06253886, - "balance_loss_mlp": 1.02871442, - "epoch": 0.06270855253269202, - "flos": 38399495189760.0, - "grad_norm": 1.7165873820424848, - "language_loss": 0.76349056, - "learning_rate": 3.988788567558874e-06, - "loss": 0.78613389, - "num_input_tokens_seen": 22333750, - "step": 1043, - "time_per_iteration": 2.761768341064453 - }, - { - "auxiliary_loss_clip": 0.0118944, - "auxiliary_loss_mlp": 0.01063189, - "balance_loss_clip": 1.06111181, - "balance_loss_mlp": 1.03912091, - "epoch": 0.06276867578535998, - "flos": 22453542028800.0, - "grad_norm": 8.34017761542712, - "language_loss": 0.92031956, - "learning_rate": 3.988747349903097e-06, - "loss": 0.94284582, - "num_input_tokens_seen": 22351940, - "step": 1044, - "time_per_iteration": 2.636179208755493 - }, - { - "auxiliary_loss_clip": 0.01192566, - "auxiliary_loss_mlp": 0.01070128, - "balance_loss_clip": 1.05862689, - "balance_loss_mlp": 1.0456785, - "epoch": 0.06282879903802796, - "flos": 22930435923840.0, - "grad_norm": 2.3486674311430944, - "language_loss": 0.85913992, - "learning_rate": 3.988706056833821e-06, - "loss": 0.88176692, - "num_input_tokens_seen": 22372085, - "step": 1045, - "time_per_iteration": 2.7749502658843994 - }, - { - "auxiliary_loss_clip": 0.01179197, - "auxiliary_loss_mlp": 0.01065176, - "balance_loss_clip": 1.05804443, - "balance_loss_mlp": 1.04053521, - "epoch": 0.06288892229069593, - "flos": 34819237019520.0, - "grad_norm": 1.9846122850853416, - "language_loss": 0.7796576, - "learning_rate": 3.9886646883526125e-06, - "loss": 0.80210131, - "num_input_tokens_seen": 22392020, - "step": 1046, - "time_per_iteration": 2.803135871887207 - }, - { - "auxiliary_loss_clip": 0.01197344, - "auxiliary_loss_mlp": 0.01069269, - "balance_loss_clip": 1.06361508, - "balance_loss_mlp": 1.04558206, - "epoch": 0.06294904554336389, - "flos": 19427134642560.0, - "grad_norm": 2.174325060947129, - "language_loss": 0.77326387, - "learning_rate": 3.988623244461039e-06, - "loss": 0.79592997, - "num_input_tokens_seen": 22411180, - "step": 1047, - "time_per_iteration": 2.647446632385254 - }, - { - "auxiliary_loss_clip": 0.01200907, - "auxiliary_loss_mlp": 0.0105799, - "balance_loss_clip": 1.06238222, - "balance_loss_mlp": 1.03314662, - "epoch": 0.06300916879603187, - "flos": 40661867358720.0, - "grad_norm": 2.4899372640825046, - "language_loss": 0.77190751, - "learning_rate": 3.988581725160672e-06, - "loss": 0.79449654, - "num_input_tokens_seen": 22435105, - "step": 1048, - "time_per_iteration": 2.8167293071746826 - }, - { - "auxiliary_loss_clip": 0.0118184, - "auxiliary_loss_mlp": 0.01064361, - "balance_loss_clip": 1.0613215, - "balance_loss_mlp": 1.03914821, - "epoch": 0.06306929204869983, - "flos": 23804142341760.0, - "grad_norm": 4.606540291271834, - "language_loss": 0.77258086, - "learning_rate": 3.988540130453087e-06, - "loss": 0.79504287, - "num_input_tokens_seen": 22452710, - "step": 1049, - "time_per_iteration": 2.6908538341522217 - }, - { - "auxiliary_loss_clip": 0.01194538, - "auxiliary_loss_mlp": 0.0105682, - "balance_loss_clip": 1.06043661, - "balance_loss_mlp": 1.03290701, - "epoch": 0.0631294153013678, - "flos": 18915802583040.0, - "grad_norm": 2.515998307474139, - "language_loss": 0.83302009, - "learning_rate": 3.988498460339862e-06, - "loss": 0.85553372, - "num_input_tokens_seen": 22470175, - "step": 1050, - "time_per_iteration": 2.62186861038208 - }, - { - "auxiliary_loss_clip": 0.01210654, - "auxiliary_loss_mlp": 0.01062894, - "balance_loss_clip": 1.06468701, - "balance_loss_mlp": 1.04008913, - "epoch": 0.06318953855403578, - "flos": 24280174310400.0, - "grad_norm": 5.5478202090132065, - "language_loss": 0.76564771, - "learning_rate": 3.988456714822575e-06, - "loss": 0.78838319, - "num_input_tokens_seen": 22490020, - "step": 1051, - "time_per_iteration": 2.732269525527954 - }, - { - "auxiliary_loss_clip": 0.01188416, - "auxiliary_loss_mlp": 0.01069443, - "balance_loss_clip": 1.06340146, - "balance_loss_mlp": 1.04492211, - "epoch": 0.06324966180670374, - "flos": 22528918719360.0, - "grad_norm": 1.9993900469270787, - "language_loss": 0.80410004, - "learning_rate": 3.98841489390281e-06, - "loss": 0.82667863, - "num_input_tokens_seen": 22509685, - "step": 1052, - "time_per_iteration": 2.7683873176574707 - }, - { - "auxiliary_loss_clip": 0.01211333, - "auxiliary_loss_mlp": 0.01058255, - "balance_loss_clip": 1.06324601, - "balance_loss_mlp": 1.03468728, - "epoch": 0.06330978505937171, - "flos": 15778107884160.0, - "grad_norm": 2.370007457349547, - "language_loss": 0.77433288, - "learning_rate": 3.988372997582155e-06, - "loss": 0.79702866, - "num_input_tokens_seen": 22527905, - "step": 1053, - "time_per_iteration": 5.757168531417847 - }, - { - "auxiliary_loss_clip": 0.01190721, - "auxiliary_loss_mlp": 0.00780448, - "balance_loss_clip": 1.06378174, - "balance_loss_mlp": 1.00028598, - "epoch": 0.06336990831203967, - "flos": 21471098163840.0, - "grad_norm": 3.085258828985267, - "language_loss": 0.84931248, - "learning_rate": 3.988331025862195e-06, - "loss": 0.86902416, - "num_input_tokens_seen": 22546335, - "step": 1054, - "time_per_iteration": 2.7733829021453857 - }, - { - "auxiliary_loss_clip": 0.01172281, - "auxiliary_loss_mlp": 0.01061232, - "balance_loss_clip": 1.05722666, - "balance_loss_mlp": 1.03753328, - "epoch": 0.06343003156470765, - "flos": 18478877546880.0, - "grad_norm": 2.0168531459993435, - "language_loss": 0.85884213, - "learning_rate": 3.9882889787445225e-06, - "loss": 0.88117731, - "num_input_tokens_seen": 22563885, - "step": 1055, - "time_per_iteration": 4.490305185317993 - }, - { - "auxiliary_loss_clip": 0.01164237, - "auxiliary_loss_mlp": 0.01069785, - "balance_loss_clip": 1.05727792, - "balance_loss_mlp": 1.04534709, - "epoch": 0.06349015481737562, - "flos": 25154886309120.0, - "grad_norm": 2.4509218988768, - "language_loss": 0.8113938, - "learning_rate": 3.988246856230734e-06, - "loss": 0.83373404, - "num_input_tokens_seen": 22583035, - "step": 1056, - "time_per_iteration": 5.345282793045044 - }, - { - "auxiliary_loss_clip": 0.01144181, - "auxiliary_loss_mlp": 0.01061125, - "balance_loss_clip": 1.04991364, - "balance_loss_mlp": 1.03449368, - "epoch": 0.06355027807004358, - "flos": 26871775562880.0, - "grad_norm": 2.2117272688527128, - "language_loss": 0.81083393, - "learning_rate": 3.988204658322426e-06, - "loss": 0.83288693, - "num_input_tokens_seen": 22605055, - "step": 1057, - "time_per_iteration": 2.866757392883301 - }, - { - "auxiliary_loss_clip": 0.01139076, - "auxiliary_loss_mlp": 0.01061742, - "balance_loss_clip": 1.04970908, - "balance_loss_mlp": 1.03918755, - "epoch": 0.06361040132271156, - "flos": 21396691140480.0, - "grad_norm": 1.9636971972870172, - "language_loss": 0.83353591, - "learning_rate": 3.988162385021196e-06, - "loss": 0.85554409, - "num_input_tokens_seen": 22623760, - "step": 1058, - "time_per_iteration": 2.767024278640747 - }, - { - "auxiliary_loss_clip": 0.0117752, - "auxiliary_loss_mlp": 0.01059639, - "balance_loss_clip": 1.0576936, - "balance_loss_mlp": 1.03408027, - "epoch": 0.06367052457537953, - "flos": 25733765894400.0, - "grad_norm": 2.137077300251244, - "language_loss": 0.87556928, - "learning_rate": 3.988120036328651e-06, - "loss": 0.89794087, - "num_input_tokens_seen": 22643000, - "step": 1059, - "time_per_iteration": 2.794734239578247 - }, - { - "auxiliary_loss_clip": 0.01169658, - "auxiliary_loss_mlp": 0.01063463, - "balance_loss_clip": 1.06196678, - "balance_loss_mlp": 1.0383693, - "epoch": 0.0637306478280475, - "flos": 17631420992640.0, - "grad_norm": 2.543966627588717, - "language_loss": 0.91561133, - "learning_rate": 3.988077612246394e-06, - "loss": 0.93794256, - "num_input_tokens_seen": 22660460, - "step": 1060, - "time_per_iteration": 2.8223626613616943 - }, - { - "auxiliary_loss_clip": 0.01173933, - "auxiliary_loss_mlp": 0.01065151, - "balance_loss_clip": 1.05715585, - "balance_loss_mlp": 1.03981876, - "epoch": 0.06379077108071547, - "flos": 13662610427520.0, - "grad_norm": 1.9401711052692647, - "language_loss": 0.87242293, - "learning_rate": 3.988035112776035e-06, - "loss": 0.89481378, - "num_input_tokens_seen": 22679270, - "step": 1061, - "time_per_iteration": 2.7783865928649902 - }, - { - "auxiliary_loss_clip": 0.01190039, - "auxiliary_loss_mlp": 0.01059971, - "balance_loss_clip": 1.05976009, - "balance_loss_mlp": 1.03388786, - "epoch": 0.06385089433338344, - "flos": 28478849961600.0, - "grad_norm": 5.360593029379932, - "language_loss": 0.77407908, - "learning_rate": 3.987992537919185e-06, - "loss": 0.79657912, - "num_input_tokens_seen": 22699330, - "step": 1062, - "time_per_iteration": 2.872587203979492 - }, - { - "auxiliary_loss_clip": 0.01172912, - "auxiliary_loss_mlp": 0.01061175, - "balance_loss_clip": 1.05884075, - "balance_loss_mlp": 1.03798842, - "epoch": 0.0639110175860514, - "flos": 24311057028480.0, - "grad_norm": 2.2658654128491436, - "language_loss": 0.86522883, - "learning_rate": 3.987949887677459e-06, - "loss": 0.88756967, - "num_input_tokens_seen": 22717945, - "step": 1063, - "time_per_iteration": 2.7915029525756836 - }, - { - "auxiliary_loss_clip": 0.01207773, - "auxiliary_loss_mlp": 0.01062698, - "balance_loss_clip": 1.05969334, - "balance_loss_mlp": 1.03846335, - "epoch": 0.06397114083871938, - "flos": 22090772620800.0, - "grad_norm": 2.302236346678267, - "language_loss": 0.79908657, - "learning_rate": 3.9879071620524744e-06, - "loss": 0.82179129, - "num_input_tokens_seen": 22736790, - "step": 1064, - "time_per_iteration": 2.6880991458892822 - }, - { - "auxiliary_loss_clip": 0.01198826, - "auxiliary_loss_mlp": 0.01066465, - "balance_loss_clip": 1.0603801, - "balance_loss_mlp": 1.04149103, - "epoch": 0.06403126409138735, - "flos": 19572824206080.0, - "grad_norm": 3.1552731138796215, - "language_loss": 0.84327948, - "learning_rate": 3.987864361045851e-06, - "loss": 0.8659324, - "num_input_tokens_seen": 22754745, - "step": 1065, - "time_per_iteration": 2.6956398487091064 - }, - { - "auxiliary_loss_clip": 0.01168098, - "auxiliary_loss_mlp": 0.01054905, - "balance_loss_clip": 1.0597136, - "balance_loss_mlp": 1.03162324, - "epoch": 0.06409138734405531, - "flos": 40807413267840.0, - "grad_norm": 1.52830872536012, - "language_loss": 0.68177885, - "learning_rate": 3.987821484659211e-06, - "loss": 0.70400894, - "num_input_tokens_seen": 22776780, - "step": 1066, - "time_per_iteration": 2.9867773056030273 - }, - { - "auxiliary_loss_clip": 0.01214184, - "auxiliary_loss_mlp": 0.01070649, - "balance_loss_clip": 1.06780005, - "balance_loss_mlp": 1.04609215, - "epoch": 0.06415151059672328, - "flos": 20441610460800.0, - "grad_norm": 1.8546001537284342, - "language_loss": 0.90349269, - "learning_rate": 3.987778532894181e-06, - "loss": 0.926341, - "num_input_tokens_seen": 22793915, - "step": 1067, - "time_per_iteration": 2.685896873474121 - }, - { - "auxiliary_loss_clip": 0.01188134, - "auxiliary_loss_mlp": 0.01063022, - "balance_loss_clip": 1.0623709, - "balance_loss_mlp": 1.03969264, - "epoch": 0.06421163384939126, - "flos": 18072045129600.0, - "grad_norm": 2.189788428445167, - "language_loss": 0.83437371, - "learning_rate": 3.987735505752391e-06, - "loss": 0.85688531, - "num_input_tokens_seen": 22812670, - "step": 1068, - "time_per_iteration": 2.851602554321289 - }, - { - "auxiliary_loss_clip": 0.01178972, - "auxiliary_loss_mlp": 0.01057745, - "balance_loss_clip": 1.05909026, - "balance_loss_mlp": 1.03426039, - "epoch": 0.06427175710205922, - "flos": 25119442563840.0, - "grad_norm": 3.045176948020938, - "language_loss": 0.89311272, - "learning_rate": 3.987692403235471e-06, - "loss": 0.9154799, - "num_input_tokens_seen": 22832440, - "step": 1069, - "time_per_iteration": 2.7825255393981934 - }, - { - "auxiliary_loss_clip": 0.01185672, - "auxiliary_loss_mlp": 0.01071834, - "balance_loss_clip": 1.06158304, - "balance_loss_mlp": 1.04663301, - "epoch": 0.06433188035472719, - "flos": 17380549428480.0, - "grad_norm": 2.7038488706194808, - "language_loss": 0.95759481, - "learning_rate": 3.987649225345056e-06, - "loss": 0.98016989, - "num_input_tokens_seen": 22845495, - "step": 1070, - "time_per_iteration": 2.715296506881714 - }, - { - "auxiliary_loss_clip": 0.01140792, - "auxiliary_loss_mlp": 0.01056718, - "balance_loss_clip": 1.05607581, - "balance_loss_mlp": 1.03027749, - "epoch": 0.06439200360739517, - "flos": 23546267625600.0, - "grad_norm": 1.630790580283393, - "language_loss": 0.8811003, - "learning_rate": 3.987605972082782e-06, - "loss": 0.90307534, - "num_input_tokens_seen": 22865390, - "step": 1071, - "time_per_iteration": 2.8445394039154053 - }, - { - "auxiliary_loss_clip": 0.01155172, - "auxiliary_loss_mlp": 0.01054986, - "balance_loss_clip": 1.05483651, - "balance_loss_mlp": 1.03102481, - "epoch": 0.06445212686006313, - "flos": 21979772616960.0, - "grad_norm": 1.8349443396730127, - "language_loss": 0.76116478, - "learning_rate": 3.987562643450292e-06, - "loss": 0.78326637, - "num_input_tokens_seen": 22885495, - "step": 1072, - "time_per_iteration": 2.8330819606781006 - }, - { - "auxiliary_loss_clip": 0.01172997, - "auxiliary_loss_mlp": 0.01070104, - "balance_loss_clip": 1.05975842, - "balance_loss_mlp": 1.04362798, - "epoch": 0.0645122501127311, - "flos": 25921291824000.0, - "grad_norm": 2.724283900767911, - "language_loss": 0.80849886, - "learning_rate": 3.987519239449226e-06, - "loss": 0.83092993, - "num_input_tokens_seen": 22904845, - "step": 1073, - "time_per_iteration": 2.748286247253418 - }, - { - "auxiliary_loss_clip": 0.01194712, - "auxiliary_loss_mlp": 0.01062452, - "balance_loss_clip": 1.06345201, - "balance_loss_mlp": 1.03825283, - "epoch": 0.06457237336539907, - "flos": 25626034028160.0, - "grad_norm": 5.0538746884234245, - "language_loss": 0.80282539, - "learning_rate": 3.987475760081233e-06, - "loss": 0.82539707, - "num_input_tokens_seen": 22925940, - "step": 1074, - "time_per_iteration": 2.7482337951660156 - }, - { - "auxiliary_loss_clip": 0.01173366, - "auxiliary_loss_mlp": 0.01057774, - "balance_loss_clip": 1.05920076, - "balance_loss_mlp": 1.03256142, - "epoch": 0.06463249661806704, - "flos": 19463979018240.0, - "grad_norm": 2.0209756517373707, - "language_loss": 0.79249811, - "learning_rate": 3.987432205347958e-06, - "loss": 0.8148095, - "num_input_tokens_seen": 22944375, - "step": 1075, - "time_per_iteration": 2.6937224864959717 - }, - { - "auxiliary_loss_clip": 0.01171297, - "auxiliary_loss_mlp": 0.01063569, - "balance_loss_clip": 1.05735481, - "balance_loss_mlp": 1.04025126, - "epoch": 0.064692619870735, - "flos": 24498044254080.0, - "grad_norm": 2.9028991302223357, - "language_loss": 0.88208115, - "learning_rate": 3.987388575251055e-06, - "loss": 0.90442967, - "num_input_tokens_seen": 22959145, - "step": 1076, - "time_per_iteration": 2.878103256225586 - }, - { - "auxiliary_loss_clip": 0.01192915, - "auxiliary_loss_mlp": 0.01052877, - "balance_loss_clip": 1.06164443, - "balance_loss_mlp": 1.0288558, - "epoch": 0.06475274312340297, - "flos": 17018677860480.0, - "grad_norm": 2.225760792581628, - "language_loss": 0.80876106, - "learning_rate": 3.98734486979218e-06, - "loss": 0.83121902, - "num_input_tokens_seen": 22978100, - "step": 1077, - "time_per_iteration": 2.7221076488494873 - }, - { - "auxiliary_loss_clip": 0.01200466, - "auxiliary_loss_mlp": 0.01064019, - "balance_loss_clip": 1.0656153, - "balance_loss_mlp": 1.03866291, - "epoch": 0.06481286637607095, - "flos": 24572379450240.0, - "grad_norm": 2.256787147683815, - "language_loss": 0.91727465, - "learning_rate": 3.987301088972986e-06, - "loss": 0.93991947, - "num_input_tokens_seen": 22997285, - "step": 1078, - "time_per_iteration": 2.862365484237671 - }, - { - "auxiliary_loss_clip": 0.0122435, - "auxiliary_loss_mlp": 0.01060225, - "balance_loss_clip": 1.06826639, - "balance_loss_mlp": 1.03552508, - "epoch": 0.06487298962873891, - "flos": 21105635235840.0, - "grad_norm": 2.080056711608912, - "language_loss": 0.78349572, - "learning_rate": 3.987257232795137e-06, - "loss": 0.80634147, - "num_input_tokens_seen": 23016285, - "step": 1079, - "time_per_iteration": 2.6435368061065674 - }, - { - "auxiliary_loss_clip": 0.01156927, - "auxiliary_loss_mlp": 0.01063794, - "balance_loss_clip": 1.05512071, - "balance_loss_mlp": 1.03899896, - "epoch": 0.06493311288140688, - "flos": 24608182331520.0, - "grad_norm": 2.274862403364013, - "language_loss": 0.68702769, - "learning_rate": 3.987213301260294e-06, - "loss": 0.70923495, - "num_input_tokens_seen": 23036420, - "step": 1080, - "time_per_iteration": 2.7782626152038574 - }, - { - "auxiliary_loss_clip": 0.01175684, - "auxiliary_loss_mlp": 0.01062351, - "balance_loss_clip": 1.06640029, - "balance_loss_mlp": 1.03610086, - "epoch": 0.06499323613407486, - "flos": 25337994865920.0, - "grad_norm": 1.886196453243775, - "language_loss": 0.72291583, - "learning_rate": 3.987169294370123e-06, - "loss": 0.74529618, - "num_input_tokens_seen": 23056945, - "step": 1081, - "time_per_iteration": 2.7983880043029785 - }, - { - "auxiliary_loss_clip": 0.01139671, - "auxiliary_loss_mlp": 0.01066686, - "balance_loss_clip": 1.0504055, - "balance_loss_mlp": 1.04076982, - "epoch": 0.06505335938674282, - "flos": 20375714960640.0, - "grad_norm": 3.3093934650613566, - "language_loss": 0.84059012, - "learning_rate": 3.987125212126294e-06, - "loss": 0.86265367, - "num_input_tokens_seen": 23074940, - "step": 1082, - "time_per_iteration": 2.8351900577545166 - }, - { - "auxiliary_loss_clip": 0.01204185, - "auxiliary_loss_mlp": 0.01063692, - "balance_loss_clip": 1.06306195, - "balance_loss_mlp": 1.03809738, - "epoch": 0.06511348263941079, - "flos": 25337923038720.0, - "grad_norm": 2.894360492506304, - "language_loss": 0.82550305, - "learning_rate": 3.987081054530478e-06, - "loss": 0.84818184, - "num_input_tokens_seen": 23093420, - "step": 1083, - "time_per_iteration": 2.866729974746704 - }, - { - "auxiliary_loss_clip": 0.01168245, - "auxiliary_loss_mlp": 0.01062938, - "balance_loss_clip": 1.06021011, - "balance_loss_mlp": 1.03655696, - "epoch": 0.06517360589207877, - "flos": 20332801186560.0, - "grad_norm": 2.468736383036802, - "language_loss": 0.79289383, - "learning_rate": 3.987036821584348e-06, - "loss": 0.81520569, - "num_input_tokens_seen": 23111550, - "step": 1084, - "time_per_iteration": 2.816601276397705 - }, - { - "auxiliary_loss_clip": 0.01174068, - "auxiliary_loss_mlp": 0.0106167, - "balance_loss_clip": 1.05854714, - "balance_loss_mlp": 1.03667152, - "epoch": 0.06523372914474673, - "flos": 31681650061440.0, - "grad_norm": 2.571590277205686, - "language_loss": 0.66443276, - "learning_rate": 3.986992513289584e-06, - "loss": 0.68679011, - "num_input_tokens_seen": 23130335, - "step": 1085, - "time_per_iteration": 2.8260092735290527 - }, - { - "auxiliary_loss_clip": 0.01170818, - "auxiliary_loss_mlp": 0.01062435, - "balance_loss_clip": 1.0600934, - "balance_loss_mlp": 1.03833067, - "epoch": 0.0652938523974147, - "flos": 20778165918720.0, - "grad_norm": 2.0478791529086977, - "language_loss": 0.76548934, - "learning_rate": 3.9869481296478645e-06, - "loss": 0.78782183, - "num_input_tokens_seen": 23152380, - "step": 1086, - "time_per_iteration": 2.7937023639678955 - }, - { - "auxiliary_loss_clip": 0.01198609, - "auxiliary_loss_mlp": 0.01059288, - "balance_loss_clip": 1.06335294, - "balance_loss_mlp": 1.03519547, - "epoch": 0.06535397565008266, - "flos": 16690993061760.0, - "grad_norm": 2.1629448601391017, - "language_loss": 0.85109925, - "learning_rate": 3.986903670660872e-06, - "loss": 0.87367821, - "num_input_tokens_seen": 23171630, - "step": 1087, - "time_per_iteration": 2.7510013580322266 - }, - { - "auxiliary_loss_clip": 0.01184978, - "auxiliary_loss_mlp": 0.01059017, - "balance_loss_clip": 1.06293821, - "balance_loss_mlp": 1.03510392, - "epoch": 0.06541409890275064, - "flos": 26868220116480.0, - "grad_norm": 1.7886353193129139, - "language_loss": 0.77776635, - "learning_rate": 3.9868591363302945e-06, - "loss": 0.80020636, - "num_input_tokens_seen": 23192520, - "step": 1088, - "time_per_iteration": 2.7792751789093018 - }, - { - "auxiliary_loss_clip": 0.01192707, - "auxiliary_loss_mlp": 0.01067634, - "balance_loss_clip": 1.06569457, - "balance_loss_mlp": 1.04498422, - "epoch": 0.06547422215541861, - "flos": 20521620005760.0, - "grad_norm": 3.0334087154373375, - "language_loss": 0.71050513, - "learning_rate": 3.9868145266578186e-06, - "loss": 0.73310852, - "num_input_tokens_seen": 23210710, - "step": 1089, - "time_per_iteration": 2.8832852840423584 - }, - { - "auxiliary_loss_clip": 0.01173663, - "auxiliary_loss_mlp": 0.00781529, - "balance_loss_clip": 1.06159782, - "balance_loss_mlp": 1.00019014, - "epoch": 0.06553434540808657, - "flos": 22016616992640.0, - "grad_norm": 2.02973275746688, - "language_loss": 0.85650897, - "learning_rate": 3.9867698416451366e-06, - "loss": 0.87606084, - "num_input_tokens_seen": 23230305, - "step": 1090, - "time_per_iteration": 2.7933149337768555 - }, - { - "auxiliary_loss_clip": 0.01214666, - "auxiliary_loss_mlp": 0.0105885, - "balance_loss_clip": 1.06735325, - "balance_loss_mlp": 1.03460288, - "epoch": 0.06559446866075455, - "flos": 24608649208320.0, - "grad_norm": 2.137212216289862, - "language_loss": 0.71829313, - "learning_rate": 3.9867250812939434e-06, - "loss": 0.74102825, - "num_input_tokens_seen": 23249015, - "step": 1091, - "time_per_iteration": 2.646592855453491 - }, - { - "auxiliary_loss_clip": 0.01121055, - "auxiliary_loss_mlp": 0.0106405, - "balance_loss_clip": 1.05242276, - "balance_loss_mlp": 1.03961205, - "epoch": 0.06565459191342252, - "flos": 24274679529600.0, - "grad_norm": 2.2773849385721956, - "language_loss": 0.82839823, - "learning_rate": 3.986680245605936e-06, - "loss": 0.85024923, - "num_input_tokens_seen": 23265105, - "step": 1092, - "time_per_iteration": 4.799649715423584 - }, - { - "auxiliary_loss_clip": 0.01215092, - "auxiliary_loss_mlp": 0.01059151, - "balance_loss_clip": 1.0640471, - "balance_loss_mlp": 1.03352082, - "epoch": 0.06571471516609048, - "flos": 24787124910720.0, - "grad_norm": 2.268968080418226, - "language_loss": 0.71134168, - "learning_rate": 3.986635334582814e-06, - "loss": 0.73408413, - "num_input_tokens_seen": 23283950, - "step": 1093, - "time_per_iteration": 5.3356239795684814 - }, - { - "auxiliary_loss_clip": 0.01190682, - "auxiliary_loss_mlp": 0.01064498, - "balance_loss_clip": 1.06751943, - "balance_loss_mlp": 1.0392611, - "epoch": 0.06577483841875846, - "flos": 26214071581440.0, - "grad_norm": 3.829837904337144, - "language_loss": 0.87996346, - "learning_rate": 3.986590348226282e-06, - "loss": 0.90251523, - "num_input_tokens_seen": 23305005, - "step": 1094, - "time_per_iteration": 2.853489637374878 - }, - { - "auxiliary_loss_clip": 0.01192742, - "auxiliary_loss_mlp": 0.01065068, - "balance_loss_clip": 1.06367433, - "balance_loss_mlp": 1.03843689, - "epoch": 0.06583496167142643, - "flos": 25080802508160.0, - "grad_norm": 1.6736216436017588, - "language_loss": 0.81483954, - "learning_rate": 3.986545286538044e-06, - "loss": 0.8374176, - "num_input_tokens_seen": 23323220, - "step": 1095, - "time_per_iteration": 5.1613922119140625 - }, - { - "auxiliary_loss_clip": 0.01166049, - "auxiliary_loss_mlp": 0.01058945, - "balance_loss_clip": 1.06295943, - "balance_loss_mlp": 1.03598547, - "epoch": 0.06589508492409439, - "flos": 25629804956160.0, - "grad_norm": 2.0200125290673068, - "language_loss": 0.69789279, - "learning_rate": 3.986500149519811e-06, - "loss": 0.72014272, - "num_input_tokens_seen": 23342235, - "step": 1096, - "time_per_iteration": 2.804025173187256 - }, - { - "auxiliary_loss_clip": 0.01201939, - "auxiliary_loss_mlp": 0.01070786, - "balance_loss_clip": 1.06405246, - "balance_loss_mlp": 1.04614568, - "epoch": 0.06595520817676236, - "flos": 23621249266560.0, - "grad_norm": 1.7011375462517908, - "language_loss": 0.77430046, - "learning_rate": 3.986454937173292e-06, - "loss": 0.79702777, - "num_input_tokens_seen": 23363680, - "step": 1097, - "time_per_iteration": 2.7658958435058594 - }, - { - "auxiliary_loss_clip": 0.01215996, - "auxiliary_loss_mlp": 0.01063445, - "balance_loss_clip": 1.06707537, - "balance_loss_mlp": 1.03959155, - "epoch": 0.06601533142943034, - "flos": 33801708545280.0, - "grad_norm": 1.8316558452843608, - "language_loss": 0.78217584, - "learning_rate": 3.986409649500203e-06, - "loss": 0.80497026, - "num_input_tokens_seen": 23385590, - "step": 1098, - "time_per_iteration": 2.865684747695923 - }, - { - "auxiliary_loss_clip": 0.01197349, - "auxiliary_loss_mlp": 0.01069192, - "balance_loss_clip": 1.06328607, - "balance_loss_mlp": 1.04443276, - "epoch": 0.0660754546820983, - "flos": 20259184262400.0, - "grad_norm": 1.9237510259783663, - "language_loss": 0.81525648, - "learning_rate": 3.986364286502261e-06, - "loss": 0.83792192, - "num_input_tokens_seen": 23402945, - "step": 1099, - "time_per_iteration": 2.690377950668335 - }, - { - "auxiliary_loss_clip": 0.01179995, - "auxiliary_loss_mlp": 0.0105819, - "balance_loss_clip": 1.0578239, - "balance_loss_mlp": 1.03428841, - "epoch": 0.06613557793476627, - "flos": 19354164163200.0, - "grad_norm": 1.9906927310803755, - "language_loss": 0.82793295, - "learning_rate": 3.986318848181186e-06, - "loss": 0.8503148, - "num_input_tokens_seen": 23421410, - "step": 1100, - "time_per_iteration": 2.7613909244537354 - }, - { - "auxiliary_loss_clip": 0.01191263, - "auxiliary_loss_mlp": 0.0105903, - "balance_loss_clip": 1.06985724, - "balance_loss_mlp": 1.03529549, - "epoch": 0.06619570118743424, - "flos": 13772568936960.0, - "grad_norm": 2.079994286400427, - "language_loss": 0.73502243, - "learning_rate": 3.986273334538702e-06, - "loss": 0.75752538, - "num_input_tokens_seen": 23438870, - "step": 1101, - "time_per_iteration": 2.7795870304107666 - }, - { - "auxiliary_loss_clip": 0.01199256, - "auxiliary_loss_mlp": 0.01061171, - "balance_loss_clip": 1.06278944, - "balance_loss_mlp": 1.03773487, - "epoch": 0.06625582444010221, - "flos": 17857874286720.0, - "grad_norm": 2.875757629612747, - "language_loss": 0.85861301, - "learning_rate": 3.986227745576533e-06, - "loss": 0.88121736, - "num_input_tokens_seen": 23456975, - "step": 1102, - "time_per_iteration": 2.737269401550293 - }, - { - "auxiliary_loss_clip": 0.01191982, - "auxiliary_loss_mlp": 0.01058639, - "balance_loss_clip": 1.06898165, - "balance_loss_mlp": 1.03410578, - "epoch": 0.06631594769277017, - "flos": 11838707579520.0, - "grad_norm": 2.8924251757501778, - "language_loss": 0.81655926, - "learning_rate": 3.98618208129641e-06, - "loss": 0.83906543, - "num_input_tokens_seen": 23473440, - "step": 1103, - "time_per_iteration": 2.9345293045043945 - }, - { - "auxiliary_loss_clip": 0.01203522, - "auxiliary_loss_mlp": 0.00780451, - "balance_loss_clip": 1.06721628, - "balance_loss_mlp": 1.00042021, - "epoch": 0.06637607094543815, - "flos": 19793351756160.0, - "grad_norm": 5.176370819061919, - "language_loss": 0.81749105, - "learning_rate": 3.986136341700063e-06, - "loss": 0.83733076, - "num_input_tokens_seen": 23493880, - "step": 1104, - "time_per_iteration": 2.753657102584839 - }, - { - "auxiliary_loss_clip": 0.0116508, - "auxiliary_loss_mlp": 0.01050687, - "balance_loss_clip": 1.0576005, - "balance_loss_mlp": 1.02608228, - "epoch": 0.06643619419810612, - "flos": 25485659677440.0, - "grad_norm": 1.5448539486038575, - "language_loss": 0.80422902, - "learning_rate": 3.986090526789227e-06, - "loss": 0.82638663, - "num_input_tokens_seen": 23514920, - "step": 1105, - "time_per_iteration": 2.8904521465301514 - }, - { - "auxiliary_loss_clip": 0.01179397, - "auxiliary_loss_mlp": 0.0106197, - "balance_loss_clip": 1.06348729, - "balance_loss_mlp": 1.0391891, - "epoch": 0.06649631745077408, - "flos": 16946533393920.0, - "grad_norm": 2.7426455725749896, - "language_loss": 0.96762037, - "learning_rate": 3.986044636565639e-06, - "loss": 0.99003398, - "num_input_tokens_seen": 23531635, - "step": 1106, - "time_per_iteration": 2.890073299407959 - }, - { - "auxiliary_loss_clip": 0.01198065, - "auxiliary_loss_mlp": 0.01059975, - "balance_loss_clip": 1.06069684, - "balance_loss_mlp": 1.03511953, - "epoch": 0.06655644070344206, - "flos": 17858592558720.0, - "grad_norm": 1.9297768479693453, - "language_loss": 0.82528949, - "learning_rate": 3.985998671031039e-06, - "loss": 0.84786987, - "num_input_tokens_seen": 23551020, - "step": 1107, - "time_per_iteration": 2.778857469558716 - }, - { - "auxiliary_loss_clip": 0.01104176, - "auxiliary_loss_mlp": 0.01010935, - "balance_loss_clip": 1.04708242, - "balance_loss_mlp": 1.0072155, - "epoch": 0.06661656395611003, - "flos": 61419350021760.0, - "grad_norm": 0.7967940032222198, - "language_loss": 0.56789279, - "learning_rate": 3.9859526301871705e-06, - "loss": 0.58904392, - "num_input_tokens_seen": 23610675, - "step": 1108, - "time_per_iteration": 3.2717819213867188 - }, - { - "auxiliary_loss_clip": 0.0118327, - "auxiliary_loss_mlp": 0.01062625, - "balance_loss_clip": 1.05651307, - "balance_loss_mlp": 1.0376507, - "epoch": 0.066676687208778, - "flos": 20662856282880.0, - "grad_norm": 2.682842555407744, - "language_loss": 0.7287578, - "learning_rate": 3.9859065140357795e-06, - "loss": 0.75121677, - "num_input_tokens_seen": 23628710, - "step": 1109, - "time_per_iteration": 2.829623222351074 - }, - { - "auxiliary_loss_clip": 0.01148971, - "auxiliary_loss_mlp": 0.01071895, - "balance_loss_clip": 1.05459642, - "balance_loss_mlp": 1.04714715, - "epoch": 0.06673681046144596, - "flos": 20923280864640.0, - "grad_norm": 1.7914435942805436, - "language_loss": 0.78140426, - "learning_rate": 3.985860322578614e-06, - "loss": 0.80361295, - "num_input_tokens_seen": 23649160, - "step": 1110, - "time_per_iteration": 2.892786741256714 - }, - { - "auxiliary_loss_clip": 0.01153553, - "auxiliary_loss_mlp": 0.0106147, - "balance_loss_clip": 1.05590594, - "balance_loss_mlp": 1.03700781, - "epoch": 0.06679693371411394, - "flos": 31065818359680.0, - "grad_norm": 2.5260725451831805, - "language_loss": 0.71425366, - "learning_rate": 3.985814055817427e-06, - "loss": 0.73640382, - "num_input_tokens_seen": 23671995, - "step": 1111, - "time_per_iteration": 2.9349052906036377 - }, - { - "auxiliary_loss_clip": 0.01170538, - "auxiliary_loss_mlp": 0.01066103, - "balance_loss_clip": 1.05776191, - "balance_loss_mlp": 1.04199934, - "epoch": 0.0668570569667819, - "flos": 21726135705600.0, - "grad_norm": 1.8396663794990693, - "language_loss": 0.78767776, - "learning_rate": 3.985767713753971e-06, - "loss": 0.81004417, - "num_input_tokens_seen": 23690705, - "step": 1112, - "time_per_iteration": 2.8676345348358154 - }, - { - "auxiliary_loss_clip": 0.01153291, - "auxiliary_loss_mlp": 0.01065421, - "balance_loss_clip": 1.05340791, - "balance_loss_mlp": 1.04163861, - "epoch": 0.06691718021944987, - "flos": 22747255539840.0, - "grad_norm": 2.071048188460824, - "language_loss": 0.78481978, - "learning_rate": 3.985721296390005e-06, - "loss": 0.80700684, - "num_input_tokens_seen": 23709990, - "step": 1113, - "time_per_iteration": 2.8688411712646484 - }, - { - "auxiliary_loss_clip": 0.0114872, - "auxiliary_loss_mlp": 0.01057074, - "balance_loss_clip": 1.05157375, - "balance_loss_mlp": 1.03376842, - "epoch": 0.06697730347211785, - "flos": 16545626720640.0, - "grad_norm": 1.7560007918285245, - "language_loss": 0.82399213, - "learning_rate": 3.985674803727289e-06, - "loss": 0.84605002, - "num_input_tokens_seen": 23728485, - "step": 1114, - "time_per_iteration": 2.832458019256592 - }, - { - "auxiliary_loss_clip": 0.01075626, - "auxiliary_loss_mlp": 0.01006906, - "balance_loss_clip": 1.04995251, - "balance_loss_mlp": 1.00271022, - "epoch": 0.06703742672478581, - "flos": 59782326658560.0, - "grad_norm": 0.8370646888074905, - "language_loss": 0.58147323, - "learning_rate": 3.985628235767584e-06, - "loss": 0.60229862, - "num_input_tokens_seen": 23786650, - "step": 1115, - "time_per_iteration": 3.550837755203247 - }, - { - "auxiliary_loss_clip": 0.01177193, - "auxiliary_loss_mlp": 0.01059174, - "balance_loss_clip": 1.05986214, - "balance_loss_mlp": 1.03381801, - "epoch": 0.06709754997745378, - "flos": 16800197385600.0, - "grad_norm": 2.8944873563712235, - "language_loss": 0.91280693, - "learning_rate": 3.985581592512658e-06, - "loss": 0.93517065, - "num_input_tokens_seen": 23802555, - "step": 1116, - "time_per_iteration": 2.994608163833618 - }, - { - "auxiliary_loss_clip": 0.01169376, - "auxiliary_loss_mlp": 0.0078227, - "balance_loss_clip": 1.05839634, - "balance_loss_mlp": 1.00045347, - "epoch": 0.06715767323012176, - "flos": 22123917895680.0, - "grad_norm": 1.9249158333763592, - "language_loss": 0.87154609, - "learning_rate": 3.985534873964279e-06, - "loss": 0.89106256, - "num_input_tokens_seen": 23822945, - "step": 1117, - "time_per_iteration": 2.794400453567505 - }, - { - "auxiliary_loss_clip": 0.01095782, - "auxiliary_loss_mlp": 0.01003785, - "balance_loss_clip": 1.0387876, - "balance_loss_mlp": 0.99963647, - "epoch": 0.06721779648278972, - "flos": 66618100137600.0, - "grad_norm": 0.8644388721740246, - "language_loss": 0.5981611, - "learning_rate": 3.985488080124218e-06, - "loss": 0.61915678, - "num_input_tokens_seen": 23874075, - "step": 1118, - "time_per_iteration": 3.1695809364318848 - }, - { - "auxiliary_loss_clip": 0.01178972, - "auxiliary_loss_mlp": 0.01051993, - "balance_loss_clip": 1.05301392, - "balance_loss_mlp": 1.02780545, - "epoch": 0.06727791973545769, - "flos": 22382474970240.0, - "grad_norm": 3.6923711141076447, - "language_loss": 0.83045954, - "learning_rate": 3.985441210994251e-06, - "loss": 0.85276914, - "num_input_tokens_seen": 23889720, - "step": 1119, - "time_per_iteration": 2.7538814544677734 - }, - { - "auxiliary_loss_clip": 0.01182384, - "auxiliary_loss_mlp": 0.01058422, - "balance_loss_clip": 1.06102347, - "balance_loss_mlp": 1.03566504, - "epoch": 0.06733804298812565, - "flos": 24280210224000.0, - "grad_norm": 4.541743494234462, - "language_loss": 0.8451674, - "learning_rate": 3.9853942665761545e-06, - "loss": 0.86757541, - "num_input_tokens_seen": 23909385, - "step": 1120, - "time_per_iteration": 2.76581072807312 - }, - { - "auxiliary_loss_clip": 0.0121565, - "auxiliary_loss_mlp": 0.01064916, - "balance_loss_clip": 1.06757379, - "balance_loss_mlp": 1.04028773, - "epoch": 0.06739816624079363, - "flos": 15918230839680.0, - "grad_norm": 2.503866645162978, - "language_loss": 0.78722781, - "learning_rate": 3.985347246871708e-06, - "loss": 0.81003344, - "num_input_tokens_seen": 23926830, - "step": 1121, - "time_per_iteration": 2.651175022125244 - }, - { - "auxiliary_loss_clip": 0.01080914, - "auxiliary_loss_mlp": 0.01011889, - "balance_loss_clip": 1.03108025, - "balance_loss_mlp": 1.00802636, - "epoch": 0.0674582894934616, - "flos": 71398567353600.0, - "grad_norm": 0.7540288133642103, - "language_loss": 0.58320796, - "learning_rate": 3.985300151882694e-06, - "loss": 0.60413599, - "num_input_tokens_seen": 23992640, - "step": 1122, - "time_per_iteration": 3.3794541358947754 - }, - { - "auxiliary_loss_clip": 0.01145486, - "auxiliary_loss_mlp": 0.01066136, - "balance_loss_clip": 1.05581403, - "balance_loss_mlp": 1.04167438, - "epoch": 0.06751841274612956, - "flos": 25264952559360.0, - "grad_norm": 2.3361170394687076, - "language_loss": 0.71965349, - "learning_rate": 3.985252981610901e-06, - "loss": 0.74176967, - "num_input_tokens_seen": 24011135, - "step": 1123, - "time_per_iteration": 2.8049354553222656 - }, - { - "auxiliary_loss_clip": 0.01144994, - "auxiliary_loss_mlp": 0.01064196, - "balance_loss_clip": 1.05373979, - "balance_loss_mlp": 1.03612232, - "epoch": 0.06757853599879754, - "flos": 23802741711360.0, - "grad_norm": 1.7380479869896208, - "language_loss": 0.78987843, - "learning_rate": 3.985205736058114e-06, - "loss": 0.81197035, - "num_input_tokens_seen": 24030695, - "step": 1124, - "time_per_iteration": 2.8595056533813477 - }, - { - "auxiliary_loss_clip": 0.01189686, - "auxiliary_loss_mlp": 0.01055169, - "balance_loss_clip": 1.05663013, - "balance_loss_mlp": 1.03200674, - "epoch": 0.0676386592514655, - "flos": 21033742164480.0, - "grad_norm": 3.1450673626590793, - "language_loss": 0.70999855, - "learning_rate": 3.985158415226128e-06, - "loss": 0.73244709, - "num_input_tokens_seen": 24050680, - "step": 1125, - "time_per_iteration": 2.726163625717163 - }, - { - "auxiliary_loss_clip": 0.01165518, - "auxiliary_loss_mlp": 0.01068918, - "balance_loss_clip": 1.05826426, - "balance_loss_mlp": 1.04290628, - "epoch": 0.06769878250413347, - "flos": 25556331686400.0, - "grad_norm": 3.340323364887528, - "language_loss": 0.81440383, - "learning_rate": 3.985111019116736e-06, - "loss": 0.83674812, - "num_input_tokens_seen": 24067205, - "step": 1126, - "time_per_iteration": 2.7356598377227783 - }, - { - "auxiliary_loss_clip": 0.0107201, - "auxiliary_loss_mlp": 0.01004999, - "balance_loss_clip": 1.0293622, - "balance_loss_mlp": 1.00092208, - "epoch": 0.06775890575680145, - "flos": 70655251305600.0, - "grad_norm": 0.77802311726495, - "language_loss": 0.59720373, - "learning_rate": 3.985063547731735e-06, - "loss": 0.6179738, - "num_input_tokens_seen": 24131320, - "step": 1127, - "time_per_iteration": 3.2627320289611816 - }, - { - "auxiliary_loss_clip": 0.01206438, - "auxiliary_loss_mlp": 0.01055509, - "balance_loss_clip": 1.06308687, - "balance_loss_mlp": 1.03189397, - "epoch": 0.06781902900946941, - "flos": 24235500769920.0, - "grad_norm": 2.2535941175889054, - "language_loss": 0.81097019, - "learning_rate": 3.985016001072925e-06, - "loss": 0.83358967, - "num_input_tokens_seen": 24149930, - "step": 1128, - "time_per_iteration": 2.6652371883392334 - }, - { - "auxiliary_loss_clip": 0.01158345, - "auxiliary_loss_mlp": 0.01052658, - "balance_loss_clip": 1.05360484, - "balance_loss_mlp": 1.02804112, - "epoch": 0.06787915226213738, - "flos": 22417523665920.0, - "grad_norm": 2.24200367657907, - "language_loss": 0.75559127, - "learning_rate": 3.984968379142109e-06, - "loss": 0.77770138, - "num_input_tokens_seen": 24169590, - "step": 1129, - "time_per_iteration": 2.7023732662200928 - }, - { - "auxiliary_loss_clip": 0.01117595, - "auxiliary_loss_mlp": 0.01053995, - "balance_loss_clip": 1.04627228, - "balance_loss_mlp": 1.03006983, - "epoch": 0.06793927551480534, - "flos": 37706922080640.0, - "grad_norm": 1.890559803272908, - "language_loss": 0.71710479, - "learning_rate": 3.984920681941094e-06, - "loss": 0.73882067, - "num_input_tokens_seen": 24189965, - "step": 1130, - "time_per_iteration": 3.0757689476013184 - }, - { - "auxiliary_loss_clip": 0.01158117, - "auxiliary_loss_mlp": 0.010592, - "balance_loss_clip": 1.05734515, - "balance_loss_mlp": 1.03481019, - "epoch": 0.06799939876747332, - "flos": 20631398947200.0, - "grad_norm": 2.24421862356218, - "language_loss": 0.80776262, - "learning_rate": 3.984872909471688e-06, - "loss": 0.82993579, - "num_input_tokens_seen": 24208045, - "step": 1131, - "time_per_iteration": 5.00832724571228 - }, - { - "auxiliary_loss_clip": 0.01195331, - "auxiliary_loss_mlp": 0.01070142, - "balance_loss_clip": 1.06155944, - "balance_loss_mlp": 1.04614532, - "epoch": 0.06805952202014129, - "flos": 14864755829760.0, - "grad_norm": 2.0533244923502463, - "language_loss": 0.80371779, - "learning_rate": 3.984825061735701e-06, - "loss": 0.8263725, - "num_input_tokens_seen": 24223805, - "step": 1132, - "time_per_iteration": 4.487931251525879 - }, - { - "auxiliary_loss_clip": 0.01170581, - "auxiliary_loss_mlp": 0.01061867, - "balance_loss_clip": 1.05438542, - "balance_loss_mlp": 1.03756022, - "epoch": 0.06811964527280925, - "flos": 48909434947200.0, - "grad_norm": 1.7182324226465766, - "language_loss": 0.6341064, - "learning_rate": 3.9847771387349495e-06, - "loss": 0.65643084, - "num_input_tokens_seen": 24249475, - "step": 1133, - "time_per_iteration": 4.48089337348938 - }, - { - "auxiliary_loss_clip": 0.01125599, - "auxiliary_loss_mlp": 0.01055984, - "balance_loss_clip": 1.04700482, - "balance_loss_mlp": 1.02973366, - "epoch": 0.06817976852547723, - "flos": 15377273038080.0, - "grad_norm": 1.9264963116598819, - "language_loss": 0.74771935, - "learning_rate": 3.9847291404712506e-06, - "loss": 0.76953518, - "num_input_tokens_seen": 24267980, - "step": 1134, - "time_per_iteration": 5.287277936935425 - }, - { - "auxiliary_loss_clip": 0.01169269, - "auxiliary_loss_mlp": 0.00782536, - "balance_loss_clip": 1.05878353, - "balance_loss_mlp": 1.00042605, - "epoch": 0.0682398917781452, - "flos": 20155690200960.0, - "grad_norm": 2.151108605399924, - "language_loss": 0.86871451, - "learning_rate": 3.984681066946423e-06, - "loss": 0.88823259, - "num_input_tokens_seen": 24286805, - "step": 1135, - "time_per_iteration": 2.8024110794067383 - }, - { - "auxiliary_loss_clip": 0.0117656, - "auxiliary_loss_mlp": 0.007818, - "balance_loss_clip": 1.0543226, - "balance_loss_mlp": 1.00046515, - "epoch": 0.06830001503081316, - "flos": 23440618748160.0, - "grad_norm": 2.521942237810997, - "language_loss": 0.78131735, - "learning_rate": 3.984632918162291e-06, - "loss": 0.80090094, - "num_input_tokens_seen": 24305855, - "step": 1136, - "time_per_iteration": 2.7595040798187256 - }, - { - "auxiliary_loss_clip": 0.01185832, - "auxiliary_loss_mlp": 0.01063587, - "balance_loss_clip": 1.05952621, - "balance_loss_mlp": 1.03868449, - "epoch": 0.06836013828348114, - "flos": 34349813153280.0, - "grad_norm": 2.275643110468061, - "language_loss": 0.83968467, - "learning_rate": 3.984584694120679e-06, - "loss": 0.86217892, - "num_input_tokens_seen": 24326535, - "step": 1137, - "time_per_iteration": 2.7738285064697266 - }, - { - "auxiliary_loss_clip": 0.01153105, - "auxiliary_loss_mlp": 0.01059471, - "balance_loss_clip": 1.05239427, - "balance_loss_mlp": 1.0348897, - "epoch": 0.06842026153614911, - "flos": 23148844571520.0, - "grad_norm": 2.068206081593879, - "language_loss": 0.788486, - "learning_rate": 3.984536394823418e-06, - "loss": 0.81061178, - "num_input_tokens_seen": 24345810, - "step": 1138, - "time_per_iteration": 2.804537296295166 - }, - { - "auxiliary_loss_clip": 0.01209658, - "auxiliary_loss_mlp": 0.01058353, - "balance_loss_clip": 1.06288362, - "balance_loss_mlp": 1.03415346, - "epoch": 0.06848038478881707, - "flos": 24608972430720.0, - "grad_norm": 2.3335265924104096, - "language_loss": 0.85507643, - "learning_rate": 3.984488020272336e-06, - "loss": 0.87775654, - "num_input_tokens_seen": 24366095, - "step": 1139, - "time_per_iteration": 2.746884822845459 - }, - { - "auxiliary_loss_clip": 0.01153855, - "auxiliary_loss_mlp": 0.01063721, - "balance_loss_clip": 1.05325532, - "balance_loss_mlp": 1.03679228, - "epoch": 0.06854050804148504, - "flos": 40880994278400.0, - "grad_norm": 1.9254794009430078, - "language_loss": 0.74899161, - "learning_rate": 3.984439570469271e-06, - "loss": 0.7711674, - "num_input_tokens_seen": 24388665, - "step": 1140, - "time_per_iteration": 2.938143253326416 - }, - { - "auxiliary_loss_clip": 0.01186218, - "auxiliary_loss_mlp": 0.00782227, - "balance_loss_clip": 1.06101704, - "balance_loss_mlp": 1.00036597, - "epoch": 0.06860063129415302, - "flos": 31686354743040.0, - "grad_norm": 2.1250887020504767, - "language_loss": 0.68258876, - "learning_rate": 3.9843910454160574e-06, - "loss": 0.70227319, - "num_input_tokens_seen": 24407705, - "step": 1141, - "time_per_iteration": 2.8180530071258545 - }, - { - "auxiliary_loss_clip": 0.01197117, - "auxiliary_loss_mlp": 0.01067748, - "balance_loss_clip": 1.05978489, - "balance_loss_mlp": 1.04266596, - "epoch": 0.06866075454682098, - "flos": 26542007775360.0, - "grad_norm": 1.8460768582410394, - "language_loss": 0.78959155, - "learning_rate": 3.984342445114538e-06, - "loss": 0.81224018, - "num_input_tokens_seen": 24428390, - "step": 1142, - "time_per_iteration": 2.712876558303833 - }, - { - "auxiliary_loss_clip": 0.01186915, - "auxiliary_loss_mlp": 0.01060882, - "balance_loss_clip": 1.06245089, - "balance_loss_mlp": 1.03702831, - "epoch": 0.06872087779948895, - "flos": 29789768724480.0, - "grad_norm": 1.7867268614306446, - "language_loss": 0.68287402, - "learning_rate": 3.984293769566553e-06, - "loss": 0.70535195, - "num_input_tokens_seen": 24450810, - "step": 1143, - "time_per_iteration": 2.752659320831299 - }, - { - "auxiliary_loss_clip": 0.01177843, - "auxiliary_loss_mlp": 0.01059894, - "balance_loss_clip": 1.05798244, - "balance_loss_mlp": 1.03773308, - "epoch": 0.06878100105215693, - "flos": 26941118768640.0, - "grad_norm": 1.7582250309313294, - "language_loss": 0.74307454, - "learning_rate": 3.98424501877395e-06, - "loss": 0.76545191, - "num_input_tokens_seen": 24469965, - "step": 1144, - "time_per_iteration": 2.6448662281036377 - }, - { - "auxiliary_loss_clip": 0.01189197, - "auxiliary_loss_mlp": 0.0106544, - "balance_loss_clip": 1.0565474, - "balance_loss_mlp": 1.04039407, - "epoch": 0.06884112430482489, - "flos": 10670748946560.0, - "grad_norm": 2.699041414372958, - "language_loss": 0.91755033, - "learning_rate": 3.984196192738577e-06, - "loss": 0.94009674, - "num_input_tokens_seen": 24486370, - "step": 1145, - "time_per_iteration": 2.6621482372283936 - }, - { - "auxiliary_loss_clip": 0.01212189, - "auxiliary_loss_mlp": 0.0106819, - "balance_loss_clip": 1.06225932, - "balance_loss_mlp": 1.04258406, - "epoch": 0.06890124755749286, - "flos": 20193647898240.0, - "grad_norm": 2.2014676012481487, - "language_loss": 0.81726635, - "learning_rate": 3.984147291462285e-06, - "loss": 0.84007025, - "num_input_tokens_seen": 24503780, - "step": 1146, - "time_per_iteration": 2.623964548110962 - }, - { - "auxiliary_loss_clip": 0.01204602, - "auxiliary_loss_mlp": 0.01065301, - "balance_loss_clip": 1.06215203, - "balance_loss_mlp": 1.04191244, - "epoch": 0.06896137081016084, - "flos": 20449224144000.0, - "grad_norm": 2.1265245828428108, - "language_loss": 0.84968954, - "learning_rate": 3.98409831494693e-06, - "loss": 0.8723886, - "num_input_tokens_seen": 24522320, - "step": 1147, - "time_per_iteration": 2.5898265838623047 - }, - { - "auxiliary_loss_clip": 0.01156886, - "auxiliary_loss_mlp": 0.01064453, - "balance_loss_clip": 1.05563867, - "balance_loss_mlp": 1.03949046, - "epoch": 0.0690214940628288, - "flos": 18368703555840.0, - "grad_norm": 1.7557033260323716, - "language_loss": 0.86094105, - "learning_rate": 3.984049263194367e-06, - "loss": 0.88315445, - "num_input_tokens_seen": 24540445, - "step": 1148, - "time_per_iteration": 2.748782157897949 - }, - { - "auxiliary_loss_clip": 0.01173365, - "auxiliary_loss_mlp": 0.01060047, - "balance_loss_clip": 1.05569541, - "balance_loss_mlp": 1.03370178, - "epoch": 0.06908161731549677, - "flos": 20558033418240.0, - "grad_norm": 2.322434023005448, - "language_loss": 0.69602191, - "learning_rate": 3.9840001362064575e-06, - "loss": 0.71835601, - "num_input_tokens_seen": 24557105, - "step": 1149, - "time_per_iteration": 2.741854429244995 - }, - { - "auxiliary_loss_clip": 0.01207871, - "auxiliary_loss_mlp": 0.01051245, - "balance_loss_clip": 1.06034219, - "balance_loss_mlp": 1.02692604, - "epoch": 0.06914174056816474, - "flos": 27563666313600.0, - "grad_norm": 1.9440351937259064, - "language_loss": 0.8374452, - "learning_rate": 3.983950933985064e-06, - "loss": 0.86003637, - "num_input_tokens_seen": 24578240, - "step": 1150, - "time_per_iteration": 2.6919586658477783 - }, - { - "auxiliary_loss_clip": 0.01181406, - "auxiliary_loss_mlp": 0.01058015, - "balance_loss_clip": 1.06063652, - "balance_loss_mlp": 1.03380394, - "epoch": 0.06920186382083271, - "flos": 15304015249920.0, - "grad_norm": 4.11905785776886, - "language_loss": 0.81464434, - "learning_rate": 3.983901656532052e-06, - "loss": 0.83703858, - "num_input_tokens_seen": 24593585, - "step": 1151, - "time_per_iteration": 2.7979934215545654 - }, - { - "auxiliary_loss_clip": 0.01206831, - "auxiliary_loss_mlp": 0.01058184, - "balance_loss_clip": 1.06409955, - "balance_loss_mlp": 1.03434169, - "epoch": 0.06926198707350067, - "flos": 25191227894400.0, - "grad_norm": 2.0324362571668724, - "language_loss": 0.85408235, - "learning_rate": 3.983852303849291e-06, - "loss": 0.87673247, - "num_input_tokens_seen": 24613110, - "step": 1152, - "time_per_iteration": 2.686021089553833 - }, - { - "auxiliary_loss_clip": 0.01190935, - "auxiliary_loss_mlp": 0.01062076, - "balance_loss_clip": 1.06250155, - "balance_loss_mlp": 1.03866374, - "epoch": 0.06932211032616864, - "flos": 13256137146240.0, - "grad_norm": 2.182544196511779, - "language_loss": 0.90594423, - "learning_rate": 3.983802875938651e-06, - "loss": 0.92847437, - "num_input_tokens_seen": 24628795, - "step": 1153, - "time_per_iteration": 2.58366060256958 - }, - { - "auxiliary_loss_clip": 0.01169877, - "auxiliary_loss_mlp": 0.01055253, - "balance_loss_clip": 1.05681062, - "balance_loss_mlp": 1.03088629, - "epoch": 0.06938223357883662, - "flos": 24827381078400.0, - "grad_norm": 2.1214794624630846, - "language_loss": 0.81526846, - "learning_rate": 3.983753372802008e-06, - "loss": 0.83751976, - "num_input_tokens_seen": 24645480, - "step": 1154, - "time_per_iteration": 2.696794271469116 - }, - { - "auxiliary_loss_clip": 0.01188774, - "auxiliary_loss_mlp": 0.01066335, - "balance_loss_clip": 1.0691216, - "balance_loss_mlp": 1.04200506, - "epoch": 0.06944235683150458, - "flos": 27267977554560.0, - "grad_norm": 2.102018399986892, - "language_loss": 0.75022292, - "learning_rate": 3.983703794441237e-06, - "loss": 0.77277398, - "num_input_tokens_seen": 24664630, - "step": 1155, - "time_per_iteration": 2.7718143463134766 - }, - { - "auxiliary_loss_clip": 0.01180696, - "auxiliary_loss_mlp": 0.00782152, - "balance_loss_clip": 1.05586052, - "balance_loss_mlp": 1.00041056, - "epoch": 0.06950248008417255, - "flos": 25808065176960.0, - "grad_norm": 1.7459449483933205, - "language_loss": 0.7110405, - "learning_rate": 3.98365414085822e-06, - "loss": 0.73066902, - "num_input_tokens_seen": 24684210, - "step": 1156, - "time_per_iteration": 2.7014200687408447 - }, - { - "auxiliary_loss_clip": 0.01179101, - "auxiliary_loss_mlp": 0.00782674, - "balance_loss_clip": 1.0593586, - "balance_loss_mlp": 1.00037348, - "epoch": 0.06956260333684053, - "flos": 22271546793600.0, - "grad_norm": 2.067241397655847, - "language_loss": 0.74882817, - "learning_rate": 3.98360441205484e-06, - "loss": 0.76844591, - "num_input_tokens_seen": 24702490, - "step": 1157, - "time_per_iteration": 2.7571897506713867 - }, - { - "auxiliary_loss_clip": 0.01178249, - "auxiliary_loss_mlp": 0.01061737, - "balance_loss_clip": 1.05653787, - "balance_loss_mlp": 1.03697729, - "epoch": 0.0696227265895085, - "flos": 29681390413440.0, - "grad_norm": 1.9827644507913538, - "language_loss": 0.7165724, - "learning_rate": 3.983554608032982e-06, - "loss": 0.73897225, - "num_input_tokens_seen": 24724340, - "step": 1158, - "time_per_iteration": 2.839745044708252 - }, - { - "auxiliary_loss_clip": 0.01207855, - "auxiliary_loss_mlp": 0.01058558, - "balance_loss_clip": 1.0605582, - "balance_loss_mlp": 1.03370285, - "epoch": 0.06968284984217646, - "flos": 25523545547520.0, - "grad_norm": 1.9692207215605615, - "language_loss": 0.79595017, - "learning_rate": 3.983504728794533e-06, - "loss": 0.8186143, - "num_input_tokens_seen": 24745550, - "step": 1159, - "time_per_iteration": 2.7535817623138428 - }, - { - "auxiliary_loss_clip": 0.01212717, - "auxiliary_loss_mlp": 0.01068535, - "balance_loss_clip": 1.06535673, - "balance_loss_mlp": 1.04094958, - "epoch": 0.06974297309484444, - "flos": 20698192287360.0, - "grad_norm": 3.5530789367722373, - "language_loss": 0.80517769, - "learning_rate": 3.983454774341387e-06, - "loss": 0.82799017, - "num_input_tokens_seen": 24762575, - "step": 1160, - "time_per_iteration": 2.7455785274505615 - }, - { - "auxiliary_loss_clip": 0.0119075, - "auxiliary_loss_mlp": 0.01057887, - "balance_loss_clip": 1.05680609, - "balance_loss_mlp": 1.03294837, - "epoch": 0.0698030963475124, - "flos": 26505199313280.0, - "grad_norm": 1.6303409062485206, - "language_loss": 0.7607069, - "learning_rate": 3.983404744675437e-06, - "loss": 0.78319323, - "num_input_tokens_seen": 24782605, - "step": 1161, - "time_per_iteration": 2.773775100708008 - }, - { - "auxiliary_loss_clip": 0.01175787, - "auxiliary_loss_mlp": 0.01062083, - "balance_loss_clip": 1.05773759, - "balance_loss_mlp": 1.03673923, - "epoch": 0.06986321960018037, - "flos": 23040430346880.0, - "grad_norm": 1.6605796421434038, - "language_loss": 0.82758528, - "learning_rate": 3.9833546397985794e-06, - "loss": 0.84996402, - "num_input_tokens_seen": 24802910, - "step": 1162, - "time_per_iteration": 2.7426044940948486 - }, - { - "auxiliary_loss_clip": 0.01182513, - "auxiliary_loss_mlp": 0.01058124, - "balance_loss_clip": 1.05717576, - "balance_loss_mlp": 1.03092098, - "epoch": 0.06992334285284833, - "flos": 28584822061440.0, - "grad_norm": 1.9523155091610094, - "language_loss": 0.79563475, - "learning_rate": 3.983304459712716e-06, - "loss": 0.81804121, - "num_input_tokens_seen": 24823305, - "step": 1163, - "time_per_iteration": 2.720947742462158 - }, - { - "auxiliary_loss_clip": 0.01190519, - "auxiliary_loss_mlp": 0.01063375, - "balance_loss_clip": 1.05861616, - "balance_loss_mlp": 1.03722012, - "epoch": 0.06998346610551631, - "flos": 20595344670720.0, - "grad_norm": 2.213365660843382, - "language_loss": 0.79187214, - "learning_rate": 3.983254204419749e-06, - "loss": 0.81441104, - "num_input_tokens_seen": 24842155, - "step": 1164, - "time_per_iteration": 2.6554183959960938 - }, - { - "auxiliary_loss_clip": 0.01143916, - "auxiliary_loss_mlp": 0.01067459, - "balance_loss_clip": 1.05240798, - "balance_loss_mlp": 1.03875315, - "epoch": 0.07004358935818428, - "flos": 22528810978560.0, - "grad_norm": 1.421930435008642, - "language_loss": 0.72855628, - "learning_rate": 3.983203873921583e-06, - "loss": 0.75067008, - "num_input_tokens_seen": 24862080, - "step": 1165, - "time_per_iteration": 2.753063440322876 - }, - { - "auxiliary_loss_clip": 0.01183824, - "auxiliary_loss_mlp": 0.01059612, - "balance_loss_clip": 1.06135893, - "balance_loss_mlp": 1.03522193, - "epoch": 0.07010371261085224, - "flos": 28949997680640.0, - "grad_norm": 2.453348821242437, - "language_loss": 0.81136239, - "learning_rate": 3.983153468220128e-06, - "loss": 0.83379674, - "num_input_tokens_seen": 24886165, - "step": 1166, - "time_per_iteration": 2.802016496658325 - }, - { - "auxiliary_loss_clip": 0.011718, - "auxiliary_loss_mlp": 0.01053529, - "balance_loss_clip": 1.05450797, - "balance_loss_mlp": 1.02754176, - "epoch": 0.07016383586352022, - "flos": 23659171050240.0, - "grad_norm": 2.457667377154448, - "language_loss": 0.84640259, - "learning_rate": 3.983102987317295e-06, - "loss": 0.86865586, - "num_input_tokens_seen": 24905775, - "step": 1167, - "time_per_iteration": 2.7066097259521484 - }, - { - "auxiliary_loss_clip": 0.01193446, - "auxiliary_loss_mlp": 0.01064209, - "balance_loss_clip": 1.06136739, - "balance_loss_mlp": 1.03887713, - "epoch": 0.07022395911618819, - "flos": 19792130693760.0, - "grad_norm": 2.6158204436543, - "language_loss": 0.89524722, - "learning_rate": 3.983052431214997e-06, - "loss": 0.91782373, - "num_input_tokens_seen": 24924295, - "step": 1168, - "time_per_iteration": 2.6258392333984375 - }, - { - "auxiliary_loss_clip": 0.01190821, - "auxiliary_loss_mlp": 0.01065905, - "balance_loss_clip": 1.06090224, - "balance_loss_mlp": 1.03705645, - "epoch": 0.07028408236885615, - "flos": 21689147675520.0, - "grad_norm": 2.6445150319591035, - "language_loss": 0.89008862, - "learning_rate": 3.983001799915153e-06, - "loss": 0.91265589, - "num_input_tokens_seen": 24943210, - "step": 1169, - "time_per_iteration": 2.6858527660369873 - }, - { - "auxiliary_loss_clip": 0.01211063, - "auxiliary_loss_mlp": 0.01065533, - "balance_loss_clip": 1.06400895, - "balance_loss_mlp": 1.03950977, - "epoch": 0.07034420562152413, - "flos": 25630271832960.0, - "grad_norm": 1.9672897290124218, - "language_loss": 0.83834457, - "learning_rate": 3.982951093419681e-06, - "loss": 0.86111057, - "num_input_tokens_seen": 24960360, - "step": 1170, - "time_per_iteration": 2.6278069019317627 - }, - { - "auxiliary_loss_clip": 0.01180333, - "auxiliary_loss_mlp": 0.00782328, - "balance_loss_clip": 1.0613637, - "balance_loss_mlp": 1.00041986, - "epoch": 0.0704043288741921, - "flos": 20810449267200.0, - "grad_norm": 1.8542795171503503, - "language_loss": 0.75687242, - "learning_rate": 3.982900311730506e-06, - "loss": 0.77649903, - "num_input_tokens_seen": 24978290, - "step": 1171, - "time_per_iteration": 5.806530475616455 - }, - { - "auxiliary_loss_clip": 0.01179645, - "auxiliary_loss_mlp": 0.0106394, - "balance_loss_clip": 1.06133175, - "balance_loss_mlp": 1.03919196, - "epoch": 0.07046445212686006, - "flos": 25593176062080.0, - "grad_norm": 2.482864122539831, - "language_loss": 0.88865125, - "learning_rate": 3.9828494548495514e-06, - "loss": 0.91108704, - "num_input_tokens_seen": 24997055, - "step": 1172, - "time_per_iteration": 4.371561288833618 - }, - { - "auxiliary_loss_clip": 0.01197698, - "auxiliary_loss_mlp": 0.01054991, - "balance_loss_clip": 1.06532764, - "balance_loss_mlp": 1.02858603, - "epoch": 0.07052457537952803, - "flos": 25556978131200.0, - "grad_norm": 1.6816354314161714, - "language_loss": 0.82075119, - "learning_rate": 3.982798522778748e-06, - "loss": 0.84327805, - "num_input_tokens_seen": 25017490, - "step": 1173, - "time_per_iteration": 4.611542463302612 - }, - { - "auxiliary_loss_clip": 0.01200886, - "auxiliary_loss_mlp": 0.01060851, - "balance_loss_clip": 1.06317592, - "balance_loss_mlp": 1.03503036, - "epoch": 0.070584698632196, - "flos": 17968515154560.0, - "grad_norm": 2.007232853627583, - "language_loss": 0.82071686, - "learning_rate": 3.9827475155200245e-06, - "loss": 0.8433342, - "num_input_tokens_seen": 25035660, - "step": 1174, - "time_per_iteration": 2.6334969997406006 - }, - { - "auxiliary_loss_clip": 0.01180907, - "auxiliary_loss_mlp": 0.01059972, - "balance_loss_clip": 1.05857778, - "balance_loss_mlp": 1.03473568, - "epoch": 0.07064482188486397, - "flos": 25370888745600.0, - "grad_norm": 2.09222115072597, - "language_loss": 0.85013211, - "learning_rate": 3.982696433075317e-06, - "loss": 0.87254095, - "num_input_tokens_seen": 25054785, - "step": 1175, - "time_per_iteration": 2.861591339111328 - }, - { - "auxiliary_loss_clip": 0.01196955, - "auxiliary_loss_mlp": 0.01069941, - "balance_loss_clip": 1.06447482, - "balance_loss_mlp": 1.04605186, - "epoch": 0.07070494513753194, - "flos": 24899848767360.0, - "grad_norm": 1.7270820646539309, - "language_loss": 0.83103871, - "learning_rate": 3.982645275446563e-06, - "loss": 0.85370767, - "num_input_tokens_seen": 25075180, - "step": 1176, - "time_per_iteration": 2.754521608352661 - }, - { - "auxiliary_loss_clip": 0.01152261, - "auxiliary_loss_mlp": 0.01062154, - "balance_loss_clip": 1.05370057, - "balance_loss_mlp": 1.0352838, - "epoch": 0.07076506839019991, - "flos": 22338447874560.0, - "grad_norm": 3.4939498355716996, - "language_loss": 0.74409902, - "learning_rate": 3.982594042635701e-06, - "loss": 0.7662431, - "num_input_tokens_seen": 25093035, - "step": 1177, - "time_per_iteration": 2.692426919937134 - }, - { - "auxiliary_loss_clip": 0.01188551, - "auxiliary_loss_mlp": 0.0106394, - "balance_loss_clip": 1.06080353, - "balance_loss_mlp": 1.03801203, - "epoch": 0.07082519164286788, - "flos": 18660800954880.0, - "grad_norm": 1.8240190288677762, - "language_loss": 0.85965598, - "learning_rate": 3.982542734644673e-06, - "loss": 0.88218087, - "num_input_tokens_seen": 25112520, - "step": 1178, - "time_per_iteration": 2.7197048664093018 - }, - { - "auxiliary_loss_clip": 0.01082521, - "auxiliary_loss_mlp": 0.01013999, - "balance_loss_clip": 1.03661168, - "balance_loss_mlp": 1.01023197, - "epoch": 0.07088531489553584, - "flos": 63654107610240.0, - "grad_norm": 0.8453670789764802, - "language_loss": 0.63256603, - "learning_rate": 3.982491351475427e-06, - "loss": 0.65353125, - "num_input_tokens_seen": 25177760, - "step": 1179, - "time_per_iteration": 3.3419978618621826 - }, - { - "auxiliary_loss_clip": 0.01211274, - "auxiliary_loss_mlp": 0.01073372, - "balance_loss_clip": 1.06935215, - "balance_loss_mlp": 1.04858887, - "epoch": 0.07094543814820382, - "flos": 21572688804480.0, - "grad_norm": 3.2714198066984177, - "language_loss": 0.83388901, - "learning_rate": 3.98243989312991e-06, - "loss": 0.85673553, - "num_input_tokens_seen": 25195260, - "step": 1180, - "time_per_iteration": 2.631992816925049 - }, - { - "auxiliary_loss_clip": 0.01182661, - "auxiliary_loss_mlp": 0.01071326, - "balance_loss_clip": 1.06119037, - "balance_loss_mlp": 1.04624391, - "epoch": 0.07100556140087179, - "flos": 22089946608000.0, - "grad_norm": 2.0409456536886386, - "language_loss": 0.88649988, - "learning_rate": 3.982388359610074e-06, - "loss": 0.90903974, - "num_input_tokens_seen": 25212740, - "step": 1181, - "time_per_iteration": 2.696789264678955 - }, - { - "auxiliary_loss_clip": 0.01180377, - "auxiliary_loss_mlp": 0.01070036, - "balance_loss_clip": 1.06187141, - "balance_loss_mlp": 1.04516935, - "epoch": 0.07106568465353975, - "flos": 47922286400640.0, - "grad_norm": 1.8294049229574356, - "language_loss": 0.83244783, - "learning_rate": 3.9823367509178725e-06, - "loss": 0.85495198, - "num_input_tokens_seen": 25236420, - "step": 1182, - "time_per_iteration": 2.9415605068206787 - }, - { - "auxiliary_loss_clip": 0.01193669, - "auxiliary_loss_mlp": 0.01067019, - "balance_loss_clip": 1.0641923, - "balance_loss_mlp": 1.04150808, - "epoch": 0.07112580790620772, - "flos": 23440798316160.0, - "grad_norm": 3.5892595189310903, - "language_loss": 0.79067838, - "learning_rate": 3.982285067055262e-06, - "loss": 0.81328523, - "num_input_tokens_seen": 25255120, - "step": 1183, - "time_per_iteration": 2.7284862995147705 - }, - { - "auxiliary_loss_clip": 0.01211976, - "auxiliary_loss_mlp": 0.01064792, - "balance_loss_clip": 1.06126475, - "balance_loss_mlp": 1.03866172, - "epoch": 0.0711859311588757, - "flos": 31868888682240.0, - "grad_norm": 2.5463322111759354, - "language_loss": 0.788867, - "learning_rate": 3.982233308024204e-06, - "loss": 0.81163466, - "num_input_tokens_seen": 25275150, - "step": 1184, - "time_per_iteration": 2.7531635761260986 - }, - { - "auxiliary_loss_clip": 0.01152059, - "auxiliary_loss_mlp": 0.01062006, - "balance_loss_clip": 1.05961919, - "balance_loss_mlp": 1.03752065, - "epoch": 0.07124605441154366, - "flos": 19610315026560.0, - "grad_norm": 1.904751850318294, - "language_loss": 0.76806915, - "learning_rate": 3.98218147382666e-06, - "loss": 0.79020983, - "num_input_tokens_seen": 25293680, - "step": 1185, - "time_per_iteration": 2.732539176940918 - }, - { - "auxiliary_loss_clip": 0.01208288, - "auxiliary_loss_mlp": 0.01073792, - "balance_loss_clip": 1.06328642, - "balance_loss_mlp": 1.04903185, - "epoch": 0.07130617766421163, - "flos": 14684448533760.0, - "grad_norm": 2.1301142092644696, - "language_loss": 0.65472758, - "learning_rate": 3.982129564464596e-06, - "loss": 0.67754835, - "num_input_tokens_seen": 25310050, - "step": 1186, - "time_per_iteration": 2.757812261581421 - }, - { - "auxiliary_loss_clip": 0.01195497, - "auxiliary_loss_mlp": 0.01057322, - "balance_loss_clip": 1.06479859, - "balance_loss_mlp": 1.03274107, - "epoch": 0.07136630091687961, - "flos": 26067915141120.0, - "grad_norm": 2.1671481434625894, - "language_loss": 0.69743419, - "learning_rate": 3.98207757993998e-06, - "loss": 0.71996236, - "num_input_tokens_seen": 25331020, - "step": 1187, - "time_per_iteration": 2.746615409851074 - }, - { - "auxiliary_loss_clip": 0.01151827, - "auxiliary_loss_mlp": 0.01067347, - "balance_loss_clip": 1.05412316, - "balance_loss_mlp": 1.04367232, - "epoch": 0.07142642416954757, - "flos": 15669190869120.0, - "grad_norm": 2.8037131445876597, - "language_loss": 0.7861973, - "learning_rate": 3.9820255202547845e-06, - "loss": 0.80838895, - "num_input_tokens_seen": 25347875, - "step": 1188, - "time_per_iteration": 2.738281726837158 - }, - { - "auxiliary_loss_clip": 0.01203626, - "auxiliary_loss_mlp": 0.01059966, - "balance_loss_clip": 1.06304908, - "balance_loss_mlp": 1.03530121, - "epoch": 0.07148654742221554, - "flos": 19755322231680.0, - "grad_norm": 1.8909260147246576, - "language_loss": 0.84754103, - "learning_rate": 3.981973385410981e-06, - "loss": 0.87017697, - "num_input_tokens_seen": 25366715, - "step": 1189, - "time_per_iteration": 2.5770246982574463 - }, - { - "auxiliary_loss_clip": 0.01173135, - "auxiliary_loss_mlp": 0.0078213, - "balance_loss_clip": 1.06234396, - "balance_loss_mlp": 1.00041807, - "epoch": 0.07154667067488352, - "flos": 23471824688640.0, - "grad_norm": 5.212083930118342, - "language_loss": 0.76932275, - "learning_rate": 3.9819211754105494e-06, - "loss": 0.78887534, - "num_input_tokens_seen": 25385450, - "step": 1190, - "time_per_iteration": 2.7057712078094482 - }, - { - "auxiliary_loss_clip": 0.01208346, - "auxiliary_loss_mlp": 0.01074705, - "balance_loss_clip": 1.06283545, - "balance_loss_mlp": 1.04751348, - "epoch": 0.07160679392755148, - "flos": 18332936588160.0, - "grad_norm": 2.5312098602102084, - "language_loss": 0.75201792, - "learning_rate": 3.981868890255468e-06, - "loss": 0.7748484, - "num_input_tokens_seen": 25403940, - "step": 1191, - "time_per_iteration": 2.6071674823760986 - }, - { - "auxiliary_loss_clip": 0.01162268, - "auxiliary_loss_mlp": 0.01063437, - "balance_loss_clip": 1.0519917, - "balance_loss_mlp": 1.03649545, - "epoch": 0.07166691718021945, - "flos": 17747017937280.0, - "grad_norm": 2.470839013019174, - "language_loss": 0.74334443, - "learning_rate": 3.981816529947719e-06, - "loss": 0.76560152, - "num_input_tokens_seen": 25420410, - "step": 1192, - "time_per_iteration": 2.661078453063965 - }, - { - "auxiliary_loss_clip": 0.01202036, - "auxiliary_loss_mlp": 0.01054727, - "balance_loss_clip": 1.05904579, - "balance_loss_mlp": 1.03099298, - "epoch": 0.07172704043288743, - "flos": 22451925916800.0, - "grad_norm": 2.443309122344248, - "language_loss": 0.78010541, - "learning_rate": 3.9817640944892896e-06, - "loss": 0.8026731, - "num_input_tokens_seen": 25439415, - "step": 1193, - "time_per_iteration": 2.5603158473968506 - }, - { - "auxiliary_loss_clip": 0.01186747, - "auxiliary_loss_mlp": 0.01059465, - "balance_loss_clip": 1.06358278, - "balance_loss_mlp": 1.03319085, - "epoch": 0.07178716368555539, - "flos": 23222210100480.0, - "grad_norm": 2.1011663585924585, - "language_loss": 0.85497916, - "learning_rate": 3.981711583882166e-06, - "loss": 0.87744129, - "num_input_tokens_seen": 25458715, - "step": 1194, - "time_per_iteration": 2.6819851398468018 - }, - { - "auxiliary_loss_clip": 0.01184191, - "auxiliary_loss_mlp": 0.01067737, - "balance_loss_clip": 1.05706751, - "balance_loss_mlp": 1.04135609, - "epoch": 0.07184728693822336, - "flos": 25150828072320.0, - "grad_norm": 2.0205668140023185, - "language_loss": 0.8183766, - "learning_rate": 3.981658998128341e-06, - "loss": 0.84089589, - "num_input_tokens_seen": 25477985, - "step": 1195, - "time_per_iteration": 2.6646647453308105 - }, - { - "auxiliary_loss_clip": 0.01165951, - "auxiliary_loss_mlp": 0.01063438, - "balance_loss_clip": 1.0578239, - "balance_loss_mlp": 1.03976321, - "epoch": 0.07190741019089132, - "flos": 22711237176960.0, - "grad_norm": 2.161995064372768, - "language_loss": 0.80093575, - "learning_rate": 3.981606337229808e-06, - "loss": 0.82322967, - "num_input_tokens_seen": 25497110, - "step": 1196, - "time_per_iteration": 2.7217979431152344 - }, - { - "auxiliary_loss_clip": 0.01176131, - "auxiliary_loss_mlp": 0.00784114, - "balance_loss_clip": 1.06106043, - "balance_loss_mlp": 1.00034249, - "epoch": 0.0719675334435593, - "flos": 29349791032320.0, - "grad_norm": 2.5905261146074263, - "language_loss": 0.71339291, - "learning_rate": 3.9815536011885655e-06, - "loss": 0.73299539, - "num_input_tokens_seen": 25516555, - "step": 1197, - "time_per_iteration": 2.7931766510009766 - }, - { - "auxiliary_loss_clip": 0.01157444, - "auxiliary_loss_mlp": 0.01055247, - "balance_loss_clip": 1.06130266, - "balance_loss_mlp": 1.03074968, - "epoch": 0.07202765669622727, - "flos": 17639788861440.0, - "grad_norm": 3.074283933156949, - "language_loss": 0.85951984, - "learning_rate": 3.98150079000661e-06, - "loss": 0.88164675, - "num_input_tokens_seen": 25533895, - "step": 1198, - "time_per_iteration": 2.7241532802581787 - }, - { - "auxiliary_loss_clip": 0.01160083, - "auxiliary_loss_mlp": 0.0106501, - "balance_loss_clip": 1.0597434, - "balance_loss_mlp": 1.03944004, - "epoch": 0.07208777994889523, - "flos": 21434038306560.0, - "grad_norm": 2.052617638295489, - "language_loss": 0.83840948, - "learning_rate": 3.981447903685947e-06, - "loss": 0.86066043, - "num_input_tokens_seen": 25554195, - "step": 1199, - "time_per_iteration": 2.71362566947937 - }, - { - "auxiliary_loss_clip": 0.01212755, - "auxiliary_loss_mlp": 0.01060557, - "balance_loss_clip": 1.06877887, - "balance_loss_mlp": 1.03709614, - "epoch": 0.07214790320156321, - "flos": 26940867373440.0, - "grad_norm": 3.1601590133124837, - "language_loss": 0.7623595, - "learning_rate": 3.981394942228581e-06, - "loss": 0.78509259, - "num_input_tokens_seen": 25574155, - "step": 1200, - "time_per_iteration": 2.6913061141967773 - }, - { - "auxiliary_loss_clip": 0.0119008, - "auxiliary_loss_mlp": 0.010701, - "balance_loss_clip": 1.06442261, - "balance_loss_mlp": 1.04487491, - "epoch": 0.07220802645423118, - "flos": 23879949995520.0, - "grad_norm": 2.2017873087036226, - "language_loss": 0.83013475, - "learning_rate": 3.98134190563652e-06, - "loss": 0.85273659, - "num_input_tokens_seen": 25592735, - "step": 1201, - "time_per_iteration": 2.6983115673065186 - }, - { - "auxiliary_loss_clip": 0.01196941, - "auxiliary_loss_mlp": 0.01065672, - "balance_loss_clip": 1.06197119, - "balance_loss_mlp": 1.03952968, - "epoch": 0.07226814970689914, - "flos": 19243631036160.0, - "grad_norm": 20.835065187143087, - "language_loss": 0.68601412, - "learning_rate": 3.981288793911775e-06, - "loss": 0.70864022, - "num_input_tokens_seen": 25611510, - "step": 1202, - "time_per_iteration": 2.691742420196533 - }, - { - "auxiliary_loss_clip": 0.01182684, - "auxiliary_loss_mlp": 0.00782201, - "balance_loss_clip": 1.06256962, - "balance_loss_mlp": 1.00038218, - "epoch": 0.07232827295956712, - "flos": 19172025273600.0, - "grad_norm": 1.9661831136137597, - "language_loss": 0.87487721, - "learning_rate": 3.98123560705636e-06, - "loss": 0.89452606, - "num_input_tokens_seen": 25629560, - "step": 1203, - "time_per_iteration": 2.7832019329071045 - }, - { - "auxiliary_loss_clip": 0.01154778, - "auxiliary_loss_mlp": 0.01065748, - "balance_loss_clip": 1.05210066, - "balance_loss_mlp": 1.04065442, - "epoch": 0.07238839621223508, - "flos": 17639752947840.0, - "grad_norm": 1.731721557525142, - "language_loss": 0.78053147, - "learning_rate": 3.981182345072293e-06, - "loss": 0.80273676, - "num_input_tokens_seen": 25648330, - "step": 1204, - "time_per_iteration": 2.7754547595977783 - }, - { - "auxiliary_loss_clip": 0.01191832, - "auxiliary_loss_mlp": 0.01065794, - "balance_loss_clip": 1.06211591, - "balance_loss_mlp": 1.04084373, - "epoch": 0.07244851946490305, - "flos": 28292401440000.0, - "grad_norm": 1.5043252978087258, - "language_loss": 0.82094097, - "learning_rate": 3.981129007961593e-06, - "loss": 0.84351724, - "num_input_tokens_seen": 25669470, - "step": 1205, - "time_per_iteration": 2.680457353591919 - }, - { - "auxiliary_loss_clip": 0.01180244, - "auxiliary_loss_mlp": 0.00782807, - "balance_loss_clip": 1.06221068, - "balance_loss_mlp": 1.00036049, - "epoch": 0.07250864271757101, - "flos": 22564829341440.0, - "grad_norm": 1.6438962430217685, - "language_loss": 0.76715982, - "learning_rate": 3.981075595726283e-06, - "loss": 0.78679025, - "num_input_tokens_seen": 25690470, - "step": 1206, - "time_per_iteration": 2.7028439044952393 - }, - { - "auxiliary_loss_clip": 0.01188223, - "auxiliary_loss_mlp": 0.01059861, - "balance_loss_clip": 1.06262684, - "balance_loss_mlp": 1.03442228, - "epoch": 0.072568765970239, - "flos": 21762405463680.0, - "grad_norm": 1.9378198243304647, - "language_loss": 0.77272987, - "learning_rate": 3.981022108368387e-06, - "loss": 0.79521072, - "num_input_tokens_seen": 25709205, - "step": 1207, - "time_per_iteration": 2.779289960861206 - }, - { - "auxiliary_loss_clip": 0.01185538, - "auxiliary_loss_mlp": 0.01053693, - "balance_loss_clip": 1.05844951, - "balance_loss_mlp": 1.03062558, - "epoch": 0.07262888922290696, - "flos": 25519702792320.0, - "grad_norm": 1.8716528383816402, - "language_loss": 0.79480875, - "learning_rate": 3.9809685458899345e-06, - "loss": 0.81720108, - "num_input_tokens_seen": 25728485, - "step": 1208, - "time_per_iteration": 2.682965040206909 - }, - { - "auxiliary_loss_clip": 0.01184899, - "auxiliary_loss_mlp": 0.01054862, - "balance_loss_clip": 1.05801737, - "balance_loss_mlp": 1.03198612, - "epoch": 0.07268901247557492, - "flos": 21246548290560.0, - "grad_norm": 2.5612886109689765, - "language_loss": 0.78537548, - "learning_rate": 3.980914908292955e-06, - "loss": 0.80777311, - "num_input_tokens_seen": 25747730, - "step": 1209, - "time_per_iteration": 2.6582658290863037 - }, - { - "auxiliary_loss_clip": 0.01191905, - "auxiliary_loss_mlp": 0.01067741, - "balance_loss_clip": 1.05931175, - "balance_loss_mlp": 1.04408956, - "epoch": 0.0727491357282429, - "flos": 25479302970240.0, - "grad_norm": 2.351303434522043, - "language_loss": 0.80920583, - "learning_rate": 3.980861195579486e-06, - "loss": 0.83180225, - "num_input_tokens_seen": 25768050, - "step": 1210, - "time_per_iteration": 4.241993427276611 - }, - { - "auxiliary_loss_clip": 0.0117493, - "auxiliary_loss_mlp": 0.01063711, - "balance_loss_clip": 1.06087565, - "balance_loss_mlp": 1.03891551, - "epoch": 0.07280925898091087, - "flos": 24462169545600.0, - "grad_norm": 1.875347829314158, - "language_loss": 0.84302205, - "learning_rate": 3.98080740775156e-06, - "loss": 0.86540848, - "num_input_tokens_seen": 25787985, - "step": 1211, - "time_per_iteration": 4.289919853210449 - }, - { - "auxiliary_loss_clip": 0.01162055, - "auxiliary_loss_mlp": 0.01060218, - "balance_loss_clip": 1.05356658, - "balance_loss_mlp": 1.03629231, - "epoch": 0.07286938223357883, - "flos": 18288191220480.0, - "grad_norm": 2.991110515222773, - "language_loss": 0.90684664, - "learning_rate": 3.98075354481122e-06, - "loss": 0.92906934, - "num_input_tokens_seen": 25803620, - "step": 1212, - "time_per_iteration": 2.660780906677246 - }, - { - "auxiliary_loss_clip": 0.01202443, - "auxiliary_loss_mlp": 0.01058817, - "balance_loss_clip": 1.0623759, - "balance_loss_mlp": 1.03490353, - "epoch": 0.07292950548624681, - "flos": 21214803646080.0, - "grad_norm": 1.7918815842724805, - "language_loss": 0.72358596, - "learning_rate": 3.9806996067605055e-06, - "loss": 0.74619853, - "num_input_tokens_seen": 25823315, - "step": 1213, - "time_per_iteration": 4.303524017333984 - }, - { - "auxiliary_loss_clip": 0.01153662, - "auxiliary_loss_mlp": 0.01055706, - "balance_loss_clip": 1.05658662, - "balance_loss_mlp": 1.03089869, - "epoch": 0.07298962873891478, - "flos": 24642009964800.0, - "grad_norm": 1.8655932637344164, - "language_loss": 0.84356117, - "learning_rate": 3.980645593601465e-06, - "loss": 0.86565483, - "num_input_tokens_seen": 25842605, - "step": 1214, - "time_per_iteration": 2.7505569458007812 - }, - { - "auxiliary_loss_clip": 0.01208881, - "auxiliary_loss_mlp": 0.01062075, - "balance_loss_clip": 1.06484771, - "balance_loss_mlp": 1.03723145, - "epoch": 0.07304975199158274, - "flos": 27052765217280.0, - "grad_norm": 2.025651344907852, - "language_loss": 0.84113681, - "learning_rate": 3.980591505336144e-06, - "loss": 0.86384636, - "num_input_tokens_seen": 25863030, - "step": 1215, - "time_per_iteration": 2.7235965728759766 - }, - { - "auxiliary_loss_clip": 0.01149957, - "auxiliary_loss_mlp": 0.01062992, - "balance_loss_clip": 1.05138278, - "balance_loss_mlp": 1.03744531, - "epoch": 0.07310987524425071, - "flos": 33549544091520.0, - "grad_norm": 1.9312816725096997, - "language_loss": 0.80926049, - "learning_rate": 3.980537341966595e-06, - "loss": 0.83139002, - "num_input_tokens_seen": 25888015, - "step": 1216, - "time_per_iteration": 2.9129130840301514 - }, - { - "auxiliary_loss_clip": 0.01167944, - "auxiliary_loss_mlp": 0.01060276, - "balance_loss_clip": 1.05619049, - "balance_loss_mlp": 1.03680408, - "epoch": 0.07316999849691869, - "flos": 28110944908800.0, - "grad_norm": 3.2846247291101975, - "language_loss": 0.75949144, - "learning_rate": 3.980483103494872e-06, - "loss": 0.78177369, - "num_input_tokens_seen": 25908660, - "step": 1217, - "time_per_iteration": 2.7106521129608154 - }, - { - "auxiliary_loss_clip": 0.01169026, - "auxiliary_loss_mlp": 0.01056631, - "balance_loss_clip": 1.06182647, - "balance_loss_mlp": 1.03477991, - "epoch": 0.07323012174958665, - "flos": 14392602529920.0, - "grad_norm": 1.9658490798069863, - "language_loss": 0.86455309, - "learning_rate": 3.98042878992303e-06, - "loss": 0.88680959, - "num_input_tokens_seen": 25927215, - "step": 1218, - "time_per_iteration": 2.5911786556243896 - }, - { - "auxiliary_loss_clip": 0.01192266, - "auxiliary_loss_mlp": 0.0106258, - "balance_loss_clip": 1.06015348, - "balance_loss_mlp": 1.03916681, - "epoch": 0.07329024500225462, - "flos": 21616428591360.0, - "grad_norm": 2.2310702082820675, - "language_loss": 0.86782354, - "learning_rate": 3.9803744012531305e-06, - "loss": 0.89037204, - "num_input_tokens_seen": 25945500, - "step": 1219, - "time_per_iteration": 2.608562707901001 - }, - { - "auxiliary_loss_clip": 0.01201545, - "auxiliary_loss_mlp": 0.01058282, - "balance_loss_clip": 1.06024373, - "balance_loss_mlp": 1.03539419, - "epoch": 0.0733503682549226, - "flos": 13224141106560.0, - "grad_norm": 2.095886373367052, - "language_loss": 0.84608674, - "learning_rate": 3.980319937487235e-06, - "loss": 0.86868501, - "num_input_tokens_seen": 25963105, - "step": 1220, - "time_per_iteration": 2.469189405441284 - }, - { - "auxiliary_loss_clip": 0.01158855, - "auxiliary_loss_mlp": 0.01063399, - "balance_loss_clip": 1.05358922, - "balance_loss_mlp": 1.03942597, - "epoch": 0.07341049150759056, - "flos": 20886975192960.0, - "grad_norm": 2.648884311755534, - "language_loss": 0.77114344, - "learning_rate": 3.98026539862741e-06, - "loss": 0.79336596, - "num_input_tokens_seen": 25981690, - "step": 1221, - "time_per_iteration": 2.671762466430664 - }, - { - "auxiliary_loss_clip": 0.01158201, - "auxiliary_loss_mlp": 0.01064916, - "balance_loss_clip": 1.05726743, - "balance_loss_mlp": 1.04082406, - "epoch": 0.07347061476025853, - "flos": 15413614623360.0, - "grad_norm": 2.5357389392469942, - "language_loss": 0.91631913, - "learning_rate": 3.980210784675722e-06, - "loss": 0.93855029, - "num_input_tokens_seen": 25999890, - "step": 1222, - "time_per_iteration": 2.6973063945770264 - }, - { - "auxiliary_loss_clip": 0.01135907, - "auxiliary_loss_mlp": 0.01064872, - "balance_loss_clip": 1.05333126, - "balance_loss_mlp": 1.04169726, - "epoch": 0.0735307380129265, - "flos": 11108859131520.0, - "grad_norm": 2.8024324299253047, - "language_loss": 0.90976465, - "learning_rate": 3.980156095634242e-06, - "loss": 0.93177247, - "num_input_tokens_seen": 26016445, - "step": 1223, - "time_per_iteration": 2.8141093254089355 - }, - { - "auxiliary_loss_clip": 0.01202875, - "auxiliary_loss_mlp": 0.01077185, - "balance_loss_clip": 1.06232905, - "balance_loss_mlp": 1.05341494, - "epoch": 0.07359086126559447, - "flos": 23732392924800.0, - "grad_norm": 1.9348534518871447, - "language_loss": 0.82161939, - "learning_rate": 3.980101331505045e-06, - "loss": 0.84442002, - "num_input_tokens_seen": 26036080, - "step": 1224, - "time_per_iteration": 2.640432119369507 - }, - { - "auxiliary_loss_clip": 0.01200329, - "auxiliary_loss_mlp": 0.01057586, - "balance_loss_clip": 1.05987597, - "balance_loss_mlp": 1.03229022, - "epoch": 0.07365098451826244, - "flos": 20993270515200.0, - "grad_norm": 2.31744406237409, - "language_loss": 0.83194047, - "learning_rate": 3.9800464922902076e-06, - "loss": 0.85451961, - "num_input_tokens_seen": 26055805, - "step": 1225, - "time_per_iteration": 2.6159210205078125 - }, - { - "auxiliary_loss_clip": 0.01170115, - "auxiliary_loss_mlp": 0.01056068, - "balance_loss_clip": 1.05743551, - "balance_loss_mlp": 1.03190422, - "epoch": 0.0737111077709304, - "flos": 19933582452480.0, - "grad_norm": 2.2959030425986544, - "language_loss": 0.90388274, - "learning_rate": 3.979991577991808e-06, - "loss": 0.9261446, - "num_input_tokens_seen": 26073905, - "step": 1226, - "time_per_iteration": 2.6527435779571533 - }, - { - "auxiliary_loss_clip": 0.01207799, - "auxiliary_loss_mlp": 0.0104599, - "balance_loss_clip": 1.05913424, - "balance_loss_mlp": 1.02080154, - "epoch": 0.07377123102359838, - "flos": 16581537342720.0, - "grad_norm": 2.579592162134606, - "language_loss": 0.76626784, - "learning_rate": 3.97993658861193e-06, - "loss": 0.78880572, - "num_input_tokens_seen": 26091700, - "step": 1227, - "time_per_iteration": 2.596151351928711 - }, - { - "auxiliary_loss_clip": 0.0118909, - "auxiliary_loss_mlp": 0.01053386, - "balance_loss_clip": 1.06296694, - "balance_loss_mlp": 1.02954459, - "epoch": 0.07383135427626634, - "flos": 28328563457280.0, - "grad_norm": 7.788838200212175, - "language_loss": 0.8555491, - "learning_rate": 3.9798815241526575e-06, - "loss": 0.87797379, - "num_input_tokens_seen": 26114105, - "step": 1228, - "time_per_iteration": 2.6955716609954834 - }, - { - "auxiliary_loss_clip": 0.01191175, - "auxiliary_loss_mlp": 0.01062669, - "balance_loss_clip": 1.05897212, - "balance_loss_mlp": 1.03860044, - "epoch": 0.07389147752893431, - "flos": 20047168235520.0, - "grad_norm": 2.2575099517148898, - "language_loss": 0.79598552, - "learning_rate": 3.97982638461608e-06, - "loss": 0.818524, - "num_input_tokens_seen": 26131165, - "step": 1229, - "time_per_iteration": 2.6544861793518066 - }, - { - "auxiliary_loss_clip": 0.01192886, - "auxiliary_loss_mlp": 0.00782044, - "balance_loss_clip": 1.05966699, - "balance_loss_mlp": 1.00032902, - "epoch": 0.07395160078160229, - "flos": 18114132890880.0, - "grad_norm": 2.2881874382496377, - "language_loss": 0.78209347, - "learning_rate": 3.979771170004287e-06, - "loss": 0.80184281, - "num_input_tokens_seen": 26150040, - "step": 1230, - "time_per_iteration": 2.6001133918762207 - }, - { - "auxiliary_loss_clip": 0.0120142, - "auxiliary_loss_mlp": 0.01052342, - "balance_loss_clip": 1.06209648, - "balance_loss_mlp": 1.02739108, - "epoch": 0.07401172403427025, - "flos": 23586918842880.0, - "grad_norm": 2.038847041772147, - "language_loss": 0.8136946, - "learning_rate": 3.979715880319372e-06, - "loss": 0.83623219, - "num_input_tokens_seen": 26169380, - "step": 1231, - "time_per_iteration": 2.6364073753356934 - }, - { - "auxiliary_loss_clip": 0.01179975, - "auxiliary_loss_mlp": 0.01070917, - "balance_loss_clip": 1.05690873, - "balance_loss_mlp": 1.04599047, - "epoch": 0.07407184728693822, - "flos": 26359904799360.0, - "grad_norm": 2.096832924731062, - "language_loss": 0.95204866, - "learning_rate": 3.979660515563434e-06, - "loss": 0.97455758, - "num_input_tokens_seen": 26189420, - "step": 1232, - "time_per_iteration": 2.7929203510284424 - }, - { - "auxiliary_loss_clip": 0.01187282, - "auxiliary_loss_mlp": 0.01059661, - "balance_loss_clip": 1.06202245, - "balance_loss_mlp": 1.03733301, - "epoch": 0.0741319705396062, - "flos": 22200443821440.0, - "grad_norm": 1.7778448126368063, - "language_loss": 0.80695188, - "learning_rate": 3.979605075738569e-06, - "loss": 0.82942128, - "num_input_tokens_seen": 26209300, - "step": 1233, - "time_per_iteration": 2.7945051193237305 - }, - { - "auxiliary_loss_clip": 0.01209245, - "auxiliary_loss_mlp": 0.0106207, - "balance_loss_clip": 1.06238747, - "balance_loss_mlp": 1.03602231, - "epoch": 0.07419209379227416, - "flos": 39200482523520.0, - "grad_norm": 2.136728864247421, - "language_loss": 0.70708907, - "learning_rate": 3.979549560846883e-06, - "loss": 0.72980225, - "num_input_tokens_seen": 26228110, - "step": 1234, - "time_per_iteration": 2.9646782875061035 - }, - { - "auxiliary_loss_clip": 0.01167486, - "auxiliary_loss_mlp": 0.01068879, - "balance_loss_clip": 1.0542618, - "balance_loss_mlp": 1.04265285, - "epoch": 0.07425221704494213, - "flos": 22781657790720.0, - "grad_norm": 1.7921102377369336, - "language_loss": 0.76852918, - "learning_rate": 3.979493970890478e-06, - "loss": 0.79089284, - "num_input_tokens_seen": 26247020, - "step": 1235, - "time_per_iteration": 2.820577621459961 - }, - { - "auxiliary_loss_clip": 0.01198028, - "auxiliary_loss_mlp": 0.01055883, - "balance_loss_clip": 1.05918813, - "balance_loss_mlp": 1.0321244, - "epoch": 0.0743123402976101, - "flos": 22272983337600.0, - "grad_norm": 2.3018318065058097, - "language_loss": 0.82748145, - "learning_rate": 3.979438305871464e-06, - "loss": 0.85002053, - "num_input_tokens_seen": 26265750, - "step": 1236, - "time_per_iteration": 2.6302287578582764 - }, - { - "auxiliary_loss_clip": 0.01154783, - "auxiliary_loss_mlp": 0.00782014, - "balance_loss_clip": 1.05519629, - "balance_loss_mlp": 1.00039148, - "epoch": 0.07437246355027807, - "flos": 29315029645440.0, - "grad_norm": 1.7985383717833268, - "language_loss": 0.7595011, - "learning_rate": 3.979382565791951e-06, - "loss": 0.77886909, - "num_input_tokens_seen": 26287905, - "step": 1237, - "time_per_iteration": 2.721931219100952 - }, - { - "auxiliary_loss_clip": 0.01135551, - "auxiliary_loss_mlp": 0.00783311, - "balance_loss_clip": 1.0505693, - "balance_loss_mlp": 1.00031757, - "epoch": 0.07443258680294604, - "flos": 31944732249600.0, - "grad_norm": 1.6915170784810407, - "language_loss": 0.77458763, - "learning_rate": 3.979326750654053e-06, - "loss": 0.79377621, - "num_input_tokens_seen": 26311795, - "step": 1238, - "time_per_iteration": 2.831620931625366 - }, - { - "auxiliary_loss_clip": 0.01177529, - "auxiliary_loss_mlp": 0.01057762, - "balance_loss_clip": 1.05673254, - "balance_loss_mlp": 1.03311002, - "epoch": 0.074492710055614, - "flos": 22675290641280.0, - "grad_norm": 1.9053364150897723, - "language_loss": 0.867737, - "learning_rate": 3.9792708604598854e-06, - "loss": 0.89008987, - "num_input_tokens_seen": 26330330, - "step": 1239, - "time_per_iteration": 2.6697263717651367 - }, - { - "auxiliary_loss_clip": 0.01159844, - "auxiliary_loss_mlp": 0.01050954, - "balance_loss_clip": 1.05222142, - "balance_loss_mlp": 1.02532458, - "epoch": 0.07455283330828198, - "flos": 21284901037440.0, - "grad_norm": 26.978042105238785, - "language_loss": 0.89356089, - "learning_rate": 3.979214895211569e-06, - "loss": 0.91566885, - "num_input_tokens_seen": 26348865, - "step": 1240, - "time_per_iteration": 2.846013069152832 - }, - { - "auxiliary_loss_clip": 0.01174117, - "auxiliary_loss_mlp": 0.01063539, - "balance_loss_clip": 1.05857158, - "balance_loss_mlp": 1.03713393, - "epoch": 0.07461295656094995, - "flos": 24388408967040.0, - "grad_norm": 1.9346624045484253, - "language_loss": 0.88873678, - "learning_rate": 3.979158854911225e-06, - "loss": 0.91111326, - "num_input_tokens_seen": 26368210, - "step": 1241, - "time_per_iteration": 2.6926562786102295 - }, - { - "auxiliary_loss_clip": 0.01079637, - "auxiliary_loss_mlp": 0.01009562, - "balance_loss_clip": 1.03489435, - "balance_loss_mlp": 1.00405502, - "epoch": 0.07467307981361791, - "flos": 62109660574080.0, - "grad_norm": 0.8973011136706247, - "language_loss": 0.63067901, - "learning_rate": 3.979102739560979e-06, - "loss": 0.65157104, - "num_input_tokens_seen": 26424890, - "step": 1242, - "time_per_iteration": 3.298609972000122 - }, - { - "auxiliary_loss_clip": 0.01164269, - "auxiliary_loss_mlp": 0.01068833, - "balance_loss_clip": 1.05246222, - "balance_loss_mlp": 1.03819644, - "epoch": 0.07473320306628589, - "flos": 24863148046080.0, - "grad_norm": 3.87499965477456, - "language_loss": 0.62926078, - "learning_rate": 3.9790465491629595e-06, - "loss": 0.65159178, - "num_input_tokens_seen": 26446405, - "step": 1243, - "time_per_iteration": 2.7774572372436523 - }, - { - "auxiliary_loss_clip": 0.01188864, - "auxiliary_loss_mlp": 0.01059918, - "balance_loss_clip": 1.05716145, - "balance_loss_mlp": 1.03499091, - "epoch": 0.07479332631895386, - "flos": 24897442556160.0, - "grad_norm": 1.6252135866538246, - "language_loss": 0.76259589, - "learning_rate": 3.978990283719296e-06, - "loss": 0.78508377, - "num_input_tokens_seen": 26466070, - "step": 1244, - "time_per_iteration": 2.714459180831909 - }, - { - "auxiliary_loss_clip": 0.01184345, - "auxiliary_loss_mlp": 0.00783076, - "balance_loss_clip": 1.0611167, - "balance_loss_mlp": 1.00038469, - "epoch": 0.07485344957162182, - "flos": 17815247821440.0, - "grad_norm": 5.636002853507256, - "language_loss": 0.69419599, - "learning_rate": 3.978933943232123e-06, - "loss": 0.71387023, - "num_input_tokens_seen": 26479350, - "step": 1245, - "time_per_iteration": 2.640895366668701 - }, - { - "auxiliary_loss_clip": 0.01203955, - "auxiliary_loss_mlp": 0.01062684, - "balance_loss_clip": 1.06098139, - "balance_loss_mlp": 1.0372088, - "epoch": 0.0749135728242898, - "flos": 25010202326400.0, - "grad_norm": 2.5525245798098757, - "language_loss": 0.88635457, - "learning_rate": 3.978877527703576e-06, - "loss": 0.90902102, - "num_input_tokens_seen": 26498255, - "step": 1246, - "time_per_iteration": 2.747765302658081 - }, - { - "auxiliary_loss_clip": 0.01212369, - "auxiliary_loss_mlp": 0.01077452, - "balance_loss_clip": 1.06102896, - "balance_loss_mlp": 1.049402, - "epoch": 0.07497369607695777, - "flos": 17822071405440.0, - "grad_norm": 2.675073323546491, - "language_loss": 0.8825295, - "learning_rate": 3.9788210371357945e-06, - "loss": 0.90542769, - "num_input_tokens_seen": 26515375, - "step": 1247, - "time_per_iteration": 2.6810224056243896 - }, - { - "auxiliary_loss_clip": 0.0118495, - "auxiliary_loss_mlp": 0.01069489, - "balance_loss_clip": 1.06058884, - "balance_loss_mlp": 1.04383492, - "epoch": 0.07503381932962573, - "flos": 15121086261120.0, - "grad_norm": 2.620559853720615, - "language_loss": 0.64849806, - "learning_rate": 3.978764471530921e-06, - "loss": 0.67104244, - "num_input_tokens_seen": 26533595, - "step": 1248, - "time_per_iteration": 2.706862449645996 - }, - { - "auxiliary_loss_clip": 0.01181878, - "auxiliary_loss_mlp": 0.00782677, - "balance_loss_clip": 1.0575974, - "balance_loss_mlp": 1.0004611, - "epoch": 0.0750939425822937, - "flos": 12816734071680.0, - "grad_norm": 2.872208543000993, - "language_loss": 0.74216163, - "learning_rate": 3.978707830891102e-06, - "loss": 0.7618072, - "num_input_tokens_seen": 26549405, - "step": 1249, - "time_per_iteration": 4.309665679931641 - }, - { - "auxiliary_loss_clip": 0.01168375, - "auxiliary_loss_mlp": 0.01079691, - "balance_loss_clip": 1.0579834, - "balance_loss_mlp": 1.05296445, - "epoch": 0.07515406583496168, - "flos": 24206844695040.0, - "grad_norm": 2.679176110316805, - "language_loss": 0.82353318, - "learning_rate": 3.978651115218482e-06, - "loss": 0.84601378, - "num_input_tokens_seen": 26567200, - "step": 1250, - "time_per_iteration": 4.367432594299316 - }, - { - "auxiliary_loss_clip": 0.011507, - "auxiliary_loss_mlp": 0.01064103, - "balance_loss_clip": 1.05736125, - "balance_loss_mlp": 1.0380677, - "epoch": 0.07521418908762964, - "flos": 26688164215680.0, - "grad_norm": 2.015636709873133, - "language_loss": 0.6679548, - "learning_rate": 3.978594324515215e-06, - "loss": 0.69010288, - "num_input_tokens_seen": 26586190, - "step": 1251, - "time_per_iteration": 4.339111089706421 - }, - { - "auxiliary_loss_clip": 0.01061099, - "auxiliary_loss_mlp": 0.01007289, - "balance_loss_clip": 1.02992618, - "balance_loss_mlp": 1.00314093, - "epoch": 0.0752743123402976, - "flos": 59095140589440.0, - "grad_norm": 0.9014655793512963, - "language_loss": 0.7038399, - "learning_rate": 3.9785374587834515e-06, - "loss": 0.72452378, - "num_input_tokens_seen": 26650710, - "step": 1252, - "time_per_iteration": 4.984445333480835 - }, - { - "auxiliary_loss_clip": 0.0120348, - "auxiliary_loss_mlp": 0.01071343, - "balance_loss_clip": 1.06016684, - "balance_loss_mlp": 1.04651129, - "epoch": 0.07533443559296558, - "flos": 23477032160640.0, - "grad_norm": 2.2789224049077226, - "language_loss": 0.79936707, - "learning_rate": 3.97848051802535e-06, - "loss": 0.82211524, - "num_input_tokens_seen": 26669000, - "step": 1253, - "time_per_iteration": 2.613696575164795 - }, - { - "auxiliary_loss_clip": 0.01165402, - "auxiliary_loss_mlp": 0.01062493, - "balance_loss_clip": 1.05703712, - "balance_loss_mlp": 1.03758967, - "epoch": 0.07539455884563355, - "flos": 20879110114560.0, - "grad_norm": 3.1057458778243263, - "language_loss": 0.93360364, - "learning_rate": 3.978423502243069e-06, - "loss": 0.95588255, - "num_input_tokens_seen": 26683075, - "step": 1254, - "time_per_iteration": 2.7332606315612793 - }, - { - "auxiliary_loss_clip": 0.011733, - "auxiliary_loss_mlp": 0.01064454, - "balance_loss_clip": 1.06050682, - "balance_loss_mlp": 1.03958726, - "epoch": 0.07545468209830151, - "flos": 27672906551040.0, - "grad_norm": 2.090631066181037, - "language_loss": 0.88087487, - "learning_rate": 3.97836641143877e-06, - "loss": 0.90325236, - "num_input_tokens_seen": 26701875, - "step": 1255, - "time_per_iteration": 2.713636875152588 - }, - { - "auxiliary_loss_clip": 0.01202338, - "auxiliary_loss_mlp": 0.01071467, - "balance_loss_clip": 1.06138325, - "balance_loss_mlp": 1.04531264, - "epoch": 0.0755148053509695, - "flos": 14136990370560.0, - "grad_norm": 1.9772348994273161, - "language_loss": 0.79305708, - "learning_rate": 3.978309245614618e-06, - "loss": 0.81579506, - "num_input_tokens_seen": 26719050, - "step": 1256, - "time_per_iteration": 2.688812255859375 - }, - { - "auxiliary_loss_clip": 0.01064506, - "auxiliary_loss_mlp": 0.01008663, - "balance_loss_clip": 1.0281384, - "balance_loss_mlp": 1.0043, - "epoch": 0.07557492860363746, - "flos": 58235257929600.0, - "grad_norm": 0.7721513084275832, - "language_loss": 0.58031851, - "learning_rate": 3.9782520047727825e-06, - "loss": 0.6010502, - "num_input_tokens_seen": 26780650, - "step": 1257, - "time_per_iteration": 3.290971517562866 - }, - { - "auxiliary_loss_clip": 0.01154091, - "auxiliary_loss_mlp": 0.01065293, - "balance_loss_clip": 1.06175375, - "balance_loss_mlp": 1.04035461, - "epoch": 0.07563505185630542, - "flos": 24644380262400.0, - "grad_norm": 2.5700283098608026, - "language_loss": 0.90029764, - "learning_rate": 3.978194688915432e-06, - "loss": 0.92249143, - "num_input_tokens_seen": 26798725, - "step": 1258, - "time_per_iteration": 2.800297975540161 - }, - { - "auxiliary_loss_clip": 0.01169581, - "auxiliary_loss_mlp": 0.01064585, - "balance_loss_clip": 1.06184185, - "balance_loss_mlp": 1.03797793, - "epoch": 0.07569517510897339, - "flos": 15522998515200.0, - "grad_norm": 2.1868972302346377, - "language_loss": 0.81404132, - "learning_rate": 3.978137298044741e-06, - "loss": 0.83638299, - "num_input_tokens_seen": 26817005, - "step": 1259, - "time_per_iteration": 2.767717123031616 - }, - { - "auxiliary_loss_clip": 0.01194891, - "auxiliary_loss_mlp": 0.01062022, - "balance_loss_clip": 1.06317782, - "balance_loss_mlp": 1.03766739, - "epoch": 0.07575529836164137, - "flos": 22928532503040.0, - "grad_norm": 1.8876128491153832, - "language_loss": 0.7609086, - "learning_rate": 3.978079832162885e-06, - "loss": 0.78347778, - "num_input_tokens_seen": 26836655, - "step": 1260, - "time_per_iteration": 2.859339714050293 - }, - { - "auxiliary_loss_clip": 0.01160098, - "auxiliary_loss_mlp": 0.01068568, - "balance_loss_clip": 1.05432057, - "balance_loss_mlp": 1.04222322, - "epoch": 0.07581542161430933, - "flos": 19500428344320.0, - "grad_norm": 1.7028037437197219, - "language_loss": 0.84734851, - "learning_rate": 3.978022291272044e-06, - "loss": 0.86963522, - "num_input_tokens_seen": 26854925, - "step": 1261, - "time_per_iteration": 2.773087978363037 - }, - { - "auxiliary_loss_clip": 0.01212087, - "auxiliary_loss_mlp": 0.0106726, - "balance_loss_clip": 1.06821966, - "balance_loss_mlp": 1.04273915, - "epoch": 0.0758755448669773, - "flos": 24973465691520.0, - "grad_norm": 1.8668314773439494, - "language_loss": 0.82578814, - "learning_rate": 3.977964675374399e-06, - "loss": 0.84858155, - "num_input_tokens_seen": 26876170, - "step": 1262, - "time_per_iteration": 2.681764841079712 - }, - { - "auxiliary_loss_clip": 0.01206367, - "auxiliary_loss_mlp": 0.0106285, - "balance_loss_clip": 1.06333947, - "balance_loss_mlp": 1.03685009, - "epoch": 0.07593566811964528, - "flos": 22747973811840.0, - "grad_norm": 2.501362251414687, - "language_loss": 0.82448232, - "learning_rate": 3.977906984472136e-06, - "loss": 0.84717447, - "num_input_tokens_seen": 26895005, - "step": 1263, - "time_per_iteration": 2.6262786388397217 - }, - { - "auxiliary_loss_clip": 0.01166059, - "auxiliary_loss_mlp": 0.01068738, - "balance_loss_clip": 1.06484997, - "balance_loss_mlp": 1.04334641, - "epoch": 0.07599579137231324, - "flos": 23112395245440.0, - "grad_norm": 2.171520639750579, - "language_loss": 0.76149648, - "learning_rate": 3.977849218567442e-06, - "loss": 0.78384447, - "num_input_tokens_seen": 26913930, - "step": 1264, - "time_per_iteration": 2.7735466957092285 - }, - { - "auxiliary_loss_clip": 0.01181777, - "auxiliary_loss_mlp": 0.01061673, - "balance_loss_clip": 1.06183577, - "balance_loss_mlp": 1.03704381, - "epoch": 0.07605591462498121, - "flos": 14502058248960.0, - "grad_norm": 2.252731793921747, - "language_loss": 0.80919051, - "learning_rate": 3.977791377662507e-06, - "loss": 0.83162498, - "num_input_tokens_seen": 26931485, - "step": 1265, - "time_per_iteration": 2.6076793670654297 - }, - { - "auxiliary_loss_clip": 0.01143593, - "auxiliary_loss_mlp": 0.01068856, - "balance_loss_clip": 1.05383801, - "balance_loss_mlp": 1.0411638, - "epoch": 0.07611603787764919, - "flos": 23514199758720.0, - "grad_norm": 2.117217065332582, - "language_loss": 0.65244937, - "learning_rate": 3.977733461759524e-06, - "loss": 0.67457378, - "num_input_tokens_seen": 26951670, - "step": 1266, - "time_per_iteration": 2.714848041534424 - }, - { - "auxiliary_loss_clip": 0.0116364, - "auxiliary_loss_mlp": 0.01066982, - "balance_loss_clip": 1.05869627, - "balance_loss_mlp": 1.04194832, - "epoch": 0.07617616113031715, - "flos": 21507188353920.0, - "grad_norm": 2.0157381540709416, - "language_loss": 0.79570109, - "learning_rate": 3.977675470860691e-06, - "loss": 0.81800735, - "num_input_tokens_seen": 26970335, - "step": 1267, - "time_per_iteration": 2.692220687866211 - }, - { - "auxiliary_loss_clip": 0.01186526, - "auxiliary_loss_mlp": 0.01060572, - "balance_loss_clip": 1.06368709, - "balance_loss_mlp": 1.03644359, - "epoch": 0.07623628438298512, - "flos": 14573161221120.0, - "grad_norm": 2.573855585409162, - "language_loss": 0.72936547, - "learning_rate": 3.977617404968205e-06, - "loss": 0.75183642, - "num_input_tokens_seen": 26986025, - "step": 1268, - "time_per_iteration": 2.666487216949463 - }, - { - "auxiliary_loss_clip": 0.01189272, - "auxiliary_loss_mlp": 0.01056943, - "balance_loss_clip": 1.05925119, - "balance_loss_mlp": 1.03146791, - "epoch": 0.07629640763565308, - "flos": 14720395069440.0, - "grad_norm": 2.3531002902867018, - "language_loss": 0.82087409, - "learning_rate": 3.977559264084269e-06, - "loss": 0.84333622, - "num_input_tokens_seen": 27004045, - "step": 1269, - "time_per_iteration": 2.6196024417877197 - }, - { - "auxiliary_loss_clip": 0.01198264, - "auxiliary_loss_mlp": 0.01062408, - "balance_loss_clip": 1.06528163, - "balance_loss_mlp": 1.03656352, - "epoch": 0.07635653088832106, - "flos": 14902929008640.0, - "grad_norm": 2.6660741307472424, - "language_loss": 0.88614184, - "learning_rate": 3.977501048211088e-06, - "loss": 0.90874851, - "num_input_tokens_seen": 27022070, - "step": 1270, - "time_per_iteration": 2.6423919200897217 - }, - { - "auxiliary_loss_clip": 0.01195764, - "auxiliary_loss_mlp": 0.01062092, - "balance_loss_clip": 1.06443572, - "balance_loss_mlp": 1.0371294, - "epoch": 0.07641665414098903, - "flos": 26651571235200.0, - "grad_norm": 2.486841045046768, - "language_loss": 0.7104162, - "learning_rate": 3.977442757350869e-06, - "loss": 0.73299474, - "num_input_tokens_seen": 27041755, - "step": 1271, - "time_per_iteration": 2.6679437160491943 - }, - { - "auxiliary_loss_clip": 0.01157818, - "auxiliary_loss_mlp": 0.01068131, - "balance_loss_clip": 1.05973268, - "balance_loss_mlp": 1.04282308, - "epoch": 0.07647677739365699, - "flos": 25192808092800.0, - "grad_norm": 1.5691807400142836, - "language_loss": 0.82570392, - "learning_rate": 3.977384391505823e-06, - "loss": 0.84796339, - "num_input_tokens_seen": 27061540, - "step": 1272, - "time_per_iteration": 2.7613680362701416 - }, - { - "auxiliary_loss_clip": 0.01176176, - "auxiliary_loss_mlp": 0.00782751, - "balance_loss_clip": 1.05822372, - "balance_loss_mlp": 1.00051665, - "epoch": 0.07653690064632497, - "flos": 20558141159040.0, - "grad_norm": 1.811509476700225, - "language_loss": 0.79854733, - "learning_rate": 3.977325950678162e-06, - "loss": 0.81813657, - "num_input_tokens_seen": 27081395, - "step": 1273, - "time_per_iteration": 2.696317434310913 - }, - { - "auxiliary_loss_clip": 0.01185133, - "auxiliary_loss_mlp": 0.01064308, - "balance_loss_clip": 1.06556833, - "balance_loss_mlp": 1.03910685, - "epoch": 0.07659702389899294, - "flos": 22269320150400.0, - "grad_norm": 1.7399681078894738, - "language_loss": 0.81519866, - "learning_rate": 3.977267434870103e-06, - "loss": 0.83769304, - "num_input_tokens_seen": 27101175, - "step": 1274, - "time_per_iteration": 2.8570950031280518 - }, - { - "auxiliary_loss_clip": 0.0118748, - "auxiliary_loss_mlp": 0.01078696, - "balance_loss_clip": 1.06516898, - "balance_loss_mlp": 1.05164731, - "epoch": 0.0766571471516609, - "flos": 32636120209920.0, - "grad_norm": 2.6845981005996453, - "language_loss": 0.73083639, - "learning_rate": 3.977208844083865e-06, - "loss": 0.75349814, - "num_input_tokens_seen": 27124505, - "step": 1275, - "time_per_iteration": 2.75947904586792 - }, - { - "auxiliary_loss_clip": 0.0121081, - "auxiliary_loss_mlp": 0.01063745, - "balance_loss_clip": 1.06740415, - "balance_loss_mlp": 1.03694642, - "epoch": 0.07671727040432888, - "flos": 15267386355840.0, - "grad_norm": 2.828157953752124, - "language_loss": 0.79507053, - "learning_rate": 3.9771501783216685e-06, - "loss": 0.81781602, - "num_input_tokens_seen": 27140960, - "step": 1276, - "time_per_iteration": 2.626683473587036 - }, - { - "auxiliary_loss_clip": 0.01198279, - "auxiliary_loss_mlp": 0.01058719, - "balance_loss_clip": 1.06486118, - "balance_loss_mlp": 1.03485298, - "epoch": 0.07677739365699685, - "flos": 28184094956160.0, - "grad_norm": 2.406514987231471, - "language_loss": 0.58915478, - "learning_rate": 3.97709143758574e-06, - "loss": 0.61172473, - "num_input_tokens_seen": 27160985, - "step": 1277, - "time_per_iteration": 2.6684958934783936 - }, - { - "auxiliary_loss_clip": 0.01201282, - "auxiliary_loss_mlp": 0.01064396, - "balance_loss_clip": 1.06430948, - "balance_loss_mlp": 1.03919542, - "epoch": 0.07683751690966481, - "flos": 18296128126080.0, - "grad_norm": 2.8024245322836046, - "language_loss": 0.74957907, - "learning_rate": 3.977032621878305e-06, - "loss": 0.77223587, - "num_input_tokens_seen": 27178390, - "step": 1278, - "time_per_iteration": 2.723675012588501 - }, - { - "auxiliary_loss_clip": 0.01160972, - "auxiliary_loss_mlp": 0.01063133, - "balance_loss_clip": 1.0584681, - "balance_loss_mlp": 1.0390408, - "epoch": 0.07689764016233278, - "flos": 21981101420160.0, - "grad_norm": 5.339853944094037, - "language_loss": 0.88594604, - "learning_rate": 3.976973731201596e-06, - "loss": 0.90818715, - "num_input_tokens_seen": 27197505, - "step": 1279, - "time_per_iteration": 2.655036211013794 - }, - { - "auxiliary_loss_clip": 0.01172627, - "auxiliary_loss_mlp": 0.01066586, - "balance_loss_clip": 1.06065845, - "balance_loss_mlp": 1.04077685, - "epoch": 0.07695776341500075, - "flos": 22235995307520.0, - "grad_norm": 2.4937131241937256, - "language_loss": 0.8300451, - "learning_rate": 3.976914765557845e-06, - "loss": 0.85243726, - "num_input_tokens_seen": 27214260, - "step": 1280, - "time_per_iteration": 2.7717065811157227 - }, - { - "auxiliary_loss_clip": 0.01194022, - "auxiliary_loss_mlp": 0.01066533, - "balance_loss_clip": 1.06593037, - "balance_loss_mlp": 1.04104638, - "epoch": 0.07701788666766872, - "flos": 16143750380160.0, - "grad_norm": 2.044864943195716, - "language_loss": 0.7581439, - "learning_rate": 3.9768557249492875e-06, - "loss": 0.78074944, - "num_input_tokens_seen": 27232525, - "step": 1281, - "time_per_iteration": 2.7444865703582764 - }, - { - "auxiliary_loss_clip": 0.01170775, - "auxiliary_loss_mlp": 0.01062526, - "balance_loss_clip": 1.05879402, - "balance_loss_mlp": 1.03669322, - "epoch": 0.07707800992033668, - "flos": 19463045264640.0, - "grad_norm": 1.8925477349429178, - "language_loss": 0.75091648, - "learning_rate": 3.9767966093781634e-06, - "loss": 0.77324951, - "num_input_tokens_seen": 27249800, - "step": 1282, - "time_per_iteration": 2.829145908355713 - }, - { - "auxiliary_loss_clip": 0.01213222, - "auxiliary_loss_mlp": 0.01071082, - "balance_loss_clip": 1.07007408, - "balance_loss_mlp": 1.04549992, - "epoch": 0.07713813317300466, - "flos": 18990281433600.0, - "grad_norm": 2.1558853998977527, - "language_loss": 0.83863324, - "learning_rate": 3.976737418846713e-06, - "loss": 0.8614763, - "num_input_tokens_seen": 27268895, - "step": 1283, - "time_per_iteration": 2.6955173015594482 - }, - { - "auxiliary_loss_clip": 0.0119621, - "auxiliary_loss_mlp": 0.01066889, - "balance_loss_clip": 1.06603825, - "balance_loss_mlp": 1.03925657, - "epoch": 0.07719825642567263, - "flos": 18113953322880.0, - "grad_norm": 2.520477290704422, - "language_loss": 0.75147104, - "learning_rate": 3.976678153357181e-06, - "loss": 0.77410209, - "num_input_tokens_seen": 27288180, - "step": 1284, - "time_per_iteration": 2.6589291095733643 - }, - { - "auxiliary_loss_clip": 0.01182212, - "auxiliary_loss_mlp": 0.01068485, - "balance_loss_clip": 1.06304765, - "balance_loss_mlp": 1.0438329, - "epoch": 0.0772583796783406, - "flos": 42194426993280.0, - "grad_norm": 5.2953301239297295, - "language_loss": 0.76224041, - "learning_rate": 3.976618812911817e-06, - "loss": 0.78474742, - "num_input_tokens_seen": 27311815, - "step": 1285, - "time_per_iteration": 2.847702741622925 - }, - { - "auxiliary_loss_clip": 0.01216302, - "auxiliary_loss_mlp": 0.01071451, - "balance_loss_clip": 1.07193899, - "balance_loss_mlp": 1.04729891, - "epoch": 0.07731850293100857, - "flos": 24753692327040.0, - "grad_norm": 2.0564733507641, - "language_loss": 0.84193194, - "learning_rate": 3.9765593975128685e-06, - "loss": 0.86480945, - "num_input_tokens_seen": 27331890, - "step": 1286, - "time_per_iteration": 2.713963270187378 - }, - { - "auxiliary_loss_clip": 0.01180469, - "auxiliary_loss_mlp": 0.01061062, - "balance_loss_clip": 1.06331325, - "balance_loss_mlp": 1.03646958, - "epoch": 0.07737862618367654, - "flos": 17565884628480.0, - "grad_norm": 2.810253293244863, - "language_loss": 0.76899689, - "learning_rate": 3.97649990716259e-06, - "loss": 0.79141217, - "num_input_tokens_seen": 27348320, - "step": 1287, - "time_per_iteration": 2.669168472290039 - }, - { - "auxiliary_loss_clip": 0.011763, - "auxiliary_loss_mlp": 0.01061108, - "balance_loss_clip": 1.05891848, - "balance_loss_mlp": 1.03696775, - "epoch": 0.0774387494363445, - "flos": 25627147349760.0, - "grad_norm": 1.6525652726351308, - "language_loss": 0.84699571, - "learning_rate": 3.976440341863237e-06, - "loss": 0.86936986, - "num_input_tokens_seen": 27367670, - "step": 1288, - "time_per_iteration": 2.7794599533081055 - }, - { - "auxiliary_loss_clip": 0.01206182, - "auxiliary_loss_mlp": 0.0106604, - "balance_loss_clip": 1.06214797, - "balance_loss_mlp": 1.04203176, - "epoch": 0.07749887268901248, - "flos": 12239865648000.0, - "grad_norm": 2.0424090794957523, - "language_loss": 0.85576034, - "learning_rate": 3.976380701617068e-06, - "loss": 0.87848258, - "num_input_tokens_seen": 27385485, - "step": 1289, - "time_per_iteration": 4.232934236526489 - }, - { - "auxiliary_loss_clip": 0.01207527, - "auxiliary_loss_mlp": 0.01052975, - "balance_loss_clip": 1.06487668, - "balance_loss_mlp": 1.0291574, - "epoch": 0.07755899594168045, - "flos": 25081736261760.0, - "grad_norm": 2.840721047922519, - "language_loss": 0.85548425, - "learning_rate": 3.976320986426344e-06, - "loss": 0.87808931, - "num_input_tokens_seen": 27405110, - "step": 1290, - "time_per_iteration": 4.218302965164185 - }, - { - "auxiliary_loss_clip": 0.0117374, - "auxiliary_loss_mlp": 0.01066698, - "balance_loss_clip": 1.06411862, - "balance_loss_mlp": 1.04041266, - "epoch": 0.07761911919434841, - "flos": 14246410176000.0, - "grad_norm": 2.3756178078405976, - "language_loss": 0.91390574, - "learning_rate": 3.9762611962933315e-06, - "loss": 0.93631011, - "num_input_tokens_seen": 27422855, - "step": 1291, - "time_per_iteration": 4.468304395675659 - }, - { - "auxiliary_loss_clip": 0.01081301, - "auxiliary_loss_mlp": 0.01043026, - "balance_loss_clip": 1.04092944, - "balance_loss_mlp": 1.03894901, - "epoch": 0.07767924244701638, - "flos": 67237202954880.0, - "grad_norm": 0.8973948861970446, - "language_loss": 0.65065891, - "learning_rate": 3.9762013312202955e-06, - "loss": 0.67190224, - "num_input_tokens_seen": 27487190, - "step": 1292, - "time_per_iteration": 3.3142755031585693 - }, - { - "auxiliary_loss_clip": 0.01195822, - "auxiliary_loss_mlp": 0.01062751, - "balance_loss_clip": 1.06527543, - "balance_loss_mlp": 1.03846776, - "epoch": 0.07773936569968436, - "flos": 28550635292160.0, - "grad_norm": 1.7595227960044768, - "language_loss": 0.87530363, - "learning_rate": 3.9761413912095075e-06, - "loss": 0.89788938, - "num_input_tokens_seen": 27510465, - "step": 1293, - "time_per_iteration": 2.801603078842163 - }, - { - "auxiliary_loss_clip": 0.01116633, - "auxiliary_loss_mlp": 0.01078659, - "balance_loss_clip": 1.05041039, - "balance_loss_mlp": 1.05012059, - "epoch": 0.07779948895235232, - "flos": 27490264871040.0, - "grad_norm": 2.2898991349098528, - "language_loss": 0.84518278, - "learning_rate": 3.976081376263239e-06, - "loss": 0.8671357, - "num_input_tokens_seen": 27528645, - "step": 1294, - "time_per_iteration": 2.898597002029419 - }, - { - "auxiliary_loss_clip": 0.01158796, - "auxiliary_loss_mlp": 0.01059505, - "balance_loss_clip": 1.05967593, - "balance_loss_mlp": 1.0342207, - "epoch": 0.07785961220502029, - "flos": 18223301301120.0, - "grad_norm": 2.7292442592472073, - "language_loss": 0.79365373, - "learning_rate": 3.976021286383768e-06, - "loss": 0.81583679, - "num_input_tokens_seen": 27546165, - "step": 1295, - "time_per_iteration": 2.8481552600860596 - }, - { - "auxiliary_loss_clip": 0.01155886, - "auxiliary_loss_mlp": 0.01061351, - "balance_loss_clip": 1.06015158, - "balance_loss_mlp": 1.0356493, - "epoch": 0.07791973545768827, - "flos": 24608218245120.0, - "grad_norm": 3.472740252224496, - "language_loss": 0.88351864, - "learning_rate": 3.975961121573371e-06, - "loss": 0.90569103, - "num_input_tokens_seen": 27566520, - "step": 1296, - "time_per_iteration": 2.697831392288208 - }, - { - "auxiliary_loss_clip": 0.0120756, - "auxiliary_loss_mlp": 0.01074146, - "balance_loss_clip": 1.06552935, - "balance_loss_mlp": 1.04791999, - "epoch": 0.07797985871035623, - "flos": 14282069402880.0, - "grad_norm": 2.384603846473911, - "language_loss": 0.9625901, - "learning_rate": 3.9759008818343305e-06, - "loss": 0.98540717, - "num_input_tokens_seen": 27581960, - "step": 1297, - "time_per_iteration": 2.62660551071167 - }, - { - "auxiliary_loss_clip": 0.01175852, - "auxiliary_loss_mlp": 0.01069298, - "balance_loss_clip": 1.06147313, - "balance_loss_mlp": 1.04517019, - "epoch": 0.0780399819630242, - "flos": 26610453141120.0, - "grad_norm": 2.15152040651991, - "language_loss": 0.7600193, - "learning_rate": 3.97584056716893e-06, - "loss": 0.78247076, - "num_input_tokens_seen": 27601415, - "step": 1298, - "time_per_iteration": 2.8040499687194824 - }, - { - "auxiliary_loss_clip": 0.0114505, - "auxiliary_loss_mlp": 0.00783981, - "balance_loss_clip": 1.05864501, - "balance_loss_mlp": 1.0006063, - "epoch": 0.07810010521569218, - "flos": 21834514016640.0, - "grad_norm": 1.6697657327886877, - "language_loss": 0.8097105, - "learning_rate": 3.9757801775794575e-06, - "loss": 0.82900077, - "num_input_tokens_seen": 27621490, - "step": 1299, - "time_per_iteration": 2.7667653560638428 - }, - { - "auxiliary_loss_clip": 0.01162638, - "auxiliary_loss_mlp": 0.01064395, - "balance_loss_clip": 1.06191885, - "balance_loss_mlp": 1.0393368, - "epoch": 0.07816022846836014, - "flos": 25081233471360.0, - "grad_norm": 1.9748762517467437, - "language_loss": 0.86755943, - "learning_rate": 3.975719713068202e-06, - "loss": 0.8898297, - "num_input_tokens_seen": 27640600, - "step": 1300, - "time_per_iteration": 2.7819204330444336 - }, - { - "auxiliary_loss_clip": 0.0120807, - "auxiliary_loss_mlp": 0.01056805, - "balance_loss_clip": 1.06663537, - "balance_loss_mlp": 1.03180683, - "epoch": 0.0782203517210281, - "flos": 40917515431680.0, - "grad_norm": 3.040560411644486, - "language_loss": 0.71822268, - "learning_rate": 3.975659173637458e-06, - "loss": 0.74087137, - "num_input_tokens_seen": 27663070, - "step": 1301, - "time_per_iteration": 2.845107316970825 - }, - { - "auxiliary_loss_clip": 0.01196566, - "auxiliary_loss_mlp": 0.01075534, - "balance_loss_clip": 1.06426311, - "balance_loss_mlp": 1.05100083, - "epoch": 0.07828047497369607, - "flos": 41172014269440.0, - "grad_norm": 1.6425838754876312, - "language_loss": 0.70782864, - "learning_rate": 3.97559855928952e-06, - "loss": 0.73054957, - "num_input_tokens_seen": 27686425, - "step": 1302, - "time_per_iteration": 2.898069381713867 - }, - { - "auxiliary_loss_clip": 0.01162032, - "auxiliary_loss_mlp": 0.00783256, - "balance_loss_clip": 1.06019354, - "balance_loss_mlp": 1.00062823, - "epoch": 0.07834059822636405, - "flos": 23508130360320.0, - "grad_norm": 2.067506704059933, - "language_loss": 0.82100385, - "learning_rate": 3.9755378700266864e-06, - "loss": 0.84045678, - "num_input_tokens_seen": 27704900, - "step": 1303, - "time_per_iteration": 2.7862839698791504 - }, - { - "auxiliary_loss_clip": 0.01191742, - "auxiliary_loss_mlp": 0.01074585, - "balance_loss_clip": 1.06583321, - "balance_loss_mlp": 1.04908574, - "epoch": 0.07840072147903202, - "flos": 20193899293440.0, - "grad_norm": 1.8830773419754625, - "language_loss": 0.75206572, - "learning_rate": 3.9754771058512585e-06, - "loss": 0.77472901, - "num_input_tokens_seen": 27724890, - "step": 1304, - "time_per_iteration": 2.7380170822143555 - }, - { - "auxiliary_loss_clip": 0.01211207, - "auxiliary_loss_mlp": 0.01074343, - "balance_loss_clip": 1.07114935, - "balance_loss_mlp": 1.04922605, - "epoch": 0.07846084473169998, - "flos": 21360816432000.0, - "grad_norm": 1.6118444643214749, - "language_loss": 0.76141047, - "learning_rate": 3.975416266765542e-06, - "loss": 0.784266, - "num_input_tokens_seen": 27743115, - "step": 1305, - "time_per_iteration": 2.6788928508758545 - }, - { - "auxiliary_loss_clip": 0.01137547, - "auxiliary_loss_mlp": 0.01064795, - "balance_loss_clip": 1.05611205, - "balance_loss_mlp": 1.04021358, - "epoch": 0.07852096798436796, - "flos": 25410965345280.0, - "grad_norm": 1.9541638070229452, - "language_loss": 0.85011744, - "learning_rate": 3.975355352771841e-06, - "loss": 0.87214082, - "num_input_tokens_seen": 27763570, - "step": 1306, - "time_per_iteration": 3.048137903213501 - }, - { - "auxiliary_loss_clip": 0.01194779, - "auxiliary_loss_mlp": 0.01049822, - "balance_loss_clip": 1.06754708, - "balance_loss_mlp": 1.02668333, - "epoch": 0.07858109123703592, - "flos": 24571481610240.0, - "grad_norm": 6.108459548145404, - "language_loss": 0.90882134, - "learning_rate": 3.975294363872468e-06, - "loss": 0.93126732, - "num_input_tokens_seen": 27780030, - "step": 1307, - "time_per_iteration": 3.1597135066986084 - }, - { - "auxiliary_loss_clip": 0.01145989, - "auxiliary_loss_mlp": 0.01060478, - "balance_loss_clip": 1.05529833, - "balance_loss_mlp": 1.034729, - "epoch": 0.07864121448970389, - "flos": 20698874645760.0, - "grad_norm": 3.4991416096159136, - "language_loss": 0.83695096, - "learning_rate": 3.975233300069735e-06, - "loss": 0.85901558, - "num_input_tokens_seen": 27796225, - "step": 1308, - "time_per_iteration": 2.749174118041992 - }, - { - "auxiliary_loss_clip": 0.01151044, - "auxiliary_loss_mlp": 0.01061966, - "balance_loss_clip": 1.05445218, - "balance_loss_mlp": 1.03789735, - "epoch": 0.07870133774237187, - "flos": 22966526113920.0, - "grad_norm": 1.7092634116882437, - "language_loss": 0.77521002, - "learning_rate": 3.975172161365958e-06, - "loss": 0.7973401, - "num_input_tokens_seen": 27815975, - "step": 1309, - "time_per_iteration": 2.752854108810425 - }, - { - "auxiliary_loss_clip": 0.01200102, - "auxiliary_loss_mlp": 0.01070583, - "balance_loss_clip": 1.06396675, - "balance_loss_mlp": 1.04449987, - "epoch": 0.07876146099503983, - "flos": 18842832103680.0, - "grad_norm": 1.8729662604656268, - "language_loss": 0.80561006, - "learning_rate": 3.975110947763453e-06, - "loss": 0.82831693, - "num_input_tokens_seen": 27832255, - "step": 1310, - "time_per_iteration": 2.6966710090637207 - }, - { - "auxiliary_loss_clip": 0.01173381, - "auxiliary_loss_mlp": 0.0078245, - "balance_loss_clip": 1.06193507, - "balance_loss_mlp": 1.00060987, - "epoch": 0.0788215842477078, - "flos": 23805794367360.0, - "grad_norm": 1.796715978968241, - "language_loss": 0.73187977, - "learning_rate": 3.9750496592645435e-06, - "loss": 0.75143808, - "num_input_tokens_seen": 27852180, - "step": 1311, - "time_per_iteration": 2.7588090896606445 - }, - { - "auxiliary_loss_clip": 0.01188438, - "auxiliary_loss_mlp": 0.01078546, - "balance_loss_clip": 1.06358969, - "balance_loss_mlp": 1.05342865, - "epoch": 0.07888170750037576, - "flos": 21579907438080.0, - "grad_norm": 1.7490617386556226, - "language_loss": 0.86002982, - "learning_rate": 3.974988295871553e-06, - "loss": 0.88269973, - "num_input_tokens_seen": 27871435, - "step": 1312, - "time_per_iteration": 2.6969683170318604 - }, - { - "auxiliary_loss_clip": 0.01178338, - "auxiliary_loss_mlp": 0.01059112, - "balance_loss_clip": 1.06324685, - "balance_loss_mlp": 1.03633142, - "epoch": 0.07894183075304374, - "flos": 19864849777920.0, - "grad_norm": 1.825664315845032, - "language_loss": 0.82087892, - "learning_rate": 3.9749268575868085e-06, - "loss": 0.84325337, - "num_input_tokens_seen": 27890625, - "step": 1313, - "time_per_iteration": 2.6936304569244385 - }, - { - "auxiliary_loss_clip": 0.01184798, - "auxiliary_loss_mlp": 0.00783631, - "balance_loss_clip": 1.06229842, - "balance_loss_mlp": 1.00053823, - "epoch": 0.07900195400571171, - "flos": 16143463071360.0, - "grad_norm": 2.837190319075622, - "language_loss": 0.73569417, - "learning_rate": 3.97486534441264e-06, - "loss": 0.75537837, - "num_input_tokens_seen": 27906530, - "step": 1314, - "time_per_iteration": 2.653505325317383 - }, - { - "auxiliary_loss_clip": 0.01154585, - "auxiliary_loss_mlp": 0.00782352, - "balance_loss_clip": 1.05730104, - "balance_loss_mlp": 1.00044668, - "epoch": 0.07906207725837967, - "flos": 23730417676800.0, - "grad_norm": 1.6153694611764058, - "language_loss": 0.79490477, - "learning_rate": 3.974803756351379e-06, - "loss": 0.81427419, - "num_input_tokens_seen": 27926725, - "step": 1315, - "time_per_iteration": 2.797306776046753 - }, - { - "auxiliary_loss_clip": 0.01189107, - "auxiliary_loss_mlp": 0.01060743, - "balance_loss_clip": 1.05841756, - "balance_loss_mlp": 1.03487444, - "epoch": 0.07912220051104765, - "flos": 24315905364480.0, - "grad_norm": 1.6362349035659796, - "language_loss": 0.73546493, - "learning_rate": 3.974742093405362e-06, - "loss": 0.75796348, - "num_input_tokens_seen": 27947875, - "step": 1316, - "time_per_iteration": 2.688997507095337 - }, - { - "auxiliary_loss_clip": 0.01162651, - "auxiliary_loss_mlp": 0.01066617, - "balance_loss_clip": 1.05845332, - "balance_loss_mlp": 1.0418098, - "epoch": 0.07918232376371562, - "flos": 18880035615360.0, - "grad_norm": 2.157376902111077, - "language_loss": 0.65540409, - "learning_rate": 3.974680355576927e-06, - "loss": 0.67769682, - "num_input_tokens_seen": 27965040, - "step": 1317, - "time_per_iteration": 2.6998519897460938 - }, - { - "auxiliary_loss_clip": 0.01177674, - "auxiliary_loss_mlp": 0.01068635, - "balance_loss_clip": 1.06280386, - "balance_loss_mlp": 1.0428021, - "epoch": 0.07924244701638358, - "flos": 27376284038400.0, - "grad_norm": 2.382161374765057, - "language_loss": 0.73105192, - "learning_rate": 3.974618542868415e-06, - "loss": 0.75351495, - "num_input_tokens_seen": 27985330, - "step": 1318, - "time_per_iteration": 2.8350789546966553 - }, - { - "auxiliary_loss_clip": 0.01139638, - "auxiliary_loss_mlp": 0.01058798, - "balance_loss_clip": 1.05582452, - "balance_loss_mlp": 1.03515935, - "epoch": 0.07930257026905156, - "flos": 25120340403840.0, - "grad_norm": 2.635941883481154, - "language_loss": 0.90381306, - "learning_rate": 3.97455665528217e-06, - "loss": 0.92579746, - "num_input_tokens_seen": 28007615, - "step": 1319, - "time_per_iteration": 2.8553895950317383 - }, - { - "auxiliary_loss_clip": 0.01175059, - "auxiliary_loss_mlp": 0.01055333, - "balance_loss_clip": 1.05662942, - "balance_loss_mlp": 1.03122926, - "epoch": 0.07936269352171953, - "flos": 21834478103040.0, - "grad_norm": 1.9449065990449943, - "language_loss": 0.80134505, - "learning_rate": 3.974494692820539e-06, - "loss": 0.82364893, - "num_input_tokens_seen": 28027765, - "step": 1320, - "time_per_iteration": 2.6651997566223145 - }, - { - "auxiliary_loss_clip": 0.01181808, - "auxiliary_loss_mlp": 0.01060151, - "balance_loss_clip": 1.06380332, - "balance_loss_mlp": 1.03657198, - "epoch": 0.07942281677438749, - "flos": 16939889377920.0, - "grad_norm": 2.1078540484546746, - "language_loss": 0.6901226, - "learning_rate": 3.974432655485872e-06, - "loss": 0.71254218, - "num_input_tokens_seen": 28044225, - "step": 1321, - "time_per_iteration": 2.6500401496887207 - }, - { - "auxiliary_loss_clip": 0.01189002, - "auxiliary_loss_mlp": 0.01060598, - "balance_loss_clip": 1.06469131, - "balance_loss_mlp": 1.03688753, - "epoch": 0.07948294002705546, - "flos": 18986941468800.0, - "grad_norm": 1.9310950096267907, - "language_loss": 0.8359012, - "learning_rate": 3.9743705432805195e-06, - "loss": 0.85839725, - "num_input_tokens_seen": 28062915, - "step": 1322, - "time_per_iteration": 2.684978723526001 - }, - { - "auxiliary_loss_clip": 0.01202147, - "auxiliary_loss_mlp": 0.01057117, - "balance_loss_clip": 1.06135976, - "balance_loss_mlp": 1.03304851, - "epoch": 0.07954306327972344, - "flos": 21653452535040.0, - "grad_norm": 2.128262121046283, - "language_loss": 0.90555447, - "learning_rate": 3.974308356206838e-06, - "loss": 0.92814714, - "num_input_tokens_seen": 28082175, - "step": 1323, - "time_per_iteration": 2.6192240715026855 - }, - { - "auxiliary_loss_clip": 0.01164151, - "auxiliary_loss_mlp": 0.01062303, - "balance_loss_clip": 1.06272292, - "balance_loss_mlp": 1.03809166, - "epoch": 0.0796031865323914, - "flos": 23220270766080.0, - "grad_norm": 1.8373443631598505, - "language_loss": 0.82521075, - "learning_rate": 3.974246094267187e-06, - "loss": 0.84747529, - "num_input_tokens_seen": 28102645, - "step": 1324, - "time_per_iteration": 2.8283956050872803 - }, - { - "auxiliary_loss_clip": 0.01180787, - "auxiliary_loss_mlp": 0.01053463, - "balance_loss_clip": 1.06256735, - "balance_loss_mlp": 1.02834535, - "epoch": 0.07966330978505937, - "flos": 23294534135040.0, - "grad_norm": 2.119290865165494, - "language_loss": 0.79162025, - "learning_rate": 3.974183757463925e-06, - "loss": 0.8139627, - "num_input_tokens_seen": 28122805, - "step": 1325, - "time_per_iteration": 2.6996092796325684 - }, - { - "auxiliary_loss_clip": 0.01119286, - "auxiliary_loss_mlp": 0.00785175, - "balance_loss_clip": 1.04844928, - "balance_loss_mlp": 1.00035501, - "epoch": 0.07972343303772735, - "flos": 18363783392640.0, - "grad_norm": 2.2621745256944448, - "language_loss": 0.88038248, - "learning_rate": 3.974121345799418e-06, - "loss": 0.89942712, - "num_input_tokens_seen": 28140530, - "step": 1326, - "time_per_iteration": 2.881410837173462 - }, - { - "auxiliary_loss_clip": 0.012, - "auxiliary_loss_mlp": 0.01056877, - "balance_loss_clip": 1.06257951, - "balance_loss_mlp": 1.03168797, - "epoch": 0.07978355629039531, - "flos": 21762513204480.0, - "grad_norm": 1.8538865301137586, - "language_loss": 0.8328709, - "learning_rate": 3.974058859276032e-06, - "loss": 0.85543966, - "num_input_tokens_seen": 28159640, - "step": 1327, - "time_per_iteration": 2.7277982234954834 - }, - { - "auxiliary_loss_clip": 0.01207207, - "auxiliary_loss_mlp": 0.01056886, - "balance_loss_clip": 1.06532371, - "balance_loss_mlp": 1.03223395, - "epoch": 0.07984367954306328, - "flos": 18551309322240.0, - "grad_norm": 2.3216818645515636, - "language_loss": 0.78599, - "learning_rate": 3.9739962978961354e-06, - "loss": 0.80863088, - "num_input_tokens_seen": 28177050, - "step": 1328, - "time_per_iteration": 4.2137157917022705 - }, - { - "auxiliary_loss_clip": 0.01201442, - "auxiliary_loss_mlp": 0.01052053, - "balance_loss_clip": 1.06778932, - "balance_loss_mlp": 1.02722156, - "epoch": 0.07990380279573125, - "flos": 16904050583040.0, - "grad_norm": 4.209530911932697, - "language_loss": 0.73918134, - "learning_rate": 3.973933661662101e-06, - "loss": 0.76171625, - "num_input_tokens_seen": 28193245, - "step": 1329, - "time_per_iteration": 5.853717565536499 - }, - { - "auxiliary_loss_clip": 0.01169795, - "auxiliary_loss_mlp": 0.01064631, - "balance_loss_clip": 1.06039059, - "balance_loss_mlp": 1.04069376, - "epoch": 0.07996392604839922, - "flos": 24098358643200.0, - "grad_norm": 1.6102544328312476, - "language_loss": 0.81743932, - "learning_rate": 3.973870950576305e-06, - "loss": 0.83978355, - "num_input_tokens_seen": 28213570, - "step": 1330, - "time_per_iteration": 4.307915687561035 - }, - { - "auxiliary_loss_clip": 0.01205148, - "auxiliary_loss_mlp": 0.00780735, - "balance_loss_clip": 1.06445098, - "balance_loss_mlp": 1.00030971, - "epoch": 0.08002404930106718, - "flos": 14278729438080.0, - "grad_norm": 3.0935981151455865, - "language_loss": 0.88962448, - "learning_rate": 3.9738081646411255e-06, - "loss": 0.90948325, - "num_input_tokens_seen": 28229980, - "step": 1331, - "time_per_iteration": 2.645198345184326 - }, - { - "auxiliary_loss_clip": 0.01196019, - "auxiliary_loss_mlp": 0.00781409, - "balance_loss_clip": 1.05950165, - "balance_loss_mlp": 1.00032377, - "epoch": 0.08008417255373516, - "flos": 40406219285760.0, - "grad_norm": 1.8933982437719925, - "language_loss": 0.7335732, - "learning_rate": 3.973745303858942e-06, - "loss": 0.75334752, - "num_input_tokens_seen": 28253840, - "step": 1332, - "time_per_iteration": 2.792128562927246 - }, - { - "auxiliary_loss_clip": 0.01180359, - "auxiliary_loss_mlp": 0.01055118, - "balance_loss_clip": 1.06217384, - "balance_loss_mlp": 1.03216982, - "epoch": 0.08014429580640313, - "flos": 18478913460480.0, - "grad_norm": 1.7464568676953767, - "language_loss": 0.82765031, - "learning_rate": 3.973682368232138e-06, - "loss": 0.85000509, - "num_input_tokens_seen": 28271675, - "step": 1333, - "time_per_iteration": 2.635579824447632 - }, - { - "auxiliary_loss_clip": 0.01160554, - "auxiliary_loss_mlp": 0.01059025, - "balance_loss_clip": 1.05944169, - "balance_loss_mlp": 1.03502798, - "epoch": 0.0802044190590711, - "flos": 22053461368320.0, - "grad_norm": 2.677615191761892, - "language_loss": 0.74862051, - "learning_rate": 3.9736193577631015e-06, - "loss": 0.77081633, - "num_input_tokens_seen": 28291850, - "step": 1334, - "time_per_iteration": 2.8150298595428467 - }, - { - "auxiliary_loss_clip": 0.01176175, - "auxiliary_loss_mlp": 0.01063593, - "balance_loss_clip": 1.06460369, - "balance_loss_mlp": 1.04010868, - "epoch": 0.08026454231173906, - "flos": 24572128055040.0, - "grad_norm": 1.8723728369534094, - "language_loss": 0.79970533, - "learning_rate": 3.973556272454221e-06, - "loss": 0.82210302, - "num_input_tokens_seen": 28310780, - "step": 1335, - "time_per_iteration": 2.6858503818511963 - }, - { - "auxiliary_loss_clip": 0.01068232, - "auxiliary_loss_mlp": 0.01020395, - "balance_loss_clip": 1.04101062, - "balance_loss_mlp": 1.01693749, - "epoch": 0.08032466556440704, - "flos": 52581841459200.0, - "grad_norm": 0.7491611763509133, - "language_loss": 0.56056821, - "learning_rate": 3.973493112307889e-06, - "loss": 0.58145452, - "num_input_tokens_seen": 28369985, - "step": 1336, - "time_per_iteration": 3.324230670928955 - }, - { - "auxiliary_loss_clip": 0.01179495, - "auxiliary_loss_mlp": 0.01064433, - "balance_loss_clip": 1.06005239, - "balance_loss_mlp": 1.04149771, - "epoch": 0.080384788817075, - "flos": 23842602829440.0, - "grad_norm": 2.8990759307469256, - "language_loss": 0.67587668, - "learning_rate": 3.9734298773265005e-06, - "loss": 0.69831598, - "num_input_tokens_seen": 28388670, - "step": 1337, - "time_per_iteration": 2.755451202392578 - }, - { - "auxiliary_loss_clip": 0.01171763, - "auxiliary_loss_mlp": 0.0107788, - "balance_loss_clip": 1.06270492, - "balance_loss_mlp": 1.05304837, - "epoch": 0.08044491206974297, - "flos": 25300719527040.0, - "grad_norm": 1.9421039451316542, - "language_loss": 0.86847901, - "learning_rate": 3.973366567512453e-06, - "loss": 0.89097536, - "num_input_tokens_seen": 28411845, - "step": 1338, - "time_per_iteration": 2.758418560028076 - }, - { - "auxiliary_loss_clip": 0.01136344, - "auxiliary_loss_mlp": 0.01082295, - "balance_loss_clip": 1.04883683, - "balance_loss_mlp": 1.05596161, - "epoch": 0.08050503532241095, - "flos": 22376549226240.0, - "grad_norm": 2.4557709650828157, - "language_loss": 0.87217385, - "learning_rate": 3.973303182868147e-06, - "loss": 0.89436018, - "num_input_tokens_seen": 28427875, - "step": 1339, - "time_per_iteration": 2.72682785987854 - }, - { - "auxiliary_loss_clip": 0.01188632, - "auxiliary_loss_mlp": 0.01055953, - "balance_loss_clip": 1.06334567, - "balance_loss_mlp": 1.03417385, - "epoch": 0.08056515857507891, - "flos": 18369421827840.0, - "grad_norm": 10.603370056653041, - "language_loss": 0.89504963, - "learning_rate": 3.973239723395988e-06, - "loss": 0.91749549, - "num_input_tokens_seen": 28446615, - "step": 1340, - "time_per_iteration": 2.639601469039917 - }, - { - "auxiliary_loss_clip": 0.01080107, - "auxiliary_loss_mlp": 0.01012224, - "balance_loss_clip": 1.02943289, - "balance_loss_mlp": 1.00850451, - "epoch": 0.08062528182774688, - "flos": 51348130980480.0, - "grad_norm": 0.8861598592181924, - "language_loss": 0.64834231, - "learning_rate": 3.97317618909838e-06, - "loss": 0.66926563, - "num_input_tokens_seen": 28505290, - "step": 1341, - "time_per_iteration": 3.0625648498535156 - }, - { - "auxiliary_loss_clip": 0.01197538, - "auxiliary_loss_mlp": 0.01061885, - "balance_loss_clip": 1.0628854, - "balance_loss_mlp": 1.0364095, - "epoch": 0.08068540508041486, - "flos": 17599712261760.0, - "grad_norm": 3.3156125209451286, - "language_loss": 0.89471233, - "learning_rate": 3.973112579977733e-06, - "loss": 0.9173066, - "num_input_tokens_seen": 28522735, - "step": 1342, - "time_per_iteration": 2.6123783588409424 - }, - { - "auxiliary_loss_clip": 0.01177687, - "auxiliary_loss_mlp": 0.01062063, - "balance_loss_clip": 1.0644995, - "balance_loss_mlp": 1.03818512, - "epoch": 0.08074552833308282, - "flos": 10561185486720.0, - "grad_norm": 2.2904075751929365, - "language_loss": 0.76354575, - "learning_rate": 3.973048896036459e-06, - "loss": 0.78594327, - "num_input_tokens_seen": 28539460, - "step": 1343, - "time_per_iteration": 2.7564918994903564 - }, - { - "auxiliary_loss_clip": 0.01064182, - "auxiliary_loss_mlp": 0.01010488, - "balance_loss_clip": 1.02542567, - "balance_loss_mlp": 1.0066731, - "epoch": 0.08080565158575079, - "flos": 60840254954880.0, - "grad_norm": 0.8071281523255156, - "language_loss": 0.57418531, - "learning_rate": 3.972985137276974e-06, - "loss": 0.59493202, - "num_input_tokens_seen": 28599855, - "step": 1344, - "time_per_iteration": 3.170443058013916 - }, - { - "auxiliary_loss_clip": 0.01158029, - "auxiliary_loss_mlp": 0.01063108, - "balance_loss_clip": 1.05839872, - "balance_loss_mlp": 1.03846788, - "epoch": 0.08086577483841875, - "flos": 18332361970560.0, - "grad_norm": 2.5953739346171676, - "language_loss": 0.86569476, - "learning_rate": 3.972921303701695e-06, - "loss": 0.88790607, - "num_input_tokens_seen": 28617585, - "step": 1345, - "time_per_iteration": 2.765254497528076 - }, - { - "auxiliary_loss_clip": 0.01203428, - "auxiliary_loss_mlp": 0.01057879, - "balance_loss_clip": 1.06629944, - "balance_loss_mlp": 1.03603959, - "epoch": 0.08092589809108673, - "flos": 21543601766400.0, - "grad_norm": 1.8653844332842058, - "language_loss": 0.87646407, - "learning_rate": 3.972857395313042e-06, - "loss": 0.89907712, - "num_input_tokens_seen": 28636355, - "step": 1346, - "time_per_iteration": 2.655611991882324 - }, - { - "auxiliary_loss_clip": 0.01191822, - "auxiliary_loss_mlp": 0.0105414, - "balance_loss_clip": 1.06450033, - "balance_loss_mlp": 1.03047693, - "epoch": 0.0809860213437547, - "flos": 22128012046080.0, - "grad_norm": 1.7047476553504466, - "language_loss": 0.9298563, - "learning_rate": 3.972793412113439e-06, - "loss": 0.95231593, - "num_input_tokens_seen": 28656260, - "step": 1347, - "time_per_iteration": 2.718355417251587 - }, - { - "auxiliary_loss_clip": 0.01188696, - "auxiliary_loss_mlp": 0.01066703, - "balance_loss_clip": 1.06260633, - "balance_loss_mlp": 1.04144263, - "epoch": 0.08104614459642266, - "flos": 21725489260800.0, - "grad_norm": 1.9307860049130865, - "language_loss": 0.89506733, - "learning_rate": 3.972729354105312e-06, - "loss": 0.91762137, - "num_input_tokens_seen": 28675865, - "step": 1348, - "time_per_iteration": 2.763735771179199 - }, - { - "auxiliary_loss_clip": 0.01137961, - "auxiliary_loss_mlp": 0.01059733, - "balance_loss_clip": 1.06026649, - "balance_loss_mlp": 1.03730989, - "epoch": 0.08110626784909064, - "flos": 23951878980480.0, - "grad_norm": 1.6214351378274148, - "language_loss": 0.76906884, - "learning_rate": 3.97266522129109e-06, - "loss": 0.79104578, - "num_input_tokens_seen": 28696255, - "step": 1349, - "time_per_iteration": 2.778050661087036 - }, - { - "auxiliary_loss_clip": 0.01202122, - "auxiliary_loss_mlp": 0.01065092, - "balance_loss_clip": 1.06290889, - "balance_loss_mlp": 1.04144049, - "epoch": 0.0811663911017586, - "flos": 19025689265280.0, - "grad_norm": 1.777484449358279, - "language_loss": 0.8877703, - "learning_rate": 3.972601013673205e-06, - "loss": 0.91044247, - "num_input_tokens_seen": 28713905, - "step": 1350, - "time_per_iteration": 2.5871450901031494 - }, - { - "auxiliary_loss_clip": 0.01164889, - "auxiliary_loss_mlp": 0.00780958, - "balance_loss_clip": 1.06011164, - "balance_loss_mlp": 1.00028801, - "epoch": 0.08122651435442657, - "flos": 15341290588800.0, - "grad_norm": 2.7472756845793156, - "language_loss": 0.82298493, - "learning_rate": 3.972536731254092e-06, - "loss": 0.84244347, - "num_input_tokens_seen": 28732075, - "step": 1351, - "time_per_iteration": 2.840271234512329 - }, - { - "auxiliary_loss_clip": 0.01198177, - "auxiliary_loss_mlp": 0.01055773, - "balance_loss_clip": 1.06010592, - "balance_loss_mlp": 1.03090644, - "epoch": 0.08128663760709455, - "flos": 23221563655680.0, - "grad_norm": 2.2808101252466724, - "language_loss": 0.75274944, - "learning_rate": 3.972472374036189e-06, - "loss": 0.775289, - "num_input_tokens_seen": 28751150, - "step": 1352, - "time_per_iteration": 2.733644485473633 - }, - { - "auxiliary_loss_clip": 0.01194643, - "auxiliary_loss_mlp": 0.00783595, - "balance_loss_clip": 1.06613326, - "balance_loss_mlp": 1.00036311, - "epoch": 0.08134676085976252, - "flos": 22965628273920.0, - "grad_norm": 1.678520960707938, - "language_loss": 0.82936156, - "learning_rate": 3.972407942021935e-06, - "loss": 0.84914398, - "num_input_tokens_seen": 28773360, - "step": 1353, - "time_per_iteration": 2.742149829864502 - }, - { - "auxiliary_loss_clip": 0.01068236, - "auxiliary_loss_mlp": 0.01015932, - "balance_loss_clip": 1.02440155, - "balance_loss_mlp": 1.01242769, - "epoch": 0.08140688411243048, - "flos": 64322115816960.0, - "grad_norm": 0.8516312511934722, - "language_loss": 0.59741521, - "learning_rate": 3.972343435213775e-06, - "loss": 0.61825693, - "num_input_tokens_seen": 28833390, - "step": 1354, - "time_per_iteration": 3.1912426948547363 - }, - { - "auxiliary_loss_clip": 0.01150343, - "auxiliary_loss_mlp": 0.01058874, - "balance_loss_clip": 1.0546236, - "balance_loss_mlp": 1.03583086, - "epoch": 0.08146700736509845, - "flos": 22491858862080.0, - "grad_norm": 2.1234068486581643, - "language_loss": 0.82310611, - "learning_rate": 3.972278853614154e-06, - "loss": 0.84519827, - "num_input_tokens_seen": 28852430, - "step": 1355, - "time_per_iteration": 2.782442808151245 - }, - { - "auxiliary_loss_clip": 0.01186948, - "auxiliary_loss_mlp": 0.01062856, - "balance_loss_clip": 1.0600667, - "balance_loss_mlp": 1.03801262, - "epoch": 0.08152713061776642, - "flos": 20447823513600.0, - "grad_norm": 1.8366299277102565, - "language_loss": 0.7135247, - "learning_rate": 3.972214197225521e-06, - "loss": 0.73602271, - "num_input_tokens_seen": 28870685, - "step": 1356, - "time_per_iteration": 2.7777554988861084 - }, - { - "auxiliary_loss_clip": 0.01194666, - "auxiliary_loss_mlp": 0.01056522, - "balance_loss_clip": 1.06462216, - "balance_loss_mlp": 1.03259718, - "epoch": 0.08158725387043439, - "flos": 23550218121600.0, - "grad_norm": 2.050923525150184, - "language_loss": 0.70426142, - "learning_rate": 3.972149466050329e-06, - "loss": 0.72677326, - "num_input_tokens_seen": 28889860, - "step": 1357, - "time_per_iteration": 2.852046012878418 - }, - { - "auxiliary_loss_clip": 0.01186996, - "auxiliary_loss_mlp": 0.01054475, - "balance_loss_clip": 1.06138206, - "balance_loss_mlp": 1.03070426, - "epoch": 0.08164737712310235, - "flos": 22017335264640.0, - "grad_norm": 2.634204556872777, - "language_loss": 0.84203482, - "learning_rate": 3.97208466009103e-06, - "loss": 0.8644495, - "num_input_tokens_seen": 28905865, - "step": 1358, - "time_per_iteration": 2.7127115726470947 - }, - { - "auxiliary_loss_clip": 0.01176629, - "auxiliary_loss_mlp": 0.010566, - "balance_loss_clip": 1.06037402, - "balance_loss_mlp": 1.03154182, - "epoch": 0.08170750037577033, - "flos": 23367827836800.0, - "grad_norm": 2.1726272773281097, - "language_loss": 1.02781308, - "learning_rate": 3.972019779350084e-06, - "loss": 1.05014539, - "num_input_tokens_seen": 28925250, - "step": 1359, - "time_per_iteration": 2.7171826362609863 - }, - { - "auxiliary_loss_clip": 0.01128357, - "auxiliary_loss_mlp": 0.01056774, - "balance_loss_clip": 1.05009234, - "balance_loss_mlp": 1.03263426, - "epoch": 0.0817676236284383, - "flos": 28397978490240.0, - "grad_norm": 2.0494617207464945, - "language_loss": 0.8313604, - "learning_rate": 3.971954823829951e-06, - "loss": 0.85321164, - "num_input_tokens_seen": 28943445, - "step": 1360, - "time_per_iteration": 2.9020919799804688 - }, - { - "auxiliary_loss_clip": 0.01202956, - "auxiliary_loss_mlp": 0.0106887, - "balance_loss_clip": 1.06274688, - "balance_loss_mlp": 1.04469395, - "epoch": 0.08182774688110626, - "flos": 19208905562880.0, - "grad_norm": 5.2377005088202075, - "language_loss": 0.72322488, - "learning_rate": 3.971889793533093e-06, - "loss": 0.74594313, - "num_input_tokens_seen": 28962695, - "step": 1361, - "time_per_iteration": 2.6643178462982178 - }, - { - "auxiliary_loss_clip": 0.01166556, - "auxiliary_loss_mlp": 0.01056311, - "balance_loss_clip": 1.0552367, - "balance_loss_mlp": 1.03184962, - "epoch": 0.08188787013377424, - "flos": 22784099915520.0, - "grad_norm": 28.302545492028134, - "language_loss": 0.76657653, - "learning_rate": 3.971824688461976e-06, - "loss": 0.78880513, - "num_input_tokens_seen": 28982120, - "step": 1362, - "time_per_iteration": 2.7439064979553223 - }, - { - "auxiliary_loss_clip": 0.01199728, - "auxiliary_loss_mlp": 0.01053492, - "balance_loss_clip": 1.06350708, - "balance_loss_mlp": 1.03104496, - "epoch": 0.08194799338644221, - "flos": 16468095214080.0, - "grad_norm": 2.1850191919210338, - "language_loss": 0.72384715, - "learning_rate": 3.971759508619069e-06, - "loss": 0.74637932, - "num_input_tokens_seen": 28998100, - "step": 1363, - "time_per_iteration": 2.7082791328430176 - }, - { - "auxiliary_loss_clip": 0.01202887, - "auxiliary_loss_mlp": 0.01066374, - "balance_loss_clip": 1.06580126, - "balance_loss_mlp": 1.04083955, - "epoch": 0.08200811663911017, - "flos": 23913633974400.0, - "grad_norm": 2.142285699657122, - "language_loss": 0.7726444, - "learning_rate": 3.971694254006844e-06, - "loss": 0.79533696, - "num_input_tokens_seen": 29017095, - "step": 1364, - "time_per_iteration": 2.777156114578247 - }, - { - "auxiliary_loss_clip": 0.01135428, - "auxiliary_loss_mlp": 0.01063854, - "balance_loss_clip": 1.05182433, - "balance_loss_mlp": 1.03645968, - "epoch": 0.08206823989177814, - "flos": 17896550256000.0, - "grad_norm": 1.85589982882842, - "language_loss": 0.82242119, - "learning_rate": 3.971628924627776e-06, - "loss": 0.844414, - "num_input_tokens_seen": 29037240, - "step": 1365, - "time_per_iteration": 2.8192803859710693 - }, - { - "auxiliary_loss_clip": 0.01196582, - "auxiliary_loss_mlp": 0.01059945, - "balance_loss_clip": 1.07006347, - "balance_loss_mlp": 1.03706884, - "epoch": 0.08212836314444612, - "flos": 22088186841600.0, - "grad_norm": 1.7803424706125983, - "language_loss": 0.82062519, - "learning_rate": 3.97156352048434e-06, - "loss": 0.84319043, - "num_input_tokens_seen": 29056250, - "step": 1366, - "time_per_iteration": 2.7482311725616455 - }, - { - "auxiliary_loss_clip": 0.01153262, - "auxiliary_loss_mlp": 0.0107233, - "balance_loss_clip": 1.05320215, - "balance_loss_mlp": 1.04779685, - "epoch": 0.08218848639711408, - "flos": 17597485618560.0, - "grad_norm": 2.010209091244133, - "language_loss": 0.81944495, - "learning_rate": 3.97149804157902e-06, - "loss": 0.84170091, - "num_input_tokens_seen": 29073380, - "step": 1367, - "time_per_iteration": 4.352729797363281 - }, - { - "auxiliary_loss_clip": 0.01206125, - "auxiliary_loss_mlp": 0.01066888, - "balance_loss_clip": 1.06541765, - "balance_loss_mlp": 1.04241478, - "epoch": 0.08224860964978205, - "flos": 17857838373120.0, - "grad_norm": 2.518996379768439, - "language_loss": 0.8331567, - "learning_rate": 3.9714324879142946e-06, - "loss": 0.85588682, - "num_input_tokens_seen": 29091330, - "step": 1368, - "time_per_iteration": 6.077457666397095 - }, - { - "auxiliary_loss_clip": 0.01159992, - "auxiliary_loss_mlp": 0.01049874, - "balance_loss_clip": 1.06314564, - "balance_loss_mlp": 1.02790344, - "epoch": 0.08230873290245003, - "flos": 25227533566080.0, - "grad_norm": 3.198110530618569, - "language_loss": 0.81336468, - "learning_rate": 3.971366859492653e-06, - "loss": 0.8354634, - "num_input_tokens_seen": 29110375, - "step": 1369, - "time_per_iteration": 2.769972085952759 - }, - { - "auxiliary_loss_clip": 0.01137456, - "auxiliary_loss_mlp": 0.00781814, - "balance_loss_clip": 1.05438268, - "balance_loss_mlp": 1.00027657, - "epoch": 0.08236885615511799, - "flos": 31759935753600.0, - "grad_norm": 2.610758273724768, - "language_loss": 0.74818152, - "learning_rate": 3.971301156316582e-06, - "loss": 0.76737428, - "num_input_tokens_seen": 29129395, - "step": 1370, - "time_per_iteration": 4.497304201126099 - }, - { - "auxiliary_loss_clip": 0.0115498, - "auxiliary_loss_mlp": 0.01064278, - "balance_loss_clip": 1.06403351, - "balance_loss_mlp": 1.03987551, - "epoch": 0.08242897940778596, - "flos": 23185832601600.0, - "grad_norm": 1.5246391685186451, - "language_loss": 0.7398203, - "learning_rate": 3.971235378388573e-06, - "loss": 0.76201284, - "num_input_tokens_seen": 29148650, - "step": 1371, - "time_per_iteration": 2.758089065551758 - }, - { - "auxiliary_loss_clip": 0.01097162, - "auxiliary_loss_mlp": 0.0106614, - "balance_loss_clip": 1.05124569, - "balance_loss_mlp": 1.04098701, - "epoch": 0.08248910266045394, - "flos": 34491480393600.0, - "grad_norm": 1.9670948823939327, - "language_loss": 0.70851803, - "learning_rate": 3.971169525711122e-06, - "loss": 0.73015106, - "num_input_tokens_seen": 29170785, - "step": 1372, - "time_per_iteration": 4.069301605224609 - }, - { - "auxiliary_loss_clip": 0.01162292, - "auxiliary_loss_mlp": 0.01056859, - "balance_loss_clip": 1.0571332, - "balance_loss_mlp": 1.03261209, - "epoch": 0.0825492259131219, - "flos": 13436228960640.0, - "grad_norm": 2.750431245604494, - "language_loss": 0.88363653, - "learning_rate": 3.9711035982867246e-06, - "loss": 0.905828, - "num_input_tokens_seen": 29185210, - "step": 1373, - "time_per_iteration": 3.9346964359283447 - }, - { - "auxiliary_loss_clip": 0.01147291, - "auxiliary_loss_mlp": 0.01062343, - "balance_loss_clip": 1.05334187, - "balance_loss_mlp": 1.03878665, - "epoch": 0.08260934916578987, - "flos": 25812446636160.0, - "grad_norm": 2.128923272573014, - "language_loss": 0.82465184, - "learning_rate": 3.971037596117882e-06, - "loss": 0.84674811, - "num_input_tokens_seen": 29205210, - "step": 1374, - "time_per_iteration": 2.933377981185913 - }, - { - "auxiliary_loss_clip": 0.01044322, - "auxiliary_loss_mlp": 0.01017124, - "balance_loss_clip": 1.03154135, - "balance_loss_mlp": 1.0135479, - "epoch": 0.08266947241845783, - "flos": 63460009491840.0, - "grad_norm": 0.8272339650193923, - "language_loss": 0.60641956, - "learning_rate": 3.970971519207095e-06, - "loss": 0.62703401, - "num_input_tokens_seen": 29265350, - "step": 1375, - "time_per_iteration": 3.3287038803100586 - }, - { - "auxiliary_loss_clip": 0.01060461, - "auxiliary_loss_mlp": 0.01013653, - "balance_loss_clip": 1.02398169, - "balance_loss_mlp": 1.01017237, - "epoch": 0.08272959567112581, - "flos": 69993704568960.0, - "grad_norm": 0.9162492148708097, - "language_loss": 0.62171799, - "learning_rate": 3.970905367556871e-06, - "loss": 0.64245915, - "num_input_tokens_seen": 29321475, - "step": 1376, - "time_per_iteration": 3.218834161758423 - }, - { - "auxiliary_loss_clip": 0.01159103, - "auxiliary_loss_mlp": 0.0106347, - "balance_loss_clip": 1.06229186, - "balance_loss_mlp": 1.03942561, - "epoch": 0.08278971892379378, - "flos": 20413205781120.0, - "grad_norm": 1.9191670647860084, - "language_loss": 0.82577401, - "learning_rate": 3.970839141169718e-06, - "loss": 0.84799975, - "num_input_tokens_seen": 29341405, - "step": 1377, - "time_per_iteration": 2.8763558864593506 - }, - { - "auxiliary_loss_clip": 0.01176967, - "auxiliary_loss_mlp": 0.01054072, - "balance_loss_clip": 1.06486619, - "balance_loss_mlp": 1.03011107, - "epoch": 0.08284984217646174, - "flos": 26250233598720.0, - "grad_norm": 1.915539507671093, - "language_loss": 0.84923226, - "learning_rate": 3.970772840048147e-06, - "loss": 0.87154263, - "num_input_tokens_seen": 29361955, - "step": 1378, - "time_per_iteration": 2.8232595920562744 - }, - { - "auxiliary_loss_clip": 0.01185329, - "auxiliary_loss_mlp": 0.01058999, - "balance_loss_clip": 1.06043923, - "balance_loss_mlp": 1.0344305, - "epoch": 0.08290996542912972, - "flos": 27194683852800.0, - "grad_norm": 6.4689921779024795, - "language_loss": 0.87319231, - "learning_rate": 3.970706464194672e-06, - "loss": 0.8956356, - "num_input_tokens_seen": 29382395, - "step": 1379, - "time_per_iteration": 2.756082534790039 - }, - { - "auxiliary_loss_clip": 0.01158173, - "auxiliary_loss_mlp": 0.01061479, - "balance_loss_clip": 1.05779433, - "balance_loss_mlp": 1.03829277, - "epoch": 0.08297008868179769, - "flos": 38618191146240.0, - "grad_norm": 2.078993196749275, - "language_loss": 0.78545237, - "learning_rate": 3.970640013611812e-06, - "loss": 0.8076489, - "num_input_tokens_seen": 29404460, - "step": 1380, - "time_per_iteration": 2.9525601863861084 - }, - { - "auxiliary_loss_clip": 0.01183492, - "auxiliary_loss_mlp": 0.01059448, - "balance_loss_clip": 1.06308961, - "balance_loss_mlp": 1.0344255, - "epoch": 0.08303021193446565, - "flos": 19974736460160.0, - "grad_norm": 2.6608111668609697, - "language_loss": 0.86125714, - "learning_rate": 3.970573488302083e-06, - "loss": 0.88368654, - "num_input_tokens_seen": 29422675, - "step": 1381, - "time_per_iteration": 2.735203742980957 - }, - { - "auxiliary_loss_clip": 0.01197152, - "auxiliary_loss_mlp": 0.00781814, - "balance_loss_clip": 1.06611753, - "balance_loss_mlp": 1.00034571, - "epoch": 0.08309033518713363, - "flos": 13662646341120.0, - "grad_norm": 2.9433398182948203, - "language_loss": 0.87471211, - "learning_rate": 3.970506888268011e-06, - "loss": 0.89450181, - "num_input_tokens_seen": 29439840, - "step": 1382, - "time_per_iteration": 2.6392617225646973 - }, - { - "auxiliary_loss_clip": 0.0115996, - "auxiliary_loss_mlp": 0.01055463, - "balance_loss_clip": 1.06138313, - "balance_loss_mlp": 1.03337312, - "epoch": 0.0831504584398016, - "flos": 17968551068160.0, - "grad_norm": 1.9901989904031434, - "language_loss": 0.77085757, - "learning_rate": 3.970440213512121e-06, - "loss": 0.79301178, - "num_input_tokens_seen": 29457360, - "step": 1383, - "time_per_iteration": 2.756565809249878 - }, - { - "auxiliary_loss_clip": 0.01191549, - "auxiliary_loss_mlp": 0.01058014, - "balance_loss_clip": 1.06211782, - "balance_loss_mlp": 1.03395748, - "epoch": 0.08321058169246956, - "flos": 22601386408320.0, - "grad_norm": 1.818236548161018, - "language_loss": 0.82858944, - "learning_rate": 3.97037346403694e-06, - "loss": 0.85108507, - "num_input_tokens_seen": 29477040, - "step": 1384, - "time_per_iteration": 2.7848587036132812 - }, - { - "auxiliary_loss_clip": 0.01148661, - "auxiliary_loss_mlp": 0.01063605, - "balance_loss_clip": 1.05671442, - "balance_loss_mlp": 1.03610373, - "epoch": 0.08327070494513754, - "flos": 22850426378880.0, - "grad_norm": 3.9982776391866346, - "language_loss": 0.85219657, - "learning_rate": 3.970306639845e-06, - "loss": 0.8743192, - "num_input_tokens_seen": 29492010, - "step": 1385, - "time_per_iteration": 2.803893566131592 - }, - { - "auxiliary_loss_clip": 0.01157001, - "auxiliary_loss_mlp": 0.01061891, - "balance_loss_clip": 1.05823874, - "balance_loss_mlp": 1.03750122, - "epoch": 0.0833308281978055, - "flos": 22782986593920.0, - "grad_norm": 1.7071515381676081, - "language_loss": 0.69195282, - "learning_rate": 3.970239740938835e-06, - "loss": 0.71414173, - "num_input_tokens_seen": 29511850, - "step": 1386, - "time_per_iteration": 3.004786252975464 - }, - { - "auxiliary_loss_clip": 0.01172803, - "auxiliary_loss_mlp": 0.01058809, - "balance_loss_clip": 1.05489016, - "balance_loss_mlp": 1.03483546, - "epoch": 0.08339095145047347, - "flos": 20812604083200.0, - "grad_norm": 1.672791522425571, - "language_loss": 0.81894958, - "learning_rate": 3.97017276732098e-06, - "loss": 0.84126568, - "num_input_tokens_seen": 29531415, - "step": 1387, - "time_per_iteration": 2.7678542137145996 - }, - { - "auxiliary_loss_clip": 0.01179554, - "auxiliary_loss_mlp": 0.01074251, - "balance_loss_clip": 1.06179345, - "balance_loss_mlp": 1.04817975, - "epoch": 0.08345107470314143, - "flos": 18515326872960.0, - "grad_norm": 2.071322011459688, - "language_loss": 0.77205479, - "learning_rate": 3.970105718993978e-06, - "loss": 0.7945928, - "num_input_tokens_seen": 29549525, - "step": 1388, - "time_per_iteration": 2.8246304988861084 - }, - { - "auxiliary_loss_clip": 0.01130856, - "auxiliary_loss_mlp": 0.01062414, - "balance_loss_clip": 1.05684018, - "balance_loss_mlp": 1.03742766, - "epoch": 0.08351119795580941, - "flos": 18807567926400.0, - "grad_norm": 2.0255270252506636, - "language_loss": 0.79527366, - "learning_rate": 3.970038595960369e-06, - "loss": 0.81720638, - "num_input_tokens_seen": 29568705, - "step": 1389, - "time_per_iteration": 2.8606414794921875 - }, - { - "auxiliary_loss_clip": 0.01172785, - "auxiliary_loss_mlp": 0.01064077, - "balance_loss_clip": 1.05787444, - "balance_loss_mlp": 1.03923428, - "epoch": 0.08357132120847738, - "flos": 18441817689600.0, - "grad_norm": 2.546615132743645, - "language_loss": 0.87427586, - "learning_rate": 3.969971398222699e-06, - "loss": 0.89664447, - "num_input_tokens_seen": 29585855, - "step": 1390, - "time_per_iteration": 2.795931577682495 - }, - { - "auxiliary_loss_clip": 0.01160426, - "auxiliary_loss_mlp": 0.01067723, - "balance_loss_clip": 1.05447149, - "balance_loss_mlp": 1.04082966, - "epoch": 0.08363144446114534, - "flos": 25922333318400.0, - "grad_norm": 1.8703157168219726, - "language_loss": 0.86833143, - "learning_rate": 3.969904125783517e-06, - "loss": 0.89061296, - "num_input_tokens_seen": 29607280, - "step": 1391, - "time_per_iteration": 2.811598062515259 - }, - { - "auxiliary_loss_clip": 0.01156119, - "auxiliary_loss_mlp": 0.01076482, - "balance_loss_clip": 1.05575848, - "balance_loss_mlp": 1.05180562, - "epoch": 0.08369156771381332, - "flos": 18041306065920.0, - "grad_norm": 3.7979396758909263, - "language_loss": 0.87688571, - "learning_rate": 3.969836778645371e-06, - "loss": 0.89921176, - "num_input_tokens_seen": 29624130, - "step": 1392, - "time_per_iteration": 2.776819944381714 - }, - { - "auxiliary_loss_clip": 0.01183316, - "auxiliary_loss_mlp": 0.01058545, - "balance_loss_clip": 1.05830503, - "balance_loss_mlp": 1.03500128, - "epoch": 0.08375169096648129, - "flos": 22675111073280.0, - "grad_norm": 8.95243370865895, - "language_loss": 0.80574775, - "learning_rate": 3.969769356810819e-06, - "loss": 0.82816637, - "num_input_tokens_seen": 29643210, - "step": 1393, - "time_per_iteration": 2.735761880874634 - }, - { - "auxiliary_loss_clip": 0.01197686, - "auxiliary_loss_mlp": 0.01058125, - "balance_loss_clip": 1.06329441, - "balance_loss_mlp": 1.03466487, - "epoch": 0.08381181421914925, - "flos": 26103215232000.0, - "grad_norm": 1.7485261130451684, - "language_loss": 0.85064757, - "learning_rate": 3.969701860282415e-06, - "loss": 0.87320572, - "num_input_tokens_seen": 29663920, - "step": 1394, - "time_per_iteration": 2.950211524963379 - }, - { - "auxiliary_loss_clip": 0.01145594, - "auxiliary_loss_mlp": 0.01058123, - "balance_loss_clip": 1.05994248, - "balance_loss_mlp": 1.03432918, - "epoch": 0.08387193747181723, - "flos": 20629782835200.0, - "grad_norm": 1.782466846937859, - "language_loss": 0.82979721, - "learning_rate": 3.969634289062719e-06, - "loss": 0.85183442, - "num_input_tokens_seen": 29683825, - "step": 1395, - "time_per_iteration": 2.883977174758911 - }, - { - "auxiliary_loss_clip": 0.01187279, - "auxiliary_loss_mlp": 0.00782865, - "balance_loss_clip": 1.06065941, - "balance_loss_mlp": 1.00028706, - "epoch": 0.0839320607244852, - "flos": 13443196199040.0, - "grad_norm": 3.330409107955743, - "language_loss": 0.82481396, - "learning_rate": 3.969566643154293e-06, - "loss": 0.84451544, - "num_input_tokens_seen": 29698775, - "step": 1396, - "time_per_iteration": 2.6729378700256348 - }, - { - "auxiliary_loss_clip": 0.0118605, - "auxiliary_loss_mlp": 0.01060468, - "balance_loss_clip": 1.06378388, - "balance_loss_mlp": 1.03475475, - "epoch": 0.08399218397715316, - "flos": 23477247642240.0, - "grad_norm": 1.780410555630689, - "language_loss": 0.76843297, - "learning_rate": 3.969498922559703e-06, - "loss": 0.79089814, - "num_input_tokens_seen": 29719430, - "step": 1397, - "time_per_iteration": 2.64888334274292 - }, - { - "auxiliary_loss_clip": 0.01153742, - "auxiliary_loss_mlp": 0.01050759, - "balance_loss_clip": 1.05790138, - "balance_loss_mlp": 1.02621412, - "epoch": 0.08405230722982113, - "flos": 25920717206400.0, - "grad_norm": 2.1323769932413184, - "language_loss": 0.77941638, - "learning_rate": 3.969431127281516e-06, - "loss": 0.8014614, - "num_input_tokens_seen": 29739685, - "step": 1398, - "time_per_iteration": 2.8302125930786133 - }, - { - "auxiliary_loss_clip": 0.01191086, - "auxiliary_loss_mlp": 0.01052374, - "balance_loss_clip": 1.05962944, - "balance_loss_mlp": 1.02943766, - "epoch": 0.0841124304824891, - "flos": 17967437746560.0, - "grad_norm": 2.150764713624159, - "language_loss": 0.94635069, - "learning_rate": 3.969363257322304e-06, - "loss": 0.96878529, - "num_input_tokens_seen": 29756165, - "step": 1399, - "time_per_iteration": 2.650517702102661 - }, - { - "auxiliary_loss_clip": 0.01172403, - "auxiliary_loss_mlp": 0.0106738, - "balance_loss_clip": 1.0562712, - "balance_loss_mlp": 1.04168999, - "epoch": 0.08417255373515707, - "flos": 25629661301760.0, - "grad_norm": 3.6141849657848137, - "language_loss": 0.81904209, - "learning_rate": 3.96929531268464e-06, - "loss": 0.8414399, - "num_input_tokens_seen": 29776425, - "step": 1400, - "time_per_iteration": 2.777369260787964 - }, - { - "auxiliary_loss_clip": 0.01170173, - "auxiliary_loss_mlp": 0.01064292, - "balance_loss_clip": 1.05968165, - "balance_loss_mlp": 1.03957999, - "epoch": 0.08423267698782504, - "flos": 26249730808320.0, - "grad_norm": 8.998651919840762, - "language_loss": 0.8642807, - "learning_rate": 3.969227293371099e-06, - "loss": 0.88662529, - "num_input_tokens_seen": 29796440, - "step": 1401, - "time_per_iteration": 2.91375732421875 - }, - { - "auxiliary_loss_clip": 0.01196, - "auxiliary_loss_mlp": 0.01066109, - "balance_loss_clip": 1.05935979, - "balance_loss_mlp": 1.04053831, - "epoch": 0.08429280024049302, - "flos": 20119707751680.0, - "grad_norm": 2.9792515680869114, - "language_loss": 0.87500131, - "learning_rate": 3.969159199384263e-06, - "loss": 0.89762247, - "num_input_tokens_seen": 29814755, - "step": 1402, - "time_per_iteration": 2.7827296257019043 - }, - { - "auxiliary_loss_clip": 0.01144907, - "auxiliary_loss_mlp": 0.00781428, - "balance_loss_clip": 1.05105817, - "balance_loss_mlp": 1.00033188, - "epoch": 0.08435292349316098, - "flos": 42924526836480.0, - "grad_norm": 2.1517994230241566, - "language_loss": 0.8905524, - "learning_rate": 3.9690910307267125e-06, - "loss": 0.90981579, - "num_input_tokens_seen": 29834785, - "step": 1403, - "time_per_iteration": 2.931666374206543 - }, - { - "auxiliary_loss_clip": 0.01165276, - "auxiliary_loss_mlp": 0.01061696, - "balance_loss_clip": 1.05570936, - "balance_loss_mlp": 1.03715038, - "epoch": 0.08441304674582895, - "flos": 22857285876480.0, - "grad_norm": 1.790271378285476, - "language_loss": 0.80321431, - "learning_rate": 3.969022787401033e-06, - "loss": 0.82548404, - "num_input_tokens_seen": 29854695, - "step": 1404, - "time_per_iteration": 2.7397725582122803 - }, - { - "auxiliary_loss_clip": 0.01181709, - "auxiliary_loss_mlp": 0.01071408, - "balance_loss_clip": 1.06211567, - "balance_loss_mlp": 1.04649353, - "epoch": 0.08447316999849692, - "flos": 18697501676160.0, - "grad_norm": 2.0849305916509193, - "language_loss": 0.83557045, - "learning_rate": 3.968954469409811e-06, - "loss": 0.85810155, - "num_input_tokens_seen": 29872180, - "step": 1405, - "time_per_iteration": 2.8052847385406494 - }, - { - "auxiliary_loss_clip": 0.0118246, - "auxiliary_loss_mlp": 0.01058347, - "balance_loss_clip": 1.05636072, - "balance_loss_mlp": 1.03588748, - "epoch": 0.08453329325116489, - "flos": 25483971738240.0, - "grad_norm": 1.5225846020503528, - "language_loss": 0.7991904, - "learning_rate": 3.968886076755639e-06, - "loss": 0.82159847, - "num_input_tokens_seen": 29893205, - "step": 1406, - "time_per_iteration": 4.301243305206299 - }, - { - "auxiliary_loss_clip": 0.0117117, - "auxiliary_loss_mlp": 0.01068275, - "balance_loss_clip": 1.05790758, - "balance_loss_mlp": 1.04406369, - "epoch": 0.08459341650383286, - "flos": 20920048640640.0, - "grad_norm": 1.717770739318623, - "language_loss": 0.79441547, - "learning_rate": 3.96881760944111e-06, - "loss": 0.81680995, - "num_input_tokens_seen": 29911970, - "step": 1407, - "time_per_iteration": 2.6535613536834717 - }, - { - "auxiliary_loss_clip": 0.01186501, - "auxiliary_loss_mlp": 0.01057881, - "balance_loss_clip": 1.05982685, - "balance_loss_mlp": 1.03409886, - "epoch": 0.08465353975650082, - "flos": 13043079624960.0, - "grad_norm": 2.191354041218588, - "language_loss": 0.91799384, - "learning_rate": 3.968749067468819e-06, - "loss": 0.94043779, - "num_input_tokens_seen": 29929925, - "step": 1408, - "time_per_iteration": 5.774486064910889 - }, - { - "auxiliary_loss_clip": 0.01058217, - "auxiliary_loss_mlp": 0.01015213, - "balance_loss_clip": 1.0231359, - "balance_loss_mlp": 1.01139832, - "epoch": 0.0847136630091688, - "flos": 60877422552960.0, - "grad_norm": 0.9559717259642487, - "language_loss": 0.61891782, - "learning_rate": 3.968680450841368e-06, - "loss": 0.63965201, - "num_input_tokens_seen": 29985950, - "step": 1409, - "time_per_iteration": 4.9455225467681885 - }, - { - "auxiliary_loss_clip": 0.01188186, - "auxiliary_loss_mlp": 0.01061718, - "balance_loss_clip": 1.05840743, - "balance_loss_mlp": 1.03878236, - "epoch": 0.08477378626183676, - "flos": 22046530043520.0, - "grad_norm": 1.6980375913788566, - "language_loss": 0.86357373, - "learning_rate": 3.968611759561355e-06, - "loss": 0.88607281, - "num_input_tokens_seen": 30004330, - "step": 1410, - "time_per_iteration": 2.640355110168457 - }, - { - "auxiliary_loss_clip": 0.01181512, - "auxiliary_loss_mlp": 0.01053874, - "balance_loss_clip": 1.0583061, - "balance_loss_mlp": 1.02870846, - "epoch": 0.08483390951450473, - "flos": 16690059308160.0, - "grad_norm": 2.248971712939306, - "language_loss": 0.74384397, - "learning_rate": 3.968542993631388e-06, - "loss": 0.7661978, - "num_input_tokens_seen": 30022555, - "step": 1411, - "time_per_iteration": 2.6200830936431885 - }, - { - "auxiliary_loss_clip": 0.01077929, - "auxiliary_loss_mlp": 0.01003535, - "balance_loss_clip": 1.02317524, - "balance_loss_mlp": 0.99991113, - "epoch": 0.08489403276717271, - "flos": 51584640082560.0, - "grad_norm": 0.9014663966204861, - "language_loss": 0.56748837, - "learning_rate": 3.968474153054073e-06, - "loss": 0.58830309, - "num_input_tokens_seen": 30077220, - "step": 1412, - "time_per_iteration": 3.0746512413024902 - }, - { - "auxiliary_loss_clip": 0.01156137, - "auxiliary_loss_mlp": 0.01067795, - "balance_loss_clip": 1.05325568, - "balance_loss_mlp": 1.04265356, - "epoch": 0.08495415601984067, - "flos": 17092330698240.0, - "grad_norm": 2.2757293876932945, - "language_loss": 0.88754624, - "learning_rate": 3.96840523783202e-06, - "loss": 0.90978551, - "num_input_tokens_seen": 30094600, - "step": 1413, - "time_per_iteration": 2.7309420108795166 - }, - { - "auxiliary_loss_clip": 0.01164895, - "auxiliary_loss_mlp": 0.01057479, - "balance_loss_clip": 1.05780244, - "balance_loss_mlp": 1.03295755, - "epoch": 0.08501427927250864, - "flos": 23148413608320.0, - "grad_norm": 1.9781781646219805, - "language_loss": 0.87963474, - "learning_rate": 3.968336247967844e-06, - "loss": 0.90185857, - "num_input_tokens_seen": 30114475, - "step": 1414, - "time_per_iteration": 2.692030668258667 - }, - { - "auxiliary_loss_clip": 0.01168145, - "auxiliary_loss_mlp": 0.01063751, - "balance_loss_clip": 1.05704033, - "balance_loss_mlp": 1.04170966, - "epoch": 0.08507440252517662, - "flos": 19063467394560.0, - "grad_norm": 1.9706021333256292, - "language_loss": 0.77636635, - "learning_rate": 3.96826718346416e-06, - "loss": 0.79868531, - "num_input_tokens_seen": 30133350, - "step": 1415, - "time_per_iteration": 2.8435540199279785 - }, - { - "auxiliary_loss_clip": 0.01182108, - "auxiliary_loss_mlp": 0.01059478, - "balance_loss_clip": 1.0588963, - "balance_loss_mlp": 1.03701878, - "epoch": 0.08513452577784458, - "flos": 60182296600320.0, - "grad_norm": 1.7170282174092708, - "language_loss": 0.70545506, - "learning_rate": 3.968198044323587e-06, - "loss": 0.72787094, - "num_input_tokens_seen": 30159005, - "step": 1416, - "time_per_iteration": 3.021360158920288 - }, - { - "auxiliary_loss_clip": 0.01174166, - "auxiliary_loss_mlp": 0.01066487, - "balance_loss_clip": 1.05930233, - "balance_loss_mlp": 1.04131043, - "epoch": 0.08519464903051255, - "flos": 27308485117440.0, - "grad_norm": 2.8159853289102053, - "language_loss": 0.74938154, - "learning_rate": 3.968128830548748e-06, - "loss": 0.771788, - "num_input_tokens_seen": 30179450, - "step": 1417, - "time_per_iteration": 2.738301992416382 - }, - { - "auxiliary_loss_clip": 0.01171292, - "auxiliary_loss_mlp": 0.01057092, - "balance_loss_clip": 1.05715823, - "balance_loss_mlp": 1.03313112, - "epoch": 0.08525477228318051, - "flos": 20266438809600.0, - "grad_norm": 2.4132423968154635, - "language_loss": 0.8258723, - "learning_rate": 3.968059542142265e-06, - "loss": 0.84815615, - "num_input_tokens_seen": 30197235, - "step": 1418, - "time_per_iteration": 2.671574831008911 - }, - { - "auxiliary_loss_clip": 0.0104499, - "auxiliary_loss_mlp": 0.01004818, - "balance_loss_clip": 1.02242994, - "balance_loss_mlp": 1.0004549, - "epoch": 0.08531489553584849, - "flos": 67615017183360.0, - "grad_norm": 0.8667411864001444, - "language_loss": 0.56638753, - "learning_rate": 3.9679901791067685e-06, - "loss": 0.58688557, - "num_input_tokens_seen": 30257410, - "step": 1419, - "time_per_iteration": 3.199730396270752 - }, - { - "auxiliary_loss_clip": 0.01192231, - "auxiliary_loss_mlp": 0.01067737, - "balance_loss_clip": 1.05757999, - "balance_loss_mlp": 1.04369283, - "epoch": 0.08537501878851646, - "flos": 27526965592320.0, - "grad_norm": 2.2357492693560466, - "language_loss": 0.70111859, - "learning_rate": 3.967920741444886e-06, - "loss": 0.72371829, - "num_input_tokens_seen": 30277865, - "step": 1420, - "time_per_iteration": 2.7176027297973633 - }, - { - "auxiliary_loss_clip": 0.01155207, - "auxiliary_loss_mlp": 0.01050755, - "balance_loss_clip": 1.05377483, - "balance_loss_mlp": 1.02692556, - "epoch": 0.08543514204118442, - "flos": 22784243569920.0, - "grad_norm": 1.5975069204011494, - "language_loss": 0.88011539, - "learning_rate": 3.967851229159252e-06, - "loss": 0.90217495, - "num_input_tokens_seen": 30298545, - "step": 1421, - "time_per_iteration": 2.7552106380462646 - }, - { - "auxiliary_loss_clip": 0.01077473, - "auxiliary_loss_mlp": 0.01013517, - "balance_loss_clip": 1.02364218, - "balance_loss_mlp": 1.01020324, - "epoch": 0.0854952652938524, - "flos": 60990721027200.0, - "grad_norm": 0.9142209544576306, - "language_loss": 0.63506877, - "learning_rate": 3.967781642252502e-06, - "loss": 0.65597868, - "num_input_tokens_seen": 30361725, - "step": 1422, - "time_per_iteration": 3.134183168411255 - }, - { - "auxiliary_loss_clip": 0.01152948, - "auxiliary_loss_mlp": 0.01063847, - "balance_loss_clip": 1.05932307, - "balance_loss_mlp": 1.0406723, - "epoch": 0.08555538854652037, - "flos": 28038046256640.0, - "grad_norm": 1.8757015124159093, - "language_loss": 0.82691669, - "learning_rate": 3.967711980727276e-06, - "loss": 0.84908462, - "num_input_tokens_seen": 30382180, - "step": 1423, - "time_per_iteration": 2.789393424987793 - }, - { - "auxiliary_loss_clip": 0.01153439, - "auxiliary_loss_mlp": 0.01064169, - "balance_loss_clip": 1.0526228, - "balance_loss_mlp": 1.04089928, - "epoch": 0.08561551179918833, - "flos": 23509279595520.0, - "grad_norm": 1.6593534429066656, - "language_loss": 0.75424892, - "learning_rate": 3.967642244586213e-06, - "loss": 0.776425, - "num_input_tokens_seen": 30402980, - "step": 1424, - "time_per_iteration": 2.7805826663970947 - }, - { - "auxiliary_loss_clip": 0.01139579, - "auxiliary_loss_mlp": 0.01060342, - "balance_loss_clip": 1.05769765, - "balance_loss_mlp": 1.03751373, - "epoch": 0.08567563505185631, - "flos": 17926930183680.0, - "grad_norm": 1.7999307606718091, - "language_loss": 0.75948423, - "learning_rate": 3.96757243383196e-06, - "loss": 0.78148341, - "num_input_tokens_seen": 30420800, - "step": 1425, - "time_per_iteration": 2.677889823913574 - }, - { - "auxiliary_loss_clip": 0.0118966, - "auxiliary_loss_mlp": 0.01055231, - "balance_loss_clip": 1.05982256, - "balance_loss_mlp": 1.03230715, - "epoch": 0.08573575830452428, - "flos": 19719519350400.0, - "grad_norm": 2.1792756220437743, - "language_loss": 0.93362999, - "learning_rate": 3.9675025484671624e-06, - "loss": 0.95607889, - "num_input_tokens_seen": 30439620, - "step": 1426, - "time_per_iteration": 2.6270906925201416 - }, - { - "auxiliary_loss_clip": 0.01145994, - "auxiliary_loss_mlp": 0.01066219, - "balance_loss_clip": 1.05707717, - "balance_loss_mlp": 1.0406251, - "epoch": 0.08579588155719224, - "flos": 17931563038080.0, - "grad_norm": 2.3679064075186553, - "language_loss": 0.75424731, - "learning_rate": 3.967432588494471e-06, - "loss": 0.77636945, - "num_input_tokens_seen": 30457300, - "step": 1427, - "time_per_iteration": 2.84614634513855 - }, - { - "auxiliary_loss_clip": 0.01190697, - "auxiliary_loss_mlp": 0.01052992, - "balance_loss_clip": 1.06006169, - "balance_loss_mlp": 1.0305804, - "epoch": 0.08585600480986022, - "flos": 16033324993920.0, - "grad_norm": 3.503048788198607, - "language_loss": 0.82108849, - "learning_rate": 3.96736255391654e-06, - "loss": 0.84352541, - "num_input_tokens_seen": 30471580, - "step": 1428, - "time_per_iteration": 2.5882396697998047 - }, - { - "auxiliary_loss_clip": 0.01173688, - "auxiliary_loss_mlp": 0.0106298, - "balance_loss_clip": 1.05633736, - "balance_loss_mlp": 1.03832793, - "epoch": 0.08591612806252819, - "flos": 28657433404800.0, - "grad_norm": 2.088481658755078, - "language_loss": 0.79929984, - "learning_rate": 3.967292444736023e-06, - "loss": 0.82166648, - "num_input_tokens_seen": 30492720, - "step": 1429, - "time_per_iteration": 2.720500946044922 - }, - { - "auxiliary_loss_clip": 0.01169119, - "auxiliary_loss_mlp": 0.010606, - "balance_loss_clip": 1.05971265, - "balance_loss_mlp": 1.0379504, - "epoch": 0.08597625131519615, - "flos": 20959119659520.0, - "grad_norm": 1.9029222975672677, - "language_loss": 0.87716508, - "learning_rate": 3.967222260955578e-06, - "loss": 0.89946228, - "num_input_tokens_seen": 30509535, - "step": 1430, - "time_per_iteration": 2.6914596557617188 - }, - { - "auxiliary_loss_clip": 0.01144304, - "auxiliary_loss_mlp": 0.01074633, - "balance_loss_clip": 1.05802035, - "balance_loss_mlp": 1.05125606, - "epoch": 0.08603637456786412, - "flos": 23256360956160.0, - "grad_norm": 1.6366623508781384, - "language_loss": 0.81859726, - "learning_rate": 3.96715200257787e-06, - "loss": 0.84078664, - "num_input_tokens_seen": 30529490, - "step": 1431, - "time_per_iteration": 2.834402322769165 - }, - { - "auxiliary_loss_clip": 0.01148362, - "auxiliary_loss_mlp": 0.01054323, - "balance_loss_clip": 1.05620182, - "balance_loss_mlp": 1.03132737, - "epoch": 0.0860964978205321, - "flos": 28694170039680.0, - "grad_norm": 1.5497375505717568, - "language_loss": 0.78109461, - "learning_rate": 3.967081669605559e-06, - "loss": 0.80312145, - "num_input_tokens_seen": 30550205, - "step": 1432, - "time_per_iteration": 2.767860174179077 - }, - { - "auxiliary_loss_clip": 0.01167351, - "auxiliary_loss_mlp": 0.0106333, - "balance_loss_clip": 1.0540905, - "balance_loss_mlp": 1.03914225, - "epoch": 0.08615662107320006, - "flos": 19318397195520.0, - "grad_norm": 1.9631692713893694, - "language_loss": 0.73365706, - "learning_rate": 3.967011262041315e-06, - "loss": 0.75596392, - "num_input_tokens_seen": 30568830, - "step": 1433, - "time_per_iteration": 2.6930699348449707 - }, - { - "auxiliary_loss_clip": 0.01150098, - "auxiliary_loss_mlp": 0.00781967, - "balance_loss_clip": 1.05335927, - "balance_loss_mlp": 1.00044179, - "epoch": 0.08621674432586802, - "flos": 15851688894720.0, - "grad_norm": 2.468588778716135, - "language_loss": 0.85340321, - "learning_rate": 3.9669407798878065e-06, - "loss": 0.87272388, - "num_input_tokens_seen": 30585730, - "step": 1434, - "time_per_iteration": 2.735690116882324 - }, - { - "auxiliary_loss_clip": 0.01170363, - "auxiliary_loss_mlp": 0.01057659, - "balance_loss_clip": 1.05604434, - "balance_loss_mlp": 1.0344249, - "epoch": 0.086276867578536, - "flos": 14100648785280.0, - "grad_norm": 2.160640509122794, - "language_loss": 0.7870298, - "learning_rate": 3.966870223147707e-06, - "loss": 0.80931008, - "num_input_tokens_seen": 30603180, - "step": 1435, - "time_per_iteration": 2.776567220687866 - }, - { - "auxiliary_loss_clip": 0.01047768, - "auxiliary_loss_mlp": 0.01015597, - "balance_loss_clip": 1.023893, - "balance_loss_mlp": 1.01206815, - "epoch": 0.08633699083120397, - "flos": 70184857772160.0, - "grad_norm": 0.8900716332014227, - "language_loss": 0.57975936, - "learning_rate": 3.96679959182369e-06, - "loss": 0.60039294, - "num_input_tokens_seen": 30668895, - "step": 1436, - "time_per_iteration": 3.344207763671875 - }, - { - "auxiliary_loss_clip": 0.0117372, - "auxiliary_loss_mlp": 0.01056829, - "balance_loss_clip": 1.05617976, - "balance_loss_mlp": 1.03153312, - "epoch": 0.08639711408387193, - "flos": 30298874140800.0, - "grad_norm": 2.240343996649645, - "language_loss": 0.69169062, - "learning_rate": 3.966728885918437e-06, - "loss": 0.71399617, - "num_input_tokens_seen": 30688955, - "step": 1437, - "time_per_iteration": 2.7171547412872314 - }, - { - "auxiliary_loss_clip": 0.01121044, - "auxiliary_loss_mlp": 0.01055264, - "balance_loss_clip": 1.05334914, - "balance_loss_mlp": 1.03223276, - "epoch": 0.08645723733653991, - "flos": 20297680663680.0, - "grad_norm": 2.1340571114707245, - "language_loss": 0.72624576, - "learning_rate": 3.966658105434627e-06, - "loss": 0.74800885, - "num_input_tokens_seen": 30706095, - "step": 1438, - "time_per_iteration": 2.7815651893615723 - }, - { - "auxiliary_loss_clip": 0.01179626, - "auxiliary_loss_mlp": 0.01052578, - "balance_loss_clip": 1.06052637, - "balance_loss_mlp": 1.02872419, - "epoch": 0.08651736058920788, - "flos": 32890583134080.0, - "grad_norm": 1.5339762166114281, - "language_loss": 0.64377135, - "learning_rate": 3.966587250374945e-06, - "loss": 0.66609335, - "num_input_tokens_seen": 30729025, - "step": 1439, - "time_per_iteration": 2.8935797214508057 - }, - { - "auxiliary_loss_clip": 0.01153286, - "auxiliary_loss_mlp": 0.01056452, - "balance_loss_clip": 1.05530453, - "balance_loss_mlp": 1.03213322, - "epoch": 0.08657748384187584, - "flos": 22637368857600.0, - "grad_norm": 5.193932354158579, - "language_loss": 0.87521696, - "learning_rate": 3.966516320742077e-06, - "loss": 0.89731431, - "num_input_tokens_seen": 30746155, - "step": 1440, - "time_per_iteration": 2.731531858444214 - }, - { - "auxiliary_loss_clip": 0.01155923, - "auxiliary_loss_mlp": 0.00782787, - "balance_loss_clip": 1.05752945, - "balance_loss_mlp": 1.00043201, - "epoch": 0.08663760709454381, - "flos": 23658380951040.0, - "grad_norm": 2.023462963415533, - "language_loss": 0.83434939, - "learning_rate": 3.9664453165387124e-06, - "loss": 0.85373652, - "num_input_tokens_seen": 30761410, - "step": 1441, - "time_per_iteration": 2.7126500606536865 - }, - { - "auxiliary_loss_clip": 0.01074667, - "auxiliary_loss_mlp": 0.01004602, - "balance_loss_clip": 1.0222367, - "balance_loss_mlp": 1.00100195, - "epoch": 0.08669773034721179, - "flos": 62686564911360.0, - "grad_norm": 0.8541685426878655, - "language_loss": 0.60479522, - "learning_rate": 3.966374237767545e-06, - "loss": 0.62558794, - "num_input_tokens_seen": 30823010, - "step": 1442, - "time_per_iteration": 3.25555157661438 - }, - { - "auxiliary_loss_clip": 0.0116729, - "auxiliary_loss_mlp": 0.01054262, - "balance_loss_clip": 1.05768681, - "balance_loss_mlp": 1.03075421, - "epoch": 0.08675785359987975, - "flos": 20667489137280.0, - "grad_norm": 2.8449103562639073, - "language_loss": 0.79304373, - "learning_rate": 3.96630308443127e-06, - "loss": 0.81525922, - "num_input_tokens_seen": 30841980, - "step": 1443, - "time_per_iteration": 2.7314631938934326 - }, - { - "auxiliary_loss_clip": 0.01180858, - "auxiliary_loss_mlp": 0.01051075, - "balance_loss_clip": 1.05780149, - "balance_loss_mlp": 1.02755547, - "epoch": 0.08681797685254772, - "flos": 26941118768640.0, - "grad_norm": 1.6739262813835734, - "language_loss": 0.82399666, - "learning_rate": 3.966231856532584e-06, - "loss": 0.84631598, - "num_input_tokens_seen": 30863280, - "step": 1444, - "time_per_iteration": 2.7341418266296387 - }, - { - "auxiliary_loss_clip": 0.01196759, - "auxiliary_loss_mlp": 0.01051473, - "balance_loss_clip": 1.06044626, - "balance_loss_mlp": 1.02810788, - "epoch": 0.0868781001052157, - "flos": 17712831168000.0, - "grad_norm": 2.3015461969915747, - "language_loss": 0.87354827, - "learning_rate": 3.966160554074189e-06, - "loss": 0.8960306, - "num_input_tokens_seen": 30881710, - "step": 1445, - "time_per_iteration": 4.25179386138916 - }, - { - "auxiliary_loss_clip": 0.01180784, - "auxiliary_loss_mlp": 0.01055896, - "balance_loss_clip": 1.06094933, - "balance_loss_mlp": 1.03446186, - "epoch": 0.08693822335788366, - "flos": 19896522595200.0, - "grad_norm": 1.8066650797875201, - "language_loss": 0.81863767, - "learning_rate": 3.96608917705879e-06, - "loss": 0.84100449, - "num_input_tokens_seen": 30900225, - "step": 1446, - "time_per_iteration": 4.197181940078735 - }, - { - "auxiliary_loss_clip": 0.01056056, - "auxiliary_loss_mlp": 0.01004371, - "balance_loss_clip": 1.01782191, - "balance_loss_mlp": 1.00031781, - "epoch": 0.08699834661055163, - "flos": 67023747406080.0, - "grad_norm": 0.7255245569613363, - "language_loss": 0.54762936, - "learning_rate": 3.966017725489091e-06, - "loss": 0.56823361, - "num_input_tokens_seen": 30959580, - "step": 1447, - "time_per_iteration": 3.2158126831054688 - }, - { - "auxiliary_loss_clip": 0.0114861, - "auxiliary_loss_mlp": 0.01056824, - "balance_loss_clip": 1.05373001, - "balance_loss_mlp": 1.03518772, - "epoch": 0.0870584698632196, - "flos": 13480507451520.0, - "grad_norm": 2.1586118179593696, - "language_loss": 0.84592307, - "learning_rate": 3.965946199367804e-06, - "loss": 0.86797738, - "num_input_tokens_seen": 30976775, - "step": 1448, - "time_per_iteration": 4.262767314910889 - }, - { - "auxiliary_loss_clip": 0.01194173, - "auxiliary_loss_mlp": 0.01050219, - "balance_loss_clip": 1.05891991, - "balance_loss_mlp": 1.02768826, - "epoch": 0.08711859311588757, - "flos": 16107013745280.0, - "grad_norm": 3.4326906921347096, - "language_loss": 0.80644608, - "learning_rate": 3.965874598697638e-06, - "loss": 0.82888997, - "num_input_tokens_seen": 30990495, - "step": 1449, - "time_per_iteration": 4.553676128387451 - }, - { - "auxiliary_loss_clip": 0.01138548, - "auxiliary_loss_mlp": 0.01052142, - "balance_loss_clip": 1.05437374, - "balance_loss_mlp": 1.02946854, - "epoch": 0.08717871636855554, - "flos": 38472357928320.0, - "grad_norm": 1.5251600336566102, - "language_loss": 0.70971417, - "learning_rate": 3.965802923481313e-06, - "loss": 0.73162109, - "num_input_tokens_seen": 31014080, - "step": 1450, - "time_per_iteration": 2.9082705974578857 - }, - { - "auxiliary_loss_clip": 0.01124466, - "auxiliary_loss_mlp": 0.01054883, - "balance_loss_clip": 1.05164719, - "balance_loss_mlp": 1.03207827, - "epoch": 0.0872388396212235, - "flos": 17600574188160.0, - "grad_norm": 1.9392114767205617, - "language_loss": 0.83684897, - "learning_rate": 3.965731173721542e-06, - "loss": 0.85864246, - "num_input_tokens_seen": 31031210, - "step": 1451, - "time_per_iteration": 2.809880495071411 - }, - { - "auxiliary_loss_clip": 0.01134251, - "auxiliary_loss_mlp": 0.00780873, - "balance_loss_clip": 1.05147851, - "balance_loss_mlp": 1.00039482, - "epoch": 0.08729896287389148, - "flos": 25259385951360.0, - "grad_norm": 2.5160845512367773, - "language_loss": 0.74654591, - "learning_rate": 3.965659349421049e-06, - "loss": 0.76569718, - "num_input_tokens_seen": 31049710, - "step": 1452, - "time_per_iteration": 2.88580060005188 - }, - { - "auxiliary_loss_clip": 0.01157134, - "auxiliary_loss_mlp": 0.01063328, - "balance_loss_clip": 1.05607891, - "balance_loss_mlp": 1.0388428, - "epoch": 0.08735908612655945, - "flos": 15632454234240.0, - "grad_norm": 4.56941406999875, - "language_loss": 0.80543101, - "learning_rate": 3.965587450582556e-06, - "loss": 0.82763565, - "num_input_tokens_seen": 31066160, - "step": 1453, - "time_per_iteration": 2.733632802963257 - }, - { - "auxiliary_loss_clip": 0.01169707, - "auxiliary_loss_mlp": 0.01059533, - "balance_loss_clip": 1.05905569, - "balance_loss_mlp": 1.03625154, - "epoch": 0.08741920937922741, - "flos": 20339660684160.0, - "grad_norm": 2.0102093196988102, - "language_loss": 0.71041977, - "learning_rate": 3.96551547720879e-06, - "loss": 0.73271215, - "num_input_tokens_seen": 31085270, - "step": 1454, - "time_per_iteration": 2.7568745613098145 - }, - { - "auxiliary_loss_clip": 0.0106426, - "auxiliary_loss_mlp": 0.01008112, - "balance_loss_clip": 1.0215131, - "balance_loss_mlp": 1.00463128, - "epoch": 0.08747933263189539, - "flos": 62819795433600.0, - "grad_norm": 0.7713706503543015, - "language_loss": 0.5859946, - "learning_rate": 3.96544342930248e-06, - "loss": 0.6067183, - "num_input_tokens_seen": 31148445, - "step": 1455, - "time_per_iteration": 3.2372186183929443 - }, - { - "auxiliary_loss_clip": 0.01189404, - "auxiliary_loss_mlp": 0.01060742, - "balance_loss_clip": 1.05742884, - "balance_loss_mlp": 1.03688788, - "epoch": 0.08753945588456336, - "flos": 33035877648000.0, - "grad_norm": 1.6485208275358016, - "language_loss": 0.77564865, - "learning_rate": 3.965371306866359e-06, - "loss": 0.79815018, - "num_input_tokens_seen": 31168770, - "step": 1456, - "time_per_iteration": 2.790663003921509 - }, - { - "auxiliary_loss_clip": 0.01127959, - "auxiliary_loss_mlp": 0.01054526, - "balance_loss_clip": 1.04962158, - "balance_loss_mlp": 1.03071976, - "epoch": 0.08759957913723132, - "flos": 35547182046720.0, - "grad_norm": 1.83889407784057, - "language_loss": 0.72420907, - "learning_rate": 3.96529910990316e-06, - "loss": 0.74603397, - "num_input_tokens_seen": 31189270, - "step": 1457, - "time_per_iteration": 2.9099740982055664 - }, - { - "auxiliary_loss_clip": 0.01176549, - "auxiliary_loss_mlp": 0.0104866, - "balance_loss_clip": 1.05627227, - "balance_loss_mlp": 1.02633214, - "epoch": 0.0876597023898993, - "flos": 23911120022400.0, - "grad_norm": 1.5250401870177361, - "language_loss": 0.86412215, - "learning_rate": 3.965226838415622e-06, - "loss": 0.88637424, - "num_input_tokens_seen": 31210385, - "step": 1458, - "time_per_iteration": 2.7517166137695312 - }, - { - "auxiliary_loss_clip": 0.01169535, - "auxiliary_loss_mlp": 0.01061413, - "balance_loss_clip": 1.05884266, - "balance_loss_mlp": 1.03825045, - "epoch": 0.08771982564256726, - "flos": 18114025150080.0, - "grad_norm": 1.7412813512419094, - "language_loss": 0.80268395, - "learning_rate": 3.965154492406486e-06, - "loss": 0.82499349, - "num_input_tokens_seen": 31229745, - "step": 1459, - "time_per_iteration": 2.71455717086792 - }, - { - "auxiliary_loss_clip": 0.01130491, - "auxiliary_loss_mlp": 0.01054334, - "balance_loss_clip": 1.05256546, - "balance_loss_mlp": 1.03018188, - "epoch": 0.08777994889523523, - "flos": 17712005155200.0, - "grad_norm": 2.1450339680450714, - "language_loss": 0.84538847, - "learning_rate": 3.9650820718784945e-06, - "loss": 0.86723673, - "num_input_tokens_seen": 31248280, - "step": 1460, - "time_per_iteration": 2.8737733364105225 - }, - { - "auxiliary_loss_clip": 0.01177787, - "auxiliary_loss_mlp": 0.01057974, - "balance_loss_clip": 1.0572983, - "balance_loss_mlp": 1.03640938, - "epoch": 0.0878400721479032, - "flos": 12819930382080.0, - "grad_norm": 4.917361835698274, - "language_loss": 0.79993135, - "learning_rate": 3.965009576834394e-06, - "loss": 0.82228899, - "num_input_tokens_seen": 31262190, - "step": 1461, - "time_per_iteration": 2.8436062335968018 - }, - { - "auxiliary_loss_clip": 0.01169165, - "auxiliary_loss_mlp": 0.01058947, - "balance_loss_clip": 1.05800629, - "balance_loss_mlp": 1.03704822, - "epoch": 0.08790019540057117, - "flos": 26392690938240.0, - "grad_norm": 1.566202508611165, - "language_loss": 0.76571167, - "learning_rate": 3.964937007276932e-06, - "loss": 0.78799284, - "num_input_tokens_seen": 31283690, - "step": 1462, - "time_per_iteration": 2.7895474433898926 - }, - { - "auxiliary_loss_clip": 0.0117563, - "auxiliary_loss_mlp": 0.01060064, - "balance_loss_clip": 1.05839491, - "balance_loss_mlp": 1.03580475, - "epoch": 0.08796031865323914, - "flos": 19134031662720.0, - "grad_norm": 2.89717114041641, - "language_loss": 0.74710488, - "learning_rate": 3.9648643632088634e-06, - "loss": 0.76946187, - "num_input_tokens_seen": 31302505, - "step": 1463, - "time_per_iteration": 2.760404348373413 - }, - { - "auxiliary_loss_clip": 0.01191543, - "auxiliary_loss_mlp": 0.01061609, - "balance_loss_clip": 1.06145048, - "balance_loss_mlp": 1.03680158, - "epoch": 0.0880204419059071, - "flos": 26064287867520.0, - "grad_norm": 2.431514195311041, - "language_loss": 0.83797103, - "learning_rate": 3.964791644632941e-06, - "loss": 0.8605026, - "num_input_tokens_seen": 31323070, - "step": 1464, - "time_per_iteration": 2.7417759895324707 - }, - { - "auxiliary_loss_clip": 0.011733, - "auxiliary_loss_mlp": 0.01063475, - "balance_loss_clip": 1.05683231, - "balance_loss_mlp": 1.04093289, - "epoch": 0.08808056515857508, - "flos": 22377842115840.0, - "grad_norm": 2.1775753375634963, - "language_loss": 0.78104752, - "learning_rate": 3.964718851551923e-06, - "loss": 0.8034153, - "num_input_tokens_seen": 31341880, - "step": 1465, - "time_per_iteration": 2.6852309703826904 - }, - { - "auxiliary_loss_clip": 0.01199489, - "auxiliary_loss_mlp": 0.01059873, - "balance_loss_clip": 1.0619812, - "balance_loss_mlp": 1.03791499, - "epoch": 0.08814068841124305, - "flos": 23185293897600.0, - "grad_norm": 2.412657222564686, - "language_loss": 0.85187089, - "learning_rate": 3.9646459839685675e-06, - "loss": 0.87446451, - "num_input_tokens_seen": 31361995, - "step": 1466, - "time_per_iteration": 2.706264019012451 - }, - { - "auxiliary_loss_clip": 0.01120627, - "auxiliary_loss_mlp": 0.00782645, - "balance_loss_clip": 1.04989958, - "balance_loss_mlp": 1.00037241, - "epoch": 0.08820081166391101, - "flos": 25155281358720.0, - "grad_norm": 1.9900601596102498, - "language_loss": 0.84168816, - "learning_rate": 3.964573041885641e-06, - "loss": 0.86072087, - "num_input_tokens_seen": 31381515, - "step": 1467, - "time_per_iteration": 2.8636934757232666 - }, - { - "auxiliary_loss_clip": 0.01178935, - "auxiliary_loss_mlp": 0.01055379, - "balance_loss_clip": 1.05910301, - "balance_loss_mlp": 1.03219247, - "epoch": 0.08826093491657899, - "flos": 22231685675520.0, - "grad_norm": 1.660218686828999, - "language_loss": 0.75506544, - "learning_rate": 3.964500025305907e-06, - "loss": 0.77740854, - "num_input_tokens_seen": 31400345, - "step": 1468, - "time_per_iteration": 2.661501884460449 - }, - { - "auxiliary_loss_clip": 0.01181261, - "auxiliary_loss_mlp": 0.01054252, - "balance_loss_clip": 1.0629456, - "balance_loss_mlp": 1.03266358, - "epoch": 0.08832105816924696, - "flos": 22126826897280.0, - "grad_norm": 4.868504388441724, - "language_loss": 0.80322379, - "learning_rate": 3.9644269342321355e-06, - "loss": 0.82557893, - "num_input_tokens_seen": 31419620, - "step": 1469, - "time_per_iteration": 2.7473137378692627 - }, - { - "auxiliary_loss_clip": 0.01198542, - "auxiliary_loss_mlp": 0.01059353, - "balance_loss_clip": 1.0627017, - "balance_loss_mlp": 1.03677487, - "epoch": 0.08838118142191492, - "flos": 17566495159680.0, - "grad_norm": 2.0179242193855806, - "language_loss": 0.77437651, - "learning_rate": 3.9643537686670974e-06, - "loss": 0.79695547, - "num_input_tokens_seen": 31437970, - "step": 1470, - "time_per_iteration": 2.7672410011291504 - }, - { - "auxiliary_loss_clip": 0.01193825, - "auxiliary_loss_mlp": 0.01067102, - "balance_loss_clip": 1.06180143, - "balance_loss_mlp": 1.04281926, - "epoch": 0.0884413046745829, - "flos": 20777196251520.0, - "grad_norm": 1.6812425162011504, - "language_loss": 0.84297001, - "learning_rate": 3.964280528613569e-06, - "loss": 0.86557925, - "num_input_tokens_seen": 31457040, - "step": 1471, - "time_per_iteration": 2.7584216594696045 - }, - { - "auxiliary_loss_clip": 0.01156315, - "auxiliary_loss_mlp": 0.01054307, - "balance_loss_clip": 1.05682266, - "balance_loss_mlp": 1.03342199, - "epoch": 0.08850142792725087, - "flos": 22125462180480.0, - "grad_norm": 1.6938350729430058, - "language_loss": 0.83321345, - "learning_rate": 3.964207214074324e-06, - "loss": 0.85531968, - "num_input_tokens_seen": 31477520, - "step": 1472, - "time_per_iteration": 2.7895469665527344 - }, - { - "auxiliary_loss_clip": 0.01176151, - "auxiliary_loss_mlp": 0.01058616, - "balance_loss_clip": 1.06106544, - "balance_loss_mlp": 1.03529835, - "epoch": 0.08856155117991883, - "flos": 22418744728320.0, - "grad_norm": 2.3638705809965, - "language_loss": 0.82781172, - "learning_rate": 3.964133825052146e-06, - "loss": 0.85015941, - "num_input_tokens_seen": 31495575, - "step": 1473, - "time_per_iteration": 2.7361483573913574 - }, - { - "auxiliary_loss_clip": 0.01129906, - "auxiliary_loss_mlp": 0.01064148, - "balance_loss_clip": 1.05552769, - "balance_loss_mlp": 1.04263091, - "epoch": 0.0886216744325868, - "flos": 29937002572800.0, - "grad_norm": 1.6022277785896435, - "language_loss": 0.78712153, - "learning_rate": 3.964060361549816e-06, - "loss": 0.80906206, - "num_input_tokens_seen": 31520020, - "step": 1474, - "time_per_iteration": 2.894319534301758 - }, - { - "auxiliary_loss_clip": 0.01146238, - "auxiliary_loss_mlp": 0.01068131, - "balance_loss_clip": 1.05575764, - "balance_loss_mlp": 1.04175043, - "epoch": 0.08868179768525478, - "flos": 23982833525760.0, - "grad_norm": 1.6120869011213488, - "language_loss": 0.79030406, - "learning_rate": 3.963986823570121e-06, - "loss": 0.81244779, - "num_input_tokens_seen": 31539265, - "step": 1475, - "time_per_iteration": 2.8806042671203613 - }, - { - "auxiliary_loss_clip": 0.01191986, - "auxiliary_loss_mlp": 0.01047451, - "balance_loss_clip": 1.05980015, - "balance_loss_mlp": 1.02478909, - "epoch": 0.08874192093792274, - "flos": 43177553216640.0, - "grad_norm": 1.4679464237421194, - "language_loss": 0.74202317, - "learning_rate": 3.963913211115848e-06, - "loss": 0.76441753, - "num_input_tokens_seen": 31563425, - "step": 1476, - "time_per_iteration": 2.8381049633026123 - }, - { - "auxiliary_loss_clip": 0.01174628, - "auxiliary_loss_mlp": 0.01059934, - "balance_loss_clip": 1.06217527, - "balance_loss_mlp": 1.03678358, - "epoch": 0.0888020441905907, - "flos": 32852445868800.0, - "grad_norm": 1.712954575149443, - "language_loss": 0.74220836, - "learning_rate": 3.9638395241897895e-06, - "loss": 0.76455402, - "num_input_tokens_seen": 31584525, - "step": 1477, - "time_per_iteration": 2.8452210426330566 - }, - { - "auxiliary_loss_clip": 0.01191865, - "auxiliary_loss_mlp": 0.01051229, - "balance_loss_clip": 1.06062829, - "balance_loss_mlp": 1.0278163, - "epoch": 0.08886216744325869, - "flos": 23149347361920.0, - "grad_norm": 1.95844459768748, - "language_loss": 0.87194049, - "learning_rate": 3.963765762794739e-06, - "loss": 0.89437139, - "num_input_tokens_seen": 31603325, - "step": 1478, - "time_per_iteration": 2.644918203353882 - }, - { - "auxiliary_loss_clip": 0.01176299, - "auxiliary_loss_mlp": 0.01058069, - "balance_loss_clip": 1.0572443, - "balance_loss_mlp": 1.03546739, - "epoch": 0.08892229069592665, - "flos": 23331593992320.0, - "grad_norm": 1.6306868156426517, - "language_loss": 0.77571511, - "learning_rate": 3.963691926933495e-06, - "loss": 0.79805881, - "num_input_tokens_seen": 31624820, - "step": 1479, - "time_per_iteration": 2.738168954849243 - }, - { - "auxiliary_loss_clip": 0.01164179, - "auxiliary_loss_mlp": 0.010526, - "balance_loss_clip": 1.05629039, - "balance_loss_mlp": 1.02801871, - "epoch": 0.08898241394859462, - "flos": 26213784272640.0, - "grad_norm": 2.199164032289915, - "language_loss": 0.77797234, - "learning_rate": 3.9636180166088555e-06, - "loss": 0.80014014, - "num_input_tokens_seen": 31646080, - "step": 1480, - "time_per_iteration": 2.837562322616577 - }, - { - "auxiliary_loss_clip": 0.01180168, - "auxiliary_loss_mlp": 0.01060894, - "balance_loss_clip": 1.05762577, - "balance_loss_mlp": 1.03656292, - "epoch": 0.0890425372012626, - "flos": 23550613171200.0, - "grad_norm": 2.9471668635954273, - "language_loss": 0.66437578, - "learning_rate": 3.963544031823624e-06, - "loss": 0.68678641, - "num_input_tokens_seen": 31665770, - "step": 1481, - "time_per_iteration": 2.742422580718994 - }, - { - "auxiliary_loss_clip": 0.01143445, - "auxiliary_loss_mlp": 0.01055318, - "balance_loss_clip": 1.05510306, - "balance_loss_mlp": 1.03273988, - "epoch": 0.08910266045393056, - "flos": 23002795872000.0, - "grad_norm": 2.124586862599894, - "language_loss": 0.96630967, - "learning_rate": 3.9634699725806065e-06, - "loss": 0.9882974, - "num_input_tokens_seen": 31683805, - "step": 1482, - "time_per_iteration": 2.8150243759155273 - }, - { - "auxiliary_loss_clip": 0.0115336, - "auxiliary_loss_mlp": 0.01057266, - "balance_loss_clip": 1.05521989, - "balance_loss_mlp": 1.03353167, - "epoch": 0.08916278370659853, - "flos": 31936508035200.0, - "grad_norm": 1.7904792435575492, - "language_loss": 0.78683239, - "learning_rate": 3.96339583888261e-06, - "loss": 0.80893862, - "num_input_tokens_seen": 31704630, - "step": 1483, - "time_per_iteration": 2.869084119796753 - }, - { - "auxiliary_loss_clip": 0.0116904, - "auxiliary_loss_mlp": 0.01082082, - "balance_loss_clip": 1.05540919, - "balance_loss_mlp": 1.05829978, - "epoch": 0.08922290695926649, - "flos": 17530404969600.0, - "grad_norm": 2.2229749189835677, - "language_loss": 0.85424453, - "learning_rate": 3.963321630732448e-06, - "loss": 0.87675571, - "num_input_tokens_seen": 31723255, - "step": 1484, - "time_per_iteration": 4.280332326889038 - }, - { - "auxiliary_loss_clip": 0.01199312, - "auxiliary_loss_mlp": 0.01060639, - "balance_loss_clip": 1.06350458, - "balance_loss_mlp": 1.03701186, - "epoch": 0.08928303021193447, - "flos": 32125075459200.0, - "grad_norm": 1.7208139316694195, - "language_loss": 0.80205405, - "learning_rate": 3.963247348132932e-06, - "loss": 0.82465357, - "num_input_tokens_seen": 31747045, - "step": 1485, - "time_per_iteration": 2.761733055114746 - }, - { - "auxiliary_loss_clip": 0.01173167, - "auxiliary_loss_mlp": 0.01056554, - "balance_loss_clip": 1.0563333, - "balance_loss_mlp": 1.03228331, - "epoch": 0.08934315346460243, - "flos": 22125210785280.0, - "grad_norm": 1.8969438127775513, - "language_loss": 0.82859123, - "learning_rate": 3.96317299108688e-06, - "loss": 0.85088843, - "num_input_tokens_seen": 31766615, - "step": 1486, - "time_per_iteration": 4.144649028778076 - }, - { - "auxiliary_loss_clip": 0.01144509, - "auxiliary_loss_mlp": 0.01063805, - "balance_loss_clip": 1.05592823, - "balance_loss_mlp": 1.04021382, - "epoch": 0.0894032767172704, - "flos": 22565583527040.0, - "grad_norm": 2.1520807598980185, - "language_loss": 0.76365155, - "learning_rate": 3.963098559597111e-06, - "loss": 0.78573477, - "num_input_tokens_seen": 31785855, - "step": 1487, - "time_per_iteration": 4.432489395141602 - }, - { - "auxiliary_loss_clip": 0.01157327, - "auxiliary_loss_mlp": 0.01060261, - "balance_loss_clip": 1.05041027, - "balance_loss_mlp": 1.03542995, - "epoch": 0.08946339996993838, - "flos": 20193396503040.0, - "grad_norm": 3.851280697857004, - "language_loss": 0.83030224, - "learning_rate": 3.963024053666449e-06, - "loss": 0.85247803, - "num_input_tokens_seen": 31804210, - "step": 1488, - "time_per_iteration": 2.7262001037597656 - }, - { - "auxiliary_loss_clip": 0.01171869, - "auxiliary_loss_mlp": 0.01051875, - "balance_loss_clip": 1.05546355, - "balance_loss_mlp": 1.02916527, - "epoch": 0.08952352322260634, - "flos": 48360181104000.0, - "grad_norm": 1.7759111472560039, - "language_loss": 0.71783459, - "learning_rate": 3.962949473297718e-06, - "loss": 0.74007201, - "num_input_tokens_seen": 31826150, - "step": 1489, - "time_per_iteration": 4.562536954879761 - }, - { - "auxiliary_loss_clip": 0.01150585, - "auxiliary_loss_mlp": 0.01051382, - "balance_loss_clip": 1.05190349, - "balance_loss_mlp": 1.02830291, - "epoch": 0.08958364647527431, - "flos": 31793081028480.0, - "grad_norm": 1.6999724957706692, - "language_loss": 0.89717221, - "learning_rate": 3.962874818493745e-06, - "loss": 0.91919196, - "num_input_tokens_seen": 31848060, - "step": 1490, - "time_per_iteration": 2.838327646255493 - }, - { - "auxiliary_loss_clip": 0.01184278, - "auxiliary_loss_mlp": 0.01064168, - "balance_loss_clip": 1.05656135, - "balance_loss_mlp": 1.04102957, - "epoch": 0.08964376972794229, - "flos": 23368186972800.0, - "grad_norm": 3.9062133325383126, - "language_loss": 0.73075998, - "learning_rate": 3.9628000892573635e-06, - "loss": 0.7532444, - "num_input_tokens_seen": 31870040, - "step": 1491, - "time_per_iteration": 2.7007367610931396 - }, - { - "auxiliary_loss_clip": 0.01189564, - "auxiliary_loss_mlp": 0.00780167, - "balance_loss_clip": 1.05968356, - "balance_loss_mlp": 1.00023544, - "epoch": 0.08970389298061025, - "flos": 23294785530240.0, - "grad_norm": 1.7021050418948058, - "language_loss": 0.77235049, - "learning_rate": 3.9627252855914055e-06, - "loss": 0.79204774, - "num_input_tokens_seen": 31890400, - "step": 1492, - "time_per_iteration": 2.7799623012542725 - }, - { - "auxiliary_loss_clip": 0.01187114, - "auxiliary_loss_mlp": 0.01057952, - "balance_loss_clip": 1.05902028, - "balance_loss_mlp": 1.03512359, - "epoch": 0.08976401623327822, - "flos": 33761703772800.0, - "grad_norm": 1.9236790530591625, - "language_loss": 0.71429193, - "learning_rate": 3.962650407498707e-06, - "loss": 0.73674262, - "num_input_tokens_seen": 31913435, - "step": 1493, - "time_per_iteration": 2.8479840755462646 - }, - { - "auxiliary_loss_clip": 0.01188796, - "auxiliary_loss_mlp": 0.01057103, - "balance_loss_clip": 1.05757976, - "balance_loss_mlp": 1.03371406, - "epoch": 0.08982413948594618, - "flos": 23911335504000.0, - "grad_norm": 2.6977604073852053, - "language_loss": 0.87175488, - "learning_rate": 3.962575454982109e-06, - "loss": 0.8942138, - "num_input_tokens_seen": 31932435, - "step": 1494, - "time_per_iteration": 2.855658769607544 - }, - { - "auxiliary_loss_clip": 0.0108466, - "auxiliary_loss_mlp": 0.01070478, - "balance_loss_clip": 1.04641223, - "balance_loss_mlp": 1.04551601, - "epoch": 0.08988426273861416, - "flos": 16837544551680.0, - "grad_norm": 1.6162523894431247, - "language_loss": 0.82929438, - "learning_rate": 3.962500428044454e-06, - "loss": 0.85084569, - "num_input_tokens_seen": 31950125, - "step": 1495, - "time_per_iteration": 2.9265449047088623 - }, - { - "auxiliary_loss_clip": 0.01171464, - "auxiliary_loss_mlp": 0.01059756, - "balance_loss_clip": 1.05779243, - "balance_loss_mlp": 1.03682017, - "epoch": 0.08994438599128213, - "flos": 14793365548800.0, - "grad_norm": 9.387255385257733, - "language_loss": 0.70191383, - "learning_rate": 3.962425326688585e-06, - "loss": 0.72422606, - "num_input_tokens_seen": 31968050, - "step": 1496, - "time_per_iteration": 2.773693799972534 - }, - { - "auxiliary_loss_clip": 0.01164171, - "auxiliary_loss_mlp": 0.01049454, - "balance_loss_clip": 1.05397439, - "balance_loss_mlp": 1.02888989, - "epoch": 0.09000450924395009, - "flos": 17384320356480.0, - "grad_norm": 1.6327835891742186, - "language_loss": 0.79752576, - "learning_rate": 3.962350150917351e-06, - "loss": 0.81966203, - "num_input_tokens_seen": 31985675, - "step": 1497, - "time_per_iteration": 2.6850852966308594 - }, - { - "auxiliary_loss_clip": 0.01129609, - "auxiliary_loss_mlp": 0.01054903, - "balance_loss_clip": 1.05307686, - "balance_loss_mlp": 1.03146648, - "epoch": 0.09006463249661807, - "flos": 24280317964800.0, - "grad_norm": 8.517000212139891, - "language_loss": 0.82940567, - "learning_rate": 3.9622749007336035e-06, - "loss": 0.85125089, - "num_input_tokens_seen": 32005180, - "step": 1498, - "time_per_iteration": 2.786205768585205 - }, - { - "auxiliary_loss_clip": 0.01170006, - "auxiliary_loss_mlp": 0.01059397, - "balance_loss_clip": 1.0577898, - "balance_loss_mlp": 1.03718853, - "epoch": 0.09012475574928604, - "flos": 13661928069120.0, - "grad_norm": 2.220597323082783, - "language_loss": 0.78609937, - "learning_rate": 3.962199576140195e-06, - "loss": 0.80839342, - "num_input_tokens_seen": 32022970, - "step": 1499, - "time_per_iteration": 2.71785831451416 - }, - { - "auxiliary_loss_clip": 0.01161539, - "auxiliary_loss_mlp": 0.00780528, - "balance_loss_clip": 1.05444527, - "balance_loss_mlp": 1.00024021, - "epoch": 0.090184879001954, - "flos": 23327751237120.0, - "grad_norm": 2.049001350461653, - "language_loss": 0.93337607, - "learning_rate": 3.962124177139981e-06, - "loss": 0.95279682, - "num_input_tokens_seen": 32043055, - "step": 1500, - "time_per_iteration": 2.7077536582946777 - }, - { - "auxiliary_loss_clip": 0.01148009, - "auxiliary_loss_mlp": 0.01055246, - "balance_loss_clip": 1.05371249, - "balance_loss_mlp": 1.0308435, - "epoch": 0.09024500225462198, - "flos": 23002688131200.0, - "grad_norm": 3.0778515668575492, - "language_loss": 0.74595469, - "learning_rate": 3.962048703735822e-06, - "loss": 0.76798725, - "num_input_tokens_seen": 32061900, - "step": 1501, - "time_per_iteration": 2.7073416709899902 - }, - { - "auxiliary_loss_clip": 0.01056535, - "auxiliary_loss_mlp": 0.01013118, - "balance_loss_clip": 1.03392363, - "balance_loss_mlp": 1.00963676, - "epoch": 0.09030512550728995, - "flos": 62189203242240.0, - "grad_norm": 0.7274487593473578, - "language_loss": 0.58316052, - "learning_rate": 3.96197315593058e-06, - "loss": 0.60385704, - "num_input_tokens_seen": 32122745, - "step": 1502, - "time_per_iteration": 3.274049997329712 - }, - { - "auxiliary_loss_clip": 0.0114469, - "auxiliary_loss_mlp": 0.01062533, - "balance_loss_clip": 1.04626393, - "balance_loss_mlp": 1.03896546, - "epoch": 0.09036524875995791, - "flos": 38800689171840.0, - "grad_norm": 2.1727281711500095, - "language_loss": 0.69501173, - "learning_rate": 3.961897533727119e-06, - "loss": 0.71708393, - "num_input_tokens_seen": 32145125, - "step": 1503, - "time_per_iteration": 2.87554669380188 - }, - { - "auxiliary_loss_clip": 0.01133108, - "auxiliary_loss_mlp": 0.0105903, - "balance_loss_clip": 1.04783726, - "balance_loss_mlp": 1.03660655, - "epoch": 0.09042537201262588, - "flos": 21690081429120.0, - "grad_norm": 2.169205134580129, - "language_loss": 0.86124271, - "learning_rate": 3.961821837128306e-06, - "loss": 0.88316405, - "num_input_tokens_seen": 32166255, - "step": 1504, - "time_per_iteration": 2.844688892364502 - }, - { - "auxiliary_loss_clip": 0.01146301, - "auxiliary_loss_mlp": 0.01069714, - "balance_loss_clip": 1.05341232, - "balance_loss_mlp": 1.04261804, - "epoch": 0.09048549526529386, - "flos": 22267021680000.0, - "grad_norm": 2.178155372989796, - "language_loss": 0.7233696, - "learning_rate": 3.961746066137014e-06, - "loss": 0.74552977, - "num_input_tokens_seen": 32184010, - "step": 1505, - "time_per_iteration": 2.7992677688598633 - }, - { - "auxiliary_loss_clip": 0.01137399, - "auxiliary_loss_mlp": 0.01056414, - "balance_loss_clip": 1.05097985, - "balance_loss_mlp": 1.03302479, - "epoch": 0.09054561851796182, - "flos": 14610939350400.0, - "grad_norm": 2.5107188210784526, - "language_loss": 0.80730999, - "learning_rate": 3.961670220756114e-06, - "loss": 0.82924813, - "num_input_tokens_seen": 32201635, - "step": 1506, - "time_per_iteration": 2.7458760738372803 - }, - { - "auxiliary_loss_clip": 0.01140643, - "auxiliary_loss_mlp": 0.01053315, - "balance_loss_clip": 1.05161858, - "balance_loss_mlp": 1.03197718, - "epoch": 0.09060574177062979, - "flos": 27636169916160.0, - "grad_norm": 2.166956120197676, - "language_loss": 0.75915337, - "learning_rate": 3.961594300988482e-06, - "loss": 0.78109294, - "num_input_tokens_seen": 32221940, - "step": 1507, - "time_per_iteration": 2.873826742172241 - }, - { - "auxiliary_loss_clip": 0.01051873, - "auxiliary_loss_mlp": 0.01005715, - "balance_loss_clip": 1.02043629, - "balance_loss_mlp": 1.00175714, - "epoch": 0.09066586502329776, - "flos": 66085797513600.0, - "grad_norm": 0.7272435825555993, - "language_loss": 0.57699698, - "learning_rate": 3.961518306836998e-06, - "loss": 0.59757286, - "num_input_tokens_seen": 32276495, - "step": 1508, - "time_per_iteration": 3.064926862716675 - }, - { - "auxiliary_loss_clip": 0.01165416, - "auxiliary_loss_mlp": 0.01054804, - "balance_loss_clip": 1.055233, - "balance_loss_mlp": 1.03155804, - "epoch": 0.09072598827596573, - "flos": 18916449027840.0, - "grad_norm": 1.7601330807914457, - "language_loss": 0.85090744, - "learning_rate": 3.961442238304543e-06, - "loss": 0.87310958, - "num_input_tokens_seen": 32294130, - "step": 1509, - "time_per_iteration": 2.6664113998413086 - }, - { - "auxiliary_loss_clip": 0.01168837, - "auxiliary_loss_mlp": 0.01064138, - "balance_loss_clip": 1.05745769, - "balance_loss_mlp": 1.03949761, - "epoch": 0.0907861115286337, - "flos": 24821742643200.0, - "grad_norm": 2.3794507710009203, - "language_loss": 0.84110659, - "learning_rate": 3.961366095394002e-06, - "loss": 0.8634364, - "num_input_tokens_seen": 32313555, - "step": 1510, - "time_per_iteration": 2.783484697341919 - }, - { - "auxiliary_loss_clip": 0.01153141, - "auxiliary_loss_mlp": 0.01058569, - "balance_loss_clip": 1.05423617, - "balance_loss_mlp": 1.03482211, - "epoch": 0.09084623478130167, - "flos": 21652842003840.0, - "grad_norm": 1.8490761573484715, - "language_loss": 0.85247588, - "learning_rate": 3.961289878108262e-06, - "loss": 0.87459302, - "num_input_tokens_seen": 32331430, - "step": 1511, - "time_per_iteration": 2.714620351791382 - }, - { - "auxiliary_loss_clip": 0.01145395, - "auxiliary_loss_mlp": 0.01052919, - "balance_loss_clip": 1.05182219, - "balance_loss_mlp": 1.02983987, - "epoch": 0.09090635803396964, - "flos": 27639258485760.0, - "grad_norm": 1.5734326837562458, - "language_loss": 0.84977764, - "learning_rate": 3.9612135864502135e-06, - "loss": 0.87176073, - "num_input_tokens_seen": 32353705, - "step": 1512, - "time_per_iteration": 2.75361704826355 - }, - { - "auxiliary_loss_clip": 0.01155239, - "auxiliary_loss_mlp": 0.01053669, - "balance_loss_clip": 1.05740952, - "balance_loss_mlp": 1.03185391, - "epoch": 0.0909664812866376, - "flos": 17669127294720.0, - "grad_norm": 3.0235926431973654, - "language_loss": 0.87346804, - "learning_rate": 3.961137220422749e-06, - "loss": 0.89555705, - "num_input_tokens_seen": 32370520, - "step": 1513, - "time_per_iteration": 2.6864211559295654 - }, - { - "auxiliary_loss_clip": 0.01168585, - "auxiliary_loss_mlp": 0.01049408, - "balance_loss_clip": 1.05562937, - "balance_loss_mlp": 1.02841544, - "epoch": 0.09102660453930557, - "flos": 23951448017280.0, - "grad_norm": 1.7883280971870592, - "language_loss": 0.86802679, - "learning_rate": 3.961060780028764e-06, - "loss": 0.89020675, - "num_input_tokens_seen": 32389105, - "step": 1514, - "time_per_iteration": 2.6788065433502197 - }, - { - "auxiliary_loss_clip": 0.01134005, - "auxiliary_loss_mlp": 0.01064386, - "balance_loss_clip": 1.05571628, - "balance_loss_mlp": 1.04252315, - "epoch": 0.09108672779197355, - "flos": 25812949426560.0, - "grad_norm": 1.7666120550996132, - "language_loss": 0.89944756, - "learning_rate": 3.960984265271159e-06, - "loss": 0.92143154, - "num_input_tokens_seen": 32408065, - "step": 1515, - "time_per_iteration": 2.757390022277832 - }, - { - "auxiliary_loss_clip": 0.01162518, - "auxiliary_loss_mlp": 0.01056937, - "balance_loss_clip": 1.05547726, - "balance_loss_mlp": 1.03360808, - "epoch": 0.09114685104464151, - "flos": 29639482220160.0, - "grad_norm": 2.1090985009837646, - "language_loss": 0.85576892, - "learning_rate": 3.9609076761528335e-06, - "loss": 0.87796342, - "num_input_tokens_seen": 32427225, - "step": 1516, - "time_per_iteration": 2.704784870147705 - }, - { - "auxiliary_loss_clip": 0.01158781, - "auxiliary_loss_mlp": 0.01057165, - "balance_loss_clip": 1.05135357, - "balance_loss_mlp": 1.03451526, - "epoch": 0.09120697429730948, - "flos": 33729635905920.0, - "grad_norm": 2.086405156201108, - "language_loss": 0.81167233, - "learning_rate": 3.960831012676692e-06, - "loss": 0.83383185, - "num_input_tokens_seen": 32450510, - "step": 1517, - "time_per_iteration": 2.8586854934692383 - }, - { - "auxiliary_loss_clip": 0.0117857, - "auxiliary_loss_mlp": 0.01065492, - "balance_loss_clip": 1.05741739, - "balance_loss_mlp": 1.04280686, - "epoch": 0.09126709754997746, - "flos": 18401381953920.0, - "grad_norm": 2.104468567304263, - "language_loss": 0.78067243, - "learning_rate": 3.960754274845642e-06, - "loss": 0.80311304, - "num_input_tokens_seen": 32468425, - "step": 1518, - "time_per_iteration": 2.7862088680267334 - }, - { - "auxiliary_loss_clip": 0.01165395, - "auxiliary_loss_mlp": 0.01061371, - "balance_loss_clip": 1.05285823, - "balance_loss_mlp": 1.03900695, - "epoch": 0.09132722080264542, - "flos": 22091957769600.0, - "grad_norm": 1.6816479812467473, - "language_loss": 0.86124098, - "learning_rate": 3.960677462662594e-06, - "loss": 0.88350856, - "num_input_tokens_seen": 32487510, - "step": 1519, - "time_per_iteration": 2.723714828491211 - }, - { - "auxiliary_loss_clip": 0.01163599, - "auxiliary_loss_mlp": 0.01052792, - "balance_loss_clip": 1.05454183, - "balance_loss_mlp": 1.02914131, - "epoch": 0.09138734405531339, - "flos": 21033131633280.0, - "grad_norm": 1.9681293960876167, - "language_loss": 0.73279071, - "learning_rate": 3.96060057613046e-06, - "loss": 0.75495458, - "num_input_tokens_seen": 32507250, - "step": 1520, - "time_per_iteration": 2.8098628520965576 - }, - { - "auxiliary_loss_clip": 0.01161166, - "auxiliary_loss_mlp": 0.01058035, - "balance_loss_clip": 1.05696058, - "balance_loss_mlp": 1.03469419, - "epoch": 0.09144746730798137, - "flos": 20083940784000.0, - "grad_norm": 2.6988457876937066, - "language_loss": 0.85236609, - "learning_rate": 3.960523615252156e-06, - "loss": 0.87455815, - "num_input_tokens_seen": 32526045, - "step": 1521, - "time_per_iteration": 2.7134172916412354 - }, - { - "auxiliary_loss_clip": 0.01120174, - "auxiliary_loss_mlp": 0.01063979, - "balance_loss_clip": 1.05189717, - "balance_loss_mlp": 1.03991079, - "epoch": 0.09150759056064933, - "flos": 22778210085120.0, - "grad_norm": 1.6991603177293335, - "language_loss": 0.83933008, - "learning_rate": 3.960446580030599e-06, - "loss": 0.8611716, - "num_input_tokens_seen": 32546575, - "step": 1522, - "time_per_iteration": 2.93745493888855 - }, - { - "auxiliary_loss_clip": 0.01182362, - "auxiliary_loss_mlp": 0.01064589, - "balance_loss_clip": 1.05630755, - "balance_loss_mlp": 1.04153395, - "epoch": 0.0915677138133173, - "flos": 27564205017600.0, - "grad_norm": 1.647915064434875, - "language_loss": 0.81012994, - "learning_rate": 3.960369470468711e-06, - "loss": 0.8325994, - "num_input_tokens_seen": 32568795, - "step": 1523, - "time_per_iteration": 4.378152847290039 - }, - { - "auxiliary_loss_clip": 0.01157976, - "auxiliary_loss_mlp": 0.00781395, - "balance_loss_clip": 1.05422449, - "balance_loss_mlp": 1.00037968, - "epoch": 0.09162783706598528, - "flos": 17674765729920.0, - "grad_norm": 2.106497620262502, - "language_loss": 0.7460072, - "learning_rate": 3.960292286569418e-06, - "loss": 0.76540089, - "num_input_tokens_seen": 32587010, - "step": 1524, - "time_per_iteration": 2.7146124839782715 - }, - { - "auxiliary_loss_clip": 0.01135228, - "auxiliary_loss_mlp": 0.0106119, - "balance_loss_clip": 1.05092478, - "balance_loss_mlp": 1.03782487, - "epoch": 0.09168796031865324, - "flos": 18478195188480.0, - "grad_norm": 2.0992608845945413, - "language_loss": 0.86498803, - "learning_rate": 3.960215028335644e-06, - "loss": 0.88695222, - "num_input_tokens_seen": 32602375, - "step": 1525, - "time_per_iteration": 4.314826965332031 - }, - { - "auxiliary_loss_clip": 0.01164396, - "auxiliary_loss_mlp": 0.01049506, - "balance_loss_clip": 1.05688822, - "balance_loss_mlp": 1.0263319, - "epoch": 0.0917480835713212, - "flos": 29387605075200.0, - "grad_norm": 2.1146348399758237, - "language_loss": 0.74512708, - "learning_rate": 3.96013769577032e-06, - "loss": 0.76726609, - "num_input_tokens_seen": 32621460, - "step": 1526, - "time_per_iteration": 5.878855466842651 - }, - { - "auxiliary_loss_clip": 0.01186002, - "auxiliary_loss_mlp": 0.01055817, - "balance_loss_clip": 1.05732703, - "balance_loss_mlp": 1.03392982, - "epoch": 0.09180820682398917, - "flos": 19829262378240.0, - "grad_norm": 2.5135282962071215, - "language_loss": 0.77581728, - "learning_rate": 3.960060288876378e-06, - "loss": 0.79823542, - "num_input_tokens_seen": 32640440, - "step": 1527, - "time_per_iteration": 2.693847179412842 - }, - { - "auxiliary_loss_clip": 0.01173605, - "auxiliary_loss_mlp": 0.01052264, - "balance_loss_clip": 1.0534333, - "balance_loss_mlp": 1.02868414, - "epoch": 0.09186833007665715, - "flos": 23841848643840.0, - "grad_norm": 2.655631139677705, - "language_loss": 0.78546697, - "learning_rate": 3.959982807656753e-06, - "loss": 0.80772561, - "num_input_tokens_seen": 32660020, - "step": 1528, - "time_per_iteration": 2.774219512939453 - }, - { - "auxiliary_loss_clip": 0.01146017, - "auxiliary_loss_mlp": 0.01050376, - "balance_loss_clip": 1.0499053, - "balance_loss_mlp": 1.02827477, - "epoch": 0.09192845332932512, - "flos": 12932726065920.0, - "grad_norm": 2.682547324044482, - "language_loss": 0.76732361, - "learning_rate": 3.959905252114384e-06, - "loss": 0.78928751, - "num_input_tokens_seen": 32678170, - "step": 1529, - "time_per_iteration": 4.603156089782715 - }, - { - "auxiliary_loss_clip": 0.01186538, - "auxiliary_loss_mlp": 0.00780856, - "balance_loss_clip": 1.05415928, - "balance_loss_mlp": 1.00045025, - "epoch": 0.09198857658199308, - "flos": 24568177559040.0, - "grad_norm": 1.7410660090049153, - "language_loss": 0.82906747, - "learning_rate": 3.959827622252211e-06, - "loss": 0.84874141, - "num_input_tokens_seen": 32697540, - "step": 1530, - "time_per_iteration": 2.7118582725524902 - }, - { - "auxiliary_loss_clip": 0.01130108, - "auxiliary_loss_mlp": 0.0106509, - "balance_loss_clip": 1.04975331, - "balance_loss_mlp": 1.04220152, - "epoch": 0.09204869983466106, - "flos": 20266941600000.0, - "grad_norm": 2.182960664479704, - "language_loss": 0.84001881, - "learning_rate": 3.959749918073179e-06, - "loss": 0.86197078, - "num_input_tokens_seen": 32716805, - "step": 1531, - "time_per_iteration": 2.791947603225708 - }, - { - "auxiliary_loss_clip": 0.0113655, - "auxiliary_loss_mlp": 0.01051554, - "balance_loss_clip": 1.04906452, - "balance_loss_mlp": 1.02853465, - "epoch": 0.09210882308732903, - "flos": 20885646389760.0, - "grad_norm": 1.7570281394880602, - "language_loss": 0.81253195, - "learning_rate": 3.959672139580233e-06, - "loss": 0.83441293, - "num_input_tokens_seen": 32736385, - "step": 1532, - "time_per_iteration": 2.737739324569702 - }, - { - "auxiliary_loss_clip": 0.01157728, - "auxiliary_loss_mlp": 0.01056753, - "balance_loss_clip": 1.052163, - "balance_loss_mlp": 1.03385305, - "epoch": 0.09216894633999699, - "flos": 30956326727040.0, - "grad_norm": 2.2821036564882182, - "language_loss": 0.84194255, - "learning_rate": 3.9595942867763235e-06, - "loss": 0.86408734, - "num_input_tokens_seen": 32757140, - "step": 1533, - "time_per_iteration": 2.7542598247528076 - }, - { - "auxiliary_loss_clip": 0.01149262, - "auxiliary_loss_mlp": 0.01053623, - "balance_loss_clip": 1.05813503, - "balance_loss_mlp": 1.03190327, - "epoch": 0.09222906959266497, - "flos": 13151565676800.0, - "grad_norm": 1.9396914937933663, - "language_loss": 0.9009546, - "learning_rate": 3.959516359664402e-06, - "loss": 0.92298347, - "num_input_tokens_seen": 32774860, - "step": 1534, - "time_per_iteration": 2.6450984477996826 - }, - { - "auxiliary_loss_clip": 0.01150273, - "auxiliary_loss_mlp": 0.0106298, - "balance_loss_clip": 1.0495038, - "balance_loss_mlp": 1.03849435, - "epoch": 0.09228919284533293, - "flos": 25994477784960.0, - "grad_norm": 5.065477266086046, - "language_loss": 0.75779241, - "learning_rate": 3.959438358247424e-06, - "loss": 0.77992499, - "num_input_tokens_seen": 32795250, - "step": 1535, - "time_per_iteration": 2.730915069580078 - }, - { - "auxiliary_loss_clip": 0.01168283, - "auxiliary_loss_mlp": 0.01045276, - "balance_loss_clip": 1.05278873, - "balance_loss_mlp": 1.02403271, - "epoch": 0.0923493160980009, - "flos": 18660800954880.0, - "grad_norm": 1.8085584532497372, - "language_loss": 0.81631637, - "learning_rate": 3.959360282528346e-06, - "loss": 0.83845198, - "num_input_tokens_seen": 32813805, - "step": 1536, - "time_per_iteration": 2.7326817512512207 - }, - { - "auxiliary_loss_clip": 0.01181977, - "auxiliary_loss_mlp": 0.01053699, - "balance_loss_clip": 1.05431938, - "balance_loss_mlp": 1.03224182, - "epoch": 0.09240943935066886, - "flos": 21140576190720.0, - "grad_norm": 2.0929096884707556, - "language_loss": 0.89092755, - "learning_rate": 3.959282132510131e-06, - "loss": 0.9132843, - "num_input_tokens_seen": 32830960, - "step": 1537, - "time_per_iteration": 2.675771713256836 - }, - { - "auxiliary_loss_clip": 0.01157238, - "auxiliary_loss_mlp": 0.01058647, - "balance_loss_clip": 1.05114293, - "balance_loss_mlp": 1.03605688, - "epoch": 0.09246956260333684, - "flos": 20592435669120.0, - "grad_norm": 1.9480116987165197, - "language_loss": 0.80702311, - "learning_rate": 3.959203908195741e-06, - "loss": 0.82918191, - "num_input_tokens_seen": 32848275, - "step": 1538, - "time_per_iteration": 2.71618390083313 - }, - { - "auxiliary_loss_clip": 0.01060495, - "auxiliary_loss_mlp": 0.0101237, - "balance_loss_clip": 1.03095436, - "balance_loss_mlp": 1.00872231, - "epoch": 0.09252968585600481, - "flos": 67558710614400.0, - "grad_norm": 0.7534074452314953, - "language_loss": 0.57429332, - "learning_rate": 3.959125609588142e-06, - "loss": 0.59502202, - "num_input_tokens_seen": 32917730, - "step": 1539, - "time_per_iteration": 3.3933441638946533 - }, - { - "auxiliary_loss_clip": 0.01159831, - "auxiliary_loss_mlp": 0.01050602, - "balance_loss_clip": 1.05638027, - "balance_loss_mlp": 1.02863121, - "epoch": 0.09258980910867277, - "flos": 17383853479680.0, - "grad_norm": 2.849299216868502, - "language_loss": 0.67554641, - "learning_rate": 3.959047236690304e-06, - "loss": 0.69765073, - "num_input_tokens_seen": 32934910, - "step": 1540, - "time_per_iteration": 2.757084608078003 - }, - { - "auxiliary_loss_clip": 0.01144239, - "auxiliary_loss_mlp": 0.01048444, - "balance_loss_clip": 1.04954028, - "balance_loss_mlp": 1.026438, - "epoch": 0.09264993236134075, - "flos": 19865927185920.0, - "grad_norm": 2.044335478602743, - "language_loss": 0.83917534, - "learning_rate": 3.958968789505198e-06, - "loss": 0.86110216, - "num_input_tokens_seen": 32953840, - "step": 1541, - "time_per_iteration": 2.8497180938720703 - }, - { - "auxiliary_loss_clip": 0.01077839, - "auxiliary_loss_mlp": 0.01013078, - "balance_loss_clip": 1.02602255, - "balance_loss_mlp": 1.0097636, - "epoch": 0.09271005561400872, - "flos": 62284401262080.0, - "grad_norm": 0.8790732834061692, - "language_loss": 0.61881655, - "learning_rate": 3.9588902680358e-06, - "loss": 0.63972563, - "num_input_tokens_seen": 33011410, - "step": 1542, - "time_per_iteration": 3.3079330921173096 - }, - { - "auxiliary_loss_clip": 0.01161232, - "auxiliary_loss_mlp": 0.01059438, - "balance_loss_clip": 1.05441117, - "balance_loss_mlp": 1.03808808, - "epoch": 0.09277017886667668, - "flos": 23329870139520.0, - "grad_norm": 1.6256118826429122, - "language_loss": 0.82802349, - "learning_rate": 3.958811672285086e-06, - "loss": 0.85023022, - "num_input_tokens_seen": 33031675, - "step": 1543, - "time_per_iteration": 2.7408807277679443 - }, - { - "auxiliary_loss_clip": 0.01135873, - "auxiliary_loss_mlp": 0.01060295, - "balance_loss_clip": 1.04848838, - "balance_loss_mlp": 1.03863442, - "epoch": 0.09283030211934466, - "flos": 54745169875200.0, - "grad_norm": 1.706948475246468, - "language_loss": 0.72265279, - "learning_rate": 3.958733002256038e-06, - "loss": 0.74461448, - "num_input_tokens_seen": 33056355, - "step": 1544, - "time_per_iteration": 3.104156255722046 - }, - { - "auxiliary_loss_clip": 0.01166071, - "auxiliary_loss_mlp": 0.01055881, - "balance_loss_clip": 1.05165935, - "balance_loss_mlp": 1.03138375, - "epoch": 0.09289042537201263, - "flos": 30334784762880.0, - "grad_norm": 1.7720844214030114, - "language_loss": 0.77286768, - "learning_rate": 3.958654257951637e-06, - "loss": 0.79508722, - "num_input_tokens_seen": 33079520, - "step": 1545, - "time_per_iteration": 2.808180570602417 - }, - { - "auxiliary_loss_clip": 0.01140161, - "auxiliary_loss_mlp": 0.01050495, - "balance_loss_clip": 1.0526737, - "balance_loss_mlp": 1.02872682, - "epoch": 0.09295054862468059, - "flos": 17746838369280.0, - "grad_norm": 2.7089619481030076, - "language_loss": 0.74396008, - "learning_rate": 3.9585754393748706e-06, - "loss": 0.76586664, - "num_input_tokens_seen": 33096135, - "step": 1546, - "time_per_iteration": 2.7634081840515137 - }, - { - "auxiliary_loss_clip": 0.01163775, - "auxiliary_loss_mlp": 0.0105305, - "balance_loss_clip": 1.05357957, - "balance_loss_mlp": 1.02956545, - "epoch": 0.09301067187734856, - "flos": 23658021815040.0, - "grad_norm": 1.9423225100503794, - "language_loss": 0.84200966, - "learning_rate": 3.9584965465287275e-06, - "loss": 0.86417794, - "num_input_tokens_seen": 33115245, - "step": 1547, - "time_per_iteration": 2.790003776550293 - }, - { - "auxiliary_loss_clip": 0.01141839, - "auxiliary_loss_mlp": 0.01053941, - "balance_loss_clip": 1.04740989, - "balance_loss_mlp": 1.03195918, - "epoch": 0.09307079513001654, - "flos": 27527719777920.0, - "grad_norm": 2.6545433694843488, - "language_loss": 0.67698336, - "learning_rate": 3.958417579416199e-06, - "loss": 0.69894123, - "num_input_tokens_seen": 33136640, - "step": 1548, - "time_per_iteration": 2.8367013931274414 - }, - { - "auxiliary_loss_clip": 0.01123899, - "auxiliary_loss_mlp": 0.01059885, - "balance_loss_clip": 1.04744387, - "balance_loss_mlp": 1.03754544, - "epoch": 0.0931309183826845, - "flos": 20627340710400.0, - "grad_norm": 1.6829727803454704, - "language_loss": 0.8326273, - "learning_rate": 3.9583385380402795e-06, - "loss": 0.85446513, - "num_input_tokens_seen": 33155060, - "step": 1549, - "time_per_iteration": 2.8462016582489014 - }, - { - "auxiliary_loss_clip": 0.01176243, - "auxiliary_loss_mlp": 0.0104617, - "balance_loss_clip": 1.05815506, - "balance_loss_mlp": 1.02473652, - "epoch": 0.09319104163535247, - "flos": 29020921084800.0, - "grad_norm": 1.5528514681372962, - "language_loss": 0.75838119, - "learning_rate": 3.958259422403966e-06, - "loss": 0.78060532, - "num_input_tokens_seen": 33175420, - "step": 1550, - "time_per_iteration": 2.7325351238250732 - }, - { - "auxiliary_loss_clip": 0.01150315, - "auxiliary_loss_mlp": 0.01069257, - "balance_loss_clip": 1.05249369, - "balance_loss_mlp": 1.04483092, - "epoch": 0.09325116488802045, - "flos": 25301545539840.0, - "grad_norm": 2.1922696027472233, - "language_loss": 0.82828665, - "learning_rate": 3.95818023251026e-06, - "loss": 0.85048234, - "num_input_tokens_seen": 33194120, - "step": 1551, - "time_per_iteration": 2.852602481842041 - }, - { - "auxiliary_loss_clip": 0.01064371, - "auxiliary_loss_mlp": 0.00760109, - "balance_loss_clip": 1.02203059, - "balance_loss_mlp": 0.99984246, - "epoch": 0.09331128814068841, - "flos": 61536203942400.0, - "grad_norm": 0.7384225982202158, - "language_loss": 0.61837572, - "learning_rate": 3.958100968362163e-06, - "loss": 0.63662052, - "num_input_tokens_seen": 33261080, - "step": 1552, - "time_per_iteration": 3.3453099727630615 - }, - { - "auxiliary_loss_clip": 0.01059175, - "auxiliary_loss_mlp": 0.01016654, - "balance_loss_clip": 1.02415061, - "balance_loss_mlp": 1.01338792, - "epoch": 0.09337141139335638, - "flos": 53293700171520.0, - "grad_norm": 0.8524917480784928, - "language_loss": 0.58986926, - "learning_rate": 3.958021629962681e-06, - "loss": 0.61062753, - "num_input_tokens_seen": 33330235, - "step": 1553, - "time_per_iteration": 3.37673282623291 - }, - { - "auxiliary_loss_clip": 0.01146955, - "auxiliary_loss_mlp": 0.01056683, - "balance_loss_clip": 1.05026984, - "balance_loss_mlp": 1.03336585, - "epoch": 0.09343153464602436, - "flos": 23476852592640.0, - "grad_norm": 2.3365109182487, - "language_loss": 0.87665397, - "learning_rate": 3.957942217314823e-06, - "loss": 0.8986904, - "num_input_tokens_seen": 33349035, - "step": 1554, - "time_per_iteration": 2.8098127841949463 - }, - { - "auxiliary_loss_clip": 0.01153047, - "auxiliary_loss_mlp": 0.01057257, - "balance_loss_clip": 1.05439448, - "balance_loss_mlp": 1.03393972, - "epoch": 0.09349165789869232, - "flos": 19353481804800.0, - "grad_norm": 4.388884220182432, - "language_loss": 0.81678319, - "learning_rate": 3.957862730421599e-06, - "loss": 0.83888626, - "num_input_tokens_seen": 33368060, - "step": 1555, - "time_per_iteration": 2.726207971572876 - }, - { - "auxiliary_loss_clip": 0.01058869, - "auxiliary_loss_mlp": 0.01003892, - "balance_loss_clip": 1.0202632, - "balance_loss_mlp": 1.00045919, - "epoch": 0.09355178115136029, - "flos": 67502580635520.0, - "grad_norm": 0.8683826280274983, - "language_loss": 0.59606886, - "learning_rate": 3.957783169286024e-06, - "loss": 0.61669648, - "num_input_tokens_seen": 33430825, - "step": 1556, - "time_per_iteration": 3.209326982498169 - }, - { - "auxiliary_loss_clip": 0.01174249, - "auxiliary_loss_mlp": 0.01059741, - "balance_loss_clip": 1.05518138, - "balance_loss_mlp": 1.03727031, - "epoch": 0.09361190440402825, - "flos": 37341638720640.0, - "grad_norm": 1.6803158790244075, - "language_loss": 0.84290808, - "learning_rate": 3.9577035339111155e-06, - "loss": 0.86524796, - "num_input_tokens_seen": 33454855, - "step": 1557, - "time_per_iteration": 2.831650733947754 - }, - { - "auxiliary_loss_clip": 0.01110857, - "auxiliary_loss_mlp": 0.01065156, - "balance_loss_clip": 1.04900038, - "balance_loss_mlp": 1.04112351, - "epoch": 0.09367202765669623, - "flos": 24899705112960.0, - "grad_norm": 1.6725809358966677, - "language_loss": 0.780913, - "learning_rate": 3.957623824299893e-06, - "loss": 0.8026731, - "num_input_tokens_seen": 33476000, - "step": 1558, - "time_per_iteration": 3.0111780166625977 - }, - { - "auxiliary_loss_clip": 0.01164994, - "auxiliary_loss_mlp": 0.01051229, - "balance_loss_clip": 1.0558666, - "balance_loss_mlp": 1.02881753, - "epoch": 0.0937321509093642, - "flos": 15705568368000.0, - "grad_norm": 2.0141986314124414, - "language_loss": 0.80066288, - "learning_rate": 3.957544040455379e-06, - "loss": 0.82282507, - "num_input_tokens_seen": 33493845, - "step": 1559, - "time_per_iteration": 3.024117946624756 - }, - { - "auxiliary_loss_clip": 0.01141277, - "auxiliary_loss_mlp": 0.01061718, - "balance_loss_clip": 1.05060387, - "balance_loss_mlp": 1.04012942, - "epoch": 0.09379227416203216, - "flos": 20483698222080.0, - "grad_norm": 1.8358373674042003, - "language_loss": 0.76418209, - "learning_rate": 3.957464182380599e-06, - "loss": 0.78621197, - "num_input_tokens_seen": 33510850, - "step": 1560, - "time_per_iteration": 2.68558406829834 - }, - { - "auxiliary_loss_clip": 0.01137939, - "auxiliary_loss_mlp": 0.01054925, - "balance_loss_clip": 1.05014277, - "balance_loss_mlp": 1.03213274, - "epoch": 0.09385239741470014, - "flos": 24352498344960.0, - "grad_norm": 3.575155933252121, - "language_loss": 0.80784953, - "learning_rate": 3.95738425007858e-06, - "loss": 0.82977819, - "num_input_tokens_seen": 33530430, - "step": 1561, - "time_per_iteration": 2.759148359298706 - }, - { - "auxiliary_loss_clip": 0.01173652, - "auxiliary_loss_mlp": 0.01052448, - "balance_loss_clip": 1.05276573, - "balance_loss_mlp": 1.02989376, - "epoch": 0.0939125206673681, - "flos": 33291489807360.0, - "grad_norm": 2.448664627367939, - "language_loss": 0.6140722, - "learning_rate": 3.957304243552354e-06, - "loss": 0.63633323, - "num_input_tokens_seen": 33551975, - "step": 1562, - "time_per_iteration": 2.9014978408813477 - }, - { - "auxiliary_loss_clip": 0.01162693, - "auxiliary_loss_mlp": 0.0106374, - "balance_loss_clip": 1.05719543, - "balance_loss_mlp": 1.04213953, - "epoch": 0.09397264392003607, - "flos": 19244923925760.0, - "grad_norm": 3.5098220300578555, - "language_loss": 0.8496151, - "learning_rate": 3.957224162804956e-06, - "loss": 0.87187934, - "num_input_tokens_seen": 33569850, - "step": 1563, - "time_per_iteration": 4.404061555862427 - }, - { - "auxiliary_loss_clip": 0.01164811, - "auxiliary_loss_mlp": 0.01047932, - "balance_loss_clip": 1.05775142, - "balance_loss_mlp": 1.02652228, - "epoch": 0.09403276717270405, - "flos": 19317930318720.0, - "grad_norm": 1.6765528861156813, - "language_loss": 0.76511294, - "learning_rate": 3.9571440078394205e-06, - "loss": 0.78724039, - "num_input_tokens_seen": 33590510, - "step": 1564, - "time_per_iteration": 4.255565166473389 - }, - { - "auxiliary_loss_clip": 0.01151297, - "auxiliary_loss_mlp": 0.01063256, - "balance_loss_clip": 1.05196142, - "balance_loss_mlp": 1.04172707, - "epoch": 0.09409289042537201, - "flos": 23583471137280.0, - "grad_norm": 1.9762038777899962, - "language_loss": 0.80134326, - "learning_rate": 3.9570637786587895e-06, - "loss": 0.82348871, - "num_input_tokens_seen": 33608810, - "step": 1565, - "time_per_iteration": 2.8548545837402344 - }, - { - "auxiliary_loss_clip": 0.01158602, - "auxiliary_loss_mlp": 0.01063767, - "balance_loss_clip": 1.05420566, - "balance_loss_mlp": 1.04233313, - "epoch": 0.09415301367803998, - "flos": 20078446003200.0, - "grad_norm": 1.6810250981626251, - "language_loss": 0.75134379, - "learning_rate": 3.956983475266103e-06, - "loss": 0.77356744, - "num_input_tokens_seen": 33627265, - "step": 1566, - "time_per_iteration": 4.889045715332031 - }, - { - "auxiliary_loss_clip": 0.01145856, - "auxiliary_loss_mlp": 0.00780689, - "balance_loss_clip": 1.05168366, - "balance_loss_mlp": 1.00022864, - "epoch": 0.09421313693070796, - "flos": 21062075016960.0, - "grad_norm": 1.6828919748843199, - "language_loss": 0.77958012, - "learning_rate": 3.956903097664407e-06, - "loss": 0.79884553, - "num_input_tokens_seen": 33644810, - "step": 1567, - "time_per_iteration": 4.445765972137451 - }, - { - "auxiliary_loss_clip": 0.01156815, - "auxiliary_loss_mlp": 0.01056228, - "balance_loss_clip": 1.05256855, - "balance_loss_mlp": 1.03591454, - "epoch": 0.09427326018337592, - "flos": 24316156759680.0, - "grad_norm": 2.008686295040646, - "language_loss": 0.82608044, - "learning_rate": 3.956822645856749e-06, - "loss": 0.84821093, - "num_input_tokens_seen": 33665665, - "step": 1568, - "time_per_iteration": 2.881535768508911 - }, - { - "auxiliary_loss_clip": 0.01187915, - "auxiliary_loss_mlp": 0.01051731, - "balance_loss_clip": 1.05717778, - "balance_loss_mlp": 1.02927184, - "epoch": 0.09433338343604389, - "flos": 20263888944000.0, - "grad_norm": 1.9573151026586577, - "language_loss": 0.76943743, - "learning_rate": 3.9567421198461814e-06, - "loss": 0.79183388, - "num_input_tokens_seen": 33684760, - "step": 1569, - "time_per_iteration": 2.6097726821899414 - }, - { - "auxiliary_loss_clip": 0.01120191, - "auxiliary_loss_mlp": 0.01060805, - "balance_loss_clip": 1.04771852, - "balance_loss_mlp": 1.03625941, - "epoch": 0.09439350668871185, - "flos": 12742973493120.0, - "grad_norm": 3.3813700161908917, - "language_loss": 0.85488856, - "learning_rate": 3.956661519635756e-06, - "loss": 0.87669849, - "num_input_tokens_seen": 33700750, - "step": 1570, - "time_per_iteration": 2.7571377754211426 - }, - { - "auxiliary_loss_clip": 0.01122458, - "auxiliary_loss_mlp": 0.01055939, - "balance_loss_clip": 1.04927301, - "balance_loss_mlp": 1.03183508, - "epoch": 0.09445362994137983, - "flos": 25962266263680.0, - "grad_norm": 1.540414635950846, - "language_loss": 0.76415235, - "learning_rate": 3.95658084522853e-06, - "loss": 0.7859363, - "num_input_tokens_seen": 33724430, - "step": 1571, - "time_per_iteration": 2.913569211959839 - }, - { - "auxiliary_loss_clip": 0.01135683, - "auxiliary_loss_mlp": 0.01057111, - "balance_loss_clip": 1.0490278, - "balance_loss_mlp": 1.0349735, - "epoch": 0.0945137531940478, - "flos": 19715353372800.0, - "grad_norm": 1.6745378641752047, - "language_loss": 0.79397607, - "learning_rate": 3.956500096627561e-06, - "loss": 0.81590402, - "num_input_tokens_seen": 33743455, - "step": 1572, - "time_per_iteration": 2.813410758972168 - }, - { - "auxiliary_loss_clip": 0.01148251, - "auxiliary_loss_mlp": 0.0106927, - "balance_loss_clip": 1.05619979, - "balance_loss_mlp": 1.04524922, - "epoch": 0.09457387644671576, - "flos": 23617047375360.0, - "grad_norm": 1.7559396294879055, - "language_loss": 0.87707287, - "learning_rate": 3.956419273835913e-06, - "loss": 0.89924812, - "num_input_tokens_seen": 33763435, - "step": 1573, - "time_per_iteration": 2.776535987854004 - }, - { - "auxiliary_loss_clip": 0.01161183, - "auxiliary_loss_mlp": 0.01063326, - "balance_loss_clip": 1.05485129, - "balance_loss_mlp": 1.03804219, - "epoch": 0.09463399969938374, - "flos": 26907291135360.0, - "grad_norm": 2.9707854698090097, - "language_loss": 0.81982428, - "learning_rate": 3.95633837685665e-06, - "loss": 0.84206939, - "num_input_tokens_seen": 33784325, - "step": 1574, - "time_per_iteration": 2.7604806423187256 - }, - { - "auxiliary_loss_clip": 0.01156287, - "auxiliary_loss_mlp": 0.01055594, - "balance_loss_clip": 1.05234718, - "balance_loss_mlp": 1.0344342, - "epoch": 0.0946941229520517, - "flos": 23659566099840.0, - "grad_norm": 1.7178511535677499, - "language_loss": 0.80855322, - "learning_rate": 3.95625740569284e-06, - "loss": 0.83067203, - "num_input_tokens_seen": 33802510, - "step": 1575, - "time_per_iteration": 2.713247299194336 - }, - { - "auxiliary_loss_clip": 0.01182326, - "auxiliary_loss_mlp": 0.01068689, - "balance_loss_clip": 1.05578864, - "balance_loss_mlp": 1.04581285, - "epoch": 0.09475424620471967, - "flos": 24134053783680.0, - "grad_norm": 1.9110861379460222, - "language_loss": 0.86483347, - "learning_rate": 3.956176360347553e-06, - "loss": 0.88734365, - "num_input_tokens_seen": 33819980, - "step": 1576, - "time_per_iteration": 2.682644844055176 - }, - { - "auxiliary_loss_clip": 0.01056441, - "auxiliary_loss_mlp": 0.01027284, - "balance_loss_clip": 1.0225811, - "balance_loss_mlp": 1.02344561, - "epoch": 0.09481436945738765, - "flos": 68426168065920.0, - "grad_norm": 0.9789918611127905, - "language_loss": 0.6582402, - "learning_rate": 3.956095240823862e-06, - "loss": 0.67907751, - "num_input_tokens_seen": 33878925, - "step": 1577, - "time_per_iteration": 3.2106685638427734 - }, - { - "auxiliary_loss_clip": 0.01147668, - "auxiliary_loss_mlp": 0.01051958, - "balance_loss_clip": 1.05218005, - "balance_loss_mlp": 1.03098869, - "epoch": 0.09487449271005562, - "flos": 16654076858880.0, - "grad_norm": 1.8223175005615506, - "language_loss": 0.79152733, - "learning_rate": 3.956014047124844e-06, - "loss": 0.81352365, - "num_input_tokens_seen": 33897600, - "step": 1578, - "time_per_iteration": 2.820089340209961 - }, - { - "auxiliary_loss_clip": 0.01185941, - "auxiliary_loss_mlp": 0.01066432, - "balance_loss_clip": 1.05838132, - "balance_loss_mlp": 1.04437804, - "epoch": 0.09493461596272358, - "flos": 24275685110400.0, - "grad_norm": 3.480730999818176, - "language_loss": 0.78161818, - "learning_rate": 3.955932779253578e-06, - "loss": 0.80414188, - "num_input_tokens_seen": 33917365, - "step": 1579, - "time_per_iteration": 2.6518983840942383 - }, - { - "auxiliary_loss_clip": 0.01128319, - "auxiliary_loss_mlp": 0.01065633, - "balance_loss_clip": 1.04771328, - "balance_loss_mlp": 1.04001498, - "epoch": 0.09499473921539155, - "flos": 21870173243520.0, - "grad_norm": 2.0084876987684526, - "language_loss": 0.73410392, - "learning_rate": 3.955851437213144e-06, - "loss": 0.75604343, - "num_input_tokens_seen": 33936680, - "step": 1580, - "time_per_iteration": 2.679461717605591 - }, - { - "auxiliary_loss_clip": 0.01157568, - "auxiliary_loss_mlp": 0.01062628, - "balance_loss_clip": 1.05573344, - "balance_loss_mlp": 1.04095626, - "epoch": 0.09505486246805953, - "flos": 33547137880320.0, - "grad_norm": 14.809542792179553, - "language_loss": 0.77565914, - "learning_rate": 3.955770021006627e-06, - "loss": 0.7978611, - "num_input_tokens_seen": 33960685, - "step": 1581, - "time_per_iteration": 2.765394449234009 - }, - { - "auxiliary_loss_clip": 0.01144835, - "auxiliary_loss_mlp": 0.0106468, - "balance_loss_clip": 1.05426359, - "balance_loss_mlp": 1.04276967, - "epoch": 0.09511498572072749, - "flos": 21215342350080.0, - "grad_norm": 1.8617167187056045, - "language_loss": 0.87230825, - "learning_rate": 3.955688530637116e-06, - "loss": 0.89440346, - "num_input_tokens_seen": 33980015, - "step": 1582, - "time_per_iteration": 2.691364288330078 - }, - { - "auxiliary_loss_clip": 0.01174295, - "auxiliary_loss_mlp": 0.0106431, - "balance_loss_clip": 1.05508888, - "balance_loss_mlp": 1.04039705, - "epoch": 0.09517510897339546, - "flos": 14611262572800.0, - "grad_norm": 1.8512060219658202, - "language_loss": 0.67043924, - "learning_rate": 3.955606966107699e-06, - "loss": 0.69282532, - "num_input_tokens_seen": 33997705, - "step": 1583, - "time_per_iteration": 2.6693732738494873 - }, - { - "auxiliary_loss_clip": 0.01177751, - "auxiliary_loss_mlp": 0.01053743, - "balance_loss_clip": 1.0593859, - "balance_loss_mlp": 1.03035378, - "epoch": 0.09523523222606343, - "flos": 27817339138560.0, - "grad_norm": 2.144216926782962, - "language_loss": 0.70752859, - "learning_rate": 3.95552532742147e-06, - "loss": 0.7298435, - "num_input_tokens_seen": 34017465, - "step": 1584, - "time_per_iteration": 2.7164390087127686 - }, - { - "auxiliary_loss_clip": 0.01138507, - "auxiliary_loss_mlp": 0.0105762, - "balance_loss_clip": 1.05243039, - "balance_loss_mlp": 1.03584039, - "epoch": 0.0952953554787314, - "flos": 20706272847360.0, - "grad_norm": 1.4654737580846544, - "language_loss": 0.8080442, - "learning_rate": 3.955443614581525e-06, - "loss": 0.83000553, - "num_input_tokens_seen": 34038550, - "step": 1585, - "time_per_iteration": 2.879831314086914 - }, - { - "auxiliary_loss_clip": 0.01159374, - "auxiliary_loss_mlp": 0.01057717, - "balance_loss_clip": 1.05387473, - "balance_loss_mlp": 1.03355336, - "epoch": 0.09535547873139937, - "flos": 24787627701120.0, - "grad_norm": 1.638250735795891, - "language_loss": 0.71921158, - "learning_rate": 3.955361827590961e-06, - "loss": 0.74138248, - "num_input_tokens_seen": 34058665, - "step": 1586, - "time_per_iteration": 2.750436544418335 - }, - { - "auxiliary_loss_clip": 0.01048565, - "auxiliary_loss_mlp": 0.01003302, - "balance_loss_clip": 1.03115988, - "balance_loss_mlp": 0.99901009, - "epoch": 0.09541560198406734, - "flos": 71912194905600.0, - "grad_norm": 0.8099482252624973, - "language_loss": 0.55475175, - "learning_rate": 3.955279966452883e-06, - "loss": 0.57527041, - "num_input_tokens_seen": 34109655, - "step": 1587, - "time_per_iteration": 3.0975699424743652 - }, - { - "auxiliary_loss_clip": 0.01128884, - "auxiliary_loss_mlp": 0.0105965, - "balance_loss_clip": 1.04768586, - "balance_loss_mlp": 1.03661847, - "epoch": 0.09547572523673531, - "flos": 28982604251520.0, - "grad_norm": 1.708481785076906, - "language_loss": 0.81062275, - "learning_rate": 3.955198031170391e-06, - "loss": 0.83250809, - "num_input_tokens_seen": 34131115, - "step": 1588, - "time_per_iteration": 2.7718451023101807 - }, - { - "auxiliary_loss_clip": 0.01131602, - "auxiliary_loss_mlp": 0.01056117, - "balance_loss_clip": 1.04894614, - "balance_loss_mlp": 1.03438473, - "epoch": 0.09553584848940327, - "flos": 24133910129280.0, - "grad_norm": 1.5119879232668088, - "language_loss": 0.81481898, - "learning_rate": 3.955116021746594e-06, - "loss": 0.83669615, - "num_input_tokens_seen": 34151925, - "step": 1589, - "time_per_iteration": 2.782468795776367 - }, - { - "auxiliary_loss_clip": 0.0112194, - "auxiliary_loss_mlp": 0.00780573, - "balance_loss_clip": 1.0508883, - "balance_loss_mlp": 1.00013089, - "epoch": 0.09559597174207124, - "flos": 42851376789120.0, - "grad_norm": 1.525287399882202, - "language_loss": 0.64882791, - "learning_rate": 3.955033938184601e-06, - "loss": 0.667853, - "num_input_tokens_seen": 34175395, - "step": 1590, - "time_per_iteration": 3.0783450603485107 - }, - { - "auxiliary_loss_clip": 0.01143501, - "auxiliary_loss_mlp": 0.01058399, - "balance_loss_clip": 1.05087948, - "balance_loss_mlp": 1.0358206, - "epoch": 0.09565609499473922, - "flos": 32670845683200.0, - "grad_norm": 2.0745314237741916, - "language_loss": 0.83290577, - "learning_rate": 3.954951780487526e-06, - "loss": 0.85492468, - "num_input_tokens_seen": 34197760, - "step": 1591, - "time_per_iteration": 2.8393962383270264 - }, - { - "auxiliary_loss_clip": 0.01163486, - "auxiliary_loss_mlp": 0.01065588, - "balance_loss_clip": 1.0522387, - "balance_loss_mlp": 1.04266405, - "epoch": 0.09571621824740718, - "flos": 18478410670080.0, - "grad_norm": 2.825705290827541, - "language_loss": 0.74087322, - "learning_rate": 3.9548695486584835e-06, - "loss": 0.76316392, - "num_input_tokens_seen": 34215330, - "step": 1592, - "time_per_iteration": 2.6828882694244385 - }, - { - "auxiliary_loss_clip": 0.01169239, - "auxiliary_loss_mlp": 0.01055073, - "balance_loss_clip": 1.05161428, - "balance_loss_mlp": 1.03337741, - "epoch": 0.09577634150007515, - "flos": 29387497334400.0, - "grad_norm": 2.18277080043521, - "language_loss": 0.74483889, - "learning_rate": 3.954787242700592e-06, - "loss": 0.76708198, - "num_input_tokens_seen": 34237745, - "step": 1593, - "time_per_iteration": 2.7193498611450195 - }, - { - "auxiliary_loss_clip": 0.01177343, - "auxiliary_loss_mlp": 0.01055096, - "balance_loss_clip": 1.05910873, - "balance_loss_mlp": 1.03307831, - "epoch": 0.09583646475274313, - "flos": 22747830157440.0, - "grad_norm": 1.887493467708827, - "language_loss": 0.69782627, - "learning_rate": 3.954704862616971e-06, - "loss": 0.72015071, - "num_input_tokens_seen": 34256565, - "step": 1594, - "time_per_iteration": 2.635383367538452 - }, - { - "auxiliary_loss_clip": 0.01173222, - "auxiliary_loss_mlp": 0.01051806, - "balance_loss_clip": 1.05618978, - "balance_loss_mlp": 1.03037214, - "epoch": 0.0958965880054111, - "flos": 23218367345280.0, - "grad_norm": 2.1411006117727682, - "language_loss": 0.82780552, - "learning_rate": 3.954622408410747e-06, - "loss": 0.85005581, - "num_input_tokens_seen": 34275970, - "step": 1595, - "time_per_iteration": 2.7158257961273193 - }, - { - "auxiliary_loss_clip": 0.01153253, - "auxiliary_loss_mlp": 0.01054246, - "balance_loss_clip": 1.05143809, - "balance_loss_mlp": 1.0301652, - "epoch": 0.09595671125807906, - "flos": 21324438933120.0, - "grad_norm": 1.7751890788987925, - "language_loss": 0.84513396, - "learning_rate": 3.954539880085045e-06, - "loss": 0.86720896, - "num_input_tokens_seen": 34295490, - "step": 1596, - "time_per_iteration": 2.710228204727173 - }, - { - "auxiliary_loss_clip": 0.01166586, - "auxiliary_loss_mlp": 0.0105804, - "balance_loss_clip": 1.05440903, - "balance_loss_mlp": 1.03376901, - "epoch": 0.09601683451074704, - "flos": 39603472185600.0, - "grad_norm": 1.8335529067237837, - "language_loss": 0.69328064, - "learning_rate": 3.9544572776429945e-06, - "loss": 0.71552688, - "num_input_tokens_seen": 34319990, - "step": 1597, - "time_per_iteration": 2.802959442138672 - }, - { - "auxiliary_loss_clip": 0.01167235, - "auxiliary_loss_mlp": 0.00780978, - "balance_loss_clip": 1.0503217, - "balance_loss_mlp": 1.00010371, - "epoch": 0.096076957763415, - "flos": 23732716147200.0, - "grad_norm": 2.0491570740921885, - "language_loss": 0.7486403, - "learning_rate": 3.954374601087729e-06, - "loss": 0.76812243, - "num_input_tokens_seen": 34339225, - "step": 1598, - "time_per_iteration": 2.6502270698547363 - }, - { - "auxiliary_loss_clip": 0.01176661, - "auxiliary_loss_mlp": 0.01053936, - "balance_loss_clip": 1.05745888, - "balance_loss_mlp": 1.03009462, - "epoch": 0.09613708101608297, - "flos": 34678108483200.0, - "grad_norm": 1.6831440826618358, - "language_loss": 0.68804371, - "learning_rate": 3.954291850422382e-06, - "loss": 0.71034968, - "num_input_tokens_seen": 34361020, - "step": 1599, - "time_per_iteration": 2.74243426322937 - }, - { - "auxiliary_loss_clip": 0.01157322, - "auxiliary_loss_mlp": 0.01059883, - "balance_loss_clip": 1.05754852, - "balance_loss_mlp": 1.0371263, - "epoch": 0.09619720426875093, - "flos": 20740028653440.0, - "grad_norm": 2.9774251326108367, - "language_loss": 0.83950365, - "learning_rate": 3.954209025650093e-06, - "loss": 0.86167574, - "num_input_tokens_seen": 34378630, - "step": 1600, - "time_per_iteration": 2.702907085418701 - }, - { - "auxiliary_loss_clip": 0.01150263, - "auxiliary_loss_mlp": 0.01054168, - "balance_loss_clip": 1.05129707, - "balance_loss_mlp": 1.03093433, - "epoch": 0.09625732752141891, - "flos": 13042720488960.0, - "grad_norm": 2.287254549480118, - "language_loss": 0.80520785, - "learning_rate": 3.954126126774001e-06, - "loss": 0.82725215, - "num_input_tokens_seen": 34397110, - "step": 1601, - "time_per_iteration": 2.693399429321289 - }, - { - "auxiliary_loss_clip": 0.01181247, - "auxiliary_loss_mlp": 0.01054578, - "balance_loss_clip": 1.05711937, - "balance_loss_mlp": 1.03133249, - "epoch": 0.09631745077408688, - "flos": 22273629782400.0, - "grad_norm": 2.4356926646094954, - "language_loss": 0.81959623, - "learning_rate": 3.954043153797251e-06, - "loss": 0.84195447, - "num_input_tokens_seen": 34414165, - "step": 1602, - "time_per_iteration": 2.639479875564575 - }, - { - "auxiliary_loss_clip": 0.01137855, - "auxiliary_loss_mlp": 0.01051495, - "balance_loss_clip": 1.05295444, - "balance_loss_mlp": 1.02681863, - "epoch": 0.09637757402675484, - "flos": 24754266944640.0, - "grad_norm": 3.099164686790191, - "language_loss": 0.62498438, - "learning_rate": 3.953960106722989e-06, - "loss": 0.64687788, - "num_input_tokens_seen": 34434445, - "step": 1603, - "time_per_iteration": 4.341834306716919 - }, - { - "auxiliary_loss_clip": 0.01189954, - "auxiliary_loss_mlp": 0.01054376, - "balance_loss_clip": 1.05902839, - "balance_loss_mlp": 1.02918696, - "epoch": 0.09643769727942282, - "flos": 22525758322560.0, - "grad_norm": 3.121905357886113, - "language_loss": 0.70996022, - "learning_rate": 3.953876985554364e-06, - "loss": 0.73240346, - "num_input_tokens_seen": 34453095, - "step": 1604, - "time_per_iteration": 2.6520893573760986 - }, - { - "auxiliary_loss_clip": 0.01176446, - "auxiliary_loss_mlp": 0.01055314, - "balance_loss_clip": 1.0570209, - "balance_loss_mlp": 1.03358221, - "epoch": 0.09649782053209079, - "flos": 30921026636160.0, - "grad_norm": 2.082890345500055, - "language_loss": 0.7993719, - "learning_rate": 3.953793790294527e-06, - "loss": 0.82168949, - "num_input_tokens_seen": 34473680, - "step": 1605, - "time_per_iteration": 4.5557661056518555 - }, - { - "auxiliary_loss_clip": 0.01161047, - "auxiliary_loss_mlp": 0.01047918, - "balance_loss_clip": 1.05455577, - "balance_loss_mlp": 1.0245893, - "epoch": 0.09655794378475875, - "flos": 25337635729920.0, - "grad_norm": 1.990204665194141, - "language_loss": 0.74550986, - "learning_rate": 3.953710520946634e-06, - "loss": 0.76759952, - "num_input_tokens_seen": 34492610, - "step": 1606, - "time_per_iteration": 2.7172651290893555 - }, - { - "auxiliary_loss_clip": 0.01172416, - "auxiliary_loss_mlp": 0.01046772, - "balance_loss_clip": 1.05834222, - "balance_loss_mlp": 1.02378857, - "epoch": 0.09661806703742673, - "flos": 22346061557760.0, - "grad_norm": 1.6403710807101601, - "language_loss": 0.7571919, - "learning_rate": 3.953627177513843e-06, - "loss": 0.77938372, - "num_input_tokens_seen": 34511855, - "step": 1607, - "time_per_iteration": 4.302686452865601 - }, - { - "auxiliary_loss_clip": 0.01139491, - "auxiliary_loss_mlp": 0.01051546, - "balance_loss_clip": 1.04833579, - "balance_loss_mlp": 1.0289799, - "epoch": 0.0966781902900947, - "flos": 17457578144640.0, - "grad_norm": 1.975850982703557, - "language_loss": 0.86756283, - "learning_rate": 3.953543759999312e-06, - "loss": 0.88947326, - "num_input_tokens_seen": 34528905, - "step": 1608, - "time_per_iteration": 2.6280455589294434 - }, - { - "auxiliary_loss_clip": 0.01126253, - "auxiliary_loss_mlp": 0.01064704, - "balance_loss_clip": 1.05433142, - "balance_loss_mlp": 1.03940821, - "epoch": 0.09673831354276266, - "flos": 36903995412480.0, - "grad_norm": 2.3082762386200266, - "language_loss": 0.71363097, - "learning_rate": 3.953460268406207e-06, - "loss": 0.73554057, - "num_input_tokens_seen": 34548480, - "step": 1609, - "time_per_iteration": 2.9116146564483643 - }, - { - "auxiliary_loss_clip": 0.01149353, - "auxiliary_loss_mlp": 0.01058179, - "balance_loss_clip": 1.0546515, - "balance_loss_mlp": 1.03606534, - "epoch": 0.09679843679543064, - "flos": 20701388597760.0, - "grad_norm": 1.9988414994799784, - "language_loss": 0.84810984, - "learning_rate": 3.953376702737693e-06, - "loss": 0.87018514, - "num_input_tokens_seen": 34565410, - "step": 1610, - "time_per_iteration": 2.8005051612854004 - }, - { - "auxiliary_loss_clip": 0.01161389, - "auxiliary_loss_mlp": 0.01056267, - "balance_loss_clip": 1.05790925, - "balance_loss_mlp": 1.03228188, - "epoch": 0.0968585600480986, - "flos": 23514415240320.0, - "grad_norm": 2.176236379770122, - "language_loss": 0.6696198, - "learning_rate": 3.953293062996939e-06, - "loss": 0.69179636, - "num_input_tokens_seen": 34584840, - "step": 1611, - "time_per_iteration": 2.731931447982788 - }, - { - "auxiliary_loss_clip": 0.01125259, - "auxiliary_loss_mlp": 0.01057116, - "balance_loss_clip": 1.04740572, - "balance_loss_mlp": 1.03385806, - "epoch": 0.09691868330076657, - "flos": 20121072468480.0, - "grad_norm": 1.6508278294088392, - "language_loss": 0.81067657, - "learning_rate": 3.953209349187115e-06, - "loss": 0.83250034, - "num_input_tokens_seen": 34603360, - "step": 1612, - "time_per_iteration": 2.7998390197753906 - }, - { - "auxiliary_loss_clip": 0.01182404, - "auxiliary_loss_mlp": 0.01069551, - "balance_loss_clip": 1.06046534, - "balance_loss_mlp": 1.04600716, - "epoch": 0.09697880655343454, - "flos": 16544692967040.0, - "grad_norm": 3.304939197664143, - "language_loss": 0.80836105, - "learning_rate": 3.953125561311398e-06, - "loss": 0.83088064, - "num_input_tokens_seen": 34620760, - "step": 1613, - "time_per_iteration": 2.624218702316284 - }, - { - "auxiliary_loss_clip": 0.01148565, - "auxiliary_loss_mlp": 0.01054743, - "balance_loss_clip": 1.05542159, - "balance_loss_mlp": 1.03047192, - "epoch": 0.09703892980610251, - "flos": 26104184899200.0, - "grad_norm": 1.7164386274315457, - "language_loss": 0.84289789, - "learning_rate": 3.953041699372964e-06, - "loss": 0.86493099, - "num_input_tokens_seen": 34640695, - "step": 1614, - "time_per_iteration": 2.744340419769287 - }, - { - "auxiliary_loss_clip": 0.01066618, - "auxiliary_loss_mlp": 0.00759744, - "balance_loss_clip": 1.02654934, - "balance_loss_mlp": 1.00008702, - "epoch": 0.09709905305877048, - "flos": 60443622000000.0, - "grad_norm": 0.7127167896900892, - "language_loss": 0.54629624, - "learning_rate": 3.952957763374992e-06, - "loss": 0.56455994, - "num_input_tokens_seen": 34702395, - "step": 1615, - "time_per_iteration": 3.1547679901123047 - }, - { - "auxiliary_loss_clip": 0.01033143, - "auxiliary_loss_mlp": 0.01017555, - "balance_loss_clip": 1.02384067, - "balance_loss_mlp": 1.01381195, - "epoch": 0.09715917631143844, - "flos": 57639932893440.0, - "grad_norm": 0.7689373847786285, - "language_loss": 0.58190405, - "learning_rate": 3.952873753320666e-06, - "loss": 0.60241103, - "num_input_tokens_seen": 34768910, - "step": 1616, - "time_per_iteration": 3.3940556049346924 - }, - { - "auxiliary_loss_clip": 0.01155533, - "auxiliary_loss_mlp": 0.01067983, - "balance_loss_clip": 1.05504358, - "balance_loss_mlp": 1.04205465, - "epoch": 0.09721929956410642, - "flos": 20558212986240.0, - "grad_norm": 1.8932449927934136, - "language_loss": 0.69031835, - "learning_rate": 3.952789669213172e-06, - "loss": 0.7125535, - "num_input_tokens_seen": 34787680, - "step": 1617, - "time_per_iteration": 2.714629888534546 - }, - { - "auxiliary_loss_clip": 0.01152637, - "auxiliary_loss_mlp": 0.01057882, - "balance_loss_clip": 1.05386162, - "balance_loss_mlp": 1.03127456, - "epoch": 0.09727942281677439, - "flos": 27344359825920.0, - "grad_norm": 1.755493071880773, - "language_loss": 0.80910909, - "learning_rate": 3.952705511055698e-06, - "loss": 0.83121431, - "num_input_tokens_seen": 34808330, - "step": 1618, - "time_per_iteration": 2.8081507682800293 - }, - { - "auxiliary_loss_clip": 0.01168356, - "auxiliary_loss_mlp": 0.01058179, - "balance_loss_clip": 1.06048679, - "balance_loss_mlp": 1.03678131, - "epoch": 0.09733954606944235, - "flos": 24900028335360.0, - "grad_norm": 1.667659488760432, - "language_loss": 0.92901695, - "learning_rate": 3.952621278851435e-06, - "loss": 0.95128226, - "num_input_tokens_seen": 34830020, - "step": 1619, - "time_per_iteration": 2.7752275466918945 - }, - { - "auxiliary_loss_clip": 0.01175515, - "auxiliary_loss_mlp": 0.01058252, - "balance_loss_clip": 1.05952573, - "balance_loss_mlp": 1.03512526, - "epoch": 0.09739966932211033, - "flos": 31503928544640.0, - "grad_norm": 2.1973967195348902, - "language_loss": 0.88978708, - "learning_rate": 3.9525369726035784e-06, - "loss": 0.91212475, - "num_input_tokens_seen": 34850330, - "step": 1620, - "time_per_iteration": 2.771176338195801 - }, - { - "auxiliary_loss_clip": 0.01153763, - "auxiliary_loss_mlp": 0.01065329, - "balance_loss_clip": 1.05353975, - "balance_loss_mlp": 1.0397464, - "epoch": 0.0974597925747783, - "flos": 23878764846720.0, - "grad_norm": 2.154793183838835, - "language_loss": 0.77331412, - "learning_rate": 3.952452592315324e-06, - "loss": 0.79550499, - "num_input_tokens_seen": 34871640, - "step": 1621, - "time_per_iteration": 2.6740832328796387 - }, - { - "auxiliary_loss_clip": 0.01131342, - "auxiliary_loss_mlp": 0.01082359, - "balance_loss_clip": 1.04798269, - "balance_loss_mlp": 1.05640674, - "epoch": 0.09751991582744626, - "flos": 17019575700480.0, - "grad_norm": 1.9420195171733425, - "language_loss": 0.77671158, - "learning_rate": 3.952368137989871e-06, - "loss": 0.79884863, - "num_input_tokens_seen": 34888100, - "step": 1622, - "time_per_iteration": 2.7247347831726074 - }, - { - "auxiliary_loss_clip": 0.01150185, - "auxiliary_loss_mlp": 0.01064277, - "balance_loss_clip": 1.05335355, - "balance_loss_mlp": 1.04025626, - "epoch": 0.09758003908011423, - "flos": 28402826826240.0, - "grad_norm": 1.8603109065807166, - "language_loss": 0.85784447, - "learning_rate": 3.9522836096304225e-06, - "loss": 0.87998909, - "num_input_tokens_seen": 34910485, - "step": 1623, - "time_per_iteration": 2.785388469696045 - }, - { - "auxiliary_loss_clip": 0.0117659, - "auxiliary_loss_mlp": 0.01064102, - "balance_loss_clip": 1.05769634, - "balance_loss_mlp": 1.04043913, - "epoch": 0.09764016233278221, - "flos": 18144297336960.0, - "grad_norm": 2.39630116599036, - "language_loss": 0.80534065, - "learning_rate": 3.952199007240184e-06, - "loss": 0.82774758, - "num_input_tokens_seen": 34928615, - "step": 1624, - "time_per_iteration": 2.6818184852600098 - }, - { - "auxiliary_loss_clip": 0.01176335, - "auxiliary_loss_mlp": 0.01056788, - "balance_loss_clip": 1.05616927, - "balance_loss_mlp": 1.03465128, - "epoch": 0.09770028558545017, - "flos": 15265842071040.0, - "grad_norm": 2.44379144971104, - "language_loss": 0.85556966, - "learning_rate": 3.952114330822364e-06, - "loss": 0.8779009, - "num_input_tokens_seen": 34946045, - "step": 1625, - "time_per_iteration": 2.6594324111938477 - }, - { - "auxiliary_loss_clip": 0.01181411, - "auxiliary_loss_mlp": 0.0106682, - "balance_loss_clip": 1.06004012, - "balance_loss_mlp": 1.04411101, - "epoch": 0.09776040883811814, - "flos": 23472435219840.0, - "grad_norm": 2.058269503362464, - "language_loss": 0.85431635, - "learning_rate": 3.952029580380172e-06, - "loss": 0.87679869, - "num_input_tokens_seen": 34962865, - "step": 1626, - "time_per_iteration": 2.7384841442108154 - }, - { - "auxiliary_loss_clip": 0.01165311, - "auxiliary_loss_mlp": 0.007823, - "balance_loss_clip": 1.05467701, - "balance_loss_mlp": 1.000211, - "epoch": 0.09782053209078612, - "flos": 24499480798080.0, - "grad_norm": 2.0701580273163036, - "language_loss": 0.83370024, - "learning_rate": 3.9519447559168234e-06, - "loss": 0.85317636, - "num_input_tokens_seen": 34983505, - "step": 1627, - "time_per_iteration": 2.8269948959350586 - }, - { - "auxiliary_loss_clip": 0.01168188, - "auxiliary_loss_mlp": 0.01065332, - "balance_loss_clip": 1.05557203, - "balance_loss_mlp": 1.04275417, - "epoch": 0.09788065534345408, - "flos": 21580158833280.0, - "grad_norm": 1.8143281262319713, - "language_loss": 0.84674478, - "learning_rate": 3.951859857435534e-06, - "loss": 0.86907995, - "num_input_tokens_seen": 35001825, - "step": 1628, - "time_per_iteration": 2.6151821613311768 - }, - { - "auxiliary_loss_clip": 0.01170257, - "auxiliary_loss_mlp": 0.01058367, - "balance_loss_clip": 1.05374515, - "balance_loss_mlp": 1.03558636, - "epoch": 0.09794077859612205, - "flos": 23842459175040.0, - "grad_norm": 1.5658807312485334, - "language_loss": 0.75531614, - "learning_rate": 3.951774884939523e-06, - "loss": 0.77760237, - "num_input_tokens_seen": 35023075, - "step": 1629, - "time_per_iteration": 2.6794557571411133 - }, - { - "auxiliary_loss_clip": 0.01129604, - "auxiliary_loss_mlp": 0.01056904, - "balance_loss_clip": 1.0577755, - "balance_loss_mlp": 1.03169131, - "epoch": 0.09800090184879003, - "flos": 23659889322240.0, - "grad_norm": 1.6755762488260617, - "language_loss": 0.78487194, - "learning_rate": 3.951689838432013e-06, - "loss": 0.80673707, - "num_input_tokens_seen": 35043480, - "step": 1630, - "time_per_iteration": 2.7986228466033936 - }, - { - "auxiliary_loss_clip": 0.01167766, - "auxiliary_loss_mlp": 0.01063441, - "balance_loss_clip": 1.05938148, - "balance_loss_mlp": 1.03804946, - "epoch": 0.09806102510145799, - "flos": 17055773631360.0, - "grad_norm": 1.8175370389297836, - "language_loss": 0.86677933, - "learning_rate": 3.951604717916228e-06, - "loss": 0.88909143, - "num_input_tokens_seen": 35061490, - "step": 1631, - "time_per_iteration": 2.6350157260894775 - }, - { - "auxiliary_loss_clip": 0.01171369, - "auxiliary_loss_mlp": 0.01058643, - "balance_loss_clip": 1.0610745, - "balance_loss_mlp": 1.03625536, - "epoch": 0.09812114835412596, - "flos": 23878477537920.0, - "grad_norm": 2.2030333753544773, - "language_loss": 0.82996809, - "learning_rate": 3.9515195233953975e-06, - "loss": 0.85226822, - "num_input_tokens_seen": 35079670, - "step": 1632, - "time_per_iteration": 2.7990314960479736 - }, - { - "auxiliary_loss_clip": 0.01148453, - "auxiliary_loss_mlp": 0.01064004, - "balance_loss_clip": 1.05554819, - "balance_loss_mlp": 1.04102039, - "epoch": 0.09818127160679392, - "flos": 20595488325120.0, - "grad_norm": 1.531777801288569, - "language_loss": 0.7882973, - "learning_rate": 3.951434254872751e-06, - "loss": 0.81042188, - "num_input_tokens_seen": 35099205, - "step": 1633, - "time_per_iteration": 2.735353708267212 - }, - { - "auxiliary_loss_clip": 0.01170992, - "auxiliary_loss_mlp": 0.01061681, - "balance_loss_clip": 1.05558002, - "balance_loss_mlp": 1.03731489, - "epoch": 0.0982413948594619, - "flos": 15487339288320.0, - "grad_norm": 2.4037572513069687, - "language_loss": 0.73209554, - "learning_rate": 3.951348912351521e-06, - "loss": 0.75442231, - "num_input_tokens_seen": 35115270, - "step": 1634, - "time_per_iteration": 2.688596248626709 - }, - { - "auxiliary_loss_clip": 0.01162743, - "auxiliary_loss_mlp": 0.01071164, - "balance_loss_clip": 1.05591321, - "balance_loss_mlp": 1.04672611, - "epoch": 0.09830151811212987, - "flos": 24207958016640.0, - "grad_norm": 3.2244021303311405, - "language_loss": 0.72553629, - "learning_rate": 3.951263495834947e-06, - "loss": 0.74787533, - "num_input_tokens_seen": 35134065, - "step": 1635, - "time_per_iteration": 2.720266342163086 - }, - { - "auxiliary_loss_clip": 0.01154765, - "auxiliary_loss_mlp": 0.01068349, - "balance_loss_clip": 1.05526268, - "balance_loss_mlp": 1.04177701, - "epoch": 0.09836164136479783, - "flos": 20594590485120.0, - "grad_norm": 1.7699592352066487, - "language_loss": 0.78026646, - "learning_rate": 3.951178005326264e-06, - "loss": 0.80249763, - "num_input_tokens_seen": 35154870, - "step": 1636, - "time_per_iteration": 2.9618239402770996 - }, - { - "auxiliary_loss_clip": 0.01162744, - "auxiliary_loss_mlp": 0.01060716, - "balance_loss_clip": 1.05561686, - "balance_loss_mlp": 1.0368979, - "epoch": 0.09842176461746581, - "flos": 19934157070080.0, - "grad_norm": 1.8332710343018006, - "language_loss": 0.69524407, - "learning_rate": 3.951092440828715e-06, - "loss": 0.71747863, - "num_input_tokens_seen": 35171850, - "step": 1637, - "time_per_iteration": 2.671178102493286 - }, - { - "auxiliary_loss_clip": 0.01188316, - "auxiliary_loss_mlp": 0.01058851, - "balance_loss_clip": 1.05926394, - "balance_loss_mlp": 1.03500926, - "epoch": 0.09848188787013377, - "flos": 21214659991680.0, - "grad_norm": 2.3775286970935503, - "language_loss": 0.77050996, - "learning_rate": 3.951006802345545e-06, - "loss": 0.79298162, - "num_input_tokens_seen": 35188795, - "step": 1638, - "time_per_iteration": 2.62457537651062 - }, - { - "auxiliary_loss_clip": 0.01140265, - "auxiliary_loss_mlp": 0.01052026, - "balance_loss_clip": 1.05538166, - "balance_loss_mlp": 1.02941203, - "epoch": 0.09854201112280174, - "flos": 30154226071680.0, - "grad_norm": 1.4014263071342075, - "language_loss": 0.72620296, - "learning_rate": 3.950921089880003e-06, - "loss": 0.74812591, - "num_input_tokens_seen": 35212100, - "step": 1639, - "time_per_iteration": 2.7499618530273438 - }, - { - "auxiliary_loss_clip": 0.01173752, - "auxiliary_loss_mlp": 0.01051382, - "balance_loss_clip": 1.0582087, - "balance_loss_mlp": 1.02831531, - "epoch": 0.09860213437546972, - "flos": 21795730306560.0, - "grad_norm": 1.7213189449892274, - "language_loss": 0.88679075, - "learning_rate": 3.950835303435337e-06, - "loss": 0.90904212, - "num_input_tokens_seen": 35230390, - "step": 1640, - "time_per_iteration": 2.664133071899414 - }, - { - "auxiliary_loss_clip": 0.01177786, - "auxiliary_loss_mlp": 0.01044457, - "balance_loss_clip": 1.05981517, - "balance_loss_mlp": 1.02130616, - "epoch": 0.09866225762813768, - "flos": 21835555511040.0, - "grad_norm": 2.0701766566296915, - "language_loss": 0.80567038, - "learning_rate": 3.950749443014801e-06, - "loss": 0.82789278, - "num_input_tokens_seen": 35250405, - "step": 1641, - "time_per_iteration": 2.645353317260742 - }, - { - "auxiliary_loss_clip": 0.011756, - "auxiliary_loss_mlp": 0.01062641, - "balance_loss_clip": 1.05896795, - "balance_loss_mlp": 1.03742838, - "epoch": 0.09872238088080565, - "flos": 17599855916160.0, - "grad_norm": 2.64335263522248, - "language_loss": 0.86117625, - "learning_rate": 3.95066350862165e-06, - "loss": 0.88355863, - "num_input_tokens_seen": 35262820, - "step": 1642, - "time_per_iteration": 5.81004524230957 - }, - { - "auxiliary_loss_clip": 0.01151329, - "auxiliary_loss_mlp": 0.01056693, - "balance_loss_clip": 1.05857074, - "balance_loss_mlp": 1.03404331, - "epoch": 0.09878250413347361, - "flos": 27636134002560.0, - "grad_norm": 2.7092208079201607, - "language_loss": 0.8058275, - "learning_rate": 3.950577500259144e-06, - "loss": 0.82790768, - "num_input_tokens_seen": 35284490, - "step": 1643, - "time_per_iteration": 2.7235090732574463 - }, - { - "auxiliary_loss_clip": 0.01174075, - "auxiliary_loss_mlp": 0.01077435, - "balance_loss_clip": 1.05761337, - "balance_loss_mlp": 1.05470192, - "epoch": 0.0988426273861416, - "flos": 16544728880640.0, - "grad_norm": 2.0561742686210676, - "language_loss": 0.82546467, - "learning_rate": 3.950491417930543e-06, - "loss": 0.84797978, - "num_input_tokens_seen": 35302815, - "step": 1644, - "time_per_iteration": 4.318823575973511 - }, - { - "auxiliary_loss_clip": 0.01163142, - "auxiliary_loss_mlp": 0.00782463, - "balance_loss_clip": 1.05607629, - "balance_loss_mlp": 1.00010633, - "epoch": 0.09890275063880956, - "flos": 21215270522880.0, - "grad_norm": 1.6945489721625269, - "language_loss": 0.68219113, - "learning_rate": 3.9504052616391124e-06, - "loss": 0.70164716, - "num_input_tokens_seen": 35321175, - "step": 1645, - "time_per_iteration": 2.6626670360565186 - }, - { - "auxiliary_loss_clip": 0.01059795, - "auxiliary_loss_mlp": 0.01047617, - "balance_loss_clip": 1.02852345, - "balance_loss_mlp": 1.04404068, - "epoch": 0.09896287389147752, - "flos": 59379372910080.0, - "grad_norm": 0.8512889940087613, - "language_loss": 0.60885167, - "learning_rate": 3.950319031388119e-06, - "loss": 0.62992585, - "num_input_tokens_seen": 35381740, - "step": 1646, - "time_per_iteration": 4.752669095993042 - }, - { - "auxiliary_loss_clip": 0.01147006, - "auxiliary_loss_mlp": 0.0105976, - "balance_loss_clip": 1.0574733, - "balance_loss_mlp": 1.03464222, - "epoch": 0.0990229971441455, - "flos": 29642678530560.0, - "grad_norm": 5.785751121573768, - "language_loss": 0.73211443, - "learning_rate": 3.950232727180833e-06, - "loss": 0.7541821, - "num_input_tokens_seen": 35403760, - "step": 1647, - "time_per_iteration": 2.783442974090576 - }, - { - "auxiliary_loss_clip": 0.01161789, - "auxiliary_loss_mlp": 0.01066314, - "balance_loss_clip": 1.06016421, - "balance_loss_mlp": 1.04445136, - "epoch": 0.09908312039681347, - "flos": 21834873152640.0, - "grad_norm": 1.828428298130997, - "language_loss": 0.84094375, - "learning_rate": 3.950146349020525e-06, - "loss": 0.86322474, - "num_input_tokens_seen": 35424050, - "step": 1648, - "time_per_iteration": 2.709559679031372 - }, - { - "auxiliary_loss_clip": 0.01065954, - "auxiliary_loss_mlp": 0.01020799, - "balance_loss_clip": 1.02565169, - "balance_loss_mlp": 1.01722264, - "epoch": 0.09914324364948143, - "flos": 57564304807680.0, - "grad_norm": 0.7317434537206132, - "language_loss": 0.55672908, - "learning_rate": 3.950059896910473e-06, - "loss": 0.5775966, - "num_input_tokens_seen": 35481690, - "step": 1649, - "time_per_iteration": 3.0944156646728516 - }, - { - "auxiliary_loss_clip": 0.0117133, - "auxiliary_loss_mlp": 0.01049543, - "balance_loss_clip": 1.05603158, - "balance_loss_mlp": 1.02723897, - "epoch": 0.09920336690214941, - "flos": 34123934476800.0, - "grad_norm": 2.195431109372502, - "language_loss": 0.8975327, - "learning_rate": 3.949973370853954e-06, - "loss": 0.91974139, - "num_input_tokens_seen": 35498635, - "step": 1650, - "time_per_iteration": 2.7438554763793945 - }, - { - "auxiliary_loss_clip": 0.01033978, - "auxiliary_loss_mlp": 0.00758727, - "balance_loss_clip": 1.02943921, - "balance_loss_mlp": 0.9997822, - "epoch": 0.09926349015481738, - "flos": 71216428464000.0, - "grad_norm": 0.8036050505402587, - "language_loss": 0.63734978, - "learning_rate": 3.94988677085425e-06, - "loss": 0.65527683, - "num_input_tokens_seen": 35565720, - "step": 1651, - "time_per_iteration": 3.40269136428833 - }, - { - "auxiliary_loss_clip": 0.01170347, - "auxiliary_loss_mlp": 0.01062486, - "balance_loss_clip": 1.05790281, - "balance_loss_mlp": 1.03842974, - "epoch": 0.09932361340748534, - "flos": 23148700917120.0, - "grad_norm": 1.9744130417114842, - "language_loss": 0.88115525, - "learning_rate": 3.949800096914643e-06, - "loss": 0.90348363, - "num_input_tokens_seen": 35586000, - "step": 1652, - "time_per_iteration": 2.6695117950439453 - }, - { - "auxiliary_loss_clip": 0.0116773, - "auxiliary_loss_mlp": 0.01062073, - "balance_loss_clip": 1.06095552, - "balance_loss_mlp": 1.03895831, - "epoch": 0.09938373666015332, - "flos": 19828651847040.0, - "grad_norm": 2.166773052437996, - "language_loss": 0.81789082, - "learning_rate": 3.949713349038422e-06, - "loss": 0.84018886, - "num_input_tokens_seen": 35604355, - "step": 1653, - "time_per_iteration": 2.7136831283569336 - }, - { - "auxiliary_loss_clip": 0.01173152, - "auxiliary_loss_mlp": 0.00780466, - "balance_loss_clip": 1.05683279, - "balance_loss_mlp": 1.00016594, - "epoch": 0.09944385991282129, - "flos": 22090664880000.0, - "grad_norm": 1.662037391605293, - "language_loss": 0.79489207, - "learning_rate": 3.949626527228875e-06, - "loss": 0.81442821, - "num_input_tokens_seen": 35625495, - "step": 1654, - "time_per_iteration": 2.645875930786133 - }, - { - "auxiliary_loss_clip": 0.01187918, - "auxiliary_loss_mlp": 0.01056849, - "balance_loss_clip": 1.06405056, - "balance_loss_mlp": 1.03561759, - "epoch": 0.09950398316548925, - "flos": 19828867328640.0, - "grad_norm": 1.7263610037420916, - "language_loss": 0.81038272, - "learning_rate": 3.949539631489295e-06, - "loss": 0.83283037, - "num_input_tokens_seen": 35645030, - "step": 1655, - "time_per_iteration": 2.630404233932495 - }, - { - "auxiliary_loss_clip": 0.01181205, - "auxiliary_loss_mlp": 0.01055977, - "balance_loss_clip": 1.05679035, - "balance_loss_mlp": 1.03294599, - "epoch": 0.09956410641815722, - "flos": 25003701964800.0, - "grad_norm": 2.426795421082641, - "language_loss": 0.80429518, - "learning_rate": 3.9494526618229765e-06, - "loss": 0.82666701, - "num_input_tokens_seen": 35664305, - "step": 1656, - "time_per_iteration": 2.6283950805664062 - }, - { - "auxiliary_loss_clip": 0.01170003, - "auxiliary_loss_mlp": 0.01061881, - "balance_loss_clip": 1.05787742, - "balance_loss_mlp": 1.03870714, - "epoch": 0.0996242296708252, - "flos": 19317714837120.0, - "grad_norm": 1.4960238412267362, - "language_loss": 0.89040691, - "learning_rate": 3.949365618233217e-06, - "loss": 0.91272575, - "num_input_tokens_seen": 35684060, - "step": 1657, - "time_per_iteration": 2.653674602508545 - }, - { - "auxiliary_loss_clip": 0.01165842, - "auxiliary_loss_mlp": 0.01057352, - "balance_loss_clip": 1.05830753, - "balance_loss_mlp": 1.0329144, - "epoch": 0.09968435292349316, - "flos": 21871609787520.0, - "grad_norm": 2.1866084372248062, - "language_loss": 0.84684521, - "learning_rate": 3.9492785007233195e-06, - "loss": 0.86907715, - "num_input_tokens_seen": 35703250, - "step": 1658, - "time_per_iteration": 2.6897473335266113 - }, - { - "auxiliary_loss_clip": 0.01069806, - "auxiliary_loss_mlp": 0.01015844, - "balance_loss_clip": 1.02042234, - "balance_loss_mlp": 1.01292348, - "epoch": 0.09974447617616113, - "flos": 65384533313280.0, - "grad_norm": 0.9123227767672076, - "language_loss": 0.60828507, - "learning_rate": 3.949191309296585e-06, - "loss": 0.62914157, - "num_input_tokens_seen": 35762165, - "step": 1659, - "time_per_iteration": 3.273890495300293 - }, - { - "auxiliary_loss_clip": 0.01152432, - "auxiliary_loss_mlp": 0.01051829, - "balance_loss_clip": 1.05082798, - "balance_loss_mlp": 1.02814245, - "epoch": 0.0998045994288291, - "flos": 23659817495040.0, - "grad_norm": 1.9344290476513741, - "language_loss": 0.84892076, - "learning_rate": 3.949104043956321e-06, - "loss": 0.87096334, - "num_input_tokens_seen": 35781520, - "step": 1660, - "time_per_iteration": 2.788018226623535 - }, - { - "auxiliary_loss_clip": 0.01149163, - "auxiliary_loss_mlp": 0.01060092, - "balance_loss_clip": 1.05374026, - "balance_loss_mlp": 1.03514171, - "epoch": 0.09986472268149707, - "flos": 19609704495360.0, - "grad_norm": 1.9493882663610318, - "language_loss": 0.80024737, - "learning_rate": 3.949016704705836e-06, - "loss": 0.82234001, - "num_input_tokens_seen": 35799565, - "step": 1661, - "time_per_iteration": 2.6537399291992188 - }, - { - "auxiliary_loss_clip": 0.01172787, - "auxiliary_loss_mlp": 0.01055532, - "balance_loss_clip": 1.05715156, - "balance_loss_mlp": 1.03153503, - "epoch": 0.09992484593416504, - "flos": 26213317395840.0, - "grad_norm": 2.0152235709188377, - "language_loss": 0.83560598, - "learning_rate": 3.948929291548443e-06, - "loss": 0.85788912, - "num_input_tokens_seen": 35821085, - "step": 1662, - "time_per_iteration": 2.753807783126831 - }, - { - "auxiliary_loss_clip": 0.01154838, - "auxiliary_loss_mlp": 0.01061466, - "balance_loss_clip": 1.05079484, - "balance_loss_mlp": 1.03616929, - "epoch": 0.09998496918683301, - "flos": 17493632421120.0, - "grad_norm": 1.9355779644050557, - "language_loss": 0.88865256, - "learning_rate": 3.9488418044874546e-06, - "loss": 0.91081554, - "num_input_tokens_seen": 35839840, - "step": 1663, - "time_per_iteration": 2.6829047203063965 - }, - { - "auxiliary_loss_clip": 0.0118246, - "auxiliary_loss_mlp": 0.01061692, - "balance_loss_clip": 1.06228638, - "balance_loss_mlp": 1.03825521, - "epoch": 0.10004509243950098, - "flos": 22784925928320.0, - "grad_norm": 1.7925330820671084, - "language_loss": 0.70140731, - "learning_rate": 3.948754243526191e-06, - "loss": 0.72384882, - "num_input_tokens_seen": 35861545, - "step": 1664, - "time_per_iteration": 2.809300184249878 - }, - { - "auxiliary_loss_clip": 0.01142878, - "auxiliary_loss_mlp": 0.01055306, - "balance_loss_clip": 1.05475903, - "balance_loss_mlp": 1.03312087, - "epoch": 0.10010521569216894, - "flos": 16253385667200.0, - "grad_norm": 2.4978474602303895, - "language_loss": 0.78981555, - "learning_rate": 3.94866660866797e-06, - "loss": 0.81179744, - "num_input_tokens_seen": 35878295, - "step": 1665, - "time_per_iteration": 2.7010488510131836 - }, - { - "auxiliary_loss_clip": 0.01175861, - "auxiliary_loss_mlp": 0.01070341, - "balance_loss_clip": 1.06286561, - "balance_loss_mlp": 1.04742861, - "epoch": 0.10016533894483691, - "flos": 23402589223680.0, - "grad_norm": 3.1438625724360945, - "language_loss": 0.70054829, - "learning_rate": 3.9485788999161165e-06, - "loss": 0.7230103, - "num_input_tokens_seen": 35898990, - "step": 1666, - "time_per_iteration": 2.689879894256592 - }, - { - "auxiliary_loss_clip": 0.01110848, - "auxiliary_loss_mlp": 0.01074593, - "balance_loss_clip": 1.05082703, - "balance_loss_mlp": 1.04946339, - "epoch": 0.10022546219750489, - "flos": 19354164163200.0, - "grad_norm": 1.7583449522195267, - "language_loss": 0.78647351, - "learning_rate": 3.948491117273956e-06, - "loss": 0.80832791, - "num_input_tokens_seen": 35916225, - "step": 1667, - "time_per_iteration": 2.8973352909088135 - }, - { - "auxiliary_loss_clip": 0.01153352, - "auxiliary_loss_mlp": 0.01062819, - "balance_loss_clip": 1.05452693, - "balance_loss_mlp": 1.03752255, - "epoch": 0.10028558545017285, - "flos": 27085766837760.0, - "grad_norm": 2.4011089045072187, - "language_loss": 0.77357388, - "learning_rate": 3.948403260744817e-06, - "loss": 0.7957356, - "num_input_tokens_seen": 35934630, - "step": 1668, - "time_per_iteration": 3.2600321769714355 - }, - { - "auxiliary_loss_clip": 0.01184879, - "auxiliary_loss_mlp": 0.01059367, - "balance_loss_clip": 1.05833495, - "balance_loss_mlp": 1.03523922, - "epoch": 0.10034570870284082, - "flos": 25847136195840.0, - "grad_norm": 1.7407668002390366, - "language_loss": 0.77520061, - "learning_rate": 3.948315330332031e-06, - "loss": 0.79764307, - "num_input_tokens_seen": 35953855, - "step": 1669, - "time_per_iteration": 2.6899471282958984 - }, - { - "auxiliary_loss_clip": 0.0118887, - "auxiliary_loss_mlp": 0.01067842, - "balance_loss_clip": 1.05948365, - "balance_loss_mlp": 1.04416728, - "epoch": 0.1004058319555088, - "flos": 26249587153920.0, - "grad_norm": 5.441134829238958, - "language_loss": 0.85160148, - "learning_rate": 3.948227326038933e-06, - "loss": 0.87416857, - "num_input_tokens_seen": 35974555, - "step": 1670, - "time_per_iteration": 2.616867780685425 - }, - { - "auxiliary_loss_clip": 0.011763, - "auxiliary_loss_mlp": 0.01055607, - "balance_loss_clip": 1.05584121, - "balance_loss_mlp": 1.03354108, - "epoch": 0.10046595520817676, - "flos": 25374480105600.0, - "grad_norm": 1.4849262119454174, - "language_loss": 0.76836258, - "learning_rate": 3.9481392478688586e-06, - "loss": 0.79068166, - "num_input_tokens_seen": 35996830, - "step": 1671, - "time_per_iteration": 2.658254384994507 - }, - { - "auxiliary_loss_clip": 0.01061447, - "auxiliary_loss_mlp": 0.01017561, - "balance_loss_clip": 1.02178144, - "balance_loss_mlp": 1.01454473, - "epoch": 0.10052607846084473, - "flos": 67461821677440.0, - "grad_norm": 0.7781454358921105, - "language_loss": 0.60718858, - "learning_rate": 3.948051095825149e-06, - "loss": 0.62797856, - "num_input_tokens_seen": 36054465, - "step": 1672, - "time_per_iteration": 3.1269097328186035 - }, - { - "auxiliary_loss_clip": 0.01143177, - "auxiliary_loss_mlp": 0.01063346, - "balance_loss_clip": 1.05112922, - "balance_loss_mlp": 1.04055333, - "epoch": 0.10058620171351271, - "flos": 21360493209600.0, - "grad_norm": 2.433278134910662, - "language_loss": 0.7711426, - "learning_rate": 3.947962869911147e-06, - "loss": 0.79320776, - "num_input_tokens_seen": 36073480, - "step": 1673, - "time_per_iteration": 2.6931638717651367 - }, - { - "auxiliary_loss_clip": 0.01132094, - "auxiliary_loss_mlp": 0.01056611, - "balance_loss_clip": 1.04989302, - "balance_loss_mlp": 1.03262639, - "epoch": 0.10064632496618067, - "flos": 16800125558400.0, - "grad_norm": 2.074683072839241, - "language_loss": 0.73173523, - "learning_rate": 3.947874570130197e-06, - "loss": 0.75362229, - "num_input_tokens_seen": 36091830, - "step": 1674, - "time_per_iteration": 2.7188127040863037 - }, - { - "auxiliary_loss_clip": 0.01172389, - "auxiliary_loss_mlp": 0.00779533, - "balance_loss_clip": 1.0556165, - "balance_loss_mlp": 1.00024796, - "epoch": 0.10070644821884864, - "flos": 23624445576960.0, - "grad_norm": 2.1982379565146872, - "language_loss": 0.79456973, - "learning_rate": 3.947786196485649e-06, - "loss": 0.81408894, - "num_input_tokens_seen": 36111400, - "step": 1675, - "time_per_iteration": 2.712090253829956 - }, - { - "auxiliary_loss_clip": 0.01182659, - "auxiliary_loss_mlp": 0.01063327, - "balance_loss_clip": 1.05801332, - "balance_loss_mlp": 1.04239404, - "epoch": 0.1007665714715166, - "flos": 24462564595200.0, - "grad_norm": 2.408955682155161, - "language_loss": 0.8120935, - "learning_rate": 3.947697748980853e-06, - "loss": 0.83455336, - "num_input_tokens_seen": 36129345, - "step": 1676, - "time_per_iteration": 2.685472249984741 - }, - { - "auxiliary_loss_clip": 0.01175397, - "auxiliary_loss_mlp": 0.01057105, - "balance_loss_clip": 1.05950332, - "balance_loss_mlp": 1.03546858, - "epoch": 0.10082669472418458, - "flos": 16799119977600.0, - "grad_norm": 2.008035557658629, - "language_loss": 0.86132157, - "learning_rate": 3.947609227619163e-06, - "loss": 0.88364655, - "num_input_tokens_seen": 36146255, - "step": 1677, - "time_per_iteration": 2.6589157581329346 - }, - { - "auxiliary_loss_clip": 0.01162997, - "auxiliary_loss_mlp": 0.010508, - "balance_loss_clip": 1.05363441, - "balance_loss_mlp": 1.02896047, - "epoch": 0.10088681797685255, - "flos": 13553513844480.0, - "grad_norm": 2.160847391025828, - "language_loss": 0.86006588, - "learning_rate": 3.947520632403936e-06, - "loss": 0.88220382, - "num_input_tokens_seen": 36164050, - "step": 1678, - "time_per_iteration": 2.694347858428955 - }, - { - "auxiliary_loss_clip": 0.0116292, - "auxiliary_loss_mlp": 0.01056376, - "balance_loss_clip": 1.0587275, - "balance_loss_mlp": 1.03406048, - "epoch": 0.10094694122952051, - "flos": 25265706744960.0, - "grad_norm": 12.700254532531051, - "language_loss": 0.89978886, - "learning_rate": 3.947431963338532e-06, - "loss": 0.92198181, - "num_input_tokens_seen": 36183530, - "step": 1679, - "time_per_iteration": 2.6741397380828857 - }, - { - "auxiliary_loss_clip": 0.01071086, - "auxiliary_loss_mlp": 0.0101685, - "balance_loss_clip": 1.02328789, - "balance_loss_mlp": 1.01360798, - "epoch": 0.10100706448218849, - "flos": 69854299885440.0, - "grad_norm": 0.7882499243548835, - "language_loss": 0.52985126, - "learning_rate": 3.947343220426312e-06, - "loss": 0.55073065, - "num_input_tokens_seen": 36248550, - "step": 1680, - "time_per_iteration": 3.169893503189087 - }, - { - "auxiliary_loss_clip": 0.01185252, - "auxiliary_loss_mlp": 0.00779951, - "balance_loss_clip": 1.06022644, - "balance_loss_mlp": 1.00017488, - "epoch": 0.10106718773485646, - "flos": 20007163463040.0, - "grad_norm": 1.6642182724084642, - "language_loss": 0.76869059, - "learning_rate": 3.947254403670641e-06, - "loss": 0.7883426, - "num_input_tokens_seen": 36266065, - "step": 1681, - "time_per_iteration": 4.146950006484985 - }, - { - "auxiliary_loss_clip": 0.01156046, - "auxiliary_loss_mlp": 0.01059972, - "balance_loss_clip": 1.0539515, - "balance_loss_mlp": 1.03469992, - "epoch": 0.10112731098752442, - "flos": 13479825093120.0, - "grad_norm": 2.3884003317971225, - "language_loss": 0.93957508, - "learning_rate": 3.947165513074889e-06, - "loss": 0.96173531, - "num_input_tokens_seen": 36280960, - "step": 1682, - "time_per_iteration": 4.220505237579346 - }, - { - "auxiliary_loss_clip": 0.01173183, - "auxiliary_loss_mlp": 0.01053261, - "balance_loss_clip": 1.05487084, - "balance_loss_mlp": 1.03133821, - "epoch": 0.1011874342401924, - "flos": 18515901490560.0, - "grad_norm": 3.5300660189263917, - "language_loss": 0.87618893, - "learning_rate": 3.947076548642425e-06, - "loss": 0.89845335, - "num_input_tokens_seen": 36299010, - "step": 1683, - "time_per_iteration": 2.635636329650879 - }, - { - "auxiliary_loss_clip": 0.01128888, - "auxiliary_loss_mlp": 0.01063089, - "balance_loss_clip": 1.04814756, - "balance_loss_mlp": 1.04008126, - "epoch": 0.10124755749286037, - "flos": 20702861055360.0, - "grad_norm": 2.3337760241024923, - "language_loss": 0.74566805, - "learning_rate": 3.946987510376624e-06, - "loss": 0.76758784, - "num_input_tokens_seen": 36318400, - "step": 1684, - "time_per_iteration": 4.417364835739136 - }, - { - "auxiliary_loss_clip": 0.01053031, - "auxiliary_loss_mlp": 0.0101182, - "balance_loss_clip": 1.02547038, - "balance_loss_mlp": 1.00853014, - "epoch": 0.10130768074552833, - "flos": 56109456247680.0, - "grad_norm": 0.7564631726021327, - "language_loss": 0.61085057, - "learning_rate": 3.9468983982808615e-06, - "loss": 0.63149905, - "num_input_tokens_seen": 36381815, - "step": 1685, - "time_per_iteration": 4.87179970741272 - }, - { - "auxiliary_loss_clip": 0.01157045, - "auxiliary_loss_mlp": 0.01056064, - "balance_loss_clip": 1.05233479, - "balance_loss_mlp": 1.0341655, - "epoch": 0.1013678039981963, - "flos": 33402346156800.0, - "grad_norm": 4.297801792672815, - "language_loss": 0.61381406, - "learning_rate": 3.946809212358516e-06, - "loss": 0.6359452, - "num_input_tokens_seen": 36404320, - "step": 1686, - "time_per_iteration": 2.8289108276367188 - }, - { - "auxiliary_loss_clip": 0.01144631, - "auxiliary_loss_mlp": 0.01059888, - "balance_loss_clip": 1.05645001, - "balance_loss_mlp": 1.03678524, - "epoch": 0.10142792725086427, - "flos": 31905338008320.0, - "grad_norm": 2.21923850158845, - "language_loss": 0.81216162, - "learning_rate": 3.946719952612972e-06, - "loss": 0.83420682, - "num_input_tokens_seen": 36427510, - "step": 1687, - "time_per_iteration": 2.947535276412964 - }, - { - "auxiliary_loss_clip": 0.0117612, - "auxiliary_loss_mlp": 0.0105614, - "balance_loss_clip": 1.05933213, - "balance_loss_mlp": 1.03403926, - "epoch": 0.10148805050353224, - "flos": 28475905046400.0, - "grad_norm": 1.7955898786084035, - "language_loss": 0.71943259, - "learning_rate": 3.94663061904761e-06, - "loss": 0.74175525, - "num_input_tokens_seen": 36448230, - "step": 1688, - "time_per_iteration": 2.693249225616455 - }, - { - "auxiliary_loss_clip": 0.01151953, - "auxiliary_loss_mlp": 0.01063362, - "balance_loss_clip": 1.05288756, - "balance_loss_mlp": 1.04079556, - "epoch": 0.1015481737562002, - "flos": 25148888737920.0, - "grad_norm": 2.636795901714516, - "language_loss": 0.86876953, - "learning_rate": 3.94654121166582e-06, - "loss": 0.89092261, - "num_input_tokens_seen": 36464395, - "step": 1689, - "time_per_iteration": 2.677992820739746 - }, - { - "auxiliary_loss_clip": 0.01172188, - "auxiliary_loss_mlp": 0.01057982, - "balance_loss_clip": 1.05476904, - "balance_loss_mlp": 1.0378834, - "epoch": 0.10160829700886818, - "flos": 30882781630080.0, - "grad_norm": 2.2211105929909696, - "language_loss": 0.88170946, - "learning_rate": 3.946451730470993e-06, - "loss": 0.90401113, - "num_input_tokens_seen": 36486475, - "step": 1690, - "time_per_iteration": 2.707209348678589 - }, - { - "auxiliary_loss_clip": 0.01158767, - "auxiliary_loss_mlp": 0.01052386, - "balance_loss_clip": 1.05507553, - "balance_loss_mlp": 1.02973664, - "epoch": 0.10166842026153615, - "flos": 20412020632320.0, - "grad_norm": 2.08291471600754, - "language_loss": 0.83348423, - "learning_rate": 3.946362175466521e-06, - "loss": 0.85559577, - "num_input_tokens_seen": 36505310, - "step": 1691, - "time_per_iteration": 2.6521170139312744 - }, - { - "auxiliary_loss_clip": 0.01162159, - "auxiliary_loss_mlp": 0.01051716, - "balance_loss_clip": 1.05550599, - "balance_loss_mlp": 1.03016281, - "epoch": 0.10172854351420411, - "flos": 33476968661760.0, - "grad_norm": 1.704519528530946, - "language_loss": 0.66773653, - "learning_rate": 3.946272546655801e-06, - "loss": 0.68987525, - "num_input_tokens_seen": 36529820, - "step": 1692, - "time_per_iteration": 2.799353837966919 - }, - { - "auxiliary_loss_clip": 0.01144502, - "auxiliary_loss_mlp": 0.0107473, - "balance_loss_clip": 1.05057836, - "balance_loss_mlp": 1.05258095, - "epoch": 0.1017886667668721, - "flos": 23550325862400.0, - "grad_norm": 1.8345924563029705, - "language_loss": 0.75939322, - "learning_rate": 3.94618284404223e-06, - "loss": 0.78158557, - "num_input_tokens_seen": 36549000, - "step": 1693, - "time_per_iteration": 2.6711113452911377 - }, - { - "auxiliary_loss_clip": 0.01132621, - "auxiliary_loss_mlp": 0.01057162, - "balance_loss_clip": 1.04893303, - "balance_loss_mlp": 1.03289056, - "epoch": 0.10184879001954006, - "flos": 23296078419840.0, - "grad_norm": 1.7027745569702395, - "language_loss": 0.87503564, - "learning_rate": 3.9460930676292105e-06, - "loss": 0.89693356, - "num_input_tokens_seen": 36567515, - "step": 1694, - "time_per_iteration": 2.749119520187378 - }, - { - "auxiliary_loss_clip": 0.01130673, - "auxiliary_loss_mlp": 0.01058451, - "balance_loss_clip": 1.04954553, - "balance_loss_mlp": 1.033095, - "epoch": 0.10190891327220802, - "flos": 18333116156160.0, - "grad_norm": 1.7649462193878245, - "language_loss": 0.79299057, - "learning_rate": 3.946003217420147e-06, - "loss": 0.8148818, - "num_input_tokens_seen": 36586190, - "step": 1695, - "time_per_iteration": 2.839081048965454 - }, - { - "auxiliary_loss_clip": 0.0112732, - "auxiliary_loss_mlp": 0.01061103, - "balance_loss_clip": 1.04818296, - "balance_loss_mlp": 1.03772628, - "epoch": 0.10196903652487599, - "flos": 26465374108800.0, - "grad_norm": 2.7190993931598446, - "language_loss": 0.86494684, - "learning_rate": 3.945913293418447e-06, - "loss": 0.88683105, - "num_input_tokens_seen": 36607495, - "step": 1696, - "time_per_iteration": 2.7802348136901855 - }, - { - "auxiliary_loss_clip": 0.01168675, - "auxiliary_loss_mlp": 0.01054661, - "balance_loss_clip": 1.05711746, - "balance_loss_mlp": 1.03315568, - "epoch": 0.10202915977754397, - "flos": 21869526798720.0, - "grad_norm": 1.7889048836535288, - "language_loss": 0.82350796, - "learning_rate": 3.945823295627519e-06, - "loss": 0.84574133, - "num_input_tokens_seen": 36628555, - "step": 1697, - "time_per_iteration": 2.667962074279785 - }, - { - "auxiliary_loss_clip": 0.01184333, - "auxiliary_loss_mlp": 0.01055548, - "balance_loss_clip": 1.05680871, - "balance_loss_mlp": 1.033149, - "epoch": 0.10208928303021193, - "flos": 22309755886080.0, - "grad_norm": 2.0464291543972006, - "language_loss": 0.81198204, - "learning_rate": 3.9457332240507775e-06, - "loss": 0.83438087, - "num_input_tokens_seen": 36646250, - "step": 1698, - "time_per_iteration": 2.6484432220458984 - }, - { - "auxiliary_loss_clip": 0.01150498, - "auxiliary_loss_mlp": 0.01053546, - "balance_loss_clip": 1.05696845, - "balance_loss_mlp": 1.03226686, - "epoch": 0.1021494062828799, - "flos": 22125569921280.0, - "grad_norm": 2.3020250981163226, - "language_loss": 0.75612724, - "learning_rate": 3.945643078691637e-06, - "loss": 0.77816761, - "num_input_tokens_seen": 36666675, - "step": 1699, - "time_per_iteration": 2.8040614128112793 - }, - { - "auxiliary_loss_clip": 0.01162088, - "auxiliary_loss_mlp": 0.01050379, - "balance_loss_clip": 1.06041551, - "balance_loss_mlp": 1.02827764, - "epoch": 0.10220952953554788, - "flos": 19646728439040.0, - "grad_norm": 1.6839869206777538, - "language_loss": 0.80395639, - "learning_rate": 3.945552859553516e-06, - "loss": 0.8260811, - "num_input_tokens_seen": 36685225, - "step": 1700, - "time_per_iteration": 2.6701290607452393 - }, - { - "auxiliary_loss_clip": 0.0117076, - "auxiliary_loss_mlp": 0.0104804, - "balance_loss_clip": 1.05714083, - "balance_loss_mlp": 1.02653444, - "epoch": 0.10226965278821584, - "flos": 29787290686080.0, - "grad_norm": 2.102621975458346, - "language_loss": 0.76877582, - "learning_rate": 3.945462566639836e-06, - "loss": 0.79096377, - "num_input_tokens_seen": 36705985, - "step": 1701, - "time_per_iteration": 2.748201847076416 - }, - { - "auxiliary_loss_clip": 0.01182259, - "auxiliary_loss_mlp": 0.01050364, - "balance_loss_clip": 1.06157088, - "balance_loss_mlp": 1.02852523, - "epoch": 0.10232977604088381, - "flos": 27016818681600.0, - "grad_norm": 2.1099726588763965, - "language_loss": 0.77922845, - "learning_rate": 3.945372199954019e-06, - "loss": 0.80155474, - "num_input_tokens_seen": 36725815, - "step": 1702, - "time_per_iteration": 2.6703274250030518 - }, - { - "auxiliary_loss_clip": 0.01156323, - "auxiliary_loss_mlp": 0.01052524, - "balance_loss_clip": 1.05596721, - "balance_loss_mlp": 1.03126872, - "epoch": 0.10238989929355179, - "flos": 20777519473920.0, - "grad_norm": 2.2326457826946293, - "language_loss": 0.94093609, - "learning_rate": 3.945281759499494e-06, - "loss": 0.96302462, - "num_input_tokens_seen": 36742345, - "step": 1703, - "time_per_iteration": 2.6712698936462402 - }, - { - "auxiliary_loss_clip": 0.01034483, - "auxiliary_loss_mlp": 0.01037784, - "balance_loss_clip": 1.02765131, - "balance_loss_mlp": 1.03315914, - "epoch": 0.10245002254621975, - "flos": 57698322451200.0, - "grad_norm": 0.8815387598011586, - "language_loss": 0.55096036, - "learning_rate": 3.94519124527969e-06, - "loss": 0.57168299, - "num_input_tokens_seen": 36798775, - "step": 1704, - "time_per_iteration": 3.2863855361938477 - }, - { - "auxiliary_loss_clip": 0.01186822, - "auxiliary_loss_mlp": 0.01053701, - "balance_loss_clip": 1.06026638, - "balance_loss_mlp": 1.03088403, - "epoch": 0.10251014579888772, - "flos": 16800125558400.0, - "grad_norm": 2.051901555713709, - "language_loss": 0.84025991, - "learning_rate": 3.945100657298039e-06, - "loss": 0.86266518, - "num_input_tokens_seen": 36816295, - "step": 1705, - "time_per_iteration": 2.8991851806640625 - }, - { - "auxiliary_loss_clip": 0.01045354, - "auxiliary_loss_mlp": 0.01018361, - "balance_loss_clip": 1.02622223, - "balance_loss_mlp": 1.01526153, - "epoch": 0.1025702690515557, - "flos": 68565500922240.0, - "grad_norm": 0.7692746082941451, - "language_loss": 0.60408181, - "learning_rate": 3.9450099955579765e-06, - "loss": 0.62471896, - "num_input_tokens_seen": 36882030, - "step": 1706, - "time_per_iteration": 3.2174558639526367 - }, - { - "auxiliary_loss_clip": 0.01149922, - "auxiliary_loss_mlp": 0.01051211, - "balance_loss_clip": 1.05388391, - "balance_loss_mlp": 1.02812052, - "epoch": 0.10263039230422366, - "flos": 14866623336960.0, - "grad_norm": 2.201796189576969, - "language_loss": 0.85937822, - "learning_rate": 3.94491926006294e-06, - "loss": 0.88138962, - "num_input_tokens_seen": 36899245, - "step": 1707, - "time_per_iteration": 2.689208507537842 - }, - { - "auxiliary_loss_clip": 0.01169165, - "auxiliary_loss_mlp": 0.0105297, - "balance_loss_clip": 1.05941081, - "balance_loss_mlp": 1.03114319, - "epoch": 0.10269051555689163, - "flos": 25337599816320.0, - "grad_norm": 1.471109036018689, - "language_loss": 0.73299325, - "learning_rate": 3.944828450816369e-06, - "loss": 0.75521457, - "num_input_tokens_seen": 36920950, - "step": 1708, - "time_per_iteration": 2.679760456085205 - }, - { - "auxiliary_loss_clip": 0.01155833, - "auxiliary_loss_mlp": 0.00780571, - "balance_loss_clip": 1.05718231, - "balance_loss_mlp": 1.00042295, - "epoch": 0.10275063880955959, - "flos": 21068826773760.0, - "grad_norm": 1.7051644476897239, - "language_loss": 0.91616452, - "learning_rate": 3.944737567821709e-06, - "loss": 0.93552846, - "num_input_tokens_seen": 36938900, - "step": 1709, - "time_per_iteration": 2.6754679679870605 - }, - { - "auxiliary_loss_clip": 0.01124911, - "auxiliary_loss_mlp": 0.01057008, - "balance_loss_clip": 1.05144072, - "balance_loss_mlp": 1.0343945, - "epoch": 0.10281076206222757, - "flos": 30366780802560.0, - "grad_norm": 2.1056252966717275, - "language_loss": 0.88004494, - "learning_rate": 3.944646611082406e-06, - "loss": 0.90186411, - "num_input_tokens_seen": 36957010, - "step": 1710, - "time_per_iteration": 2.708723306655884 - }, - { - "auxiliary_loss_clip": 0.01171004, - "auxiliary_loss_mlp": 0.0105967, - "balance_loss_clip": 1.05658317, - "balance_loss_mlp": 1.036973, - "epoch": 0.10287088531489554, - "flos": 22418313765120.0, - "grad_norm": 1.7046493271202992, - "language_loss": 0.79370153, - "learning_rate": 3.944555580601908e-06, - "loss": 0.81600821, - "num_input_tokens_seen": 36977690, - "step": 1711, - "time_per_iteration": 2.631908416748047 - }, - { - "auxiliary_loss_clip": 0.01156003, - "auxiliary_loss_mlp": 0.01055126, - "balance_loss_clip": 1.05841637, - "balance_loss_mlp": 1.03189242, - "epoch": 0.1029310085675635, - "flos": 25115994858240.0, - "grad_norm": 3.2168061349371135, - "language_loss": 0.73666596, - "learning_rate": 3.944464476383668e-06, - "loss": 0.75877726, - "num_input_tokens_seen": 36997300, - "step": 1712, - "time_per_iteration": 2.7107467651367188 - }, - { - "auxiliary_loss_clip": 0.01133407, - "auxiliary_loss_mlp": 0.01056055, - "balance_loss_clip": 1.05496907, - "balance_loss_mlp": 1.03334546, - "epoch": 0.10299113182023148, - "flos": 19865639877120.0, - "grad_norm": 1.974447377126898, - "language_loss": 0.87049067, - "learning_rate": 3.94437329843114e-06, - "loss": 0.89238536, - "num_input_tokens_seen": 37016110, - "step": 1713, - "time_per_iteration": 2.6532411575317383 - }, - { - "auxiliary_loss_clip": 0.0116832, - "auxiliary_loss_mlp": 0.01060237, - "balance_loss_clip": 1.05669498, - "balance_loss_mlp": 1.03877962, - "epoch": 0.10305125507289944, - "flos": 20447608032000.0, - "grad_norm": 1.57388574383124, - "language_loss": 0.72406238, - "learning_rate": 3.944282046747782e-06, - "loss": 0.74634796, - "num_input_tokens_seen": 37036405, - "step": 1714, - "time_per_iteration": 2.5987610816955566 - }, - { - "auxiliary_loss_clip": 0.01174482, - "auxiliary_loss_mlp": 0.01063165, - "balance_loss_clip": 1.05715692, - "balance_loss_mlp": 1.03934693, - "epoch": 0.10311137832556741, - "flos": 26250772302720.0, - "grad_norm": 2.1959530175190434, - "language_loss": 0.91065919, - "learning_rate": 3.944190721337053e-06, - "loss": 0.93303567, - "num_input_tokens_seen": 37057580, - "step": 1715, - "time_per_iteration": 2.743833303451538 - }, - { - "auxiliary_loss_clip": 0.01170297, - "auxiliary_loss_mlp": 0.01054891, - "balance_loss_clip": 1.05448914, - "balance_loss_mlp": 1.03305221, - "epoch": 0.10317150157823539, - "flos": 35298932175360.0, - "grad_norm": 1.8741123562687005, - "language_loss": 0.75969976, - "learning_rate": 3.944099322202418e-06, - "loss": 0.78195167, - "num_input_tokens_seen": 37079120, - "step": 1716, - "time_per_iteration": 2.748903274536133 - }, - { - "auxiliary_loss_clip": 0.01162664, - "auxiliary_loss_mlp": 0.01061895, - "balance_loss_clip": 1.05617428, - "balance_loss_mlp": 1.03804111, - "epoch": 0.10323162483090335, - "flos": 25739943033600.0, - "grad_norm": 3.178190042364093, - "language_loss": 0.85308528, - "learning_rate": 3.944007849347342e-06, - "loss": 0.87533092, - "num_input_tokens_seen": 37099710, - "step": 1717, - "time_per_iteration": 2.690772533416748 - }, - { - "auxiliary_loss_clip": 0.01127019, - "auxiliary_loss_mlp": 0.01067935, - "balance_loss_clip": 1.05048633, - "balance_loss_mlp": 1.04436755, - "epoch": 0.10329174808357132, - "flos": 16289870906880.0, - "grad_norm": 1.8474438265561113, - "language_loss": 0.82945001, - "learning_rate": 3.943916302775292e-06, - "loss": 0.85139954, - "num_input_tokens_seen": 37117775, - "step": 1718, - "time_per_iteration": 2.7029476165771484 - }, - { - "auxiliary_loss_clip": 0.01171184, - "auxiliary_loss_mlp": 0.01049869, - "balance_loss_clip": 1.05912328, - "balance_loss_mlp": 1.02701616, - "epoch": 0.10335187133623928, - "flos": 36687166963200.0, - "grad_norm": 1.7728224248964342, - "language_loss": 0.73396438, - "learning_rate": 3.943824682489742e-06, - "loss": 0.75617492, - "num_input_tokens_seen": 37140280, - "step": 1719, - "time_per_iteration": 2.7653820514678955 - }, - { - "auxiliary_loss_clip": 0.01168859, - "auxiliary_loss_mlp": 0.01048444, - "balance_loss_clip": 1.05861163, - "balance_loss_mlp": 1.02786827, - "epoch": 0.10341199458890726, - "flos": 14975648092800.0, - "grad_norm": 1.7819459058763836, - "language_loss": 0.92692196, - "learning_rate": 3.9437329884941665e-06, - "loss": 0.94909501, - "num_input_tokens_seen": 37158350, - "step": 1720, - "time_per_iteration": 4.1962480545043945 - }, - { - "auxiliary_loss_clip": 0.01139894, - "auxiliary_loss_mlp": 0.01051033, - "balance_loss_clip": 1.05092323, - "balance_loss_mlp": 1.02827597, - "epoch": 0.10347211784157523, - "flos": 21031587348480.0, - "grad_norm": 1.6861044154168399, - "language_loss": 0.79497123, - "learning_rate": 3.943641220792039e-06, - "loss": 0.81688046, - "num_input_tokens_seen": 37177120, - "step": 1721, - "time_per_iteration": 4.524151802062988 - }, - { - "auxiliary_loss_clip": 0.01130482, - "auxiliary_loss_mlp": 0.01067754, - "balance_loss_clip": 1.05380797, - "balance_loss_mlp": 1.04109859, - "epoch": 0.1035322410942432, - "flos": 19792094780160.0, - "grad_norm": 1.951940775381607, - "language_loss": 0.80707669, - "learning_rate": 3.9435493793868434e-06, - "loss": 0.829059, - "num_input_tokens_seen": 37195895, - "step": 1722, - "time_per_iteration": 2.7972562313079834 - }, - { - "auxiliary_loss_clip": 0.01059018, - "auxiliary_loss_mlp": 0.01038991, - "balance_loss_clip": 1.02668202, - "balance_loss_mlp": 1.03536737, - "epoch": 0.10359236434691117, - "flos": 52698874947840.0, - "grad_norm": 0.9413879826908518, - "language_loss": 0.67161834, - "learning_rate": 3.943457464282059e-06, - "loss": 0.69259846, - "num_input_tokens_seen": 37247270, - "step": 1723, - "time_per_iteration": 4.899553060531616 - }, - { - "auxiliary_loss_clip": 0.01169875, - "auxiliary_loss_mlp": 0.01062977, - "balance_loss_clip": 1.05482125, - "balance_loss_mlp": 1.04193664, - "epoch": 0.10365248759957914, - "flos": 18405404277120.0, - "grad_norm": 2.8641520576523116, - "language_loss": 0.77715755, - "learning_rate": 3.9433654754811745e-06, - "loss": 0.7994861, - "num_input_tokens_seen": 37265595, - "step": 1724, - "time_per_iteration": 2.7613437175750732 - }, - { - "auxiliary_loss_clip": 0.01151829, - "auxiliary_loss_mlp": 0.01069246, - "balance_loss_clip": 1.05667496, - "balance_loss_mlp": 1.04753852, - "epoch": 0.1037126108522471, - "flos": 47553555335040.0, - "grad_norm": 2.6433978354033543, - "language_loss": 0.74533165, - "learning_rate": 3.943273412987676e-06, - "loss": 0.76754242, - "num_input_tokens_seen": 37286660, - "step": 1725, - "time_per_iteration": 4.557274580001831 - }, - { - "auxiliary_loss_clip": 0.01137065, - "auxiliary_loss_mlp": 0.01081067, - "balance_loss_clip": 1.05264461, - "balance_loss_mlp": 1.05832207, - "epoch": 0.10377273410491508, - "flos": 22816670572800.0, - "grad_norm": 2.2241153649877865, - "language_loss": 0.75043738, - "learning_rate": 3.943181276805054e-06, - "loss": 0.77261865, - "num_input_tokens_seen": 37304915, - "step": 1726, - "time_per_iteration": 2.7098495960235596 - }, - { - "auxiliary_loss_clip": 0.01150932, - "auxiliary_loss_mlp": 0.0107864, - "balance_loss_clip": 1.05345368, - "balance_loss_mlp": 1.05610991, - "epoch": 0.10383285735758305, - "flos": 26138694890880.0, - "grad_norm": 2.783771441956431, - "language_loss": 0.73243797, - "learning_rate": 3.9430890669368035e-06, - "loss": 0.75473368, - "num_input_tokens_seen": 37325265, - "step": 1727, - "time_per_iteration": 2.74774169921875 - }, - { - "auxiliary_loss_clip": 0.01157922, - "auxiliary_loss_mlp": 0.01068007, - "balance_loss_clip": 1.05303776, - "balance_loss_mlp": 1.04625082, - "epoch": 0.10389298061025101, - "flos": 17091791994240.0, - "grad_norm": 2.172978726198527, - "language_loss": 0.84373868, - "learning_rate": 3.942996783386422e-06, - "loss": 0.86599791, - "num_input_tokens_seen": 37341650, - "step": 1728, - "time_per_iteration": 2.675724744796753 - }, - { - "auxiliary_loss_clip": 0.01154897, - "auxiliary_loss_mlp": 0.01060505, - "balance_loss_clip": 1.0545603, - "balance_loss_mlp": 1.0393219, - "epoch": 0.10395310386291898, - "flos": 20776513893120.0, - "grad_norm": 2.1406499008555513, - "language_loss": 0.70776087, - "learning_rate": 3.942904426157406e-06, - "loss": 0.7299149, - "num_input_tokens_seen": 37360270, - "step": 1729, - "time_per_iteration": 2.6885008811950684 - }, - { - "auxiliary_loss_clip": 0.01158623, - "auxiliary_loss_mlp": 0.01068311, - "balance_loss_clip": 1.05437422, - "balance_loss_mlp": 1.04520774, - "epoch": 0.10401322711558696, - "flos": 12820540913280.0, - "grad_norm": 2.4133379049648283, - "language_loss": 0.81237471, - "learning_rate": 3.9428119952532605e-06, - "loss": 0.83464402, - "num_input_tokens_seen": 37375225, - "step": 1730, - "time_per_iteration": 2.6659536361694336 - }, - { - "auxiliary_loss_clip": 0.01085856, - "auxiliary_loss_mlp": 0.01063394, - "balance_loss_clip": 1.04733562, - "balance_loss_mlp": 1.04314065, - "epoch": 0.10407335036825492, - "flos": 23184683366400.0, - "grad_norm": 1.6634499611984725, - "language_loss": 0.75829297, - "learning_rate": 3.942719490677489e-06, - "loss": 0.77978551, - "num_input_tokens_seen": 37395165, - "step": 1731, - "time_per_iteration": 3.043125629425049 - }, - { - "auxiliary_loss_clip": 0.01129913, - "auxiliary_loss_mlp": 0.01065783, - "balance_loss_clip": 1.0526607, - "balance_loss_mlp": 1.04604149, - "epoch": 0.10413347362092289, - "flos": 26104184899200.0, - "grad_norm": 1.8280179918091173, - "language_loss": 0.8268069, - "learning_rate": 3.9426269124336e-06, - "loss": 0.84876388, - "num_input_tokens_seen": 37414845, - "step": 1732, - "time_per_iteration": 2.96221661567688 - }, - { - "auxiliary_loss_clip": 0.01141505, - "auxiliary_loss_mlp": 0.01067805, - "balance_loss_clip": 1.05805755, - "balance_loss_mlp": 1.04852867, - "epoch": 0.10419359687359087, - "flos": 12641059630080.0, - "grad_norm": 1.9919813178368582, - "language_loss": 0.83320522, - "learning_rate": 3.942534260525104e-06, - "loss": 0.85529828, - "num_input_tokens_seen": 37432490, - "step": 1733, - "time_per_iteration": 2.7364420890808105 - }, - { - "auxiliary_loss_clip": 0.01153374, - "auxiliary_loss_mlp": 0.0106675, - "balance_loss_clip": 1.05592012, - "balance_loss_mlp": 1.04654372, - "epoch": 0.10425372012625883, - "flos": 12125094716160.0, - "grad_norm": 2.4441875881355597, - "language_loss": 0.76683885, - "learning_rate": 3.942441534955514e-06, - "loss": 0.78904009, - "num_input_tokens_seen": 37449435, - "step": 1734, - "time_per_iteration": 2.669623851776123 - }, - { - "auxiliary_loss_clip": 0.0113597, - "auxiliary_loss_mlp": 0.01052567, - "balance_loss_clip": 1.05042601, - "balance_loss_mlp": 1.03255177, - "epoch": 0.1043138433789268, - "flos": 25337563902720.0, - "grad_norm": 1.6775801166329647, - "language_loss": 0.74826896, - "learning_rate": 3.9423487357283465e-06, - "loss": 0.7701543, - "num_input_tokens_seen": 37469105, - "step": 1735, - "time_per_iteration": 2.8477160930633545 - }, - { - "auxiliary_loss_clip": 0.01167698, - "auxiliary_loss_mlp": 0.01055716, - "balance_loss_clip": 1.05678105, - "balance_loss_mlp": 1.0344727, - "epoch": 0.10437396663159478, - "flos": 29167149352320.0, - "grad_norm": 1.7228393064183538, - "language_loss": 0.78835273, - "learning_rate": 3.94225586284712e-06, - "loss": 0.81058681, - "num_input_tokens_seen": 37490540, - "step": 1736, - "time_per_iteration": 2.690453052520752 - }, - { - "auxiliary_loss_clip": 0.0116734, - "auxiliary_loss_mlp": 0.01064692, - "balance_loss_clip": 1.05800533, - "balance_loss_mlp": 1.04357982, - "epoch": 0.10443408988426274, - "flos": 25080946162560.0, - "grad_norm": 1.8549131823334455, - "language_loss": 0.7058785, - "learning_rate": 3.942162916315356e-06, - "loss": 0.72819883, - "num_input_tokens_seen": 37511905, - "step": 1737, - "time_per_iteration": 2.6296744346618652 - }, - { - "auxiliary_loss_clip": 0.01150138, - "auxiliary_loss_mlp": 0.01059407, - "balance_loss_clip": 1.04806042, - "balance_loss_mlp": 1.03600669, - "epoch": 0.1044942131369307, - "flos": 26759662237440.0, - "grad_norm": 2.415613377802324, - "language_loss": 0.81624997, - "learning_rate": 3.942069896136581e-06, - "loss": 0.83834541, - "num_input_tokens_seen": 37533635, - "step": 1738, - "time_per_iteration": 2.7436723709106445 - }, - { - "auxiliary_loss_clip": 0.01181471, - "auxiliary_loss_mlp": 0.01062035, - "balance_loss_clip": 1.05579174, - "balance_loss_mlp": 1.03950453, - "epoch": 0.10455433638959867, - "flos": 18442571875200.0, - "grad_norm": 2.1004590024567897, - "language_loss": 0.75419426, - "learning_rate": 3.9419768023143196e-06, - "loss": 0.77662933, - "num_input_tokens_seen": 37552035, - "step": 1739, - "time_per_iteration": 2.585538148880005 - }, - { - "auxiliary_loss_clip": 0.01146716, - "auxiliary_loss_mlp": 0.01054893, - "balance_loss_clip": 1.05417264, - "balance_loss_mlp": 1.03348303, - "epoch": 0.10461445964226665, - "flos": 23218977876480.0, - "grad_norm": 1.586314706443492, - "language_loss": 0.77523744, - "learning_rate": 3.941883634852104e-06, - "loss": 0.79725355, - "num_input_tokens_seen": 37571540, - "step": 1740, - "time_per_iteration": 2.8947789669036865 - }, - { - "auxiliary_loss_clip": 0.01152077, - "auxiliary_loss_mlp": 0.01049503, - "balance_loss_clip": 1.05725431, - "balance_loss_mlp": 1.0288676, - "epoch": 0.10467458289493461, - "flos": 24345243797760.0, - "grad_norm": 1.964868695493703, - "language_loss": 0.85976374, - "learning_rate": 3.941790393753467e-06, - "loss": 0.88177955, - "num_input_tokens_seen": 37588265, - "step": 1741, - "time_per_iteration": 2.7706260681152344 - }, - { - "auxiliary_loss_clip": 0.01158134, - "auxiliary_loss_mlp": 0.01056311, - "balance_loss_clip": 1.05614483, - "balance_loss_mlp": 1.03350592, - "epoch": 0.10473470614760258, - "flos": 21287953693440.0, - "grad_norm": 5.197245251055922, - "language_loss": 0.75592613, - "learning_rate": 3.941697079021942e-06, - "loss": 0.77807057, - "num_input_tokens_seen": 37606860, - "step": 1742, - "time_per_iteration": 2.784748077392578 - }, - { - "auxiliary_loss_clip": 0.0113066, - "auxiliary_loss_mlp": 0.01057571, - "balance_loss_clip": 1.05678856, - "balance_loss_mlp": 1.03735304, - "epoch": 0.10479482940027056, - "flos": 21687208341120.0, - "grad_norm": 2.1426857583950416, - "language_loss": 0.87614191, - "learning_rate": 3.94160369066107e-06, - "loss": 0.89802414, - "num_input_tokens_seen": 37625210, - "step": 1743, - "time_per_iteration": 2.819350004196167 - }, - { - "auxiliary_loss_clip": 0.01139959, - "auxiliary_loss_mlp": 0.01048534, - "balance_loss_clip": 1.0552268, - "balance_loss_mlp": 1.0254786, - "epoch": 0.10485495265293852, - "flos": 21573694385280.0, - "grad_norm": 2.060686178474056, - "language_loss": 0.75927812, - "learning_rate": 3.941510228674391e-06, - "loss": 0.7811631, - "num_input_tokens_seen": 37644110, - "step": 1744, - "time_per_iteration": 2.7817211151123047 - }, - { - "auxiliary_loss_clip": 0.01170232, - "auxiliary_loss_mlp": 0.01054483, - "balance_loss_clip": 1.05992889, - "balance_loss_mlp": 1.03442037, - "epoch": 0.10491507590560649, - "flos": 37961923708800.0, - "grad_norm": 1.9689383181633062, - "language_loss": 0.78905094, - "learning_rate": 3.941416693065451e-06, - "loss": 0.81129813, - "num_input_tokens_seen": 37665800, - "step": 1745, - "time_per_iteration": 2.88080096244812 - }, - { - "auxiliary_loss_clip": 0.01180482, - "auxiliary_loss_mlp": 0.01060479, - "balance_loss_clip": 1.05740213, - "balance_loss_mlp": 1.03920031, - "epoch": 0.10497519915827447, - "flos": 26396282298240.0, - "grad_norm": 2.64819141351011, - "language_loss": 0.82568693, - "learning_rate": 3.941323083837794e-06, - "loss": 0.84809649, - "num_input_tokens_seen": 37685095, - "step": 1746, - "time_per_iteration": 2.7068004608154297 - }, - { - "auxiliary_loss_clip": 0.01158367, - "auxiliary_loss_mlp": 0.0105595, - "balance_loss_clip": 1.05737162, - "balance_loss_mlp": 1.03448033, - "epoch": 0.10503532241094243, - "flos": 40662190581120.0, - "grad_norm": 1.6274602877205533, - "language_loss": 0.70573747, - "learning_rate": 3.941229400994971e-06, - "loss": 0.7278806, - "num_input_tokens_seen": 37707445, - "step": 1747, - "time_per_iteration": 2.8689963817596436 - }, - { - "auxiliary_loss_clip": 0.01159389, - "auxiliary_loss_mlp": 0.01056346, - "balance_loss_clip": 1.06035507, - "balance_loss_mlp": 1.03492367, - "epoch": 0.1050954456636104, - "flos": 29789409588480.0, - "grad_norm": 2.386885173400054, - "language_loss": 0.8447504, - "learning_rate": 3.941135644540535e-06, - "loss": 0.86690772, - "num_input_tokens_seen": 37728325, - "step": 1748, - "time_per_iteration": 2.8022749423980713 - }, - { - "auxiliary_loss_clip": 0.01175489, - "auxiliary_loss_mlp": 0.01049407, - "balance_loss_clip": 1.05471563, - "balance_loss_mlp": 1.02701974, - "epoch": 0.10515556891627838, - "flos": 23948754497280.0, - "grad_norm": 1.759895679837136, - "language_loss": 0.71681082, - "learning_rate": 3.941041814478041e-06, - "loss": 0.73905981, - "num_input_tokens_seen": 37748910, - "step": 1749, - "time_per_iteration": 2.6568849086761475 - }, - { - "auxiliary_loss_clip": 0.01158221, - "auxiliary_loss_mlp": 0.01058697, - "balance_loss_clip": 1.05427456, - "balance_loss_mlp": 1.03590393, - "epoch": 0.10521569216894634, - "flos": 18259606972800.0, - "grad_norm": 2.95022560634889, - "language_loss": 0.81510806, - "learning_rate": 3.940947910811047e-06, - "loss": 0.83727717, - "num_input_tokens_seen": 37765745, - "step": 1750, - "time_per_iteration": 2.6282739639282227 - }, - { - "auxiliary_loss_clip": 0.01156475, - "auxiliary_loss_mlp": 0.01062657, - "balance_loss_clip": 1.06022298, - "balance_loss_mlp": 1.03973269, - "epoch": 0.10527581542161431, - "flos": 15630909949440.0, - "grad_norm": 2.2218325288878953, - "language_loss": 0.92364043, - "learning_rate": 3.940853933543114e-06, - "loss": 0.94583178, - "num_input_tokens_seen": 37780520, - "step": 1751, - "time_per_iteration": 2.703376531600952 - }, - { - "auxiliary_loss_clip": 0.01165779, - "auxiliary_loss_mlp": 0.01053304, - "balance_loss_clip": 1.0570029, - "balance_loss_mlp": 1.03171563, - "epoch": 0.10533593867428227, - "flos": 18296559089280.0, - "grad_norm": 2.0356912608722877, - "language_loss": 0.79293752, - "learning_rate": 3.940759882677805e-06, - "loss": 0.81512833, - "num_input_tokens_seen": 37799515, - "step": 1752, - "time_per_iteration": 2.6501150131225586 - }, - { - "auxiliary_loss_clip": 0.01116865, - "auxiliary_loss_mlp": 0.01055489, - "balance_loss_clip": 1.05116987, - "balance_loss_mlp": 1.03264856, - "epoch": 0.10539606192695025, - "flos": 29023219555200.0, - "grad_norm": 2.022904639316529, - "language_loss": 0.75978744, - "learning_rate": 3.940665758218686e-06, - "loss": 0.78151095, - "num_input_tokens_seen": 37818695, - "step": 1753, - "time_per_iteration": 2.871335744857788 - }, - { - "auxiliary_loss_clip": 0.01141721, - "auxiliary_loss_mlp": 0.01057356, - "balance_loss_clip": 1.05547547, - "balance_loss_mlp": 1.03415775, - "epoch": 0.10545618517961822, - "flos": 19969313506560.0, - "grad_norm": 2.0563919939847914, - "language_loss": 0.83969283, - "learning_rate": 3.940571560169328e-06, - "loss": 0.86168355, - "num_input_tokens_seen": 37837860, - "step": 1754, - "time_per_iteration": 2.685591459274292 - }, - { - "auxiliary_loss_clip": 0.01136802, - "auxiliary_loss_mlp": 0.01053577, - "balance_loss_clip": 1.05587101, - "balance_loss_mlp": 1.03034329, - "epoch": 0.10551630843228618, - "flos": 16143427157760.0, - "grad_norm": 2.7567281016961087, - "language_loss": 0.68732727, - "learning_rate": 3.940477288533302e-06, - "loss": 0.70923102, - "num_input_tokens_seen": 37856260, - "step": 1755, - "time_per_iteration": 2.754117727279663 - }, - { - "auxiliary_loss_clip": 0.01161626, - "auxiliary_loss_mlp": 0.010623, - "balance_loss_clip": 1.05367684, - "balance_loss_mlp": 1.040187, - "epoch": 0.10557643168495416, - "flos": 23440115957760.0, - "grad_norm": 2.26658946748733, - "language_loss": 0.76382339, - "learning_rate": 3.940382943314182e-06, - "loss": 0.7860626, - "num_input_tokens_seen": 37876960, - "step": 1756, - "time_per_iteration": 2.686790943145752 - }, - { - "auxiliary_loss_clip": 0.01182062, - "auxiliary_loss_mlp": 0.01062906, - "balance_loss_clip": 1.05688286, - "balance_loss_mlp": 1.04203284, - "epoch": 0.10563655493762213, - "flos": 21799034357760.0, - "grad_norm": 1.5917029795724482, - "language_loss": 0.79926664, - "learning_rate": 3.940288524515547e-06, - "loss": 0.82171631, - "num_input_tokens_seen": 37897070, - "step": 1757, - "time_per_iteration": 2.6543681621551514 - }, - { - "auxiliary_loss_clip": 0.01149304, - "auxiliary_loss_mlp": 0.01057523, - "balance_loss_clip": 1.0524838, - "balance_loss_mlp": 1.03563643, - "epoch": 0.10569667819029009, - "flos": 53800863275520.0, - "grad_norm": 1.6583181970862437, - "language_loss": 0.78714895, - "learning_rate": 3.940194032140976e-06, - "loss": 0.80921721, - "num_input_tokens_seen": 37923635, - "step": 1758, - "time_per_iteration": 3.013157367706299 - }, - { - "auxiliary_loss_clip": 0.01165597, - "auxiliary_loss_mlp": 0.01054919, - "balance_loss_clip": 1.05894113, - "balance_loss_mlp": 1.03347349, - "epoch": 0.10575680144295807, - "flos": 22925515760640.0, - "grad_norm": 1.870482409236857, - "language_loss": 0.91388202, - "learning_rate": 3.940099466194054e-06, - "loss": 0.93608713, - "num_input_tokens_seen": 37942650, - "step": 1759, - "time_per_iteration": 4.1841137409210205 - }, - { - "auxiliary_loss_clip": 0.0115455, - "auxiliary_loss_mlp": 0.01056708, - "balance_loss_clip": 1.05242109, - "balance_loss_mlp": 1.03346229, - "epoch": 0.10581692469562604, - "flos": 14136667148160.0, - "grad_norm": 2.509404173865799, - "language_loss": 0.77406812, - "learning_rate": 3.940004826678365e-06, - "loss": 0.79618067, - "num_input_tokens_seen": 37960660, - "step": 1760, - "time_per_iteration": 4.476959228515625 - }, - { - "auxiliary_loss_clip": 0.01161737, - "auxiliary_loss_mlp": 0.01064522, - "balance_loss_clip": 1.0536418, - "balance_loss_mlp": 1.04053712, - "epoch": 0.105877047948294, - "flos": 25958674903680.0, - "grad_norm": 2.27300461956159, - "language_loss": 0.88896096, - "learning_rate": 3.939910113597498e-06, - "loss": 0.91122353, - "num_input_tokens_seen": 37978625, - "step": 1761, - "time_per_iteration": 2.6907520294189453 - }, - { - "auxiliary_loss_clip": 0.01110571, - "auxiliary_loss_mlp": 0.00782389, - "balance_loss_clip": 1.04964042, - "balance_loss_mlp": 1.00012767, - "epoch": 0.10593717120096197, - "flos": 30664768032000.0, - "grad_norm": 2.010693315376097, - "language_loss": 0.7809304, - "learning_rate": 3.9398153269550464e-06, - "loss": 0.79986, - "num_input_tokens_seen": 38000005, - "step": 1762, - "time_per_iteration": 2.869051456451416 - }, - { - "auxiliary_loss_clip": 0.01053171, - "auxiliary_loss_mlp": 0.0105371, - "balance_loss_clip": 1.02694225, - "balance_loss_mlp": 1.05056334, - "epoch": 0.10599729445362994, - "flos": 66436682497920.0, - "grad_norm": 0.8956567750819878, - "language_loss": 0.60503203, - "learning_rate": 3.939720466754602e-06, - "loss": 0.6261009, - "num_input_tokens_seen": 38066165, - "step": 1763, - "time_per_iteration": 5.049196720123291 - }, - { - "auxiliary_loss_clip": 0.01156865, - "auxiliary_loss_mlp": 0.01048706, - "balance_loss_clip": 1.05424261, - "balance_loss_mlp": 1.02708137, - "epoch": 0.10605741770629791, - "flos": 23948179879680.0, - "grad_norm": 2.0510547250099633, - "language_loss": 0.80232942, - "learning_rate": 3.939625532999763e-06, - "loss": 0.82438517, - "num_input_tokens_seen": 38086150, - "step": 1764, - "time_per_iteration": 4.288762807846069 - }, - { - "auxiliary_loss_clip": 0.01136032, - "auxiliary_loss_mlp": 0.01055975, - "balance_loss_clip": 1.04879069, - "balance_loss_mlp": 1.03218043, - "epoch": 0.10611754095896588, - "flos": 19387524919680.0, - "grad_norm": 1.693202084864273, - "language_loss": 0.801691, - "learning_rate": 3.9395305256941314e-06, - "loss": 0.82361102, - "num_input_tokens_seen": 38104205, - "step": 1765, - "time_per_iteration": 2.931269407272339 - }, - { - "auxiliary_loss_clip": 0.01163261, - "auxiliary_loss_mlp": 0.01058956, - "balance_loss_clip": 1.05457163, - "balance_loss_mlp": 1.0367949, - "epoch": 0.10617766421163385, - "flos": 22237755073920.0, - "grad_norm": 1.7665774264343403, - "language_loss": 0.76864165, - "learning_rate": 3.939435444841306e-06, - "loss": 0.79086387, - "num_input_tokens_seen": 38122005, - "step": 1766, - "time_per_iteration": 2.5976176261901855 - }, - { - "auxiliary_loss_clip": 0.01182495, - "auxiliary_loss_mlp": 0.01059246, - "balance_loss_clip": 1.05923963, - "balance_loss_mlp": 1.03766894, - "epoch": 0.10623778746430182, - "flos": 28404407024640.0, - "grad_norm": 1.6265727447650185, - "language_loss": 0.77311498, - "learning_rate": 3.939340290444895e-06, - "loss": 0.79553241, - "num_input_tokens_seen": 38143365, - "step": 1767, - "time_per_iteration": 2.6356630325317383 - }, - { - "auxiliary_loss_clip": 0.01006515, - "auxiliary_loss_mlp": 0.01018751, - "balance_loss_clip": 1.03004837, - "balance_loss_mlp": 1.0151509, - "epoch": 0.10629791071696978, - "flos": 64234639221120.0, - "grad_norm": 0.9172341423433896, - "language_loss": 0.57889944, - "learning_rate": 3.939245062508506e-06, - "loss": 0.59915209, - "num_input_tokens_seen": 38210035, - "step": 1768, - "time_per_iteration": 3.6866471767425537 - }, - { - "auxiliary_loss_clip": 0.01144481, - "auxiliary_loss_mlp": 0.01047419, - "balance_loss_clip": 1.0546546, - "balance_loss_mlp": 1.02687907, - "epoch": 0.10635803396963776, - "flos": 22747578762240.0, - "grad_norm": 1.4529696494540971, - "language_loss": 0.86711109, - "learning_rate": 3.939149761035749e-06, - "loss": 0.8890301, - "num_input_tokens_seen": 38231230, - "step": 1769, - "time_per_iteration": 3.936905860900879 - }, - { - "auxiliary_loss_clip": 0.01141219, - "auxiliary_loss_mlp": 0.00780338, - "balance_loss_clip": 1.05321527, - "balance_loss_mlp": 1.00008726, - "epoch": 0.10641815722230573, - "flos": 31395586147200.0, - "grad_norm": 1.8275276693890916, - "language_loss": 0.61906171, - "learning_rate": 3.9390543860302395e-06, - "loss": 0.63827729, - "num_input_tokens_seen": 38253890, - "step": 1770, - "time_per_iteration": 2.8926138877868652 - }, - { - "auxiliary_loss_clip": 0.01057689, - "auxiliary_loss_mlp": 0.01010808, - "balance_loss_clip": 1.02007711, - "balance_loss_mlp": 1.00775671, - "epoch": 0.1064782804749737, - "flos": 58552527784320.0, - "grad_norm": 0.9163874753670794, - "language_loss": 0.57049137, - "learning_rate": 3.9389589374955925e-06, - "loss": 0.59117633, - "num_input_tokens_seen": 38304290, - "step": 1771, - "time_per_iteration": 3.0783088207244873 - }, - { - "auxiliary_loss_clip": 0.01146276, - "auxiliary_loss_mlp": 0.01065918, - "balance_loss_clip": 1.05574095, - "balance_loss_mlp": 1.04465103, - "epoch": 0.10653840372764166, - "flos": 23987825516160.0, - "grad_norm": 12.794881398939157, - "language_loss": 0.88265753, - "learning_rate": 3.938863415435429e-06, - "loss": 0.90477949, - "num_input_tokens_seen": 38324725, - "step": 1772, - "time_per_iteration": 2.770202159881592 - }, - { - "auxiliary_loss_clip": 0.0118421, - "auxiliary_loss_mlp": 0.01058161, - "balance_loss_clip": 1.05697048, - "balance_loss_mlp": 1.03497458, - "epoch": 0.10659852698030964, - "flos": 18294655668480.0, - "grad_norm": 2.576940958490313, - "language_loss": 0.76030588, - "learning_rate": 3.93876781985337e-06, - "loss": 0.78272957, - "num_input_tokens_seen": 38340735, - "step": 1773, - "time_per_iteration": 2.6177070140838623 - }, - { - "auxiliary_loss_clip": 0.01122733, - "auxiliary_loss_mlp": 0.01067657, - "balance_loss_clip": 1.04691553, - "balance_loss_mlp": 1.04205084, - "epoch": 0.1066586502329776, - "flos": 32160591031680.0, - "grad_norm": 1.868288871406422, - "language_loss": 0.8330853, - "learning_rate": 3.938672150753041e-06, - "loss": 0.85498923, - "num_input_tokens_seen": 38361315, - "step": 1774, - "time_per_iteration": 2.7396061420440674 - }, - { - "auxiliary_loss_clip": 0.01156305, - "auxiliary_loss_mlp": 0.00780518, - "balance_loss_clip": 1.05627465, - "balance_loss_mlp": 1.00011277, - "epoch": 0.10671877348564557, - "flos": 17785155202560.0, - "grad_norm": 2.73383407032925, - "language_loss": 0.76446521, - "learning_rate": 3.9385764081380704e-06, - "loss": 0.78383344, - "num_input_tokens_seen": 38377425, - "step": 1775, - "time_per_iteration": 2.624208927154541 - }, - { - "auxiliary_loss_clip": 0.01063199, - "auxiliary_loss_mlp": 0.01007654, - "balance_loss_clip": 1.01726675, - "balance_loss_mlp": 1.00443542, - "epoch": 0.10677889673831355, - "flos": 63510177813120.0, - "grad_norm": 0.8200823962511624, - "language_loss": 0.57477289, - "learning_rate": 3.9384805920120876e-06, - "loss": 0.5954814, - "num_input_tokens_seen": 38440275, - "step": 1776, - "time_per_iteration": 3.1782386302948 - }, - { - "auxiliary_loss_clip": 0.01150087, - "auxiliary_loss_mlp": 0.01066244, - "balance_loss_clip": 1.05192852, - "balance_loss_mlp": 1.0407691, - "epoch": 0.10683901999098151, - "flos": 22017694400640.0, - "grad_norm": 1.4232532718517703, - "language_loss": 0.83442962, - "learning_rate": 3.938384702378727e-06, - "loss": 0.85659301, - "num_input_tokens_seen": 38461820, - "step": 1777, - "time_per_iteration": 2.7342305183410645 - }, - { - "auxiliary_loss_clip": 0.01113855, - "auxiliary_loss_mlp": 0.00780712, - "balance_loss_clip": 1.04919302, - "balance_loss_mlp": 1.00015831, - "epoch": 0.10689914324364948, - "flos": 25042952551680.0, - "grad_norm": 1.8326039994575831, - "language_loss": 0.87207437, - "learning_rate": 3.938288739241625e-06, - "loss": 0.89102006, - "num_input_tokens_seen": 38482235, - "step": 1778, - "time_per_iteration": 2.859834671020508 - }, - { - "auxiliary_loss_clip": 0.01152509, - "auxiliary_loss_mlp": 0.00780436, - "balance_loss_clip": 1.06804752, - "balance_loss_mlp": 1.00019765, - "epoch": 0.10695926649631746, - "flos": 16435129507200.0, - "grad_norm": 2.4525249429301823, - "language_loss": 0.84165859, - "learning_rate": 3.938192702604417e-06, - "loss": 0.86098808, - "num_input_tokens_seen": 38500690, - "step": 1779, - "time_per_iteration": 2.81423020362854 - }, - { - "auxiliary_loss_clip": 0.01141718, - "auxiliary_loss_mlp": 0.00779857, - "balance_loss_clip": 1.05215359, - "balance_loss_mlp": 1.0001775, - "epoch": 0.10701938974898542, - "flos": 16979211792000.0, - "grad_norm": 1.9378348403129941, - "language_loss": 0.66915894, - "learning_rate": 3.9380965924707495e-06, - "loss": 0.68837464, - "num_input_tokens_seen": 38518405, - "step": 1780, - "time_per_iteration": 2.616684913635254 - }, - { - "auxiliary_loss_clip": 0.01166288, - "auxiliary_loss_mlp": 0.01054109, - "balance_loss_clip": 1.05843914, - "balance_loss_mlp": 1.03268683, - "epoch": 0.10707951300165339, - "flos": 15888102307200.0, - "grad_norm": 1.9168180254288365, - "language_loss": 0.92058647, - "learning_rate": 3.938000408844265e-06, - "loss": 0.94279045, - "num_input_tokens_seen": 38535060, - "step": 1781, - "time_per_iteration": 2.6167802810668945 - }, - { - "auxiliary_loss_clip": 0.0113109, - "auxiliary_loss_mlp": 0.01064554, - "balance_loss_clip": 1.0531441, - "balance_loss_mlp": 1.04344225, - "epoch": 0.10713963625432135, - "flos": 14247164361600.0, - "grad_norm": 1.8357670097294174, - "language_loss": 0.79336482, - "learning_rate": 3.9379041517286105e-06, - "loss": 0.81532121, - "num_input_tokens_seen": 38552855, - "step": 1782, - "time_per_iteration": 2.7669336795806885 - }, - { - "auxiliary_loss_clip": 0.01158369, - "auxiliary_loss_mlp": 0.01061646, - "balance_loss_clip": 1.05510604, - "balance_loss_mlp": 1.04016423, - "epoch": 0.10719975950698933, - "flos": 16756780821120.0, - "grad_norm": 2.0914095256513945, - "language_loss": 0.79086542, - "learning_rate": 3.937807821127436e-06, - "loss": 0.81306553, - "num_input_tokens_seen": 38570075, - "step": 1783, - "time_per_iteration": 2.6349542140960693 - }, - { - "auxiliary_loss_clip": 0.01164267, - "auxiliary_loss_mlp": 0.01065333, - "balance_loss_clip": 1.0570296, - "balance_loss_mlp": 1.04299295, - "epoch": 0.1072598827596573, - "flos": 22710626645760.0, - "grad_norm": 2.1874612027367806, - "language_loss": 0.86421812, - "learning_rate": 3.937711417044395e-06, - "loss": 0.88651407, - "num_input_tokens_seen": 38587970, - "step": 1784, - "time_per_iteration": 2.8452541828155518 - }, - { - "auxiliary_loss_clip": 0.01153461, - "auxiliary_loss_mlp": 0.01055605, - "balance_loss_clip": 1.05502176, - "balance_loss_mlp": 1.03321707, - "epoch": 0.10732000601232526, - "flos": 23258264376960.0, - "grad_norm": 2.4649130783319553, - "language_loss": 1.01192284, - "learning_rate": 3.937614939483143e-06, - "loss": 1.03401351, - "num_input_tokens_seen": 38605840, - "step": 1785, - "time_per_iteration": 2.690018653869629 - }, - { - "auxiliary_loss_clip": 0.01168517, - "auxiliary_loss_mlp": 0.01060763, - "balance_loss_clip": 1.05854678, - "balance_loss_mlp": 1.03984189, - "epoch": 0.10738012926499324, - "flos": 24207060176640.0, - "grad_norm": 1.397915549237645, - "language_loss": 0.84951413, - "learning_rate": 3.937518388447339e-06, - "loss": 0.87180698, - "num_input_tokens_seen": 38627070, - "step": 1786, - "time_per_iteration": 2.637430191040039 - }, - { - "auxiliary_loss_clip": 0.01183118, - "auxiliary_loss_mlp": 0.01059079, - "balance_loss_clip": 1.05716729, - "balance_loss_mlp": 1.03520155, - "epoch": 0.1074402525176612, - "flos": 20923065383040.0, - "grad_norm": 1.7951357311742837, - "language_loss": 0.78861409, - "learning_rate": 3.937421763940642e-06, - "loss": 0.81103605, - "num_input_tokens_seen": 38645840, - "step": 1787, - "time_per_iteration": 2.54508900642395 - }, - { - "auxiliary_loss_clip": 0.01174896, - "auxiliary_loss_mlp": 0.01047406, - "balance_loss_clip": 1.05971575, - "balance_loss_mlp": 1.02528071, - "epoch": 0.10750037577032917, - "flos": 16946928443520.0, - "grad_norm": 1.8536072321218278, - "language_loss": 0.82307518, - "learning_rate": 3.937325065966719e-06, - "loss": 0.84529817, - "num_input_tokens_seen": 38664770, - "step": 1788, - "time_per_iteration": 2.706247568130493 - }, - { - "auxiliary_loss_clip": 0.01180896, - "auxiliary_loss_mlp": 0.01064682, - "balance_loss_clip": 1.05843878, - "balance_loss_mlp": 1.04427314, - "epoch": 0.10756049902299715, - "flos": 20266546550400.0, - "grad_norm": 2.110245519520894, - "language_loss": 0.77840686, - "learning_rate": 3.9372282945292335e-06, - "loss": 0.80086267, - "num_input_tokens_seen": 38683865, - "step": 1789, - "time_per_iteration": 2.6274654865264893 - }, - { - "auxiliary_loss_clip": 0.01185566, - "auxiliary_loss_mlp": 0.01065099, - "balance_loss_clip": 1.0604099, - "balance_loss_mlp": 1.04049408, - "epoch": 0.10762062227566511, - "flos": 23586523793280.0, - "grad_norm": 2.7248977042722524, - "language_loss": 0.74817526, - "learning_rate": 3.937131449631859e-06, - "loss": 0.77068192, - "num_input_tokens_seen": 38702485, - "step": 1790, - "time_per_iteration": 2.624382972717285 - }, - { - "auxiliary_loss_clip": 0.01178128, - "auxiliary_loss_mlp": 0.00780572, - "balance_loss_clip": 1.06110644, - "balance_loss_mlp": 1.00021124, - "epoch": 0.10768074552833308, - "flos": 24310626065280.0, - "grad_norm": 2.350797373347828, - "language_loss": 0.78764236, - "learning_rate": 3.9370345312782645e-06, - "loss": 0.80722934, - "num_input_tokens_seen": 38722475, - "step": 1791, - "time_per_iteration": 2.696162223815918 - }, - { - "auxiliary_loss_clip": 0.01134133, - "auxiliary_loss_mlp": 0.01065057, - "balance_loss_clip": 1.05280125, - "balance_loss_mlp": 1.04117918, - "epoch": 0.10774086878100106, - "flos": 25299965341440.0, - "grad_norm": 1.5879424734455678, - "language_loss": 0.70638013, - "learning_rate": 3.936937539472126e-06, - "loss": 0.7283721, - "num_input_tokens_seen": 38743285, - "step": 1792, - "time_per_iteration": 2.770874261856079 - }, - { - "auxiliary_loss_clip": 0.01149934, - "auxiliary_loss_mlp": 0.01051019, - "balance_loss_clip": 1.05610943, - "balance_loss_mlp": 1.02764249, - "epoch": 0.10780099203366902, - "flos": 22054035985920.0, - "grad_norm": 1.920104493539276, - "language_loss": 0.76565266, - "learning_rate": 3.9368404742171236e-06, - "loss": 0.78766215, - "num_input_tokens_seen": 38763035, - "step": 1793, - "time_per_iteration": 2.7218761444091797 - }, - { - "auxiliary_loss_clip": 0.01116412, - "auxiliary_loss_mlp": 0.01064574, - "balance_loss_clip": 1.05029237, - "balance_loss_mlp": 1.0414238, - "epoch": 0.10786111528633699, - "flos": 22747471021440.0, - "grad_norm": 1.7475786500241859, - "language_loss": 0.85103315, - "learning_rate": 3.936743335516936e-06, - "loss": 0.87284303, - "num_input_tokens_seen": 38784900, - "step": 1794, - "time_per_iteration": 2.7590620517730713 - }, - { - "auxiliary_loss_clip": 0.01115198, - "auxiliary_loss_mlp": 0.01055294, - "balance_loss_clip": 1.04807687, - "balance_loss_mlp": 1.03146446, - "epoch": 0.10792123853900495, - "flos": 20851064570880.0, - "grad_norm": 2.5236234593460924, - "language_loss": 0.74585378, - "learning_rate": 3.936646123375246e-06, - "loss": 0.76755869, - "num_input_tokens_seen": 38804695, - "step": 1795, - "time_per_iteration": 2.8500585556030273 - }, - { - "auxiliary_loss_clip": 0.01124895, - "auxiliary_loss_mlp": 0.01058294, - "balance_loss_clip": 1.04831553, - "balance_loss_mlp": 1.03479767, - "epoch": 0.10798136179167293, - "flos": 17748705876480.0, - "grad_norm": 2.842374039298248, - "language_loss": 0.81653619, - "learning_rate": 3.936548837795741e-06, - "loss": 0.83836806, - "num_input_tokens_seen": 38822395, - "step": 1796, - "time_per_iteration": 2.7549750804901123 - }, - { - "auxiliary_loss_clip": 0.01140492, - "auxiliary_loss_mlp": 0.01083966, - "balance_loss_clip": 1.05246449, - "balance_loss_mlp": 1.05721593, - "epoch": 0.1080414850443409, - "flos": 13589639948160.0, - "grad_norm": 2.59635455269928, - "language_loss": 0.74233043, - "learning_rate": 3.936451478782111e-06, - "loss": 0.764575, - "num_input_tokens_seen": 38839865, - "step": 1797, - "time_per_iteration": 2.6396753787994385 - }, - { - "auxiliary_loss_clip": 0.01160286, - "auxiliary_loss_mlp": 0.01049954, - "balance_loss_clip": 1.05505061, - "balance_loss_mlp": 1.02874684, - "epoch": 0.10810160829700886, - "flos": 16253421580800.0, - "grad_norm": 2.0852339617015025, - "language_loss": 0.81855786, - "learning_rate": 3.936354046338046e-06, - "loss": 0.84066033, - "num_input_tokens_seen": 38857300, - "step": 1798, - "time_per_iteration": 2.7105324268341064 - }, - { - "auxiliary_loss_clip": 0.01142859, - "auxiliary_loss_mlp": 0.01054502, - "balance_loss_clip": 1.05379176, - "balance_loss_mlp": 1.03117299, - "epoch": 0.10816173154967684, - "flos": 15158002464000.0, - "grad_norm": 2.4443000829323687, - "language_loss": 0.85516405, - "learning_rate": 3.936256540467242e-06, - "loss": 0.87713766, - "num_input_tokens_seen": 38874960, - "step": 1799, - "time_per_iteration": 4.159978628158569 - }, - { - "auxiliary_loss_clip": 0.01154352, - "auxiliary_loss_mlp": 0.01062903, - "balance_loss_clip": 1.05493283, - "balance_loss_mlp": 1.04114687, - "epoch": 0.10822185480234481, - "flos": 17785334770560.0, - "grad_norm": 2.7405734706827825, - "language_loss": 0.77434146, - "learning_rate": 3.9361589611733955e-06, - "loss": 0.79651403, - "num_input_tokens_seen": 38893610, - "step": 1800, - "time_per_iteration": 4.52047872543335 - }, - { - "auxiliary_loss_clip": 0.01178634, - "auxiliary_loss_mlp": 0.0104758, - "balance_loss_clip": 1.05722904, - "balance_loss_mlp": 1.02689719, - "epoch": 0.10828197805501277, - "flos": 25556654908800.0, - "grad_norm": 1.582468034859118, - "language_loss": 0.72897375, - "learning_rate": 3.9360613084602075e-06, - "loss": 0.75123584, - "num_input_tokens_seen": 38913485, - "step": 1801, - "time_per_iteration": 4.291400909423828 - }, - { - "auxiliary_loss_clip": 0.01190595, - "auxiliary_loss_mlp": 0.01056056, - "balance_loss_clip": 1.06095624, - "balance_loss_mlp": 1.03478956, - "epoch": 0.10834210130768075, - "flos": 28984435845120.0, - "grad_norm": 1.951139287607183, - "language_loss": 0.6634692, - "learning_rate": 3.935963582331381e-06, - "loss": 0.68593562, - "num_input_tokens_seen": 38935650, - "step": 1802, - "time_per_iteration": 2.722628355026245 - }, - { - "auxiliary_loss_clip": 0.01155661, - "auxiliary_loss_mlp": 0.01059375, - "balance_loss_clip": 1.05326533, - "balance_loss_mlp": 1.03695142, - "epoch": 0.10840222456034872, - "flos": 20264212166400.0, - "grad_norm": 2.084551157592464, - "language_loss": 0.81612957, - "learning_rate": 3.935865782790621e-06, - "loss": 0.8382799, - "num_input_tokens_seen": 38954130, - "step": 1803, - "time_per_iteration": 4.239379167556763 - }, - { - "auxiliary_loss_clip": 0.01163104, - "auxiliary_loss_mlp": 0.01061781, - "balance_loss_clip": 1.0567112, - "balance_loss_mlp": 1.03921473, - "epoch": 0.10846234781301668, - "flos": 19863054097920.0, - "grad_norm": 1.9102934552723363, - "language_loss": 0.91127038, - "learning_rate": 3.9357679098416365e-06, - "loss": 0.93351918, - "num_input_tokens_seen": 38972905, - "step": 1804, - "time_per_iteration": 2.5836737155914307 - }, - { - "auxiliary_loss_clip": 0.01136188, - "auxiliary_loss_mlp": 0.01060133, - "balance_loss_clip": 1.05617714, - "balance_loss_mlp": 1.03718543, - "epoch": 0.10852247106568465, - "flos": 26469037296000.0, - "grad_norm": 2.5742522317806262, - "language_loss": 0.76198906, - "learning_rate": 3.935669963488139e-06, - "loss": 0.78395224, - "num_input_tokens_seen": 38993255, - "step": 1805, - "time_per_iteration": 2.783137321472168 - }, - { - "auxiliary_loss_clip": 0.01149468, - "auxiliary_loss_mlp": 0.01050946, - "balance_loss_clip": 1.05419612, - "balance_loss_mlp": 1.03050184, - "epoch": 0.10858259431835263, - "flos": 30081506987520.0, - "grad_norm": 1.7049574807827799, - "language_loss": 0.85876733, - "learning_rate": 3.935571943733843e-06, - "loss": 0.88077152, - "num_input_tokens_seen": 39012610, - "step": 1806, - "time_per_iteration": 2.8148701190948486 - }, - { - "auxiliary_loss_clip": 0.01168733, - "auxiliary_loss_mlp": 0.00779888, - "balance_loss_clip": 1.05462408, - "balance_loss_mlp": 1.00006652, - "epoch": 0.10864271757102059, - "flos": 19063180085760.0, - "grad_norm": 2.554050049117878, - "language_loss": 0.8108198, - "learning_rate": 3.9354738505824635e-06, - "loss": 0.83030605, - "num_input_tokens_seen": 39030120, - "step": 1807, - "time_per_iteration": 2.6275649070739746 - }, - { - "auxiliary_loss_clip": 0.01139085, - "auxiliary_loss_mlp": 0.01055438, - "balance_loss_clip": 1.05193985, - "balance_loss_mlp": 1.03522038, - "epoch": 0.10870284082368856, - "flos": 24715052271360.0, - "grad_norm": 1.834914777588586, - "language_loss": 0.78910971, - "learning_rate": 3.9353756840377225e-06, - "loss": 0.81105494, - "num_input_tokens_seen": 39049875, - "step": 1808, - "time_per_iteration": 2.722910165786743 - }, - { - "auxiliary_loss_clip": 0.01157997, - "auxiliary_loss_mlp": 0.01056971, - "balance_loss_clip": 1.05918014, - "balance_loss_mlp": 1.03548992, - "epoch": 0.10876296407635654, - "flos": 20627663932800.0, - "grad_norm": 1.6201371380093192, - "language_loss": 0.79013431, - "learning_rate": 3.935277444103342e-06, - "loss": 0.81228393, - "num_input_tokens_seen": 39068935, - "step": 1809, - "time_per_iteration": 2.7261481285095215 - }, - { - "auxiliary_loss_clip": 0.01180468, - "auxiliary_loss_mlp": 0.01057915, - "balance_loss_clip": 1.0568099, - "balance_loss_mlp": 1.03705359, - "epoch": 0.1088230873290245, - "flos": 21579835610880.0, - "grad_norm": 1.9004896030263678, - "language_loss": 0.85129547, - "learning_rate": 3.935179130783046e-06, - "loss": 0.87367928, - "num_input_tokens_seen": 39087370, - "step": 1810, - "time_per_iteration": 2.672696828842163 - }, - { - "auxiliary_loss_clip": 0.01124301, - "auxiliary_loss_mlp": 0.01057363, - "balance_loss_clip": 1.04580724, - "balance_loss_mlp": 1.0335803, - "epoch": 0.10888321058169247, - "flos": 26469037296000.0, - "grad_norm": 1.5993643379141278, - "language_loss": 0.63822675, - "learning_rate": 3.935080744080564e-06, - "loss": 0.66004336, - "num_input_tokens_seen": 39106635, - "step": 1811, - "time_per_iteration": 2.7731611728668213 - }, - { - "auxiliary_loss_clip": 0.01151891, - "auxiliary_loss_mlp": 0.01050225, - "balance_loss_clip": 1.05335796, - "balance_loss_mlp": 1.02836192, - "epoch": 0.10894333383436045, - "flos": 25848608653440.0, - "grad_norm": 1.9284151803363307, - "language_loss": 0.74238706, - "learning_rate": 3.934982283999626e-06, - "loss": 0.76440823, - "num_input_tokens_seen": 39126335, - "step": 1812, - "time_per_iteration": 2.727743625640869 - }, - { - "auxiliary_loss_clip": 0.01142498, - "auxiliary_loss_mlp": 0.01057826, - "balance_loss_clip": 1.05199611, - "balance_loss_mlp": 1.03546214, - "epoch": 0.10900345708702841, - "flos": 19537093152000.0, - "grad_norm": 1.5783196636767667, - "language_loss": 0.72746086, - "learning_rate": 3.934883750543966e-06, - "loss": 0.74946409, - "num_input_tokens_seen": 39144820, - "step": 1813, - "time_per_iteration": 2.798297166824341 - }, - { - "auxiliary_loss_clip": 0.0113892, - "auxiliary_loss_mlp": 0.01056639, - "balance_loss_clip": 1.0511452, - "balance_loss_mlp": 1.03515792, - "epoch": 0.10906358033969638, - "flos": 23623296341760.0, - "grad_norm": 1.635228619121262, - "language_loss": 0.82981038, - "learning_rate": 3.93478514371732e-06, - "loss": 0.85176599, - "num_input_tokens_seen": 39165945, - "step": 1814, - "time_per_iteration": 2.7120048999786377 - }, - { - "auxiliary_loss_clip": 0.01141958, - "auxiliary_loss_mlp": 0.01058857, - "balance_loss_clip": 1.0537864, - "balance_loss_mlp": 1.03787625, - "epoch": 0.10912370359236434, - "flos": 21214731818880.0, - "grad_norm": 1.9556743991494996, - "language_loss": 0.84310579, - "learning_rate": 3.934686463523429e-06, - "loss": 0.86511397, - "num_input_tokens_seen": 39183520, - "step": 1815, - "time_per_iteration": 2.788870096206665 - }, - { - "auxiliary_loss_clip": 0.01146878, - "auxiliary_loss_mlp": 0.01055141, - "balance_loss_clip": 1.05443966, - "balance_loss_mlp": 1.03182411, - "epoch": 0.10918382684503232, - "flos": 13553190622080.0, - "grad_norm": 2.5374826422013195, - "language_loss": 0.71670222, - "learning_rate": 3.9345877099660315e-06, - "loss": 0.73872244, - "num_input_tokens_seen": 39201190, - "step": 1816, - "time_per_iteration": 2.8424103260040283 - }, - { - "auxiliary_loss_clip": 0.01164173, - "auxiliary_loss_mlp": 0.01064184, - "balance_loss_clip": 1.05216932, - "balance_loss_mlp": 1.04052126, - "epoch": 0.10924395009770028, - "flos": 27964321591680.0, - "grad_norm": 2.016899555923086, - "language_loss": 0.72880268, - "learning_rate": 3.9344888830488744e-06, - "loss": 0.75108624, - "num_input_tokens_seen": 39221210, - "step": 1817, - "time_per_iteration": 2.7320947647094727 - }, - { - "auxiliary_loss_clip": 0.01116915, - "auxiliary_loss_mlp": 0.01057856, - "balance_loss_clip": 1.05173278, - "balance_loss_mlp": 1.03517008, - "epoch": 0.10930407335036825, - "flos": 25593750679680.0, - "grad_norm": 1.5988628345308824, - "language_loss": 0.67275256, - "learning_rate": 3.934389982775706e-06, - "loss": 0.69450033, - "num_input_tokens_seen": 39242025, - "step": 1818, - "time_per_iteration": 2.8700790405273438 - }, - { - "auxiliary_loss_clip": 0.01155804, - "auxiliary_loss_mlp": 0.01065952, - "balance_loss_clip": 1.05673873, - "balance_loss_mlp": 1.04313517, - "epoch": 0.10936419660303623, - "flos": 18406194376320.0, - "grad_norm": 3.593580913512793, - "language_loss": 0.73149616, - "learning_rate": 3.934291009150275e-06, - "loss": 0.75371373, - "num_input_tokens_seen": 39259870, - "step": 1819, - "time_per_iteration": 2.7091007232666016 - }, - { - "auxiliary_loss_clip": 0.01142955, - "auxiliary_loss_mlp": 0.00779155, - "balance_loss_clip": 1.05341268, - "balance_loss_mlp": 1.00027704, - "epoch": 0.1094243198557042, - "flos": 23840052963840.0, - "grad_norm": 4.531598275817935, - "language_loss": 0.73764241, - "learning_rate": 3.934191962176335e-06, - "loss": 0.75686359, - "num_input_tokens_seen": 39278500, - "step": 1820, - "time_per_iteration": 2.6513099670410156 - }, - { - "auxiliary_loss_clip": 0.01179358, - "auxiliary_loss_mlp": 0.01056073, - "balance_loss_clip": 1.05747604, - "balance_loss_mlp": 1.03297031, - "epoch": 0.10948444310837216, - "flos": 14643940970880.0, - "grad_norm": 2.2567103978329337, - "language_loss": 0.82532805, - "learning_rate": 3.934092841857642e-06, - "loss": 0.84768236, - "num_input_tokens_seen": 39294800, - "step": 1821, - "time_per_iteration": 2.5348384380340576 - }, - { - "auxiliary_loss_clip": 0.01148016, - "auxiliary_loss_mlp": 0.01052031, - "balance_loss_clip": 1.05133605, - "balance_loss_mlp": 1.03077567, - "epoch": 0.10954456636104014, - "flos": 27818811596160.0, - "grad_norm": 2.0770330480401578, - "language_loss": 0.76271641, - "learning_rate": 3.933993648197955e-06, - "loss": 0.7847169, - "num_input_tokens_seen": 39314625, - "step": 1822, - "time_per_iteration": 2.730079174041748 - }, - { - "auxiliary_loss_clip": 0.01142446, - "auxiliary_loss_mlp": 0.01049259, - "balance_loss_clip": 1.04849207, - "balance_loss_mlp": 1.02856421, - "epoch": 0.1096046896137081, - "flos": 33620934372480.0, - "grad_norm": 1.734419613996414, - "language_loss": 0.79309607, - "learning_rate": 3.933894381201034e-06, - "loss": 0.81501311, - "num_input_tokens_seen": 39336465, - "step": 1823, - "time_per_iteration": 2.756969928741455 - }, - { - "auxiliary_loss_clip": 0.01148165, - "auxiliary_loss_mlp": 0.01049595, - "balance_loss_clip": 1.05160606, - "balance_loss_mlp": 1.02745807, - "epoch": 0.10966481286637607, - "flos": 26980010219520.0, - "grad_norm": 1.4318009514182364, - "language_loss": 0.79590744, - "learning_rate": 3.933795040870645e-06, - "loss": 0.81788504, - "num_input_tokens_seen": 39357930, - "step": 1824, - "time_per_iteration": 2.798168182373047 - }, - { - "auxiliary_loss_clip": 0.01142146, - "auxiliary_loss_mlp": 0.01055513, - "balance_loss_clip": 1.05104232, - "balance_loss_mlp": 1.03381693, - "epoch": 0.10972493611904403, - "flos": 23036551678080.0, - "grad_norm": 2.127143421089703, - "language_loss": 0.88138539, - "learning_rate": 3.933695627210554e-06, - "loss": 0.90336192, - "num_input_tokens_seen": 39376380, - "step": 1825, - "time_per_iteration": 2.6804513931274414 - }, - { - "auxiliary_loss_clip": 0.01128623, - "auxiliary_loss_mlp": 0.01056127, - "balance_loss_clip": 1.04586983, - "balance_loss_mlp": 1.03439498, - "epoch": 0.10978505937171201, - "flos": 38104632443520.0, - "grad_norm": 1.721192594935189, - "language_loss": 0.76441038, - "learning_rate": 3.933596140224532e-06, - "loss": 0.78625786, - "num_input_tokens_seen": 39399935, - "step": 1826, - "time_per_iteration": 2.8315086364746094 - }, - { - "auxiliary_loss_clip": 0.01063155, - "auxiliary_loss_mlp": 0.01016957, - "balance_loss_clip": 1.02709544, - "balance_loss_mlp": 1.01409554, - "epoch": 0.10984518262437998, - "flos": 59849694616320.0, - "grad_norm": 0.8518463216820418, - "language_loss": 0.54997343, - "learning_rate": 3.93349657991635e-06, - "loss": 0.57077461, - "num_input_tokens_seen": 39460685, - "step": 1827, - "time_per_iteration": 3.1425766944885254 - }, - { - "auxiliary_loss_clip": 0.01072651, - "auxiliary_loss_mlp": 0.01010167, - "balance_loss_clip": 1.02693772, - "balance_loss_mlp": 1.00717473, - "epoch": 0.10990530587704794, - "flos": 66719837410560.0, - "grad_norm": 0.7375455878808789, - "language_loss": 0.55382878, - "learning_rate": 3.933396946289784e-06, - "loss": 0.57465696, - "num_input_tokens_seen": 39524765, - "step": 1828, - "time_per_iteration": 3.168165922164917 - }, - { - "auxiliary_loss_clip": 0.01156998, - "auxiliary_loss_mlp": 0.01059335, - "balance_loss_clip": 1.05407059, - "balance_loss_mlp": 1.03618491, - "epoch": 0.10996542912971592, - "flos": 25447199189760.0, - "grad_norm": 2.250827401167328, - "language_loss": 0.84010404, - "learning_rate": 3.933297239348612e-06, - "loss": 0.86226743, - "num_input_tokens_seen": 39543640, - "step": 1829, - "time_per_iteration": 2.7341628074645996 - }, - { - "auxiliary_loss_clip": 0.01130747, - "auxiliary_loss_mlp": 0.01053464, - "balance_loss_clip": 1.0547024, - "balance_loss_mlp": 1.03036165, - "epoch": 0.11002555238238389, - "flos": 44018186186880.0, - "grad_norm": 2.342204785330024, - "language_loss": 0.88880253, - "learning_rate": 3.933197459096614e-06, - "loss": 0.91064465, - "num_input_tokens_seen": 39567525, - "step": 1830, - "time_per_iteration": 2.9093260765075684 - }, - { - "auxiliary_loss_clip": 0.01049643, - "auxiliary_loss_mlp": 0.01009685, - "balance_loss_clip": 1.02618647, - "balance_loss_mlp": 1.00681162, - "epoch": 0.11008567563505185, - "flos": 54065133590400.0, - "grad_norm": 0.6882192363357665, - "language_loss": 0.55566543, - "learning_rate": 3.9330976055375756e-06, - "loss": 0.57625872, - "num_input_tokens_seen": 39628470, - "step": 1831, - "time_per_iteration": 3.1713974475860596 - }, - { - "auxiliary_loss_clip": 0.01156783, - "auxiliary_loss_mlp": 0.01073931, - "balance_loss_clip": 1.05708003, - "balance_loss_mlp": 1.04965997, - "epoch": 0.11014579888771983, - "flos": 24243150366720.0, - "grad_norm": 2.4937725361201495, - "language_loss": 0.90836191, - "learning_rate": 3.932997678675282e-06, - "loss": 0.93066907, - "num_input_tokens_seen": 39646670, - "step": 1832, - "time_per_iteration": 2.6786489486694336 - }, - { - "auxiliary_loss_clip": 0.0106111, - "auxiliary_loss_mlp": 0.01010664, - "balance_loss_clip": 1.02332854, - "balance_loss_mlp": 1.00769615, - "epoch": 0.1102059221403878, - "flos": 57743965658880.0, - "grad_norm": 0.7154576595208243, - "language_loss": 0.59911001, - "learning_rate": 3.932897678513523e-06, - "loss": 0.61982775, - "num_input_tokens_seen": 39712915, - "step": 1833, - "time_per_iteration": 3.1802401542663574 - }, - { - "auxiliary_loss_clip": 0.01167201, - "auxiliary_loss_mlp": 0.0105502, - "balance_loss_clip": 1.05312014, - "balance_loss_mlp": 1.03285873, - "epoch": 0.11026604539305576, - "flos": 16795923667200.0, - "grad_norm": 2.6772934272606923, - "language_loss": 0.80799395, - "learning_rate": 3.93279760505609e-06, - "loss": 0.83021617, - "num_input_tokens_seen": 39730650, - "step": 1834, - "time_per_iteration": 2.591374635696411 - }, - { - "auxiliary_loss_clip": 0.01141662, - "auxiliary_loss_mlp": 0.01054827, - "balance_loss_clip": 1.05557871, - "balance_loss_mlp": 1.03004324, - "epoch": 0.11032616864572373, - "flos": 23988076911360.0, - "grad_norm": 2.4853906687508247, - "language_loss": 0.89856094, - "learning_rate": 3.932697458306779e-06, - "loss": 0.92052579, - "num_input_tokens_seen": 39751065, - "step": 1835, - "time_per_iteration": 2.742330312728882 - }, - { - "auxiliary_loss_clip": 0.01131787, - "auxiliary_loss_mlp": 0.01063812, - "balance_loss_clip": 1.0524013, - "balance_loss_mlp": 1.03758645, - "epoch": 0.1103862918983917, - "flos": 19683141851520.0, - "grad_norm": 2.2754442269720023, - "language_loss": 0.63256055, - "learning_rate": 3.932597238269386e-06, - "loss": 0.65451658, - "num_input_tokens_seen": 39769245, - "step": 1836, - "time_per_iteration": 2.6935038566589355 - }, - { - "auxiliary_loss_clip": 0.01138919, - "auxiliary_loss_mlp": 0.01061469, - "balance_loss_clip": 1.05021358, - "balance_loss_mlp": 1.03954661, - "epoch": 0.11044641515105967, - "flos": 32160878340480.0, - "grad_norm": 1.6726289784191204, - "language_loss": 0.72792488, - "learning_rate": 3.932496944947711e-06, - "loss": 0.74992871, - "num_input_tokens_seen": 39790830, - "step": 1837, - "time_per_iteration": 2.7790510654449463 - }, - { - "auxiliary_loss_clip": 0.01165472, - "auxiliary_loss_mlp": 0.01057035, - "balance_loss_clip": 1.05463088, - "balance_loss_mlp": 1.03551781, - "epoch": 0.11050653840372764, - "flos": 16689233295360.0, - "grad_norm": 2.027055787194766, - "language_loss": 0.78489268, - "learning_rate": 3.93239657834556e-06, - "loss": 0.8071177, - "num_input_tokens_seen": 39809475, - "step": 1838, - "time_per_iteration": 4.098532438278198 - }, - { - "auxiliary_loss_clip": 0.01154042, - "auxiliary_loss_mlp": 0.01062407, - "balance_loss_clip": 1.05542612, - "balance_loss_mlp": 1.03970969, - "epoch": 0.11056666165639562, - "flos": 21208877902080.0, - "grad_norm": 2.046221888979386, - "language_loss": 0.71451718, - "learning_rate": 3.932296138466736e-06, - "loss": 0.7366817, - "num_input_tokens_seen": 39826355, - "step": 1839, - "time_per_iteration": 4.205714464187622 - }, - { - "auxiliary_loss_clip": 0.01187588, - "auxiliary_loss_mlp": 0.00781104, - "balance_loss_clip": 1.06183171, - "balance_loss_mlp": 1.00018013, - "epoch": 0.11062678490906358, - "flos": 19165488998400.0, - "grad_norm": 2.623062836625425, - "language_loss": 0.79027873, - "learning_rate": 3.93219562531505e-06, - "loss": 0.80996567, - "num_input_tokens_seen": 39845335, - "step": 1840, - "time_per_iteration": 2.6023378372192383 - }, - { - "auxiliary_loss_clip": 0.01156508, - "auxiliary_loss_mlp": 0.01052512, - "balance_loss_clip": 1.05206251, - "balance_loss_mlp": 1.02887261, - "epoch": 0.11068690816173155, - "flos": 24895287740160.0, - "grad_norm": 1.7551987843009527, - "language_loss": 0.88083529, - "learning_rate": 3.932095038894311e-06, - "loss": 0.90292549, - "num_input_tokens_seen": 39865065, - "step": 1841, - "time_per_iteration": 4.3361639976501465 - }, - { - "auxiliary_loss_clip": 0.01130203, - "auxiliary_loss_mlp": 0.01067683, - "balance_loss_clip": 1.05036247, - "balance_loss_mlp": 1.04453301, - "epoch": 0.11074703141439952, - "flos": 16472368932480.0, - "grad_norm": 3.1603067125494126, - "language_loss": 0.90521991, - "learning_rate": 3.931994379208334e-06, - "loss": 0.92719877, - "num_input_tokens_seen": 39882780, - "step": 1842, - "time_per_iteration": 2.7086760997772217 - }, - { - "auxiliary_loss_clip": 0.01152506, - "auxiliary_loss_mlp": 0.01061227, - "balance_loss_clip": 1.05065131, - "balance_loss_mlp": 1.03982854, - "epoch": 0.11080715466706749, - "flos": 19172420323200.0, - "grad_norm": 2.112801816568727, - "language_loss": 0.85845053, - "learning_rate": 3.931893646260937e-06, - "loss": 0.88058788, - "num_input_tokens_seen": 39900295, - "step": 1843, - "time_per_iteration": 4.263117790222168 - }, - { - "auxiliary_loss_clip": 0.01119254, - "auxiliary_loss_mlp": 0.00783076, - "balance_loss_clip": 1.05050898, - "balance_loss_mlp": 1.00012159, - "epoch": 0.11086727791973545, - "flos": 27704687109120.0, - "grad_norm": 1.4511349711086798, - "language_loss": 0.74735641, - "learning_rate": 3.931792840055941e-06, - "loss": 0.76637971, - "num_input_tokens_seen": 39922075, - "step": 1844, - "time_per_iteration": 2.7999000549316406 - }, - { - "auxiliary_loss_clip": 0.01180395, - "auxiliary_loss_mlp": 0.01055824, - "balance_loss_clip": 1.05662274, - "balance_loss_mlp": 1.03238785, - "epoch": 0.11092740117240343, - "flos": 18514967736960.0, - "grad_norm": 2.017286766878137, - "language_loss": 0.7566812, - "learning_rate": 3.931691960597165e-06, - "loss": 0.77904338, - "num_input_tokens_seen": 39940115, - "step": 1845, - "time_per_iteration": 2.5305535793304443 - }, - { - "auxiliary_loss_clip": 0.01153403, - "auxiliary_loss_mlp": 0.01058911, - "balance_loss_clip": 1.05442989, - "balance_loss_mlp": 1.03807366, - "epoch": 0.1109875244250714, - "flos": 20522446018560.0, - "grad_norm": 1.9628359583393364, - "language_loss": 0.75953126, - "learning_rate": 3.9315910078884375e-06, - "loss": 0.78165436, - "num_input_tokens_seen": 39959920, - "step": 1846, - "time_per_iteration": 2.719325542449951 - }, - { - "auxiliary_loss_clip": 0.01173899, - "auxiliary_loss_mlp": 0.01059369, - "balance_loss_clip": 1.05823123, - "balance_loss_mlp": 1.03717244, - "epoch": 0.11104764767773936, - "flos": 14098601710080.0, - "grad_norm": 2.612459533347621, - "language_loss": 0.8620472, - "learning_rate": 3.931489981933584e-06, - "loss": 0.88437986, - "num_input_tokens_seen": 39974755, - "step": 1847, - "time_per_iteration": 2.7705559730529785 - }, - { - "auxiliary_loss_clip": 0.01181158, - "auxiliary_loss_mlp": 0.01055145, - "balance_loss_clip": 1.05562854, - "balance_loss_mlp": 1.0322808, - "epoch": 0.11110777093040733, - "flos": 20594518657920.0, - "grad_norm": 1.8452742714770096, - "language_loss": 0.76981926, - "learning_rate": 3.931388882736438e-06, - "loss": 0.79218227, - "num_input_tokens_seen": 39993355, - "step": 1848, - "time_per_iteration": 2.605933666229248 - }, - { - "auxiliary_loss_clip": 0.01172398, - "auxiliary_loss_mlp": 0.01056349, - "balance_loss_clip": 1.06262445, - "balance_loss_mlp": 1.03455794, - "epoch": 0.11116789418307531, - "flos": 21870065502720.0, - "grad_norm": 1.6943193134392138, - "language_loss": 0.77621841, - "learning_rate": 3.931287710300832e-06, - "loss": 0.7985059, - "num_input_tokens_seen": 40012410, - "step": 1849, - "time_per_iteration": 2.678415536880493 - }, - { - "auxiliary_loss_clip": 0.01138995, - "auxiliary_loss_mlp": 0.00781122, - "balance_loss_clip": 1.05277848, - "balance_loss_mlp": 1.00010324, - "epoch": 0.11122801743574327, - "flos": 15523106256000.0, - "grad_norm": 3.3234972538165066, - "language_loss": 0.72098577, - "learning_rate": 3.931186464630601e-06, - "loss": 0.74018693, - "num_input_tokens_seen": 40029315, - "step": 1850, - "time_per_iteration": 2.7763028144836426 - }, - { - "auxiliary_loss_clip": 0.01170569, - "auxiliary_loss_mlp": 0.01061108, - "balance_loss_clip": 1.05759382, - "balance_loss_mlp": 1.03874469, - "epoch": 0.11128814068841124, - "flos": 14392279307520.0, - "grad_norm": 2.0638339407107873, - "language_loss": 0.81499028, - "learning_rate": 3.931085145729588e-06, - "loss": 0.83730704, - "num_input_tokens_seen": 40045765, - "step": 1851, - "time_per_iteration": 2.688854694366455 - }, - { - "auxiliary_loss_clip": 0.01164692, - "auxiliary_loss_mlp": 0.01061301, - "balance_loss_clip": 1.05789042, - "balance_loss_mlp": 1.04027295, - "epoch": 0.11134826394107922, - "flos": 16653933204480.0, - "grad_norm": 2.365035468310974, - "language_loss": 0.88270009, - "learning_rate": 3.930983753601631e-06, - "loss": 0.90496004, - "num_input_tokens_seen": 40061660, - "step": 1852, - "time_per_iteration": 2.659914493560791 - }, - { - "auxiliary_loss_clip": 0.01166772, - "auxiliary_loss_mlp": 0.01060698, - "balance_loss_clip": 1.05489326, - "balance_loss_mlp": 1.03791702, - "epoch": 0.11140838719374718, - "flos": 16690993061760.0, - "grad_norm": 2.1825610274136054, - "language_loss": 0.72492862, - "learning_rate": 3.930882288250578e-06, - "loss": 0.74720335, - "num_input_tokens_seen": 40080180, - "step": 1853, - "time_per_iteration": 2.7840964794158936 - }, - { - "auxiliary_loss_clip": 0.01069898, - "auxiliary_loss_mlp": 0.01019902, - "balance_loss_clip": 1.02549517, - "balance_loss_mlp": 1.01701725, - "epoch": 0.11146851044641515, - "flos": 60976355587200.0, - "grad_norm": 0.772231443606995, - "language_loss": 0.53664064, - "learning_rate": 3.930780749680273e-06, - "loss": 0.55753863, - "num_input_tokens_seen": 40138910, - "step": 1854, - "time_per_iteration": 3.089354991912842 - }, - { - "auxiliary_loss_clip": 0.01159576, - "auxiliary_loss_mlp": 0.0105585, - "balance_loss_clip": 1.05390525, - "balance_loss_mlp": 1.03184092, - "epoch": 0.11152863369908313, - "flos": 22193835719040.0, - "grad_norm": 1.863523240792578, - "language_loss": 0.8468501, - "learning_rate": 3.9306791378945705e-06, - "loss": 0.86900431, - "num_input_tokens_seen": 40157745, - "step": 1855, - "time_per_iteration": 2.7361156940460205 - }, - { - "auxiliary_loss_clip": 0.01147504, - "auxiliary_loss_mlp": 0.01064479, - "balance_loss_clip": 1.05225825, - "balance_loss_mlp": 1.0424726, - "epoch": 0.11158875695175109, - "flos": 19537524115200.0, - "grad_norm": 2.1217067547931756, - "language_loss": 0.81187081, - "learning_rate": 3.9305774528973205e-06, - "loss": 0.83399057, - "num_input_tokens_seen": 40175375, - "step": 1856, - "time_per_iteration": 2.7158002853393555 - }, - { - "auxiliary_loss_clip": 0.01168288, - "auxiliary_loss_mlp": 0.01052259, - "balance_loss_clip": 1.05843937, - "balance_loss_mlp": 1.02957392, - "epoch": 0.11164888020441906, - "flos": 25442709989760.0, - "grad_norm": 2.0555738298465314, - "language_loss": 0.82761133, - "learning_rate": 3.93047569469238e-06, - "loss": 0.8498168, - "num_input_tokens_seen": 40195715, - "step": 1857, - "time_per_iteration": 2.647184133529663 - }, - { - "auxiliary_loss_clip": 0.01144196, - "auxiliary_loss_mlp": 0.01044915, - "balance_loss_clip": 1.05255508, - "balance_loss_mlp": 1.02395833, - "epoch": 0.11170900345708702, - "flos": 15632741543040.0, - "grad_norm": 2.3199985887988914, - "language_loss": 0.83131742, - "learning_rate": 3.930373863283608e-06, - "loss": 0.85320854, - "num_input_tokens_seen": 40213975, - "step": 1858, - "time_per_iteration": 2.726905107498169 - }, - { - "auxiliary_loss_clip": 0.01134962, - "auxiliary_loss_mlp": 0.01067658, - "balance_loss_clip": 1.04900265, - "balance_loss_mlp": 1.04350638, - "epoch": 0.111769126709755, - "flos": 23039424766080.0, - "grad_norm": 2.0395414997027657, - "language_loss": 0.9133389, - "learning_rate": 3.930271958674866e-06, - "loss": 0.93536508, - "num_input_tokens_seen": 40233905, - "step": 1859, - "time_per_iteration": 3.0006766319274902 - }, - { - "auxiliary_loss_clip": 0.01167289, - "auxiliary_loss_mlp": 0.01049698, - "balance_loss_clip": 1.05445409, - "balance_loss_mlp": 1.02751315, - "epoch": 0.11182924996242297, - "flos": 20850705434880.0, - "grad_norm": 2.048197345879043, - "language_loss": 0.81528586, - "learning_rate": 3.930169980870018e-06, - "loss": 0.83745575, - "num_input_tokens_seen": 40252810, - "step": 1860, - "time_per_iteration": 2.7216553688049316 - }, - { - "auxiliary_loss_clip": 0.01154007, - "auxiliary_loss_mlp": 0.01060885, - "balance_loss_clip": 1.05737674, - "balance_loss_mlp": 1.03920078, - "epoch": 0.11188937321509093, - "flos": 17455315587840.0, - "grad_norm": 2.00330439318394, - "language_loss": 0.75250578, - "learning_rate": 3.930067929872931e-06, - "loss": 0.77465475, - "num_input_tokens_seen": 40272000, - "step": 1861, - "time_per_iteration": 2.6878490447998047 - }, - { - "auxiliary_loss_clip": 0.01177651, - "auxiliary_loss_mlp": 0.01054452, - "balance_loss_clip": 1.0565964, - "balance_loss_mlp": 1.03360212, - "epoch": 0.11194949646775891, - "flos": 24095916518400.0, - "grad_norm": 1.9427039767358767, - "language_loss": 0.88888168, - "learning_rate": 3.929965805687474e-06, - "loss": 0.91120267, - "num_input_tokens_seen": 40290660, - "step": 1862, - "time_per_iteration": 2.615057945251465 - }, - { - "auxiliary_loss_clip": 0.01164251, - "auxiliary_loss_mlp": 0.01062894, - "balance_loss_clip": 1.05994737, - "balance_loss_mlp": 1.04086459, - "epoch": 0.11200961972042688, - "flos": 25153880728320.0, - "grad_norm": 2.2273555113866847, - "language_loss": 0.87719512, - "learning_rate": 3.92986360831752e-06, - "loss": 0.89946657, - "num_input_tokens_seen": 40307820, - "step": 1863, - "time_per_iteration": 2.6778175830841064 - }, - { - "auxiliary_loss_clip": 0.01158667, - "auxiliary_loss_mlp": 0.01055299, - "balance_loss_clip": 1.05455208, - "balance_loss_mlp": 1.03071773, - "epoch": 0.11206974297309484, - "flos": 21288312829440.0, - "grad_norm": 2.8013407816012226, - "language_loss": 0.64245486, - "learning_rate": 3.929761337766945e-06, - "loss": 0.66459453, - "num_input_tokens_seen": 40327430, - "step": 1864, - "time_per_iteration": 2.724076509475708 - }, - { - "auxiliary_loss_clip": 0.01110154, - "auxiliary_loss_mlp": 0.01047933, - "balance_loss_clip": 1.04924703, - "balance_loss_mlp": 1.02672601, - "epoch": 0.11212986622576282, - "flos": 18915982151040.0, - "grad_norm": 2.0303098144917135, - "language_loss": 0.74043733, - "learning_rate": 3.929658994039627e-06, - "loss": 0.7620182, - "num_input_tokens_seen": 40344545, - "step": 1865, - "time_per_iteration": 2.8119356632232666 - }, - { - "auxiliary_loss_clip": 0.01114683, - "auxiliary_loss_mlp": 0.01070203, - "balance_loss_clip": 1.05348182, - "balance_loss_mlp": 1.04483545, - "epoch": 0.11218998947843078, - "flos": 22054754257920.0, - "grad_norm": 2.7389427033573375, - "language_loss": 0.84692436, - "learning_rate": 3.929556577139446e-06, - "loss": 0.86877316, - "num_input_tokens_seen": 40362300, - "step": 1866, - "time_per_iteration": 2.8022067546844482 - }, - { - "auxiliary_loss_clip": 0.01092364, - "auxiliary_loss_mlp": 0.00781014, - "balance_loss_clip": 1.04227424, - "balance_loss_mlp": 1.00006938, - "epoch": 0.11225011273109875, - "flos": 24571697091840.0, - "grad_norm": 1.704208120094955, - "language_loss": 0.8104012, - "learning_rate": 3.929454087070286e-06, - "loss": 0.82913494, - "num_input_tokens_seen": 40384720, - "step": 1867, - "time_per_iteration": 2.915989875793457 - }, - { - "auxiliary_loss_clip": 0.01179505, - "auxiliary_loss_mlp": 0.01060529, - "balance_loss_clip": 1.05720687, - "balance_loss_mlp": 1.03959608, - "epoch": 0.11231023598376672, - "flos": 28438665621120.0, - "grad_norm": 2.0811636681692844, - "language_loss": 0.86840278, - "learning_rate": 3.929351523836035e-06, - "loss": 0.8908031, - "num_input_tokens_seen": 40404000, - "step": 1868, - "time_per_iteration": 2.6855647563934326 - }, - { - "auxiliary_loss_clip": 0.01161412, - "auxiliary_loss_mlp": 0.00779977, - "balance_loss_clip": 1.06005311, - "balance_loss_mlp": 1.00010097, - "epoch": 0.1123703592364347, - "flos": 14426466076800.0, - "grad_norm": 2.1491178409138376, - "language_loss": 0.68308532, - "learning_rate": 3.9292488874405795e-06, - "loss": 0.70249927, - "num_input_tokens_seen": 40418665, - "step": 1869, - "time_per_iteration": 2.7404487133026123 - }, - { - "auxiliary_loss_clip": 0.01133783, - "auxiliary_loss_mlp": 0.01066188, - "balance_loss_clip": 1.04932964, - "balance_loss_mlp": 1.04225063, - "epoch": 0.11243048248910266, - "flos": 22236282616320.0, - "grad_norm": 1.5255545896853626, - "language_loss": 0.76943326, - "learning_rate": 3.929146177887814e-06, - "loss": 0.79143298, - "num_input_tokens_seen": 40437870, - "step": 1870, - "time_per_iteration": 2.809734344482422 - }, - { - "auxiliary_loss_clip": 0.01129358, - "auxiliary_loss_mlp": 0.01056867, - "balance_loss_clip": 1.0509038, - "balance_loss_mlp": 1.03300166, - "epoch": 0.11249060574177062, - "flos": 18584167288320.0, - "grad_norm": 1.8186132867503446, - "language_loss": 0.76056099, - "learning_rate": 3.929043395181631e-06, - "loss": 0.78242326, - "num_input_tokens_seen": 40455570, - "step": 1871, - "time_per_iteration": 2.727161169052124 - }, - { - "auxiliary_loss_clip": 0.01105662, - "auxiliary_loss_mlp": 0.01051114, - "balance_loss_clip": 1.04993379, - "balance_loss_mlp": 1.03026426, - "epoch": 0.1125507289944386, - "flos": 22856567604480.0, - "grad_norm": 1.9425066802508644, - "language_loss": 0.81811988, - "learning_rate": 3.928940539325929e-06, - "loss": 0.83968765, - "num_input_tokens_seen": 40473600, - "step": 1872, - "time_per_iteration": 2.851868152618408 - }, - { - "auxiliary_loss_clip": 0.01179923, - "auxiliary_loss_mlp": 0.01055722, - "balance_loss_clip": 1.05722499, - "balance_loss_mlp": 1.03359652, - "epoch": 0.11261085224710657, - "flos": 19676390094720.0, - "grad_norm": 2.186176467187071, - "language_loss": 0.8361913, - "learning_rate": 3.9288376103246095e-06, - "loss": 0.85854775, - "num_input_tokens_seen": 40490025, - "step": 1873, - "time_per_iteration": 2.6668763160705566 - }, - { - "auxiliary_loss_clip": 0.01144862, - "auxiliary_loss_mlp": 0.01054726, - "balance_loss_clip": 1.0525465, - "balance_loss_mlp": 1.03196871, - "epoch": 0.11267097549977453, - "flos": 26063246373120.0, - "grad_norm": 1.8822875514234196, - "language_loss": 0.92342389, - "learning_rate": 3.928734608181575e-06, - "loss": 0.94541967, - "num_input_tokens_seen": 40511580, - "step": 1874, - "time_per_iteration": 2.700533866882324 - }, - { - "auxiliary_loss_clip": 0.01140327, - "auxiliary_loss_mlp": 0.01056402, - "balance_loss_clip": 1.05100179, - "balance_loss_mlp": 1.03509891, - "epoch": 0.11273109875244251, - "flos": 21068036674560.0, - "grad_norm": 1.6564425098873434, - "language_loss": 0.75359404, - "learning_rate": 3.928631532900729e-06, - "loss": 0.77556133, - "num_input_tokens_seen": 40530155, - "step": 1875, - "time_per_iteration": 2.7642719745635986 - }, - { - "auxiliary_loss_clip": 0.01167091, - "auxiliary_loss_mlp": 0.01055271, - "balance_loss_clip": 1.05893159, - "balance_loss_mlp": 1.0348264, - "epoch": 0.11279122200511048, - "flos": 27088999061760.0, - "grad_norm": 2.12758140825061, - "language_loss": 0.71578634, - "learning_rate": 3.928528384485984e-06, - "loss": 0.73800993, - "num_input_tokens_seen": 40549500, - "step": 1876, - "time_per_iteration": 2.8505096435546875 - }, - { - "auxiliary_loss_clip": 0.01147417, - "auxiliary_loss_mlp": 0.01054094, - "balance_loss_clip": 1.05223966, - "balance_loss_mlp": 1.03200495, - "epoch": 0.11285134525777844, - "flos": 20187901722240.0, - "grad_norm": 1.8103612630164048, - "language_loss": 0.76795971, - "learning_rate": 3.9284251629412475e-06, - "loss": 0.78997481, - "num_input_tokens_seen": 40567475, - "step": 1877, - "time_per_iteration": 2.6972849369049072 - }, - { - "auxiliary_loss_clip": 0.01168106, - "auxiliary_loss_mlp": 0.01063056, - "balance_loss_clip": 1.05518627, - "balance_loss_mlp": 1.04026341, - "epoch": 0.11291146851044641, - "flos": 12458453863680.0, - "grad_norm": 2.1601834607000368, - "language_loss": 0.87843502, - "learning_rate": 3.928321868270436e-06, - "loss": 0.90074658, - "num_input_tokens_seen": 40583280, - "step": 1878, - "time_per_iteration": 5.6992692947387695 - }, - { - "auxiliary_loss_clip": 0.01140682, - "auxiliary_loss_mlp": 0.01054902, - "balance_loss_clip": 1.05420399, - "balance_loss_mlp": 1.03333724, - "epoch": 0.11297159176311439, - "flos": 23842315520640.0, - "grad_norm": 2.151084139284284, - "language_loss": 0.81623232, - "learning_rate": 3.928218500477466e-06, - "loss": 0.83818817, - "num_input_tokens_seen": 40603080, - "step": 1879, - "time_per_iteration": 2.8688366413116455 - }, - { - "auxiliary_loss_clip": 0.01155904, - "auxiliary_loss_mlp": 0.01059079, - "balance_loss_clip": 1.05238748, - "balance_loss_mlp": 1.03609526, - "epoch": 0.11303171501578235, - "flos": 29930538124800.0, - "grad_norm": 1.941623939252122, - "language_loss": 0.70234305, - "learning_rate": 3.928115059566259e-06, - "loss": 0.72449279, - "num_input_tokens_seen": 40623255, - "step": 1880, - "time_per_iteration": 5.567574739456177 - }, - { - "auxiliary_loss_clip": 0.01155691, - "auxiliary_loss_mlp": 0.01052309, - "balance_loss_clip": 1.05585837, - "balance_loss_mlp": 1.0306015, - "epoch": 0.11309183826845032, - "flos": 16180558842240.0, - "grad_norm": 1.6696082535169858, - "language_loss": 0.72690225, - "learning_rate": 3.928011545540734e-06, - "loss": 0.74898225, - "num_input_tokens_seen": 40641570, - "step": 1881, - "time_per_iteration": 2.792428493499756 - }, - { - "auxiliary_loss_clip": 0.011425, - "auxiliary_loss_mlp": 0.00781179, - "balance_loss_clip": 1.05046606, - "balance_loss_mlp": 1.00008667, - "epoch": 0.1131519615211183, - "flos": 12020702814720.0, - "grad_norm": 2.2964043184115783, - "language_loss": 0.74205768, - "learning_rate": 3.927907958404819e-06, - "loss": 0.76129448, - "num_input_tokens_seen": 40658775, - "step": 1882, - "time_per_iteration": 4.414916515350342 - }, - { - "auxiliary_loss_clip": 0.01177281, - "auxiliary_loss_mlp": 0.01054815, - "balance_loss_clip": 1.05680335, - "balance_loss_mlp": 1.03203452, - "epoch": 0.11321208477378626, - "flos": 26250125857920.0, - "grad_norm": 2.4326158086005965, - "language_loss": 0.7923016, - "learning_rate": 3.92780429816244e-06, - "loss": 0.81462252, - "num_input_tokens_seen": 40679555, - "step": 1883, - "time_per_iteration": 2.762615919113159 - }, - { - "auxiliary_loss_clip": 0.01140926, - "auxiliary_loss_mlp": 0.01058465, - "balance_loss_clip": 1.05226314, - "balance_loss_mlp": 1.03520727, - "epoch": 0.11327220802645423, - "flos": 13626376583040.0, - "grad_norm": 2.2898863699254974, - "language_loss": 0.77047318, - "learning_rate": 3.927700564817529e-06, - "loss": 0.79246712, - "num_input_tokens_seen": 40697295, - "step": 1884, - "time_per_iteration": 2.835468292236328 - }, - { - "auxiliary_loss_clip": 0.01074478, - "auxiliary_loss_mlp": 0.01009476, - "balance_loss_clip": 1.03993821, - "balance_loss_mlp": 1.00620937, - "epoch": 0.1133323312791222, - "flos": 57191802814080.0, - "grad_norm": 0.8138652948403053, - "language_loss": 0.55151373, - "learning_rate": 3.927596758374019e-06, - "loss": 0.5723533, - "num_input_tokens_seen": 40758095, - "step": 1885, - "time_per_iteration": 3.179532289505005 - }, - { - "auxiliary_loss_clip": 0.01083888, - "auxiliary_loss_mlp": 0.01050751, - "balance_loss_clip": 1.04415166, - "balance_loss_mlp": 1.02910316, - "epoch": 0.11339245453179017, - "flos": 24351708245760.0, - "grad_norm": 1.9836288003076585, - "language_loss": 0.90384823, - "learning_rate": 3.927492878835848e-06, - "loss": 0.92519462, - "num_input_tokens_seen": 40777140, - "step": 1886, - "time_per_iteration": 3.038928747177124 - }, - { - "auxiliary_loss_clip": 0.01116325, - "auxiliary_loss_mlp": 0.01057697, - "balance_loss_clip": 1.05137897, - "balance_loss_mlp": 1.03634632, - "epoch": 0.11345257778445814, - "flos": 22670693700480.0, - "grad_norm": 2.0132756022974023, - "language_loss": 0.84852886, - "learning_rate": 3.927388926206953e-06, - "loss": 0.87026906, - "num_input_tokens_seen": 40797505, - "step": 1887, - "time_per_iteration": 3.178863048553467 - }, - { - "auxiliary_loss_clip": 0.01136567, - "auxiliary_loss_mlp": 0.01056557, - "balance_loss_clip": 1.05091035, - "balance_loss_mlp": 1.03549314, - "epoch": 0.11351270103712612, - "flos": 20988242611200.0, - "grad_norm": 2.847610033990257, - "language_loss": 0.75826252, - "learning_rate": 3.927284900491277e-06, - "loss": 0.78019381, - "num_input_tokens_seen": 40812970, - "step": 1888, - "time_per_iteration": 2.7349846363067627 - }, - { - "auxiliary_loss_clip": 0.0113463, - "auxiliary_loss_mlp": 0.01062359, - "balance_loss_clip": 1.05614805, - "balance_loss_mlp": 1.03892243, - "epoch": 0.11357282428979408, - "flos": 37347923600640.0, - "grad_norm": 2.0598279187313624, - "language_loss": 0.68104899, - "learning_rate": 3.927180801692764e-06, - "loss": 0.7030189, - "num_input_tokens_seen": 40837745, - "step": 1889, - "time_per_iteration": 3.144444465637207 - }, - { - "auxiliary_loss_clip": 0.01177206, - "auxiliary_loss_mlp": 0.01049162, - "balance_loss_clip": 1.05653095, - "balance_loss_mlp": 1.02694094, - "epoch": 0.11363294754246205, - "flos": 21757018423680.0, - "grad_norm": 1.7896678692754837, - "language_loss": 0.83947051, - "learning_rate": 3.927076629815362e-06, - "loss": 0.86173415, - "num_input_tokens_seen": 40856490, - "step": 1890, - "time_per_iteration": 2.73126482963562 - }, - { - "auxiliary_loss_clip": 0.01145149, - "auxiliary_loss_mlp": 0.01056017, - "balance_loss_clip": 1.05039728, - "balance_loss_mlp": 1.03395164, - "epoch": 0.11369307079513001, - "flos": 22601637803520.0, - "grad_norm": 2.1678723202845256, - "language_loss": 0.64663875, - "learning_rate": 3.926972384863022e-06, - "loss": 0.66865045, - "num_input_tokens_seen": 40874070, - "step": 1891, - "time_per_iteration": 2.7474160194396973 - }, - { - "auxiliary_loss_clip": 0.01145505, - "auxiliary_loss_mlp": 0.01049015, - "balance_loss_clip": 1.05395687, - "balance_loss_mlp": 1.02773631, - "epoch": 0.11375319404779799, - "flos": 21944257044480.0, - "grad_norm": 2.126575023047711, - "language_loss": 0.87889415, - "learning_rate": 3.9268680668396956e-06, - "loss": 0.90083933, - "num_input_tokens_seen": 40892425, - "step": 1892, - "time_per_iteration": 2.795269250869751 - }, - { - "auxiliary_loss_clip": 0.01119535, - "auxiliary_loss_mlp": 0.01079586, - "balance_loss_clip": 1.05541015, - "balance_loss_mlp": 1.05461168, - "epoch": 0.11381331730046595, - "flos": 26395456285440.0, - "grad_norm": 3.1806920305576973, - "language_loss": 0.72902197, - "learning_rate": 3.926763675749339e-06, - "loss": 0.75101316, - "num_input_tokens_seen": 40912190, - "step": 1893, - "time_per_iteration": 2.890289306640625 - }, - { - "auxiliary_loss_clip": 0.01175698, - "auxiliary_loss_mlp": 0.0106591, - "balance_loss_clip": 1.05438137, - "balance_loss_mlp": 1.04290223, - "epoch": 0.11387344055313392, - "flos": 23804716959360.0, - "grad_norm": 1.8842571229841023, - "language_loss": 0.79247093, - "learning_rate": 3.92665921159591e-06, - "loss": 0.81488699, - "num_input_tokens_seen": 40928395, - "step": 1894, - "time_per_iteration": 2.6820743083953857 - }, - { - "auxiliary_loss_clip": 0.01150233, - "auxiliary_loss_mlp": 0.01061956, - "balance_loss_clip": 1.05356526, - "balance_loss_mlp": 1.03944933, - "epoch": 0.1139335638058019, - "flos": 34522865902080.0, - "grad_norm": 3.429237983174195, - "language_loss": 0.79718482, - "learning_rate": 3.926554674383371e-06, - "loss": 0.81930667, - "num_input_tokens_seen": 40946555, - "step": 1895, - "time_per_iteration": 2.829946994781494 - }, - { - "auxiliary_loss_clip": 0.01075529, - "auxiliary_loss_mlp": 0.01018518, - "balance_loss_clip": 1.03062391, - "balance_loss_mlp": 1.0155375, - "epoch": 0.11399368705846986, - "flos": 70587811520640.0, - "grad_norm": 0.8041110638842961, - "language_loss": 0.63357508, - "learning_rate": 3.926450064115686e-06, - "loss": 0.65451556, - "num_input_tokens_seen": 41004910, - "step": 1896, - "time_per_iteration": 3.3087315559387207 - }, - { - "auxiliary_loss_clip": 0.01147265, - "auxiliary_loss_mlp": 0.0106086, - "balance_loss_clip": 1.05560398, - "balance_loss_mlp": 1.03663635, - "epoch": 0.11405381031113783, - "flos": 21324259365120.0, - "grad_norm": 1.5952307342327186, - "language_loss": 0.85055745, - "learning_rate": 3.926345380796821e-06, - "loss": 0.8726387, - "num_input_tokens_seen": 41026385, - "step": 1897, - "time_per_iteration": 2.8522274494171143 - }, - { - "auxiliary_loss_clip": 0.0117836, - "auxiliary_loss_mlp": 0.00780276, - "balance_loss_clip": 1.05591989, - "balance_loss_mlp": 1.0001986, - "epoch": 0.11411393356380581, - "flos": 19719627091200.0, - "grad_norm": 3.3624139627125587, - "language_loss": 0.79675245, - "learning_rate": 3.9262406244307465e-06, - "loss": 0.81633884, - "num_input_tokens_seen": 41045315, - "step": 1898, - "time_per_iteration": 2.760057210922241 - }, - { - "auxiliary_loss_clip": 0.01115338, - "auxiliary_loss_mlp": 0.01064417, - "balance_loss_clip": 1.04594529, - "balance_loss_mlp": 1.03965724, - "epoch": 0.11417405681647377, - "flos": 17530440883200.0, - "grad_norm": 2.0191769665152903, - "language_loss": 0.73251313, - "learning_rate": 3.926135795021435e-06, - "loss": 0.75431061, - "num_input_tokens_seen": 41063390, - "step": 1899, - "time_per_iteration": 2.7363204956054688 - }, - { - "auxiliary_loss_clip": 0.01042449, - "auxiliary_loss_mlp": 0.01003313, - "balance_loss_clip": 1.03643703, - "balance_loss_mlp": 1.0003922, - "epoch": 0.11423418006914174, - "flos": 59674666619520.0, - "grad_norm": 0.9089505356695228, - "language_loss": 0.63434029, - "learning_rate": 3.92603089257286e-06, - "loss": 0.65479791, - "num_input_tokens_seen": 41124180, - "step": 1900, - "time_per_iteration": 3.2045955657958984 - }, - { - "auxiliary_loss_clip": 0.01113626, - "auxiliary_loss_mlp": 0.01066815, - "balance_loss_clip": 1.04929233, - "balance_loss_mlp": 1.04378414, - "epoch": 0.1142943033218097, - "flos": 22963114321920.0, - "grad_norm": 1.577500478750639, - "language_loss": 0.77943742, - "learning_rate": 3.925925917089001e-06, - "loss": 0.80124187, - "num_input_tokens_seen": 41143485, - "step": 1901, - "time_per_iteration": 2.745089530944824 - }, - { - "auxiliary_loss_clip": 0.01171621, - "auxiliary_loss_mlp": 0.01057834, - "balance_loss_clip": 1.05803061, - "balance_loss_mlp": 1.0359118, - "epoch": 0.11435442657447768, - "flos": 18256267008000.0, - "grad_norm": 2.175933638179557, - "language_loss": 0.84158623, - "learning_rate": 3.925820868573839e-06, - "loss": 0.86388075, - "num_input_tokens_seen": 41161695, - "step": 1902, - "time_per_iteration": 2.6433799266815186 - }, - { - "auxiliary_loss_clip": 0.01159941, - "auxiliary_loss_mlp": 0.01056662, - "balance_loss_clip": 1.05280399, - "balance_loss_mlp": 1.03122306, - "epoch": 0.11441454982714565, - "flos": 24061191045120.0, - "grad_norm": 1.7702735053047673, - "language_loss": 0.77720451, - "learning_rate": 3.925715747031356e-06, - "loss": 0.79937053, - "num_input_tokens_seen": 41181715, - "step": 1903, - "time_per_iteration": 2.6385905742645264 - }, - { - "auxiliary_loss_clip": 0.01145143, - "auxiliary_loss_mlp": 0.0104196, - "balance_loss_clip": 1.05293322, - "balance_loss_mlp": 1.02174175, - "epoch": 0.11447467307981361, - "flos": 25337707557120.0, - "grad_norm": 2.212790565732917, - "language_loss": 0.75751555, - "learning_rate": 3.925610552465539e-06, - "loss": 0.77938658, - "num_input_tokens_seen": 41201770, - "step": 1904, - "time_per_iteration": 2.632152557373047 - }, - { - "auxiliary_loss_clip": 0.01149375, - "auxiliary_loss_mlp": 0.01056532, - "balance_loss_clip": 1.05207586, - "balance_loss_mlp": 1.03279781, - "epoch": 0.11453479633248159, - "flos": 21726063878400.0, - "grad_norm": 2.4422699353972006, - "language_loss": 0.91853034, - "learning_rate": 3.9255052848803764e-06, - "loss": 0.94058943, - "num_input_tokens_seen": 41220590, - "step": 1905, - "time_per_iteration": 2.7421486377716064 - }, - { - "auxiliary_loss_clip": 0.01161686, - "auxiliary_loss_mlp": 0.01050264, - "balance_loss_clip": 1.04978943, - "balance_loss_mlp": 1.02612448, - "epoch": 0.11459491958514956, - "flos": 12969714096000.0, - "grad_norm": 2.5117992419356066, - "language_loss": 0.77484202, - "learning_rate": 3.925399944279861e-06, - "loss": 0.79696143, - "num_input_tokens_seen": 41237250, - "step": 1906, - "time_per_iteration": 2.69333553314209 - }, - { - "auxiliary_loss_clip": 0.0117911, - "auxiliary_loss_mlp": 0.01055129, - "balance_loss_clip": 1.05697322, - "balance_loss_mlp": 1.03222847, - "epoch": 0.11465504283781752, - "flos": 22711273090560.0, - "grad_norm": 2.0720467666322113, - "language_loss": 0.81739306, - "learning_rate": 3.925294530667986e-06, - "loss": 0.83973539, - "num_input_tokens_seen": 41256680, - "step": 1907, - "time_per_iteration": 2.6531317234039307 - }, - { - "auxiliary_loss_clip": 0.0113647, - "auxiliary_loss_mlp": 0.01065473, - "balance_loss_clip": 1.05235374, - "balance_loss_mlp": 1.04227471, - "epoch": 0.1147151660904855, - "flos": 23398387332480.0, - "grad_norm": 2.1769364553121293, - "language_loss": 0.84901214, - "learning_rate": 3.92518904404875e-06, - "loss": 0.87103164, - "num_input_tokens_seen": 41270955, - "step": 1908, - "time_per_iteration": 2.8768258094787598 - }, - { - "auxiliary_loss_clip": 0.01029536, - "auxiliary_loss_mlp": 0.01020856, - "balance_loss_clip": 1.02524137, - "balance_loss_mlp": 1.01694632, - "epoch": 0.11477528934315347, - "flos": 63011843498880.0, - "grad_norm": 0.9197306473097341, - "language_loss": 0.61072773, - "learning_rate": 3.925083484426153e-06, - "loss": 0.63123173, - "num_input_tokens_seen": 41319180, - "step": 1909, - "time_per_iteration": 3.0845727920532227 - }, - { - "auxiliary_loss_clip": 0.01182744, - "auxiliary_loss_mlp": 0.01054075, - "balance_loss_clip": 1.06014562, - "balance_loss_mlp": 1.03219986, - "epoch": 0.11483541259582143, - "flos": 16325601960960.0, - "grad_norm": 7.319166590530674, - "language_loss": 0.79170966, - "learning_rate": 3.924977851804197e-06, - "loss": 0.81407785, - "num_input_tokens_seen": 41337480, - "step": 1910, - "time_per_iteration": 2.708704710006714 - }, - { - "auxiliary_loss_clip": 0.01156489, - "auxiliary_loss_mlp": 0.01052406, - "balance_loss_clip": 1.0580864, - "balance_loss_mlp": 1.03029275, - "epoch": 0.1148955358484894, - "flos": 21580410228480.0, - "grad_norm": 2.117911712245717, - "language_loss": 0.7702589, - "learning_rate": 3.9248721461868875e-06, - "loss": 0.79234779, - "num_input_tokens_seen": 41354650, - "step": 1911, - "time_per_iteration": 2.7597720623016357 - }, - { - "auxiliary_loss_clip": 0.01159986, - "auxiliary_loss_mlp": 0.01054599, - "balance_loss_clip": 1.05726957, - "balance_loss_mlp": 1.03227139, - "epoch": 0.11495565910115738, - "flos": 27673696650240.0, - "grad_norm": 1.677508784227342, - "language_loss": 0.79177421, - "learning_rate": 3.9247663675782336e-06, - "loss": 0.81392002, - "num_input_tokens_seen": 41376935, - "step": 1912, - "time_per_iteration": 2.8143310546875 - }, - { - "auxiliary_loss_clip": 0.01183047, - "auxiliary_loss_mlp": 0.00779659, - "balance_loss_clip": 1.06065917, - "balance_loss_mlp": 1.00014925, - "epoch": 0.11501578235382534, - "flos": 20632368614400.0, - "grad_norm": 2.291252405113977, - "language_loss": 0.77942276, - "learning_rate": 3.924660515982246e-06, - "loss": 0.79904979, - "num_input_tokens_seen": 41396105, - "step": 1913, - "time_per_iteration": 2.696430206298828 - }, - { - "auxiliary_loss_clip": 0.01166892, - "auxiliary_loss_mlp": 0.01052769, - "balance_loss_clip": 1.05442226, - "balance_loss_mlp": 1.02953506, - "epoch": 0.1150759056064933, - "flos": 19829046896640.0, - "grad_norm": 1.8145547055361753, - "language_loss": 0.7003395, - "learning_rate": 3.924554591402939e-06, - "loss": 0.72253609, - "num_input_tokens_seen": 41415600, - "step": 1914, - "time_per_iteration": 2.739251136779785 - }, - { - "auxiliary_loss_clip": 0.01007182, - "auxiliary_loss_mlp": 0.01004682, - "balance_loss_clip": 1.02677619, - "balance_loss_mlp": 1.00191641, - "epoch": 0.11513602885916129, - "flos": 70045776311040.0, - "grad_norm": 0.7558771871458172, - "language_loss": 0.61059874, - "learning_rate": 3.92444859384433e-06, - "loss": 0.6307174, - "num_input_tokens_seen": 41478760, - "step": 1915, - "time_per_iteration": 3.56019926071167 - }, - { - "auxiliary_loss_clip": 0.01166434, - "auxiliary_loss_mlp": 0.01058573, - "balance_loss_clip": 1.05994964, - "balance_loss_mlp": 1.03595936, - "epoch": 0.11519615211182925, - "flos": 15741730385280.0, - "grad_norm": 2.437201506258279, - "language_loss": 0.93116963, - "learning_rate": 3.924342523310436e-06, - "loss": 0.95341969, - "num_input_tokens_seen": 41495720, - "step": 1916, - "time_per_iteration": 3.244772434234619 - }, - { - "auxiliary_loss_clip": 0.01161132, - "auxiliary_loss_mlp": 0.01059827, - "balance_loss_clip": 1.05798697, - "balance_loss_mlp": 1.03470993, - "epoch": 0.11525627536449722, - "flos": 20667632791680.0, - "grad_norm": 1.8909260082350545, - "language_loss": 0.72560197, - "learning_rate": 3.9242363798052806e-06, - "loss": 0.74781156, - "num_input_tokens_seen": 41513585, - "step": 1917, - "time_per_iteration": 4.502236843109131 - }, - { - "auxiliary_loss_clip": 0.01138773, - "auxiliary_loss_mlp": 0.0104964, - "balance_loss_clip": 1.05739903, - "balance_loss_mlp": 1.02700245, - "epoch": 0.1153163986171652, - "flos": 20303283185280.0, - "grad_norm": 9.147356795176979, - "language_loss": 0.74213129, - "learning_rate": 3.92413016333289e-06, - "loss": 0.76401544, - "num_input_tokens_seen": 41533390, - "step": 1918, - "time_per_iteration": 4.344711065292358 - }, - { - "auxiliary_loss_clip": 0.0114898, - "auxiliary_loss_mlp": 0.010469, - "balance_loss_clip": 1.05532503, - "balance_loss_mlp": 1.02450073, - "epoch": 0.11537652186983316, - "flos": 17639321984640.0, - "grad_norm": 3.182152136597976, - "language_loss": 0.86367452, - "learning_rate": 3.92402387389729e-06, - "loss": 0.88563335, - "num_input_tokens_seen": 41551015, - "step": 1919, - "time_per_iteration": 4.540036201477051 - }, - { - "auxiliary_loss_clip": 0.01134044, - "auxiliary_loss_mlp": 0.01067867, - "balance_loss_clip": 1.0496366, - "balance_loss_mlp": 1.04172444, - "epoch": 0.11543664512250112, - "flos": 21069401391360.0, - "grad_norm": 1.93595243799445, - "language_loss": 0.86735415, - "learning_rate": 3.923917511502512e-06, - "loss": 0.8893733, - "num_input_tokens_seen": 41568055, - "step": 1920, - "time_per_iteration": 2.7719242572784424 - }, - { - "auxiliary_loss_clip": 0.011686, - "auxiliary_loss_mlp": 0.010528, - "balance_loss_clip": 1.0593946, - "balance_loss_mlp": 1.0302341, - "epoch": 0.11549676837516909, - "flos": 22747542848640.0, - "grad_norm": 4.512761907267092, - "language_loss": 0.79294932, - "learning_rate": 3.923811076152589e-06, - "loss": 0.81516337, - "num_input_tokens_seen": 41587435, - "step": 1921, - "time_per_iteration": 2.798673629760742 - }, - { - "auxiliary_loss_clip": 0.01174604, - "auxiliary_loss_mlp": 0.01063526, - "balance_loss_clip": 1.05685806, - "balance_loss_mlp": 1.04007721, - "epoch": 0.11555689162783707, - "flos": 19168972617600.0, - "grad_norm": 2.4057040360661484, - "language_loss": 0.78464305, - "learning_rate": 3.923704567851557e-06, - "loss": 0.80702436, - "num_input_tokens_seen": 41604975, - "step": 1922, - "time_per_iteration": 4.352341651916504 - }, - { - "auxiliary_loss_clip": 0.01092284, - "auxiliary_loss_mlp": 0.01064602, - "balance_loss_clip": 1.04645681, - "balance_loss_mlp": 1.04229808, - "epoch": 0.11561701488050503, - "flos": 24572056227840.0, - "grad_norm": 1.8560991769949675, - "language_loss": 0.84293079, - "learning_rate": 3.923597986603456e-06, - "loss": 0.86449969, - "num_input_tokens_seen": 41626155, - "step": 1923, - "time_per_iteration": 3.2956740856170654 - }, - { - "auxiliary_loss_clip": 0.01171957, - "auxiliary_loss_mlp": 0.01056739, - "balance_loss_clip": 1.0600003, - "balance_loss_mlp": 1.03317094, - "epoch": 0.115677138133173, - "flos": 17092546179840.0, - "grad_norm": 1.944851076041885, - "language_loss": 0.80890471, - "learning_rate": 3.9234913324123264e-06, - "loss": 0.83119166, - "num_input_tokens_seen": 41644805, - "step": 1924, - "time_per_iteration": 3.0939247608184814 - }, - { - "auxiliary_loss_clip": 0.01055916, - "auxiliary_loss_mlp": 0.01027131, - "balance_loss_clip": 1.03045607, - "balance_loss_mlp": 1.02436543, - "epoch": 0.11573726138584098, - "flos": 62703875266560.0, - "grad_norm": 0.8171642061509322, - "language_loss": 0.61196578, - "learning_rate": 3.923384605282212e-06, - "loss": 0.63279623, - "num_input_tokens_seen": 41709345, - "step": 1925, - "time_per_iteration": 3.3765265941619873 - }, - { - "auxiliary_loss_clip": 0.01155845, - "auxiliary_loss_mlp": 0.01079328, - "balance_loss_clip": 1.05374098, - "balance_loss_mlp": 1.0549382, - "epoch": 0.11579738463850894, - "flos": 22601135013120.0, - "grad_norm": 1.7772533553430212, - "language_loss": 0.74766397, - "learning_rate": 3.923277805217161e-06, - "loss": 0.77001572, - "num_input_tokens_seen": 41730210, - "step": 1926, - "time_per_iteration": 2.754974126815796 - }, - { - "auxiliary_loss_clip": 0.01116228, - "auxiliary_loss_mlp": 0.00781701, - "balance_loss_clip": 1.04683304, - "balance_loss_mlp": 1.00016665, - "epoch": 0.11585750789117691, - "flos": 21726135705600.0, - "grad_norm": 4.731879086182685, - "language_loss": 0.71978599, - "learning_rate": 3.923170932221222e-06, - "loss": 0.7387653, - "num_input_tokens_seen": 41750270, - "step": 1927, - "time_per_iteration": 2.9454004764556885 - }, - { - "auxiliary_loss_clip": 0.01137955, - "auxiliary_loss_mlp": 0.01058796, - "balance_loss_clip": 1.05250621, - "balance_loss_mlp": 1.03572917, - "epoch": 0.11591763114384489, - "flos": 26287544851200.0, - "grad_norm": 1.5938674022456252, - "language_loss": 0.86854041, - "learning_rate": 3.92306398629845e-06, - "loss": 0.89050794, - "num_input_tokens_seen": 41772975, - "step": 1928, - "time_per_iteration": 2.832750082015991 - }, - { - "auxiliary_loss_clip": 0.01129041, - "auxiliary_loss_mlp": 0.01060836, - "balance_loss_clip": 1.05032003, - "balance_loss_mlp": 1.03706551, - "epoch": 0.11597775439651285, - "flos": 23000461488000.0, - "grad_norm": 1.6639520350020578, - "language_loss": 0.77450585, - "learning_rate": 3.922956967452898e-06, - "loss": 0.79640466, - "num_input_tokens_seen": 41791765, - "step": 1929, - "time_per_iteration": 2.7876811027526855 - }, - { - "auxiliary_loss_clip": 0.01176887, - "auxiliary_loss_mlp": 0.01063611, - "balance_loss_clip": 1.05667901, - "balance_loss_mlp": 1.0424509, - "epoch": 0.11603787764918082, - "flos": 31941715507200.0, - "grad_norm": 1.8085677541874856, - "language_loss": 0.76831949, - "learning_rate": 3.922849875688626e-06, - "loss": 0.79072452, - "num_input_tokens_seen": 41815615, - "step": 1930, - "time_per_iteration": 2.819934844970703 - }, - { - "auxiliary_loss_clip": 0.01145781, - "auxiliary_loss_mlp": 0.01054046, - "balance_loss_clip": 1.05066586, - "balance_loss_mlp": 1.03165817, - "epoch": 0.1160980009018488, - "flos": 22271654534400.0, - "grad_norm": 1.9434791543130712, - "language_loss": 0.72291863, - "learning_rate": 3.922742711009693e-06, - "loss": 0.74491692, - "num_input_tokens_seen": 41834810, - "step": 1931, - "time_per_iteration": 2.8078088760375977 - }, - { - "auxiliary_loss_clip": 0.01146409, - "auxiliary_loss_mlp": 0.01061336, - "balance_loss_clip": 1.05090261, - "balance_loss_mlp": 1.03575325, - "epoch": 0.11615812415451676, - "flos": 22783633038720.0, - "grad_norm": 1.7378937044391531, - "language_loss": 0.8222791, - "learning_rate": 3.922635473420164e-06, - "loss": 0.8443566, - "num_input_tokens_seen": 41854975, - "step": 1932, - "time_per_iteration": 2.7495200634002686 - }, - { - "auxiliary_loss_clip": 0.01030493, - "auxiliary_loss_mlp": 0.01018834, - "balance_loss_clip": 1.02184403, - "balance_loss_mlp": 1.01556778, - "epoch": 0.11621824740718473, - "flos": 67146096107520.0, - "grad_norm": 0.7669378012870447, - "language_loss": 0.61050332, - "learning_rate": 3.922528162924105e-06, - "loss": 0.63099658, - "num_input_tokens_seen": 41911105, - "step": 1933, - "time_per_iteration": 3.256678581237793 - }, - { - "auxiliary_loss_clip": 0.01108577, - "auxiliary_loss_mlp": 0.00780156, - "balance_loss_clip": 1.04764509, - "balance_loss_mlp": 1.00006175, - "epoch": 0.11627837065985269, - "flos": 20375930442240.0, - "grad_norm": 2.830760437639296, - "language_loss": 0.85790741, - "learning_rate": 3.922420779525586e-06, - "loss": 0.8767947, - "num_input_tokens_seen": 41931750, - "step": 1934, - "time_per_iteration": 2.9144253730773926 - }, - { - "auxiliary_loss_clip": 0.01117671, - "auxiliary_loss_mlp": 0.01059839, - "balance_loss_clip": 1.04929256, - "balance_loss_mlp": 1.03453088, - "epoch": 0.11633849391252067, - "flos": 21725812483200.0, - "grad_norm": 2.625764216143105, - "language_loss": 0.66222906, - "learning_rate": 3.9223133232286776e-06, - "loss": 0.68400419, - "num_input_tokens_seen": 41949400, - "step": 1935, - "time_per_iteration": 2.867152452468872 - }, - { - "auxiliary_loss_clip": 0.01183991, - "auxiliary_loss_mlp": 0.01052492, - "balance_loss_clip": 1.05868936, - "balance_loss_mlp": 1.03111792, - "epoch": 0.11639861716518864, - "flos": 18805341283200.0, - "grad_norm": 2.025938843377603, - "language_loss": 0.75678742, - "learning_rate": 3.922205794037456e-06, - "loss": 0.77915227, - "num_input_tokens_seen": 41968100, - "step": 1936, - "time_per_iteration": 2.7282185554504395 - }, - { - "auxiliary_loss_clip": 0.01179718, - "auxiliary_loss_mlp": 0.01049532, - "balance_loss_clip": 1.05632091, - "balance_loss_mlp": 1.02639306, - "epoch": 0.1164587404178566, - "flos": 21214983214080.0, - "grad_norm": 2.0032002399718905, - "language_loss": 0.84086847, - "learning_rate": 3.922098191955998e-06, - "loss": 0.86316097, - "num_input_tokens_seen": 41986375, - "step": 1937, - "time_per_iteration": 2.715386152267456 - }, - { - "auxiliary_loss_clip": 0.01152084, - "auxiliary_loss_mlp": 0.01048961, - "balance_loss_clip": 1.05258632, - "balance_loss_mlp": 1.0268234, - "epoch": 0.11651886367052458, - "flos": 27818632028160.0, - "grad_norm": 3.0485930101216607, - "language_loss": 0.7617709, - "learning_rate": 3.921990516988384e-06, - "loss": 0.78378135, - "num_input_tokens_seen": 42006055, - "step": 1938, - "time_per_iteration": 2.7624804973602295 - }, - { - "auxiliary_loss_clip": 0.01182576, - "auxiliary_loss_mlp": 0.01055104, - "balance_loss_clip": 1.05742419, - "balance_loss_mlp": 1.03250146, - "epoch": 0.11657898692319255, - "flos": 22889569224960.0, - "grad_norm": 1.7682499083089231, - "language_loss": 0.79677606, - "learning_rate": 3.921882769138696e-06, - "loss": 0.81915289, - "num_input_tokens_seen": 42024995, - "step": 1939, - "time_per_iteration": 2.71458101272583 - }, - { - "auxiliary_loss_clip": 0.01148291, - "auxiliary_loss_mlp": 0.01057951, - "balance_loss_clip": 1.05209351, - "balance_loss_mlp": 1.03508627, - "epoch": 0.11663911017586051, - "flos": 24315905364480.0, - "grad_norm": 2.2281245193552475, - "language_loss": 0.85916591, - "learning_rate": 3.9217749484110215e-06, - "loss": 0.88122833, - "num_input_tokens_seen": 42042640, - "step": 1940, - "time_per_iteration": 2.7322728633880615 - }, - { - "auxiliary_loss_clip": 0.01153746, - "auxiliary_loss_mlp": 0.01056301, - "balance_loss_clip": 1.05659437, - "balance_loss_mlp": 1.03548717, - "epoch": 0.11669923342852849, - "flos": 42340152470400.0, - "grad_norm": 1.4952807995381137, - "language_loss": 0.75590646, - "learning_rate": 3.921667054809449e-06, - "loss": 0.77800703, - "num_input_tokens_seen": 42067005, - "step": 1941, - "time_per_iteration": 2.9211390018463135 - }, - { - "auxiliary_loss_clip": 0.01149585, - "auxiliary_loss_mlp": 0.00780203, - "balance_loss_clip": 1.05181897, - "balance_loss_mlp": 1.00006557, - "epoch": 0.11675935668119646, - "flos": 14642288945280.0, - "grad_norm": 2.277225749463833, - "language_loss": 0.88847101, - "learning_rate": 3.921559088338068e-06, - "loss": 0.90776885, - "num_input_tokens_seen": 42082295, - "step": 1942, - "time_per_iteration": 2.7145469188690186 - }, - { - "auxiliary_loss_clip": 0.01165183, - "auxiliary_loss_mlp": 0.01056257, - "balance_loss_clip": 1.05553317, - "balance_loss_mlp": 1.03552663, - "epoch": 0.11681947993386442, - "flos": 35116470063360.0, - "grad_norm": 1.6547450593003057, - "language_loss": 0.67979252, - "learning_rate": 3.921451049000975e-06, - "loss": 0.70200694, - "num_input_tokens_seen": 42105295, - "step": 1943, - "time_per_iteration": 2.789701461791992 - }, - { - "auxiliary_loss_clip": 0.01153022, - "auxiliary_loss_mlp": 0.01047648, - "balance_loss_clip": 1.05515063, - "balance_loss_mlp": 1.02591634, - "epoch": 0.11687960318653239, - "flos": 38983259024640.0, - "grad_norm": 1.9817763000300312, - "language_loss": 0.69831288, - "learning_rate": 3.921342936802265e-06, - "loss": 0.72031963, - "num_input_tokens_seen": 42125520, - "step": 1944, - "time_per_iteration": 2.827150583267212 - }, - { - "auxiliary_loss_clip": 0.01155915, - "auxiliary_loss_mlp": 0.01051888, - "balance_loss_clip": 1.05038309, - "balance_loss_mlp": 1.03158641, - "epoch": 0.11693972643920036, - "flos": 25994980575360.0, - "grad_norm": 1.4963028532298175, - "language_loss": 0.82662582, - "learning_rate": 3.921234751746038e-06, - "loss": 0.84870374, - "num_input_tokens_seen": 42146335, - "step": 1945, - "time_per_iteration": 2.7190194129943848 - }, - { - "auxiliary_loss_clip": 0.01137101, - "auxiliary_loss_mlp": 0.01062082, - "balance_loss_clip": 1.04682803, - "balance_loss_mlp": 1.04005265, - "epoch": 0.11699984969186833, - "flos": 27272107618560.0, - "grad_norm": 2.3643045784637735, - "language_loss": 0.76298034, - "learning_rate": 3.9211264938363975e-06, - "loss": 0.78497219, - "num_input_tokens_seen": 42165320, - "step": 1946, - "time_per_iteration": 2.792555093765259 - }, - { - "auxiliary_loss_clip": 0.01134728, - "auxiliary_loss_mlp": 0.01056112, - "balance_loss_clip": 1.0507704, - "balance_loss_mlp": 1.03536999, - "epoch": 0.1170599729445363, - "flos": 15267853232640.0, - "grad_norm": 2.058923240355934, - "language_loss": 0.69014907, - "learning_rate": 3.921018163077448e-06, - "loss": 0.71205747, - "num_input_tokens_seen": 42182955, - "step": 1947, - "time_per_iteration": 2.643807888031006 - }, - { - "auxiliary_loss_clip": 0.01154759, - "auxiliary_loss_mlp": 0.01067767, - "balance_loss_clip": 1.05707347, - "balance_loss_mlp": 1.04604673, - "epoch": 0.11712009619720427, - "flos": 17164439251200.0, - "grad_norm": 2.0690991629011615, - "language_loss": 0.85044622, - "learning_rate": 3.920909759473295e-06, - "loss": 0.87267148, - "num_input_tokens_seen": 42200760, - "step": 1948, - "time_per_iteration": 2.6399292945861816 - }, - { - "auxiliary_loss_clip": 0.01051031, - "auxiliary_loss_mlp": 0.0075782, - "balance_loss_clip": 1.0245688, - "balance_loss_mlp": 0.99997467, - "epoch": 0.11718021944987224, - "flos": 70940991997440.0, - "grad_norm": 0.8206821069070506, - "language_loss": 0.65139282, - "learning_rate": 3.920801283028054e-06, - "loss": 0.66948134, - "num_input_tokens_seen": 42265745, - "step": 1949, - "time_per_iteration": 3.3030900955200195 - }, - { - "auxiliary_loss_clip": 0.01159399, - "auxiliary_loss_mlp": 0.01061163, - "balance_loss_clip": 1.05735683, - "balance_loss_mlp": 1.04054022, - "epoch": 0.1172403427025402, - "flos": 27453456408960.0, - "grad_norm": 1.512876015443777, - "language_loss": 0.71746683, - "learning_rate": 3.920692733745835e-06, - "loss": 0.73967248, - "num_input_tokens_seen": 42286245, - "step": 1950, - "time_per_iteration": 2.739341974258423 - }, - { - "auxiliary_loss_clip": 0.01175731, - "auxiliary_loss_mlp": 0.01061149, - "balance_loss_clip": 1.06152189, - "balance_loss_mlp": 1.03907192, - "epoch": 0.11730046595520818, - "flos": 15668723992320.0, - "grad_norm": 2.1258853115079996, - "language_loss": 0.76671386, - "learning_rate": 3.920584111630755e-06, - "loss": 0.78908259, - "num_input_tokens_seen": 42302710, - "step": 1951, - "time_per_iteration": 2.624788999557495 - }, - { - "auxiliary_loss_clip": 0.01129104, - "auxiliary_loss_mlp": 0.0106562, - "balance_loss_clip": 1.05285251, - "balance_loss_mlp": 1.04435349, - "epoch": 0.11736058920787615, - "flos": 25630164092160.0, - "grad_norm": 1.7264952730121887, - "language_loss": 0.75964963, - "learning_rate": 3.9204754166869325e-06, - "loss": 0.7815969, - "num_input_tokens_seen": 42324115, - "step": 1952, - "time_per_iteration": 2.824826955795288 - }, - { - "auxiliary_loss_clip": 0.01123677, - "auxiliary_loss_mlp": 0.01065929, - "balance_loss_clip": 1.04589534, - "balance_loss_mlp": 1.04451907, - "epoch": 0.11742071246054411, - "flos": 21434289701760.0, - "grad_norm": 2.2111022500713453, - "language_loss": 0.72316217, - "learning_rate": 3.920366648918491e-06, - "loss": 0.74505818, - "num_input_tokens_seen": 42342505, - "step": 1953, - "time_per_iteration": 2.7456531524658203 - }, - { - "auxiliary_loss_clip": 0.01149214, - "auxiliary_loss_mlp": 0.00781136, - "balance_loss_clip": 1.0549686, - "balance_loss_mlp": 1.0000577, - "epoch": 0.11748083571321208, - "flos": 15997845335040.0, - "grad_norm": 2.1208802652878522, - "language_loss": 0.79780388, - "learning_rate": 3.920257808329552e-06, - "loss": 0.81710744, - "num_input_tokens_seen": 42360525, - "step": 1954, - "time_per_iteration": 2.653949737548828 - }, - { - "auxiliary_loss_clip": 0.01112399, - "auxiliary_loss_mlp": 0.01059787, - "balance_loss_clip": 1.04880822, - "balance_loss_mlp": 1.03763783, - "epoch": 0.11754095896588006, - "flos": 16180056051840.0, - "grad_norm": 1.9673692595826442, - "language_loss": 0.8553021, - "learning_rate": 3.920148894924246e-06, - "loss": 0.87702394, - "num_input_tokens_seen": 42377045, - "step": 1955, - "time_per_iteration": 2.7987124919891357 - }, - { - "auxiliary_loss_clip": 0.01163172, - "auxiliary_loss_mlp": 0.00779783, - "balance_loss_clip": 1.05209899, - "balance_loss_mlp": 1.00016606, - "epoch": 0.11760108221854802, - "flos": 13261596013440.0, - "grad_norm": 2.12926288831445, - "language_loss": 0.78105426, - "learning_rate": 3.920039908706701e-06, - "loss": 0.80048382, - "num_input_tokens_seen": 42393960, - "step": 1956, - "time_per_iteration": 2.6247944831848145 - }, - { - "auxiliary_loss_clip": 0.01158287, - "auxiliary_loss_mlp": 0.01058454, - "balance_loss_clip": 1.05559933, - "balance_loss_mlp": 1.03601909, - "epoch": 0.11766120547121599, - "flos": 24498439303680.0, - "grad_norm": 2.264983200322237, - "language_loss": 0.80487299, - "learning_rate": 3.91993084968105e-06, - "loss": 0.82704043, - "num_input_tokens_seen": 42413160, - "step": 1957, - "time_per_iteration": 5.862411260604858 - }, - { - "auxiliary_loss_clip": 0.01168294, - "auxiliary_loss_mlp": 0.0105259, - "balance_loss_clip": 1.05703866, - "balance_loss_mlp": 1.0308696, - "epoch": 0.11772132872388397, - "flos": 17784005967360.0, - "grad_norm": 4.8672025609093215, - "language_loss": 0.77955222, - "learning_rate": 3.919821717851428e-06, - "loss": 0.80176103, - "num_input_tokens_seen": 42432590, - "step": 1958, - "time_per_iteration": 4.4218549728393555 - }, - { - "auxiliary_loss_clip": 0.01149976, - "auxiliary_loss_mlp": 0.0105003, - "balance_loss_clip": 1.05451894, - "balance_loss_mlp": 1.02680755, - "epoch": 0.11778145197655193, - "flos": 13217030213760.0, - "grad_norm": 1.7537692363765556, - "language_loss": 0.77002251, - "learning_rate": 3.919712513221976e-06, - "loss": 0.79202259, - "num_input_tokens_seen": 42450135, - "step": 1959, - "time_per_iteration": 2.674323558807373 - }, - { - "auxiliary_loss_clip": 0.01162585, - "auxiliary_loss_mlp": 0.01057019, - "balance_loss_clip": 1.05857027, - "balance_loss_mlp": 1.03484631, - "epoch": 0.1178415752292199, - "flos": 20230204965120.0, - "grad_norm": 2.2026367524708927, - "language_loss": 0.70078689, - "learning_rate": 3.919603235796832e-06, - "loss": 0.722983, - "num_input_tokens_seen": 42470050, - "step": 1960, - "time_per_iteration": 2.7704508304595947 - }, - { - "auxiliary_loss_clip": 0.01161089, - "auxiliary_loss_mlp": 0.01055224, - "balance_loss_clip": 1.05841374, - "balance_loss_mlp": 1.03228831, - "epoch": 0.11790169848188788, - "flos": 13040134709760.0, - "grad_norm": 2.663996374773888, - "language_loss": 0.81045067, - "learning_rate": 3.9194938855801406e-06, - "loss": 0.83261371, - "num_input_tokens_seen": 42484335, - "step": 1961, - "time_per_iteration": 4.67006778717041 - }, - { - "auxiliary_loss_clip": 0.01163817, - "auxiliary_loss_mlp": 0.00779643, - "balance_loss_clip": 1.05658793, - "balance_loss_mlp": 1.00009537, - "epoch": 0.11796182173455584, - "flos": 22265728790400.0, - "grad_norm": 1.71345119244153, - "language_loss": 0.92273545, - "learning_rate": 3.919384462576049e-06, - "loss": 0.94217002, - "num_input_tokens_seen": 42502720, - "step": 1962, - "time_per_iteration": 2.6559524536132812 - }, - { - "auxiliary_loss_clip": 0.01139826, - "auxiliary_loss_mlp": 0.01058964, - "balance_loss_clip": 1.05222392, - "balance_loss_mlp": 1.03704107, - "epoch": 0.1180219449872238, - "flos": 10635017892480.0, - "grad_norm": 2.157203116008796, - "language_loss": 0.87635934, - "learning_rate": 3.919274966788707e-06, - "loss": 0.8983472, - "num_input_tokens_seen": 42519460, - "step": 1963, - "time_per_iteration": 2.710042715072632 - }, - { - "auxiliary_loss_clip": 0.0115823, - "auxiliary_loss_mlp": 0.00779391, - "balance_loss_clip": 1.05600929, - "balance_loss_mlp": 1.00011134, - "epoch": 0.11808206823989177, - "flos": 20923532259840.0, - "grad_norm": 2.8331529324994333, - "language_loss": 0.83879703, - "learning_rate": 3.919165398222265e-06, - "loss": 0.85817325, - "num_input_tokens_seen": 42539420, - "step": 1964, - "time_per_iteration": 2.734941244125366 - }, - { - "auxiliary_loss_clip": 0.01122529, - "auxiliary_loss_mlp": 0.01069054, - "balance_loss_clip": 1.05171156, - "balance_loss_mlp": 1.04628491, - "epoch": 0.11814219149255975, - "flos": 20777770869120.0, - "grad_norm": 3.9132941826799543, - "language_loss": 0.8313272, - "learning_rate": 3.919055756880879e-06, - "loss": 0.85324299, - "num_input_tokens_seen": 42558225, - "step": 1965, - "time_per_iteration": 2.7427306175231934 - }, - { - "auxiliary_loss_clip": 0.01178673, - "auxiliary_loss_mlp": 0.01053338, - "balance_loss_clip": 1.05815279, - "balance_loss_mlp": 1.03163004, - "epoch": 0.11820231474522772, - "flos": 48759938542080.0, - "grad_norm": 1.6720023918141877, - "language_loss": 0.74227381, - "learning_rate": 3.918946042768707e-06, - "loss": 0.76459396, - "num_input_tokens_seen": 42580790, - "step": 1966, - "time_per_iteration": 2.8265397548675537 - }, - { - "auxiliary_loss_clip": 0.01163407, - "auxiliary_loss_mlp": 0.0106081, - "balance_loss_clip": 1.06309748, - "balance_loss_mlp": 1.03836274, - "epoch": 0.11826243799789568, - "flos": 16690598012160.0, - "grad_norm": 2.5628488285375397, - "language_loss": 0.73137337, - "learning_rate": 3.918836255889908e-06, - "loss": 0.7536155, - "num_input_tokens_seen": 42597355, - "step": 1967, - "time_per_iteration": 2.706193685531616 - }, - { - "auxiliary_loss_clip": 0.01167052, - "auxiliary_loss_mlp": 0.01053471, - "balance_loss_clip": 1.05852592, - "balance_loss_mlp": 1.03141701, - "epoch": 0.11832256125056366, - "flos": 16909868586240.0, - "grad_norm": 5.332816815546028, - "language_loss": 0.8831054, - "learning_rate": 3.9187263962486456e-06, - "loss": 0.90531063, - "num_input_tokens_seen": 42616060, - "step": 1968, - "time_per_iteration": 2.6308343410491943 - }, - { - "auxiliary_loss_clip": 0.01168356, - "auxiliary_loss_mlp": 0.01051817, - "balance_loss_clip": 1.06406927, - "balance_loss_mlp": 1.0294776, - "epoch": 0.11838268450323162, - "flos": 22820405587200.0, - "grad_norm": 2.252087054693662, - "language_loss": 0.67010254, - "learning_rate": 3.918616463849087e-06, - "loss": 0.69230425, - "num_input_tokens_seen": 42636285, - "step": 1969, - "time_per_iteration": 2.662480592727661 - }, - { - "auxiliary_loss_clip": 0.01130071, - "auxiliary_loss_mlp": 0.0106143, - "balance_loss_clip": 1.05177045, - "balance_loss_mlp": 1.03774357, - "epoch": 0.11844280775589959, - "flos": 33545844990720.0, - "grad_norm": 2.153814675458072, - "language_loss": 0.80455101, - "learning_rate": 3.918506458695399e-06, - "loss": 0.82646602, - "num_input_tokens_seen": 42658320, - "step": 1970, - "time_per_iteration": 2.798050880432129 - }, - { - "auxiliary_loss_clip": 0.01060284, - "auxiliary_loss_mlp": 0.01021383, - "balance_loss_clip": 1.02553701, - "balance_loss_mlp": 1.01892686, - "epoch": 0.11850293100856757, - "flos": 66350998604160.0, - "grad_norm": 0.8165911228106061, - "language_loss": 0.66192186, - "learning_rate": 3.918396380791754e-06, - "loss": 0.68273854, - "num_input_tokens_seen": 42721500, - "step": 1971, - "time_per_iteration": 3.167018413543701 - }, - { - "auxiliary_loss_clip": 0.01151504, - "auxiliary_loss_mlp": 0.0105629, - "balance_loss_clip": 1.05294323, - "balance_loss_mlp": 1.03422379, - "epoch": 0.11856305426123553, - "flos": 24681045070080.0, - "grad_norm": 2.1839859106137554, - "language_loss": 0.79782552, - "learning_rate": 3.918286230142327e-06, - "loss": 0.81990343, - "num_input_tokens_seen": 42739825, - "step": 1972, - "time_per_iteration": 2.6908793449401855 - }, - { - "auxiliary_loss_clip": 0.01133219, - "auxiliary_loss_mlp": 0.00778766, - "balance_loss_clip": 1.05341005, - "balance_loss_mlp": 1.00005877, - "epoch": 0.1186231775139035, - "flos": 24280102483200.0, - "grad_norm": 2.0473813607633384, - "language_loss": 0.72843599, - "learning_rate": 3.918176006751292e-06, - "loss": 0.74755585, - "num_input_tokens_seen": 42758695, - "step": 1973, - "time_per_iteration": 2.7801859378814697 - }, - { - "auxiliary_loss_clip": 0.01138022, - "auxiliary_loss_mlp": 0.01049764, - "balance_loss_clip": 1.05580497, - "balance_loss_mlp": 1.02707887, - "epoch": 0.11868330076657148, - "flos": 21757413473280.0, - "grad_norm": 1.6449677647733996, - "language_loss": 0.72019619, - "learning_rate": 3.918065710622832e-06, - "loss": 0.74207413, - "num_input_tokens_seen": 42778510, - "step": 1974, - "time_per_iteration": 2.7337663173675537 - }, - { - "auxiliary_loss_clip": 0.01129602, - "auxiliary_loss_mlp": 0.01043161, - "balance_loss_clip": 1.05265522, - "balance_loss_mlp": 1.02086854, - "epoch": 0.11874342401923944, - "flos": 17193274894080.0, - "grad_norm": 2.017372400194955, - "language_loss": 0.77409399, - "learning_rate": 3.917955341761128e-06, - "loss": 0.79582161, - "num_input_tokens_seen": 42793995, - "step": 1975, - "time_per_iteration": 2.669546604156494 - }, - { - "auxiliary_loss_clip": 0.01131477, - "auxiliary_loss_mlp": 0.01059968, - "balance_loss_clip": 1.05880177, - "balance_loss_mlp": 1.03908277, - "epoch": 0.11880354727190741, - "flos": 15229572312960.0, - "grad_norm": 2.3842578575289, - "language_loss": 0.75110453, - "learning_rate": 3.917844900170364e-06, - "loss": 0.77301902, - "num_input_tokens_seen": 42809000, - "step": 1976, - "time_per_iteration": 2.8439090251922607 - }, - { - "auxiliary_loss_clip": 0.0116819, - "auxiliary_loss_mlp": 0.01049523, - "balance_loss_clip": 1.05999744, - "balance_loss_mlp": 1.02835166, - "epoch": 0.11886367052457537, - "flos": 27309706179840.0, - "grad_norm": 1.8674311015318124, - "language_loss": 0.74877423, - "learning_rate": 3.91773438585473e-06, - "loss": 0.77095133, - "num_input_tokens_seen": 42831585, - "step": 1977, - "time_per_iteration": 2.6747169494628906 - }, - { - "auxiliary_loss_clip": 0.01182095, - "auxiliary_loss_mlp": 0.01059621, - "balance_loss_clip": 1.05954552, - "balance_loss_mlp": 1.03805614, - "epoch": 0.11892379377724335, - "flos": 21798280172160.0, - "grad_norm": 2.1793079873879604, - "language_loss": 0.74207634, - "learning_rate": 3.9176237988184165e-06, - "loss": 0.76449353, - "num_input_tokens_seen": 42848420, - "step": 1978, - "time_per_iteration": 2.631664514541626 - }, - { - "auxiliary_loss_clip": 0.01142323, - "auxiliary_loss_mlp": 0.01050585, - "balance_loss_clip": 1.06037045, - "balance_loss_mlp": 1.0289247, - "epoch": 0.11898391702991132, - "flos": 13991013498240.0, - "grad_norm": 1.7170872786869797, - "language_loss": 0.73256385, - "learning_rate": 3.917513139065616e-06, - "loss": 0.754493, - "num_input_tokens_seen": 42866645, - "step": 1979, - "time_per_iteration": 2.7442541122436523 - }, - { - "auxiliary_loss_clip": 0.01137516, - "auxiliary_loss_mlp": 0.01051378, - "balance_loss_clip": 1.0566175, - "balance_loss_mlp": 1.02968168, - "epoch": 0.11904404028257928, - "flos": 32234567091840.0, - "grad_norm": 1.876224505386343, - "language_loss": 0.98293436, - "learning_rate": 3.917402406600525e-06, - "loss": 1.00482333, - "num_input_tokens_seen": 42888515, - "step": 1980, - "time_per_iteration": 2.787667989730835 - }, - { - "auxiliary_loss_clip": 0.01153629, - "auxiliary_loss_mlp": 0.01053612, - "balance_loss_clip": 1.05595791, - "balance_loss_mlp": 1.03077161, - "epoch": 0.11910416353524726, - "flos": 23586272398080.0, - "grad_norm": 1.7507584506289393, - "language_loss": 0.86265099, - "learning_rate": 3.917291601427342e-06, - "loss": 0.88472342, - "num_input_tokens_seen": 42909035, - "step": 1981, - "time_per_iteration": 2.6680359840393066 - }, - { - "auxiliary_loss_clip": 0.01158736, - "auxiliary_loss_mlp": 0.01064978, - "balance_loss_clip": 1.06144083, - "balance_loss_mlp": 1.04214907, - "epoch": 0.11916428678791523, - "flos": 25333038789120.0, - "grad_norm": 1.8908045276276995, - "language_loss": 0.85375237, - "learning_rate": 3.91718072355027e-06, - "loss": 0.87598956, - "num_input_tokens_seen": 42927555, - "step": 1982, - "time_per_iteration": 2.732797861099243 - }, - { - "auxiliary_loss_clip": 0.01146432, - "auxiliary_loss_mlp": 0.01050259, - "balance_loss_clip": 1.05539966, - "balance_loss_mlp": 1.02843213, - "epoch": 0.11922441004058319, - "flos": 19788431592960.0, - "grad_norm": 2.3856086229742877, - "language_loss": 0.85202634, - "learning_rate": 3.917069772973513e-06, - "loss": 0.87399322, - "num_input_tokens_seen": 42945300, - "step": 1983, - "time_per_iteration": 2.6839804649353027 - }, - { - "auxiliary_loss_clip": 0.01126589, - "auxiliary_loss_mlp": 0.01056051, - "balance_loss_clip": 1.05602145, - "balance_loss_mlp": 1.03399742, - "epoch": 0.11928453329325117, - "flos": 21536347219200.0, - "grad_norm": 3.6641824085676022, - "language_loss": 0.7693429, - "learning_rate": 3.916958749701277e-06, - "loss": 0.79116929, - "num_input_tokens_seen": 42961295, - "step": 1984, - "time_per_iteration": 2.7008767127990723 - }, - { - "auxiliary_loss_clip": 0.01161623, - "auxiliary_loss_mlp": 0.01055251, - "balance_loss_clip": 1.05752373, - "balance_loss_mlp": 1.0334003, - "epoch": 0.11934465654591914, - "flos": 20815010294400.0, - "grad_norm": 1.917528093726237, - "language_loss": 0.83058321, - "learning_rate": 3.9168476537377745e-06, - "loss": 0.85275191, - "num_input_tokens_seen": 42980330, - "step": 1985, - "time_per_iteration": 2.6692728996276855 - }, - { - "auxiliary_loss_clip": 0.01151831, - "auxiliary_loss_mlp": 0.01050086, - "balance_loss_clip": 1.0541923, - "balance_loss_mlp": 1.02835393, - "epoch": 0.1194047797985871, - "flos": 19060486565760.0, - "grad_norm": 1.8732848573733223, - "language_loss": 0.74398553, - "learning_rate": 3.916736485087216e-06, - "loss": 0.76600474, - "num_input_tokens_seen": 42996125, - "step": 1986, - "time_per_iteration": 2.722013473510742 - }, - { - "auxiliary_loss_clip": 0.01146125, - "auxiliary_loss_mlp": 0.01059008, - "balance_loss_clip": 1.05472732, - "balance_loss_mlp": 1.03791952, - "epoch": 0.11946490305125507, - "flos": 27190805184000.0, - "grad_norm": 2.4724436343771083, - "language_loss": 0.72123617, - "learning_rate": 3.916625243753819e-06, - "loss": 0.74328756, - "num_input_tokens_seen": 43014180, - "step": 1987, - "time_per_iteration": 2.814481258392334 - }, - { - "auxiliary_loss_clip": 0.01156854, - "auxiliary_loss_mlp": 0.01054644, - "balance_loss_clip": 1.05747938, - "balance_loss_mlp": 1.03138638, - "epoch": 0.11952502630392305, - "flos": 21140791672320.0, - "grad_norm": 1.9246234449532542, - "language_loss": 0.72007513, - "learning_rate": 3.916513929741799e-06, - "loss": 0.74219012, - "num_input_tokens_seen": 43032120, - "step": 1988, - "time_per_iteration": 2.7242019176483154 - }, - { - "auxiliary_loss_clip": 0.0116348, - "auxiliary_loss_mlp": 0.01062102, - "balance_loss_clip": 1.05559146, - "balance_loss_mlp": 1.03913057, - "epoch": 0.11958514955659101, - "flos": 22124241118080.0, - "grad_norm": 1.7561483239324645, - "language_loss": 0.81144297, - "learning_rate": 3.91640254305538e-06, - "loss": 0.83369875, - "num_input_tokens_seen": 43052215, - "step": 1989, - "time_per_iteration": 2.6259546279907227 - }, - { - "auxiliary_loss_clip": 0.01135956, - "auxiliary_loss_mlp": 0.01057689, - "balance_loss_clip": 1.05254042, - "balance_loss_mlp": 1.03325129, - "epoch": 0.11964527280925898, - "flos": 17421452040960.0, - "grad_norm": 2.5516320258539795, - "language_loss": 0.75881672, - "learning_rate": 3.916291083698784e-06, - "loss": 0.7807532, - "num_input_tokens_seen": 43069720, - "step": 1990, - "time_per_iteration": 2.6779251098632812 - }, - { - "auxiliary_loss_clip": 0.0105322, - "auxiliary_loss_mlp": 0.01019112, - "balance_loss_clip": 1.02816892, - "balance_loss_mlp": 1.01647794, - "epoch": 0.11970539606192696, - "flos": 70679741402880.0, - "grad_norm": 0.8628582727639288, - "language_loss": 0.55184531, - "learning_rate": 3.916179551676238e-06, - "loss": 0.57256866, - "num_input_tokens_seen": 43123130, - "step": 1991, - "time_per_iteration": 3.3713693618774414 - }, - { - "auxiliary_loss_clip": 0.01136423, - "auxiliary_loss_mlp": 0.01053959, - "balance_loss_clip": 1.05748868, - "balance_loss_mlp": 1.03326464, - "epoch": 0.11976551931459492, - "flos": 21215019127680.0, - "grad_norm": 2.286300891386994, - "language_loss": 0.78371406, - "learning_rate": 3.916067946991971e-06, - "loss": 0.80561793, - "num_input_tokens_seen": 43140015, - "step": 1992, - "time_per_iteration": 2.6797914505004883 - }, - { - "auxiliary_loss_clip": 0.0117949, - "auxiliary_loss_mlp": 0.01056635, - "balance_loss_clip": 1.05811, - "balance_loss_mlp": 1.03453374, - "epoch": 0.11982564256726289, - "flos": 25989306226560.0, - "grad_norm": 1.8481811043026504, - "language_loss": 0.78911144, - "learning_rate": 3.915956269650216e-06, - "loss": 0.81147265, - "num_input_tokens_seen": 43160105, - "step": 1993, - "time_per_iteration": 2.691301107406616 - }, - { - "auxiliary_loss_clip": 0.01126423, - "auxiliary_loss_mlp": 0.0106217, - "balance_loss_clip": 1.05012226, - "balance_loss_mlp": 1.04081941, - "epoch": 0.11988576581993086, - "flos": 21650866755840.0, - "grad_norm": 1.644866568705103, - "language_loss": 0.82088816, - "learning_rate": 3.915844519655208e-06, - "loss": 0.84277415, - "num_input_tokens_seen": 43179835, - "step": 1994, - "time_per_iteration": 2.772905111312866 - }, - { - "auxiliary_loss_clip": 0.0115068, - "auxiliary_loss_mlp": 0.01063961, - "balance_loss_clip": 1.05523098, - "balance_loss_mlp": 1.0433259, - "epoch": 0.11994588907259883, - "flos": 17857407409920.0, - "grad_norm": 2.0065598513575247, - "language_loss": 0.88392794, - "learning_rate": 3.915732697011183e-06, - "loss": 0.9060744, - "num_input_tokens_seen": 43197210, - "step": 1995, - "time_per_iteration": 4.206532716751099 - }, - { - "auxiliary_loss_clip": 0.01153482, - "auxiliary_loss_mlp": 0.01066415, - "balance_loss_clip": 1.06005812, - "balance_loss_mlp": 1.0441823, - "epoch": 0.1200060123252668, - "flos": 24462744163200.0, - "grad_norm": 1.8775058007239456, - "language_loss": 0.73949909, - "learning_rate": 3.9156208017223825e-06, - "loss": 0.76169801, - "num_input_tokens_seen": 43215050, - "step": 1996, - "time_per_iteration": 2.7263944149017334 - }, - { - "auxiliary_loss_clip": 0.01141484, - "auxiliary_loss_mlp": 0.01060112, - "balance_loss_clip": 1.05754757, - "balance_loss_mlp": 1.03808212, - "epoch": 0.12006613557793476, - "flos": 18732191235840.0, - "grad_norm": 1.976051865072764, - "language_loss": 0.88125587, - "learning_rate": 3.915508833793048e-06, - "loss": 0.90327179, - "num_input_tokens_seen": 43233900, - "step": 1997, - "time_per_iteration": 4.29426383972168 - }, - { - "auxiliary_loss_clip": 0.01165634, - "auxiliary_loss_mlp": 0.00779568, - "balance_loss_clip": 1.05701697, - "balance_loss_mlp": 1.00001049, - "epoch": 0.12012625883060274, - "flos": 22267739952000.0, - "grad_norm": 2.1091392562336018, - "language_loss": 0.79031086, - "learning_rate": 3.915396793227428e-06, - "loss": 0.80976284, - "num_input_tokens_seen": 43252105, - "step": 1998, - "time_per_iteration": 4.330955266952515 - }, - { - "auxiliary_loss_clip": 0.0116661, - "auxiliary_loss_mlp": 0.00779642, - "balance_loss_clip": 1.0576719, - "balance_loss_mlp": 1.00002396, - "epoch": 0.1201863820832707, - "flos": 21758885930880.0, - "grad_norm": 1.799585336659533, - "language_loss": 0.73583078, - "learning_rate": 3.915284680029769e-06, - "loss": 0.75529337, - "num_input_tokens_seen": 43270315, - "step": 1999, - "time_per_iteration": 2.754770040512085 - }, - { - "auxiliary_loss_clip": 0.01178966, - "auxiliary_loss_mlp": 0.01073097, - "balance_loss_clip": 1.0602119, - "balance_loss_mlp": 1.05115068, - "epoch": 0.12024650533593867, - "flos": 21907987286400.0, - "grad_norm": 2.916355473014409, - "language_loss": 0.74854898, - "learning_rate": 3.915172494204323e-06, - "loss": 0.77106953, - "num_input_tokens_seen": 43289935, - "step": 2000, - "time_per_iteration": 4.3900322914123535 - }, - { - "auxiliary_loss_clip": 0.01149374, - "auxiliary_loss_mlp": 0.01069735, - "balance_loss_clip": 1.05375695, - "balance_loss_mlp": 1.04763341, - "epoch": 0.12030662858860665, - "flos": 21689219502720.0, - "grad_norm": 1.5203973891597686, - "language_loss": 0.8496564, - "learning_rate": 3.915060235755344e-06, - "loss": 0.87184751, - "num_input_tokens_seen": 43309325, - "step": 2001, - "time_per_iteration": 2.6912643909454346 - }, - { - "auxiliary_loss_clip": 0.01154057, - "auxiliary_loss_mlp": 0.01063637, - "balance_loss_clip": 1.05600786, - "balance_loss_mlp": 1.04265642, - "epoch": 0.12036675184127461, - "flos": 12933228856320.0, - "grad_norm": 2.932264271186656, - "language_loss": 0.74711967, - "learning_rate": 3.91494790468709e-06, - "loss": 0.76929653, - "num_input_tokens_seen": 43327010, - "step": 2002, - "time_per_iteration": 2.6991024017333984 - }, - { - "auxiliary_loss_clip": 0.01129169, - "auxiliary_loss_mlp": 0.01066705, - "balance_loss_clip": 1.05340302, - "balance_loss_mlp": 1.0429939, - "epoch": 0.12042687509394258, - "flos": 20851028657280.0, - "grad_norm": 2.117271428042382, - "language_loss": 0.78029454, - "learning_rate": 3.9148355010038185e-06, - "loss": 0.80225325, - "num_input_tokens_seen": 43345650, - "step": 2003, - "time_per_iteration": 2.731381416320801 - }, - { - "auxiliary_loss_clip": 0.01163252, - "auxiliary_loss_mlp": 0.01062886, - "balance_loss_clip": 1.05728662, - "balance_loss_mlp": 1.04073668, - "epoch": 0.12048699834661056, - "flos": 23878513451520.0, - "grad_norm": 1.585850552088038, - "language_loss": 0.72205627, - "learning_rate": 3.914723024709793e-06, - "loss": 0.74431765, - "num_input_tokens_seen": 43365555, - "step": 2004, - "time_per_iteration": 2.725092649459839 - }, - { - "auxiliary_loss_clip": 0.01160616, - "auxiliary_loss_mlp": 0.01069457, - "balance_loss_clip": 1.05870187, - "balance_loss_mlp": 1.04645014, - "epoch": 0.12054712159927852, - "flos": 19756363726080.0, - "grad_norm": 1.9357732467170252, - "language_loss": 0.78415942, - "learning_rate": 3.914610475809279e-06, - "loss": 0.8064602, - "num_input_tokens_seen": 43384990, - "step": 2005, - "time_per_iteration": 2.7232437133789062 - }, - { - "auxiliary_loss_clip": 0.01073016, - "auxiliary_loss_mlp": 0.00758901, - "balance_loss_clip": 1.02995479, - "balance_loss_mlp": 1.00011683, - "epoch": 0.12060724485194649, - "flos": 51672763123200.0, - "grad_norm": 0.9264315537536937, - "language_loss": 0.58087146, - "learning_rate": 3.914497854306543e-06, - "loss": 0.59919059, - "num_input_tokens_seen": 43436335, - "step": 2006, - "time_per_iteration": 2.9570157527923584 - }, - { - "auxiliary_loss_clip": 0.01155081, - "auxiliary_loss_mlp": 0.01053472, - "balance_loss_clip": 1.05803597, - "balance_loss_mlp": 1.03299201, - "epoch": 0.12066736810461445, - "flos": 18990425088000.0, - "grad_norm": 1.6109316320484448, - "language_loss": 0.76524282, - "learning_rate": 3.9143851602058575e-06, - "loss": 0.78732836, - "num_input_tokens_seen": 43456495, - "step": 2007, - "time_per_iteration": 2.763380289077759 - }, - { - "auxiliary_loss_clip": 0.01147254, - "auxiliary_loss_mlp": 0.01064209, - "balance_loss_clip": 1.05931091, - "balance_loss_mlp": 1.04177368, - "epoch": 0.12072749135728243, - "flos": 16471973882880.0, - "grad_norm": 2.449779851562752, - "language_loss": 0.83023942, - "learning_rate": 3.914272393511494e-06, - "loss": 0.85235405, - "num_input_tokens_seen": 43473085, - "step": 2008, - "time_per_iteration": 2.7693119049072266 - }, - { - "auxiliary_loss_clip": 0.01176157, - "auxiliary_loss_mlp": 0.01052894, - "balance_loss_clip": 1.0584172, - "balance_loss_mlp": 1.03135288, - "epoch": 0.1207876146099504, - "flos": 18077108947200.0, - "grad_norm": 2.203355340521787, - "language_loss": 0.83835697, - "learning_rate": 3.91415955422773e-06, - "loss": 0.86064744, - "num_input_tokens_seen": 43491135, - "step": 2009, - "time_per_iteration": 2.640944242477417 - }, - { - "auxiliary_loss_clip": 0.01180076, - "auxiliary_loss_mlp": 0.01053549, - "balance_loss_clip": 1.06196725, - "balance_loss_mlp": 1.02994514, - "epoch": 0.12084773786261836, - "flos": 21871573873920.0, - "grad_norm": 1.6799099601218046, - "language_loss": 0.83870012, - "learning_rate": 3.914046642358844e-06, - "loss": 0.8610363, - "num_input_tokens_seen": 43510440, - "step": 2010, - "time_per_iteration": 2.716127634048462 - }, - { - "auxiliary_loss_clip": 0.01145261, - "auxiliary_loss_mlp": 0.00780804, - "balance_loss_clip": 1.05555713, - "balance_loss_mlp": 1.0000627, - "epoch": 0.12090786111528634, - "flos": 18333044328960.0, - "grad_norm": 1.8933604390076018, - "language_loss": 0.84194541, - "learning_rate": 3.9139336579091174e-06, - "loss": 0.86120605, - "num_input_tokens_seen": 43530145, - "step": 2011, - "time_per_iteration": 2.73793625831604 - }, - { - "auxiliary_loss_clip": 0.01148418, - "auxiliary_loss_mlp": 0.01060974, - "balance_loss_clip": 1.05480969, - "balance_loss_mlp": 1.03905129, - "epoch": 0.1209679843679543, - "flos": 21105850717440.0, - "grad_norm": 2.0524904800028154, - "language_loss": 0.96236968, - "learning_rate": 3.913820600882834e-06, - "loss": 0.98446357, - "num_input_tokens_seen": 43549315, - "step": 2012, - "time_per_iteration": 2.7269980907440186 - }, - { - "auxiliary_loss_clip": 0.01146369, - "auxiliary_loss_mlp": 0.01051396, - "balance_loss_clip": 1.05808425, - "balance_loss_mlp": 1.0289607, - "epoch": 0.12102810762062227, - "flos": 29241053585280.0, - "grad_norm": 1.853151366811655, - "language_loss": 0.80903435, - "learning_rate": 3.913707471284283e-06, - "loss": 0.83101201, - "num_input_tokens_seen": 43569240, - "step": 2013, - "time_per_iteration": 2.740489959716797 - }, - { - "auxiliary_loss_clip": 0.01124703, - "auxiliary_loss_mlp": 0.0105341, - "balance_loss_clip": 1.05300117, - "balance_loss_mlp": 1.02962804, - "epoch": 0.12108823087329025, - "flos": 17930701111680.0, - "grad_norm": 5.099975898232357, - "language_loss": 0.77255923, - "learning_rate": 3.9135942691177515e-06, - "loss": 0.79434031, - "num_input_tokens_seen": 43587710, - "step": 2014, - "time_per_iteration": 2.7361485958099365 - }, - { - "auxiliary_loss_clip": 0.0116607, - "auxiliary_loss_mlp": 0.01051056, - "balance_loss_clip": 1.05832791, - "balance_loss_mlp": 1.02791715, - "epoch": 0.12114835412595822, - "flos": 22091850028800.0, - "grad_norm": 5.8570343294144465, - "language_loss": 0.87169874, - "learning_rate": 3.913480994387535e-06, - "loss": 0.89387, - "num_input_tokens_seen": 43606000, - "step": 2015, - "time_per_iteration": 2.6881515979766846 - }, - { - "auxiliary_loss_clip": 0.01170382, - "auxiliary_loss_mlp": 0.01051162, - "balance_loss_clip": 1.05500197, - "balance_loss_mlp": 1.0289886, - "epoch": 0.12120847737862618, - "flos": 20412343854720.0, - "grad_norm": 2.087765239068409, - "language_loss": 0.69146478, - "learning_rate": 3.913367647097926e-06, - "loss": 0.71368027, - "num_input_tokens_seen": 43624815, - "step": 2016, - "time_per_iteration": 2.7096211910247803 - }, - { - "auxiliary_loss_clip": 0.01152563, - "auxiliary_loss_mlp": 0.0104714, - "balance_loss_clip": 1.05737591, - "balance_loss_mlp": 1.02390599, - "epoch": 0.12126860063129415, - "flos": 22309037614080.0, - "grad_norm": 2.8043603396252865, - "language_loss": 0.79858959, - "learning_rate": 3.913254227253225e-06, - "loss": 0.82058656, - "num_input_tokens_seen": 43643960, - "step": 2017, - "time_per_iteration": 2.7042336463928223 - }, - { - "auxiliary_loss_clip": 0.01156022, - "auxiliary_loss_mlp": 0.0105052, - "balance_loss_clip": 1.05479789, - "balance_loss_mlp": 1.02740538, - "epoch": 0.12132872388396213, - "flos": 13699275235200.0, - "grad_norm": 2.8700241463026654, - "language_loss": 0.68828821, - "learning_rate": 3.913140734857731e-06, - "loss": 0.71035373, - "num_input_tokens_seen": 43662650, - "step": 2018, - "time_per_iteration": 2.7015058994293213 - }, - { - "auxiliary_loss_clip": 0.01136376, - "auxiliary_loss_mlp": 0.01050749, - "balance_loss_clip": 1.05524123, - "balance_loss_mlp": 1.02873111, - "epoch": 0.12138884713663009, - "flos": 26466954307200.0, - "grad_norm": 1.6132330771570709, - "language_loss": 0.72476816, - "learning_rate": 3.91302716991575e-06, - "loss": 0.74663943, - "num_input_tokens_seen": 43684205, - "step": 2019, - "time_per_iteration": 2.8956947326660156 - }, - { - "auxiliary_loss_clip": 0.01107167, - "auxiliary_loss_mlp": 0.01057916, - "balance_loss_clip": 1.05286384, - "balance_loss_mlp": 1.03482556, - "epoch": 0.12144897038929806, - "flos": 26141603892480.0, - "grad_norm": 1.853626515444831, - "language_loss": 0.92125106, - "learning_rate": 3.912913532431586e-06, - "loss": 0.94290185, - "num_input_tokens_seen": 43706320, - "step": 2020, - "time_per_iteration": 2.9980764389038086 - }, - { - "auxiliary_loss_clip": 0.0114145, - "auxiliary_loss_mlp": 0.01055455, - "balance_loss_clip": 1.05289125, - "balance_loss_mlp": 1.03360391, - "epoch": 0.12150909364196603, - "flos": 24717530309760.0, - "grad_norm": 1.9227427415613194, - "language_loss": 0.7772885, - "learning_rate": 3.912799822409549e-06, - "loss": 0.79925752, - "num_input_tokens_seen": 43724805, - "step": 2021, - "time_per_iteration": 3.01798939704895 - }, - { - "auxiliary_loss_clip": 0.0117749, - "auxiliary_loss_mlp": 0.01049007, - "balance_loss_clip": 1.0610733, - "balance_loss_mlp": 1.0277164, - "epoch": 0.121569216894634, - "flos": 25186990089600.0, - "grad_norm": 2.054228820960504, - "language_loss": 0.80712306, - "learning_rate": 3.912686039853952e-06, - "loss": 0.82938808, - "num_input_tokens_seen": 43742320, - "step": 2022, - "time_per_iteration": 2.684309244155884 - }, - { - "auxiliary_loss_clip": 0.01144749, - "auxiliary_loss_mlp": 0.0106163, - "balance_loss_clip": 1.055619, - "balance_loss_mlp": 1.03697765, - "epoch": 0.12162934014730196, - "flos": 13444094039040.0, - "grad_norm": 1.734031517866852, - "language_loss": 0.84842217, - "learning_rate": 3.912572184769108e-06, - "loss": 0.87048596, - "num_input_tokens_seen": 43760665, - "step": 2023, - "time_per_iteration": 2.6886441707611084 - }, - { - "auxiliary_loss_clip": 0.01139348, - "auxiliary_loss_mlp": 0.01053043, - "balance_loss_clip": 1.05162323, - "balance_loss_mlp": 1.03081048, - "epoch": 0.12168946339996994, - "flos": 16946138344320.0, - "grad_norm": 2.3397199529221546, - "language_loss": 0.85514021, - "learning_rate": 3.912458257159335e-06, - "loss": 0.87706411, - "num_input_tokens_seen": 43779020, - "step": 2024, - "time_per_iteration": 2.8043718338012695 - }, - { - "auxiliary_loss_clip": 0.01169767, - "auxiliary_loss_mlp": 0.01055534, - "balance_loss_clip": 1.05277538, - "balance_loss_mlp": 1.03389716, - "epoch": 0.12174958665263791, - "flos": 29821585196160.0, - "grad_norm": 1.8432491304976684, - "language_loss": 0.72088945, - "learning_rate": 3.912344257028954e-06, - "loss": 0.74314243, - "num_input_tokens_seen": 43798850, - "step": 2025, - "time_per_iteration": 2.704876184463501 - }, - { - "auxiliary_loss_clip": 0.01148564, - "auxiliary_loss_mlp": 0.01047618, - "balance_loss_clip": 1.05486572, - "balance_loss_mlp": 1.02555275, - "epoch": 0.12180970990530587, - "flos": 24641902224000.0, - "grad_norm": 1.4969552271445652, - "language_loss": 0.76075011, - "learning_rate": 3.912230184382286e-06, - "loss": 0.78271192, - "num_input_tokens_seen": 43820130, - "step": 2026, - "time_per_iteration": 2.6957921981811523 - }, - { - "auxiliary_loss_clip": 0.01147374, - "auxiliary_loss_mlp": 0.01046261, - "balance_loss_clip": 1.05086374, - "balance_loss_mlp": 1.02474427, - "epoch": 0.12186983315797385, - "flos": 20521691832960.0, - "grad_norm": 2.2064263994277478, - "language_loss": 0.88769746, - "learning_rate": 3.912116039223659e-06, - "loss": 0.90963376, - "num_input_tokens_seen": 43838485, - "step": 2027, - "time_per_iteration": 2.6847639083862305 - }, - { - "auxiliary_loss_clip": 0.01143778, - "auxiliary_loss_mlp": 0.01056715, - "balance_loss_clip": 1.05258501, - "balance_loss_mlp": 1.03667617, - "epoch": 0.12192995641064182, - "flos": 27818344719360.0, - "grad_norm": 1.5725885574076592, - "language_loss": 0.75544459, - "learning_rate": 3.912001821557399e-06, - "loss": 0.77744961, - "num_input_tokens_seen": 43859080, - "step": 2028, - "time_per_iteration": 2.7706027030944824 - }, - { - "auxiliary_loss_clip": 0.01123185, - "auxiliary_loss_mlp": 0.01057136, - "balance_loss_clip": 1.0518471, - "balance_loss_mlp": 1.03554714, - "epoch": 0.12199007966330978, - "flos": 22017119783040.0, - "grad_norm": 2.0550419223931193, - "language_loss": 0.76802504, - "learning_rate": 3.911887531387839e-06, - "loss": 0.78982824, - "num_input_tokens_seen": 43879030, - "step": 2029, - "time_per_iteration": 2.732637405395508 - }, - { - "auxiliary_loss_clip": 0.01156591, - "auxiliary_loss_mlp": 0.01052355, - "balance_loss_clip": 1.05253625, - "balance_loss_mlp": 1.03107572, - "epoch": 0.12205020291597775, - "flos": 23295216493440.0, - "grad_norm": 1.707195979328818, - "language_loss": 0.79164296, - "learning_rate": 3.911773168719313e-06, - "loss": 0.81373239, - "num_input_tokens_seen": 43898505, - "step": 2030, - "time_per_iteration": 2.7254061698913574 - }, - { - "auxiliary_loss_clip": 0.0116997, - "auxiliary_loss_mlp": 0.01051357, - "balance_loss_clip": 1.05618095, - "balance_loss_mlp": 1.02930319, - "epoch": 0.12211032616864573, - "flos": 26031609469440.0, - "grad_norm": 3.038077546298312, - "language_loss": 0.74411637, - "learning_rate": 3.911658733556155e-06, - "loss": 0.76632965, - "num_input_tokens_seen": 43917945, - "step": 2031, - "time_per_iteration": 2.6711080074310303 - }, - { - "auxiliary_loss_clip": 0.01174332, - "auxiliary_loss_mlp": 0.01045812, - "balance_loss_clip": 1.05888343, - "balance_loss_mlp": 1.02545118, - "epoch": 0.12217044942131369, - "flos": 20410943224320.0, - "grad_norm": 1.7636188348969384, - "language_loss": 0.75230348, - "learning_rate": 3.911544225902707e-06, - "loss": 0.7745049, - "num_input_tokens_seen": 43937385, - "step": 2032, - "time_per_iteration": 2.7134530544281006 - }, - { - "auxiliary_loss_clip": 0.01152363, - "auxiliary_loss_mlp": 0.01045735, - "balance_loss_clip": 1.05129802, - "balance_loss_mlp": 1.02538586, - "epoch": 0.12223057267398166, - "flos": 22857142222080.0, - "grad_norm": 1.5809359138264147, - "language_loss": 0.89502287, - "learning_rate": 3.911429645763311e-06, - "loss": 0.91700387, - "num_input_tokens_seen": 43958130, - "step": 2033, - "time_per_iteration": 2.7105965614318848 - }, - { - "auxiliary_loss_clip": 0.01155694, - "auxiliary_loss_mlp": 0.01051169, - "balance_loss_clip": 1.05740523, - "balance_loss_mlp": 1.03005767, - "epoch": 0.12229069592664964, - "flos": 20047563285120.0, - "grad_norm": 1.9580868921695649, - "language_loss": 0.65195286, - "learning_rate": 3.911314993142311e-06, - "loss": 0.67402148, - "num_input_tokens_seen": 43976800, - "step": 2034, - "time_per_iteration": 4.222668886184692 - }, - { - "auxiliary_loss_clip": 0.01152239, - "auxiliary_loss_mlp": 0.01055659, - "balance_loss_clip": 1.05550218, - "balance_loss_mlp": 1.0327704, - "epoch": 0.1223508191793176, - "flos": 22274240313600.0, - "grad_norm": 1.6376942269871653, - "language_loss": 0.76459455, - "learning_rate": 3.911200268044055e-06, - "loss": 0.78667355, - "num_input_tokens_seen": 43996620, - "step": 2035, - "time_per_iteration": 2.7306556701660156 - }, - { - "auxiliary_loss_clip": 0.01176703, - "auxiliary_loss_mlp": 0.01050008, - "balance_loss_clip": 1.0577215, - "balance_loss_mlp": 1.02798975, - "epoch": 0.12241094243198557, - "flos": 21285978445440.0, - "grad_norm": 1.8460180606974623, - "language_loss": 0.71294892, - "learning_rate": 3.911085470472892e-06, - "loss": 0.73521602, - "num_input_tokens_seen": 44016175, - "step": 2036, - "time_per_iteration": 2.7327258586883545 - }, - { - "auxiliary_loss_clip": 0.01144473, - "auxiliary_loss_mlp": 0.01058389, - "balance_loss_clip": 1.05778408, - "balance_loss_mlp": 1.03623962, - "epoch": 0.12247106568465355, - "flos": 17382381022080.0, - "grad_norm": 1.5772021569883852, - "language_loss": 0.83130831, - "learning_rate": 3.910970600433178e-06, - "loss": 0.85333693, - "num_input_tokens_seen": 44035060, - "step": 2037, - "time_per_iteration": 4.248440742492676 - }, - { - "auxiliary_loss_clip": 0.01153641, - "auxiliary_loss_mlp": 0.01060257, - "balance_loss_clip": 1.0556947, - "balance_loss_mlp": 1.0366174, - "epoch": 0.12253118893732151, - "flos": 27045438842880.0, - "grad_norm": 2.676780030246967, - "language_loss": 0.79765236, - "learning_rate": 3.910855657929267e-06, - "loss": 0.81979132, - "num_input_tokens_seen": 44053330, - "step": 2038, - "time_per_iteration": 2.7321341037750244 - }, - { - "auxiliary_loss_clip": 0.010642, - "auxiliary_loss_mlp": 0.00759248, - "balance_loss_clip": 1.02961969, - "balance_loss_mlp": 1.00006962, - "epoch": 0.12259131218998948, - "flos": 53861518368000.0, - "grad_norm": 0.8248048644272604, - "language_loss": 0.58659601, - "learning_rate": 3.910740642965518e-06, - "loss": 0.6048305, - "num_input_tokens_seen": 44107575, - "step": 2039, - "time_per_iteration": 4.739040851593018 - }, - { - "auxiliary_loss_clip": 0.01128, - "auxiliary_loss_mlp": 0.01064411, - "balance_loss_clip": 1.05292714, - "balance_loss_mlp": 1.03912663, - "epoch": 0.12265143544265744, - "flos": 17891917401600.0, - "grad_norm": 2.1548467753138136, - "language_loss": 0.80099291, - "learning_rate": 3.910625555546292e-06, - "loss": 0.82291704, - "num_input_tokens_seen": 44126075, - "step": 2040, - "time_per_iteration": 2.723247766494751 - }, - { - "auxiliary_loss_clip": 0.01149343, - "auxiliary_loss_mlp": 0.01058534, - "balance_loss_clip": 1.05517352, - "balance_loss_mlp": 1.03673029, - "epoch": 0.12271155869532542, - "flos": 21799932197760.0, - "grad_norm": 1.8247690225218605, - "language_loss": 0.82841176, - "learning_rate": 3.910510395675953e-06, - "loss": 0.85049051, - "num_input_tokens_seen": 44145605, - "step": 2041, - "time_per_iteration": 2.699110984802246 - }, - { - "auxiliary_loss_clip": 0.01136001, - "auxiliary_loss_mlp": 0.01053451, - "balance_loss_clip": 1.05120957, - "balance_loss_mlp": 1.03061032, - "epoch": 0.12277168194799339, - "flos": 19828759587840.0, - "grad_norm": 1.9386136063873771, - "language_loss": 0.67272276, - "learning_rate": 3.9103951633588694e-06, - "loss": 0.69461727, - "num_input_tokens_seen": 44164770, - "step": 2042, - "time_per_iteration": 2.7042133808135986 - }, - { - "auxiliary_loss_clip": 0.01133115, - "auxiliary_loss_mlp": 0.01056941, - "balance_loss_clip": 1.05079007, - "balance_loss_mlp": 1.03517294, - "epoch": 0.12283180520066135, - "flos": 23221024951680.0, - "grad_norm": 1.912164915278887, - "language_loss": 0.81765604, - "learning_rate": 3.910279858599409e-06, - "loss": 0.83955657, - "num_input_tokens_seen": 44184025, - "step": 2043, - "time_per_iteration": 2.6942050457000732 - }, - { - "auxiliary_loss_clip": 0.01146416, - "auxiliary_loss_mlp": 0.01052365, - "balance_loss_clip": 1.05161905, - "balance_loss_mlp": 1.03040695, - "epoch": 0.12289192845332933, - "flos": 18588476920320.0, - "grad_norm": 1.7894844734354058, - "language_loss": 0.80192459, - "learning_rate": 3.910164481401946e-06, - "loss": 0.82391244, - "num_input_tokens_seen": 44202950, - "step": 2044, - "time_per_iteration": 2.6227192878723145 - }, - { - "auxiliary_loss_clip": 0.01116285, - "auxiliary_loss_mlp": 0.01052013, - "balance_loss_clip": 1.05284619, - "balance_loss_mlp": 1.03055525, - "epoch": 0.1229520517059973, - "flos": 25769532862080.0, - "grad_norm": 1.7152742607840916, - "language_loss": 0.7794897, - "learning_rate": 3.910049031770853e-06, - "loss": 0.80117267, - "num_input_tokens_seen": 44221115, - "step": 2045, - "time_per_iteration": 2.769017219543457 - }, - { - "auxiliary_loss_clip": 0.01163545, - "auxiliary_loss_mlp": 0.01060468, - "balance_loss_clip": 1.05796146, - "balance_loss_mlp": 1.03827095, - "epoch": 0.12301217495866526, - "flos": 20887154760960.0, - "grad_norm": 1.852572781372854, - "language_loss": 0.67284262, - "learning_rate": 3.90993350971051e-06, - "loss": 0.69508278, - "num_input_tokens_seen": 44240575, - "step": 2046, - "time_per_iteration": 2.6377944946289062 - }, - { - "auxiliary_loss_clip": 0.01173803, - "auxiliary_loss_mlp": 0.01053755, - "balance_loss_clip": 1.06010675, - "balance_loss_mlp": 1.03202295, - "epoch": 0.12307229821133324, - "flos": 22378811783040.0, - "grad_norm": 4.982373490718116, - "language_loss": 0.72730684, - "learning_rate": 3.909817915225297e-06, - "loss": 0.74958241, - "num_input_tokens_seen": 44257145, - "step": 2047, - "time_per_iteration": 2.5791239738464355 - }, - { - "auxiliary_loss_clip": 0.01155159, - "auxiliary_loss_mlp": 0.01060632, - "balance_loss_clip": 1.05398846, - "balance_loss_mlp": 1.03817296, - "epoch": 0.1231324214640012, - "flos": 23367396873600.0, - "grad_norm": 1.8194194024321948, - "language_loss": 0.76583183, - "learning_rate": 3.909702248319597e-06, - "loss": 0.78798974, - "num_input_tokens_seen": 44278035, - "step": 2048, - "time_per_iteration": 2.6997592449188232 - }, - { - "auxiliary_loss_clip": 0.01146796, - "auxiliary_loss_mlp": 0.01047309, - "balance_loss_clip": 1.05524468, - "balance_loss_mlp": 1.02798486, - "epoch": 0.12319254471666917, - "flos": 23767154311680.0, - "grad_norm": 1.8097490634569602, - "language_loss": 0.85359102, - "learning_rate": 3.909586508997797e-06, - "loss": 0.87553203, - "num_input_tokens_seen": 44296980, - "step": 2049, - "time_per_iteration": 2.739617109298706 - }, - { - "auxiliary_loss_clip": 0.01120276, - "auxiliary_loss_mlp": 0.01050145, - "balance_loss_clip": 1.0533725, - "balance_loss_mlp": 1.02887857, - "epoch": 0.12325266796933713, - "flos": 23550146294400.0, - "grad_norm": 2.6582136339172724, - "language_loss": 0.75563407, - "learning_rate": 3.909470697264285e-06, - "loss": 0.77733827, - "num_input_tokens_seen": 44318005, - "step": 2050, - "time_per_iteration": 2.7814078330993652 - }, - { - "auxiliary_loss_clip": 0.01138568, - "auxiliary_loss_mlp": 0.01057939, - "balance_loss_clip": 1.05428278, - "balance_loss_mlp": 1.03608823, - "epoch": 0.12331279122200511, - "flos": 24423996366720.0, - "grad_norm": 1.81408967902731, - "language_loss": 0.81166679, - "learning_rate": 3.909354813123452e-06, - "loss": 0.83363187, - "num_input_tokens_seen": 44335260, - "step": 2051, - "time_per_iteration": 2.7555224895477295 - }, - { - "auxiliary_loss_clip": 0.01171646, - "auxiliary_loss_mlp": 0.00779218, - "balance_loss_clip": 1.05882978, - "balance_loss_mlp": 0.99996465, - "epoch": 0.12337291447467308, - "flos": 25484294960640.0, - "grad_norm": 1.8885516327307212, - "language_loss": 0.80445349, - "learning_rate": 3.909238856579693e-06, - "loss": 0.82396215, - "num_input_tokens_seen": 44355315, - "step": 2052, - "time_per_iteration": 2.7676405906677246 - }, - { - "auxiliary_loss_clip": 0.01165489, - "auxiliary_loss_mlp": 0.010569, - "balance_loss_clip": 1.0581975, - "balance_loss_mlp": 1.03537059, - "epoch": 0.12343303772734104, - "flos": 23550002640000.0, - "grad_norm": 2.171205541070781, - "language_loss": 0.73676848, - "learning_rate": 3.909122827637406e-06, - "loss": 0.75899243, - "num_input_tokens_seen": 44373020, - "step": 2053, - "time_per_iteration": 2.648609161376953 - }, - { - "auxiliary_loss_clip": 0.01168883, - "auxiliary_loss_mlp": 0.00778478, - "balance_loss_clip": 1.05302441, - "balance_loss_mlp": 0.99995315, - "epoch": 0.12349316098000902, - "flos": 47557074867840.0, - "grad_norm": 1.5051513438882418, - "language_loss": 0.7413671, - "learning_rate": 3.909006726300991e-06, - "loss": 0.76084077, - "num_input_tokens_seen": 44397525, - "step": 2054, - "time_per_iteration": 2.871469020843506 - }, - { - "auxiliary_loss_clip": 0.01147607, - "auxiliary_loss_mlp": 0.01044612, - "balance_loss_clip": 1.05402803, - "balance_loss_mlp": 1.02482307, - "epoch": 0.12355328423267699, - "flos": 25045969294080.0, - "grad_norm": 4.50189877271012, - "language_loss": 0.85417157, - "learning_rate": 3.908890552574849e-06, - "loss": 0.8760938, - "num_input_tokens_seen": 44415890, - "step": 2055, - "time_per_iteration": 2.7136077880859375 - }, - { - "auxiliary_loss_clip": 0.01133829, - "auxiliary_loss_mlp": 0.01047458, - "balance_loss_clip": 1.05999517, - "balance_loss_mlp": 1.02802706, - "epoch": 0.12361340748534495, - "flos": 27709140395520.0, - "grad_norm": 2.0629908776416688, - "language_loss": 0.77506042, - "learning_rate": 3.908774306463384e-06, - "loss": 0.79687333, - "num_input_tokens_seen": 44436625, - "step": 2056, - "time_per_iteration": 2.83107852935791 - }, - { - "auxiliary_loss_clip": 0.01158234, - "auxiliary_loss_mlp": 0.01055, - "balance_loss_clip": 1.05444396, - "balance_loss_mlp": 1.03405499, - "epoch": 0.12367353073801293, - "flos": 26140598311680.0, - "grad_norm": 1.9893743253373262, - "language_loss": 0.83361745, - "learning_rate": 3.908657987971009e-06, - "loss": 0.85574985, - "num_input_tokens_seen": 44455265, - "step": 2057, - "time_per_iteration": 2.6987085342407227 - }, - { - "auxiliary_loss_clip": 0.01141319, - "auxiliary_loss_mlp": 0.01051708, - "balance_loss_clip": 1.05057144, - "balance_loss_mlp": 1.02991605, - "epoch": 0.1237336539906809, - "flos": 25156035544320.0, - "grad_norm": 1.4905135493793764, - "language_loss": 0.77818203, - "learning_rate": 3.90854159710213e-06, - "loss": 0.80011231, - "num_input_tokens_seen": 44475815, - "step": 2058, - "time_per_iteration": 2.7149016857147217 - }, - { - "auxiliary_loss_clip": 0.01138087, - "auxiliary_loss_mlp": 0.01058134, - "balance_loss_clip": 1.05117273, - "balance_loss_mlp": 1.03482866, - "epoch": 0.12379377724334886, - "flos": 15304589867520.0, - "grad_norm": 1.8387803476985631, - "language_loss": 0.8342883, - "learning_rate": 3.9084251338611624e-06, - "loss": 0.85625052, - "num_input_tokens_seen": 44494045, - "step": 2059, - "time_per_iteration": 2.7030091285705566 - }, - { - "auxiliary_loss_clip": 0.01133517, - "auxiliary_loss_mlp": 0.01057399, - "balance_loss_clip": 1.05123472, - "balance_loss_mlp": 1.03445077, - "epoch": 0.12385390049601683, - "flos": 21316717509120.0, - "grad_norm": 2.7478129466394217, - "language_loss": 0.81420219, - "learning_rate": 3.908308598252523e-06, - "loss": 0.83611137, - "num_input_tokens_seen": 44509120, - "step": 2060, - "time_per_iteration": 2.738499402999878 - }, - { - "auxiliary_loss_clip": 0.01150334, - "auxiliary_loss_mlp": 0.01054424, - "balance_loss_clip": 1.05367386, - "balance_loss_mlp": 1.0315125, - "epoch": 0.1239140237486848, - "flos": 15116309752320.0, - "grad_norm": 1.8699548955873522, - "language_loss": 0.86224365, - "learning_rate": 3.9081919902806306e-06, - "loss": 0.88429129, - "num_input_tokens_seen": 44525780, - "step": 2061, - "time_per_iteration": 2.6492960453033447 - }, - { - "auxiliary_loss_clip": 0.0115523, - "auxiliary_loss_mlp": 0.01050307, - "balance_loss_clip": 1.05506253, - "balance_loss_mlp": 1.03031528, - "epoch": 0.12397414700135277, - "flos": 21976791788160.0, - "grad_norm": 2.006361909654615, - "language_loss": 0.84949362, - "learning_rate": 3.908075309949906e-06, - "loss": 0.87154901, - "num_input_tokens_seen": 44543125, - "step": 2062, - "time_per_iteration": 2.5925393104553223 - }, - { - "auxiliary_loss_clip": 0.01124676, - "auxiliary_loss_mlp": 0.01058304, - "balance_loss_clip": 1.05198252, - "balance_loss_mlp": 1.03498697, - "epoch": 0.12403427025402074, - "flos": 13400892956160.0, - "grad_norm": 1.6181471799462952, - "language_loss": 0.78765064, - "learning_rate": 3.907958557264774e-06, - "loss": 0.80948043, - "num_input_tokens_seen": 44560275, - "step": 2063, - "time_per_iteration": 2.7551674842834473 - }, - { - "auxiliary_loss_clip": 0.01124369, - "auxiliary_loss_mlp": 0.01057465, - "balance_loss_clip": 1.05492854, - "balance_loss_mlp": 1.03450513, - "epoch": 0.12409439350668872, - "flos": 15304374385920.0, - "grad_norm": 2.9315517002695017, - "language_loss": 0.79452097, - "learning_rate": 3.907841732229663e-06, - "loss": 0.81633931, - "num_input_tokens_seen": 44577640, - "step": 2064, - "time_per_iteration": 2.699711322784424 - }, - { - "auxiliary_loss_clip": 0.01144709, - "auxiliary_loss_mlp": 0.01058768, - "balance_loss_clip": 1.05316699, - "balance_loss_mlp": 1.03847849, - "epoch": 0.12415451675935668, - "flos": 25009376313600.0, - "grad_norm": 2.5611248351266016, - "language_loss": 0.92676973, - "learning_rate": 3.907724834849002e-06, - "loss": 0.9488045, - "num_input_tokens_seen": 44594860, - "step": 2065, - "time_per_iteration": 2.7114996910095215 - }, - { - "auxiliary_loss_clip": 0.01147841, - "auxiliary_loss_mlp": 0.01052058, - "balance_loss_clip": 1.05113554, - "balance_loss_mlp": 1.02943158, - "epoch": 0.12421464001202465, - "flos": 23659673840640.0, - "grad_norm": 1.7498294279318665, - "language_loss": 0.80540735, - "learning_rate": 3.907607865127225e-06, - "loss": 0.82740629, - "num_input_tokens_seen": 44614780, - "step": 2066, - "time_per_iteration": 2.6958389282226562 - }, - { - "auxiliary_loss_clip": 0.01030831, - "auxiliary_loss_mlp": 0.01051436, - "balance_loss_clip": 1.02768898, - "balance_loss_mlp": 1.04884958, - "epoch": 0.12427476326469263, - "flos": 65732904345600.0, - "grad_norm": 0.8715885531008962, - "language_loss": 0.63299954, - "learning_rate": 3.907490823068766e-06, - "loss": 0.6538223, - "num_input_tokens_seen": 44671240, - "step": 2067, - "time_per_iteration": 3.200000762939453 - }, - { - "auxiliary_loss_clip": 0.01117858, - "auxiliary_loss_mlp": 0.01057985, - "balance_loss_clip": 1.04878855, - "balance_loss_mlp": 1.0344646, - "epoch": 0.12433488651736059, - "flos": 24535427333760.0, - "grad_norm": 1.9218217735084064, - "language_loss": 0.93783462, - "learning_rate": 3.907373708678063e-06, - "loss": 0.959593, - "num_input_tokens_seen": 44691050, - "step": 2068, - "time_per_iteration": 2.7631025314331055 - }, - { - "auxiliary_loss_clip": 0.01166393, - "auxiliary_loss_mlp": 0.0105657, - "balance_loss_clip": 1.05994427, - "balance_loss_mlp": 1.03697169, - "epoch": 0.12439500977002856, - "flos": 21031659175680.0, - "grad_norm": 1.8717926968048342, - "language_loss": 0.80861229, - "learning_rate": 3.9072565219595596e-06, - "loss": 0.83084196, - "num_input_tokens_seen": 44709850, - "step": 2069, - "time_per_iteration": 2.6630098819732666 - }, - { - "auxiliary_loss_clip": 0.01113262, - "auxiliary_loss_mlp": 0.01062592, - "balance_loss_clip": 1.04863238, - "balance_loss_mlp": 1.03963184, - "epoch": 0.12445513302269653, - "flos": 26830621555200.0, - "grad_norm": 1.5649570979854035, - "language_loss": 0.777978, - "learning_rate": 3.907139262917696e-06, - "loss": 0.79973656, - "num_input_tokens_seen": 44731475, - "step": 2070, - "time_per_iteration": 2.7750463485717773 - }, - { - "auxiliary_loss_clip": 0.01156875, - "auxiliary_loss_mlp": 0.01052509, - "balance_loss_clip": 1.05520415, - "balance_loss_mlp": 1.03055048, - "epoch": 0.1245152562753645, - "flos": 18368919037440.0, - "grad_norm": 2.2051981544638166, - "language_loss": 0.80743957, - "learning_rate": 3.907021931556922e-06, - "loss": 0.8295334, - "num_input_tokens_seen": 44749685, - "step": 2071, - "time_per_iteration": 2.654171943664551 - }, - { - "auxiliary_loss_clip": 0.01154683, - "auxiliary_loss_mlp": 0.01055767, - "balance_loss_clip": 1.05492425, - "balance_loss_mlp": 1.03405952, - "epoch": 0.12457537952803246, - "flos": 33107986200960.0, - "grad_norm": 2.118828414072521, - "language_loss": 0.78278041, - "learning_rate": 3.906904527881684e-06, - "loss": 0.80488491, - "num_input_tokens_seen": 44772165, - "step": 2072, - "time_per_iteration": 2.753159284591675 - }, - { - "auxiliary_loss_clip": 0.0114568, - "auxiliary_loss_mlp": 0.01055287, - "balance_loss_clip": 1.05651307, - "balance_loss_mlp": 1.03381729, - "epoch": 0.12463550278070043, - "flos": 22270217990400.0, - "grad_norm": 7.360489773093417, - "language_loss": 0.752267, - "learning_rate": 3.9067870518964355e-06, - "loss": 0.77427667, - "num_input_tokens_seen": 44790580, - "step": 2073, - "time_per_iteration": 2.6561899185180664 - }, - { - "auxiliary_loss_clip": 0.01096485, - "auxiliary_loss_mlp": 0.01053193, - "balance_loss_clip": 1.04471385, - "balance_loss_mlp": 1.03086543, - "epoch": 0.12469562603336841, - "flos": 14679025580160.0, - "grad_norm": 1.9234955386089483, - "language_loss": 0.90560025, - "learning_rate": 3.906669503605631e-06, - "loss": 0.92709696, - "num_input_tokens_seen": 44806730, - "step": 2074, - "time_per_iteration": 2.7846343517303467 - }, - { - "auxiliary_loss_clip": 0.01105332, - "auxiliary_loss_mlp": 0.01056651, - "balance_loss_clip": 1.04977274, - "balance_loss_mlp": 1.03346491, - "epoch": 0.12475574928603637, - "flos": 24644775312000.0, - "grad_norm": 2.8321626325497493, - "language_loss": 0.83836985, - "learning_rate": 3.906551883013728e-06, - "loss": 0.8599897, - "num_input_tokens_seen": 44825550, - "step": 2075, - "time_per_iteration": 4.412928342819214 - }, - { - "auxiliary_loss_clip": 0.01107078, - "auxiliary_loss_mlp": 0.01062819, - "balance_loss_clip": 1.04380202, - "balance_loss_mlp": 1.03972864, - "epoch": 0.12481587253870434, - "flos": 21762980081280.0, - "grad_norm": 2.042892519020311, - "language_loss": 0.73648787, - "learning_rate": 3.9064341901251865e-06, - "loss": 0.75818682, - "num_input_tokens_seen": 44844155, - "step": 2076, - "time_per_iteration": 5.925223112106323 - }, - { - "auxiliary_loss_clip": 0.01101731, - "auxiliary_loss_mlp": 0.01048176, - "balance_loss_clip": 1.04774427, - "balance_loss_mlp": 1.02751708, - "epoch": 0.12487599579137232, - "flos": 21432529935360.0, - "grad_norm": 1.8779339700875872, - "language_loss": 0.7622484, - "learning_rate": 3.906316424944469e-06, - "loss": 0.78374755, - "num_input_tokens_seen": 44863780, - "step": 2077, - "time_per_iteration": 2.70566987991333 - }, - { - "auxiliary_loss_clip": 0.01156274, - "auxiliary_loss_mlp": 0.01062042, - "balance_loss_clip": 1.05365288, - "balance_loss_mlp": 1.04001164, - "epoch": 0.12493611904404028, - "flos": 16107624276480.0, - "grad_norm": 2.022280968605665, - "language_loss": 0.82290226, - "learning_rate": 3.906198587476043e-06, - "loss": 0.84508544, - "num_input_tokens_seen": 44881480, - "step": 2078, - "time_per_iteration": 4.302385568618774 - }, - { - "auxiliary_loss_clip": 0.01144821, - "auxiliary_loss_mlp": 0.01050482, - "balance_loss_clip": 1.05281842, - "balance_loss_mlp": 1.02855957, - "epoch": 0.12499624229670825, - "flos": 21580266574080.0, - "grad_norm": 1.6413520418295044, - "language_loss": 0.75195324, - "learning_rate": 3.906080677724374e-06, - "loss": 0.77390629, - "num_input_tokens_seen": 44900390, - "step": 2079, - "time_per_iteration": 2.6915946006774902 - }, - { - "auxiliary_loss_clip": 0.01166758, - "auxiliary_loss_mlp": 0.01058474, - "balance_loss_clip": 1.05881989, - "balance_loss_mlp": 1.03696847, - "epoch": 0.1250563655493762, - "flos": 25699040421120.0, - "grad_norm": 6.733284446627088, - "language_loss": 0.83874094, - "learning_rate": 3.905962695693935e-06, - "loss": 0.86099327, - "num_input_tokens_seen": 44920375, - "step": 2080, - "time_per_iteration": 2.7467572689056396 - }, - { - "auxiliary_loss_clip": 0.01156163, - "auxiliary_loss_mlp": 0.01059409, - "balance_loss_clip": 1.05525088, - "balance_loss_mlp": 1.03885686, - "epoch": 0.12511648880204418, - "flos": 16909509450240.0, - "grad_norm": 1.8581885454518776, - "language_loss": 0.84644079, - "learning_rate": 3.9058446413892e-06, - "loss": 0.86859655, - "num_input_tokens_seen": 44938415, - "step": 2081, - "time_per_iteration": 2.685875654220581 - }, - { - "auxiliary_loss_clip": 0.01156835, - "auxiliary_loss_mlp": 0.01046398, - "balance_loss_clip": 1.05375946, - "balance_loss_mlp": 1.02594149, - "epoch": 0.12517661205471217, - "flos": 17567500740480.0, - "grad_norm": 1.8191819349610059, - "language_loss": 0.76739037, - "learning_rate": 3.905726514814646e-06, - "loss": 0.78942269, - "num_input_tokens_seen": 44957135, - "step": 2082, - "time_per_iteration": 2.6133053302764893 - }, - { - "auxiliary_loss_clip": 0.01152911, - "auxiliary_loss_mlp": 0.0104632, - "balance_loss_clip": 1.05701911, - "balance_loss_mlp": 1.02463615, - "epoch": 0.12523673530738014, - "flos": 16033791870720.0, - "grad_norm": 2.5415589476696265, - "language_loss": 0.79044539, - "learning_rate": 3.9056083159747495e-06, - "loss": 0.81243765, - "num_input_tokens_seen": 44974480, - "step": 2083, - "time_per_iteration": 2.6963307857513428 - }, - { - "auxiliary_loss_clip": 0.01147874, - "auxiliary_loss_mlp": 0.01047351, - "balance_loss_clip": 1.05509973, - "balance_loss_mlp": 1.02421284, - "epoch": 0.1252968585600481, - "flos": 18807747494400.0, - "grad_norm": 2.1696249857299, - "language_loss": 0.89831448, - "learning_rate": 3.9054900448739966e-06, - "loss": 0.92026675, - "num_input_tokens_seen": 44990310, - "step": 2084, - "time_per_iteration": 2.6770403385162354 - }, - { - "auxiliary_loss_clip": 0.01131068, - "auxiliary_loss_mlp": 0.01048299, - "balance_loss_clip": 1.05299771, - "balance_loss_mlp": 1.02729464, - "epoch": 0.12535698181271607, - "flos": 27271568914560.0, - "grad_norm": 1.8896331095253402, - "language_loss": 0.80354226, - "learning_rate": 3.905371701516869e-06, - "loss": 0.82533598, - "num_input_tokens_seen": 45010720, - "step": 2085, - "time_per_iteration": 2.749783515930176 - }, - { - "auxiliary_loss_clip": 0.01170318, - "auxiliary_loss_mlp": 0.01051018, - "balance_loss_clip": 1.05725896, - "balance_loss_mlp": 1.03001356, - "epoch": 0.12541710506538403, - "flos": 22054107813120.0, - "grad_norm": 1.8300316094254767, - "language_loss": 0.88228154, - "learning_rate": 3.905253285907856e-06, - "loss": 0.90449488, - "num_input_tokens_seen": 45030360, - "step": 2086, - "time_per_iteration": 2.603515148162842 - }, - { - "auxiliary_loss_clip": 0.01134598, - "auxiliary_loss_mlp": 0.01044925, - "balance_loss_clip": 1.05278981, - "balance_loss_mlp": 1.02522027, - "epoch": 0.125477228318052, - "flos": 12603173760000.0, - "grad_norm": 2.0471238132540344, - "language_loss": 0.86819696, - "learning_rate": 3.905134798051447e-06, - "loss": 0.88999224, - "num_input_tokens_seen": 45045085, - "step": 2087, - "time_per_iteration": 2.6265859603881836 - }, - { - "auxiliary_loss_clip": 0.01146999, - "auxiliary_loss_mlp": 0.01058875, - "balance_loss_clip": 1.05599046, - "balance_loss_mlp": 1.03651142, - "epoch": 0.12553735157071996, - "flos": 23878549365120.0, - "grad_norm": 2.3362397674907758, - "language_loss": 0.73027468, - "learning_rate": 3.905016237952136e-06, - "loss": 0.75233346, - "num_input_tokens_seen": 45065145, - "step": 2088, - "time_per_iteration": 2.65324330329895 - }, - { - "auxiliary_loss_clip": 0.01062529, - "auxiliary_loss_mlp": 0.01013405, - "balance_loss_clip": 1.02985716, - "balance_loss_mlp": 1.01079392, - "epoch": 0.12559747482338796, - "flos": 69920841830400.0, - "grad_norm": 0.7742255614948045, - "language_loss": 0.61767036, - "learning_rate": 3.904897605614418e-06, - "loss": 0.6384297, - "num_input_tokens_seen": 45126230, - "step": 2089, - "time_per_iteration": 3.1219804286956787 - }, - { - "auxiliary_loss_clip": 0.01149606, - "auxiliary_loss_mlp": 0.01060841, - "balance_loss_clip": 1.05670094, - "balance_loss_mlp": 1.0388943, - "epoch": 0.12565759807605592, - "flos": 24279563779200.0, - "grad_norm": 1.817095421446176, - "language_loss": 0.7781918, - "learning_rate": 3.904778901042793e-06, - "loss": 0.80029625, - "num_input_tokens_seen": 45145545, - "step": 2090, - "time_per_iteration": 2.700425863265991 - }, - { - "auxiliary_loss_clip": 0.01046946, - "auxiliary_loss_mlp": 0.01013884, - "balance_loss_clip": 1.03125095, - "balance_loss_mlp": 1.01101136, - "epoch": 0.12571772132872389, - "flos": 56451180286080.0, - "grad_norm": 0.760599485634597, - "language_loss": 0.59434772, - "learning_rate": 3.90466012424176e-06, - "loss": 0.61495602, - "num_input_tokens_seen": 45206845, - "step": 2091, - "time_per_iteration": 3.0814294815063477 - }, - { - "auxiliary_loss_clip": 0.01159814, - "auxiliary_loss_mlp": 0.01060546, - "balance_loss_clip": 1.05760789, - "balance_loss_mlp": 1.041067, - "epoch": 0.12577784458139185, - "flos": 41245846675200.0, - "grad_norm": 1.6552462178493936, - "language_loss": 0.62916517, - "learning_rate": 3.904541275215825e-06, - "loss": 0.6513688, - "num_input_tokens_seen": 45228495, - "step": 2092, - "time_per_iteration": 2.7813880443573 - }, - { - "auxiliary_loss_clip": 0.01147016, - "auxiliary_loss_mlp": 0.01061963, - "balance_loss_clip": 1.05395663, - "balance_loss_mlp": 1.04069614, - "epoch": 0.12583796783405982, - "flos": 19755501799680.0, - "grad_norm": 2.279616692029291, - "language_loss": 0.80507946, - "learning_rate": 3.904422353969493e-06, - "loss": 0.82716924, - "num_input_tokens_seen": 45245720, - "step": 2093, - "time_per_iteration": 2.6768014430999756 - }, - { - "auxiliary_loss_clip": 0.01146976, - "auxiliary_loss_mlp": 0.01075616, - "balance_loss_clip": 1.0524025, - "balance_loss_mlp": 1.05380058, - "epoch": 0.12589809108672778, - "flos": 22602104680320.0, - "grad_norm": 1.7347385846840702, - "language_loss": 0.76003867, - "learning_rate": 3.904303360507276e-06, - "loss": 0.78226459, - "num_input_tokens_seen": 45265650, - "step": 2094, - "time_per_iteration": 2.6730611324310303 - }, - { - "auxiliary_loss_clip": 0.01117887, - "auxiliary_loss_mlp": 0.01069309, - "balance_loss_clip": 1.0500071, - "balance_loss_mlp": 1.04892457, - "epoch": 0.12595821433939577, - "flos": 45222845541120.0, - "grad_norm": 1.5703706409155747, - "language_loss": 0.76664734, - "learning_rate": 3.9041842948336835e-06, - "loss": 0.78851926, - "num_input_tokens_seen": 45287790, - "step": 2095, - "time_per_iteration": 2.958367109298706 - }, - { - "auxiliary_loss_clip": 0.01147751, - "auxiliary_loss_mlp": 0.01058477, - "balance_loss_clip": 1.05202031, - "balance_loss_mlp": 1.03782988, - "epoch": 0.12601833759206374, - "flos": 14319811618560.0, - "grad_norm": 2.2556524892449326, - "language_loss": 0.83266854, - "learning_rate": 3.904065156953232e-06, - "loss": 0.85473078, - "num_input_tokens_seen": 45305720, - "step": 2096, - "time_per_iteration": 2.7097342014312744 - }, - { - "auxiliary_loss_clip": 0.01163652, - "auxiliary_loss_mlp": 0.01056552, - "balance_loss_clip": 1.05806553, - "balance_loss_mlp": 1.03577375, - "epoch": 0.1260784608447317, - "flos": 21288241002240.0, - "grad_norm": 1.7589400475615893, - "language_loss": 0.75478256, - "learning_rate": 3.903945946870439e-06, - "loss": 0.77698463, - "num_input_tokens_seen": 45325290, - "step": 2097, - "time_per_iteration": 2.642056703567505 - }, - { - "auxiliary_loss_clip": 0.01156719, - "auxiliary_loss_mlp": 0.01063976, - "balance_loss_clip": 1.05648863, - "balance_loss_mlp": 1.04527175, - "epoch": 0.12613858409739967, - "flos": 26251311006720.0, - "grad_norm": 1.8828235460619742, - "language_loss": 0.87110066, - "learning_rate": 3.9038266645898246e-06, - "loss": 0.89330757, - "num_input_tokens_seen": 45344465, - "step": 2098, - "time_per_iteration": 2.63826584815979 - }, - { - "auxiliary_loss_clip": 0.01117414, - "auxiliary_loss_mlp": 0.01058025, - "balance_loss_clip": 1.04983974, - "balance_loss_mlp": 1.03475559, - "epoch": 0.12619870735006763, - "flos": 21579979265280.0, - "grad_norm": 1.8855647331078333, - "language_loss": 0.69494271, - "learning_rate": 3.903707310115912e-06, - "loss": 0.7166971, - "num_input_tokens_seen": 45362465, - "step": 2099, - "time_per_iteration": 2.7813057899475098 - }, - { - "auxiliary_loss_clip": 0.01142696, - "auxiliary_loss_mlp": 0.01061431, - "balance_loss_clip": 1.04979372, - "balance_loss_mlp": 1.03923464, - "epoch": 0.1262588306027356, - "flos": 23367037737600.0, - "grad_norm": 2.0457253500590498, - "language_loss": 0.81949925, - "learning_rate": 3.903587883453228e-06, - "loss": 0.84154058, - "num_input_tokens_seen": 45382700, - "step": 2100, - "time_per_iteration": 2.704871416091919 - }, - { - "auxiliary_loss_clip": 0.01159613, - "auxiliary_loss_mlp": 0.01055067, - "balance_loss_clip": 1.0620985, - "balance_loss_mlp": 1.03408623, - "epoch": 0.12631895385540357, - "flos": 23949185460480.0, - "grad_norm": 1.7810176086536167, - "language_loss": 0.80399859, - "learning_rate": 3.903468384606302e-06, - "loss": 0.82614541, - "num_input_tokens_seen": 45401005, - "step": 2101, - "time_per_iteration": 2.7071452140808105 - }, - { - "auxiliary_loss_clip": 0.0106985, - "auxiliary_loss_mlp": 0.01010859, - "balance_loss_clip": 1.02823138, - "balance_loss_mlp": 1.00803375, - "epoch": 0.12637907710807156, - "flos": 70282138780800.0, - "grad_norm": 0.7128618749962091, - "language_loss": 0.57087427, - "learning_rate": 3.903348813579662e-06, - "loss": 0.59168136, - "num_input_tokens_seen": 45466555, - "step": 2102, - "time_per_iteration": 3.20320987701416 - }, - { - "auxiliary_loss_clip": 0.01140495, - "auxiliary_loss_mlp": 0.01056574, - "balance_loss_clip": 1.053671, - "balance_loss_mlp": 1.03661788, - "epoch": 0.12643920036073952, - "flos": 18915084311040.0, - "grad_norm": 2.0306165352193988, - "language_loss": 0.93653679, - "learning_rate": 3.903229170377845e-06, - "loss": 0.95850742, - "num_input_tokens_seen": 45485165, - "step": 2103, - "time_per_iteration": 2.6628894805908203 - }, - { - "auxiliary_loss_clip": 0.01144405, - "auxiliary_loss_mlp": 0.01040745, - "balance_loss_clip": 1.04991472, - "balance_loss_mlp": 1.02174282, - "epoch": 0.1264993236134075, - "flos": 27782470010880.0, - "grad_norm": 1.5962316578756222, - "language_loss": 0.7804662, - "learning_rate": 3.903109455005387e-06, - "loss": 0.80231774, - "num_input_tokens_seen": 45504630, - "step": 2104, - "time_per_iteration": 2.6215474605560303 - }, - { - "auxiliary_loss_clip": 0.01135927, - "auxiliary_loss_mlp": 0.01056343, - "balance_loss_clip": 1.05414486, - "balance_loss_mlp": 1.03683996, - "epoch": 0.12655944686607545, - "flos": 24754697907840.0, - "grad_norm": 1.7362499149688688, - "language_loss": 0.80728614, - "learning_rate": 3.902989667466828e-06, - "loss": 0.82920885, - "num_input_tokens_seen": 45524885, - "step": 2105, - "time_per_iteration": 2.74128794670105 - }, - { - "auxiliary_loss_clip": 0.01162904, - "auxiliary_loss_mlp": 0.01056367, - "balance_loss_clip": 1.05482686, - "balance_loss_mlp": 1.03514743, - "epoch": 0.12661957011874342, - "flos": 24133048202880.0, - "grad_norm": 1.9810187943106816, - "language_loss": 0.83402872, - "learning_rate": 3.90286980776671e-06, - "loss": 0.85622144, - "num_input_tokens_seen": 45545000, - "step": 2106, - "time_per_iteration": 2.676694631576538 - }, - { - "auxiliary_loss_clip": 0.01126632, - "auxiliary_loss_mlp": 0.01052067, - "balance_loss_clip": 1.05697966, - "balance_loss_mlp": 1.03147984, - "epoch": 0.12667969337141138, - "flos": 24569614103040.0, - "grad_norm": 1.6951691508845637, - "language_loss": 0.73469931, - "learning_rate": 3.902749875909578e-06, - "loss": 0.7564863, - "num_input_tokens_seen": 45564210, - "step": 2107, - "time_per_iteration": 2.7506372928619385 - }, - { - "auxiliary_loss_clip": 0.01162931, - "auxiliary_loss_mlp": 0.01044317, - "balance_loss_clip": 1.05320692, - "balance_loss_mlp": 1.02599406, - "epoch": 0.12673981662407935, - "flos": 22961677777920.0, - "grad_norm": 2.0116792159666477, - "language_loss": 0.79395336, - "learning_rate": 3.90262987189998e-06, - "loss": 0.81602579, - "num_input_tokens_seen": 45583030, - "step": 2108, - "time_per_iteration": 2.6611146926879883 - }, - { - "auxiliary_loss_clip": 0.01168073, - "auxiliary_loss_mlp": 0.01049192, - "balance_loss_clip": 1.05300844, - "balance_loss_mlp": 1.02945089, - "epoch": 0.12679993987674734, - "flos": 17274864637440.0, - "grad_norm": 1.9298328790617403, - "language_loss": 0.7561394, - "learning_rate": 3.902509795742467e-06, - "loss": 0.77831209, - "num_input_tokens_seen": 45602265, - "step": 2109, - "time_per_iteration": 2.5963573455810547 - }, - { - "auxiliary_loss_clip": 0.01111025, - "auxiliary_loss_mlp": 0.01053822, - "balance_loss_clip": 1.04636049, - "balance_loss_mlp": 1.0335331, - "epoch": 0.1268600631294153, - "flos": 17275080119040.0, - "grad_norm": 1.6171901700648081, - "language_loss": 0.82806516, - "learning_rate": 3.902389647441592e-06, - "loss": 0.84971368, - "num_input_tokens_seen": 45620595, - "step": 2110, - "time_per_iteration": 2.6745550632476807 - }, - { - "auxiliary_loss_clip": 0.01145969, - "auxiliary_loss_mlp": 0.00778071, - "balance_loss_clip": 1.05419564, - "balance_loss_mlp": 0.99996144, - "epoch": 0.12692018638208327, - "flos": 24061047390720.0, - "grad_norm": 1.6765217216011241, - "language_loss": 0.78092968, - "learning_rate": 3.90226942700191e-06, - "loss": 0.80017006, - "num_input_tokens_seen": 45641140, - "step": 2111, - "time_per_iteration": 2.65983510017395 - }, - { - "auxiliary_loss_clip": 0.01130932, - "auxiliary_loss_mlp": 0.01076547, - "balance_loss_clip": 1.05490458, - "balance_loss_mlp": 1.05352807, - "epoch": 0.12698030963475124, - "flos": 31831900652160.0, - "grad_norm": 2.15738266202174, - "language_loss": 0.77103376, - "learning_rate": 3.902149134427982e-06, - "loss": 0.79310858, - "num_input_tokens_seen": 45662315, - "step": 2112, - "time_per_iteration": 2.870299816131592 - }, - { - "auxiliary_loss_clip": 0.01129438, - "auxiliary_loss_mlp": 0.01074863, - "balance_loss_clip": 1.05213726, - "balance_loss_mlp": 1.05427516, - "epoch": 0.1270404328874192, - "flos": 25187744275200.0, - "grad_norm": 1.9191529425470424, - "language_loss": 0.85806453, - "learning_rate": 3.902028769724367e-06, - "loss": 0.88010758, - "num_input_tokens_seen": 45680335, - "step": 2113, - "time_per_iteration": 4.26338267326355 - }, - { - "auxiliary_loss_clip": 0.01137468, - "auxiliary_loss_mlp": 0.01078067, - "balance_loss_clip": 1.05511892, - "balance_loss_mlp": 1.05670488, - "epoch": 0.12710055614008717, - "flos": 15997342544640.0, - "grad_norm": 1.9721234476704599, - "language_loss": 0.74027002, - "learning_rate": 3.9019083328956315e-06, - "loss": 0.7624253, - "num_input_tokens_seen": 45696240, - "step": 2114, - "time_per_iteration": 2.7573230266571045 - }, - { - "auxiliary_loss_clip": 0.01156713, - "auxiliary_loss_mlp": 0.01060574, - "balance_loss_clip": 1.05770111, - "balance_loss_mlp": 1.03924704, - "epoch": 0.12716067939275516, - "flos": 15085642515840.0, - "grad_norm": 1.7921743813213327, - "language_loss": 0.83240676, - "learning_rate": 3.901787823946341e-06, - "loss": 0.85457963, - "num_input_tokens_seen": 45713695, - "step": 2115, - "time_per_iteration": 4.1369829177856445 - }, - { - "auxiliary_loss_clip": 0.01154653, - "auxiliary_loss_mlp": 0.01065557, - "balance_loss_clip": 1.05875492, - "balance_loss_mlp": 1.04476702, - "epoch": 0.12722080264542313, - "flos": 28366736636160.0, - "grad_norm": 1.4840591347809418, - "language_loss": 0.87010503, - "learning_rate": 3.901667242881065e-06, - "loss": 0.89230716, - "num_input_tokens_seen": 45736655, - "step": 2116, - "time_per_iteration": 2.73896861076355 - }, - { - "auxiliary_loss_clip": 0.01139498, - "auxiliary_loss_mlp": 0.00777066, - "balance_loss_clip": 1.05413389, - "balance_loss_mlp": 0.99995339, - "epoch": 0.1272809258980911, - "flos": 32379897519360.0, - "grad_norm": 1.753205985010591, - "language_loss": 0.70374918, - "learning_rate": 3.9015465897043775e-06, - "loss": 0.72291481, - "num_input_tokens_seen": 45758195, - "step": 2117, - "time_per_iteration": 2.783156156539917 - }, - { - "auxiliary_loss_clip": 0.01127455, - "auxiliary_loss_mlp": 0.0106424, - "balance_loss_clip": 1.04978406, - "balance_loss_mlp": 1.04068434, - "epoch": 0.12734104915075906, - "flos": 16034402401920.0, - "grad_norm": 1.9957647698478755, - "language_loss": 0.86237884, - "learning_rate": 3.901425864420852e-06, - "loss": 0.8842957, - "num_input_tokens_seen": 45774280, - "step": 2118, - "time_per_iteration": 4.322036266326904 - }, - { - "auxiliary_loss_clip": 0.01161417, - "auxiliary_loss_mlp": 0.01049008, - "balance_loss_clip": 1.05827069, - "balance_loss_mlp": 1.02951694, - "epoch": 0.12740117240342702, - "flos": 18260325244800.0, - "grad_norm": 1.705293179953873, - "language_loss": 0.87577266, - "learning_rate": 3.901305067035068e-06, - "loss": 0.89787692, - "num_input_tokens_seen": 45792760, - "step": 2119, - "time_per_iteration": 2.6559741497039795 - }, - { - "auxiliary_loss_clip": 0.01145426, - "auxiliary_loss_mlp": 0.0077754, - "balance_loss_clip": 1.05233431, - "balance_loss_mlp": 0.99984539, - "epoch": 0.127461295656095, - "flos": 12121790664960.0, - "grad_norm": 2.05013605026053, - "language_loss": 0.87824571, - "learning_rate": 3.901184197551605e-06, - "loss": 0.89747536, - "num_input_tokens_seen": 45804300, - "step": 2120, - "time_per_iteration": 2.6154048442840576 - }, - { - "auxiliary_loss_clip": 0.01170497, - "auxiliary_loss_mlp": 0.01046075, - "balance_loss_clip": 1.05822706, - "balance_loss_mlp": 1.02626204, - "epoch": 0.12752141890876295, - "flos": 23149095966720.0, - "grad_norm": 1.9784951602308867, - "language_loss": 0.75584805, - "learning_rate": 3.901063255975046e-06, - "loss": 0.77801377, - "num_input_tokens_seen": 45823780, - "step": 2121, - "time_per_iteration": 2.579265832901001 - }, - { - "auxiliary_loss_clip": 0.0111249, - "auxiliary_loss_mlp": 0.01047949, - "balance_loss_clip": 1.04741263, - "balance_loss_mlp": 1.02727842, - "epoch": 0.12758154216143094, - "flos": 21615997628160.0, - "grad_norm": 2.0293629108662405, - "language_loss": 0.82732606, - "learning_rate": 3.900942242309978e-06, - "loss": 0.84893048, - "num_input_tokens_seen": 45840495, - "step": 2122, - "time_per_iteration": 2.793870210647583 - }, - { - "auxiliary_loss_clip": 0.01151713, - "auxiliary_loss_mlp": 0.01049724, - "balance_loss_clip": 1.05901408, - "balance_loss_mlp": 1.02983987, - "epoch": 0.1276416654140989, - "flos": 15924874855680.0, - "grad_norm": 1.7660235451894624, - "language_loss": 0.78699338, - "learning_rate": 3.90082115656099e-06, - "loss": 0.80900776, - "num_input_tokens_seen": 45857735, - "step": 2123, - "time_per_iteration": 2.70546293258667 - }, - { - "auxiliary_loss_clip": 0.01172823, - "auxiliary_loss_mlp": 0.01055328, - "balance_loss_clip": 1.05931985, - "balance_loss_mlp": 1.03478789, - "epoch": 0.12770178866676687, - "flos": 22382690451840.0, - "grad_norm": 1.5643885422181942, - "language_loss": 0.78931451, - "learning_rate": 3.900699998732673e-06, - "loss": 0.81159604, - "num_input_tokens_seen": 45876485, - "step": 2124, - "time_per_iteration": 2.661712408065796 - }, - { - "auxiliary_loss_clip": 0.01160474, - "auxiliary_loss_mlp": 0.00776885, - "balance_loss_clip": 1.05457389, - "balance_loss_mlp": 0.99987447, - "epoch": 0.12776191191943484, - "flos": 21652482867840.0, - "grad_norm": 1.9695028631977674, - "language_loss": 0.75605726, - "learning_rate": 3.900578768829623e-06, - "loss": 0.7754308, - "num_input_tokens_seen": 45894645, - "step": 2125, - "time_per_iteration": 2.696021556854248 - }, - { - "auxiliary_loss_clip": 0.01158163, - "auxiliary_loss_mlp": 0.00777059, - "balance_loss_clip": 1.05398965, - "balance_loss_mlp": 1.00002348, - "epoch": 0.1278220351721028, - "flos": 25735561574400.0, - "grad_norm": 3.019802885219414, - "language_loss": 0.78016824, - "learning_rate": 3.900457466856434e-06, - "loss": 0.79952049, - "num_input_tokens_seen": 45913755, - "step": 2126, - "time_per_iteration": 2.721435308456421 - }, - { - "auxiliary_loss_clip": 0.01124637, - "auxiliary_loss_mlp": 0.010537, - "balance_loss_clip": 1.05406642, - "balance_loss_mlp": 1.03504348, - "epoch": 0.12788215842477077, - "flos": 41243224982400.0, - "grad_norm": 1.3825945270792501, - "language_loss": 0.6927852, - "learning_rate": 3.9003360928177085e-06, - "loss": 0.71456861, - "num_input_tokens_seen": 45936095, - "step": 2127, - "time_per_iteration": 2.902101993560791 - }, - { - "auxiliary_loss_clip": 0.01030231, - "auxiliary_loss_mlp": 0.00759051, - "balance_loss_clip": 1.02830005, - "balance_loss_mlp": 1.00050259, - "epoch": 0.12794228167743876, - "flos": 70877430881280.0, - "grad_norm": 0.853491438999862, - "language_loss": 0.62831402, - "learning_rate": 3.900214646718047e-06, - "loss": 0.64620686, - "num_input_tokens_seen": 46004655, - "step": 2128, - "time_per_iteration": 3.3387396335601807 - }, - { - "auxiliary_loss_clip": 0.01145823, - "auxiliary_loss_mlp": 0.01047815, - "balance_loss_clip": 1.05080712, - "balance_loss_mlp": 1.02599955, - "epoch": 0.12800240493010673, - "flos": 16289727252480.0, - "grad_norm": 2.066959353069841, - "language_loss": 0.77626479, - "learning_rate": 3.900093128562056e-06, - "loss": 0.7982012, - "num_input_tokens_seen": 46023610, - "step": 2129, - "time_per_iteration": 2.611309766769409 - }, - { - "auxiliary_loss_clip": 0.01122914, - "auxiliary_loss_mlp": 0.01052577, - "balance_loss_clip": 1.05058527, - "balance_loss_mlp": 1.03029668, - "epoch": 0.1280625281827747, - "flos": 20631542601600.0, - "grad_norm": 2.1214737401843893, - "language_loss": 0.79263359, - "learning_rate": 3.899971538354343e-06, - "loss": 0.81438851, - "num_input_tokens_seen": 46041725, - "step": 2130, - "time_per_iteration": 2.753243923187256 - }, - { - "auxiliary_loss_clip": 0.01139626, - "auxiliary_loss_mlp": 0.01052453, - "balance_loss_clip": 1.05133748, - "balance_loss_mlp": 1.03147244, - "epoch": 0.12812265143544266, - "flos": 22638230784000.0, - "grad_norm": 1.7780274650921335, - "language_loss": 0.70945668, - "learning_rate": 3.899849876099518e-06, - "loss": 0.73137754, - "num_input_tokens_seen": 46061095, - "step": 2131, - "time_per_iteration": 2.6809306144714355 - }, - { - "auxiliary_loss_clip": 0.01102824, - "auxiliary_loss_mlp": 0.01052393, - "balance_loss_clip": 1.04982638, - "balance_loss_mlp": 1.03163886, - "epoch": 0.12818277468811062, - "flos": 34714701463680.0, - "grad_norm": 2.2916674504462655, - "language_loss": 0.72298968, - "learning_rate": 3.899728141802197e-06, - "loss": 0.74454176, - "num_input_tokens_seen": 46082670, - "step": 2132, - "time_per_iteration": 2.8769233226776123 - }, - { - "auxiliary_loss_clip": 0.01102594, - "auxiliary_loss_mlp": 0.01055993, - "balance_loss_clip": 1.04384947, - "balance_loss_mlp": 1.03348672, - "epoch": 0.1282428979407786, - "flos": 23112107936640.0, - "grad_norm": 2.0316054281953155, - "language_loss": 0.82128644, - "learning_rate": 3.8996063354669935e-06, - "loss": 0.84287226, - "num_input_tokens_seen": 46102410, - "step": 2133, - "time_per_iteration": 2.766897678375244 - }, - { - "auxiliary_loss_clip": 0.01163396, - "auxiliary_loss_mlp": 0.01057069, - "balance_loss_clip": 1.05397773, - "balance_loss_mlp": 1.03458595, - "epoch": 0.12830302119344655, - "flos": 20886508316160.0, - "grad_norm": 3.232115826630309, - "language_loss": 0.80001891, - "learning_rate": 3.899484457098528e-06, - "loss": 0.82222354, - "num_input_tokens_seen": 46121145, - "step": 2134, - "time_per_iteration": 2.6347672939300537 - }, - { - "auxiliary_loss_clip": 0.01159056, - "auxiliary_loss_mlp": 0.01046209, - "balance_loss_clip": 1.05907345, - "balance_loss_mlp": 1.02614641, - "epoch": 0.12836314444611455, - "flos": 21397768548480.0, - "grad_norm": 1.731952504909339, - "language_loss": 0.82657921, - "learning_rate": 3.899362506701421e-06, - "loss": 0.84863198, - "num_input_tokens_seen": 46140740, - "step": 2135, - "time_per_iteration": 2.6393656730651855 - }, - { - "auxiliary_loss_clip": 0.0114208, - "auxiliary_loss_mlp": 0.0105553, - "balance_loss_clip": 1.05345035, - "balance_loss_mlp": 1.03411996, - "epoch": 0.1284232676987825, - "flos": 13662466773120.0, - "grad_norm": 2.1083924470752278, - "language_loss": 0.7764526, - "learning_rate": 3.899240484280298e-06, - "loss": 0.79842871, - "num_input_tokens_seen": 46156805, - "step": 2136, - "time_per_iteration": 2.7195920944213867 - }, - { - "auxiliary_loss_clip": 0.01020946, - "auxiliary_loss_mlp": 0.01003991, - "balance_loss_clip": 1.01967573, - "balance_loss_mlp": 1.00096273, - "epoch": 0.12848339095145048, - "flos": 59994737735040.0, - "grad_norm": 0.8964253308146478, - "language_loss": 0.59152198, - "learning_rate": 3.899118389839785e-06, - "loss": 0.61177135, - "num_input_tokens_seen": 46222085, - "step": 2137, - "time_per_iteration": 3.416015625 - }, - { - "auxiliary_loss_clip": 0.01153694, - "auxiliary_loss_mlp": 0.01054623, - "balance_loss_clip": 1.05178177, - "balance_loss_mlp": 1.03483438, - "epoch": 0.12854351420411844, - "flos": 13881378211200.0, - "grad_norm": 3.244493357011547, - "language_loss": 0.82344306, - "learning_rate": 3.898996223384512e-06, - "loss": 0.84552622, - "num_input_tokens_seen": 46239970, - "step": 2138, - "time_per_iteration": 2.65515398979187 - }, - { - "auxiliary_loss_clip": 0.01159586, - "auxiliary_loss_mlp": 0.01049293, - "balance_loss_clip": 1.05592752, - "balance_loss_mlp": 1.02665496, - "epoch": 0.1286036374567864, - "flos": 22637943475200.0, - "grad_norm": 2.5417837252920323, - "language_loss": 0.78691363, - "learning_rate": 3.898873984919113e-06, - "loss": 0.8090024, - "num_input_tokens_seen": 46257740, - "step": 2139, - "time_per_iteration": 2.651132345199585 - }, - { - "auxiliary_loss_clip": 0.01136892, - "auxiliary_loss_mlp": 0.01045928, - "balance_loss_clip": 1.05267286, - "balance_loss_mlp": 1.02582908, - "epoch": 0.12866376070945437, - "flos": 16324775948160.0, - "grad_norm": 1.9541049485452633, - "language_loss": 0.85289955, - "learning_rate": 3.8987516744482215e-06, - "loss": 0.87472773, - "num_input_tokens_seen": 46275445, - "step": 2140, - "time_per_iteration": 2.730156183242798 - }, - { - "auxiliary_loss_clip": 0.01143134, - "auxiliary_loss_mlp": 0.01044337, - "balance_loss_clip": 1.05203128, - "balance_loss_mlp": 1.02482224, - "epoch": 0.12872388396212234, - "flos": 11874546374400.0, - "grad_norm": 1.8185491602156885, - "language_loss": 0.86268306, - "learning_rate": 3.898629291976476e-06, - "loss": 0.88455778, - "num_input_tokens_seen": 46291710, - "step": 2141, - "time_per_iteration": 2.62223482131958 - }, - { - "auxiliary_loss_clip": 0.01146971, - "auxiliary_loss_mlp": 0.01045813, - "balance_loss_clip": 1.0528295, - "balance_loss_mlp": 1.02548814, - "epoch": 0.12878400721479033, - "flos": 28366700722560.0, - "grad_norm": 3.1267362471736684, - "language_loss": 0.68282312, - "learning_rate": 3.898506837508518e-06, - "loss": 0.70475101, - "num_input_tokens_seen": 46311335, - "step": 2142, - "time_per_iteration": 2.71232271194458 - }, - { - "auxiliary_loss_clip": 0.01165678, - "auxiliary_loss_mlp": 0.0077895, - "balance_loss_clip": 1.05764627, - "balance_loss_mlp": 0.99990749, - "epoch": 0.1288441304674583, - "flos": 25885632597120.0, - "grad_norm": 2.373838274123079, - "language_loss": 0.83479214, - "learning_rate": 3.89838431104899e-06, - "loss": 0.85423845, - "num_input_tokens_seen": 46330985, - "step": 2143, - "time_per_iteration": 2.677692174911499 - }, - { - "auxiliary_loss_clip": 0.01175134, - "auxiliary_loss_mlp": 0.00777405, - "balance_loss_clip": 1.0598439, - "balance_loss_mlp": 0.99994075, - "epoch": 0.12890425372012626, - "flos": 20813789232000.0, - "grad_norm": 1.5662270309624111, - "language_loss": 0.81703234, - "learning_rate": 3.898261712602539e-06, - "loss": 0.83655775, - "num_input_tokens_seen": 46351295, - "step": 2144, - "time_per_iteration": 2.712620496749878 - }, - { - "auxiliary_loss_clip": 0.01130321, - "auxiliary_loss_mlp": 0.01053521, - "balance_loss_clip": 1.04658103, - "balance_loss_mlp": 1.03145528, - "epoch": 0.12896437697279423, - "flos": 22565870835840.0, - "grad_norm": 1.8026346290528672, - "language_loss": 0.78304374, - "learning_rate": 3.898139042173813e-06, - "loss": 0.80488217, - "num_input_tokens_seen": 46368600, - "step": 2145, - "time_per_iteration": 2.6766605377197266 - }, - { - "auxiliary_loss_clip": 0.01170585, - "auxiliary_loss_mlp": 0.01047893, - "balance_loss_clip": 1.0543592, - "balance_loss_mlp": 1.02662635, - "epoch": 0.1290245002254622, - "flos": 17493776075520.0, - "grad_norm": 2.147087506474235, - "language_loss": 0.82865375, - "learning_rate": 3.898016299767465e-06, - "loss": 0.85083848, - "num_input_tokens_seen": 46387370, - "step": 2146, - "time_per_iteration": 2.5860395431518555 - }, - { - "auxiliary_loss_clip": 0.01141916, - "auxiliary_loss_mlp": 0.0105138, - "balance_loss_clip": 1.05367482, - "balance_loss_mlp": 1.03062606, - "epoch": 0.12908462347813016, - "flos": 36315957859200.0, - "grad_norm": 2.344626501147968, - "language_loss": 0.71275079, - "learning_rate": 3.897893485388149e-06, - "loss": 0.73468375, - "num_input_tokens_seen": 46409570, - "step": 2147, - "time_per_iteration": 2.7870359420776367 - }, - { - "auxiliary_loss_clip": 0.01147238, - "auxiliary_loss_mlp": 0.01052291, - "balance_loss_clip": 1.05527067, - "balance_loss_mlp": 1.03297925, - "epoch": 0.12914474673079815, - "flos": 22528703237760.0, - "grad_norm": 2.120275205230366, - "language_loss": 0.71432978, - "learning_rate": 3.897770599040521e-06, - "loss": 0.73632509, - "num_input_tokens_seen": 46429320, - "step": 2148, - "time_per_iteration": 2.6865081787109375 - }, - { - "auxiliary_loss_clip": 0.01168479, - "auxiliary_loss_mlp": 0.01049575, - "balance_loss_clip": 1.05762172, - "balance_loss_mlp": 1.03016782, - "epoch": 0.12920486998346611, - "flos": 21471888263040.0, - "grad_norm": 1.6388902851592406, - "language_loss": 0.79064089, - "learning_rate": 3.897647640729242e-06, - "loss": 0.81282145, - "num_input_tokens_seen": 46450155, - "step": 2149, - "time_per_iteration": 2.6041862964630127 - }, - { - "auxiliary_loss_clip": 0.01159527, - "auxiliary_loss_mlp": 0.01046069, - "balance_loss_clip": 1.05377793, - "balance_loss_mlp": 1.02531469, - "epoch": 0.12926499323613408, - "flos": 27308556944640.0, - "grad_norm": 2.034796374339078, - "language_loss": 0.75976646, - "learning_rate": 3.897524610458975e-06, - "loss": 0.78182244, - "num_input_tokens_seen": 46470280, - "step": 2150, - "time_per_iteration": 2.647224187850952 - }, - { - "auxiliary_loss_clip": 0.01155787, - "auxiliary_loss_mlp": 0.01055192, - "balance_loss_clip": 1.05445433, - "balance_loss_mlp": 1.03491461, - "epoch": 0.12932511648880204, - "flos": 22091131756800.0, - "grad_norm": 2.3830500835005592, - "language_loss": 0.70986372, - "learning_rate": 3.8974015082343835e-06, - "loss": 0.73197353, - "num_input_tokens_seen": 46487605, - "step": 2151, - "time_per_iteration": 2.7008492946624756 - }, - { - "auxiliary_loss_clip": 0.01167835, - "auxiliary_loss_mlp": 0.0104951, - "balance_loss_clip": 1.05603719, - "balance_loss_mlp": 1.03017378, - "epoch": 0.12938523974147, - "flos": 20302780394880.0, - "grad_norm": 2.058334480733051, - "language_loss": 0.83964819, - "learning_rate": 3.897278334060137e-06, - "loss": 0.86182165, - "num_input_tokens_seen": 46505100, - "step": 2152, - "time_per_iteration": 2.6467373371124268 - }, - { - "auxiliary_loss_clip": 0.01158553, - "auxiliary_loss_mlp": 0.01058416, - "balance_loss_clip": 1.05283821, - "balance_loss_mlp": 1.03888893, - "epoch": 0.12944536299413797, - "flos": 19499961467520.0, - "grad_norm": 1.5624811365269535, - "language_loss": 0.78585124, - "learning_rate": 3.897155087940906e-06, - "loss": 0.80802095, - "num_input_tokens_seen": 46524020, - "step": 2153, - "time_per_iteration": 4.286921262741089 - }, - { - "auxiliary_loss_clip": 0.01113716, - "auxiliary_loss_mlp": 0.00777812, - "balance_loss_clip": 1.04707122, - "balance_loss_mlp": 0.99989671, - "epoch": 0.12950548624680594, - "flos": 27707919333120.0, - "grad_norm": 1.6189787343362376, - "language_loss": 0.80253434, - "learning_rate": 3.897031769881364e-06, - "loss": 0.82144964, - "num_input_tokens_seen": 46544640, - "step": 2154, - "time_per_iteration": 2.7602338790893555 - }, - { - "auxiliary_loss_clip": 0.01149958, - "auxiliary_loss_mlp": 0.0105188, - "balance_loss_clip": 1.05262971, - "balance_loss_mlp": 1.03099442, - "epoch": 0.12956560949947393, - "flos": 17565740974080.0, - "grad_norm": 1.8080432584650143, - "language_loss": 0.83717728, - "learning_rate": 3.896908379886188e-06, - "loss": 0.85919571, - "num_input_tokens_seen": 46561395, - "step": 2155, - "time_per_iteration": 5.696707010269165 - }, - { - "auxiliary_loss_clip": 0.01161999, - "auxiliary_loss_mlp": 0.01056273, - "balance_loss_clip": 1.05426383, - "balance_loss_mlp": 1.03611445, - "epoch": 0.1296257327521419, - "flos": 20740711011840.0, - "grad_norm": 2.4972858828122666, - "language_loss": 0.76114857, - "learning_rate": 3.896784917960055e-06, - "loss": 0.78333133, - "num_input_tokens_seen": 46579395, - "step": 2156, - "time_per_iteration": 2.6279313564300537 - }, - { - "auxiliary_loss_clip": 0.01105089, - "auxiliary_loss_mlp": 0.01056603, - "balance_loss_clip": 1.0510118, - "balance_loss_mlp": 1.03679013, - "epoch": 0.12968585600480986, - "flos": 16395735265920.0, - "grad_norm": 1.6652476704410177, - "language_loss": 0.86493659, - "learning_rate": 3.896661384107648e-06, - "loss": 0.88655347, - "num_input_tokens_seen": 46597090, - "step": 2157, - "time_per_iteration": 4.4089202880859375 - }, - { - "auxiliary_loss_clip": 0.01170107, - "auxiliary_loss_mlp": 0.01055814, - "balance_loss_clip": 1.05253935, - "balance_loss_mlp": 1.0349642, - "epoch": 0.12974597925747783, - "flos": 28329533124480.0, - "grad_norm": 2.5240136552338956, - "language_loss": 0.80393612, - "learning_rate": 3.896537778333651e-06, - "loss": 0.8261953, - "num_input_tokens_seen": 46617355, - "step": 2158, - "time_per_iteration": 2.702765703201294 - }, - { - "auxiliary_loss_clip": 0.01177017, - "auxiliary_loss_mlp": 0.01060365, - "balance_loss_clip": 1.05905974, - "balance_loss_mlp": 1.04050517, - "epoch": 0.1298061025101458, - "flos": 9683025782400.0, - "grad_norm": 2.5307604694159607, - "language_loss": 0.74881256, - "learning_rate": 3.896414100642752e-06, - "loss": 0.77118635, - "num_input_tokens_seen": 46633130, - "step": 2159, - "time_per_iteration": 2.534163475036621 - }, - { - "auxiliary_loss_clip": 0.01122909, - "auxiliary_loss_mlp": 0.01058309, - "balance_loss_clip": 1.04594469, - "balance_loss_mlp": 1.03471708, - "epoch": 0.12986622576281376, - "flos": 27709535445120.0, - "grad_norm": 1.954419432637739, - "language_loss": 0.8259204, - "learning_rate": 3.89629035103964e-06, - "loss": 0.84773254, - "num_input_tokens_seen": 46650575, - "step": 2160, - "time_per_iteration": 2.7358646392822266 - }, - { - "auxiliary_loss_clip": 0.01154348, - "auxiliary_loss_mlp": 0.01047243, - "balance_loss_clip": 1.05873609, - "balance_loss_mlp": 1.02732301, - "epoch": 0.12992634901548175, - "flos": 18802719590400.0, - "grad_norm": 1.7252123805741888, - "language_loss": 0.82310414, - "learning_rate": 3.896166529529008e-06, - "loss": 0.84512007, - "num_input_tokens_seen": 46668780, - "step": 2161, - "time_per_iteration": 2.7029623985290527 - }, - { - "auxiliary_loss_clip": 0.01145886, - "auxiliary_loss_mlp": 0.01060381, - "balance_loss_clip": 1.05145073, - "balance_loss_mlp": 1.03911448, - "epoch": 0.12998647226814972, - "flos": 29127575543040.0, - "grad_norm": 2.0780374068601253, - "language_loss": 0.82668459, - "learning_rate": 3.896042636115551e-06, - "loss": 0.84874725, - "num_input_tokens_seen": 46687550, - "step": 2162, - "time_per_iteration": 2.674825668334961 - }, - { - "auxiliary_loss_clip": 0.0113921, - "auxiliary_loss_mlp": 0.0105953, - "balance_loss_clip": 1.05468941, - "balance_loss_mlp": 1.03957474, - "epoch": 0.13004659552081768, - "flos": 19573686132480.0, - "grad_norm": 3.928222506771022, - "language_loss": 0.72579277, - "learning_rate": 3.895918670803968e-06, - "loss": 0.7477802, - "num_input_tokens_seen": 46706730, - "step": 2163, - "time_per_iteration": 2.678394079208374 - }, - { - "auxiliary_loss_clip": 0.01173873, - "auxiliary_loss_mlp": 0.00778662, - "balance_loss_clip": 1.05635965, - "balance_loss_mlp": 0.99994016, - "epoch": 0.13010671877348565, - "flos": 22490709626880.0, - "grad_norm": 2.0196348424542827, - "language_loss": 0.81330699, - "learning_rate": 3.895794633598958e-06, - "loss": 0.83283234, - "num_input_tokens_seen": 46724250, - "step": 2164, - "time_per_iteration": 2.6116931438446045 - }, - { - "auxiliary_loss_clip": 0.01119834, - "auxiliary_loss_mlp": 0.01050661, - "balance_loss_clip": 1.04808033, - "balance_loss_mlp": 1.03061032, - "epoch": 0.1301668420261536, - "flos": 23878226142720.0, - "grad_norm": 2.274563635903502, - "language_loss": 0.72262049, - "learning_rate": 3.8956705245052256e-06, - "loss": 0.74432552, - "num_input_tokens_seen": 46744105, - "step": 2165, - "time_per_iteration": 2.7646515369415283 - }, - { - "auxiliary_loss_clip": 0.01109832, - "auxiliary_loss_mlp": 0.01048351, - "balance_loss_clip": 1.05059505, - "balance_loss_mlp": 1.02707219, - "epoch": 0.13022696527882158, - "flos": 23150065633920.0, - "grad_norm": 2.8383873988269217, - "language_loss": 0.74749964, - "learning_rate": 3.8955463435274765e-06, - "loss": 0.76908153, - "num_input_tokens_seen": 46764250, - "step": 2166, - "time_per_iteration": 2.7939398288726807 - }, - { - "auxiliary_loss_clip": 0.01170298, - "auxiliary_loss_mlp": 0.01048037, - "balance_loss_clip": 1.05364752, - "balance_loss_mlp": 1.02827251, - "epoch": 0.13028708853148954, - "flos": 26908548111360.0, - "grad_norm": 1.5379857106114436, - "language_loss": 0.83098066, - "learning_rate": 3.895422090670421e-06, - "loss": 0.85316396, - "num_input_tokens_seen": 46786865, - "step": 2167, - "time_per_iteration": 2.700505495071411 - }, - { - "auxiliary_loss_clip": 0.01108628, - "auxiliary_loss_mlp": 0.01059921, - "balance_loss_clip": 1.04567361, - "balance_loss_mlp": 1.03841531, - "epoch": 0.13034721178415754, - "flos": 21251468453760.0, - "grad_norm": 1.6054044551173634, - "language_loss": 0.83578718, - "learning_rate": 3.89529776593877e-06, - "loss": 0.85747266, - "num_input_tokens_seen": 46807030, - "step": 2168, - "time_per_iteration": 2.839285135269165 - }, - { - "auxiliary_loss_clip": 0.01079188, - "auxiliary_loss_mlp": 0.01063413, - "balance_loss_clip": 1.04247975, - "balance_loss_mlp": 1.03861713, - "epoch": 0.1304073350368255, - "flos": 18767239931520.0, - "grad_norm": 1.950315007602454, - "language_loss": 0.79910588, - "learning_rate": 3.8951733693372375e-06, - "loss": 0.8205319, - "num_input_tokens_seen": 46826280, - "step": 2169, - "time_per_iteration": 2.8150076866149902 - }, - { - "auxiliary_loss_clip": 0.01174566, - "auxiliary_loss_mlp": 0.01044893, - "balance_loss_clip": 1.05822575, - "balance_loss_mlp": 1.02339983, - "epoch": 0.13046745828949347, - "flos": 28364653647360.0, - "grad_norm": 2.4117618540057766, - "language_loss": 0.66804767, - "learning_rate": 3.8950489008705406e-06, - "loss": 0.69024229, - "num_input_tokens_seen": 46846505, - "step": 2170, - "time_per_iteration": 2.722769021987915 - }, - { - "auxiliary_loss_clip": 0.0114216, - "auxiliary_loss_mlp": 0.01046684, - "balance_loss_clip": 1.05424142, - "balance_loss_mlp": 1.02637053, - "epoch": 0.13052758154216143, - "flos": 29605044055680.0, - "grad_norm": 1.9089846415842238, - "language_loss": 0.66768706, - "learning_rate": 3.8949243605434e-06, - "loss": 0.68957549, - "num_input_tokens_seen": 46867380, - "step": 2171, - "time_per_iteration": 2.7474682331085205 - }, - { - "auxiliary_loss_clip": 0.01157431, - "auxiliary_loss_mlp": 0.01049079, - "balance_loss_clip": 1.05283058, - "balance_loss_mlp": 1.02701378, - "epoch": 0.1305877047948294, - "flos": 19390864884480.0, - "grad_norm": 2.103440896006443, - "language_loss": 0.72157478, - "learning_rate": 3.894799748360537e-06, - "loss": 0.74363995, - "num_input_tokens_seen": 46886810, - "step": 2172, - "time_per_iteration": 2.8062691688537598 - }, - { - "auxiliary_loss_clip": 0.01131178, - "auxiliary_loss_mlp": 0.01045812, - "balance_loss_clip": 1.05676126, - "balance_loss_mlp": 1.0248909, - "epoch": 0.13064782804749736, - "flos": 16873527000960.0, - "grad_norm": 1.8662964619330822, - "language_loss": 0.75331408, - "learning_rate": 3.894675064326678e-06, - "loss": 0.77508402, - "num_input_tokens_seen": 46905620, - "step": 2173, - "time_per_iteration": 2.749630928039551 - }, - { - "auxiliary_loss_clip": 0.01132129, - "auxiliary_loss_mlp": 0.01056024, - "balance_loss_clip": 1.05241716, - "balance_loss_mlp": 1.03388715, - "epoch": 0.13070795130016533, - "flos": 24499085748480.0, - "grad_norm": 2.8034072456055426, - "language_loss": 0.70175481, - "learning_rate": 3.894550308446551e-06, - "loss": 0.72363639, - "num_input_tokens_seen": 46925120, - "step": 2174, - "time_per_iteration": 2.723314046859741 - }, - { - "auxiliary_loss_clip": 0.01047643, - "auxiliary_loss_mlp": 0.01015006, - "balance_loss_clip": 1.02629197, - "balance_loss_mlp": 1.01260972, - "epoch": 0.13076807455283332, - "flos": 71054505953280.0, - "grad_norm": 0.7998489021914615, - "language_loss": 0.59026134, - "learning_rate": 3.894425480724886e-06, - "loss": 0.61088777, - "num_input_tokens_seen": 46988195, - "step": 2175, - "time_per_iteration": 3.318049192428589 - }, - { - "auxiliary_loss_clip": 0.01159762, - "auxiliary_loss_mlp": 0.01053929, - "balance_loss_clip": 1.05441868, - "balance_loss_mlp": 1.03342521, - "epoch": 0.13082819780550128, - "flos": 20264499475200.0, - "grad_norm": 2.2309284705459707, - "language_loss": 0.80365628, - "learning_rate": 3.894300581166417e-06, - "loss": 0.82579315, - "num_input_tokens_seen": 47004720, - "step": 2176, - "time_per_iteration": 2.631732702255249 - }, - { - "auxiliary_loss_clip": 0.01169648, - "auxiliary_loss_mlp": 0.01047517, - "balance_loss_clip": 1.05513525, - "balance_loss_mlp": 1.02529645, - "epoch": 0.13088832105816925, - "flos": 34203441231360.0, - "grad_norm": 1.6906214681317566, - "language_loss": 0.74661696, - "learning_rate": 3.894175609775881e-06, - "loss": 0.76878858, - "num_input_tokens_seen": 47024255, - "step": 2177, - "time_per_iteration": 2.701422691345215 - }, - { - "auxiliary_loss_clip": 0.01131124, - "auxiliary_loss_mlp": 0.0105144, - "balance_loss_clip": 1.051373, - "balance_loss_mlp": 1.02905297, - "epoch": 0.13094844431083721, - "flos": 17894970057600.0, - "grad_norm": 1.8043513019060269, - "language_loss": 0.82266748, - "learning_rate": 3.894050566558015e-06, - "loss": 0.84449303, - "num_input_tokens_seen": 47042465, - "step": 2178, - "time_per_iteration": 2.6934497356414795 - }, - { - "auxiliary_loss_clip": 0.01170524, - "auxiliary_loss_mlp": 0.01047895, - "balance_loss_clip": 1.05729508, - "balance_loss_mlp": 1.02705729, - "epoch": 0.13100856756350518, - "flos": 17311313963520.0, - "grad_norm": 2.9251611149508276, - "language_loss": 0.74291968, - "learning_rate": 3.893925451517562e-06, - "loss": 0.76510382, - "num_input_tokens_seen": 47060370, - "step": 2179, - "time_per_iteration": 2.6111502647399902 - }, - { - "auxiliary_loss_clip": 0.01128297, - "auxiliary_loss_mlp": 0.01052407, - "balance_loss_clip": 1.04917574, - "balance_loss_mlp": 1.03184354, - "epoch": 0.13106869081617314, - "flos": 22200551562240.0, - "grad_norm": 1.9805514150688242, - "language_loss": 0.84366202, - "learning_rate": 3.893800264659266e-06, - "loss": 0.8654691, - "num_input_tokens_seen": 47081415, - "step": 2180, - "time_per_iteration": 2.731229543685913 - }, - { - "auxiliary_loss_clip": 0.01162028, - "auxiliary_loss_mlp": 0.0105845, - "balance_loss_clip": 1.05875921, - "balance_loss_mlp": 1.03757644, - "epoch": 0.13112881406884114, - "flos": 21763123735680.0, - "grad_norm": 1.8389866248015785, - "language_loss": 0.89840436, - "learning_rate": 3.8936750059878746e-06, - "loss": 0.92060918, - "num_input_tokens_seen": 47099860, - "step": 2181, - "time_per_iteration": 2.643890380859375 - }, - { - "auxiliary_loss_clip": 0.01153771, - "auxiliary_loss_mlp": 0.01051982, - "balance_loss_clip": 1.05222976, - "balance_loss_mlp": 1.03126323, - "epoch": 0.1311889373215091, - "flos": 23331091201920.0, - "grad_norm": 2.117586475019142, - "language_loss": 0.68813586, - "learning_rate": 3.893549675508137e-06, - "loss": 0.7101934, - "num_input_tokens_seen": 47118540, - "step": 2182, - "time_per_iteration": 2.6198863983154297 - }, - { - "auxiliary_loss_clip": 0.01123039, - "auxiliary_loss_mlp": 0.01051411, - "balance_loss_clip": 1.0502702, - "balance_loss_mlp": 1.0292381, - "epoch": 0.13124906057417707, - "flos": 21467363149440.0, - "grad_norm": 1.787500136217105, - "language_loss": 0.78694725, - "learning_rate": 3.893424273224806e-06, - "loss": 0.8086918, - "num_input_tokens_seen": 47136710, - "step": 2183, - "time_per_iteration": 2.715517520904541 - }, - { - "auxiliary_loss_clip": 0.01169106, - "auxiliary_loss_mlp": 0.01047098, - "balance_loss_clip": 1.05452895, - "balance_loss_mlp": 1.02586675, - "epoch": 0.13130918382684503, - "flos": 23255319461760.0, - "grad_norm": 26.753588494231124, - "language_loss": 0.85792655, - "learning_rate": 3.893298799142636e-06, - "loss": 0.88008863, - "num_input_tokens_seen": 47157155, - "step": 2184, - "time_per_iteration": 2.632539987564087 - }, - { - "auxiliary_loss_clip": 0.01138714, - "auxiliary_loss_mlp": 0.01054657, - "balance_loss_clip": 1.05349112, - "balance_loss_mlp": 1.03230524, - "epoch": 0.131369307079513, - "flos": 20850274471680.0, - "grad_norm": 2.50466124454056, - "language_loss": 0.82703435, - "learning_rate": 3.893173253266387e-06, - "loss": 0.84896809, - "num_input_tokens_seen": 47176820, - "step": 2185, - "time_per_iteration": 2.6809136867523193 - }, - { - "auxiliary_loss_clip": 0.01144077, - "auxiliary_loss_mlp": 0.01054121, - "balance_loss_clip": 1.05262399, - "balance_loss_mlp": 1.03236496, - "epoch": 0.13142943033218096, - "flos": 17858341163520.0, - "grad_norm": 1.8949462712827352, - "language_loss": 0.72956109, - "learning_rate": 3.893047635600818e-06, - "loss": 0.75154305, - "num_input_tokens_seen": 47195855, - "step": 2186, - "time_per_iteration": 2.628096342086792 - }, - { - "auxiliary_loss_clip": 0.01157778, - "auxiliary_loss_mlp": 0.01050695, - "balance_loss_clip": 1.05436552, - "balance_loss_mlp": 1.02783096, - "epoch": 0.13148955358484893, - "flos": 20996035862400.0, - "grad_norm": 1.9822444068613732, - "language_loss": 0.80363685, - "learning_rate": 3.892921946150693e-06, - "loss": 0.82572162, - "num_input_tokens_seen": 47214535, - "step": 2187, - "time_per_iteration": 2.762223720550537 - }, - { - "auxiliary_loss_clip": 0.01027324, - "auxiliary_loss_mlp": 0.0101023, - "balance_loss_clip": 1.02364707, - "balance_loss_mlp": 1.00792885, - "epoch": 0.13154967683751692, - "flos": 70172467580160.0, - "grad_norm": 0.8471850380496847, - "language_loss": 0.59082437, - "learning_rate": 3.892796184920778e-06, - "loss": 0.61119986, - "num_input_tokens_seen": 47270300, - "step": 2188, - "time_per_iteration": 3.302457571029663 - }, - { - "auxiliary_loss_clip": 0.01095126, - "auxiliary_loss_mlp": 0.01059346, - "balance_loss_clip": 1.04827487, - "balance_loss_mlp": 1.03676724, - "epoch": 0.1316098000901849, - "flos": 20376145923840.0, - "grad_norm": 1.7340345041340466, - "language_loss": 0.74211109, - "learning_rate": 3.892670351915842e-06, - "loss": 0.76365584, - "num_input_tokens_seen": 47290720, - "step": 2189, - "time_per_iteration": 2.7990496158599854 - }, - { - "auxiliary_loss_clip": 0.01160124, - "auxiliary_loss_mlp": 0.01049098, - "balance_loss_clip": 1.05551052, - "balance_loss_mlp": 1.02799821, - "epoch": 0.13166992334285285, - "flos": 23221132692480.0, - "grad_norm": 1.8160574809616576, - "language_loss": 0.73152113, - "learning_rate": 3.892544447140657e-06, - "loss": 0.75361335, - "num_input_tokens_seen": 47311820, - "step": 2190, - "time_per_iteration": 2.6485326290130615 - }, - { - "auxiliary_loss_clip": 0.01160351, - "auxiliary_loss_mlp": 0.01058461, - "balance_loss_clip": 1.05671644, - "balance_loss_mlp": 1.03811169, - "epoch": 0.13173004659552082, - "flos": 23330947547520.0, - "grad_norm": 1.8825588242208007, - "language_loss": 0.74617779, - "learning_rate": 3.892418470599996e-06, - "loss": 0.76836598, - "num_input_tokens_seen": 47331605, - "step": 2191, - "time_per_iteration": 2.644484281539917 - }, - { - "auxiliary_loss_clip": 0.0112783, - "auxiliary_loss_mlp": 0.01054712, - "balance_loss_clip": 1.05129039, - "balance_loss_mlp": 1.03356445, - "epoch": 0.13179016984818878, - "flos": 21251504367360.0, - "grad_norm": 1.8823393822145031, - "language_loss": 0.79093283, - "learning_rate": 3.892292422298637e-06, - "loss": 0.81275827, - "num_input_tokens_seen": 47350455, - "step": 2192, - "time_per_iteration": 2.735225200653076 - }, - { - "auxiliary_loss_clip": 0.0111282, - "auxiliary_loss_mlp": 0.01051113, - "balance_loss_clip": 1.04457211, - "balance_loss_mlp": 1.02936912, - "epoch": 0.13185029310085675, - "flos": 17778690754560.0, - "grad_norm": 1.7242105632860862, - "language_loss": 0.85350716, - "learning_rate": 3.892166302241361e-06, - "loss": 0.87514639, - "num_input_tokens_seen": 47368225, - "step": 2193, - "time_per_iteration": 4.262877941131592 - }, - { - "auxiliary_loss_clip": 0.0104173, - "auxiliary_loss_mlp": 0.01015651, - "balance_loss_clip": 1.02609122, - "balance_loss_mlp": 1.01280212, - "epoch": 0.1319104163535247, - "flos": 69851785933440.0, - "grad_norm": 0.7746813180799224, - "language_loss": 0.54112649, - "learning_rate": 3.8920401104329475e-06, - "loss": 0.56170022, - "num_input_tokens_seen": 47427125, - "step": 2194, - "time_per_iteration": 6.223008394241333 - }, - { - "auxiliary_loss_clip": 0.01168022, - "auxiliary_loss_mlp": 0.01048581, - "balance_loss_clip": 1.05420566, - "balance_loss_mlp": 1.02828002, - "epoch": 0.1319705396061927, - "flos": 25193095401600.0, - "grad_norm": 2.1079865649821925, - "language_loss": 0.72433972, - "learning_rate": 3.891913846878185e-06, - "loss": 0.74650574, - "num_input_tokens_seen": 47450275, - "step": 2195, - "time_per_iteration": 2.6357345581054688 - }, - { - "auxiliary_loss_clip": 0.01136503, - "auxiliary_loss_mlp": 0.00778731, - "balance_loss_clip": 1.05176425, - "balance_loss_mlp": 0.99996454, - "epoch": 0.13203066285886067, - "flos": 20740459616640.0, - "grad_norm": 1.5737174748369949, - "language_loss": 0.78126895, - "learning_rate": 3.891787511581859e-06, - "loss": 0.8004213, - "num_input_tokens_seen": 47469155, - "step": 2196, - "time_per_iteration": 2.7118594646453857 - }, - { - "auxiliary_loss_clip": 0.01162447, - "auxiliary_loss_mlp": 0.010526, - "balance_loss_clip": 1.05453539, - "balance_loss_mlp": 1.03210831, - "epoch": 0.13209078611152864, - "flos": 22054395121920.0, - "grad_norm": 1.9385650447291836, - "language_loss": 0.74632496, - "learning_rate": 3.89166110454876e-06, - "loss": 0.76847541, - "num_input_tokens_seen": 47488405, - "step": 2197, - "time_per_iteration": 4.270530939102173 - }, - { - "auxiliary_loss_clip": 0.01173786, - "auxiliary_loss_mlp": 0.01050846, - "balance_loss_clip": 1.05440533, - "balance_loss_mlp": 1.02947164, - "epoch": 0.1321509093641966, - "flos": 16284950743680.0, - "grad_norm": 1.785688190112577, - "language_loss": 0.79566747, - "learning_rate": 3.891534625783685e-06, - "loss": 0.81791383, - "num_input_tokens_seen": 47505650, - "step": 2198, - "time_per_iteration": 2.6145474910736084 - }, - { - "auxiliary_loss_clip": 0.01170264, - "auxiliary_loss_mlp": 0.01057159, - "balance_loss_clip": 1.05536175, - "balance_loss_mlp": 1.03647637, - "epoch": 0.13221103261686457, - "flos": 16983018633600.0, - "grad_norm": 2.56313218775589, - "language_loss": 0.82932216, - "learning_rate": 3.891408075291425e-06, - "loss": 0.85159647, - "num_input_tokens_seen": 47521540, - "step": 2199, - "time_per_iteration": 2.5715503692626953 - }, - { - "auxiliary_loss_clip": 0.01122554, - "auxiliary_loss_mlp": 0.01052148, - "balance_loss_clip": 1.05047798, - "balance_loss_mlp": 1.03045249, - "epoch": 0.13227115586953253, - "flos": 34233605677440.0, - "grad_norm": 1.8710902505917797, - "language_loss": 0.69579422, - "learning_rate": 3.8912814530767826e-06, - "loss": 0.71754128, - "num_input_tokens_seen": 47543625, - "step": 2200, - "time_per_iteration": 2.8001365661621094 - }, - { - "auxiliary_loss_clip": 0.01167798, - "auxiliary_loss_mlp": 0.01058155, - "balance_loss_clip": 1.05345917, - "balance_loss_mlp": 1.03618431, - "epoch": 0.13233127912220052, - "flos": 20704656735360.0, - "grad_norm": 1.647659287704997, - "language_loss": 0.84624702, - "learning_rate": 3.891154759144557e-06, - "loss": 0.86850655, - "num_input_tokens_seen": 47563740, - "step": 2201, - "time_per_iteration": 2.6485981941223145 - }, - { - "auxiliary_loss_clip": 0.0117188, - "auxiliary_loss_mlp": 0.01055627, - "balance_loss_clip": 1.05427861, - "balance_loss_mlp": 1.03431273, - "epoch": 0.1323914023748685, - "flos": 25805048434560.0, - "grad_norm": 1.7446392584198542, - "language_loss": 0.87088037, - "learning_rate": 3.891027993499554e-06, - "loss": 0.8931554, - "num_input_tokens_seen": 47582655, - "step": 2202, - "time_per_iteration": 2.5921456813812256 - }, - { - "auxiliary_loss_clip": 0.01139991, - "auxiliary_loss_mlp": 0.01053413, - "balance_loss_clip": 1.05299544, - "balance_loss_mlp": 1.03267026, - "epoch": 0.13245152562753645, - "flos": 21251540280960.0, - "grad_norm": 2.405254380671628, - "language_loss": 0.72801507, - "learning_rate": 3.89090115614658e-06, - "loss": 0.7499491, - "num_input_tokens_seen": 47600875, - "step": 2203, - "time_per_iteration": 2.6257405281066895 - }, - { - "auxiliary_loss_clip": 0.01124508, - "auxiliary_loss_mlp": 0.0105959, - "balance_loss_clip": 1.05080879, - "balance_loss_mlp": 1.03916979, - "epoch": 0.13251164888020442, - "flos": 26610955931520.0, - "grad_norm": 2.044348475010678, - "language_loss": 0.73170948, - "learning_rate": 3.890774247090444e-06, - "loss": 0.75355047, - "num_input_tokens_seen": 47619250, - "step": 2204, - "time_per_iteration": 2.753830909729004 - }, - { - "auxiliary_loss_clip": 0.01160826, - "auxiliary_loss_mlp": 0.01054406, - "balance_loss_clip": 1.05474758, - "balance_loss_mlp": 1.03225708, - "epoch": 0.13257177213287238, - "flos": 29826541272960.0, - "grad_norm": 2.094172729236468, - "language_loss": 0.78377104, - "learning_rate": 3.89064726633596e-06, - "loss": 0.80592328, - "num_input_tokens_seen": 47639445, - "step": 2205, - "time_per_iteration": 2.730682134628296 - }, - { - "auxiliary_loss_clip": 0.01125154, - "auxiliary_loss_mlp": 0.01048818, - "balance_loss_clip": 1.04975629, - "balance_loss_mlp": 1.02782559, - "epoch": 0.13263189538554035, - "flos": 21288456483840.0, - "grad_norm": 1.8609089802832188, - "language_loss": 0.78638101, - "learning_rate": 3.890520213887941e-06, - "loss": 0.80812073, - "num_input_tokens_seen": 47658740, - "step": 2206, - "time_per_iteration": 2.691962718963623 - }, - { - "auxiliary_loss_clip": 0.01124965, - "auxiliary_loss_mlp": 0.01045957, - "balance_loss_clip": 1.04958403, - "balance_loss_mlp": 1.02649069, - "epoch": 0.13269201863820831, - "flos": 16874101618560.0, - "grad_norm": 2.2777192787220066, - "language_loss": 0.74672282, - "learning_rate": 3.890393089751208e-06, - "loss": 0.76843208, - "num_input_tokens_seen": 47676880, - "step": 2207, - "time_per_iteration": 2.7062454223632812 - }, - { - "auxiliary_loss_clip": 0.01143208, - "auxiliary_loss_mlp": 0.01047941, - "balance_loss_clip": 1.05257845, - "balance_loss_mlp": 1.02672219, - "epoch": 0.1327521418908763, - "flos": 23768914078080.0, - "grad_norm": 1.692212064021935, - "language_loss": 0.84061795, - "learning_rate": 3.890265893930578e-06, - "loss": 0.8625294, - "num_input_tokens_seen": 47696635, - "step": 2208, - "time_per_iteration": 2.687717914581299 - }, - { - "auxiliary_loss_clip": 0.01152573, - "auxiliary_loss_mlp": 0.0105274, - "balance_loss_clip": 1.05847478, - "balance_loss_mlp": 1.03411973, - "epoch": 0.13281226514354427, - "flos": 26505594362880.0, - "grad_norm": 1.7032258459750478, - "language_loss": 0.85587811, - "learning_rate": 3.890138626430876e-06, - "loss": 0.8779313, - "num_input_tokens_seen": 47717760, - "step": 2209, - "time_per_iteration": 2.646015167236328 - }, - { - "auxiliary_loss_clip": 0.01138084, - "auxiliary_loss_mlp": 0.00778828, - "balance_loss_clip": 1.05316806, - "balance_loss_mlp": 1.00002563, - "epoch": 0.13287238839621224, - "flos": 24498762526080.0, - "grad_norm": 2.237247968175465, - "language_loss": 0.81797457, - "learning_rate": 3.890011287256929e-06, - "loss": 0.83714366, - "num_input_tokens_seen": 47737685, - "step": 2210, - "time_per_iteration": 2.676262378692627 - }, - { - "auxiliary_loss_clip": 0.0104445, - "auxiliary_loss_mlp": 0.00757817, - "balance_loss_clip": 1.03801322, - "balance_loss_mlp": 1.00007725, - "epoch": 0.1329325116488802, - "flos": 67694344369920.0, - "grad_norm": 0.7515252652740232, - "language_loss": 0.58031559, - "learning_rate": 3.889883876413563e-06, - "loss": 0.59833825, - "num_input_tokens_seen": 47802415, - "step": 2211, - "time_per_iteration": 3.3914146423339844 - }, - { - "auxiliary_loss_clip": 0.01064712, - "auxiliary_loss_mlp": 0.01012978, - "balance_loss_clip": 1.04205871, - "balance_loss_mlp": 1.01083231, - "epoch": 0.13299263490154817, - "flos": 72261894741120.0, - "grad_norm": 0.8012428422082742, - "language_loss": 0.55299425, - "learning_rate": 3.889756393905611e-06, - "loss": 0.57377112, - "num_input_tokens_seen": 47871485, - "step": 2212, - "time_per_iteration": 3.2910914421081543 - }, - { - "auxiliary_loss_clip": 0.01132433, - "auxiliary_loss_mlp": 0.01054299, - "balance_loss_clip": 1.05107963, - "balance_loss_mlp": 1.0331986, - "epoch": 0.13305275815421613, - "flos": 17931275729280.0, - "grad_norm": 2.484635795733661, - "language_loss": 0.74228692, - "learning_rate": 3.889628839737908e-06, - "loss": 0.7641542, - "num_input_tokens_seen": 47888315, - "step": 2213, - "time_per_iteration": 2.755777597427368 - }, - { - "auxiliary_loss_clip": 0.01114671, - "auxiliary_loss_mlp": 0.01051459, - "balance_loss_clip": 1.04682255, - "balance_loss_mlp": 1.03231359, - "epoch": 0.13311288140688413, - "flos": 22340889999360.0, - "grad_norm": 1.850943077435394, - "language_loss": 0.79699469, - "learning_rate": 3.889501213915291e-06, - "loss": 0.81865597, - "num_input_tokens_seen": 47906600, - "step": 2214, - "time_per_iteration": 2.702603340148926 - }, - { - "auxiliary_loss_clip": 0.01143494, - "auxiliary_loss_mlp": 0.01052411, - "balance_loss_clip": 1.05555344, - "balance_loss_mlp": 1.03171659, - "epoch": 0.1331730046595521, - "flos": 31868888682240.0, - "grad_norm": 1.8782588426913054, - "language_loss": 0.69341159, - "learning_rate": 3.889373516442597e-06, - "loss": 0.71537066, - "num_input_tokens_seen": 47927630, - "step": 2215, - "time_per_iteration": 2.769237518310547 - }, - { - "auxiliary_loss_clip": 0.01167307, - "auxiliary_loss_mlp": 0.01051423, - "balance_loss_clip": 1.06098068, - "balance_loss_mlp": 1.03132463, - "epoch": 0.13323312791222006, - "flos": 22566589107840.0, - "grad_norm": 1.884566493826098, - "language_loss": 0.81262428, - "learning_rate": 3.889245747324671e-06, - "loss": 0.83481157, - "num_input_tokens_seen": 47947935, - "step": 2216, - "time_per_iteration": 2.7427120208740234 - }, - { - "auxiliary_loss_clip": 0.01163681, - "auxiliary_loss_mlp": 0.01056545, - "balance_loss_clip": 1.06198788, - "balance_loss_mlp": 1.03631544, - "epoch": 0.13329325116488802, - "flos": 15085319293440.0, - "grad_norm": 3.783334161704178, - "language_loss": 0.87299347, - "learning_rate": 3.889117906566356e-06, - "loss": 0.89519572, - "num_input_tokens_seen": 47965515, - "step": 2217, - "time_per_iteration": 2.709527015686035 - }, - { - "auxiliary_loss_clip": 0.01152703, - "auxiliary_loss_mlp": 0.01056364, - "balance_loss_clip": 1.06054497, - "balance_loss_mlp": 1.0343225, - "epoch": 0.133353374417556, - "flos": 27453671890560.0, - "grad_norm": 4.412823416345162, - "language_loss": 0.73105222, - "learning_rate": 3.888989994172501e-06, - "loss": 0.75314289, - "num_input_tokens_seen": 47985675, - "step": 2218, - "time_per_iteration": 2.697733163833618 - }, - { - "auxiliary_loss_clip": 0.01129106, - "auxiliary_loss_mlp": 0.01051151, - "balance_loss_clip": 1.0535965, - "balance_loss_mlp": 1.02993202, - "epoch": 0.13341349767022395, - "flos": 24094695456000.0, - "grad_norm": 1.7935349411013712, - "language_loss": 0.86911142, - "learning_rate": 3.8888620101479565e-06, - "loss": 0.89091408, - "num_input_tokens_seen": 48004985, - "step": 2219, - "time_per_iteration": 2.7641642093658447 - }, - { - "auxiliary_loss_clip": 0.01141172, - "auxiliary_loss_mlp": 0.0106326, - "balance_loss_clip": 1.05751657, - "balance_loss_mlp": 1.04406714, - "epoch": 0.13347362092289192, - "flos": 24133335511680.0, - "grad_norm": 1.8604531362737113, - "language_loss": 0.77244747, - "learning_rate": 3.888733954497574e-06, - "loss": 0.79449183, - "num_input_tokens_seen": 48024965, - "step": 2220, - "time_per_iteration": 2.732160806655884 - }, - { - "auxiliary_loss_clip": 0.01146487, - "auxiliary_loss_mlp": 0.01048662, - "balance_loss_clip": 1.05399704, - "balance_loss_mlp": 1.03001785, - "epoch": 0.1335337441755599, - "flos": 18436538390400.0, - "grad_norm": 2.3004113327688955, - "language_loss": 0.79467338, - "learning_rate": 3.888605827226212e-06, - "loss": 0.81662482, - "num_input_tokens_seen": 48040890, - "step": 2221, - "time_per_iteration": 2.685612440109253 - }, - { - "auxiliary_loss_clip": 0.01062777, - "auxiliary_loss_mlp": 0.01021711, - "balance_loss_clip": 1.03293467, - "balance_loss_mlp": 1.0194701, - "epoch": 0.13359386742822787, - "flos": 50611997652480.0, - "grad_norm": 0.9755051104211709, - "language_loss": 0.68938822, - "learning_rate": 3.8884776283387275e-06, - "loss": 0.71023309, - "num_input_tokens_seen": 48091855, - "step": 2222, - "time_per_iteration": 3.0336835384368896 - }, - { - "auxiliary_loss_clip": 0.01130152, - "auxiliary_loss_mlp": 0.01058574, - "balance_loss_clip": 1.05544209, - "balance_loss_mlp": 1.03940475, - "epoch": 0.13365399068089584, - "flos": 22778569221120.0, - "grad_norm": 2.1295993667823416, - "language_loss": 0.67389107, - "learning_rate": 3.888349357839982e-06, - "loss": 0.69577825, - "num_input_tokens_seen": 48111350, - "step": 2223, - "time_per_iteration": 2.7134146690368652 - }, - { - "auxiliary_loss_clip": 0.01161386, - "auxiliary_loss_mlp": 0.01060571, - "balance_loss_clip": 1.05785358, - "balance_loss_mlp": 1.04010296, - "epoch": 0.1337141139335638, - "flos": 12531603911040.0, - "grad_norm": 4.277142483609355, - "language_loss": 0.82505226, - "learning_rate": 3.88822101573484e-06, - "loss": 0.84727186, - "num_input_tokens_seen": 48129840, - "step": 2224, - "time_per_iteration": 2.608372926712036 - }, - { - "auxiliary_loss_clip": 0.01173412, - "auxiliary_loss_mlp": 0.01050086, - "balance_loss_clip": 1.0573926, - "balance_loss_mlp": 1.0290221, - "epoch": 0.13377423718623177, - "flos": 23038957889280.0, - "grad_norm": 1.9890294619132924, - "language_loss": 0.66270435, - "learning_rate": 3.888092602028167e-06, - "loss": 0.68493932, - "num_input_tokens_seen": 48149240, - "step": 2225, - "time_per_iteration": 2.6304945945739746 - }, - { - "auxiliary_loss_clip": 0.01153626, - "auxiliary_loss_mlp": 0.01051637, - "balance_loss_clip": 1.05233717, - "balance_loss_mlp": 1.03180075, - "epoch": 0.13383436043889974, - "flos": 16216397637120.0, - "grad_norm": 2.2915668787246997, - "language_loss": 0.89469218, - "learning_rate": 3.887964116724835e-06, - "loss": 0.91674477, - "num_input_tokens_seen": 48166330, - "step": 2226, - "time_per_iteration": 2.6002328395843506 - }, - { - "auxiliary_loss_clip": 0.01150395, - "auxiliary_loss_mlp": 0.01054296, - "balance_loss_clip": 1.0549798, - "balance_loss_mlp": 1.03423262, - "epoch": 0.1338944836915677, - "flos": 24279671520000.0, - "grad_norm": 1.7271512115821777, - "language_loss": 0.73209751, - "learning_rate": 3.887835559829712e-06, - "loss": 0.75414443, - "num_input_tokens_seen": 48187600, - "step": 2227, - "time_per_iteration": 2.706193447113037 - }, - { - "auxiliary_loss_clip": 0.01157707, - "auxiliary_loss_mlp": 0.01047387, - "balance_loss_clip": 1.05518484, - "balance_loss_mlp": 1.02683568, - "epoch": 0.1339546069442357, - "flos": 17598742594560.0, - "grad_norm": 2.848999829625599, - "language_loss": 0.85160232, - "learning_rate": 3.8877069313476764e-06, - "loss": 0.87365323, - "num_input_tokens_seen": 48204400, - "step": 2228, - "time_per_iteration": 2.689209222793579 - }, - { - "auxiliary_loss_clip": 0.01132803, - "auxiliary_loss_mlp": 0.01052829, - "balance_loss_clip": 1.04935181, - "balance_loss_mlp": 1.03126431, - "epoch": 0.13401473019690366, - "flos": 18990065952000.0, - "grad_norm": 1.909679794697233, - "language_loss": 0.81460214, - "learning_rate": 3.8875782312836054e-06, - "loss": 0.83645844, - "num_input_tokens_seen": 48222180, - "step": 2229, - "time_per_iteration": 2.6380228996276855 - }, - { - "auxiliary_loss_clip": 0.0110557, - "auxiliary_loss_mlp": 0.01052684, - "balance_loss_clip": 1.04774594, - "balance_loss_mlp": 1.03233457, - "epoch": 0.13407485344957162, - "flos": 26943812288640.0, - "grad_norm": 1.7464076089691416, - "language_loss": 0.73822236, - "learning_rate": 3.887449459642378e-06, - "loss": 0.7598049, - "num_input_tokens_seen": 48243245, - "step": 2230, - "time_per_iteration": 2.7332983016967773 - }, - { - "auxiliary_loss_clip": 0.01125236, - "auxiliary_loss_mlp": 0.01058977, - "balance_loss_clip": 1.05213606, - "balance_loss_mlp": 1.03890252, - "epoch": 0.1341349767022396, - "flos": 20339373375360.0, - "grad_norm": 1.6827882777998602, - "language_loss": 0.80133682, - "learning_rate": 3.8873206164288785e-06, - "loss": 0.82317901, - "num_input_tokens_seen": 48262600, - "step": 2231, - "time_per_iteration": 2.6759045124053955 - }, - { - "auxiliary_loss_clip": 0.01111387, - "auxiliary_loss_mlp": 0.01057582, - "balance_loss_clip": 1.04997492, - "balance_loss_mlp": 1.03499198, - "epoch": 0.13419509995490755, - "flos": 29862020931840.0, - "grad_norm": 1.746756846769887, - "language_loss": 0.72152746, - "learning_rate": 3.887191701647992e-06, - "loss": 0.74321723, - "num_input_tokens_seen": 48285075, - "step": 2232, - "time_per_iteration": 4.391890048980713 - }, - { - "auxiliary_loss_clip": 0.0112104, - "auxiliary_loss_mlp": 0.01051805, - "balance_loss_clip": 1.0481019, - "balance_loss_mlp": 1.03039551, - "epoch": 0.13425522320757552, - "flos": 26942986275840.0, - "grad_norm": 2.4719586176391686, - "language_loss": 0.65116024, - "learning_rate": 3.8870627153046066e-06, - "loss": 0.67288864, - "num_input_tokens_seen": 48301285, - "step": 2233, - "time_per_iteration": 4.234508037567139 - }, - { - "auxiliary_loss_clip": 0.01167005, - "auxiliary_loss_mlp": 0.0104461, - "balance_loss_clip": 1.05189967, - "balance_loss_mlp": 1.02421367, - "epoch": 0.1343153464602435, - "flos": 15777281871360.0, - "grad_norm": 2.4864430088666656, - "language_loss": 0.80878961, - "learning_rate": 3.886933657403615e-06, - "loss": 0.8309058, - "num_input_tokens_seen": 48317835, - "step": 2234, - "time_per_iteration": 4.175215005874634 - }, - { - "auxiliary_loss_clip": 0.01140761, - "auxiliary_loss_mlp": 0.01054039, - "balance_loss_clip": 1.05052733, - "balance_loss_mlp": 1.03268874, - "epoch": 0.13437546971291148, - "flos": 24314756129280.0, - "grad_norm": 2.0569321713284827, - "language_loss": 0.82114553, - "learning_rate": 3.886804527949909e-06, - "loss": 0.84309351, - "num_input_tokens_seen": 48335670, - "step": 2235, - "time_per_iteration": 2.6588025093078613 - }, - { - "auxiliary_loss_clip": 0.01149093, - "auxiliary_loss_mlp": 0.01052015, - "balance_loss_clip": 1.05040097, - "balance_loss_mlp": 1.02983022, - "epoch": 0.13443559296557944, - "flos": 26650673395200.0, - "grad_norm": 1.6363146905087136, - "language_loss": 0.86092007, - "learning_rate": 3.8866753269483864e-06, - "loss": 0.88293117, - "num_input_tokens_seen": 48357805, - "step": 2236, - "time_per_iteration": 4.349383592605591 - }, - { - "auxiliary_loss_clip": 0.01166751, - "auxiliary_loss_mlp": 0.01047925, - "balance_loss_clip": 1.05288053, - "balance_loss_mlp": 1.02724242, - "epoch": 0.1344957162182474, - "flos": 21796197183360.0, - "grad_norm": 1.82135056053112, - "language_loss": 0.77258497, - "learning_rate": 3.886546054403946e-06, - "loss": 0.79473174, - "num_input_tokens_seen": 48377845, - "step": 2237, - "time_per_iteration": 2.6398766040802 - }, - { - "auxiliary_loss_clip": 0.01145425, - "auxiliary_loss_mlp": 0.01051006, - "balance_loss_clip": 1.05016851, - "balance_loss_mlp": 1.02919102, - "epoch": 0.13455583947091537, - "flos": 19865568049920.0, - "grad_norm": 2.440947698046141, - "language_loss": 0.78772336, - "learning_rate": 3.886416710321491e-06, - "loss": 0.80968761, - "num_input_tokens_seen": 48394735, - "step": 2238, - "time_per_iteration": 2.6556923389434814 - }, - { - "auxiliary_loss_clip": 0.01141594, - "auxiliary_loss_mlp": 0.01050085, - "balance_loss_clip": 1.05123293, - "balance_loss_mlp": 1.02878201, - "epoch": 0.13461596272358334, - "flos": 30846835094400.0, - "grad_norm": 2.9136729194949735, - "language_loss": 0.68486369, - "learning_rate": 3.886287294705924e-06, - "loss": 0.70678043, - "num_input_tokens_seen": 48414200, - "step": 2239, - "time_per_iteration": 2.6778814792633057 - }, - { - "auxiliary_loss_clip": 0.01147129, - "auxiliary_loss_mlp": 0.01052633, - "balance_loss_clip": 1.0515976, - "balance_loss_mlp": 1.03197384, - "epoch": 0.1346760859762513, - "flos": 12494436312960.0, - "grad_norm": 2.3763106012672925, - "language_loss": 0.81277847, - "learning_rate": 3.8861578075621555e-06, - "loss": 0.8347761, - "num_input_tokens_seen": 48431065, - "step": 2240, - "time_per_iteration": 2.5920939445495605 - }, - { - "auxiliary_loss_clip": 0.01107793, - "auxiliary_loss_mlp": 0.01049909, - "balance_loss_clip": 1.04459488, - "balance_loss_mlp": 1.02884459, - "epoch": 0.1347362092289193, - "flos": 21836022387840.0, - "grad_norm": 1.7269080191231387, - "language_loss": 0.77183759, - "learning_rate": 3.886028248895093e-06, - "loss": 0.79341465, - "num_input_tokens_seen": 48450335, - "step": 2241, - "time_per_iteration": 2.7224419116973877 - }, - { - "auxiliary_loss_clip": 0.0116331, - "auxiliary_loss_mlp": 0.01041419, - "balance_loss_clip": 1.05439126, - "balance_loss_mlp": 1.02324009, - "epoch": 0.13479633248158726, - "flos": 23509459163520.0, - "grad_norm": 2.0305903786470743, - "language_loss": 0.83062387, - "learning_rate": 3.88589861870965e-06, - "loss": 0.85267115, - "num_input_tokens_seen": 48468555, - "step": 2242, - "time_per_iteration": 2.5794169902801514 - }, - { - "auxiliary_loss_clip": 0.01170048, - "auxiliary_loss_mlp": 0.01056609, - "balance_loss_clip": 1.05504107, - "balance_loss_mlp": 1.03469825, - "epoch": 0.13485645573425523, - "flos": 29344332165120.0, - "grad_norm": 2.465549548535016, - "language_loss": 0.6498239, - "learning_rate": 3.885768917010744e-06, - "loss": 0.67209053, - "num_input_tokens_seen": 48488515, - "step": 2243, - "time_per_iteration": 2.6709110736846924 - }, - { - "auxiliary_loss_clip": 0.01125086, - "auxiliary_loss_mlp": 0.01046786, - "balance_loss_clip": 1.04593956, - "balance_loss_mlp": 1.02618706, - "epoch": 0.1349165789869232, - "flos": 28037112503040.0, - "grad_norm": 1.7770524512670738, - "language_loss": 0.72633034, - "learning_rate": 3.8856391438032895e-06, - "loss": 0.74804902, - "num_input_tokens_seen": 48510515, - "step": 2244, - "time_per_iteration": 2.713803768157959 - }, - { - "auxiliary_loss_clip": 0.0115377, - "auxiliary_loss_mlp": 0.0105148, - "balance_loss_clip": 1.05312431, - "balance_loss_mlp": 1.03209639, - "epoch": 0.13497670223959116, - "flos": 22853730430080.0, - "grad_norm": 1.7564166456764931, - "language_loss": 0.86023217, - "learning_rate": 3.88550929909221e-06, - "loss": 0.88228464, - "num_input_tokens_seen": 48529940, - "step": 2245, - "time_per_iteration": 2.626560926437378 - }, - { - "auxiliary_loss_clip": 0.01149467, - "auxiliary_loss_mlp": 0.0105327, - "balance_loss_clip": 1.05035663, - "balance_loss_mlp": 1.03346968, - "epoch": 0.13503682549225912, - "flos": 16504580453760.0, - "grad_norm": 1.7861449859595755, - "language_loss": 0.78912753, - "learning_rate": 3.88537938288243e-06, - "loss": 0.8111549, - "num_input_tokens_seen": 48548190, - "step": 2246, - "time_per_iteration": 2.6543703079223633 - }, - { - "auxiliary_loss_clip": 0.010304, - "auxiliary_loss_mlp": 0.01015407, - "balance_loss_clip": 1.03666449, - "balance_loss_mlp": 1.01285601, - "epoch": 0.1350969487449271, - "flos": 70756303242240.0, - "grad_norm": 0.7509256694227144, - "language_loss": 0.6054731, - "learning_rate": 3.885249395178874e-06, - "loss": 0.62593114, - "num_input_tokens_seen": 48613165, - "step": 2247, - "time_per_iteration": 3.3349809646606445 - }, - { - "auxiliary_loss_clip": 0.01162017, - "auxiliary_loss_mlp": 0.01056869, - "balance_loss_clip": 1.05492628, - "balance_loss_mlp": 1.03470767, - "epoch": 0.13515707199759508, - "flos": 23075981832960.0, - "grad_norm": 2.562042993856578, - "language_loss": 0.80841738, - "learning_rate": 3.885119335986473e-06, - "loss": 0.83060622, - "num_input_tokens_seen": 48631705, - "step": 2248, - "time_per_iteration": 2.6279287338256836 - }, - { - "auxiliary_loss_clip": 0.0114073, - "auxiliary_loss_mlp": 0.01049128, - "balance_loss_clip": 1.05086231, - "balance_loss_mlp": 1.03054309, - "epoch": 0.13521719525026304, - "flos": 23186371305600.0, - "grad_norm": 1.9247838227480492, - "language_loss": 0.77108699, - "learning_rate": 3.884989205310157e-06, - "loss": 0.79298556, - "num_input_tokens_seen": 48649740, - "step": 2249, - "time_per_iteration": 2.7100210189819336 - }, - { - "auxiliary_loss_clip": 0.0112733, - "auxiliary_loss_mlp": 0.01057649, - "balance_loss_clip": 1.05325472, - "balance_loss_mlp": 1.03863478, - "epoch": 0.135277318502931, - "flos": 24790931752320.0, - "grad_norm": 1.7403695434994237, - "language_loss": 0.84457541, - "learning_rate": 3.884859003154862e-06, - "loss": 0.86642522, - "num_input_tokens_seen": 48671565, - "step": 2250, - "time_per_iteration": 2.789350986480713 - }, - { - "auxiliary_loss_clip": 0.01155547, - "auxiliary_loss_mlp": 0.0105348, - "balance_loss_clip": 1.05310512, - "balance_loss_mlp": 1.03243995, - "epoch": 0.13533744175559898, - "flos": 21908525990400.0, - "grad_norm": 3.018154510939524, - "language_loss": 0.81796515, - "learning_rate": 3.884728729525524e-06, - "loss": 0.84005541, - "num_input_tokens_seen": 48690425, - "step": 2251, - "time_per_iteration": 2.685617208480835 - }, - { - "auxiliary_loss_clip": 0.01165433, - "auxiliary_loss_mlp": 0.01060257, - "balance_loss_clip": 1.05235004, - "balance_loss_mlp": 1.03888273, - "epoch": 0.13539756500826694, - "flos": 21211643249280.0, - "grad_norm": 1.7680273527580506, - "language_loss": 0.86173487, - "learning_rate": 3.884598384427084e-06, - "loss": 0.88399172, - "num_input_tokens_seen": 48707505, - "step": 2252, - "time_per_iteration": 2.597219467163086 - }, - { - "auxiliary_loss_clip": 0.01052296, - "auxiliary_loss_mlp": 0.01018557, - "balance_loss_clip": 1.02446079, - "balance_loss_mlp": 1.01632786, - "epoch": 0.1354576882609349, - "flos": 63242103634560.0, - "grad_norm": 0.8028920055572067, - "language_loss": 0.61837333, - "learning_rate": 3.884467967864485e-06, - "loss": 0.6390819, - "num_input_tokens_seen": 48775895, - "step": 2253, - "time_per_iteration": 3.25115704536438 - }, - { - "auxiliary_loss_clip": 0.01155107, - "auxiliary_loss_mlp": 0.01055639, - "balance_loss_clip": 1.0539906, - "balance_loss_mlp": 1.03587449, - "epoch": 0.1355178115136029, - "flos": 25483037984640.0, - "grad_norm": 1.6376691715964824, - "language_loss": 0.89441288, - "learning_rate": 3.884337479842671e-06, - "loss": 0.91652036, - "num_input_tokens_seen": 48798370, - "step": 2254, - "time_per_iteration": 2.6803932189941406 - }, - { - "auxiliary_loss_clip": 0.01131786, - "auxiliary_loss_mlp": 0.01063066, - "balance_loss_clip": 1.04506016, - "balance_loss_mlp": 1.03872383, - "epoch": 0.13557793476627086, - "flos": 21616967295360.0, - "grad_norm": 2.1104776784573787, - "language_loss": 0.84626925, - "learning_rate": 3.884206920366591e-06, - "loss": 0.86821771, - "num_input_tokens_seen": 48817955, - "step": 2255, - "time_per_iteration": 2.7074074745178223 - }, - { - "auxiliary_loss_clip": 0.01165481, - "auxiliary_loss_mlp": 0.01058458, - "balance_loss_clip": 1.05211091, - "balance_loss_mlp": 1.03767991, - "epoch": 0.13563805801893883, - "flos": 24928253447040.0, - "grad_norm": 4.791676738707355, - "language_loss": 0.74684238, - "learning_rate": 3.884076289441196e-06, - "loss": 0.76908177, - "num_input_tokens_seen": 48836330, - "step": 2256, - "time_per_iteration": 2.590178966522217 - }, - { - "auxiliary_loss_clip": 0.01127027, - "auxiliary_loss_mlp": 0.01054317, - "balance_loss_clip": 1.04977024, - "balance_loss_mlp": 1.03338361, - "epoch": 0.1356981812716068, - "flos": 14750272206720.0, - "grad_norm": 5.890843360804152, - "language_loss": 0.8309083, - "learning_rate": 3.88394558707144e-06, - "loss": 0.85272169, - "num_input_tokens_seen": 48851890, - "step": 2257, - "time_per_iteration": 2.642096519470215 - }, - { - "auxiliary_loss_clip": 0.0114984, - "auxiliary_loss_mlp": 0.00780177, - "balance_loss_clip": 1.05128407, - "balance_loss_mlp": 1.00013828, - "epoch": 0.13575830452427476, - "flos": 11108571822720.0, - "grad_norm": 2.1957250492246505, - "language_loss": 0.82045269, - "learning_rate": 3.883814813262277e-06, - "loss": 0.83975297, - "num_input_tokens_seen": 48865510, - "step": 2258, - "time_per_iteration": 2.6279473304748535 - }, - { - "auxiliary_loss_clip": 0.01155515, - "auxiliary_loss_mlp": 0.01054519, - "balance_loss_clip": 1.05172098, - "balance_loss_mlp": 1.03152323, - "epoch": 0.13581842777694272, - "flos": 17960290940160.0, - "grad_norm": 2.6364031487830464, - "language_loss": 0.82694167, - "learning_rate": 3.883683968018669e-06, - "loss": 0.849042, - "num_input_tokens_seen": 48882360, - "step": 2259, - "time_per_iteration": 2.677804708480835 - }, - { - "auxiliary_loss_clip": 0.01127201, - "auxiliary_loss_mlp": 0.01054646, - "balance_loss_clip": 1.0495683, - "balance_loss_mlp": 1.03547728, - "epoch": 0.1358785510296107, - "flos": 22857142222080.0, - "grad_norm": 2.0790748617118853, - "language_loss": 0.73916006, - "learning_rate": 3.8835530513455755e-06, - "loss": 0.76097858, - "num_input_tokens_seen": 48902700, - "step": 2260, - "time_per_iteration": 2.7416799068450928 - }, - { - "auxiliary_loss_clip": 0.01144177, - "auxiliary_loss_mlp": 0.01056881, - "balance_loss_clip": 1.05196047, - "balance_loss_mlp": 1.03691387, - "epoch": 0.13593867428227868, - "flos": 25739404329600.0, - "grad_norm": 3.546593987683097, - "language_loss": 0.74799728, - "learning_rate": 3.883422063247961e-06, - "loss": 0.77000785, - "num_input_tokens_seen": 48922525, - "step": 2261, - "time_per_iteration": 2.675342559814453 - }, - { - "auxiliary_loss_clip": 0.01170469, - "auxiliary_loss_mlp": 0.01050986, - "balance_loss_clip": 1.05486035, - "balance_loss_mlp": 1.03043413, - "epoch": 0.13599879753494665, - "flos": 31249214225280.0, - "grad_norm": 2.967396076139427, - "language_loss": 0.63602281, - "learning_rate": 3.883291003730794e-06, - "loss": 0.65823734, - "num_input_tokens_seen": 48942510, - "step": 2262, - "time_per_iteration": 2.660538911819458 - }, - { - "auxiliary_loss_clip": 0.01148004, - "auxiliary_loss_mlp": 0.01052118, - "balance_loss_clip": 1.0516696, - "balance_loss_mlp": 1.03216195, - "epoch": 0.1360589207876146, - "flos": 23915034604800.0, - "grad_norm": 2.301949377353301, - "language_loss": 0.81810403, - "learning_rate": 3.883159872799043e-06, - "loss": 0.84010524, - "num_input_tokens_seen": 48962625, - "step": 2263, - "time_per_iteration": 2.840043783187866 - }, - { - "auxiliary_loss_clip": 0.01098888, - "auxiliary_loss_mlp": 0.01064302, - "balance_loss_clip": 1.04875195, - "balance_loss_mlp": 1.0410558, - "epoch": 0.13611904404028258, - "flos": 19974197756160.0, - "grad_norm": 1.7561035968690553, - "language_loss": 0.87737143, - "learning_rate": 3.8830286704576815e-06, - "loss": 0.89900339, - "num_input_tokens_seen": 48982525, - "step": 2264, - "time_per_iteration": 2.784648895263672 - }, - { - "auxiliary_loss_clip": 0.01157618, - "auxiliary_loss_mlp": 0.01049521, - "balance_loss_clip": 1.05161715, - "balance_loss_mlp": 1.02709746, - "epoch": 0.13617916729295054, - "flos": 15340644144000.0, - "grad_norm": 3.151792845640157, - "language_loss": 0.7115528, - "learning_rate": 3.882897396711683e-06, - "loss": 0.7336241, - "num_input_tokens_seen": 48997605, - "step": 2265, - "time_per_iteration": 2.6108245849609375 - }, - { - "auxiliary_loss_clip": 0.01111831, - "auxiliary_loss_mlp": 0.01042545, - "balance_loss_clip": 1.05199265, - "balance_loss_mlp": 1.02256525, - "epoch": 0.1362392905456185, - "flos": 27451445247360.0, - "grad_norm": 4.918827494175735, - "language_loss": 0.6671263, - "learning_rate": 3.882766051566027e-06, - "loss": 0.68867004, - "num_input_tokens_seen": 49018535, - "step": 2266, - "time_per_iteration": 2.7810373306274414 - }, - { - "auxiliary_loss_clip": 0.01127539, - "auxiliary_loss_mlp": 0.01057589, - "balance_loss_clip": 1.05683684, - "balance_loss_mlp": 1.03739524, - "epoch": 0.1362994137982865, - "flos": 25009017177600.0, - "grad_norm": 1.707924588861666, - "language_loss": 0.7634865, - "learning_rate": 3.882634635025694e-06, - "loss": 0.78533769, - "num_input_tokens_seen": 49038865, - "step": 2267, - "time_per_iteration": 2.7682721614837646 - }, - { - "auxiliary_loss_clip": 0.01133448, - "auxiliary_loss_mlp": 0.01048207, - "balance_loss_clip": 1.04668903, - "balance_loss_mlp": 1.02641535, - "epoch": 0.13635953705095447, - "flos": 20303031790080.0, - "grad_norm": 2.9531688260339934, - "language_loss": 0.81653506, - "learning_rate": 3.882503147095667e-06, - "loss": 0.83835161, - "num_input_tokens_seen": 49058010, - "step": 2268, - "time_per_iteration": 2.645081043243408 - }, - { - "auxiliary_loss_clip": 0.01155147, - "auxiliary_loss_mlp": 0.01048448, - "balance_loss_clip": 1.05424881, - "balance_loss_mlp": 1.02738333, - "epoch": 0.13641966030362243, - "flos": 31358418549120.0, - "grad_norm": 1.9923150848418427, - "language_loss": 0.75975174, - "learning_rate": 3.882371587780931e-06, - "loss": 0.78178769, - "num_input_tokens_seen": 49080330, - "step": 2269, - "time_per_iteration": 2.6764814853668213 - }, - { - "auxiliary_loss_clip": 0.0113465, - "auxiliary_loss_mlp": 0.01049702, - "balance_loss_clip": 1.04941857, - "balance_loss_mlp": 1.02844727, - "epoch": 0.1364797835562904, - "flos": 20478095700480.0, - "grad_norm": 2.1475090354855473, - "language_loss": 0.81328762, - "learning_rate": 3.882239957086477e-06, - "loss": 0.83513117, - "num_input_tokens_seen": 49097035, - "step": 2270, - "time_per_iteration": 2.6801655292510986 - }, - { - "auxiliary_loss_clip": 0.01142111, - "auxiliary_loss_mlp": 0.010594, - "balance_loss_clip": 1.04989171, - "balance_loss_mlp": 1.03773928, - "epoch": 0.13653990680895836, - "flos": 13078343802240.0, - "grad_norm": 3.2227070482893976, - "language_loss": 0.75812757, - "learning_rate": 3.882108255017295e-06, - "loss": 0.78014266, - "num_input_tokens_seen": 49113945, - "step": 2271, - "time_per_iteration": 4.197805166244507 - }, - { - "auxiliary_loss_clip": 0.01156913, - "auxiliary_loss_mlp": 0.01061846, - "balance_loss_clip": 1.05097795, - "balance_loss_mlp": 1.03921962, - "epoch": 0.13660003006162633, - "flos": 16946712961920.0, - "grad_norm": 2.2800716885469754, - "language_loss": 0.80251753, - "learning_rate": 3.881976481578379e-06, - "loss": 0.82470512, - "num_input_tokens_seen": 49132855, - "step": 2272, - "time_per_iteration": 4.1461029052734375 - }, - { - "auxiliary_loss_clip": 0.01055091, - "auxiliary_loss_mlp": 0.01042701, - "balance_loss_clip": 1.02539539, - "balance_loss_mlp": 1.04001904, - "epoch": 0.1366601533142943, - "flos": 68682749892480.0, - "grad_norm": 0.7097054685047118, - "language_loss": 0.60739923, - "learning_rate": 3.8818446367747255e-06, - "loss": 0.62837708, - "num_input_tokens_seen": 49198310, - "step": 2273, - "time_per_iteration": 4.731219530105591 - }, - { - "auxiliary_loss_clip": 0.01165514, - "auxiliary_loss_mlp": 0.00780474, - "balance_loss_clip": 1.0523783, - "balance_loss_mlp": 1.00008452, - "epoch": 0.13672027656696228, - "flos": 19244241567360.0, - "grad_norm": 2.4844725334882583, - "language_loss": 0.77506429, - "learning_rate": 3.881712720611336e-06, - "loss": 0.79452413, - "num_input_tokens_seen": 49217250, - "step": 2274, - "time_per_iteration": 2.7122738361358643 - }, - { - "auxiliary_loss_clip": 0.01154937, - "auxiliary_loss_mlp": 0.01054542, - "balance_loss_clip": 1.05082417, - "balance_loss_mlp": 1.03271496, - "epoch": 0.13678039981963025, - "flos": 24534924543360.0, - "grad_norm": 2.391437383339344, - "language_loss": 0.78256011, - "learning_rate": 3.881580733093211e-06, - "loss": 0.8046549, - "num_input_tokens_seen": 49236615, - "step": 2275, - "time_per_iteration": 2.6674444675445557 - }, - { - "auxiliary_loss_clip": 0.01154585, - "auxiliary_loss_mlp": 0.01044634, - "balance_loss_clip": 1.05220842, - "balance_loss_mlp": 1.02449977, - "epoch": 0.13684052307229821, - "flos": 15669334523520.0, - "grad_norm": 2.271072834476717, - "language_loss": 0.81682789, - "learning_rate": 3.881448674225356e-06, - "loss": 0.83882004, - "num_input_tokens_seen": 49253935, - "step": 2276, - "time_per_iteration": 4.202202558517456 - }, - { - "auxiliary_loss_clip": 0.01164941, - "auxiliary_loss_mlp": 0.01060078, - "balance_loss_clip": 1.05228245, - "balance_loss_mlp": 1.03604531, - "epoch": 0.13690064632496618, - "flos": 28364689560960.0, - "grad_norm": 5.063053962589045, - "language_loss": 0.69948691, - "learning_rate": 3.881316544012779e-06, - "loss": 0.72173715, - "num_input_tokens_seen": 49273605, - "step": 2277, - "time_per_iteration": 2.708591938018799 - }, - { - "auxiliary_loss_clip": 0.01160044, - "auxiliary_loss_mlp": 0.00780297, - "balance_loss_clip": 1.05169702, - "balance_loss_mlp": 1.00017083, - "epoch": 0.13696076957763414, - "flos": 23404779953280.0, - "grad_norm": 2.062701620585305, - "language_loss": 0.80197465, - "learning_rate": 3.88118434246049e-06, - "loss": 0.82137805, - "num_input_tokens_seen": 49291785, - "step": 2278, - "time_per_iteration": 2.6916158199310303 - }, - { - "auxiliary_loss_clip": 0.01159146, - "auxiliary_loss_mlp": 0.01060686, - "balance_loss_clip": 1.05954766, - "balance_loss_mlp": 1.03925228, - "epoch": 0.1370208928303021, - "flos": 37196595601920.0, - "grad_norm": 7.088344486179519, - "language_loss": 0.75048816, - "learning_rate": 3.881052069573502e-06, - "loss": 0.77268648, - "num_input_tokens_seen": 49311405, - "step": 2279, - "time_per_iteration": 2.7316977977752686 - }, - { - "auxiliary_loss_clip": 0.01101952, - "auxiliary_loss_mlp": 0.01066685, - "balance_loss_clip": 1.04605758, - "balance_loss_mlp": 1.04485774, - "epoch": 0.13708101608297008, - "flos": 26976311118720.0, - "grad_norm": 2.5293116992138223, - "language_loss": 0.76743513, - "learning_rate": 3.880919725356831e-06, - "loss": 0.78912151, - "num_input_tokens_seen": 49331835, - "step": 2280, - "time_per_iteration": 2.813720941543579 - }, - { - "auxiliary_loss_clip": 0.01108594, - "auxiliary_loss_mlp": 0.01060805, - "balance_loss_clip": 1.04457331, - "balance_loss_mlp": 1.04022956, - "epoch": 0.13714113933563807, - "flos": 32556864850560.0, - "grad_norm": 2.0597640944890325, - "language_loss": 0.79657966, - "learning_rate": 3.880787309815496e-06, - "loss": 0.81827366, - "num_input_tokens_seen": 49352290, - "step": 2281, - "time_per_iteration": 2.8325345516204834 - }, - { - "auxiliary_loss_clip": 0.0117656, - "auxiliary_loss_mlp": 0.0107773, - "balance_loss_clip": 1.05715084, - "balance_loss_mlp": 1.05671358, - "epoch": 0.13720126258830603, - "flos": 16101267569280.0, - "grad_norm": 2.0769142230572877, - "language_loss": 0.83383757, - "learning_rate": 3.880654822954518e-06, - "loss": 0.85638046, - "num_input_tokens_seen": 49370285, - "step": 2282, - "time_per_iteration": 2.5988755226135254 - }, - { - "auxiliary_loss_clip": 0.01142098, - "auxiliary_loss_mlp": 0.01075909, - "balance_loss_clip": 1.04898703, - "balance_loss_mlp": 1.05583453, - "epoch": 0.137261385840974, - "flos": 18953544798720.0, - "grad_norm": 1.5269487193470777, - "language_loss": 0.73526621, - "learning_rate": 3.8805222647789195e-06, - "loss": 0.75744629, - "num_input_tokens_seen": 49389610, - "step": 2283, - "time_per_iteration": 2.7099714279174805 - }, - { - "auxiliary_loss_clip": 0.01160178, - "auxiliary_loss_mlp": 0.01062577, - "balance_loss_clip": 1.05577087, - "balance_loss_mlp": 1.04173923, - "epoch": 0.13732150909364196, - "flos": 23295360147840.0, - "grad_norm": 2.2306012559941455, - "language_loss": 0.83934438, - "learning_rate": 3.880389635293729e-06, - "loss": 0.86157191, - "num_input_tokens_seen": 49408390, - "step": 2284, - "time_per_iteration": 2.7315831184387207 - }, - { - "auxiliary_loss_clip": 0.01151427, - "auxiliary_loss_mlp": 0.01070288, - "balance_loss_clip": 1.05204272, - "balance_loss_mlp": 1.04779351, - "epoch": 0.13738163234630993, - "flos": 29351263489920.0, - "grad_norm": 2.0900141273659223, - "language_loss": 0.7557056, - "learning_rate": 3.880256934503974e-06, - "loss": 0.77792281, - "num_input_tokens_seen": 49427725, - "step": 2285, - "time_per_iteration": 2.7257747650146484 - }, - { - "auxiliary_loss_clip": 0.01144078, - "auxiliary_loss_mlp": 0.01064539, - "balance_loss_clip": 1.05233073, - "balance_loss_mlp": 1.04392731, - "epoch": 0.1374417555989779, - "flos": 26651319840000.0, - "grad_norm": 2.727019945657865, - "language_loss": 0.74521589, - "learning_rate": 3.880124162414689e-06, - "loss": 0.76730204, - "num_input_tokens_seen": 49449000, - "step": 2286, - "time_per_iteration": 2.742582082748413 - }, - { - "auxiliary_loss_clip": 0.0112541, - "auxiliary_loss_mlp": 0.01059198, - "balance_loss_clip": 1.04906356, - "balance_loss_mlp": 1.03659606, - "epoch": 0.1375018788516459, - "flos": 28403401443840.0, - "grad_norm": 2.2168449035378357, - "language_loss": 0.86683542, - "learning_rate": 3.879991319030908e-06, - "loss": 0.88868147, - "num_input_tokens_seen": 49468360, - "step": 2287, - "time_per_iteration": 2.802088499069214 - }, - { - "auxiliary_loss_clip": 0.01124712, - "auxiliary_loss_mlp": 0.01064517, - "balance_loss_clip": 1.04803944, - "balance_loss_mlp": 1.04207003, - "epoch": 0.13756200210431385, - "flos": 37413783187200.0, - "grad_norm": 2.0592152854463106, - "language_loss": 0.68410838, - "learning_rate": 3.879858404357666e-06, - "loss": 0.70600063, - "num_input_tokens_seen": 49493450, - "step": 2288, - "time_per_iteration": 2.861175537109375 - }, - { - "auxiliary_loss_clip": 0.01112106, - "auxiliary_loss_mlp": 0.01071262, - "balance_loss_clip": 1.05062151, - "balance_loss_mlp": 1.04666936, - "epoch": 0.13762212535698182, - "flos": 22711021695360.0, - "grad_norm": 2.3933568244149357, - "language_loss": 0.87090456, - "learning_rate": 3.879725418400005e-06, - "loss": 0.89273822, - "num_input_tokens_seen": 49511220, - "step": 2289, - "time_per_iteration": 2.7185773849487305 - }, - { - "auxiliary_loss_clip": 0.01130193, - "auxiliary_loss_mlp": 0.00781167, - "balance_loss_clip": 1.0480957, - "balance_loss_mlp": 1.00019848, - "epoch": 0.13768224860964978, - "flos": 23952130375680.0, - "grad_norm": 1.8106848287624444, - "language_loss": 0.74668044, - "learning_rate": 3.879592361162969e-06, - "loss": 0.76579404, - "num_input_tokens_seen": 49529820, - "step": 2290, - "time_per_iteration": 2.6751222610473633 - }, - { - "auxiliary_loss_clip": 0.01039657, - "auxiliary_loss_mlp": 0.01081332, - "balance_loss_clip": 1.03094769, - "balance_loss_mlp": 1.07881641, - "epoch": 0.13774237186231775, - "flos": 63590438753280.0, - "grad_norm": 0.7179159366671727, - "language_loss": 0.51597112, - "learning_rate": 3.8794592326516015e-06, - "loss": 0.53718102, - "num_input_tokens_seen": 49595325, - "step": 2291, - "time_per_iteration": 3.2823359966278076 - }, - { - "auxiliary_loss_clip": 0.01157406, - "auxiliary_loss_mlp": 0.01052846, - "balance_loss_clip": 1.05224037, - "balance_loss_mlp": 1.03123331, - "epoch": 0.1378024951149857, - "flos": 24279456038400.0, - "grad_norm": 1.9326408617769533, - "language_loss": 0.71273667, - "learning_rate": 3.879326032870952e-06, - "loss": 0.7348392, - "num_input_tokens_seen": 49615850, - "step": 2292, - "time_per_iteration": 2.74045729637146 - }, - { - "auxiliary_loss_clip": 0.01156871, - "auxiliary_loss_mlp": 0.01049315, - "balance_loss_clip": 1.05427122, - "balance_loss_mlp": 1.02931166, - "epoch": 0.13786261836765368, - "flos": 14021537080320.0, - "grad_norm": 6.592759889378346, - "language_loss": 0.8047784, - "learning_rate": 3.879192761826071e-06, - "loss": 0.82684022, - "num_input_tokens_seen": 49631860, - "step": 2293, - "time_per_iteration": 2.587576389312744 - }, - { - "auxiliary_loss_clip": 0.0115787, - "auxiliary_loss_mlp": 0.0104972, - "balance_loss_clip": 1.0554558, - "balance_loss_mlp": 1.02921653, - "epoch": 0.13792274162032167, - "flos": 28878679226880.0, - "grad_norm": 1.9082895606463517, - "language_loss": 0.78440171, - "learning_rate": 3.879059419522011e-06, - "loss": 0.80647767, - "num_input_tokens_seen": 49652145, - "step": 2294, - "time_per_iteration": 2.7152793407440186 - }, - { - "auxiliary_loss_clip": 0.01126374, - "auxiliary_loss_mlp": 0.01050648, - "balance_loss_clip": 1.05281758, - "balance_loss_mlp": 1.03104973, - "epoch": 0.13798286487298964, - "flos": 21141150808320.0, - "grad_norm": 1.991103290125302, - "language_loss": 0.80339509, - "learning_rate": 3.878926005963831e-06, - "loss": 0.82516527, - "num_input_tokens_seen": 49669880, - "step": 2295, - "time_per_iteration": 2.7026021480560303 - }, - { - "auxiliary_loss_clip": 0.01154693, - "auxiliary_loss_mlp": 0.01052186, - "balance_loss_clip": 1.05239046, - "balance_loss_mlp": 1.03102624, - "epoch": 0.1380429881256576, - "flos": 22487477402880.0, - "grad_norm": 1.7450624966187134, - "language_loss": 0.78661883, - "learning_rate": 3.878792521156588e-06, - "loss": 0.80868757, - "num_input_tokens_seen": 49687255, - "step": 2296, - "time_per_iteration": 2.566929340362549 - }, - { - "auxiliary_loss_clip": 0.01153425, - "auxiliary_loss_mlp": 0.01069343, - "balance_loss_clip": 1.05437231, - "balance_loss_mlp": 1.04811132, - "epoch": 0.13810311137832557, - "flos": 21393674398080.0, - "grad_norm": 1.7434096141785573, - "language_loss": 0.78663194, - "learning_rate": 3.8786589651053446e-06, - "loss": 0.80885959, - "num_input_tokens_seen": 49706650, - "step": 2297, - "time_per_iteration": 2.6254489421844482 - }, - { - "auxiliary_loss_clip": 0.01110905, - "auxiliary_loss_mlp": 0.01059754, - "balance_loss_clip": 1.05296302, - "balance_loss_mlp": 1.03871369, - "epoch": 0.13816323463099353, - "flos": 25989844930560.0, - "grad_norm": 1.929043788877404, - "language_loss": 0.69199705, - "learning_rate": 3.878525337815164e-06, - "loss": 0.71370363, - "num_input_tokens_seen": 49725715, - "step": 2298, - "time_per_iteration": 2.791301965713501 - }, - { - "auxiliary_loss_clip": 0.01137772, - "auxiliary_loss_mlp": 0.01061768, - "balance_loss_clip": 1.0517292, - "balance_loss_mlp": 1.04059684, - "epoch": 0.1382233578836615, - "flos": 19244313394560.0, - "grad_norm": 1.7910922430646712, - "language_loss": 0.86382294, - "learning_rate": 3.878391639291116e-06, - "loss": 0.88581836, - "num_input_tokens_seen": 49744710, - "step": 2299, - "time_per_iteration": 2.6075453758239746 - }, - { - "auxiliary_loss_clip": 0.01166817, - "auxiliary_loss_mlp": 0.01054863, - "balance_loss_clip": 1.05378175, - "balance_loss_mlp": 1.03292871, - "epoch": 0.1382834811363295, - "flos": 25666290195840.0, - "grad_norm": 2.2378660690879606, - "language_loss": 0.75468475, - "learning_rate": 3.878257869538267e-06, - "loss": 0.77690154, - "num_input_tokens_seen": 49764300, - "step": 2300, - "time_per_iteration": 2.663328170776367 - }, - { - "auxiliary_loss_clip": 0.01130608, - "auxiliary_loss_mlp": 0.01047248, - "balance_loss_clip": 1.05274105, - "balance_loss_mlp": 1.02664876, - "epoch": 0.13834360438899745, - "flos": 19784193788160.0, - "grad_norm": 2.5571861214345963, - "language_loss": 0.82463622, - "learning_rate": 3.878124028561692e-06, - "loss": 0.8464148, - "num_input_tokens_seen": 49778380, - "step": 2301, - "time_per_iteration": 2.6705129146575928 - }, - { - "auxiliary_loss_clip": 0.0113862, - "auxiliary_loss_mlp": 0.00777879, - "balance_loss_clip": 1.05323792, - "balance_loss_mlp": 1.00021625, - "epoch": 0.13840372764166542, - "flos": 26651858544000.0, - "grad_norm": 1.9612043619218924, - "language_loss": 0.85957694, - "learning_rate": 3.877990116366466e-06, - "loss": 0.87874192, - "num_input_tokens_seen": 49797460, - "step": 2302, - "time_per_iteration": 2.679797410964966 - }, - { - "auxiliary_loss_clip": 0.01059341, - "auxiliary_loss_mlp": 0.01025212, - "balance_loss_clip": 1.03226125, - "balance_loss_mlp": 1.02244604, - "epoch": 0.13846385089433338, - "flos": 70510998286080.0, - "grad_norm": 0.7598813547967705, - "language_loss": 0.65591633, - "learning_rate": 3.877856132957667e-06, - "loss": 0.67676187, - "num_input_tokens_seen": 49868005, - "step": 2303, - "time_per_iteration": 3.3249399662017822 - }, - { - "auxiliary_loss_clip": 0.01151443, - "auxiliary_loss_mlp": 0.01046478, - "balance_loss_clip": 1.05337632, - "balance_loss_mlp": 1.02655792, - "epoch": 0.13852397414700135, - "flos": 17348732956800.0, - "grad_norm": 3.141207945865242, - "language_loss": 0.78663635, - "learning_rate": 3.877722078340374e-06, - "loss": 0.80861557, - "num_input_tokens_seen": 49885825, - "step": 2304, - "time_per_iteration": 2.7364001274108887 - }, - { - "auxiliary_loss_clip": 0.01157514, - "auxiliary_loss_mlp": 0.01043253, - "balance_loss_clip": 1.05566275, - "balance_loss_mlp": 1.02385736, - "epoch": 0.13858409739966931, - "flos": 21543781334400.0, - "grad_norm": 1.7487365854034607, - "language_loss": 0.77559888, - "learning_rate": 3.877587952519672e-06, - "loss": 0.79760659, - "num_input_tokens_seen": 49905975, - "step": 2305, - "time_per_iteration": 2.7814202308654785 - }, - { - "auxiliary_loss_clip": 0.01074766, - "auxiliary_loss_mlp": 0.01055718, - "balance_loss_clip": 1.04160607, - "balance_loss_mlp": 1.03473723, - "epoch": 0.13864422065233728, - "flos": 21579907438080.0, - "grad_norm": 1.8207477060355044, - "language_loss": 0.87737936, - "learning_rate": 3.877453755500647e-06, - "loss": 0.89868426, - "num_input_tokens_seen": 49925800, - "step": 2306, - "time_per_iteration": 2.917616605758667 - }, - { - "auxiliary_loss_clip": 0.01064826, - "auxiliary_loss_mlp": 0.0101208, - "balance_loss_clip": 1.02692199, - "balance_loss_mlp": 1.0094099, - "epoch": 0.13870434390500527, - "flos": 53371156872960.0, - "grad_norm": 0.8728538231155298, - "language_loss": 0.59008431, - "learning_rate": 3.877319487288387e-06, - "loss": 0.61085337, - "num_input_tokens_seen": 49977620, - "step": 2307, - "time_per_iteration": 3.4345149993896484 - }, - { - "auxiliary_loss_clip": 0.01169624, - "auxiliary_loss_mlp": 0.00778134, - "balance_loss_clip": 1.05528641, - "balance_loss_mlp": 1.00021303, - "epoch": 0.13876446715767324, - "flos": 22565906749440.0, - "grad_norm": 1.8467673932802395, - "language_loss": 0.79483795, - "learning_rate": 3.877185147887984e-06, - "loss": 0.81431556, - "num_input_tokens_seen": 49996650, - "step": 2308, - "time_per_iteration": 2.7137296199798584 - }, - { - "auxiliary_loss_clip": 0.01131024, - "auxiliary_loss_mlp": 0.01050332, - "balance_loss_clip": 1.05118585, - "balance_loss_mlp": 1.03054297, - "epoch": 0.1388245904103412, - "flos": 20705231352960.0, - "grad_norm": 2.352128383160346, - "language_loss": 0.78101134, - "learning_rate": 3.877050737304533e-06, - "loss": 0.80282485, - "num_input_tokens_seen": 50015640, - "step": 2309, - "time_per_iteration": 2.9259471893310547 - }, - { - "auxiliary_loss_clip": 0.01128109, - "auxiliary_loss_mlp": 0.01057348, - "balance_loss_clip": 1.04979932, - "balance_loss_mlp": 1.03620028, - "epoch": 0.13888471366300917, - "flos": 20554729367040.0, - "grad_norm": 3.914796791761399, - "language_loss": 0.68133545, - "learning_rate": 3.876916255543129e-06, - "loss": 0.70318997, - "num_input_tokens_seen": 50033500, - "step": 2310, - "time_per_iteration": 4.27877140045166 - }, - { - "auxiliary_loss_clip": 0.01164985, - "auxiliary_loss_mlp": 0.01062516, - "balance_loss_clip": 1.05356944, - "balance_loss_mlp": 1.04021168, - "epoch": 0.13894483691567713, - "flos": 13838033473920.0, - "grad_norm": 1.934954545600412, - "language_loss": 0.84295756, - "learning_rate": 3.8767817026088725e-06, - "loss": 0.86523259, - "num_input_tokens_seen": 50050075, - "step": 2311, - "time_per_iteration": 2.5612359046936035 - }, - { - "auxiliary_loss_clip": 0.01173749, - "auxiliary_loss_mlp": 0.01055474, - "balance_loss_clip": 1.05752683, - "balance_loss_mlp": 1.0350771, - "epoch": 0.1390049601683451, - "flos": 28031186759040.0, - "grad_norm": 2.9213009430481143, - "language_loss": 0.82358992, - "learning_rate": 3.876647078506866e-06, - "loss": 0.84588212, - "num_input_tokens_seen": 50070080, - "step": 2312, - "time_per_iteration": 5.737139701843262 - }, - { - "auxiliary_loss_clip": 0.01129781, - "auxiliary_loss_mlp": 0.00778347, - "balance_loss_clip": 1.05464363, - "balance_loss_mlp": 1.00023031, - "epoch": 0.13906508342101306, - "flos": 26756860976640.0, - "grad_norm": 2.109799495913242, - "language_loss": 0.86732674, - "learning_rate": 3.876512383242215e-06, - "loss": 0.88640809, - "num_input_tokens_seen": 50090040, - "step": 2313, - "time_per_iteration": 2.8402304649353027 - }, - { - "auxiliary_loss_clip": 0.01168088, - "auxiliary_loss_mlp": 0.01061738, - "balance_loss_clip": 1.05670547, - "balance_loss_mlp": 1.04115057, - "epoch": 0.13912520667368106, - "flos": 24535104111360.0, - "grad_norm": 1.784990717237318, - "language_loss": 0.79935932, - "learning_rate": 3.876377616820024e-06, - "loss": 0.8216576, - "num_input_tokens_seen": 50110595, - "step": 2314, - "time_per_iteration": 2.683448076248169 - }, - { - "auxiliary_loss_clip": 0.01124732, - "auxiliary_loss_mlp": 0.01061041, - "balance_loss_clip": 1.04845023, - "balance_loss_mlp": 1.04103708, - "epoch": 0.13918532992634902, - "flos": 19383215287680.0, - "grad_norm": 2.585875079553688, - "language_loss": 0.85367405, - "learning_rate": 3.876242779245409e-06, - "loss": 0.87553179, - "num_input_tokens_seen": 50125430, - "step": 2315, - "time_per_iteration": 4.332594394683838 - }, - { - "auxiliary_loss_clip": 0.01156122, - "auxiliary_loss_mlp": 0.01058532, - "balance_loss_clip": 1.05397022, - "balance_loss_mlp": 1.0372889, - "epoch": 0.139245453179017, - "flos": 21323756574720.0, - "grad_norm": 2.333331492160627, - "language_loss": 0.77170396, - "learning_rate": 3.876107870523477e-06, - "loss": 0.79385042, - "num_input_tokens_seen": 50144120, - "step": 2316, - "time_per_iteration": 2.654604911804199 - }, - { - "auxiliary_loss_clip": 0.01163967, - "auxiliary_loss_mlp": 0.00780027, - "balance_loss_clip": 1.05353916, - "balance_loss_mlp": 1.00024533, - "epoch": 0.13930557643168495, - "flos": 19500607912320.0, - "grad_norm": 2.1485284032262086, - "language_loss": 0.76820493, - "learning_rate": 3.875972890659349e-06, - "loss": 0.78764486, - "num_input_tokens_seen": 50162500, - "step": 2317, - "time_per_iteration": 2.6501235961914062 - }, - { - "auxiliary_loss_clip": 0.01144052, - "auxiliary_loss_mlp": 0.01061042, - "balance_loss_clip": 1.05156648, - "balance_loss_mlp": 1.04074025, - "epoch": 0.13936569968435292, - "flos": 25410821690880.0, - "grad_norm": 1.7797832869421444, - "language_loss": 0.80185997, - "learning_rate": 3.875837839658139e-06, - "loss": 0.82391089, - "num_input_tokens_seen": 50182415, - "step": 2318, - "time_per_iteration": 2.7097995281219482 - }, - { - "auxiliary_loss_clip": 0.01049096, - "auxiliary_loss_mlp": 0.01048478, - "balance_loss_clip": 1.03358936, - "balance_loss_mlp": 1.04518783, - "epoch": 0.13942582293702088, - "flos": 70771063731840.0, - "grad_norm": 0.854553938374386, - "language_loss": 0.59004617, - "learning_rate": 3.87570271752497e-06, - "loss": 0.61102188, - "num_input_tokens_seen": 50245160, - "step": 2319, - "time_per_iteration": 3.2631640434265137 - }, - { - "auxiliary_loss_clip": 0.0111484, - "auxiliary_loss_mlp": 0.01055367, - "balance_loss_clip": 1.04508984, - "balance_loss_mlp": 1.03437412, - "epoch": 0.13948594618968888, - "flos": 35590885920000.0, - "grad_norm": 2.3313836691947722, - "language_loss": 0.64993447, - "learning_rate": 3.875567524264967e-06, - "loss": 0.67163646, - "num_input_tokens_seen": 50268215, - "step": 2320, - "time_per_iteration": 2.8668782711029053 - }, - { - "auxiliary_loss_clip": 0.01096421, - "auxiliary_loss_mlp": 0.01056652, - "balance_loss_clip": 1.04400086, - "balance_loss_mlp": 1.03521848, - "epoch": 0.13954606944235684, - "flos": 21105204272640.0, - "grad_norm": 2.285151015895421, - "language_loss": 0.70708811, - "learning_rate": 3.875432259883256e-06, - "loss": 0.72861886, - "num_input_tokens_seen": 50288575, - "step": 2321, - "time_per_iteration": 2.8273603916168213 - }, - { - "auxiliary_loss_clip": 0.01117698, - "auxiliary_loss_mlp": 0.01061754, - "balance_loss_clip": 1.04603076, - "balance_loss_mlp": 1.03698206, - "epoch": 0.1396061926950248, - "flos": 25044425009280.0, - "grad_norm": 1.7926270181208543, - "language_loss": 0.85931206, - "learning_rate": 3.875296924384965e-06, - "loss": 0.88110662, - "num_input_tokens_seen": 50308735, - "step": 2322, - "time_per_iteration": 2.833807945251465 - }, - { - "auxiliary_loss_clip": 0.01120545, - "auxiliary_loss_mlp": 0.01055036, - "balance_loss_clip": 1.04616976, - "balance_loss_mlp": 1.03568828, - "epoch": 0.13966631594769277, - "flos": 37634023428480.0, - "grad_norm": 1.5963293576391182, - "language_loss": 0.67159557, - "learning_rate": 3.875161517775226e-06, - "loss": 0.69335139, - "num_input_tokens_seen": 50331025, - "step": 2323, - "time_per_iteration": 2.875265121459961 - }, - { - "auxiliary_loss_clip": 0.01127992, - "auxiliary_loss_mlp": 0.01055173, - "balance_loss_clip": 1.04900301, - "balance_loss_mlp": 1.03432369, - "epoch": 0.13972643920036074, - "flos": 16690993061760.0, - "grad_norm": 2.0757452253793485, - "language_loss": 0.88878977, - "learning_rate": 3.875026040059175e-06, - "loss": 0.9106214, - "num_input_tokens_seen": 50349725, - "step": 2324, - "time_per_iteration": 2.6841063499450684 - }, - { - "auxiliary_loss_clip": 0.01154799, - "auxiliary_loss_mlp": 0.01056834, - "balance_loss_clip": 1.05145955, - "balance_loss_mlp": 1.03541231, - "epoch": 0.1397865624530287, - "flos": 23331055288320.0, - "grad_norm": 2.8450589371660526, - "language_loss": 0.70621002, - "learning_rate": 3.8748904912419485e-06, - "loss": 0.72832638, - "num_input_tokens_seen": 50367965, - "step": 2325, - "time_per_iteration": 2.694218397140503 - }, - { - "auxiliary_loss_clip": 0.01134393, - "auxiliary_loss_mlp": 0.00778751, - "balance_loss_clip": 1.05273592, - "balance_loss_mlp": 1.00028229, - "epoch": 0.13984668570569667, - "flos": 22778317825920.0, - "grad_norm": 2.230299294128946, - "language_loss": 0.81657004, - "learning_rate": 3.874754871328688e-06, - "loss": 0.83570141, - "num_input_tokens_seen": 50385605, - "step": 2326, - "time_per_iteration": 2.715306282043457 - }, - { - "auxiliary_loss_clip": 0.01151297, - "auxiliary_loss_mlp": 0.01045813, - "balance_loss_clip": 1.05490732, - "balance_loss_mlp": 1.02745473, - "epoch": 0.13990680895836466, - "flos": 19464553635840.0, - "grad_norm": 1.729713540462037, - "language_loss": 0.89241689, - "learning_rate": 3.874619180324534e-06, - "loss": 0.91438794, - "num_input_tokens_seen": 50403985, - "step": 2327, - "time_per_iteration": 2.679626941680908 - }, - { - "auxiliary_loss_clip": 0.01119996, - "auxiliary_loss_mlp": 0.01057397, - "balance_loss_clip": 1.04873121, - "balance_loss_mlp": 1.0352242, - "epoch": 0.13996693221103262, - "flos": 20303283185280.0, - "grad_norm": 2.9217951598838363, - "language_loss": 0.84760427, - "learning_rate": 3.874483418234632e-06, - "loss": 0.86937821, - "num_input_tokens_seen": 50421590, - "step": 2328, - "time_per_iteration": 2.7277352809906006 - }, - { - "auxiliary_loss_clip": 0.01151775, - "auxiliary_loss_mlp": 0.0104443, - "balance_loss_clip": 1.05300856, - "balance_loss_mlp": 1.02421176, - "epoch": 0.1400270554637006, - "flos": 26617707688320.0, - "grad_norm": 1.6116398320348613, - "language_loss": 0.73835862, - "learning_rate": 3.874347585064131e-06, - "loss": 0.76032066, - "num_input_tokens_seen": 50443945, - "step": 2329, - "time_per_iteration": 2.6911025047302246 - }, - { - "auxiliary_loss_clip": 0.01153137, - "auxiliary_loss_mlp": 0.01046755, - "balance_loss_clip": 1.05254042, - "balance_loss_mlp": 1.02644169, - "epoch": 0.14008717871636855, - "flos": 19391475415680.0, - "grad_norm": 2.565670250114109, - "language_loss": 0.78373277, - "learning_rate": 3.874211680818183e-06, - "loss": 0.80573165, - "num_input_tokens_seen": 50462065, - "step": 2330, - "time_per_iteration": 2.703225612640381 - }, - { - "auxiliary_loss_clip": 0.01144455, - "auxiliary_loss_mlp": 0.01046085, - "balance_loss_clip": 1.05247569, - "balance_loss_mlp": 1.02692819, - "epoch": 0.14014730196903652, - "flos": 15304266645120.0, - "grad_norm": 2.2215524337864143, - "language_loss": 0.72115719, - "learning_rate": 3.87407570550194e-06, - "loss": 0.74306256, - "num_input_tokens_seen": 50479565, - "step": 2331, - "time_per_iteration": 2.7044217586517334 - }, - { - "auxiliary_loss_clip": 0.01159691, - "auxiliary_loss_mlp": 0.01051771, - "balance_loss_clip": 1.0558939, - "balance_loss_mlp": 1.03234017, - "epoch": 0.14020742522170448, - "flos": 14939701557120.0, - "grad_norm": 1.5806705357110964, - "language_loss": 0.72634697, - "learning_rate": 3.873939659120557e-06, - "loss": 0.7484616, - "num_input_tokens_seen": 50497305, - "step": 2332, - "time_per_iteration": 2.647564649581909 - }, - { - "auxiliary_loss_clip": 0.01063058, - "auxiliary_loss_mlp": 0.01022564, - "balance_loss_clip": 1.03391051, - "balance_loss_mlp": 1.01944101, - "epoch": 0.14026754847437245, - "flos": 48824580044160.0, - "grad_norm": 0.8445516092095569, - "language_loss": 0.56185365, - "learning_rate": 3.873803541679196e-06, - "loss": 0.58270991, - "num_input_tokens_seen": 50549735, - "step": 2333, - "time_per_iteration": 3.038390636444092 - }, - { - "auxiliary_loss_clip": 0.01127793, - "auxiliary_loss_mlp": 0.01045888, - "balance_loss_clip": 1.05246043, - "balance_loss_mlp": 1.02587318, - "epoch": 0.14032767172704044, - "flos": 25773267876480.0, - "grad_norm": 1.7702774265545234, - "language_loss": 0.82728767, - "learning_rate": 3.873667353183016e-06, - "loss": 0.84902453, - "num_input_tokens_seen": 50570100, - "step": 2334, - "time_per_iteration": 2.7205803394317627 - }, - { - "auxiliary_loss_clip": 0.01129244, - "auxiliary_loss_mlp": 0.01044663, - "balance_loss_clip": 1.05110407, - "balance_loss_mlp": 1.02593565, - "epoch": 0.1403877949797084, - "flos": 21216312017280.0, - "grad_norm": 1.7790720657464538, - "language_loss": 0.80958998, - "learning_rate": 3.8735310936371825e-06, - "loss": 0.83132899, - "num_input_tokens_seen": 50589185, - "step": 2335, - "time_per_iteration": 2.7844314575195312 - }, - { - "auxiliary_loss_clip": 0.01108373, - "auxiliary_loss_mlp": 0.0104374, - "balance_loss_clip": 1.04802513, - "balance_loss_mlp": 1.02160311, - "epoch": 0.14044791823237637, - "flos": 22747973811840.0, - "grad_norm": 1.739505291070366, - "language_loss": 0.81987065, - "learning_rate": 3.873394763046862e-06, - "loss": 0.84139174, - "num_input_tokens_seen": 50609645, - "step": 2336, - "time_per_iteration": 2.7787351608276367 - }, - { - "auxiliary_loss_clip": 0.01150445, - "auxiliary_loss_mlp": 0.01046319, - "balance_loss_clip": 1.05603921, - "balance_loss_mlp": 1.02709103, - "epoch": 0.14050804148504434, - "flos": 22964443125120.0, - "grad_norm": 1.7584048007565314, - "language_loss": 0.80606967, - "learning_rate": 3.873258361417225e-06, - "loss": 0.82803738, - "num_input_tokens_seen": 50628385, - "step": 2337, - "time_per_iteration": 2.6119275093078613 - }, - { - "auxiliary_loss_clip": 0.01150898, - "auxiliary_loss_mlp": 0.01051074, - "balance_loss_clip": 1.05363941, - "balance_loss_mlp": 1.03202438, - "epoch": 0.1405681647377123, - "flos": 22200336080640.0, - "grad_norm": 2.383737065589604, - "language_loss": 0.78994334, - "learning_rate": 3.873121888753442e-06, - "loss": 0.81196302, - "num_input_tokens_seen": 50647260, - "step": 2338, - "time_per_iteration": 2.672427177429199 - }, - { - "auxiliary_loss_clip": 0.01158377, - "auxiliary_loss_mlp": 0.01050168, - "balance_loss_clip": 1.05894089, - "balance_loss_mlp": 1.02919865, - "epoch": 0.14062828799038027, - "flos": 23732787974400.0, - "grad_norm": 2.117725014058833, - "language_loss": 0.79766536, - "learning_rate": 3.87298534506069e-06, - "loss": 0.81975079, - "num_input_tokens_seen": 50666130, - "step": 2339, - "time_per_iteration": 2.68635892868042 - }, - { - "auxiliary_loss_clip": 0.01097095, - "auxiliary_loss_mlp": 0.01065327, - "balance_loss_clip": 1.04686952, - "balance_loss_mlp": 1.04463232, - "epoch": 0.14068841124304826, - "flos": 39202493685120.0, - "grad_norm": 2.0269377249156793, - "language_loss": 0.65632963, - "learning_rate": 3.872848730344146e-06, - "loss": 0.67795384, - "num_input_tokens_seen": 50687440, - "step": 2340, - "time_per_iteration": 2.9426286220550537 - }, - { - "auxiliary_loss_clip": 0.0114865, - "auxiliary_loss_mlp": 0.01050723, - "balance_loss_clip": 1.05418086, - "balance_loss_mlp": 1.0310297, - "epoch": 0.14074853449571623, - "flos": 20192283181440.0, - "grad_norm": 2.8518792803213917, - "language_loss": 0.78760445, - "learning_rate": 3.87271204460899e-06, - "loss": 0.80959821, - "num_input_tokens_seen": 50704030, - "step": 2341, - "time_per_iteration": 2.8814899921417236 - }, - { - "auxiliary_loss_clip": 0.01162758, - "auxiliary_loss_mlp": 0.01057334, - "balance_loss_clip": 1.0554986, - "balance_loss_mlp": 1.03876162, - "epoch": 0.1408086577483842, - "flos": 18405871153920.0, - "grad_norm": 2.2693198584224454, - "language_loss": 0.80322361, - "learning_rate": 3.8725752878604066e-06, - "loss": 0.82542449, - "num_input_tokens_seen": 50723305, - "step": 2342, - "time_per_iteration": 2.604814291000366 - }, - { - "auxiliary_loss_clip": 0.01152048, - "auxiliary_loss_mlp": 0.01056552, - "balance_loss_clip": 1.05776191, - "balance_loss_mlp": 1.03858757, - "epoch": 0.14086878100105216, - "flos": 25264593423360.0, - "grad_norm": 2.4727499245104343, - "language_loss": 0.77686632, - "learning_rate": 3.87243846010358e-06, - "loss": 0.79895234, - "num_input_tokens_seen": 50743270, - "step": 2343, - "time_per_iteration": 2.676823854446411 - }, - { - "auxiliary_loss_clip": 0.0105659, - "auxiliary_loss_mlp": 0.01037584, - "balance_loss_clip": 1.03650093, - "balance_loss_mlp": 1.03438878, - "epoch": 0.14092890425372012, - "flos": 65978388869760.0, - "grad_norm": 0.8521752699932517, - "language_loss": 0.61553669, - "learning_rate": 3.872301561343699e-06, - "loss": 0.63647842, - "num_input_tokens_seen": 50802710, - "step": 2344, - "time_per_iteration": 3.156792402267456 - }, - { - "auxiliary_loss_clip": 0.01147637, - "auxiliary_loss_mlp": 0.01049362, - "balance_loss_clip": 1.05167484, - "balance_loss_mlp": 1.03121877, - "epoch": 0.1409890275063881, - "flos": 23694973931520.0, - "grad_norm": 1.558783678159347, - "language_loss": 0.64331692, - "learning_rate": 3.872164591585956e-06, - "loss": 0.6652869, - "num_input_tokens_seen": 50822625, - "step": 2345, - "time_per_iteration": 2.654100179672241 - }, - { - "auxiliary_loss_clip": 0.01154879, - "auxiliary_loss_mlp": 0.0104633, - "balance_loss_clip": 1.05009735, - "balance_loss_mlp": 1.02562308, - "epoch": 0.14104915075905605, - "flos": 23623152687360.0, - "grad_norm": 2.26337760563351, - "language_loss": 0.73892581, - "learning_rate": 3.8720275508355435e-06, - "loss": 0.76093793, - "num_input_tokens_seen": 50842330, - "step": 2346, - "time_per_iteration": 2.7032830715179443 - }, - { - "auxiliary_loss_clip": 0.0115447, - "auxiliary_loss_mlp": 0.01048793, - "balance_loss_clip": 1.0572027, - "balance_loss_mlp": 1.02929008, - "epoch": 0.14110927401172405, - "flos": 20595165102720.0, - "grad_norm": 1.7675181118684058, - "language_loss": 0.7727294, - "learning_rate": 3.8718904390976585e-06, - "loss": 0.79476202, - "num_input_tokens_seen": 50861035, - "step": 2347, - "time_per_iteration": 2.678647518157959 - }, - { - "auxiliary_loss_clip": 0.01164131, - "auxiliary_loss_mlp": 0.01052088, - "balance_loss_clip": 1.05490732, - "balance_loss_mlp": 1.03370619, - "epoch": 0.141169397264392, - "flos": 28548049512960.0, - "grad_norm": 2.592464695784388, - "language_loss": 0.76753062, - "learning_rate": 3.8717532563775e-06, - "loss": 0.78969282, - "num_input_tokens_seen": 50880105, - "step": 2348, - "time_per_iteration": 2.7450597286224365 - }, - { - "auxiliary_loss_clip": 0.01147264, - "auxiliary_loss_mlp": 0.01042525, - "balance_loss_clip": 1.05267334, - "balance_loss_mlp": 1.02295136, - "epoch": 0.14122952051705998, - "flos": 17092258871040.0, - "grad_norm": 1.8617784303344698, - "language_loss": 0.86794335, - "learning_rate": 3.871616002680272e-06, - "loss": 0.8898412, - "num_input_tokens_seen": 50897720, - "step": 2349, - "time_per_iteration": 2.662508964538574 - }, - { - "auxiliary_loss_clip": 0.01150971, - "auxiliary_loss_mlp": 0.01048616, - "balance_loss_clip": 1.05632985, - "balance_loss_mlp": 1.02897048, - "epoch": 0.14128964376972794, - "flos": 28946801370240.0, - "grad_norm": 2.650060051711467, - "language_loss": 0.88758218, - "learning_rate": 3.871478678011177e-06, - "loss": 0.90957808, - "num_input_tokens_seen": 50918385, - "step": 2350, - "time_per_iteration": 4.1697962284088135 - }, - { - "auxiliary_loss_clip": 0.01142704, - "auxiliary_loss_mlp": 0.01045134, - "balance_loss_clip": 1.05369377, - "balance_loss_mlp": 1.02442729, - "epoch": 0.1413497670223959, - "flos": 18989778643200.0, - "grad_norm": 1.801090232061166, - "language_loss": 0.8094542, - "learning_rate": 3.871341282375423e-06, - "loss": 0.83133256, - "num_input_tokens_seen": 50938270, - "step": 2351, - "time_per_iteration": 2.6769907474517822 - }, - { - "auxiliary_loss_clip": 0.01149546, - "auxiliary_loss_mlp": 0.01040141, - "balance_loss_clip": 1.05100775, - "balance_loss_mlp": 1.02096045, - "epoch": 0.14140989027506387, - "flos": 29862236413440.0, - "grad_norm": 2.590933181784672, - "language_loss": 0.82796198, - "learning_rate": 3.871203815778219e-06, - "loss": 0.84985888, - "num_input_tokens_seen": 50958155, - "step": 2352, - "time_per_iteration": 5.713203430175781 - }, - { - "auxiliary_loss_clip": 0.01063742, - "auxiliary_loss_mlp": 0.01009803, - "balance_loss_clip": 1.03462291, - "balance_loss_mlp": 1.0060122, - "epoch": 0.14147001352773186, - "flos": 62079532041600.0, - "grad_norm": 0.9118003008214054, - "language_loss": 0.61876011, - "learning_rate": 3.87106627822478e-06, - "loss": 0.63949555, - "num_input_tokens_seen": 51020705, - "step": 2353, - "time_per_iteration": 3.1698319911956787 - }, - { - "auxiliary_loss_clip": 0.01134069, - "auxiliary_loss_mlp": 0.01049094, - "balance_loss_clip": 1.0536828, - "balance_loss_mlp": 1.03039002, - "epoch": 0.14153013678039983, - "flos": 22017514832640.0, - "grad_norm": 1.5909284402791886, - "language_loss": 0.87075388, - "learning_rate": 3.8709286697203196e-06, - "loss": 0.89258552, - "num_input_tokens_seen": 51039995, - "step": 2354, - "time_per_iteration": 2.6781272888183594 - }, - { - "auxiliary_loss_clip": 0.01124592, - "auxiliary_loss_mlp": 0.0104583, - "balance_loss_clip": 1.0527302, - "balance_loss_mlp": 1.02562428, - "epoch": 0.1415902600330678, - "flos": 19720093968000.0, - "grad_norm": 2.035812967878614, - "language_loss": 0.74701214, - "learning_rate": 3.870790990270057e-06, - "loss": 0.76871634, - "num_input_tokens_seen": 51059075, - "step": 2355, - "time_per_iteration": 4.464852571487427 - }, - { - "auxiliary_loss_clip": 0.01062228, - "auxiliary_loss_mlp": 0.01003337, - "balance_loss_clip": 1.03320074, - "balance_loss_mlp": 0.99947417, - "epoch": 0.14165038328573576, - "flos": 65900929190400.0, - "grad_norm": 0.6801443738216844, - "language_loss": 0.51819825, - "learning_rate": 3.870653239879212e-06, - "loss": 0.53885388, - "num_input_tokens_seen": 51120380, - "step": 2356, - "time_per_iteration": 3.094026803970337 - }, - { - "auxiliary_loss_clip": 0.01165635, - "auxiliary_loss_mlp": 0.01057535, - "balance_loss_clip": 1.05662966, - "balance_loss_mlp": 1.0379492, - "epoch": 0.14171050653840372, - "flos": 12130158533760.0, - "grad_norm": 1.9928903491175036, - "language_loss": 0.70598352, - "learning_rate": 3.8705154185530095e-06, - "loss": 0.72821522, - "num_input_tokens_seen": 51136950, - "step": 2357, - "time_per_iteration": 2.569486141204834 - }, - { - "auxiliary_loss_clip": 0.01117022, - "auxiliary_loss_mlp": 0.01054948, - "balance_loss_clip": 1.04706419, - "balance_loss_mlp": 1.0355413, - "epoch": 0.1417706297910717, - "flos": 20412487509120.0, - "grad_norm": 2.1046358800035234, - "language_loss": 0.82020235, - "learning_rate": 3.870377526296674e-06, - "loss": 0.84192204, - "num_input_tokens_seen": 51155175, - "step": 2358, - "time_per_iteration": 2.719344139099121 - }, - { - "auxiliary_loss_clip": 0.01145283, - "auxiliary_loss_mlp": 0.01050239, - "balance_loss_clip": 1.05257189, - "balance_loss_mlp": 1.02932954, - "epoch": 0.14183075304373965, - "flos": 22380607463040.0, - "grad_norm": 2.2336131404929787, - "language_loss": 0.71575904, - "learning_rate": 3.870239563115436e-06, - "loss": 0.73771417, - "num_input_tokens_seen": 51174500, - "step": 2359, - "time_per_iteration": 2.6914820671081543 - }, - { - "auxiliary_loss_clip": 0.0111529, - "auxiliary_loss_mlp": 0.007787, - "balance_loss_clip": 1.0526464, - "balance_loss_mlp": 1.00033379, - "epoch": 0.14189087629640765, - "flos": 21580913018880.0, - "grad_norm": 2.4314273775499906, - "language_loss": 0.7541784, - "learning_rate": 3.870101529014526e-06, - "loss": 0.77311832, - "num_input_tokens_seen": 51194270, - "step": 2360, - "time_per_iteration": 2.803493022918701 - }, - { - "auxiliary_loss_clip": 0.01108644, - "auxiliary_loss_mlp": 0.01053684, - "balance_loss_clip": 1.0491271, - "balance_loss_mlp": 1.03136814, - "epoch": 0.1419509995490756, - "flos": 20008564093440.0, - "grad_norm": 2.374719540518049, - "language_loss": 0.81920552, - "learning_rate": 3.869963423999178e-06, - "loss": 0.84082878, - "num_input_tokens_seen": 51211850, - "step": 2361, - "time_per_iteration": 2.8039920330047607 - }, - { - "auxiliary_loss_clip": 0.0115065, - "auxiliary_loss_mlp": 0.01057946, - "balance_loss_clip": 1.05230403, - "balance_loss_mlp": 1.03802609, - "epoch": 0.14201112280174358, - "flos": 31941464112000.0, - "grad_norm": 1.9397979109407166, - "language_loss": 0.74081504, - "learning_rate": 3.86982524807463e-06, - "loss": 0.76290095, - "num_input_tokens_seen": 51233545, - "step": 2362, - "time_per_iteration": 2.7272114753723145 - }, - { - "auxiliary_loss_clip": 0.0115354, - "auxiliary_loss_mlp": 0.01048321, - "balance_loss_clip": 1.05355787, - "balance_loss_mlp": 1.02861547, - "epoch": 0.14207124605441154, - "flos": 41464147582080.0, - "grad_norm": 1.7489521991344694, - "language_loss": 0.74221587, - "learning_rate": 3.869687001246122e-06, - "loss": 0.76423442, - "num_input_tokens_seen": 51257615, - "step": 2363, - "time_per_iteration": 2.789802312850952 - }, - { - "auxiliary_loss_clip": 0.01128802, - "auxiliary_loss_mlp": 0.0105205, - "balance_loss_clip": 1.04769099, - "balance_loss_mlp": 1.03180885, - "epoch": 0.1421313693070795, - "flos": 31905086613120.0, - "grad_norm": 1.7832713632097879, - "language_loss": 0.73034167, - "learning_rate": 3.8695486835188946e-06, - "loss": 0.75215018, - "num_input_tokens_seen": 51279645, - "step": 2364, - "time_per_iteration": 2.8508312702178955 - }, - { - "auxiliary_loss_clip": 0.01142769, - "auxiliary_loss_mlp": 0.01049829, - "balance_loss_clip": 1.05160844, - "balance_loss_mlp": 1.03207827, - "epoch": 0.14219149255974747, - "flos": 26871165031680.0, - "grad_norm": 1.875477198706701, - "language_loss": 0.90395916, - "learning_rate": 3.869410294898195e-06, - "loss": 0.92588514, - "num_input_tokens_seen": 51299775, - "step": 2365, - "time_per_iteration": 2.6807806491851807 - }, - { - "auxiliary_loss_clip": 0.01127252, - "auxiliary_loss_mlp": 0.01054912, - "balance_loss_clip": 1.04759967, - "balance_loss_mlp": 1.03394318, - "epoch": 0.14225161581241544, - "flos": 27454426076160.0, - "grad_norm": 1.719218863067841, - "language_loss": 0.65305161, - "learning_rate": 3.869271835389268e-06, - "loss": 0.67487329, - "num_input_tokens_seen": 51319430, - "step": 2366, - "time_per_iteration": 2.7293641567230225 - }, - { - "auxiliary_loss_clip": 0.01143576, - "auxiliary_loss_mlp": 0.01051629, - "balance_loss_clip": 1.05218709, - "balance_loss_mlp": 1.03058839, - "epoch": 0.14231173906508343, - "flos": 10561436881920.0, - "grad_norm": 2.3740196514966256, - "language_loss": 0.80331928, - "learning_rate": 3.8691333049973665e-06, - "loss": 0.82527137, - "num_input_tokens_seen": 51336045, - "step": 2367, - "time_per_iteration": 2.67529296875 - }, - { - "auxiliary_loss_clip": 0.01138517, - "auxiliary_loss_mlp": 0.01062653, - "balance_loss_clip": 1.05117869, - "balance_loss_mlp": 1.0402534, - "epoch": 0.1423718623177514, - "flos": 28360882719360.0, - "grad_norm": 2.0081973718426283, - "language_loss": 0.82346755, - "learning_rate": 3.868994703727742e-06, - "loss": 0.84547925, - "num_input_tokens_seen": 51357030, - "step": 2368, - "time_per_iteration": 2.7447288036346436 - }, - { - "auxiliary_loss_clip": 0.01122755, - "auxiliary_loss_mlp": 0.01052229, - "balance_loss_clip": 1.05180073, - "balance_loss_mlp": 1.03065228, - "epoch": 0.14243198557041936, - "flos": 19354235990400.0, - "grad_norm": 2.6586279461428144, - "language_loss": 0.8711772, - "learning_rate": 3.868856031585652e-06, - "loss": 0.89292705, - "num_input_tokens_seen": 51374890, - "step": 2369, - "time_per_iteration": 2.736872673034668 - }, - { - "auxiliary_loss_clip": 0.01127301, - "auxiliary_loss_mlp": 0.0104182, - "balance_loss_clip": 1.05011857, - "balance_loss_mlp": 1.02170992, - "epoch": 0.14249210882308733, - "flos": 28806857982720.0, - "grad_norm": 1.7900856007188275, - "language_loss": 0.75828248, - "learning_rate": 3.868717288576354e-06, - "loss": 0.77997375, - "num_input_tokens_seen": 51398100, - "step": 2370, - "time_per_iteration": 2.762603998184204 - }, - { - "auxiliary_loss_clip": 0.01158195, - "auxiliary_loss_mlp": 0.00781098, - "balance_loss_clip": 1.05268764, - "balance_loss_mlp": 1.00028419, - "epoch": 0.1425522320757553, - "flos": 21835016807040.0, - "grad_norm": 1.7770434161065212, - "language_loss": 0.82934797, - "learning_rate": 3.868578474705109e-06, - "loss": 0.84874088, - "num_input_tokens_seen": 51418745, - "step": 2371, - "time_per_iteration": 2.6224656105041504 - }, - { - "auxiliary_loss_clip": 0.01173447, - "auxiliary_loss_mlp": 0.0105718, - "balance_loss_clip": 1.05837953, - "balance_loss_mlp": 1.03638947, - "epoch": 0.14261235532842326, - "flos": 17311457617920.0, - "grad_norm": 2.0431625041319825, - "language_loss": 0.82982123, - "learning_rate": 3.868439589977181e-06, - "loss": 0.85212755, - "num_input_tokens_seen": 51437455, - "step": 2372, - "time_per_iteration": 2.575690269470215 - }, - { - "auxiliary_loss_clip": 0.01172196, - "auxiliary_loss_mlp": 0.0105022, - "balance_loss_clip": 1.0581125, - "balance_loss_mlp": 1.0285356, - "epoch": 0.14267247858109125, - "flos": 18806741913600.0, - "grad_norm": 3.3704326167450582, - "language_loss": 0.8438468, - "learning_rate": 3.868300634397836e-06, - "loss": 0.86607099, - "num_input_tokens_seen": 51455710, - "step": 2373, - "time_per_iteration": 2.7160356044769287 - }, - { - "auxiliary_loss_clip": 0.01141742, - "auxiliary_loss_mlp": 0.01055295, - "balance_loss_clip": 1.05160809, - "balance_loss_mlp": 1.03598261, - "epoch": 0.14273260183375922, - "flos": 11358904682880.0, - "grad_norm": 3.5035356392631836, - "language_loss": 0.86027539, - "learning_rate": 3.8681616079723445e-06, - "loss": 0.88224572, - "num_input_tokens_seen": 51471270, - "step": 2374, - "time_per_iteration": 2.6845595836639404 - }, - { - "auxiliary_loss_clip": 0.01164623, - "auxiliary_loss_mlp": 0.01061957, - "balance_loss_clip": 1.05515146, - "balance_loss_mlp": 1.03996301, - "epoch": 0.14279272508642718, - "flos": 27567688636800.0, - "grad_norm": 1.6059368749673757, - "language_loss": 0.79169822, - "learning_rate": 3.868022510705977e-06, - "loss": 0.81396401, - "num_input_tokens_seen": 51492705, - "step": 2375, - "time_per_iteration": 2.738156795501709 - }, - { - "auxiliary_loss_clip": 0.01163115, - "auxiliary_loss_mlp": 0.01058224, - "balance_loss_clip": 1.05641222, - "balance_loss_mlp": 1.0368259, - "epoch": 0.14285284833909515, - "flos": 16252559654400.0, - "grad_norm": 2.559097553272684, - "language_loss": 0.76907504, - "learning_rate": 3.867883342604009e-06, - "loss": 0.79128844, - "num_input_tokens_seen": 51510780, - "step": 2376, - "time_per_iteration": 2.751178741455078 - }, - { - "auxiliary_loss_clip": 0.01160115, - "auxiliary_loss_mlp": 0.0105168, - "balance_loss_clip": 1.054515, - "balance_loss_mlp": 1.03040111, - "epoch": 0.1429129715917631, - "flos": 19755609540480.0, - "grad_norm": 2.7331999261828592, - "language_loss": 0.92795181, - "learning_rate": 3.867744103671717e-06, - "loss": 0.95006979, - "num_input_tokens_seen": 51531400, - "step": 2377, - "time_per_iteration": 2.6584725379943848 - }, - { - "auxiliary_loss_clip": 0.01147246, - "auxiliary_loss_mlp": 0.01061419, - "balance_loss_clip": 1.05362535, - "balance_loss_mlp": 1.03793442, - "epoch": 0.14297309484443108, - "flos": 21137092571520.0, - "grad_norm": 2.9252003733204894, - "language_loss": 0.91754365, - "learning_rate": 3.867604793914382e-06, - "loss": 0.93963027, - "num_input_tokens_seen": 51548215, - "step": 2378, - "time_per_iteration": 2.8107075691223145 - }, - { - "auxiliary_loss_clip": 0.01164153, - "auxiliary_loss_mlp": 0.0105303, - "balance_loss_clip": 1.05712187, - "balance_loss_mlp": 1.03092849, - "epoch": 0.14303321809709904, - "flos": 23586667447680.0, - "grad_norm": 2.1292902842232966, - "language_loss": 0.73961306, - "learning_rate": 3.8674654133372864e-06, - "loss": 0.76178491, - "num_input_tokens_seen": 51566820, - "step": 2379, - "time_per_iteration": 2.7029881477355957 - }, - { - "auxiliary_loss_clip": 0.01137551, - "auxiliary_loss_mlp": 0.01055012, - "balance_loss_clip": 1.05204058, - "balance_loss_mlp": 1.0330174, - "epoch": 0.14309334134976703, - "flos": 15888281875200.0, - "grad_norm": 2.1898245228218784, - "language_loss": 0.78818595, - "learning_rate": 3.867325961945714e-06, - "loss": 0.81011152, - "num_input_tokens_seen": 51585075, - "step": 2380, - "time_per_iteration": 2.7213294506073 - }, - { - "auxiliary_loss_clip": 0.01126442, - "auxiliary_loss_mlp": 0.01057409, - "balance_loss_clip": 1.05457354, - "balance_loss_mlp": 1.03580785, - "epoch": 0.143153464602435, - "flos": 16325601960960.0, - "grad_norm": 4.699041640805274, - "language_loss": 0.87895483, - "learning_rate": 3.867186439744955e-06, - "loss": 0.90079331, - "num_input_tokens_seen": 51603185, - "step": 2381, - "time_per_iteration": 2.7144110202789307 - }, - { - "auxiliary_loss_clip": 0.01141327, - "auxiliary_loss_mlp": 0.01052708, - "balance_loss_clip": 1.05200005, - "balance_loss_mlp": 1.03088117, - "epoch": 0.14321358785510296, - "flos": 17092079303040.0, - "grad_norm": 2.47508592106904, - "language_loss": 0.76396096, - "learning_rate": 3.867046846740299e-06, - "loss": 0.78590137, - "num_input_tokens_seen": 51620880, - "step": 2382, - "time_per_iteration": 2.6185953617095947 - }, - { - "auxiliary_loss_clip": 0.01132222, - "auxiliary_loss_mlp": 0.01054019, - "balance_loss_clip": 1.05162048, - "balance_loss_mlp": 1.03319359, - "epoch": 0.14327371110777093, - "flos": 26322916769280.0, - "grad_norm": 4.3017095308344375, - "language_loss": 0.76636785, - "learning_rate": 3.866907182937039e-06, - "loss": 0.7882303, - "num_input_tokens_seen": 51640170, - "step": 2383, - "time_per_iteration": 2.7408525943756104 - }, - { - "auxiliary_loss_clip": 0.01139698, - "auxiliary_loss_mlp": 0.01052888, - "balance_loss_clip": 1.05078864, - "balance_loss_mlp": 1.02926064, - "epoch": 0.1433338343604389, - "flos": 18076462502400.0, - "grad_norm": 2.3526544982502284, - "language_loss": 0.87649417, - "learning_rate": 3.866767448340471e-06, - "loss": 0.8984201, - "num_input_tokens_seen": 51656580, - "step": 2384, - "time_per_iteration": 2.6798789501190186 - }, - { - "auxiliary_loss_clip": 0.01164805, - "auxiliary_loss_mlp": 0.01053206, - "balance_loss_clip": 1.05644679, - "balance_loss_mlp": 1.02985239, - "epoch": 0.14339395761310686, - "flos": 15522783033600.0, - "grad_norm": 2.6134761315069284, - "language_loss": 0.79340684, - "learning_rate": 3.866627642955895e-06, - "loss": 0.81558692, - "num_input_tokens_seen": 51674645, - "step": 2385, - "time_per_iteration": 2.5856544971466064 - }, - { - "auxiliary_loss_clip": 0.01156607, - "auxiliary_loss_mlp": 0.01042784, - "balance_loss_clip": 1.05148256, - "balance_loss_mlp": 1.02182722, - "epoch": 0.14345408086577485, - "flos": 28548767784960.0, - "grad_norm": 2.6990187663653247, - "language_loss": 0.74960196, - "learning_rate": 3.866487766788612e-06, - "loss": 0.77159584, - "num_input_tokens_seen": 51695770, - "step": 2386, - "time_per_iteration": 2.6670751571655273 - }, - { - "auxiliary_loss_clip": 0.01171639, - "auxiliary_loss_mlp": 0.01048096, - "balance_loss_clip": 1.05699563, - "balance_loss_mlp": 1.02733016, - "epoch": 0.14351420411844282, - "flos": 20230061310720.0, - "grad_norm": 2.299870083842227, - "language_loss": 0.78659731, - "learning_rate": 3.866347819843925e-06, - "loss": 0.80879462, - "num_input_tokens_seen": 51714165, - "step": 2387, - "time_per_iteration": 2.5805532932281494 - }, - { - "auxiliary_loss_clip": 0.01140581, - "auxiliary_loss_mlp": 0.01055299, - "balance_loss_clip": 1.05355716, - "balance_loss_mlp": 1.03317428, - "epoch": 0.14357432737111078, - "flos": 19865029345920.0, - "grad_norm": 6.554164509194222, - "language_loss": 0.82492924, - "learning_rate": 3.866207802127143e-06, - "loss": 0.84688807, - "num_input_tokens_seen": 51734440, - "step": 2388, - "time_per_iteration": 2.656609058380127 - }, - { - "auxiliary_loss_clip": 0.01155007, - "auxiliary_loss_mlp": 0.01047154, - "balance_loss_clip": 1.0537287, - "balance_loss_mlp": 1.02674508, - "epoch": 0.14363445062377875, - "flos": 28256814040320.0, - "grad_norm": 2.5973624291758655, - "language_loss": 0.82025754, - "learning_rate": 3.866067713643573e-06, - "loss": 0.84227914, - "num_input_tokens_seen": 51753730, - "step": 2389, - "time_per_iteration": 4.21793794631958 - }, - { - "auxiliary_loss_clip": 0.01145665, - "auxiliary_loss_mlp": 0.01046852, - "balance_loss_clip": 1.05107975, - "balance_loss_mlp": 1.02513266, - "epoch": 0.1436945738764467, - "flos": 18186672407040.0, - "grad_norm": 3.7970835440683097, - "language_loss": 0.83056784, - "learning_rate": 3.8659275543985285e-06, - "loss": 0.85249299, - "num_input_tokens_seen": 51771195, - "step": 2390, - "time_per_iteration": 2.6859514713287354 - }, - { - "auxiliary_loss_clip": 0.01152608, - "auxiliary_loss_mlp": 0.01054404, - "balance_loss_clip": 1.05400729, - "balance_loss_mlp": 1.0334475, - "epoch": 0.14375469712911468, - "flos": 27307910499840.0, - "grad_norm": 1.8176612067028404, - "language_loss": 0.75018179, - "learning_rate": 3.865787324397324e-06, - "loss": 0.77225184, - "num_input_tokens_seen": 51792290, - "step": 2391, - "time_per_iteration": 5.726900577545166 - }, - { - "auxiliary_loss_clip": 0.01045505, - "auxiliary_loss_mlp": 0.01033342, - "balance_loss_clip": 1.03226101, - "balance_loss_mlp": 1.0303973, - "epoch": 0.14381482038178264, - "flos": 56891445287040.0, - "grad_norm": 0.8787809928903102, - "language_loss": 0.61848003, - "learning_rate": 3.865647023645277e-06, - "loss": 0.63926852, - "num_input_tokens_seen": 51843675, - "step": 2392, - "time_per_iteration": 3.113558053970337 - }, - { - "auxiliary_loss_clip": 0.01158698, - "auxiliary_loss_mlp": 0.01058807, - "balance_loss_clip": 1.05467868, - "balance_loss_mlp": 1.03608608, - "epoch": 0.14387494363445064, - "flos": 14282177143680.0, - "grad_norm": 2.718376715006273, - "language_loss": 0.77346605, - "learning_rate": 3.865506652147709e-06, - "loss": 0.79564106, - "num_input_tokens_seen": 51860285, - "step": 2393, - "time_per_iteration": 2.6578521728515625 - }, - { - "auxiliary_loss_clip": 0.0116951, - "auxiliary_loss_mlp": 0.01052986, - "balance_loss_clip": 1.05671048, - "balance_loss_mlp": 1.03287578, - "epoch": 0.1439350668871186, - "flos": 26761493831040.0, - "grad_norm": 5.715284956255472, - "language_loss": 0.76301813, - "learning_rate": 3.865366209909941e-06, - "loss": 0.78524309, - "num_input_tokens_seen": 51880105, - "step": 2394, - "time_per_iteration": 4.345217943191528 - }, - { - "auxiliary_loss_clip": 0.01165266, - "auxiliary_loss_mlp": 0.01053501, - "balance_loss_clip": 1.05325842, - "balance_loss_mlp": 1.03365326, - "epoch": 0.14399519013978657, - "flos": 40700040537600.0, - "grad_norm": 2.2496244390836893, - "language_loss": 0.85859704, - "learning_rate": 3.8652256969372994e-06, - "loss": 0.88078463, - "num_input_tokens_seen": 51905175, - "step": 2395, - "time_per_iteration": 2.739717483520508 - }, - { - "auxiliary_loss_clip": 0.0112523, - "auxiliary_loss_mlp": 0.01051092, - "balance_loss_clip": 1.04946184, - "balance_loss_mlp": 1.028669, - "epoch": 0.14405531339245453, - "flos": 20557530627840.0, - "grad_norm": 4.117082508421602, - "language_loss": 0.82894099, - "learning_rate": 3.865085113235113e-06, - "loss": 0.85070425, - "num_input_tokens_seen": 51924490, - "step": 2396, - "time_per_iteration": 2.686732053756714 - }, - { - "auxiliary_loss_clip": 0.01126754, - "auxiliary_loss_mlp": 0.00779833, - "balance_loss_clip": 1.04752374, - "balance_loss_mlp": 1.00036597, - "epoch": 0.1441154366451225, - "flos": 19572931946880.0, - "grad_norm": 6.956399779275871, - "language_loss": 0.82801461, - "learning_rate": 3.864944458808712e-06, - "loss": 0.84708053, - "num_input_tokens_seen": 51940490, - "step": 2397, - "time_per_iteration": 2.742809534072876 - }, - { - "auxiliary_loss_clip": 0.01168871, - "auxiliary_loss_mlp": 0.0104994, - "balance_loss_clip": 1.05485702, - "balance_loss_mlp": 1.02892387, - "epoch": 0.14417555989779046, - "flos": 18515721922560.0, - "grad_norm": 8.355198005975433, - "language_loss": 0.8001197, - "learning_rate": 3.86480373366343e-06, - "loss": 0.82230783, - "num_input_tokens_seen": 51957910, - "step": 2398, - "time_per_iteration": 2.573267936706543 - }, - { - "auxiliary_loss_clip": 0.01152449, - "auxiliary_loss_mlp": 0.01053407, - "balance_loss_clip": 1.05287588, - "balance_loss_mlp": 1.03336823, - "epoch": 0.14423568315045843, - "flos": 26031681296640.0, - "grad_norm": 3.294581575970509, - "language_loss": 0.64690518, - "learning_rate": 3.864662937804603e-06, - "loss": 0.66896379, - "num_input_tokens_seen": 51978010, - "step": 2399, - "time_per_iteration": 2.6831774711608887 - }, - { - "auxiliary_loss_clip": 0.01134916, - "auxiliary_loss_mlp": 0.01052493, - "balance_loss_clip": 1.04998159, - "balance_loss_mlp": 1.03119016, - "epoch": 0.14429580640312642, - "flos": 21288743792640.0, - "grad_norm": 3.586256880371596, - "language_loss": 0.82207137, - "learning_rate": 3.864522071237571e-06, - "loss": 0.84394544, - "num_input_tokens_seen": 51998515, - "step": 2400, - "time_per_iteration": 2.6812663078308105 - }, - { - "auxiliary_loss_clip": 0.01149983, - "auxiliary_loss_mlp": 0.01051884, - "balance_loss_clip": 1.0567503, - "balance_loss_mlp": 1.02954376, - "epoch": 0.14435592965579438, - "flos": 25627865621760.0, - "grad_norm": 2.3908005596579165, - "language_loss": 0.74217784, - "learning_rate": 3.864381133967676e-06, - "loss": 0.76419652, - "num_input_tokens_seen": 52019270, - "step": 2401, - "time_per_iteration": 2.773838520050049 - }, - { - "auxiliary_loss_clip": 0.01137207, - "auxiliary_loss_mlp": 0.01047592, - "balance_loss_clip": 1.05065656, - "balance_loss_mlp": 1.02671885, - "epoch": 0.14441605290846235, - "flos": 22965053656320.0, - "grad_norm": 2.616063077702737, - "language_loss": 0.80771816, - "learning_rate": 3.86424012600026e-06, - "loss": 0.82956612, - "num_input_tokens_seen": 52039315, - "step": 2402, - "time_per_iteration": 2.786031723022461 - }, - { - "auxiliary_loss_clip": 0.01120897, - "auxiliary_loss_mlp": 0.01052115, - "balance_loss_clip": 1.04718328, - "balance_loss_mlp": 1.02988231, - "epoch": 0.14447617616113032, - "flos": 17347655548800.0, - "grad_norm": 2.397935571801219, - "language_loss": 0.84159613, - "learning_rate": 3.864099047340673e-06, - "loss": 0.86332625, - "num_input_tokens_seen": 52056555, - "step": 2403, - "time_per_iteration": 2.8113911151885986 - }, - { - "auxiliary_loss_clip": 0.01129082, - "auxiliary_loss_mlp": 0.00783127, - "balance_loss_clip": 1.04854488, - "balance_loss_mlp": 1.00030184, - "epoch": 0.14453629941379828, - "flos": 24060185464320.0, - "grad_norm": 2.224282169770823, - "language_loss": 0.70142806, - "learning_rate": 3.863957897994262e-06, - "loss": 0.72055018, - "num_input_tokens_seen": 52075800, - "step": 2404, - "time_per_iteration": 2.7748003005981445 - }, - { - "auxiliary_loss_clip": 0.01144289, - "auxiliary_loss_mlp": 0.01051404, - "balance_loss_clip": 1.05279732, - "balance_loss_mlp": 1.03099549, - "epoch": 0.14459642266646625, - "flos": 14429554646400.0, - "grad_norm": 2.429117427076043, - "language_loss": 0.73179376, - "learning_rate": 3.863816677966381e-06, - "loss": 0.75375068, - "num_input_tokens_seen": 52092585, - "step": 2405, - "time_per_iteration": 2.7927868366241455 - }, - { - "auxiliary_loss_clip": 0.01108387, - "auxiliary_loss_mlp": 0.01054584, - "balance_loss_clip": 1.04661417, - "balance_loss_mlp": 1.0326612, - "epoch": 0.14465654591913424, - "flos": 9867032179200.0, - "grad_norm": 7.089523066959408, - "language_loss": 0.73039794, - "learning_rate": 3.863675387262386e-06, - "loss": 0.75202763, - "num_input_tokens_seen": 52108990, - "step": 2406, - "time_per_iteration": 2.742253303527832 - }, - { - "auxiliary_loss_clip": 0.01157268, - "auxiliary_loss_mlp": 0.01054465, - "balance_loss_clip": 1.05420268, - "balance_loss_mlp": 1.03198171, - "epoch": 0.1447166691718022, - "flos": 24972926987520.0, - "grad_norm": 5.383630788916188, - "language_loss": 0.75570732, - "learning_rate": 3.8635340258876325e-06, - "loss": 0.77782464, - "num_input_tokens_seen": 52125385, - "step": 2407, - "time_per_iteration": 2.654636859893799 - }, - { - "auxiliary_loss_clip": 0.0116674, - "auxiliary_loss_mlp": 0.01054642, - "balance_loss_clip": 1.05440819, - "balance_loss_mlp": 1.03392315, - "epoch": 0.14477679242447017, - "flos": 21908023200000.0, - "grad_norm": 2.0240540465866146, - "language_loss": 0.79426706, - "learning_rate": 3.8633925938474826e-06, - "loss": 0.81648088, - "num_input_tokens_seen": 52144985, - "step": 2408, - "time_per_iteration": 2.663611650466919 - }, - { - "auxiliary_loss_clip": 0.01155332, - "auxiliary_loss_mlp": 0.01053557, - "balance_loss_clip": 1.05411625, - "balance_loss_mlp": 1.03107429, - "epoch": 0.14483691567713813, - "flos": 20740746925440.0, - "grad_norm": 2.249858190268702, - "language_loss": 0.82188261, - "learning_rate": 3.863251091147299e-06, - "loss": 0.84397143, - "num_input_tokens_seen": 52163885, - "step": 2409, - "time_per_iteration": 2.6218342781066895 - }, - { - "auxiliary_loss_clip": 0.01116852, - "auxiliary_loss_mlp": 0.01065498, - "balance_loss_clip": 1.04859877, - "balance_loss_mlp": 1.04340839, - "epoch": 0.1448970389298061, - "flos": 35407705536000.0, - "grad_norm": 3.918408886138166, - "language_loss": 0.74477464, - "learning_rate": 3.863109517792446e-06, - "loss": 0.76659817, - "num_input_tokens_seen": 52184325, - "step": 2410, - "time_per_iteration": 2.8525002002716064 - }, - { - "auxiliary_loss_clip": 0.01166422, - "auxiliary_loss_mlp": 0.0105028, - "balance_loss_clip": 1.05447876, - "balance_loss_mlp": 1.0300622, - "epoch": 0.14495716218247406, - "flos": 15414368808960.0, - "grad_norm": 2.976325973684052, - "language_loss": 0.81616414, - "learning_rate": 3.8629678737882945e-06, - "loss": 0.8383311, - "num_input_tokens_seen": 52202740, - "step": 2411, - "time_per_iteration": 2.580059051513672 - }, - { - "auxiliary_loss_clip": 0.01143671, - "auxiliary_loss_mlp": 0.01055066, - "balance_loss_clip": 1.05553794, - "balance_loss_mlp": 1.03366852, - "epoch": 0.14501728543514203, - "flos": 33693222493440.0, - "grad_norm": 2.049708152728223, - "language_loss": 0.69947547, - "learning_rate": 3.862826159140214e-06, - "loss": 0.72146285, - "num_input_tokens_seen": 52223100, - "step": 2412, - "time_per_iteration": 2.792389392852783 - }, - { - "auxiliary_loss_clip": 0.01153861, - "auxiliary_loss_mlp": 0.01047504, - "balance_loss_clip": 1.05600309, - "balance_loss_mlp": 1.02669024, - "epoch": 0.14507740868781002, - "flos": 15596112648960.0, - "grad_norm": 1.9741671649406984, - "language_loss": 0.76655865, - "learning_rate": 3.862684373853579e-06, - "loss": 0.78857231, - "num_input_tokens_seen": 52239690, - "step": 2413, - "time_per_iteration": 2.6535370349884033 - }, - { - "auxiliary_loss_clip": 0.01072879, - "auxiliary_loss_mlp": 0.01028499, - "balance_loss_clip": 1.04041791, - "balance_loss_mlp": 1.0252564, - "epoch": 0.145137531940478, - "flos": 66675343438080.0, - "grad_norm": 0.9047547971056389, - "language_loss": 0.58883119, - "learning_rate": 3.8625425179337656e-06, - "loss": 0.60984492, - "num_input_tokens_seen": 52296705, - "step": 2414, - "time_per_iteration": 3.1230342388153076 - }, - { - "auxiliary_loss_clip": 0.01059489, - "auxiliary_loss_mlp": 0.01009718, - "balance_loss_clip": 1.03874373, - "balance_loss_mlp": 1.00692892, - "epoch": 0.14519765519314595, - "flos": 67521578929920.0, - "grad_norm": 0.8422279258983576, - "language_loss": 0.62171185, - "learning_rate": 3.862400591386154e-06, - "loss": 0.64240396, - "num_input_tokens_seen": 52361830, - "step": 2415, - "time_per_iteration": 3.1932270526885986 - }, - { - "auxiliary_loss_clip": 0.01151643, - "auxiliary_loss_mlp": 0.01046675, - "balance_loss_clip": 1.05383611, - "balance_loss_mlp": 1.02500319, - "epoch": 0.14525777844581392, - "flos": 17198913329280.0, - "grad_norm": 2.2913061581681036, - "language_loss": 0.71468806, - "learning_rate": 3.8622585942161245e-06, - "loss": 0.73667121, - "num_input_tokens_seen": 52379420, - "step": 2416, - "time_per_iteration": 2.5892374515533447 - }, - { - "auxiliary_loss_clip": 0.01050816, - "auxiliary_loss_mlp": 0.010049, - "balance_loss_clip": 1.03675056, - "balance_loss_mlp": 1.00211036, - "epoch": 0.14531790169848188, - "flos": 65404609015680.0, - "grad_norm": 0.7147623603004897, - "language_loss": 0.6037569, - "learning_rate": 3.8621165264290635e-06, - "loss": 0.62431407, - "num_input_tokens_seen": 52446290, - "step": 2417, - "time_per_iteration": 3.3065359592437744 - }, - { - "auxiliary_loss_clip": 0.01168766, - "auxiliary_loss_mlp": 0.01053548, - "balance_loss_clip": 1.05357766, - "balance_loss_mlp": 1.03275824, - "epoch": 0.14537802495114985, - "flos": 32562467372160.0, - "grad_norm": 3.7032433533234346, - "language_loss": 0.78014368, - "learning_rate": 3.861974388030356e-06, - "loss": 0.80236679, - "num_input_tokens_seen": 52467295, - "step": 2418, - "time_per_iteration": 2.887986183166504 - }, - { - "auxiliary_loss_clip": 0.01114137, - "auxiliary_loss_mlp": 0.01049779, - "balance_loss_clip": 1.04354823, - "balance_loss_mlp": 1.02911985, - "epoch": 0.1454381482038178, - "flos": 20226685432320.0, - "grad_norm": 2.096300480609688, - "language_loss": 0.71208847, - "learning_rate": 3.861832179025394e-06, - "loss": 0.73372757, - "num_input_tokens_seen": 52487295, - "step": 2419, - "time_per_iteration": 2.764268636703491 - }, - { - "auxiliary_loss_clip": 0.01142427, - "auxiliary_loss_mlp": 0.01054976, - "balance_loss_clip": 1.05351484, - "balance_loss_mlp": 1.03300607, - "epoch": 0.1454982714564858, - "flos": 22893124671360.0, - "grad_norm": 2.414673655978061, - "language_loss": 0.89847761, - "learning_rate": 3.861689899419569e-06, - "loss": 0.92045164, - "num_input_tokens_seen": 52504220, - "step": 2420, - "time_per_iteration": 2.7500016689300537 - }, - { - "auxiliary_loss_clip": 0.01155004, - "auxiliary_loss_mlp": 0.01060929, - "balance_loss_clip": 1.05202007, - "balance_loss_mlp": 1.04072309, - "epoch": 0.14555839470915377, - "flos": 20229845829120.0, - "grad_norm": 2.0953123539002383, - "language_loss": 0.82278717, - "learning_rate": 3.861547549218276e-06, - "loss": 0.8449465, - "num_input_tokens_seen": 52521900, - "step": 2421, - "time_per_iteration": 2.672722816467285 - }, - { - "auxiliary_loss_clip": 0.01099277, - "auxiliary_loss_mlp": 0.01056793, - "balance_loss_clip": 1.04282439, - "balance_loss_mlp": 1.03507352, - "epoch": 0.14561851796182174, - "flos": 22236282616320.0, - "grad_norm": 1.667429152986229, - "language_loss": 0.81741488, - "learning_rate": 3.861405128426914e-06, - "loss": 0.83897555, - "num_input_tokens_seen": 52540495, - "step": 2422, - "time_per_iteration": 2.739992141723633 - }, - { - "auxiliary_loss_clip": 0.01031842, - "auxiliary_loss_mlp": 0.00760413, - "balance_loss_clip": 1.0271318, - "balance_loss_mlp": 1.00019872, - "epoch": 0.1456786412144897, - "flos": 52636786289280.0, - "grad_norm": 0.9102961670465963, - "language_loss": 0.63342595, - "learning_rate": 3.861262637050883e-06, - "loss": 0.65134847, - "num_input_tokens_seen": 52603305, - "step": 2423, - "time_per_iteration": 3.2704036235809326 - }, - { - "auxiliary_loss_clip": 0.01112855, - "auxiliary_loss_mlp": 0.00780065, - "balance_loss_clip": 1.05457556, - "balance_loss_mlp": 1.00038898, - "epoch": 0.14573876446715767, - "flos": 23221671396480.0, - "grad_norm": 2.2239460229896206, - "language_loss": 0.82163274, - "learning_rate": 3.861120075095585e-06, - "loss": 0.84056193, - "num_input_tokens_seen": 52623435, - "step": 2424, - "time_per_iteration": 2.7993249893188477 - }, - { - "auxiliary_loss_clip": 0.01141208, - "auxiliary_loss_mlp": 0.01069468, - "balance_loss_clip": 1.0535512, - "balance_loss_mlp": 1.0496788, - "epoch": 0.14579888771982563, - "flos": 18114384286080.0, - "grad_norm": 2.769336045727131, - "language_loss": 0.78602695, - "learning_rate": 3.860977442566429e-06, - "loss": 0.80813372, - "num_input_tokens_seen": 52642255, - "step": 2425, - "time_per_iteration": 2.698594093322754 - }, - { - "auxiliary_loss_clip": 0.01156078, - "auxiliary_loss_mlp": 0.01062133, - "balance_loss_clip": 1.05603778, - "balance_loss_mlp": 1.04148602, - "epoch": 0.14585901097249362, - "flos": 23001107932800.0, - "grad_norm": 50.77412231982301, - "language_loss": 0.83184898, - "learning_rate": 3.860834739468821e-06, - "loss": 0.85403109, - "num_input_tokens_seen": 52658700, - "step": 2426, - "time_per_iteration": 2.6948676109313965 - }, - { - "auxiliary_loss_clip": 0.01166642, - "auxiliary_loss_mlp": 0.01060596, - "balance_loss_clip": 1.05706, - "balance_loss_mlp": 1.04040194, - "epoch": 0.1459191342251616, - "flos": 21908669644800.0, - "grad_norm": 3.7420612082917475, - "language_loss": 0.87215799, - "learning_rate": 3.860691965808173e-06, - "loss": 0.8944304, - "num_input_tokens_seen": 52678140, - "step": 2427, - "time_per_iteration": 2.6479666233062744 - }, - { - "auxiliary_loss_clip": 0.01128634, - "auxiliary_loss_mlp": 0.01064346, - "balance_loss_clip": 1.04835391, - "balance_loss_mlp": 1.0405997, - "epoch": 0.14597925747782955, - "flos": 14975504438400.0, - "grad_norm": 1.9221483903926033, - "language_loss": 0.66815829, - "learning_rate": 3.8605491215899e-06, - "loss": 0.69008809, - "num_input_tokens_seen": 52696825, - "step": 2428, - "time_per_iteration": 2.6971306800842285 - }, - { - "auxiliary_loss_clip": 0.01155557, - "auxiliary_loss_mlp": 0.01059343, - "balance_loss_clip": 1.05335426, - "balance_loss_mlp": 1.03842235, - "epoch": 0.14603938073049752, - "flos": 21068898600960.0, - "grad_norm": 2.0918238083564242, - "language_loss": 0.83231717, - "learning_rate": 3.860406206819417e-06, - "loss": 0.8544662, - "num_input_tokens_seen": 52715125, - "step": 2429, - "time_per_iteration": 4.283279895782471 - }, - { - "auxiliary_loss_clip": 0.01120809, - "auxiliary_loss_mlp": 0.01053505, - "balance_loss_clip": 1.04625869, - "balance_loss_mlp": 1.03446746, - "epoch": 0.14609950398316549, - "flos": 19864777950720.0, - "grad_norm": 2.4559042296603746, - "language_loss": 0.79087842, - "learning_rate": 3.860263221502145e-06, - "loss": 0.81262159, - "num_input_tokens_seen": 52734015, - "step": 2430, - "time_per_iteration": 4.197890758514404 - }, - { - "auxiliary_loss_clip": 0.01170782, - "auxiliary_loss_mlp": 0.01061965, - "balance_loss_clip": 1.05820751, - "balance_loss_mlp": 1.04179525, - "epoch": 0.14615962723583345, - "flos": 22418852469120.0, - "grad_norm": 2.4376691278662506, - "language_loss": 0.82910693, - "learning_rate": 3.860120165643504e-06, - "loss": 0.85143435, - "num_input_tokens_seen": 52753025, - "step": 2431, - "time_per_iteration": 4.162708282470703 - }, - { - "auxiliary_loss_clip": 0.011607, - "auxiliary_loss_mlp": 0.01060112, - "balance_loss_clip": 1.05553937, - "balance_loss_mlp": 1.03853524, - "epoch": 0.14621975048850142, - "flos": 22346241125760.0, - "grad_norm": 2.881661839068268, - "language_loss": 0.78330141, - "learning_rate": 3.859977039248921e-06, - "loss": 0.80550951, - "num_input_tokens_seen": 52773420, - "step": 2432, - "time_per_iteration": 2.6907777786254883 - }, - { - "auxiliary_loss_clip": 0.01165399, - "auxiliary_loss_mlp": 0.00782861, - "balance_loss_clip": 1.05517077, - "balance_loss_mlp": 1.00040507, - "epoch": 0.1462798737411694, - "flos": 24389163152640.0, - "grad_norm": 2.3488382544651887, - "language_loss": 0.79515982, - "learning_rate": 3.859833842323822e-06, - "loss": 0.81464243, - "num_input_tokens_seen": 52792870, - "step": 2433, - "time_per_iteration": 2.719841241836548 - }, - { - "auxiliary_loss_clip": 0.01124303, - "auxiliary_loss_mlp": 0.01055776, - "balance_loss_clip": 1.05385411, - "balance_loss_mlp": 1.03484273, - "epoch": 0.14633999699383737, - "flos": 19244672530560.0, - "grad_norm": 2.0782880949269926, - "language_loss": 0.77905983, - "learning_rate": 3.859690574873638e-06, - "loss": 0.80086064, - "num_input_tokens_seen": 52811615, - "step": 2434, - "time_per_iteration": 4.371506929397583 - }, - { - "auxiliary_loss_clip": 0.01066282, - "auxiliary_loss_mlp": 0.01033141, - "balance_loss_clip": 1.05327988, - "balance_loss_mlp": 1.03022039, - "epoch": 0.14640012024650534, - "flos": 62660638270080.0, - "grad_norm": 0.8566726319617045, - "language_loss": 0.58453119, - "learning_rate": 3.8595472369038e-06, - "loss": 0.60552537, - "num_input_tokens_seen": 52873230, - "step": 2435, - "time_per_iteration": 3.229882001876831 - }, - { - "auxiliary_loss_clip": 0.01160087, - "auxiliary_loss_mlp": 0.01045043, - "balance_loss_clip": 1.05263698, - "balance_loss_mlp": 1.0257076, - "epoch": 0.1464602434991733, - "flos": 12276243146880.0, - "grad_norm": 3.775553645712452, - "language_loss": 0.88436592, - "learning_rate": 3.859403828419744e-06, - "loss": 0.90641725, - "num_input_tokens_seen": 52889325, - "step": 2436, - "time_per_iteration": 2.568624973297119 - }, - { - "auxiliary_loss_clip": 0.011561, - "auxiliary_loss_mlp": 0.00780257, - "balance_loss_clip": 1.05587268, - "balance_loss_mlp": 1.00041819, - "epoch": 0.14652036675184127, - "flos": 20922311197440.0, - "grad_norm": 2.028718201913856, - "language_loss": 0.74904168, - "learning_rate": 3.85926034942691e-06, - "loss": 0.7684052, - "num_input_tokens_seen": 52909705, - "step": 2437, - "time_per_iteration": 2.6361188888549805 - }, - { - "auxiliary_loss_clip": 0.01165187, - "auxiliary_loss_mlp": 0.01050068, - "balance_loss_clip": 1.05295086, - "balance_loss_mlp": 1.02729869, - "epoch": 0.14658049000450923, - "flos": 27703681528320.0, - "grad_norm": 3.0822234004311033, - "language_loss": 0.73914421, - "learning_rate": 3.859116799930736e-06, - "loss": 0.76129669, - "num_input_tokens_seen": 52930300, - "step": 2438, - "time_per_iteration": 2.7590928077697754 - }, - { - "auxiliary_loss_clip": 0.01154571, - "auxiliary_loss_mlp": 0.01046509, - "balance_loss_clip": 1.05747688, - "balance_loss_mlp": 1.02708936, - "epoch": 0.14664061325717723, - "flos": 24936513575040.0, - "grad_norm": 4.476318678757457, - "language_loss": 0.74410725, - "learning_rate": 3.858973179936668e-06, - "loss": 0.76611805, - "num_input_tokens_seen": 52949955, - "step": 2439, - "time_per_iteration": 2.627037763595581 - }, - { - "auxiliary_loss_clip": 0.01152452, - "auxiliary_loss_mlp": 0.01051294, - "balance_loss_clip": 1.05477583, - "balance_loss_mlp": 1.0309453, - "epoch": 0.1467007365098452, - "flos": 40297661406720.0, - "grad_norm": 2.1583973700525343, - "language_loss": 0.74123728, - "learning_rate": 3.85882948945015e-06, - "loss": 0.76327467, - "num_input_tokens_seen": 52972905, - "step": 2440, - "time_per_iteration": 2.79715633392334 - }, - { - "auxiliary_loss_clip": 0.01160843, - "auxiliary_loss_mlp": 0.01044034, - "balance_loss_clip": 1.05471611, - "balance_loss_mlp": 1.02493691, - "epoch": 0.14676085976251316, - "flos": 26541074021760.0, - "grad_norm": 1.9756103236146798, - "language_loss": 0.82730794, - "learning_rate": 3.85868572847663e-06, - "loss": 0.84935671, - "num_input_tokens_seen": 52994850, - "step": 2441, - "time_per_iteration": 2.6505653858184814 - }, - { - "auxiliary_loss_clip": 0.01152605, - "auxiliary_loss_mlp": 0.01049175, - "balance_loss_clip": 1.05408478, - "balance_loss_mlp": 1.02796757, - "epoch": 0.14682098301518112, - "flos": 23550110380800.0, - "grad_norm": 2.582118236216862, - "language_loss": 0.71455544, - "learning_rate": 3.858541897021563e-06, - "loss": 0.73657322, - "num_input_tokens_seen": 53014740, - "step": 2442, - "time_per_iteration": 2.772648572921753 - }, - { - "auxiliary_loss_clip": 0.0113053, - "auxiliary_loss_mlp": 0.0104246, - "balance_loss_clip": 1.05283213, - "balance_loss_mlp": 1.02224207, - "epoch": 0.1468811062678491, - "flos": 11651073909120.0, - "grad_norm": 3.6780587187273155, - "language_loss": 0.81992352, - "learning_rate": 3.8583979950904e-06, - "loss": 0.84165335, - "num_input_tokens_seen": 53029780, - "step": 2443, - "time_per_iteration": 2.6979780197143555 - }, - { - "auxiliary_loss_clip": 0.01147138, - "auxiliary_loss_mlp": 0.0105693, - "balance_loss_clip": 1.05402422, - "balance_loss_mlp": 1.03474557, - "epoch": 0.14694122952051705, - "flos": 23002616304000.0, - "grad_norm": 3.190851099873364, - "language_loss": 0.83093917, - "learning_rate": 3.858254022688599e-06, - "loss": 0.85297978, - "num_input_tokens_seen": 53048620, - "step": 2444, - "time_per_iteration": 2.7177255153656006 - }, - { - "auxiliary_loss_clip": 0.01134628, - "auxiliary_loss_mlp": 0.01051986, - "balance_loss_clip": 1.05385137, - "balance_loss_mlp": 1.03213811, - "epoch": 0.14700135277318502, - "flos": 26502972670080.0, - "grad_norm": 3.1425569240832414, - "language_loss": 0.71183646, - "learning_rate": 3.85810997982162e-06, - "loss": 0.7337026, - "num_input_tokens_seen": 53070055, - "step": 2445, - "time_per_iteration": 2.735361099243164 - }, - { - "auxiliary_loss_clip": 0.01095177, - "auxiliary_loss_mlp": 0.01023118, - "balance_loss_clip": 1.05335557, - "balance_loss_mlp": 1.01999438, - "epoch": 0.147061476025853, - "flos": 59449434387840.0, - "grad_norm": 0.824990401786658, - "language_loss": 0.63083708, - "learning_rate": 3.857965866494923e-06, - "loss": 0.65202004, - "num_input_tokens_seen": 53126945, - "step": 2446, - "time_per_iteration": 3.0853025913238525 - }, - { - "auxiliary_loss_clip": 0.01120664, - "auxiliary_loss_mlp": 0.01045249, - "balance_loss_clip": 1.05621576, - "balance_loss_mlp": 1.02491164, - "epoch": 0.14712159927852098, - "flos": 28330897841280.0, - "grad_norm": 2.813052009295296, - "language_loss": 0.74895924, - "learning_rate": 3.857821682713975e-06, - "loss": 0.77061838, - "num_input_tokens_seen": 53149130, - "step": 2447, - "time_per_iteration": 2.858643054962158 - }, - { - "auxiliary_loss_clip": 0.01168929, - "auxiliary_loss_mlp": 0.01042907, - "balance_loss_clip": 1.0604012, - "balance_loss_mlp": 1.02383327, - "epoch": 0.14718172253118894, - "flos": 27089825074560.0, - "grad_norm": 2.2427639286159367, - "language_loss": 0.8528471, - "learning_rate": 3.857677428484242e-06, - "loss": 0.87496543, - "num_input_tokens_seen": 53167120, - "step": 2448, - "time_per_iteration": 2.699781894683838 - }, - { - "auxiliary_loss_clip": 0.01092169, - "auxiliary_loss_mlp": 0.01019616, - "balance_loss_clip": 1.05051064, - "balance_loss_mlp": 1.01654005, - "epoch": 0.1472418457838569, - "flos": 66706764860160.0, - "grad_norm": 0.7683837313264128, - "language_loss": 0.56829578, - "learning_rate": 3.857533103811195e-06, - "loss": 0.58941364, - "num_input_tokens_seen": 53227945, - "step": 2449, - "time_per_iteration": 3.1478211879730225 - }, - { - "auxiliary_loss_clip": 0.01135016, - "auxiliary_loss_mlp": 0.01050801, - "balance_loss_clip": 1.05464292, - "balance_loss_mlp": 1.03023791, - "epoch": 0.14730196903652487, - "flos": 19573578391680.0, - "grad_norm": 1.9048653074507311, - "language_loss": 0.85067344, - "learning_rate": 3.857388708700307e-06, - "loss": 0.87253165, - "num_input_tokens_seen": 53244615, - "step": 2450, - "time_per_iteration": 2.726008653640747 - }, - { - "auxiliary_loss_clip": 0.01158708, - "auxiliary_loss_mlp": 0.01049735, - "balance_loss_clip": 1.05984712, - "balance_loss_mlp": 1.02994645, - "epoch": 0.14736209228919284, - "flos": 16071031296000.0, - "grad_norm": 2.306043539040143, - "language_loss": 0.74523091, - "learning_rate": 3.857244243157052e-06, - "loss": 0.76731533, - "num_input_tokens_seen": 53262205, - "step": 2451, - "time_per_iteration": 2.641082286834717 - }, - { - "auxiliary_loss_clip": 0.01133915, - "auxiliary_loss_mlp": 0.01038458, - "balance_loss_clip": 1.05399728, - "balance_loss_mlp": 1.02031422, - "epoch": 0.1474222155418608, - "flos": 23039460679680.0, - "grad_norm": 1.8026547738986978, - "language_loss": 0.82384264, - "learning_rate": 3.85709970718691e-06, - "loss": 0.84556639, - "num_input_tokens_seen": 53282445, - "step": 2452, - "time_per_iteration": 2.7810096740722656 - }, - { - "auxiliary_loss_clip": 0.01101553, - "auxiliary_loss_mlp": 0.01041864, - "balance_loss_clip": 1.05924153, - "balance_loss_mlp": 1.0238874, - "epoch": 0.1474823387945288, - "flos": 17018641946880.0, - "grad_norm": 1.6675065143572472, - "language_loss": 0.74075705, - "learning_rate": 3.856955100795361e-06, - "loss": 0.76219124, - "num_input_tokens_seen": 53299060, - "step": 2453, - "time_per_iteration": 2.7913167476654053 - }, - { - "auxiliary_loss_clip": 0.01141798, - "auxiliary_loss_mlp": 0.0104607, - "balance_loss_clip": 1.05557632, - "balance_loss_mlp": 1.026353, - "epoch": 0.14754246204719676, - "flos": 17895041884800.0, - "grad_norm": 1.9958141581621542, - "language_loss": 0.7558704, - "learning_rate": 3.856810423987889e-06, - "loss": 0.77774906, - "num_input_tokens_seen": 53315970, - "step": 2454, - "time_per_iteration": 2.7199089527130127 - }, - { - "auxiliary_loss_clip": 0.01147348, - "auxiliary_loss_mlp": 0.01038134, - "balance_loss_clip": 1.05733335, - "balance_loss_mlp": 1.01864362, - "epoch": 0.14760258529986472, - "flos": 13079097987840.0, - "grad_norm": 2.0858167958418674, - "language_loss": 0.83077228, - "learning_rate": 3.856665676769979e-06, - "loss": 0.85262716, - "num_input_tokens_seen": 53332940, - "step": 2455, - "time_per_iteration": 2.75616192817688 - }, - { - "auxiliary_loss_clip": 0.01130504, - "auxiliary_loss_mlp": 0.01042951, - "balance_loss_clip": 1.05704689, - "balance_loss_mlp": 1.02452159, - "epoch": 0.1476627085525327, - "flos": 30806399358720.0, - "grad_norm": 2.3702229998953976, - "language_loss": 0.83881497, - "learning_rate": 3.85652085914712e-06, - "loss": 0.86054951, - "num_input_tokens_seen": 53353295, - "step": 2456, - "time_per_iteration": 2.7914254665374756 - }, - { - "auxiliary_loss_clip": 0.01154014, - "auxiliary_loss_mlp": 0.01043715, - "balance_loss_clip": 1.05863023, - "balance_loss_mlp": 1.02514231, - "epoch": 0.14772283180520066, - "flos": 21689434984320.0, - "grad_norm": 2.4172359629848996, - "language_loss": 0.84154665, - "learning_rate": 3.856375971124805e-06, - "loss": 0.86352402, - "num_input_tokens_seen": 53373410, - "step": 2457, - "time_per_iteration": 2.688265323638916 - }, - { - "auxiliary_loss_clip": 0.01155788, - "auxiliary_loss_mlp": 0.01042903, - "balance_loss_clip": 1.06250155, - "balance_loss_mlp": 1.02529585, - "epoch": 0.14778295505786862, - "flos": 18770400328320.0, - "grad_norm": 6.310680797376285, - "language_loss": 0.75692672, - "learning_rate": 3.856231012708527e-06, - "loss": 0.77891362, - "num_input_tokens_seen": 53391430, - "step": 2458, - "time_per_iteration": 2.698697805404663 - }, - { - "auxiliary_loss_clip": 0.01117404, - "auxiliary_loss_mlp": 0.01047753, - "balance_loss_clip": 1.05451179, - "balance_loss_mlp": 1.02718902, - "epoch": 0.1478430783105366, - "flos": 22893555634560.0, - "grad_norm": 3.1268711361266393, - "language_loss": 0.83348328, - "learning_rate": 3.856085983903782e-06, - "loss": 0.85513484, - "num_input_tokens_seen": 53409960, - "step": 2459, - "time_per_iteration": 2.790552854537964 - }, - { - "auxiliary_loss_clip": 0.01126767, - "auxiliary_loss_mlp": 0.01042293, - "balance_loss_clip": 1.05070424, - "balance_loss_mlp": 1.02435231, - "epoch": 0.14790320156320458, - "flos": 15085319293440.0, - "grad_norm": 3.1203941208753534, - "language_loss": 0.7554391, - "learning_rate": 3.855940884716071e-06, - "loss": 0.77712965, - "num_input_tokens_seen": 53426160, - "step": 2460, - "time_per_iteration": 2.815455675125122 - }, - { - "auxiliary_loss_clip": 0.01134117, - "auxiliary_loss_mlp": 0.01056838, - "balance_loss_clip": 1.05845904, - "balance_loss_mlp": 1.03770471, - "epoch": 0.14796332481587254, - "flos": 26504768350080.0, - "grad_norm": 3.59241393994, - "language_loss": 0.81227219, - "learning_rate": 3.855795715150896e-06, - "loss": 0.83418173, - "num_input_tokens_seen": 53448530, - "step": 2461, - "time_per_iteration": 2.785569190979004 - }, - { - "auxiliary_loss_clip": 0.01156748, - "auxiliary_loss_mlp": 0.01051178, - "balance_loss_clip": 1.05812359, - "balance_loss_mlp": 1.03044713, - "epoch": 0.1480234480685405, - "flos": 17563191108480.0, - "grad_norm": 3.2910626990147183, - "language_loss": 0.66117477, - "learning_rate": 3.855650475213761e-06, - "loss": 0.683254, - "num_input_tokens_seen": 53465915, - "step": 2462, - "time_per_iteration": 2.7222983837127686 - }, - { - "auxiliary_loss_clip": 0.01136035, - "auxiliary_loss_mlp": 0.01049537, - "balance_loss_clip": 1.05622339, - "balance_loss_mlp": 1.02965331, - "epoch": 0.14808357132120847, - "flos": 53582203232640.0, - "grad_norm": 1.8120706772856114, - "language_loss": 0.67226064, - "learning_rate": 3.8555051649101745e-06, - "loss": 0.69411635, - "num_input_tokens_seen": 53496055, - "step": 2463, - "time_per_iteration": 3.0344398021698 - }, - { - "auxiliary_loss_clip": 0.01153077, - "auxiliary_loss_mlp": 0.01050435, - "balance_loss_clip": 1.05550933, - "balance_loss_mlp": 1.0307889, - "epoch": 0.14814369457387644, - "flos": 19829190551040.0, - "grad_norm": 1.9881580745750587, - "language_loss": 0.76870739, - "learning_rate": 3.855359784245646e-06, - "loss": 0.79074258, - "num_input_tokens_seen": 53513790, - "step": 2464, - "time_per_iteration": 2.69480037689209 - }, - { - "auxiliary_loss_clip": 0.01133748, - "auxiliary_loss_mlp": 0.01057139, - "balance_loss_clip": 1.05392432, - "balance_loss_mlp": 1.03769565, - "epoch": 0.1482038178265444, - "flos": 23914962777600.0, - "grad_norm": 1.8401367705559406, - "language_loss": 0.79628456, - "learning_rate": 3.855214333225688e-06, - "loss": 0.81819344, - "num_input_tokens_seen": 53533410, - "step": 2465, - "time_per_iteration": 2.6989939212799072 - }, - { - "auxiliary_loss_clip": 0.01170385, - "auxiliary_loss_mlp": 0.01054925, - "balance_loss_clip": 1.06119514, - "balance_loss_mlp": 1.03568494, - "epoch": 0.1482639410792124, - "flos": 24170503109760.0, - "grad_norm": 2.005541134809237, - "language_loss": 0.76272273, - "learning_rate": 3.855068811855817e-06, - "loss": 0.78497583, - "num_input_tokens_seen": 53554775, - "step": 2466, - "time_per_iteration": 2.646245002746582 - }, - { - "auxiliary_loss_clip": 0.01018939, - "auxiliary_loss_mlp": 0.0114331, - "balance_loss_clip": 1.03313899, - "balance_loss_mlp": 1.14004362, - "epoch": 0.14832406433188036, - "flos": 66191051341440.0, - "grad_norm": 0.8320983618395327, - "language_loss": 0.6004858, - "learning_rate": 3.854923220141551e-06, - "loss": 0.62210834, - "num_input_tokens_seen": 53609675, - "step": 2467, - "time_per_iteration": 3.33776593208313 - }, - { - "auxiliary_loss_clip": 0.01141854, - "auxiliary_loss_mlp": 0.01044026, - "balance_loss_clip": 1.05437851, - "balance_loss_mlp": 1.02509522, - "epoch": 0.14838418758454833, - "flos": 25411252654080.0, - "grad_norm": 2.92694776694492, - "language_loss": 0.87666196, - "learning_rate": 3.85477755808841e-06, - "loss": 0.89852077, - "num_input_tokens_seen": 53626950, - "step": 2468, - "time_per_iteration": 4.266207456588745 - }, - { - "auxiliary_loss_clip": 0.01130189, - "auxiliary_loss_mlp": 0.01048186, - "balance_loss_clip": 1.05255163, - "balance_loss_mlp": 1.02782488, - "epoch": 0.1484443108372163, - "flos": 23289901280640.0, - "grad_norm": 2.2284173124426223, - "language_loss": 0.7598694, - "learning_rate": 3.854631825701919e-06, - "loss": 0.78165317, - "num_input_tokens_seen": 53644200, - "step": 2469, - "time_per_iteration": 4.217481851577759 - }, - { - "auxiliary_loss_clip": 0.01126269, - "auxiliary_loss_mlp": 0.0104139, - "balance_loss_clip": 1.05208421, - "balance_loss_mlp": 1.02251911, - "epoch": 0.14850443408988426, - "flos": 14647675985280.0, - "grad_norm": 6.591244267451795, - "language_loss": 0.75895017, - "learning_rate": 3.854486022987603e-06, - "loss": 0.78062677, - "num_input_tokens_seen": 53659650, - "step": 2470, - "time_per_iteration": 2.7157187461853027 - }, - { - "auxiliary_loss_clip": 0.01161157, - "auxiliary_loss_mlp": 0.01044729, - "balance_loss_clip": 1.05831027, - "balance_loss_mlp": 1.02571499, - "epoch": 0.14856455734255222, - "flos": 23548314700800.0, - "grad_norm": 1.8610043660805562, - "language_loss": 0.7215873, - "learning_rate": 3.8543401499509905e-06, - "loss": 0.74364614, - "num_input_tokens_seen": 53680275, - "step": 2471, - "time_per_iteration": 4.162387132644653 - }, - { - "auxiliary_loss_clip": 0.01135244, - "auxiliary_loss_mlp": 0.01047611, - "balance_loss_clip": 1.05438995, - "balance_loss_mlp": 1.02717888, - "epoch": 0.1486246805952202, - "flos": 18077288515200.0, - "grad_norm": 1.979025280241548, - "language_loss": 0.89558828, - "learning_rate": 3.854194206597615e-06, - "loss": 0.91741687, - "num_input_tokens_seen": 53698270, - "step": 2472, - "time_per_iteration": 2.739457607269287 - }, - { - "auxiliary_loss_clip": 0.01134625, - "auxiliary_loss_mlp": 0.01049109, - "balance_loss_clip": 1.06334805, - "balance_loss_mlp": 1.02964163, - "epoch": 0.14868480384788818, - "flos": 19353625459200.0, - "grad_norm": 2.6029609251362764, - "language_loss": 0.80801564, - "learning_rate": 3.854048192933008e-06, - "loss": 0.82985294, - "num_input_tokens_seen": 53716845, - "step": 2473, - "time_per_iteration": 4.412883758544922 - }, - { - "auxiliary_loss_clip": 0.01161034, - "auxiliary_loss_mlp": 0.01051306, - "balance_loss_clip": 1.0626657, - "balance_loss_mlp": 1.03267312, - "epoch": 0.14874492710055615, - "flos": 22200192426240.0, - "grad_norm": 3.426519274325147, - "language_loss": 0.77372944, - "learning_rate": 3.853902108962709e-06, - "loss": 0.79585278, - "num_input_tokens_seen": 53734970, - "step": 2474, - "time_per_iteration": 2.6879520416259766 - }, - { - "auxiliary_loss_clip": 0.01124216, - "auxiliary_loss_mlp": 0.01059785, - "balance_loss_clip": 1.05597806, - "balance_loss_mlp": 1.04041362, - "epoch": 0.1488050503532241, - "flos": 21103444506240.0, - "grad_norm": 2.4771626433268734, - "language_loss": 0.82151824, - "learning_rate": 3.853755954692255e-06, - "loss": 0.84335828, - "num_input_tokens_seen": 53753415, - "step": 2475, - "time_per_iteration": 2.7828469276428223 - }, - { - "auxiliary_loss_clip": 0.01115855, - "auxiliary_loss_mlp": 0.01052322, - "balance_loss_clip": 1.0614953, - "balance_loss_mlp": 1.03341544, - "epoch": 0.14886517360589208, - "flos": 12786569625600.0, - "grad_norm": 1.9349243252831771, - "language_loss": 0.80917645, - "learning_rate": 3.85360973012719e-06, - "loss": 0.83085823, - "num_input_tokens_seen": 53770305, - "step": 2476, - "time_per_iteration": 2.7227590084075928 - }, - { - "auxiliary_loss_clip": 0.01156019, - "auxiliary_loss_mlp": 0.0105036, - "balance_loss_clip": 1.06338036, - "balance_loss_mlp": 1.03216898, - "epoch": 0.14892529685856004, - "flos": 29022860419200.0, - "grad_norm": 2.0032169897498346, - "language_loss": 0.77659523, - "learning_rate": 3.853463435273058e-06, - "loss": 0.79865897, - "num_input_tokens_seen": 53788895, - "step": 2477, - "time_per_iteration": 2.740241765975952 - }, - { - "auxiliary_loss_clip": 0.0110234, - "auxiliary_loss_mlp": 0.01092005, - "balance_loss_clip": 1.07879949, - "balance_loss_mlp": 1.08730817, - "epoch": 0.148985420111228, - "flos": 61926121054080.0, - "grad_norm": 0.8188153224748298, - "language_loss": 0.60153681, - "learning_rate": 3.853317070135407e-06, - "loss": 0.62348026, - "num_input_tokens_seen": 53850260, - "step": 2478, - "time_per_iteration": 3.2467947006225586 - }, - { - "auxiliary_loss_clip": 0.01107417, - "auxiliary_loss_mlp": 0.01048452, - "balance_loss_clip": 1.0516423, - "balance_loss_mlp": 1.03041577, - "epoch": 0.149045543363896, - "flos": 23915106432000.0, - "grad_norm": 2.666109649137694, - "language_loss": 0.7139731, - "learning_rate": 3.853170634719787e-06, - "loss": 0.73553181, - "num_input_tokens_seen": 53867520, - "step": 2479, - "time_per_iteration": 2.7973475456237793 - }, - { - "auxiliary_loss_clip": 0.01140551, - "auxiliary_loss_mlp": 0.01043104, - "balance_loss_clip": 1.05563831, - "balance_loss_mlp": 1.02407789, - "epoch": 0.14910566661656396, - "flos": 23654394541440.0, - "grad_norm": 1.7687137634424535, - "language_loss": 0.80758464, - "learning_rate": 3.853024129031751e-06, - "loss": 0.82942122, - "num_input_tokens_seen": 53886620, - "step": 2480, - "time_per_iteration": 2.7238829135894775 - }, - { - "auxiliary_loss_clip": 0.01138106, - "auxiliary_loss_mlp": 0.0104537, - "balance_loss_clip": 1.0584991, - "balance_loss_mlp": 1.02627277, - "epoch": 0.14916578986923193, - "flos": 20515299212160.0, - "grad_norm": 4.65741826395702, - "language_loss": 0.84375542, - "learning_rate": 3.852877553076854e-06, - "loss": 0.86559021, - "num_input_tokens_seen": 53902230, - "step": 2481, - "time_per_iteration": 2.791550874710083 - }, - { - "auxiliary_loss_clip": 0.01149484, - "auxiliary_loss_mlp": 0.01050268, - "balance_loss_clip": 1.05772805, - "balance_loss_mlp": 1.02948999, - "epoch": 0.1492259131218999, - "flos": 22491822948480.0, - "grad_norm": 8.035113387353048, - "language_loss": 0.77703977, - "learning_rate": 3.8527309068606546e-06, - "loss": 0.79903734, - "num_input_tokens_seen": 53919475, - "step": 2482, - "time_per_iteration": 2.7310593128204346 - }, - { - "auxiliary_loss_clip": 0.01133163, - "auxiliary_loss_mlp": 0.01040426, - "balance_loss_clip": 1.05452228, - "balance_loss_mlp": 1.02032781, - "epoch": 0.14928603637456786, - "flos": 23185868515200.0, - "grad_norm": 2.207731010812049, - "language_loss": 0.78967929, - "learning_rate": 3.852584190388713e-06, - "loss": 0.81141514, - "num_input_tokens_seen": 53939150, - "step": 2483, - "time_per_iteration": 2.749671220779419 - }, - { - "auxiliary_loss_clip": 0.01154122, - "auxiliary_loss_mlp": 0.00776708, - "balance_loss_clip": 1.06144214, - "balance_loss_mlp": 1.00029397, - "epoch": 0.14934615962723582, - "flos": 21653237053440.0, - "grad_norm": 2.020127706544282, - "language_loss": 0.70361555, - "learning_rate": 3.852437403666595e-06, - "loss": 0.72292387, - "num_input_tokens_seen": 53958735, - "step": 2484, - "time_per_iteration": 2.737781524658203 - }, - { - "auxiliary_loss_clip": 0.01141919, - "auxiliary_loss_mlp": 0.00778215, - "balance_loss_clip": 1.05718136, - "balance_loss_mlp": 1.00030363, - "epoch": 0.1494062828799038, - "flos": 27010066924800.0, - "grad_norm": 2.165877689982274, - "language_loss": 0.84666765, - "learning_rate": 3.852290546699863e-06, - "loss": 0.86586899, - "num_input_tokens_seen": 53975065, - "step": 2485, - "time_per_iteration": 2.697976589202881 - }, - { - "auxiliary_loss_clip": 0.01145272, - "auxiliary_loss_mlp": 0.0104224, - "balance_loss_clip": 1.05639958, - "balance_loss_mlp": 1.02257001, - "epoch": 0.14946640613257178, - "flos": 21214947300480.0, - "grad_norm": 2.5229241908443023, - "language_loss": 0.8476423, - "learning_rate": 3.8521436194940894e-06, - "loss": 0.86951739, - "num_input_tokens_seen": 53993330, - "step": 2486, - "time_per_iteration": 2.6799628734588623 - }, - { - "auxiliary_loss_clip": 0.01149031, - "auxiliary_loss_mlp": 0.01039312, - "balance_loss_clip": 1.05667424, - "balance_loss_mlp": 1.0230875, - "epoch": 0.14952652938523975, - "flos": 13370872164480.0, - "grad_norm": 2.1822908802725203, - "language_loss": 0.74762607, - "learning_rate": 3.851996622054842e-06, - "loss": 0.76950949, - "num_input_tokens_seen": 54010515, - "step": 2487, - "time_per_iteration": 2.8037290573120117 - }, - { - "auxiliary_loss_clip": 0.01153097, - "auxiliary_loss_mlp": 0.01044274, - "balance_loss_clip": 1.05934322, - "balance_loss_mlp": 1.02611899, - "epoch": 0.1495866526379077, - "flos": 35517699959040.0, - "grad_norm": 16.320028017118723, - "language_loss": 0.72210175, - "learning_rate": 3.8518495543877e-06, - "loss": 0.74407548, - "num_input_tokens_seen": 54031315, - "step": 2488, - "time_per_iteration": 2.8031094074249268 - }, - { - "auxiliary_loss_clip": 0.01137536, - "auxiliary_loss_mlp": 0.01054916, - "balance_loss_clip": 1.05569518, - "balance_loss_mlp": 1.03636682, - "epoch": 0.14964677589057568, - "flos": 17632749795840.0, - "grad_norm": 3.2458980886023143, - "language_loss": 0.71352434, - "learning_rate": 3.851702416498235e-06, - "loss": 0.73544884, - "num_input_tokens_seen": 54045965, - "step": 2489, - "time_per_iteration": 2.648883819580078 - }, - { - "auxiliary_loss_clip": 0.0113767, - "auxiliary_loss_mlp": 0.01052603, - "balance_loss_clip": 1.05376494, - "balance_loss_mlp": 1.03357768, - "epoch": 0.14970689914324364, - "flos": 20185280029440.0, - "grad_norm": 3.893198448080141, - "language_loss": 0.81559736, - "learning_rate": 3.8515552083920295e-06, - "loss": 0.8375001, - "num_input_tokens_seen": 54059960, - "step": 2490, - "time_per_iteration": 2.702808380126953 - }, - { - "auxiliary_loss_clip": 0.01125097, - "auxiliary_loss_mlp": 0.01055928, - "balance_loss_clip": 1.05606139, - "balance_loss_mlp": 1.03803492, - "epoch": 0.1497670223959116, - "flos": 37228699382400.0, - "grad_norm": 1.9071281232744548, - "language_loss": 0.80057055, - "learning_rate": 3.851407930074666e-06, - "loss": 0.82238084, - "num_input_tokens_seen": 54079330, - "step": 2491, - "time_per_iteration": 2.833272933959961 - }, - { - "auxiliary_loss_clip": 0.01143407, - "auxiliary_loss_mlp": 0.01052558, - "balance_loss_clip": 1.05301452, - "balance_loss_mlp": 1.03195894, - "epoch": 0.1498271456485796, - "flos": 24455848752000.0, - "grad_norm": 2.3105790695512294, - "language_loss": 0.90820229, - "learning_rate": 3.851260581551727e-06, - "loss": 0.93016195, - "num_input_tokens_seen": 54097555, - "step": 2492, - "time_per_iteration": 2.684178352355957 - }, - { - "auxiliary_loss_clip": 0.01152331, - "auxiliary_loss_mlp": 0.01063543, - "balance_loss_clip": 1.05835843, - "balance_loss_mlp": 1.04508913, - "epoch": 0.14988726890124757, - "flos": 16253601148800.0, - "grad_norm": 6.881290297472923, - "language_loss": 0.79406559, - "learning_rate": 3.851113162828802e-06, - "loss": 0.81622434, - "num_input_tokens_seen": 54115600, - "step": 2493, - "time_per_iteration": 2.6558918952941895 - }, - { - "auxiliary_loss_clip": 0.0114858, - "auxiliary_loss_mlp": 0.01052018, - "balance_loss_clip": 1.05345511, - "balance_loss_mlp": 1.03258693, - "epoch": 0.14994739215391553, - "flos": 20666555383680.0, - "grad_norm": 2.3431247769189967, - "language_loss": 0.79894584, - "learning_rate": 3.85096567391148e-06, - "loss": 0.82095182, - "num_input_tokens_seen": 54135220, - "step": 2494, - "time_per_iteration": 2.6774168014526367 - }, - { - "auxiliary_loss_clip": 0.01137216, - "auxiliary_loss_mlp": 0.01050857, - "balance_loss_clip": 1.05474579, - "balance_loss_mlp": 1.03212965, - "epoch": 0.1500075154065835, - "flos": 70652375239680.0, - "grad_norm": 1.928941284350508, - "language_loss": 0.66480517, - "learning_rate": 3.850818114805354e-06, - "loss": 0.68668592, - "num_input_tokens_seen": 54161065, - "step": 2495, - "time_per_iteration": 3.1090729236602783 - }, - { - "auxiliary_loss_clip": 0.01103374, - "auxiliary_loss_mlp": 0.01038654, - "balance_loss_clip": 1.06896818, - "balance_loss_mlp": 1.03560257, - "epoch": 0.15006763865925146, - "flos": 68011937447040.0, - "grad_norm": 0.9030283421527312, - "language_loss": 0.59524739, - "learning_rate": 3.850670485516019e-06, - "loss": 0.61666763, - "num_input_tokens_seen": 54225095, - "step": 2496, - "time_per_iteration": 3.2250726222991943 - }, - { - "auxiliary_loss_clip": 0.01163934, - "auxiliary_loss_mlp": 0.01055725, - "balance_loss_clip": 1.05690169, - "balance_loss_mlp": 1.0360074, - "epoch": 0.15012776191191943, - "flos": 18916269459840.0, - "grad_norm": 3.063784198565679, - "language_loss": 0.65276247, - "learning_rate": 3.850522786049075e-06, - "loss": 0.67495906, - "num_input_tokens_seen": 54243750, - "step": 2497, - "time_per_iteration": 2.619946002960205 - }, - { - "auxiliary_loss_clip": 0.01125657, - "auxiliary_loss_mlp": 0.01054091, - "balance_loss_clip": 1.05308235, - "balance_loss_mlp": 1.03316998, - "epoch": 0.1501878851645874, - "flos": 23701330638720.0, - "grad_norm": 1.5552670947231086, - "language_loss": 0.75182658, - "learning_rate": 3.850375016410121e-06, - "loss": 0.77362406, - "num_input_tokens_seen": 54266185, - "step": 2498, - "time_per_iteration": 2.778163433074951 - }, - { - "auxiliary_loss_clip": 0.01132738, - "auxiliary_loss_mlp": 0.01046919, - "balance_loss_clip": 1.05919099, - "balance_loss_mlp": 1.02701163, - "epoch": 0.15024800841725539, - "flos": 20412523422720.0, - "grad_norm": 3.357364003851319, - "language_loss": 0.71821117, - "learning_rate": 3.850227176604761e-06, - "loss": 0.74000776, - "num_input_tokens_seen": 54283940, - "step": 2499, - "time_per_iteration": 2.6929259300231934 - }, - { - "auxiliary_loss_clip": 0.01134239, - "auxiliary_loss_mlp": 0.01051817, - "balance_loss_clip": 1.0547812, - "balance_loss_mlp": 1.03236222, - "epoch": 0.15030813166992335, - "flos": 31831002812160.0, - "grad_norm": 2.1406696998963652, - "language_loss": 0.7206136, - "learning_rate": 3.850079266638601e-06, - "loss": 0.7424742, - "num_input_tokens_seen": 54304830, - "step": 2500, - "time_per_iteration": 2.769988536834717 - }, - { - "auxiliary_loss_clip": 0.01134021, - "auxiliary_loss_mlp": 0.0105021, - "balance_loss_clip": 1.06063724, - "balance_loss_mlp": 1.03181624, - "epoch": 0.15036825492259132, - "flos": 35657822914560.0, - "grad_norm": 2.0251881980439306, - "language_loss": 0.65127194, - "learning_rate": 3.849931286517249e-06, - "loss": 0.6731143, - "num_input_tokens_seen": 54325595, - "step": 2501, - "time_per_iteration": 2.810945510864258 - }, - { - "auxiliary_loss_clip": 0.01137877, - "auxiliary_loss_mlp": 0.01055223, - "balance_loss_clip": 1.0541079, - "balance_loss_mlp": 1.03511274, - "epoch": 0.15042837817525928, - "flos": 18838163335680.0, - "grad_norm": 2.209666371186328, - "language_loss": 0.83401144, - "learning_rate": 3.849783236246318e-06, - "loss": 0.85594243, - "num_input_tokens_seen": 54342180, - "step": 2502, - "time_per_iteration": 2.6780545711517334 - }, - { - "auxiliary_loss_clip": 0.01122961, - "auxiliary_loss_mlp": 0.01049887, - "balance_loss_clip": 1.05318308, - "balance_loss_mlp": 1.0323875, - "epoch": 0.15048850142792725, - "flos": 19535548867200.0, - "grad_norm": 2.0319272128830947, - "language_loss": 0.77134645, - "learning_rate": 3.849635115831421e-06, - "loss": 0.79307491, - "num_input_tokens_seen": 54360255, - "step": 2503, - "time_per_iteration": 2.7579123973846436 - }, - { - "auxiliary_loss_clip": 0.01159116, - "auxiliary_loss_mlp": 0.01044094, - "balance_loss_clip": 1.05766046, - "balance_loss_mlp": 1.02692807, - "epoch": 0.1505486246805952, - "flos": 22017550746240.0, - "grad_norm": 1.9852139459946199, - "language_loss": 0.85514295, - "learning_rate": 3.849486925278176e-06, - "loss": 0.87717503, - "num_input_tokens_seen": 54378260, - "step": 2504, - "time_per_iteration": 2.631882905960083 - }, - { - "auxiliary_loss_clip": 0.01146113, - "auxiliary_loss_mlp": 0.01048035, - "balance_loss_clip": 1.05622697, - "balance_loss_mlp": 1.03098798, - "epoch": 0.15060874793326318, - "flos": 20743153136640.0, - "grad_norm": 1.8222645508164372, - "language_loss": 0.83178544, - "learning_rate": 3.8493386645922e-06, - "loss": 0.85372692, - "num_input_tokens_seen": 54399745, - "step": 2505, - "time_per_iteration": 2.7706007957458496 - }, - { - "auxiliary_loss_clip": 0.01125699, - "auxiliary_loss_mlp": 0.01053819, - "balance_loss_clip": 1.05586648, - "balance_loss_mlp": 1.03590202, - "epoch": 0.15066887118593117, - "flos": 16471902055680.0, - "grad_norm": 2.0148067518000445, - "language_loss": 0.76044405, - "learning_rate": 3.849190333779117e-06, - "loss": 0.7822392, - "num_input_tokens_seen": 54417105, - "step": 2506, - "time_per_iteration": 2.70989990234375 - }, - { - "auxiliary_loss_clip": 0.01165314, - "auxiliary_loss_mlp": 0.01041911, - "balance_loss_clip": 1.05785728, - "balance_loss_mlp": 1.02305174, - "epoch": 0.15072899443859913, - "flos": 19859319083520.0, - "grad_norm": 2.823460856599666, - "language_loss": 0.76220375, - "learning_rate": 3.849041932844552e-06, - "loss": 0.78427601, - "num_input_tokens_seen": 54433920, - "step": 2507, - "time_per_iteration": 2.5367634296417236 - }, - { - "auxiliary_loss_clip": 0.01144479, - "auxiliary_loss_mlp": 0.01041094, - "balance_loss_clip": 1.05261898, - "balance_loss_mlp": 1.02306986, - "epoch": 0.1507891176912671, - "flos": 20776226584320.0, - "grad_norm": 2.5197772895304906, - "language_loss": 0.68633789, - "learning_rate": 3.848893461794131e-06, - "loss": 0.70819366, - "num_input_tokens_seen": 54451540, - "step": 2508, - "time_per_iteration": 4.303388833999634 - }, - { - "auxiliary_loss_clip": 0.01130299, - "auxiliary_loss_mlp": 0.01046507, - "balance_loss_clip": 1.05477214, - "balance_loss_mlp": 1.02835178, - "epoch": 0.15084924094393506, - "flos": 23586631534080.0, - "grad_norm": 2.840517748098311, - "language_loss": 0.77994299, - "learning_rate": 3.8487449206334845e-06, - "loss": 0.80171108, - "num_input_tokens_seen": 54470800, - "step": 2509, - "time_per_iteration": 4.380200147628784 - }, - { - "auxiliary_loss_clip": 0.01141335, - "auxiliary_loss_mlp": 0.00776843, - "balance_loss_clip": 1.05463386, - "balance_loss_mlp": 1.00027037, - "epoch": 0.15090936419660303, - "flos": 18911313383040.0, - "grad_norm": 2.53406994590866, - "language_loss": 0.79959804, - "learning_rate": 3.848596309368246e-06, - "loss": 0.81877983, - "num_input_tokens_seen": 54486525, - "step": 2510, - "time_per_iteration": 4.219487428665161 - }, - { - "auxiliary_loss_clip": 0.01150641, - "auxiliary_loss_mlp": 0.01047345, - "balance_loss_clip": 1.05529225, - "balance_loss_mlp": 1.02794981, - "epoch": 0.150969487449271, - "flos": 17928223073280.0, - "grad_norm": 1.8628702139594306, - "language_loss": 0.73398602, - "learning_rate": 3.8484476280040495e-06, - "loss": 0.75596589, - "num_input_tokens_seen": 54503795, - "step": 2511, - "time_per_iteration": 2.62237811088562 - }, - { - "auxiliary_loss_clip": 0.01094269, - "auxiliary_loss_mlp": 0.0104236, - "balance_loss_clip": 1.04747009, - "balance_loss_mlp": 1.02365553, - "epoch": 0.151029610701939, - "flos": 24243078539520.0, - "grad_norm": 2.20399257021602, - "language_loss": 0.68716824, - "learning_rate": 3.848298876546534e-06, - "loss": 0.70853454, - "num_input_tokens_seen": 54523025, - "step": 2512, - "time_per_iteration": 2.823359489440918 - }, - { - "auxiliary_loss_clip": 0.01149398, - "auxiliary_loss_mlp": 0.01043296, - "balance_loss_clip": 1.05574036, - "balance_loss_mlp": 1.02615356, - "epoch": 0.15108973395460695, - "flos": 30262496641920.0, - "grad_norm": 2.6278607305338877, - "language_loss": 0.73833561, - "learning_rate": 3.84815005500134e-06, - "loss": 0.76026255, - "num_input_tokens_seen": 54545025, - "step": 2513, - "time_per_iteration": 4.386258602142334 - }, - { - "auxiliary_loss_clip": 0.01059691, - "auxiliary_loss_mlp": 0.01109321, - "balance_loss_clip": 1.0685482, - "balance_loss_mlp": 1.10529137, - "epoch": 0.15114985720727492, - "flos": 60437624428800.0, - "grad_norm": 0.9017688875456507, - "language_loss": 0.64720047, - "learning_rate": 3.84800116337411e-06, - "loss": 0.6688906, - "num_input_tokens_seen": 54604545, - "step": 2514, - "time_per_iteration": 3.254983425140381 - }, - { - "auxiliary_loss_clip": 0.01146323, - "auxiliary_loss_mlp": 0.0104352, - "balance_loss_clip": 1.05674648, - "balance_loss_mlp": 1.02584124, - "epoch": 0.15120998045994288, - "flos": 20521691832960.0, - "grad_norm": 3.178381755435586, - "language_loss": 0.72995645, - "learning_rate": 3.8478522016704916e-06, - "loss": 0.7518549, - "num_input_tokens_seen": 54620590, - "step": 2515, - "time_per_iteration": 2.67921781539917 - }, - { - "auxiliary_loss_clip": 0.01133382, - "auxiliary_loss_mlp": 0.01040315, - "balance_loss_clip": 1.05675673, - "balance_loss_mlp": 1.02120531, - "epoch": 0.15127010371261085, - "flos": 21178893024000.0, - "grad_norm": 2.0712989062813243, - "language_loss": 0.7773214, - "learning_rate": 3.8477031698961325e-06, - "loss": 0.79905832, - "num_input_tokens_seen": 54640410, - "step": 2516, - "time_per_iteration": 2.763467788696289 - }, - { - "auxiliary_loss_clip": 0.01087601, - "auxiliary_loss_mlp": 0.01004779, - "balance_loss_clip": 1.05344796, - "balance_loss_mlp": 1.00160813, - "epoch": 0.1513302269652788, - "flos": 65320648974720.0, - "grad_norm": 0.7270407819118658, - "language_loss": 0.54622567, - "learning_rate": 3.8475540680566835e-06, - "loss": 0.56714946, - "num_input_tokens_seen": 54701430, - "step": 2517, - "time_per_iteration": 3.2293660640716553 - }, - { - "auxiliary_loss_clip": 0.01110142, - "auxiliary_loss_mlp": 0.0104362, - "balance_loss_clip": 1.04499209, - "balance_loss_mlp": 1.02427244, - "epoch": 0.15139035021794678, - "flos": 19135827342720.0, - "grad_norm": 3.035771526476276, - "language_loss": 0.78264821, - "learning_rate": 3.8474048961577995e-06, - "loss": 0.80418587, - "num_input_tokens_seen": 54720845, - "step": 2518, - "time_per_iteration": 2.8154754638671875 - }, - { - "auxiliary_loss_clip": 0.01147342, - "auxiliary_loss_mlp": 0.01056368, - "balance_loss_clip": 1.05279088, - "balance_loss_mlp": 1.03681803, - "epoch": 0.15145047347061477, - "flos": 26578564842240.0, - "grad_norm": 2.1881526177791097, - "language_loss": 0.70480245, - "learning_rate": 3.847255654205137e-06, - "loss": 0.72683954, - "num_input_tokens_seen": 54740495, - "step": 2519, - "time_per_iteration": 2.7098515033721924 - }, - { - "auxiliary_loss_clip": 0.01152463, - "auxiliary_loss_mlp": 0.01056975, - "balance_loss_clip": 1.05683672, - "balance_loss_mlp": 1.03802037, - "epoch": 0.15151059672328274, - "flos": 20302959962880.0, - "grad_norm": 1.9048594994100874, - "language_loss": 0.78681207, - "learning_rate": 3.847106342204354e-06, - "loss": 0.80890644, - "num_input_tokens_seen": 54758415, - "step": 2520, - "time_per_iteration": 2.664187431335449 - }, - { - "auxiliary_loss_clip": 0.01140573, - "auxiliary_loss_mlp": 0.01071607, - "balance_loss_clip": 1.05435348, - "balance_loss_mlp": 1.05244994, - "epoch": 0.1515707199759507, - "flos": 27228367831680.0, - "grad_norm": 3.950911503454746, - "language_loss": 0.74849677, - "learning_rate": 3.846956960161114e-06, - "loss": 0.77061862, - "num_input_tokens_seen": 54779355, - "step": 2521, - "time_per_iteration": 2.7900772094726562 - }, - { - "auxiliary_loss_clip": 0.01132038, - "auxiliary_loss_mlp": 0.01055874, - "balance_loss_clip": 1.05052209, - "balance_loss_mlp": 1.0360136, - "epoch": 0.15163084322861867, - "flos": 23587349806080.0, - "grad_norm": 4.620979243079986, - "language_loss": 0.8253814, - "learning_rate": 3.84680750808108e-06, - "loss": 0.84726053, - "num_input_tokens_seen": 54799465, - "step": 2522, - "time_per_iteration": 2.7216525077819824 - }, - { - "auxiliary_loss_clip": 0.01051858, - "auxiliary_loss_mlp": 0.01048797, - "balance_loss_clip": 1.05645704, - "balance_loss_mlp": 1.04595995, - "epoch": 0.15169096648128663, - "flos": 66889622021760.0, - "grad_norm": 0.8362305181264502, - "language_loss": 0.57885599, - "learning_rate": 3.846657985969922e-06, - "loss": 0.59986252, - "num_input_tokens_seen": 54857665, - "step": 2523, - "time_per_iteration": 3.2375056743621826 - }, - { - "auxiliary_loss_clip": 0.0114147, - "auxiliary_loss_mlp": 0.01057964, - "balance_loss_clip": 1.05213499, - "balance_loss_mlp": 1.0368042, - "epoch": 0.1517510897339546, - "flos": 29095435848960.0, - "grad_norm": 1.8054087157705183, - "language_loss": 0.74795163, - "learning_rate": 3.8465083938333066e-06, - "loss": 0.76994598, - "num_input_tokens_seen": 54879895, - "step": 2524, - "time_per_iteration": 2.711557388305664 - }, - { - "auxiliary_loss_clip": 0.01138185, - "auxiliary_loss_mlp": 0.01057236, - "balance_loss_clip": 1.05304718, - "balance_loss_mlp": 1.03865099, - "epoch": 0.1518112129866226, - "flos": 18406553512320.0, - "grad_norm": 1.8255227790100423, - "language_loss": 0.74631184, - "learning_rate": 3.8463587316769085e-06, - "loss": 0.76826608, - "num_input_tokens_seen": 54898245, - "step": 2525, - "time_per_iteration": 2.6936984062194824 - }, - { - "auxiliary_loss_clip": 0.01144047, - "auxiliary_loss_mlp": 0.01057009, - "balance_loss_clip": 1.05403006, - "balance_loss_mlp": 1.03747034, - "epoch": 0.15187133623929056, - "flos": 19425410789760.0, - "grad_norm": 1.8907352833287865, - "language_loss": 0.79600316, - "learning_rate": 3.846208999506402e-06, - "loss": 0.81801373, - "num_input_tokens_seen": 54917060, - "step": 2526, - "time_per_iteration": 2.651494264602661 - }, - { - "auxiliary_loss_clip": 0.01135228, - "auxiliary_loss_mlp": 0.01047798, - "balance_loss_clip": 1.05538774, - "balance_loss_mlp": 1.03056002, - "epoch": 0.15193145949195852, - "flos": 17566207850880.0, - "grad_norm": 1.7677336965262924, - "language_loss": 0.8443349, - "learning_rate": 3.846059197327466e-06, - "loss": 0.86616516, - "num_input_tokens_seen": 54936365, - "step": 2527, - "time_per_iteration": 2.702683448791504 - }, - { - "auxiliary_loss_clip": 0.01124925, - "auxiliary_loss_mlp": 0.01049207, - "balance_loss_clip": 1.04976487, - "balance_loss_mlp": 1.02985954, - "epoch": 0.15199158274462649, - "flos": 36176265866880.0, - "grad_norm": 1.85678489681458, - "language_loss": 0.69361663, - "learning_rate": 3.845909325145779e-06, - "loss": 0.7153579, - "num_input_tokens_seen": 54961365, - "step": 2528, - "time_per_iteration": 2.9250690937042236 - }, - { - "auxiliary_loss_clip": 0.01134092, - "auxiliary_loss_mlp": 0.01055056, - "balance_loss_clip": 1.05266535, - "balance_loss_mlp": 1.03587484, - "epoch": 0.15205170599729445, - "flos": 23074042498560.0, - "grad_norm": 2.004144148858156, - "language_loss": 0.86482549, - "learning_rate": 3.845759382967026e-06, - "loss": 0.88671696, - "num_input_tokens_seen": 54980750, - "step": 2529, - "time_per_iteration": 2.7277863025665283 - }, - { - "auxiliary_loss_clip": 0.01124798, - "auxiliary_loss_mlp": 0.01041651, - "balance_loss_clip": 1.05046487, - "balance_loss_mlp": 1.02297091, - "epoch": 0.15211182924996242, - "flos": 21908382336000.0, - "grad_norm": 2.544775548600603, - "language_loss": 0.83399373, - "learning_rate": 3.845609370796893e-06, - "loss": 0.85565823, - "num_input_tokens_seen": 54999675, - "step": 2530, - "time_per_iteration": 2.8717291355133057 - }, - { - "auxiliary_loss_clip": 0.01125761, - "auxiliary_loss_mlp": 0.01048121, - "balance_loss_clip": 1.05035281, - "balance_loss_mlp": 1.02940559, - "epoch": 0.15217195250263038, - "flos": 13881521865600.0, - "grad_norm": 2.1410437006568723, - "language_loss": 0.80404246, - "learning_rate": 3.845459288641066e-06, - "loss": 0.82578129, - "num_input_tokens_seen": 55018295, - "step": 2531, - "time_per_iteration": 2.8444995880126953 - }, - { - "auxiliary_loss_clip": 0.01143114, - "auxiliary_loss_mlp": 0.01043494, - "balance_loss_clip": 1.05216551, - "balance_loss_mlp": 1.02613723, - "epoch": 0.15223207575529837, - "flos": 24535319592960.0, - "grad_norm": 1.7922494378130023, - "language_loss": 0.78874445, - "learning_rate": 3.8453091365052394e-06, - "loss": 0.81061059, - "num_input_tokens_seen": 55037975, - "step": 2532, - "time_per_iteration": 2.9122390747070312 - }, - { - "auxiliary_loss_clip": 0.01149502, - "auxiliary_loss_mlp": 0.0104596, - "balance_loss_clip": 1.05737543, - "balance_loss_mlp": 1.02676702, - "epoch": 0.15229219900796634, - "flos": 25556798563200.0, - "grad_norm": 1.9533698136575197, - "language_loss": 0.87679356, - "learning_rate": 3.845158914395105e-06, - "loss": 0.89874816, - "num_input_tokens_seen": 55057135, - "step": 2533, - "time_per_iteration": 2.7987985610961914 - }, - { - "auxiliary_loss_clip": 0.01117955, - "auxiliary_loss_mlp": 0.01048672, - "balance_loss_clip": 1.05235386, - "balance_loss_mlp": 1.02983665, - "epoch": 0.1523523222606343, - "flos": 18217806520320.0, - "grad_norm": 2.391026063452041, - "language_loss": 0.78886449, - "learning_rate": 3.84500862231636e-06, - "loss": 0.81053078, - "num_input_tokens_seen": 55075525, - "step": 2534, - "time_per_iteration": 2.7587406635284424 - }, - { - "auxiliary_loss_clip": 0.01164218, - "auxiliary_loss_mlp": 0.0104722, - "balance_loss_clip": 1.05609345, - "balance_loss_mlp": 1.0270381, - "epoch": 0.15241244551330227, - "flos": 13260087642240.0, - "grad_norm": 2.689732363294508, - "language_loss": 0.76809752, - "learning_rate": 3.844858260274702e-06, - "loss": 0.79021192, - "num_input_tokens_seen": 55090845, - "step": 2535, - "time_per_iteration": 2.7494406700134277 - }, - { - "auxiliary_loss_clip": 0.01142628, - "auxiliary_loss_mlp": 0.01042905, - "balance_loss_clip": 1.05345285, - "balance_loss_mlp": 1.02401042, - "epoch": 0.15247256876597023, - "flos": 19715568854400.0, - "grad_norm": 2.2235871255319446, - "language_loss": 0.78301942, - "learning_rate": 3.844707828275835e-06, - "loss": 0.80487478, - "num_input_tokens_seen": 55108750, - "step": 2536, - "time_per_iteration": 2.738638401031494 - }, - { - "auxiliary_loss_clip": 0.01128919, - "auxiliary_loss_mlp": 0.0105368, - "balance_loss_clip": 1.05349088, - "balance_loss_mlp": 1.03497589, - "epoch": 0.1525326920186382, - "flos": 20375858615040.0, - "grad_norm": 2.311649941233105, - "language_loss": 0.75824189, - "learning_rate": 3.844557326325461e-06, - "loss": 0.78006792, - "num_input_tokens_seen": 55126750, - "step": 2537, - "time_per_iteration": 2.632373809814453 - }, - { - "auxiliary_loss_clip": 0.0114911, - "auxiliary_loss_mlp": 0.01041421, - "balance_loss_clip": 1.05675745, - "balance_loss_mlp": 1.02331281, - "epoch": 0.15259281527130616, - "flos": 13589963170560.0, - "grad_norm": 2.193148723631548, - "language_loss": 0.77737647, - "learning_rate": 3.8444067544292896e-06, - "loss": 0.79928178, - "num_input_tokens_seen": 55144690, - "step": 2538, - "time_per_iteration": 2.6835639476776123 - }, - { - "auxiliary_loss_clip": 0.01109367, - "auxiliary_loss_mlp": 0.01042256, - "balance_loss_clip": 1.05477905, - "balance_loss_mlp": 1.02480412, - "epoch": 0.15265293852397416, - "flos": 22860374446080.0, - "grad_norm": 2.951423477379744, - "language_loss": 0.89502335, - "learning_rate": 3.844256112593029e-06, - "loss": 0.91653961, - "num_input_tokens_seen": 55166055, - "step": 2539, - "time_per_iteration": 2.7825794219970703 - }, - { - "auxiliary_loss_clip": 0.01142581, - "auxiliary_loss_mlp": 0.01045856, - "balance_loss_clip": 1.05367279, - "balance_loss_mlp": 1.02721143, - "epoch": 0.15271306177664212, - "flos": 29238108670080.0, - "grad_norm": 2.1073423273657044, - "language_loss": 0.93423879, - "learning_rate": 3.844105400822391e-06, - "loss": 0.95612311, - "num_input_tokens_seen": 55186285, - "step": 2540, - "time_per_iteration": 2.717541456222534 - }, - { - "auxiliary_loss_clip": 0.01131603, - "auxiliary_loss_mlp": 0.01041863, - "balance_loss_clip": 1.05122495, - "balance_loss_mlp": 1.0240885, - "epoch": 0.1527731850293101, - "flos": 31246269310080.0, - "grad_norm": 2.084754505375857, - "language_loss": 0.75217843, - "learning_rate": 3.843954619123092e-06, - "loss": 0.77391309, - "num_input_tokens_seen": 55207915, - "step": 2541, - "time_per_iteration": 2.8376123905181885 - }, - { - "auxiliary_loss_clip": 0.01116303, - "auxiliary_loss_mlp": 0.01045227, - "balance_loss_clip": 1.04877007, - "balance_loss_mlp": 1.0268805, - "epoch": 0.15283330828197805, - "flos": 22382079920640.0, - "grad_norm": 2.037290364787748, - "language_loss": 0.80996066, - "learning_rate": 3.84380376750085e-06, - "loss": 0.83157599, - "num_input_tokens_seen": 55227860, - "step": 2542, - "time_per_iteration": 2.7110376358032227 - }, - { - "auxiliary_loss_clip": 0.01160331, - "auxiliary_loss_mlp": 0.01048661, - "balance_loss_clip": 1.0566076, - "balance_loss_mlp": 1.02992105, - "epoch": 0.15289343153464602, - "flos": 25520133755520.0, - "grad_norm": 3.2152362880248857, - "language_loss": 0.77796149, - "learning_rate": 3.843652845961383e-06, - "loss": 0.80005145, - "num_input_tokens_seen": 55247330, - "step": 2543, - "time_per_iteration": 2.674131155014038 - }, - { - "auxiliary_loss_clip": 0.01145565, - "auxiliary_loss_mlp": 0.01042133, - "balance_loss_clip": 1.05380869, - "balance_loss_mlp": 1.02388239, - "epoch": 0.15295355478731398, - "flos": 22710016114560.0, - "grad_norm": 2.4890924021550918, - "language_loss": 0.85898137, - "learning_rate": 3.843501854510416e-06, - "loss": 0.88085836, - "num_input_tokens_seen": 55266195, - "step": 2544, - "time_per_iteration": 2.685840606689453 - }, - { - "auxiliary_loss_clip": 0.01149904, - "auxiliary_loss_mlp": 0.01051141, - "balance_loss_clip": 1.05162692, - "balance_loss_mlp": 1.03061318, - "epoch": 0.15301367803998198, - "flos": 23251907669760.0, - "grad_norm": 1.9817931887295275, - "language_loss": 0.83159137, - "learning_rate": 3.843350793153673e-06, - "loss": 0.85360181, - "num_input_tokens_seen": 55283305, - "step": 2545, - "time_per_iteration": 2.7415812015533447 - }, - { - "auxiliary_loss_clip": 0.01158976, - "auxiliary_loss_mlp": 0.01040888, - "balance_loss_clip": 1.05556524, - "balance_loss_mlp": 1.02257705, - "epoch": 0.15307380129264994, - "flos": 25886279041920.0, - "grad_norm": 6.0131413628182, - "language_loss": 0.71669161, - "learning_rate": 3.843199661896884e-06, - "loss": 0.73869026, - "num_input_tokens_seen": 55303035, - "step": 2546, - "time_per_iteration": 2.6626265048980713 - }, - { - "auxiliary_loss_clip": 0.01130357, - "auxiliary_loss_mlp": 0.01047635, - "balance_loss_clip": 1.05013335, - "balance_loss_mlp": 1.02688098, - "epoch": 0.1531339245453179, - "flos": 46973239205760.0, - "grad_norm": 1.6563553629779504, - "language_loss": 0.77438712, - "learning_rate": 3.843048460745779e-06, - "loss": 0.79616702, - "num_input_tokens_seen": 55327570, - "step": 2547, - "time_per_iteration": 4.451423168182373 - }, - { - "auxiliary_loss_clip": 0.01107553, - "auxiliary_loss_mlp": 0.01044692, - "balance_loss_clip": 1.04845536, - "balance_loss_mlp": 1.02517736, - "epoch": 0.15319404779798587, - "flos": 35882049565440.0, - "grad_norm": 2.3544675813743834, - "language_loss": 0.74357474, - "learning_rate": 3.842897189706092e-06, - "loss": 0.7650972, - "num_input_tokens_seen": 55351090, - "step": 2548, - "time_per_iteration": 2.846991539001465 - }, - { - "auxiliary_loss_clip": 0.01138346, - "auxiliary_loss_mlp": 0.0105294, - "balance_loss_clip": 1.05340147, - "balance_loss_mlp": 1.03304434, - "epoch": 0.15325417105065384, - "flos": 25664638170240.0, - "grad_norm": 1.446042531021912, - "language_loss": 0.80296385, - "learning_rate": 3.842745848783558e-06, - "loss": 0.82487667, - "num_input_tokens_seen": 55371050, - "step": 2549, - "time_per_iteration": 5.8849101066589355 - }, - { - "auxiliary_loss_clip": 0.01144858, - "auxiliary_loss_mlp": 0.01041292, - "balance_loss_clip": 1.05108786, - "balance_loss_mlp": 1.02255249, - "epoch": 0.1533142943033218, - "flos": 18770831291520.0, - "grad_norm": 1.6149920159034452, - "language_loss": 0.74602014, - "learning_rate": 3.842594437983917e-06, - "loss": 0.76788169, - "num_input_tokens_seen": 55390375, - "step": 2550, - "time_per_iteration": 2.684868812561035 - }, - { - "auxiliary_loss_clip": 0.01149823, - "auxiliary_loss_mlp": 0.01040743, - "balance_loss_clip": 1.05212283, - "balance_loss_mlp": 1.02129996, - "epoch": 0.15337441755598977, - "flos": 23107367341440.0, - "grad_norm": 2.33086854575276, - "language_loss": 0.76910275, - "learning_rate": 3.8424429573129115e-06, - "loss": 0.79100841, - "num_input_tokens_seen": 55408890, - "step": 2551, - "time_per_iteration": 4.415414333343506 - }, - { - "auxiliary_loss_clip": 0.01086721, - "auxiliary_loss_mlp": 0.01054065, - "balance_loss_clip": 1.05333817, - "balance_loss_mlp": 1.05116868, - "epoch": 0.15343454080865776, - "flos": 59861079227520.0, - "grad_norm": 0.9493148205555214, - "language_loss": 0.5665558, - "learning_rate": 3.842291406776283e-06, - "loss": 0.5879637, - "num_input_tokens_seen": 55463815, - "step": 2552, - "time_per_iteration": 3.1105730533599854 - }, - { - "auxiliary_loss_clip": 0.011128, - "auxiliary_loss_mlp": 0.01039619, - "balance_loss_clip": 1.05131924, - "balance_loss_mlp": 1.0204618, - "epoch": 0.15349466406132573, - "flos": 11910887959680.0, - "grad_norm": 2.183188616823757, - "language_loss": 0.88550794, - "learning_rate": 3.84213978637978e-06, - "loss": 0.90703207, - "num_input_tokens_seen": 55481050, - "step": 2553, - "time_per_iteration": 2.748298406600952 - }, - { - "auxiliary_loss_clip": 0.01147024, - "auxiliary_loss_mlp": 0.01042929, - "balance_loss_clip": 1.05247378, - "balance_loss_mlp": 1.0232954, - "epoch": 0.1535547873139937, - "flos": 24096922099200.0, - "grad_norm": 1.8094820084348213, - "language_loss": 0.7800495, - "learning_rate": 3.841988096129152e-06, - "loss": 0.80194902, - "num_input_tokens_seen": 55500050, - "step": 2554, - "time_per_iteration": 2.6555569171905518 - }, - { - "auxiliary_loss_clip": 0.01094445, - "auxiliary_loss_mlp": 0.01053684, - "balance_loss_clip": 1.04876757, - "balance_loss_mlp": 1.03291798, - "epoch": 0.15361491056666166, - "flos": 17566459246080.0, - "grad_norm": 2.372022486587551, - "language_loss": 0.77472258, - "learning_rate": 3.841836336030151e-06, - "loss": 0.79620385, - "num_input_tokens_seen": 55518125, - "step": 2555, - "time_per_iteration": 2.7507212162017822 - }, - { - "auxiliary_loss_clip": 0.01129555, - "auxiliary_loss_mlp": 0.01046723, - "balance_loss_clip": 1.05400753, - "balance_loss_mlp": 1.02873409, - "epoch": 0.15367503381932962, - "flos": 25046041121280.0, - "grad_norm": 1.5517643759455655, - "language_loss": 0.77453947, - "learning_rate": 3.8416845060885305e-06, - "loss": 0.79630232, - "num_input_tokens_seen": 55540960, - "step": 2556, - "time_per_iteration": 2.7947654724121094 - }, - { - "auxiliary_loss_clip": 0.01140725, - "auxiliary_loss_mlp": 0.0077646, - "balance_loss_clip": 1.05336452, - "balance_loss_mlp": 1.00054574, - "epoch": 0.15373515707199759, - "flos": 21507332008320.0, - "grad_norm": 1.8786460244833383, - "language_loss": 0.90098578, - "learning_rate": 3.84153260631005e-06, - "loss": 0.92015761, - "num_input_tokens_seen": 55559210, - "step": 2557, - "time_per_iteration": 2.702029228210449 - }, - { - "auxiliary_loss_clip": 0.01137441, - "auxiliary_loss_mlp": 0.01048546, - "balance_loss_clip": 1.05146766, - "balance_loss_mlp": 1.02862656, - "epoch": 0.15379528032466555, - "flos": 25994729180160.0, - "grad_norm": 2.4046585493240102, - "language_loss": 0.7092281, - "learning_rate": 3.841380636700468e-06, - "loss": 0.73108798, - "num_input_tokens_seen": 55578925, - "step": 2558, - "time_per_iteration": 2.815653085708618 - }, - { - "auxiliary_loss_clip": 0.01131603, - "auxiliary_loss_mlp": 0.01045983, - "balance_loss_clip": 1.04937947, - "balance_loss_mlp": 1.02659965, - "epoch": 0.15385540357733354, - "flos": 19277315015040.0, - "grad_norm": 2.1050139676488535, - "language_loss": 0.92165422, - "learning_rate": 3.841228597265548e-06, - "loss": 0.94343007, - "num_input_tokens_seen": 55597255, - "step": 2559, - "time_per_iteration": 2.7363967895507812 - }, - { - "auxiliary_loss_clip": 0.011375, - "auxiliary_loss_mlp": 0.01057878, - "balance_loss_clip": 1.05492043, - "balance_loss_mlp": 1.03711152, - "epoch": 0.1539155268300015, - "flos": 28549126920960.0, - "grad_norm": 2.149412909113977, - "language_loss": 0.63330692, - "learning_rate": 3.841076488011055e-06, - "loss": 0.65526068, - "num_input_tokens_seen": 55619515, - "step": 2560, - "time_per_iteration": 2.811800003051758 - }, - { - "auxiliary_loss_clip": 0.01132154, - "auxiliary_loss_mlp": 0.01043974, - "balance_loss_clip": 1.04914606, - "balance_loss_mlp": 1.02416182, - "epoch": 0.15397565008266947, - "flos": 23547883737600.0, - "grad_norm": 2.066473237183783, - "language_loss": 0.88155699, - "learning_rate": 3.8409243089427574e-06, - "loss": 0.90331829, - "num_input_tokens_seen": 55640050, - "step": 2561, - "time_per_iteration": 2.7991089820861816 - }, - { - "auxiliary_loss_clip": 0.0114054, - "auxiliary_loss_mlp": 0.01041879, - "balance_loss_clip": 1.05085099, - "balance_loss_mlp": 1.02380693, - "epoch": 0.15403577333533744, - "flos": 17129821518720.0, - "grad_norm": 1.906051405357337, - "language_loss": 0.83117974, - "learning_rate": 3.840772060066425e-06, - "loss": 0.85300398, - "num_input_tokens_seen": 55658695, - "step": 2562, - "time_per_iteration": 2.6410810947418213 - }, - { - "auxiliary_loss_clip": 0.01128756, - "auxiliary_loss_mlp": 0.00778205, - "balance_loss_clip": 1.04988563, - "balance_loss_mlp": 1.00058532, - "epoch": 0.1540958965880054, - "flos": 17894503180800.0, - "grad_norm": 2.3547297997270906, - "language_loss": 0.74647415, - "learning_rate": 3.840619741387832e-06, - "loss": 0.76554382, - "num_input_tokens_seen": 55676340, - "step": 2563, - "time_per_iteration": 2.6813745498657227 - }, - { - "auxiliary_loss_clip": 0.01116857, - "auxiliary_loss_mlp": 0.0104411, - "balance_loss_clip": 1.05126941, - "balance_loss_mlp": 1.02444029, - "epoch": 0.15415601984067337, - "flos": 32161057908480.0, - "grad_norm": 2.842824767177756, - "language_loss": 0.7609179, - "learning_rate": 3.8404673529127534e-06, - "loss": 0.78252757, - "num_input_tokens_seen": 55698890, - "step": 2564, - "time_per_iteration": 2.832885265350342 - }, - { - "auxiliary_loss_clip": 0.01133461, - "auxiliary_loss_mlp": 0.01052887, - "balance_loss_clip": 1.05174518, - "balance_loss_mlp": 1.03443313, - "epoch": 0.15421614309334136, - "flos": 24024418496640.0, - "grad_norm": 2.0125869911748575, - "language_loss": 0.70960921, - "learning_rate": 3.840314894646969e-06, - "loss": 0.73147273, - "num_input_tokens_seen": 55718535, - "step": 2565, - "time_per_iteration": 2.7352514266967773 - }, - { - "auxiliary_loss_clip": 0.01137766, - "auxiliary_loss_mlp": 0.01046908, - "balance_loss_clip": 1.04731965, - "balance_loss_mlp": 1.02787066, - "epoch": 0.15427626634600933, - "flos": 24386290064640.0, - "grad_norm": 2.1021891280826965, - "language_loss": 0.71605748, - "learning_rate": 3.840162366596259e-06, - "loss": 0.73790431, - "num_input_tokens_seen": 55738970, - "step": 2566, - "time_per_iteration": 2.681710720062256 - }, - { - "auxiliary_loss_clip": 0.01150619, - "auxiliary_loss_mlp": 0.01040725, - "balance_loss_clip": 1.04834008, - "balance_loss_mlp": 1.02271223, - "epoch": 0.1543363895986773, - "flos": 23331522165120.0, - "grad_norm": 1.7167104030167524, - "language_loss": 0.84746087, - "learning_rate": 3.840009768766408e-06, - "loss": 0.86937428, - "num_input_tokens_seen": 55759585, - "step": 2567, - "time_per_iteration": 2.6413686275482178 - }, - { - "auxiliary_loss_clip": 0.01104646, - "auxiliary_loss_mlp": 0.01050344, - "balance_loss_clip": 1.04447246, - "balance_loss_mlp": 1.03164053, - "epoch": 0.15439651285134526, - "flos": 24274284480000.0, - "grad_norm": 2.9101336164483014, - "language_loss": 0.78074998, - "learning_rate": 3.839857101163202e-06, - "loss": 0.80229992, - "num_input_tokens_seen": 55779250, - "step": 2568, - "time_per_iteration": 2.7385261058807373 - }, - { - "auxiliary_loss_clip": 0.01121993, - "auxiliary_loss_mlp": 0.01037084, - "balance_loss_clip": 1.04715753, - "balance_loss_mlp": 1.01684201, - "epoch": 0.15445663610401322, - "flos": 22456163721600.0, - "grad_norm": 1.852436867559063, - "language_loss": 0.6991998, - "learning_rate": 3.83970436379243e-06, - "loss": 0.72079051, - "num_input_tokens_seen": 55800470, - "step": 2569, - "time_per_iteration": 2.746974229812622 - }, - { - "auxiliary_loss_clip": 0.01124209, - "auxiliary_loss_mlp": 0.01040299, - "balance_loss_clip": 1.04695952, - "balance_loss_mlp": 1.02178574, - "epoch": 0.1545167593566812, - "flos": 22049510872320.0, - "grad_norm": 1.7212875994527412, - "language_loss": 0.76482332, - "learning_rate": 3.839551556659884e-06, - "loss": 0.78646845, - "num_input_tokens_seen": 55817795, - "step": 2570, - "time_per_iteration": 2.7470619678497314 - }, - { - "auxiliary_loss_clip": 0.01137702, - "auxiliary_loss_mlp": 0.01038561, - "balance_loss_clip": 1.04993737, - "balance_loss_mlp": 1.0192852, - "epoch": 0.15457688260934915, - "flos": 19318253541120.0, - "grad_norm": 2.5033166184578066, - "language_loss": 0.77997506, - "learning_rate": 3.839398679771359e-06, - "loss": 0.80173767, - "num_input_tokens_seen": 55836125, - "step": 2571, - "time_per_iteration": 2.692863702774048 - }, - { - "auxiliary_loss_clip": 0.0113208, - "auxiliary_loss_mlp": 0.0104519, - "balance_loss_clip": 1.0498451, - "balance_loss_mlp": 1.02704597, - "epoch": 0.15463700586201715, - "flos": 24133981956480.0, - "grad_norm": 4.3242380509309015, - "language_loss": 0.82932413, - "learning_rate": 3.839245733132652e-06, - "loss": 0.85109681, - "num_input_tokens_seen": 55855280, - "step": 2572, - "time_per_iteration": 2.8341822624206543 - }, - { - "auxiliary_loss_clip": 0.01156188, - "auxiliary_loss_mlp": 0.01042592, - "balance_loss_clip": 1.05181205, - "balance_loss_mlp": 1.02383995, - "epoch": 0.1546971291146851, - "flos": 22420935457920.0, - "grad_norm": 1.5874704718869805, - "language_loss": 0.90373385, - "learning_rate": 3.839092716749563e-06, - "loss": 0.92572165, - "num_input_tokens_seen": 55875695, - "step": 2573, - "time_per_iteration": 2.740121364593506 - }, - { - "auxiliary_loss_clip": 0.01088424, - "auxiliary_loss_mlp": 0.01049893, - "balance_loss_clip": 1.04328668, - "balance_loss_mlp": 1.03003311, - "epoch": 0.15475725236735308, - "flos": 17530225401600.0, - "grad_norm": 1.596795561637076, - "language_loss": 0.70298707, - "learning_rate": 3.838939630627893e-06, - "loss": 0.72437024, - "num_input_tokens_seen": 55894575, - "step": 2574, - "time_per_iteration": 2.7629144191741943 - }, - { - "auxiliary_loss_clip": 0.01127537, - "auxiliary_loss_mlp": 0.01045732, - "balance_loss_clip": 1.04714394, - "balance_loss_mlp": 1.02509642, - "epoch": 0.15481737562002104, - "flos": 22561740771840.0, - "grad_norm": 6.018921028505516, - "language_loss": 0.82426423, - "learning_rate": 3.838786474773448e-06, - "loss": 0.84599686, - "num_input_tokens_seen": 55912855, - "step": 2575, - "time_per_iteration": 2.656783103942871 - }, - { - "auxiliary_loss_clip": 0.01127415, - "auxiliary_loss_mlp": 0.01043354, - "balance_loss_clip": 1.04681587, - "balance_loss_mlp": 1.02584219, - "epoch": 0.154877498872689, - "flos": 24900567039360.0, - "grad_norm": 1.8376318938002576, - "language_loss": 0.85038638, - "learning_rate": 3.838633249192036e-06, - "loss": 0.87209404, - "num_input_tokens_seen": 55932375, - "step": 2576, - "time_per_iteration": 2.648484230041504 - }, - { - "auxiliary_loss_clip": 0.01152547, - "auxiliary_loss_mlp": 0.01043401, - "balance_loss_clip": 1.04872847, - "balance_loss_mlp": 1.02499545, - "epoch": 0.15493762212535697, - "flos": 28147501975680.0, - "grad_norm": 1.8027999188827728, - "language_loss": 0.82271254, - "learning_rate": 3.838479953889465e-06, - "loss": 0.84467208, - "num_input_tokens_seen": 55953970, - "step": 2577, - "time_per_iteration": 2.6355643272399902 - }, - { - "auxiliary_loss_clip": 0.01126009, - "auxiliary_loss_mlp": 0.01049018, - "balance_loss_clip": 1.05147958, - "balance_loss_mlp": 1.02984881, - "epoch": 0.15499774537802496, - "flos": 25411073086080.0, - "grad_norm": 2.1677069711314463, - "language_loss": 0.76556361, - "learning_rate": 3.8383265888715525e-06, - "loss": 0.78731394, - "num_input_tokens_seen": 55973120, - "step": 2578, - "time_per_iteration": 2.649043560028076 - }, - { - "auxiliary_loss_clip": 0.01123677, - "auxiliary_loss_mlp": 0.01044461, - "balance_loss_clip": 1.05155993, - "balance_loss_mlp": 1.0253042, - "epoch": 0.15505786863069293, - "flos": 22091562720000.0, - "grad_norm": 1.9614380224881987, - "language_loss": 0.82443559, - "learning_rate": 3.83817315414411e-06, - "loss": 0.8461169, - "num_input_tokens_seen": 55993260, - "step": 2579, - "time_per_iteration": 2.62631893157959 - }, - { - "auxiliary_loss_clip": 0.01143904, - "auxiliary_loss_mlp": 0.01044324, - "balance_loss_clip": 1.05856657, - "balance_loss_mlp": 1.02556014, - "epoch": 0.1551179918833609, - "flos": 18917131386240.0, - "grad_norm": 2.610374735790095, - "language_loss": 0.80465376, - "learning_rate": 3.838019649712958e-06, - "loss": 0.82653606, - "num_input_tokens_seen": 56012130, - "step": 2580, - "time_per_iteration": 2.6512253284454346 - }, - { - "auxiliary_loss_clip": 0.0107737, - "auxiliary_loss_mlp": 0.01006304, - "balance_loss_clip": 1.04551053, - "balance_loss_mlp": 1.00360954, - "epoch": 0.15517811513602886, - "flos": 66239172587520.0, - "grad_norm": 0.842131683094019, - "language_loss": 0.58823448, - "learning_rate": 3.8378660755839166e-06, - "loss": 0.60907125, - "num_input_tokens_seen": 56079045, - "step": 2581, - "time_per_iteration": 3.357855796813965 - }, - { - "auxiliary_loss_clip": 0.01108206, - "auxiliary_loss_mlp": 0.01047031, - "balance_loss_clip": 1.04392648, - "balance_loss_mlp": 1.0249418, - "epoch": 0.15523823838869683, - "flos": 24021078531840.0, - "grad_norm": 1.9584677228939371, - "language_loss": 0.84773678, - "learning_rate": 3.8377124317628095e-06, - "loss": 0.86928916, - "num_input_tokens_seen": 56098745, - "step": 2582, - "time_per_iteration": 2.727062702178955 - }, - { - "auxiliary_loss_clip": 0.01144131, - "auxiliary_loss_mlp": 0.01051911, - "balance_loss_clip": 1.05233002, - "balance_loss_mlp": 1.03175235, - "epoch": 0.1552983616413648, - "flos": 20485062938880.0, - "grad_norm": 2.466663791870015, - "language_loss": 0.79050052, - "learning_rate": 3.8375587182554625e-06, - "loss": 0.81246096, - "num_input_tokens_seen": 56117655, - "step": 2583, - "time_per_iteration": 2.664794683456421 - }, - { - "auxiliary_loss_clip": 0.01139818, - "auxiliary_loss_mlp": 0.01054771, - "balance_loss_clip": 1.04957032, - "balance_loss_mlp": 1.03252697, - "epoch": 0.15535848489403276, - "flos": 32123710742400.0, - "grad_norm": 1.8743170599575527, - "language_loss": 0.76320136, - "learning_rate": 3.837404935067705e-06, - "loss": 0.78514719, - "num_input_tokens_seen": 56141960, - "step": 2584, - "time_per_iteration": 2.757392168045044 - }, - { - "auxiliary_loss_clip": 0.01137324, - "auxiliary_loss_mlp": 0.01042496, - "balance_loss_clip": 1.04884958, - "balance_loss_mlp": 1.02302885, - "epoch": 0.15541860814670075, - "flos": 19098444263040.0, - "grad_norm": 1.6493041410587026, - "language_loss": 0.75269651, - "learning_rate": 3.837251082205368e-06, - "loss": 0.77449471, - "num_input_tokens_seen": 56161430, - "step": 2585, - "time_per_iteration": 2.6497461795806885 - }, - { - "auxiliary_loss_clip": 0.01116144, - "auxiliary_loss_mlp": 0.01042356, - "balance_loss_clip": 1.04862189, - "balance_loss_mlp": 1.02321053, - "epoch": 0.1554787313993687, - "flos": 19172097100800.0, - "grad_norm": 2.068989677221064, - "language_loss": 0.61187196, - "learning_rate": 3.837097159674286e-06, - "loss": 0.63345695, - "num_input_tokens_seen": 56179390, - "step": 2586, - "time_per_iteration": 2.697852373123169 - }, - { - "auxiliary_loss_clip": 0.01129408, - "auxiliary_loss_mlp": 0.01042187, - "balance_loss_clip": 1.04842281, - "balance_loss_mlp": 1.02341127, - "epoch": 0.15553885465203668, - "flos": 16143822207360.0, - "grad_norm": 1.8484108176722505, - "language_loss": 0.81318939, - "learning_rate": 3.836943167480296e-06, - "loss": 0.83490539, - "num_input_tokens_seen": 56198020, - "step": 2587, - "time_per_iteration": 4.212551593780518 - }, - { - "auxiliary_loss_clip": 0.01160891, - "auxiliary_loss_mlp": 0.01054822, - "balance_loss_clip": 1.05309868, - "balance_loss_mlp": 1.03325701, - "epoch": 0.15559897790470464, - "flos": 25337779384320.0, - "grad_norm": 1.866779523391448, - "language_loss": 0.88716942, - "learning_rate": 3.836789105629236e-06, - "loss": 0.90932655, - "num_input_tokens_seen": 56218165, - "step": 2588, - "time_per_iteration": 4.192267894744873 - }, - { - "auxiliary_loss_clip": 0.01094981, - "auxiliary_loss_mlp": 0.01052123, - "balance_loss_clip": 1.04558384, - "balance_loss_mlp": 1.03164268, - "epoch": 0.1556591011573726, - "flos": 23148772744320.0, - "grad_norm": 2.018423224363699, - "language_loss": 0.64624381, - "learning_rate": 3.83663497412695e-06, - "loss": 0.66771483, - "num_input_tokens_seen": 56237160, - "step": 2589, - "time_per_iteration": 4.303871154785156 - }, - { - "auxiliary_loss_clip": 0.01104407, - "auxiliary_loss_mlp": 0.01041976, - "balance_loss_clip": 1.04520249, - "balance_loss_mlp": 1.02123344, - "epoch": 0.15571922441004057, - "flos": 25370888745600.0, - "grad_norm": 1.784618480549341, - "language_loss": 0.82832813, - "learning_rate": 3.836480772979281e-06, - "loss": 0.84979194, - "num_input_tokens_seen": 56257610, - "step": 2590, - "time_per_iteration": 4.460350751876831 - }, - { - "auxiliary_loss_clip": 0.011248, - "auxiliary_loss_mlp": 0.01047287, - "balance_loss_clip": 1.05032134, - "balance_loss_mlp": 1.02694952, - "epoch": 0.15577934766270854, - "flos": 14501375890560.0, - "grad_norm": 2.6687659077907484, - "language_loss": 0.78766, - "learning_rate": 3.836326502192077e-06, - "loss": 0.80938083, - "num_input_tokens_seen": 56275215, - "step": 2591, - "time_per_iteration": 2.73305606842041 - }, - { - "auxiliary_loss_clip": 0.01143879, - "auxiliary_loss_mlp": 0.01049015, - "balance_loss_clip": 1.05174232, - "balance_loss_mlp": 1.03137255, - "epoch": 0.15583947091537653, - "flos": 37414537372800.0, - "grad_norm": 2.0331558547393054, - "language_loss": 0.65025747, - "learning_rate": 3.836172161771189e-06, - "loss": 0.67218637, - "num_input_tokens_seen": 56297130, - "step": 2592, - "time_per_iteration": 2.8582632541656494 - }, - { - "auxiliary_loss_clip": 0.01136043, - "auxiliary_loss_mlp": 0.01052096, - "balance_loss_clip": 1.05417228, - "balance_loss_mlp": 1.0322001, - "epoch": 0.1558995941680445, - "flos": 21834729498240.0, - "grad_norm": 2.311634250072179, - "language_loss": 0.82506329, - "learning_rate": 3.836017751722467e-06, - "loss": 0.84694475, - "num_input_tokens_seen": 56314995, - "step": 2593, - "time_per_iteration": 2.7230453491210938 - }, - { - "auxiliary_loss_clip": 0.01142565, - "auxiliary_loss_mlp": 0.01046037, - "balance_loss_clip": 1.05237365, - "balance_loss_mlp": 1.02676034, - "epoch": 0.15595971742071246, - "flos": 19792633484160.0, - "grad_norm": 2.778410683125911, - "language_loss": 0.73220694, - "learning_rate": 3.8358632720517695e-06, - "loss": 0.75409293, - "num_input_tokens_seen": 56334005, - "step": 2594, - "time_per_iteration": 2.708063840866089 - }, - { - "auxiliary_loss_clip": 0.01117989, - "auxiliary_loss_mlp": 0.01040106, - "balance_loss_clip": 1.0453043, - "balance_loss_mlp": 1.02077007, - "epoch": 0.15601984067338043, - "flos": 26722135503360.0, - "grad_norm": 2.1444704922101105, - "language_loss": 0.81569934, - "learning_rate": 3.835708722764952e-06, - "loss": 0.83728027, - "num_input_tokens_seen": 56353795, - "step": 2595, - "time_per_iteration": 2.716334581375122 - }, - { - "auxiliary_loss_clip": 0.01155359, - "auxiliary_loss_mlp": 0.01043269, - "balance_loss_clip": 1.05093551, - "balance_loss_mlp": 1.0238502, - "epoch": 0.1560799639260484, - "flos": 18369278173440.0, - "grad_norm": 1.8943501893042642, - "language_loss": 0.86674929, - "learning_rate": 3.835554103867876e-06, - "loss": 0.88873553, - "num_input_tokens_seen": 56373195, - "step": 2596, - "time_per_iteration": 2.5947446823120117 - }, - { - "auxiliary_loss_clip": 0.01144729, - "auxiliary_loss_mlp": 0.01042109, - "balance_loss_clip": 1.05225515, - "balance_loss_mlp": 1.02360725, - "epoch": 0.15614008717871636, - "flos": 22598980197120.0, - "grad_norm": 1.8059460934517404, - "language_loss": 0.68772388, - "learning_rate": 3.835399415366404e-06, - "loss": 0.70959222, - "num_input_tokens_seen": 56391525, - "step": 2597, - "time_per_iteration": 2.8101041316986084 - }, - { - "auxiliary_loss_clip": 0.01130069, - "auxiliary_loss_mlp": 0.01050835, - "balance_loss_clip": 1.05409336, - "balance_loss_mlp": 1.03165436, - "epoch": 0.15620021043138435, - "flos": 22746860490240.0, - "grad_norm": 1.9103744906429732, - "language_loss": 0.79860938, - "learning_rate": 3.8352446572664035e-06, - "loss": 0.82041842, - "num_input_tokens_seen": 56410715, - "step": 2598, - "time_per_iteration": 2.695117950439453 - }, - { - "auxiliary_loss_clip": 0.0112861, - "auxiliary_loss_mlp": 0.00776118, - "balance_loss_clip": 1.04750216, - "balance_loss_mlp": 1.0006249, - "epoch": 0.15626033368405232, - "flos": 13114936782720.0, - "grad_norm": 3.1104681024188827, - "language_loss": 0.83092594, - "learning_rate": 3.8350898295737405e-06, - "loss": 0.84997326, - "num_input_tokens_seen": 56429170, - "step": 2599, - "time_per_iteration": 2.665703773498535 - }, - { - "auxiliary_loss_clip": 0.01160593, - "auxiliary_loss_mlp": 0.0105002, - "balance_loss_clip": 1.05274248, - "balance_loss_mlp": 1.02924192, - "epoch": 0.15632045693672028, - "flos": 16472297105280.0, - "grad_norm": 2.2910683048406266, - "language_loss": 0.81530893, - "learning_rate": 3.834934932294287e-06, - "loss": 0.83741504, - "num_input_tokens_seen": 56445685, - "step": 2600, - "time_per_iteration": 2.615651845932007 - }, - { - "auxiliary_loss_clip": 0.01161023, - "auxiliary_loss_mlp": 0.00776671, - "balance_loss_clip": 1.05562234, - "balance_loss_mlp": 1.00063944, - "epoch": 0.15638058018938825, - "flos": 20850346298880.0, - "grad_norm": 1.7832591469657297, - "language_loss": 0.88511437, - "learning_rate": 3.834779965433917e-06, - "loss": 0.90449131, - "num_input_tokens_seen": 56465900, - "step": 2601, - "time_per_iteration": 2.6833529472351074 - }, - { - "auxiliary_loss_clip": 0.0116257, - "auxiliary_loss_mlp": 0.0106307, - "balance_loss_clip": 1.05569744, - "balance_loss_mlp": 1.04120743, - "epoch": 0.1564407034420562, - "flos": 21872220318720.0, - "grad_norm": 1.9421054688538308, - "language_loss": 0.78707534, - "learning_rate": 3.834624928998508e-06, - "loss": 0.80933177, - "num_input_tokens_seen": 56485020, - "step": 2602, - "time_per_iteration": 2.6296608448028564 - }, - { - "auxiliary_loss_clip": 0.01126653, - "auxiliary_loss_mlp": 0.01043676, - "balance_loss_clip": 1.05035329, - "balance_loss_mlp": 1.02419758, - "epoch": 0.15650082669472418, - "flos": 21834549930240.0, - "grad_norm": 1.8230718276715763, - "language_loss": 0.74029547, - "learning_rate": 3.8344698229939376e-06, - "loss": 0.76199877, - "num_input_tokens_seen": 56505205, - "step": 2603, - "time_per_iteration": 2.744508743286133 - }, - { - "auxiliary_loss_clip": 0.01143305, - "auxiliary_loss_mlp": 0.01051047, - "balance_loss_clip": 1.04820418, - "balance_loss_mlp": 1.03112721, - "epoch": 0.15656094994739214, - "flos": 13800542653440.0, - "grad_norm": 4.041164356714064, - "language_loss": 0.87723601, - "learning_rate": 3.8343146474260865e-06, - "loss": 0.89917958, - "num_input_tokens_seen": 56521495, - "step": 2604, - "time_per_iteration": 2.682457447052002 - }, - { - "auxiliary_loss_clip": 0.01145351, - "auxiliary_loss_mlp": 0.01044759, - "balance_loss_clip": 1.04976749, - "balance_loss_mlp": 1.0256021, - "epoch": 0.15662107320006013, - "flos": 27308197808640.0, - "grad_norm": 2.260429022209425, - "language_loss": 0.8573193, - "learning_rate": 3.834159402300841e-06, - "loss": 0.87922043, - "num_input_tokens_seen": 56540665, - "step": 2605, - "time_per_iteration": 2.7724974155426025 - }, - { - "auxiliary_loss_clip": 0.0115108, - "auxiliary_loss_mlp": 0.01047256, - "balance_loss_clip": 1.05181313, - "balance_loss_mlp": 1.02676356, - "epoch": 0.1566811964527281, - "flos": 26685075646080.0, - "grad_norm": 1.7309636492693905, - "language_loss": 0.73101914, - "learning_rate": 3.834004087624087e-06, - "loss": 0.75300246, - "num_input_tokens_seen": 56560805, - "step": 2606, - "time_per_iteration": 2.7490081787109375 - }, - { - "auxiliary_loss_clip": 0.01158388, - "auxiliary_loss_mlp": 0.01049752, - "balance_loss_clip": 1.0552665, - "balance_loss_mlp": 1.03165627, - "epoch": 0.15674131970539606, - "flos": 16103422385280.0, - "grad_norm": 2.968092109370304, - "language_loss": 0.76497948, - "learning_rate": 3.8338487034017145e-06, - "loss": 0.78706092, - "num_input_tokens_seen": 56576335, - "step": 2607, - "time_per_iteration": 2.6597230434417725 - }, - { - "auxiliary_loss_clip": 0.01120645, - "auxiliary_loss_mlp": 0.01047174, - "balance_loss_clip": 1.05131412, - "balance_loss_mlp": 1.0284934, - "epoch": 0.15680144295806403, - "flos": 19169690889600.0, - "grad_norm": 1.7981763092074996, - "language_loss": 0.82107675, - "learning_rate": 3.833693249639615e-06, - "loss": 0.84275496, - "num_input_tokens_seen": 56595880, - "step": 2608, - "time_per_iteration": 2.7072103023529053 - }, - { - "auxiliary_loss_clip": 0.0112834, - "auxiliary_loss_mlp": 0.01045106, - "balance_loss_clip": 1.04685056, - "balance_loss_mlp": 1.02436399, - "epoch": 0.156861566210732, - "flos": 20813430096000.0, - "grad_norm": 1.6817301031159713, - "language_loss": 0.72335941, - "learning_rate": 3.833537726343684e-06, - "loss": 0.74509382, - "num_input_tokens_seen": 56615130, - "step": 2609, - "time_per_iteration": 2.690690755844116 - }, - { - "auxiliary_loss_clip": 0.01143972, - "auxiliary_loss_mlp": 0.01036718, - "balance_loss_clip": 1.04901087, - "balance_loss_mlp": 1.01756072, - "epoch": 0.15692168946339996, - "flos": 20047922421120.0, - "grad_norm": 5.132438477880424, - "language_loss": 0.72317064, - "learning_rate": 3.833382133519818e-06, - "loss": 0.74497753, - "num_input_tokens_seen": 56634005, - "step": 2610, - "time_per_iteration": 2.6515614986419678 - }, - { - "auxiliary_loss_clip": 0.01159588, - "auxiliary_loss_mlp": 0.01051513, - "balance_loss_clip": 1.05216432, - "balance_loss_mlp": 1.03063977, - "epoch": 0.15698181271606793, - "flos": 21398019943680.0, - "grad_norm": 2.0600295188113935, - "language_loss": 0.72915608, - "learning_rate": 3.833226471173919e-06, - "loss": 0.75126708, - "num_input_tokens_seen": 56653480, - "step": 2611, - "time_per_iteration": 2.630988359451294 - }, - { - "auxiliary_loss_clip": 0.01141924, - "auxiliary_loss_mlp": 0.01042538, - "balance_loss_clip": 1.04917872, - "balance_loss_mlp": 1.0231905, - "epoch": 0.15704193596873592, - "flos": 20845785271680.0, - "grad_norm": 2.0339762399532186, - "language_loss": 0.70766544, - "learning_rate": 3.833070739311887e-06, - "loss": 0.72951007, - "num_input_tokens_seen": 56672270, - "step": 2612, - "time_per_iteration": 2.6569461822509766 - }, - { - "auxiliary_loss_clip": 0.01116284, - "auxiliary_loss_mlp": 0.01051299, - "balance_loss_clip": 1.04844582, - "balance_loss_mlp": 1.03221321, - "epoch": 0.15710205922140388, - "flos": 21762908254080.0, - "grad_norm": 1.9704781930994688, - "language_loss": 0.76294881, - "learning_rate": 3.83291493793963e-06, - "loss": 0.78462464, - "num_input_tokens_seen": 56691510, - "step": 2613, - "time_per_iteration": 2.7188539505004883 - }, - { - "auxiliary_loss_clip": 0.01115155, - "auxiliary_loss_mlp": 0.01049301, - "balance_loss_clip": 1.04504919, - "balance_loss_mlp": 1.02956033, - "epoch": 0.15716218247407185, - "flos": 25007760201600.0, - "grad_norm": 2.137998057111896, - "language_loss": 0.65944499, - "learning_rate": 3.832759067063055e-06, - "loss": 0.68108952, - "num_input_tokens_seen": 56712230, - "step": 2614, - "time_per_iteration": 2.7550084590911865 - }, - { - "auxiliary_loss_clip": 0.01151987, - "auxiliary_loss_mlp": 0.01044173, - "balance_loss_clip": 1.05387104, - "balance_loss_mlp": 1.02374101, - "epoch": 0.1572223057267398, - "flos": 20191780391040.0, - "grad_norm": 2.2755662506820915, - "language_loss": 0.75204211, - "learning_rate": 3.832603126688072e-06, - "loss": 0.77400374, - "num_input_tokens_seen": 56727490, - "step": 2615, - "time_per_iteration": 2.683225154876709 - }, - { - "auxiliary_loss_clip": 0.01138545, - "auxiliary_loss_mlp": 0.01050891, - "balance_loss_clip": 1.05209839, - "balance_loss_mlp": 1.03078008, - "epoch": 0.15728242897940778, - "flos": 20959514709120.0, - "grad_norm": 2.581872009488739, - "language_loss": 0.73064095, - "learning_rate": 3.832447116820594e-06, - "loss": 0.75253528, - "num_input_tokens_seen": 56747385, - "step": 2616, - "time_per_iteration": 2.6660919189453125 - }, - { - "auxiliary_loss_clip": 0.01130717, - "auxiliary_loss_mlp": 0.01047511, - "balance_loss_clip": 1.04999971, - "balance_loss_mlp": 1.02794933, - "epoch": 0.15734255223207574, - "flos": 23038275530880.0, - "grad_norm": 2.813587490853999, - "language_loss": 0.72425079, - "learning_rate": 3.832291037466539e-06, - "loss": 0.74603307, - "num_input_tokens_seen": 56768055, - "step": 2617, - "time_per_iteration": 2.768561363220215 - }, - { - "auxiliary_loss_clip": 0.01138315, - "auxiliary_loss_mlp": 0.0104637, - "balance_loss_clip": 1.04947805, - "balance_loss_mlp": 1.02548432, - "epoch": 0.15740267548474374, - "flos": 20551281661440.0, - "grad_norm": 2.3222819484870016, - "language_loss": 0.74358094, - "learning_rate": 3.8321348886318235e-06, - "loss": 0.76542777, - "num_input_tokens_seen": 56785110, - "step": 2618, - "time_per_iteration": 2.66121768951416 - }, - { - "auxiliary_loss_clip": 0.01162954, - "auxiliary_loss_mlp": 0.01046178, - "balance_loss_clip": 1.05417252, - "balance_loss_mlp": 1.02526867, - "epoch": 0.1574627987374117, - "flos": 22666922772480.0, - "grad_norm": 1.8808629075569874, - "language_loss": 0.78896272, - "learning_rate": 3.8319786703223695e-06, - "loss": 0.81105405, - "num_input_tokens_seen": 56804975, - "step": 2619, - "time_per_iteration": 2.6743338108062744 - }, - { - "auxiliary_loss_clip": 0.01126081, - "auxiliary_loss_mlp": 0.01055551, - "balance_loss_clip": 1.05046356, - "balance_loss_mlp": 1.03576207, - "epoch": 0.15752292199007967, - "flos": 16800664262400.0, - "grad_norm": 1.9082963728737496, - "language_loss": 0.76517296, - "learning_rate": 3.831822382544101e-06, - "loss": 0.78698927, - "num_input_tokens_seen": 56822470, - "step": 2620, - "time_per_iteration": 2.6481080055236816 - }, - { - "auxiliary_loss_clip": 0.01136128, - "auxiliary_loss_mlp": 0.0104575, - "balance_loss_clip": 1.05097985, - "balance_loss_mlp": 1.02488887, - "epoch": 0.15758304524274763, - "flos": 29826002568960.0, - "grad_norm": 1.6603432400664486, - "language_loss": 0.7136035, - "learning_rate": 3.831666025302944e-06, - "loss": 0.73542225, - "num_input_tokens_seen": 56842100, - "step": 2621, - "time_per_iteration": 2.70985746383667 - }, - { - "auxiliary_loss_clip": 0.01103274, - "auxiliary_loss_mlp": 0.01052522, - "balance_loss_clip": 1.04624665, - "balance_loss_mlp": 1.02921629, - "epoch": 0.1576431684954156, - "flos": 53577426723840.0, - "grad_norm": 2.1843515622778624, - "language_loss": 0.72136736, - "learning_rate": 3.831509598604828e-06, - "loss": 0.74292529, - "num_input_tokens_seen": 56865920, - "step": 2622, - "time_per_iteration": 3.024561643600464 - }, - { - "auxiliary_loss_clip": 0.01095163, - "auxiliary_loss_mlp": 0.01043948, - "balance_loss_clip": 1.04474711, - "balance_loss_mlp": 1.02464843, - "epoch": 0.15770329174808356, - "flos": 20813609664000.0, - "grad_norm": 1.6586715789846178, - "language_loss": 0.87637675, - "learning_rate": 3.831353102455684e-06, - "loss": 0.8977679, - "num_input_tokens_seen": 56885265, - "step": 2623, - "time_per_iteration": 2.9600114822387695 - }, - { - "auxiliary_loss_clip": 0.01158714, - "auxiliary_loss_mlp": 0.01044337, - "balance_loss_clip": 1.05476475, - "balance_loss_mlp": 1.02564478, - "epoch": 0.15776341500075153, - "flos": 24974004395520.0, - "grad_norm": 1.6915331173398198, - "language_loss": 0.81600082, - "learning_rate": 3.831196536861448e-06, - "loss": 0.83803129, - "num_input_tokens_seen": 56906710, - "step": 2624, - "time_per_iteration": 2.6621103286743164 - }, - { - "auxiliary_loss_clip": 0.01122344, - "auxiliary_loss_mlp": 0.01049423, - "balance_loss_clip": 1.04776418, - "balance_loss_mlp": 1.02990842, - "epoch": 0.15782353825341952, - "flos": 21907915459200.0, - "grad_norm": 2.879465237309773, - "language_loss": 0.79977828, - "learning_rate": 3.831039901828054e-06, - "loss": 0.82149595, - "num_input_tokens_seen": 56924275, - "step": 2625, - "time_per_iteration": 2.7291064262390137 - }, - { - "auxiliary_loss_clip": 0.01157938, - "auxiliary_loss_mlp": 0.01046203, - "balance_loss_clip": 1.05403268, - "balance_loss_mlp": 1.02857196, - "epoch": 0.15788366150608749, - "flos": 26177191292160.0, - "grad_norm": 2.133783972400447, - "language_loss": 0.80332482, - "learning_rate": 3.830883197361445e-06, - "loss": 0.8253662, - "num_input_tokens_seen": 56941525, - "step": 2626, - "time_per_iteration": 4.252760171890259 - }, - { - "auxiliary_loss_clip": 0.01102762, - "auxiliary_loss_mlp": 0.01057658, - "balance_loss_clip": 1.05214024, - "balance_loss_mlp": 1.03512752, - "epoch": 0.15794378475875545, - "flos": 27709822753920.0, - "grad_norm": 3.9802810067864045, - "language_loss": 0.73636395, - "learning_rate": 3.830726423467561e-06, - "loss": 0.75796819, - "num_input_tokens_seen": 56962145, - "step": 2627, - "time_per_iteration": 4.328871250152588 - }, - { - "auxiliary_loss_clip": 0.01117433, - "auxiliary_loss_mlp": 0.01055032, - "balance_loss_clip": 1.0503006, - "balance_loss_mlp": 1.0351001, - "epoch": 0.15800390801142342, - "flos": 12130158533760.0, - "grad_norm": 2.0211273696228216, - "language_loss": 0.84589541, - "learning_rate": 3.830569580152348e-06, - "loss": 0.86762005, - "num_input_tokens_seen": 56977505, - "step": 2628, - "time_per_iteration": 2.6785013675689697 - }, - { - "auxiliary_loss_clip": 0.01129476, - "auxiliary_loss_mlp": 0.01040858, - "balance_loss_clip": 1.05065978, - "balance_loss_mlp": 1.02308416, - "epoch": 0.15806403126409138, - "flos": 20704728562560.0, - "grad_norm": 1.897214582222077, - "language_loss": 0.76437485, - "learning_rate": 3.830412667421752e-06, - "loss": 0.78607821, - "num_input_tokens_seen": 56996770, - "step": 2629, - "time_per_iteration": 4.2878499031066895 - }, - { - "auxiliary_loss_clip": 0.01143973, - "auxiliary_loss_mlp": 0.01046449, - "balance_loss_clip": 1.0529623, - "balance_loss_mlp": 1.02675569, - "epoch": 0.15812415451675935, - "flos": 17821712269440.0, - "grad_norm": 2.252423233454998, - "language_loss": 0.73337436, - "learning_rate": 3.8302556852817245e-06, - "loss": 0.75527859, - "num_input_tokens_seen": 57014970, - "step": 2630, - "time_per_iteration": 4.253108263015747 - }, - { - "auxiliary_loss_clip": 0.01156261, - "auxiliary_loss_mlp": 0.01045602, - "balance_loss_clip": 1.05644512, - "balance_loss_mlp": 1.02615929, - "epoch": 0.15818427776942734, - "flos": 20084048524800.0, - "grad_norm": 2.390369083551665, - "language_loss": 0.83678091, - "learning_rate": 3.8300986337382184e-06, - "loss": 0.85879952, - "num_input_tokens_seen": 57034045, - "step": 2631, - "time_per_iteration": 2.6145882606506348 - }, - { - "auxiliary_loss_clip": 0.01159092, - "auxiliary_loss_mlp": 0.01045772, - "balance_loss_clip": 1.05313432, - "balance_loss_mlp": 1.02746117, - "epoch": 0.1582444010220953, - "flos": 21214911386880.0, - "grad_norm": 1.8755653224160422, - "language_loss": 0.78415525, - "learning_rate": 3.8299415127971895e-06, - "loss": 0.80620384, - "num_input_tokens_seen": 57053695, - "step": 2632, - "time_per_iteration": 2.656691551208496 - }, - { - "auxiliary_loss_clip": 0.01151481, - "auxiliary_loss_mlp": 0.01057283, - "balance_loss_clip": 1.05574381, - "balance_loss_mlp": 1.03769732, - "epoch": 0.15830452427476327, - "flos": 17858341163520.0, - "grad_norm": 2.079450153413421, - "language_loss": 0.8301838, - "learning_rate": 3.829784322464594e-06, - "loss": 0.85227144, - "num_input_tokens_seen": 57071290, - "step": 2633, - "time_per_iteration": 2.622725248336792 - }, - { - "auxiliary_loss_clip": 0.01165069, - "auxiliary_loss_mlp": 0.01041545, - "balance_loss_clip": 1.05761647, - "balance_loss_mlp": 1.02223265, - "epoch": 0.15836464752743123, - "flos": 24534960456960.0, - "grad_norm": 2.1719104392782813, - "language_loss": 0.77448404, - "learning_rate": 3.829627062746394e-06, - "loss": 0.79655015, - "num_input_tokens_seen": 57091465, - "step": 2634, - "time_per_iteration": 2.6383235454559326 - }, - { - "auxiliary_loss_clip": 0.01127407, - "auxiliary_loss_mlp": 0.00777775, - "balance_loss_clip": 1.05277348, - "balance_loss_mlp": 1.00136137, - "epoch": 0.1584247707800992, - "flos": 20120821073280.0, - "grad_norm": 3.5133527254089087, - "language_loss": 0.88479185, - "learning_rate": 3.829469733648552e-06, - "loss": 0.90384364, - "num_input_tokens_seen": 57110075, - "step": 2635, - "time_per_iteration": 2.725924491882324 - }, - { - "auxiliary_loss_clip": 0.01096223, - "auxiliary_loss_mlp": 0.01058885, - "balance_loss_clip": 1.04816198, - "balance_loss_mlp": 1.03847599, - "epoch": 0.15848489403276717, - "flos": 20375966355840.0, - "grad_norm": 2.8627721083207627, - "language_loss": 0.75762677, - "learning_rate": 3.829312335177034e-06, - "loss": 0.77917778, - "num_input_tokens_seen": 57128945, - "step": 2636, - "time_per_iteration": 2.775310516357422 - }, - { - "auxiliary_loss_clip": 0.01120174, - "auxiliary_loss_mlp": 0.01043834, - "balance_loss_clip": 1.05117822, - "balance_loss_mlp": 1.02350879, - "epoch": 0.15854501728543513, - "flos": 39346890359040.0, - "grad_norm": 2.388418559522659, - "language_loss": 0.71977961, - "learning_rate": 3.82915486733781e-06, - "loss": 0.74141967, - "num_input_tokens_seen": 57152385, - "step": 2637, - "time_per_iteration": 2.8375279903411865 - }, - { - "auxiliary_loss_clip": 0.0115052, - "auxiliary_loss_mlp": 0.01044842, - "balance_loss_clip": 1.05661607, - "balance_loss_mlp": 1.02640057, - "epoch": 0.15860514053810312, - "flos": 24864225454080.0, - "grad_norm": 2.1640345554565057, - "language_loss": 0.78352648, - "learning_rate": 3.82899733013685e-06, - "loss": 0.80548006, - "num_input_tokens_seen": 57172620, - "step": 2638, - "time_per_iteration": 2.7298176288604736 - }, - { - "auxiliary_loss_clip": 0.01129706, - "auxiliary_loss_mlp": 0.01057375, - "balance_loss_clip": 1.05311394, - "balance_loss_mlp": 1.03715718, - "epoch": 0.1586652637907711, - "flos": 26177694082560.0, - "grad_norm": 2.325769963269074, - "language_loss": 0.75845039, - "learning_rate": 3.828839723580128e-06, - "loss": 0.78032124, - "num_input_tokens_seen": 57194680, - "step": 2639, - "time_per_iteration": 2.7731449604034424 - }, - { - "auxiliary_loss_clip": 0.01104856, - "auxiliary_loss_mlp": 0.01057283, - "balance_loss_clip": 1.05350864, - "balance_loss_mlp": 1.03772068, - "epoch": 0.15872538704343905, - "flos": 19792058866560.0, - "grad_norm": 2.173238447343554, - "language_loss": 0.81319505, - "learning_rate": 3.82868204767362e-06, - "loss": 0.83481646, - "num_input_tokens_seen": 57214675, - "step": 2640, - "time_per_iteration": 2.8024139404296875 - }, - { - "auxiliary_loss_clip": 0.01135166, - "auxiliary_loss_mlp": 0.01054673, - "balance_loss_clip": 1.05492401, - "balance_loss_mlp": 1.03426492, - "epoch": 0.15878551029610702, - "flos": 28475366342400.0, - "grad_norm": 2.013499020988034, - "language_loss": 0.66893363, - "learning_rate": 3.828524302423306e-06, - "loss": 0.69083202, - "num_input_tokens_seen": 57235830, - "step": 2641, - "time_per_iteration": 2.7519116401672363 - }, - { - "auxiliary_loss_clip": 0.01149448, - "auxiliary_loss_mlp": 0.01051949, - "balance_loss_clip": 1.05758858, - "balance_loss_mlp": 1.0326376, - "epoch": 0.15884563354877498, - "flos": 24206701040640.0, - "grad_norm": 2.139760259286454, - "language_loss": 0.7552591, - "learning_rate": 3.828366487835167e-06, - "loss": 0.77727306, - "num_input_tokens_seen": 57255970, - "step": 2642, - "time_per_iteration": 2.706136465072632 - }, - { - "auxiliary_loss_clip": 0.01156917, - "auxiliary_loss_mlp": 0.01042142, - "balance_loss_clip": 1.06263423, - "balance_loss_mlp": 1.02323556, - "epoch": 0.15890575680144295, - "flos": 23949795991680.0, - "grad_norm": 1.9419610036505286, - "language_loss": 0.70564604, - "learning_rate": 3.828208603915186e-06, - "loss": 0.72763658, - "num_input_tokens_seen": 57274435, - "step": 2643, - "time_per_iteration": 2.682015895843506 - }, - { - "auxiliary_loss_clip": 0.01161783, - "auxiliary_loss_mlp": 0.01041643, - "balance_loss_clip": 1.05891204, - "balance_loss_mlp": 1.02389312, - "epoch": 0.15896588005411091, - "flos": 21215019127680.0, - "grad_norm": 1.846517711414915, - "language_loss": 0.78057045, - "learning_rate": 3.828050650669353e-06, - "loss": 0.80260473, - "num_input_tokens_seen": 57293115, - "step": 2644, - "time_per_iteration": 2.683790922164917 - }, - { - "auxiliary_loss_clip": 0.01151239, - "auxiliary_loss_mlp": 0.01050105, - "balance_loss_clip": 1.05701637, - "balance_loss_mlp": 1.03154373, - "epoch": 0.1590260033067789, - "flos": 24352390604160.0, - "grad_norm": 3.757920662841351, - "language_loss": 0.81961924, - "learning_rate": 3.827892628103657e-06, - "loss": 0.84163266, - "num_input_tokens_seen": 57312565, - "step": 2645, - "time_per_iteration": 2.698085069656372 - }, - { - "auxiliary_loss_clip": 0.01162748, - "auxiliary_loss_mlp": 0.01048492, - "balance_loss_clip": 1.05487716, - "balance_loss_mlp": 1.02854836, - "epoch": 0.15908612655944687, - "flos": 32048944583040.0, - "grad_norm": 2.056693785790565, - "language_loss": 0.69412929, - "learning_rate": 3.827734536224087e-06, - "loss": 0.71624172, - "num_input_tokens_seen": 57333360, - "step": 2646, - "time_per_iteration": 2.7166528701782227 - }, - { - "auxiliary_loss_clip": 0.01135067, - "auxiliary_loss_mlp": 0.01040314, - "balance_loss_clip": 1.05435526, - "balance_loss_mlp": 1.02223015, - "epoch": 0.15914624981211484, - "flos": 17785370684160.0, - "grad_norm": 2.5975497323405055, - "language_loss": 0.62932581, - "learning_rate": 3.827576375036642e-06, - "loss": 0.65107965, - "num_input_tokens_seen": 57350575, - "step": 2647, - "time_per_iteration": 2.7405354976654053 - }, - { - "auxiliary_loss_clip": 0.01160144, - "auxiliary_loss_mlp": 0.01047955, - "balance_loss_clip": 1.05654776, - "balance_loss_mlp": 1.02896523, - "epoch": 0.1592063730647828, - "flos": 17712507945600.0, - "grad_norm": 2.2161421076431025, - "language_loss": 0.89490473, - "learning_rate": 3.827418144547318e-06, - "loss": 0.91698575, - "num_input_tokens_seen": 57367570, - "step": 2648, - "time_per_iteration": 2.6193346977233887 - }, - { - "auxiliary_loss_clip": 0.01158791, - "auxiliary_loss_mlp": 0.01048086, - "balance_loss_clip": 1.05630398, - "balance_loss_mlp": 1.03072906, - "epoch": 0.15926649631745077, - "flos": 18803545603200.0, - "grad_norm": 1.9960039108301237, - "language_loss": 0.91307199, - "learning_rate": 3.827259844762114e-06, - "loss": 0.93514073, - "num_input_tokens_seen": 57383980, - "step": 2649, - "time_per_iteration": 2.6137378215789795 - }, - { - "auxiliary_loss_clip": 0.01099661, - "auxiliary_loss_mlp": 0.01044384, - "balance_loss_clip": 1.05474401, - "balance_loss_mlp": 1.02439272, - "epoch": 0.15932661957011873, - "flos": 17566243764480.0, - "grad_norm": 2.3504548368335767, - "language_loss": 0.71782613, - "learning_rate": 3.827101475687033e-06, - "loss": 0.73926663, - "num_input_tokens_seen": 57400840, - "step": 2650, - "time_per_iteration": 2.8883376121520996 - }, - { - "auxiliary_loss_clip": 0.01146809, - "auxiliary_loss_mlp": 0.01041815, - "balance_loss_clip": 1.05386841, - "balance_loss_mlp": 1.02476835, - "epoch": 0.15938674282278673, - "flos": 13334351011200.0, - "grad_norm": 1.8238326955956992, - "language_loss": 0.71427429, - "learning_rate": 3.826943037328082e-06, - "loss": 0.73616046, - "num_input_tokens_seen": 57419230, - "step": 2651, - "time_per_iteration": 2.607879638671875 - }, - { - "auxiliary_loss_clip": 0.01118842, - "auxiliary_loss_mlp": 0.00777496, - "balance_loss_clip": 1.05154157, - "balance_loss_mlp": 1.00132799, - "epoch": 0.1594468660754547, - "flos": 22488842119680.0, - "grad_norm": 1.8928974850955373, - "language_loss": 0.80185902, - "learning_rate": 3.8267845296912674e-06, - "loss": 0.82082248, - "num_input_tokens_seen": 57439315, - "step": 2652, - "time_per_iteration": 2.718695640563965 - }, - { - "auxiliary_loss_clip": 0.01138048, - "auxiliary_loss_mlp": 0.00775, - "balance_loss_clip": 1.0567826, - "balance_loss_mlp": 1.00124729, - "epoch": 0.15950698932812266, - "flos": 15007320910080.0, - "grad_norm": 2.6116065834427387, - "language_loss": 0.69539076, - "learning_rate": 3.826625952782601e-06, - "loss": 0.71452117, - "num_input_tokens_seen": 57454635, - "step": 2653, - "time_per_iteration": 2.7088639736175537 - }, - { - "auxiliary_loss_clip": 0.01144826, - "auxiliary_loss_mlp": 0.01038735, - "balance_loss_clip": 1.05257821, - "balance_loss_mlp": 1.02050805, - "epoch": 0.15956711258079062, - "flos": 30155052084480.0, - "grad_norm": 2.1937273620657307, - "language_loss": 0.76670635, - "learning_rate": 3.826467306608095e-06, - "loss": 0.78854191, - "num_input_tokens_seen": 57476805, - "step": 2654, - "time_per_iteration": 2.79425048828125 - }, - { - "auxiliary_loss_clip": 0.01114313, - "auxiliary_loss_mlp": 0.01041134, - "balance_loss_clip": 1.04714727, - "balance_loss_mlp": 1.02248931, - "epoch": 0.1596272358334586, - "flos": 21032700670080.0, - "grad_norm": 2.0572535633716247, - "language_loss": 0.81873977, - "learning_rate": 3.826308591173765e-06, - "loss": 0.84029424, - "num_input_tokens_seen": 57496400, - "step": 2655, - "time_per_iteration": 2.6990878582000732 - }, - { - "auxiliary_loss_clip": 0.01112525, - "auxiliary_loss_mlp": 0.01046346, - "balance_loss_clip": 1.04670715, - "balance_loss_mlp": 1.02849984, - "epoch": 0.15968735908612655, - "flos": 15268032800640.0, - "grad_norm": 2.0964800101687486, - "language_loss": 0.73768878, - "learning_rate": 3.826149806485631e-06, - "loss": 0.75927746, - "num_input_tokens_seen": 57513700, - "step": 2656, - "time_per_iteration": 2.7409873008728027 - }, - { - "auxiliary_loss_clip": 0.01111218, - "auxiliary_loss_mlp": 0.01039948, - "balance_loss_clip": 1.04749918, - "balance_loss_mlp": 1.02220988, - "epoch": 0.15974748233879452, - "flos": 52665726695040.0, - "grad_norm": 2.516351978408242, - "language_loss": 0.77637637, - "learning_rate": 3.825990952549713e-06, - "loss": 0.79788804, - "num_input_tokens_seen": 57536180, - "step": 2657, - "time_per_iteration": 2.984161376953125 - }, - { - "auxiliary_loss_clip": 0.01142397, - "auxiliary_loss_mlp": 0.01048058, - "balance_loss_clip": 1.05276513, - "balance_loss_mlp": 1.02984321, - "epoch": 0.1598076055914625, - "flos": 18733232730240.0, - "grad_norm": 2.1741432296797303, - "language_loss": 0.74654955, - "learning_rate": 3.825832029372035e-06, - "loss": 0.76845407, - "num_input_tokens_seen": 57555025, - "step": 2658, - "time_per_iteration": 2.6795172691345215 - }, - { - "auxiliary_loss_clip": 0.01137294, - "auxiliary_loss_mlp": 0.01047097, - "balance_loss_clip": 1.05887127, - "balance_loss_mlp": 1.02581763, - "epoch": 0.15986772884413047, - "flos": 34349238535680.0, - "grad_norm": 2.2676743120149916, - "language_loss": 0.75164986, - "learning_rate": 3.825673036958624e-06, - "loss": 0.77349377, - "num_input_tokens_seen": 57577660, - "step": 2659, - "time_per_iteration": 2.885744094848633 - }, - { - "auxiliary_loss_clip": 0.01122752, - "auxiliary_loss_mlp": 0.0105323, - "balance_loss_clip": 1.0512991, - "balance_loss_mlp": 1.0334295, - "epoch": 0.15992785209679844, - "flos": 22054969739520.0, - "grad_norm": 2.181311046841435, - "language_loss": 0.90998709, - "learning_rate": 3.825513975315508e-06, - "loss": 0.93174696, - "num_input_tokens_seen": 57596335, - "step": 2660, - "time_per_iteration": 2.7562267780303955 - }, - { - "auxiliary_loss_clip": 0.01114547, - "auxiliary_loss_mlp": 0.01058378, - "balance_loss_clip": 1.05538487, - "balance_loss_mlp": 1.03590751, - "epoch": 0.1599879753494664, - "flos": 33066652625280.0, - "grad_norm": 1.746468400789071, - "language_loss": 0.77724659, - "learning_rate": 3.82535484444872e-06, - "loss": 0.79897583, - "num_input_tokens_seen": 57616830, - "step": 2661, - "time_per_iteration": 2.9896914958953857 - }, - { - "auxiliary_loss_clip": 0.0113781, - "auxiliary_loss_mlp": 0.00777461, - "balance_loss_clip": 1.05382478, - "balance_loss_mlp": 1.00132632, - "epoch": 0.16004809860213437, - "flos": 28038010343040.0, - "grad_norm": 2.0483033922540086, - "language_loss": 0.74442393, - "learning_rate": 3.825195644364292e-06, - "loss": 0.76357663, - "num_input_tokens_seen": 57635515, - "step": 2662, - "time_per_iteration": 2.7993714809417725 - }, - { - "auxiliary_loss_clip": 0.01135674, - "auxiliary_loss_mlp": 0.00780783, - "balance_loss_clip": 1.05392313, - "balance_loss_mlp": 1.0016191, - "epoch": 0.16010822185480234, - "flos": 22780113505920.0, - "grad_norm": 2.9903694104875984, - "language_loss": 0.82515085, - "learning_rate": 3.825036375068263e-06, - "loss": 0.84431541, - "num_input_tokens_seen": 57654250, - "step": 2663, - "time_per_iteration": 2.678490161895752 - }, - { - "auxiliary_loss_clip": 0.01112205, - "auxiliary_loss_mlp": 0.01044917, - "balance_loss_clip": 1.05182636, - "balance_loss_mlp": 1.02574801, - "epoch": 0.16016834510747033, - "flos": 20084012611200.0, - "grad_norm": 2.06786422122115, - "language_loss": 0.7951405, - "learning_rate": 3.824877036566672e-06, - "loss": 0.81671166, - "num_input_tokens_seen": 57672645, - "step": 2664, - "time_per_iteration": 2.819880962371826 - }, - { - "auxiliary_loss_clip": 0.01151449, - "auxiliary_loss_mlp": 0.01048023, - "balance_loss_clip": 1.05374622, - "balance_loss_mlp": 1.02886605, - "epoch": 0.1602284683601383, - "flos": 21173829206400.0, - "grad_norm": 1.6697703441146605, - "language_loss": 0.93748474, - "learning_rate": 3.824717628865561e-06, - "loss": 0.95947945, - "num_input_tokens_seen": 57691055, - "step": 2665, - "time_per_iteration": 2.697660446166992 - }, - { - "auxiliary_loss_clip": 0.01127607, - "auxiliary_loss_mlp": 0.01047415, - "balance_loss_clip": 1.05185676, - "balance_loss_mlp": 1.02774525, - "epoch": 0.16028859161280626, - "flos": 14647568244480.0, - "grad_norm": 2.9655602739253095, - "language_loss": 0.85237324, - "learning_rate": 3.824558151970974e-06, - "loss": 0.87412339, - "num_input_tokens_seen": 57707235, - "step": 2666, - "time_per_iteration": 4.282273530960083 - }, - { - "auxiliary_loss_clip": 0.01129818, - "auxiliary_loss_mlp": 0.00777125, - "balance_loss_clip": 1.05257225, - "balance_loss_mlp": 1.00145936, - "epoch": 0.16034871486547422, - "flos": 20990325600000.0, - "grad_norm": 1.8366839898970433, - "language_loss": 0.81284773, - "learning_rate": 3.8243986058889595e-06, - "loss": 0.83191717, - "num_input_tokens_seen": 57724190, - "step": 2667, - "time_per_iteration": 2.69508695602417 - }, - { - "auxiliary_loss_clip": 0.0116556, - "auxiliary_loss_mlp": 0.01046526, - "balance_loss_clip": 1.06089485, - "balance_loss_mlp": 1.02643883, - "epoch": 0.1604088381181422, - "flos": 21397732634880.0, - "grad_norm": 1.958935842080623, - "language_loss": 0.74031079, - "learning_rate": 3.824238990625567e-06, - "loss": 0.76243162, - "num_input_tokens_seen": 57743620, - "step": 2668, - "time_per_iteration": 4.2559425830841064 - }, - { - "auxiliary_loss_clip": 0.01148853, - "auxiliary_loss_mlp": 0.01051992, - "balance_loss_clip": 1.05547619, - "balance_loss_mlp": 1.03240585, - "epoch": 0.16046896137081015, - "flos": 23877040993920.0, - "grad_norm": 1.7737626564305047, - "language_loss": 0.77495629, - "learning_rate": 3.824079306186848e-06, - "loss": 0.7969647, - "num_input_tokens_seen": 57764810, - "step": 2669, - "time_per_iteration": 2.6424050331115723 - }, - { - "auxiliary_loss_clip": 0.01097339, - "auxiliary_loss_mlp": 0.01012737, - "balance_loss_clip": 1.06351233, - "balance_loss_mlp": 1.00986385, - "epoch": 0.16052908462347812, - "flos": 59806709015040.0, - "grad_norm": 0.8041290684345284, - "language_loss": 0.5549804, - "learning_rate": 3.823919552578861e-06, - "loss": 0.57608116, - "num_input_tokens_seen": 57824390, - "step": 2670, - "time_per_iteration": 4.765664100646973 - }, - { - "auxiliary_loss_clip": 0.01149639, - "auxiliary_loss_mlp": 0.01043383, - "balance_loss_clip": 1.05322218, - "balance_loss_mlp": 1.02430916, - "epoch": 0.1605892078761461, - "flos": 18296559089280.0, - "grad_norm": 2.6306224128650464, - "language_loss": 0.77778888, - "learning_rate": 3.82375972980766e-06, - "loss": 0.7997191, - "num_input_tokens_seen": 57843665, - "step": 2671, - "time_per_iteration": 2.6876416206359863 - }, - { - "auxiliary_loss_clip": 0.01151164, - "auxiliary_loss_mlp": 0.01043962, - "balance_loss_clip": 1.05529547, - "balance_loss_mlp": 1.02503204, - "epoch": 0.16064933112881408, - "flos": 32160734686080.0, - "grad_norm": 1.9167251889277674, - "language_loss": 0.64766788, - "learning_rate": 3.8235998378793086e-06, - "loss": 0.66961908, - "num_input_tokens_seen": 57863305, - "step": 2672, - "time_per_iteration": 2.7102553844451904 - }, - { - "auxiliary_loss_clip": 0.01150206, - "auxiliary_loss_mlp": 0.01046785, - "balance_loss_clip": 1.05674481, - "balance_loss_mlp": 1.02554154, - "epoch": 0.16070945438148204, - "flos": 19828795501440.0, - "grad_norm": 2.045175098484539, - "language_loss": 0.85708207, - "learning_rate": 3.8234398767998675e-06, - "loss": 0.87905198, - "num_input_tokens_seen": 57883025, - "step": 2673, - "time_per_iteration": 2.656360626220703 - }, - { - "auxiliary_loss_clip": 0.01125542, - "auxiliary_loss_mlp": 0.01055838, - "balance_loss_clip": 1.05366015, - "balance_loss_mlp": 1.03716969, - "epoch": 0.16076957763415, - "flos": 18913144976640.0, - "grad_norm": 2.339006860757087, - "language_loss": 0.7289716, - "learning_rate": 3.823279846575403e-06, - "loss": 0.75078535, - "num_input_tokens_seen": 57901430, - "step": 2674, - "time_per_iteration": 2.7122414112091064 - }, - { - "auxiliary_loss_clip": 0.01150063, - "auxiliary_loss_mlp": 0.01045468, - "balance_loss_clip": 1.05416465, - "balance_loss_mlp": 1.02464211, - "epoch": 0.16082970088681797, - "flos": 16764358590720.0, - "grad_norm": 1.9341682597436423, - "language_loss": 0.84438515, - "learning_rate": 3.823119747211986e-06, - "loss": 0.86634052, - "num_input_tokens_seen": 57919550, - "step": 2675, - "time_per_iteration": 2.6646435260772705 - }, - { - "auxiliary_loss_clip": 0.01116221, - "auxiliary_loss_mlp": 0.01049343, - "balance_loss_clip": 1.05220723, - "balance_loss_mlp": 1.02823126, - "epoch": 0.16088982413948594, - "flos": 35150261783040.0, - "grad_norm": 1.871909119220515, - "language_loss": 0.82216591, - "learning_rate": 3.822959578715685e-06, - "loss": 0.84382153, - "num_input_tokens_seen": 57939890, - "step": 2676, - "time_per_iteration": 2.8457534313201904 - }, - { - "auxiliary_loss_clip": 0.01151157, - "auxiliary_loss_mlp": 0.01049874, - "balance_loss_clip": 1.05746996, - "balance_loss_mlp": 1.03162253, - "epoch": 0.1609499473921539, - "flos": 18625105814400.0, - "grad_norm": 2.1166154816193923, - "language_loss": 0.73485494, - "learning_rate": 3.822799341092573e-06, - "loss": 0.75686526, - "num_input_tokens_seen": 57957410, - "step": 2677, - "time_per_iteration": 2.65387225151062 - }, - { - "auxiliary_loss_clip": 0.01138188, - "auxiliary_loss_mlp": 0.01044363, - "balance_loss_clip": 1.05438483, - "balance_loss_mlp": 1.02537322, - "epoch": 0.1610100706448219, - "flos": 33145728416640.0, - "grad_norm": 3.229282061984371, - "language_loss": 0.76305777, - "learning_rate": 3.822639034348728e-06, - "loss": 0.78488332, - "num_input_tokens_seen": 57977900, - "step": 2678, - "time_per_iteration": 2.836071014404297 - }, - { - "auxiliary_loss_clip": 0.01148252, - "auxiliary_loss_mlp": 0.01047887, - "balance_loss_clip": 1.05379987, - "balance_loss_mlp": 1.02789569, - "epoch": 0.16107019389748986, - "flos": 34676707852800.0, - "grad_norm": 8.295814069484678, - "language_loss": 0.70340431, - "learning_rate": 3.822478658490228e-06, - "loss": 0.7253657, - "num_input_tokens_seen": 57998210, - "step": 2679, - "time_per_iteration": 2.771185874938965 - }, - { - "auxiliary_loss_clip": 0.01059502, - "auxiliary_loss_mlp": 0.00758644, - "balance_loss_clip": 1.04695845, - "balance_loss_mlp": 1.00150955, - "epoch": 0.16113031715015783, - "flos": 65713403260800.0, - "grad_norm": 0.7819629653273137, - "language_loss": 0.51843339, - "learning_rate": 3.822318213523154e-06, - "loss": 0.53661484, - "num_input_tokens_seen": 58059420, - "step": 2680, - "time_per_iteration": 3.3107378482818604 - }, - { - "auxiliary_loss_clip": 0.01144342, - "auxiliary_loss_mlp": 0.01047358, - "balance_loss_clip": 1.05360317, - "balance_loss_mlp": 1.02632904, - "epoch": 0.1611904404028258, - "flos": 20810413353600.0, - "grad_norm": 1.6718368455031125, - "language_loss": 0.8028667, - "learning_rate": 3.8221576994535925e-06, - "loss": 0.82478368, - "num_input_tokens_seen": 58078370, - "step": 2681, - "time_per_iteration": 2.6986513137817383 - }, - { - "auxiliary_loss_clip": 0.01139192, - "auxiliary_loss_mlp": 0.01055518, - "balance_loss_clip": 1.05603266, - "balance_loss_mlp": 1.03602743, - "epoch": 0.16125056365549376, - "flos": 27013335062400.0, - "grad_norm": 2.154781054673542, - "language_loss": 0.68957973, - "learning_rate": 3.821997116287627e-06, - "loss": 0.71152687, - "num_input_tokens_seen": 58097395, - "step": 2682, - "time_per_iteration": 2.794686794281006 - }, - { - "auxiliary_loss_clip": 0.01139216, - "auxiliary_loss_mlp": 0.01052349, - "balance_loss_clip": 1.05670619, - "balance_loss_mlp": 1.03195262, - "epoch": 0.16131068690816172, - "flos": 19276524915840.0, - "grad_norm": 1.9802191055590168, - "language_loss": 0.87362224, - "learning_rate": 3.821836464031348e-06, - "loss": 0.89553785, - "num_input_tokens_seen": 58115630, - "step": 2683, - "time_per_iteration": 2.703634262084961 - }, - { - "auxiliary_loss_clip": 0.01165497, - "auxiliary_loss_mlp": 0.0105575, - "balance_loss_clip": 1.05714059, - "balance_loss_mlp": 1.03491259, - "epoch": 0.16137081016082971, - "flos": 35337931367040.0, - "grad_norm": 1.939499216066865, - "language_loss": 0.74143028, - "learning_rate": 3.821675742690849e-06, - "loss": 0.76364273, - "num_input_tokens_seen": 58138655, - "step": 2684, - "time_per_iteration": 2.7890264987945557 - }, - { - "auxiliary_loss_clip": 0.01136683, - "auxiliary_loss_mlp": 0.00778989, - "balance_loss_clip": 1.05435085, - "balance_loss_mlp": 1.00176883, - "epoch": 0.16143093341349768, - "flos": 34235257703040.0, - "grad_norm": 1.9009911635557044, - "language_loss": 0.70506597, - "learning_rate": 3.821514952272223e-06, - "loss": 0.72422272, - "num_input_tokens_seen": 58157440, - "step": 2685, - "time_per_iteration": 2.803942918777466 - }, - { - "auxiliary_loss_clip": 0.01116315, - "auxiliary_loss_mlp": 0.01059092, - "balance_loss_clip": 1.05291295, - "balance_loss_mlp": 1.03757524, - "epoch": 0.16149105666616564, - "flos": 27999262546560.0, - "grad_norm": 2.295686008167468, - "language_loss": 0.72060591, - "learning_rate": 3.821354092781567e-06, - "loss": 0.74236, - "num_input_tokens_seen": 58176660, - "step": 2686, - "time_per_iteration": 2.850309133529663 - }, - { - "auxiliary_loss_clip": 0.01153803, - "auxiliary_loss_mlp": 0.01048887, - "balance_loss_clip": 1.05603862, - "balance_loss_mlp": 1.02922952, - "epoch": 0.1615511799188336, - "flos": 19422214479360.0, - "grad_norm": 2.056921120199424, - "language_loss": 0.81720114, - "learning_rate": 3.821193164224981e-06, - "loss": 0.83922803, - "num_input_tokens_seen": 58195085, - "step": 2687, - "time_per_iteration": 2.7077832221984863 - }, - { - "auxiliary_loss_clip": 0.01154388, - "auxiliary_loss_mlp": 0.01050682, - "balance_loss_clip": 1.05335689, - "balance_loss_mlp": 1.02910483, - "epoch": 0.16161130317150157, - "flos": 22854915578880.0, - "grad_norm": 1.6747986106054085, - "language_loss": 0.71680355, - "learning_rate": 3.821032166608568e-06, - "loss": 0.73885429, - "num_input_tokens_seen": 58213540, - "step": 2688, - "time_per_iteration": 2.700073480606079 - }, - { - "auxiliary_loss_clip": 0.0112226, - "auxiliary_loss_mlp": 0.0105252, - "balance_loss_clip": 1.0517168, - "balance_loss_mlp": 1.03330338, - "epoch": 0.16167142642416954, - "flos": 26110577520000.0, - "grad_norm": 2.2887064413695253, - "language_loss": 0.76168394, - "learning_rate": 3.8208710999384325e-06, - "loss": 0.78343177, - "num_input_tokens_seen": 58236995, - "step": 2689, - "time_per_iteration": 2.846964120864868 - }, - { - "auxiliary_loss_clip": 0.01166324, - "auxiliary_loss_mlp": 0.01052979, - "balance_loss_clip": 1.05979431, - "balance_loss_mlp": 1.03308284, - "epoch": 0.1617315496768375, - "flos": 22779646629120.0, - "grad_norm": 2.045037041298705, - "language_loss": 0.87211925, - "learning_rate": 3.820709964220683e-06, - "loss": 0.89431226, - "num_input_tokens_seen": 58257230, - "step": 2690, - "time_per_iteration": 2.704497814178467 - }, - { - "auxiliary_loss_clip": 0.01143898, - "auxiliary_loss_mlp": 0.01046571, - "balance_loss_clip": 1.05318451, - "balance_loss_mlp": 1.02890396, - "epoch": 0.1617916729295055, - "flos": 22017299351040.0, - "grad_norm": 1.7518031225399346, - "language_loss": 0.87899524, - "learning_rate": 3.8205487594614284e-06, - "loss": 0.90089989, - "num_input_tokens_seen": 58277080, - "step": 2691, - "time_per_iteration": 2.6763153076171875 - }, - { - "auxiliary_loss_clip": 0.01150265, - "auxiliary_loss_mlp": 0.01053114, - "balance_loss_clip": 1.05237532, - "balance_loss_mlp": 1.03142977, - "epoch": 0.16185179618217346, - "flos": 23438248450560.0, - "grad_norm": 2.1723450057475313, - "language_loss": 0.81989783, - "learning_rate": 3.820387485666784e-06, - "loss": 0.84193164, - "num_input_tokens_seen": 58294815, - "step": 2692, - "time_per_iteration": 2.6381001472473145 - }, - { - "auxiliary_loss_clip": 0.01167881, - "auxiliary_loss_mlp": 0.0104606, - "balance_loss_clip": 1.05555534, - "balance_loss_mlp": 1.02499604, - "epoch": 0.16191191943484143, - "flos": 25666110627840.0, - "grad_norm": 2.194958172554253, - "language_loss": 0.81381011, - "learning_rate": 3.820226142842862e-06, - "loss": 0.83594954, - "num_input_tokens_seen": 58313215, - "step": 2693, - "time_per_iteration": 2.6366944313049316 - }, - { - "auxiliary_loss_clip": 0.01164466, - "auxiliary_loss_mlp": 0.01058298, - "balance_loss_clip": 1.0587461, - "balance_loss_mlp": 1.03991616, - "epoch": 0.1619720426875094, - "flos": 23477355383040.0, - "grad_norm": 2.778189532536263, - "language_loss": 0.83837044, - "learning_rate": 3.820064730995783e-06, - "loss": 0.86059809, - "num_input_tokens_seen": 58333215, - "step": 2694, - "time_per_iteration": 2.7802140712738037 - }, - { - "auxiliary_loss_clip": 0.01116209, - "auxiliary_loss_mlp": 0.0105764, - "balance_loss_clip": 1.04927421, - "balance_loss_mlp": 1.0366354, - "epoch": 0.16203216594017736, - "flos": 24133658734080.0, - "grad_norm": 1.8201511645490482, - "language_loss": 0.69709098, - "learning_rate": 3.819903250131667e-06, - "loss": 0.71882945, - "num_input_tokens_seen": 58351160, - "step": 2695, - "time_per_iteration": 2.756904125213623 - }, - { - "auxiliary_loss_clip": 0.01155526, - "auxiliary_loss_mlp": 0.01050837, - "balance_loss_clip": 1.05799723, - "balance_loss_mlp": 1.03026128, - "epoch": 0.16209228919284532, - "flos": 22340889999360.0, - "grad_norm": 2.1550523064219487, - "language_loss": 0.82986331, - "learning_rate": 3.819741700256637e-06, - "loss": 0.85192692, - "num_input_tokens_seen": 58368505, - "step": 2696, - "time_per_iteration": 2.651510238647461 - }, - { - "auxiliary_loss_clip": 0.01174193, - "auxiliary_loss_mlp": 0.01052819, - "balance_loss_clip": 1.05826569, - "balance_loss_mlp": 1.03095615, - "epoch": 0.1621524124455133, - "flos": 15815131827840.0, - "grad_norm": 2.9267990143146503, - "language_loss": 0.8862049, - "learning_rate": 3.8195800813768194e-06, - "loss": 0.90847504, - "num_input_tokens_seen": 58385085, - "step": 2697, - "time_per_iteration": 2.5935380458831787 - }, - { - "auxiliary_loss_clip": 0.01158945, - "auxiliary_loss_mlp": 0.01045471, - "balance_loss_clip": 1.0552485, - "balance_loss_mlp": 1.02719641, - "epoch": 0.16221253569818128, - "flos": 30186688988160.0, - "grad_norm": 1.7480298293719791, - "language_loss": 0.80844599, - "learning_rate": 3.819418393498343e-06, - "loss": 0.83049017, - "num_input_tokens_seen": 58406985, - "step": 2698, - "time_per_iteration": 2.6685965061187744 - }, - { - "auxiliary_loss_clip": 0.01151678, - "auxiliary_loss_mlp": 0.01050084, - "balance_loss_clip": 1.05785704, - "balance_loss_mlp": 1.03060579, - "epoch": 0.16227265895084925, - "flos": 24605991601920.0, - "grad_norm": 1.590231062064763, - "language_loss": 0.77499473, - "learning_rate": 3.819256636627339e-06, - "loss": 0.79701245, - "num_input_tokens_seen": 58426205, - "step": 2699, - "time_per_iteration": 2.7206287384033203 - }, - { - "auxiliary_loss_clip": 0.01134482, - "auxiliary_loss_mlp": 0.01043888, - "balance_loss_clip": 1.0504272, - "balance_loss_mlp": 1.02510071, - "epoch": 0.1623327822035172, - "flos": 19573326996480.0, - "grad_norm": 2.299083669251571, - "language_loss": 0.85903585, - "learning_rate": 3.81909481076994e-06, - "loss": 0.88081944, - "num_input_tokens_seen": 58443830, - "step": 2700, - "time_per_iteration": 2.6440224647521973 - }, - { - "auxiliary_loss_clip": 0.01150266, - "auxiliary_loss_mlp": 0.00778348, - "balance_loss_clip": 1.05360484, - "balance_loss_mlp": 1.00180686, - "epoch": 0.16239290545618518, - "flos": 26468462678400.0, - "grad_norm": 1.7679372116400307, - "language_loss": 0.80424523, - "learning_rate": 3.818932915932284e-06, - "loss": 0.82353133, - "num_input_tokens_seen": 58464405, - "step": 2701, - "time_per_iteration": 2.6943976879119873 - }, - { - "auxiliary_loss_clip": 0.01144477, - "auxiliary_loss_mlp": 0.01046291, - "balance_loss_clip": 1.05771017, - "balance_loss_mlp": 1.02664542, - "epoch": 0.16245302870885314, - "flos": 15851940289920.0, - "grad_norm": 1.6539412057050027, - "language_loss": 0.72777367, - "learning_rate": 3.818770952120511e-06, - "loss": 0.74968135, - "num_input_tokens_seen": 58483295, - "step": 2702, - "time_per_iteration": 2.6914141178131104 - }, - { - "auxiliary_loss_clip": 0.01156069, - "auxiliary_loss_mlp": 0.01050141, - "balance_loss_clip": 1.05802381, - "balance_loss_mlp": 1.02896905, - "epoch": 0.1625131519615211, - "flos": 14756521173120.0, - "grad_norm": 1.8265391375227176, - "language_loss": 0.7273894, - "learning_rate": 3.81860891934076e-06, - "loss": 0.74945152, - "num_input_tokens_seen": 58501205, - "step": 2703, - "time_per_iteration": 2.6301820278167725 - }, - { - "auxiliary_loss_clip": 0.01165642, - "auxiliary_loss_mlp": 0.01050857, - "balance_loss_clip": 1.0553968, - "balance_loss_mlp": 1.02942359, - "epoch": 0.1625732752141891, - "flos": 28220508368640.0, - "grad_norm": 3.0329584489902666, - "language_loss": 0.70018482, - "learning_rate": 3.818446817599176e-06, - "loss": 0.72234988, - "num_input_tokens_seen": 58522315, - "step": 2704, - "time_per_iteration": 2.6667227745056152 - }, - { - "auxiliary_loss_clip": 0.01034679, - "auxiliary_loss_mlp": 0.01001657, - "balance_loss_clip": 1.03343439, - "balance_loss_mlp": 0.99865305, - "epoch": 0.16263339846685707, - "flos": 67327947688320.0, - "grad_norm": 0.7801109588151329, - "language_loss": 0.5336051, - "learning_rate": 3.818284646901907e-06, - "loss": 0.55396849, - "num_input_tokens_seen": 58586695, - "step": 2705, - "time_per_iteration": 4.808594465255737 - }, - { - "auxiliary_loss_clip": 0.01138628, - "auxiliary_loss_mlp": 0.00781324, - "balance_loss_clip": 1.0539608, - "balance_loss_mlp": 1.00171995, - "epoch": 0.16269352171952503, - "flos": 14319165173760.0, - "grad_norm": 2.3827832530074455, - "language_loss": 0.7536028, - "learning_rate": 3.818122407255102e-06, - "loss": 0.77280229, - "num_input_tokens_seen": 58602435, - "step": 2706, - "time_per_iteration": 4.126614570617676 - }, - { - "auxiliary_loss_clip": 0.01130684, - "auxiliary_loss_mlp": 0.01047489, - "balance_loss_clip": 1.0523324, - "balance_loss_mlp": 1.02859437, - "epoch": 0.162753644972193, - "flos": 28361205941760.0, - "grad_norm": 2.2272392184651038, - "language_loss": 0.72203928, - "learning_rate": 3.817960098664914e-06, - "loss": 0.74382102, - "num_input_tokens_seen": 58621275, - "step": 2707, - "time_per_iteration": 4.2739410400390625 - }, - { - "auxiliary_loss_clip": 0.01142142, - "auxiliary_loss_mlp": 0.01047652, - "balance_loss_clip": 1.05433679, - "balance_loss_mlp": 1.02898431, - "epoch": 0.16281376822486096, - "flos": 19937856170880.0, - "grad_norm": 3.192481802987827, - "language_loss": 0.83481139, - "learning_rate": 3.817797721137495e-06, - "loss": 0.85670936, - "num_input_tokens_seen": 58637550, - "step": 2708, - "time_per_iteration": 2.7163965702056885 - }, - { - "auxiliary_loss_clip": 0.01101561, - "auxiliary_loss_mlp": 0.00781217, - "balance_loss_clip": 1.04896522, - "balance_loss_mlp": 1.00177419, - "epoch": 0.16287389147752893, - "flos": 21251719848960.0, - "grad_norm": 2.2850459718507654, - "language_loss": 0.86162847, - "learning_rate": 3.817635274679006e-06, - "loss": 0.88045627, - "num_input_tokens_seen": 58654135, - "step": 2709, - "time_per_iteration": 4.474989652633667 - }, - { - "auxiliary_loss_clip": 0.0114031, - "auxiliary_loss_mlp": 0.00777602, - "balance_loss_clip": 1.05267572, - "balance_loss_mlp": 1.00172114, - "epoch": 0.1629340147301969, - "flos": 19244672530560.0, - "grad_norm": 2.581053296112052, - "language_loss": 0.91410124, - "learning_rate": 3.817472759295605e-06, - "loss": 0.93328035, - "num_input_tokens_seen": 58674320, - "step": 2710, - "time_per_iteration": 2.6951892375946045 - }, - { - "auxiliary_loss_clip": 0.01118597, - "auxiliary_loss_mlp": 0.01054854, - "balance_loss_clip": 1.05254805, - "balance_loss_mlp": 1.03451669, - "epoch": 0.16299413798286488, - "flos": 21249816428160.0, - "grad_norm": 2.4322540773438437, - "language_loss": 0.81690979, - "learning_rate": 3.817310174993453e-06, - "loss": 0.83864427, - "num_input_tokens_seen": 58691000, - "step": 2711, - "time_per_iteration": 2.7854437828063965 - }, - { - "auxiliary_loss_clip": 0.01146056, - "auxiliary_loss_mlp": 0.01040648, - "balance_loss_clip": 1.04954815, - "balance_loss_mlp": 1.02107334, - "epoch": 0.16305426123553285, - "flos": 18770579896320.0, - "grad_norm": 3.73256798888747, - "language_loss": 0.8091476, - "learning_rate": 3.817147521778719e-06, - "loss": 0.83101463, - "num_input_tokens_seen": 58710230, - "step": 2712, - "time_per_iteration": 2.834291458129883 - }, - { - "auxiliary_loss_clip": 0.01171211, - "auxiliary_loss_mlp": 0.01053015, - "balance_loss_clip": 1.0590024, - "balance_loss_mlp": 1.03273714, - "epoch": 0.16311438448820081, - "flos": 22087648137600.0, - "grad_norm": 2.3460895846171996, - "language_loss": 0.7681579, - "learning_rate": 3.816984799657568e-06, - "loss": 0.79040015, - "num_input_tokens_seen": 58728610, - "step": 2713, - "time_per_iteration": 2.6188278198242188 - }, - { - "auxiliary_loss_clip": 0.01156539, - "auxiliary_loss_mlp": 0.0105792, - "balance_loss_clip": 1.06240916, - "balance_loss_mlp": 1.03832221, - "epoch": 0.16317450774086878, - "flos": 16467700164480.0, - "grad_norm": 2.543173325075216, - "language_loss": 0.79012156, - "learning_rate": 3.8168220086361715e-06, - "loss": 0.81226611, - "num_input_tokens_seen": 58744385, - "step": 2714, - "time_per_iteration": 2.6534018516540527 - }, - { - "auxiliary_loss_clip": 0.01149567, - "auxiliary_loss_mlp": 0.01056152, - "balance_loss_clip": 1.05467987, - "balance_loss_mlp": 1.03724504, - "epoch": 0.16323463099353674, - "flos": 24352929308160.0, - "grad_norm": 1.614702766215493, - "language_loss": 0.77693665, - "learning_rate": 3.816659148720702e-06, - "loss": 0.79899377, - "num_input_tokens_seen": 58763905, - "step": 2715, - "time_per_iteration": 2.856006383895874 - }, - { - "auxiliary_loss_clip": 0.01129437, - "auxiliary_loss_mlp": 0.01044046, - "balance_loss_clip": 1.04810584, - "balance_loss_mlp": 1.02525854, - "epoch": 0.1632947542462047, - "flos": 24900782520960.0, - "grad_norm": 2.374975046722651, - "language_loss": 0.81513858, - "learning_rate": 3.816496219917336e-06, - "loss": 0.83687335, - "num_input_tokens_seen": 58785580, - "step": 2716, - "time_per_iteration": 2.6750845909118652 - }, - { - "auxiliary_loss_clip": 0.01144393, - "auxiliary_loss_mlp": 0.01055927, - "balance_loss_clip": 1.05851114, - "balance_loss_mlp": 1.03703237, - "epoch": 0.1633548774988727, - "flos": 24900279730560.0, - "grad_norm": 1.8186679286330678, - "language_loss": 0.86522418, - "learning_rate": 3.816333222232251e-06, - "loss": 0.88722742, - "num_input_tokens_seen": 58806075, - "step": 2717, - "time_per_iteration": 2.761622428894043 - }, - { - "auxiliary_loss_clip": 0.01135377, - "auxiliary_loss_mlp": 0.01045964, - "balance_loss_clip": 1.05334044, - "balance_loss_mlp": 1.0274632, - "epoch": 0.16341500075154067, - "flos": 30441798357120.0, - "grad_norm": 1.8799656187942837, - "language_loss": 0.76924133, - "learning_rate": 3.816170155671629e-06, - "loss": 0.79105473, - "num_input_tokens_seen": 58827405, - "step": 2718, - "time_per_iteration": 2.7946770191192627 - }, - { - "auxiliary_loss_clip": 0.01145146, - "auxiliary_loss_mlp": 0.01043682, - "balance_loss_clip": 1.05553615, - "balance_loss_mlp": 1.02566922, - "epoch": 0.16347512400420863, - "flos": 22784530878720.0, - "grad_norm": 2.2449478392049906, - "language_loss": 0.73827291, - "learning_rate": 3.816007020241652e-06, - "loss": 0.76016116, - "num_input_tokens_seen": 58847205, - "step": 2719, - "time_per_iteration": 2.719980478286743 - }, - { - "auxiliary_loss_clip": 0.01128361, - "auxiliary_loss_mlp": 0.01045887, - "balance_loss_clip": 1.04900515, - "balance_loss_mlp": 1.02732563, - "epoch": 0.1635352472568766, - "flos": 22633274707200.0, - "grad_norm": 1.7092252575708884, - "language_loss": 0.72267497, - "learning_rate": 3.815843815948507e-06, - "loss": 0.74441749, - "num_input_tokens_seen": 58866865, - "step": 2720, - "time_per_iteration": 2.8737292289733887 - }, - { - "auxiliary_loss_clip": 0.01109456, - "auxiliary_loss_mlp": 0.01049703, - "balance_loss_clip": 1.05004287, - "balance_loss_mlp": 1.02840054, - "epoch": 0.16359537050954456, - "flos": 15522998515200.0, - "grad_norm": 2.1621365878543153, - "language_loss": 0.75120997, - "learning_rate": 3.8156805427983824e-06, - "loss": 0.77280164, - "num_input_tokens_seen": 58885200, - "step": 2721, - "time_per_iteration": 2.785296678543091 - }, - { - "auxiliary_loss_clip": 0.01110342, - "auxiliary_loss_mlp": 0.01059955, - "balance_loss_clip": 1.04597676, - "balance_loss_mlp": 1.03734064, - "epoch": 0.16365549376221253, - "flos": 22090162089600.0, - "grad_norm": 1.9032438792006017, - "language_loss": 0.79073942, - "learning_rate": 3.8155172007974695e-06, - "loss": 0.81244236, - "num_input_tokens_seen": 58906385, - "step": 2722, - "time_per_iteration": 2.7850708961486816 - }, - { - "auxiliary_loss_clip": 0.01149809, - "auxiliary_loss_mlp": 0.00778798, - "balance_loss_clip": 1.05395257, - "balance_loss_mlp": 1.00171757, - "epoch": 0.1637156170148805, - "flos": 24060400945920.0, - "grad_norm": 2.3019049903761215, - "language_loss": 0.84954333, - "learning_rate": 3.8153537899519624e-06, - "loss": 0.86882937, - "num_input_tokens_seen": 58925040, - "step": 2723, - "time_per_iteration": 2.7268764972686768 - }, - { - "auxiliary_loss_clip": 0.01108328, - "auxiliary_loss_mlp": 0.01044851, - "balance_loss_clip": 1.04805517, - "balance_loss_mlp": 1.02493143, - "epoch": 0.1637757402675485, - "flos": 26685362954880.0, - "grad_norm": 1.8985615531712963, - "language_loss": 0.71018666, - "learning_rate": 3.815190310268058e-06, - "loss": 0.73171842, - "num_input_tokens_seen": 58944790, - "step": 2724, - "time_per_iteration": 2.7691783905029297 - }, - { - "auxiliary_loss_clip": 0.01118053, - "auxiliary_loss_mlp": 0.01041883, - "balance_loss_clip": 1.05226958, - "balance_loss_mlp": 1.02364373, - "epoch": 0.16383586352021645, - "flos": 16106941918080.0, - "grad_norm": 2.1059770262776136, - "language_loss": 0.70552838, - "learning_rate": 3.815026761751955e-06, - "loss": 0.72712779, - "num_input_tokens_seen": 58962500, - "step": 2725, - "time_per_iteration": 2.6936957836151123 - }, - { - "auxiliary_loss_clip": 0.01112368, - "auxiliary_loss_mlp": 0.01046594, - "balance_loss_clip": 1.04912174, - "balance_loss_mlp": 1.028391, - "epoch": 0.16389598677288442, - "flos": 19165991788800.0, - "grad_norm": 2.27810298992254, - "language_loss": 0.88491893, - "learning_rate": 3.814863144409855e-06, - "loss": 0.90650856, - "num_input_tokens_seen": 58980355, - "step": 2726, - "time_per_iteration": 2.7967143058776855 - }, - { - "auxiliary_loss_clip": 0.01157668, - "auxiliary_loss_mlp": 0.0105068, - "balance_loss_clip": 1.06062055, - "balance_loss_mlp": 1.03099847, - "epoch": 0.16395611002555238, - "flos": 21507008785920.0, - "grad_norm": 2.0584475237926303, - "language_loss": 0.7469939, - "learning_rate": 3.814699458247963e-06, - "loss": 0.7690773, - "num_input_tokens_seen": 58999505, - "step": 2727, - "time_per_iteration": 2.6818623542785645 - }, - { - "auxiliary_loss_clip": 0.01150971, - "auxiliary_loss_mlp": 0.01052077, - "balance_loss_clip": 1.0570507, - "balance_loss_mlp": 1.03527999, - "epoch": 0.16401623327822035, - "flos": 21470918595840.0, - "grad_norm": 1.6112579442237729, - "language_loss": 0.83097756, - "learning_rate": 3.8145357032724855e-06, - "loss": 0.85300803, - "num_input_tokens_seen": 59017930, - "step": 2728, - "time_per_iteration": 2.675360918045044 - }, - { - "auxiliary_loss_clip": 0.01156153, - "auxiliary_loss_mlp": 0.01045609, - "balance_loss_clip": 1.05826735, - "balance_loss_mlp": 1.02602315, - "epoch": 0.1640763565308883, - "flos": 13626232928640.0, - "grad_norm": 2.5738755626941106, - "language_loss": 0.84892929, - "learning_rate": 3.814371879489633e-06, - "loss": 0.87094688, - "num_input_tokens_seen": 59035130, - "step": 2729, - "time_per_iteration": 2.7004599571228027 - }, - { - "auxiliary_loss_clip": 0.01167293, - "auxiliary_loss_mlp": 0.01048461, - "balance_loss_clip": 1.0591594, - "balance_loss_mlp": 1.03053224, - "epoch": 0.16413647978355628, - "flos": 15451464579840.0, - "grad_norm": 1.9897225699042427, - "language_loss": 0.72895479, - "learning_rate": 3.814207986905616e-06, - "loss": 0.75111228, - "num_input_tokens_seen": 59053080, - "step": 2730, - "time_per_iteration": 2.593179702758789 - }, - { - "auxiliary_loss_clip": 0.01142509, - "auxiliary_loss_mlp": 0.01050071, - "balance_loss_clip": 1.05208349, - "balance_loss_mlp": 1.02908981, - "epoch": 0.16419660303622427, - "flos": 45878682015360.0, - "grad_norm": 1.6754501336017709, - "language_loss": 0.74384654, - "learning_rate": 3.814044025526651e-06, - "loss": 0.76577234, - "num_input_tokens_seen": 59075610, - "step": 2731, - "time_per_iteration": 2.8702962398529053 - }, - { - "auxiliary_loss_clip": 0.01122791, - "auxiliary_loss_mlp": 0.01047176, - "balance_loss_clip": 1.05006754, - "balance_loss_mlp": 1.02650499, - "epoch": 0.16425672628889224, - "flos": 18952826526720.0, - "grad_norm": 2.031351475505915, - "language_loss": 0.79190683, - "learning_rate": 3.8138799953589548e-06, - "loss": 0.8136065, - "num_input_tokens_seen": 59094555, - "step": 2732, - "time_per_iteration": 2.734529972076416 - }, - { - "auxiliary_loss_clip": 0.01141118, - "auxiliary_loss_mlp": 0.01047385, - "balance_loss_clip": 1.05340672, - "balance_loss_mlp": 1.02796555, - "epoch": 0.1643168495415602, - "flos": 24312996362880.0, - "grad_norm": 2.250003976384769, - "language_loss": 0.69526887, - "learning_rate": 3.8137158964087473e-06, - "loss": 0.71715385, - "num_input_tokens_seen": 59113515, - "step": 2733, - "time_per_iteration": 2.672377109527588 - }, - { - "auxiliary_loss_clip": 0.01143332, - "auxiliary_loss_mlp": 0.01053232, - "balance_loss_clip": 1.05603123, - "balance_loss_mlp": 1.0325135, - "epoch": 0.16437697279422817, - "flos": 26428421992320.0, - "grad_norm": 2.000873580428856, - "language_loss": 0.80976766, - "learning_rate": 3.8135517286822508e-06, - "loss": 0.83173329, - "num_input_tokens_seen": 59133275, - "step": 2734, - "time_per_iteration": 2.710293769836426 - }, - { - "auxiliary_loss_clip": 0.01135758, - "auxiliary_loss_mlp": 0.01056722, - "balance_loss_clip": 1.05488348, - "balance_loss_mlp": 1.03470409, - "epoch": 0.16443709604689613, - "flos": 34532239351680.0, - "grad_norm": 2.100664117201308, - "language_loss": 0.81810421, - "learning_rate": 3.8133874921856914e-06, - "loss": 0.840029, - "num_input_tokens_seen": 59154095, - "step": 2735, - "time_per_iteration": 2.8074140548706055 - }, - { - "auxiliary_loss_clip": 0.01070875, - "auxiliary_loss_mlp": 0.01044313, - "balance_loss_clip": 1.04323888, - "balance_loss_mlp": 1.02508426, - "epoch": 0.1644972192995641, - "flos": 23258048895360.0, - "grad_norm": 2.405088987017839, - "language_loss": 0.78515649, - "learning_rate": 3.813223186925296e-06, - "loss": 0.80630839, - "num_input_tokens_seen": 59173795, - "step": 2736, - "time_per_iteration": 2.839087963104248 - }, - { - "auxiliary_loss_clip": 0.01147998, - "auxiliary_loss_mlp": 0.01054659, - "balance_loss_clip": 1.05859447, - "balance_loss_mlp": 1.03513288, - "epoch": 0.1645573425522321, - "flos": 26979543342720.0, - "grad_norm": 1.9462182296456145, - "language_loss": 0.81052899, - "learning_rate": 3.8130588129072964e-06, - "loss": 0.83255553, - "num_input_tokens_seen": 59191610, - "step": 2737, - "time_per_iteration": 2.7328996658325195 - }, - { - "auxiliary_loss_clip": 0.01150424, - "auxiliary_loss_mlp": 0.01052207, - "balance_loss_clip": 1.0559026, - "balance_loss_mlp": 1.03065443, - "epoch": 0.16461746580490005, - "flos": 28731768600960.0, - "grad_norm": 1.8596348168124566, - "language_loss": 0.87449318, - "learning_rate": 3.8128943701379246e-06, - "loss": 0.89651948, - "num_input_tokens_seen": 59213000, - "step": 2738, - "time_per_iteration": 2.7345526218414307 - }, - { - "auxiliary_loss_clip": 0.01139154, - "auxiliary_loss_mlp": 0.0106055, - "balance_loss_clip": 1.05534518, - "balance_loss_mlp": 1.04079759, - "epoch": 0.16467758905756802, - "flos": 24930156867840.0, - "grad_norm": 1.728421510231393, - "language_loss": 0.71997833, - "learning_rate": 3.8127298586234167e-06, - "loss": 0.74197543, - "num_input_tokens_seen": 59232340, - "step": 2739, - "time_per_iteration": 2.7091422080993652 - }, - { - "auxiliary_loss_clip": 0.01154419, - "auxiliary_loss_mlp": 0.0105106, - "balance_loss_clip": 1.05673754, - "balance_loss_mlp": 1.0312835, - "epoch": 0.16473771231023598, - "flos": 24826519152000.0, - "grad_norm": 1.8559436932352185, - "language_loss": 0.81645715, - "learning_rate": 3.8125652783700104e-06, - "loss": 0.83851194, - "num_input_tokens_seen": 59253950, - "step": 2740, - "time_per_iteration": 2.712658166885376 - }, - { - "auxiliary_loss_clip": 0.01114061, - "auxiliary_loss_mlp": 0.01068725, - "balance_loss_clip": 1.04991829, - "balance_loss_mlp": 1.04307163, - "epoch": 0.16479783556290395, - "flos": 39896072375040.0, - "grad_norm": 2.0528021789830837, - "language_loss": 0.69467485, - "learning_rate": 3.8124006293839475e-06, - "loss": 0.71650267, - "num_input_tokens_seen": 59275545, - "step": 2741, - "time_per_iteration": 2.8629493713378906 - }, - { - "auxiliary_loss_clip": 0.01167543, - "auxiliary_loss_mlp": 0.01048721, - "balance_loss_clip": 1.05907226, - "balance_loss_mlp": 1.02906334, - "epoch": 0.16485795881557191, - "flos": 19897061299200.0, - "grad_norm": 1.7765193730452222, - "language_loss": 0.79811072, - "learning_rate": 3.812235911671472e-06, - "loss": 0.8202734, - "num_input_tokens_seen": 59293480, - "step": 2742, - "time_per_iteration": 2.626775026321411 - }, - { - "auxiliary_loss_clip": 0.01141681, - "auxiliary_loss_mlp": 0.01055663, - "balance_loss_clip": 1.05664062, - "balance_loss_mlp": 1.03477716, - "epoch": 0.16491808206823988, - "flos": 20556129997440.0, - "grad_norm": 1.91797408289014, - "language_loss": 0.8499459, - "learning_rate": 3.8120711252388274e-06, - "loss": 0.87191939, - "num_input_tokens_seen": 59313435, - "step": 2743, - "time_per_iteration": 2.8218302726745605 - }, - { - "auxiliary_loss_clip": 0.01162447, - "auxiliary_loss_mlp": 0.01051969, - "balance_loss_clip": 1.05743837, - "balance_loss_mlp": 1.03196514, - "epoch": 0.16497820532090787, - "flos": 23800802376960.0, - "grad_norm": 1.4425200129075006, - "language_loss": 0.85558498, - "learning_rate": 3.811906270092265e-06, - "loss": 0.87772918, - "num_input_tokens_seen": 59331535, - "step": 2744, - "time_per_iteration": 4.206263542175293 - }, - { - "auxiliary_loss_clip": 0.01131671, - "auxiliary_loss_mlp": 0.0104676, - "balance_loss_clip": 1.05206287, - "balance_loss_mlp": 1.02812767, - "epoch": 0.16503832857357584, - "flos": 25482642935040.0, - "grad_norm": 1.6285200980820358, - "language_loss": 0.82770813, - "learning_rate": 3.811741346238036e-06, - "loss": 0.84949243, - "num_input_tokens_seen": 59350680, - "step": 2745, - "time_per_iteration": 4.331594467163086 - }, - { - "auxiliary_loss_clip": 0.011344, - "auxiliary_loss_mlp": 0.01057242, - "balance_loss_clip": 1.05874014, - "balance_loss_mlp": 1.03825223, - "epoch": 0.1650984518262438, - "flos": 17676058619520.0, - "grad_norm": 6.766690288332402, - "language_loss": 0.76811314, - "learning_rate": 3.8115763536823923e-06, - "loss": 0.79002959, - "num_input_tokens_seen": 59367020, - "step": 2746, - "time_per_iteration": 4.225586414337158 - }, - { - "auxiliary_loss_clip": 0.01164296, - "auxiliary_loss_mlp": 0.01055636, - "balance_loss_clip": 1.05781221, - "balance_loss_mlp": 1.03533494, - "epoch": 0.16515857507891177, - "flos": 18698327688960.0, - "grad_norm": 1.9760186874049024, - "language_loss": 0.80818808, - "learning_rate": 3.811411292431592e-06, - "loss": 0.83038735, - "num_input_tokens_seen": 59386075, - "step": 2747, - "time_per_iteration": 2.6862480640411377 - }, - { - "auxiliary_loss_clip": 0.01157975, - "auxiliary_loss_mlp": 0.0104673, - "balance_loss_clip": 1.05990267, - "balance_loss_mlp": 1.02664328, - "epoch": 0.16521869833157973, - "flos": 15010481306880.0, - "grad_norm": 2.0608482379031337, - "language_loss": 0.69433749, - "learning_rate": 3.8112461624918945e-06, - "loss": 0.71638453, - "num_input_tokens_seen": 59402690, - "step": 2748, - "time_per_iteration": 2.6520986557006836 - }, - { - "auxiliary_loss_clip": 0.01169692, - "auxiliary_loss_mlp": 0.00778195, - "balance_loss_clip": 1.06237423, - "balance_loss_mlp": 1.00173104, - "epoch": 0.1652788215842477, - "flos": 22121152548480.0, - "grad_norm": 2.259215537482641, - "language_loss": 0.88012803, - "learning_rate": 3.811080963869561e-06, - "loss": 0.89960694, - "num_input_tokens_seen": 59421130, - "step": 2749, - "time_per_iteration": 4.260679244995117 - }, - { - "auxiliary_loss_clip": 0.01154179, - "auxiliary_loss_mlp": 0.01045617, - "balance_loss_clip": 1.05586052, - "balance_loss_mlp": 1.02542281, - "epoch": 0.16533894483691566, - "flos": 18333080242560.0, - "grad_norm": 2.0880864906339864, - "language_loss": 0.79240286, - "learning_rate": 3.8109156965708557e-06, - "loss": 0.81440079, - "num_input_tokens_seen": 59438970, - "step": 2750, - "time_per_iteration": 2.6335251331329346 - }, - { - "auxiliary_loss_clip": 0.01153343, - "auxiliary_loss_mlp": 0.0104591, - "balance_loss_clip": 1.0579437, - "balance_loss_mlp": 1.02602625, - "epoch": 0.16539906808958366, - "flos": 22382115834240.0, - "grad_norm": 1.6952801391084946, - "language_loss": 0.94854712, - "learning_rate": 3.8107503606020455e-06, - "loss": 0.97053963, - "num_input_tokens_seen": 59458510, - "step": 2751, - "time_per_iteration": 2.697174310684204 - }, - { - "auxiliary_loss_clip": 0.0106803, - "auxiliary_loss_mlp": 0.0105236, - "balance_loss_clip": 1.04625726, - "balance_loss_mlp": 1.03247619, - "epoch": 0.16545919134225162, - "flos": 22711093522560.0, - "grad_norm": 2.614588592950962, - "language_loss": 0.71231711, - "learning_rate": 3.8105849559693997e-06, - "loss": 0.73352098, - "num_input_tokens_seen": 59477110, - "step": 2752, - "time_per_iteration": 2.7780745029449463 - }, - { - "auxiliary_loss_clip": 0.01090521, - "auxiliary_loss_mlp": 0.01022104, - "balance_loss_clip": 1.05741131, - "balance_loss_mlp": 1.01941013, - "epoch": 0.1655193145949196, - "flos": 67802974076160.0, - "grad_norm": 0.7721529651221379, - "language_loss": 0.54058975, - "learning_rate": 3.810419482679192e-06, - "loss": 0.56171602, - "num_input_tokens_seen": 59541155, - "step": 2753, - "time_per_iteration": 3.3371469974517822 - }, - { - "auxiliary_loss_clip": 0.01163808, - "auxiliary_loss_mlp": 0.00778536, - "balance_loss_clip": 1.05587018, - "balance_loss_mlp": 1.00172091, - "epoch": 0.16557943784758755, - "flos": 24280389792000.0, - "grad_norm": 1.6411537728312637, - "language_loss": 0.75436741, - "learning_rate": 3.8102539407376954e-06, - "loss": 0.7737909, - "num_input_tokens_seen": 59561155, - "step": 2754, - "time_per_iteration": 2.6382133960723877 - }, - { - "auxiliary_loss_clip": 0.01139421, - "auxiliary_loss_mlp": 0.01060584, - "balance_loss_clip": 1.05406713, - "balance_loss_mlp": 1.03768396, - "epoch": 0.16563956110025552, - "flos": 20083617561600.0, - "grad_norm": 2.4067479946694137, - "language_loss": 0.86654639, - "learning_rate": 3.810088330151188e-06, - "loss": 0.88854647, - "num_input_tokens_seen": 59580460, - "step": 2755, - "time_per_iteration": 2.6590075492858887 - }, - { - "auxiliary_loss_clip": 0.01122817, - "auxiliary_loss_mlp": 0.01053169, - "balance_loss_clip": 1.04948378, - "balance_loss_mlp": 1.03293943, - "epoch": 0.16569968435292348, - "flos": 28034454896640.0, - "grad_norm": 1.7268487777137649, - "language_loss": 0.73350251, - "learning_rate": 3.80992265092595e-06, - "loss": 0.75526237, - "num_input_tokens_seen": 59600025, - "step": 2756, - "time_per_iteration": 2.771820545196533 - }, - { - "auxiliary_loss_clip": 0.01128662, - "auxiliary_loss_mlp": 0.01049666, - "balance_loss_clip": 1.05550277, - "balance_loss_mlp": 1.02969813, - "epoch": 0.16575980760559147, - "flos": 26250233598720.0, - "grad_norm": 1.5540667033085804, - "language_loss": 0.75308084, - "learning_rate": 3.8097569030682636e-06, - "loss": 0.77486414, - "num_input_tokens_seen": 59620600, - "step": 2757, - "time_per_iteration": 2.8106157779693604 - }, - { - "auxiliary_loss_clip": 0.01143608, - "auxiliary_loss_mlp": 0.01054064, - "balance_loss_clip": 1.057634, - "balance_loss_mlp": 1.03390563, - "epoch": 0.16581993085825944, - "flos": 26943955943040.0, - "grad_norm": 1.8675154897424497, - "language_loss": 0.84604371, - "learning_rate": 3.8095910865844137e-06, - "loss": 0.86802036, - "num_input_tokens_seen": 59641385, - "step": 2758, - "time_per_iteration": 2.8663368225097656 - }, - { - "auxiliary_loss_clip": 0.01168186, - "auxiliary_loss_mlp": 0.01058337, - "balance_loss_clip": 1.06166434, - "balance_loss_mlp": 1.03952527, - "epoch": 0.1658800541109274, - "flos": 21653632103040.0, - "grad_norm": 2.0824774555850243, - "language_loss": 0.78848934, - "learning_rate": 3.809425201480689e-06, - "loss": 0.81075454, - "num_input_tokens_seen": 59659865, - "step": 2759, - "time_per_iteration": 2.655371904373169 - }, - { - "auxiliary_loss_clip": 0.01098973, - "auxiliary_loss_mlp": 0.0104879, - "balance_loss_clip": 1.0491066, - "balance_loss_mlp": 1.02846527, - "epoch": 0.16594017736359537, - "flos": 16435488643200.0, - "grad_norm": 2.4005603702739613, - "language_loss": 0.75130272, - "learning_rate": 3.8092592477633793e-06, - "loss": 0.77278036, - "num_input_tokens_seen": 59678780, - "step": 2760, - "time_per_iteration": 2.767866611480713 - }, - { - "auxiliary_loss_clip": 0.01117278, - "auxiliary_loss_mlp": 0.0104823, - "balance_loss_clip": 1.05129814, - "balance_loss_mlp": 1.02867997, - "epoch": 0.16600030061626334, - "flos": 22637297030400.0, - "grad_norm": 1.5792623632565632, - "language_loss": 0.73425764, - "learning_rate": 3.8090932254387774e-06, - "loss": 0.75591272, - "num_input_tokens_seen": 59698795, - "step": 2761, - "time_per_iteration": 2.762836456298828 - }, - { - "auxiliary_loss_clip": 0.0113507, - "auxiliary_loss_mlp": 0.01050415, - "balance_loss_clip": 1.05250192, - "balance_loss_mlp": 1.03018475, - "epoch": 0.1660604238689313, - "flos": 26396569607040.0, - "grad_norm": 2.9515424803015033, - "language_loss": 0.88832974, - "learning_rate": 3.8089271345131788e-06, - "loss": 0.91018462, - "num_input_tokens_seen": 59718795, - "step": 2762, - "time_per_iteration": 2.766324281692505 - }, - { - "auxiliary_loss_clip": 0.01115163, - "auxiliary_loss_mlp": 0.01050144, - "balance_loss_clip": 1.05208707, - "balance_loss_mlp": 1.03080845, - "epoch": 0.16612054712159927, - "flos": 23039999383680.0, - "grad_norm": 1.84507980271118, - "language_loss": 0.87992418, - "learning_rate": 3.8087609749928822e-06, - "loss": 0.90157735, - "num_input_tokens_seen": 59737555, - "step": 2763, - "time_per_iteration": 2.7734055519104004 - }, - { - "auxiliary_loss_clip": 0.01086152, - "auxiliary_loss_mlp": 0.01013622, - "balance_loss_clip": 1.0448606, - "balance_loss_mlp": 1.01065338, - "epoch": 0.16618067037426726, - "flos": 59241225202560.0, - "grad_norm": 0.7790832079967882, - "language_loss": 0.59799927, - "learning_rate": 3.8085947468841885e-06, - "loss": 0.61899698, - "num_input_tokens_seen": 59800915, - "step": 2764, - "time_per_iteration": 3.1728692054748535 - }, - { - "auxiliary_loss_clip": 0.01152232, - "auxiliary_loss_mlp": 0.01053607, - "balance_loss_clip": 1.05467176, - "balance_loss_mlp": 1.03254318, - "epoch": 0.16624079362693522, - "flos": 27198813916800.0, - "grad_norm": 1.7436496772383425, - "language_loss": 0.82260036, - "learning_rate": 3.808428450193401e-06, - "loss": 0.84465873, - "num_input_tokens_seen": 59822910, - "step": 2765, - "time_per_iteration": 2.72440767288208 - }, - { - "auxiliary_loss_clip": 0.01171844, - "auxiliary_loss_mlp": 0.01049085, - "balance_loss_clip": 1.05882454, - "balance_loss_mlp": 1.02746069, - "epoch": 0.1663009168796032, - "flos": 10925068216320.0, - "grad_norm": 2.128015994498251, - "language_loss": 0.69980019, - "learning_rate": 3.8082620849268244e-06, - "loss": 0.72200948, - "num_input_tokens_seen": 59838805, - "step": 2766, - "time_per_iteration": 2.5810647010803223 - }, - { - "auxiliary_loss_clip": 0.0115036, - "auxiliary_loss_mlp": 0.01047665, - "balance_loss_clip": 1.05772817, - "balance_loss_mlp": 1.02792454, - "epoch": 0.16636104013227115, - "flos": 17894431353600.0, - "grad_norm": 2.107381123394178, - "language_loss": 0.8845337, - "learning_rate": 3.808095651090769e-06, - "loss": 0.90651393, - "num_input_tokens_seen": 59855345, - "step": 2767, - "time_per_iteration": 2.659240245819092 - }, - { - "auxiliary_loss_clip": 0.01077283, - "auxiliary_loss_mlp": 0.01002999, - "balance_loss_clip": 1.046556, - "balance_loss_mlp": 1.00020981, - "epoch": 0.16642116338493912, - "flos": 66726050463360.0, - "grad_norm": 0.6403612433239105, - "language_loss": 0.5289067, - "learning_rate": 3.8079291486915447e-06, - "loss": 0.54970956, - "num_input_tokens_seen": 59917710, - "step": 2768, - "time_per_iteration": 3.28488826751709 - }, - { - "auxiliary_loss_clip": 0.01137637, - "auxiliary_loss_mlp": 0.01051692, - "balance_loss_clip": 1.05451822, - "balance_loss_mlp": 1.03034163, - "epoch": 0.16648128663760708, - "flos": 19026048401280.0, - "grad_norm": 2.4342686570828267, - "language_loss": 0.84962058, - "learning_rate": 3.8077625777354667e-06, - "loss": 0.87151396, - "num_input_tokens_seen": 59935105, - "step": 2769, - "time_per_iteration": 2.753257989883423 - }, - { - "auxiliary_loss_clip": 0.01068987, - "auxiliary_loss_mlp": 0.0100573, - "balance_loss_clip": 1.04678345, - "balance_loss_mlp": 1.00316668, - "epoch": 0.16654140989027508, - "flos": 70134976759680.0, - "grad_norm": 0.8107434108728753, - "language_loss": 0.57455683, - "learning_rate": 3.80759593822885e-06, - "loss": 0.59530401, - "num_input_tokens_seen": 59984085, - "step": 2770, - "time_per_iteration": 3.2202906608581543 - }, - { - "auxiliary_loss_clip": 0.01054548, - "auxiliary_loss_mlp": 0.01003676, - "balance_loss_clip": 1.04637623, - "balance_loss_mlp": 1.00086308, - "epoch": 0.16660153314294304, - "flos": 70272406195200.0, - "grad_norm": 0.8940719168038874, - "language_loss": 0.56241393, - "learning_rate": 3.807429230178015e-06, - "loss": 0.58299619, - "num_input_tokens_seen": 60043470, - "step": 2771, - "time_per_iteration": 3.3302085399627686 - }, - { - "auxiliary_loss_clip": 0.01110714, - "auxiliary_loss_mlp": 0.01053994, - "balance_loss_clip": 1.04819679, - "balance_loss_mlp": 1.03316772, - "epoch": 0.166661656395611, - "flos": 23075048079360.0, - "grad_norm": 2.9137693497887778, - "language_loss": 0.70419657, - "learning_rate": 3.8072624535892817e-06, - "loss": 0.72584367, - "num_input_tokens_seen": 60063045, - "step": 2772, - "time_per_iteration": 2.845414161682129 - }, - { - "auxiliary_loss_clip": 0.0114592, - "auxiliary_loss_mlp": 0.01049708, - "balance_loss_clip": 1.05082583, - "balance_loss_mlp": 1.02923954, - "epoch": 0.16672177964827897, - "flos": 28366341586560.0, - "grad_norm": 2.20945076195277, - "language_loss": 0.86324167, - "learning_rate": 3.807095608468975e-06, - "loss": 0.88519788, - "num_input_tokens_seen": 60081945, - "step": 2773, - "time_per_iteration": 2.669412851333618 - }, - { - "auxiliary_loss_clip": 0.01095425, - "auxiliary_loss_mlp": 0.01049097, - "balance_loss_clip": 1.04436934, - "balance_loss_mlp": 1.0300827, - "epoch": 0.16678190290094694, - "flos": 19091010147840.0, - "grad_norm": 2.0211952616678937, - "language_loss": 0.82141376, - "learning_rate": 3.8069286948234224e-06, - "loss": 0.84285897, - "num_input_tokens_seen": 60096820, - "step": 2774, - "time_per_iteration": 2.7111308574676514 - }, - { - "auxiliary_loss_clip": 0.01123493, - "auxiliary_loss_mlp": 0.01045144, - "balance_loss_clip": 1.05252421, - "balance_loss_mlp": 1.02446127, - "epoch": 0.1668420261536149, - "flos": 21799106184960.0, - "grad_norm": 3.3781068524499, - "language_loss": 0.8298822, - "learning_rate": 3.806761712658952e-06, - "loss": 0.85156858, - "num_input_tokens_seen": 60116140, - "step": 2775, - "time_per_iteration": 2.7367632389068604 - }, - { - "auxiliary_loss_clip": 0.01150495, - "auxiliary_loss_mlp": 0.01051475, - "balance_loss_clip": 1.05761933, - "balance_loss_mlp": 1.03264022, - "epoch": 0.16690214940628287, - "flos": 19062533640960.0, - "grad_norm": 1.8115651629444076, - "language_loss": 0.80919641, - "learning_rate": 3.806594661981897e-06, - "loss": 0.8312161, - "num_input_tokens_seen": 60134235, - "step": 2776, - "time_per_iteration": 2.651723623275757 - }, - { - "auxiliary_loss_clip": 0.0113775, - "auxiliary_loss_mlp": 0.01054199, - "balance_loss_clip": 1.05518723, - "balance_loss_mlp": 1.0346483, - "epoch": 0.16696227265895086, - "flos": 18588548747520.0, - "grad_norm": 2.7510345221850336, - "language_loss": 0.80203485, - "learning_rate": 3.8064275427985906e-06, - "loss": 0.82395434, - "num_input_tokens_seen": 60153275, - "step": 2777, - "time_per_iteration": 2.6380929946899414 - }, - { - "auxiliary_loss_clip": 0.01147967, - "auxiliary_loss_mlp": 0.01045166, - "balance_loss_clip": 1.05270481, - "balance_loss_mlp": 1.02640271, - "epoch": 0.16702239591161883, - "flos": 23294139085440.0, - "grad_norm": 1.6179722336290305, - "language_loss": 0.85384095, - "learning_rate": 3.806260355115371e-06, - "loss": 0.87577224, - "num_input_tokens_seen": 60173215, - "step": 2778, - "time_per_iteration": 2.754652500152588 - }, - { - "auxiliary_loss_clip": 0.01136802, - "auxiliary_loss_mlp": 0.01040643, - "balance_loss_clip": 1.0531714, - "balance_loss_mlp": 1.02148652, - "epoch": 0.1670825191642868, - "flos": 24425648392320.0, - "grad_norm": 3.2091470007324414, - "language_loss": 0.74180603, - "learning_rate": 3.8060930989385778e-06, - "loss": 0.76358056, - "num_input_tokens_seen": 60190515, - "step": 2779, - "time_per_iteration": 2.777193784713745 - }, - { - "auxiliary_loss_clip": 0.01112683, - "auxiliary_loss_mlp": 0.00777451, - "balance_loss_clip": 1.04981184, - "balance_loss_mlp": 1.0015173, - "epoch": 0.16714264241695476, - "flos": 26797512193920.0, - "grad_norm": 2.127789274190337, - "language_loss": 0.6557346, - "learning_rate": 3.805925774274554e-06, - "loss": 0.67463589, - "num_input_tokens_seen": 60211655, - "step": 2780, - "time_per_iteration": 2.896976947784424 - }, - { - "auxiliary_loss_clip": 0.01120921, - "auxiliary_loss_mlp": 0.01045506, - "balance_loss_clip": 1.04843462, - "balance_loss_mlp": 1.02547836, - "epoch": 0.16720276566962272, - "flos": 21835304115840.0, - "grad_norm": 2.46647860258999, - "language_loss": 0.78422606, - "learning_rate": 3.805758381129643e-06, - "loss": 0.80589032, - "num_input_tokens_seen": 60230860, - "step": 2781, - "time_per_iteration": 2.725782632827759 - }, - { - "auxiliary_loss_clip": 0.01094692, - "auxiliary_loss_mlp": 0.01050104, - "balance_loss_clip": 1.04439843, - "balance_loss_mlp": 1.03056526, - "epoch": 0.1672628889222907, - "flos": 21470415805440.0, - "grad_norm": 26.23767952829368, - "language_loss": 0.75119764, - "learning_rate": 3.805590919510193e-06, - "loss": 0.77264553, - "num_input_tokens_seen": 60250535, - "step": 2782, - "time_per_iteration": 2.7064197063446045 - }, - { - "auxiliary_loss_clip": 0.01129162, - "auxiliary_loss_mlp": 0.01047612, - "balance_loss_clip": 1.05152631, - "balance_loss_mlp": 1.02764392, - "epoch": 0.16732301217495865, - "flos": 30774008269440.0, - "grad_norm": 2.116531296279042, - "language_loss": 0.67398441, - "learning_rate": 3.8054233894225547e-06, - "loss": 0.69575214, - "num_input_tokens_seen": 60269530, - "step": 2783, - "time_per_iteration": 2.7901556491851807 - }, - { - "auxiliary_loss_clip": 0.01158882, - "auxiliary_loss_mlp": 0.0105166, - "balance_loss_clip": 1.05460215, - "balance_loss_mlp": 1.03271747, - "epoch": 0.16738313542762664, - "flos": 23474625949440.0, - "grad_norm": 1.7768362036873409, - "language_loss": 0.69919086, - "learning_rate": 3.805255790873081e-06, - "loss": 0.72129631, - "num_input_tokens_seen": 60289900, - "step": 2784, - "time_per_iteration": 5.714844226837158 - }, - { - "auxiliary_loss_clip": 0.01137618, - "auxiliary_loss_mlp": 0.01056022, - "balance_loss_clip": 1.05217624, - "balance_loss_mlp": 1.03539932, - "epoch": 0.1674432586802946, - "flos": 29789086366080.0, - "grad_norm": 4.741795209709136, - "language_loss": 0.60970068, - "learning_rate": 3.805088123868126e-06, - "loss": 0.6316371, - "num_input_tokens_seen": 60310025, - "step": 2785, - "time_per_iteration": 4.219547510147095 - }, - { - "auxiliary_loss_clip": 0.01057886, - "auxiliary_loss_mlp": 0.0100398, - "balance_loss_clip": 1.03758883, - "balance_loss_mlp": 1.00141752, - "epoch": 0.16750338193296258, - "flos": 66136073575680.0, - "grad_norm": 0.773077721474628, - "language_loss": 0.58780885, - "learning_rate": 3.8049203884140492e-06, - "loss": 0.60842752, - "num_input_tokens_seen": 60377800, - "step": 2786, - "time_per_iteration": 3.2306320667266846 - }, - { - "auxiliary_loss_clip": 0.0113927, - "auxiliary_loss_mlp": 0.01044966, - "balance_loss_clip": 1.0496738, - "balance_loss_mlp": 1.02589226, - "epoch": 0.16756350518563054, - "flos": 25696777864320.0, - "grad_norm": 1.7333132735183339, - "language_loss": 0.76308596, - "learning_rate": 3.80475258451721e-06, - "loss": 0.78492826, - "num_input_tokens_seen": 60398215, - "step": 2787, - "time_per_iteration": 2.6434125900268555 - }, - { - "auxiliary_loss_clip": 0.01146924, - "auxiliary_loss_mlp": 0.01043386, - "balance_loss_clip": 1.0529089, - "balance_loss_mlp": 1.02544546, - "epoch": 0.1676236284382985, - "flos": 23836102467840.0, - "grad_norm": 1.7210472408736244, - "language_loss": 0.7717936, - "learning_rate": 3.804584712183972e-06, - "loss": 0.79369676, - "num_input_tokens_seen": 60416910, - "step": 2788, - "time_per_iteration": 4.359618425369263 - }, - { - "auxiliary_loss_clip": 0.01054629, - "auxiliary_loss_mlp": 0.00999991, - "balance_loss_clip": 1.03482509, - "balance_loss_mlp": 0.99746382, - "epoch": 0.16768375169096647, - "flos": 59874902985600.0, - "grad_norm": 0.8596744797543817, - "language_loss": 0.59331679, - "learning_rate": 3.8044167714207013e-06, - "loss": 0.61386299, - "num_input_tokens_seen": 60468660, - "step": 2789, - "time_per_iteration": 3.0742650032043457 - }, - { - "auxiliary_loss_clip": 0.01148272, - "auxiliary_loss_mlp": 0.01053856, - "balance_loss_clip": 1.05450928, - "balance_loss_mlp": 1.03428209, - "epoch": 0.16774387494363446, - "flos": 38435657207040.0, - "grad_norm": 1.689036486923415, - "language_loss": 0.7012763, - "learning_rate": 3.804248762233765e-06, - "loss": 0.7232976, - "num_input_tokens_seen": 60492370, - "step": 2790, - "time_per_iteration": 2.872232437133789 - }, - { - "auxiliary_loss_clip": 0.0112492, - "auxiliary_loss_mlp": 0.01051622, - "balance_loss_clip": 1.0497216, - "balance_loss_mlp": 1.0334661, - "epoch": 0.16780399819630243, - "flos": 22637620252800.0, - "grad_norm": 1.864386369112868, - "language_loss": 0.79464513, - "learning_rate": 3.8040806846295356e-06, - "loss": 0.81641054, - "num_input_tokens_seen": 60512655, - "step": 2791, - "time_per_iteration": 2.7180140018463135 - }, - { - "auxiliary_loss_clip": 0.01122456, - "auxiliary_loss_mlp": 0.01050939, - "balance_loss_clip": 1.04977369, - "balance_loss_mlp": 1.03106701, - "epoch": 0.1678641214489704, - "flos": 32891516887680.0, - "grad_norm": 1.705849915566178, - "language_loss": 0.71547955, - "learning_rate": 3.8039125386143853e-06, - "loss": 0.73721349, - "num_input_tokens_seen": 60533090, - "step": 2792, - "time_per_iteration": 2.9221818447113037 - }, - { - "auxiliary_loss_clip": 0.01131469, - "auxiliary_loss_mlp": 0.01044061, - "balance_loss_clip": 1.05479562, - "balance_loss_mlp": 1.02551246, - "epoch": 0.16792424470163836, - "flos": 19974916028160.0, - "grad_norm": 1.9301593564774673, - "language_loss": 0.71581644, - "learning_rate": 3.803744324194691e-06, - "loss": 0.73757172, - "num_input_tokens_seen": 60553190, - "step": 2793, - "time_per_iteration": 2.75104022026062 - }, - { - "auxiliary_loss_clip": 0.01143072, - "auxiliary_loss_mlp": 0.01053231, - "balance_loss_clip": 1.05276942, - "balance_loss_mlp": 1.03452659, - "epoch": 0.16798436795430632, - "flos": 19719878486400.0, - "grad_norm": 2.3859650274226833, - "language_loss": 0.7717455, - "learning_rate": 3.803576041376831e-06, - "loss": 0.79370856, - "num_input_tokens_seen": 60571995, - "step": 2794, - "time_per_iteration": 2.6007745265960693 - }, - { - "auxiliary_loss_clip": 0.01137828, - "auxiliary_loss_mlp": 0.0104987, - "balance_loss_clip": 1.05250025, - "balance_loss_mlp": 1.03010476, - "epoch": 0.1680444912069743, - "flos": 28104839596800.0, - "grad_norm": 2.7692472240964747, - "language_loss": 0.71609265, - "learning_rate": 3.803407690167187e-06, - "loss": 0.73796958, - "num_input_tokens_seen": 60591275, - "step": 2795, - "time_per_iteration": 2.693826198577881 - }, - { - "auxiliary_loss_clip": 0.01131865, - "auxiliary_loss_mlp": 0.01041012, - "balance_loss_clip": 1.04973865, - "balance_loss_mlp": 1.02302384, - "epoch": 0.16810461445964225, - "flos": 18075205526400.0, - "grad_norm": 1.990096863808903, - "language_loss": 0.84230494, - "learning_rate": 3.803239270572142e-06, - "loss": 0.8640337, - "num_input_tokens_seen": 60609235, - "step": 2796, - "time_per_iteration": 2.697253465652466 - }, - { - "auxiliary_loss_clip": 0.01101634, - "auxiliary_loss_mlp": 0.01045196, - "balance_loss_clip": 1.04877055, - "balance_loss_mlp": 1.0262773, - "epoch": 0.16816473771231025, - "flos": 23878657105920.0, - "grad_norm": 1.9272276676322646, - "language_loss": 0.81609607, - "learning_rate": 3.8030707825980838e-06, - "loss": 0.83756441, - "num_input_tokens_seen": 60629880, - "step": 2797, - "time_per_iteration": 2.8784244060516357 - }, - { - "auxiliary_loss_clip": 0.0114057, - "auxiliary_loss_mlp": 0.01041282, - "balance_loss_clip": 1.05136061, - "balance_loss_mlp": 1.02448523, - "epoch": 0.1682248609649782, - "flos": 22783597125120.0, - "grad_norm": 1.7015769336052518, - "language_loss": 0.74811113, - "learning_rate": 3.802902226251401e-06, - "loss": 0.76992965, - "num_input_tokens_seen": 60651175, - "step": 2798, - "time_per_iteration": 2.700727939605713 - }, - { - "auxiliary_loss_clip": 0.01161342, - "auxiliary_loss_mlp": 0.01048462, - "balance_loss_clip": 1.05728281, - "balance_loss_mlp": 1.03075945, - "epoch": 0.16828498421764618, - "flos": 20705123612160.0, - "grad_norm": 1.5964091182578661, - "language_loss": 0.79693568, - "learning_rate": 3.8027336015384845e-06, - "loss": 0.81903368, - "num_input_tokens_seen": 60670210, - "step": 2799, - "time_per_iteration": 2.6582021713256836 - }, - { - "auxiliary_loss_clip": 0.01077177, - "auxiliary_loss_mlp": 0.01045216, - "balance_loss_clip": 1.04514158, - "balance_loss_mlp": 1.02374637, - "epoch": 0.16834510747031414, - "flos": 29420606695680.0, - "grad_norm": 4.227726163531211, - "language_loss": 0.70963746, - "learning_rate": 3.8025649084657296e-06, - "loss": 0.73086143, - "num_input_tokens_seen": 60690895, - "step": 2800, - "time_per_iteration": 2.8856699466705322 - }, - { - "auxiliary_loss_clip": 0.01108822, - "auxiliary_loss_mlp": 0.00777078, - "balance_loss_clip": 1.04776788, - "balance_loss_mlp": 1.00161195, - "epoch": 0.1684052307229821, - "flos": 18145374744960.0, - "grad_norm": 1.9902029671619985, - "language_loss": 0.83663505, - "learning_rate": 3.8023961470395326e-06, - "loss": 0.85549408, - "num_input_tokens_seen": 60708280, - "step": 2801, - "time_per_iteration": 2.6917035579681396 - }, - { - "auxiliary_loss_clip": 0.01128148, - "auxiliary_loss_mlp": 0.01049324, - "balance_loss_clip": 1.05011535, - "balance_loss_mlp": 1.03084683, - "epoch": 0.16846535397565007, - "flos": 16574929240320.0, - "grad_norm": 2.4052305427948735, - "language_loss": 0.82509923, - "learning_rate": 3.8022273172662933e-06, - "loss": 0.84687394, - "num_input_tokens_seen": 60724150, - "step": 2802, - "time_per_iteration": 2.882611036300659 - }, - { - "auxiliary_loss_clip": 0.01150156, - "auxiliary_loss_mlp": 0.01048717, - "balance_loss_clip": 1.05517435, - "balance_loss_mlp": 1.02885723, - "epoch": 0.16852547722831807, - "flos": 30408868563840.0, - "grad_norm": 3.107584498439891, - "language_loss": 0.80643189, - "learning_rate": 3.802058419152413e-06, - "loss": 0.8284207, - "num_input_tokens_seen": 60746485, - "step": 2803, - "time_per_iteration": 2.7886922359466553 - }, - { - "auxiliary_loss_clip": 0.01148107, - "auxiliary_loss_mlp": 0.01047852, - "balance_loss_clip": 1.0556829, - "balance_loss_mlp": 1.02918339, - "epoch": 0.16858560048098603, - "flos": 33507420416640.0, - "grad_norm": 2.2127389669880713, - "language_loss": 0.76168799, - "learning_rate": 3.801889452704297e-06, - "loss": 0.7836476, - "num_input_tokens_seen": 60762875, - "step": 2804, - "time_per_iteration": 2.7588601112365723 - }, - { - "auxiliary_loss_clip": 0.01045171, - "auxiliary_loss_mlp": 0.01013955, - "balance_loss_clip": 1.03581083, - "balance_loss_mlp": 1.01078367, - "epoch": 0.168645723733654, - "flos": 67370502326400.0, - "grad_norm": 0.8536034833258724, - "language_loss": 0.55464876, - "learning_rate": 3.8017204179283526e-06, - "loss": 0.57524002, - "num_input_tokens_seen": 60825510, - "step": 2805, - "time_per_iteration": 3.2089412212371826 - }, - { - "auxiliary_loss_clip": 0.01138275, - "auxiliary_loss_mlp": 0.0103974, - "balance_loss_clip": 1.05013156, - "balance_loss_mlp": 1.02239537, - "epoch": 0.16870584698632196, - "flos": 21324618501120.0, - "grad_norm": 2.2836767274778427, - "language_loss": 0.73090243, - "learning_rate": 3.8015513148309892e-06, - "loss": 0.75268269, - "num_input_tokens_seen": 60844440, - "step": 2806, - "time_per_iteration": 2.643596649169922 - }, - { - "auxiliary_loss_clip": 0.01117063, - "auxiliary_loss_mlp": 0.01045402, - "balance_loss_clip": 1.05330753, - "balance_loss_mlp": 1.02766335, - "epoch": 0.16876597023898993, - "flos": 20740746925440.0, - "grad_norm": 1.8406859431587912, - "language_loss": 0.69773197, - "learning_rate": 3.80138214341862e-06, - "loss": 0.71935666, - "num_input_tokens_seen": 60863210, - "step": 2807, - "time_per_iteration": 2.6946568489074707 - }, - { - "auxiliary_loss_clip": 0.01130702, - "auxiliary_loss_mlp": 0.01047199, - "balance_loss_clip": 1.04842246, - "balance_loss_mlp": 1.02794707, - "epoch": 0.1688260934916579, - "flos": 20303498666880.0, - "grad_norm": 3.042021842274248, - "language_loss": 0.70280695, - "learning_rate": 3.8012129036976587e-06, - "loss": 0.72458601, - "num_input_tokens_seen": 60882510, - "step": 2808, - "time_per_iteration": 2.6656088829040527 - }, - { - "auxiliary_loss_clip": 0.01119025, - "auxiliary_loss_mlp": 0.01041739, - "balance_loss_clip": 1.05019665, - "balance_loss_mlp": 1.02164018, - "epoch": 0.16888621674432586, - "flos": 20340702178560.0, - "grad_norm": 2.0835789337145965, - "language_loss": 0.79903001, - "learning_rate": 3.8010435956745236e-06, - "loss": 0.8206377, - "num_input_tokens_seen": 60901105, - "step": 2809, - "time_per_iteration": 2.7665679454803467 - }, - { - "auxiliary_loss_clip": 0.01155146, - "auxiliary_loss_mlp": 0.01042018, - "balance_loss_clip": 1.0557605, - "balance_loss_mlp": 1.02252758, - "epoch": 0.16894633999699385, - "flos": 16244802316800.0, - "grad_norm": 2.0672093223845245, - "language_loss": 0.88076419, - "learning_rate": 3.8008742193556358e-06, - "loss": 0.90273583, - "num_input_tokens_seen": 60915340, - "step": 2810, - "time_per_iteration": 2.6186363697052 - }, - { - "auxiliary_loss_clip": 0.01149997, - "auxiliary_loss_mlp": 0.0104631, - "balance_loss_clip": 1.05503082, - "balance_loss_mlp": 1.02715337, - "epoch": 0.16900646324966181, - "flos": 19610171372160.0, - "grad_norm": 1.8921026809528976, - "language_loss": 0.92376304, - "learning_rate": 3.800704774747416e-06, - "loss": 0.9457261, - "num_input_tokens_seen": 60933735, - "step": 2811, - "time_per_iteration": 2.6567442417144775 - }, - { - "auxiliary_loss_clip": 0.01140053, - "auxiliary_loss_mlp": 0.01049063, - "balance_loss_clip": 1.05383325, - "balance_loss_mlp": 1.03039432, - "epoch": 0.16906658650232978, - "flos": 22018089450240.0, - "grad_norm": 2.116573413654177, - "language_loss": 0.78582352, - "learning_rate": 3.800535261856291e-06, - "loss": 0.8077147, - "num_input_tokens_seen": 60953105, - "step": 2812, - "time_per_iteration": 2.6796023845672607 - }, - { - "auxiliary_loss_clip": 0.01147895, - "auxiliary_loss_mlp": 0.01043917, - "balance_loss_clip": 1.05772316, - "balance_loss_mlp": 1.02653646, - "epoch": 0.16912670975499774, - "flos": 11763690024960.0, - "grad_norm": 2.5483899062625093, - "language_loss": 0.75195068, - "learning_rate": 3.8003656806886887e-06, - "loss": 0.7738688, - "num_input_tokens_seen": 60969150, - "step": 2813, - "time_per_iteration": 2.621772050857544 - }, - { - "auxiliary_loss_clip": 0.01136313, - "auxiliary_loss_mlp": 0.01045037, - "balance_loss_clip": 1.05311871, - "balance_loss_mlp": 1.02599943, - "epoch": 0.1691868330076657, - "flos": 17161386595200.0, - "grad_norm": 3.0041182480764554, - "language_loss": 0.69118392, - "learning_rate": 3.8001960312510396e-06, - "loss": 0.7129975, - "num_input_tokens_seen": 60982825, - "step": 2814, - "time_per_iteration": 2.837264060974121 - }, - { - "auxiliary_loss_clip": 0.01163835, - "auxiliary_loss_mlp": 0.01039837, - "balance_loss_clip": 1.05900145, - "balance_loss_mlp": 1.02134776, - "epoch": 0.16924695626033368, - "flos": 22416553998720.0, - "grad_norm": 3.1079956206415833, - "language_loss": 0.61439502, - "learning_rate": 3.800026313549776e-06, - "loss": 0.63643175, - "num_input_tokens_seen": 61000875, - "step": 2815, - "time_per_iteration": 2.6967194080352783 - }, - { - "auxiliary_loss_clip": 0.01129827, - "auxiliary_loss_mlp": 0.01042692, - "balance_loss_clip": 1.05139673, - "balance_loss_mlp": 1.02382088, - "epoch": 0.16930707951300164, - "flos": 25739655724800.0, - "grad_norm": 1.7930623183302479, - "language_loss": 0.82490849, - "learning_rate": 3.7998565275913342e-06, - "loss": 0.84663367, - "num_input_tokens_seen": 61021940, - "step": 2816, - "time_per_iteration": 2.7227163314819336 - }, - { - "auxiliary_loss_clip": 0.01133129, - "auxiliary_loss_mlp": 0.01047914, - "balance_loss_clip": 1.05375743, - "balance_loss_mlp": 1.02853012, - "epoch": 0.16936720276566963, - "flos": 22747040058240.0, - "grad_norm": 3.083808689594852, - "language_loss": 0.87322289, - "learning_rate": 3.799686673382153e-06, - "loss": 0.89503324, - "num_input_tokens_seen": 61040285, - "step": 2817, - "time_per_iteration": 2.733180522918701 - }, - { - "auxiliary_loss_clip": 0.01141455, - "auxiliary_loss_mlp": 0.01052753, - "balance_loss_clip": 1.05800366, - "balance_loss_mlp": 1.03352427, - "epoch": 0.1694273260183376, - "flos": 19573973441280.0, - "grad_norm": 1.8594303503608436, - "language_loss": 0.81247765, - "learning_rate": 3.799516750928672e-06, - "loss": 0.83441973, - "num_input_tokens_seen": 61059020, - "step": 2818, - "time_per_iteration": 2.7384097576141357 - }, - { - "auxiliary_loss_clip": 0.01160132, - "auxiliary_loss_mlp": 0.01044196, - "balance_loss_clip": 1.05699944, - "balance_loss_mlp": 1.02496791, - "epoch": 0.16948744927100556, - "flos": 12457843332480.0, - "grad_norm": 2.739998367204505, - "language_loss": 0.80788404, - "learning_rate": 3.799346760237336e-06, - "loss": 0.82992733, - "num_input_tokens_seen": 61074245, - "step": 2819, - "time_per_iteration": 2.609870672225952 - }, - { - "auxiliary_loss_clip": 0.01069019, - "auxiliary_loss_mlp": 0.01015301, - "balance_loss_clip": 1.0485003, - "balance_loss_mlp": 1.0125947, - "epoch": 0.16954757252367353, - "flos": 71291694435840.0, - "grad_norm": 0.9309223426502673, - "language_loss": 0.61031163, - "learning_rate": 3.7991767013145902e-06, - "loss": 0.63115478, - "num_input_tokens_seen": 61127080, - "step": 2820, - "time_per_iteration": 3.161051034927368 - }, - { - "auxiliary_loss_clip": 0.01125604, - "auxiliary_loss_mlp": 0.0105036, - "balance_loss_clip": 1.05106986, - "balance_loss_mlp": 1.03207326, - "epoch": 0.1696076957763415, - "flos": 29606516513280.0, - "grad_norm": 1.8682266790688726, - "language_loss": 0.78265435, - "learning_rate": 3.7990065741668844e-06, - "loss": 0.80441403, - "num_input_tokens_seen": 61146955, - "step": 2821, - "time_per_iteration": 2.838730573654175 - }, - { - "auxiliary_loss_clip": 0.0113863, - "auxiliary_loss_mlp": 0.01055528, - "balance_loss_clip": 1.05282724, - "balance_loss_mlp": 1.03494084, - "epoch": 0.16966781902900946, - "flos": 24388588535040.0, - "grad_norm": 2.1667405259997516, - "language_loss": 0.78521514, - "learning_rate": 3.7988363788006685e-06, - "loss": 0.80715668, - "num_input_tokens_seen": 61166605, - "step": 2822, - "time_per_iteration": 2.783385753631592 - }, - { - "auxiliary_loss_clip": 0.01143597, - "auxiliary_loss_mlp": 0.00777154, - "balance_loss_clip": 1.05367076, - "balance_loss_mlp": 1.00129986, - "epoch": 0.16972794228167745, - "flos": 23038814234880.0, - "grad_norm": 1.8038457392731222, - "language_loss": 0.74939907, - "learning_rate": 3.7986661152223967e-06, - "loss": 0.76860654, - "num_input_tokens_seen": 61186535, - "step": 2823, - "time_per_iteration": 4.329328298568726 - }, - { - "auxiliary_loss_clip": 0.01129469, - "auxiliary_loss_mlp": 0.0105385, - "balance_loss_clip": 1.05166912, - "balance_loss_mlp": 1.03496754, - "epoch": 0.16978806553434542, - "flos": 35228691129600.0, - "grad_norm": 3.336653609493179, - "language_loss": 0.60266119, - "learning_rate": 3.7984957834385257e-06, - "loss": 0.62449437, - "num_input_tokens_seen": 61208965, - "step": 2824, - "time_per_iteration": 5.892346620559692 - }, - { - "auxiliary_loss_clip": 0.01138249, - "auxiliary_loss_mlp": 0.01042322, - "balance_loss_clip": 1.05565047, - "balance_loss_mlp": 1.02287912, - "epoch": 0.16984818878701338, - "flos": 32014290936960.0, - "grad_norm": 2.152838804074104, - "language_loss": 0.73322558, - "learning_rate": 3.7983253834555144e-06, - "loss": 0.75503135, - "num_input_tokens_seen": 61230670, - "step": 2825, - "time_per_iteration": 2.834482431411743 - }, - { - "auxiliary_loss_clip": 0.01161467, - "auxiliary_loss_mlp": 0.01047701, - "balance_loss_clip": 1.05502653, - "balance_loss_mlp": 1.02762675, - "epoch": 0.16990831203968135, - "flos": 22818609907200.0, - "grad_norm": 2.05671259677731, - "language_loss": 0.85638934, - "learning_rate": 3.7981549152798245e-06, - "loss": 0.87848103, - "num_input_tokens_seen": 61249510, - "step": 2826, - "time_per_iteration": 2.6443135738372803 - }, - { - "auxiliary_loss_clip": 0.01139368, - "auxiliary_loss_mlp": 0.01047749, - "balance_loss_clip": 1.05266595, - "balance_loss_mlp": 1.02856779, - "epoch": 0.1699684352923493, - "flos": 23039604334080.0, - "grad_norm": 1.9562557148441426, - "language_loss": 0.82465482, - "learning_rate": 3.7979843789179196e-06, - "loss": 0.84652597, - "num_input_tokens_seen": 61269440, - "step": 2827, - "time_per_iteration": 2.7683157920837402 - }, - { - "auxiliary_loss_clip": 0.01131885, - "auxiliary_loss_mlp": 0.0104561, - "balance_loss_clip": 1.05320346, - "balance_loss_mlp": 1.02536786, - "epoch": 0.17002855854501728, - "flos": 21434110133760.0, - "grad_norm": 1.7386401818136152, - "language_loss": 0.73704529, - "learning_rate": 3.797813774376267e-06, - "loss": 0.75882024, - "num_input_tokens_seen": 61288195, - "step": 2828, - "time_per_iteration": 4.465311288833618 - }, - { - "auxiliary_loss_clip": 0.01061458, - "auxiliary_loss_mlp": 0.01009538, - "balance_loss_clip": 1.04764342, - "balance_loss_mlp": 1.00620067, - "epoch": 0.17008868179768524, - "flos": 71453509205760.0, - "grad_norm": 0.7670168832041738, - "language_loss": 0.56426483, - "learning_rate": 3.797643101661336e-06, - "loss": 0.58497471, - "num_input_tokens_seen": 61350850, - "step": 2829, - "time_per_iteration": 3.3114631175994873 - }, - { - "auxiliary_loss_clip": 0.01111753, - "auxiliary_loss_mlp": 0.01051557, - "balance_loss_clip": 1.04527223, - "balance_loss_mlp": 1.03088641, - "epoch": 0.17014880505035324, - "flos": 24900315644160.0, - "grad_norm": 1.7961285206560338, - "language_loss": 0.83465374, - "learning_rate": 3.7974723607795983e-06, - "loss": 0.85628688, - "num_input_tokens_seen": 61370765, - "step": 2830, - "time_per_iteration": 2.795253038406372 - }, - { - "auxiliary_loss_clip": 0.01121533, - "auxiliary_loss_mlp": 0.0104408, - "balance_loss_clip": 1.04901659, - "balance_loss_mlp": 1.02442193, - "epoch": 0.1702089283030212, - "flos": 29862415981440.0, - "grad_norm": 2.4873654173451727, - "language_loss": 0.78360993, - "learning_rate": 3.797301551737529e-06, - "loss": 0.80526608, - "num_input_tokens_seen": 61388935, - "step": 2831, - "time_per_iteration": 2.7864232063293457 - }, - { - "auxiliary_loss_clip": 0.01123612, - "auxiliary_loss_mlp": 0.01051154, - "balance_loss_clip": 1.05275893, - "balance_loss_mlp": 1.0311985, - "epoch": 0.17026905155568917, - "flos": 17744180762880.0, - "grad_norm": 2.532473263441992, - "language_loss": 0.79668158, - "learning_rate": 3.7971306745416044e-06, - "loss": 0.81842923, - "num_input_tokens_seen": 61407350, - "step": 2832, - "time_per_iteration": 2.842217206954956 - }, - { - "auxiliary_loss_clip": 0.01127135, - "auxiliary_loss_mlp": 0.01048966, - "balance_loss_clip": 1.05029321, - "balance_loss_mlp": 1.02984488, - "epoch": 0.17032917480835713, - "flos": 23148665003520.0, - "grad_norm": 1.8387196201649116, - "language_loss": 0.88638175, - "learning_rate": 3.7969597291983046e-06, - "loss": 0.90814275, - "num_input_tokens_seen": 61429010, - "step": 2833, - "time_per_iteration": 2.75942325592041 - }, - { - "auxiliary_loss_clip": 0.01158799, - "auxiliary_loss_mlp": 0.01046883, - "balance_loss_clip": 1.05633831, - "balance_loss_mlp": 1.02842951, - "epoch": 0.1703892980610251, - "flos": 39202565512320.0, - "grad_norm": 2.49094605220443, - "language_loss": 0.71924698, - "learning_rate": 3.7967887157141115e-06, - "loss": 0.74130386, - "num_input_tokens_seen": 61450040, - "step": 2834, - "time_per_iteration": 2.9035184383392334 - }, - { - "auxiliary_loss_clip": 0.01119873, - "auxiliary_loss_mlp": 0.01052215, - "balance_loss_clip": 1.05165124, - "balance_loss_mlp": 1.03428626, - "epoch": 0.17044942131369306, - "flos": 23039101543680.0, - "grad_norm": 1.9093816511111852, - "language_loss": 0.86831236, - "learning_rate": 3.7966176340955106e-06, - "loss": 0.89003325, - "num_input_tokens_seen": 61468585, - "step": 2835, - "time_per_iteration": 2.7627484798431396 - }, - { - "auxiliary_loss_clip": 0.01149332, - "auxiliary_loss_mlp": 0.01049844, - "balance_loss_clip": 1.0536654, - "balance_loss_mlp": 1.02887547, - "epoch": 0.17050954456636103, - "flos": 17054983532160.0, - "grad_norm": 2.1227367002258153, - "language_loss": 0.74483943, - "learning_rate": 3.796446484348989e-06, - "loss": 0.76683116, - "num_input_tokens_seen": 61486330, - "step": 2836, - "time_per_iteration": 2.6748619079589844 - }, - { - "auxiliary_loss_clip": 0.01102249, - "auxiliary_loss_mlp": 0.01049533, - "balance_loss_clip": 1.04775679, - "balance_loss_mlp": 1.02790809, - "epoch": 0.17056966781902902, - "flos": 16836969934080.0, - "grad_norm": 2.1718385109372824, - "language_loss": 0.79959226, - "learning_rate": 3.796275266481036e-06, - "loss": 0.82111007, - "num_input_tokens_seen": 61503950, - "step": 2837, - "time_per_iteration": 2.757340908050537 - }, - { - "auxiliary_loss_clip": 0.01144378, - "auxiliary_loss_mlp": 0.01044803, - "balance_loss_clip": 1.05493581, - "balance_loss_mlp": 1.02644491, - "epoch": 0.17062979107169698, - "flos": 17712543859200.0, - "grad_norm": 1.6825251002952497, - "language_loss": 0.83258498, - "learning_rate": 3.7961039804981456e-06, - "loss": 0.85447681, - "num_input_tokens_seen": 61523550, - "step": 2838, - "time_per_iteration": 2.705357551574707 - }, - { - "auxiliary_loss_clip": 0.0110604, - "auxiliary_loss_mlp": 0.01044889, - "balance_loss_clip": 1.05217135, - "balance_loss_mlp": 1.02685261, - "epoch": 0.17068991432436495, - "flos": 22525040050560.0, - "grad_norm": 1.7789799751303759, - "language_loss": 0.93788463, - "learning_rate": 3.795932626406812e-06, - "loss": 0.95939398, - "num_input_tokens_seen": 61542720, - "step": 2839, - "time_per_iteration": 2.7881791591644287 - }, - { - "auxiliary_loss_clip": 0.01126465, - "auxiliary_loss_mlp": 0.01045617, - "balance_loss_clip": 1.05183244, - "balance_loss_mlp": 1.0250175, - "epoch": 0.17075003757703291, - "flos": 25882939077120.0, - "grad_norm": 2.3337760403585435, - "language_loss": 0.83974946, - "learning_rate": 3.7957612042135336e-06, - "loss": 0.86147022, - "num_input_tokens_seen": 61563040, - "step": 2840, - "time_per_iteration": 2.7564892768859863 - }, - { - "auxiliary_loss_clip": 0.01151834, - "auxiliary_loss_mlp": 0.01044417, - "balance_loss_clip": 1.05555129, - "balance_loss_mlp": 1.02449679, - "epoch": 0.17081016082970088, - "flos": 20120713332480.0, - "grad_norm": 1.9037435592597944, - "language_loss": 0.76307738, - "learning_rate": 3.79558971392481e-06, - "loss": 0.7850399, - "num_input_tokens_seen": 61581890, - "step": 2841, - "time_per_iteration": 2.695525646209717 - }, - { - "auxiliary_loss_clip": 0.01136217, - "auxiliary_loss_mlp": 0.01045847, - "balance_loss_clip": 1.0527097, - "balance_loss_mlp": 1.02744126, - "epoch": 0.17087028408236885, - "flos": 24936477661440.0, - "grad_norm": 1.7844240011089845, - "language_loss": 0.77076876, - "learning_rate": 3.7954181555471443e-06, - "loss": 0.79258937, - "num_input_tokens_seen": 61602095, - "step": 2842, - "time_per_iteration": 2.773792266845703 - }, - { - "auxiliary_loss_clip": 0.01155915, - "auxiliary_loss_mlp": 0.01043896, - "balance_loss_clip": 1.05616069, - "balance_loss_mlp": 1.02503705, - "epoch": 0.17093040733503684, - "flos": 19057864872960.0, - "grad_norm": 1.8430349199993477, - "language_loss": 0.85694385, - "learning_rate": 3.795246529087043e-06, - "loss": 0.87894201, - "num_input_tokens_seen": 61620400, - "step": 2843, - "time_per_iteration": 2.5860671997070312 - }, - { - "auxiliary_loss_clip": 0.01154742, - "auxiliary_loss_mlp": 0.01044059, - "balance_loss_clip": 1.05549574, - "balance_loss_mlp": 1.02608204, - "epoch": 0.1709905305877048, - "flos": 13078954333440.0, - "grad_norm": 2.0353470349004485, - "language_loss": 0.68646181, - "learning_rate": 3.7950748345510126e-06, - "loss": 0.70844984, - "num_input_tokens_seen": 61637680, - "step": 2844, - "time_per_iteration": 2.5961523056030273 - }, - { - "auxiliary_loss_clip": 0.01133396, - "auxiliary_loss_mlp": 0.00778162, - "balance_loss_clip": 1.05117011, - "balance_loss_mlp": 1.00112617, - "epoch": 0.17105065384037277, - "flos": 19209336526080.0, - "grad_norm": 2.027694794878894, - "language_loss": 0.78771943, - "learning_rate": 3.7949030719455646e-06, - "loss": 0.806835, - "num_input_tokens_seen": 61655630, - "step": 2845, - "time_per_iteration": 2.720193386077881 - }, - { - "auxiliary_loss_clip": 0.01145033, - "auxiliary_loss_mlp": 0.01047407, - "balance_loss_clip": 1.05443549, - "balance_loss_mlp": 1.02914453, - "epoch": 0.17111077709304073, - "flos": 18515183218560.0, - "grad_norm": 2.2586144454646306, - "language_loss": 0.7811147, - "learning_rate": 3.7947312412772127e-06, - "loss": 0.80303913, - "num_input_tokens_seen": 61673475, - "step": 2846, - "time_per_iteration": 2.691033363342285 - }, - { - "auxiliary_loss_clip": 0.01143809, - "auxiliary_loss_mlp": 0.0104645, - "balance_loss_clip": 1.05425262, - "balance_loss_mlp": 1.02865243, - "epoch": 0.1711709003457087, - "flos": 25082670015360.0, - "grad_norm": 2.2208975060456426, - "language_loss": 0.79762948, - "learning_rate": 3.794559342552472e-06, - "loss": 0.8195321, - "num_input_tokens_seen": 61693370, - "step": 2847, - "time_per_iteration": 2.7504522800445557 - }, - { - "auxiliary_loss_clip": 0.01142651, - "auxiliary_loss_mlp": 0.01045695, - "balance_loss_clip": 1.05101562, - "balance_loss_mlp": 1.02668071, - "epoch": 0.17123102359837666, - "flos": 17566387418880.0, - "grad_norm": 2.4457083156230017, - "language_loss": 0.8665086, - "learning_rate": 3.7943873757778614e-06, - "loss": 0.88839209, - "num_input_tokens_seen": 61710820, - "step": 2848, - "time_per_iteration": 2.642946720123291 - }, - { - "auxiliary_loss_clip": 0.0111167, - "auxiliary_loss_mlp": 0.01044479, - "balance_loss_clip": 1.04839015, - "balance_loss_mlp": 1.02559662, - "epoch": 0.17129114685104463, - "flos": 26173635845760.0, - "grad_norm": 3.6033710399461856, - "language_loss": 0.75238276, - "learning_rate": 3.794215340959902e-06, - "loss": 0.77394426, - "num_input_tokens_seen": 61729855, - "step": 2849, - "time_per_iteration": 2.7511017322540283 - }, - { - "auxiliary_loss_clip": 0.0103263, - "auxiliary_loss_mlp": 0.01006833, - "balance_loss_clip": 1.02775574, - "balance_loss_mlp": 1.00413883, - "epoch": 0.17135127010371262, - "flos": 69269710037760.0, - "grad_norm": 0.7881928427119427, - "language_loss": 0.57514679, - "learning_rate": 3.7940432381051163e-06, - "loss": 0.59554148, - "num_input_tokens_seen": 61790290, - "step": 2850, - "time_per_iteration": 3.234609603881836 - }, - { - "auxiliary_loss_clip": 0.01115021, - "auxiliary_loss_mlp": 0.01044381, - "balance_loss_clip": 1.05049884, - "balance_loss_mlp": 1.02661848, - "epoch": 0.1714113933563806, - "flos": 23550110380800.0, - "grad_norm": 2.962731712990184, - "language_loss": 0.81328994, - "learning_rate": 3.793871067220031e-06, - "loss": 0.83488399, - "num_input_tokens_seen": 61809265, - "step": 2851, - "time_per_iteration": 2.78957200050354 - }, - { - "auxiliary_loss_clip": 0.01114419, - "auxiliary_loss_mlp": 0.01043587, - "balance_loss_clip": 1.05193233, - "balance_loss_mlp": 1.02592039, - "epoch": 0.17147151660904855, - "flos": 21142443697920.0, - "grad_norm": 2.049906502724323, - "language_loss": 0.93085313, - "learning_rate": 3.7936988283111764e-06, - "loss": 0.95243311, - "num_input_tokens_seen": 61828980, - "step": 2852, - "time_per_iteration": 2.8247029781341553 - }, - { - "auxiliary_loss_clip": 0.01123258, - "auxiliary_loss_mlp": 0.01048953, - "balance_loss_clip": 1.04961288, - "balance_loss_mlp": 1.03045225, - "epoch": 0.17153163986171652, - "flos": 18624890332800.0, - "grad_norm": 1.8770741979814063, - "language_loss": 0.69465554, - "learning_rate": 3.7935265213850817e-06, - "loss": 0.71637762, - "num_input_tokens_seen": 61847915, - "step": 2853, - "time_per_iteration": 2.814162492752075 - }, - { - "auxiliary_loss_clip": 0.01120856, - "auxiliary_loss_mlp": 0.0104692, - "balance_loss_clip": 1.05593121, - "balance_loss_mlp": 1.02899122, - "epoch": 0.17159176311438448, - "flos": 18223265387520.0, - "grad_norm": 2.5884803351111705, - "language_loss": 0.66611075, - "learning_rate": 3.7933541464482815e-06, - "loss": 0.68778855, - "num_input_tokens_seen": 61865570, - "step": 2854, - "time_per_iteration": 2.7968995571136475 - }, - { - "auxiliary_loss_clip": 0.01120742, - "auxiliary_loss_mlp": 0.01052217, - "balance_loss_clip": 1.04853106, - "balance_loss_mlp": 1.0349679, - "epoch": 0.17165188636705245, - "flos": 20738987159040.0, - "grad_norm": 1.705510390491261, - "language_loss": 0.8929621, - "learning_rate": 3.7931817035073124e-06, - "loss": 0.91469175, - "num_input_tokens_seen": 61883340, - "step": 2855, - "time_per_iteration": 2.7045016288757324 - }, - { - "auxiliary_loss_clip": 0.01157319, - "auxiliary_loss_mlp": 0.01043813, - "balance_loss_clip": 1.05505848, - "balance_loss_mlp": 1.02662265, - "epoch": 0.17171200961972044, - "flos": 24899884680960.0, - "grad_norm": 2.117219134143716, - "language_loss": 0.83963835, - "learning_rate": 3.7930091925687134e-06, - "loss": 0.86164963, - "num_input_tokens_seen": 61900610, - "step": 2856, - "time_per_iteration": 2.7349936962127686 - }, - { - "auxiliary_loss_clip": 0.01150108, - "auxiliary_loss_mlp": 0.0104615, - "balance_loss_clip": 1.05812418, - "balance_loss_mlp": 1.02783966, - "epoch": 0.1717721328723884, - "flos": 20157234485760.0, - "grad_norm": 2.234025867710235, - "language_loss": 0.86309886, - "learning_rate": 3.792836613639026e-06, - "loss": 0.88506144, - "num_input_tokens_seen": 61916795, - "step": 2857, - "time_per_iteration": 2.749356746673584 - }, - { - "auxiliary_loss_clip": 0.01144467, - "auxiliary_loss_mlp": 0.0105057, - "balance_loss_clip": 1.05469525, - "balance_loss_mlp": 1.0324626, - "epoch": 0.17183225612505637, - "flos": 23361650697600.0, - "grad_norm": 2.069122070501307, - "language_loss": 0.78334701, - "learning_rate": 3.7926639667247947e-06, - "loss": 0.80529737, - "num_input_tokens_seen": 61936665, - "step": 2858, - "time_per_iteration": 2.6673583984375 - }, - { - "auxiliary_loss_clip": 0.01147374, - "auxiliary_loss_mlp": 0.0105371, - "balance_loss_clip": 1.05591416, - "balance_loss_mlp": 1.03263378, - "epoch": 0.17189237937772434, - "flos": 18114240631680.0, - "grad_norm": 2.1629422323642453, - "language_loss": 0.77565676, - "learning_rate": 3.7924912518325663e-06, - "loss": 0.79766762, - "num_input_tokens_seen": 61954415, - "step": 2859, - "time_per_iteration": 2.646648645401001 - }, - { - "auxiliary_loss_clip": 0.0110879, - "auxiliary_loss_mlp": 0.01047481, - "balance_loss_clip": 1.05317724, - "balance_loss_mlp": 1.02887201, - "epoch": 0.1719525026303923, - "flos": 23258408031360.0, - "grad_norm": 2.088627069497316, - "language_loss": 0.77088714, - "learning_rate": 3.7923184689688902e-06, - "loss": 0.79244983, - "num_input_tokens_seen": 61973940, - "step": 2860, - "time_per_iteration": 2.7671573162078857 - }, - { - "auxiliary_loss_clip": 0.01145562, - "auxiliary_loss_mlp": 0.01042048, - "balance_loss_clip": 1.05316472, - "balance_loss_mlp": 1.02416611, - "epoch": 0.17201262588306027, - "flos": 20810413353600.0, - "grad_norm": 2.1608688480628304, - "language_loss": 0.81384242, - "learning_rate": 3.792145618140317e-06, - "loss": 0.83571851, - "num_input_tokens_seen": 61991845, - "step": 2861, - "time_per_iteration": 2.6492061614990234 - }, - { - "auxiliary_loss_clip": 0.011306, - "auxiliary_loss_mlp": 0.01051558, - "balance_loss_clip": 1.05280077, - "balance_loss_mlp": 1.0335927, - "epoch": 0.17207274913572823, - "flos": 20375858615040.0, - "grad_norm": 2.0128324416816192, - "language_loss": 0.85691392, - "learning_rate": 3.7919726993534038e-06, - "loss": 0.87873554, - "num_input_tokens_seen": 62009395, - "step": 2862, - "time_per_iteration": 4.290126323699951 - }, - { - "auxiliary_loss_clip": 0.01116765, - "auxiliary_loss_mlp": 0.01043444, - "balance_loss_clip": 1.05126834, - "balance_loss_mlp": 1.02655208, - "epoch": 0.17213287238839622, - "flos": 26797727675520.0, - "grad_norm": 3.7047120479299993, - "language_loss": 0.78047049, - "learning_rate": 3.7917997126147054e-06, - "loss": 0.80207253, - "num_input_tokens_seen": 62029005, - "step": 2863, - "time_per_iteration": 4.275500774383545 - }, - { - "auxiliary_loss_clip": 0.01122315, - "auxiliary_loss_mlp": 0.00776596, - "balance_loss_clip": 1.05132961, - "balance_loss_mlp": 1.00090909, - "epoch": 0.1721929956410642, - "flos": 26030819370240.0, - "grad_norm": 1.7350128683820358, - "language_loss": 0.72135127, - "learning_rate": 3.7916266579307823e-06, - "loss": 0.74034035, - "num_input_tokens_seen": 62048730, - "step": 2864, - "time_per_iteration": 4.414710998535156 - }, - { - "auxiliary_loss_clip": 0.01121488, - "auxiliary_loss_mlp": 0.01049611, - "balance_loss_clip": 1.05114079, - "balance_loss_mlp": 1.03099, - "epoch": 0.17225311889373215, - "flos": 22273091078400.0, - "grad_norm": 1.9270646210248614, - "language_loss": 0.73002023, - "learning_rate": 3.7914535353081973e-06, - "loss": 0.75173128, - "num_input_tokens_seen": 62069000, - "step": 2865, - "time_per_iteration": 2.7463715076446533 - }, - { - "auxiliary_loss_clip": 0.01145037, - "auxiliary_loss_mlp": 0.0077644, - "balance_loss_clip": 1.05669165, - "balance_loss_mlp": 1.00120521, - "epoch": 0.17231324214640012, - "flos": 21287774125440.0, - "grad_norm": 2.669585642962841, - "language_loss": 0.78357804, - "learning_rate": 3.7912803447535145e-06, - "loss": 0.80279285, - "num_input_tokens_seen": 62086750, - "step": 2866, - "time_per_iteration": 2.785146713256836 - }, - { - "auxiliary_loss_clip": 0.01157272, - "auxiliary_loss_mlp": 0.01044358, - "balance_loss_clip": 1.05600274, - "balance_loss_mlp": 1.02536821, - "epoch": 0.17237336539906808, - "flos": 19680735640320.0, - "grad_norm": 2.551277931358127, - "language_loss": 0.79755104, - "learning_rate": 3.7911070862733016e-06, - "loss": 0.81956732, - "num_input_tokens_seen": 62106240, - "step": 2867, - "time_per_iteration": 4.3145318031311035 - }, - { - "auxiliary_loss_clip": 0.01132297, - "auxiliary_loss_mlp": 0.01041396, - "balance_loss_clip": 1.0529356, - "balance_loss_mlp": 1.02274013, - "epoch": 0.17243348865173605, - "flos": 17529650784000.0, - "grad_norm": 1.8689780270661371, - "language_loss": 0.79206991, - "learning_rate": 3.7909337598741276e-06, - "loss": 0.81380683, - "num_input_tokens_seen": 62124895, - "step": 2868, - "time_per_iteration": 2.7683827877044678 - }, - { - "auxiliary_loss_clip": 0.01111702, - "auxiliary_loss_mlp": 0.01041717, - "balance_loss_clip": 1.05331647, - "balance_loss_mlp": 1.02427697, - "epoch": 0.17249361190440402, - "flos": 18259858368000.0, - "grad_norm": 2.0344588273772923, - "language_loss": 0.84221756, - "learning_rate": 3.7907603655625674e-06, - "loss": 0.86375177, - "num_input_tokens_seen": 62143510, - "step": 2869, - "time_per_iteration": 2.729156970977783 - }, - { - "auxiliary_loss_clip": 0.01132999, - "auxiliary_loss_mlp": 0.01048405, - "balance_loss_clip": 1.0535363, - "balance_loss_mlp": 1.02955842, - "epoch": 0.172553735157072, - "flos": 21174367910400.0, - "grad_norm": 1.8935704627114847, - "language_loss": 0.77299273, - "learning_rate": 3.7905869033451932e-06, - "loss": 0.79480684, - "num_input_tokens_seen": 62162285, - "step": 2870, - "time_per_iteration": 2.752739191055298 - }, - { - "auxiliary_loss_clip": 0.0115398, - "auxiliary_loss_mlp": 0.01037809, - "balance_loss_clip": 1.05671024, - "balance_loss_mlp": 1.02110744, - "epoch": 0.17261385840973997, - "flos": 22273270646400.0, - "grad_norm": 2.0115587398764396, - "language_loss": 0.77409238, - "learning_rate": 3.7904133732285857e-06, - "loss": 0.79601026, - "num_input_tokens_seen": 62180970, - "step": 2871, - "time_per_iteration": 2.660627603530884 - }, - { - "auxiliary_loss_clip": 0.01132474, - "auxiliary_loss_mlp": 0.01041073, - "balance_loss_clip": 1.05313993, - "balance_loss_mlp": 1.0222379, - "epoch": 0.17267398166240794, - "flos": 27922233830400.0, - "grad_norm": 2.203011669690562, - "language_loss": 0.74197829, - "learning_rate": 3.7902397752193228e-06, - "loss": 0.76371384, - "num_input_tokens_seen": 62198965, - "step": 2872, - "time_per_iteration": 2.6959900856018066 - }, - { - "auxiliary_loss_clip": 0.01150773, - "auxiliary_loss_mlp": 0.01041508, - "balance_loss_clip": 1.05359554, - "balance_loss_mlp": 1.02362645, - "epoch": 0.1727341049150759, - "flos": 21945118970880.0, - "grad_norm": 1.7914171074077658, - "language_loss": 0.82336062, - "learning_rate": 3.790066109323988e-06, - "loss": 0.84528345, - "num_input_tokens_seen": 62219890, - "step": 2873, - "time_per_iteration": 2.603564977645874 - }, - { - "auxiliary_loss_clip": 0.01108819, - "auxiliary_loss_mlp": 0.01044995, - "balance_loss_clip": 1.04744792, - "balance_loss_mlp": 1.02522969, - "epoch": 0.17279422816774387, - "flos": 18107883924480.0, - "grad_norm": 3.7341652608759297, - "language_loss": 0.75355422, - "learning_rate": 3.7898923755491678e-06, - "loss": 0.77509236, - "num_input_tokens_seen": 62237140, - "step": 2874, - "time_per_iteration": 2.8438260555267334 - }, - { - "auxiliary_loss_clip": 0.01159322, - "auxiliary_loss_mlp": 0.01044415, - "balance_loss_clip": 1.05658269, - "balance_loss_mlp": 1.02404249, - "epoch": 0.17285435142041183, - "flos": 21835447770240.0, - "grad_norm": 2.7053876793207037, - "language_loss": 0.80239916, - "learning_rate": 3.7897185739014487e-06, - "loss": 0.82443655, - "num_input_tokens_seen": 62255405, - "step": 2875, - "time_per_iteration": 2.625183343887329 - }, - { - "auxiliary_loss_clip": 0.01135727, - "auxiliary_loss_mlp": 0.0105273, - "balance_loss_clip": 1.0535475, - "balance_loss_mlp": 1.03297722, - "epoch": 0.17291447467307983, - "flos": 18368452160640.0, - "grad_norm": 3.840653645811056, - "language_loss": 0.87621164, - "learning_rate": 3.7895447043874217e-06, - "loss": 0.8980962, - "num_input_tokens_seen": 62271280, - "step": 2876, - "time_per_iteration": 2.6782751083374023 - }, - { - "auxiliary_loss_clip": 0.01136898, - "auxiliary_loss_mlp": 0.01044228, - "balance_loss_clip": 1.05730534, - "balance_loss_mlp": 1.02559566, - "epoch": 0.1729745979257478, - "flos": 18624638937600.0, - "grad_norm": 1.8931416121171032, - "language_loss": 0.84386718, - "learning_rate": 3.789370767013681e-06, - "loss": 0.86567843, - "num_input_tokens_seen": 62289140, - "step": 2877, - "time_per_iteration": 2.681131362915039 - }, - { - "auxiliary_loss_clip": 0.01120759, - "auxiliary_loss_mlp": 0.01043962, - "balance_loss_clip": 1.05222571, - "balance_loss_mlp": 1.02499604, - "epoch": 0.17303472117841576, - "flos": 22998234844800.0, - "grad_norm": 2.106635210245156, - "language_loss": 0.79660022, - "learning_rate": 3.7891967617868204e-06, - "loss": 0.81824744, - "num_input_tokens_seen": 62307490, - "step": 2878, - "time_per_iteration": 2.8118834495544434 - }, - { - "auxiliary_loss_clip": 0.01136112, - "auxiliary_loss_mlp": 0.01047222, - "balance_loss_clip": 1.05593777, - "balance_loss_mlp": 1.02953172, - "epoch": 0.17309484443108372, - "flos": 25664386775040.0, - "grad_norm": 1.9675557254753375, - "language_loss": 0.70236337, - "learning_rate": 3.78902268871344e-06, - "loss": 0.72419673, - "num_input_tokens_seen": 62328570, - "step": 2879, - "time_per_iteration": 2.7998502254486084 - }, - { - "auxiliary_loss_clip": 0.01130517, - "auxiliary_loss_mlp": 0.01051722, - "balance_loss_clip": 1.05183411, - "balance_loss_mlp": 1.03337598, - "epoch": 0.1731549676837517, - "flos": 13552903313280.0, - "grad_norm": 2.0545155253910163, - "language_loss": 0.82884222, - "learning_rate": 3.78884854780014e-06, - "loss": 0.85066462, - "num_input_tokens_seen": 62345735, - "step": 2880, - "time_per_iteration": 2.6707684993743896 - }, - { - "auxiliary_loss_clip": 0.01110706, - "auxiliary_loss_mlp": 0.01054327, - "balance_loss_clip": 1.05214918, - "balance_loss_mlp": 1.03303647, - "epoch": 0.17321509093641965, - "flos": 22857070394880.0, - "grad_norm": 1.9029231217608267, - "language_loss": 0.80879176, - "learning_rate": 3.7886743390535236e-06, - "loss": 0.83044201, - "num_input_tokens_seen": 62365525, - "step": 2881, - "time_per_iteration": 2.7851576805114746 - }, - { - "auxiliary_loss_clip": 0.01135983, - "auxiliary_loss_mlp": 0.01046895, - "balance_loss_clip": 1.05544055, - "balance_loss_mlp": 1.02921653, - "epoch": 0.17327521418908762, - "flos": 24352785653760.0, - "grad_norm": 2.753231520615002, - "language_loss": 0.77268815, - "learning_rate": 3.788500062480197e-06, - "loss": 0.79451692, - "num_input_tokens_seen": 62385160, - "step": 2882, - "time_per_iteration": 2.7785212993621826 - }, - { - "auxiliary_loss_clip": 0.01124099, - "auxiliary_loss_mlp": 0.01047516, - "balance_loss_clip": 1.0633558, - "balance_loss_mlp": 1.02947998, - "epoch": 0.1733353374417556, - "flos": 33105651816960.0, - "grad_norm": 2.096311926604511, - "language_loss": 0.76714236, - "learning_rate": 3.788325718086769e-06, - "loss": 0.78885853, - "num_input_tokens_seen": 62405280, - "step": 2883, - "time_per_iteration": 2.838848352432251 - }, - { - "auxiliary_loss_clip": 0.01110924, - "auxiliary_loss_mlp": 0.0104619, - "balance_loss_clip": 1.04929209, - "balance_loss_mlp": 1.02821302, - "epoch": 0.17339546069442358, - "flos": 24388947671040.0, - "grad_norm": 2.1194201700326873, - "language_loss": 0.8555252, - "learning_rate": 3.7881513058798503e-06, - "loss": 0.87709635, - "num_input_tokens_seen": 62423665, - "step": 2884, - "time_per_iteration": 2.829376220703125 - }, - { - "auxiliary_loss_clip": 0.01133962, - "auxiliary_loss_mlp": 0.00775817, - "balance_loss_clip": 1.05472779, - "balance_loss_mlp": 1.00088096, - "epoch": 0.17345558394709154, - "flos": 27454174680960.0, - "grad_norm": 1.7131036779262108, - "language_loss": 0.74756771, - "learning_rate": 3.787976825866055e-06, - "loss": 0.76666546, - "num_input_tokens_seen": 62445170, - "step": 2885, - "time_per_iteration": 2.8710989952087402 - }, - { - "auxiliary_loss_clip": 0.01128977, - "auxiliary_loss_mlp": 0.01044901, - "balance_loss_clip": 1.05498922, - "balance_loss_mlp": 1.0280925, - "epoch": 0.1735157071997595, - "flos": 24682158391680.0, - "grad_norm": 2.374438581614022, - "language_loss": 0.7107017, - "learning_rate": 3.7878022780519998e-06, - "loss": 0.73244053, - "num_input_tokens_seen": 62466135, - "step": 2886, - "time_per_iteration": 2.726621150970459 - }, - { - "auxiliary_loss_clip": 0.01142411, - "auxiliary_loss_mlp": 0.01041857, - "balance_loss_clip": 1.05233932, - "balance_loss_mlp": 1.02408338, - "epoch": 0.17357583045242747, - "flos": 21688932193920.0, - "grad_norm": 2.0566537172661747, - "language_loss": 0.69906294, - "learning_rate": 3.7876276624443024e-06, - "loss": 0.72090566, - "num_input_tokens_seen": 62483910, - "step": 2887, - "time_per_iteration": 2.7066688537597656 - }, - { - "auxiliary_loss_clip": 0.01116425, - "auxiliary_loss_mlp": 0.01045383, - "balance_loss_clip": 1.05328536, - "balance_loss_mlp": 1.02728677, - "epoch": 0.17363595370509544, - "flos": 15375728753280.0, - "grad_norm": 2.038016964464323, - "language_loss": 0.85257947, - "learning_rate": 3.787452979049585e-06, - "loss": 0.87419748, - "num_input_tokens_seen": 62501530, - "step": 2888, - "time_per_iteration": 2.7514970302581787 - }, - { - "auxiliary_loss_clip": 0.01095063, - "auxiliary_loss_mlp": 0.01049413, - "balance_loss_clip": 1.05020595, - "balance_loss_mlp": 1.02822983, - "epoch": 0.1736960769577634, - "flos": 23440941970560.0, - "grad_norm": 2.196318077733749, - "language_loss": 0.78491282, - "learning_rate": 3.7872782278744718e-06, - "loss": 0.80635762, - "num_input_tokens_seen": 62521295, - "step": 2889, - "time_per_iteration": 2.8221559524536133 - }, - { - "auxiliary_loss_clip": 0.01112139, - "auxiliary_loss_mlp": 0.0077601, - "balance_loss_clip": 1.05236733, - "balance_loss_mlp": 1.00114667, - "epoch": 0.1737562002104314, - "flos": 18587830475520.0, - "grad_norm": 2.333227367674716, - "language_loss": 0.84076989, - "learning_rate": 3.7871034089255883e-06, - "loss": 0.85965133, - "num_input_tokens_seen": 62539615, - "step": 2890, - "time_per_iteration": 2.7213382720947266 - }, - { - "auxiliary_loss_clip": 0.01142218, - "auxiliary_loss_mlp": 0.01054918, - "balance_loss_clip": 1.05530691, - "balance_loss_mlp": 1.03752589, - "epoch": 0.17381632346309936, - "flos": 15998060816640.0, - "grad_norm": 2.7278091568285596, - "language_loss": 0.82205319, - "learning_rate": 3.7869285222095653e-06, - "loss": 0.84402454, - "num_input_tokens_seen": 62556820, - "step": 2891, - "time_per_iteration": 2.625162363052368 - }, - { - "auxiliary_loss_clip": 0.01097361, - "auxiliary_loss_mlp": 0.01050012, - "balance_loss_clip": 1.04281187, - "balance_loss_mlp": 1.02876878, - "epoch": 0.17387644671576732, - "flos": 13369830670080.0, - "grad_norm": 1.9017653264876209, - "language_loss": 0.81200826, - "learning_rate": 3.7867535677330334e-06, - "loss": 0.83348203, - "num_input_tokens_seen": 62572450, - "step": 2892, - "time_per_iteration": 2.7682459354400635 - }, - { - "auxiliary_loss_clip": 0.01148834, - "auxiliary_loss_mlp": 0.0105551, - "balance_loss_clip": 1.05707812, - "balance_loss_mlp": 1.03631687, - "epoch": 0.1739365699684353, - "flos": 26615516958720.0, - "grad_norm": 2.0056711213447436, - "language_loss": 0.73950225, - "learning_rate": 3.786578545502627e-06, - "loss": 0.76154572, - "num_input_tokens_seen": 62592580, - "step": 2893, - "time_per_iteration": 2.8463022708892822 - }, - { - "auxiliary_loss_clip": 0.01132474, - "auxiliary_loss_mlp": 0.01043509, - "balance_loss_clip": 1.05198765, - "balance_loss_mlp": 1.02443516, - "epoch": 0.17399669322110325, - "flos": 23367971491200.0, - "grad_norm": 4.010773627073901, - "language_loss": 0.82507658, - "learning_rate": 3.7864034555249828e-06, - "loss": 0.84683645, - "num_input_tokens_seen": 62611220, - "step": 2894, - "time_per_iteration": 2.719564914703369 - }, - { - "auxiliary_loss_clip": 0.01113951, - "auxiliary_loss_mlp": 0.01046249, - "balance_loss_clip": 1.0506922, - "balance_loss_mlp": 1.02463603, - "epoch": 0.17405681647377122, - "flos": 22054107813120.0, - "grad_norm": 2.3322053123967574, - "language_loss": 0.73826683, - "learning_rate": 3.786228297806741e-06, - "loss": 0.7598688, - "num_input_tokens_seen": 62629185, - "step": 2895, - "time_per_iteration": 2.743992805480957 - }, - { - "auxiliary_loss_clip": 0.01037578, - "auxiliary_loss_mlp": 0.01011099, - "balance_loss_clip": 1.0404408, - "balance_loss_mlp": 1.00788069, - "epoch": 0.1741169397264392, - "flos": 61457559114240.0, - "grad_norm": 0.8765647158253519, - "language_loss": 0.62754023, - "learning_rate": 3.7860530723545435e-06, - "loss": 0.64802706, - "num_input_tokens_seen": 62691895, - "step": 2896, - "time_per_iteration": 3.345099687576294 - }, - { - "auxiliary_loss_clip": 0.0113101, - "auxiliary_loss_mlp": 0.00776588, - "balance_loss_clip": 1.05246758, - "balance_loss_mlp": 1.00102258, - "epoch": 0.17417706297910718, - "flos": 27017680608000.0, - "grad_norm": 1.7338863964520728, - "language_loss": 0.75822324, - "learning_rate": 3.785877779175034e-06, - "loss": 0.77729923, - "num_input_tokens_seen": 62713790, - "step": 2897, - "time_per_iteration": 2.772292137145996 - }, - { - "auxiliary_loss_clip": 0.01141357, - "auxiliary_loss_mlp": 0.01042983, - "balance_loss_clip": 1.0545547, - "balance_loss_mlp": 1.02512598, - "epoch": 0.17423718623177514, - "flos": 33508856960640.0, - "grad_norm": 1.944569306659421, - "language_loss": 0.6883949, - "learning_rate": 3.7857024182748606e-06, - "loss": 0.71023834, - "num_input_tokens_seen": 62736285, - "step": 2898, - "time_per_iteration": 2.7278554439544678 - }, - { - "auxiliary_loss_clip": 0.01128715, - "auxiliary_loss_mlp": 0.01044216, - "balance_loss_clip": 1.05251193, - "balance_loss_mlp": 1.02504694, - "epoch": 0.1742973094844431, - "flos": 27198634348800.0, - "grad_norm": 2.99011081330885, - "language_loss": 0.76445562, - "learning_rate": 3.7855269896606717e-06, - "loss": 0.78618491, - "num_input_tokens_seen": 62756240, - "step": 2899, - "time_per_iteration": 2.8052010536193848 - }, - { - "auxiliary_loss_clip": 0.01095069, - "auxiliary_loss_mlp": 0.01045896, - "balance_loss_clip": 1.04680347, - "balance_loss_mlp": 1.02632213, - "epoch": 0.17435743273711107, - "flos": 22710734386560.0, - "grad_norm": 3.2965812335226357, - "language_loss": 0.72860038, - "learning_rate": 3.785351493339121e-06, - "loss": 0.75001007, - "num_input_tokens_seen": 62775910, - "step": 2900, - "time_per_iteration": 2.868218421936035 - }, - { - "auxiliary_loss_clip": 0.01110522, - "auxiliary_loss_mlp": 0.00776698, - "balance_loss_clip": 1.05202782, - "balance_loss_mlp": 1.000983, - "epoch": 0.17441755598977904, - "flos": 41646466039680.0, - "grad_norm": 1.5488662608930523, - "language_loss": 0.69946706, - "learning_rate": 3.785175929316863e-06, - "loss": 0.71833932, - "num_input_tokens_seen": 62799385, - "step": 2901, - "time_per_iteration": 4.407040596008301 - }, - { - "auxiliary_loss_clip": 0.01129098, - "auxiliary_loss_mlp": 0.01045525, - "balance_loss_clip": 1.05246592, - "balance_loss_mlp": 1.02764344, - "epoch": 0.174477679242447, - "flos": 26287077974400.0, - "grad_norm": 2.1785959913748965, - "language_loss": 0.76588804, - "learning_rate": 3.7850002976005543e-06, - "loss": 0.78763425, - "num_input_tokens_seen": 62819380, - "step": 2902, - "time_per_iteration": 4.2244462966918945 - }, - { - "auxiliary_loss_clip": 0.01145685, - "auxiliary_loss_mlp": 0.0104382, - "balance_loss_clip": 1.0531354, - "balance_loss_mlp": 1.02567625, - "epoch": 0.174537802495115, - "flos": 17858412990720.0, - "grad_norm": 2.2508699895191073, - "language_loss": 0.81588745, - "learning_rate": 3.7848245981968558e-06, - "loss": 0.83778256, - "num_input_tokens_seen": 62836205, - "step": 2903, - "time_per_iteration": 4.132925271987915 - }, - { - "auxiliary_loss_clip": 0.01126443, - "auxiliary_loss_mlp": 0.0103942, - "balance_loss_clip": 1.05449986, - "balance_loss_mlp": 1.02135992, - "epoch": 0.17459792574778296, - "flos": 16940715390720.0, - "grad_norm": 2.4085694554154187, - "language_loss": 0.73316491, - "learning_rate": 3.784648831112429e-06, - "loss": 0.75482351, - "num_input_tokens_seen": 62854045, - "step": 2904, - "time_per_iteration": 2.7033374309539795 - }, - { - "auxiliary_loss_clip": 0.01105192, - "auxiliary_loss_mlp": 0.0104577, - "balance_loss_clip": 1.05250716, - "balance_loss_mlp": 1.02822256, - "epoch": 0.17465804900045093, - "flos": 25520026014720.0, - "grad_norm": 1.8783326609306377, - "language_loss": 0.64233291, - "learning_rate": 3.7844729963539406e-06, - "loss": 0.66384256, - "num_input_tokens_seen": 62873075, - "step": 2905, - "time_per_iteration": 2.8325791358947754 - }, - { - "auxiliary_loss_clip": 0.01135256, - "auxiliary_loss_mlp": 0.01053006, - "balance_loss_clip": 1.05869055, - "balance_loss_mlp": 1.03370619, - "epoch": 0.1747181722531189, - "flos": 24129708238080.0, - "grad_norm": 2.820817719352069, - "language_loss": 0.79504299, - "learning_rate": 3.7842970939280566e-06, - "loss": 0.81692564, - "num_input_tokens_seen": 62892675, - "step": 2906, - "time_per_iteration": 4.491498231887817 - }, - { - "auxiliary_loss_clip": 0.01146195, - "auxiliary_loss_mlp": 0.01050729, - "balance_loss_clip": 1.05623174, - "balance_loss_mlp": 1.03258538, - "epoch": 0.17477829550578686, - "flos": 17748813617280.0, - "grad_norm": 2.262709441571415, - "language_loss": 0.81318873, - "learning_rate": 3.784121123841449e-06, - "loss": 0.83515799, - "num_input_tokens_seen": 62910675, - "step": 2907, - "time_per_iteration": 2.6855854988098145 - }, - { - "auxiliary_loss_clip": 0.01143202, - "auxiliary_loss_mlp": 0.01043315, - "balance_loss_clip": 1.05374384, - "balance_loss_mlp": 1.0253861, - "epoch": 0.17483841875845482, - "flos": 15377344865280.0, - "grad_norm": 2.068635027461873, - "language_loss": 0.81342787, - "learning_rate": 3.7839450861007886e-06, - "loss": 0.83529305, - "num_input_tokens_seen": 62928130, - "step": 2908, - "time_per_iteration": 2.6449570655822754 - }, - { - "auxiliary_loss_clip": 0.01127136, - "auxiliary_loss_mlp": 0.01050925, - "balance_loss_clip": 1.05178046, - "balance_loss_mlp": 1.03163743, - "epoch": 0.17489854201112282, - "flos": 17163254102400.0, - "grad_norm": 3.147433356867123, - "language_loss": 0.80020624, - "learning_rate": 3.7837689807127518e-06, - "loss": 0.82198691, - "num_input_tokens_seen": 62944290, - "step": 2909, - "time_per_iteration": 2.6820569038391113 - }, - { - "auxiliary_loss_clip": 0.0109059, - "auxiliary_loss_mlp": 0.01052625, - "balance_loss_clip": 1.05020881, - "balance_loss_mlp": 1.0310595, - "epoch": 0.17495866526379078, - "flos": 19755286318080.0, - "grad_norm": 1.6978440546881337, - "language_loss": 0.76742244, - "learning_rate": 3.783592807684017e-06, - "loss": 0.7888546, - "num_input_tokens_seen": 62963505, - "step": 2910, - "time_per_iteration": 2.6980416774749756 - }, - { - "auxiliary_loss_clip": 0.01158552, - "auxiliary_loss_mlp": 0.01049407, - "balance_loss_clip": 1.05618358, - "balance_loss_mlp": 1.03059566, - "epoch": 0.17501878851645875, - "flos": 28511133310080.0, - "grad_norm": 1.9812610358315632, - "language_loss": 0.8698765, - "learning_rate": 3.7834165670212645e-06, - "loss": 0.89195609, - "num_input_tokens_seen": 62985020, - "step": 2911, - "time_per_iteration": 2.692662477493286 - }, - { - "auxiliary_loss_clip": 0.01154744, - "auxiliary_loss_mlp": 0.00777232, - "balance_loss_clip": 1.05323184, - "balance_loss_mlp": 1.00110698, - "epoch": 0.1750789117691267, - "flos": 17931203902080.0, - "grad_norm": 3.030740090796483, - "language_loss": 0.89883876, - "learning_rate": 3.7832402587311764e-06, - "loss": 0.91815847, - "num_input_tokens_seen": 63001745, - "step": 2912, - "time_per_iteration": 2.600738763809204 - }, - { - "auxiliary_loss_clip": 0.01146165, - "auxiliary_loss_mlp": 0.01045616, - "balance_loss_clip": 1.0538094, - "balance_loss_mlp": 1.02655411, - "epoch": 0.17513903502179468, - "flos": 18259427404800.0, - "grad_norm": 2.03479884577424, - "language_loss": 0.72818935, - "learning_rate": 3.783063882820439e-06, - "loss": 0.75010711, - "num_input_tokens_seen": 63019750, - "step": 2913, - "time_per_iteration": 2.623342275619507 - }, - { - "auxiliary_loss_clip": 0.01140074, - "auxiliary_loss_mlp": 0.01043928, - "balance_loss_clip": 1.05781865, - "balance_loss_mlp": 1.02557003, - "epoch": 0.17519915827446264, - "flos": 20704728562560.0, - "grad_norm": 2.137073079496124, - "language_loss": 0.6891731, - "learning_rate": 3.782887439295741e-06, - "loss": 0.71101314, - "num_input_tokens_seen": 63039500, - "step": 2914, - "time_per_iteration": 2.7065770626068115 - }, - { - "auxiliary_loss_clip": 0.01142434, - "auxiliary_loss_mlp": 0.01045043, - "balance_loss_clip": 1.05532789, - "balance_loss_mlp": 1.02649403, - "epoch": 0.1752592815271306, - "flos": 20523415685760.0, - "grad_norm": 2.051329837479214, - "language_loss": 0.93125081, - "learning_rate": 3.782710928163772e-06, - "loss": 0.9531256, - "num_input_tokens_seen": 63059785, - "step": 2915, - "time_per_iteration": 2.659029245376587 - }, - { - "auxiliary_loss_clip": 0.01114731, - "auxiliary_loss_mlp": 0.01040999, - "balance_loss_clip": 1.04957223, - "balance_loss_mlp": 1.02243853, - "epoch": 0.1753194047797986, - "flos": 21799178012160.0, - "grad_norm": 1.604344576738792, - "language_loss": 0.81092978, - "learning_rate": 3.782534349431226e-06, - "loss": 0.83248705, - "num_input_tokens_seen": 63079385, - "step": 2916, - "time_per_iteration": 2.7099549770355225 - }, - { - "auxiliary_loss_clip": 0.0114211, - "auxiliary_loss_mlp": 0.01046221, - "balance_loss_clip": 1.05090034, - "balance_loss_mlp": 1.02780342, - "epoch": 0.17537952803246656, - "flos": 20668351063680.0, - "grad_norm": 3.7582760939418716, - "language_loss": 0.73829222, - "learning_rate": 3.782357703104799e-06, - "loss": 0.76017547, - "num_input_tokens_seen": 63098970, - "step": 2917, - "time_per_iteration": 2.666717767715454 - }, - { - "auxiliary_loss_clip": 0.01133449, - "auxiliary_loss_mlp": 0.01047353, - "balance_loss_clip": 1.05319786, - "balance_loss_mlp": 1.02821994, - "epoch": 0.17543965128513453, - "flos": 23295072839040.0, - "grad_norm": 1.813699779869167, - "language_loss": 0.76739681, - "learning_rate": 3.7821809891911897e-06, - "loss": 0.78920484, - "num_input_tokens_seen": 63118750, - "step": 2918, - "time_per_iteration": 2.647634744644165 - }, - { - "auxiliary_loss_clip": 0.01093958, - "auxiliary_loss_mlp": 0.01045643, - "balance_loss_clip": 1.0476644, - "balance_loss_mlp": 1.02425694, - "epoch": 0.1754997745378025, - "flos": 29095615416960.0, - "grad_norm": 2.436739755969174, - "language_loss": 0.73624814, - "learning_rate": 3.782004207697098e-06, - "loss": 0.75764406, - "num_input_tokens_seen": 63136865, - "step": 2919, - "time_per_iteration": 2.7904632091522217 - }, - { - "auxiliary_loss_clip": 0.0112465, - "auxiliary_loss_mlp": 0.01046524, - "balance_loss_clip": 1.04938293, - "balance_loss_mlp": 1.02805829, - "epoch": 0.17555989779047046, - "flos": 30371844620160.0, - "grad_norm": 2.5113730227003814, - "language_loss": 0.74840331, - "learning_rate": 3.781827358629228e-06, - "loss": 0.77011508, - "num_input_tokens_seen": 63158325, - "step": 2920, - "time_per_iteration": 2.727890968322754 - }, - { - "auxiliary_loss_clip": 0.01117257, - "auxiliary_loss_mlp": 0.01042893, - "balance_loss_clip": 1.0462867, - "balance_loss_mlp": 1.02371216, - "epoch": 0.17562002104313842, - "flos": 23287746464640.0, - "grad_norm": 3.6617213109535536, - "language_loss": 0.79731411, - "learning_rate": 3.7816504419942873e-06, - "loss": 0.81891561, - "num_input_tokens_seen": 63173115, - "step": 2921, - "time_per_iteration": 2.753817558288574 - }, - { - "auxiliary_loss_clip": 0.01121718, - "auxiliary_loss_mlp": 0.01046234, - "balance_loss_clip": 1.05232286, - "balance_loss_mlp": 1.02679133, - "epoch": 0.1756801442958064, - "flos": 24790500789120.0, - "grad_norm": 2.6301689129577546, - "language_loss": 0.87826073, - "learning_rate": 3.7814734577989823e-06, - "loss": 0.89994025, - "num_input_tokens_seen": 63192880, - "step": 2922, - "time_per_iteration": 2.7411837577819824 - }, - { - "auxiliary_loss_clip": 0.01144004, - "auxiliary_loss_mlp": 0.01047403, - "balance_loss_clip": 1.05196273, - "balance_loss_mlp": 1.02778149, - "epoch": 0.17574026754847438, - "flos": 25771651764480.0, - "grad_norm": 4.4893841411537085, - "language_loss": 0.62347209, - "learning_rate": 3.7812964060500253e-06, - "loss": 0.64538622, - "num_input_tokens_seen": 63214395, - "step": 2923, - "time_per_iteration": 2.7666683197021484 - }, - { - "auxiliary_loss_clip": 0.01134872, - "auxiliary_loss_mlp": 0.01048692, - "balance_loss_clip": 1.05887377, - "balance_loss_mlp": 1.02847457, - "epoch": 0.17580039080114235, - "flos": 17456608477440.0, - "grad_norm": 2.8552131957437914, - "language_loss": 0.80392253, - "learning_rate": 3.78111928675413e-06, - "loss": 0.82575822, - "num_input_tokens_seen": 63231020, - "step": 2924, - "time_per_iteration": 2.729403257369995 - }, - { - "auxiliary_loss_clip": 0.01132783, - "auxiliary_loss_mlp": 0.01051456, - "balance_loss_clip": 1.05193377, - "balance_loss_mlp": 1.03082108, - "epoch": 0.1758605140538103, - "flos": 14864648088960.0, - "grad_norm": 5.080042666316876, - "language_loss": 0.71374178, - "learning_rate": 3.7809420999180126e-06, - "loss": 0.73558426, - "num_input_tokens_seen": 63246245, - "step": 2925, - "time_per_iteration": 2.9538233280181885 - }, - { - "auxiliary_loss_clip": 0.01117196, - "auxiliary_loss_mlp": 0.01045706, - "balance_loss_clip": 1.05052948, - "balance_loss_mlp": 1.02744341, - "epoch": 0.17592063730647828, - "flos": 23004268329600.0, - "grad_norm": 1.6620026542608322, - "language_loss": 0.71931666, - "learning_rate": 3.7807648455483934e-06, - "loss": 0.74094564, - "num_input_tokens_seen": 63267790, - "step": 2926, - "time_per_iteration": 2.7738964557647705 - }, - { - "auxiliary_loss_clip": 0.01105944, - "auxiliary_loss_mlp": 0.01045732, - "balance_loss_clip": 1.04915071, - "balance_loss_mlp": 1.02253425, - "epoch": 0.17598076055914624, - "flos": 20741501111040.0, - "grad_norm": 2.6318732447225837, - "language_loss": 0.84724289, - "learning_rate": 3.7805875236519918e-06, - "loss": 0.86875963, - "num_input_tokens_seen": 63286830, - "step": 2927, - "time_per_iteration": 2.704437494277954 - }, - { - "auxiliary_loss_clip": 0.01100437, - "auxiliary_loss_mlp": 0.01046684, - "balance_loss_clip": 1.05039644, - "balance_loss_mlp": 1.02887452, - "epoch": 0.1760408838118142, - "flos": 34092441227520.0, - "grad_norm": 1.9547597089289632, - "language_loss": 0.72147644, - "learning_rate": 3.7804101342355336e-06, - "loss": 0.74294758, - "num_input_tokens_seen": 63308870, - "step": 2928, - "time_per_iteration": 2.793802261352539 - }, - { - "auxiliary_loss_clip": 0.01120251, - "auxiliary_loss_mlp": 0.01045623, - "balance_loss_clip": 1.0516876, - "balance_loss_mlp": 1.02679992, - "epoch": 0.1761010070644822, - "flos": 24168384207360.0, - "grad_norm": 1.8474008440192304, - "language_loss": 0.83097279, - "learning_rate": 3.780232677305744e-06, - "loss": 0.85263157, - "num_input_tokens_seen": 63329005, - "step": 2929, - "time_per_iteration": 2.733339786529541 - }, - { - "auxiliary_loss_clip": 0.01124127, - "auxiliary_loss_mlp": 0.01042521, - "balance_loss_clip": 1.04853475, - "balance_loss_mlp": 1.02479422, - "epoch": 0.17616113031715017, - "flos": 26576697335040.0, - "grad_norm": 2.4427170552109163, - "language_loss": 0.79211783, - "learning_rate": 3.7800551528693535e-06, - "loss": 0.81378424, - "num_input_tokens_seen": 63349390, - "step": 2930, - "time_per_iteration": 2.748080015182495 - }, - { - "auxiliary_loss_clip": 0.01160654, - "auxiliary_loss_mlp": 0.01047281, - "balance_loss_clip": 1.05925918, - "balance_loss_mlp": 1.02758813, - "epoch": 0.17622125356981813, - "flos": 25666685245440.0, - "grad_norm": 2.504124366499191, - "language_loss": 0.76502466, - "learning_rate": 3.7798775609330927e-06, - "loss": 0.78710401, - "num_input_tokens_seen": 63368835, - "step": 2931, - "time_per_iteration": 2.6691603660583496 - }, - { - "auxiliary_loss_clip": 0.01076453, - "auxiliary_loss_mlp": 0.01043586, - "balance_loss_clip": 1.04577017, - "balance_loss_mlp": 1.02478647, - "epoch": 0.1762813768224861, - "flos": 16508530949760.0, - "grad_norm": 2.941321746162514, - "language_loss": 0.76070881, - "learning_rate": 3.779699901503696e-06, - "loss": 0.78190923, - "num_input_tokens_seen": 63385220, - "step": 2932, - "time_per_iteration": 2.809630870819092 - }, - { - "auxiliary_loss_clip": 0.01148627, - "auxiliary_loss_mlp": 0.01043149, - "balance_loss_clip": 1.05284405, - "balance_loss_mlp": 1.0229789, - "epoch": 0.17634150007515406, - "flos": 11211850402560.0, - "grad_norm": 5.168612276821382, - "language_loss": 0.90027422, - "learning_rate": 3.7795221745879016e-06, - "loss": 0.92219198, - "num_input_tokens_seen": 63400865, - "step": 2933, - "time_per_iteration": 2.6665337085723877 - }, - { - "auxiliary_loss_clip": 0.01154114, - "auxiliary_loss_mlp": 0.01055985, - "balance_loss_clip": 1.05539656, - "balance_loss_mlp": 1.03766203, - "epoch": 0.17640162332782203, - "flos": 23659925235840.0, - "grad_norm": 2.009210784374188, - "language_loss": 0.88323247, - "learning_rate": 3.779344380192448e-06, - "loss": 0.90533352, - "num_input_tokens_seen": 63421390, - "step": 2934, - "time_per_iteration": 2.6649580001831055 - }, - { - "auxiliary_loss_clip": 0.01128495, - "auxiliary_loss_mlp": 0.01048067, - "balance_loss_clip": 1.05581188, - "balance_loss_mlp": 1.03028131, - "epoch": 0.17646174658049, - "flos": 53796984606720.0, - "grad_norm": 1.6302121247923247, - "language_loss": 0.70403945, - "learning_rate": 3.779166518324077e-06, - "loss": 0.72580504, - "num_input_tokens_seen": 63444715, - "step": 2935, - "time_per_iteration": 3.006019115447998 - }, - { - "auxiliary_loss_clip": 0.01126189, - "auxiliary_loss_mlp": 0.01040034, - "balance_loss_clip": 1.05360174, - "balance_loss_mlp": 1.02135396, - "epoch": 0.17652186983315798, - "flos": 24243868638720.0, - "grad_norm": 2.5931578566124807, - "language_loss": 0.69721985, - "learning_rate": 3.7789885889895325e-06, - "loss": 0.71888208, - "num_input_tokens_seen": 63465525, - "step": 2936, - "time_per_iteration": 2.7517428398132324 - }, - { - "auxiliary_loss_clip": 0.01105644, - "auxiliary_loss_mlp": 0.01045896, - "balance_loss_clip": 1.05023837, - "balance_loss_mlp": 1.02737129, - "epoch": 0.17658199308582595, - "flos": 27454282421760.0, - "grad_norm": 1.9170676229980566, - "language_loss": 0.71288073, - "learning_rate": 3.7788105921955634e-06, - "loss": 0.73439616, - "num_input_tokens_seen": 63485815, - "step": 2937, - "time_per_iteration": 2.837181329727173 - }, - { - "auxiliary_loss_clip": 0.01141008, - "auxiliary_loss_mlp": 0.01046843, - "balance_loss_clip": 1.05945122, - "balance_loss_mlp": 1.02674472, - "epoch": 0.17664211633849392, - "flos": 22418672901120.0, - "grad_norm": 2.267148270780071, - "language_loss": 0.75439745, - "learning_rate": 3.7786325279489184e-06, - "loss": 0.77627593, - "num_input_tokens_seen": 63503905, - "step": 2938, - "time_per_iteration": 2.883162021636963 - }, - { - "auxiliary_loss_clip": 0.01147345, - "auxiliary_loss_mlp": 0.01043976, - "balance_loss_clip": 1.05576169, - "balance_loss_mlp": 1.02553487, - "epoch": 0.17670223959116188, - "flos": 24715124098560.0, - "grad_norm": 2.921726967662053, - "language_loss": 0.71015209, - "learning_rate": 3.7784543962563495e-06, - "loss": 0.73206532, - "num_input_tokens_seen": 63521985, - "step": 2939, - "time_per_iteration": 2.6938419342041016 - }, - { - "auxiliary_loss_clip": 0.01160437, - "auxiliary_loss_mlp": 0.01046921, - "balance_loss_clip": 1.05818558, - "balance_loss_mlp": 1.02794337, - "epoch": 0.17676236284382985, - "flos": 22527051212160.0, - "grad_norm": 3.114901170192376, - "language_loss": 0.73513985, - "learning_rate": 3.7782761971246115e-06, - "loss": 0.75721341, - "num_input_tokens_seen": 63539830, - "step": 2940, - "time_per_iteration": 4.145469665527344 - }, - { - "auxiliary_loss_clip": 0.0112582, - "auxiliary_loss_mlp": 0.01046611, - "balance_loss_clip": 1.05631542, - "balance_loss_mlp": 1.02731109, - "epoch": 0.1768224860964978, - "flos": 12385160161920.0, - "grad_norm": 3.071469776016301, - "language_loss": 0.85375023, - "learning_rate": 3.7780979305604616e-06, - "loss": 0.87547457, - "num_input_tokens_seen": 63555495, - "step": 2941, - "time_per_iteration": 4.279599666595459 - }, - { - "auxiliary_loss_clip": 0.01161068, - "auxiliary_loss_mlp": 0.01045254, - "balance_loss_clip": 1.05717027, - "balance_loss_mlp": 1.0257628, - "epoch": 0.1768826093491658, - "flos": 24353360271360.0, - "grad_norm": 2.434766510066968, - "language_loss": 0.76885259, - "learning_rate": 3.7779195965706607e-06, - "loss": 0.79091585, - "num_input_tokens_seen": 63575290, - "step": 2942, - "time_per_iteration": 4.2280871868133545 - }, - { - "auxiliary_loss_clip": 0.01106234, - "auxiliary_loss_mlp": 0.00780676, - "balance_loss_clip": 1.04992843, - "balance_loss_mlp": 1.00087166, - "epoch": 0.17694273260183377, - "flos": 23587062497280.0, - "grad_norm": 3.301743041114179, - "language_loss": 0.8024286, - "learning_rate": 3.77774119516197e-06, - "loss": 0.82129776, - "num_input_tokens_seen": 63594670, - "step": 2943, - "time_per_iteration": 2.8921029567718506 - }, - { - "auxiliary_loss_clip": 0.01132848, - "auxiliary_loss_mlp": 0.01052225, - "balance_loss_clip": 1.05352235, - "balance_loss_mlp": 1.03124392, - "epoch": 0.17700285585450173, - "flos": 26760991040640.0, - "grad_norm": 5.7613375603973465, - "language_loss": 0.80809408, - "learning_rate": 3.777562726341155e-06, - "loss": 0.82994485, - "num_input_tokens_seen": 63614780, - "step": 2944, - "time_per_iteration": 2.692831039428711 - }, - { - "auxiliary_loss_clip": 0.01161854, - "auxiliary_loss_mlp": 0.01056825, - "balance_loss_clip": 1.05807233, - "balance_loss_mlp": 1.03796625, - "epoch": 0.1770629791071697, - "flos": 42776323320960.0, - "grad_norm": 2.4257754996125227, - "language_loss": 0.73812854, - "learning_rate": 3.7773841901149835e-06, - "loss": 0.7603153, - "num_input_tokens_seen": 63637190, - "step": 2945, - "time_per_iteration": 2.782910108566284 - }, - { - "auxiliary_loss_clip": 0.011481, - "auxiliary_loss_mlp": 0.01047361, - "balance_loss_clip": 1.05756998, - "balance_loss_mlp": 1.02862108, - "epoch": 0.17712310235983766, - "flos": 17345572560000.0, - "grad_norm": 2.8106797532110637, - "language_loss": 0.7793628, - "learning_rate": 3.7772055864902256e-06, - "loss": 0.80131739, - "num_input_tokens_seen": 63652140, - "step": 2946, - "time_per_iteration": 4.278741121292114 - }, - { - "auxiliary_loss_clip": 0.01109059, - "auxiliary_loss_mlp": 0.01052842, - "balance_loss_clip": 1.04997015, - "balance_loss_mlp": 1.03341079, - "epoch": 0.17718322561250563, - "flos": 23878477537920.0, - "grad_norm": 2.172386857191393, - "language_loss": 0.76068008, - "learning_rate": 3.7770269154736535e-06, - "loss": 0.7822991, - "num_input_tokens_seen": 63671700, - "step": 2947, - "time_per_iteration": 2.7949914932250977 - }, - { - "auxiliary_loss_clip": 0.0114934, - "auxiliary_loss_mlp": 0.01044342, - "balance_loss_clip": 1.05480659, - "balance_loss_mlp": 1.025388, - "epoch": 0.1772433488651736, - "flos": 36466352104320.0, - "grad_norm": 2.6793588646204745, - "language_loss": 0.72557831, - "learning_rate": 3.7768481770720424e-06, - "loss": 0.74751514, - "num_input_tokens_seen": 63691685, - "step": 2948, - "time_per_iteration": 2.901662826538086 - }, - { - "auxiliary_loss_clip": 0.01151572, - "auxiliary_loss_mlp": 0.01050692, - "balance_loss_clip": 1.05921662, - "balance_loss_mlp": 1.03236949, - "epoch": 0.1773034721178416, - "flos": 26684716510080.0, - "grad_norm": 1.8296543316983853, - "language_loss": 0.81782824, - "learning_rate": 3.776669371292171e-06, - "loss": 0.8398509, - "num_input_tokens_seen": 63711720, - "step": 2949, - "time_per_iteration": 2.7284891605377197 - }, - { - "auxiliary_loss_clip": 0.01080853, - "auxiliary_loss_mlp": 0.0100651, - "balance_loss_clip": 1.04975748, - "balance_loss_mlp": 1.00226629, - "epoch": 0.17736359537050955, - "flos": 57117467617920.0, - "grad_norm": 0.768126622018234, - "language_loss": 0.64989161, - "learning_rate": 3.7764904981408186e-06, - "loss": 0.67076528, - "num_input_tokens_seen": 63776280, - "step": 2950, - "time_per_iteration": 3.2761552333831787 - }, - { - "auxiliary_loss_clip": 0.01121454, - "auxiliary_loss_mlp": 0.01045861, - "balance_loss_clip": 1.05373287, - "balance_loss_mlp": 1.02743077, - "epoch": 0.17742371862317752, - "flos": 27198203385600.0, - "grad_norm": 2.9882590699755927, - "language_loss": 0.83619881, - "learning_rate": 3.7763115576247686e-06, - "loss": 0.85787189, - "num_input_tokens_seen": 63797535, - "step": 2951, - "time_per_iteration": 2.7637627124786377 - }, - { - "auxiliary_loss_clip": 0.01125929, - "auxiliary_loss_mlp": 0.01046039, - "balance_loss_clip": 1.05109882, - "balance_loss_mlp": 1.02682269, - "epoch": 0.17748384187584548, - "flos": 20959694277120.0, - "grad_norm": 2.3133151959471796, - "language_loss": 0.80395055, - "learning_rate": 3.776132549750806e-06, - "loss": 0.82567012, - "num_input_tokens_seen": 63817045, - "step": 2952, - "time_per_iteration": 2.7605957984924316 - }, - { - "auxiliary_loss_clip": 0.01162679, - "auxiliary_loss_mlp": 0.01044862, - "balance_loss_clip": 1.05858529, - "balance_loss_mlp": 1.02513337, - "epoch": 0.17754396512851345, - "flos": 25009986844800.0, - "grad_norm": 2.8185319653472116, - "language_loss": 0.79273909, - "learning_rate": 3.7759534745257194e-06, - "loss": 0.81481451, - "num_input_tokens_seen": 63837665, - "step": 2953, - "time_per_iteration": 2.798912525177002 - }, - { - "auxiliary_loss_clip": 0.0112399, - "auxiliary_loss_mlp": 0.01043314, - "balance_loss_clip": 1.05482125, - "balance_loss_mlp": 1.02470589, - "epoch": 0.1776040883811814, - "flos": 32051566275840.0, - "grad_norm": 2.017710353628998, - "language_loss": 0.87963271, - "learning_rate": 3.7757743319562994e-06, - "loss": 0.90130568, - "num_input_tokens_seen": 63858455, - "step": 2954, - "time_per_iteration": 2.838931083679199 - }, - { - "auxiliary_loss_clip": 0.01144028, - "auxiliary_loss_mlp": 0.01052958, - "balance_loss_clip": 1.06043494, - "balance_loss_mlp": 1.03296697, - "epoch": 0.17766421163384938, - "flos": 21574125348480.0, - "grad_norm": 1.9130853947826985, - "language_loss": 0.85313326, - "learning_rate": 3.7755951220493386e-06, - "loss": 0.87510312, - "num_input_tokens_seen": 63876935, - "step": 2955, - "time_per_iteration": 2.7965714931488037 - }, - { - "auxiliary_loss_clip": 0.01127677, - "auxiliary_loss_mlp": 0.01047004, - "balance_loss_clip": 1.05093336, - "balance_loss_mlp": 1.02660692, - "epoch": 0.17772433488651737, - "flos": 22419319345920.0, - "grad_norm": 18.24238703278013, - "language_loss": 0.71152055, - "learning_rate": 3.7754158448116327e-06, - "loss": 0.73326737, - "num_input_tokens_seen": 63896815, - "step": 2956, - "time_per_iteration": 2.8358442783355713 - }, - { - "auxiliary_loss_clip": 0.01150063, - "auxiliary_loss_mlp": 0.010506, - "balance_loss_clip": 1.05813813, - "balance_loss_mlp": 1.03156281, - "epoch": 0.17778445813918534, - "flos": 25629445820160.0, - "grad_norm": 2.981126112172262, - "language_loss": 0.82881534, - "learning_rate": 3.7752365002499795e-06, - "loss": 0.85082197, - "num_input_tokens_seen": 63916140, - "step": 2957, - "time_per_iteration": 2.7034976482391357 - }, - { - "auxiliary_loss_clip": 0.01100452, - "auxiliary_loss_mlp": 0.01047239, - "balance_loss_clip": 1.04976833, - "balance_loss_mlp": 1.02789164, - "epoch": 0.1778445813918533, - "flos": 25628871202560.0, - "grad_norm": 2.7180995933425622, - "language_loss": 0.75164193, - "learning_rate": 3.7750570883711807e-06, - "loss": 0.77311885, - "num_input_tokens_seen": 63935220, - "step": 2958, - "time_per_iteration": 2.8312718868255615 - }, - { - "auxiliary_loss_clip": 0.01146025, - "auxiliary_loss_mlp": 0.01043359, - "balance_loss_clip": 1.06117964, - "balance_loss_mlp": 1.02502513, - "epoch": 0.17790470464452127, - "flos": 22345522853760.0, - "grad_norm": 9.439636088267013, - "language_loss": 0.80363399, - "learning_rate": 3.7748776091820397e-06, - "loss": 0.82552785, - "num_input_tokens_seen": 63954550, - "step": 2959, - "time_per_iteration": 2.722102642059326 - }, - { - "auxiliary_loss_clip": 0.01164621, - "auxiliary_loss_mlp": 0.01049069, - "balance_loss_clip": 1.05812871, - "balance_loss_mlp": 1.02938771, - "epoch": 0.17796482789718923, - "flos": 18765875214720.0, - "grad_norm": 2.62580469975692, - "language_loss": 0.51511085, - "learning_rate": 3.774698062689362e-06, - "loss": 0.53724772, - "num_input_tokens_seen": 63972425, - "step": 2960, - "time_per_iteration": 2.6222047805786133 - }, - { - "auxiliary_loss_clip": 0.01111843, - "auxiliary_loss_mlp": 0.01052801, - "balance_loss_clip": 1.05275989, - "balance_loss_mlp": 1.03228474, - "epoch": 0.1780249511498572, - "flos": 23440941970560.0, - "grad_norm": 1.7626913000215665, - "language_loss": 0.88908094, - "learning_rate": 3.7745184488999548e-06, - "loss": 0.91072738, - "num_input_tokens_seen": 63992165, - "step": 2961, - "time_per_iteration": 2.8088786602020264 - }, - { - "auxiliary_loss_clip": 0.01116231, - "auxiliary_loss_mlp": 0.01054867, - "balance_loss_clip": 1.05181062, - "balance_loss_mlp": 1.03385067, - "epoch": 0.1780850744025252, - "flos": 23367468700800.0, - "grad_norm": 1.716412227369414, - "language_loss": 0.79170465, - "learning_rate": 3.774338767820631e-06, - "loss": 0.81341565, - "num_input_tokens_seen": 64013470, - "step": 2962, - "time_per_iteration": 2.7546913623809814 - }, - { - "auxiliary_loss_clip": 0.01145526, - "auxiliary_loss_mlp": 0.01052794, - "balance_loss_clip": 1.05649889, - "balance_loss_mlp": 1.03104997, - "epoch": 0.17814519765519315, - "flos": 13771994319360.0, - "grad_norm": 2.3241756501763446, - "language_loss": 0.74910223, - "learning_rate": 3.774159019458203e-06, - "loss": 0.77108544, - "num_input_tokens_seen": 64030975, - "step": 2963, - "time_per_iteration": 2.680356979370117 - }, - { - "auxiliary_loss_clip": 0.01140656, - "auxiliary_loss_mlp": 0.01043225, - "balance_loss_clip": 1.05769885, - "balance_loss_mlp": 1.02347231, - "epoch": 0.17820532090786112, - "flos": 21976396738560.0, - "grad_norm": 1.747536927551571, - "language_loss": 0.78837025, - "learning_rate": 3.7739792038194877e-06, - "loss": 0.81020904, - "num_input_tokens_seen": 64050075, - "step": 2964, - "time_per_iteration": 2.748398780822754 - }, - { - "auxiliary_loss_clip": 0.01151685, - "auxiliary_loss_mlp": 0.00776982, - "balance_loss_clip": 1.05950594, - "balance_loss_mlp": 1.00098181, - "epoch": 0.17826544416052909, - "flos": 24790752184320.0, - "grad_norm": 3.046027397796258, - "language_loss": 0.81160808, - "learning_rate": 3.7737993209113027e-06, - "loss": 0.83089471, - "num_input_tokens_seen": 64071920, - "step": 2965, - "time_per_iteration": 2.8090012073516846 - }, - { - "auxiliary_loss_clip": 0.01151658, - "auxiliary_loss_mlp": 0.01047086, - "balance_loss_clip": 1.06002402, - "balance_loss_mlp": 1.02916884, - "epoch": 0.17832556741319705, - "flos": 13879582531200.0, - "grad_norm": 2.554359630612449, - "language_loss": 0.95307338, - "learning_rate": 3.7736193707404698e-06, - "loss": 0.97506082, - "num_input_tokens_seen": 64086835, - "step": 2966, - "time_per_iteration": 2.7159550189971924 - }, - { - "auxiliary_loss_clip": 0.01112928, - "auxiliary_loss_mlp": 0.00777395, - "balance_loss_clip": 1.05336046, - "balance_loss_mlp": 1.00083637, - "epoch": 0.17838569066586502, - "flos": 36641703323520.0, - "grad_norm": 7.5683867487642065, - "language_loss": 0.72833109, - "learning_rate": 3.7734393533138127e-06, - "loss": 0.74723434, - "num_input_tokens_seen": 64107360, - "step": 2967, - "time_per_iteration": 2.9540669918060303 - }, - { - "auxiliary_loss_clip": 0.01129124, - "auxiliary_loss_mlp": 0.01046817, - "balance_loss_clip": 1.05574143, - "balance_loss_mlp": 1.02775562, - "epoch": 0.17844581391853298, - "flos": 18727271072640.0, - "grad_norm": 2.1617023205672523, - "language_loss": 0.76897681, - "learning_rate": 3.773259268638157e-06, - "loss": 0.7907362, - "num_input_tokens_seen": 64124690, - "step": 2968, - "time_per_iteration": 2.752717971801758 - }, - { - "auxiliary_loss_clip": 0.01085006, - "auxiliary_loss_mlp": 0.01044958, - "balance_loss_clip": 1.04640651, - "balance_loss_mlp": 1.02559829, - "epoch": 0.17850593717120097, - "flos": 27378259286400.0, - "grad_norm": 2.039560504387258, - "language_loss": 0.75839806, - "learning_rate": 3.7730791167203333e-06, - "loss": 0.77969772, - "num_input_tokens_seen": 64146315, - "step": 2969, - "time_per_iteration": 2.9161994457244873 - }, - { - "auxiliary_loss_clip": 0.01075271, - "auxiliary_loss_mlp": 0.01013071, - "balance_loss_clip": 1.06177902, - "balance_loss_mlp": 1.00932813, - "epoch": 0.17856606042386894, - "flos": 66996025084800.0, - "grad_norm": 0.8520394227890811, - "language_loss": 0.69012916, - "learning_rate": 3.772898897567171e-06, - "loss": 0.7110126, - "num_input_tokens_seen": 64210875, - "step": 2970, - "time_per_iteration": 3.3269262313842773 - }, - { - "auxiliary_loss_clip": 0.011313, - "auxiliary_loss_mlp": 0.01044166, - "balance_loss_clip": 1.05561864, - "balance_loss_mlp": 1.02493763, - "epoch": 0.1786261836765369, - "flos": 36977001805440.0, - "grad_norm": 1.9951166568015506, - "language_loss": 0.67617297, - "learning_rate": 3.772718611185505e-06, - "loss": 0.69792765, - "num_input_tokens_seen": 64230740, - "step": 2971, - "time_per_iteration": 2.8691961765289307 - }, - { - "auxiliary_loss_clip": 0.01110831, - "auxiliary_loss_mlp": 0.01052779, - "balance_loss_clip": 1.05309939, - "balance_loss_mlp": 1.03266823, - "epoch": 0.17868630692920487, - "flos": 24825441744000.0, - "grad_norm": 1.5664358375440484, - "language_loss": 0.8971802, - "learning_rate": 3.7725382575821717e-06, - "loss": 0.91881633, - "num_input_tokens_seen": 64252300, - "step": 2972, - "time_per_iteration": 2.893923759460449 - }, - { - "auxiliary_loss_clip": 0.01124705, - "auxiliary_loss_mlp": 0.01055871, - "balance_loss_clip": 1.05635929, - "balance_loss_mlp": 1.03466403, - "epoch": 0.17874643018187283, - "flos": 16981977139200.0, - "grad_norm": 2.4611679901229153, - "language_loss": 0.88593906, - "learning_rate": 3.77235783676401e-06, - "loss": 0.90774482, - "num_input_tokens_seen": 64270105, - "step": 2973, - "time_per_iteration": 2.7340333461761475 - }, - { - "auxiliary_loss_clip": 0.01164127, - "auxiliary_loss_mlp": 0.01047073, - "balance_loss_clip": 1.06285155, - "balance_loss_mlp": 1.0283215, - "epoch": 0.1788065534345408, - "flos": 21032233793280.0, - "grad_norm": 3.4039298885336557, - "language_loss": 0.7668556, - "learning_rate": 3.7721773487378615e-06, - "loss": 0.78896761, - "num_input_tokens_seen": 64287250, - "step": 2974, - "time_per_iteration": 2.632495403289795 - }, - { - "auxiliary_loss_clip": 0.0114187, - "auxiliary_loss_mlp": 0.01053, - "balance_loss_clip": 1.06101942, - "balance_loss_mlp": 1.03390288, - "epoch": 0.17886667668720876, - "flos": 23987717775360.0, - "grad_norm": 2.484949778027245, - "language_loss": 0.74701655, - "learning_rate": 3.7719967935105705e-06, - "loss": 0.76896524, - "num_input_tokens_seen": 64307140, - "step": 2975, - "time_per_iteration": 2.704012870788574 - }, - { - "auxiliary_loss_clip": 0.01149026, - "auxiliary_loss_mlp": 0.01048788, - "balance_loss_clip": 1.05678535, - "balance_loss_mlp": 1.03004813, - "epoch": 0.17892679993987676, - "flos": 25739476156800.0, - "grad_norm": 1.518747487377626, - "language_loss": 0.73032069, - "learning_rate": 3.7718161710889833e-06, - "loss": 0.75229883, - "num_input_tokens_seen": 64328760, - "step": 2976, - "time_per_iteration": 2.7357017993927 - }, - { - "auxiliary_loss_clip": 0.01150398, - "auxiliary_loss_mlp": 0.01038685, - "balance_loss_clip": 1.06239033, - "balance_loss_mlp": 1.0229373, - "epoch": 0.17898692319254472, - "flos": 25699686865920.0, - "grad_norm": 1.4579507247258654, - "language_loss": 0.770594, - "learning_rate": 3.7716354814799495e-06, - "loss": 0.79248488, - "num_input_tokens_seen": 64348800, - "step": 2977, - "time_per_iteration": 2.727318286895752 - }, - { - "auxiliary_loss_clip": 0.01131521, - "auxiliary_loss_mlp": 0.01045834, - "balance_loss_clip": 1.06618452, - "balance_loss_mlp": 1.02841735, - "epoch": 0.1790470464452127, - "flos": 19317786664320.0, - "grad_norm": 2.7286854986191282, - "language_loss": 0.80235189, - "learning_rate": 3.7714547246903203e-06, - "loss": 0.82412547, - "num_input_tokens_seen": 64367955, - "step": 2978, - "time_per_iteration": 2.8178791999816895 - }, - { - "auxiliary_loss_clip": 0.0114307, - "auxiliary_loss_mlp": 0.01052978, - "balance_loss_clip": 1.05818772, - "balance_loss_mlp": 1.03330874, - "epoch": 0.17910716969788065, - "flos": 30044267562240.0, - "grad_norm": 1.4967765935497133, - "language_loss": 0.76192784, - "learning_rate": 3.7712739007269508e-06, - "loss": 0.7838884, - "num_input_tokens_seen": 64389805, - "step": 2979, - "time_per_iteration": 4.241487741470337 - }, - { - "auxiliary_loss_clip": 0.01122958, - "auxiliary_loss_mlp": 0.0104457, - "balance_loss_clip": 1.0590893, - "balance_loss_mlp": 1.02660525, - "epoch": 0.17916729295054862, - "flos": 19427709260160.0, - "grad_norm": 1.9491816848203256, - "language_loss": 0.68945503, - "learning_rate": 3.7710930095966976e-06, - "loss": 0.71113026, - "num_input_tokens_seen": 64408220, - "step": 2980, - "time_per_iteration": 2.6817352771759033 - }, - { - "auxiliary_loss_clip": 0.01152986, - "auxiliary_loss_mlp": 0.0104519, - "balance_loss_clip": 1.0588038, - "balance_loss_mlp": 1.02497244, - "epoch": 0.17922741620321658, - "flos": 14611549881600.0, - "grad_norm": 1.9134992191513662, - "language_loss": 0.70793843, - "learning_rate": 3.7709120513064196e-06, - "loss": 0.72992027, - "num_input_tokens_seen": 64426380, - "step": 2981, - "time_per_iteration": 4.310532331466675 - }, - { - "auxiliary_loss_clip": 0.01137747, - "auxiliary_loss_mlp": 0.01056086, - "balance_loss_clip": 1.06083858, - "balance_loss_mlp": 1.03686976, - "epoch": 0.17928753945588458, - "flos": 17165301177600.0, - "grad_norm": 2.529665562311581, - "language_loss": 0.8190546, - "learning_rate": 3.7707310258629796e-06, - "loss": 0.84099293, - "num_input_tokens_seen": 64444355, - "step": 2982, - "time_per_iteration": 2.710726261138916 - }, - { - "auxiliary_loss_clip": 0.01162978, - "auxiliary_loss_mlp": 0.01041014, - "balance_loss_clip": 1.06181359, - "balance_loss_mlp": 1.02306128, - "epoch": 0.17934766270855254, - "flos": 31395622060800.0, - "grad_norm": 1.6440716861921114, - "language_loss": 0.83123535, - "learning_rate": 3.7705499332732413e-06, - "loss": 0.85327524, - "num_input_tokens_seen": 64467800, - "step": 2983, - "time_per_iteration": 2.700378656387329 - }, - { - "auxiliary_loss_clip": 0.01153001, - "auxiliary_loss_mlp": 0.01048341, - "balance_loss_clip": 1.05694914, - "balance_loss_mlp": 1.02932739, - "epoch": 0.1794077859612205, - "flos": 20814184281600.0, - "grad_norm": 1.6703280507743268, - "language_loss": 0.85149562, - "learning_rate": 3.7703687735440718e-06, - "loss": 0.87350899, - "num_input_tokens_seen": 64487230, - "step": 2984, - "time_per_iteration": 2.6529407501220703 - }, - { - "auxiliary_loss_clip": 0.01126981, - "auxiliary_loss_mlp": 0.01043442, - "balance_loss_clip": 1.05520201, - "balance_loss_mlp": 1.02424896, - "epoch": 0.17946790921388847, - "flos": 28986447006720.0, - "grad_norm": 2.4609160562432053, - "language_loss": 0.8935222, - "learning_rate": 3.7701875466823416e-06, - "loss": 0.9152264, - "num_input_tokens_seen": 64509165, - "step": 2985, - "time_per_iteration": 4.528426170349121 - }, - { - "auxiliary_loss_clip": 0.01160091, - "auxiliary_loss_mlp": 0.01040749, - "balance_loss_clip": 1.06142831, - "balance_loss_mlp": 1.02434587, - "epoch": 0.17952803246655644, - "flos": 20737406960640.0, - "grad_norm": 2.095497349072142, - "language_loss": 0.69538593, - "learning_rate": 3.770006252694922e-06, - "loss": 0.71739429, - "num_input_tokens_seen": 64527940, - "step": 2986, - "time_per_iteration": 2.6890172958374023 - }, - { - "auxiliary_loss_clip": 0.01158556, - "auxiliary_loss_mlp": 0.00776, - "balance_loss_clip": 1.05752599, - "balance_loss_mlp": 1.00081134, - "epoch": 0.1795881557192244, - "flos": 28255988027520.0, - "grad_norm": 2.4599229747435123, - "language_loss": 0.77855188, - "learning_rate": 3.769824891588688e-06, - "loss": 0.79789746, - "num_input_tokens_seen": 64545230, - "step": 2987, - "time_per_iteration": 2.650761842727661 - }, - { - "auxiliary_loss_clip": 0.0116216, - "auxiliary_loss_mlp": 0.01043775, - "balance_loss_clip": 1.05775642, - "balance_loss_mlp": 1.02441502, - "epoch": 0.17964827897189237, - "flos": 18552027594240.0, - "grad_norm": 2.0190394876224467, - "language_loss": 0.77958816, - "learning_rate": 3.7696434633705164e-06, - "loss": 0.80164748, - "num_input_tokens_seen": 64563820, - "step": 2988, - "time_per_iteration": 2.6151437759399414 - }, - { - "auxiliary_loss_clip": 0.01059513, - "auxiliary_loss_mlp": 0.00756906, - "balance_loss_clip": 1.07071137, - "balance_loss_mlp": 1.00131369, - "epoch": 0.17970840222456036, - "flos": 58165088711040.0, - "grad_norm": 0.7650122273387262, - "language_loss": 0.62709254, - "learning_rate": 3.7694619680472875e-06, - "loss": 0.64525676, - "num_input_tokens_seen": 64621315, - "step": 2989, - "time_per_iteration": 3.1990275382995605 - }, - { - "auxiliary_loss_clip": 0.01137168, - "auxiliary_loss_mlp": 0.01038826, - "balance_loss_clip": 1.05553865, - "balance_loss_mlp": 1.02128983, - "epoch": 0.17976852547722832, - "flos": 20300805146880.0, - "grad_norm": 2.3566032567209483, - "language_loss": 0.71070904, - "learning_rate": 3.7692804056258837e-06, - "loss": 0.73246896, - "num_input_tokens_seen": 64639885, - "step": 2990, - "time_per_iteration": 2.7275335788726807 - }, - { - "auxiliary_loss_clip": 0.01135847, - "auxiliary_loss_mlp": 0.01044966, - "balance_loss_clip": 1.05398035, - "balance_loss_mlp": 1.02639365, - "epoch": 0.1798286487298963, - "flos": 39669367685760.0, - "grad_norm": 1.8035266350414116, - "language_loss": 0.68888462, - "learning_rate": 3.7690987761131893e-06, - "loss": 0.7106927, - "num_input_tokens_seen": 64661220, - "step": 2991, - "time_per_iteration": 2.8237311840057373 - }, - { - "auxiliary_loss_clip": 0.01104375, - "auxiliary_loss_mlp": 0.01046061, - "balance_loss_clip": 1.05156851, - "balance_loss_mlp": 1.02663028, - "epoch": 0.17988877198256426, - "flos": 25520313323520.0, - "grad_norm": 1.6063564491400402, - "language_loss": 0.82933879, - "learning_rate": 3.7689170795160924e-06, - "loss": 0.85084313, - "num_input_tokens_seen": 64682530, - "step": 2992, - "time_per_iteration": 2.8303778171539307 - }, - { - "auxiliary_loss_clip": 0.01140805, - "auxiliary_loss_mlp": 0.01035603, - "balance_loss_clip": 1.05302262, - "balance_loss_mlp": 1.0187583, - "epoch": 0.17994889523523222, - "flos": 18807496099200.0, - "grad_norm": 2.076285453641059, - "language_loss": 0.82228035, - "learning_rate": 3.7687353158414822e-06, - "loss": 0.84404445, - "num_input_tokens_seen": 64701025, - "step": 2993, - "time_per_iteration": 2.710369110107422 - }, - { - "auxiliary_loss_clip": 0.01135151, - "auxiliary_loss_mlp": 0.01040493, - "balance_loss_clip": 1.05135202, - "balance_loss_mlp": 1.02236176, - "epoch": 0.18000901848790019, - "flos": 21104450087040.0, - "grad_norm": 1.7027458997386926, - "language_loss": 0.78129464, - "learning_rate": 3.7685534850962517e-06, - "loss": 0.80305111, - "num_input_tokens_seen": 64719570, - "step": 2994, - "time_per_iteration": 2.6666738986968994 - }, - { - "auxiliary_loss_clip": 0.01158877, - "auxiliary_loss_mlp": 0.01045455, - "balance_loss_clip": 1.05657315, - "balance_loss_mlp": 1.02819359, - "epoch": 0.18006914174056818, - "flos": 19646441130240.0, - "grad_norm": 2.4198973911698434, - "language_loss": 0.81139499, - "learning_rate": 3.768371587287296e-06, - "loss": 0.83343828, - "num_input_tokens_seen": 64738110, - "step": 2995, - "time_per_iteration": 2.699521541595459 - }, - { - "auxiliary_loss_clip": 0.01142902, - "auxiliary_loss_mlp": 0.01047606, - "balance_loss_clip": 1.05350447, - "balance_loss_mlp": 1.0310601, - "epoch": 0.18012926499323614, - "flos": 19499889640320.0, - "grad_norm": 1.8607496799697536, - "language_loss": 0.84162772, - "learning_rate": 3.768189622421512e-06, - "loss": 0.86353278, - "num_input_tokens_seen": 64756345, - "step": 2996, - "time_per_iteration": 2.696723461151123 - }, - { - "auxiliary_loss_clip": 0.01127214, - "auxiliary_loss_mlp": 0.01039953, - "balance_loss_clip": 1.06094205, - "balance_loss_mlp": 1.02273917, - "epoch": 0.1801893882459041, - "flos": 19464553635840.0, - "grad_norm": 2.1291201116421283, - "language_loss": 0.88189137, - "learning_rate": 3.7680075905058006e-06, - "loss": 0.90356302, - "num_input_tokens_seen": 64776375, - "step": 2997, - "time_per_iteration": 2.785522699356079 - }, - { - "auxiliary_loss_clip": 0.01134376, - "auxiliary_loss_mlp": 0.01045962, - "balance_loss_clip": 1.04949927, - "balance_loss_mlp": 1.02753246, - "epoch": 0.18024951149857207, - "flos": 26870590414080.0, - "grad_norm": 1.7579499924576911, - "language_loss": 0.85068727, - "learning_rate": 3.7678254915470643e-06, - "loss": 0.87249064, - "num_input_tokens_seen": 64796210, - "step": 2998, - "time_per_iteration": 2.6912384033203125 - }, - { - "auxiliary_loss_clip": 0.01159537, - "auxiliary_loss_mlp": 0.01044427, - "balance_loss_clip": 1.06019807, - "balance_loss_mlp": 1.02641416, - "epoch": 0.18030963475124004, - "flos": 30226621933440.0, - "grad_norm": 1.8075624565441775, - "language_loss": 0.84176779, - "learning_rate": 3.7676433255522084e-06, - "loss": 0.86380744, - "num_input_tokens_seen": 64818590, - "step": 2999, - "time_per_iteration": 2.722447395324707 - }, - { - "auxiliary_loss_clip": 0.01143605, - "auxiliary_loss_mlp": 0.01047321, - "balance_loss_clip": 1.05324686, - "balance_loss_mlp": 1.02870023, - "epoch": 0.180369758003908, - "flos": 22307493329280.0, - "grad_norm": 1.8789697336390492, - "language_loss": 0.75206578, - "learning_rate": 3.76746109252814e-06, - "loss": 0.77397501, - "num_input_tokens_seen": 64838350, - "step": 3000, - "time_per_iteration": 2.669875144958496 - }, - { - "auxiliary_loss_clip": 0.01130052, - "auxiliary_loss_mlp": 0.00775745, - "balance_loss_clip": 1.0526886, - "balance_loss_mlp": 1.00060582, - "epoch": 0.18042988125657597, - "flos": 23732033788800.0, - "grad_norm": 2.1714361871851704, - "language_loss": 0.71088028, - "learning_rate": 3.76727879248177e-06, - "loss": 0.72993821, - "num_input_tokens_seen": 64858065, - "step": 3001, - "time_per_iteration": 2.7207603454589844 - }, - { - "auxiliary_loss_clip": 0.01150091, - "auxiliary_loss_mlp": 0.01044695, - "balance_loss_clip": 1.05701649, - "balance_loss_mlp": 1.02605033, - "epoch": 0.18049000450924396, - "flos": 24093582134400.0, - "grad_norm": 2.218812983953599, - "language_loss": 0.8849982, - "learning_rate": 3.767096425420011e-06, - "loss": 0.90694606, - "num_input_tokens_seen": 64877305, - "step": 3002, - "time_per_iteration": 2.6577625274658203 - }, - { - "auxiliary_loss_clip": 0.01157827, - "auxiliary_loss_mlp": 0.01048268, - "balance_loss_clip": 1.05624068, - "balance_loss_mlp": 1.03076851, - "epoch": 0.18055012776191193, - "flos": 22163168482560.0, - "grad_norm": 1.6287780165264572, - "language_loss": 0.80328667, - "learning_rate": 3.7669139913497788e-06, - "loss": 0.8253476, - "num_input_tokens_seen": 64896955, - "step": 3003, - "time_per_iteration": 2.6274783611297607 - }, - { - "auxiliary_loss_clip": 0.01158367, - "auxiliary_loss_mlp": 0.01043654, - "balance_loss_clip": 1.05622995, - "balance_loss_mlp": 1.02596307, - "epoch": 0.1806102510145799, - "flos": 28913512440960.0, - "grad_norm": 2.3308952017896956, - "language_loss": 0.67250973, - "learning_rate": 3.7667314902779907e-06, - "loss": 0.69452989, - "num_input_tokens_seen": 64917080, - "step": 3004, - "time_per_iteration": 2.6652631759643555 - }, - { - "auxiliary_loss_clip": 0.01147517, - "auxiliary_loss_mlp": 0.01054518, - "balance_loss_clip": 1.05606318, - "balance_loss_mlp": 1.03528929, - "epoch": 0.18067037426724786, - "flos": 19025689265280.0, - "grad_norm": 2.592432277036083, - "language_loss": 0.85111535, - "learning_rate": 3.7665489222115677e-06, - "loss": 0.87313569, - "num_input_tokens_seen": 64935215, - "step": 3005, - "time_per_iteration": 2.654977560043335 - }, - { - "auxiliary_loss_clip": 0.0114499, - "auxiliary_loss_mlp": 0.01041993, - "balance_loss_clip": 1.05690646, - "balance_loss_mlp": 1.02489829, - "epoch": 0.18073049751991582, - "flos": 27453635976960.0, - "grad_norm": 1.5217876402754629, - "language_loss": 0.83215338, - "learning_rate": 3.766366287157432e-06, - "loss": 0.85402322, - "num_input_tokens_seen": 64956275, - "step": 3006, - "time_per_iteration": 2.7118306159973145 - }, - { - "auxiliary_loss_clip": 0.01127168, - "auxiliary_loss_mlp": 0.01050084, - "balance_loss_clip": 1.05063033, - "balance_loss_mlp": 1.03105807, - "epoch": 0.1807906207725838, - "flos": 28729039167360.0, - "grad_norm": 1.6327495611050657, - "language_loss": 0.77377248, - "learning_rate": 3.7661835851225103e-06, - "loss": 0.79554498, - "num_input_tokens_seen": 64979390, - "step": 3007, - "time_per_iteration": 2.7996537685394287 - }, - { - "auxiliary_loss_clip": 0.01070026, - "auxiliary_loss_mlp": 0.01030441, - "balance_loss_clip": 1.04936945, - "balance_loss_mlp": 1.02712655, - "epoch": 0.18085074402525175, - "flos": 64466515468800.0, - "grad_norm": 0.801982400183398, - "language_loss": 0.56987137, - "learning_rate": 3.7660008161137294e-06, - "loss": 0.5908761, - "num_input_tokens_seen": 65043135, - "step": 3008, - "time_per_iteration": 3.4269092082977295 - }, - { - "auxiliary_loss_clip": 0.01130838, - "auxiliary_loss_mlp": 0.01047085, - "balance_loss_clip": 1.05308366, - "balance_loss_mlp": 1.02686691, - "epoch": 0.18091086727791975, - "flos": 23476960333440.0, - "grad_norm": 1.8424126412451678, - "language_loss": 0.67248082, - "learning_rate": 3.765817980138021e-06, - "loss": 0.69426012, - "num_input_tokens_seen": 65062845, - "step": 3009, - "time_per_iteration": 2.7875866889953613 - }, - { - "auxiliary_loss_clip": 0.01161719, - "auxiliary_loss_mlp": 0.01044187, - "balance_loss_clip": 1.0595516, - "balance_loss_mlp": 1.02673507, - "epoch": 0.1809709905305877, - "flos": 24170467196160.0, - "grad_norm": 2.4429360498363986, - "language_loss": 0.75690198, - "learning_rate": 3.7656350772023177e-06, - "loss": 0.778961, - "num_input_tokens_seen": 65082110, - "step": 3010, - "time_per_iteration": 2.6060268878936768 - }, - { - "auxiliary_loss_clip": 0.01127916, - "auxiliary_loss_mlp": 0.01037817, - "balance_loss_clip": 1.05715132, - "balance_loss_mlp": 1.02063942, - "epoch": 0.18103111378325568, - "flos": 21650902669440.0, - "grad_norm": 1.6324915654296899, - "language_loss": 0.67356348, - "learning_rate": 3.7654521073135553e-06, - "loss": 0.69522083, - "num_input_tokens_seen": 65101985, - "step": 3011, - "time_per_iteration": 2.763596534729004 - }, - { - "auxiliary_loss_clip": 0.01105034, - "auxiliary_loss_mlp": 0.00777475, - "balance_loss_clip": 1.04540467, - "balance_loss_mlp": 1.00078559, - "epoch": 0.18109123703592364, - "flos": 53686918356480.0, - "grad_norm": 1.551526807882757, - "language_loss": 0.71288514, - "learning_rate": 3.7652690704786723e-06, - "loss": 0.73171026, - "num_input_tokens_seen": 65129295, - "step": 3012, - "time_per_iteration": 3.037775993347168 - }, - { - "auxiliary_loss_clip": 0.01132189, - "auxiliary_loss_mlp": 0.01052085, - "balance_loss_clip": 1.05564284, - "balance_loss_mlp": 1.03348863, - "epoch": 0.1811513602885916, - "flos": 35845564325760.0, - "grad_norm": 2.095737131475866, - "language_loss": 0.62309992, - "learning_rate": 3.765085966704609e-06, - "loss": 0.64494264, - "num_input_tokens_seen": 65150625, - "step": 3013, - "time_per_iteration": 2.7692227363586426 - }, - { - "auxiliary_loss_clip": 0.01131323, - "auxiliary_loss_mlp": 0.0105253, - "balance_loss_clip": 1.05343401, - "balance_loss_mlp": 1.03486276, - "epoch": 0.18121148354125957, - "flos": 23732572492800.0, - "grad_norm": 1.6679267545988328, - "language_loss": 0.76147234, - "learning_rate": 3.764902795998309e-06, - "loss": 0.78331089, - "num_input_tokens_seen": 65170880, - "step": 3014, - "time_per_iteration": 2.7296786308288574 - }, - { - "auxiliary_loss_clip": 0.01163543, - "auxiliary_loss_mlp": 0.01050053, - "balance_loss_clip": 1.05964816, - "balance_loss_mlp": 1.02987087, - "epoch": 0.18127160679392756, - "flos": 28728320895360.0, - "grad_norm": 2.1234423596691796, - "language_loss": 0.66310829, - "learning_rate": 3.7647195583667184e-06, - "loss": 0.6852442, - "num_input_tokens_seen": 65192530, - "step": 3015, - "time_per_iteration": 2.7575571537017822 - }, - { - "auxiliary_loss_clip": 0.0113004, - "auxiliary_loss_mlp": 0.00776613, - "balance_loss_clip": 1.05429327, - "balance_loss_mlp": 1.00067461, - "epoch": 0.18133173004659553, - "flos": 20485062938880.0, - "grad_norm": 1.7837261279259933, - "language_loss": 0.78152305, - "learning_rate": 3.764536253816785e-06, - "loss": 0.80058956, - "num_input_tokens_seen": 65211675, - "step": 3016, - "time_per_iteration": 2.6718828678131104 - }, - { - "auxiliary_loss_clip": 0.01145073, - "auxiliary_loss_mlp": 0.01049504, - "balance_loss_clip": 1.05684161, - "balance_loss_mlp": 1.03068125, - "epoch": 0.1813918532992635, - "flos": 22852078404480.0, - "grad_norm": 1.7248072345223011, - "language_loss": 0.8351965, - "learning_rate": 3.7643528823554602e-06, - "loss": 0.85714233, - "num_input_tokens_seen": 65231185, - "step": 3017, - "time_per_iteration": 2.6879045963287354 - }, - { - "auxiliary_loss_clip": 0.0114091, - "auxiliary_loss_mlp": 0.01042994, - "balance_loss_clip": 1.05404854, - "balance_loss_mlp": 1.02539897, - "epoch": 0.18145197655193146, - "flos": 36065122208640.0, - "grad_norm": 2.2664795482488924, - "language_loss": 0.6769017, - "learning_rate": 3.764169443989697e-06, - "loss": 0.69874066, - "num_input_tokens_seen": 65251645, - "step": 3018, - "time_per_iteration": 4.31333327293396 - }, - { - "auxiliary_loss_clip": 0.01147629, - "auxiliary_loss_mlp": 0.00776661, - "balance_loss_clip": 1.05706179, - "balance_loss_mlp": 1.00074184, - "epoch": 0.18151209980459942, - "flos": 24023951619840.0, - "grad_norm": 1.8935259017451227, - "language_loss": 0.76396847, - "learning_rate": 3.7639859387264518e-06, - "loss": 0.78321135, - "num_input_tokens_seen": 65271125, - "step": 3019, - "time_per_iteration": 2.7667160034179688 - }, - { - "auxiliary_loss_clip": 0.01121465, - "auxiliary_loss_mlp": 0.01046742, - "balance_loss_clip": 1.05550635, - "balance_loss_mlp": 1.02722728, - "epoch": 0.1815722230572674, - "flos": 23951627585280.0, - "grad_norm": 2.042490471678265, - "language_loss": 0.81550395, - "learning_rate": 3.7638023665726834e-06, - "loss": 0.83718598, - "num_input_tokens_seen": 65290600, - "step": 3020, - "time_per_iteration": 4.3900346755981445 - }, - { - "auxiliary_loss_clip": 0.01136424, - "auxiliary_loss_mlp": 0.01046217, - "balance_loss_clip": 1.05758023, - "balance_loss_mlp": 1.02567708, - "epoch": 0.18163234630993536, - "flos": 24386469632640.0, - "grad_norm": 1.9628186536024828, - "language_loss": 0.7757082, - "learning_rate": 3.763618727535352e-06, - "loss": 0.79753458, - "num_input_tokens_seen": 65311040, - "step": 3021, - "time_per_iteration": 4.3029396533966064 - }, - { - "auxiliary_loss_clip": 0.01143245, - "auxiliary_loss_mlp": 0.01047278, - "balance_loss_clip": 1.05453348, - "balance_loss_mlp": 1.02907431, - "epoch": 0.18169246956260335, - "flos": 24681332378880.0, - "grad_norm": 1.725306643191844, - "language_loss": 0.84863859, - "learning_rate": 3.763435021621422e-06, - "loss": 0.87054378, - "num_input_tokens_seen": 65332115, - "step": 3022, - "time_per_iteration": 2.7353312969207764 - }, - { - "auxiliary_loss_clip": 0.01132435, - "auxiliary_loss_mlp": 0.01042747, - "balance_loss_clip": 1.05769348, - "balance_loss_mlp": 1.0235188, - "epoch": 0.1817525928152713, - "flos": 24243294021120.0, - "grad_norm": 2.230341519134859, - "language_loss": 0.69367266, - "learning_rate": 3.763251248837859e-06, - "loss": 0.71542448, - "num_input_tokens_seen": 65352210, - "step": 3023, - "time_per_iteration": 2.775200605392456 - }, - { - "auxiliary_loss_clip": 0.01127605, - "auxiliary_loss_mlp": 0.01043947, - "balance_loss_clip": 1.04900002, - "balance_loss_mlp": 1.02556491, - "epoch": 0.18181271606793928, - "flos": 16472081623680.0, - "grad_norm": 2.150764188548567, - "language_loss": 0.74107385, - "learning_rate": 3.7630674091916317e-06, - "loss": 0.76278937, - "num_input_tokens_seen": 65370600, - "step": 3024, - "time_per_iteration": 2.7364041805267334 - }, - { - "auxiliary_loss_clip": 0.01145205, - "auxiliary_loss_mlp": 0.01046837, - "balance_loss_clip": 1.05719447, - "balance_loss_mlp": 1.02900314, - "epoch": 0.18187283932060724, - "flos": 18581042805120.0, - "grad_norm": 2.148591016046099, - "language_loss": 0.8835662, - "learning_rate": 3.7628835026897123e-06, - "loss": 0.90548658, - "num_input_tokens_seen": 65387270, - "step": 3025, - "time_per_iteration": 4.274658679962158 - }, - { - "auxiliary_loss_clip": 0.01133667, - "auxiliary_loss_mlp": 0.01050575, - "balance_loss_clip": 1.05470932, - "balance_loss_mlp": 1.03137028, - "epoch": 0.1819329625732752, - "flos": 20266833859200.0, - "grad_norm": 3.6399614210311206, - "language_loss": 0.79041791, - "learning_rate": 3.7626995293390735e-06, - "loss": 0.81226033, - "num_input_tokens_seen": 65406550, - "step": 3026, - "time_per_iteration": 2.7589778900146484 - }, - { - "auxiliary_loss_clip": 0.01132736, - "auxiliary_loss_mlp": 0.01055367, - "balance_loss_clip": 1.05774415, - "balance_loss_mlp": 1.03679442, - "epoch": 0.18199308582594317, - "flos": 25915186512000.0, - "grad_norm": 1.6980721374313217, - "language_loss": 0.759978, - "learning_rate": 3.762515489146692e-06, - "loss": 0.78185904, - "num_input_tokens_seen": 65425955, - "step": 3027, - "time_per_iteration": 2.7347826957702637 - }, - { - "auxiliary_loss_clip": 0.01163558, - "auxiliary_loss_mlp": 0.01053369, - "balance_loss_clip": 1.05835891, - "balance_loss_mlp": 1.03378284, - "epoch": 0.18205320907861114, - "flos": 15377524433280.0, - "grad_norm": 2.2893837743041368, - "language_loss": 0.85592651, - "learning_rate": 3.762331382119546e-06, - "loss": 0.87809575, - "num_input_tokens_seen": 65442820, - "step": 3028, - "time_per_iteration": 2.598905563354492 - }, - { - "auxiliary_loss_clip": 0.01156921, - "auxiliary_loss_mlp": 0.0104449, - "balance_loss_clip": 1.0578618, - "balance_loss_mlp": 1.0260129, - "epoch": 0.18211333233127913, - "flos": 25624310175360.0, - "grad_norm": 1.8897570500397638, - "language_loss": 0.82807779, - "learning_rate": 3.7621472082646183e-06, - "loss": 0.85009193, - "num_input_tokens_seen": 65461825, - "step": 3029, - "time_per_iteration": 2.677332639694214 - }, - { - "auxiliary_loss_clip": 0.01114993, - "auxiliary_loss_mlp": 0.01050232, - "balance_loss_clip": 1.05223596, - "balance_loss_mlp": 1.02931094, - "epoch": 0.1821734555839471, - "flos": 14976007228800.0, - "grad_norm": 10.840079090220346, - "language_loss": 0.78091359, - "learning_rate": 3.761962967588891e-06, - "loss": 0.80256593, - "num_input_tokens_seen": 65479480, - "step": 3030, - "time_per_iteration": 2.6865499019622803 - }, - { - "auxiliary_loss_clip": 0.01139676, - "auxiliary_loss_mlp": 0.01043273, - "balance_loss_clip": 1.05401075, - "balance_loss_mlp": 1.0240562, - "epoch": 0.18223357883661506, - "flos": 20194007034240.0, - "grad_norm": 2.05958060196279, - "language_loss": 0.85162055, - "learning_rate": 3.761778660099352e-06, - "loss": 0.87345004, - "num_input_tokens_seen": 65497775, - "step": 3031, - "time_per_iteration": 2.6336488723754883 - }, - { - "auxiliary_loss_clip": 0.01116657, - "auxiliary_loss_mlp": 0.00776186, - "balance_loss_clip": 1.0497843, - "balance_loss_mlp": 1.00052071, - "epoch": 0.18229370208928303, - "flos": 15231978524160.0, - "grad_norm": 1.83501853384953, - "language_loss": 0.79992211, - "learning_rate": 3.76159428580299e-06, - "loss": 0.81885058, - "num_input_tokens_seen": 65516505, - "step": 3032, - "time_per_iteration": 2.6879780292510986 - }, - { - "auxiliary_loss_clip": 0.01166412, - "auxiliary_loss_mlp": 0.01048902, - "balance_loss_clip": 1.06163025, - "balance_loss_mlp": 1.03038836, - "epoch": 0.182353825341951, - "flos": 23840483927040.0, - "grad_norm": 1.8132660189598853, - "language_loss": 0.81316388, - "learning_rate": 3.761409844706795e-06, - "loss": 0.83531702, - "num_input_tokens_seen": 65536160, - "step": 3033, - "time_per_iteration": 2.628100872039795 - }, - { - "auxiliary_loss_clip": 0.01048591, - "auxiliary_loss_mlp": 0.0100128, - "balance_loss_clip": 1.05392861, - "balance_loss_mlp": 0.99850291, - "epoch": 0.18241394859461896, - "flos": 61190957393280.0, - "grad_norm": 0.8825814513625035, - "language_loss": 0.63439631, - "learning_rate": 3.7612253368177625e-06, - "loss": 0.65489495, - "num_input_tokens_seen": 65589375, - "step": 3034, - "time_per_iteration": 3.2329187393188477 - }, - { - "auxiliary_loss_clip": 0.0112853, - "auxiliary_loss_mlp": 0.01041043, - "balance_loss_clip": 1.05698252, - "balance_loss_mlp": 1.02384114, - "epoch": 0.18247407184728695, - "flos": 18471694826880.0, - "grad_norm": 3.107937736318082, - "language_loss": 0.79893476, - "learning_rate": 3.7610407621428893e-06, - "loss": 0.82063049, - "num_input_tokens_seen": 65606720, - "step": 3035, - "time_per_iteration": 2.7644357681274414 - }, - { - "auxiliary_loss_clip": 0.01134115, - "auxiliary_loss_mlp": 0.01046396, - "balance_loss_clip": 1.05675578, - "balance_loss_mlp": 1.02906322, - "epoch": 0.18253419509995492, - "flos": 21795191602560.0, - "grad_norm": 1.870086430131469, - "language_loss": 0.85076666, - "learning_rate": 3.7608561206891735e-06, - "loss": 0.87257177, - "num_input_tokens_seen": 65625495, - "step": 3036, - "time_per_iteration": 2.7102303504943848 - }, - { - "auxiliary_loss_clip": 0.01140083, - "auxiliary_loss_mlp": 0.01039078, - "balance_loss_clip": 1.05572963, - "balance_loss_mlp": 1.02192414, - "epoch": 0.18259431835262288, - "flos": 20149764456960.0, - "grad_norm": 2.1821496235124727, - "language_loss": 0.80254716, - "learning_rate": 3.760671412463617e-06, - "loss": 0.82433879, - "num_input_tokens_seen": 65643515, - "step": 3037, - "time_per_iteration": 2.6703832149505615 - }, - { - "auxiliary_loss_clip": 0.01139652, - "auxiliary_loss_mlp": 0.00776941, - "balance_loss_clip": 1.05986989, - "balance_loss_mlp": 1.00062871, - "epoch": 0.18265444160529085, - "flos": 16981653916800.0, - "grad_norm": 3.0764011293768023, - "language_loss": 0.7950514, - "learning_rate": 3.7604866374732246e-06, - "loss": 0.81421733, - "num_input_tokens_seen": 65658155, - "step": 3038, - "time_per_iteration": 2.7410895824432373 - }, - { - "auxiliary_loss_clip": 0.01125628, - "auxiliary_loss_mlp": 0.01044597, - "balance_loss_clip": 1.05254972, - "balance_loss_mlp": 1.02551126, - "epoch": 0.1827145648579588, - "flos": 34423250509440.0, - "grad_norm": 1.9524772610579864, - "language_loss": 0.67722493, - "learning_rate": 3.7603017957250023e-06, - "loss": 0.69892722, - "num_input_tokens_seen": 65679310, - "step": 3039, - "time_per_iteration": 2.756833076477051 - }, - { - "auxiliary_loss_clip": 0.0113051, - "auxiliary_loss_mlp": 0.01051065, - "balance_loss_clip": 1.053087, - "balance_loss_mlp": 1.03304029, - "epoch": 0.18277468811062678, - "flos": 53287017264000.0, - "grad_norm": 1.8757227718998248, - "language_loss": 0.73394251, - "learning_rate": 3.7601168872259593e-06, - "loss": 0.75575823, - "num_input_tokens_seen": 65705235, - "step": 3040, - "time_per_iteration": 3.026679039001465 - }, - { - "auxiliary_loss_clip": 0.01143558, - "auxiliary_loss_mlp": 0.01042261, - "balance_loss_clip": 1.05585194, - "balance_loss_mlp": 1.02373624, - "epoch": 0.18283481136329474, - "flos": 31650659602560.0, - "grad_norm": 2.017308993436446, - "language_loss": 0.60348576, - "learning_rate": 3.7599319119831075e-06, - "loss": 0.62534392, - "num_input_tokens_seen": 65727575, - "step": 3041, - "time_per_iteration": 2.738554000854492 - }, - { - "auxiliary_loss_clip": 0.01116972, - "auxiliary_loss_mlp": 0.01053827, - "balance_loss_clip": 1.05058599, - "balance_loss_mlp": 1.03544497, - "epoch": 0.18289493461596273, - "flos": 53137664513280.0, - "grad_norm": 2.3558133433802104, - "language_loss": 0.59825706, - "learning_rate": 3.7597468700034616e-06, - "loss": 0.61996508, - "num_input_tokens_seen": 65751370, - "step": 3042, - "time_per_iteration": 3.0009193420410156 - }, - { - "auxiliary_loss_clip": 0.0112422, - "auxiliary_loss_mlp": 0.01046569, - "balance_loss_clip": 1.05319464, - "balance_loss_mlp": 1.02917695, - "epoch": 0.1829550578686307, - "flos": 25589369220480.0, - "grad_norm": 1.5313119565207096, - "language_loss": 0.8757726, - "learning_rate": 3.7595617612940374e-06, - "loss": 0.89748049, - "num_input_tokens_seen": 65771040, - "step": 3043, - "time_per_iteration": 2.7406487464904785 - }, - { - "auxiliary_loss_clip": 0.01056788, - "auxiliary_loss_mlp": 0.01056357, - "balance_loss_clip": 1.04592645, - "balance_loss_mlp": 1.03712869, - "epoch": 0.18301518112129866, - "flos": 22601422321920.0, - "grad_norm": 2.144378235575635, - "language_loss": 0.70980251, - "learning_rate": 3.7593765858618552e-06, - "loss": 0.73093396, - "num_input_tokens_seen": 65789345, - "step": 3044, - "time_per_iteration": 2.785931348800659 - }, - { - "auxiliary_loss_clip": 0.01105073, - "auxiliary_loss_mlp": 0.01059118, - "balance_loss_clip": 1.05111921, - "balance_loss_mlp": 1.0381608, - "epoch": 0.18307530437396663, - "flos": 34020799551360.0, - "grad_norm": 3.097061979225562, - "language_loss": 0.64460731, - "learning_rate": 3.7591913437139365e-06, - "loss": 0.66624922, - "num_input_tokens_seen": 65810990, - "step": 3045, - "time_per_iteration": 2.8085720539093018 - }, - { - "auxiliary_loss_clip": 0.01155246, - "auxiliary_loss_mlp": 0.01044973, - "balance_loss_clip": 1.05604315, - "balance_loss_mlp": 1.02780676, - "epoch": 0.1831354276266346, - "flos": 21279765392640.0, - "grad_norm": 11.455833434854163, - "language_loss": 0.78461385, - "learning_rate": 3.7590060348573066e-06, - "loss": 0.80661607, - "num_input_tokens_seen": 65827230, - "step": 3046, - "time_per_iteration": 2.603299140930176 - }, - { - "auxiliary_loss_clip": 0.01118725, - "auxiliary_loss_mlp": 0.01042864, - "balance_loss_clip": 1.04837, - "balance_loss_mlp": 1.0240643, - "epoch": 0.18319555087930256, - "flos": 21032952065280.0, - "grad_norm": 1.9889932097770582, - "language_loss": 0.78733194, - "learning_rate": 3.7588206592989903e-06, - "loss": 0.8089478, - "num_input_tokens_seen": 65845900, - "step": 3047, - "time_per_iteration": 2.7109453678131104 - }, - { - "auxiliary_loss_clip": 0.01144516, - "auxiliary_loss_mlp": 0.01042422, - "balance_loss_clip": 1.05723858, - "balance_loss_mlp": 1.0254705, - "epoch": 0.18325567413197055, - "flos": 34382958428160.0, - "grad_norm": 1.5191744259185578, - "language_loss": 0.80704039, - "learning_rate": 3.7586352170460194e-06, - "loss": 0.82890975, - "num_input_tokens_seen": 65868730, - "step": 3048, - "time_per_iteration": 2.7485053539276123 - }, - { - "auxiliary_loss_clip": 0.01139433, - "auxiliary_loss_mlp": 0.01046004, - "balance_loss_clip": 1.05405188, - "balance_loss_mlp": 1.02552414, - "epoch": 0.18331579738463852, - "flos": 20558464381440.0, - "grad_norm": 2.1437824577601354, - "language_loss": 0.86579728, - "learning_rate": 3.758449708105424e-06, - "loss": 0.88765168, - "num_input_tokens_seen": 65888420, - "step": 3049, - "time_per_iteration": 2.6876962184906006 - }, - { - "auxiliary_loss_clip": 0.01143881, - "auxiliary_loss_mlp": 0.01045208, - "balance_loss_clip": 1.05379057, - "balance_loss_mlp": 1.02544308, - "epoch": 0.18337592063730648, - "flos": 19607872901760.0, - "grad_norm": 2.616661567020713, - "language_loss": 0.77827966, - "learning_rate": 3.75826413248424e-06, - "loss": 0.80017054, - "num_input_tokens_seen": 65905840, - "step": 3050, - "time_per_iteration": 2.5814058780670166 - }, - { - "auxiliary_loss_clip": 0.01126116, - "auxiliary_loss_mlp": 0.01041302, - "balance_loss_clip": 1.04954183, - "balance_loss_mlp": 1.0238502, - "epoch": 0.18343604388997445, - "flos": 20850885002880.0, - "grad_norm": 2.3686375880611656, - "language_loss": 0.99064422, - "learning_rate": 3.7580784901895035e-06, - "loss": 1.01231837, - "num_input_tokens_seen": 65922845, - "step": 3051, - "time_per_iteration": 2.701848268508911 - }, - { - "auxiliary_loss_clip": 0.01125492, - "auxiliary_loss_mlp": 0.010397, - "balance_loss_clip": 1.05189931, - "balance_loss_mlp": 1.02078128, - "epoch": 0.1834961671426424, - "flos": 24394370624640.0, - "grad_norm": 2.0338529701436237, - "language_loss": 0.8607648, - "learning_rate": 3.7578927812282542e-06, - "loss": 0.88241673, - "num_input_tokens_seen": 65945555, - "step": 3052, - "time_per_iteration": 2.7252042293548584 - }, - { - "auxiliary_loss_clip": 0.01152967, - "auxiliary_loss_mlp": 0.01044648, - "balance_loss_clip": 1.05449986, - "balance_loss_mlp": 1.02737474, - "epoch": 0.18355629039531038, - "flos": 21251612108160.0, - "grad_norm": 1.8649432496703628, - "language_loss": 0.73393309, - "learning_rate": 3.7577070056075356e-06, - "loss": 0.7559092, - "num_input_tokens_seen": 65963965, - "step": 3053, - "time_per_iteration": 2.6331369876861572 - }, - { - "auxiliary_loss_clip": 0.01158728, - "auxiliary_loss_mlp": 0.01044052, - "balance_loss_clip": 1.05783379, - "balance_loss_mlp": 1.02565801, - "epoch": 0.18361641364797834, - "flos": 28656499651200.0, - "grad_norm": 1.5358769917973574, - "language_loss": 0.61891186, - "learning_rate": 3.7575211633343902e-06, - "loss": 0.64093965, - "num_input_tokens_seen": 65985965, - "step": 3054, - "time_per_iteration": 2.6792421340942383 - }, - { - "auxiliary_loss_clip": 0.01108826, - "auxiliary_loss_mlp": 0.01042654, - "balance_loss_clip": 1.05558836, - "balance_loss_mlp": 1.02502322, - "epoch": 0.18367653690064634, - "flos": 20918827578240.0, - "grad_norm": 2.2474279661883667, - "language_loss": 0.78218341, - "learning_rate": 3.7573352544158663e-06, - "loss": 0.80369824, - "num_input_tokens_seen": 66005645, - "step": 3055, - "time_per_iteration": 2.778691053390503 - }, - { - "auxiliary_loss_clip": 0.01096638, - "auxiliary_loss_mlp": 0.01050677, - "balance_loss_clip": 1.05003095, - "balance_loss_mlp": 1.03211594, - "epoch": 0.1837366601533143, - "flos": 28765596234240.0, - "grad_norm": 1.8043720478204575, - "language_loss": 0.7022509, - "learning_rate": 3.757149278859014e-06, - "loss": 0.72372401, - "num_input_tokens_seen": 66025675, - "step": 3056, - "time_per_iteration": 2.794254779815674 - }, - { - "auxiliary_loss_clip": 0.01140367, - "auxiliary_loss_mlp": 0.01038358, - "balance_loss_clip": 1.05211461, - "balance_loss_mlp": 1.02181149, - "epoch": 0.18379678340598227, - "flos": 21251432540160.0, - "grad_norm": 1.8709784760841586, - "language_loss": 0.80357504, - "learning_rate": 3.7569632366708842e-06, - "loss": 0.82536227, - "num_input_tokens_seen": 66046125, - "step": 3057, - "time_per_iteration": 2.644728899002075 - }, - { - "auxiliary_loss_clip": 0.01150041, - "auxiliary_loss_mlp": 0.01043781, - "balance_loss_clip": 1.05482352, - "balance_loss_mlp": 1.02332497, - "epoch": 0.18385690665865023, - "flos": 20449619193600.0, - "grad_norm": 7.225766788646501, - "language_loss": 0.82570755, - "learning_rate": 3.756777127858533e-06, - "loss": 0.84764576, - "num_input_tokens_seen": 66064375, - "step": 3058, - "time_per_iteration": 4.136845588684082 - }, - { - "auxiliary_loss_clip": 0.01119139, - "auxiliary_loss_mlp": 0.00776668, - "balance_loss_clip": 1.04992914, - "balance_loss_mlp": 1.00066566, - "epoch": 0.1839170299113182, - "flos": 26140562398080.0, - "grad_norm": 2.277694088171661, - "language_loss": 0.85071868, - "learning_rate": 3.756590952429017e-06, - "loss": 0.86967677, - "num_input_tokens_seen": 66084590, - "step": 3059, - "time_per_iteration": 2.745020866394043 - }, - { - "auxiliary_loss_clip": 0.01151831, - "auxiliary_loss_mlp": 0.00775088, - "balance_loss_clip": 1.05359423, - "balance_loss_mlp": 1.00077426, - "epoch": 0.18397715316398616, - "flos": 31758032332800.0, - "grad_norm": 2.3540516696336216, - "language_loss": 0.72983348, - "learning_rate": 3.756404710389396e-06, - "loss": 0.74910271, - "num_input_tokens_seen": 66107105, - "step": 3060, - "time_per_iteration": 5.792214393615723 - }, - { - "auxiliary_loss_clip": 0.01149482, - "auxiliary_loss_mlp": 0.01041417, - "balance_loss_clip": 1.05812132, - "balance_loss_mlp": 1.02266574, - "epoch": 0.18403727641665413, - "flos": 24611989173120.0, - "grad_norm": 1.5810457302838978, - "language_loss": 0.73126459, - "learning_rate": 3.7562184017467323e-06, - "loss": 0.75317359, - "num_input_tokens_seen": 66129295, - "step": 3061, - "time_per_iteration": 2.754167318344116 - }, - { - "auxiliary_loss_clip": 0.01138281, - "auxiliary_loss_mlp": 0.01043599, - "balance_loss_clip": 1.05435956, - "balance_loss_mlp": 1.02379823, - "epoch": 0.18409739966932212, - "flos": 23439900476160.0, - "grad_norm": 1.8413104246803462, - "language_loss": 0.81937188, - "learning_rate": 3.7560320265080906e-06, - "loss": 0.8411907, - "num_input_tokens_seen": 66146910, - "step": 3062, - "time_per_iteration": 2.7545394897460938 - }, - { - "auxiliary_loss_clip": 0.01144664, - "auxiliary_loss_mlp": 0.01040639, - "balance_loss_clip": 1.05668104, - "balance_loss_mlp": 1.02259111, - "epoch": 0.18415752292199009, - "flos": 21872112577920.0, - "grad_norm": 2.011374259171591, - "language_loss": 0.72994816, - "learning_rate": 3.7558455846805383e-06, - "loss": 0.75180125, - "num_input_tokens_seen": 66165370, - "step": 3063, - "time_per_iteration": 2.738293170928955 - }, - { - "auxiliary_loss_clip": 0.01133824, - "auxiliary_loss_mlp": 0.01040987, - "balance_loss_clip": 1.05164194, - "balance_loss_mlp": 1.02490544, - "epoch": 0.18421764617465805, - "flos": 25410678036480.0, - "grad_norm": 2.2975785147287953, - "language_loss": 0.65614092, - "learning_rate": 3.7556590762711463e-06, - "loss": 0.67788899, - "num_input_tokens_seen": 66186210, - "step": 3064, - "time_per_iteration": 4.404583930969238 - }, - { - "auxiliary_loss_clip": 0.01141547, - "auxiliary_loss_mlp": 0.01042996, - "balance_loss_clip": 1.05395937, - "balance_loss_mlp": 1.02498376, - "epoch": 0.18427776942732602, - "flos": 27198131558400.0, - "grad_norm": 2.1874829734431898, - "language_loss": 0.68347883, - "learning_rate": 3.7554725012869853e-06, - "loss": 0.70532429, - "num_input_tokens_seen": 66204800, - "step": 3065, - "time_per_iteration": 2.7149577140808105 - }, - { - "auxiliary_loss_clip": 0.01136969, - "auxiliary_loss_mlp": 0.01045319, - "balance_loss_clip": 1.05518305, - "balance_loss_mlp": 1.02674615, - "epoch": 0.18433789267999398, - "flos": 27852351920640.0, - "grad_norm": 2.2758854533642925, - "language_loss": 0.73142231, - "learning_rate": 3.7552858597351318e-06, - "loss": 0.75324523, - "num_input_tokens_seen": 66222195, - "step": 3066, - "time_per_iteration": 2.672675609588623 - }, - { - "auxiliary_loss_clip": 0.01125186, - "auxiliary_loss_mlp": 0.01043389, - "balance_loss_clip": 1.04947495, - "balance_loss_mlp": 1.0256983, - "epoch": 0.18439801593266195, - "flos": 17856940533120.0, - "grad_norm": 2.1067167513095444, - "language_loss": 0.82191038, - "learning_rate": 3.7550991516226622e-06, - "loss": 0.8435961, - "num_input_tokens_seen": 66239505, - "step": 3067, - "time_per_iteration": 2.697768211364746 - }, - { - "auxiliary_loss_clip": 0.01082345, - "auxiliary_loss_mlp": 0.00756782, - "balance_loss_clip": 1.04466891, - "balance_loss_mlp": 1.00113225, - "epoch": 0.18445813918532994, - "flos": 56389522590720.0, - "grad_norm": 0.7960107429271657, - "language_loss": 0.59750569, - "learning_rate": 3.754912376956657e-06, - "loss": 0.61589694, - "num_input_tokens_seen": 66295695, - "step": 3068, - "time_per_iteration": 3.0305213928222656 - }, - { - "auxiliary_loss_clip": 0.01127048, - "auxiliary_loss_mlp": 0.01041294, - "balance_loss_clip": 1.05452299, - "balance_loss_mlp": 1.02356791, - "epoch": 0.1845182624379979, - "flos": 20957180325120.0, - "grad_norm": 3.7299324256794244, - "language_loss": 0.76434112, - "learning_rate": 3.7547255357441987e-06, - "loss": 0.78602457, - "num_input_tokens_seen": 66315315, - "step": 3069, - "time_per_iteration": 2.6757962703704834 - }, - { - "auxiliary_loss_clip": 0.01146412, - "auxiliary_loss_mlp": 0.010456, - "balance_loss_clip": 1.05468106, - "balance_loss_mlp": 1.02798057, - "epoch": 0.18457838569066587, - "flos": 20485170679680.0, - "grad_norm": 1.9225240149566294, - "language_loss": 0.8491416, - "learning_rate": 3.7545386279923718e-06, - "loss": 0.87106168, - "num_input_tokens_seen": 66333675, - "step": 3070, - "time_per_iteration": 2.617023229598999 - }, - { - "auxiliary_loss_clip": 0.01127789, - "auxiliary_loss_mlp": 0.01043452, - "balance_loss_clip": 1.0553112, - "balance_loss_mlp": 1.02510571, - "epoch": 0.18463850894333383, - "flos": 25010022758400.0, - "grad_norm": 6.700503585098448, - "language_loss": 0.77807182, - "learning_rate": 3.754351653708265e-06, - "loss": 0.79978424, - "num_input_tokens_seen": 66354075, - "step": 3071, - "time_per_iteration": 2.847329616546631 - }, - { - "auxiliary_loss_clip": 0.01109458, - "auxiliary_loss_mlp": 0.01049978, - "balance_loss_clip": 1.05054557, - "balance_loss_mlp": 1.03154778, - "epoch": 0.1846986321960018, - "flos": 16800628348800.0, - "grad_norm": 2.0836336776071565, - "language_loss": 0.77414191, - "learning_rate": 3.7541646128989674e-06, - "loss": 0.79573631, - "num_input_tokens_seen": 66372520, - "step": 3072, - "time_per_iteration": 2.780921220779419 - }, - { - "auxiliary_loss_clip": 0.01138997, - "auxiliary_loss_mlp": 0.01043594, - "balance_loss_clip": 1.05106127, - "balance_loss_mlp": 1.02465141, - "epoch": 0.18475875544866976, - "flos": 20814327936000.0, - "grad_norm": 4.959080593148226, - "language_loss": 0.86546457, - "learning_rate": 3.7539775055715715e-06, - "loss": 0.88729048, - "num_input_tokens_seen": 66390745, - "step": 3073, - "time_per_iteration": 2.631913661956787 - }, - { - "auxiliary_loss_clip": 0.01158717, - "auxiliary_loss_mlp": 0.0104013, - "balance_loss_clip": 1.05862749, - "balance_loss_mlp": 1.02366686, - "epoch": 0.18481887870133773, - "flos": 22601422321920.0, - "grad_norm": 2.162700927804164, - "language_loss": 0.91831195, - "learning_rate": 3.7537903317331732e-06, - "loss": 0.94030046, - "num_input_tokens_seen": 66410525, - "step": 3074, - "time_per_iteration": 2.6152567863464355 - }, - { - "auxiliary_loss_clip": 0.01104968, - "auxiliary_loss_mlp": 0.01047718, - "balance_loss_clip": 1.04757643, - "balance_loss_mlp": 1.02763104, - "epoch": 0.18487900195400572, - "flos": 29458815788160.0, - "grad_norm": 1.9967983521568784, - "language_loss": 0.64783108, - "learning_rate": 3.75360309139087e-06, - "loss": 0.66935796, - "num_input_tokens_seen": 66432535, - "step": 3075, - "time_per_iteration": 2.763559103012085 - }, - { - "auxiliary_loss_clip": 0.01135247, - "auxiliary_loss_mlp": 0.01046601, - "balance_loss_clip": 1.05689573, - "balance_loss_mlp": 1.02913702, - "epoch": 0.1849391252066737, - "flos": 20628777254400.0, - "grad_norm": 1.8996898495981898, - "language_loss": 0.72803432, - "learning_rate": 3.753415784551761e-06, - "loss": 0.74985278, - "num_input_tokens_seen": 66450620, - "step": 3076, - "time_per_iteration": 2.76629376411438 - }, - { - "auxiliary_loss_clip": 0.01124833, - "auxiliary_loss_mlp": 0.01042344, - "balance_loss_clip": 1.0584389, - "balance_loss_mlp": 1.0249157, - "epoch": 0.18499924845934165, - "flos": 14428549065600.0, - "grad_norm": 2.4862024108169556, - "language_loss": 0.80772626, - "learning_rate": 3.7532284112229507e-06, - "loss": 0.82939804, - "num_input_tokens_seen": 66467865, - "step": 3077, - "time_per_iteration": 2.7296142578125 - }, - { - "auxiliary_loss_clip": 0.01128471, - "auxiliary_loss_mlp": 0.01041495, - "balance_loss_clip": 1.05401397, - "balance_loss_mlp": 1.02428079, - "epoch": 0.18505937171200962, - "flos": 23727652329600.0, - "grad_norm": 1.8214336253769514, - "language_loss": 0.78693211, - "learning_rate": 3.7530409714115424e-06, - "loss": 0.80863178, - "num_input_tokens_seen": 66486245, - "step": 3078, - "time_per_iteration": 2.715838670730591 - }, - { - "auxiliary_loss_clip": 0.01154963, - "auxiliary_loss_mlp": 0.01043373, - "balance_loss_clip": 1.05546641, - "balance_loss_mlp": 1.02655268, - "epoch": 0.18511949496467758, - "flos": 25957489754880.0, - "grad_norm": 1.7455066055145632, - "language_loss": 0.77326959, - "learning_rate": 3.7528534651246453e-06, - "loss": 0.79525292, - "num_input_tokens_seen": 66506510, - "step": 3079, - "time_per_iteration": 2.674128770828247 - }, - { - "auxiliary_loss_clip": 0.01119079, - "auxiliary_loss_mlp": 0.01041512, - "balance_loss_clip": 1.04717147, - "balance_loss_mlp": 1.02328515, - "epoch": 0.18517961821734555, - "flos": 42413553912960.0, - "grad_norm": 1.885086933557342, - "language_loss": 0.82143807, - "learning_rate": 3.752665892369369e-06, - "loss": 0.84304404, - "num_input_tokens_seen": 66530960, - "step": 3080, - "time_per_iteration": 2.906940460205078 - }, - { - "auxiliary_loss_clip": 0.01123637, - "auxiliary_loss_mlp": 0.01044031, - "balance_loss_clip": 1.05894399, - "balance_loss_mlp": 1.02563691, - "epoch": 0.18523974147001354, - "flos": 24097568544000.0, - "grad_norm": 2.065822240576764, - "language_loss": 0.73973286, - "learning_rate": 3.7524782531528266e-06, - "loss": 0.76140958, - "num_input_tokens_seen": 66550275, - "step": 3081, - "time_per_iteration": 2.7960739135742188 - }, - { - "auxiliary_loss_clip": 0.01126977, - "auxiliary_loss_mlp": 0.01051674, - "balance_loss_clip": 1.05360913, - "balance_loss_mlp": 1.03286242, - "epoch": 0.1852998647226815, - "flos": 27375278457600.0, - "grad_norm": 1.9854893879184425, - "language_loss": 0.71991849, - "learning_rate": 3.7522905474821334e-06, - "loss": 0.74170506, - "num_input_tokens_seen": 66569040, - "step": 3082, - "time_per_iteration": 2.6965079307556152 - }, - { - "auxiliary_loss_clip": 0.01124933, - "auxiliary_loss_mlp": 0.01046296, - "balance_loss_clip": 1.05649543, - "balance_loss_mlp": 1.02694798, - "epoch": 0.18535998797534947, - "flos": 18332757020160.0, - "grad_norm": 2.0424653419479886, - "language_loss": 0.69580144, - "learning_rate": 3.752102775364407e-06, - "loss": 0.71751374, - "num_input_tokens_seen": 66587775, - "step": 3083, - "time_per_iteration": 2.727252721786499 - }, - { - "auxiliary_loss_clip": 0.01122388, - "auxiliary_loss_mlp": 0.01046999, - "balance_loss_clip": 1.05204451, - "balance_loss_mlp": 1.02964258, - "epoch": 0.18542011122801744, - "flos": 37845859887360.0, - "grad_norm": 2.185713468975319, - "language_loss": 0.68965334, - "learning_rate": 3.751914936806767e-06, - "loss": 0.71134722, - "num_input_tokens_seen": 66610800, - "step": 3084, - "time_per_iteration": 2.95849871635437 - }, - { - "auxiliary_loss_clip": 0.01155184, - "auxiliary_loss_mlp": 0.01043029, - "balance_loss_clip": 1.05578482, - "balance_loss_mlp": 1.0257436, - "epoch": 0.1854802344806854, - "flos": 25186128163200.0, - "grad_norm": 1.6859724806626923, - "language_loss": 0.77390355, - "learning_rate": 3.7517270318163377e-06, - "loss": 0.79588568, - "num_input_tokens_seen": 66630960, - "step": 3085, - "time_per_iteration": 2.68961501121521 - }, - { - "auxiliary_loss_clip": 0.01152089, - "auxiliary_loss_mlp": 0.01049004, - "balance_loss_clip": 1.05316019, - "balance_loss_mlp": 1.03142118, - "epoch": 0.18554035773335337, - "flos": 26684788337280.0, - "grad_norm": 1.993169596996871, - "language_loss": 0.73752379, - "learning_rate": 3.751539060400244e-06, - "loss": 0.75953472, - "num_input_tokens_seen": 66650585, - "step": 3086, - "time_per_iteration": 2.652475595474243 - }, - { - "auxiliary_loss_clip": 0.01142754, - "auxiliary_loss_mlp": 0.01049865, - "balance_loss_clip": 1.05530787, - "balance_loss_mlp": 1.03134012, - "epoch": 0.18560048098602133, - "flos": 22346887570560.0, - "grad_norm": 7.927127736744579, - "language_loss": 0.69762361, - "learning_rate": 3.7513510225656132e-06, - "loss": 0.71954978, - "num_input_tokens_seen": 66670045, - "step": 3087, - "time_per_iteration": 2.668849229812622 - }, - { - "auxiliary_loss_clip": 0.01119022, - "auxiliary_loss_mlp": 0.01055302, - "balance_loss_clip": 1.05543649, - "balance_loss_mlp": 1.03546548, - "epoch": 0.18566060423868933, - "flos": 17748526308480.0, - "grad_norm": 2.1117122734340263, - "language_loss": 0.72513628, - "learning_rate": 3.7511629183195764e-06, - "loss": 0.74687952, - "num_input_tokens_seen": 66688790, - "step": 3088, - "time_per_iteration": 2.7150719165802 - }, - { - "auxiliary_loss_clip": 0.0112638, - "auxiliary_loss_mlp": 0.01044188, - "balance_loss_clip": 1.04933047, - "balance_loss_mlp": 1.02616334, - "epoch": 0.1857207274913573, - "flos": 24677274142080.0, - "grad_norm": 2.112009927874319, - "language_loss": 0.91859758, - "learning_rate": 3.7509747476692663e-06, - "loss": 0.94030321, - "num_input_tokens_seen": 66708090, - "step": 3089, - "time_per_iteration": 2.7239248752593994 - }, - { - "auxiliary_loss_clip": 0.01104754, - "auxiliary_loss_mlp": 0.01046981, - "balance_loss_clip": 1.0494597, - "balance_loss_mlp": 1.02919531, - "epoch": 0.18578085074402526, - "flos": 28147825198080.0, - "grad_norm": 2.490831087537115, - "language_loss": 0.57275403, - "learning_rate": 3.7507865106218176e-06, - "loss": 0.59427136, - "num_input_tokens_seen": 66727320, - "step": 3090, - "time_per_iteration": 2.8263309001922607 - }, - { - "auxiliary_loss_clip": 0.01125877, - "auxiliary_loss_mlp": 0.0104478, - "balance_loss_clip": 1.04981184, - "balance_loss_mlp": 1.02636242, - "epoch": 0.18584097399669322, - "flos": 23951878980480.0, - "grad_norm": 1.7797305478565062, - "language_loss": 0.81704801, - "learning_rate": 3.7505982071843695e-06, - "loss": 0.83875453, - "num_input_tokens_seen": 66747505, - "step": 3091, - "time_per_iteration": 2.697525978088379 - }, - { - "auxiliary_loss_clip": 0.01101743, - "auxiliary_loss_mlp": 0.01050837, - "balance_loss_clip": 1.04999971, - "balance_loss_mlp": 1.03277707, - "epoch": 0.18590109724936119, - "flos": 17201678676480.0, - "grad_norm": 2.0826959244757832, - "language_loss": 0.83704746, - "learning_rate": 3.7504098373640617e-06, - "loss": 0.8585732, - "num_input_tokens_seen": 66766425, - "step": 3092, - "time_per_iteration": 2.8379435539245605 - }, - { - "auxiliary_loss_clip": 0.01136846, - "auxiliary_loss_mlp": 0.01048758, - "balance_loss_clip": 1.05389428, - "balance_loss_mlp": 1.03036356, - "epoch": 0.18596122050202915, - "flos": 17234644383360.0, - "grad_norm": 5.439917179387958, - "language_loss": 0.93443698, - "learning_rate": 3.750221401168038e-06, - "loss": 0.95629299, - "num_input_tokens_seen": 66781130, - "step": 3093, - "time_per_iteration": 2.8053483963012695 - }, - { - "auxiliary_loss_clip": 0.01130362, - "auxiliary_loss_mlp": 0.01042367, - "balance_loss_clip": 1.05440521, - "balance_loss_mlp": 1.02464092, - "epoch": 0.18602134375469712, - "flos": 19020733188480.0, - "grad_norm": 1.7318887555782294, - "language_loss": 0.77516603, - "learning_rate": 3.750032898603443e-06, - "loss": 0.7968933, - "num_input_tokens_seen": 66797535, - "step": 3094, - "time_per_iteration": 2.7402310371398926 - }, - { - "auxiliary_loss_clip": 0.0109741, - "auxiliary_loss_mlp": 0.01049219, - "balance_loss_clip": 1.0519228, - "balance_loss_mlp": 1.0323391, - "epoch": 0.1860814670073651, - "flos": 50950094417280.0, - "grad_norm": 1.7033453736007413, - "language_loss": 0.69854707, - "learning_rate": 3.749844329677425e-06, - "loss": 0.72001338, - "num_input_tokens_seen": 66821720, - "step": 3095, - "time_per_iteration": 3.133192777633667 - }, - { - "auxiliary_loss_clip": 0.01113224, - "auxiliary_loss_mlp": 0.010546, - "balance_loss_clip": 1.0511899, - "balance_loss_mlp": 1.03415525, - "epoch": 0.18614159026003307, - "flos": 19390972625280.0, - "grad_norm": 2.2828801406167307, - "language_loss": 0.81214821, - "learning_rate": 3.749655694397135e-06, - "loss": 0.83382642, - "num_input_tokens_seen": 66839060, - "step": 3096, - "time_per_iteration": 2.7599101066589355 - }, - { - "auxiliary_loss_clip": 0.01147399, - "auxiliary_loss_mlp": 0.0104683, - "balance_loss_clip": 1.05678356, - "balance_loss_mlp": 1.02810192, - "epoch": 0.18620171351270104, - "flos": 21798782962560.0, - "grad_norm": 2.430947734084612, - "language_loss": 0.75326216, - "learning_rate": 3.7494669927697255e-06, - "loss": 0.77520448, - "num_input_tokens_seen": 66857760, - "step": 3097, - "time_per_iteration": 4.255983114242554 - }, - { - "auxiliary_loss_clip": 0.01133757, - "auxiliary_loss_mlp": 0.01050365, - "balance_loss_clip": 1.05756521, - "balance_loss_mlp": 1.03228104, - "epoch": 0.186261836765369, - "flos": 16362877299840.0, - "grad_norm": 2.553895603581972, - "language_loss": 0.66602015, - "learning_rate": 3.749278224802352e-06, - "loss": 0.68786132, - "num_input_tokens_seen": 66876460, - "step": 3098, - "time_per_iteration": 2.723567247390747 - }, - { - "auxiliary_loss_clip": 0.01163461, - "auxiliary_loss_mlp": 0.01052357, - "balance_loss_clip": 1.05991709, - "balance_loss_mlp": 1.03212702, - "epoch": 0.18632196001803697, - "flos": 23370054480000.0, - "grad_norm": 1.6168121451860142, - "language_loss": 0.69838905, - "learning_rate": 3.7490893905021733e-06, - "loss": 0.7205472, - "num_input_tokens_seen": 66897960, - "step": 3099, - "time_per_iteration": 5.687380075454712 - }, - { - "auxiliary_loss_clip": 0.01148363, - "auxiliary_loss_mlp": 0.01051556, - "balance_loss_clip": 1.05713868, - "balance_loss_mlp": 1.03243458, - "epoch": 0.18638208327070493, - "flos": 22492002516480.0, - "grad_norm": 1.7060244708994476, - "language_loss": 0.71840072, - "learning_rate": 3.7489004898763494e-06, - "loss": 0.74039996, - "num_input_tokens_seen": 66917675, - "step": 3100, - "time_per_iteration": 2.6711015701293945 - }, - { - "auxiliary_loss_clip": 0.01138377, - "auxiliary_loss_mlp": 0.01050667, - "balance_loss_clip": 1.05749035, - "balance_loss_mlp": 1.03133154, - "epoch": 0.18644220652337293, - "flos": 29165245931520.0, - "grad_norm": 1.9639279354826686, - "language_loss": 0.80343997, - "learning_rate": 3.7487115229320444e-06, - "loss": 0.82533038, - "num_input_tokens_seen": 66936000, - "step": 3101, - "time_per_iteration": 2.6996583938598633 - }, - { - "auxiliary_loss_clip": 0.01112778, - "auxiliary_loss_mlp": 0.01042097, - "balance_loss_clip": 1.05307627, - "balance_loss_mlp": 1.02478826, - "epoch": 0.1865023297760409, - "flos": 24243796811520.0, - "grad_norm": 1.8804860702941575, - "language_loss": 0.77053607, - "learning_rate": 3.7485224896764222e-06, - "loss": 0.79208481, - "num_input_tokens_seen": 66955700, - "step": 3102, - "time_per_iteration": 2.726146936416626 - }, - { - "auxiliary_loss_clip": 0.01150817, - "auxiliary_loss_mlp": 0.01039303, - "balance_loss_clip": 1.057688, - "balance_loss_mlp": 1.0213027, - "epoch": 0.18656245302870886, - "flos": 19128716449920.0, - "grad_norm": 2.314682178811096, - "language_loss": 0.76689744, - "learning_rate": 3.7483333901166525e-06, - "loss": 0.78879869, - "num_input_tokens_seen": 66972815, - "step": 3103, - "time_per_iteration": 4.374122619628906 - }, - { - "auxiliary_loss_clip": 0.01132531, - "auxiliary_loss_mlp": 0.0104481, - "balance_loss_clip": 1.05477643, - "balance_loss_mlp": 1.02671361, - "epoch": 0.18662257628137682, - "flos": 17786088956160.0, - "grad_norm": 1.6956506235876265, - "language_loss": 0.79252636, - "learning_rate": 3.7481442242599054e-06, - "loss": 0.8142997, - "num_input_tokens_seen": 66992280, - "step": 3104, - "time_per_iteration": 2.695012092590332 - }, - { - "auxiliary_loss_clip": 0.01106786, - "auxiliary_loss_mlp": 0.01050273, - "balance_loss_clip": 1.05117702, - "balance_loss_mlp": 1.03096056, - "epoch": 0.1866826995340448, - "flos": 24024382583040.0, - "grad_norm": 2.065624302338532, - "language_loss": 0.8496474, - "learning_rate": 3.747954992113354e-06, - "loss": 0.87121809, - "num_input_tokens_seen": 67012220, - "step": 3105, - "time_per_iteration": 2.761521816253662 - }, - { - "auxiliary_loss_clip": 0.0112324, - "auxiliary_loss_mlp": 0.01043689, - "balance_loss_clip": 1.05166531, - "balance_loss_mlp": 1.02407932, - "epoch": 0.18674282278671275, - "flos": 26141244756480.0, - "grad_norm": 1.8352441384571676, - "language_loss": 0.86880243, - "learning_rate": 3.7477656936841742e-06, - "loss": 0.8904717, - "num_input_tokens_seen": 67032030, - "step": 3106, - "time_per_iteration": 2.785738706588745 - }, - { - "auxiliary_loss_clip": 0.01150222, - "auxiliary_loss_mlp": 0.01040973, - "balance_loss_clip": 1.0566026, - "balance_loss_mlp": 1.02281737, - "epoch": 0.18680294603938072, - "flos": 19201938324480.0, - "grad_norm": 2.128833658771433, - "language_loss": 0.78226906, - "learning_rate": 3.7475763289795445e-06, - "loss": 0.80418098, - "num_input_tokens_seen": 67048920, - "step": 3107, - "time_per_iteration": 2.693995237350464 - }, - { - "auxiliary_loss_clip": 0.01153763, - "auxiliary_loss_mlp": 0.01053056, - "balance_loss_clip": 1.05873394, - "balance_loss_mlp": 1.03341043, - "epoch": 0.1868630692920487, - "flos": 28544889116160.0, - "grad_norm": 3.0927798335187506, - "language_loss": 0.74159014, - "learning_rate": 3.7473868980066446e-06, - "loss": 0.7636584, - "num_input_tokens_seen": 67068645, - "step": 3108, - "time_per_iteration": 2.795715570449829 - }, - { - "auxiliary_loss_clip": 0.01107582, - "auxiliary_loss_mlp": 0.01042714, - "balance_loss_clip": 1.05207491, - "balance_loss_mlp": 1.02451098, - "epoch": 0.18692319254471668, - "flos": 17238020261760.0, - "grad_norm": 1.6837485322309411, - "language_loss": 0.74348569, - "learning_rate": 3.747197400772658e-06, - "loss": 0.76498872, - "num_input_tokens_seen": 67087075, - "step": 3109, - "time_per_iteration": 2.7627830505371094 - }, - { - "auxiliary_loss_clip": 0.01145572, - "auxiliary_loss_mlp": 0.01044117, - "balance_loss_clip": 1.05631042, - "balance_loss_mlp": 1.02526462, - "epoch": 0.18698331579738464, - "flos": 23185186156800.0, - "grad_norm": 1.499459601293056, - "language_loss": 0.84250218, - "learning_rate": 3.747007837284772e-06, - "loss": 0.86439908, - "num_input_tokens_seen": 67108040, - "step": 3110, - "time_per_iteration": 2.7665328979492188 - }, - { - "auxiliary_loss_clip": 0.01147578, - "auxiliary_loss_mlp": 0.01042389, - "balance_loss_clip": 1.05929494, - "balance_loss_mlp": 1.02381575, - "epoch": 0.1870434390500526, - "flos": 25516721963520.0, - "grad_norm": 1.9108380391903876, - "language_loss": 0.84738445, - "learning_rate": 3.7468182075501737e-06, - "loss": 0.86928415, - "num_input_tokens_seen": 67127605, - "step": 3111, - "time_per_iteration": 2.729233741760254 - }, - { - "auxiliary_loss_clip": 0.01128, - "auxiliary_loss_mlp": 0.01044544, - "balance_loss_clip": 1.05348754, - "balance_loss_mlp": 1.02635229, - "epoch": 0.18710356230272057, - "flos": 19500823393920.0, - "grad_norm": 1.8704338434966796, - "language_loss": 0.76875687, - "learning_rate": 3.7466285115760536e-06, - "loss": 0.79048228, - "num_input_tokens_seen": 67145785, - "step": 3112, - "time_per_iteration": 2.7392494678497314 - }, - { - "auxiliary_loss_clip": 0.0114846, - "auxiliary_loss_mlp": 0.0104709, - "balance_loss_clip": 1.05636978, - "balance_loss_mlp": 1.02913654, - "epoch": 0.18716368555538854, - "flos": 26760847386240.0, - "grad_norm": 1.8996972204761096, - "language_loss": 0.64466536, - "learning_rate": 3.7464387493696046e-06, - "loss": 0.66662085, - "num_input_tokens_seen": 67165930, - "step": 3113, - "time_per_iteration": 2.7393765449523926 - }, - { - "auxiliary_loss_clip": 0.01153807, - "auxiliary_loss_mlp": 0.01048748, - "balance_loss_clip": 1.05685568, - "balance_loss_mlp": 1.02900672, - "epoch": 0.1872238088080565, - "flos": 25189827264000.0, - "grad_norm": 6.483287708452815, - "language_loss": 0.817972, - "learning_rate": 3.746248920938024e-06, - "loss": 0.83999759, - "num_input_tokens_seen": 67185830, - "step": 3114, - "time_per_iteration": 2.740229368209839 - }, - { - "auxiliary_loss_clip": 0.01104278, - "auxiliary_loss_mlp": 0.01050738, - "balance_loss_clip": 1.04921412, - "balance_loss_mlp": 1.03024614, - "epoch": 0.1872839320607245, - "flos": 24134305178880.0, - "grad_norm": 2.3064843449079175, - "language_loss": 0.57413173, - "learning_rate": 3.74605902628851e-06, - "loss": 0.59568191, - "num_input_tokens_seen": 67206930, - "step": 3115, - "time_per_iteration": 2.811549663543701 - }, - { - "auxiliary_loss_clip": 0.01123025, - "auxiliary_loss_mlp": 0.01052226, - "balance_loss_clip": 1.05446446, - "balance_loss_mlp": 1.03241396, - "epoch": 0.18734405531339246, - "flos": 21173793292800.0, - "grad_norm": 2.577640519639585, - "language_loss": 0.70842528, - "learning_rate": 3.745869065428261e-06, - "loss": 0.73017788, - "num_input_tokens_seen": 67226290, - "step": 3116, - "time_per_iteration": 2.8053951263427734 - }, - { - "auxiliary_loss_clip": 0.0115042, - "auxiliary_loss_mlp": 0.01035569, - "balance_loss_clip": 1.05196476, - "balance_loss_mlp": 1.01787841, - "epoch": 0.18740417856606043, - "flos": 17237697039360.0, - "grad_norm": 3.010261965906642, - "language_loss": 0.78994375, - "learning_rate": 3.7456790383644833e-06, - "loss": 0.81180358, - "num_input_tokens_seen": 67244410, - "step": 3117, - "time_per_iteration": 2.819415330886841 - }, - { - "auxiliary_loss_clip": 0.01132901, - "auxiliary_loss_mlp": 0.01049724, - "balance_loss_clip": 1.05260777, - "balance_loss_mlp": 1.03047204, - "epoch": 0.1874643018187284, - "flos": 32558049999360.0, - "grad_norm": 2.2828109389679865, - "language_loss": 0.83903432, - "learning_rate": 3.745488945104381e-06, - "loss": 0.86086059, - "num_input_tokens_seen": 67264470, - "step": 3118, - "time_per_iteration": 2.783804416656494 - }, - { - "auxiliary_loss_clip": 0.01144867, - "auxiliary_loss_mlp": 0.0104452, - "balance_loss_clip": 1.05412436, - "balance_loss_mlp": 1.02688873, - "epoch": 0.18752442507139636, - "flos": 23258156636160.0, - "grad_norm": 3.566737352043019, - "language_loss": 0.76283264, - "learning_rate": 3.7452987856551636e-06, - "loss": 0.78472656, - "num_input_tokens_seen": 67284315, - "step": 3119, - "time_per_iteration": 2.6872506141662598 - }, - { - "auxiliary_loss_clip": 0.01156835, - "auxiliary_loss_mlp": 0.01046653, - "balance_loss_clip": 1.05519438, - "balance_loss_mlp": 1.02899814, - "epoch": 0.18758454832406432, - "flos": 21760933006080.0, - "grad_norm": 1.7224942549361077, - "language_loss": 0.82017547, - "learning_rate": 3.7451085600240406e-06, - "loss": 0.84221041, - "num_input_tokens_seen": 67302780, - "step": 3120, - "time_per_iteration": 2.637505292892456 - }, - { - "auxiliary_loss_clip": 0.0113033, - "auxiliary_loss_mlp": 0.01035538, - "balance_loss_clip": 1.05060756, - "balance_loss_mlp": 1.01828837, - "epoch": 0.1876446715767323, - "flos": 29570210841600.0, - "grad_norm": 2.5027223446471982, - "language_loss": 0.84992659, - "learning_rate": 3.7449182682182263e-06, - "loss": 0.87158525, - "num_input_tokens_seen": 67323405, - "step": 3121, - "time_per_iteration": 2.788353681564331 - }, - { - "auxiliary_loss_clip": 0.01096681, - "auxiliary_loss_mlp": 0.0104429, - "balance_loss_clip": 1.045645, - "balance_loss_mlp": 1.02599168, - "epoch": 0.18770479482940028, - "flos": 30339992234880.0, - "grad_norm": 2.1738591443482362, - "language_loss": 0.70032287, - "learning_rate": 3.744727910244937e-06, - "loss": 0.72173256, - "num_input_tokens_seen": 67345800, - "step": 3122, - "time_per_iteration": 3.0225250720977783 - }, - { - "auxiliary_loss_clip": 0.01153439, - "auxiliary_loss_mlp": 0.01042355, - "balance_loss_clip": 1.05445123, - "balance_loss_mlp": 1.02288795, - "epoch": 0.18776491808206824, - "flos": 14465357527680.0, - "grad_norm": 4.839579375412361, - "language_loss": 0.70661515, - "learning_rate": 3.7445374861113905e-06, - "loss": 0.72857308, - "num_input_tokens_seen": 67363575, - "step": 3123, - "time_per_iteration": 2.779904365539551 - }, - { - "auxiliary_loss_clip": 0.01142265, - "auxiliary_loss_mlp": 0.01041425, - "balance_loss_clip": 1.05286181, - "balance_loss_mlp": 1.02454507, - "epoch": 0.1878250413347362, - "flos": 24498547044480.0, - "grad_norm": 2.057520579072589, - "language_loss": 0.74103826, - "learning_rate": 3.7443469958248066e-06, - "loss": 0.76287514, - "num_input_tokens_seen": 67381765, - "step": 3124, - "time_per_iteration": 2.6336071491241455 - }, - { - "auxiliary_loss_clip": 0.01157579, - "auxiliary_loss_mlp": 0.01052509, - "balance_loss_clip": 1.05653572, - "balance_loss_mlp": 1.03333998, - "epoch": 0.18788516458740417, - "flos": 39786185692800.0, - "grad_norm": 3.0670363966795096, - "language_loss": 0.80654436, - "learning_rate": 3.7441564393924106e-06, - "loss": 0.82864523, - "num_input_tokens_seen": 67405000, - "step": 3125, - "time_per_iteration": 2.7224199771881104 - }, - { - "auxiliary_loss_clip": 0.01046615, - "auxiliary_loss_mlp": 0.01006504, - "balance_loss_clip": 1.04444218, - "balance_loss_mlp": 1.00435853, - "epoch": 0.18794528784007214, - "flos": 64699250664960.0, - "grad_norm": 0.9424570711133922, - "language_loss": 0.63647306, - "learning_rate": 3.7439658168214273e-06, - "loss": 0.65700436, - "num_input_tokens_seen": 67467140, - "step": 3126, - "time_per_iteration": 3.313321113586426 - }, - { - "auxiliary_loss_clip": 0.01128308, - "auxiliary_loss_mlp": 0.01040458, - "balance_loss_clip": 1.05377257, - "balance_loss_mlp": 1.02236164, - "epoch": 0.1880054110927401, - "flos": 28622061486720.0, - "grad_norm": 1.8734163453478039, - "language_loss": 0.81308508, - "learning_rate": 3.7437751281190857e-06, - "loss": 0.83477271, - "num_input_tokens_seen": 67487980, - "step": 3127, - "time_per_iteration": 2.7137866020202637 - }, - { - "auxiliary_loss_clip": 0.01088267, - "auxiliary_loss_mlp": 0.0101138, - "balance_loss_clip": 1.04814553, - "balance_loss_mlp": 1.00912714, - "epoch": 0.1880655343454081, - "flos": 64488958490880.0, - "grad_norm": 0.7699217277386954, - "language_loss": 0.61922526, - "learning_rate": 3.7435843732926164e-06, - "loss": 0.64022171, - "num_input_tokens_seen": 67552500, - "step": 3128, - "time_per_iteration": 3.264270782470703 - }, - { - "auxiliary_loss_clip": 0.01108205, - "auxiliary_loss_mlp": 0.01049422, - "balance_loss_clip": 1.04763842, - "balance_loss_mlp": 1.02907288, - "epoch": 0.18812565759807606, - "flos": 32124464928000.0, - "grad_norm": 2.4867495334212175, - "language_loss": 0.70985162, - "learning_rate": 3.7433935523492536e-06, - "loss": 0.73142785, - "num_input_tokens_seen": 67573295, - "step": 3129, - "time_per_iteration": 2.79929256439209 - }, - { - "auxiliary_loss_clip": 0.01158485, - "auxiliary_loss_mlp": 0.01050611, - "balance_loss_clip": 1.05767536, - "balance_loss_mlp": 1.03109634, - "epoch": 0.18818578085074403, - "flos": 20624539449600.0, - "grad_norm": 2.4831518001798676, - "language_loss": 0.85035253, - "learning_rate": 3.7432026652962314e-06, - "loss": 0.87244344, - "num_input_tokens_seen": 67590010, - "step": 3130, - "time_per_iteration": 2.60624361038208 - }, - { - "auxiliary_loss_clip": 0.01107202, - "auxiliary_loss_mlp": 0.01049966, - "balance_loss_clip": 1.04649067, - "balance_loss_mlp": 1.03023696, - "epoch": 0.188245904103412, - "flos": 28840506048000.0, - "grad_norm": 9.096753382647533, - "language_loss": 0.7643525, - "learning_rate": 3.7430117121407897e-06, - "loss": 0.7859242, - "num_input_tokens_seen": 67611110, - "step": 3131, - "time_per_iteration": 2.759230136871338 - }, - { - "auxiliary_loss_clip": 0.0112329, - "auxiliary_loss_mlp": 0.01049221, - "balance_loss_clip": 1.05344164, - "balance_loss_mlp": 1.03014708, - "epoch": 0.18830602735607996, - "flos": 29420319386880.0, - "grad_norm": 2.109252219381847, - "language_loss": 0.80713749, - "learning_rate": 3.74282069289017e-06, - "loss": 0.82886261, - "num_input_tokens_seen": 67631990, - "step": 3132, - "time_per_iteration": 2.773817777633667 - }, - { - "auxiliary_loss_clip": 0.01093588, - "auxiliary_loss_mlp": 0.00779094, - "balance_loss_clip": 1.04652429, - "balance_loss_mlp": 1.00091529, - "epoch": 0.18836615060874792, - "flos": 28872933050880.0, - "grad_norm": 2.092242478448591, - "language_loss": 0.79653811, - "learning_rate": 3.742629607551614e-06, - "loss": 0.81526494, - "num_input_tokens_seen": 67650490, - "step": 3133, - "time_per_iteration": 2.7873754501342773 - }, - { - "auxiliary_loss_clip": 0.01119878, - "auxiliary_loss_mlp": 0.01059381, - "balance_loss_clip": 1.05341148, - "balance_loss_mlp": 1.03921056, - "epoch": 0.18842627386141592, - "flos": 22601673717120.0, - "grad_norm": 1.9069857551930867, - "language_loss": 0.83001804, - "learning_rate": 3.7424384561323698e-06, - "loss": 0.85181063, - "num_input_tokens_seen": 67668860, - "step": 3134, - "time_per_iteration": 2.9284298419952393 - }, - { - "auxiliary_loss_clip": 0.01131578, - "auxiliary_loss_mlp": 0.01046681, - "balance_loss_clip": 1.05168402, - "balance_loss_mlp": 1.02802503, - "epoch": 0.18848639711408388, - "flos": 24573600512640.0, - "grad_norm": 2.0376543711114152, - "language_loss": 0.82859468, - "learning_rate": 3.742247238639684e-06, - "loss": 0.85037726, - "num_input_tokens_seen": 67690220, - "step": 3135, - "time_per_iteration": 2.8006811141967773 - }, - { - "auxiliary_loss_clip": 0.01143148, - "auxiliary_loss_mlp": 0.01050197, - "balance_loss_clip": 1.05505157, - "balance_loss_mlp": 1.03146911, - "epoch": 0.18854652036675185, - "flos": 34166920078080.0, - "grad_norm": 1.9728388324049713, - "language_loss": 0.78658557, - "learning_rate": 3.7420559550808083e-06, - "loss": 0.80851901, - "num_input_tokens_seen": 67709820, - "step": 3136, - "time_per_iteration": 4.256143569946289 - }, - { - "auxiliary_loss_clip": 0.01135545, - "auxiliary_loss_mlp": 0.01048618, - "balance_loss_clip": 1.05388892, - "balance_loss_mlp": 1.03006911, - "epoch": 0.1886066436194198, - "flos": 24200236592640.0, - "grad_norm": 1.7483697887361769, - "language_loss": 0.80820233, - "learning_rate": 3.741864605462996e-06, - "loss": 0.83004391, - "num_input_tokens_seen": 67729490, - "step": 3137, - "time_per_iteration": 2.7538130283355713 - }, - { - "auxiliary_loss_clip": 0.01159054, - "auxiliary_loss_mlp": 0.01048373, - "balance_loss_clip": 1.05827475, - "balance_loss_mlp": 1.03107548, - "epoch": 0.18866676687208778, - "flos": 21251109317760.0, - "grad_norm": 1.9799764624272802, - "language_loss": 0.81274408, - "learning_rate": 3.741673189793504e-06, - "loss": 0.83481836, - "num_input_tokens_seen": 67749665, - "step": 3138, - "time_per_iteration": 4.143909931182861 - }, - { - "auxiliary_loss_clip": 0.01150082, - "auxiliary_loss_mlp": 0.01056444, - "balance_loss_clip": 1.05626798, - "balance_loss_mlp": 1.03713167, - "epoch": 0.18872689012475574, - "flos": 37308673013760.0, - "grad_norm": 2.326218248348143, - "language_loss": 0.63655496, - "learning_rate": 3.7414817080795896e-06, - "loss": 0.65862024, - "num_input_tokens_seen": 67776230, - "step": 3139, - "time_per_iteration": 4.30991268157959 - }, - { - "auxiliary_loss_clip": 0.0115289, - "auxiliary_loss_mlp": 0.01043021, - "balance_loss_clip": 1.05286491, - "balance_loss_mlp": 1.02356625, - "epoch": 0.1887870133774237, - "flos": 21652303299840.0, - "grad_norm": 2.1185902638296525, - "language_loss": 0.7148211, - "learning_rate": 3.741290160328514e-06, - "loss": 0.73678017, - "num_input_tokens_seen": 67795080, - "step": 3140, - "time_per_iteration": 2.6880578994750977 - }, - { - "auxiliary_loss_clip": 0.01154738, - "auxiliary_loss_mlp": 0.01043099, - "balance_loss_clip": 1.05349982, - "balance_loss_mlp": 1.02382278, - "epoch": 0.1888471366300917, - "flos": 15924659374080.0, - "grad_norm": 2.6250212982316574, - "language_loss": 0.87069929, - "learning_rate": 3.7410985465475412e-06, - "loss": 0.89267766, - "num_input_tokens_seen": 67813110, - "step": 3141, - "time_per_iteration": 2.6677181720733643 - }, - { - "auxiliary_loss_clip": 0.01130655, - "auxiliary_loss_mlp": 0.01052882, - "balance_loss_clip": 1.0507834, - "balance_loss_mlp": 1.03243756, - "epoch": 0.18890725988275966, - "flos": 18551955767040.0, - "grad_norm": 1.873404502116747, - "language_loss": 0.7744689, - "learning_rate": 3.7409068667439378e-06, - "loss": 0.79630429, - "num_input_tokens_seen": 67831070, - "step": 3142, - "time_per_iteration": 2.63077449798584 - }, - { - "auxiliary_loss_clip": 0.01128192, - "auxiliary_loss_mlp": 0.01038074, - "balance_loss_clip": 1.05298221, - "balance_loss_mlp": 1.02132463, - "epoch": 0.18896738313542763, - "flos": 28840865184000.0, - "grad_norm": 1.6611052928231447, - "language_loss": 0.78867507, - "learning_rate": 3.740715120924971e-06, - "loss": 0.81033778, - "num_input_tokens_seen": 67852170, - "step": 3143, - "time_per_iteration": 4.417406797409058 - }, - { - "auxiliary_loss_clip": 0.0111986, - "auxiliary_loss_mlp": 0.01048019, - "balance_loss_clip": 1.05024099, - "balance_loss_mlp": 1.02821851, - "epoch": 0.1890275063880956, - "flos": 22412747157120.0, - "grad_norm": 2.855732191409361, - "language_loss": 0.71476078, - "learning_rate": 3.740523309097912e-06, - "loss": 0.73643959, - "num_input_tokens_seen": 67869945, - "step": 3144, - "time_per_iteration": 2.8104894161224365 - }, - { - "auxiliary_loss_clip": 0.01125398, - "auxiliary_loss_mlp": 0.01044816, - "balance_loss_clip": 1.05102479, - "balance_loss_mlp": 1.02492023, - "epoch": 0.18908762964076356, - "flos": 24243904552320.0, - "grad_norm": 2.5973078221757144, - "language_loss": 0.73390597, - "learning_rate": 3.7403314312700356e-06, - "loss": 0.75560808, - "num_input_tokens_seen": 67890240, - "step": 3145, - "time_per_iteration": 2.715609312057495 - }, - { - "auxiliary_loss_clip": 0.01110308, - "auxiliary_loss_mlp": 0.01042542, - "balance_loss_clip": 1.04543984, - "balance_loss_mlp": 1.02446938, - "epoch": 0.18914775289343153, - "flos": 16982910892800.0, - "grad_norm": 2.915733862437625, - "language_loss": 0.76263785, - "learning_rate": 3.740139487448616e-06, - "loss": 0.78416634, - "num_input_tokens_seen": 67907825, - "step": 3146, - "time_per_iteration": 2.777221202850342 - }, - { - "auxiliary_loss_clip": 0.01092807, - "auxiliary_loss_mlp": 0.01049336, - "balance_loss_clip": 1.04319823, - "balance_loss_mlp": 1.02829611, - "epoch": 0.1892078761460995, - "flos": 21543781334400.0, - "grad_norm": 1.988128972125699, - "language_loss": 0.7837925, - "learning_rate": 3.7399474776409326e-06, - "loss": 0.80521393, - "num_input_tokens_seen": 67926670, - "step": 3147, - "time_per_iteration": 2.8039205074310303 - }, - { - "auxiliary_loss_clip": 0.01143577, - "auxiliary_loss_mlp": 0.01042953, - "balance_loss_clip": 1.0548687, - "balance_loss_mlp": 1.02454758, - "epoch": 0.18926799939876748, - "flos": 23001538896000.0, - "grad_norm": 3.932544798883504, - "language_loss": 0.67477876, - "learning_rate": 3.739755401854267e-06, - "loss": 0.69664401, - "num_input_tokens_seen": 67943645, - "step": 3148, - "time_per_iteration": 2.7273359298706055 - }, - { - "auxiliary_loss_clip": 0.01112331, - "auxiliary_loss_mlp": 0.01039139, - "balance_loss_clip": 1.04617155, - "balance_loss_mlp": 1.02014899, - "epoch": 0.18932812265143545, - "flos": 22273019251200.0, - "grad_norm": 2.9848849244070315, - "language_loss": 0.76207471, - "learning_rate": 3.739563260095902e-06, - "loss": 0.78358936, - "num_input_tokens_seen": 67962345, - "step": 3149, - "time_per_iteration": 2.8031978607177734 - }, - { - "auxiliary_loss_clip": 0.01130375, - "auxiliary_loss_mlp": 0.01045773, - "balance_loss_clip": 1.05438852, - "balance_loss_mlp": 1.02797484, - "epoch": 0.1893882459041034, - "flos": 18624423456000.0, - "grad_norm": 2.3661599820320136, - "language_loss": 0.80378366, - "learning_rate": 3.7393710523731245e-06, - "loss": 0.82554519, - "num_input_tokens_seen": 67979760, - "step": 3150, - "time_per_iteration": 2.7836129665374756 - }, - { - "auxiliary_loss_clip": 0.01137112, - "auxiliary_loss_mlp": 0.0104876, - "balance_loss_clip": 1.0528239, - "balance_loss_mlp": 1.03019929, - "epoch": 0.18944836915677138, - "flos": 22892981016960.0, - "grad_norm": 2.0711129864945956, - "language_loss": 0.85251844, - "learning_rate": 3.7391787786932215e-06, - "loss": 0.87437713, - "num_input_tokens_seen": 67996895, - "step": 3151, - "time_per_iteration": 2.7782201766967773 - }, - { - "auxiliary_loss_clip": 0.01121267, - "auxiliary_loss_mlp": 0.01046776, - "balance_loss_clip": 1.05223882, - "balance_loss_mlp": 1.02839363, - "epoch": 0.18950849240943934, - "flos": 26796542526720.0, - "grad_norm": 2.1337439707996673, - "language_loss": 0.74114192, - "learning_rate": 3.7389864390634857e-06, - "loss": 0.76282233, - "num_input_tokens_seen": 68018365, - "step": 3152, - "time_per_iteration": 2.8767755031585693 - }, - { - "auxiliary_loss_clip": 0.01120312, - "auxiliary_loss_mlp": 0.0104438, - "balance_loss_clip": 1.05119991, - "balance_loss_mlp": 1.02463925, - "epoch": 0.1895686156621073, - "flos": 24971239048320.0, - "grad_norm": 1.9471461777193173, - "language_loss": 0.75520492, - "learning_rate": 3.738794033491209e-06, - "loss": 0.77685189, - "num_input_tokens_seen": 68037985, - "step": 3153, - "time_per_iteration": 2.7722980976104736 - }, - { - "auxiliary_loss_clip": 0.01158287, - "auxiliary_loss_mlp": 0.01049678, - "balance_loss_clip": 1.0559293, - "balance_loss_mlp": 1.03102183, - "epoch": 0.1896287389147753, - "flos": 21944544353280.0, - "grad_norm": 2.099749434473157, - "language_loss": 0.79984629, - "learning_rate": 3.7386015619836887e-06, - "loss": 0.82192594, - "num_input_tokens_seen": 68057975, - "step": 3154, - "time_per_iteration": 2.6530587673187256 - }, - { - "auxiliary_loss_clip": 0.01117992, - "auxiliary_loss_mlp": 0.01056707, - "balance_loss_clip": 1.04851115, - "balance_loss_mlp": 1.03536844, - "epoch": 0.18968886216744327, - "flos": 18179058723840.0, - "grad_norm": 3.210440214164498, - "language_loss": 0.73046303, - "learning_rate": 3.738409024548223e-06, - "loss": 0.75220996, - "num_input_tokens_seen": 68074175, - "step": 3155, - "time_per_iteration": 2.729832410812378 - }, - { - "auxiliary_loss_clip": 0.01126019, - "auxiliary_loss_mlp": 0.01045659, - "balance_loss_clip": 1.05104291, - "balance_loss_mlp": 1.02626419, - "epoch": 0.18974898542011123, - "flos": 20412487509120.0, - "grad_norm": 1.8299076145086866, - "language_loss": 0.73869717, - "learning_rate": 3.7382164211921136e-06, - "loss": 0.76041389, - "num_input_tokens_seen": 68095230, - "step": 3156, - "time_per_iteration": 2.6747231483459473 - }, - { - "auxiliary_loss_clip": 0.01156549, - "auxiliary_loss_mlp": 0.0104418, - "balance_loss_clip": 1.05489409, - "balance_loss_mlp": 1.02645326, - "epoch": 0.1898091086727792, - "flos": 23985024255360.0, - "grad_norm": 1.9629652277148564, - "language_loss": 0.68053937, - "learning_rate": 3.7380237519226623e-06, - "loss": 0.70254672, - "num_input_tokens_seen": 68113805, - "step": 3157, - "time_per_iteration": 2.7092478275299072 - }, - { - "auxiliary_loss_clip": 0.01114914, - "auxiliary_loss_mlp": 0.01044181, - "balance_loss_clip": 1.04805827, - "balance_loss_mlp": 1.02533436, - "epoch": 0.18986923192544716, - "flos": 27637067756160.0, - "grad_norm": 1.7829025355963362, - "language_loss": 0.79893303, - "learning_rate": 3.737831016747176e-06, - "loss": 0.82052404, - "num_input_tokens_seen": 68133190, - "step": 3158, - "time_per_iteration": 2.7921364307403564 - }, - { - "auxiliary_loss_clip": 0.01163231, - "auxiliary_loss_mlp": 0.01049502, - "balance_loss_clip": 1.05787683, - "balance_loss_mlp": 1.02923679, - "epoch": 0.18992935517811513, - "flos": 25484151306240.0, - "grad_norm": 1.856283461980025, - "language_loss": 0.72348613, - "learning_rate": 3.737638215672964e-06, - "loss": 0.74561346, - "num_input_tokens_seen": 68152330, - "step": 3159, - "time_per_iteration": 2.6111273765563965 - }, - { - "auxiliary_loss_clip": 0.01149613, - "auxiliary_loss_mlp": 0.01053808, - "balance_loss_clip": 1.05840325, - "balance_loss_mlp": 1.03386414, - "epoch": 0.1899894784307831, - "flos": 17420805596160.0, - "grad_norm": 2.2573250756933647, - "language_loss": 0.84977192, - "learning_rate": 3.7374453487073366e-06, - "loss": 0.87180614, - "num_input_tokens_seen": 68170185, - "step": 3160, - "time_per_iteration": 2.659259796142578 - }, - { - "auxiliary_loss_clip": 0.01129342, - "auxiliary_loss_mlp": 0.01049909, - "balance_loss_clip": 1.05297387, - "balance_loss_mlp": 1.03289795, - "epoch": 0.19004960168345109, - "flos": 27492240119040.0, - "grad_norm": 2.752358611011079, - "language_loss": 0.73407793, - "learning_rate": 3.7372524158576074e-06, - "loss": 0.7558704, - "num_input_tokens_seen": 68191665, - "step": 3161, - "time_per_iteration": 2.784040689468384 - }, - { - "auxiliary_loss_clip": 0.01139858, - "auxiliary_loss_mlp": 0.0105519, - "balance_loss_clip": 1.05456805, - "balance_loss_mlp": 1.03476942, - "epoch": 0.19010972493611905, - "flos": 38654676385920.0, - "grad_norm": 1.6629026055958476, - "language_loss": 0.8115741, - "learning_rate": 3.7370594171310926e-06, - "loss": 0.83352458, - "num_input_tokens_seen": 68214635, - "step": 3162, - "time_per_iteration": 2.9375386238098145 - }, - { - "auxiliary_loss_clip": 0.01157449, - "auxiliary_loss_mlp": 0.01040035, - "balance_loss_clip": 1.05625844, - "balance_loss_mlp": 1.02062798, - "epoch": 0.19016984818878702, - "flos": 19244744357760.0, - "grad_norm": 2.448016750033594, - "language_loss": 0.75615001, - "learning_rate": 3.73686635253511e-06, - "loss": 0.77812481, - "num_input_tokens_seen": 68232150, - "step": 3163, - "time_per_iteration": 2.7344541549682617 - }, - { - "auxiliary_loss_clip": 0.0110099, - "auxiliary_loss_mlp": 0.01050093, - "balance_loss_clip": 1.050578, - "balance_loss_mlp": 1.02880192, - "epoch": 0.19022997144145498, - "flos": 37596891744000.0, - "grad_norm": 2.2644227245470514, - "language_loss": 0.74093997, - "learning_rate": 3.736673222076982e-06, - "loss": 0.76245081, - "num_input_tokens_seen": 68253370, - "step": 3164, - "time_per_iteration": 2.9165730476379395 - }, - { - "auxiliary_loss_clip": 0.01141317, - "auxiliary_loss_mlp": 0.01038043, - "balance_loss_clip": 1.05518687, - "balance_loss_mlp": 1.0195303, - "epoch": 0.19029009469412295, - "flos": 61530921665280.0, - "grad_norm": 1.5484522746055986, - "language_loss": 0.66844344, - "learning_rate": 3.7364800257640313e-06, - "loss": 0.69023699, - "num_input_tokens_seen": 68278895, - "step": 3165, - "time_per_iteration": 3.006096124649048 - }, - { - "auxiliary_loss_clip": 0.01146225, - "auxiliary_loss_mlp": 0.0104856, - "balance_loss_clip": 1.05512285, - "balance_loss_mlp": 1.02848506, - "epoch": 0.1903502179467909, - "flos": 13954851480960.0, - "grad_norm": 2.8598536292657144, - "language_loss": 0.74239767, - "learning_rate": 3.7362867636035835e-06, - "loss": 0.76434553, - "num_input_tokens_seen": 68294880, - "step": 3166, - "time_per_iteration": 2.678844928741455 - }, - { - "auxiliary_loss_clip": 0.01050093, - "auxiliary_loss_mlp": 0.01014959, - "balance_loss_clip": 1.04342103, - "balance_loss_mlp": 1.01201403, - "epoch": 0.1904103411994589, - "flos": 66899641916160.0, - "grad_norm": 0.7754190343967906, - "language_loss": 0.50311053, - "learning_rate": 3.736093435602968e-06, - "loss": 0.52376103, - "num_input_tokens_seen": 68359665, - "step": 3167, - "time_per_iteration": 3.277529239654541 - }, - { - "auxiliary_loss_clip": 0.01138483, - "auxiliary_loss_mlp": 0.01051348, - "balance_loss_clip": 1.05485487, - "balance_loss_mlp": 1.03293037, - "epoch": 0.19047046445212687, - "flos": 21908741472000.0, - "grad_norm": 2.3487387451986192, - "language_loss": 0.74504036, - "learning_rate": 3.7359000417695156e-06, - "loss": 0.76693863, - "num_input_tokens_seen": 68378950, - "step": 3168, - "time_per_iteration": 2.690995216369629 - }, - { - "auxiliary_loss_clip": 0.01040165, - "auxiliary_loss_mlp": 0.01023518, - "balance_loss_clip": 1.03869283, - "balance_loss_mlp": 1.02085996, - "epoch": 0.19053058770479483, - "flos": 59255156701440.0, - "grad_norm": 0.8605055473788603, - "language_loss": 0.60079956, - "learning_rate": 3.73570658211056e-06, - "loss": 0.62143636, - "num_input_tokens_seen": 68434235, - "step": 3169, - "time_per_iteration": 3.2108101844787598 - }, - { - "auxiliary_loss_clip": 0.01103792, - "auxiliary_loss_mlp": 0.01056606, - "balance_loss_clip": 1.05267787, - "balance_loss_mlp": 1.03741288, - "epoch": 0.1905907109574628, - "flos": 23951304362880.0, - "grad_norm": 1.5575975614891868, - "language_loss": 0.78179795, - "learning_rate": 3.735513056633436e-06, - "loss": 0.80340189, - "num_input_tokens_seen": 68453830, - "step": 3170, - "time_per_iteration": 2.832043409347534 - }, - { - "auxiliary_loss_clip": 0.01142047, - "auxiliary_loss_mlp": 0.01045041, - "balance_loss_clip": 1.05325115, - "balance_loss_mlp": 1.02605128, - "epoch": 0.19065083421013077, - "flos": 20812316774400.0, - "grad_norm": 1.7671932984988854, - "language_loss": 0.78177166, - "learning_rate": 3.7353194653454834e-06, - "loss": 0.80364257, - "num_input_tokens_seen": 68473005, - "step": 3171, - "time_per_iteration": 2.7823612689971924 - }, - { - "auxiliary_loss_clip": 0.01158227, - "auxiliary_loss_mlp": 0.01047345, - "balance_loss_clip": 1.05499291, - "balance_loss_mlp": 1.0285697, - "epoch": 0.19071095746279873, - "flos": 31284981192960.0, - "grad_norm": 2.1976685633770905, - "language_loss": 0.77953529, - "learning_rate": 3.7351258082540426e-06, - "loss": 0.80159104, - "num_input_tokens_seen": 68493470, - "step": 3172, - "time_per_iteration": 2.746279001235962 - }, - { - "auxiliary_loss_clip": 0.01145112, - "auxiliary_loss_mlp": 0.01055334, - "balance_loss_clip": 1.05438328, - "balance_loss_mlp": 1.03703523, - "epoch": 0.1907710807154667, - "flos": 14356117290240.0, - "grad_norm": 1.5258786569967644, - "language_loss": 0.80223799, - "learning_rate": 3.7349320853664576e-06, - "loss": 0.82424247, - "num_input_tokens_seen": 68511290, - "step": 3173, - "time_per_iteration": 2.7396810054779053 - }, - { - "auxiliary_loss_clip": 0.01113266, - "auxiliary_loss_mlp": 0.00778142, - "balance_loss_clip": 1.04967713, - "balance_loss_mlp": 1.00094676, - "epoch": 0.1908312039681347, - "flos": 26907039740160.0, - "grad_norm": 1.5341307852526682, - "language_loss": 0.78495061, - "learning_rate": 3.7347382966900735e-06, - "loss": 0.80386466, - "num_input_tokens_seen": 68532575, - "step": 3174, - "time_per_iteration": 2.8579304218292236 - }, - { - "auxiliary_loss_clip": 0.01106714, - "auxiliary_loss_mlp": 0.01047557, - "balance_loss_clip": 1.04928994, - "balance_loss_mlp": 1.02838778, - "epoch": 0.19089132722080265, - "flos": 14494695960960.0, - "grad_norm": 1.8075853216546063, - "language_loss": 0.81067109, - "learning_rate": 3.7345444422322395e-06, - "loss": 0.83221382, - "num_input_tokens_seen": 68548760, - "step": 3175, - "time_per_iteration": 2.718254804611206 - }, - { - "auxiliary_loss_clip": 0.01080497, - "auxiliary_loss_mlp": 0.01053652, - "balance_loss_clip": 1.04361629, - "balance_loss_mlp": 1.0342685, - "epoch": 0.19095145047347062, - "flos": 13952876232960.0, - "grad_norm": 2.2545261224105873, - "language_loss": 0.85529047, - "learning_rate": 3.7343505220003067e-06, - "loss": 0.87663192, - "num_input_tokens_seen": 68563100, - "step": 3176, - "time_per_iteration": 4.2962729930877686 - }, - { - "auxiliary_loss_clip": 0.0113361, - "auxiliary_loss_mlp": 0.01059849, - "balance_loss_clip": 1.05418086, - "balance_loss_mlp": 1.03928506, - "epoch": 0.19101157372613858, - "flos": 25301832848640.0, - "grad_norm": 2.0896270593066832, - "language_loss": 0.813025, - "learning_rate": 3.7341565360016285e-06, - "loss": 0.83495957, - "num_input_tokens_seen": 68581650, - "step": 3177, - "time_per_iteration": 2.815127372741699 - }, - { - "auxiliary_loss_clip": 0.01122377, - "auxiliary_loss_mlp": 0.01044946, - "balance_loss_clip": 1.0482533, - "balance_loss_mlp": 1.0265398, - "epoch": 0.19107169697880655, - "flos": 20558212986240.0, - "grad_norm": 2.67963335978105, - "language_loss": 0.7530241, - "learning_rate": 3.73396248424356e-06, - "loss": 0.7746973, - "num_input_tokens_seen": 68600360, - "step": 3178, - "time_per_iteration": 4.351228475570679 - }, - { - "auxiliary_loss_clip": 0.01146729, - "auxiliary_loss_mlp": 0.01042476, - "balance_loss_clip": 1.05574143, - "balance_loss_mlp": 1.02458286, - "epoch": 0.19113182023147451, - "flos": 22163204396160.0, - "grad_norm": 4.753014277211421, - "language_loss": 0.81381619, - "learning_rate": 3.7337683667334606e-06, - "loss": 0.83570826, - "num_input_tokens_seen": 68617885, - "step": 3179, - "time_per_iteration": 4.259284019470215 - }, - { - "auxiliary_loss_clip": 0.01147837, - "auxiliary_loss_mlp": 0.01048144, - "balance_loss_clip": 1.05645823, - "balance_loss_mlp": 1.0291661, - "epoch": 0.19119194348414248, - "flos": 18581796990720.0, - "grad_norm": 2.753081884541086, - "language_loss": 0.79384613, - "learning_rate": 3.733574183478691e-06, - "loss": 0.81580591, - "num_input_tokens_seen": 68634550, - "step": 3180, - "time_per_iteration": 2.6609203815460205 - }, - { - "auxiliary_loss_clip": 0.01129361, - "auxiliary_loss_mlp": 0.0105402, - "balance_loss_clip": 1.05249727, - "balance_loss_mlp": 1.03445804, - "epoch": 0.19125206673681047, - "flos": 19026623018880.0, - "grad_norm": 2.660238694189741, - "language_loss": 0.79517245, - "learning_rate": 3.733379934486615e-06, - "loss": 0.81700623, - "num_input_tokens_seen": 68651895, - "step": 3181, - "time_per_iteration": 2.6877176761627197 - }, - { - "auxiliary_loss_clip": 0.0114301, - "auxiliary_loss_mlp": 0.01053621, - "balance_loss_clip": 1.05339336, - "balance_loss_mlp": 1.03527462, - "epoch": 0.19131218998947844, - "flos": 21690153256320.0, - "grad_norm": 2.2179888965480243, - "language_loss": 0.74570775, - "learning_rate": 3.7331856197645973e-06, - "loss": 0.76767409, - "num_input_tokens_seen": 68671500, - "step": 3182, - "time_per_iteration": 4.2829508781433105 - }, - { - "auxiliary_loss_clip": 0.01128679, - "auxiliary_loss_mlp": 0.01044063, - "balance_loss_clip": 1.05578041, - "balance_loss_mlp": 1.02575254, - "epoch": 0.1913723132421464, - "flos": 18442500048000.0, - "grad_norm": 1.7534728284311585, - "language_loss": 0.64618582, - "learning_rate": 3.7329912393200084e-06, - "loss": 0.66791326, - "num_input_tokens_seen": 68690570, - "step": 3183, - "time_per_iteration": 2.7652854919433594 - }, - { - "auxiliary_loss_clip": 0.01132257, - "auxiliary_loss_mlp": 0.01050867, - "balance_loss_clip": 1.0512805, - "balance_loss_mlp": 1.0311259, - "epoch": 0.19143243649481437, - "flos": 27160102033920.0, - "grad_norm": 1.555926798692704, - "language_loss": 0.73459226, - "learning_rate": 3.7327967931602173e-06, - "loss": 0.75642347, - "num_input_tokens_seen": 68709735, - "step": 3184, - "time_per_iteration": 2.6929056644439697 - }, - { - "auxiliary_loss_clip": 0.01122578, - "auxiliary_loss_mlp": 0.01054123, - "balance_loss_clip": 1.05015373, - "balance_loss_mlp": 1.03347623, - "epoch": 0.19149255974748233, - "flos": 21718952985600.0, - "grad_norm": 2.0989643169058514, - "language_loss": 0.87983418, - "learning_rate": 3.732602281292598e-06, - "loss": 0.9016012, - "num_input_tokens_seen": 68727565, - "step": 3185, - "time_per_iteration": 2.6859230995178223 - }, - { - "auxiliary_loss_clip": 0.01153787, - "auxiliary_loss_mlp": 0.01044436, - "balance_loss_clip": 1.05334914, - "balance_loss_mlp": 1.02505302, - "epoch": 0.1915526830001503, - "flos": 22963293889920.0, - "grad_norm": 2.4520480945942587, - "language_loss": 0.73240852, - "learning_rate": 3.7324077037245267e-06, - "loss": 0.75439072, - "num_input_tokens_seen": 68748110, - "step": 3186, - "time_per_iteration": 2.6398978233337402 - }, - { - "auxiliary_loss_clip": 0.01132874, - "auxiliary_loss_mlp": 0.01044989, - "balance_loss_clip": 1.05609488, - "balance_loss_mlp": 1.02379346, - "epoch": 0.1916128062528183, - "flos": 26140741966080.0, - "grad_norm": 2.739457234253781, - "language_loss": 0.83550584, - "learning_rate": 3.7322130604633825e-06, - "loss": 0.85728443, - "num_input_tokens_seen": 68769765, - "step": 3187, - "time_per_iteration": 2.7476372718811035 - }, - { - "auxiliary_loss_clip": 0.01076264, - "auxiliary_loss_mlp": 0.01021317, - "balance_loss_clip": 1.04604995, - "balance_loss_mlp": 1.01892138, - "epoch": 0.19167292950548626, - "flos": 54925767457920.0, - "grad_norm": 0.8659386797819415, - "language_loss": 0.55824959, - "learning_rate": 3.732018351516544e-06, - "loss": 0.57922542, - "num_input_tokens_seen": 68826815, - "step": 3188, - "time_per_iteration": 3.2144031524658203 - }, - { - "auxiliary_loss_clip": 0.01139007, - "auxiliary_loss_mlp": 0.01054399, - "balance_loss_clip": 1.054564, - "balance_loss_mlp": 1.03537333, - "epoch": 0.19173305275815422, - "flos": 29935601942400.0, - "grad_norm": 2.2897904709915573, - "language_loss": 0.69839454, - "learning_rate": 3.731823576891397e-06, - "loss": 0.72032857, - "num_input_tokens_seen": 68847585, - "step": 3189, - "time_per_iteration": 2.7998950481414795 - }, - { - "auxiliary_loss_clip": 0.01118438, - "auxiliary_loss_mlp": 0.01038566, - "balance_loss_clip": 1.04930174, - "balance_loss_mlp": 1.02116132, - "epoch": 0.1917931760108222, - "flos": 24752471264640.0, - "grad_norm": 2.362312815249866, - "language_loss": 0.74320328, - "learning_rate": 3.7316287365953266e-06, - "loss": 0.76477331, - "num_input_tokens_seen": 68866620, - "step": 3190, - "time_per_iteration": 2.7386670112609863 - }, - { - "auxiliary_loss_clip": 0.01111071, - "auxiliary_loss_mlp": 0.0106718, - "balance_loss_clip": 1.04946983, - "balance_loss_mlp": 1.04702199, - "epoch": 0.19185329926349015, - "flos": 18843550375680.0, - "grad_norm": 3.545467698458187, - "language_loss": 0.8444041, - "learning_rate": 3.73143383063572e-06, - "loss": 0.8661865, - "num_input_tokens_seen": 68885515, - "step": 3191, - "time_per_iteration": 2.7025794982910156 - }, - { - "auxiliary_loss_clip": 0.01127894, - "auxiliary_loss_mlp": 0.01039849, - "balance_loss_clip": 1.05251908, - "balance_loss_mlp": 1.02231336, - "epoch": 0.19191342251615812, - "flos": 22086858038400.0, - "grad_norm": 2.0663841109071526, - "language_loss": 0.89985192, - "learning_rate": 3.73123885901997e-06, - "loss": 0.92152941, - "num_input_tokens_seen": 68903225, - "step": 3192, - "time_per_iteration": 2.802852153778076 - }, - { - "auxiliary_loss_clip": 0.01130336, - "auxiliary_loss_mlp": 0.01054766, - "balance_loss_clip": 1.05716372, - "balance_loss_mlp": 1.03509688, - "epoch": 0.19197354576882608, - "flos": 22199115018240.0, - "grad_norm": 2.3467564445058775, - "language_loss": 0.75159264, - "learning_rate": 3.7310438217554687e-06, - "loss": 0.77344358, - "num_input_tokens_seen": 68922860, - "step": 3193, - "time_per_iteration": 2.7680914402008057 - }, - { - "auxiliary_loss_clip": 0.01128303, - "auxiliary_loss_mlp": 0.00777332, - "balance_loss_clip": 1.05222785, - "balance_loss_mlp": 1.00071752, - "epoch": 0.19203366902149407, - "flos": 24896185580160.0, - "grad_norm": 2.078743387775855, - "language_loss": 0.75189757, - "learning_rate": 3.730848718849612e-06, - "loss": 0.77095383, - "num_input_tokens_seen": 68943000, - "step": 3194, - "time_per_iteration": 2.7537553310394287 - }, - { - "auxiliary_loss_clip": 0.01068142, - "auxiliary_loss_mlp": 0.01004387, - "balance_loss_clip": 1.03910232, - "balance_loss_mlp": 1.00182378, - "epoch": 0.19209379227416204, - "flos": 68416722789120.0, - "grad_norm": 0.7955224937316553, - "language_loss": 0.68507159, - "learning_rate": 3.7306535503097985e-06, - "loss": 0.70579696, - "num_input_tokens_seen": 69000255, - "step": 3195, - "time_per_iteration": 3.117191791534424 - }, - { - "auxiliary_loss_clip": 0.01116081, - "auxiliary_loss_mlp": 0.01052392, - "balance_loss_clip": 1.05205238, - "balance_loss_mlp": 1.0320189, - "epoch": 0.19215391552683, - "flos": 22055185221120.0, - "grad_norm": 2.6559439291645757, - "language_loss": 0.73141015, - "learning_rate": 3.730458316143429e-06, - "loss": 0.75309479, - "num_input_tokens_seen": 69019665, - "step": 3196, - "time_per_iteration": 2.7234303951263428 - }, - { - "auxiliary_loss_clip": 0.01139018, - "auxiliary_loss_mlp": 0.01044947, - "balance_loss_clip": 1.06151462, - "balance_loss_mlp": 1.02596927, - "epoch": 0.19221403877949797, - "flos": 20302959962880.0, - "grad_norm": 3.0997718824135734, - "language_loss": 0.83654135, - "learning_rate": 3.7302630163579068e-06, - "loss": 0.85838103, - "num_input_tokens_seen": 69039055, - "step": 3197, - "time_per_iteration": 2.72575306892395 - }, - { - "auxiliary_loss_clip": 0.01086216, - "auxiliary_loss_mlp": 0.01055059, - "balance_loss_clip": 1.04615641, - "balance_loss_mlp": 1.03320754, - "epoch": 0.19227416203216594, - "flos": 23185329811200.0, - "grad_norm": 2.2465298420006383, - "language_loss": 0.80656433, - "learning_rate": 3.7300676509606373e-06, - "loss": 0.82797706, - "num_input_tokens_seen": 69056370, - "step": 3198, - "time_per_iteration": 2.741678237915039 - }, - { - "auxiliary_loss_clip": 0.01135487, - "auxiliary_loss_mlp": 0.01056572, - "balance_loss_clip": 1.05502987, - "balance_loss_mlp": 1.03655636, - "epoch": 0.1923342852848339, - "flos": 25776607841280.0, - "grad_norm": 1.9205907836873994, - "language_loss": 0.78993976, - "learning_rate": 3.729872219959029e-06, - "loss": 0.81186032, - "num_input_tokens_seen": 69075915, - "step": 3199, - "time_per_iteration": 2.7821297645568848 - }, - { - "auxiliary_loss_clip": 0.01116808, - "auxiliary_loss_mlp": 0.01056964, - "balance_loss_clip": 1.05010581, - "balance_loss_mlp": 1.036412, - "epoch": 0.19239440853750187, - "flos": 17128349061120.0, - "grad_norm": 3.662083840248298, - "language_loss": 0.83574522, - "learning_rate": 3.7296767233604934e-06, - "loss": 0.85748297, - "num_input_tokens_seen": 69094145, - "step": 3200, - "time_per_iteration": 2.7095022201538086 - }, - { - "auxiliary_loss_clip": 0.01159025, - "auxiliary_loss_mlp": 0.01048823, - "balance_loss_clip": 1.05997193, - "balance_loss_mlp": 1.03060746, - "epoch": 0.19245453179016986, - "flos": 16435093593600.0, - "grad_norm": 1.9278966392289572, - "language_loss": 0.79092836, - "learning_rate": 3.729481161172443e-06, - "loss": 0.81300688, - "num_input_tokens_seen": 69111110, - "step": 3201, - "time_per_iteration": 2.684979200363159 - }, - { - "auxiliary_loss_clip": 0.01103349, - "auxiliary_loss_mlp": 0.01053366, - "balance_loss_clip": 1.04825675, - "balance_loss_mlp": 1.03418541, - "epoch": 0.19251465504283782, - "flos": 20230276792320.0, - "grad_norm": 2.4062417134527645, - "language_loss": 0.69276404, - "learning_rate": 3.7292855334022927e-06, - "loss": 0.71433127, - "num_input_tokens_seen": 69130280, - "step": 3202, - "time_per_iteration": 2.8284943103790283 - }, - { - "auxiliary_loss_clip": 0.01132334, - "auxiliary_loss_mlp": 0.01041011, - "balance_loss_clip": 1.05389905, - "balance_loss_mlp": 1.02256894, - "epoch": 0.1925747782955058, - "flos": 19464374067840.0, - "grad_norm": 1.9491265782204168, - "language_loss": 0.91396749, - "learning_rate": 3.7290898400574627e-06, - "loss": 0.93570089, - "num_input_tokens_seen": 69149570, - "step": 3203, - "time_per_iteration": 2.802433729171753 - }, - { - "auxiliary_loss_clip": 0.0114953, - "auxiliary_loss_mlp": 0.01049732, - "balance_loss_clip": 1.05674863, - "balance_loss_mlp": 1.02959776, - "epoch": 0.19263490154817375, - "flos": 17785586165760.0, - "grad_norm": 5.05881669068558, - "language_loss": 0.81689429, - "learning_rate": 3.7288940811453725e-06, - "loss": 0.83888692, - "num_input_tokens_seen": 69168190, - "step": 3204, - "time_per_iteration": 2.671285629272461 - }, - { - "auxiliary_loss_clip": 0.01116988, - "auxiliary_loss_mlp": 0.01048941, - "balance_loss_clip": 1.04950142, - "balance_loss_mlp": 1.0298202, - "epoch": 0.19269502480084172, - "flos": 17457075354240.0, - "grad_norm": 2.296941025186916, - "language_loss": 0.76167846, - "learning_rate": 3.7286982566734454e-06, - "loss": 0.78333771, - "num_input_tokens_seen": 69186950, - "step": 3205, - "time_per_iteration": 2.8654470443725586 - }, - { - "auxiliary_loss_clip": 0.01140852, - "auxiliary_loss_mlp": 0.01046651, - "balance_loss_clip": 1.05839586, - "balance_loss_mlp": 1.02749407, - "epoch": 0.19275514805350968, - "flos": 21506901045120.0, - "grad_norm": 3.761768843322395, - "language_loss": 0.83394569, - "learning_rate": 3.728502366649107e-06, - "loss": 0.85582072, - "num_input_tokens_seen": 69204850, - "step": 3206, - "time_per_iteration": 2.8610613346099854 - }, - { - "auxiliary_loss_clip": 0.0105715, - "auxiliary_loss_mlp": 0.01004055, - "balance_loss_clip": 1.03779244, - "balance_loss_mlp": 1.00174224, - "epoch": 0.19281527130617768, - "flos": 47695979738880.0, - "grad_norm": 0.8644529519848262, - "language_loss": 0.60561717, - "learning_rate": 3.728306411079786e-06, - "loss": 0.62622917, - "num_input_tokens_seen": 69259200, - "step": 3207, - "time_per_iteration": 3.126537322998047 - }, - { - "auxiliary_loss_clip": 0.01120285, - "auxiliary_loss_mlp": 0.01045527, - "balance_loss_clip": 1.05201781, - "balance_loss_mlp": 1.02678764, - "epoch": 0.19287539455884564, - "flos": 11801252672640.0, - "grad_norm": 2.296187182186814, - "language_loss": 0.75463599, - "learning_rate": 3.7281103899729125e-06, - "loss": 0.77629405, - "num_input_tokens_seen": 69275835, - "step": 3208, - "time_per_iteration": 2.6978750228881836 - }, - { - "auxiliary_loss_clip": 0.01150534, - "auxiliary_loss_mlp": 0.00777875, - "balance_loss_clip": 1.05520236, - "balance_loss_mlp": 1.00063884, - "epoch": 0.1929355178115136, - "flos": 20631434860800.0, - "grad_norm": 1.9483983315924505, - "language_loss": 0.60869855, - "learning_rate": 3.7279143033359195e-06, - "loss": 0.62798262, - "num_input_tokens_seen": 69294810, - "step": 3209, - "time_per_iteration": 2.699798107147217 - }, - { - "auxiliary_loss_clip": 0.01158758, - "auxiliary_loss_mlp": 0.01053815, - "balance_loss_clip": 1.05472994, - "balance_loss_mlp": 1.03261995, - "epoch": 0.19299564106418157, - "flos": 40807916058240.0, - "grad_norm": 1.9992177661428934, - "language_loss": 0.80025005, - "learning_rate": 3.727718151176243e-06, - "loss": 0.82237577, - "num_input_tokens_seen": 69316065, - "step": 3210, - "time_per_iteration": 2.832665205001831 - }, - { - "auxiliary_loss_clip": 0.01118997, - "auxiliary_loss_mlp": 0.01047494, - "balance_loss_clip": 1.05044246, - "balance_loss_mlp": 1.02920699, - "epoch": 0.19305576431684954, - "flos": 11361418634880.0, - "grad_norm": 2.515510367397107, - "language_loss": 0.82571948, - "learning_rate": 3.7275219335013217e-06, - "loss": 0.84738445, - "num_input_tokens_seen": 69332900, - "step": 3211, - "time_per_iteration": 2.7664191722869873 - }, - { - "auxiliary_loss_clip": 0.01073663, - "auxiliary_loss_mlp": 0.01002544, - "balance_loss_clip": 1.03501034, - "balance_loss_mlp": 1.00021982, - "epoch": 0.1931158875695175, - "flos": 54511895975040.0, - "grad_norm": 0.9633495631759209, - "language_loss": 0.63641912, - "learning_rate": 3.7273256503185953e-06, - "loss": 0.6571812, - "num_input_tokens_seen": 69382535, - "step": 3212, - "time_per_iteration": 2.974940299987793 - }, - { - "auxiliary_loss_clip": 0.01131742, - "auxiliary_loss_mlp": 0.01044059, - "balance_loss_clip": 1.05586314, - "balance_loss_mlp": 1.02565336, - "epoch": 0.19317601082218547, - "flos": 19828436365440.0, - "grad_norm": 1.7209148950717332, - "language_loss": 0.76375663, - "learning_rate": 3.7271293016355074e-06, - "loss": 0.78551459, - "num_input_tokens_seen": 69400600, - "step": 3213, - "time_per_iteration": 2.7898454666137695 - }, - { - "auxiliary_loss_clip": 0.01123196, - "auxiliary_loss_mlp": 0.0105066, - "balance_loss_clip": 1.05261111, - "balance_loss_mlp": 1.03116894, - "epoch": 0.19323613407485346, - "flos": 13152068467200.0, - "grad_norm": 2.349758973823363, - "language_loss": 0.70871878, - "learning_rate": 3.726932887459503e-06, - "loss": 0.73045731, - "num_input_tokens_seen": 69417350, - "step": 3214, - "time_per_iteration": 2.8155152797698975 - }, - { - "auxiliary_loss_clip": 0.01155585, - "auxiliary_loss_mlp": 0.01047831, - "balance_loss_clip": 1.05412841, - "balance_loss_mlp": 1.02807808, - "epoch": 0.19329625732752143, - "flos": 14027247342720.0, - "grad_norm": 2.190607045917922, - "language_loss": 0.75067955, - "learning_rate": 3.72673640779803e-06, - "loss": 0.77271378, - "num_input_tokens_seen": 69431845, - "step": 3215, - "time_per_iteration": 4.111938238143921 - }, - { - "auxiliary_loss_clip": 0.01112217, - "auxiliary_loss_mlp": 0.01049964, - "balance_loss_clip": 1.04928339, - "balance_loss_mlp": 1.0323447, - "epoch": 0.1933563805801894, - "flos": 23441732069760.0, - "grad_norm": 1.7842520268521305, - "language_loss": 0.88426638, - "learning_rate": 3.72653986265854e-06, - "loss": 0.9058882, - "num_input_tokens_seen": 69453275, - "step": 3216, - "time_per_iteration": 2.7699615955352783 - }, - { - "auxiliary_loss_clip": 0.01153806, - "auxiliary_loss_mlp": 0.01052131, - "balance_loss_clip": 1.05435801, - "balance_loss_mlp": 1.03442836, - "epoch": 0.19341650383285736, - "flos": 20485314334080.0, - "grad_norm": 1.6996051239972392, - "language_loss": 0.7974773, - "learning_rate": 3.726343252048485e-06, - "loss": 0.81953669, - "num_input_tokens_seen": 69471830, - "step": 3217, - "time_per_iteration": 2.6788718700408936 - }, - { - "auxiliary_loss_clip": 0.01143281, - "auxiliary_loss_mlp": 0.0104914, - "balance_loss_clip": 1.05695105, - "balance_loss_mlp": 1.02864754, - "epoch": 0.19347662708552532, - "flos": 17858484817920.0, - "grad_norm": 4.708784796317305, - "language_loss": 0.6161437, - "learning_rate": 3.7261465759753206e-06, - "loss": 0.6380679, - "num_input_tokens_seen": 69489320, - "step": 3218, - "time_per_iteration": 4.352849960327148 - }, - { - "auxiliary_loss_clip": 0.01157355, - "auxiliary_loss_mlp": 0.01047211, - "balance_loss_clip": 1.05723107, - "balance_loss_mlp": 1.02873373, - "epoch": 0.1935367503381933, - "flos": 18187247024640.0, - "grad_norm": 1.9724785552136583, - "language_loss": 0.80345452, - "learning_rate": 3.7259498344465053e-06, - "loss": 0.82550013, - "num_input_tokens_seen": 69506665, - "step": 3219, - "time_per_iteration": 4.1739161014556885 - }, - { - "auxiliary_loss_clip": 0.01104687, - "auxiliary_loss_mlp": 0.01047672, - "balance_loss_clip": 1.05145359, - "balance_loss_mlp": 1.02819324, - "epoch": 0.19359687359086128, - "flos": 15957122290560.0, - "grad_norm": 2.7508533279024077, - "language_loss": 0.85693008, - "learning_rate": 3.7257530274694993e-06, - "loss": 0.87845367, - "num_input_tokens_seen": 69523835, - "step": 3220, - "time_per_iteration": 2.777284622192383 - }, - { - "auxiliary_loss_clip": 0.01149581, - "auxiliary_loss_mlp": 0.01041747, - "balance_loss_clip": 1.05441856, - "balance_loss_mlp": 1.02511764, - "epoch": 0.19365699684352924, - "flos": 21215198695680.0, - "grad_norm": 2.05545450883527, - "language_loss": 0.84637755, - "learning_rate": 3.725556155051766e-06, - "loss": 0.86829084, - "num_input_tokens_seen": 69542620, - "step": 3221, - "time_per_iteration": 4.224115371704102 - }, - { - "auxiliary_loss_clip": 0.01143661, - "auxiliary_loss_mlp": 0.01044558, - "balance_loss_clip": 1.05466259, - "balance_loss_mlp": 1.02730846, - "epoch": 0.1937171200961972, - "flos": 17311098481920.0, - "grad_norm": 2.658004231066563, - "language_loss": 0.86087942, - "learning_rate": 3.7253592172007702e-06, - "loss": 0.8827616, - "num_input_tokens_seen": 69561130, - "step": 3222, - "time_per_iteration": 2.6400530338287354 - }, - { - "auxiliary_loss_clip": 0.01069453, - "auxiliary_loss_mlp": 0.01045281, - "balance_loss_clip": 1.04206085, - "balance_loss_mlp": 1.02599275, - "epoch": 0.19377724334886517, - "flos": 22635968227200.0, - "grad_norm": 1.8604116943694204, - "language_loss": 0.78510809, - "learning_rate": 3.72516221392398e-06, - "loss": 0.8062554, - "num_input_tokens_seen": 69580425, - "step": 3223, - "time_per_iteration": 2.9685652256011963 - }, - { - "auxiliary_loss_clip": 0.01146062, - "auxiliary_loss_mlp": 0.01046815, - "balance_loss_clip": 1.05697751, - "balance_loss_mlp": 1.02819431, - "epoch": 0.19383736660153314, - "flos": 15077813351040.0, - "grad_norm": 1.8958208586464897, - "language_loss": 0.75391948, - "learning_rate": 3.7249651452288653e-06, - "loss": 0.77584827, - "num_input_tokens_seen": 69597085, - "step": 3224, - "time_per_iteration": 2.665294885635376 - }, - { - "auxiliary_loss_clip": 0.01102293, - "auxiliary_loss_mlp": 0.01050181, - "balance_loss_clip": 1.04728186, - "balance_loss_mlp": 1.02927208, - "epoch": 0.1938974898542011, - "flos": 47119934350080.0, - "grad_norm": 3.358076005999295, - "language_loss": 0.71180636, - "learning_rate": 3.7247680111229e-06, - "loss": 0.73333108, - "num_input_tokens_seen": 69618885, - "step": 3225, - "time_per_iteration": 2.997511863708496 - }, - { - "auxiliary_loss_clip": 0.0112035, - "auxiliary_loss_mlp": 0.01053167, - "balance_loss_clip": 1.0519309, - "balance_loss_mlp": 1.03480864, - "epoch": 0.19395761310686907, - "flos": 25812554376960.0, - "grad_norm": 2.42331686427639, - "language_loss": 0.69379079, - "learning_rate": 3.7245708116135585e-06, - "loss": 0.71552593, - "num_input_tokens_seen": 69638200, - "step": 3226, - "time_per_iteration": 2.746338129043579 - }, - { - "auxiliary_loss_clip": 0.01126783, - "auxiliary_loss_mlp": 0.01042276, - "balance_loss_clip": 1.05692983, - "balance_loss_mlp": 1.02264214, - "epoch": 0.19401773635953706, - "flos": 23039604334080.0, - "grad_norm": 2.1006513764454864, - "language_loss": 0.76236808, - "learning_rate": 3.7243735467083193e-06, - "loss": 0.78405869, - "num_input_tokens_seen": 69657550, - "step": 3227, - "time_per_iteration": 2.760087728500366 - }, - { - "auxiliary_loss_clip": 0.01117794, - "auxiliary_loss_mlp": 0.010438, - "balance_loss_clip": 1.05304587, - "balance_loss_mlp": 1.0256561, - "epoch": 0.19407785961220503, - "flos": 15920780705280.0, - "grad_norm": 2.8268368707906397, - "language_loss": 0.69577461, - "learning_rate": 3.724176216414662e-06, - "loss": 0.71739054, - "num_input_tokens_seen": 69675005, - "step": 3228, - "time_per_iteration": 2.6779348850250244 - }, - { - "auxiliary_loss_clip": 0.01148199, - "auxiliary_loss_mlp": 0.01042315, - "balance_loss_clip": 1.05775642, - "balance_loss_mlp": 1.02445757, - "epoch": 0.194137982864873, - "flos": 25921722787200.0, - "grad_norm": 1.7694943420266864, - "language_loss": 0.74160898, - "learning_rate": 3.72397882074007e-06, - "loss": 0.76351416, - "num_input_tokens_seen": 69696455, - "step": 3229, - "time_per_iteration": 2.7229623794555664 - }, - { - "auxiliary_loss_clip": 0.01119678, - "auxiliary_loss_mlp": 0.01044155, - "balance_loss_clip": 1.05435359, - "balance_loss_mlp": 1.0262022, - "epoch": 0.19419810611754096, - "flos": 13261344618240.0, - "grad_norm": 1.9766126324167548, - "language_loss": 0.65722096, - "learning_rate": 3.7237813596920285e-06, - "loss": 0.67885935, - "num_input_tokens_seen": 69714245, - "step": 3230, - "time_per_iteration": 2.740324020385742 - }, - { - "auxiliary_loss_clip": 0.01124671, - "auxiliary_loss_mlp": 0.00776003, - "balance_loss_clip": 1.05223823, - "balance_loss_mlp": 1.00081468, - "epoch": 0.19425822937020892, - "flos": 15705568368000.0, - "grad_norm": 1.9307338208311895, - "language_loss": 0.82042694, - "learning_rate": 3.7235838332780254e-06, - "loss": 0.83943367, - "num_input_tokens_seen": 69731515, - "step": 3231, - "time_per_iteration": 2.7453513145446777 - }, - { - "auxiliary_loss_clip": 0.0113141, - "auxiliary_loss_mlp": 0.01042332, - "balance_loss_clip": 1.05393946, - "balance_loss_mlp": 1.02220988, - "epoch": 0.1943183526228769, - "flos": 23105392093440.0, - "grad_norm": 10.866686758212083, - "language_loss": 0.87038374, - "learning_rate": 3.72338624150555e-06, - "loss": 0.89212114, - "num_input_tokens_seen": 69748885, - "step": 3232, - "time_per_iteration": 2.7575178146362305 - }, - { - "auxiliary_loss_clip": 0.01100451, - "auxiliary_loss_mlp": 0.01050878, - "balance_loss_clip": 1.05029583, - "balance_loss_mlp": 1.03102958, - "epoch": 0.19437847587554485, - "flos": 24712610146560.0, - "grad_norm": 2.531838729905544, - "language_loss": 0.85189134, - "learning_rate": 3.723188584382096e-06, - "loss": 0.87340462, - "num_input_tokens_seen": 69767540, - "step": 3233, - "time_per_iteration": 2.8617444038391113 - }, - { - "auxiliary_loss_clip": 0.01149478, - "auxiliary_loss_mlp": 0.01054519, - "balance_loss_clip": 1.0574832, - "balance_loss_mlp": 1.0357672, - "epoch": 0.19443859912821285, - "flos": 23116130259840.0, - "grad_norm": 1.7408859410354203, - "language_loss": 0.89099532, - "learning_rate": 3.722990861915158e-06, - "loss": 0.91303527, - "num_input_tokens_seen": 69789340, - "step": 3234, - "time_per_iteration": 2.7648239135742188 - }, - { - "auxiliary_loss_clip": 0.01135157, - "auxiliary_loss_mlp": 0.01044708, - "balance_loss_clip": 1.05003643, - "balance_loss_mlp": 1.02544403, - "epoch": 0.1944987223808808, - "flos": 15084385539840.0, - "grad_norm": 2.4074482975555926, - "language_loss": 0.78673434, - "learning_rate": 3.722793074112234e-06, - "loss": 0.80853301, - "num_input_tokens_seen": 69806470, - "step": 3235, - "time_per_iteration": 2.76930832862854 - }, - { - "auxiliary_loss_clip": 0.01136497, - "auxiliary_loss_mlp": 0.01046749, - "balance_loss_clip": 1.0580672, - "balance_loss_mlp": 1.0293448, - "epoch": 0.19455884563354878, - "flos": 17126876603520.0, - "grad_norm": 2.2511193258734354, - "language_loss": 0.79391634, - "learning_rate": 3.7225952209808233e-06, - "loss": 0.81574875, - "num_input_tokens_seen": 69822655, - "step": 3236, - "time_per_iteration": 2.7060179710388184 - }, - { - "auxiliary_loss_clip": 0.01156991, - "auxiliary_loss_mlp": 0.01044638, - "balance_loss_clip": 1.05862045, - "balance_loss_mlp": 1.02482522, - "epoch": 0.19461896888621674, - "flos": 20193396503040.0, - "grad_norm": 2.1553329609131713, - "language_loss": 0.76224017, - "learning_rate": 3.72239730252843e-06, - "loss": 0.78425646, - "num_input_tokens_seen": 69841895, - "step": 3237, - "time_per_iteration": 2.642235040664673 - }, - { - "auxiliary_loss_clip": 0.01158804, - "auxiliary_loss_mlp": 0.01051059, - "balance_loss_clip": 1.05648041, - "balance_loss_mlp": 1.03289127, - "epoch": 0.1946790921388847, - "flos": 25301365971840.0, - "grad_norm": 1.5204653275468003, - "language_loss": 0.74828202, - "learning_rate": 3.7221993187625583e-06, - "loss": 0.77038062, - "num_input_tokens_seen": 69862220, - "step": 3238, - "time_per_iteration": 2.6618688106536865 - }, - { - "auxiliary_loss_clip": 0.01108331, - "auxiliary_loss_mlp": 0.01046572, - "balance_loss_clip": 1.04992437, - "balance_loss_mlp": 1.02791595, - "epoch": 0.19473921539155267, - "flos": 20193396503040.0, - "grad_norm": 3.1324225641798518, - "language_loss": 0.734164, - "learning_rate": 3.7220012696907155e-06, - "loss": 0.75571299, - "num_input_tokens_seen": 69881830, - "step": 3239, - "time_per_iteration": 2.7637152671813965 - }, - { - "auxiliary_loss_clip": 0.01132567, - "auxiliary_loss_mlp": 0.01047988, - "balance_loss_clip": 1.05458641, - "balance_loss_mlp": 1.02947509, - "epoch": 0.19479933864422067, - "flos": 20887549810560.0, - "grad_norm": 2.155392951393246, - "language_loss": 0.73291272, - "learning_rate": 3.721803155320412e-06, - "loss": 0.7547183, - "num_input_tokens_seen": 69900515, - "step": 3240, - "time_per_iteration": 2.6980888843536377 - }, - { - "auxiliary_loss_clip": 0.01131601, - "auxiliary_loss_mlp": 0.0103943, - "balance_loss_clip": 1.05846488, - "balance_loss_mlp": 1.02208555, - "epoch": 0.19485946189688863, - "flos": 23295072839040.0, - "grad_norm": 5.847648280625993, - "language_loss": 0.65809447, - "learning_rate": 3.7216049756591606e-06, - "loss": 0.6798048, - "num_input_tokens_seen": 69920060, - "step": 3241, - "time_per_iteration": 2.659707546234131 - }, - { - "auxiliary_loss_clip": 0.01128971, - "auxiliary_loss_mlp": 0.01048707, - "balance_loss_clip": 1.05226684, - "balance_loss_mlp": 1.03039646, - "epoch": 0.1949195851495566, - "flos": 23295036925440.0, - "grad_norm": 1.4408225707306088, - "language_loss": 0.82747853, - "learning_rate": 3.7214067307144754e-06, - "loss": 0.84925532, - "num_input_tokens_seen": 69939820, - "step": 3242, - "time_per_iteration": 2.7137632369995117 - }, - { - "auxiliary_loss_clip": 0.01077632, - "auxiliary_loss_mlp": 0.01014225, - "balance_loss_clip": 1.04083347, - "balance_loss_mlp": 1.01131678, - "epoch": 0.19497970840222456, - "flos": 64962871557120.0, - "grad_norm": 0.853263603243422, - "language_loss": 0.57500821, - "learning_rate": 3.721208420493875e-06, - "loss": 0.59592682, - "num_input_tokens_seen": 70002145, - "step": 3243, - "time_per_iteration": 3.1446309089660645 - }, - { - "auxiliary_loss_clip": 0.01138548, - "auxiliary_loss_mlp": 0.01050428, - "balance_loss_clip": 1.05331421, - "balance_loss_mlp": 1.02988815, - "epoch": 0.19503983165489253, - "flos": 19644717277440.0, - "grad_norm": 7.2345723863132, - "language_loss": 0.83789021, - "learning_rate": 3.7210100450048784e-06, - "loss": 0.85977995, - "num_input_tokens_seen": 70020510, - "step": 3244, - "time_per_iteration": 2.6194229125976562 - }, - { - "auxiliary_loss_clip": 0.01143261, - "auxiliary_loss_mlp": 0.01046223, - "balance_loss_clip": 1.05732584, - "balance_loss_mlp": 1.02869976, - "epoch": 0.1950999549075605, - "flos": 21141976821120.0, - "grad_norm": 2.0710390949438837, - "language_loss": 0.7739507, - "learning_rate": 3.7208116042550088e-06, - "loss": 0.79584551, - "num_input_tokens_seen": 70040760, - "step": 3245, - "time_per_iteration": 2.6684374809265137 - }, - { - "auxiliary_loss_clip": 0.01142874, - "auxiliary_loss_mlp": 0.01043114, - "balance_loss_clip": 1.05566645, - "balance_loss_mlp": 1.02431464, - "epoch": 0.19516007816022846, - "flos": 20884820376960.0, - "grad_norm": 2.1010289547443133, - "language_loss": 0.83988321, - "learning_rate": 3.7206130982517906e-06, - "loss": 0.86174309, - "num_input_tokens_seen": 70058720, - "step": 3246, - "time_per_iteration": 2.6595354080200195 - }, - { - "auxiliary_loss_clip": 0.0114599, - "auxiliary_loss_mlp": 0.00776442, - "balance_loss_clip": 1.05517101, - "balance_loss_mlp": 1.00080454, - "epoch": 0.19522020141289645, - "flos": 16910515031040.0, - "grad_norm": 3.3581015873305438, - "language_loss": 0.76840878, - "learning_rate": 3.7204145270027514e-06, - "loss": 0.78763306, - "num_input_tokens_seen": 70076470, - "step": 3247, - "time_per_iteration": 2.7777793407440186 - }, - { - "auxiliary_loss_clip": 0.01121778, - "auxiliary_loss_mlp": 0.01043977, - "balance_loss_clip": 1.05689096, - "balance_loss_mlp": 1.02651262, - "epoch": 0.19528032466556441, - "flos": 26724829023360.0, - "grad_norm": 1.8981807103962522, - "language_loss": 0.75459039, - "learning_rate": 3.720215890515421e-06, - "loss": 0.77624786, - "num_input_tokens_seen": 70096220, - "step": 3248, - "time_per_iteration": 2.8088901042938232 - }, - { - "auxiliary_loss_clip": 0.01156017, - "auxiliary_loss_mlp": 0.01048303, - "balance_loss_clip": 1.05548215, - "balance_loss_mlp": 1.03008783, - "epoch": 0.19534044791823238, - "flos": 21032808410880.0, - "grad_norm": 2.7209722336942135, - "language_loss": 0.77774823, - "learning_rate": 3.7200171887973316e-06, - "loss": 0.79979146, - "num_input_tokens_seen": 70114800, - "step": 3249, - "time_per_iteration": 2.610877752304077 - }, - { - "auxiliary_loss_clip": 0.01148434, - "auxiliary_loss_mlp": 0.01050332, - "balance_loss_clip": 1.05689144, - "balance_loss_mlp": 1.03299928, - "epoch": 0.19540057117090034, - "flos": 22344050396160.0, - "grad_norm": 1.5551573885822045, - "language_loss": 0.73118901, - "learning_rate": 3.7198184218560176e-06, - "loss": 0.75317669, - "num_input_tokens_seen": 70134930, - "step": 3250, - "time_per_iteration": 2.5901567935943604 - }, - { - "auxiliary_loss_clip": 0.01101628, - "auxiliary_loss_mlp": 0.01046467, - "balance_loss_clip": 1.05080378, - "balance_loss_mlp": 1.02876413, - "epoch": 0.1954606944235683, - "flos": 20301631159680.0, - "grad_norm": 2.030501302548557, - "language_loss": 0.79203367, - "learning_rate": 3.719619589699017e-06, - "loss": 0.81351459, - "num_input_tokens_seen": 70152045, - "step": 3251, - "time_per_iteration": 2.6619749069213867 - }, - { - "auxiliary_loss_clip": 0.0115825, - "auxiliary_loss_mlp": 0.01044132, - "balance_loss_clip": 1.05741858, - "balance_loss_mlp": 1.02606022, - "epoch": 0.19552081767623627, - "flos": 17346865449600.0, - "grad_norm": 7.451515078679223, - "language_loss": 0.83871722, - "learning_rate": 3.7194206923338695e-06, - "loss": 0.86074108, - "num_input_tokens_seen": 70169240, - "step": 3252, - "time_per_iteration": 2.5029656887054443 - }, - { - "auxiliary_loss_clip": 0.01142752, - "auxiliary_loss_mlp": 0.01057294, - "balance_loss_clip": 1.05278862, - "balance_loss_mlp": 1.03518057, - "epoch": 0.19558094092890424, - "flos": 31977626129280.0, - "grad_norm": 1.7140417843701068, - "language_loss": 0.73995864, - "learning_rate": 3.719221729768117e-06, - "loss": 0.76195908, - "num_input_tokens_seen": 70192690, - "step": 3253, - "time_per_iteration": 2.609117269515991 - }, - { - "auxiliary_loss_clip": 0.01102675, - "auxiliary_loss_mlp": 0.01046707, - "balance_loss_clip": 1.04759037, - "balance_loss_mlp": 1.02782381, - "epoch": 0.19564106418157223, - "flos": 22268889187200.0, - "grad_norm": 2.1302159220485675, - "language_loss": 0.76167047, - "learning_rate": 3.7190227020093037e-06, - "loss": 0.78316426, - "num_input_tokens_seen": 70209685, - "step": 3254, - "time_per_iteration": 4.174965858459473 - }, - { - "auxiliary_loss_clip": 0.01043127, - "auxiliary_loss_mlp": 0.01006966, - "balance_loss_clip": 1.04737842, - "balance_loss_mlp": 1.0036757, - "epoch": 0.1957011874342402, - "flos": 54364554385920.0, - "grad_norm": 0.84452007287803, - "language_loss": 0.55275303, - "learning_rate": 3.7188236090649774e-06, - "loss": 0.57325399, - "num_input_tokens_seen": 70265050, - "step": 3255, - "time_per_iteration": 3.2241716384887695 - }, - { - "auxiliary_loss_clip": 0.01133721, - "auxiliary_loss_mlp": 0.01041696, - "balance_loss_clip": 1.0557251, - "balance_loss_mlp": 1.02349281, - "epoch": 0.19576131068690816, - "flos": 16506699356160.0, - "grad_norm": 2.6103802859468392, - "language_loss": 0.70870697, - "learning_rate": 3.718624450942688e-06, - "loss": 0.73046112, - "num_input_tokens_seen": 70281830, - "step": 3256, - "time_per_iteration": 2.641296148300171 - }, - { - "auxiliary_loss_clip": 0.01152768, - "auxiliary_loss_mlp": 0.01042867, - "balance_loss_clip": 1.0544858, - "balance_loss_mlp": 1.02523613, - "epoch": 0.19582143393957613, - "flos": 14719676797440.0, - "grad_norm": 2.649319646209249, - "language_loss": 0.80722409, - "learning_rate": 3.718425227649987e-06, - "loss": 0.82918048, - "num_input_tokens_seen": 70297420, - "step": 3257, - "time_per_iteration": 4.258259057998657 - }, - { - "auxiliary_loss_clip": 0.01106644, - "auxiliary_loss_mlp": 0.01043385, - "balance_loss_clip": 1.05470431, - "balance_loss_mlp": 1.02601588, - "epoch": 0.1958815571922441, - "flos": 24425504737920.0, - "grad_norm": 6.015808523610408, - "language_loss": 0.75124931, - "learning_rate": 3.7182259391944292e-06, - "loss": 0.77274966, - "num_input_tokens_seen": 70319210, - "step": 3258, - "time_per_iteration": 4.386433362960815 - }, - { - "auxiliary_loss_clip": 0.01082287, - "auxiliary_loss_mlp": 0.01044148, - "balance_loss_clip": 1.04533339, - "balance_loss_mlp": 1.0237875, - "epoch": 0.19594168044491206, - "flos": 24900279730560.0, - "grad_norm": 1.8034996675319444, - "language_loss": 0.73872411, - "learning_rate": 3.7180265855835714e-06, - "loss": 0.75998843, - "num_input_tokens_seen": 70339045, - "step": 3259, - "time_per_iteration": 2.815469264984131 - }, - { - "auxiliary_loss_clip": 0.01131793, - "auxiliary_loss_mlp": 0.01043364, - "balance_loss_clip": 1.05167735, - "balance_loss_mlp": 1.02392125, - "epoch": 0.19600180369758005, - "flos": 12057008486400.0, - "grad_norm": 2.2096667980592, - "language_loss": 0.77053022, - "learning_rate": 3.7178271668249735e-06, - "loss": 0.79228187, - "num_input_tokens_seen": 70356505, - "step": 3260, - "time_per_iteration": 4.2817702293396 - }, - { - "auxiliary_loss_clip": 0.01148118, - "auxiliary_loss_mlp": 0.01043761, - "balance_loss_clip": 1.0551343, - "balance_loss_mlp": 1.0248661, - "epoch": 0.19606192695024802, - "flos": 20850202644480.0, - "grad_norm": 5.605178759176999, - "language_loss": 0.82261205, - "learning_rate": 3.7176276829261975e-06, - "loss": 0.84453082, - "num_input_tokens_seen": 70375410, - "step": 3261, - "time_per_iteration": 2.673092842102051 - }, - { - "auxiliary_loss_clip": 0.01121379, - "auxiliary_loss_mlp": 0.01044043, - "balance_loss_clip": 1.0550617, - "balance_loss_mlp": 1.02488637, - "epoch": 0.19612205020291598, - "flos": 28475509996800.0, - "grad_norm": 1.8492209450679535, - "language_loss": 0.76671481, - "learning_rate": 3.717428133894807e-06, - "loss": 0.78836906, - "num_input_tokens_seen": 70396315, - "step": 3262, - "time_per_iteration": 2.803938150405884 - }, - { - "auxiliary_loss_clip": 0.01148893, - "auxiliary_loss_mlp": 0.01047259, - "balance_loss_clip": 1.05960584, - "balance_loss_mlp": 1.02950907, - "epoch": 0.19618217345558395, - "flos": 25556618995200.0, - "grad_norm": 1.7278621785184562, - "language_loss": 0.8668195, - "learning_rate": 3.71722851973837e-06, - "loss": 0.88878107, - "num_input_tokens_seen": 70417945, - "step": 3263, - "time_per_iteration": 2.6677918434143066 - }, - { - "auxiliary_loss_clip": 0.0113123, - "auxiliary_loss_mlp": 0.01042546, - "balance_loss_clip": 1.05328059, - "balance_loss_mlp": 1.02505815, - "epoch": 0.1962422967082519, - "flos": 25264413855360.0, - "grad_norm": 3.447639973868791, - "language_loss": 0.73775035, - "learning_rate": 3.717028840464455e-06, - "loss": 0.75948811, - "num_input_tokens_seen": 70438690, - "step": 3264, - "time_per_iteration": 2.6973094940185547 - }, - { - "auxiliary_loss_clip": 0.01144053, - "auxiliary_loss_mlp": 0.01049918, - "balance_loss_clip": 1.05736756, - "balance_loss_mlp": 1.03223944, - "epoch": 0.19630241996091988, - "flos": 18807352444800.0, - "grad_norm": 2.4424358562200927, - "language_loss": 0.78513813, - "learning_rate": 3.7168290960806344e-06, - "loss": 0.80707777, - "num_input_tokens_seen": 70455385, - "step": 3265, - "time_per_iteration": 2.625739336013794 - }, - { - "auxiliary_loss_clip": 0.01031434, - "auxiliary_loss_mlp": 0.01002481, - "balance_loss_clip": 1.03386986, - "balance_loss_mlp": 0.99983466, - "epoch": 0.19636254321358784, - "flos": 62321137896960.0, - "grad_norm": 0.7932330660809486, - "language_loss": 0.53389955, - "learning_rate": 3.716629286594483e-06, - "loss": 0.55423868, - "num_input_tokens_seen": 70514280, - "step": 3266, - "time_per_iteration": 3.2586586475372314 - }, - { - "auxiliary_loss_clip": 0.01124628, - "auxiliary_loss_mlp": 0.00776501, - "balance_loss_clip": 1.04957044, - "balance_loss_mlp": 1.00080895, - "epoch": 0.19642266646625584, - "flos": 21069329564160.0, - "grad_norm": 2.0008611208986133, - "language_loss": 0.80109024, - "learning_rate": 3.7164294120135767e-06, - "loss": 0.8201015, - "num_input_tokens_seen": 70531800, - "step": 3267, - "time_per_iteration": 2.678537368774414 - }, - { - "auxiliary_loss_clip": 0.01130982, - "auxiliary_loss_mlp": 0.01043983, - "balance_loss_clip": 1.05263019, - "balance_loss_mlp": 1.02660179, - "epoch": 0.1964827897189238, - "flos": 14538651229440.0, - "grad_norm": 1.9909459598185588, - "language_loss": 0.86758262, - "learning_rate": 3.7162294723454953e-06, - "loss": 0.88933229, - "num_input_tokens_seen": 70550615, - "step": 3268, - "time_per_iteration": 2.6949849128723145 - }, - { - "auxiliary_loss_clip": 0.01099432, - "auxiliary_loss_mlp": 0.01041621, - "balance_loss_clip": 1.04954004, - "balance_loss_mlp": 1.02408528, - "epoch": 0.19654291297159177, - "flos": 19244636616960.0, - "grad_norm": 2.2632495429204127, - "language_loss": 0.68785441, - "learning_rate": 3.7160294675978197e-06, - "loss": 0.70926493, - "num_input_tokens_seen": 70568690, - "step": 3269, - "time_per_iteration": 2.770078182220459 - }, - { - "auxiliary_loss_clip": 0.01116538, - "auxiliary_loss_mlp": 0.01052319, - "balance_loss_clip": 1.05113554, - "balance_loss_mlp": 1.03330541, - "epoch": 0.19660303622425973, - "flos": 25775710001280.0, - "grad_norm": 7.1863103423452355, - "language_loss": 0.80241841, - "learning_rate": 3.715829397778135e-06, - "loss": 0.82410699, - "num_input_tokens_seen": 70588665, - "step": 3270, - "time_per_iteration": 2.7294864654541016 - }, - { - "auxiliary_loss_clip": 0.01139501, - "auxiliary_loss_mlp": 0.01045694, - "balance_loss_clip": 1.05189824, - "balance_loss_mlp": 1.02833724, - "epoch": 0.1966631594769277, - "flos": 20595093275520.0, - "grad_norm": 1.9668649321541274, - "language_loss": 0.83912349, - "learning_rate": 3.715629262894028e-06, - "loss": 0.86097538, - "num_input_tokens_seen": 70606900, - "step": 3271, - "time_per_iteration": 2.640235662460327 - }, - { - "auxiliary_loss_clip": 0.01139368, - "auxiliary_loss_mlp": 0.01051303, - "balance_loss_clip": 1.05468225, - "balance_loss_mlp": 1.0332067, - "epoch": 0.19672328272959566, - "flos": 23623188600960.0, - "grad_norm": 1.9968416702279483, - "language_loss": 0.79902714, - "learning_rate": 3.715429062953087e-06, - "loss": 0.82093388, - "num_input_tokens_seen": 70625955, - "step": 3272, - "time_per_iteration": 2.636629343032837 - }, - { - "auxiliary_loss_clip": 0.01124328, - "auxiliary_loss_mlp": 0.01058493, - "balance_loss_clip": 1.05192566, - "balance_loss_mlp": 1.03715479, - "epoch": 0.19678340598226365, - "flos": 23110922787840.0, - "grad_norm": 1.7302013075823783, - "language_loss": 0.80942369, - "learning_rate": 3.7152287979629043e-06, - "loss": 0.83125186, - "num_input_tokens_seen": 70646090, - "step": 3273, - "time_per_iteration": 2.6967809200286865 - }, - { - "auxiliary_loss_clip": 0.01144024, - "auxiliary_loss_mlp": 0.01054564, - "balance_loss_clip": 1.05456042, - "balance_loss_mlp": 1.03655195, - "epoch": 0.19684352923493162, - "flos": 24534852716160.0, - "grad_norm": 2.225126358921887, - "language_loss": 0.77984649, - "learning_rate": 3.7150284679310735e-06, - "loss": 0.80183232, - "num_input_tokens_seen": 70666065, - "step": 3274, - "time_per_iteration": 2.6808643341064453 - }, - { - "auxiliary_loss_clip": 0.01141267, - "auxiliary_loss_mlp": 0.01046445, - "balance_loss_clip": 1.05480242, - "balance_loss_mlp": 1.02840877, - "epoch": 0.19690365248759958, - "flos": 21796448578560.0, - "grad_norm": 2.318697297640889, - "language_loss": 0.81433225, - "learning_rate": 3.7148280728651914e-06, - "loss": 0.8362093, - "num_input_tokens_seen": 70681580, - "step": 3275, - "time_per_iteration": 2.672672986984253 - }, - { - "auxiliary_loss_clip": 0.01115756, - "auxiliary_loss_mlp": 0.01045314, - "balance_loss_clip": 1.05148947, - "balance_loss_mlp": 1.02686024, - "epoch": 0.19696377574026755, - "flos": 19056643810560.0, - "grad_norm": 2.4665004531377166, - "language_loss": 0.80909657, - "learning_rate": 3.7146276127728563e-06, - "loss": 0.83070731, - "num_input_tokens_seen": 70697745, - "step": 3276, - "time_per_iteration": 2.726970672607422 - }, - { - "auxiliary_loss_clip": 0.01142619, - "auxiliary_loss_mlp": 0.01043042, - "balance_loss_clip": 1.05443609, - "balance_loss_mlp": 1.02491045, - "epoch": 0.19702389899293551, - "flos": 22820656982400.0, - "grad_norm": 2.17541075016206, - "language_loss": 0.89113599, - "learning_rate": 3.7144270876616713e-06, - "loss": 0.9129926, - "num_input_tokens_seen": 70715110, - "step": 3277, - "time_per_iteration": 2.6738827228546143 - }, - { - "auxiliary_loss_clip": 0.01103709, - "auxiliary_loss_mlp": 0.01048433, - "balance_loss_clip": 1.04638815, - "balance_loss_mlp": 1.02864444, - "epoch": 0.19708402224560348, - "flos": 22894237992960.0, - "grad_norm": 2.640727897616601, - "language_loss": 0.62070847, - "learning_rate": 3.714226497539239e-06, - "loss": 0.64222991, - "num_input_tokens_seen": 70734715, - "step": 3278, - "time_per_iteration": 2.7382938861846924 - }, - { - "auxiliary_loss_clip": 0.01115303, - "auxiliary_loss_mlp": 0.0105759, - "balance_loss_clip": 1.05033016, - "balance_loss_mlp": 1.03793263, - "epoch": 0.19714414549827144, - "flos": 25662519267840.0, - "grad_norm": 1.930104581155035, - "language_loss": 0.73606467, - "learning_rate": 3.714025842413166e-06, - "loss": 0.75779366, - "num_input_tokens_seen": 70752650, - "step": 3279, - "time_per_iteration": 2.8123648166656494 - }, - { - "auxiliary_loss_clip": 0.0114648, - "auxiliary_loss_mlp": 0.01042853, - "balance_loss_clip": 1.05422091, - "balance_loss_mlp": 1.02567458, - "epoch": 0.19720426875093944, - "flos": 23915824704000.0, - "grad_norm": 1.7034036878345749, - "language_loss": 0.82685816, - "learning_rate": 3.713825122291061e-06, - "loss": 0.84875143, - "num_input_tokens_seen": 70772365, - "step": 3280, - "time_per_iteration": 2.7000861167907715 - }, - { - "auxiliary_loss_clip": 0.01106655, - "auxiliary_loss_mlp": 0.01048884, - "balance_loss_clip": 1.04887283, - "balance_loss_mlp": 1.03071654, - "epoch": 0.1972643920036074, - "flos": 13881952828800.0, - "grad_norm": 2.435959864664923, - "language_loss": 0.78173983, - "learning_rate": 3.713624337180536e-06, - "loss": 0.80329525, - "num_input_tokens_seen": 70790340, - "step": 3281, - "time_per_iteration": 2.7017247676849365 - }, - { - "auxiliary_loss_clip": 0.01125353, - "auxiliary_loss_mlp": 0.0104135, - "balance_loss_clip": 1.05461836, - "balance_loss_mlp": 1.02519727, - "epoch": 0.19732451525627537, - "flos": 19863592801920.0, - "grad_norm": 1.7390973872526612, - "language_loss": 0.79777479, - "learning_rate": 3.7134234870892045e-06, - "loss": 0.8194418, - "num_input_tokens_seen": 70809295, - "step": 3282, - "time_per_iteration": 2.7064146995544434 - }, - { - "auxiliary_loss_clip": 0.01112073, - "auxiliary_loss_mlp": 0.01043047, - "balance_loss_clip": 1.05485284, - "balance_loss_mlp": 1.02538049, - "epoch": 0.19738463850894333, - "flos": 24973429777920.0, - "grad_norm": 2.512566515566025, - "language_loss": 0.7192747, - "learning_rate": 3.7132225720246826e-06, - "loss": 0.74082589, - "num_input_tokens_seen": 70828765, - "step": 3283, - "time_per_iteration": 2.775297164916992 - }, - { - "auxiliary_loss_clip": 0.01137498, - "auxiliary_loss_mlp": 0.01043438, - "balance_loss_clip": 1.05320621, - "balance_loss_mlp": 1.02665281, - "epoch": 0.1974447617616113, - "flos": 18368883123840.0, - "grad_norm": 1.8864815757917637, - "language_loss": 0.78981179, - "learning_rate": 3.7130215919945886e-06, - "loss": 0.81162113, - "num_input_tokens_seen": 70846805, - "step": 3284, - "time_per_iteration": 2.6344916820526123 - }, - { - "auxiliary_loss_clip": 0.01126512, - "auxiliary_loss_mlp": 0.00776821, - "balance_loss_clip": 1.05065584, - "balance_loss_mlp": 1.00114048, - "epoch": 0.19750488501427926, - "flos": 22892945103360.0, - "grad_norm": 2.1903874509936982, - "language_loss": 0.86317503, - "learning_rate": 3.7128205470065445e-06, - "loss": 0.88220835, - "num_input_tokens_seen": 70863805, - "step": 3285, - "time_per_iteration": 2.725186586380005 - }, - { - "auxiliary_loss_clip": 0.01115791, - "auxiliary_loss_mlp": 0.01044707, - "balance_loss_clip": 1.05167055, - "balance_loss_mlp": 1.02658761, - "epoch": 0.19756500826694723, - "flos": 21871502046720.0, - "grad_norm": 2.208260347555195, - "language_loss": 0.88770825, - "learning_rate": 3.712619437068174e-06, - "loss": 0.90931326, - "num_input_tokens_seen": 70882660, - "step": 3286, - "time_per_iteration": 2.6819698810577393 - }, - { - "auxiliary_loss_clip": 0.01118742, - "auxiliary_loss_mlp": 0.01052526, - "balance_loss_clip": 1.05227792, - "balance_loss_mlp": 1.03016233, - "epoch": 0.19762513151961522, - "flos": 15158972131200.0, - "grad_norm": 2.0768117117784874, - "language_loss": 0.77941382, - "learning_rate": 3.712418262187102e-06, - "loss": 0.80112648, - "num_input_tokens_seen": 70898765, - "step": 3287, - "time_per_iteration": 2.641193389892578 - }, - { - "auxiliary_loss_clip": 0.01127955, - "auxiliary_loss_mlp": 0.01047337, - "balance_loss_clip": 1.0526104, - "balance_loss_mlp": 1.02849019, - "epoch": 0.1976852547722832, - "flos": 16979175878400.0, - "grad_norm": 2.061421898899755, - "language_loss": 0.80853081, - "learning_rate": 3.7122170223709584e-06, - "loss": 0.83028376, - "num_input_tokens_seen": 70916370, - "step": 3288, - "time_per_iteration": 2.625068426132202 - }, - { - "auxiliary_loss_clip": 0.01132408, - "auxiliary_loss_mlp": 0.01048194, - "balance_loss_clip": 1.05143857, - "balance_loss_mlp": 1.03045535, - "epoch": 0.19774537802495115, - "flos": 20302924049280.0, - "grad_norm": 2.345717890688315, - "language_loss": 0.7317158, - "learning_rate": 3.712015717627374e-06, - "loss": 0.75352174, - "num_input_tokens_seen": 70934870, - "step": 3289, - "time_per_iteration": 2.6319406032562256 - }, - { - "auxiliary_loss_clip": 0.01133413, - "auxiliary_loss_mlp": 0.01045224, - "balance_loss_clip": 1.05575252, - "balance_loss_mlp": 1.02678204, - "epoch": 0.19780550127761912, - "flos": 27235478724480.0, - "grad_norm": 1.9087552003653308, - "language_loss": 0.79608113, - "learning_rate": 3.7118143479639813e-06, - "loss": 0.81786746, - "num_input_tokens_seen": 70955140, - "step": 3290, - "time_per_iteration": 2.706570863723755 - }, - { - "auxiliary_loss_clip": 0.01049926, - "auxiliary_loss_mlp": 0.0101105, - "balance_loss_clip": 1.0327636, - "balance_loss_mlp": 1.00853467, - "epoch": 0.19786562453028708, - "flos": 63550972684800.0, - "grad_norm": 0.8952067644857119, - "language_loss": 0.60318571, - "learning_rate": 3.711612913388418e-06, - "loss": 0.62379545, - "num_input_tokens_seen": 71012005, - "step": 3291, - "time_per_iteration": 3.2849009037017822 - }, - { - "auxiliary_loss_clip": 0.01158891, - "auxiliary_loss_mlp": 0.01040785, - "balance_loss_clip": 1.05417156, - "balance_loss_mlp": 1.02088892, - "epoch": 0.19792574778295505, - "flos": 26286647011200.0, - "grad_norm": 1.932789926440358, - "language_loss": 0.81595641, - "learning_rate": 3.7114114139083204e-06, - "loss": 0.83795315, - "num_input_tokens_seen": 71031140, - "step": 3292, - "time_per_iteration": 2.6751551628112793 - }, - { - "auxiliary_loss_clip": 0.01119797, - "auxiliary_loss_mlp": 0.00778082, - "balance_loss_clip": 1.05296063, - "balance_loss_mlp": 1.00086236, - "epoch": 0.19798587103562304, - "flos": 19938107566080.0, - "grad_norm": 2.409042629875397, - "language_loss": 0.81013, - "learning_rate": 3.7112098495313313e-06, - "loss": 0.82910883, - "num_input_tokens_seen": 71050250, - "step": 3293, - "time_per_iteration": 4.3039703369140625 - }, - { - "auxiliary_loss_clip": 0.01137316, - "auxiliary_loss_mlp": 0.01052434, - "balance_loss_clip": 1.05370128, - "balance_loss_mlp": 1.03277683, - "epoch": 0.198045994288291, - "flos": 20120282369280.0, - "grad_norm": 1.8764131105986912, - "language_loss": 0.61480314, - "learning_rate": 3.711008220265093e-06, - "loss": 0.63670063, - "num_input_tokens_seen": 71068665, - "step": 3294, - "time_per_iteration": 2.671241044998169 - }, - { - "auxiliary_loss_clip": 0.01132208, - "auxiliary_loss_mlp": 0.01039978, - "balance_loss_clip": 1.05456376, - "balance_loss_mlp": 1.02201271, - "epoch": 0.19810611754095897, - "flos": 17967653228160.0, - "grad_norm": 2.0334748560156393, - "language_loss": 0.87313825, - "learning_rate": 3.710806526117251e-06, - "loss": 0.89486015, - "num_input_tokens_seen": 71085320, - "step": 3295, - "time_per_iteration": 2.659680128097534 - }, - { - "auxiliary_loss_clip": 0.01113106, - "auxiliary_loss_mlp": 0.01050184, - "balance_loss_clip": 1.05079484, - "balance_loss_mlp": 1.03256536, - "epoch": 0.19816624079362694, - "flos": 15084996071040.0, - "grad_norm": 2.5215255479345067, - "language_loss": 0.80839241, - "learning_rate": 3.7106047670954544e-06, - "loss": 0.83002532, - "num_input_tokens_seen": 71102020, - "step": 3296, - "time_per_iteration": 4.299339294433594 - }, - { - "auxiliary_loss_clip": 0.01123906, - "auxiliary_loss_mlp": 0.01045438, - "balance_loss_clip": 1.05233586, - "balance_loss_mlp": 1.02522039, - "epoch": 0.1982263640462949, - "flos": 24900315644160.0, - "grad_norm": 2.528943220563754, - "language_loss": 0.68126047, - "learning_rate": 3.710402943207354e-06, - "loss": 0.70295388, - "num_input_tokens_seen": 71123390, - "step": 3297, - "time_per_iteration": 4.258284091949463 - }, - { - "auxiliary_loss_clip": 0.01153129, - "auxiliary_loss_mlp": 0.01037574, - "balance_loss_clip": 1.05660713, - "balance_loss_mlp": 1.02031219, - "epoch": 0.19828648729896287, - "flos": 20376181837440.0, - "grad_norm": 1.9083451106828888, - "language_loss": 0.81310993, - "learning_rate": 3.7102010544606016e-06, - "loss": 0.83501697, - "num_input_tokens_seen": 71141800, - "step": 3298, - "time_per_iteration": 2.6156656742095947 - }, - { - "auxiliary_loss_clip": 0.01137409, - "auxiliary_loss_mlp": 0.01042227, - "balance_loss_clip": 1.0573976, - "balance_loss_mlp": 1.02159238, - "epoch": 0.19834661055163083, - "flos": 18880035615360.0, - "grad_norm": 1.8996943203321497, - "language_loss": 0.85154539, - "learning_rate": 3.7099991008628544e-06, - "loss": 0.87334174, - "num_input_tokens_seen": 71159505, - "step": 3299, - "time_per_iteration": 2.6749041080474854 - }, - { - "auxiliary_loss_clip": 0.01036953, - "auxiliary_loss_mlp": 0.01013935, - "balance_loss_clip": 1.02875936, - "balance_loss_mlp": 1.01106215, - "epoch": 0.19840673380429882, - "flos": 60259184640000.0, - "grad_norm": 0.82907550606663, - "language_loss": 0.53206414, - "learning_rate": 3.7097970824217706e-06, - "loss": 0.55257303, - "num_input_tokens_seen": 71223265, - "step": 3300, - "time_per_iteration": 4.83857798576355 - }, - { - "auxiliary_loss_clip": 0.01105122, - "auxiliary_loss_mlp": 0.01064471, - "balance_loss_clip": 1.04748702, - "balance_loss_mlp": 1.0410459, - "epoch": 0.1984668570569668, - "flos": 19902017376000.0, - "grad_norm": 316.1702389408657, - "language_loss": 0.73014295, - "learning_rate": 3.7095949991450093e-06, - "loss": 0.75183886, - "num_input_tokens_seen": 71242385, - "step": 3301, - "time_per_iteration": 2.700654983520508 - }, - { - "auxiliary_loss_clip": 0.01118926, - "auxiliary_loss_mlp": 0.01044315, - "balance_loss_clip": 1.05295372, - "balance_loss_mlp": 1.02619529, - "epoch": 0.19852698030963475, - "flos": 15630766295040.0, - "grad_norm": 2.410718710355122, - "language_loss": 0.88264418, - "learning_rate": 3.709392851040235e-06, - "loss": 0.90427655, - "num_input_tokens_seen": 71258990, - "step": 3302, - "time_per_iteration": 2.7190146446228027 - }, - { - "auxiliary_loss_clip": 0.01118067, - "auxiliary_loss_mlp": 0.01045078, - "balance_loss_clip": 1.05155802, - "balance_loss_mlp": 1.02661204, - "epoch": 0.19858710356230272, - "flos": 43143007311360.0, - "grad_norm": 2.210364764996701, - "language_loss": 0.73592931, - "learning_rate": 3.709190638115111e-06, - "loss": 0.75756073, - "num_input_tokens_seen": 71282770, - "step": 3303, - "time_per_iteration": 2.9379186630249023 - }, - { - "auxiliary_loss_clip": 0.01143275, - "auxiliary_loss_mlp": 0.01048515, - "balance_loss_clip": 1.05491257, - "balance_loss_mlp": 1.03002524, - "epoch": 0.19864722681497068, - "flos": 35144084643840.0, - "grad_norm": 1.9482807590384623, - "language_loss": 0.75103521, - "learning_rate": 3.7089883603773084e-06, - "loss": 0.77295315, - "num_input_tokens_seen": 71301410, - "step": 3304, - "time_per_iteration": 2.743474245071411 - }, - { - "auxiliary_loss_clip": 0.01133571, - "auxiliary_loss_mlp": 0.01034983, - "balance_loss_clip": 1.05309725, - "balance_loss_mlp": 1.01710188, - "epoch": 0.19870735006763865, - "flos": 19426200888960.0, - "grad_norm": 1.8722016114425952, - "language_loss": 0.8628391, - "learning_rate": 3.7087860178344955e-06, - "loss": 0.8845247, - "num_input_tokens_seen": 71319670, - "step": 3305, - "time_per_iteration": 2.7129390239715576 - }, - { - "auxiliary_loss_clip": 0.01128329, - "auxiliary_loss_mlp": 0.01044081, - "balance_loss_clip": 1.04770195, - "balance_loss_mlp": 1.02603281, - "epoch": 0.19876747332030664, - "flos": 23547380947200.0, - "grad_norm": 2.9829227362861106, - "language_loss": 0.68476367, - "learning_rate": 3.7085836104943445e-06, - "loss": 0.70648777, - "num_input_tokens_seen": 71339850, - "step": 3306, - "time_per_iteration": 2.7083208560943604 - }, - { - "auxiliary_loss_clip": 0.01119386, - "auxiliary_loss_mlp": 0.01038782, - "balance_loss_clip": 1.04822719, - "balance_loss_mlp": 1.02168787, - "epoch": 0.1988275965729746, - "flos": 19829406032640.0, - "grad_norm": 1.683647244561179, - "language_loss": 0.76433122, - "learning_rate": 3.7083811383645332e-06, - "loss": 0.78591287, - "num_input_tokens_seen": 71359795, - "step": 3307, - "time_per_iteration": 2.728661298751831 - }, - { - "auxiliary_loss_clip": 0.01157548, - "auxiliary_loss_mlp": 0.01044665, - "balance_loss_clip": 1.05895782, - "balance_loss_mlp": 1.02714145, - "epoch": 0.19888771982564257, - "flos": 23513625141120.0, - "grad_norm": 2.438172575069382, - "language_loss": 0.75991976, - "learning_rate": 3.708178601452737e-06, - "loss": 0.78194201, - "num_input_tokens_seen": 71378885, - "step": 3308, - "time_per_iteration": 2.6580557823181152 - }, - { - "auxiliary_loss_clip": 0.01107283, - "auxiliary_loss_mlp": 0.01041656, - "balance_loss_clip": 1.05453563, - "balance_loss_mlp": 1.02307141, - "epoch": 0.19894784307831054, - "flos": 18150510389760.0, - "grad_norm": 1.928689575161362, - "language_loss": 0.76043576, - "learning_rate": 3.7079759997666374e-06, - "loss": 0.7819252, - "num_input_tokens_seen": 71397285, - "step": 3309, - "time_per_iteration": 2.77226185798645 - }, - { - "auxiliary_loss_clip": 0.0114115, - "auxiliary_loss_mlp": 0.01045061, - "balance_loss_clip": 1.05222607, - "balance_loss_mlp": 1.02592754, - "epoch": 0.1990079663309785, - "flos": 24276044246400.0, - "grad_norm": 75.17312936609292, - "language_loss": 0.87855697, - "learning_rate": 3.707773333313917e-06, - "loss": 0.90041906, - "num_input_tokens_seen": 71415775, - "step": 3310, - "time_per_iteration": 2.6789662837982178 - }, - { - "auxiliary_loss_clip": 0.01153037, - "auxiliary_loss_mlp": 0.01039864, - "balance_loss_clip": 1.05415869, - "balance_loss_mlp": 1.02139854, - "epoch": 0.19906808958364647, - "flos": 34897666366080.0, - "grad_norm": 2.3155756588664342, - "language_loss": 0.63650048, - "learning_rate": 3.70757060210226e-06, - "loss": 0.6584295, - "num_input_tokens_seen": 71437315, - "step": 3311, - "time_per_iteration": 2.7604620456695557 - }, - { - "auxiliary_loss_clip": 0.01115133, - "auxiliary_loss_mlp": 0.01043871, - "balance_loss_clip": 1.04763019, - "balance_loss_mlp": 1.02501202, - "epoch": 0.19912821283631443, - "flos": 24024885373440.0, - "grad_norm": 3.8064295514597717, - "language_loss": 0.74542546, - "learning_rate": 3.707367806139355e-06, - "loss": 0.76701546, - "num_input_tokens_seen": 71456320, - "step": 3312, - "time_per_iteration": 2.796475410461426 - }, - { - "auxiliary_loss_clip": 0.01141587, - "auxiliary_loss_mlp": 0.01037435, - "balance_loss_clip": 1.05358124, - "balance_loss_mlp": 1.02017355, - "epoch": 0.19918833608898243, - "flos": 19859031774720.0, - "grad_norm": 2.2312990164825943, - "language_loss": 0.84033173, - "learning_rate": 3.7071649454328915e-06, - "loss": 0.86212194, - "num_input_tokens_seen": 71475360, - "step": 3313, - "time_per_iteration": 2.6044952869415283 - }, - { - "auxiliary_loss_clip": 0.01146797, - "auxiliary_loss_mlp": 0.01042166, - "balance_loss_clip": 1.05695391, - "balance_loss_mlp": 1.02422476, - "epoch": 0.1992484593416504, - "flos": 29095794984960.0, - "grad_norm": 3.856678450124864, - "language_loss": 0.810305, - "learning_rate": 3.7069620199905625e-06, - "loss": 0.83219463, - "num_input_tokens_seen": 71496155, - "step": 3314, - "time_per_iteration": 2.68841814994812 - }, - { - "auxiliary_loss_clip": 0.01112846, - "auxiliary_loss_mlp": 0.01043677, - "balance_loss_clip": 1.04617178, - "balance_loss_mlp": 1.02643955, - "epoch": 0.19930858259431836, - "flos": 23295001011840.0, - "grad_norm": 1.4822079401394097, - "language_loss": 0.87391549, - "learning_rate": 3.7067590298200627e-06, - "loss": 0.89548075, - "num_input_tokens_seen": 71517295, - "step": 3315, - "time_per_iteration": 2.720093011856079 - }, - { - "auxiliary_loss_clip": 0.0111589, - "auxiliary_loss_mlp": 0.00777002, - "balance_loss_clip": 1.04992676, - "balance_loss_mlp": 1.00093687, - "epoch": 0.19936870584698632, - "flos": 25378825651200.0, - "grad_norm": 1.7805516248937883, - "language_loss": 0.70957202, - "learning_rate": 3.7065559749290892e-06, - "loss": 0.72850096, - "num_input_tokens_seen": 71540000, - "step": 3316, - "time_per_iteration": 2.850100517272949 - }, - { - "auxiliary_loss_clip": 0.01019745, - "auxiliary_loss_mlp": 0.01012504, - "balance_loss_clip": 1.03032303, - "balance_loss_mlp": 1.01003671, - "epoch": 0.1994288290996543, - "flos": 62168053109760.0, - "grad_norm": 0.8326978726055106, - "language_loss": 0.66287398, - "learning_rate": 3.706352855325342e-06, - "loss": 0.68319643, - "num_input_tokens_seen": 71607880, - "step": 3317, - "time_per_iteration": 3.425114870071411 - }, - { - "auxiliary_loss_clip": 0.01148059, - "auxiliary_loss_mlp": 0.01048913, - "balance_loss_clip": 1.05397809, - "balance_loss_mlp": 1.02964854, - "epoch": 0.19948895235232225, - "flos": 19025832919680.0, - "grad_norm": 2.282515690517884, - "language_loss": 0.74494618, - "learning_rate": 3.7061496710165233e-06, - "loss": 0.76691592, - "num_input_tokens_seen": 71625695, - "step": 3318, - "time_per_iteration": 2.6815896034240723 - }, - { - "auxiliary_loss_clip": 0.01114942, - "auxiliary_loss_mlp": 0.01044681, - "balance_loss_clip": 1.04767084, - "balance_loss_mlp": 1.02786088, - "epoch": 0.19954907560499022, - "flos": 37815803182080.0, - "grad_norm": 1.8966456913695608, - "language_loss": 0.78894758, - "learning_rate": 3.7059464220103385e-06, - "loss": 0.81054389, - "num_input_tokens_seen": 71648520, - "step": 3319, - "time_per_iteration": 2.847911834716797 - }, - { - "auxiliary_loss_clip": 0.01134557, - "auxiliary_loss_mlp": 0.01042988, - "balance_loss_clip": 1.05354095, - "balance_loss_mlp": 1.02312756, - "epoch": 0.1996091988576582, - "flos": 49565199594240.0, - "grad_norm": 2.1348540211051197, - "language_loss": 0.76006937, - "learning_rate": 3.7057431083144945e-06, - "loss": 0.78184479, - "num_input_tokens_seen": 71672185, - "step": 3320, - "time_per_iteration": 2.9324615001678467 - }, - { - "auxiliary_loss_clip": 0.01120226, - "auxiliary_loss_mlp": 0.01042998, - "balance_loss_clip": 1.05083311, - "balance_loss_mlp": 1.02496171, - "epoch": 0.19966932211032618, - "flos": 22635788659200.0, - "grad_norm": 2.2436863685702546, - "language_loss": 0.80077857, - "learning_rate": 3.705539729936701e-06, - "loss": 0.82241082, - "num_input_tokens_seen": 71692890, - "step": 3321, - "time_per_iteration": 2.7534186840057373 - }, - { - "auxiliary_loss_clip": 0.01033096, - "auxiliary_loss_mlp": 0.01011167, - "balance_loss_clip": 1.02391553, - "balance_loss_mlp": 1.00828266, - "epoch": 0.19972944536299414, - "flos": 54082117745280.0, - "grad_norm": 0.874673110280983, - "language_loss": 0.65145189, - "learning_rate": 3.7053362868846696e-06, - "loss": 0.67189455, - "num_input_tokens_seen": 71745815, - "step": 3322, - "time_per_iteration": 3.0398683547973633 - }, - { - "auxiliary_loss_clip": 0.01039999, - "auxiliary_loss_mlp": 0.01007775, - "balance_loss_clip": 1.02971482, - "balance_loss_mlp": 1.00479472, - "epoch": 0.1997895686156621, - "flos": 69355031817600.0, - "grad_norm": 0.7915334307535052, - "language_loss": 0.56919783, - "learning_rate": 3.7051327791661153e-06, - "loss": 0.58967561, - "num_input_tokens_seen": 71806915, - "step": 3323, - "time_per_iteration": 3.2814581394195557 - }, - { - "auxiliary_loss_clip": 0.01131487, - "auxiliary_loss_mlp": 0.00776139, - "balance_loss_clip": 1.05244064, - "balance_loss_mlp": 1.00085235, - "epoch": 0.19984969186833007, - "flos": 18552063507840.0, - "grad_norm": 1.8766856730809967, - "language_loss": 0.80573648, - "learning_rate": 3.7049292067887555e-06, - "loss": 0.82481277, - "num_input_tokens_seen": 71824645, - "step": 3324, - "time_per_iteration": 2.66456937789917 - }, - { - "auxiliary_loss_clip": 0.01132572, - "auxiliary_loss_mlp": 0.01050254, - "balance_loss_clip": 1.04625165, - "balance_loss_mlp": 1.03027487, - "epoch": 0.19990981512099804, - "flos": 26429678968320.0, - "grad_norm": 2.4535669107623486, - "language_loss": 0.53931105, - "learning_rate": 3.7047255697603092e-06, - "loss": 0.56113935, - "num_input_tokens_seen": 71845125, - "step": 3325, - "time_per_iteration": 2.696556329727173 - }, - { - "auxiliary_loss_clip": 0.01130165, - "auxiliary_loss_mlp": 0.01050725, - "balance_loss_clip": 1.05065942, - "balance_loss_mlp": 1.03328443, - "epoch": 0.19996993837366603, - "flos": 16325997010560.0, - "grad_norm": 2.1570763946475187, - "language_loss": 0.86074936, - "learning_rate": 3.7045218680884984e-06, - "loss": 0.88255823, - "num_input_tokens_seen": 71863500, - "step": 3326, - "time_per_iteration": 2.7167885303497314 - }, - { - "auxiliary_loss_clip": 0.0115173, - "auxiliary_loss_mlp": 0.01042065, - "balance_loss_clip": 1.05427039, - "balance_loss_mlp": 1.02511311, - "epoch": 0.200030061626334, - "flos": 20844169159680.0, - "grad_norm": 2.0419576492150395, - "language_loss": 0.71793801, - "learning_rate": 3.7043181017810476e-06, - "loss": 0.73987597, - "num_input_tokens_seen": 71881845, - "step": 3327, - "time_per_iteration": 2.6097662448883057 - }, - { - "auxiliary_loss_clip": 0.01131035, - "auxiliary_loss_mlp": 0.01052756, - "balance_loss_clip": 1.05146813, - "balance_loss_mlp": 1.03290796, - "epoch": 0.20009018487900196, - "flos": 23762629198080.0, - "grad_norm": 1.8948781463857982, - "language_loss": 0.7668376, - "learning_rate": 3.7041142708456833e-06, - "loss": 0.78867549, - "num_input_tokens_seen": 71900940, - "step": 3328, - "time_per_iteration": 2.6869349479675293 - }, - { - "auxiliary_loss_clip": 0.01118681, - "auxiliary_loss_mlp": 0.01044603, - "balance_loss_clip": 1.04693103, - "balance_loss_mlp": 1.02799726, - "epoch": 0.20015030813166992, - "flos": 28111555440000.0, - "grad_norm": 2.0833377369651984, - "language_loss": 0.69400644, - "learning_rate": 3.7039103752901353e-06, - "loss": 0.71563935, - "num_input_tokens_seen": 71921925, - "step": 3329, - "time_per_iteration": 2.844280481338501 - }, - { - "auxiliary_loss_clip": 0.01107384, - "auxiliary_loss_mlp": 0.01069575, - "balance_loss_clip": 1.04727411, - "balance_loss_mlp": 1.04641271, - "epoch": 0.2002104313843379, - "flos": 26067160955520.0, - "grad_norm": 3.099532194576676, - "language_loss": 0.81395614, - "learning_rate": 3.7037064151221353e-06, - "loss": 0.83572567, - "num_input_tokens_seen": 71941855, - "step": 3330, - "time_per_iteration": 2.841885566711426 - }, - { - "auxiliary_loss_clip": 0.01137825, - "auxiliary_loss_mlp": 0.01048123, - "balance_loss_clip": 1.05147684, - "balance_loss_mlp": 1.02977705, - "epoch": 0.20027055463700585, - "flos": 22966633854720.0, - "grad_norm": 2.224132696455658, - "language_loss": 0.76606882, - "learning_rate": 3.703502390349417e-06, - "loss": 0.78792834, - "num_input_tokens_seen": 71960915, - "step": 3331, - "time_per_iteration": 2.7007360458374023 - }, - { - "auxiliary_loss_clip": 0.01093521, - "auxiliary_loss_mlp": 0.01069739, - "balance_loss_clip": 1.04292202, - "balance_loss_mlp": 1.04851985, - "epoch": 0.20033067788967382, - "flos": 17165660313600.0, - "grad_norm": 2.044808670508971, - "language_loss": 0.79330826, - "learning_rate": 3.7032983009797176e-06, - "loss": 0.81494087, - "num_input_tokens_seen": 71979220, - "step": 3332, - "time_per_iteration": 4.518973112106323 - }, - { - "auxiliary_loss_clip": 0.01046467, - "auxiliary_loss_mlp": 0.010754, - "balance_loss_clip": 1.02134657, - "balance_loss_mlp": 1.07303989, - "epoch": 0.2003908011423418, - "flos": 60825566292480.0, - "grad_norm": 0.9607431077817938, - "language_loss": 0.61968678, - "learning_rate": 3.703094147020776e-06, - "loss": 0.64090544, - "num_input_tokens_seen": 72033950, - "step": 3333, - "time_per_iteration": 3.074782371520996 - }, - { - "auxiliary_loss_clip": 0.01112058, - "auxiliary_loss_mlp": 0.00777645, - "balance_loss_clip": 1.04686844, - "balance_loss_mlp": 1.00099933, - "epoch": 0.20045092439500978, - "flos": 24206234163840.0, - "grad_norm": 2.9954165903614447, - "language_loss": 0.81385547, - "learning_rate": 3.7028899284803334e-06, - "loss": 0.83275253, - "num_input_tokens_seen": 72051395, - "step": 3334, - "time_per_iteration": 4.270732641220093 - }, - { - "auxiliary_loss_clip": 0.01096467, - "auxiliary_loss_mlp": 0.01058699, - "balance_loss_clip": 1.04709518, - "balance_loss_mlp": 1.03889799, - "epoch": 0.20051104764767774, - "flos": 29387605075200.0, - "grad_norm": 2.9016061168315703, - "language_loss": 0.74238038, - "learning_rate": 3.702685645366134e-06, - "loss": 0.76393211, - "num_input_tokens_seen": 72071305, - "step": 3335, - "time_per_iteration": 4.376626491546631 - }, - { - "auxiliary_loss_clip": 0.01149242, - "auxiliary_loss_mlp": 0.01059851, - "balance_loss_clip": 1.05611062, - "balance_loss_mlp": 1.04120684, - "epoch": 0.2005711709003457, - "flos": 23513804709120.0, - "grad_norm": 1.700795836589561, - "language_loss": 0.79981416, - "learning_rate": 3.7024812976859243e-06, - "loss": 0.82190514, - "num_input_tokens_seen": 72090165, - "step": 3336, - "time_per_iteration": 2.7031586170196533 - }, - { - "auxiliary_loss_clip": 0.01116655, - "auxiliary_loss_mlp": 0.01048065, - "balance_loss_clip": 1.04808092, - "balance_loss_mlp": 1.0272038, - "epoch": 0.20063129415301367, - "flos": 22523388024960.0, - "grad_norm": 2.0182523905302157, - "language_loss": 0.7761423, - "learning_rate": 3.7022768854474532e-06, - "loss": 0.79778945, - "num_input_tokens_seen": 72107210, - "step": 3337, - "time_per_iteration": 2.6990835666656494 - }, - { - "auxiliary_loss_clip": 0.01158617, - "auxiliary_loss_mlp": 0.01045618, - "balance_loss_clip": 1.05752003, - "balance_loss_mlp": 1.02631783, - "epoch": 0.20069141740568164, - "flos": 25958243940480.0, - "grad_norm": 2.232061800350416, - "language_loss": 0.69108742, - "learning_rate": 3.7020724086584724e-06, - "loss": 0.71312982, - "num_input_tokens_seen": 72126315, - "step": 3338, - "time_per_iteration": 2.6827659606933594 - }, - { - "auxiliary_loss_clip": 0.01117671, - "auxiliary_loss_mlp": 0.01053755, - "balance_loss_clip": 1.04930723, - "balance_loss_mlp": 1.03543282, - "epoch": 0.2007515406583496, - "flos": 24790608529920.0, - "grad_norm": 2.685005372503905, - "language_loss": 0.68898237, - "learning_rate": 3.701867867326735e-06, - "loss": 0.71069658, - "num_input_tokens_seen": 72146470, - "step": 3339, - "time_per_iteration": 4.430418014526367 - }, - { - "auxiliary_loss_clip": 0.01123098, - "auxiliary_loss_mlp": 0.01041763, - "balance_loss_clip": 1.05656064, - "balance_loss_mlp": 1.02408433, - "epoch": 0.2008116639110176, - "flos": 37925582123520.0, - "grad_norm": 2.0597617887640607, - "language_loss": 0.66606021, - "learning_rate": 3.7016632614599974e-06, - "loss": 0.6877088, - "num_input_tokens_seen": 72166600, - "step": 3340, - "time_per_iteration": 3.0020461082458496 - }, - { - "auxiliary_loss_clip": 0.01145166, - "auxiliary_loss_mlp": 0.01036815, - "balance_loss_clip": 1.05326021, - "balance_loss_mlp": 1.01712155, - "epoch": 0.20087178716368556, - "flos": 20740531443840.0, - "grad_norm": 6.669810478748975, - "language_loss": 0.74554622, - "learning_rate": 3.701458591066019e-06, - "loss": 0.76736599, - "num_input_tokens_seen": 72185160, - "step": 3341, - "time_per_iteration": 2.762573480606079 - }, - { - "auxiliary_loss_clip": 0.01110242, - "auxiliary_loss_mlp": 0.01044424, - "balance_loss_clip": 1.04981375, - "balance_loss_mlp": 1.02595794, - "epoch": 0.20093191041635353, - "flos": 23842279607040.0, - "grad_norm": 7.177474445031109, - "language_loss": 0.71779013, - "learning_rate": 3.70125385615256e-06, - "loss": 0.73933673, - "num_input_tokens_seen": 72205160, - "step": 3342, - "time_per_iteration": 2.7128167152404785 - }, - { - "auxiliary_loss_clip": 0.01114025, - "auxiliary_loss_mlp": 0.01045057, - "balance_loss_clip": 1.05036438, - "balance_loss_mlp": 1.02749765, - "epoch": 0.2009920336690215, - "flos": 21792067119360.0, - "grad_norm": 2.3652416151608873, - "language_loss": 0.72892809, - "learning_rate": 3.701049056727384e-06, - "loss": 0.75051892, - "num_input_tokens_seen": 72223555, - "step": 3343, - "time_per_iteration": 2.8155410289764404 - }, - { - "auxiliary_loss_clip": 0.01113341, - "auxiliary_loss_mlp": 0.01046556, - "balance_loss_clip": 1.04568779, - "balance_loss_mlp": 1.02762532, - "epoch": 0.20105215692168946, - "flos": 26359222440960.0, - "grad_norm": 2.2972411099560195, - "language_loss": 0.80645263, - "learning_rate": 3.7008441927982574e-06, - "loss": 0.82805163, - "num_input_tokens_seen": 72242465, - "step": 3344, - "time_per_iteration": 2.780198335647583 - }, - { - "auxiliary_loss_clip": 0.01155099, - "auxiliary_loss_mlp": 0.01045938, - "balance_loss_clip": 1.05386972, - "balance_loss_mlp": 1.02773499, - "epoch": 0.20111228017435742, - "flos": 18807280617600.0, - "grad_norm": 2.2640230255386125, - "language_loss": 0.83114576, - "learning_rate": 3.700639264372948e-06, - "loss": 0.85315621, - "num_input_tokens_seen": 72260655, - "step": 3345, - "time_per_iteration": 2.6209781169891357 - }, - { - "auxiliary_loss_clip": 0.01093716, - "auxiliary_loss_mlp": 0.01041329, - "balance_loss_clip": 1.04619193, - "balance_loss_mlp": 1.02492619, - "epoch": 0.20117240342702541, - "flos": 19975059682560.0, - "grad_norm": 1.7610524328763844, - "language_loss": 0.67947632, - "learning_rate": 3.7004342714592283e-06, - "loss": 0.70082676, - "num_input_tokens_seen": 72279055, - "step": 3346, - "time_per_iteration": 2.692222833633423 - }, - { - "auxiliary_loss_clip": 0.01114086, - "auxiliary_loss_mlp": 0.01048128, - "balance_loss_clip": 1.04710329, - "balance_loss_mlp": 1.03028262, - "epoch": 0.20123252667969338, - "flos": 23142703345920.0, - "grad_norm": 2.3067659385334958, - "language_loss": 0.72993439, - "learning_rate": 3.70022921406487e-06, - "loss": 0.75155658, - "num_input_tokens_seen": 72297895, - "step": 3347, - "time_per_iteration": 2.7501564025878906 - }, - { - "auxiliary_loss_clip": 0.01142236, - "auxiliary_loss_mlp": 0.01047715, - "balance_loss_clip": 1.05465829, - "balance_loss_mlp": 1.03122878, - "epoch": 0.20129264993236134, - "flos": 23221671396480.0, - "grad_norm": 1.5798788242702444, - "language_loss": 0.86869538, - "learning_rate": 3.70002409219765e-06, - "loss": 0.8905949, - "num_input_tokens_seen": 72318385, - "step": 3348, - "time_per_iteration": 2.688606023788452 - }, - { - "auxiliary_loss_clip": 0.01099793, - "auxiliary_loss_mlp": 0.01045183, - "balance_loss_clip": 1.04737949, - "balance_loss_mlp": 1.02587092, - "epoch": 0.2013527731850293, - "flos": 21871466133120.0, - "grad_norm": 1.8024729376762028, - "language_loss": 0.71082795, - "learning_rate": 3.699818905865346e-06, - "loss": 0.73227775, - "num_input_tokens_seen": 72338235, - "step": 3349, - "time_per_iteration": 2.8423163890838623 - }, - { - "auxiliary_loss_clip": 0.01119982, - "auxiliary_loss_mlp": 0.01044662, - "balance_loss_clip": 1.0504061, - "balance_loss_mlp": 1.02520752, - "epoch": 0.20141289643769728, - "flos": 18040803275520.0, - "grad_norm": 1.7324672298731074, - "language_loss": 0.71324664, - "learning_rate": 3.6996136550757377e-06, - "loss": 0.73489314, - "num_input_tokens_seen": 72357825, - "step": 3350, - "time_per_iteration": 2.7691454887390137 - }, - { - "auxiliary_loss_clip": 0.01126392, - "auxiliary_loss_mlp": 0.01043835, - "balance_loss_clip": 1.0497458, - "balance_loss_mlp": 1.02312887, - "epoch": 0.20147301969036524, - "flos": 23951412103680.0, - "grad_norm": 2.3965463087123107, - "language_loss": 0.76391226, - "learning_rate": 3.69940833983661e-06, - "loss": 0.78561449, - "num_input_tokens_seen": 72376335, - "step": 3351, - "time_per_iteration": 2.701244592666626 - }, - { - "auxiliary_loss_clip": 0.01134085, - "auxiliary_loss_mlp": 0.01047695, - "balance_loss_clip": 1.05303741, - "balance_loss_mlp": 1.02840734, - "epoch": 0.2015331429430332, - "flos": 25588471380480.0, - "grad_norm": 1.5574195085232978, - "language_loss": 0.80808926, - "learning_rate": 3.699202960155748e-06, - "loss": 0.82990712, - "num_input_tokens_seen": 72395440, - "step": 3352, - "time_per_iteration": 2.707792043685913 - }, - { - "auxiliary_loss_clip": 0.011457, - "auxiliary_loss_mlp": 0.01042883, - "balance_loss_clip": 1.05415952, - "balance_loss_mlp": 1.0244298, - "epoch": 0.2015932661957012, - "flos": 26724972677760.0, - "grad_norm": 1.9831574274346238, - "language_loss": 0.80594563, - "learning_rate": 3.6989975160409396e-06, - "loss": 0.82783151, - "num_input_tokens_seen": 72414670, - "step": 3353, - "time_per_iteration": 2.675960063934326 - }, - { - "auxiliary_loss_clip": 0.01126272, - "auxiliary_loss_mlp": 0.01045978, - "balance_loss_clip": 1.05195928, - "balance_loss_mlp": 1.02787042, - "epoch": 0.20165338944836916, - "flos": 15633136592640.0, - "grad_norm": 2.0684163707657763, - "language_loss": 0.90046668, - "learning_rate": 3.6987920074999747e-06, - "loss": 0.92218912, - "num_input_tokens_seen": 72432210, - "step": 3354, - "time_per_iteration": 2.6648361682891846 - }, - { - "auxiliary_loss_clip": 0.0104514, - "auxiliary_loss_mlp": 0.0075774, - "balance_loss_clip": 1.0285337, - "balance_loss_mlp": 1.00170481, - "epoch": 0.20171351270103713, - "flos": 57912529207680.0, - "grad_norm": 0.8264169258847935, - "language_loss": 0.55863291, - "learning_rate": 3.6985864345406465e-06, - "loss": 0.57666171, - "num_input_tokens_seen": 72489225, - "step": 3355, - "time_per_iteration": 3.155352830886841 - }, - { - "auxiliary_loss_clip": 0.01127799, - "auxiliary_loss_mlp": 0.00776255, - "balance_loss_clip": 1.05133796, - "balance_loss_mlp": 1.00109434, - "epoch": 0.2017736359537051, - "flos": 20814363849600.0, - "grad_norm": 1.8367443502770229, - "language_loss": 0.84333616, - "learning_rate": 3.698380797170751e-06, - "loss": 0.86237669, - "num_input_tokens_seen": 72508715, - "step": 3356, - "time_per_iteration": 2.754645586013794 - }, - { - "auxiliary_loss_clip": 0.01127514, - "auxiliary_loss_mlp": 0.01052066, - "balance_loss_clip": 1.04904747, - "balance_loss_mlp": 1.02811635, - "epoch": 0.20183375920637306, - "flos": 17092043389440.0, - "grad_norm": 3.2349249330618504, - "language_loss": 0.70046175, - "learning_rate": 3.698175095398085e-06, - "loss": 0.72225749, - "num_input_tokens_seen": 72525135, - "step": 3357, - "time_per_iteration": 2.6905863285064697 - }, - { - "auxiliary_loss_clip": 0.0113535, - "auxiliary_loss_mlp": 0.01044956, - "balance_loss_clip": 1.05209541, - "balance_loss_mlp": 1.02590632, - "epoch": 0.20189388245904102, - "flos": 18661339658880.0, - "grad_norm": 2.41944886120848, - "language_loss": 0.7169627, - "learning_rate": 3.6979693292304493e-06, - "loss": 0.73876572, - "num_input_tokens_seen": 72543690, - "step": 3358, - "time_per_iteration": 2.696295738220215 - }, - { - "auxiliary_loss_clip": 0.01139673, - "auxiliary_loss_mlp": 0.01052145, - "balance_loss_clip": 1.05050206, - "balance_loss_mlp": 1.03496706, - "epoch": 0.20195400571170902, - "flos": 16797539779200.0, - "grad_norm": 2.6870341127491675, - "language_loss": 0.83242267, - "learning_rate": 3.6977634986756463e-06, - "loss": 0.85434085, - "num_input_tokens_seen": 72560725, - "step": 3359, - "time_per_iteration": 2.6779677867889404 - }, - { - "auxiliary_loss_clip": 0.01052166, - "auxiliary_loss_mlp": 0.01026452, - "balance_loss_clip": 1.02534354, - "balance_loss_mlp": 1.02345943, - "epoch": 0.20201412896437698, - "flos": 67174716268800.0, - "grad_norm": 0.8259567660078829, - "language_loss": 0.58980465, - "learning_rate": 3.697557603741482e-06, - "loss": 0.61059082, - "num_input_tokens_seen": 72621940, - "step": 3360, - "time_per_iteration": 3.1175289154052734 - }, - { - "auxiliary_loss_clip": 0.01096543, - "auxiliary_loss_mlp": 0.01051237, - "balance_loss_clip": 1.05081403, - "balance_loss_mlp": 1.03154337, - "epoch": 0.20207425221704495, - "flos": 21325013550720.0, - "grad_norm": 2.668010943284884, - "language_loss": 0.63219774, - "learning_rate": 3.697351644435763e-06, - "loss": 0.65367556, - "num_input_tokens_seen": 72639135, - "step": 3361, - "time_per_iteration": 2.7732017040252686 - }, - { - "auxiliary_loss_clip": 0.01119862, - "auxiliary_loss_mlp": 0.01069748, - "balance_loss_clip": 1.04988885, - "balance_loss_mlp": 1.05035317, - "epoch": 0.2021343754697129, - "flos": 22527158952960.0, - "grad_norm": 1.9150118782569074, - "language_loss": 0.75946522, - "learning_rate": 3.6971456207662993e-06, - "loss": 0.78136134, - "num_input_tokens_seen": 72658525, - "step": 3362, - "time_per_iteration": 2.755686044692993 - }, - { - "auxiliary_loss_clip": 0.01139499, - "auxiliary_loss_mlp": 0.00777827, - "balance_loss_clip": 1.05068207, - "balance_loss_mlp": 1.0011797, - "epoch": 0.20219449872238088, - "flos": 19062785036160.0, - "grad_norm": 2.043450343479612, - "language_loss": 0.76542944, - "learning_rate": 3.6969395327409035e-06, - "loss": 0.78460264, - "num_input_tokens_seen": 72678085, - "step": 3363, - "time_per_iteration": 2.788773775100708 - }, - { - "auxiliary_loss_clip": 0.01143235, - "auxiliary_loss_mlp": 0.01068217, - "balance_loss_clip": 1.05241406, - "balance_loss_mlp": 1.0511229, - "epoch": 0.20225462197504884, - "flos": 24717027519360.0, - "grad_norm": 1.8380065969237507, - "language_loss": 0.75088942, - "learning_rate": 3.696733380367391e-06, - "loss": 0.773004, - "num_input_tokens_seen": 72698695, - "step": 3364, - "time_per_iteration": 2.7484803199768066 - }, - { - "auxiliary_loss_clip": 0.01111683, - "auxiliary_loss_mlp": 0.01065374, - "balance_loss_clip": 1.05202723, - "balance_loss_mlp": 1.04583549, - "epoch": 0.2023147452277168, - "flos": 22018304931840.0, - "grad_norm": 2.1478979049108395, - "language_loss": 0.71917796, - "learning_rate": 3.6965271636535783e-06, - "loss": 0.7409485, - "num_input_tokens_seen": 72717880, - "step": 3365, - "time_per_iteration": 2.770939350128174 - }, - { - "auxiliary_loss_clip": 0.01110149, - "auxiliary_loss_mlp": 0.01064133, - "balance_loss_clip": 1.04989934, - "balance_loss_mlp": 1.04559648, - "epoch": 0.2023748684803848, - "flos": 17745365911680.0, - "grad_norm": 2.2136098995040228, - "language_loss": 0.85318875, - "learning_rate": 3.696320882607286e-06, - "loss": 0.87493157, - "num_input_tokens_seen": 72736410, - "step": 3366, - "time_per_iteration": 2.717759609222412 - }, - { - "auxiliary_loss_clip": 0.01116913, - "auxiliary_loss_mlp": 0.0106476, - "balance_loss_clip": 1.050488, - "balance_loss_mlp": 1.04605615, - "epoch": 0.20243499173305277, - "flos": 31138932493440.0, - "grad_norm": 2.048733189447585, - "language_loss": 0.69766563, - "learning_rate": 3.696114537236335e-06, - "loss": 0.71948242, - "num_input_tokens_seen": 72758295, - "step": 3367, - "time_per_iteration": 2.788444995880127 - }, - { - "auxiliary_loss_clip": 0.01144949, - "auxiliary_loss_mlp": 0.01060722, - "balance_loss_clip": 1.04997301, - "balance_loss_mlp": 1.03857303, - "epoch": 0.20249511498572073, - "flos": 33839235279360.0, - "grad_norm": 1.942153338299175, - "language_loss": 0.68162113, - "learning_rate": 3.6959081275485512e-06, - "loss": 0.70367789, - "num_input_tokens_seen": 72782495, - "step": 3368, - "time_per_iteration": 2.7339746952056885 - }, - { - "auxiliary_loss_clip": 0.01123527, - "auxiliary_loss_mlp": 0.01063426, - "balance_loss_clip": 1.0543493, - "balance_loss_mlp": 1.04405439, - "epoch": 0.2025552382383887, - "flos": 21215629658880.0, - "grad_norm": 1.8860162071579365, - "language_loss": 0.77298439, - "learning_rate": 3.6957016535517615e-06, - "loss": 0.79485393, - "num_input_tokens_seen": 72801885, - "step": 3369, - "time_per_iteration": 2.739088535308838 - }, - { - "auxiliary_loss_clip": 0.01136965, - "auxiliary_loss_mlp": 0.01071822, - "balance_loss_clip": 1.05140853, - "balance_loss_mlp": 1.05315351, - "epoch": 0.20261536149105666, - "flos": 14647388676480.0, - "grad_norm": 2.9806431283259354, - "language_loss": 0.65055734, - "learning_rate": 3.695495115253795e-06, - "loss": 0.67264521, - "num_input_tokens_seen": 72816990, - "step": 3370, - "time_per_iteration": 2.7082977294921875 - }, - { - "auxiliary_loss_clip": 0.0105828, - "auxiliary_loss_mlp": 0.01019528, - "balance_loss_clip": 1.03235602, - "balance_loss_mlp": 1.01690567, - "epoch": 0.20267548474372463, - "flos": 66783649921920.0, - "grad_norm": 0.678414814309544, - "language_loss": 0.58126765, - "learning_rate": 3.6952885126624834e-06, - "loss": 0.60204571, - "num_input_tokens_seen": 72879240, - "step": 3371, - "time_per_iteration": 4.805691242218018 - }, - { - "auxiliary_loss_clip": 0.01117624, - "auxiliary_loss_mlp": 0.01050757, - "balance_loss_clip": 1.04833245, - "balance_loss_mlp": 1.0329231, - "epoch": 0.2027356079963926, - "flos": 24680793674880.0, - "grad_norm": 2.167047343870177, - "language_loss": 0.91830015, - "learning_rate": 3.6950818457856617e-06, - "loss": 0.9399839, - "num_input_tokens_seen": 72899030, - "step": 3372, - "time_per_iteration": 4.306687831878662 - }, - { - "auxiliary_loss_clip": 0.01137734, - "auxiliary_loss_mlp": 0.01057192, - "balance_loss_clip": 1.05065978, - "balance_loss_mlp": 1.03598428, - "epoch": 0.20279573124906058, - "flos": 26392762765440.0, - "grad_norm": 2.1240220719821195, - "language_loss": 0.78505349, - "learning_rate": 3.694875114631167e-06, - "loss": 0.80700278, - "num_input_tokens_seen": 72919190, - "step": 3373, - "time_per_iteration": 4.223219394683838 - }, - { - "auxiliary_loss_clip": 0.01091396, - "auxiliary_loss_mlp": 0.01058555, - "balance_loss_clip": 1.04464257, - "balance_loss_mlp": 1.03719246, - "epoch": 0.20285585450172855, - "flos": 33799984692480.0, - "grad_norm": 2.5403716567908745, - "language_loss": 0.71275264, - "learning_rate": 3.6946683192068377e-06, - "loss": 0.7342521, - "num_input_tokens_seen": 72939720, - "step": 3374, - "time_per_iteration": 2.853079319000244 - }, - { - "auxiliary_loss_clip": 0.01042818, - "auxiliary_loss_mlp": 0.01010518, - "balance_loss_clip": 1.02580416, - "balance_loss_mlp": 1.00797904, - "epoch": 0.20291597775439651, - "flos": 71164823598720.0, - "grad_norm": 0.9711663240936556, - "language_loss": 0.62466931, - "learning_rate": 3.694461459520516e-06, - "loss": 0.64520264, - "num_input_tokens_seen": 73000015, - "step": 3375, - "time_per_iteration": 3.2016799449920654 - }, - { - "auxiliary_loss_clip": 0.01153133, - "auxiliary_loss_mlp": 0.01048539, - "balance_loss_clip": 1.05278802, - "balance_loss_mlp": 1.03021622, - "epoch": 0.20297610100706448, - "flos": 19494287118720.0, - "grad_norm": 1.613636998778186, - "language_loss": 0.82316196, - "learning_rate": 3.6942545355800463e-06, - "loss": 0.84517872, - "num_input_tokens_seen": 73017675, - "step": 3376, - "time_per_iteration": 2.6073458194732666 - }, - { - "auxiliary_loss_clip": 0.01142412, - "auxiliary_loss_mlp": 0.01038523, - "balance_loss_clip": 1.0506475, - "balance_loss_mlp": 1.01912737, - "epoch": 0.20303622425973245, - "flos": 25044245441280.0, - "grad_norm": 2.0454517065820026, - "language_loss": 0.81243992, - "learning_rate": 3.6940475473932743e-06, - "loss": 0.83424926, - "num_input_tokens_seen": 73036135, - "step": 3377, - "time_per_iteration": 2.6802914142608643 - }, - { - "auxiliary_loss_clip": 0.01127133, - "auxiliary_loss_mlp": 0.01049784, - "balance_loss_clip": 1.05416846, - "balance_loss_mlp": 1.03053212, - "epoch": 0.2030963475124004, - "flos": 21979988098560.0, - "grad_norm": 1.9719049052811064, - "language_loss": 0.76726258, - "learning_rate": 3.69384049496805e-06, - "loss": 0.78903174, - "num_input_tokens_seen": 73054075, - "step": 3378, - "time_per_iteration": 2.7052531242370605 - }, - { - "auxiliary_loss_clip": 0.01087342, - "auxiliary_loss_mlp": 0.01049115, - "balance_loss_clip": 1.04531622, - "balance_loss_mlp": 1.02726364, - "epoch": 0.2031564707650684, - "flos": 19500392430720.0, - "grad_norm": 2.0079998756584017, - "language_loss": 0.7982831, - "learning_rate": 3.6936333783122242e-06, - "loss": 0.81964767, - "num_input_tokens_seen": 73073530, - "step": 3379, - "time_per_iteration": 4.379331588745117 - }, - { - "auxiliary_loss_clip": 0.01139431, - "auxiliary_loss_mlp": 0.01039085, - "balance_loss_clip": 1.05384874, - "balance_loss_mlp": 1.02164412, - "epoch": 0.20321659401773637, - "flos": 22747075971840.0, - "grad_norm": 1.5868581768713355, - "language_loss": 0.86639273, - "learning_rate": 3.6934261974336505e-06, - "loss": 0.88817787, - "num_input_tokens_seen": 73092820, - "step": 3380, - "time_per_iteration": 2.7405402660369873 - }, - { - "auxiliary_loss_clip": 0.01156702, - "auxiliary_loss_mlp": 0.01053775, - "balance_loss_clip": 1.05730438, - "balance_loss_mlp": 1.03507149, - "epoch": 0.20327671727040433, - "flos": 22455840499200.0, - "grad_norm": 2.063467458189152, - "language_loss": 0.74637043, - "learning_rate": 3.693218952340186e-06, - "loss": 0.76847517, - "num_input_tokens_seen": 73113385, - "step": 3381, - "time_per_iteration": 2.6237549781799316 - }, - { - "auxiliary_loss_clip": 0.01118794, - "auxiliary_loss_mlp": 0.01042351, - "balance_loss_clip": 1.04590273, - "balance_loss_mlp": 1.02289653, - "epoch": 0.2033368405230723, - "flos": 19535010163200.0, - "grad_norm": 1.6994666268173182, - "language_loss": 0.79167414, - "learning_rate": 3.6930116430396895e-06, - "loss": 0.81328559, - "num_input_tokens_seen": 73131195, - "step": 3382, - "time_per_iteration": 2.6707420349121094 - }, - { - "auxiliary_loss_clip": 0.01113758, - "auxiliary_loss_mlp": 0.00779415, - "balance_loss_clip": 1.0459373, - "balance_loss_mlp": 1.00091934, - "epoch": 0.20339696377574026, - "flos": 13809233744640.0, - "grad_norm": 1.9483404178521286, - "language_loss": 0.8042953, - "learning_rate": 3.6928042695400214e-06, - "loss": 0.82322699, - "num_input_tokens_seen": 73148850, - "step": 3383, - "time_per_iteration": 2.7859487533569336 - }, - { - "auxiliary_loss_clip": 0.01100731, - "auxiliary_loss_mlp": 0.01046151, - "balance_loss_clip": 1.04473877, - "balance_loss_mlp": 1.02621913, - "epoch": 0.20345708702840823, - "flos": 20339409288960.0, - "grad_norm": 3.0507793260875693, - "language_loss": 0.74539214, - "learning_rate": 3.6925968318490464e-06, - "loss": 0.76686096, - "num_input_tokens_seen": 73166775, - "step": 3384, - "time_per_iteration": 2.802645206451416 - }, - { - "auxiliary_loss_clip": 0.0114772, - "auxiliary_loss_mlp": 0.01042851, - "balance_loss_clip": 1.05207324, - "balance_loss_mlp": 1.02232289, - "epoch": 0.2035172102810762, - "flos": 20333950421760.0, - "grad_norm": 7.661095363155204, - "language_loss": 0.76801658, - "learning_rate": 3.6923893299746293e-06, - "loss": 0.7899223, - "num_input_tokens_seen": 73183215, - "step": 3385, - "time_per_iteration": 2.823343515396118 - }, - { - "auxiliary_loss_clip": 0.01107407, - "auxiliary_loss_mlp": 0.01063941, - "balance_loss_clip": 1.04730904, - "balance_loss_mlp": 1.04331779, - "epoch": 0.2035773335337442, - "flos": 23330983461120.0, - "grad_norm": 41.05937457193927, - "language_loss": 0.68458641, - "learning_rate": 3.692181763924639e-06, - "loss": 0.70629984, - "num_input_tokens_seen": 73203290, - "step": 3386, - "time_per_iteration": 2.830810546875 - }, - { - "auxiliary_loss_clip": 0.01104248, - "auxiliary_loss_mlp": 0.01064893, - "balance_loss_clip": 1.04774165, - "balance_loss_mlp": 1.04379284, - "epoch": 0.20363745678641215, - "flos": 28330287310080.0, - "grad_norm": 3.4161658794101384, - "language_loss": 0.80985248, - "learning_rate": 3.691974133706947e-06, - "loss": 0.83154386, - "num_input_tokens_seen": 73226185, - "step": 3387, - "time_per_iteration": 2.8204662799835205 - }, - { - "auxiliary_loss_clip": 0.0112504, - "auxiliary_loss_mlp": 0.01049361, - "balance_loss_clip": 1.05224109, - "balance_loss_mlp": 1.03000104, - "epoch": 0.20369758003908012, - "flos": 18915658928640.0, - "grad_norm": 2.703878094865874, - "language_loss": 0.7988956, - "learning_rate": 3.6917664393294262e-06, - "loss": 0.82063961, - "num_input_tokens_seen": 73243300, - "step": 3388, - "time_per_iteration": 2.687053918838501 - }, - { - "auxiliary_loss_clip": 0.01157403, - "auxiliary_loss_mlp": 0.01048089, - "balance_loss_clip": 1.05471182, - "balance_loss_mlp": 1.0281812, - "epoch": 0.20375770329174808, - "flos": 19206499351680.0, - "grad_norm": 1.8133180655285324, - "language_loss": 0.7184962, - "learning_rate": 3.6915586807999527e-06, - "loss": 0.74055111, - "num_input_tokens_seen": 73261490, - "step": 3389, - "time_per_iteration": 2.614321708679199 - }, - { - "auxiliary_loss_clip": 0.01141855, - "auxiliary_loss_mlp": 0.01054311, - "balance_loss_clip": 1.05387521, - "balance_loss_mlp": 1.0351541, - "epoch": 0.20381782654441605, - "flos": 19391008538880.0, - "grad_norm": 1.8982692343761227, - "language_loss": 0.87280858, - "learning_rate": 3.691350858126404e-06, - "loss": 0.89477026, - "num_input_tokens_seen": 73280180, - "step": 3390, - "time_per_iteration": 2.6770312786102295 - }, - { - "auxiliary_loss_clip": 0.01125093, - "auxiliary_loss_mlp": 0.01052498, - "balance_loss_clip": 1.05142403, - "balance_loss_mlp": 1.03129053, - "epoch": 0.203877949797084, - "flos": 24827704300800.0, - "grad_norm": 2.3308941901233355, - "language_loss": 0.71194077, - "learning_rate": 3.691142971316662e-06, - "loss": 0.73371667, - "num_input_tokens_seen": 73300680, - "step": 3391, - "time_per_iteration": 2.7198221683502197 - }, - { - "auxiliary_loss_clip": 0.01120121, - "auxiliary_loss_mlp": 0.01051383, - "balance_loss_clip": 1.05222178, - "balance_loss_mlp": 1.0318923, - "epoch": 0.20393807304975198, - "flos": 18003707504640.0, - "grad_norm": 2.4765720957839217, - "language_loss": 0.86745828, - "learning_rate": 3.6909350203786086e-06, - "loss": 0.88917333, - "num_input_tokens_seen": 73316760, - "step": 3392, - "time_per_iteration": 2.6961052417755127 - }, - { - "auxiliary_loss_clip": 0.01145712, - "auxiliary_loss_mlp": 0.01051212, - "balance_loss_clip": 1.05204964, - "balance_loss_mlp": 1.03236461, - "epoch": 0.20399819630241997, - "flos": 24206988349440.0, - "grad_norm": 1.665333238668028, - "language_loss": 0.80659354, - "learning_rate": 3.69072700532013e-06, - "loss": 0.82856286, - "num_input_tokens_seen": 73339385, - "step": 3393, - "time_per_iteration": 2.6883490085601807 - }, - { - "auxiliary_loss_clip": 0.01123025, - "auxiliary_loss_mlp": 0.010424, - "balance_loss_clip": 1.04751348, - "balance_loss_mlp": 1.02385163, - "epoch": 0.20405831955508794, - "flos": 20777124424320.0, - "grad_norm": 1.8745864895680615, - "language_loss": 0.86126244, - "learning_rate": 3.6905189261491137e-06, - "loss": 0.88291663, - "num_input_tokens_seen": 73357235, - "step": 3394, - "time_per_iteration": 2.758887767791748 - }, - { - "auxiliary_loss_clip": 0.0114219, - "auxiliary_loss_mlp": 0.01049288, - "balance_loss_clip": 1.05699492, - "balance_loss_mlp": 1.03088212, - "epoch": 0.2041184428077559, - "flos": 15486908325120.0, - "grad_norm": 2.5133342949273416, - "language_loss": 0.83761692, - "learning_rate": 3.69031078287345e-06, - "loss": 0.85953164, - "num_input_tokens_seen": 73374435, - "step": 3395, - "time_per_iteration": 2.6468729972839355 - }, - { - "auxiliary_loss_clip": 0.01145796, - "auxiliary_loss_mlp": 0.01039804, - "balance_loss_clip": 1.05311751, - "balance_loss_mlp": 1.0200156, - "epoch": 0.20417856606042387, - "flos": 15588463052160.0, - "grad_norm": 2.8477422591662376, - "language_loss": 0.83736277, - "learning_rate": 3.690102575501033e-06, - "loss": 0.85921878, - "num_input_tokens_seen": 73391025, - "step": 3396, - "time_per_iteration": 2.6296958923339844 - }, - { - "auxiliary_loss_clip": 0.01112843, - "auxiliary_loss_mlp": 0.01045334, - "balance_loss_clip": 1.04787922, - "balance_loss_mlp": 1.02616525, - "epoch": 0.20423868931309183, - "flos": 24279348297600.0, - "grad_norm": 2.1192113228666303, - "language_loss": 0.77199841, - "learning_rate": 3.6898943040397556e-06, - "loss": 0.79358017, - "num_input_tokens_seen": 73409270, - "step": 3397, - "time_per_iteration": 2.776784896850586 - }, - { - "auxiliary_loss_clip": 0.01128614, - "auxiliary_loss_mlp": 0.01050131, - "balance_loss_clip": 1.05143905, - "balance_loss_mlp": 1.03264332, - "epoch": 0.2042988125657598, - "flos": 18614870438400.0, - "grad_norm": 3.16091809956727, - "language_loss": 0.8791461, - "learning_rate": 3.689685968497518e-06, - "loss": 0.9009335, - "num_input_tokens_seen": 73425225, - "step": 3398, - "time_per_iteration": 2.6866374015808105 - }, - { - "auxiliary_loss_clip": 0.01126796, - "auxiliary_loss_mlp": 0.01052169, - "balance_loss_clip": 1.05476117, - "balance_loss_mlp": 1.03316689, - "epoch": 0.2043589358184278, - "flos": 17851230270720.0, - "grad_norm": 2.139785862197821, - "language_loss": 0.78045064, - "learning_rate": 3.6894775688822186e-06, - "loss": 0.80224031, - "num_input_tokens_seen": 73440940, - "step": 3399, - "time_per_iteration": 2.6545825004577637 - }, - { - "auxiliary_loss_clip": 0.01144155, - "auxiliary_loss_mlp": 0.01042424, - "balance_loss_clip": 1.05252838, - "balance_loss_mlp": 1.02299261, - "epoch": 0.20441905907109575, - "flos": 21435223455360.0, - "grad_norm": 3.6374157446104802, - "language_loss": 0.76563728, - "learning_rate": 3.6892691052017603e-06, - "loss": 0.787503, - "num_input_tokens_seen": 73458805, - "step": 3400, - "time_per_iteration": 2.7279481887817383 - }, - { - "auxiliary_loss_clip": 0.01121071, - "auxiliary_loss_mlp": 0.00776799, - "balance_loss_clip": 1.05304742, - "balance_loss_mlp": 1.00072634, - "epoch": 0.20447918232376372, - "flos": 27707703851520.0, - "grad_norm": 1.8758513970592474, - "language_loss": 0.79382575, - "learning_rate": 3.6890605774640487e-06, - "loss": 0.81280446, - "num_input_tokens_seen": 73479380, - "step": 3401, - "time_per_iteration": 2.7918031215667725 - }, - { - "auxiliary_loss_clip": 0.01131319, - "auxiliary_loss_mlp": 0.01044892, - "balance_loss_clip": 1.0484674, - "balance_loss_mlp": 1.02540183, - "epoch": 0.20453930557643168, - "flos": 30524214113280.0, - "grad_norm": 2.2159471948141034, - "language_loss": 0.69798994, - "learning_rate": 3.688851985676991e-06, - "loss": 0.71975207, - "num_input_tokens_seen": 73505105, - "step": 3402, - "time_per_iteration": 2.79670786857605 - }, - { - "auxiliary_loss_clip": 0.01120554, - "auxiliary_loss_mlp": 0.01043946, - "balance_loss_clip": 1.05060196, - "balance_loss_mlp": 1.02439535, - "epoch": 0.20459942882909965, - "flos": 18987767481600.0, - "grad_norm": 1.7908768446457861, - "language_loss": 0.81114817, - "learning_rate": 3.688643329848496e-06, - "loss": 0.83279312, - "num_input_tokens_seen": 73523700, - "step": 3403, - "time_per_iteration": 2.70182728767395 - }, - { - "auxiliary_loss_clip": 0.01144248, - "auxiliary_loss_mlp": 0.01041199, - "balance_loss_clip": 1.05348516, - "balance_loss_mlp": 1.02295971, - "epoch": 0.20465955208176762, - "flos": 20339050152960.0, - "grad_norm": 2.511955552730785, - "language_loss": 0.83403814, - "learning_rate": 3.6884346099864772e-06, - "loss": 0.8558926, - "num_input_tokens_seen": 73542625, - "step": 3404, - "time_per_iteration": 2.630807399749756 - }, - { - "auxiliary_loss_clip": 0.01138937, - "auxiliary_loss_mlp": 0.01048101, - "balance_loss_clip": 1.04838705, - "balance_loss_mlp": 1.0292058, - "epoch": 0.20471967533443558, - "flos": 21251288885760.0, - "grad_norm": 1.7149716538767368, - "language_loss": 0.86209136, - "learning_rate": 3.6882258260988487e-06, - "loss": 0.88396174, - "num_input_tokens_seen": 73561450, - "step": 3405, - "time_per_iteration": 2.6076929569244385 - }, - { - "auxiliary_loss_clip": 0.01116224, - "auxiliary_loss_mlp": 0.0104429, - "balance_loss_clip": 1.05039132, - "balance_loss_mlp": 1.02621806, - "epoch": 0.20477979858710357, - "flos": 14501555458560.0, - "grad_norm": 2.1633598971137435, - "language_loss": 0.84356105, - "learning_rate": 3.6880169781935276e-06, - "loss": 0.86516619, - "num_input_tokens_seen": 73577155, - "step": 3406, - "time_per_iteration": 2.768890142440796 - }, - { - "auxiliary_loss_clip": 0.01152751, - "auxiliary_loss_mlp": 0.01039548, - "balance_loss_clip": 1.0542599, - "balance_loss_mlp": 1.02191663, - "epoch": 0.20483992183977154, - "flos": 11400310085760.0, - "grad_norm": 2.4892039461455675, - "language_loss": 0.67453218, - "learning_rate": 3.6878080662784336e-06, - "loss": 0.69645512, - "num_input_tokens_seen": 73594900, - "step": 3407, - "time_per_iteration": 2.5661377906799316 - }, - { - "auxiliary_loss_clip": 0.0115175, - "auxiliary_loss_mlp": 0.01050505, - "balance_loss_clip": 1.05328465, - "balance_loss_mlp": 1.03294516, - "epoch": 0.2049000450924395, - "flos": 19060271084160.0, - "grad_norm": 2.4363182538361285, - "language_loss": 0.84214294, - "learning_rate": 3.6875990903614886e-06, - "loss": 0.86416554, - "num_input_tokens_seen": 73613810, - "step": 3408, - "time_per_iteration": 2.585186004638672 - }, - { - "auxiliary_loss_clip": 0.01154901, - "auxiliary_loss_mlp": 0.01042295, - "balance_loss_clip": 1.0536257, - "balance_loss_mlp": 1.02471161, - "epoch": 0.20496016834510747, - "flos": 14574561851520.0, - "grad_norm": 2.317815935455145, - "language_loss": 0.63898516, - "learning_rate": 3.6873900504506166e-06, - "loss": 0.6609571, - "num_input_tokens_seen": 73631495, - "step": 3409, - "time_per_iteration": 2.5877959728240967 - }, - { - "auxiliary_loss_clip": 0.0113795, - "auxiliary_loss_mlp": 0.01042481, - "balance_loss_clip": 1.04903567, - "balance_loss_mlp": 1.02409852, - "epoch": 0.20502029159777543, - "flos": 22126647329280.0, - "grad_norm": 1.3925959707869588, - "language_loss": 0.80547982, - "learning_rate": 3.687180946553745e-06, - "loss": 0.8272841, - "num_input_tokens_seen": 73652840, - "step": 3410, - "time_per_iteration": 4.1697752475738525 - }, - { - "auxiliary_loss_clip": 0.01099823, - "auxiliary_loss_mlp": 0.01046015, - "balance_loss_clip": 1.05186486, - "balance_loss_mlp": 1.02820492, - "epoch": 0.2050804148504434, - "flos": 25367907916800.0, - "grad_norm": 2.407452066099965, - "language_loss": 0.75804615, - "learning_rate": 3.686971778678803e-06, - "loss": 0.77950454, - "num_input_tokens_seen": 73672150, - "step": 3411, - "time_per_iteration": 2.8072102069854736 - }, - { - "auxiliary_loss_clip": 0.0113879, - "auxiliary_loss_mlp": 0.01046868, - "balance_loss_clip": 1.05501246, - "balance_loss_mlp": 1.02887905, - "epoch": 0.2051405381031114, - "flos": 23620171858560.0, - "grad_norm": 2.4936494073109445, - "language_loss": 0.73356283, - "learning_rate": 3.686762546833722e-06, - "loss": 0.75541937, - "num_input_tokens_seen": 73691940, - "step": 3412, - "time_per_iteration": 5.778446912765503 - }, - { - "auxiliary_loss_clip": 0.01127692, - "auxiliary_loss_mlp": 0.01057937, - "balance_loss_clip": 1.04926813, - "balance_loss_mlp": 1.03748107, - "epoch": 0.20520066135577936, - "flos": 19565533745280.0, - "grad_norm": 2.3541654180764353, - "language_loss": 0.77958596, - "learning_rate": 3.6865532510264362e-06, - "loss": 0.80144227, - "num_input_tokens_seen": 73709080, - "step": 3413, - "time_per_iteration": 2.6457245349884033 - }, - { - "auxiliary_loss_clip": 0.0110869, - "auxiliary_loss_mlp": 0.01047866, - "balance_loss_clip": 1.04991519, - "balance_loss_mlp": 1.02862608, - "epoch": 0.20526078460844732, - "flos": 17676345928320.0, - "grad_norm": 2.4834314093653673, - "language_loss": 0.85112405, - "learning_rate": 3.6863438912648823e-06, - "loss": 0.8726896, - "num_input_tokens_seen": 73727670, - "step": 3414, - "time_per_iteration": 2.7343668937683105 - }, - { - "auxiliary_loss_clip": 0.01140219, - "auxiliary_loss_mlp": 0.01039448, - "balance_loss_clip": 1.05012155, - "balance_loss_mlp": 1.02118468, - "epoch": 0.2053209078611153, - "flos": 21500328856320.0, - "grad_norm": 2.0410772094937433, - "language_loss": 0.80372798, - "learning_rate": 3.6861344675569986e-06, - "loss": 0.82552463, - "num_input_tokens_seen": 73747170, - "step": 3415, - "time_per_iteration": 2.6669082641601562 - }, - { - "auxiliary_loss_clip": 0.01087022, - "auxiliary_loss_mlp": 0.01042771, - "balance_loss_clip": 1.04786301, - "balance_loss_mlp": 1.02643943, - "epoch": 0.20538103111378325, - "flos": 25663524848640.0, - "grad_norm": 1.941742032659622, - "language_loss": 0.72958827, - "learning_rate": 3.6859249799107275e-06, - "loss": 0.75088626, - "num_input_tokens_seen": 73767690, - "step": 3416, - "time_per_iteration": 2.892782211303711 - }, - { - "auxiliary_loss_clip": 0.01145149, - "auxiliary_loss_mlp": 0.01044328, - "balance_loss_clip": 1.05453372, - "balance_loss_mlp": 1.02577877, - "epoch": 0.20544115436645122, - "flos": 23148952312320.0, - "grad_norm": 2.508583707985938, - "language_loss": 0.78741407, - "learning_rate": 3.6857154283340115e-06, - "loss": 0.80930889, - "num_input_tokens_seen": 73786900, - "step": 3417, - "time_per_iteration": 2.7298929691314697 - }, - { - "auxiliary_loss_clip": 0.01145459, - "auxiliary_loss_mlp": 0.0104683, - "balance_loss_clip": 1.0536468, - "balance_loss_mlp": 1.02819777, - "epoch": 0.20550127761911918, - "flos": 19390433921280.0, - "grad_norm": 2.4305498920504043, - "language_loss": 0.8729043, - "learning_rate": 3.685505812834798e-06, - "loss": 0.89482725, - "num_input_tokens_seen": 73804515, - "step": 3418, - "time_per_iteration": 4.382033109664917 - }, - { - "auxiliary_loss_clip": 0.01140182, - "auxiliary_loss_mlp": 0.01046543, - "balance_loss_clip": 1.05682349, - "balance_loss_mlp": 1.02776778, - "epoch": 0.20556140087178718, - "flos": 22893124671360.0, - "grad_norm": 14.690715253896212, - "language_loss": 0.62538671, - "learning_rate": 3.685296133421035e-06, - "loss": 0.64725399, - "num_input_tokens_seen": 73822910, - "step": 3419, - "time_per_iteration": 2.7318668365478516 - }, - { - "auxiliary_loss_clip": 0.01139691, - "auxiliary_loss_mlp": 0.01046928, - "balance_loss_clip": 1.05550981, - "balance_loss_mlp": 1.02651954, - "epoch": 0.20562152412445514, - "flos": 19789652655360.0, - "grad_norm": 1.8153871521224594, - "language_loss": 0.86339438, - "learning_rate": 3.685086390100674e-06, - "loss": 0.88526058, - "num_input_tokens_seen": 73841160, - "step": 3420, - "time_per_iteration": 2.723606824874878 - }, - { - "auxiliary_loss_clip": 0.01104401, - "auxiliary_loss_mlp": 0.00780617, - "balance_loss_clip": 1.04621911, - "balance_loss_mlp": 1.00071514, - "epoch": 0.2056816473771231, - "flos": 31501989210240.0, - "grad_norm": 2.3982854973621954, - "language_loss": 0.7127136, - "learning_rate": 3.684876582881668e-06, - "loss": 0.73156381, - "num_input_tokens_seen": 73862795, - "step": 3421, - "time_per_iteration": 2.8138315677642822 - }, - { - "auxiliary_loss_clip": 0.01153254, - "auxiliary_loss_mlp": 0.01039984, - "balance_loss_clip": 1.05382609, - "balance_loss_mlp": 1.02160168, - "epoch": 0.20574177062979107, - "flos": 23258372117760.0, - "grad_norm": 6.231519820465981, - "language_loss": 0.70559299, - "learning_rate": 3.6846667117719732e-06, - "loss": 0.72752541, - "num_input_tokens_seen": 73881525, - "step": 3422, - "time_per_iteration": 2.6411848068237305 - }, - { - "auxiliary_loss_clip": 0.01062123, - "auxiliary_loss_mlp": 0.01005097, - "balance_loss_clip": 1.03459418, - "balance_loss_mlp": 1.00220013, - "epoch": 0.20580189388245904, - "flos": 70312518708480.0, - "grad_norm": 0.740118932422812, - "language_loss": 0.55461621, - "learning_rate": 3.684456776779548e-06, - "loss": 0.57528841, - "num_input_tokens_seen": 73937775, - "step": 3423, - "time_per_iteration": 3.259685516357422 - }, - { - "auxiliary_loss_clip": 0.01104389, - "auxiliary_loss_mlp": 0.01039296, - "balance_loss_clip": 1.04975653, - "balance_loss_mlp": 1.02089024, - "epoch": 0.205862017135127, - "flos": 30737846252160.0, - "grad_norm": 1.9242047681435088, - "language_loss": 0.71910381, - "learning_rate": 3.684246777912353e-06, - "loss": 0.74054068, - "num_input_tokens_seen": 73958250, - "step": 3424, - "time_per_iteration": 2.800283432006836 - }, - { - "auxiliary_loss_clip": 0.01125916, - "auxiliary_loss_mlp": 0.00777945, - "balance_loss_clip": 1.05704927, - "balance_loss_mlp": 1.00086677, - "epoch": 0.20592214038779497, - "flos": 21324546673920.0, - "grad_norm": 1.6235965502825092, - "language_loss": 0.74980927, - "learning_rate": 3.684036715178351e-06, - "loss": 0.76884782, - "num_input_tokens_seen": 73977775, - "step": 3425, - "time_per_iteration": 2.751030206680298 - }, - { - "auxiliary_loss_clip": 0.01104665, - "auxiliary_loss_mlp": 0.01058685, - "balance_loss_clip": 1.05047321, - "balance_loss_mlp": 1.03983784, - "epoch": 0.20598226364046296, - "flos": 22891652213760.0, - "grad_norm": 1.7765616723027935, - "language_loss": 0.87936616, - "learning_rate": 3.683826588585508e-06, - "loss": 0.90099961, - "num_input_tokens_seen": 73996590, - "step": 3426, - "time_per_iteration": 2.8539180755615234 - }, - { - "auxiliary_loss_clip": 0.01144422, - "auxiliary_loss_mlp": 0.01045493, - "balance_loss_clip": 1.05773449, - "balance_loss_mlp": 1.0281601, - "epoch": 0.20604238689313092, - "flos": 23878549365120.0, - "grad_norm": 1.836530467647624, - "language_loss": 0.76435733, - "learning_rate": 3.6836163981417926e-06, - "loss": 0.78625643, - "num_input_tokens_seen": 74015935, - "step": 3427, - "time_per_iteration": 2.7024967670440674 - }, - { - "auxiliary_loss_clip": 0.01159387, - "auxiliary_loss_mlp": 0.01050023, - "balance_loss_clip": 1.0577209, - "balance_loss_mlp": 1.03185558, - "epoch": 0.2061025101457989, - "flos": 22491535639680.0, - "grad_norm": 2.7350574840199964, - "language_loss": 0.74176943, - "learning_rate": 3.683406143855174e-06, - "loss": 0.76386356, - "num_input_tokens_seen": 74036575, - "step": 3428, - "time_per_iteration": 2.593151569366455 - }, - { - "auxiliary_loss_clip": 0.01132797, - "auxiliary_loss_mlp": 0.01046534, - "balance_loss_clip": 1.05232322, - "balance_loss_mlp": 1.0274843, - "epoch": 0.20616263339846685, - "flos": 22778928357120.0, - "grad_norm": 3.829070534376961, - "language_loss": 0.73316109, - "learning_rate": 3.6831958257336256e-06, - "loss": 0.75495446, - "num_input_tokens_seen": 74055365, - "step": 3429, - "time_per_iteration": 2.7357261180877686 - }, - { - "auxiliary_loss_clip": 0.01144108, - "auxiliary_loss_mlp": 0.01049081, - "balance_loss_clip": 1.05838966, - "balance_loss_mlp": 1.03030515, - "epoch": 0.20622275665113482, - "flos": 20882198684160.0, - "grad_norm": 2.201354934958512, - "language_loss": 0.85586745, - "learning_rate": 3.6829854437851237e-06, - "loss": 0.87779927, - "num_input_tokens_seen": 74074875, - "step": 3430, - "time_per_iteration": 2.658486843109131 - }, - { - "auxiliary_loss_clip": 0.01088509, - "auxiliary_loss_mlp": 0.01053254, - "balance_loss_clip": 1.04814601, - "balance_loss_mlp": 1.03387105, - "epoch": 0.20628287990380278, - "flos": 19354415558400.0, - "grad_norm": 1.8292569880077065, - "language_loss": 0.68859613, - "learning_rate": 3.6827749980176444e-06, - "loss": 0.71001375, - "num_input_tokens_seen": 74094505, - "step": 3431, - "time_per_iteration": 2.811061143875122 - }, - { - "auxiliary_loss_clip": 0.01027012, - "auxiliary_loss_mlp": 0.01012446, - "balance_loss_clip": 1.03099978, - "balance_loss_mlp": 1.00976419, - "epoch": 0.20634300315647078, - "flos": 71517932248320.0, - "grad_norm": 0.8066063325789609, - "language_loss": 0.60172188, - "learning_rate": 3.6825644884391693e-06, - "loss": 0.62211645, - "num_input_tokens_seen": 74158500, - "step": 3432, - "time_per_iteration": 3.415828227996826 - }, - { - "auxiliary_loss_clip": 0.01146488, - "auxiliary_loss_mlp": 0.01044703, - "balance_loss_clip": 1.0583806, - "balance_loss_mlp": 1.02669072, - "epoch": 0.20640312640913874, - "flos": 21723944976000.0, - "grad_norm": 2.5535613418278116, - "language_loss": 0.72622889, - "learning_rate": 3.682353915057679e-06, - "loss": 0.74814081, - "num_input_tokens_seen": 74176685, - "step": 3433, - "time_per_iteration": 2.715195655822754 - }, - { - "auxiliary_loss_clip": 0.0109694, - "auxiliary_loss_mlp": 0.01050867, - "balance_loss_clip": 1.04781306, - "balance_loss_mlp": 1.03019655, - "epoch": 0.2064632496618067, - "flos": 20554621626240.0, - "grad_norm": 2.096486283687917, - "language_loss": 0.87233114, - "learning_rate": 3.6821432778811604e-06, - "loss": 0.8938092, - "num_input_tokens_seen": 74194935, - "step": 3434, - "time_per_iteration": 2.7781460285186768 - }, - { - "auxiliary_loss_clip": 0.01151381, - "auxiliary_loss_mlp": 0.01045497, - "balance_loss_clip": 1.05561388, - "balance_loss_mlp": 1.02719867, - "epoch": 0.20652337291447467, - "flos": 29823273135360.0, - "grad_norm": 1.7621185839090663, - "language_loss": 0.69533503, - "learning_rate": 3.6819325769176004e-06, - "loss": 0.71730381, - "num_input_tokens_seen": 74215400, - "step": 3435, - "time_per_iteration": 2.7425992488861084 - }, - { - "auxiliary_loss_clip": 0.01127853, - "auxiliary_loss_mlp": 0.01045604, - "balance_loss_clip": 1.05583, - "balance_loss_mlp": 1.02672172, - "epoch": 0.20658349616714264, - "flos": 26213640618240.0, - "grad_norm": 30.077934868422773, - "language_loss": 0.89116997, - "learning_rate": 3.681721812174988e-06, - "loss": 0.91290456, - "num_input_tokens_seen": 74234090, - "step": 3436, - "time_per_iteration": 2.7460577487945557 - }, - { - "auxiliary_loss_clip": 0.01118033, - "auxiliary_loss_mlp": 0.01041557, - "balance_loss_clip": 1.05178559, - "balance_loss_mlp": 1.02168477, - "epoch": 0.2066436194198106, - "flos": 25994370044160.0, - "grad_norm": 1.7370712778981523, - "language_loss": 0.77330887, - "learning_rate": 3.6815109836613163e-06, - "loss": 0.79490477, - "num_input_tokens_seen": 74253345, - "step": 3437, - "time_per_iteration": 2.7507588863372803 - }, - { - "auxiliary_loss_clip": 0.01144607, - "auxiliary_loss_mlp": 0.01040376, - "balance_loss_clip": 1.05298507, - "balance_loss_mlp": 1.02323389, - "epoch": 0.20670374267247857, - "flos": 21361067827200.0, - "grad_norm": 1.8326742989814773, - "language_loss": 0.77813125, - "learning_rate": 3.6813000913845795e-06, - "loss": 0.799981, - "num_input_tokens_seen": 74271615, - "step": 3438, - "time_per_iteration": 2.7624385356903076 - }, - { - "auxiliary_loss_clip": 0.01063811, - "auxiliary_loss_mlp": 0.01002308, - "balance_loss_clip": 1.03603387, - "balance_loss_mlp": 0.9995541, - "epoch": 0.20676386592514656, - "flos": 66383281952640.0, - "grad_norm": 0.8298524953876073, - "language_loss": 0.67093015, - "learning_rate": 3.6810891353527747e-06, - "loss": 0.69159138, - "num_input_tokens_seen": 74331390, - "step": 3439, - "time_per_iteration": 3.2026216983795166 - }, - { - "auxiliary_loss_clip": 0.01148913, - "auxiliary_loss_mlp": 0.01041213, - "balance_loss_clip": 1.05590546, - "balance_loss_mlp": 1.02299786, - "epoch": 0.20682398917781453, - "flos": 17274577328640.0, - "grad_norm": 1.9537104709510729, - "language_loss": 0.83907467, - "learning_rate": 3.6808781155739014e-06, - "loss": 0.86097592, - "num_input_tokens_seen": 74347335, - "step": 3440, - "time_per_iteration": 2.6949758529663086 - }, - { - "auxiliary_loss_clip": 0.01147739, - "auxiliary_loss_mlp": 0.01041939, - "balance_loss_clip": 1.05509627, - "balance_loss_mlp": 1.02458239, - "epoch": 0.2068841124304825, - "flos": 18077288515200.0, - "grad_norm": 1.8008884636634683, - "language_loss": 0.84828413, - "learning_rate": 3.6806670320559614e-06, - "loss": 0.8701809, - "num_input_tokens_seen": 74366310, - "step": 3441, - "time_per_iteration": 2.6440463066101074 - }, - { - "auxiliary_loss_clip": 0.01110175, - "auxiliary_loss_mlp": 0.01048552, - "balance_loss_clip": 1.05599904, - "balance_loss_mlp": 1.03050399, - "epoch": 0.20694423568315046, - "flos": 27347017432320.0, - "grad_norm": 1.7415147413468661, - "language_loss": 0.85854685, - "learning_rate": 3.680455884806959e-06, - "loss": 0.88013411, - "num_input_tokens_seen": 74387100, - "step": 3442, - "time_per_iteration": 2.8222689628601074 - }, - { - "auxiliary_loss_clip": 0.01078025, - "auxiliary_loss_mlp": 0.01050799, - "balance_loss_clip": 1.05186844, - "balance_loss_mlp": 1.03095019, - "epoch": 0.20700435893581842, - "flos": 20229845829120.0, - "grad_norm": 1.9775081815037283, - "language_loss": 0.73038852, - "learning_rate": 3.6802446738349014e-06, - "loss": 0.75167674, - "num_input_tokens_seen": 74404460, - "step": 3443, - "time_per_iteration": 2.8044140338897705 - }, - { - "auxiliary_loss_clip": 0.01127625, - "auxiliary_loss_mlp": 0.00776303, - "balance_loss_clip": 1.05408895, - "balance_loss_mlp": 1.00079513, - "epoch": 0.2070644821884864, - "flos": 20631111638400.0, - "grad_norm": 1.84636320729986, - "language_loss": 0.85586846, - "learning_rate": 3.680033399147797e-06, - "loss": 0.87490773, - "num_input_tokens_seen": 74423790, - "step": 3444, - "time_per_iteration": 2.7582647800445557 - }, - { - "auxiliary_loss_clip": 0.01036759, - "auxiliary_loss_mlp": 0.01007145, - "balance_loss_clip": 1.03905272, - "balance_loss_mlp": 1.0042963, - "epoch": 0.20712460544115438, - "flos": 65941077617280.0, - "grad_norm": 0.6999396122177431, - "language_loss": 0.57092249, - "learning_rate": 3.6798220607536585e-06, - "loss": 0.59136152, - "num_input_tokens_seen": 74488130, - "step": 3445, - "time_per_iteration": 3.249602794647217 - }, - { - "auxiliary_loss_clip": 0.01152738, - "auxiliary_loss_mlp": 0.00776634, - "balance_loss_clip": 1.0538106, - "balance_loss_mlp": 1.00088191, - "epoch": 0.20718472869382235, - "flos": 19425734012160.0, - "grad_norm": 1.6453630130444594, - "language_loss": 0.78469276, - "learning_rate": 3.6796106586604987e-06, - "loss": 0.80398649, - "num_input_tokens_seen": 74506720, - "step": 3446, - "time_per_iteration": 2.6341898441314697 - }, - { - "auxiliary_loss_clip": 0.01151445, - "auxiliary_loss_mlp": 0.01043774, - "balance_loss_clip": 1.05439711, - "balance_loss_mlp": 1.02297151, - "epoch": 0.2072448519464903, - "flos": 24499049834880.0, - "grad_norm": 2.013256457797304, - "language_loss": 0.63031304, - "learning_rate": 3.679399192876334e-06, - "loss": 0.65226525, - "num_input_tokens_seen": 74525330, - "step": 3447, - "time_per_iteration": 2.6912922859191895 - }, - { - "auxiliary_loss_clip": 0.01103828, - "auxiliary_loss_mlp": 0.01058453, - "balance_loss_clip": 1.04668319, - "balance_loss_mlp": 1.03828287, - "epoch": 0.20730497519915828, - "flos": 23075694524160.0, - "grad_norm": 1.7423220349735584, - "language_loss": 0.86291325, - "learning_rate": 3.679187663409184e-06, - "loss": 0.88453603, - "num_input_tokens_seen": 74544535, - "step": 3448, - "time_per_iteration": 2.787576675415039 - }, - { - "auxiliary_loss_clip": 0.01128629, - "auxiliary_loss_mlp": 0.01045151, - "balance_loss_clip": 1.049932, - "balance_loss_mlp": 1.02556467, - "epoch": 0.20736509845182624, - "flos": 21069042255360.0, - "grad_norm": 3.8253504349982044, - "language_loss": 0.75264204, - "learning_rate": 3.6789760702670696e-06, - "loss": 0.77437979, - "num_input_tokens_seen": 74562300, - "step": 3449, - "time_per_iteration": 4.354467391967773 - }, - { - "auxiliary_loss_clip": 0.01141162, - "auxiliary_loss_mlp": 0.01050212, - "balance_loss_clip": 1.0534308, - "balance_loss_mlp": 1.03073323, - "epoch": 0.2074252217044942, - "flos": 17633288499840.0, - "grad_norm": 2.156163289660715, - "language_loss": 0.76558924, - "learning_rate": 3.6787644134580134e-06, - "loss": 0.787503, - "num_input_tokens_seen": 74580080, - "step": 3450, - "time_per_iteration": 2.7020533084869385 - }, - { - "auxiliary_loss_clip": 0.01128554, - "auxiliary_loss_mlp": 0.01044182, - "balance_loss_clip": 1.05234683, - "balance_loss_mlp": 1.02522802, - "epoch": 0.20748534495716217, - "flos": 23546985897600.0, - "grad_norm": 1.6446708221415856, - "language_loss": 0.82074821, - "learning_rate": 3.6785526929900436e-06, - "loss": 0.84247565, - "num_input_tokens_seen": 74598980, - "step": 3451, - "time_per_iteration": 2.7753186225891113 - }, - { - "auxiliary_loss_clip": 0.01064426, - "auxiliary_loss_mlp": 0.01003577, - "balance_loss_clip": 1.02722275, - "balance_loss_mlp": 1.00099015, - "epoch": 0.20754546820983016, - "flos": 52252935598080.0, - "grad_norm": 0.793594031040259, - "language_loss": 0.56562752, - "learning_rate": 3.6783409088711875e-06, - "loss": 0.58630753, - "num_input_tokens_seen": 74655275, - "step": 3452, - "time_per_iteration": 6.257205963134766 - }, - { - "auxiliary_loss_clip": 0.01124123, - "auxiliary_loss_mlp": 0.00776806, - "balance_loss_clip": 1.05206704, - "balance_loss_mlp": 1.0008918, - "epoch": 0.20760559146249813, - "flos": 20412379768320.0, - "grad_norm": 2.245823129763223, - "language_loss": 0.88341558, - "learning_rate": 3.6781290611094755e-06, - "loss": 0.90242493, - "num_input_tokens_seen": 74674560, - "step": 3453, - "time_per_iteration": 2.7009050846099854 - }, - { - "auxiliary_loss_clip": 0.01146287, - "auxiliary_loss_mlp": 0.01044217, - "balance_loss_clip": 1.05471313, - "balance_loss_mlp": 1.02521539, - "epoch": 0.2076657147151661, - "flos": 23186012169600.0, - "grad_norm": 2.2325669459725574, - "language_loss": 0.79920429, - "learning_rate": 3.6779171497129407e-06, - "loss": 0.82110935, - "num_input_tokens_seen": 74694500, - "step": 3454, - "time_per_iteration": 2.7080893516540527 - }, - { - "auxiliary_loss_clip": 0.01104984, - "auxiliary_loss_mlp": 0.00777717, - "balance_loss_clip": 1.04356718, - "balance_loss_mlp": 1.0007751, - "epoch": 0.20772583796783406, - "flos": 18293219124480.0, - "grad_norm": 3.601668384502942, - "language_loss": 0.76601356, - "learning_rate": 3.6777051746896202e-06, - "loss": 0.78484058, - "num_input_tokens_seen": 74710485, - "step": 3455, - "time_per_iteration": 2.6733248233795166 - }, - { - "auxiliary_loss_clip": 0.01115407, - "auxiliary_loss_mlp": 0.01050321, - "balance_loss_clip": 1.04759336, - "balance_loss_mlp": 1.0326066, - "epoch": 0.20778596122050202, - "flos": 17602800831360.0, - "grad_norm": 1.908671081537558, - "language_loss": 0.80200219, - "learning_rate": 3.6774931360475516e-06, - "loss": 0.82365942, - "num_input_tokens_seen": 74727450, - "step": 3456, - "time_per_iteration": 2.6950278282165527 - }, - { - "auxiliary_loss_clip": 0.01112832, - "auxiliary_loss_mlp": 0.00777675, - "balance_loss_clip": 1.05166578, - "balance_loss_mlp": 1.00099969, - "epoch": 0.20784608447317, - "flos": 23805578885760.0, - "grad_norm": 2.135320694722552, - "language_loss": 0.78070557, - "learning_rate": 3.6772810337947745e-06, - "loss": 0.79961067, - "num_input_tokens_seen": 74746725, - "step": 3457, - "time_per_iteration": 4.381137132644653 - }, - { - "auxiliary_loss_clip": 0.01082177, - "auxiliary_loss_mlp": 0.01058291, - "balance_loss_clip": 1.04310393, - "balance_loss_mlp": 1.03651094, - "epoch": 0.20790620772583795, - "flos": 17639286071040.0, - "grad_norm": 1.7652855773158553, - "language_loss": 0.8360287, - "learning_rate": 3.677068867939333e-06, - "loss": 0.85743344, - "num_input_tokens_seen": 74765255, - "step": 3458, - "time_per_iteration": 2.7332653999328613 - }, - { - "auxiliary_loss_clip": 0.01140275, - "auxiliary_loss_mlp": 0.0077698, - "balance_loss_clip": 1.05156302, - "balance_loss_mlp": 1.00095606, - "epoch": 0.20796633097850595, - "flos": 27673481168640.0, - "grad_norm": 11.883071119862361, - "language_loss": 0.75769317, - "learning_rate": 3.676856638489272e-06, - "loss": 0.77686572, - "num_input_tokens_seen": 74785710, - "step": 3459, - "time_per_iteration": 2.705026626586914 - }, - { - "auxiliary_loss_clip": 0.01089168, - "auxiliary_loss_mlp": 0.01038825, - "balance_loss_clip": 1.04769015, - "balance_loss_mlp": 1.02081251, - "epoch": 0.2080264542311739, - "flos": 19245606284160.0, - "grad_norm": 2.1071303009051428, - "language_loss": 0.77105331, - "learning_rate": 3.6766443454526382e-06, - "loss": 0.79233319, - "num_input_tokens_seen": 74804490, - "step": 3460, - "time_per_iteration": 2.749965190887451 - }, - { - "auxiliary_loss_clip": 0.0109477, - "auxiliary_loss_mlp": 0.01047592, - "balance_loss_clip": 1.04938984, - "balance_loss_mlp": 1.02838707, - "epoch": 0.20808657748384188, - "flos": 27525924097920.0, - "grad_norm": 9.5480036120023, - "language_loss": 0.75802225, - "learning_rate": 3.6764319888374836e-06, - "loss": 0.77944589, - "num_input_tokens_seen": 74826340, - "step": 3461, - "time_per_iteration": 2.7929086685180664 - }, - { - "auxiliary_loss_clip": 0.01124748, - "auxiliary_loss_mlp": 0.01041543, - "balance_loss_clip": 1.04610133, - "balance_loss_mlp": 1.02203989, - "epoch": 0.20814670073650984, - "flos": 26906931999360.0, - "grad_norm": 2.001927586001653, - "language_loss": 0.8848443, - "learning_rate": 3.6762195686518604e-06, - "loss": 0.90650725, - "num_input_tokens_seen": 74844960, - "step": 3462, - "time_per_iteration": 2.7031619548797607 - }, - { - "auxiliary_loss_clip": 0.01023861, - "auxiliary_loss_mlp": 0.00757905, - "balance_loss_clip": 1.02540636, - "balance_loss_mlp": 1.00168896, - "epoch": 0.2082068239891778, - "flos": 70175735717760.0, - "grad_norm": 0.7622558664505636, - "language_loss": 0.59010452, - "learning_rate": 3.6760070849038226e-06, - "loss": 0.6079222, - "num_input_tokens_seen": 74909075, - "step": 3463, - "time_per_iteration": 3.4111485481262207 - }, - { - "auxiliary_loss_clip": 0.01132553, - "auxiliary_loss_mlp": 0.01047591, - "balance_loss_clip": 1.04893148, - "balance_loss_mlp": 1.02866018, - "epoch": 0.20826694724184577, - "flos": 24608074590720.0, - "grad_norm": 2.6002828602708283, - "language_loss": 0.66744608, - "learning_rate": 3.675794537601429e-06, - "loss": 0.68924749, - "num_input_tokens_seen": 74928125, - "step": 3464, - "time_per_iteration": 2.718229293823242 - }, - { - "auxiliary_loss_clip": 0.0112374, - "auxiliary_loss_mlp": 0.0104712, - "balance_loss_clip": 1.05101657, - "balance_loss_mlp": 1.02755797, - "epoch": 0.20832707049451377, - "flos": 12892829034240.0, - "grad_norm": 2.9384916482598205, - "language_loss": 0.84044278, - "learning_rate": 3.6755819267527373e-06, - "loss": 0.86215138, - "num_input_tokens_seen": 74945090, - "step": 3465, - "time_per_iteration": 2.732109546661377 - }, - { - "auxiliary_loss_clip": 0.01096712, - "auxiliary_loss_mlp": 0.01040605, - "balance_loss_clip": 1.04373813, - "balance_loss_mlp": 1.02221096, - "epoch": 0.20838719374718173, - "flos": 22198827709440.0, - "grad_norm": 2.576139197384499, - "language_loss": 0.81923312, - "learning_rate": 3.6753692523658113e-06, - "loss": 0.84060633, - "num_input_tokens_seen": 74963630, - "step": 3466, - "time_per_iteration": 2.7758567333221436 - }, - { - "auxiliary_loss_clip": 0.01140158, - "auxiliary_loss_mlp": 0.01044188, - "balance_loss_clip": 1.05322194, - "balance_loss_mlp": 1.02787983, - "epoch": 0.2084473169998497, - "flos": 15158648908800.0, - "grad_norm": 4.780862188541671, - "language_loss": 0.82008922, - "learning_rate": 3.675156514448716e-06, - "loss": 0.84193271, - "num_input_tokens_seen": 74981875, - "step": 3467, - "time_per_iteration": 2.5788159370422363 - }, - { - "auxiliary_loss_clip": 0.01149826, - "auxiliary_loss_mlp": 0.01040027, - "balance_loss_clip": 1.05362797, - "balance_loss_mlp": 1.02265835, - "epoch": 0.20850744025251766, - "flos": 17456788045440.0, - "grad_norm": 2.009157691583003, - "language_loss": 0.82178962, - "learning_rate": 3.674943713009518e-06, - "loss": 0.84368813, - "num_input_tokens_seen": 74999155, - "step": 3468, - "time_per_iteration": 2.5874218940734863 - }, - { - "auxiliary_loss_clip": 0.01143942, - "auxiliary_loss_mlp": 0.01048537, - "balance_loss_clip": 1.05300629, - "balance_loss_mlp": 1.02774715, - "epoch": 0.20856756350518563, - "flos": 25698968593920.0, - "grad_norm": 2.0793964386868584, - "language_loss": 0.90328556, - "learning_rate": 3.6747308480562856e-06, - "loss": 0.92521036, - "num_input_tokens_seen": 75017850, - "step": 3469, - "time_per_iteration": 2.6595447063446045 - }, - { - "auxiliary_loss_clip": 0.01125181, - "auxiliary_loss_mlp": 0.0104984, - "balance_loss_clip": 1.05548537, - "balance_loss_mlp": 1.03175592, - "epoch": 0.2086276867578536, - "flos": 37889060970240.0, - "grad_norm": 1.9058635967771913, - "language_loss": 0.76809812, - "learning_rate": 3.674517919597092e-06, - "loss": 0.78984833, - "num_input_tokens_seen": 75039270, - "step": 3470, - "time_per_iteration": 2.908046245574951 - }, - { - "auxiliary_loss_clip": 0.01133446, - "auxiliary_loss_mlp": 0.01047618, - "balance_loss_clip": 1.0551517, - "balance_loss_mlp": 1.02942634, - "epoch": 0.20868781001052156, - "flos": 25557049958400.0, - "grad_norm": 2.301093296435647, - "language_loss": 0.75801277, - "learning_rate": 3.674304927640011e-06, - "loss": 0.77982342, - "num_input_tokens_seen": 75059350, - "step": 3471, - "time_per_iteration": 2.713533401489258 - }, - { - "auxiliary_loss_clip": 0.01123818, - "auxiliary_loss_mlp": 0.01053513, - "balance_loss_clip": 1.04961812, - "balance_loss_mlp": 1.03384328, - "epoch": 0.20874793326318955, - "flos": 27529192235520.0, - "grad_norm": 2.366290140730035, - "language_loss": 0.75703716, - "learning_rate": 3.67409187219312e-06, - "loss": 0.77881044, - "num_input_tokens_seen": 75080150, - "step": 3472, - "time_per_iteration": 2.785034656524658 - }, - { - "auxiliary_loss_clip": 0.01140589, - "auxiliary_loss_mlp": 0.01046494, - "balance_loss_clip": 1.05084538, - "balance_loss_mlp": 1.02854145, - "epoch": 0.20880805651585752, - "flos": 18548795370240.0, - "grad_norm": 7.277377921302429, - "language_loss": 0.84276807, - "learning_rate": 3.6738787532644966e-06, - "loss": 0.86463886, - "num_input_tokens_seen": 75097920, - "step": 3473, - "time_per_iteration": 2.6236281394958496 - }, - { - "auxiliary_loss_clip": 0.01057043, - "auxiliary_loss_mlp": 0.01037704, - "balance_loss_clip": 1.05363917, - "balance_loss_mlp": 1.03434241, - "epoch": 0.20886817976852548, - "flos": 65946644225280.0, - "grad_norm": 0.9045809123115837, - "language_loss": 0.63652557, - "learning_rate": 3.6736655708622235e-06, - "loss": 0.65747303, - "num_input_tokens_seen": 75152410, - "step": 3474, - "time_per_iteration": 3.1946537494659424 - }, - { - "auxiliary_loss_clip": 0.0113535, - "auxiliary_loss_mlp": 0.01045984, - "balance_loss_clip": 1.05276895, - "balance_loss_mlp": 1.02782845, - "epoch": 0.20892830302119345, - "flos": 36539178929280.0, - "grad_norm": 3.2311626254468795, - "language_loss": 0.69970965, - "learning_rate": 3.6734523249943844e-06, - "loss": 0.72152305, - "num_input_tokens_seen": 75173265, - "step": 3475, - "time_per_iteration": 2.7967529296875 - }, - { - "auxiliary_loss_clip": 0.01158022, - "auxiliary_loss_mlp": 0.01046944, - "balance_loss_clip": 1.05606794, - "balance_loss_mlp": 1.02862167, - "epoch": 0.2089884262738614, - "flos": 20956749361920.0, - "grad_norm": 1.9789108228051473, - "language_loss": 0.70372891, - "learning_rate": 3.673239015669065e-06, - "loss": 0.72577858, - "num_input_tokens_seen": 75193640, - "step": 3476, - "time_per_iteration": 2.629687786102295 - }, - { - "auxiliary_loss_clip": 0.01131765, - "auxiliary_loss_mlp": 0.01045236, - "balance_loss_clip": 1.05439556, - "balance_loss_mlp": 1.02722347, - "epoch": 0.20904854952652938, - "flos": 22784028088320.0, - "grad_norm": 2.3868812434184603, - "language_loss": 0.89227062, - "learning_rate": 3.6730256428943544e-06, - "loss": 0.91404068, - "num_input_tokens_seen": 75212545, - "step": 3477, - "time_per_iteration": 2.7574357986450195 - }, - { - "auxiliary_loss_clip": 0.01092922, - "auxiliary_loss_mlp": 0.01046119, - "balance_loss_clip": 1.045825, - "balance_loss_mlp": 1.02737951, - "epoch": 0.20910867277919734, - "flos": 27303277645440.0, - "grad_norm": 2.6092415644893814, - "language_loss": 0.67816859, - "learning_rate": 3.672812206678344e-06, - "loss": 0.69955903, - "num_input_tokens_seen": 75230865, - "step": 3478, - "time_per_iteration": 2.7929017543792725 - }, - { - "auxiliary_loss_clip": 0.01094689, - "auxiliary_loss_mlp": 0.01042766, - "balance_loss_clip": 1.04024661, - "balance_loss_mlp": 1.02308464, - "epoch": 0.20916879603186533, - "flos": 14319237000960.0, - "grad_norm": 4.056245481336458, - "language_loss": 0.84239435, - "learning_rate": 3.672598707029127e-06, - "loss": 0.86376888, - "num_input_tokens_seen": 75248285, - "step": 3479, - "time_per_iteration": 2.743544816970825 - }, - { - "auxiliary_loss_clip": 0.01111533, - "auxiliary_loss_mlp": 0.01050991, - "balance_loss_clip": 1.04863191, - "balance_loss_mlp": 1.03028417, - "epoch": 0.2092289192845333, - "flos": 22273019251200.0, - "grad_norm": 9.599906344578406, - "language_loss": 0.74294043, - "learning_rate": 3.6723851439548003e-06, - "loss": 0.76456571, - "num_input_tokens_seen": 75266310, - "step": 3480, - "time_per_iteration": 2.7278034687042236 - }, - { - "auxiliary_loss_clip": 0.01107791, - "auxiliary_loss_mlp": 0.01038756, - "balance_loss_clip": 1.04748154, - "balance_loss_mlp": 1.02226901, - "epoch": 0.20928904253720126, - "flos": 14830712714880.0, - "grad_norm": 2.178942595840573, - "language_loss": 0.75664043, - "learning_rate": 3.67217151746346e-06, - "loss": 0.77810597, - "num_input_tokens_seen": 75284175, - "step": 3481, - "time_per_iteration": 2.71073842048645 - }, - { - "auxiliary_loss_clip": 0.01090021, - "auxiliary_loss_mlp": 0.01046234, - "balance_loss_clip": 1.04561555, - "balance_loss_mlp": 1.02727938, - "epoch": 0.20934916578986923, - "flos": 23259162216960.0, - "grad_norm": 1.816378391984801, - "language_loss": 0.8517971, - "learning_rate": 3.671957827563209e-06, - "loss": 0.87315965, - "num_input_tokens_seen": 75303465, - "step": 3482, - "time_per_iteration": 2.8777174949645996 - }, - { - "auxiliary_loss_clip": 0.01099298, - "auxiliary_loss_mlp": 0.01046228, - "balance_loss_clip": 1.05039477, - "balance_loss_mlp": 1.02817941, - "epoch": 0.2094092890425372, - "flos": 32014398677760.0, - "grad_norm": 1.802490425012806, - "language_loss": 0.70550174, - "learning_rate": 3.6717440742621494e-06, - "loss": 0.72695696, - "num_input_tokens_seen": 75325290, - "step": 3483, - "time_per_iteration": 2.8599836826324463 - }, - { - "auxiliary_loss_clip": 0.01127333, - "auxiliary_loss_mlp": 0.01048954, - "balance_loss_clip": 1.05204535, - "balance_loss_mlp": 1.03082263, - "epoch": 0.20946941229520516, - "flos": 20010647082240.0, - "grad_norm": 1.9649551735344426, - "language_loss": 0.74867833, - "learning_rate": 3.6715302575683865e-06, - "loss": 0.77044123, - "num_input_tokens_seen": 75343895, - "step": 3484, - "time_per_iteration": 2.655538320541382 - }, - { - "auxiliary_loss_clip": 0.01117623, - "auxiliary_loss_mlp": 0.01046902, - "balance_loss_clip": 1.0514648, - "balance_loss_mlp": 1.0274353, - "epoch": 0.20952953554787315, - "flos": 30740072895360.0, - "grad_norm": 1.6308141537991403, - "language_loss": 0.70815694, - "learning_rate": 3.6713163774900292e-06, - "loss": 0.72980225, - "num_input_tokens_seen": 75367100, - "step": 3485, - "time_per_iteration": 2.744417667388916 - }, - { - "auxiliary_loss_clip": 0.01083098, - "auxiliary_loss_mlp": 0.00777163, - "balance_loss_clip": 1.0433619, - "balance_loss_mlp": 1.00097859, - "epoch": 0.20958965880054112, - "flos": 27049209770880.0, - "grad_norm": 2.030771632516388, - "language_loss": 0.83274543, - "learning_rate": 3.6711024340351875e-06, - "loss": 0.85134804, - "num_input_tokens_seen": 75389925, - "step": 3486, - "time_per_iteration": 2.742042303085327 - }, - { - "auxiliary_loss_clip": 0.01140212, - "auxiliary_loss_mlp": 0.01048337, - "balance_loss_clip": 1.05242062, - "balance_loss_mlp": 1.03115916, - "epoch": 0.20964978205320908, - "flos": 34204123589760.0, - "grad_norm": 1.6926372989653347, - "language_loss": 0.87134725, - "learning_rate": 3.6708884272119737e-06, - "loss": 0.89323276, - "num_input_tokens_seen": 75408575, - "step": 3487, - "time_per_iteration": 2.708331346511841 - }, - { - "auxiliary_loss_clip": 0.01112214, - "auxiliary_loss_mlp": 0.01041678, - "balance_loss_clip": 1.04791641, - "balance_loss_mlp": 1.0228194, - "epoch": 0.20970990530587705, - "flos": 23477391296640.0, - "grad_norm": 4.471143750410675, - "language_loss": 0.72291327, - "learning_rate": 3.670674357028504e-06, - "loss": 0.74445224, - "num_input_tokens_seen": 75427155, - "step": 3488, - "time_per_iteration": 4.250715970993042 - }, - { - "auxiliary_loss_clip": 0.01121403, - "auxiliary_loss_mlp": 0.01037296, - "balance_loss_clip": 1.05096245, - "balance_loss_mlp": 1.02014148, - "epoch": 0.209770028558545, - "flos": 18551452976640.0, - "grad_norm": 2.6694226497987437, - "language_loss": 0.79665899, - "learning_rate": 3.6704602234928945e-06, - "loss": 0.81824595, - "num_input_tokens_seen": 75444450, - "step": 3489, - "time_per_iteration": 2.6926958560943604 - }, - { - "auxiliary_loss_clip": 0.01152639, - "auxiliary_loss_mlp": 0.01045785, - "balance_loss_clip": 1.05325401, - "balance_loss_mlp": 1.02875018, - "epoch": 0.20983015181121298, - "flos": 21617003208960.0, - "grad_norm": 2.022409198347131, - "language_loss": 0.72505707, - "learning_rate": 3.670246026613266e-06, - "loss": 0.74704129, - "num_input_tokens_seen": 75462625, - "step": 3490, - "time_per_iteration": 4.133761644363403 - }, - { - "auxiliary_loss_clip": 0.01122247, - "auxiliary_loss_mlp": 0.01050283, - "balance_loss_clip": 1.0509479, - "balance_loss_mlp": 1.03402328, - "epoch": 0.20989027506388094, - "flos": 16614718531200.0, - "grad_norm": 1.8035978449536252, - "language_loss": 0.70332754, - "learning_rate": 3.6700317663977415e-06, - "loss": 0.72505283, - "num_input_tokens_seen": 75480640, - "step": 3491, - "time_per_iteration": 2.667243003845215 - }, - { - "auxiliary_loss_clip": 0.0113848, - "auxiliary_loss_mlp": 0.0077627, - "balance_loss_clip": 1.05017376, - "balance_loss_mlp": 1.00098944, - "epoch": 0.20995039831654894, - "flos": 23216823060480.0, - "grad_norm": 2.379943808529104, - "language_loss": 0.79751909, - "learning_rate": 3.669817442854444e-06, - "loss": 0.81666666, - "num_input_tokens_seen": 75494900, - "step": 3492, - "time_per_iteration": 4.270704984664917 - }, - { - "auxiliary_loss_clip": 0.01138825, - "auxiliary_loss_mlp": 0.00776339, - "balance_loss_clip": 1.05182219, - "balance_loss_mlp": 1.00108409, - "epoch": 0.2100105215692169, - "flos": 18147493647360.0, - "grad_norm": 2.2783194747149906, - "language_loss": 0.86987948, - "learning_rate": 3.669603055991502e-06, - "loss": 0.88903111, - "num_input_tokens_seen": 75513370, - "step": 3493, - "time_per_iteration": 2.7830448150634766 - }, - { - "auxiliary_loss_clip": 0.01110786, - "auxiliary_loss_mlp": 0.01037681, - "balance_loss_clip": 1.04520118, - "balance_loss_mlp": 1.02105093, - "epoch": 0.21007064482188487, - "flos": 15961611490560.0, - "grad_norm": 6.813030650079402, - "language_loss": 0.68622243, - "learning_rate": 3.6693886058170455e-06, - "loss": 0.70770705, - "num_input_tokens_seen": 75532480, - "step": 3494, - "time_per_iteration": 2.8479061126708984 - }, - { - "auxiliary_loss_clip": 0.01145467, - "auxiliary_loss_mlp": 0.01037272, - "balance_loss_clip": 1.05302739, - "balance_loss_mlp": 1.01998639, - "epoch": 0.21013076807455283, - "flos": 32234315696640.0, - "grad_norm": 1.7516454579581615, - "language_loss": 0.78848761, - "learning_rate": 3.6691740923392053e-06, - "loss": 0.81031501, - "num_input_tokens_seen": 75552745, - "step": 3495, - "time_per_iteration": 2.9313197135925293 - }, - { - "auxiliary_loss_clip": 0.01119614, - "auxiliary_loss_mlp": 0.01045108, - "balance_loss_clip": 1.04760814, - "balance_loss_mlp": 1.02708316, - "epoch": 0.2101908913272208, - "flos": 23696625957120.0, - "grad_norm": 2.1492916784611844, - "language_loss": 0.77302933, - "learning_rate": 3.668959515566116e-06, - "loss": 0.79467654, - "num_input_tokens_seen": 75574355, - "step": 3496, - "time_per_iteration": 4.467881441116333 - }, - { - "auxiliary_loss_clip": 0.01135202, - "auxiliary_loss_mlp": 0.01046618, - "balance_loss_clip": 1.05169654, - "balance_loss_mlp": 1.02839065, - "epoch": 0.21025101457988876, - "flos": 20375786787840.0, - "grad_norm": 2.146148958862047, - "language_loss": 0.82076812, - "learning_rate": 3.668744875505915e-06, - "loss": 0.8425864, - "num_input_tokens_seen": 75592215, - "step": 3497, - "time_per_iteration": 2.683037281036377 - }, - { - "auxiliary_loss_clip": 0.01144559, - "auxiliary_loss_mlp": 0.01047188, - "balance_loss_clip": 1.05445957, - "balance_loss_mlp": 1.02967596, - "epoch": 0.21031113783255675, - "flos": 25775638174080.0, - "grad_norm": 1.732381679276629, - "language_loss": 0.67239833, - "learning_rate": 3.668530172166741e-06, - "loss": 0.69431579, - "num_input_tokens_seen": 75610740, - "step": 3498, - "time_per_iteration": 2.685481548309326 - }, - { - "auxiliary_loss_clip": 0.01121255, - "auxiliary_loss_mlp": 0.01044553, - "balance_loss_clip": 1.04974794, - "balance_loss_mlp": 1.02611172, - "epoch": 0.21037126108522472, - "flos": 22018197191040.0, - "grad_norm": 1.7892967196850054, - "language_loss": 0.80832362, - "learning_rate": 3.6683154055567352e-06, - "loss": 0.82998168, - "num_input_tokens_seen": 75631005, - "step": 3499, - "time_per_iteration": 2.744995355606079 - }, - { - "auxiliary_loss_clip": 0.01139753, - "auxiliary_loss_mlp": 0.01039729, - "balance_loss_clip": 1.05226696, - "balance_loss_mlp": 1.02312946, - "epoch": 0.21043138433789269, - "flos": 25334403505920.0, - "grad_norm": 1.6464696881852638, - "language_loss": 0.77983701, - "learning_rate": 3.668100575684043e-06, - "loss": 0.80163181, - "num_input_tokens_seen": 75650655, - "step": 3500, - "time_per_iteration": 2.7704038619995117 - }, - { - "auxiliary_loss_clip": 0.01129369, - "auxiliary_loss_mlp": 0.01042187, - "balance_loss_clip": 1.05095315, - "balance_loss_mlp": 1.02390063, - "epoch": 0.21049150759056065, - "flos": 25556654908800.0, - "grad_norm": 1.5981262394728393, - "language_loss": 0.74450207, - "learning_rate": 3.6678856825568094e-06, - "loss": 0.76621759, - "num_input_tokens_seen": 75669895, - "step": 3501, - "time_per_iteration": 2.7066893577575684 - }, - { - "auxiliary_loss_clip": 0.01134924, - "auxiliary_loss_mlp": 0.01039556, - "balance_loss_clip": 1.04989994, - "balance_loss_mlp": 1.02227044, - "epoch": 0.21055163084322862, - "flos": 24495602129280.0, - "grad_norm": 1.6188770382514572, - "language_loss": 0.75278366, - "learning_rate": 3.667670726183183e-06, - "loss": 0.77452844, - "num_input_tokens_seen": 75689535, - "step": 3502, - "time_per_iteration": 2.724635124206543 - }, - { - "auxiliary_loss_clip": 0.01098479, - "auxiliary_loss_mlp": 0.01040924, - "balance_loss_clip": 1.04576206, - "balance_loss_mlp": 1.02248216, - "epoch": 0.21061175409589658, - "flos": 25739045193600.0, - "grad_norm": 1.9441266701933382, - "language_loss": 0.77188909, - "learning_rate": 3.667455706571316e-06, - "loss": 0.7932831, - "num_input_tokens_seen": 75709265, - "step": 3503, - "time_per_iteration": 2.7545289993286133 - }, - { - "auxiliary_loss_clip": 0.010957, - "auxiliary_loss_mlp": 0.01045911, - "balance_loss_clip": 1.04817343, - "balance_loss_mlp": 1.02478695, - "epoch": 0.21067187734856455, - "flos": 18989168112000.0, - "grad_norm": 2.256374081289255, - "language_loss": 0.78297234, - "learning_rate": 3.6672406237293617e-06, - "loss": 0.8043884, - "num_input_tokens_seen": 75727050, - "step": 3504, - "time_per_iteration": 2.7454304695129395 - }, - { - "auxiliary_loss_clip": 0.01117408, - "auxiliary_loss_mlp": 0.01049815, - "balance_loss_clip": 1.0488404, - "balance_loss_mlp": 1.03152788, - "epoch": 0.21073200060123254, - "flos": 24681368292480.0, - "grad_norm": 1.5753219052286964, - "language_loss": 0.76731002, - "learning_rate": 3.6670254776654754e-06, - "loss": 0.78898227, - "num_input_tokens_seen": 75747175, - "step": 3505, - "time_per_iteration": 2.7509703636169434 - }, - { - "auxiliary_loss_clip": 0.01120291, - "auxiliary_loss_mlp": 0.01052026, - "balance_loss_clip": 1.04882348, - "balance_loss_mlp": 1.03383446, - "epoch": 0.2107921238539005, - "flos": 28549342402560.0, - "grad_norm": 1.9938386598136906, - "language_loss": 0.63933277, - "learning_rate": 3.6668102683878163e-06, - "loss": 0.66105598, - "num_input_tokens_seen": 75767690, - "step": 3506, - "time_per_iteration": 2.773611545562744 - }, - { - "auxiliary_loss_clip": 0.01138444, - "auxiliary_loss_mlp": 0.01050655, - "balance_loss_clip": 1.05078697, - "balance_loss_mlp": 1.03257108, - "epoch": 0.21085224710656847, - "flos": 25885848078720.0, - "grad_norm": 2.170999698474249, - "language_loss": 0.82010436, - "learning_rate": 3.6665949959045443e-06, - "loss": 0.84199536, - "num_input_tokens_seen": 75787255, - "step": 3507, - "time_per_iteration": 2.6604206562042236 - }, - { - "auxiliary_loss_clip": 0.01136754, - "auxiliary_loss_mlp": 0.01043314, - "balance_loss_clip": 1.04972744, - "balance_loss_mlp": 1.02472949, - "epoch": 0.21091237035923643, - "flos": 14976294537600.0, - "grad_norm": 2.0519706535557414, - "language_loss": 0.75213134, - "learning_rate": 3.666379660223824e-06, - "loss": 0.77393204, - "num_input_tokens_seen": 75805890, - "step": 3508, - "time_per_iteration": 2.7164604663848877 - }, - { - "auxiliary_loss_clip": 0.01154655, - "auxiliary_loss_mlp": 0.01036811, - "balance_loss_clip": 1.05263913, - "balance_loss_mlp": 1.01894128, - "epoch": 0.2109724936119044, - "flos": 16362518163840.0, - "grad_norm": 3.4182125548434112, - "language_loss": 0.84984946, - "learning_rate": 3.6661642613538192e-06, - "loss": 0.87176406, - "num_input_tokens_seen": 75821620, - "step": 3509, - "time_per_iteration": 2.661743402481079 - }, - { - "auxiliary_loss_clip": 0.01120944, - "auxiliary_loss_mlp": 0.01044014, - "balance_loss_clip": 1.05299115, - "balance_loss_mlp": 1.02443957, - "epoch": 0.21103261686457236, - "flos": 31502492000640.0, - "grad_norm": 2.210880078691599, - "language_loss": 0.68125075, - "learning_rate": 3.6659487993026987e-06, - "loss": 0.70290035, - "num_input_tokens_seen": 75842490, - "step": 3510, - "time_per_iteration": 2.7881460189819336 - }, - { - "auxiliary_loss_clip": 0.01152569, - "auxiliary_loss_mlp": 0.01046993, - "balance_loss_clip": 1.05026078, - "balance_loss_mlp": 1.02892137, - "epoch": 0.21109274011724033, - "flos": 27344072517120.0, - "grad_norm": 1.958863999940011, - "language_loss": 0.72639364, - "learning_rate": 3.6657332740786327e-06, - "loss": 0.74838924, - "num_input_tokens_seen": 75865985, - "step": 3511, - "time_per_iteration": 2.6942689418792725 - }, - { - "auxiliary_loss_clip": 0.01066393, - "auxiliary_loss_mlp": 0.01041278, - "balance_loss_clip": 1.04279399, - "balance_loss_mlp": 1.0208931, - "epoch": 0.21115286336990832, - "flos": 17820383466240.0, - "grad_norm": 3.2801391377369686, - "language_loss": 0.69354337, - "learning_rate": 3.665517685689794e-06, - "loss": 0.71462011, - "num_input_tokens_seen": 75882745, - "step": 3512, - "time_per_iteration": 2.8260998725891113 - }, - { - "auxiliary_loss_clip": 0.01140043, - "auxiliary_loss_mlp": 0.01050555, - "balance_loss_clip": 1.04943943, - "balance_loss_mlp": 1.03082585, - "epoch": 0.2112129866225763, - "flos": 27197987904000.0, - "grad_norm": 2.072678482519775, - "language_loss": 0.73145646, - "learning_rate": 3.6653020341443584e-06, - "loss": 0.75336242, - "num_input_tokens_seen": 75904305, - "step": 3513, - "time_per_iteration": 2.9639391899108887 - }, - { - "auxiliary_loss_clip": 0.01121964, - "auxiliary_loss_mlp": 0.01038325, - "balance_loss_clip": 1.04785061, - "balance_loss_mlp": 1.02089679, - "epoch": 0.21127310987524425, - "flos": 23731279603200.0, - "grad_norm": 2.0322171916220086, - "language_loss": 0.74422491, - "learning_rate": 3.665086319450502e-06, - "loss": 0.76582778, - "num_input_tokens_seen": 75923710, - "step": 3514, - "time_per_iteration": 2.7379143238067627 - }, - { - "auxiliary_loss_clip": 0.01136944, - "auxiliary_loss_mlp": 0.01038225, - "balance_loss_clip": 1.05334568, - "balance_loss_mlp": 1.01941383, - "epoch": 0.21133323312791222, - "flos": 18332505624960.0, - "grad_norm": 2.431934297389972, - "language_loss": 0.76738697, - "learning_rate": 3.6648705416164062e-06, - "loss": 0.78913867, - "num_input_tokens_seen": 75942625, - "step": 3515, - "time_per_iteration": 2.6339287757873535 - }, - { - "auxiliary_loss_clip": 0.011289, - "auxiliary_loss_mlp": 0.01047482, - "balance_loss_clip": 1.05247736, - "balance_loss_mlp": 1.0288614, - "epoch": 0.21139335638058018, - "flos": 17931203902080.0, - "grad_norm": 2.7460645413082756, - "language_loss": 0.68756706, - "learning_rate": 3.6646547006502518e-06, - "loss": 0.70933092, - "num_input_tokens_seen": 75959930, - "step": 3516, - "time_per_iteration": 2.6489672660827637 - }, - { - "auxiliary_loss_clip": 0.01118182, - "auxiliary_loss_mlp": 0.01049447, - "balance_loss_clip": 1.05634522, - "balance_loss_mlp": 1.03045666, - "epoch": 0.21145347963324815, - "flos": 24572092141440.0, - "grad_norm": 1.8368744753927078, - "language_loss": 0.85010064, - "learning_rate": 3.664438796560225e-06, - "loss": 0.87177694, - "num_input_tokens_seen": 75980335, - "step": 3517, - "time_per_iteration": 2.745887279510498 - }, - { - "auxiliary_loss_clip": 0.01125904, - "auxiliary_loss_mlp": 0.01042813, - "balance_loss_clip": 1.04719234, - "balance_loss_mlp": 1.02506244, - "epoch": 0.21151360288591614, - "flos": 35845959375360.0, - "grad_norm": 2.246330970109572, - "language_loss": 0.63672101, - "learning_rate": 3.664222829354512e-06, - "loss": 0.65840822, - "num_input_tokens_seen": 76002095, - "step": 3518, - "time_per_iteration": 2.7990219593048096 - }, - { - "auxiliary_loss_clip": 0.01089367, - "auxiliary_loss_mlp": 0.01057733, - "balance_loss_clip": 1.05040181, - "balance_loss_mlp": 1.04001832, - "epoch": 0.2115737261385841, - "flos": 24641579001600.0, - "grad_norm": 2.1349107177710875, - "language_loss": 0.89256221, - "learning_rate": 3.664006799041303e-06, - "loss": 0.91403317, - "num_input_tokens_seen": 76020425, - "step": 3519, - "time_per_iteration": 2.8022944927215576 - }, - { - "auxiliary_loss_clip": 0.01135146, - "auxiliary_loss_mlp": 0.01049587, - "balance_loss_clip": 1.05320001, - "balance_loss_mlp": 1.03140712, - "epoch": 0.21163384939125207, - "flos": 25226887121280.0, - "grad_norm": 1.8050755180524396, - "language_loss": 0.81235015, - "learning_rate": 3.6637907056287886e-06, - "loss": 0.8341974, - "num_input_tokens_seen": 76041210, - "step": 3520, - "time_per_iteration": 2.750988245010376 - }, - { - "auxiliary_loss_clip": 0.01124406, - "auxiliary_loss_mlp": 0.01048631, - "balance_loss_clip": 1.05111551, - "balance_loss_mlp": 1.03095269, - "epoch": 0.21169397264392004, - "flos": 26067520091520.0, - "grad_norm": 1.92815865975435, - "language_loss": 0.76254267, - "learning_rate": 3.6635745491251642e-06, - "loss": 0.78427303, - "num_input_tokens_seen": 76062685, - "step": 3521, - "time_per_iteration": 2.7965810298919678 - }, - { - "auxiliary_loss_clip": 0.0109789, - "auxiliary_loss_mlp": 0.01044794, - "balance_loss_clip": 1.04872918, - "balance_loss_mlp": 1.02841413, - "epoch": 0.211754095896588, - "flos": 23108265181440.0, - "grad_norm": 2.0270933567011302, - "language_loss": 0.75752926, - "learning_rate": 3.663358329538626e-06, - "loss": 0.77895606, - "num_input_tokens_seen": 76082300, - "step": 3522, - "time_per_iteration": 2.8280131816864014 - }, - { - "auxiliary_loss_clip": 0.01153324, - "auxiliary_loss_mlp": 0.01053431, - "balance_loss_clip": 1.05353725, - "balance_loss_mlp": 1.03541851, - "epoch": 0.21181421914925597, - "flos": 27922341571200.0, - "grad_norm": 1.8399634756194385, - "language_loss": 0.70481133, - "learning_rate": 3.663142046877374e-06, - "loss": 0.72687888, - "num_input_tokens_seen": 76101135, - "step": 3523, - "time_per_iteration": 2.6909022331237793 - }, - { - "auxiliary_loss_clip": 0.01139749, - "auxiliary_loss_mlp": 0.01054127, - "balance_loss_clip": 1.05166054, - "balance_loss_mlp": 1.03619766, - "epoch": 0.21187434240192393, - "flos": 17128636369920.0, - "grad_norm": 2.455264594190525, - "language_loss": 0.77290082, - "learning_rate": 3.6629257011496085e-06, - "loss": 0.7948395, - "num_input_tokens_seen": 76119320, - "step": 3524, - "time_per_iteration": 2.6844334602355957 - }, - { - "auxiliary_loss_clip": 0.01132697, - "auxiliary_loss_mlp": 0.0104457, - "balance_loss_clip": 1.05066419, - "balance_loss_mlp": 1.02621162, - "epoch": 0.21193446565459192, - "flos": 22347318533760.0, - "grad_norm": 1.841652047976503, - "language_loss": 0.81680572, - "learning_rate": 3.6627092923635338e-06, - "loss": 0.83857846, - "num_input_tokens_seen": 76137445, - "step": 3525, - "time_per_iteration": 2.71073842048645 - }, - { - "auxiliary_loss_clip": 0.01088536, - "auxiliary_loss_mlp": 0.01041509, - "balance_loss_clip": 1.04158318, - "balance_loss_mlp": 1.02353263, - "epoch": 0.2119945889072599, - "flos": 27199316707200.0, - "grad_norm": 1.867957043941215, - "language_loss": 0.75627208, - "learning_rate": 3.662492820527356e-06, - "loss": 0.77757257, - "num_input_tokens_seen": 76159500, - "step": 3526, - "time_per_iteration": 2.973966598510742 - }, - { - "auxiliary_loss_clip": 0.0115455, - "auxiliary_loss_mlp": 0.01041027, - "balance_loss_clip": 1.05324817, - "balance_loss_mlp": 1.023229, - "epoch": 0.21205471215992786, - "flos": 20991869884800.0, - "grad_norm": 1.8230643924086412, - "language_loss": 0.77070421, - "learning_rate": 3.662276285649284e-06, - "loss": 0.79265994, - "num_input_tokens_seen": 76177990, - "step": 3527, - "time_per_iteration": 2.648961067199707 - }, - { - "auxiliary_loss_clip": 0.01151081, - "auxiliary_loss_mlp": 0.0104874, - "balance_loss_clip": 1.05143785, - "balance_loss_mlp": 1.02977419, - "epoch": 0.21211483541259582, - "flos": 20777663128320.0, - "grad_norm": 2.807984733302778, - "language_loss": 0.7815178, - "learning_rate": 3.662059687737528e-06, - "loss": 0.80351603, - "num_input_tokens_seen": 76197125, - "step": 3528, - "time_per_iteration": 4.401185989379883 - }, - { - "auxiliary_loss_clip": 0.01135768, - "auxiliary_loss_mlp": 0.01045736, - "balance_loss_clip": 1.04889631, - "balance_loss_mlp": 1.02817655, - "epoch": 0.21217495866526379, - "flos": 18989994124800.0, - "grad_norm": 2.1271435469609257, - "language_loss": 0.8128866, - "learning_rate": 3.6618430268003024e-06, - "loss": 0.8347016, - "num_input_tokens_seen": 76216215, - "step": 3529, - "time_per_iteration": 4.309772968292236 - }, - { - "auxiliary_loss_clip": 0.0113319, - "auxiliary_loss_mlp": 0.00777373, - "balance_loss_clip": 1.04967499, - "balance_loss_mlp": 1.00112891, - "epoch": 0.21223508191793175, - "flos": 20667309569280.0, - "grad_norm": 1.9704727824538568, - "language_loss": 0.76427567, - "learning_rate": 3.6616263028458235e-06, - "loss": 0.78338128, - "num_input_tokens_seen": 76237010, - "step": 3530, - "time_per_iteration": 2.7592365741729736 - }, - { - "auxiliary_loss_clip": 0.0115078, - "auxiliary_loss_mlp": 0.01047067, - "balance_loss_clip": 1.0522244, - "balance_loss_mlp": 1.02990103, - "epoch": 0.21229520517059972, - "flos": 21616464504960.0, - "grad_norm": 2.1154933827202274, - "language_loss": 0.82973897, - "learning_rate": 3.661409515882308e-06, - "loss": 0.85171747, - "num_input_tokens_seen": 76255965, - "step": 3531, - "time_per_iteration": 4.168981313705444 - }, - { - "auxiliary_loss_clip": 0.01120152, - "auxiliary_loss_mlp": 0.01042697, - "balance_loss_clip": 1.04767489, - "balance_loss_mlp": 1.02313459, - "epoch": 0.2123553284232677, - "flos": 13991049411840.0, - "grad_norm": 2.335526210972433, - "language_loss": 0.73087364, - "learning_rate": 3.661192665917977e-06, - "loss": 0.75250214, - "num_input_tokens_seen": 76272150, - "step": 3532, - "time_per_iteration": 2.6797189712524414 - }, - { - "auxiliary_loss_clip": 0.01126693, - "auxiliary_loss_mlp": 0.01041409, - "balance_loss_clip": 1.0539782, - "balance_loss_mlp": 1.02269292, - "epoch": 0.21241545167593567, - "flos": 18296774570880.0, - "grad_norm": 6.22254473074881, - "language_loss": 0.74268675, - "learning_rate": 3.660975752961054e-06, - "loss": 0.76436776, - "num_input_tokens_seen": 76291425, - "step": 3533, - "time_per_iteration": 2.741152048110962 - }, - { - "auxiliary_loss_clip": 0.01146682, - "auxiliary_loss_mlp": 0.0104438, - "balance_loss_clip": 1.05342829, - "balance_loss_mlp": 1.0265224, - "epoch": 0.21247557492860364, - "flos": 34713121265280.0, - "grad_norm": 2.0406923816018714, - "language_loss": 0.70889592, - "learning_rate": 3.6607587770197634e-06, - "loss": 0.73080653, - "num_input_tokens_seen": 76313975, - "step": 3534, - "time_per_iteration": 2.8210513591766357 - }, - { - "auxiliary_loss_clip": 0.01133157, - "auxiliary_loss_mlp": 0.01043651, - "balance_loss_clip": 1.05234385, - "balance_loss_mlp": 1.02463722, - "epoch": 0.2125356981812716, - "flos": 22053820504320.0, - "grad_norm": 2.102271516852891, - "language_loss": 0.71675557, - "learning_rate": 3.6605417381023346e-06, - "loss": 0.73852366, - "num_input_tokens_seen": 76330955, - "step": 3535, - "time_per_iteration": 2.804506540298462 - }, - { - "auxiliary_loss_clip": 0.01137461, - "auxiliary_loss_mlp": 0.01053804, - "balance_loss_clip": 1.05108476, - "balance_loss_mlp": 1.03607774, - "epoch": 0.21259582143393957, - "flos": 28548336821760.0, - "grad_norm": 24.01704513629389, - "language_loss": 0.70639503, - "learning_rate": 3.660324636216996e-06, - "loss": 0.72830772, - "num_input_tokens_seen": 76352680, - "step": 3536, - "time_per_iteration": 4.442729473114014 - }, - { - "auxiliary_loss_clip": 0.011554, - "auxiliary_loss_mlp": 0.01049939, - "balance_loss_clip": 1.05231214, - "balance_loss_mlp": 1.03082991, - "epoch": 0.21265594468660753, - "flos": 20120892900480.0, - "grad_norm": 2.2527167001205806, - "language_loss": 0.8784188, - "learning_rate": 3.660107471371981e-06, - "loss": 0.90047216, - "num_input_tokens_seen": 76370750, - "step": 3537, - "time_per_iteration": 2.6365723609924316 - }, - { - "auxiliary_loss_clip": 0.01137536, - "auxiliary_loss_mlp": 0.00776226, - "balance_loss_clip": 1.04911351, - "balance_loss_mlp": 1.00101614, - "epoch": 0.21271606793927553, - "flos": 23076161400960.0, - "grad_norm": 1.8080285651248438, - "language_loss": 0.80480909, - "learning_rate": 3.659890243575524e-06, - "loss": 0.82394671, - "num_input_tokens_seen": 76390610, - "step": 3538, - "time_per_iteration": 2.7403554916381836 - }, - { - "auxiliary_loss_clip": 0.01080631, - "auxiliary_loss_mlp": 0.0105169, - "balance_loss_clip": 1.04171312, - "balance_loss_mlp": 1.03219926, - "epoch": 0.2127761911919435, - "flos": 26388201738240.0, - "grad_norm": 2.705287390300715, - "language_loss": 0.86691839, - "learning_rate": 3.659672952835863e-06, - "loss": 0.88824159, - "num_input_tokens_seen": 76408860, - "step": 3539, - "time_per_iteration": 2.8177876472473145 - }, - { - "auxiliary_loss_clip": 0.01120184, - "auxiliary_loss_mlp": 0.01047424, - "balance_loss_clip": 1.04577422, - "balance_loss_mlp": 1.0295074, - "epoch": 0.21283631444461146, - "flos": 20228265630720.0, - "grad_norm": 5.212413836862573, - "language_loss": 0.57756186, - "learning_rate": 3.659455599161237e-06, - "loss": 0.59923792, - "num_input_tokens_seen": 76424980, - "step": 3540, - "time_per_iteration": 2.786552667617798 - }, - { - "auxiliary_loss_clip": 0.01154193, - "auxiliary_loss_mlp": 0.010403, - "balance_loss_clip": 1.05276537, - "balance_loss_mlp": 1.02131045, - "epoch": 0.21289643769727942, - "flos": 13516992691200.0, - "grad_norm": 2.318388810062464, - "language_loss": 0.76114893, - "learning_rate": 3.659238182559888e-06, - "loss": 0.78309381, - "num_input_tokens_seen": 76443135, - "step": 3541, - "time_per_iteration": 2.646207332611084 - }, - { - "auxiliary_loss_clip": 0.01108241, - "auxiliary_loss_mlp": 0.01044876, - "balance_loss_clip": 1.0464325, - "balance_loss_mlp": 1.02676797, - "epoch": 0.2129565609499474, - "flos": 24827021942400.0, - "grad_norm": 3.508596736579257, - "language_loss": 0.69749588, - "learning_rate": 3.6590207030400615e-06, - "loss": 0.71902704, - "num_input_tokens_seen": 76462470, - "step": 3542, - "time_per_iteration": 2.746612787246704 - }, - { - "auxiliary_loss_clip": 0.01149445, - "auxiliary_loss_mlp": 0.01038617, - "balance_loss_clip": 1.05146265, - "balance_loss_mlp": 1.02160525, - "epoch": 0.21301668420261535, - "flos": 23659242877440.0, - "grad_norm": 2.3488794859192397, - "language_loss": 0.75651306, - "learning_rate": 3.658803160610004e-06, - "loss": 0.77839369, - "num_input_tokens_seen": 76481995, - "step": 3543, - "time_per_iteration": 2.665900230407715 - }, - { - "auxiliary_loss_clip": 0.0112855, - "auxiliary_loss_mlp": 0.01042048, - "balance_loss_clip": 1.05257249, - "balance_loss_mlp": 1.02409506, - "epoch": 0.21307680745528332, - "flos": 16362805472640.0, - "grad_norm": 1.8076409354305347, - "language_loss": 0.66981912, - "learning_rate": 3.6585855552779634e-06, - "loss": 0.6915251, - "num_input_tokens_seen": 76500245, - "step": 3544, - "time_per_iteration": 2.6692638397216797 - }, - { - "auxiliary_loss_clip": 0.01121216, - "auxiliary_loss_mlp": 0.01046395, - "balance_loss_clip": 1.0480237, - "balance_loss_mlp": 1.02897835, - "epoch": 0.2131369307079513, - "flos": 19099054794240.0, - "grad_norm": 1.8644107460894377, - "language_loss": 0.70977402, - "learning_rate": 3.6583678870521934e-06, - "loss": 0.73145014, - "num_input_tokens_seen": 76519535, - "step": 3545, - "time_per_iteration": 2.686939001083374 - }, - { - "auxiliary_loss_clip": 0.01128605, - "auxiliary_loss_mlp": 0.01048325, - "balance_loss_clip": 1.05368018, - "balance_loss_mlp": 1.0300498, - "epoch": 0.21319705396061928, - "flos": 30372275583360.0, - "grad_norm": 1.8809403827144264, - "language_loss": 0.72329843, - "learning_rate": 3.658150155940946e-06, - "loss": 0.74506772, - "num_input_tokens_seen": 76542065, - "step": 3546, - "time_per_iteration": 2.8044040203094482 - }, - { - "auxiliary_loss_clip": 0.01115103, - "auxiliary_loss_mlp": 0.01050245, - "balance_loss_clip": 1.0539, - "balance_loss_mlp": 1.03250647, - "epoch": 0.21325717721328724, - "flos": 21756192410880.0, - "grad_norm": 3.48585993087404, - "language_loss": 0.80431038, - "learning_rate": 3.657932361952479e-06, - "loss": 0.82596385, - "num_input_tokens_seen": 76560540, - "step": 3547, - "time_per_iteration": 2.7981739044189453 - }, - { - "auxiliary_loss_clip": 0.01154388, - "auxiliary_loss_mlp": 0.01045355, - "balance_loss_clip": 1.05115056, - "balance_loss_mlp": 1.02685428, - "epoch": 0.2133173004659552, - "flos": 28730870760960.0, - "grad_norm": 2.460294966859189, - "language_loss": 0.7449761, - "learning_rate": 3.6577145050950504e-06, - "loss": 0.7669735, - "num_input_tokens_seen": 76581760, - "step": 3548, - "time_per_iteration": 2.709476947784424 - }, - { - "auxiliary_loss_clip": 0.01117193, - "auxiliary_loss_mlp": 0.01059153, - "balance_loss_clip": 1.05099797, - "balance_loss_mlp": 1.03938842, - "epoch": 0.21337742371862317, - "flos": 16837077674880.0, - "grad_norm": 2.783715227630402, - "language_loss": 0.74218595, - "learning_rate": 3.657496585376922e-06, - "loss": 0.76394939, - "num_input_tokens_seen": 76599940, - "step": 3549, - "time_per_iteration": 2.751401662826538 - }, - { - "auxiliary_loss_clip": 0.01121431, - "auxiliary_loss_mlp": 0.01050546, - "balance_loss_clip": 1.05331278, - "balance_loss_mlp": 1.03283179, - "epoch": 0.21343754697129114, - "flos": 24424930120320.0, - "grad_norm": 1.8583266555890872, - "language_loss": 0.80719978, - "learning_rate": 3.657278602806357e-06, - "loss": 0.82891953, - "num_input_tokens_seen": 76619580, - "step": 3550, - "time_per_iteration": 2.74678373336792 - }, - { - "auxiliary_loss_clip": 0.01151996, - "auxiliary_loss_mlp": 0.01048347, - "balance_loss_clip": 1.05428052, - "balance_loss_mlp": 1.03147876, - "epoch": 0.21349767022395913, - "flos": 19277817805440.0, - "grad_norm": 1.7548210279469212, - "language_loss": 0.88234103, - "learning_rate": 3.657060557391621e-06, - "loss": 0.90434444, - "num_input_tokens_seen": 76638195, - "step": 3551, - "time_per_iteration": 2.746938705444336 - }, - { - "auxiliary_loss_clip": 0.01151269, - "auxiliary_loss_mlp": 0.01048306, - "balance_loss_clip": 1.05139017, - "balance_loss_mlp": 1.03111625, - "epoch": 0.2135577934766271, - "flos": 17347547808000.0, - "grad_norm": 1.8976063035050816, - "language_loss": 0.83877259, - "learning_rate": 3.656842449140983e-06, - "loss": 0.86076838, - "num_input_tokens_seen": 76656695, - "step": 3552, - "time_per_iteration": 2.616567373275757 - }, - { - "auxiliary_loss_clip": 0.0113626, - "auxiliary_loss_mlp": 0.01050705, - "balance_loss_clip": 1.04937124, - "balance_loss_mlp": 1.0325495, - "epoch": 0.21361791672929506, - "flos": 24057204635520.0, - "grad_norm": 2.604872460919843, - "language_loss": 0.76370007, - "learning_rate": 3.656624278062713e-06, - "loss": 0.78556973, - "num_input_tokens_seen": 76677430, - "step": 3553, - "time_per_iteration": 2.730829954147339 - }, - { - "auxiliary_loss_clip": 0.01142267, - "auxiliary_loss_mlp": 0.01046102, - "balance_loss_clip": 1.05434144, - "balance_loss_mlp": 1.02915072, - "epoch": 0.21367803998196302, - "flos": 22162306556160.0, - "grad_norm": 1.5008078028945642, - "language_loss": 0.72580731, - "learning_rate": 3.6564060441650843e-06, - "loss": 0.74769098, - "num_input_tokens_seen": 76697615, - "step": 3554, - "time_per_iteration": 2.701207399368286 - }, - { - "auxiliary_loss_clip": 0.01097601, - "auxiliary_loss_mlp": 0.00776401, - "balance_loss_clip": 1.04785013, - "balance_loss_mlp": 1.00128174, - "epoch": 0.213738163234631, - "flos": 20886867452160.0, - "grad_norm": 2.0681583889949957, - "language_loss": 0.67728174, - "learning_rate": 3.6561877474563724e-06, - "loss": 0.69602168, - "num_input_tokens_seen": 76715685, - "step": 3555, - "time_per_iteration": 2.76454758644104 - }, - { - "auxiliary_loss_clip": 0.01124456, - "auxiliary_loss_mlp": 0.01045031, - "balance_loss_clip": 1.06086278, - "balance_loss_mlp": 1.02689981, - "epoch": 0.21379828648729896, - "flos": 28403114135040.0, - "grad_norm": 2.155752981705525, - "language_loss": 0.64553648, - "learning_rate": 3.6559693879448553e-06, - "loss": 0.66723132, - "num_input_tokens_seen": 76735405, - "step": 3556, - "time_per_iteration": 2.839993953704834 - }, - { - "auxiliary_loss_clip": 0.01139371, - "auxiliary_loss_mlp": 0.01051642, - "balance_loss_clip": 1.05236566, - "balance_loss_mlp": 1.0331769, - "epoch": 0.21385840973996692, - "flos": 25479662106240.0, - "grad_norm": 1.7378281716746964, - "language_loss": 0.72588408, - "learning_rate": 3.6557509656388125e-06, - "loss": 0.74779421, - "num_input_tokens_seen": 76754395, - "step": 3557, - "time_per_iteration": 2.7678587436676025 - }, - { - "auxiliary_loss_clip": 0.01151319, - "auxiliary_loss_mlp": 0.00776703, - "balance_loss_clip": 1.0647192, - "balance_loss_mlp": 1.00117195, - "epoch": 0.2139185329926349, - "flos": 28074280101120.0, - "grad_norm": 1.8333462571334693, - "language_loss": 0.6714859, - "learning_rate": 3.655532480546528e-06, - "loss": 0.6907661, - "num_input_tokens_seen": 76777210, - "step": 3558, - "time_per_iteration": 2.7584826946258545 - }, - { - "auxiliary_loss_clip": 0.01159331, - "auxiliary_loss_mlp": 0.01041115, - "balance_loss_clip": 1.0541842, - "balance_loss_mlp": 1.02297139, - "epoch": 0.21397865624530288, - "flos": 19608698914560.0, - "grad_norm": 1.8974456617751176, - "language_loss": 0.79882181, - "learning_rate": 3.655313932676286e-06, - "loss": 0.82082617, - "num_input_tokens_seen": 76795830, - "step": 3559, - "time_per_iteration": 2.6918041706085205 - }, - { - "auxiliary_loss_clip": 0.01155068, - "auxiliary_loss_mlp": 0.01046018, - "balance_loss_clip": 1.05566323, - "balance_loss_mlp": 1.0295198, - "epoch": 0.21403877949797084, - "flos": 24681476033280.0, - "grad_norm": 1.8730564704536732, - "language_loss": 0.68085694, - "learning_rate": 3.655095322036373e-06, - "loss": 0.70286781, - "num_input_tokens_seen": 76814700, - "step": 3560, - "time_per_iteration": 2.6445770263671875 - }, - { - "auxiliary_loss_clip": 0.01145074, - "auxiliary_loss_mlp": 0.01043706, - "balance_loss_clip": 1.0535686, - "balance_loss_mlp": 1.02537155, - "epoch": 0.2140989027506388, - "flos": 19861150677120.0, - "grad_norm": 1.8952415763477797, - "language_loss": 0.73272544, - "learning_rate": 3.65487664863508e-06, - "loss": 0.75461322, - "num_input_tokens_seen": 76833400, - "step": 3561, - "time_per_iteration": 2.6568899154663086 - }, - { - "auxiliary_loss_clip": 0.01133795, - "auxiliary_loss_mlp": 0.01044555, - "balance_loss_clip": 1.05333674, - "balance_loss_mlp": 1.02700794, - "epoch": 0.21415902600330677, - "flos": 19135324552320.0, - "grad_norm": 2.1953085541278203, - "language_loss": 0.78028738, - "learning_rate": 3.654657912480698e-06, - "loss": 0.80207092, - "num_input_tokens_seen": 76850645, - "step": 3562, - "time_per_iteration": 2.73655104637146 - }, - { - "auxiliary_loss_clip": 0.01155634, - "auxiliary_loss_mlp": 0.01042255, - "balance_loss_clip": 1.05661631, - "balance_loss_mlp": 1.02457595, - "epoch": 0.21421914925597474, - "flos": 22272624201600.0, - "grad_norm": 3.5245068195694937, - "language_loss": 0.84338713, - "learning_rate": 3.6544391135815237e-06, - "loss": 0.86536604, - "num_input_tokens_seen": 76870135, - "step": 3563, - "time_per_iteration": 2.676630973815918 - }, - { - "auxiliary_loss_clip": 0.01157426, - "auxiliary_loss_mlp": 0.01036109, - "balance_loss_clip": 1.05830729, - "balance_loss_mlp": 1.01957488, - "epoch": 0.2142792725086427, - "flos": 33875109987840.0, - "grad_norm": 1.5172669047015535, - "language_loss": 0.76581991, - "learning_rate": 3.6542202519458507e-06, - "loss": 0.78775525, - "num_input_tokens_seen": 76893905, - "step": 3564, - "time_per_iteration": 2.7504193782806396 - }, - { - "auxiliary_loss_clip": 0.01134427, - "auxiliary_loss_mlp": 0.01044002, - "balance_loss_clip": 1.06131172, - "balance_loss_mlp": 1.02674031, - "epoch": 0.2143393957613107, - "flos": 19860216923520.0, - "grad_norm": 1.7115347614953564, - "language_loss": 0.88466394, - "learning_rate": 3.654001327581981e-06, - "loss": 0.90644825, - "num_input_tokens_seen": 76914205, - "step": 3565, - "time_per_iteration": 2.7911624908447266 - }, - { - "auxiliary_loss_clip": 0.01071735, - "auxiliary_loss_mlp": 0.01008336, - "balance_loss_clip": 1.05462575, - "balance_loss_mlp": 1.0057019, - "epoch": 0.21439951901397866, - "flos": 68530093090560.0, - "grad_norm": 0.8339683756542131, - "language_loss": 0.52192736, - "learning_rate": 3.653782340498215e-06, - "loss": 0.54272807, - "num_input_tokens_seen": 76975650, - "step": 3566, - "time_per_iteration": 3.1801936626434326 - }, - { - "auxiliary_loss_clip": 0.01141614, - "auxiliary_loss_mlp": 0.01041326, - "balance_loss_clip": 1.05527854, - "balance_loss_mlp": 1.02505386, - "epoch": 0.21445964226664663, - "flos": 19682998197120.0, - "grad_norm": 1.8485820369681922, - "language_loss": 0.67324477, - "learning_rate": 3.6535632907028566e-06, - "loss": 0.6950742, - "num_input_tokens_seen": 76992615, - "step": 3567, - "time_per_iteration": 2.6948626041412354 - }, - { - "auxiliary_loss_clip": 0.01123629, - "auxiliary_loss_mlp": 0.01045447, - "balance_loss_clip": 1.05142832, - "balance_loss_mlp": 1.02749455, - "epoch": 0.2145197655193146, - "flos": 31107259676160.0, - "grad_norm": 3.2542445550844317, - "language_loss": 0.74213678, - "learning_rate": 3.6533441782042126e-06, - "loss": 0.76382756, - "num_input_tokens_seen": 77017005, - "step": 3568, - "time_per_iteration": 4.396210670471191 - }, - { - "auxiliary_loss_clip": 0.01140095, - "auxiliary_loss_mlp": 0.01050029, - "balance_loss_clip": 1.05480075, - "balance_loss_mlp": 1.03333998, - "epoch": 0.21457988877198256, - "flos": 20120785159680.0, - "grad_norm": 1.7132363384404574, - "language_loss": 0.77343202, - "learning_rate": 3.6531250030105917e-06, - "loss": 0.79533333, - "num_input_tokens_seen": 77034990, - "step": 3569, - "time_per_iteration": 4.224002122879028 - }, - { - "auxiliary_loss_clip": 0.011511, - "auxiliary_loss_mlp": 0.0104435, - "balance_loss_clip": 1.05651093, - "balance_loss_mlp": 1.02521753, - "epoch": 0.21464001202465052, - "flos": 18588045957120.0, - "grad_norm": 2.6050136504577583, - "language_loss": 0.70278227, - "learning_rate": 3.6529057651303053e-06, - "loss": 0.72473681, - "num_input_tokens_seen": 77052610, - "step": 3570, - "time_per_iteration": 2.668304681777954 - }, - { - "auxiliary_loss_clip": 0.01158856, - "auxiliary_loss_mlp": 0.01046783, - "balance_loss_clip": 1.05765057, - "balance_loss_mlp": 1.02955759, - "epoch": 0.21470013527731852, - "flos": 21835160461440.0, - "grad_norm": 2.5503136440013647, - "language_loss": 0.79031628, - "learning_rate": 3.6526864645716666e-06, - "loss": 0.81237268, - "num_input_tokens_seen": 77072475, - "step": 3571, - "time_per_iteration": 4.066440105438232 - }, - { - "auxiliary_loss_clip": 0.0113831, - "auxiliary_loss_mlp": 0.01047146, - "balance_loss_clip": 1.05283594, - "balance_loss_mlp": 1.02703547, - "epoch": 0.21476025852998648, - "flos": 17603195880960.0, - "grad_norm": 1.9606975528380188, - "language_loss": 0.82601345, - "learning_rate": 3.652467101342991e-06, - "loss": 0.84786803, - "num_input_tokens_seen": 77089930, - "step": 3572, - "time_per_iteration": 2.6096267700195312 - }, - { - "auxiliary_loss_clip": 0.01134964, - "auxiliary_loss_mlp": 0.01041355, - "balance_loss_clip": 1.05588293, - "balance_loss_mlp": 1.02358127, - "epoch": 0.21482038178265445, - "flos": 24828135264000.0, - "grad_norm": 4.1014522432452285, - "language_loss": 0.65240026, - "learning_rate": 3.652247675452598e-06, - "loss": 0.67416352, - "num_input_tokens_seen": 77108970, - "step": 3573, - "time_per_iteration": 2.690986394882202 - }, - { - "auxiliary_loss_clip": 0.01147698, - "auxiliary_loss_mlp": 0.01048414, - "balance_loss_clip": 1.05253768, - "balance_loss_mlp": 1.03140295, - "epoch": 0.2148805050353224, - "flos": 23258228463360.0, - "grad_norm": 2.3397683674355565, - "language_loss": 0.75229824, - "learning_rate": 3.652028186908807e-06, - "loss": 0.77425939, - "num_input_tokens_seen": 77126045, - "step": 3574, - "time_per_iteration": 2.621736526489258 - }, - { - "auxiliary_loss_clip": 0.01138272, - "auxiliary_loss_mlp": 0.01041549, - "balance_loss_clip": 1.0526228, - "balance_loss_mlp": 1.02414417, - "epoch": 0.21494062828799038, - "flos": 21321098968320.0, - "grad_norm": 1.8157113535402463, - "language_loss": 0.72179317, - "learning_rate": 3.6518086357199416e-06, - "loss": 0.74359143, - "num_input_tokens_seen": 77144600, - "step": 3575, - "time_per_iteration": 4.362869501113892 - }, - { - "auxiliary_loss_clip": 0.01126687, - "auxiliary_loss_mlp": 0.01041186, - "balance_loss_clip": 1.05261374, - "balance_loss_mlp": 1.02422237, - "epoch": 0.21500075154065834, - "flos": 18843334894080.0, - "grad_norm": 3.8402092268612216, - "language_loss": 0.68255925, - "learning_rate": 3.6515890218943277e-06, - "loss": 0.70423794, - "num_input_tokens_seen": 77162965, - "step": 3576, - "time_per_iteration": 2.665370225906372 - }, - { - "auxiliary_loss_clip": 0.01138295, - "auxiliary_loss_mlp": 0.01049053, - "balance_loss_clip": 1.05064976, - "balance_loss_mlp": 1.02859676, - "epoch": 0.2150608747933263, - "flos": 18441997257600.0, - "grad_norm": 2.2101409055401566, - "language_loss": 0.88707685, - "learning_rate": 3.651369345440292e-06, - "loss": 0.90895033, - "num_input_tokens_seen": 77179960, - "step": 3577, - "time_per_iteration": 2.655118465423584 - }, - { - "auxiliary_loss_clip": 0.01070337, - "auxiliary_loss_mlp": 0.01022454, - "balance_loss_clip": 1.0487709, - "balance_loss_mlp": 1.01998615, - "epoch": 0.2151209980459943, - "flos": 66598242894720.0, - "grad_norm": 0.8146982557647512, - "language_loss": 0.56184745, - "learning_rate": 3.6511496063661654e-06, - "loss": 0.58277535, - "num_input_tokens_seen": 77239500, - "step": 3578, - "time_per_iteration": 3.2133536338806152 - }, - { - "auxiliary_loss_clip": 0.01144391, - "auxiliary_loss_mlp": 0.00775114, - "balance_loss_clip": 1.05492067, - "balance_loss_mlp": 1.00130272, - "epoch": 0.21518112129866226, - "flos": 21575885114880.0, - "grad_norm": 2.988933296047806, - "language_loss": 0.88686001, - "learning_rate": 3.6509298046802807e-06, - "loss": 0.90605509, - "num_input_tokens_seen": 77254680, - "step": 3579, - "time_per_iteration": 2.6801605224609375 - }, - { - "auxiliary_loss_clip": 0.01143273, - "auxiliary_loss_mlp": 0.0104707, - "balance_loss_clip": 1.05253708, - "balance_loss_mlp": 1.02945101, - "epoch": 0.21524124455133023, - "flos": 20047635112320.0, - "grad_norm": 1.8556029181899094, - "language_loss": 0.77953792, - "learning_rate": 3.650709940390972e-06, - "loss": 0.80144137, - "num_input_tokens_seen": 77274060, - "step": 3580, - "time_per_iteration": 2.6932644844055176 - }, - { - "auxiliary_loss_clip": 0.01145284, - "auxiliary_loss_mlp": 0.01043211, - "balance_loss_clip": 1.05702484, - "balance_loss_mlp": 1.02543712, - "epoch": 0.2153013678039982, - "flos": 23951807153280.0, - "grad_norm": 1.9843281400180077, - "language_loss": 0.72948015, - "learning_rate": 3.6504900135065775e-06, - "loss": 0.75136507, - "num_input_tokens_seen": 77293255, - "step": 3581, - "time_per_iteration": 2.712376117706299 - }, - { - "auxiliary_loss_clip": 0.01138503, - "auxiliary_loss_mlp": 0.0104555, - "balance_loss_clip": 1.05348194, - "balance_loss_mlp": 1.0269891, - "epoch": 0.21536149105666616, - "flos": 20594841880320.0, - "grad_norm": 2.4257233983700113, - "language_loss": 0.70726413, - "learning_rate": 3.6502700240354357e-06, - "loss": 0.72910464, - "num_input_tokens_seen": 77312390, - "step": 3582, - "time_per_iteration": 2.67122220993042 - }, - { - "auxiliary_loss_clip": 0.01154755, - "auxiliary_loss_mlp": 0.01040327, - "balance_loss_clip": 1.05591798, - "balance_loss_mlp": 1.0227195, - "epoch": 0.21542161430933413, - "flos": 12860042895360.0, - "grad_norm": 2.4025311229753363, - "language_loss": 0.84906816, - "learning_rate": 3.650049971985889e-06, - "loss": 0.87101901, - "num_input_tokens_seen": 77330985, - "step": 3583, - "time_per_iteration": 2.6395328044891357 - }, - { - "auxiliary_loss_clip": 0.01133287, - "auxiliary_loss_mlp": 0.01047024, - "balance_loss_clip": 1.05368245, - "balance_loss_mlp": 1.02971518, - "epoch": 0.21548173756200212, - "flos": 26103933504000.0, - "grad_norm": 2.7569743809923533, - "language_loss": 0.83223897, - "learning_rate": 3.6498298573662824e-06, - "loss": 0.85404205, - "num_input_tokens_seen": 77350770, - "step": 3584, - "time_per_iteration": 2.730823040008545 - }, - { - "auxiliary_loss_clip": 0.01118851, - "auxiliary_loss_mlp": 0.00774813, - "balance_loss_clip": 1.0520674, - "balance_loss_mlp": 1.00120699, - "epoch": 0.21554186081467008, - "flos": 22163779013760.0, - "grad_norm": 1.9634031706782962, - "language_loss": 0.90054697, - "learning_rate": 3.6496096801849625e-06, - "loss": 0.9194836, - "num_input_tokens_seen": 77370510, - "step": 3585, - "time_per_iteration": 2.722216844558716 - }, - { - "auxiliary_loss_clip": 0.01145179, - "auxiliary_loss_mlp": 0.01045359, - "balance_loss_clip": 1.05783939, - "balance_loss_mlp": 1.02793026, - "epoch": 0.21560198406733805, - "flos": 22966741595520.0, - "grad_norm": 1.9859337557251673, - "language_loss": 0.74663597, - "learning_rate": 3.649389440450277e-06, - "loss": 0.76854134, - "num_input_tokens_seen": 77390645, - "step": 3586, - "time_per_iteration": 2.7681503295898438 - }, - { - "auxiliary_loss_clip": 0.01120328, - "auxiliary_loss_mlp": 0.01046334, - "balance_loss_clip": 1.05628061, - "balance_loss_mlp": 1.03011, - "epoch": 0.215662107320006, - "flos": 22784064001920.0, - "grad_norm": 2.903090853788092, - "language_loss": 0.83029532, - "learning_rate": 3.6491691381705804e-06, - "loss": 0.85196197, - "num_input_tokens_seen": 77409655, - "step": 3587, - "time_per_iteration": 2.788416624069214 - }, - { - "auxiliary_loss_clip": 0.01109364, - "auxiliary_loss_mlp": 0.00776304, - "balance_loss_clip": 1.05255485, - "balance_loss_mlp": 1.00129569, - "epoch": 0.21572223057267398, - "flos": 30883859038080.0, - "grad_norm": 1.7067147212291012, - "language_loss": 0.75593436, - "learning_rate": 3.648948773354224e-06, - "loss": 0.774791, - "num_input_tokens_seen": 77430560, - "step": 3588, - "time_per_iteration": 2.866584062576294 - }, - { - "auxiliary_loss_clip": 0.01136336, - "auxiliary_loss_mlp": 0.01039583, - "balance_loss_clip": 1.04921389, - "balance_loss_mlp": 1.0224762, - "epoch": 0.21578235382534194, - "flos": 26910487445760.0, - "grad_norm": 1.721393113594195, - "language_loss": 0.80745661, - "learning_rate": 3.6487283460095643e-06, - "loss": 0.82921582, - "num_input_tokens_seen": 77455000, - "step": 3589, - "time_per_iteration": 2.8839404582977295 - }, - { - "auxiliary_loss_clip": 0.01157121, - "auxiliary_loss_mlp": 0.010363, - "balance_loss_clip": 1.05677748, - "balance_loss_mlp": 1.01992083, - "epoch": 0.2158424770780099, - "flos": 24425720219520.0, - "grad_norm": 2.201221744880259, - "language_loss": 0.72849286, - "learning_rate": 3.648507856144961e-06, - "loss": 0.75042707, - "num_input_tokens_seen": 77475075, - "step": 3590, - "time_per_iteration": 2.6692256927490234 - }, - { - "auxiliary_loss_clip": 0.01134591, - "auxiliary_loss_mlp": 0.01044904, - "balance_loss_clip": 1.05195427, - "balance_loss_mlp": 1.02623618, - "epoch": 0.2159026003306779, - "flos": 23949975559680.0, - "grad_norm": 2.25677544320114, - "language_loss": 0.8402462, - "learning_rate": 3.648287303768775e-06, - "loss": 0.86204112, - "num_input_tokens_seen": 77495945, - "step": 3591, - "time_per_iteration": 2.7531416416168213 - }, - { - "auxiliary_loss_clip": 0.01123784, - "auxiliary_loss_mlp": 0.01049552, - "balance_loss_clip": 1.05391979, - "balance_loss_mlp": 1.02972734, - "epoch": 0.21596272358334587, - "flos": 30040963511040.0, - "grad_norm": 2.2410681113576585, - "language_loss": 0.69175243, - "learning_rate": 3.6480666888893686e-06, - "loss": 0.71348578, - "num_input_tokens_seen": 77517140, - "step": 3592, - "time_per_iteration": 2.8716177940368652 - }, - { - "auxiliary_loss_clip": 0.01117322, - "auxiliary_loss_mlp": 0.01050667, - "balance_loss_clip": 1.04998767, - "balance_loss_mlp": 1.03179634, - "epoch": 0.21602284683601383, - "flos": 20376217751040.0, - "grad_norm": 2.3652325886308123, - "language_loss": 0.84022737, - "learning_rate": 3.647846011515108e-06, - "loss": 0.86190724, - "num_input_tokens_seen": 77536085, - "step": 3593, - "time_per_iteration": 2.7185158729553223 - }, - { - "auxiliary_loss_clip": 0.01123006, - "auxiliary_loss_mlp": 0.01048394, - "balance_loss_clip": 1.05243289, - "balance_loss_mlp": 1.029809, - "epoch": 0.2160829700886818, - "flos": 20777339905920.0, - "grad_norm": 4.017970268493579, - "language_loss": 0.75192308, - "learning_rate": 3.6476252716543625e-06, - "loss": 0.77363706, - "num_input_tokens_seen": 77553675, - "step": 3594, - "time_per_iteration": 2.726027011871338 - }, - { - "auxiliary_loss_clip": 0.01140408, - "auxiliary_loss_mlp": 0.01044406, - "balance_loss_clip": 1.05318236, - "balance_loss_mlp": 1.02650058, - "epoch": 0.21614309334134976, - "flos": 22309755886080.0, - "grad_norm": 1.541030891618627, - "language_loss": 0.80459857, - "learning_rate": 3.6474044693155007e-06, - "loss": 0.82644665, - "num_input_tokens_seen": 77573360, - "step": 3595, - "time_per_iteration": 2.66504168510437 - }, - { - "auxiliary_loss_clip": 0.01119754, - "auxiliary_loss_mlp": 0.01039521, - "balance_loss_clip": 1.05060601, - "balance_loss_mlp": 1.02125788, - "epoch": 0.21620321659401773, - "flos": 19609524927360.0, - "grad_norm": 2.1030283577585007, - "language_loss": 0.78930759, - "learning_rate": 3.647183604506897e-06, - "loss": 0.81090033, - "num_input_tokens_seen": 77591865, - "step": 3596, - "time_per_iteration": 2.7159698009490967 - }, - { - "auxiliary_loss_clip": 0.01080261, - "auxiliary_loss_mlp": 0.01047978, - "balance_loss_clip": 1.04591155, - "balance_loss_mlp": 1.03106225, - "epoch": 0.2162633398466857, - "flos": 18844555956480.0, - "grad_norm": 1.6709210997095376, - "language_loss": 0.83061242, - "learning_rate": 3.6469626772369253e-06, - "loss": 0.85189474, - "num_input_tokens_seen": 77611600, - "step": 3597, - "time_per_iteration": 2.79276704788208 - }, - { - "auxiliary_loss_clip": 0.01133147, - "auxiliary_loss_mlp": 0.00775626, - "balance_loss_clip": 1.05385637, - "balance_loss_mlp": 1.00146937, - "epoch": 0.21632346309935369, - "flos": 18768820129920.0, - "grad_norm": 1.6388312470031852, - "language_loss": 0.80549502, - "learning_rate": 3.6467416875139642e-06, - "loss": 0.8245827, - "num_input_tokens_seen": 77630665, - "step": 3598, - "time_per_iteration": 2.6823580265045166 - }, - { - "auxiliary_loss_clip": 0.01123845, - "auxiliary_loss_mlp": 0.01051638, - "balance_loss_clip": 1.05069876, - "balance_loss_mlp": 1.03218365, - "epoch": 0.21638358635202165, - "flos": 26324173745280.0, - "grad_norm": 1.9066675721358164, - "language_loss": 0.82023275, - "learning_rate": 3.6465206353463934e-06, - "loss": 0.84198749, - "num_input_tokens_seen": 77650835, - "step": 3599, - "time_per_iteration": 2.73583722114563 - }, - { - "auxiliary_loss_clip": 0.0110774, - "auxiliary_loss_mlp": 0.00775854, - "balance_loss_clip": 1.04651821, - "balance_loss_mlp": 1.00131536, - "epoch": 0.21644370960468962, - "flos": 20740854666240.0, - "grad_norm": 2.996184273033617, - "language_loss": 0.76724887, - "learning_rate": 3.6462995207425947e-06, - "loss": 0.78608489, - "num_input_tokens_seen": 77669000, - "step": 3600, - "time_per_iteration": 2.695081949234009 - }, - { - "auxiliary_loss_clip": 0.01112458, - "auxiliary_loss_mlp": 0.01044855, - "balance_loss_clip": 1.04869664, - "balance_loss_mlp": 1.02886891, - "epoch": 0.21650383285735758, - "flos": 23952238116480.0, - "grad_norm": 2.259096111885494, - "language_loss": 0.80784452, - "learning_rate": 3.6460783437109533e-06, - "loss": 0.82941765, - "num_input_tokens_seen": 77688745, - "step": 3601, - "time_per_iteration": 2.8094849586486816 - }, - { - "auxiliary_loss_clip": 0.01155408, - "auxiliary_loss_mlp": 0.01046912, - "balance_loss_clip": 1.0550983, - "balance_loss_mlp": 1.02973413, - "epoch": 0.21656395611002555, - "flos": 23696087253120.0, - "grad_norm": 2.558776342313561, - "language_loss": 0.83192647, - "learning_rate": 3.6458571042598565e-06, - "loss": 0.85394967, - "num_input_tokens_seen": 77708445, - "step": 3602, - "time_per_iteration": 2.652876377105713 - }, - { - "auxiliary_loss_clip": 0.0115161, - "auxiliary_loss_mlp": 0.0105032, - "balance_loss_clip": 1.0525223, - "balance_loss_mlp": 1.03286743, - "epoch": 0.2166240793626935, - "flos": 20666052593280.0, - "grad_norm": 1.768938326380195, - "language_loss": 0.7449019, - "learning_rate": 3.645635802397693e-06, - "loss": 0.76692116, - "num_input_tokens_seen": 77728465, - "step": 3603, - "time_per_iteration": 2.619614601135254 - }, - { - "auxiliary_loss_clip": 0.01116481, - "auxiliary_loss_mlp": 0.01047384, - "balance_loss_clip": 1.04873598, - "balance_loss_mlp": 1.02883554, - "epoch": 0.2166842026153615, - "flos": 21580410228480.0, - "grad_norm": 1.6710689829239502, - "language_loss": 0.74178421, - "learning_rate": 3.645414438132855e-06, - "loss": 0.76342291, - "num_input_tokens_seen": 77746735, - "step": 3604, - "time_per_iteration": 2.730182647705078 - }, - { - "auxiliary_loss_clip": 0.01138214, - "auxiliary_loss_mlp": 0.01038079, - "balance_loss_clip": 1.05246544, - "balance_loss_mlp": 1.02124691, - "epoch": 0.21674432586802947, - "flos": 25629948610560.0, - "grad_norm": 1.7167946204354523, - "language_loss": 0.7990489, - "learning_rate": 3.6451930114737366e-06, - "loss": 0.82081187, - "num_input_tokens_seen": 77768105, - "step": 3605, - "time_per_iteration": 2.67668080329895 - }, - { - "auxiliary_loss_clip": 0.01079717, - "auxiliary_loss_mlp": 0.01002026, - "balance_loss_clip": 1.0400598, - "balance_loss_mlp": 0.99942732, - "epoch": 0.21680444912069743, - "flos": 56417783616000.0, - "grad_norm": 0.7112415560884942, - "language_loss": 0.5834192, - "learning_rate": 3.6449715224287347e-06, - "loss": 0.6042366, - "num_input_tokens_seen": 77833750, - "step": 3606, - "time_per_iteration": 3.2736570835113525 - }, - { - "auxiliary_loss_clip": 0.01155294, - "auxiliary_loss_mlp": 0.01043491, - "balance_loss_clip": 1.05404341, - "balance_loss_mlp": 1.02498984, - "epoch": 0.2168645723733654, - "flos": 23878944414720.0, - "grad_norm": 2.2731951350022275, - "language_loss": 0.73142302, - "learning_rate": 3.644749971006248e-06, - "loss": 0.75341088, - "num_input_tokens_seen": 77853780, - "step": 3607, - "time_per_iteration": 4.267899990081787 - }, - { - "auxiliary_loss_clip": 0.01133762, - "auxiliary_loss_mlp": 0.01046639, - "balance_loss_clip": 1.05282903, - "balance_loss_mlp": 1.02789962, - "epoch": 0.21692469562603336, - "flos": 16946174257920.0, - "grad_norm": 2.181379073292718, - "language_loss": 0.76540339, - "learning_rate": 3.6445283572146765e-06, - "loss": 0.78720737, - "num_input_tokens_seen": 77872575, - "step": 3608, - "time_per_iteration": 4.285630464553833 - }, - { - "auxiliary_loss_clip": 0.01080204, - "auxiliary_loss_mlp": 0.01047623, - "balance_loss_clip": 1.04536235, - "balance_loss_mlp": 1.0309217, - "epoch": 0.21698481887870133, - "flos": 25119047514240.0, - "grad_norm": 2.042587105390135, - "language_loss": 0.74584132, - "learning_rate": 3.6443066810624255e-06, - "loss": 0.76711953, - "num_input_tokens_seen": 77892700, - "step": 3609, - "time_per_iteration": 2.802569627761841 - }, - { - "auxiliary_loss_clip": 0.01131798, - "auxiliary_loss_mlp": 0.01049353, - "balance_loss_clip": 1.05227149, - "balance_loss_mlp": 1.03159094, - "epoch": 0.2170449421313693, - "flos": 17894682748800.0, - "grad_norm": 1.9074832440543417, - "language_loss": 0.89132321, - "learning_rate": 3.6440849425579e-06, - "loss": 0.91313475, - "num_input_tokens_seen": 77911060, - "step": 3610, - "time_per_iteration": 4.189727306365967 - }, - { - "auxiliary_loss_clip": 0.01155294, - "auxiliary_loss_mlp": 0.01044238, - "balance_loss_clip": 1.05534768, - "balance_loss_mlp": 1.02649963, - "epoch": 0.2171050653840373, - "flos": 22638446265600.0, - "grad_norm": 2.058717355808165, - "language_loss": 0.77779067, - "learning_rate": 3.6438631417095095e-06, - "loss": 0.79978603, - "num_input_tokens_seen": 77929930, - "step": 3611, - "time_per_iteration": 2.6317896842956543 - }, - { - "auxiliary_loss_clip": 0.01088447, - "auxiliary_loss_mlp": 0.01047447, - "balance_loss_clip": 1.04764366, - "balance_loss_mlp": 1.03026867, - "epoch": 0.21716518863670525, - "flos": 19499997381120.0, - "grad_norm": 2.3883055198257184, - "language_loss": 0.63578451, - "learning_rate": 3.6436412785256637e-06, - "loss": 0.65714347, - "num_input_tokens_seen": 77949060, - "step": 3612, - "time_per_iteration": 2.8771228790283203 - }, - { - "auxiliary_loss_clip": 0.01091118, - "auxiliary_loss_mlp": 0.01053996, - "balance_loss_clip": 1.04585218, - "balance_loss_mlp": 1.03454065, - "epoch": 0.21722531188937322, - "flos": 19792022952960.0, - "grad_norm": 1.801964584441428, - "language_loss": 0.75912857, - "learning_rate": 3.643419353014776e-06, - "loss": 0.78057969, - "num_input_tokens_seen": 77967920, - "step": 3613, - "time_per_iteration": 2.710601568222046 - }, - { - "auxiliary_loss_clip": 0.0110572, - "auxiliary_loss_mlp": 0.01051253, - "balance_loss_clip": 1.05008733, - "balance_loss_mlp": 1.03121352, - "epoch": 0.21728543514204118, - "flos": 13334386924800.0, - "grad_norm": 1.9293696862218277, - "language_loss": 0.71047795, - "learning_rate": 3.643197365185261e-06, - "loss": 0.73204768, - "num_input_tokens_seen": 77985330, - "step": 3614, - "time_per_iteration": 4.407632112503052 - }, - { - "auxiliary_loss_clip": 0.0114355, - "auxiliary_loss_mlp": 0.01048776, - "balance_loss_clip": 1.05521107, - "balance_loss_mlp": 1.0306083, - "epoch": 0.21734555839470915, - "flos": 15231870783360.0, - "grad_norm": 1.7289280951335333, - "language_loss": 0.73030001, - "learning_rate": 3.6429753150455378e-06, - "loss": 0.75222325, - "num_input_tokens_seen": 78003105, - "step": 3615, - "time_per_iteration": 2.6358401775360107 - }, - { - "auxiliary_loss_clip": 0.01145731, - "auxiliary_loss_mlp": 0.01046632, - "balance_loss_clip": 1.05206716, - "balance_loss_mlp": 1.02703404, - "epoch": 0.2174056816473771, - "flos": 19973982274560.0, - "grad_norm": 2.3648922858816976, - "language_loss": 0.90127194, - "learning_rate": 3.6427532026040263e-06, - "loss": 0.92319548, - "num_input_tokens_seen": 78019655, - "step": 3616, - "time_per_iteration": 2.659787178039551 - }, - { - "auxiliary_loss_clip": 0.01103597, - "auxiliary_loss_mlp": 0.01040899, - "balance_loss_clip": 1.048136, - "balance_loss_mlp": 1.02244496, - "epoch": 0.21746580490004508, - "flos": 16687293960960.0, - "grad_norm": 2.928463545610362, - "language_loss": 0.81107831, - "learning_rate": 3.642531027869148e-06, - "loss": 0.83252329, - "num_input_tokens_seen": 78036025, - "step": 3617, - "time_per_iteration": 2.7723491191864014 - }, - { - "auxiliary_loss_clip": 0.01132531, - "auxiliary_loss_mlp": 0.01041286, - "balance_loss_clip": 1.05330408, - "balance_loss_mlp": 1.02382231, - "epoch": 0.21752592815271307, - "flos": 25772298209280.0, - "grad_norm": 1.9251992817215786, - "language_loss": 0.75688154, - "learning_rate": 3.642308790849329e-06, - "loss": 0.77861977, - "num_input_tokens_seen": 78055645, - "step": 3618, - "time_per_iteration": 2.7608227729797363 - }, - { - "auxiliary_loss_clip": 0.01147874, - "auxiliary_loss_mlp": 0.01048647, - "balance_loss_clip": 1.05600834, - "balance_loss_mlp": 1.03045571, - "epoch": 0.21758605140538104, - "flos": 11254692349440.0, - "grad_norm": 2.18435089101569, - "language_loss": 0.69099152, - "learning_rate": 3.642086491552996e-06, - "loss": 0.71295673, - "num_input_tokens_seen": 78071660, - "step": 3619, - "time_per_iteration": 2.671637773513794 - }, - { - "auxiliary_loss_clip": 0.01144421, - "auxiliary_loss_mlp": 0.01042659, - "balance_loss_clip": 1.05394137, - "balance_loss_mlp": 1.02482569, - "epoch": 0.217646174658049, - "flos": 19242625455360.0, - "grad_norm": 4.829425462001391, - "language_loss": 0.78716505, - "learning_rate": 3.641864129988579e-06, - "loss": 0.8090359, - "num_input_tokens_seen": 78091265, - "step": 3620, - "time_per_iteration": 2.7232043743133545 - }, - { - "auxiliary_loss_clip": 0.01148457, - "auxiliary_loss_mlp": 0.01042109, - "balance_loss_clip": 1.05161178, - "balance_loss_mlp": 1.02507412, - "epoch": 0.21770629791071697, - "flos": 21945083057280.0, - "grad_norm": 1.4663479636678602, - "language_loss": 0.79966211, - "learning_rate": 3.641641706164509e-06, - "loss": 0.82156777, - "num_input_tokens_seen": 78110095, - "step": 3621, - "time_per_iteration": 2.6326823234558105 - }, - { - "auxiliary_loss_clip": 0.01143183, - "auxiliary_loss_mlp": 0.01035793, - "balance_loss_clip": 1.05334592, - "balance_loss_mlp": 1.01955688, - "epoch": 0.21776642116338493, - "flos": 24936764970240.0, - "grad_norm": 1.609721344037994, - "language_loss": 0.87796915, - "learning_rate": 3.641419220089221e-06, - "loss": 0.89975888, - "num_input_tokens_seen": 78129475, - "step": 3622, - "time_per_iteration": 2.6864428520202637 - }, - { - "auxiliary_loss_clip": 0.01146899, - "auxiliary_loss_mlp": 0.01037591, - "balance_loss_clip": 1.05495822, - "balance_loss_mlp": 1.01801729, - "epoch": 0.2178265444160529, - "flos": 17821317219840.0, - "grad_norm": 1.856609178217172, - "language_loss": 0.77077621, - "learning_rate": 3.641196671771152e-06, - "loss": 0.79262108, - "num_input_tokens_seen": 78146880, - "step": 3623, - "time_per_iteration": 2.743601083755493 - }, - { - "auxiliary_loss_clip": 0.01121788, - "auxiliary_loss_mlp": 0.01052122, - "balance_loss_clip": 1.05279899, - "balance_loss_mlp": 1.03226197, - "epoch": 0.2178866676687209, - "flos": 17712902995200.0, - "grad_norm": 2.4362835431673036, - "language_loss": 0.84600008, - "learning_rate": 3.640974061218741e-06, - "loss": 0.86773914, - "num_input_tokens_seen": 78165065, - "step": 3624, - "time_per_iteration": 2.7499353885650635 - }, - { - "auxiliary_loss_clip": 0.01139543, - "auxiliary_loss_mlp": 0.01057514, - "balance_loss_clip": 1.05353129, - "balance_loss_mlp": 1.03804684, - "epoch": 0.21794679092138886, - "flos": 16945851035520.0, - "grad_norm": 2.4333310175924905, - "language_loss": 0.78037703, - "learning_rate": 3.640751388440429e-06, - "loss": 0.80234766, - "num_input_tokens_seen": 78180005, - "step": 3625, - "time_per_iteration": 2.6314821243286133 - }, - { - "auxiliary_loss_clip": 0.01061536, - "auxiliary_loss_mlp": 0.01003869, - "balance_loss_clip": 1.03318405, - "balance_loss_mlp": 1.00130582, - "epoch": 0.21800691417405682, - "flos": 63718566566400.0, - "grad_norm": 0.8242097668179436, - "language_loss": 0.60701489, - "learning_rate": 3.64052865344466e-06, - "loss": 0.62766898, - "num_input_tokens_seen": 78245350, - "step": 3626, - "time_per_iteration": 3.257289409637451 - }, - { - "auxiliary_loss_clip": 0.0112643, - "auxiliary_loss_mlp": 0.00776719, - "balance_loss_clip": 1.05120194, - "balance_loss_mlp": 1.00134754, - "epoch": 0.21806703742672479, - "flos": 21616392677760.0, - "grad_norm": 2.2464694521793094, - "language_loss": 0.9077245, - "learning_rate": 3.6403058562398795e-06, - "loss": 0.92675602, - "num_input_tokens_seen": 78264165, - "step": 3627, - "time_per_iteration": 2.6639885902404785 - }, - { - "auxiliary_loss_clip": 0.0109778, - "auxiliary_loss_mlp": 0.01043665, - "balance_loss_clip": 1.04912198, - "balance_loss_mlp": 1.02471113, - "epoch": 0.21812716067939275, - "flos": 19354882435200.0, - "grad_norm": 1.8437472480823303, - "language_loss": 0.73480809, - "learning_rate": 3.6400829968345365e-06, - "loss": 0.75622261, - "num_input_tokens_seen": 78283745, - "step": 3628, - "time_per_iteration": 2.7430238723754883 - }, - { - "auxiliary_loss_clip": 0.01151444, - "auxiliary_loss_mlp": 0.01042108, - "balance_loss_clip": 1.05143893, - "balance_loss_mlp": 1.02391696, - "epoch": 0.21818728393206072, - "flos": 23548063305600.0, - "grad_norm": 2.8127332529660296, - "language_loss": 0.77337319, - "learning_rate": 3.6398600752370826e-06, - "loss": 0.79530871, - "num_input_tokens_seen": 78302900, - "step": 3629, - "time_per_iteration": 2.6468687057495117 - }, - { - "auxiliary_loss_clip": 0.01142447, - "auxiliary_loss_mlp": 0.01044137, - "balance_loss_clip": 1.0532223, - "balance_loss_mlp": 1.02709055, - "epoch": 0.21824740718472868, - "flos": 30225652266240.0, - "grad_norm": 1.7154004506833416, - "language_loss": 0.71373391, - "learning_rate": 3.63963709145597e-06, - "loss": 0.73559982, - "num_input_tokens_seen": 78326470, - "step": 3630, - "time_per_iteration": 2.7334208488464355 - }, - { - "auxiliary_loss_clip": 0.01089422, - "auxiliary_loss_mlp": 0.01040838, - "balance_loss_clip": 1.04771948, - "balance_loss_mlp": 1.02488792, - "epoch": 0.21830753043739667, - "flos": 26134672567680.0, - "grad_norm": 2.4394061962398625, - "language_loss": 0.76502508, - "learning_rate": 3.6394140454996544e-06, - "loss": 0.78632766, - "num_input_tokens_seen": 78345810, - "step": 3631, - "time_per_iteration": 2.9277098178863525 - }, - { - "auxiliary_loss_clip": 0.01153805, - "auxiliary_loss_mlp": 0.01036973, - "balance_loss_clip": 1.05322635, - "balance_loss_mlp": 1.01950908, - "epoch": 0.21836765369006464, - "flos": 21720712752000.0, - "grad_norm": 3.3333075141454556, - "language_loss": 0.75291955, - "learning_rate": 3.639190937376594e-06, - "loss": 0.77482736, - "num_input_tokens_seen": 78364085, - "step": 3632, - "time_per_iteration": 2.666961908340454 - }, - { - "auxiliary_loss_clip": 0.01149425, - "auxiliary_loss_mlp": 0.01038996, - "balance_loss_clip": 1.05168736, - "balance_loss_mlp": 1.02262831, - "epoch": 0.2184277769427326, - "flos": 19937604775680.0, - "grad_norm": 2.135610011090477, - "language_loss": 0.83723396, - "learning_rate": 3.638967767095249e-06, - "loss": 0.85911822, - "num_input_tokens_seen": 78381385, - "step": 3633, - "time_per_iteration": 2.6193437576293945 - }, - { - "auxiliary_loss_clip": 0.0112373, - "auxiliary_loss_mlp": 0.01049933, - "balance_loss_clip": 1.05514872, - "balance_loss_mlp": 1.03280258, - "epoch": 0.21848790019540057, - "flos": 20340235301760.0, - "grad_norm": 1.713148643324746, - "language_loss": 0.81381643, - "learning_rate": 3.6387445346640823e-06, - "loss": 0.83555305, - "num_input_tokens_seen": 78400500, - "step": 3634, - "time_per_iteration": 2.7383267879486084 - }, - { - "auxiliary_loss_clip": 0.01144832, - "auxiliary_loss_mlp": 0.01040423, - "balance_loss_clip": 1.0548327, - "balance_loss_mlp": 1.02263677, - "epoch": 0.21854802344806853, - "flos": 15450818135040.0, - "grad_norm": 1.8988648345390304, - "language_loss": 0.74810624, - "learning_rate": 3.638521240091558e-06, - "loss": 0.76995879, - "num_input_tokens_seen": 78418340, - "step": 3635, - "time_per_iteration": 2.7461390495300293 - }, - { - "auxiliary_loss_clip": 0.01124703, - "auxiliary_loss_mlp": 0.01052922, - "balance_loss_clip": 1.05011106, - "balance_loss_mlp": 1.03524303, - "epoch": 0.2186081467007365, - "flos": 16320717711360.0, - "grad_norm": 2.2147010555825295, - "language_loss": 0.88340998, - "learning_rate": 3.6382978833861445e-06, - "loss": 0.90518618, - "num_input_tokens_seen": 78434375, - "step": 3636, - "time_per_iteration": 2.631352186203003 - }, - { - "auxiliary_loss_clip": 0.01121776, - "auxiliary_loss_mlp": 0.00776363, - "balance_loss_clip": 1.05596519, - "balance_loss_mlp": 1.00133038, - "epoch": 0.2186682699534045, - "flos": 21689255416320.0, - "grad_norm": 2.464516707854487, - "language_loss": 0.76037598, - "learning_rate": 3.638074464556311e-06, - "loss": 0.77935731, - "num_input_tokens_seen": 78451735, - "step": 3637, - "time_per_iteration": 2.823063373565674 - }, - { - "auxiliary_loss_clip": 0.01137371, - "auxiliary_loss_mlp": 0.0104323, - "balance_loss_clip": 1.05512452, - "balance_loss_mlp": 1.02393031, - "epoch": 0.21872839320607246, - "flos": 17739260599680.0, - "grad_norm": 2.6753688852020328, - "language_loss": 0.89996254, - "learning_rate": 3.63785098361053e-06, - "loss": 0.92176855, - "num_input_tokens_seen": 78462730, - "step": 3638, - "time_per_iteration": 2.6404030323028564 - }, - { - "auxiliary_loss_clip": 0.01142035, - "auxiliary_loss_mlp": 0.01051888, - "balance_loss_clip": 1.0538702, - "balance_loss_mlp": 1.03351748, - "epoch": 0.21878851645874042, - "flos": 18652289431680.0, - "grad_norm": 2.4375531856602692, - "language_loss": 0.89243078, - "learning_rate": 3.637627440557275e-06, - "loss": 0.91436994, - "num_input_tokens_seen": 78476300, - "step": 3639, - "time_per_iteration": 2.6214118003845215 - }, - { - "auxiliary_loss_clip": 0.01134092, - "auxiliary_loss_mlp": 0.00776277, - "balance_loss_clip": 1.05406988, - "balance_loss_mlp": 1.00129211, - "epoch": 0.2188486397114084, - "flos": 25557301353600.0, - "grad_norm": 1.9800691484462982, - "language_loss": 0.79167712, - "learning_rate": 3.637403835405024e-06, - "loss": 0.81078082, - "num_input_tokens_seen": 78496135, - "step": 3640, - "time_per_iteration": 2.7559502124786377 - }, - { - "auxiliary_loss_clip": 0.01149345, - "auxiliary_loss_mlp": 0.01055855, - "balance_loss_clip": 1.05816483, - "balance_loss_mlp": 1.03617346, - "epoch": 0.21890876296407635, - "flos": 17892061056000.0, - "grad_norm": 2.2045237000129942, - "language_loss": 0.71708757, - "learning_rate": 3.637180168162255e-06, - "loss": 0.73913956, - "num_input_tokens_seen": 78513855, - "step": 3641, - "time_per_iteration": 2.6673953533172607 - }, - { - "auxiliary_loss_clip": 0.01130115, - "auxiliary_loss_mlp": 0.0104373, - "balance_loss_clip": 1.05217481, - "balance_loss_mlp": 1.02593243, - "epoch": 0.21896888621674432, - "flos": 17749100926080.0, - "grad_norm": 1.9358190088314053, - "language_loss": 0.81427026, - "learning_rate": 3.63695643883745e-06, - "loss": 0.83600873, - "num_input_tokens_seen": 78531740, - "step": 3642, - "time_per_iteration": 2.6722965240478516 - }, - { - "auxiliary_loss_clip": 0.01150265, - "auxiliary_loss_mlp": 0.01044184, - "balance_loss_clip": 1.05707705, - "balance_loss_mlp": 1.02520561, - "epoch": 0.21902900946941228, - "flos": 23076161400960.0, - "grad_norm": 2.2890480980316865, - "language_loss": 0.7124145, - "learning_rate": 3.6367326474390928e-06, - "loss": 0.73435903, - "num_input_tokens_seen": 78549600, - "step": 3643, - "time_per_iteration": 2.6586625576019287 - }, - { - "auxiliary_loss_clip": 0.01156283, - "auxiliary_loss_mlp": 0.01046488, - "balance_loss_clip": 1.05430686, - "balance_loss_mlp": 1.02728367, - "epoch": 0.21908913272208028, - "flos": 48178545004800.0, - "grad_norm": 2.705040309825256, - "language_loss": 0.68497038, - "learning_rate": 3.6365087939756696e-06, - "loss": 0.70699811, - "num_input_tokens_seen": 78573350, - "step": 3644, - "time_per_iteration": 2.835944414138794 - }, - { - "auxiliary_loss_clip": 0.01157461, - "auxiliary_loss_mlp": 0.01049851, - "balance_loss_clip": 1.05381823, - "balance_loss_mlp": 1.03175521, - "epoch": 0.21914925597474824, - "flos": 22236749493120.0, - "grad_norm": 2.498314523319793, - "language_loss": 0.77761143, - "learning_rate": 3.636284878455669e-06, - "loss": 0.79968452, - "num_input_tokens_seen": 78591005, - "step": 3645, - "time_per_iteration": 2.6053528785705566 - }, - { - "auxiliary_loss_clip": 0.01142456, - "auxiliary_loss_mlp": 0.01054431, - "balance_loss_clip": 1.05606842, - "balance_loss_mlp": 1.03732491, - "epoch": 0.2192093792274162, - "flos": 22125605834880.0, - "grad_norm": 3.1951942186566766, - "language_loss": 0.82604313, - "learning_rate": 3.636060900887582e-06, - "loss": 0.84801197, - "num_input_tokens_seen": 78610645, - "step": 3646, - "time_per_iteration": 4.198619842529297 - }, - { - "auxiliary_loss_clip": 0.01141068, - "auxiliary_loss_mlp": 0.01040772, - "balance_loss_clip": 1.05287766, - "balance_loss_mlp": 1.02365351, - "epoch": 0.21926950248008417, - "flos": 15669442264320.0, - "grad_norm": 1.720246481727725, - "language_loss": 0.82877636, - "learning_rate": 3.635836861279901e-06, - "loss": 0.85059476, - "num_input_tokens_seen": 78628340, - "step": 3647, - "time_per_iteration": 4.229920387268066 - }, - { - "auxiliary_loss_clip": 0.0115057, - "auxiliary_loss_mlp": 0.01054202, - "balance_loss_clip": 1.05145597, - "balance_loss_mlp": 1.03685677, - "epoch": 0.21932962573275214, - "flos": 30262496641920.0, - "grad_norm": 1.6932394069108108, - "language_loss": 0.72652817, - "learning_rate": 3.635612759641123e-06, - "loss": 0.74857587, - "num_input_tokens_seen": 78649355, - "step": 3648, - "time_per_iteration": 2.7226104736328125 - }, - { - "auxiliary_loss_clip": 0.01110484, - "auxiliary_loss_mlp": 0.01057841, - "balance_loss_clip": 1.04757857, - "balance_loss_mlp": 1.03643107, - "epoch": 0.2193897489854201, - "flos": 10780132838400.0, - "grad_norm": 3.9115777702699175, - "language_loss": 0.74917972, - "learning_rate": 3.635388595979745e-06, - "loss": 0.77086294, - "num_input_tokens_seen": 78664915, - "step": 3649, - "time_per_iteration": 4.201031446456909 - }, - { - "auxiliary_loss_clip": 0.01138726, - "auxiliary_loss_mlp": 0.0105421, - "balance_loss_clip": 1.0536499, - "balance_loss_mlp": 1.03718746, - "epoch": 0.21944987223808807, - "flos": 19133313390720.0, - "grad_norm": 1.8914434058388716, - "language_loss": 0.86353791, - "learning_rate": 3.635164370304267e-06, - "loss": 0.88546729, - "num_input_tokens_seen": 78681475, - "step": 3650, - "time_per_iteration": 2.6061322689056396 - }, - { - "auxiliary_loss_clip": 0.01130852, - "auxiliary_loss_mlp": 0.01052398, - "balance_loss_clip": 1.04992914, - "balance_loss_mlp": 1.03439701, - "epoch": 0.21950999549075606, - "flos": 22711093522560.0, - "grad_norm": 2.798139483493165, - "language_loss": 0.83541161, - "learning_rate": 3.6349400826231927e-06, - "loss": 0.85724407, - "num_input_tokens_seen": 78702300, - "step": 3651, - "time_per_iteration": 2.7605133056640625 - }, - { - "auxiliary_loss_clip": 0.01143643, - "auxiliary_loss_mlp": 0.0105251, - "balance_loss_clip": 1.05282581, - "balance_loss_mlp": 1.03511763, - "epoch": 0.21957011874342403, - "flos": 10561329141120.0, - "grad_norm": 1.9065881796375543, - "language_loss": 0.74475014, - "learning_rate": 3.634715732945027e-06, - "loss": 0.76671165, - "num_input_tokens_seen": 78720230, - "step": 3652, - "time_per_iteration": 2.597443103790283 - }, - { - "auxiliary_loss_clip": 0.01038431, - "auxiliary_loss_mlp": 0.01009267, - "balance_loss_clip": 1.0361495, - "balance_loss_mlp": 1.0068711, - "epoch": 0.219630241996092, - "flos": 65747913252480.0, - "grad_norm": 0.7482502800744824, - "language_loss": 0.51550615, - "learning_rate": 3.6344913212782764e-06, - "loss": 0.5359832, - "num_input_tokens_seen": 78780200, - "step": 3653, - "time_per_iteration": 3.324497699737549 - }, - { - "auxiliary_loss_clip": 0.01125533, - "auxiliary_loss_mlp": 0.01062527, - "balance_loss_clip": 1.05436754, - "balance_loss_mlp": 1.04470527, - "epoch": 0.21969036524875996, - "flos": 23696518216320.0, - "grad_norm": 1.9578946934595152, - "language_loss": 0.75356162, - "learning_rate": 3.6342668476314514e-06, - "loss": 0.77544224, - "num_input_tokens_seen": 78800575, - "step": 3654, - "time_per_iteration": 4.296064615249634 - }, - { - "auxiliary_loss_clip": 0.01152337, - "auxiliary_loss_mlp": 0.01051249, - "balance_loss_clip": 1.05944824, - "balance_loss_mlp": 1.03376114, - "epoch": 0.21975048850142792, - "flos": 19640910435840.0, - "grad_norm": 1.8387519277823352, - "language_loss": 0.72646022, - "learning_rate": 3.634042312013064e-06, - "loss": 0.74849606, - "num_input_tokens_seen": 78819585, - "step": 3655, - "time_per_iteration": 2.6634860038757324 - }, - { - "auxiliary_loss_clip": 0.01130021, - "auxiliary_loss_mlp": 0.01048784, - "balance_loss_clip": 1.05423379, - "balance_loss_mlp": 1.03071189, - "epoch": 0.21981061175409589, - "flos": 22448550038400.0, - "grad_norm": 1.722985511504472, - "language_loss": 0.80795759, - "learning_rate": 3.6338177144316276e-06, - "loss": 0.82974565, - "num_input_tokens_seen": 78837330, - "step": 3656, - "time_per_iteration": 2.730391502380371 - }, - { - "auxiliary_loss_clip": 0.01124773, - "auxiliary_loss_mlp": 0.00776202, - "balance_loss_clip": 1.06113994, - "balance_loss_mlp": 1.00139225, - "epoch": 0.21987073500676388, - "flos": 18151049093760.0, - "grad_norm": 2.646453773467974, - "language_loss": 0.84885842, - "learning_rate": 3.63359305489566e-06, - "loss": 0.86786819, - "num_input_tokens_seen": 78854955, - "step": 3657, - "time_per_iteration": 2.657607078552246 - }, - { - "auxiliary_loss_clip": 0.01142645, - "auxiliary_loss_mlp": 0.01040533, - "balance_loss_clip": 1.05631852, - "balance_loss_mlp": 1.02260423, - "epoch": 0.21993085825943184, - "flos": 25626177682560.0, - "grad_norm": 2.6990832263195585, - "language_loss": 0.80355585, - "learning_rate": 3.6333683334136803e-06, - "loss": 0.82538766, - "num_input_tokens_seen": 78874965, - "step": 3658, - "time_per_iteration": 2.6584107875823975 - }, - { - "auxiliary_loss_clip": 0.01048937, - "auxiliary_loss_mlp": 0.0100499, - "balance_loss_clip": 1.03857517, - "balance_loss_mlp": 1.00202215, - "epoch": 0.2199909815120998, - "flos": 70923217743360.0, - "grad_norm": 0.7788612160796681, - "language_loss": 0.58191586, - "learning_rate": 3.6331435499942095e-06, - "loss": 0.60245514, - "num_input_tokens_seen": 78937740, - "step": 3659, - "time_per_iteration": 3.3395371437072754 - }, - { - "auxiliary_loss_clip": 0.01111007, - "auxiliary_loss_mlp": 0.0105329, - "balance_loss_clip": 1.05029392, - "balance_loss_mlp": 1.03471744, - "epoch": 0.22005110476476777, - "flos": 21543529939200.0, - "grad_norm": 4.382741616753977, - "language_loss": 0.7477597, - "learning_rate": 3.632918704645772e-06, - "loss": 0.76940262, - "num_input_tokens_seen": 78955055, - "step": 3660, - "time_per_iteration": 2.782975435256958 - }, - { - "auxiliary_loss_clip": 0.01147277, - "auxiliary_loss_mlp": 0.01044652, - "balance_loss_clip": 1.05691171, - "balance_loss_mlp": 1.02653265, - "epoch": 0.22011122801743574, - "flos": 22054502862720.0, - "grad_norm": 1.8856077512582532, - "language_loss": 0.81484449, - "learning_rate": 3.632693797376893e-06, - "loss": 0.83676374, - "num_input_tokens_seen": 78974895, - "step": 3661, - "time_per_iteration": 2.7780110836029053 - }, - { - "auxiliary_loss_clip": 0.01126694, - "auxiliary_loss_mlp": 0.01056397, - "balance_loss_clip": 1.05167532, - "balance_loss_mlp": 1.03800273, - "epoch": 0.2201713512701037, - "flos": 26687589598080.0, - "grad_norm": 1.9746283079458686, - "language_loss": 0.73154199, - "learning_rate": 3.632468828196102e-06, - "loss": 0.75337297, - "num_input_tokens_seen": 78994990, - "step": 3662, - "time_per_iteration": 2.7189040184020996 - }, - { - "auxiliary_loss_clip": 0.0113519, - "auxiliary_loss_mlp": 0.01051686, - "balance_loss_clip": 1.05718994, - "balance_loss_mlp": 1.03555691, - "epoch": 0.22023147452277167, - "flos": 22162198815360.0, - "grad_norm": 2.0576168655035714, - "language_loss": 0.78066969, - "learning_rate": 3.632243797111929e-06, - "loss": 0.80253839, - "num_input_tokens_seen": 79014405, - "step": 3663, - "time_per_iteration": 2.731412410736084 - }, - { - "auxiliary_loss_clip": 0.01142837, - "auxiliary_loss_mlp": 0.01063521, - "balance_loss_clip": 1.05659413, - "balance_loss_mlp": 1.04352939, - "epoch": 0.22029159777543966, - "flos": 22523280284160.0, - "grad_norm": 1.752119258875799, - "language_loss": 0.80294079, - "learning_rate": 3.632018704132908e-06, - "loss": 0.82500434, - "num_input_tokens_seen": 79032375, - "step": 3664, - "time_per_iteration": 2.7043297290802 - }, - { - "auxiliary_loss_clip": 0.01134207, - "auxiliary_loss_mlp": 0.01044352, - "balance_loss_clip": 1.05424213, - "balance_loss_mlp": 1.02474177, - "epoch": 0.22035172102810763, - "flos": 13042469093760.0, - "grad_norm": 3.138103913885462, - "language_loss": 0.76388288, - "learning_rate": 3.6317935492675742e-06, - "loss": 0.78566849, - "num_input_tokens_seen": 79049635, - "step": 3665, - "time_per_iteration": 2.68300199508667 - }, - { - "auxiliary_loss_clip": 0.01128405, - "auxiliary_loss_mlp": 0.01053304, - "balance_loss_clip": 1.05599689, - "balance_loss_mlp": 1.03589976, - "epoch": 0.2204118442807756, - "flos": 12165817760640.0, - "grad_norm": 2.9738702224471583, - "language_loss": 0.9800086, - "learning_rate": 3.631568332524466e-06, - "loss": 1.00182581, - "num_input_tokens_seen": 79062890, - "step": 3666, - "time_per_iteration": 2.702584981918335 - }, - { - "auxiliary_loss_clip": 0.01141573, - "auxiliary_loss_mlp": 0.00776689, - "balance_loss_clip": 1.05254698, - "balance_loss_mlp": 1.00133562, - "epoch": 0.22047196753344356, - "flos": 40108806673920.0, - "grad_norm": 1.894759892223008, - "language_loss": 0.80946934, - "learning_rate": 3.631343053912122e-06, - "loss": 0.82865196, - "num_input_tokens_seen": 79085495, - "step": 3667, - "time_per_iteration": 2.8920814990997314 - }, - { - "auxiliary_loss_clip": 0.01149896, - "auxiliary_loss_mlp": 0.01051036, - "balance_loss_clip": 1.06145239, - "balance_loss_mlp": 1.03161693, - "epoch": 0.22053209078611152, - "flos": 20701137202560.0, - "grad_norm": 1.8771463594277091, - "language_loss": 0.7736783, - "learning_rate": 3.631117713439087e-06, - "loss": 0.79568756, - "num_input_tokens_seen": 79101820, - "step": 3668, - "time_per_iteration": 2.6733500957489014 - }, - { - "auxiliary_loss_clip": 0.01143618, - "auxiliary_loss_mlp": 0.01047462, - "balance_loss_clip": 1.05955744, - "balance_loss_mlp": 1.02972412, - "epoch": 0.2205922140387795, - "flos": 24716309247360.0, - "grad_norm": 1.7809066581326154, - "language_loss": 0.71624571, - "learning_rate": 3.630892311113904e-06, - "loss": 0.7381565, - "num_input_tokens_seen": 79123320, - "step": 3669, - "time_per_iteration": 2.7298974990844727 - }, - { - "auxiliary_loss_clip": 0.01155448, - "auxiliary_loss_mlp": 0.01039044, - "balance_loss_clip": 1.0544126, - "balance_loss_mlp": 1.0217346, - "epoch": 0.22065233729144745, - "flos": 23477247642240.0, - "grad_norm": 2.1257290130035082, - "language_loss": 0.85160267, - "learning_rate": 3.6306668469451215e-06, - "loss": 0.87354761, - "num_input_tokens_seen": 79141615, - "step": 3670, - "time_per_iteration": 2.6624948978424072 - }, - { - "auxiliary_loss_clip": 0.01137906, - "auxiliary_loss_mlp": 0.01042298, - "balance_loss_clip": 1.05475712, - "balance_loss_mlp": 1.02376091, - "epoch": 0.22071246054411545, - "flos": 35225566646400.0, - "grad_norm": 1.8008957470192373, - "language_loss": 0.76928926, - "learning_rate": 3.6304413209412886e-06, - "loss": 0.79109132, - "num_input_tokens_seen": 79164910, - "step": 3671, - "time_per_iteration": 2.7914648056030273 - }, - { - "auxiliary_loss_clip": 0.01126159, - "auxiliary_loss_mlp": 0.01040764, - "balance_loss_clip": 1.05423856, - "balance_loss_mlp": 1.02281129, - "epoch": 0.2207725837967834, - "flos": 18150294908160.0, - "grad_norm": 2.015071454696955, - "language_loss": 0.80643147, - "learning_rate": 3.6302157331109573e-06, - "loss": 0.82810068, - "num_input_tokens_seen": 79179685, - "step": 3672, - "time_per_iteration": 2.674381732940674 - }, - { - "auxiliary_loss_clip": 0.01149005, - "auxiliary_loss_mlp": 0.01047239, - "balance_loss_clip": 1.05706501, - "balance_loss_mlp": 1.02992952, - "epoch": 0.22083270704945138, - "flos": 20479675898880.0, - "grad_norm": 2.222038104071356, - "language_loss": 0.73278964, - "learning_rate": 3.629990083462682e-06, - "loss": 0.75475204, - "num_input_tokens_seen": 79196285, - "step": 3673, - "time_per_iteration": 2.6856846809387207 - }, - { - "auxiliary_loss_clip": 0.01121745, - "auxiliary_loss_mlp": 0.01044908, - "balance_loss_clip": 1.05473876, - "balance_loss_mlp": 1.02608538, - "epoch": 0.22089283030211934, - "flos": 34125801984000.0, - "grad_norm": 1.9530426336903413, - "language_loss": 0.76384282, - "learning_rate": 3.6297643720050203e-06, - "loss": 0.78550935, - "num_input_tokens_seen": 79216060, - "step": 3674, - "time_per_iteration": 2.816190242767334 - }, - { - "auxiliary_loss_clip": 0.01156134, - "auxiliary_loss_mlp": 0.01047969, - "balance_loss_clip": 1.05650616, - "balance_loss_mlp": 1.02850175, - "epoch": 0.2209529535547873, - "flos": 18077216688000.0, - "grad_norm": 2.045565300481816, - "language_loss": 0.74367136, - "learning_rate": 3.6295385987465293e-06, - "loss": 0.76571238, - "num_input_tokens_seen": 79235145, - "step": 3675, - "time_per_iteration": 2.69748592376709 - }, - { - "auxiliary_loss_clip": 0.01155113, - "auxiliary_loss_mlp": 0.01045626, - "balance_loss_clip": 1.05442023, - "balance_loss_mlp": 1.02800727, - "epoch": 0.22101307680745527, - "flos": 27235335070080.0, - "grad_norm": 1.898816078558846, - "language_loss": 0.79801333, - "learning_rate": 3.629312763695772e-06, - "loss": 0.82002068, - "num_input_tokens_seen": 79256960, - "step": 3676, - "time_per_iteration": 2.6792948246002197 - }, - { - "auxiliary_loss_clip": 0.01133095, - "auxiliary_loss_mlp": 0.01049823, - "balance_loss_clip": 1.05366707, - "balance_loss_mlp": 1.03257358, - "epoch": 0.22107320006012326, - "flos": 16543256423040.0, - "grad_norm": 2.1537198076644954, - "language_loss": 0.75327688, - "learning_rate": 3.6290868668613107e-06, - "loss": 0.77510607, - "num_input_tokens_seen": 79274860, - "step": 3677, - "time_per_iteration": 2.781393527984619 - }, - { - "auxiliary_loss_clip": 0.0111612, - "auxiliary_loss_mlp": 0.01050059, - "balance_loss_clip": 1.04986429, - "balance_loss_mlp": 1.03212988, - "epoch": 0.22113332331279123, - "flos": 22054466949120.0, - "grad_norm": 1.7875463894855461, - "language_loss": 0.83287871, - "learning_rate": 3.628860908251712e-06, - "loss": 0.85454059, - "num_input_tokens_seen": 79294005, - "step": 3678, - "time_per_iteration": 2.752838611602783 - }, - { - "auxiliary_loss_clip": 0.01094052, - "auxiliary_loss_mlp": 0.01058605, - "balance_loss_clip": 1.04951406, - "balance_loss_mlp": 1.03992522, - "epoch": 0.2211934465654592, - "flos": 26612787525120.0, - "grad_norm": 1.6742153249136704, - "language_loss": 0.89135075, - "learning_rate": 3.6286348878755452e-06, - "loss": 0.91287732, - "num_input_tokens_seen": 79314005, - "step": 3679, - "time_per_iteration": 2.8282527923583984 - }, - { - "auxiliary_loss_clip": 0.01147641, - "auxiliary_loss_mlp": 0.01054276, - "balance_loss_clip": 1.05507338, - "balance_loss_mlp": 1.03615618, - "epoch": 0.22125356981812716, - "flos": 16360363347840.0, - "grad_norm": 3.092644946410345, - "language_loss": 0.8649044, - "learning_rate": 3.6284088057413803e-06, - "loss": 0.88692355, - "num_input_tokens_seen": 79331030, - "step": 3680, - "time_per_iteration": 2.630829095840454 - }, - { - "auxiliary_loss_clip": 0.0111249, - "auxiliary_loss_mlp": 0.01052062, - "balance_loss_clip": 1.05374503, - "balance_loss_mlp": 1.03395414, - "epoch": 0.22131369307079513, - "flos": 21651118151040.0, - "grad_norm": 1.9427224492838853, - "language_loss": 0.81773758, - "learning_rate": 3.6281826618577894e-06, - "loss": 0.83938313, - "num_input_tokens_seen": 79348560, - "step": 3681, - "time_per_iteration": 2.805880069732666 - }, - { - "auxiliary_loss_clip": 0.01148508, - "auxiliary_loss_mlp": 0.00775652, - "balance_loss_clip": 1.0530386, - "balance_loss_mlp": 1.00146043, - "epoch": 0.2213738163234631, - "flos": 19609524927360.0, - "grad_norm": 2.296553230959153, - "language_loss": 0.80099678, - "learning_rate": 3.62795645623335e-06, - "loss": 0.82023835, - "num_input_tokens_seen": 79367175, - "step": 3682, - "time_per_iteration": 2.624234199523926 - }, - { - "auxiliary_loss_clip": 0.0112405, - "auxiliary_loss_mlp": 0.0105126, - "balance_loss_clip": 1.0500052, - "balance_loss_mlp": 1.03198409, - "epoch": 0.22143393957613106, - "flos": 23623404082560.0, - "grad_norm": 1.6781760642146926, - "language_loss": 0.77394038, - "learning_rate": 3.627730188876638e-06, - "loss": 0.7956934, - "num_input_tokens_seen": 79388435, - "step": 3683, - "time_per_iteration": 2.6746323108673096 - }, - { - "auxiliary_loss_clip": 0.01129753, - "auxiliary_loss_mlp": 0.01051291, - "balance_loss_clip": 1.05048668, - "balance_loss_mlp": 1.03411245, - "epoch": 0.22149406282879905, - "flos": 26177801823360.0, - "grad_norm": 2.1201256685163323, - "language_loss": 0.72406399, - "learning_rate": 3.627503859796234e-06, - "loss": 0.7458744, - "num_input_tokens_seen": 79407910, - "step": 3684, - "time_per_iteration": 2.695958375930786 - }, - { - "auxiliary_loss_clip": 0.01084051, - "auxiliary_loss_mlp": 0.01045612, - "balance_loss_clip": 1.04670835, - "balance_loss_mlp": 1.02571654, - "epoch": 0.221554186081467, - "flos": 14538758970240.0, - "grad_norm": 2.1308896442870893, - "language_loss": 0.79817796, - "learning_rate": 3.6272774690007207e-06, - "loss": 0.81947458, - "num_input_tokens_seen": 79424020, - "step": 3685, - "time_per_iteration": 2.7443795204162598 - }, - { - "auxiliary_loss_clip": 0.01147394, - "auxiliary_loss_mlp": 0.01045457, - "balance_loss_clip": 1.05201805, - "balance_loss_mlp": 1.02867222, - "epoch": 0.22161430933413498, - "flos": 22238257864320.0, - "grad_norm": 1.6870532517893482, - "language_loss": 0.87305272, - "learning_rate": 3.6270510164986823e-06, - "loss": 0.89498115, - "num_input_tokens_seen": 79445605, - "step": 3686, - "time_per_iteration": 4.388494968414307 - }, - { - "auxiliary_loss_clip": 0.01137917, - "auxiliary_loss_mlp": 0.0104367, - "balance_loss_clip": 1.052562, - "balance_loss_mlp": 1.02620554, - "epoch": 0.22167443258680294, - "flos": 23476529370240.0, - "grad_norm": 1.8821221420403713, - "language_loss": 0.78069639, - "learning_rate": 3.626824502298707e-06, - "loss": 0.80251229, - "num_input_tokens_seen": 79463850, - "step": 3687, - "time_per_iteration": 4.123531103134155 - }, - { - "auxiliary_loss_clip": 0.0112545, - "auxiliary_loss_mlp": 0.01052599, - "balance_loss_clip": 1.0494144, - "balance_loss_mlp": 1.0331558, - "epoch": 0.2217345558394709, - "flos": 23221132692480.0, - "grad_norm": 1.8251811803295879, - "language_loss": 0.84860861, - "learning_rate": 3.626597926409383e-06, - "loss": 0.8703891, - "num_input_tokens_seen": 79482845, - "step": 3688, - "time_per_iteration": 4.287938594818115 - }, - { - "auxiliary_loss_clip": 0.01110764, - "auxiliary_loss_mlp": 0.01051634, - "balance_loss_clip": 1.04967332, - "balance_loss_mlp": 1.03254843, - "epoch": 0.22179467909213887, - "flos": 20011078045440.0, - "grad_norm": 1.7785994747216247, - "language_loss": 0.81150943, - "learning_rate": 3.6263712888393027e-06, - "loss": 0.83313334, - "num_input_tokens_seen": 79501550, - "step": 3689, - "time_per_iteration": 2.7521302700042725 - }, - { - "auxiliary_loss_clip": 0.01124628, - "auxiliary_loss_mlp": 0.01048971, - "balance_loss_clip": 1.05078936, - "balance_loss_mlp": 1.03131568, - "epoch": 0.22185480234480687, - "flos": 19683034110720.0, - "grad_norm": 1.7481542974535997, - "language_loss": 0.70018351, - "learning_rate": 3.626144589597061e-06, - "loss": 0.72191954, - "num_input_tokens_seen": 79519680, - "step": 3690, - "time_per_iteration": 2.6664223670959473 - }, - { - "auxiliary_loss_clip": 0.01147193, - "auxiliary_loss_mlp": 0.00777365, - "balance_loss_clip": 1.0537169, - "balance_loss_mlp": 1.00153625, - "epoch": 0.22191492559747483, - "flos": 21981316901760.0, - "grad_norm": 1.8112729447523994, - "language_loss": 0.72609359, - "learning_rate": 3.6259178286912528e-06, - "loss": 0.74533916, - "num_input_tokens_seen": 79539000, - "step": 3691, - "time_per_iteration": 2.6724495887756348 - }, - { - "auxiliary_loss_clip": 0.01144688, - "auxiliary_loss_mlp": 0.01046427, - "balance_loss_clip": 1.05663919, - "balance_loss_mlp": 1.0275923, - "epoch": 0.2219750488501428, - "flos": 23222066446080.0, - "grad_norm": 1.8134603978799304, - "language_loss": 0.71503472, - "learning_rate": 3.625691006130477e-06, - "loss": 0.73694593, - "num_input_tokens_seen": 79559695, - "step": 3692, - "time_per_iteration": 2.6743686199188232 - }, - { - "auxiliary_loss_clip": 0.01147828, - "auxiliary_loss_mlp": 0.01048973, - "balance_loss_clip": 1.05410266, - "balance_loss_mlp": 1.03098464, - "epoch": 0.22203517210281076, - "flos": 22453685683200.0, - "grad_norm": 2.1147705582229577, - "language_loss": 0.87551594, - "learning_rate": 3.6254641219233362e-06, - "loss": 0.89748394, - "num_input_tokens_seen": 79579095, - "step": 3693, - "time_per_iteration": 4.2962939739227295 - }, - { - "auxiliary_loss_clip": 0.01141134, - "auxiliary_loss_mlp": 0.01041066, - "balance_loss_clip": 1.0537045, - "balance_loss_mlp": 1.02479386, - "epoch": 0.22209529535547873, - "flos": 17564555825280.0, - "grad_norm": 1.9865017520636683, - "language_loss": 0.85553116, - "learning_rate": 3.6252371760784325e-06, - "loss": 0.87735319, - "num_input_tokens_seen": 79596430, - "step": 3694, - "time_per_iteration": 2.585657835006714 - }, - { - "auxiliary_loss_clip": 0.01107468, - "auxiliary_loss_mlp": 0.01045482, - "balance_loss_clip": 1.04370403, - "balance_loss_mlp": 1.02640843, - "epoch": 0.2221554186081467, - "flos": 21469015175040.0, - "grad_norm": 2.1752375595399136, - "language_loss": 0.68740189, - "learning_rate": 3.6250101686043725e-06, - "loss": 0.70893133, - "num_input_tokens_seen": 79615825, - "step": 3695, - "time_per_iteration": 2.744264841079712 - }, - { - "auxiliary_loss_clip": 0.01118075, - "auxiliary_loss_mlp": 0.01047291, - "balance_loss_clip": 1.051736, - "balance_loss_mlp": 1.0310905, - "epoch": 0.22221554186081466, - "flos": 27673445255040.0, - "grad_norm": 1.6851408018575031, - "language_loss": 0.71540272, - "learning_rate": 3.6247830995097637e-06, - "loss": 0.73705637, - "num_input_tokens_seen": 79637875, - "step": 3696, - "time_per_iteration": 2.7320780754089355 - }, - { - "auxiliary_loss_clip": 0.01140935, - "auxiliary_loss_mlp": 0.0104304, - "balance_loss_clip": 1.05123305, - "balance_loss_mlp": 1.02455115, - "epoch": 0.22227566511348265, - "flos": 25958926298880.0, - "grad_norm": 1.7186386141421306, - "language_loss": 0.87905443, - "learning_rate": 3.624555968803217e-06, - "loss": 0.90089417, - "num_input_tokens_seen": 79656970, - "step": 3697, - "time_per_iteration": 2.65919828414917 - }, - { - "auxiliary_loss_clip": 0.01118987, - "auxiliary_loss_mlp": 0.0104214, - "balance_loss_clip": 1.04718316, - "balance_loss_mlp": 1.0255338, - "epoch": 0.22233578836615062, - "flos": 39203678833920.0, - "grad_norm": 1.6515031384229777, - "language_loss": 0.65900242, - "learning_rate": 3.624328776493346e-06, - "loss": 0.6806137, - "num_input_tokens_seen": 79680275, - "step": 3698, - "time_per_iteration": 2.7708024978637695 - }, - { - "auxiliary_loss_clip": 0.01142696, - "auxiliary_loss_mlp": 0.01049333, - "balance_loss_clip": 1.05630088, - "balance_loss_mlp": 1.03102303, - "epoch": 0.22239591161881858, - "flos": 36283782251520.0, - "grad_norm": 1.9634592665257078, - "language_loss": 0.82520199, - "learning_rate": 3.6241015225887637e-06, - "loss": 0.84712231, - "num_input_tokens_seen": 79701255, - "step": 3699, - "time_per_iteration": 2.7743008136749268 - }, - { - "auxiliary_loss_clip": 0.01129692, - "auxiliary_loss_mlp": 0.01047594, - "balance_loss_clip": 1.05154991, - "balance_loss_mlp": 1.02939105, - "epoch": 0.22245603487148655, - "flos": 19719591177600.0, - "grad_norm": 1.6711069078421557, - "language_loss": 0.79384553, - "learning_rate": 3.62387420709809e-06, - "loss": 0.8156184, - "num_input_tokens_seen": 79721315, - "step": 3700, - "time_per_iteration": 2.652172327041626 - }, - { - "auxiliary_loss_clip": 0.01111144, - "auxiliary_loss_mlp": 0.01045464, - "balance_loss_clip": 1.04893112, - "balance_loss_mlp": 1.02608061, - "epoch": 0.2225161581241545, - "flos": 46280450615040.0, - "grad_norm": 2.123831341506728, - "language_loss": 0.72503817, - "learning_rate": 3.623646830029943e-06, - "loss": 0.74660432, - "num_input_tokens_seen": 79742705, - "step": 3701, - "time_per_iteration": 2.943124294281006 - }, - { - "auxiliary_loss_clip": 0.01139412, - "auxiliary_loss_mlp": 0.0104206, - "balance_loss_clip": 1.05053067, - "balance_loss_mlp": 1.0246197, - "epoch": 0.22257628137682248, - "flos": 23696194993920.0, - "grad_norm": 1.9127522113256972, - "language_loss": 0.79901838, - "learning_rate": 3.6234193913929454e-06, - "loss": 0.82083315, - "num_input_tokens_seen": 79763000, - "step": 3702, - "time_per_iteration": 2.6978282928466797 - }, - { - "auxiliary_loss_clip": 0.01129024, - "auxiliary_loss_mlp": 0.01044082, - "balance_loss_clip": 1.04707038, - "balance_loss_mlp": 1.02655816, - "epoch": 0.22263640462949044, - "flos": 19353984595200.0, - "grad_norm": 1.8258996761992496, - "language_loss": 0.78237271, - "learning_rate": 3.623191891195723e-06, - "loss": 0.80410373, - "num_input_tokens_seen": 79781335, - "step": 3703, - "time_per_iteration": 2.6528990268707275 - }, - { - "auxiliary_loss_clip": 0.01140219, - "auxiliary_loss_mlp": 0.01036919, - "balance_loss_clip": 1.0503273, - "balance_loss_mlp": 1.0171181, - "epoch": 0.22269652788215843, - "flos": 20776047016320.0, - "grad_norm": 2.1693263198920563, - "language_loss": 0.74490714, - "learning_rate": 3.6229643294469005e-06, - "loss": 0.76667851, - "num_input_tokens_seen": 79800150, - "step": 3704, - "time_per_iteration": 2.679184913635254 - }, - { - "auxiliary_loss_clip": 0.0110341, - "auxiliary_loss_mlp": 0.01043861, - "balance_loss_clip": 1.046996, - "balance_loss_mlp": 1.02684951, - "epoch": 0.2227566511348264, - "flos": 47958843467520.0, - "grad_norm": 1.8279463297536431, - "language_loss": 0.644319, - "learning_rate": 3.6227367061551074e-06, - "loss": 0.66579175, - "num_input_tokens_seen": 79822390, - "step": 3705, - "time_per_iteration": 2.972221612930298 - }, - { - "auxiliary_loss_clip": 0.01037239, - "auxiliary_loss_mlp": 0.01023153, - "balance_loss_clip": 1.03748369, - "balance_loss_mlp": 1.02111423, - "epoch": 0.22281677438749437, - "flos": 66218953230720.0, - "grad_norm": 1.2472387125776994, - "language_loss": 0.65169704, - "learning_rate": 3.6225090213289766e-06, - "loss": 0.67230093, - "num_input_tokens_seen": 79873350, - "step": 3706, - "time_per_iteration": 3.118619203567505 - }, - { - "auxiliary_loss_clip": 0.01116185, - "auxiliary_loss_mlp": 0.01040401, - "balance_loss_clip": 1.04938805, - "balance_loss_mlp": 1.02290082, - "epoch": 0.22287689764016233, - "flos": 21871609787520.0, - "grad_norm": 1.912279921070755, - "language_loss": 0.80597419, - "learning_rate": 3.622281274977141e-06, - "loss": 0.8275401, - "num_input_tokens_seen": 79891715, - "step": 3707, - "time_per_iteration": 2.6555368900299072 - }, - { - "auxiliary_loss_clip": 0.01149897, - "auxiliary_loss_mlp": 0.01039316, - "balance_loss_clip": 1.05199265, - "balance_loss_mlp": 1.02203059, - "epoch": 0.2229370208928303, - "flos": 27672475587840.0, - "grad_norm": 1.9339558574691282, - "language_loss": 0.78542316, - "learning_rate": 3.6220534671082367e-06, - "loss": 0.80731529, - "num_input_tokens_seen": 79911175, - "step": 3708, - "time_per_iteration": 2.7179131507873535 - }, - { - "auxiliary_loss_clip": 0.01128276, - "auxiliary_loss_mlp": 0.01042525, - "balance_loss_clip": 1.05055118, - "balance_loss_mlp": 1.02363038, - "epoch": 0.22299714414549826, - "flos": 30154657034880.0, - "grad_norm": 1.8085596793383067, - "language_loss": 0.80606776, - "learning_rate": 3.6218255977309024e-06, - "loss": 0.82777578, - "num_input_tokens_seen": 79931875, - "step": 3709, - "time_per_iteration": 2.810605764389038 - }, - { - "auxiliary_loss_clip": 0.01135044, - "auxiliary_loss_mlp": 0.00777248, - "balance_loss_clip": 1.0480969, - "balance_loss_mlp": 1.0014261, - "epoch": 0.22305726739816625, - "flos": 23143134309120.0, - "grad_norm": 2.100780376064183, - "language_loss": 0.69068789, - "learning_rate": 3.6215976668537787e-06, - "loss": 0.70981085, - "num_input_tokens_seen": 79952445, - "step": 3710, - "time_per_iteration": 2.7197980880737305 - }, - { - "auxiliary_loss_clip": 0.01111671, - "auxiliary_loss_mlp": 0.01050475, - "balance_loss_clip": 1.04630041, - "balance_loss_mlp": 1.03220057, - "epoch": 0.22311739065083422, - "flos": 19172061187200.0, - "grad_norm": 2.1025491711486763, - "language_loss": 0.90782154, - "learning_rate": 3.6213696744855096e-06, - "loss": 0.92944294, - "num_input_tokens_seen": 79971030, - "step": 3711, - "time_per_iteration": 2.808014154434204 - }, - { - "auxiliary_loss_clip": 0.01117969, - "auxiliary_loss_mlp": 0.01059175, - "balance_loss_clip": 1.04696095, - "balance_loss_mlp": 1.03921938, - "epoch": 0.22317751390350218, - "flos": 13617757319040.0, - "grad_norm": 6.2447945102939615, - "language_loss": 0.89070308, - "learning_rate": 3.6211416206347395e-06, - "loss": 0.91247451, - "num_input_tokens_seen": 79982085, - "step": 3712, - "time_per_iteration": 2.6701955795288086 - }, - { - "auxiliary_loss_clip": 0.01150852, - "auxiliary_loss_mlp": 0.01044271, - "balance_loss_clip": 1.05445373, - "balance_loss_mlp": 1.02627039, - "epoch": 0.22323763715617015, - "flos": 11029065068160.0, - "grad_norm": 5.249819485386642, - "language_loss": 0.75858659, - "learning_rate": 3.620913505310117e-06, - "loss": 0.78053784, - "num_input_tokens_seen": 79997460, - "step": 3713, - "time_per_iteration": 2.5961148738861084 - }, - { - "auxiliary_loss_clip": 0.01106588, - "auxiliary_loss_mlp": 0.01043158, - "balance_loss_clip": 1.05345535, - "balance_loss_mlp": 1.0252645, - "epoch": 0.22329776040883811, - "flos": 41351531466240.0, - "grad_norm": 1.7774284049242903, - "language_loss": 0.62422931, - "learning_rate": 3.6206853285202917e-06, - "loss": 0.6457268, - "num_input_tokens_seen": 80022450, - "step": 3714, - "time_per_iteration": 2.9655838012695312 - }, - { - "auxiliary_loss_clip": 0.0112071, - "auxiliary_loss_mlp": 0.01033065, - "balance_loss_clip": 1.05258489, - "balance_loss_mlp": 1.0163759, - "epoch": 0.22335788366150608, - "flos": 25119478477440.0, - "grad_norm": 5.465931600334143, - "language_loss": 0.79076529, - "learning_rate": 3.6204570902739164e-06, - "loss": 0.81230301, - "num_input_tokens_seen": 80042100, - "step": 3715, - "time_per_iteration": 2.8040106296539307 - }, - { - "auxiliary_loss_clip": 0.01113318, - "auxiliary_loss_mlp": 0.01049585, - "balance_loss_clip": 1.05601192, - "balance_loss_mlp": 1.03176367, - "epoch": 0.22341800691417404, - "flos": 16983377769600.0, - "grad_norm": 2.696607190089822, - "language_loss": 0.77416688, - "learning_rate": 3.620228790579645e-06, - "loss": 0.79579592, - "num_input_tokens_seen": 80059690, - "step": 3716, - "time_per_iteration": 2.721008777618408 - }, - { - "auxiliary_loss_clip": 0.01123787, - "auxiliary_loss_mlp": 0.01043954, - "balance_loss_clip": 1.04860306, - "balance_loss_mlp": 1.02644157, - "epoch": 0.22347813016684204, - "flos": 14136738975360.0, - "grad_norm": 3.4762745813408884, - "language_loss": 0.79258984, - "learning_rate": 3.6200004294461367e-06, - "loss": 0.81426722, - "num_input_tokens_seen": 80076060, - "step": 3717, - "time_per_iteration": 2.724637746810913 - }, - { - "auxiliary_loss_clip": 0.0107853, - "auxiliary_loss_mlp": 0.01042478, - "balance_loss_clip": 1.04485083, - "balance_loss_mlp": 1.02390504, - "epoch": 0.22353825341951, - "flos": 23583147914880.0, - "grad_norm": 1.9798483733973138, - "language_loss": 0.67890245, - "learning_rate": 3.6197720068820497e-06, - "loss": 0.70011252, - "num_input_tokens_seen": 80094760, - "step": 3718, - "time_per_iteration": 2.8178799152374268 - }, - { - "auxiliary_loss_clip": 0.01128946, - "auxiliary_loss_mlp": 0.01043035, - "balance_loss_clip": 1.04887676, - "balance_loss_mlp": 1.02374721, - "epoch": 0.22359837667217797, - "flos": 29824206888960.0, - "grad_norm": 1.6261924310986715, - "language_loss": 0.81046188, - "learning_rate": 3.619543522896045e-06, - "loss": 0.83218175, - "num_input_tokens_seen": 80114475, - "step": 3719, - "time_per_iteration": 2.8068079948425293 - }, - { - "auxiliary_loss_clip": 0.0112823, - "auxiliary_loss_mlp": 0.0105526, - "balance_loss_clip": 1.05054009, - "balance_loss_mlp": 1.03555441, - "epoch": 0.22365849992484593, - "flos": 17603088140160.0, - "grad_norm": 2.128611791985372, - "language_loss": 0.86535168, - "learning_rate": 3.6193149774967885e-06, - "loss": 0.88718653, - "num_input_tokens_seen": 80132920, - "step": 3720, - "time_per_iteration": 2.726252794265747 - }, - { - "auxiliary_loss_clip": 0.01123833, - "auxiliary_loss_mlp": 0.01039252, - "balance_loss_clip": 1.05347347, - "balance_loss_mlp": 1.0207628, - "epoch": 0.2237186231775139, - "flos": 22710949868160.0, - "grad_norm": 1.725668609175168, - "language_loss": 0.7471531, - "learning_rate": 3.619086370692945e-06, - "loss": 0.76878393, - "num_input_tokens_seen": 80152845, - "step": 3721, - "time_per_iteration": 2.77329158782959 - }, - { - "auxiliary_loss_clip": 0.01158005, - "auxiliary_loss_mlp": 0.01043442, - "balance_loss_clip": 1.05607998, - "balance_loss_mlp": 1.02497673, - "epoch": 0.22377874643018186, - "flos": 13371518609280.0, - "grad_norm": 3.166607303525693, - "language_loss": 0.7957024, - "learning_rate": 3.6188577024931844e-06, - "loss": 0.8177169, - "num_input_tokens_seen": 80170680, - "step": 3722, - "time_per_iteration": 2.7204909324645996 - }, - { - "auxiliary_loss_clip": 0.01113056, - "auxiliary_loss_mlp": 0.01041868, - "balance_loss_clip": 1.0520618, - "balance_loss_mlp": 1.02571511, - "epoch": 0.22383886968284986, - "flos": 17894970057600.0, - "grad_norm": 2.0043774256219997, - "language_loss": 0.82129884, - "learning_rate": 3.618628972906178e-06, - "loss": 0.84284806, - "num_input_tokens_seen": 80189030, - "step": 3723, - "time_per_iteration": 2.7908549308776855 - }, - { - "auxiliary_loss_clip": 0.01155309, - "auxiliary_loss_mlp": 0.01046826, - "balance_loss_clip": 1.05468059, - "balance_loss_mlp": 1.02857494, - "epoch": 0.22389899293551782, - "flos": 23879123982720.0, - "grad_norm": 2.0838579777085022, - "language_loss": 0.84742224, - "learning_rate": 3.6184001819405984e-06, - "loss": 0.86944354, - "num_input_tokens_seen": 80208365, - "step": 3724, - "time_per_iteration": 2.691678047180176 - }, - { - "auxiliary_loss_clip": 0.01123425, - "auxiliary_loss_mlp": 0.01042537, - "balance_loss_clip": 1.0494504, - "balance_loss_mlp": 1.02516866, - "epoch": 0.2239591161881858, - "flos": 27272430840960.0, - "grad_norm": 1.76453761267329, - "language_loss": 0.79456621, - "learning_rate": 3.618171329605121e-06, - "loss": 0.81622583, - "num_input_tokens_seen": 80228685, - "step": 3725, - "time_per_iteration": 4.339299917221069 - }, - { - "auxiliary_loss_clip": 0.01091555, - "auxiliary_loss_mlp": 0.01043361, - "balance_loss_clip": 1.05116296, - "balance_loss_mlp": 1.02538443, - "epoch": 0.22401923944085375, - "flos": 22236857233920.0, - "grad_norm": 1.776149940187026, - "language_loss": 0.77333415, - "learning_rate": 3.6179424159084254e-06, - "loss": 0.79468334, - "num_input_tokens_seen": 80247635, - "step": 3726, - "time_per_iteration": 4.320322275161743 - }, - { - "auxiliary_loss_clip": 0.0115151, - "auxiliary_loss_mlp": 0.01047267, - "balance_loss_clip": 1.05424356, - "balance_loss_mlp": 1.02664328, - "epoch": 0.22407936269352172, - "flos": 12053668521600.0, - "grad_norm": 2.83844669603944, - "language_loss": 0.72643399, - "learning_rate": 3.6177134408591914e-06, - "loss": 0.74842173, - "num_input_tokens_seen": 80260045, - "step": 3727, - "time_per_iteration": 4.218656539916992 - }, - { - "auxiliary_loss_clip": 0.01157504, - "auxiliary_loss_mlp": 0.01043436, - "balance_loss_clip": 1.0541296, - "balance_loss_mlp": 1.02321815, - "epoch": 0.22413948594618968, - "flos": 19353553632000.0, - "grad_norm": 2.250671737688348, - "language_loss": 0.86600292, - "learning_rate": 3.6174844044661013e-06, - "loss": 0.88801229, - "num_input_tokens_seen": 80277680, - "step": 3728, - "time_per_iteration": 2.650423765182495 - }, - { - "auxiliary_loss_clip": 0.01122602, - "auxiliary_loss_mlp": 0.01053562, - "balance_loss_clip": 1.050982, - "balance_loss_mlp": 1.03134131, - "epoch": 0.22419960919885765, - "flos": 24170000319360.0, - "grad_norm": 2.1953419048873877, - "language_loss": 0.80038953, - "learning_rate": 3.6172553067378406e-06, - "loss": 0.82215106, - "num_input_tokens_seen": 80294795, - "step": 3729, - "time_per_iteration": 2.7553794384002686 - }, - { - "auxiliary_loss_clip": 0.01126228, - "auxiliary_loss_mlp": 0.01046911, - "balance_loss_clip": 1.05183935, - "balance_loss_mlp": 1.02992368, - "epoch": 0.22425973245152564, - "flos": 27378977558400.0, - "grad_norm": 1.8211738544282683, - "language_loss": 0.86968076, - "learning_rate": 3.6170261476830964e-06, - "loss": 0.89141214, - "num_input_tokens_seen": 80315425, - "step": 3730, - "time_per_iteration": 2.8044395446777344 - }, - { - "auxiliary_loss_clip": 0.01121982, - "auxiliary_loss_mlp": 0.00775761, - "balance_loss_clip": 1.04924226, - "balance_loss_mlp": 1.00148201, - "epoch": 0.2243198557041936, - "flos": 13735652734080.0, - "grad_norm": 2.1817469574553017, - "language_loss": 0.73091185, - "learning_rate": 3.616796927310559e-06, - "loss": 0.74988931, - "num_input_tokens_seen": 80333905, - "step": 3731, - "time_per_iteration": 2.764198064804077 - }, - { - "auxiliary_loss_clip": 0.01127044, - "auxiliary_loss_mlp": 0.0104235, - "balance_loss_clip": 1.05654919, - "balance_loss_mlp": 1.02467108, - "epoch": 0.22437997895686157, - "flos": 19530700531200.0, - "grad_norm": 2.1924274894904787, - "language_loss": 0.75427651, - "learning_rate": 3.6165676456289195e-06, - "loss": 0.77597046, - "num_input_tokens_seen": 80352165, - "step": 3732, - "time_per_iteration": 4.544835090637207 - }, - { - "auxiliary_loss_clip": 0.01155285, - "auxiliary_loss_mlp": 0.01053522, - "balance_loss_clip": 1.05655456, - "balance_loss_mlp": 1.03560436, - "epoch": 0.22444010220952954, - "flos": 23696230907520.0, - "grad_norm": 1.745203479087184, - "language_loss": 0.88139856, - "learning_rate": 3.616338302646873e-06, - "loss": 0.90348667, - "num_input_tokens_seen": 80371305, - "step": 3733, - "time_per_iteration": 2.7097933292388916 - }, - { - "auxiliary_loss_clip": 0.0110922, - "auxiliary_loss_mlp": 0.01040674, - "balance_loss_clip": 1.05094051, - "balance_loss_mlp": 1.02264953, - "epoch": 0.2245002254621975, - "flos": 22382905933440.0, - "grad_norm": 1.6873732683679492, - "language_loss": 0.84643197, - "learning_rate": 3.6161088983731166e-06, - "loss": 0.86793089, - "num_input_tokens_seen": 80391020, - "step": 3734, - "time_per_iteration": 2.7647547721862793 - }, - { - "auxiliary_loss_clip": 0.0113181, - "auxiliary_loss_mlp": 0.01049327, - "balance_loss_clip": 1.05362856, - "balance_loss_mlp": 1.03149319, - "epoch": 0.22456034871486547, - "flos": 26942303917440.0, - "grad_norm": 1.774553175519815, - "language_loss": 0.7679311, - "learning_rate": 3.6158794328163482e-06, - "loss": 0.78974247, - "num_input_tokens_seen": 80411365, - "step": 3735, - "time_per_iteration": 2.7682430744171143 - }, - { - "auxiliary_loss_clip": 0.01138858, - "auxiliary_loss_mlp": 0.01045746, - "balance_loss_clip": 1.06029248, - "balance_loss_mlp": 1.02927136, - "epoch": 0.22462047196753343, - "flos": 28983538005120.0, - "grad_norm": 1.671324371931155, - "language_loss": 0.842767, - "learning_rate": 3.6156499059852702e-06, - "loss": 0.86461306, - "num_input_tokens_seen": 80431075, - "step": 3736, - "time_per_iteration": 3.009368419647217 - }, - { - "auxiliary_loss_clip": 0.0111279, - "auxiliary_loss_mlp": 0.01044111, - "balance_loss_clip": 1.05240226, - "balance_loss_mlp": 1.02677774, - "epoch": 0.22468059522020142, - "flos": 20011329440640.0, - "grad_norm": 1.8971112354532307, - "language_loss": 0.86643183, - "learning_rate": 3.615420317888586e-06, - "loss": 0.88800085, - "num_input_tokens_seen": 80449240, - "step": 3737, - "time_per_iteration": 2.792965888977051 - }, - { - "auxiliary_loss_clip": 0.0115891, - "auxiliary_loss_mlp": 0.0104972, - "balance_loss_clip": 1.05792093, - "balance_loss_mlp": 1.03051496, - "epoch": 0.2247407184728694, - "flos": 29314239546240.0, - "grad_norm": 6.664079021041442, - "language_loss": 0.79027152, - "learning_rate": 3.6151906685350006e-06, - "loss": 0.81235784, - "num_input_tokens_seen": 80467900, - "step": 3738, - "time_per_iteration": 2.716878652572632 - }, - { - "auxiliary_loss_clip": 0.01122737, - "auxiliary_loss_mlp": 0.01047993, - "balance_loss_clip": 1.0520165, - "balance_loss_mlp": 1.0315063, - "epoch": 0.22480084172553735, - "flos": 22310366417280.0, - "grad_norm": 1.837059456311059, - "language_loss": 0.76693523, - "learning_rate": 3.614960957933224e-06, - "loss": 0.78864253, - "num_input_tokens_seen": 80487100, - "step": 3739, - "time_per_iteration": 2.743222713470459 - }, - { - "auxiliary_loss_clip": 0.01116493, - "auxiliary_loss_mlp": 0.01049772, - "balance_loss_clip": 1.05008686, - "balance_loss_mlp": 1.03011417, - "epoch": 0.22486096497820532, - "flos": 25591272641280.0, - "grad_norm": 2.2924613412630133, - "language_loss": 0.74577379, - "learning_rate": 3.6147311860919655e-06, - "loss": 0.7674365, - "num_input_tokens_seen": 80508625, - "step": 3740, - "time_per_iteration": 2.7339253425598145 - }, - { - "auxiliary_loss_clip": 0.01152276, - "auxiliary_loss_mlp": 0.01045147, - "balance_loss_clip": 1.05556941, - "balance_loss_mlp": 1.02728927, - "epoch": 0.22492108823087328, - "flos": 17639824775040.0, - "grad_norm": 1.9086069443180373, - "language_loss": 0.75610423, - "learning_rate": 3.614501353019939e-06, - "loss": 0.77807844, - "num_input_tokens_seen": 80527345, - "step": 3741, - "time_per_iteration": 2.7347571849823 - }, - { - "auxiliary_loss_clip": 0.01133279, - "auxiliary_loss_mlp": 0.01039745, - "balance_loss_clip": 1.05599904, - "balance_loss_mlp": 1.02316284, - "epoch": 0.22498121148354125, - "flos": 16034653797120.0, - "grad_norm": 1.7754272123040742, - "language_loss": 0.87332213, - "learning_rate": 3.6142714587258592e-06, - "loss": 0.89505225, - "num_input_tokens_seen": 80545545, - "step": 3742, - "time_per_iteration": 2.702103614807129 - }, - { - "auxiliary_loss_clip": 0.01095068, - "auxiliary_loss_mlp": 0.01053093, - "balance_loss_clip": 1.04728913, - "balance_loss_mlp": 1.03398395, - "epoch": 0.22504133473620924, - "flos": 24023772051840.0, - "grad_norm": 2.1035678371185256, - "language_loss": 0.812823, - "learning_rate": 3.614041503218444e-06, - "loss": 0.83430457, - "num_input_tokens_seen": 80565040, - "step": 3743, - "time_per_iteration": 2.777566909790039 - }, - { - "auxiliary_loss_clip": 0.01142483, - "auxiliary_loss_mlp": 0.01040692, - "balance_loss_clip": 1.05282855, - "balance_loss_mlp": 1.02319252, - "epoch": 0.2251014579888772, - "flos": 16763963541120.0, - "grad_norm": 2.836562973763206, - "language_loss": 0.63821399, - "learning_rate": 3.6138114865064134e-06, - "loss": 0.66004574, - "num_input_tokens_seen": 80582815, - "step": 3744, - "time_per_iteration": 2.6738698482513428 - }, - { - "auxiliary_loss_clip": 0.01139201, - "auxiliary_loss_mlp": 0.01043137, - "balance_loss_clip": 1.05523586, - "balance_loss_mlp": 1.0255779, - "epoch": 0.22516158124154517, - "flos": 13991013498240.0, - "grad_norm": 4.405698565190268, - "language_loss": 0.76340199, - "learning_rate": 3.613581408598489e-06, - "loss": 0.78522527, - "num_input_tokens_seen": 80600865, - "step": 3745, - "time_per_iteration": 2.8423044681549072 - }, - { - "auxiliary_loss_clip": 0.01116037, - "auxiliary_loss_mlp": 0.0104407, - "balance_loss_clip": 1.04906797, - "balance_loss_mlp": 1.0267489, - "epoch": 0.22522170449421314, - "flos": 14390016750720.0, - "grad_norm": 7.51155110796741, - "language_loss": 0.8056733, - "learning_rate": 3.6133512695033965e-06, - "loss": 0.82727438, - "num_input_tokens_seen": 80617455, - "step": 3746, - "time_per_iteration": 2.743417739868164 - }, - { - "auxiliary_loss_clip": 0.01142091, - "auxiliary_loss_mlp": 0.01050597, - "balance_loss_clip": 1.05323768, - "balance_loss_mlp": 1.0328114, - "epoch": 0.2252818277468811, - "flos": 23805542972160.0, - "grad_norm": 2.6189948571262116, - "language_loss": 0.86153656, - "learning_rate": 3.613121069229862e-06, - "loss": 0.88346344, - "num_input_tokens_seen": 80635125, - "step": 3747, - "time_per_iteration": 2.7622148990631104 - }, - { - "auxiliary_loss_clip": 0.01138021, - "auxiliary_loss_mlp": 0.0077598, - "balance_loss_clip": 1.05126321, - "balance_loss_mlp": 1.00154519, - "epoch": 0.22534195099954907, - "flos": 24718033100160.0, - "grad_norm": 2.3477587169419483, - "language_loss": 0.76400602, - "learning_rate": 3.6128908077866145e-06, - "loss": 0.78314602, - "num_input_tokens_seen": 80656370, - "step": 3748, - "time_per_iteration": 2.7347261905670166 - }, - { - "auxiliary_loss_clip": 0.01156837, - "auxiliary_loss_mlp": 0.01043045, - "balance_loss_clip": 1.05704546, - "balance_loss_mlp": 1.02525926, - "epoch": 0.22540207425221703, - "flos": 21032341534080.0, - "grad_norm": 1.5503962030073002, - "language_loss": 0.7984724, - "learning_rate": 3.6126604851823864e-06, - "loss": 0.82047117, - "num_input_tokens_seen": 80676495, - "step": 3749, - "time_per_iteration": 2.6900558471679688 - }, - { - "auxiliary_loss_clip": 0.01123701, - "auxiliary_loss_mlp": 0.01041028, - "balance_loss_clip": 1.05050755, - "balance_loss_mlp": 1.02436304, - "epoch": 0.22546219750488503, - "flos": 19390362094080.0, - "grad_norm": 3.015206251853355, - "language_loss": 0.79585081, - "learning_rate": 3.6124301014259108e-06, - "loss": 0.81749809, - "num_input_tokens_seen": 80694755, - "step": 3750, - "time_per_iteration": 2.727651596069336 - }, - { - "auxiliary_loss_clip": 0.01097337, - "auxiliary_loss_mlp": 0.01055462, - "balance_loss_clip": 1.05065274, - "balance_loss_mlp": 1.03756917, - "epoch": 0.225522320757553, - "flos": 25192628524800.0, - "grad_norm": 2.662961533862713, - "language_loss": 0.82433236, - "learning_rate": 3.6121996565259244e-06, - "loss": 0.84586036, - "num_input_tokens_seen": 80713670, - "step": 3751, - "time_per_iteration": 2.827995538711548 - }, - { - "auxiliary_loss_clip": 0.01121046, - "auxiliary_loss_mlp": 0.01046103, - "balance_loss_clip": 1.05429292, - "balance_loss_mlp": 1.02828133, - "epoch": 0.22558244401022096, - "flos": 17163110448000.0, - "grad_norm": 2.0142745824369315, - "language_loss": 0.83813727, - "learning_rate": 3.611969150491165e-06, - "loss": 0.8598088, - "num_input_tokens_seen": 80731450, - "step": 3752, - "time_per_iteration": 2.78725266456604 - }, - { - "auxiliary_loss_clip": 0.01152116, - "auxiliary_loss_mlp": 0.01037502, - "balance_loss_clip": 1.05584741, - "balance_loss_mlp": 1.02123034, - "epoch": 0.22564256726288892, - "flos": 15231008856960.0, - "grad_norm": 1.9292267305553392, - "language_loss": 0.78254855, - "learning_rate": 3.611738583330375e-06, - "loss": 0.80444479, - "num_input_tokens_seen": 80748415, - "step": 3753, - "time_per_iteration": 2.7116169929504395 - }, - { - "auxiliary_loss_clip": 0.01126321, - "auxiliary_loss_mlp": 0.0104341, - "balance_loss_clip": 1.05120027, - "balance_loss_mlp": 1.02546871, - "epoch": 0.2257026905155569, - "flos": 34568652764160.0, - "grad_norm": 1.8777790089425805, - "language_loss": 0.78391469, - "learning_rate": 3.611507955052295e-06, - "loss": 0.80561191, - "num_input_tokens_seen": 80770835, - "step": 3754, - "time_per_iteration": 2.91738224029541 - }, - { - "auxiliary_loss_clip": 0.01128102, - "auxiliary_loss_mlp": 0.01048192, - "balance_loss_clip": 1.05648673, - "balance_loss_mlp": 1.03040624, - "epoch": 0.22576281376822485, - "flos": 19938430788480.0, - "grad_norm": 1.9337610105869587, - "language_loss": 0.70648986, - "learning_rate": 3.6112772656656727e-06, - "loss": 0.72825277, - "num_input_tokens_seen": 80787840, - "step": 3755, - "time_per_iteration": 2.7427992820739746 - }, - { - "auxiliary_loss_clip": 0.01126515, - "auxiliary_loss_mlp": 0.01053366, - "balance_loss_clip": 1.05531752, - "balance_loss_mlp": 1.03559232, - "epoch": 0.22582293702089282, - "flos": 24602005192320.0, - "grad_norm": 3.9817469401483216, - "language_loss": 0.77865845, - "learning_rate": 3.6110465151792547e-06, - "loss": 0.80045724, - "num_input_tokens_seen": 80806335, - "step": 3756, - "time_per_iteration": 2.7879996299743652 - }, - { - "auxiliary_loss_clip": 0.01132066, - "auxiliary_loss_mlp": 0.01044227, - "balance_loss_clip": 1.0559032, - "balance_loss_mlp": 1.0261426, - "epoch": 0.2258830602735608, - "flos": 23035438356480.0, - "grad_norm": 1.801741818571408, - "language_loss": 0.82615864, - "learning_rate": 3.6108157036017916e-06, - "loss": 0.84792161, - "num_input_tokens_seen": 80825355, - "step": 3757, - "time_per_iteration": 2.685218095779419 - }, - { - "auxiliary_loss_clip": 0.01140048, - "auxiliary_loss_mlp": 0.01047555, - "balance_loss_clip": 1.05321026, - "balance_loss_mlp": 1.02917302, - "epoch": 0.22594318352622877, - "flos": 22158427887360.0, - "grad_norm": 2.3786564016745495, - "language_loss": 0.73007452, - "learning_rate": 3.6105848309420358e-06, - "loss": 0.7519505, - "num_input_tokens_seen": 80842570, - "step": 3758, - "time_per_iteration": 2.6716878414154053 - }, - { - "auxiliary_loss_clip": 0.01137739, - "auxiliary_loss_mlp": 0.01048984, - "balance_loss_clip": 1.0577718, - "balance_loss_mlp": 1.03019619, - "epoch": 0.22600330677889674, - "flos": 20594303176320.0, - "grad_norm": 2.226232476294752, - "language_loss": 0.77150333, - "learning_rate": 3.6103538972087412e-06, - "loss": 0.79337054, - "num_input_tokens_seen": 80858745, - "step": 3759, - "time_per_iteration": 2.787487030029297 - }, - { - "auxiliary_loss_clip": 0.01104852, - "auxiliary_loss_mlp": 0.01043473, - "balance_loss_clip": 1.04747176, - "balance_loss_mlp": 1.02507949, - "epoch": 0.2260634300315647, - "flos": 35659798162560.0, - "grad_norm": 1.6253921855068183, - "language_loss": 0.78189945, - "learning_rate": 3.6101229024106655e-06, - "loss": 0.80338269, - "num_input_tokens_seen": 80880085, - "step": 3760, - "time_per_iteration": 2.8760766983032227 - }, - { - "auxiliary_loss_clip": 0.01042849, - "auxiliary_loss_mlp": 0.01009599, - "balance_loss_clip": 1.03235281, - "balance_loss_mlp": 1.00633264, - "epoch": 0.22612355328423267, - "flos": 72090455126400.0, - "grad_norm": 0.9481639821873915, - "language_loss": 0.60083473, - "learning_rate": 3.609891846556569e-06, - "loss": 0.62135923, - "num_input_tokens_seen": 80937660, - "step": 3761, - "time_per_iteration": 3.2168753147125244 - }, - { - "auxiliary_loss_clip": 0.01114836, - "auxiliary_loss_mlp": 0.01051216, - "balance_loss_clip": 1.0493567, - "balance_loss_mlp": 1.03295338, - "epoch": 0.22618367653690064, - "flos": 22783776693120.0, - "grad_norm": 2.3328987294287047, - "language_loss": 0.76767397, - "learning_rate": 3.609660729655211e-06, - "loss": 0.78933448, - "num_input_tokens_seen": 80956265, - "step": 3762, - "time_per_iteration": 2.8012428283691406 - }, - { - "auxiliary_loss_clip": 0.01128732, - "auxiliary_loss_mlp": 0.01042327, - "balance_loss_clip": 1.05266595, - "balance_loss_mlp": 1.02190685, - "epoch": 0.22624379978956863, - "flos": 20448254476800.0, - "grad_norm": 2.7297545785195907, - "language_loss": 0.79000401, - "learning_rate": 3.6094295517153573e-06, - "loss": 0.81171465, - "num_input_tokens_seen": 80975185, - "step": 3763, - "time_per_iteration": 2.7217857837677 - }, - { - "auxiliary_loss_clip": 0.01142679, - "auxiliary_loss_mlp": 0.01057425, - "balance_loss_clip": 1.0557214, - "balance_loss_mlp": 1.03835177, - "epoch": 0.2263039230422366, - "flos": 17494314779520.0, - "grad_norm": 31.68022075556768, - "language_loss": 0.91241246, - "learning_rate": 3.6091983127457743e-06, - "loss": 0.93441343, - "num_input_tokens_seen": 80992830, - "step": 3764, - "time_per_iteration": 4.232046842575073 - }, - { - "auxiliary_loss_clip": 0.01131876, - "auxiliary_loss_mlp": 0.01055516, - "balance_loss_clip": 1.05196834, - "balance_loss_mlp": 1.0367409, - "epoch": 0.22636404629490456, - "flos": 28329748606080.0, - "grad_norm": 1.9816130101247444, - "language_loss": 0.75202596, - "learning_rate": 3.6089670127552293e-06, - "loss": 0.77389991, - "num_input_tokens_seen": 81013675, - "step": 3765, - "time_per_iteration": 4.291628122329712 - }, - { - "auxiliary_loss_clip": 0.01140284, - "auxiliary_loss_mlp": 0.01047009, - "balance_loss_clip": 1.05632913, - "balance_loss_mlp": 1.02942574, - "epoch": 0.22642416954757252, - "flos": 17489143221120.0, - "grad_norm": 2.1881182413466176, - "language_loss": 0.8966549, - "learning_rate": 3.608735651752494e-06, - "loss": 0.91852784, - "num_input_tokens_seen": 81030345, - "step": 3766, - "time_per_iteration": 2.6462960243225098 - }, - { - "auxiliary_loss_clip": 0.01126107, - "auxiliary_loss_mlp": 0.01047462, - "balance_loss_clip": 1.05579042, - "balance_loss_mlp": 1.02950931, - "epoch": 0.2264842928002405, - "flos": 24384530298240.0, - "grad_norm": 1.6297384952566736, - "language_loss": 0.74816859, - "learning_rate": 3.6085042297463417e-06, - "loss": 0.76990426, - "num_input_tokens_seen": 81051000, - "step": 3767, - "time_per_iteration": 4.181917667388916 - }, - { - "auxiliary_loss_clip": 0.01139766, - "auxiliary_loss_mlp": 0.01048037, - "balance_loss_clip": 1.05206823, - "balance_loss_mlp": 1.02981031, - "epoch": 0.22654441605290845, - "flos": 19830519354240.0, - "grad_norm": 1.6389844555489992, - "language_loss": 0.71764815, - "learning_rate": 3.6082727467455477e-06, - "loss": 0.73952615, - "num_input_tokens_seen": 81071205, - "step": 3768, - "time_per_iteration": 2.6622893810272217 - }, - { - "auxiliary_loss_clip": 0.01143239, - "auxiliary_loss_mlp": 0.01057198, - "balance_loss_clip": 1.05766034, - "balance_loss_mlp": 1.03895879, - "epoch": 0.22660453930557642, - "flos": 27454569730560.0, - "grad_norm": 1.5883345705718652, - "language_loss": 0.78320074, - "learning_rate": 3.6080412027588905e-06, - "loss": 0.80520505, - "num_input_tokens_seen": 81091880, - "step": 3769, - "time_per_iteration": 2.692366123199463 - }, - { - "auxiliary_loss_clip": 0.01121985, - "auxiliary_loss_mlp": 0.01045951, - "balance_loss_clip": 1.0452522, - "balance_loss_mlp": 1.02712774, - "epoch": 0.2266646625582444, - "flos": 23988148738560.0, - "grad_norm": 1.8427419299971495, - "language_loss": 0.6877771, - "learning_rate": 3.6078095977951488e-06, - "loss": 0.70945644, - "num_input_tokens_seen": 81113290, - "step": 3770, - "time_per_iteration": 2.7605137825012207 - }, - { - "auxiliary_loss_clip": 0.01155061, - "auxiliary_loss_mlp": 0.01053072, - "balance_loss_clip": 1.0551908, - "balance_loss_mlp": 1.03454649, - "epoch": 0.22672478581091238, - "flos": 26028054023040.0, - "grad_norm": 1.6594447480271795, - "language_loss": 0.80540276, - "learning_rate": 3.6075779318631067e-06, - "loss": 0.82748413, - "num_input_tokens_seen": 81133535, - "step": 3771, - "time_per_iteration": 4.265140771865845 - }, - { - "auxiliary_loss_clip": 0.0110854, - "auxiliary_loss_mlp": 0.01058177, - "balance_loss_clip": 1.04661536, - "balance_loss_mlp": 1.04091501, - "epoch": 0.22678490906358034, - "flos": 23841812730240.0, - "grad_norm": 1.6696234119475444, - "language_loss": 0.78947794, - "learning_rate": 3.6073462049715486e-06, - "loss": 0.81114507, - "num_input_tokens_seen": 81154650, - "step": 3772, - "time_per_iteration": 2.7325806617736816 - }, - { - "auxiliary_loss_clip": 0.01036659, - "auxiliary_loss_mlp": 0.0100656, - "balance_loss_clip": 1.0461247, - "balance_loss_mlp": 1.00336492, - "epoch": 0.2268450323162483, - "flos": 65048088574080.0, - "grad_norm": 0.653194629863103, - "language_loss": 0.54380804, - "learning_rate": 3.607114417129261e-06, - "loss": 0.56424022, - "num_input_tokens_seen": 81221240, - "step": 3773, - "time_per_iteration": 3.3729567527770996 - }, - { - "auxiliary_loss_clip": 0.0111914, - "auxiliary_loss_mlp": 0.01046238, - "balance_loss_clip": 1.05257821, - "balance_loss_mlp": 1.02851129, - "epoch": 0.22690515556891627, - "flos": 22526081544960.0, - "grad_norm": 1.81548541557593, - "language_loss": 0.70406783, - "learning_rate": 3.6068825683450334e-06, - "loss": 0.7257216, - "num_input_tokens_seen": 81241520, - "step": 3774, - "time_per_iteration": 2.7159364223480225 - }, - { - "auxiliary_loss_clip": 0.01125586, - "auxiliary_loss_mlp": 0.01046805, - "balance_loss_clip": 1.05404115, - "balance_loss_mlp": 1.02929282, - "epoch": 0.22696527882158424, - "flos": 18223444955520.0, - "grad_norm": 2.2603412716687523, - "language_loss": 0.74377871, - "learning_rate": 3.606650658627658e-06, - "loss": 0.76550257, - "num_input_tokens_seen": 81256825, - "step": 3775, - "time_per_iteration": 2.7857720851898193 - }, - { - "auxiliary_loss_clip": 0.01152024, - "auxiliary_loss_mlp": 0.01045868, - "balance_loss_clip": 1.05331159, - "balance_loss_mlp": 1.02915478, - "epoch": 0.22702540207425223, - "flos": 17019252478080.0, - "grad_norm": 1.8428958927362264, - "language_loss": 0.81582248, - "learning_rate": 3.606418687985928e-06, - "loss": 0.83780146, - "num_input_tokens_seen": 81275695, - "step": 3776, - "time_per_iteration": 2.6054935455322266 - }, - { - "auxiliary_loss_clip": 0.01135081, - "auxiliary_loss_mlp": 0.01043769, - "balance_loss_clip": 1.05466735, - "balance_loss_mlp": 1.02654314, - "epoch": 0.2270855253269202, - "flos": 21325731822720.0, - "grad_norm": 1.7711090356153572, - "language_loss": 0.82893199, - "learning_rate": 3.606186656428641e-06, - "loss": 0.85072052, - "num_input_tokens_seen": 81294920, - "step": 3777, - "time_per_iteration": 2.722621202468872 - }, - { - "auxiliary_loss_clip": 0.01127657, - "auxiliary_loss_mlp": 0.01042436, - "balance_loss_clip": 1.05438471, - "balance_loss_mlp": 1.02435195, - "epoch": 0.22714564857958816, - "flos": 23550469516800.0, - "grad_norm": 2.3905711679994295, - "language_loss": 0.72538829, - "learning_rate": 3.6059545639645955e-06, - "loss": 0.74708927, - "num_input_tokens_seen": 81314275, - "step": 3778, - "time_per_iteration": 2.730919599533081 - }, - { - "auxiliary_loss_clip": 0.01112853, - "auxiliary_loss_mlp": 0.01040216, - "balance_loss_clip": 1.05304575, - "balance_loss_mlp": 1.02241838, - "epoch": 0.22720577183225613, - "flos": 25989880844160.0, - "grad_norm": 2.4150679449588535, - "language_loss": 0.64176035, - "learning_rate": 3.605722410602591e-06, - "loss": 0.66329098, - "num_input_tokens_seen": 81333890, - "step": 3779, - "time_per_iteration": 2.7663822174072266 - }, - { - "auxiliary_loss_clip": 0.01132359, - "auxiliary_loss_mlp": 0.01047274, - "balance_loss_clip": 1.05292106, - "balance_loss_mlp": 1.02928495, - "epoch": 0.2272658950849241, - "flos": 20814076540800.0, - "grad_norm": 1.6627524387617407, - "language_loss": 0.70659381, - "learning_rate": 3.6054901963514323e-06, - "loss": 0.72839016, - "num_input_tokens_seen": 81353640, - "step": 3780, - "time_per_iteration": 2.666081666946411 - }, - { - "auxiliary_loss_clip": 0.0114157, - "auxiliary_loss_mlp": 0.01046965, - "balance_loss_clip": 1.05450416, - "balance_loss_mlp": 1.02880907, - "epoch": 0.22732601833759206, - "flos": 23909324342400.0, - "grad_norm": 1.783300050979337, - "language_loss": 0.89418924, - "learning_rate": 3.6052579212199246e-06, - "loss": 0.91607457, - "num_input_tokens_seen": 81371595, - "step": 3781, - "time_per_iteration": 2.686478614807129 - }, - { - "auxiliary_loss_clip": 0.01152428, - "auxiliary_loss_mlp": 0.01041162, - "balance_loss_clip": 1.05349672, - "balance_loss_mlp": 1.02354264, - "epoch": 0.22738614159026002, - "flos": 15924407978880.0, - "grad_norm": 19.977426185094338, - "language_loss": 0.74404943, - "learning_rate": 3.6050255852168753e-06, - "loss": 0.76598531, - "num_input_tokens_seen": 81388435, - "step": 3782, - "time_per_iteration": 2.5633177757263184 - }, - { - "auxiliary_loss_clip": 0.01129007, - "auxiliary_loss_mlp": 0.01045443, - "balance_loss_clip": 1.05195391, - "balance_loss_mlp": 1.02926588, - "epoch": 0.22744626484292801, - "flos": 24205515891840.0, - "grad_norm": 2.051662638457334, - "language_loss": 0.82665169, - "learning_rate": 3.604793188351095e-06, - "loss": 0.84839618, - "num_input_tokens_seen": 81410195, - "step": 3783, - "time_per_iteration": 2.742572069168091 - }, - { - "auxiliary_loss_clip": 0.01129724, - "auxiliary_loss_mlp": 0.01043254, - "balance_loss_clip": 1.055516, - "balance_loss_mlp": 1.02495527, - "epoch": 0.22750638809559598, - "flos": 24791614110720.0, - "grad_norm": 2.0126417567412256, - "language_loss": 0.75996566, - "learning_rate": 3.6045607306313964e-06, - "loss": 0.78169543, - "num_input_tokens_seen": 81430060, - "step": 3784, - "time_per_iteration": 2.7283668518066406 - }, - { - "auxiliary_loss_clip": 0.01148666, - "auxiliary_loss_mlp": 0.01041397, - "balance_loss_clip": 1.05224681, - "balance_loss_mlp": 1.02382576, - "epoch": 0.22756651134826394, - "flos": 22236498097920.0, - "grad_norm": 1.784429661746796, - "language_loss": 0.7105484, - "learning_rate": 3.604328212066594e-06, - "loss": 0.73244894, - "num_input_tokens_seen": 81447375, - "step": 3785, - "time_per_iteration": 2.627401351928711 - }, - { - "auxiliary_loss_clip": 0.01042691, - "auxiliary_loss_mlp": 0.0101642, - "balance_loss_clip": 1.03303862, - "balance_loss_mlp": 1.01427412, - "epoch": 0.2276266346009319, - "flos": 62707466626560.0, - "grad_norm": 0.8323137639565091, - "language_loss": 0.6189881, - "learning_rate": 3.6040956326655047e-06, - "loss": 0.63957924, - "num_input_tokens_seen": 81505235, - "step": 3786, - "time_per_iteration": 3.321380376815796 - }, - { - "auxiliary_loss_clip": 0.01135149, - "auxiliary_loss_mlp": 0.01044526, - "balance_loss_clip": 1.0540669, - "balance_loss_mlp": 1.02645397, - "epoch": 0.22768675785359987, - "flos": 18613936684800.0, - "grad_norm": 2.677223616893363, - "language_loss": 0.86047274, - "learning_rate": 3.6038629924369486e-06, - "loss": 0.8822695, - "num_input_tokens_seen": 81518685, - "step": 3787, - "time_per_iteration": 2.72554349899292 - }, - { - "auxiliary_loss_clip": 0.01129718, - "auxiliary_loss_mlp": 0.01039908, - "balance_loss_clip": 1.05296564, - "balance_loss_mlp": 1.02323031, - "epoch": 0.22774688110626784, - "flos": 26870195364480.0, - "grad_norm": 1.361320938410825, - "language_loss": 0.72755021, - "learning_rate": 3.6036302913897474e-06, - "loss": 0.74924648, - "num_input_tokens_seen": 81538940, - "step": 3788, - "time_per_iteration": 2.7717456817626953 - }, - { - "auxiliary_loss_clip": 0.01125411, - "auxiliary_loss_mlp": 0.01035437, - "balance_loss_clip": 1.05099773, - "balance_loss_mlp": 1.01800895, - "epoch": 0.2278070043589358, - "flos": 15553593924480.0, - "grad_norm": 2.510042380876752, - "language_loss": 0.67785919, - "learning_rate": 3.6033975295327243e-06, - "loss": 0.69946766, - "num_input_tokens_seen": 81555525, - "step": 3789, - "time_per_iteration": 2.6492021083831787 - }, - { - "auxiliary_loss_clip": 0.01114067, - "auxiliary_loss_mlp": 0.01042939, - "balance_loss_clip": 1.04577208, - "balance_loss_mlp": 1.0244137, - "epoch": 0.2278671276116038, - "flos": 22416805393920.0, - "grad_norm": 2.807016388048184, - "language_loss": 0.76026487, - "learning_rate": 3.6031647068747065e-06, - "loss": 0.7818349, - "num_input_tokens_seen": 81576305, - "step": 3790, - "time_per_iteration": 2.789419412612915 - }, - { - "auxiliary_loss_clip": 0.01094774, - "auxiliary_loss_mlp": 0.01043575, - "balance_loss_clip": 1.04942632, - "balance_loss_mlp": 1.02388144, - "epoch": 0.22792725086427176, - "flos": 20631363033600.0, - "grad_norm": 2.1998519418279843, - "language_loss": 0.9070015, - "learning_rate": 3.602931823424522e-06, - "loss": 0.92838502, - "num_input_tokens_seen": 81594115, - "step": 3791, - "time_per_iteration": 2.74957275390625 - }, - { - "auxiliary_loss_clip": 0.01143903, - "auxiliary_loss_mlp": 0.01039768, - "balance_loss_clip": 1.05332911, - "balance_loss_mlp": 1.02229166, - "epoch": 0.22798737411693973, - "flos": 31428946903680.0, - "grad_norm": 1.6288404079645773, - "language_loss": 0.82029706, - "learning_rate": 3.6026988791910026e-06, - "loss": 0.84213376, - "num_input_tokens_seen": 81615355, - "step": 3792, - "time_per_iteration": 2.7578563690185547 - }, - { - "auxiliary_loss_clip": 0.01074793, - "auxiliary_loss_mlp": 0.01002047, - "balance_loss_clip": 1.03528738, - "balance_loss_mlp": 0.99944824, - "epoch": 0.2280474973696077, - "flos": 52396685827200.0, - "grad_norm": 1.1490057531785423, - "language_loss": 0.65688264, - "learning_rate": 3.602465874182981e-06, - "loss": 0.67765105, - "num_input_tokens_seen": 81662075, - "step": 3793, - "time_per_iteration": 2.892385959625244 - }, - { - "auxiliary_loss_clip": 0.01156846, - "auxiliary_loss_mlp": 0.01048751, - "balance_loss_clip": 1.05509233, - "balance_loss_mlp": 1.03063166, - "epoch": 0.22810762062227566, - "flos": 26396066816640.0, - "grad_norm": 2.315054268007893, - "language_loss": 0.77095032, - "learning_rate": 3.602232808409293e-06, - "loss": 0.79300624, - "num_input_tokens_seen": 81681625, - "step": 3794, - "time_per_iteration": 2.6432933807373047 - }, - { - "auxiliary_loss_clip": 0.01106797, - "auxiliary_loss_mlp": 0.0104554, - "balance_loss_clip": 1.04641223, - "balance_loss_mlp": 1.02560771, - "epoch": 0.22816774387494362, - "flos": 25630271832960.0, - "grad_norm": 2.8263872836139194, - "language_loss": 0.80649161, - "learning_rate": 3.6019996818787755e-06, - "loss": 0.82801497, - "num_input_tokens_seen": 81701170, - "step": 3795, - "time_per_iteration": 2.748461961746216 - }, - { - "auxiliary_loss_clip": 0.01136851, - "auxiliary_loss_mlp": 0.01049098, - "balance_loss_clip": 1.0527277, - "balance_loss_mlp": 1.03194404, - "epoch": 0.22822786712761162, - "flos": 22451602694400.0, - "grad_norm": 1.970346796529307, - "language_loss": 0.77348727, - "learning_rate": 3.6017664946002704e-06, - "loss": 0.79534674, - "num_input_tokens_seen": 81721265, - "step": 3796, - "time_per_iteration": 2.6720409393310547 - }, - { - "auxiliary_loss_clip": 0.01111647, - "auxiliary_loss_mlp": 0.0077572, - "balance_loss_clip": 1.04920197, - "balance_loss_mlp": 1.00161827, - "epoch": 0.22828799038027958, - "flos": 12202554395520.0, - "grad_norm": 3.9384070064251793, - "language_loss": 0.95837742, - "learning_rate": 3.6015332465826188e-06, - "loss": 0.97725105, - "num_input_tokens_seen": 81736565, - "step": 3797, - "time_per_iteration": 2.730684995651245 - }, - { - "auxiliary_loss_clip": 0.01140956, - "auxiliary_loss_mlp": 0.00774906, - "balance_loss_clip": 1.05310869, - "balance_loss_mlp": 1.00178146, - "epoch": 0.22834811363294755, - "flos": 22085708803200.0, - "grad_norm": 2.215225796779507, - "language_loss": 0.81875294, - "learning_rate": 3.601299937834666e-06, - "loss": 0.83791155, - "num_input_tokens_seen": 81756240, - "step": 3798, - "time_per_iteration": 2.7082717418670654 - }, - { - "auxiliary_loss_clip": 0.01113838, - "auxiliary_loss_mlp": 0.01041342, - "balance_loss_clip": 1.04808974, - "balance_loss_mlp": 1.02263761, - "epoch": 0.2284082368856155, - "flos": 24860634094080.0, - "grad_norm": 2.1089113145856344, - "language_loss": 0.78796971, - "learning_rate": 3.6010665683652596e-06, - "loss": 0.8095215, - "num_input_tokens_seen": 81775720, - "step": 3799, - "time_per_iteration": 2.7810587882995605 - }, - { - "auxiliary_loss_clip": 0.01121546, - "auxiliary_loss_mlp": 0.01055329, - "balance_loss_clip": 1.04926765, - "balance_loss_mlp": 1.03627968, - "epoch": 0.22846836013828348, - "flos": 23292882109440.0, - "grad_norm": 1.7973625036918341, - "language_loss": 0.75191152, - "learning_rate": 3.6008331381832484e-06, - "loss": 0.77368033, - "num_input_tokens_seen": 81795830, - "step": 3800, - "time_per_iteration": 2.7185163497924805 - }, - { - "auxiliary_loss_clip": 0.01121477, - "auxiliary_loss_mlp": 0.01037963, - "balance_loss_clip": 1.04833913, - "balance_loss_mlp": 1.02235246, - "epoch": 0.22852848339095144, - "flos": 27416288810880.0, - "grad_norm": 1.7410667809724167, - "language_loss": 0.64073247, - "learning_rate": 3.600599647297484e-06, - "loss": 0.66232693, - "num_input_tokens_seen": 81815745, - "step": 3801, - "time_per_iteration": 2.7509078979492188 - }, - { - "auxiliary_loss_clip": 0.01129432, - "auxiliary_loss_mlp": 0.01038736, - "balance_loss_clip": 1.05498147, - "balance_loss_mlp": 1.02301216, - "epoch": 0.2285886066436194, - "flos": 26321157002880.0, - "grad_norm": 1.6732672610702524, - "language_loss": 0.81560862, - "learning_rate": 3.60036609571682e-06, - "loss": 0.83729029, - "num_input_tokens_seen": 81835155, - "step": 3802, - "time_per_iteration": 2.7188339233398438 - }, - { - "auxiliary_loss_clip": 0.01126952, - "auxiliary_loss_mlp": 0.0105215, - "balance_loss_clip": 1.05203629, - "balance_loss_mlp": 1.0342809, - "epoch": 0.2286487298962874, - "flos": 29716475022720.0, - "grad_norm": 2.0652844737971625, - "language_loss": 0.78909743, - "learning_rate": 3.600132483450114e-06, - "loss": 0.81088841, - "num_input_tokens_seen": 81855655, - "step": 3803, - "time_per_iteration": 2.7760777473449707 - }, - { - "auxiliary_loss_clip": 0.01109356, - "auxiliary_loss_mlp": 0.01043096, - "balance_loss_clip": 1.04399478, - "balance_loss_mlp": 1.02511966, - "epoch": 0.22870885314895537, - "flos": 21287199507840.0, - "grad_norm": 1.7519930287683254, - "language_loss": 0.84902716, - "learning_rate": 3.5998988105062235e-06, - "loss": 0.87055165, - "num_input_tokens_seen": 81876385, - "step": 3804, - "time_per_iteration": 5.891911745071411 - }, - { - "auxiliary_loss_clip": 0.01141965, - "auxiliary_loss_mlp": 0.01040951, - "balance_loss_clip": 1.05229163, - "balance_loss_mlp": 1.02440476, - "epoch": 0.22876897640162333, - "flos": 14939450161920.0, - "grad_norm": 2.045415026345325, - "language_loss": 0.76673448, - "learning_rate": 3.59966507689401e-06, - "loss": 0.78856367, - "num_input_tokens_seen": 81893225, - "step": 3805, - "time_per_iteration": 2.643104076385498 - }, - { - "auxiliary_loss_clip": 0.0112853, - "auxiliary_loss_mlp": 0.00775286, - "balance_loss_clip": 1.05192351, - "balance_loss_mlp": 1.00156116, - "epoch": 0.2288290996542913, - "flos": 18113917409280.0, - "grad_norm": 2.368547935700865, - "language_loss": 0.78250653, - "learning_rate": 3.5994312826223363e-06, - "loss": 0.80154467, - "num_input_tokens_seen": 81911350, - "step": 3806, - "time_per_iteration": 4.312817335128784 - }, - { - "auxiliary_loss_clip": 0.01123441, - "auxiliary_loss_mlp": 0.01052484, - "balance_loss_clip": 1.05244482, - "balance_loss_mlp": 1.03282619, - "epoch": 0.22888922290695926, - "flos": 39855457071360.0, - "grad_norm": 2.0706298183861, - "language_loss": 0.700813, - "learning_rate": 3.5991974277000684e-06, - "loss": 0.72257227, - "num_input_tokens_seen": 81935420, - "step": 3807, - "time_per_iteration": 2.8060836791992188 - }, - { - "auxiliary_loss_clip": 0.01143724, - "auxiliary_loss_mlp": 0.01057417, - "balance_loss_clip": 1.0545013, - "balance_loss_mlp": 1.03891551, - "epoch": 0.22894934615962723, - "flos": 23403774372480.0, - "grad_norm": 4.007429648995762, - "language_loss": 0.6543591, - "learning_rate": 3.5989635121360733e-06, - "loss": 0.6763705, - "num_input_tokens_seen": 81953845, - "step": 3808, - "time_per_iteration": 2.703885078430176 - }, - { - "auxiliary_loss_clip": 0.0109921, - "auxiliary_loss_mlp": 0.01061828, - "balance_loss_clip": 1.04773676, - "balance_loss_mlp": 1.04295671, - "epoch": 0.22900946941229522, - "flos": 18843011671680.0, - "grad_norm": 2.028069656557901, - "language_loss": 0.74749511, - "learning_rate": 3.598729535939222e-06, - "loss": 0.76910543, - "num_input_tokens_seen": 81972100, - "step": 3809, - "time_per_iteration": 2.726862907409668 - }, - { - "auxiliary_loss_clip": 0.01128097, - "auxiliary_loss_mlp": 0.01053112, - "balance_loss_clip": 1.0527637, - "balance_loss_mlp": 1.03666139, - "epoch": 0.22906959266496318, - "flos": 22929394429440.0, - "grad_norm": 1.6287389468918274, - "language_loss": 0.81654954, - "learning_rate": 3.5984954991183862e-06, - "loss": 0.83836162, - "num_input_tokens_seen": 81992760, - "step": 3810, - "time_per_iteration": 2.6750009059906006 - }, - { - "auxiliary_loss_clip": 0.01132496, - "auxiliary_loss_mlp": 0.01040979, - "balance_loss_clip": 1.05216146, - "balance_loss_mlp": 1.0247184, - "epoch": 0.22912971591763115, - "flos": 19354523299200.0, - "grad_norm": 2.375204791625097, - "language_loss": 0.78126299, - "learning_rate": 3.598261401682441e-06, - "loss": 0.80299771, - "num_input_tokens_seen": 82009080, - "step": 3811, - "time_per_iteration": 4.302153587341309 - }, - { - "auxiliary_loss_clip": 0.01130856, - "auxiliary_loss_mlp": 0.00775213, - "balance_loss_clip": 1.05357778, - "balance_loss_mlp": 1.00159776, - "epoch": 0.22918983917029911, - "flos": 19933546538880.0, - "grad_norm": 1.797699433224321, - "language_loss": 0.82817954, - "learning_rate": 3.5980272436402632e-06, - "loss": 0.84724021, - "num_input_tokens_seen": 82026705, - "step": 3812, - "time_per_iteration": 2.635796308517456 - }, - { - "auxiliary_loss_clip": 0.01089198, - "auxiliary_loss_mlp": 0.01067747, - "balance_loss_clip": 1.04705882, - "balance_loss_mlp": 1.0480535, - "epoch": 0.22924996242296708, - "flos": 16690885320960.0, - "grad_norm": 3.3357789636694952, - "language_loss": 0.82689399, - "learning_rate": 3.5977930250007324e-06, - "loss": 0.84846342, - "num_input_tokens_seen": 82043245, - "step": 3813, - "time_per_iteration": 2.7896463871002197 - }, - { - "auxiliary_loss_clip": 0.01135441, - "auxiliary_loss_mlp": 0.01044219, - "balance_loss_clip": 1.05230987, - "balance_loss_mlp": 1.02743411, - "epoch": 0.22931008567563504, - "flos": 33036164956800.0, - "grad_norm": 1.5779710642832598, - "language_loss": 0.70018709, - "learning_rate": 3.5975587457727298e-06, - "loss": 0.72198373, - "num_input_tokens_seen": 82066870, - "step": 3814, - "time_per_iteration": 2.759460687637329 - }, - { - "auxiliary_loss_clip": 0.01141204, - "auxiliary_loss_mlp": 0.01046745, - "balance_loss_clip": 1.05307984, - "balance_loss_mlp": 1.02947164, - "epoch": 0.229370208928303, - "flos": 23330696152320.0, - "grad_norm": 2.3195881009003174, - "language_loss": 0.66811371, - "learning_rate": 3.597324405965139e-06, - "loss": 0.6899932, - "num_input_tokens_seen": 82083180, - "step": 3815, - "time_per_iteration": 2.6878743171691895 - }, - { - "auxiliary_loss_clip": 0.01142177, - "auxiliary_loss_mlp": 0.01045942, - "balance_loss_clip": 1.05412412, - "balance_loss_mlp": 1.02921689, - "epoch": 0.229430332180971, - "flos": 28617213150720.0, - "grad_norm": 2.436037188170917, - "language_loss": 0.83555114, - "learning_rate": 3.597090005586848e-06, - "loss": 0.85743231, - "num_input_tokens_seen": 82102950, - "step": 3816, - "time_per_iteration": 2.702638626098633 - }, - { - "auxiliary_loss_clip": 0.01142001, - "auxiliary_loss_mlp": 0.01037145, - "balance_loss_clip": 1.05649173, - "balance_loss_mlp": 1.01952624, - "epoch": 0.22949045543363897, - "flos": 17238199829760.0, - "grad_norm": 2.261586370580253, - "language_loss": 0.8657164, - "learning_rate": 3.596855544646742e-06, - "loss": 0.88750786, - "num_input_tokens_seen": 82119510, - "step": 3817, - "time_per_iteration": 2.6439061164855957 - }, - { - "auxiliary_loss_clip": 0.01125222, - "auxiliary_loss_mlp": 0.01048919, - "balance_loss_clip": 1.0493896, - "balance_loss_mlp": 1.03166902, - "epoch": 0.22955057868630693, - "flos": 27489438858240.0, - "grad_norm": 3.8274774650765706, - "language_loss": 0.74976468, - "learning_rate": 3.5966210231537154e-06, - "loss": 0.77150607, - "num_input_tokens_seen": 82140095, - "step": 3818, - "time_per_iteration": 2.7610766887664795 - }, - { - "auxiliary_loss_clip": 0.01146421, - "auxiliary_loss_mlp": 0.01043004, - "balance_loss_clip": 1.05866313, - "balance_loss_mlp": 1.02550387, - "epoch": 0.2296107019389749, - "flos": 23476421629440.0, - "grad_norm": 1.7490504114150227, - "language_loss": 0.74682397, - "learning_rate": 3.596386441116659e-06, - "loss": 0.76871818, - "num_input_tokens_seen": 82159510, - "step": 3819, - "time_per_iteration": 2.7125203609466553 - }, - { - "auxiliary_loss_clip": 0.0114108, - "auxiliary_loss_mlp": 0.0104377, - "balance_loss_clip": 1.05479693, - "balance_loss_mlp": 1.02630615, - "epoch": 0.22967082519164286, - "flos": 31285160760960.0, - "grad_norm": 2.0230347194773732, - "language_loss": 0.81103987, - "learning_rate": 3.5961517985444684e-06, - "loss": 0.83288836, - "num_input_tokens_seen": 82179580, - "step": 3820, - "time_per_iteration": 2.7268714904785156 - }, - { - "auxiliary_loss_clip": 0.01129285, - "auxiliary_loss_mlp": 0.01044606, - "balance_loss_clip": 1.05326903, - "balance_loss_mlp": 1.02627158, - "epoch": 0.22973094844431083, - "flos": 14642935390080.0, - "grad_norm": 2.2801321869619153, - "language_loss": 0.69099033, - "learning_rate": 3.595917095446042e-06, - "loss": 0.71272922, - "num_input_tokens_seen": 82195585, - "step": 3821, - "time_per_iteration": 2.659498691558838 - }, - { - "auxiliary_loss_clip": 0.01098739, - "auxiliary_loss_mlp": 0.01036962, - "balance_loss_clip": 1.05118072, - "balance_loss_mlp": 1.01888967, - "epoch": 0.2297910716969788, - "flos": 22823853292800.0, - "grad_norm": 1.473505926288008, - "language_loss": 0.82876307, - "learning_rate": 3.5956823318302796e-06, - "loss": 0.85012007, - "num_input_tokens_seen": 82217530, - "step": 3822, - "time_per_iteration": 2.898287057876587 - }, - { - "auxiliary_loss_clip": 0.01149833, - "auxiliary_loss_mlp": 0.01044764, - "balance_loss_clip": 1.05239797, - "balance_loss_mlp": 1.02617884, - "epoch": 0.2298511949496468, - "flos": 23039029716480.0, - "grad_norm": 2.077495396622281, - "language_loss": 0.66552204, - "learning_rate": 3.5954475077060833e-06, - "loss": 0.68746805, - "num_input_tokens_seen": 82237980, - "step": 3823, - "time_per_iteration": 2.6397016048431396 - }, - { - "auxiliary_loss_clip": 0.01064018, - "auxiliary_loss_mlp": 0.01005373, - "balance_loss_clip": 1.04052305, - "balance_loss_mlp": 1.00196409, - "epoch": 0.22991131820231475, - "flos": 66890914911360.0, - "grad_norm": 0.8015900374762405, - "language_loss": 0.56731141, - "learning_rate": 3.595212623082357e-06, - "loss": 0.5880053, - "num_input_tokens_seen": 82301785, - "step": 3824, - "time_per_iteration": 3.2301526069641113 - }, - { - "auxiliary_loss_clip": 0.01123513, - "auxiliary_loss_mlp": 0.01037782, - "balance_loss_clip": 1.0506382, - "balance_loss_mlp": 1.02098525, - "epoch": 0.22997144145498272, - "flos": 17887248633600.0, - "grad_norm": 2.0770938093466995, - "language_loss": 0.7301755, - "learning_rate": 3.594977677968009e-06, - "loss": 0.7517885, - "num_input_tokens_seen": 82317355, - "step": 3825, - "time_per_iteration": 2.6161818504333496 - }, - { - "auxiliary_loss_clip": 0.01147516, - "auxiliary_loss_mlp": 0.01049665, - "balance_loss_clip": 1.05828226, - "balance_loss_mlp": 1.03119957, - "epoch": 0.23003156470765068, - "flos": 24676843178880.0, - "grad_norm": 1.8689845885894332, - "language_loss": 0.87652314, - "learning_rate": 3.5947426723719473e-06, - "loss": 0.89849496, - "num_input_tokens_seen": 82336645, - "step": 3826, - "time_per_iteration": 2.668858766555786 - }, - { - "auxiliary_loss_clip": 0.01134406, - "auxiliary_loss_mlp": 0.01045536, - "balance_loss_clip": 1.05722022, - "balance_loss_mlp": 1.02697468, - "epoch": 0.23009168796031865, - "flos": 15814126247040.0, - "grad_norm": 2.4660324215504312, - "language_loss": 0.81861693, - "learning_rate": 3.594507606303083e-06, - "loss": 0.84041631, - "num_input_tokens_seen": 82354225, - "step": 3827, - "time_per_iteration": 2.67173171043396 - }, - { - "auxiliary_loss_clip": 0.01083629, - "auxiliary_loss_mlp": 0.01046658, - "balance_loss_clip": 1.04976189, - "balance_loss_mlp": 1.02728689, - "epoch": 0.2301518112129866, - "flos": 16212842190720.0, - "grad_norm": 1.9417227311694012, - "language_loss": 0.86676306, - "learning_rate": 3.5942724797703314e-06, - "loss": 0.88806593, - "num_input_tokens_seen": 82370240, - "step": 3828, - "time_per_iteration": 2.7641990184783936 - }, - { - "auxiliary_loss_clip": 0.01126786, - "auxiliary_loss_mlp": 0.01048261, - "balance_loss_clip": 1.05381465, - "balance_loss_mlp": 1.02981901, - "epoch": 0.2302119344656546, - "flos": 20595452411520.0, - "grad_norm": 2.6386744924703223, - "language_loss": 0.7044189, - "learning_rate": 3.594037292782607e-06, - "loss": 0.72616941, - "num_input_tokens_seen": 82389145, - "step": 3829, - "time_per_iteration": 2.6674952507019043 - }, - { - "auxiliary_loss_clip": 0.01085573, - "auxiliary_loss_mlp": 0.01045126, - "balance_loss_clip": 1.04650855, - "balance_loss_mlp": 1.02835345, - "epoch": 0.23027205771832257, - "flos": 26796901662720.0, - "grad_norm": 1.6431866637768902, - "language_loss": 0.84075069, - "learning_rate": 3.5938020453488293e-06, - "loss": 0.86205769, - "num_input_tokens_seen": 82409185, - "step": 3830, - "time_per_iteration": 2.8631880283355713 - }, - { - "auxiliary_loss_clip": 0.01132962, - "auxiliary_loss_mlp": 0.01052116, - "balance_loss_clip": 1.0506047, - "balance_loss_mlp": 1.03415167, - "epoch": 0.23033218097099054, - "flos": 43873143068160.0, - "grad_norm": 2.3429509345019213, - "language_loss": 0.67036134, - "learning_rate": 3.5935667374779177e-06, - "loss": 0.6922121, - "num_input_tokens_seen": 82432070, - "step": 3831, - "time_per_iteration": 2.91282320022583 - }, - { - "auxiliary_loss_clip": 0.0111204, - "auxiliary_loss_mlp": 0.01053367, - "balance_loss_clip": 1.05277622, - "balance_loss_mlp": 1.03496158, - "epoch": 0.2303923042236585, - "flos": 26067663745920.0, - "grad_norm": 2.3469890931023194, - "language_loss": 0.75711727, - "learning_rate": 3.5933313691787957e-06, - "loss": 0.7787714, - "num_input_tokens_seen": 82450625, - "step": 3832, - "time_per_iteration": 2.759467601776123 - }, - { - "auxiliary_loss_clip": 0.0110298, - "auxiliary_loss_mlp": 0.01044565, - "balance_loss_clip": 1.05044174, - "balance_loss_mlp": 1.02596867, - "epoch": 0.23045242747632647, - "flos": 18296379521280.0, - "grad_norm": 1.7769817461106177, - "language_loss": 0.87558299, - "learning_rate": 3.593095940460389e-06, - "loss": 0.89705843, - "num_input_tokens_seen": 82468575, - "step": 3833, - "time_per_iteration": 2.8548035621643066 - }, - { - "auxiliary_loss_clip": 0.01116173, - "auxiliary_loss_mlp": 0.01046082, - "balance_loss_clip": 1.05032015, - "balance_loss_mlp": 1.02814126, - "epoch": 0.23051255072899443, - "flos": 25520528805120.0, - "grad_norm": 2.030934473686878, - "language_loss": 0.74736786, - "learning_rate": 3.592860451331624e-06, - "loss": 0.7689904, - "num_input_tokens_seen": 82488655, - "step": 3834, - "time_per_iteration": 2.719237804412842 - }, - { - "auxiliary_loss_clip": 0.01104525, - "auxiliary_loss_mlp": 0.01064338, - "balance_loss_clip": 1.04610491, - "balance_loss_mlp": 1.043679, - "epoch": 0.2305726739816624, - "flos": 21215198695680.0, - "grad_norm": 1.9050082770497696, - "language_loss": 0.86071098, - "learning_rate": 3.592624901801432e-06, - "loss": 0.88239956, - "num_input_tokens_seen": 82507220, - "step": 3835, - "time_per_iteration": 2.627782106399536 - }, - { - "auxiliary_loss_clip": 0.01115977, - "auxiliary_loss_mlp": 0.01060727, - "balance_loss_clip": 1.04934275, - "balance_loss_mlp": 1.03979373, - "epoch": 0.2306327972343304, - "flos": 23331127115520.0, - "grad_norm": 2.798777841757382, - "language_loss": 0.82434011, - "learning_rate": 3.5923892918787432e-06, - "loss": 0.84610713, - "num_input_tokens_seen": 82527920, - "step": 3836, - "time_per_iteration": 2.6091606616973877 - }, - { - "auxiliary_loss_clip": 0.01144536, - "auxiliary_loss_mlp": 0.0105466, - "balance_loss_clip": 1.06090033, - "balance_loss_mlp": 1.03683817, - "epoch": 0.23069292048699835, - "flos": 20666734951680.0, - "grad_norm": 1.7189193248017045, - "language_loss": 0.79633009, - "learning_rate": 3.5921536215724934e-06, - "loss": 0.81832206, - "num_input_tokens_seen": 82549040, - "step": 3837, - "time_per_iteration": 2.535435914993286 - }, - { - "auxiliary_loss_clip": 0.01057695, - "auxiliary_loss_mlp": 0.01033541, - "balance_loss_clip": 1.04840386, - "balance_loss_mlp": 1.03003633, - "epoch": 0.23075304373966632, - "flos": 70454832393600.0, - "grad_norm": 0.9031703200773207, - "language_loss": 0.65381849, - "learning_rate": 3.5919178908916184e-06, - "loss": 0.67473078, - "num_input_tokens_seen": 82604070, - "step": 3838, - "time_per_iteration": 3.0868518352508545 - }, - { - "auxiliary_loss_clip": 0.01138177, - "auxiliary_loss_mlp": 0.01056497, - "balance_loss_clip": 1.05361629, - "balance_loss_mlp": 1.0395453, - "epoch": 0.23081316699233428, - "flos": 16617986668800.0, - "grad_norm": 2.5143705705619097, - "language_loss": 0.75403488, - "learning_rate": 3.591682099845058e-06, - "loss": 0.77598161, - "num_input_tokens_seen": 82619665, - "step": 3839, - "time_per_iteration": 2.6391067504882812 - }, - { - "auxiliary_loss_clip": 0.01125705, - "auxiliary_loss_mlp": 0.01046933, - "balance_loss_clip": 1.05447173, - "balance_loss_mlp": 1.02882481, - "epoch": 0.23087329024500225, - "flos": 13298081253120.0, - "grad_norm": 1.8684605740856612, - "language_loss": 0.68962026, - "learning_rate": 3.591446248441752e-06, - "loss": 0.71134663, - "num_input_tokens_seen": 82637530, - "step": 3840, - "time_per_iteration": 2.6295006275177 - }, - { - "auxiliary_loss_clip": 0.01158019, - "auxiliary_loss_mlp": 0.01046048, - "balance_loss_clip": 1.05840647, - "balance_loss_mlp": 1.026057, - "epoch": 0.23093341349767021, - "flos": 17785729820160.0, - "grad_norm": 2.5615469809997697, - "language_loss": 0.80033958, - "learning_rate": 3.591210336690645e-06, - "loss": 0.8223803, - "num_input_tokens_seen": 82656130, - "step": 3841, - "time_per_iteration": 2.6512410640716553 - }, - { - "auxiliary_loss_clip": 0.01145317, - "auxiliary_loss_mlp": 0.01047066, - "balance_loss_clip": 1.05756617, - "balance_loss_mlp": 1.0301621, - "epoch": 0.23099353675033818, - "flos": 23988076911360.0, - "grad_norm": 1.7953422744525294, - "language_loss": 0.83389241, - "learning_rate": 3.590974364600683e-06, - "loss": 0.85581625, - "num_input_tokens_seen": 82675295, - "step": 3842, - "time_per_iteration": 2.7676117420196533 - }, - { - "auxiliary_loss_clip": 0.01144752, - "auxiliary_loss_mlp": 0.01044783, - "balance_loss_clip": 1.05491304, - "balance_loss_mlp": 1.02650845, - "epoch": 0.23105366000300617, - "flos": 35995168471680.0, - "grad_norm": 1.8421697704365976, - "language_loss": 0.66661239, - "learning_rate": 3.5907383321808135e-06, - "loss": 0.68850774, - "num_input_tokens_seen": 82703260, - "step": 3843, - "time_per_iteration": 5.82958722114563 - }, - { - "auxiliary_loss_clip": 0.01142299, - "auxiliary_loss_mlp": 0.01047166, - "balance_loss_clip": 1.05609, - "balance_loss_mlp": 1.02914143, - "epoch": 0.23111378325567414, - "flos": 31245335556480.0, - "grad_norm": 1.8996188882256444, - "language_loss": 0.77221334, - "learning_rate": 3.590502239439987e-06, - "loss": 0.79410803, - "num_input_tokens_seen": 82725060, - "step": 3844, - "time_per_iteration": 2.771226406097412 - }, - { - "auxiliary_loss_clip": 0.01141796, - "auxiliary_loss_mlp": 0.01045598, - "balance_loss_clip": 1.05503309, - "balance_loss_mlp": 1.02607179, - "epoch": 0.2311739065083421, - "flos": 19208223204480.0, - "grad_norm": 1.9651801579729304, - "language_loss": 0.78155982, - "learning_rate": 3.590266086387156e-06, - "loss": 0.80343372, - "num_input_tokens_seen": 82742960, - "step": 3845, - "time_per_iteration": 4.247429370880127 - }, - { - "auxiliary_loss_clip": 0.01117167, - "auxiliary_loss_mlp": 0.01039426, - "balance_loss_clip": 1.05274439, - "balance_loss_mlp": 1.02292788, - "epoch": 0.23123402976101007, - "flos": 23360178240000.0, - "grad_norm": 2.083958857623256, - "language_loss": 0.76397669, - "learning_rate": 3.590029873031276e-06, - "loss": 0.78554261, - "num_input_tokens_seen": 82760205, - "step": 3846, - "time_per_iteration": 2.7805917263031006 - }, - { - "auxiliary_loss_clip": 0.01131462, - "auxiliary_loss_mlp": 0.01049247, - "balance_loss_clip": 1.05376291, - "balance_loss_mlp": 1.03193808, - "epoch": 0.23129415301367803, - "flos": 13735365425280.0, - "grad_norm": 1.8827740097117207, - "language_loss": 0.70281041, - "learning_rate": 3.589793599381304e-06, - "loss": 0.72461748, - "num_input_tokens_seen": 82778590, - "step": 3847, - "time_per_iteration": 2.6848642826080322 - }, - { - "auxiliary_loss_clip": 0.01065475, - "auxiliary_loss_mlp": 0.01006045, - "balance_loss_clip": 1.04309821, - "balance_loss_mlp": 1.00356507, - "epoch": 0.231354276266346, - "flos": 69737015001600.0, - "grad_norm": 0.7955227467680892, - "language_loss": 0.61006129, - "learning_rate": 3.589557265446198e-06, - "loss": 0.63077646, - "num_input_tokens_seen": 82833925, - "step": 3848, - "time_per_iteration": 3.08832049369812 - }, - { - "auxiliary_loss_clip": 0.01142916, - "auxiliary_loss_mlp": 0.01044943, - "balance_loss_clip": 1.05631924, - "balance_loss_mlp": 1.02640557, - "epoch": 0.231414399519014, - "flos": 18835900778880.0, - "grad_norm": 1.9602331138800266, - "language_loss": 0.78082883, - "learning_rate": 3.589320871234923e-06, - "loss": 0.80270743, - "num_input_tokens_seen": 82850625, - "step": 3849, - "time_per_iteration": 2.6830787658691406 - }, - { - "auxiliary_loss_clip": 0.01137959, - "auxiliary_loss_mlp": 0.01044864, - "balance_loss_clip": 1.05184579, - "balance_loss_mlp": 1.02630353, - "epoch": 0.23147452277168196, - "flos": 36135470995200.0, - "grad_norm": 2.354271482082729, - "language_loss": 0.71243513, - "learning_rate": 3.5890844167564405e-06, - "loss": 0.7342633, - "num_input_tokens_seen": 82872105, - "step": 3850, - "time_per_iteration": 4.467762231826782 - }, - { - "auxiliary_loss_clip": 0.01121609, - "auxiliary_loss_mlp": 0.00776401, - "balance_loss_clip": 1.05099773, - "balance_loss_mlp": 1.00153255, - "epoch": 0.23153464602434992, - "flos": 20812927305600.0, - "grad_norm": 4.184777043510671, - "language_loss": 0.76577097, - "learning_rate": 3.588847902019718e-06, - "loss": 0.78475106, - "num_input_tokens_seen": 82890595, - "step": 3851, - "time_per_iteration": 2.7452898025512695 - }, - { - "auxiliary_loss_clip": 0.01152703, - "auxiliary_loss_mlp": 0.01038649, - "balance_loss_clip": 1.05650854, - "balance_loss_mlp": 1.0206244, - "epoch": 0.2315947692770179, - "flos": 19939256801280.0, - "grad_norm": 2.0528428588063914, - "language_loss": 0.69642782, - "learning_rate": 3.588611327033723e-06, - "loss": 0.71834141, - "num_input_tokens_seen": 82908910, - "step": 3852, - "time_per_iteration": 2.613687038421631 - }, - { - "auxiliary_loss_clip": 0.0110964, - "auxiliary_loss_mlp": 0.01050002, - "balance_loss_clip": 1.05097961, - "balance_loss_mlp": 1.0303328, - "epoch": 0.23165489252968585, - "flos": 12855553695360.0, - "grad_norm": 2.8596642791724993, - "language_loss": 0.67063856, - "learning_rate": 3.588374691807428e-06, - "loss": 0.69223493, - "num_input_tokens_seen": 82925405, - "step": 3853, - "time_per_iteration": 2.6974282264709473 - }, - { - "auxiliary_loss_clip": 0.01146149, - "auxiliary_loss_mlp": 0.01041525, - "balance_loss_clip": 1.05749798, - "balance_loss_mlp": 1.02340484, - "epoch": 0.23171501578235382, - "flos": 30628282792320.0, - "grad_norm": 1.7603397459637538, - "language_loss": 0.80139267, - "learning_rate": 3.5881379963498053e-06, - "loss": 0.82326943, - "num_input_tokens_seen": 82945615, - "step": 3854, - "time_per_iteration": 2.712125062942505 - }, - { - "auxiliary_loss_clip": 0.01115767, - "auxiliary_loss_mlp": 0.01052387, - "balance_loss_clip": 1.04737794, - "balance_loss_mlp": 1.03070331, - "epoch": 0.23177513903502178, - "flos": 23842782397440.0, - "grad_norm": 1.9709775740629982, - "language_loss": 0.65103847, - "learning_rate": 3.587901240669831e-06, - "loss": 0.67272007, - "num_input_tokens_seen": 82967570, - "step": 3855, - "time_per_iteration": 2.718756675720215 - }, - { - "auxiliary_loss_clip": 0.01153506, - "auxiliary_loss_mlp": 0.01048508, - "balance_loss_clip": 1.05417824, - "balance_loss_mlp": 1.03050709, - "epoch": 0.23183526228768978, - "flos": 29570282668800.0, - "grad_norm": 1.7803112411977504, - "language_loss": 0.70386064, - "learning_rate": 3.5876644247764815e-06, - "loss": 0.7258808, - "num_input_tokens_seen": 82987435, - "step": 3856, - "time_per_iteration": 2.798675060272217 - }, - { - "auxiliary_loss_clip": 0.01103018, - "auxiliary_loss_mlp": 0.01035927, - "balance_loss_clip": 1.05080032, - "balance_loss_mlp": 1.0200007, - "epoch": 0.23189538554035774, - "flos": 34458694254720.0, - "grad_norm": 1.7837780829213195, - "language_loss": 0.77101243, - "learning_rate": 3.5874275486787387e-06, - "loss": 0.79240191, - "num_input_tokens_seen": 83010505, - "step": 3857, - "time_per_iteration": 2.8545501232147217 - }, - { - "auxiliary_loss_clip": 0.01136868, - "auxiliary_loss_mlp": 0.00777317, - "balance_loss_clip": 1.0528996, - "balance_loss_mlp": 1.00133562, - "epoch": 0.2319555087930257, - "flos": 18003815245440.0, - "grad_norm": 2.445609387195472, - "language_loss": 0.91629225, - "learning_rate": 3.587190612385584e-06, - "loss": 0.9354341, - "num_input_tokens_seen": 83026705, - "step": 3858, - "time_per_iteration": 2.7018845081329346 - }, - { - "auxiliary_loss_clip": 0.01095626, - "auxiliary_loss_mlp": 0.01043975, - "balance_loss_clip": 1.04882586, - "balance_loss_mlp": 1.0263319, - "epoch": 0.23201563204569367, - "flos": 23143852581120.0, - "grad_norm": 1.987074492721614, - "language_loss": 0.76833785, - "learning_rate": 3.5869536159060026e-06, - "loss": 0.78973383, - "num_input_tokens_seen": 83046500, - "step": 3859, - "time_per_iteration": 2.7465155124664307 - }, - { - "auxiliary_loss_clip": 0.01136816, - "auxiliary_loss_mlp": 0.01041128, - "balance_loss_clip": 1.05060959, - "balance_loss_mlp": 1.02316284, - "epoch": 0.23207575529836164, - "flos": 20667991927680.0, - "grad_norm": 1.7166447387893018, - "language_loss": 0.84341264, - "learning_rate": 3.58671655924898e-06, - "loss": 0.86519206, - "num_input_tokens_seen": 83065280, - "step": 3860, - "time_per_iteration": 2.6602063179016113 - }, - { - "auxiliary_loss_clip": 0.01091436, - "auxiliary_loss_mlp": 0.01044571, - "balance_loss_clip": 1.04641938, - "balance_loss_mlp": 1.02640343, - "epoch": 0.2321358785510296, - "flos": 16472189364480.0, - "grad_norm": 2.014536853896284, - "language_loss": 0.83431923, - "learning_rate": 3.586479442423508e-06, - "loss": 0.85567933, - "num_input_tokens_seen": 83082310, - "step": 3861, - "time_per_iteration": 2.728750228881836 - }, - { - "auxiliary_loss_clip": 0.01130655, - "auxiliary_loss_mlp": 0.00776368, - "balance_loss_clip": 1.05122983, - "balance_loss_mlp": 1.00149858, - "epoch": 0.2321960018036976, - "flos": 21616320850560.0, - "grad_norm": 1.8874922149770945, - "language_loss": 0.85921204, - "learning_rate": 3.586242265438576e-06, - "loss": 0.87828225, - "num_input_tokens_seen": 83102065, - "step": 3862, - "time_per_iteration": 2.7289161682128906 - }, - { - "auxiliary_loss_clip": 0.01112788, - "auxiliary_loss_mlp": 0.0104236, - "balance_loss_clip": 1.04956031, - "balance_loss_mlp": 1.02645802, - "epoch": 0.23225612505636556, - "flos": 22271474966400.0, - "grad_norm": 1.4078274786009342, - "language_loss": 0.75131166, - "learning_rate": 3.5860050283031773e-06, - "loss": 0.77286315, - "num_input_tokens_seen": 83121445, - "step": 3863, - "time_per_iteration": 2.7308037281036377 - }, - { - "auxiliary_loss_clip": 0.01109911, - "auxiliary_loss_mlp": 0.0104503, - "balance_loss_clip": 1.05320251, - "balance_loss_mlp": 1.02840066, - "epoch": 0.23231624830903352, - "flos": 17052325925760.0, - "grad_norm": 1.8195520841096788, - "language_loss": 0.74952984, - "learning_rate": 3.58576773102631e-06, - "loss": 0.77107918, - "num_input_tokens_seen": 83138175, - "step": 3864, - "time_per_iteration": 2.669403314590454 - }, - { - "auxiliary_loss_clip": 0.01148697, - "auxiliary_loss_mlp": 0.01038596, - "balance_loss_clip": 1.05258274, - "balance_loss_mlp": 1.02182317, - "epoch": 0.2323763715617015, - "flos": 34640043045120.0, - "grad_norm": 1.757817857347048, - "language_loss": 0.70438093, - "learning_rate": 3.5855303736169714e-06, - "loss": 0.72625393, - "num_input_tokens_seen": 83161975, - "step": 3865, - "time_per_iteration": 2.766399621963501 - }, - { - "auxiliary_loss_clip": 0.01156124, - "auxiliary_loss_mlp": 0.01048904, - "balance_loss_clip": 1.05352104, - "balance_loss_mlp": 1.02978325, - "epoch": 0.23243649481436945, - "flos": 25551698832000.0, - "grad_norm": 1.8965816841290546, - "language_loss": 0.94702542, - "learning_rate": 3.5852929560841617e-06, - "loss": 0.96907574, - "num_input_tokens_seen": 83180905, - "step": 3866, - "time_per_iteration": 2.659867525100708 - }, - { - "auxiliary_loss_clip": 0.01131283, - "auxiliary_loss_mlp": 0.01044032, - "balance_loss_clip": 1.04904807, - "balance_loss_mlp": 1.02683008, - "epoch": 0.23249661806703742, - "flos": 20483482740480.0, - "grad_norm": 4.181849364953483, - "language_loss": 0.73026884, - "learning_rate": 3.5850554784368846e-06, - "loss": 0.75202191, - "num_input_tokens_seen": 83196390, - "step": 3867, - "time_per_iteration": 2.645481586456299 - }, - { - "auxiliary_loss_clip": 0.0112954, - "auxiliary_loss_mlp": 0.01046355, - "balance_loss_clip": 1.05079126, - "balance_loss_mlp": 1.02855754, - "epoch": 0.23255674131970538, - "flos": 20376612800640.0, - "grad_norm": 1.9671041323983256, - "language_loss": 0.82770872, - "learning_rate": 3.584817940684145e-06, - "loss": 0.84946775, - "num_input_tokens_seen": 83216165, - "step": 3868, - "time_per_iteration": 2.7670326232910156 - }, - { - "auxiliary_loss_clip": 0.01125563, - "auxiliary_loss_mlp": 0.01043558, - "balance_loss_clip": 1.04875207, - "balance_loss_mlp": 1.02648687, - "epoch": 0.23261686457237338, - "flos": 17056096853760.0, - "grad_norm": 2.1100994183362967, - "language_loss": 0.72952414, - "learning_rate": 3.58458034283495e-06, - "loss": 0.75121534, - "num_input_tokens_seen": 83233845, - "step": 3869, - "time_per_iteration": 2.6661763191223145 - }, - { - "auxiliary_loss_clip": 0.01132223, - "auxiliary_loss_mlp": 0.0105087, - "balance_loss_clip": 1.05129242, - "balance_loss_mlp": 1.03382349, - "epoch": 0.23267698782504134, - "flos": 29169878785920.0, - "grad_norm": 2.500604422715561, - "language_loss": 0.79142725, - "learning_rate": 3.5843426848983097e-06, - "loss": 0.81325811, - "num_input_tokens_seen": 83254930, - "step": 3870, - "time_per_iteration": 2.707321882247925 - }, - { - "auxiliary_loss_clip": 0.01152434, - "auxiliary_loss_mlp": 0.01046711, - "balance_loss_clip": 1.05334866, - "balance_loss_mlp": 1.02924728, - "epoch": 0.2327371110777093, - "flos": 21174655219200.0, - "grad_norm": 2.176894576680098, - "language_loss": 0.70915782, - "learning_rate": 3.5841049668832357e-06, - "loss": 0.73114932, - "num_input_tokens_seen": 83272095, - "step": 3871, - "time_per_iteration": 2.6389646530151367 - }, - { - "auxiliary_loss_clip": 0.01139847, - "auxiliary_loss_mlp": 0.01051541, - "balance_loss_clip": 1.05543458, - "balance_loss_mlp": 1.03244328, - "epoch": 0.23279723433037727, - "flos": 24863112132480.0, - "grad_norm": 1.8306984701748774, - "language_loss": 0.68877381, - "learning_rate": 3.5838671887987433e-06, - "loss": 0.71068764, - "num_input_tokens_seen": 83290980, - "step": 3872, - "time_per_iteration": 2.662309408187866 - }, - { - "auxiliary_loss_clip": 0.0114472, - "auxiliary_loss_mlp": 0.01042459, - "balance_loss_clip": 1.05313611, - "balance_loss_mlp": 1.02388597, - "epoch": 0.23285735758304524, - "flos": 38800617344640.0, - "grad_norm": 1.5710106481349988, - "language_loss": 0.779724, - "learning_rate": 3.5836293506538474e-06, - "loss": 0.80159569, - "num_input_tokens_seen": 83315175, - "step": 3873, - "time_per_iteration": 2.884542942047119 - }, - { - "auxiliary_loss_clip": 0.01053683, - "auxiliary_loss_mlp": 0.01022765, - "balance_loss_clip": 1.03691578, - "balance_loss_mlp": 1.02038097, - "epoch": 0.2329174808357132, - "flos": 53944113692160.0, - "grad_norm": 0.8561383552409444, - "language_loss": 0.6051712, - "learning_rate": 3.5833914524575687e-06, - "loss": 0.62593567, - "num_input_tokens_seen": 83372060, - "step": 3874, - "time_per_iteration": 3.165809392929077 - }, - { - "auxiliary_loss_clip": 0.0112779, - "auxiliary_loss_mlp": 0.01040869, - "balance_loss_clip": 1.05157447, - "balance_loss_mlp": 1.02328515, - "epoch": 0.23297760408838117, - "flos": 21216024708480.0, - "grad_norm": 2.5039775977564522, - "language_loss": 0.80842507, - "learning_rate": 3.583153494218927e-06, - "loss": 0.83011162, - "num_input_tokens_seen": 83389795, - "step": 3875, - "time_per_iteration": 2.673657178878784 - }, - { - "auxiliary_loss_clip": 0.01147803, - "auxiliary_loss_mlp": 0.00774568, - "balance_loss_clip": 1.05367982, - "balance_loss_mlp": 1.00145388, - "epoch": 0.23303772734104916, - "flos": 28403006394240.0, - "grad_norm": 4.3174446976030465, - "language_loss": 0.6123395, - "learning_rate": 3.5829154759469464e-06, - "loss": 0.63156319, - "num_input_tokens_seen": 83410005, - "step": 3876, - "time_per_iteration": 2.6973021030426025 - }, - { - "auxiliary_loss_clip": 0.01116571, - "auxiliary_loss_mlp": 0.01051971, - "balance_loss_clip": 1.05002618, - "balance_loss_mlp": 1.03345811, - "epoch": 0.23309785059371713, - "flos": 24314720215680.0, - "grad_norm": 2.4263361529850447, - "language_loss": 0.70649457, - "learning_rate": 3.5826773976506523e-06, - "loss": 0.72817999, - "num_input_tokens_seen": 83430250, - "step": 3877, - "time_per_iteration": 2.7506351470947266 - }, - { - "auxiliary_loss_clip": 0.01143537, - "auxiliary_loss_mlp": 0.01051311, - "balance_loss_clip": 1.05495286, - "balance_loss_mlp": 1.03245187, - "epoch": 0.2331579738463851, - "flos": 15992925171840.0, - "grad_norm": 2.202899784913125, - "language_loss": 0.80724835, - "learning_rate": 3.582439259339073e-06, - "loss": 0.82919687, - "num_input_tokens_seen": 83447950, - "step": 3878, - "time_per_iteration": 2.6945395469665527 - }, - { - "auxiliary_loss_clip": 0.0109123, - "auxiliary_loss_mlp": 0.01049547, - "balance_loss_clip": 1.04632592, - "balance_loss_mlp": 1.0298301, - "epoch": 0.23321809709905306, - "flos": 36426957863040.0, - "grad_norm": 1.857420507716431, - "language_loss": 0.7521472, - "learning_rate": 3.5822010610212374e-06, - "loss": 0.77355498, - "num_input_tokens_seen": 83467785, - "step": 3879, - "time_per_iteration": 2.8909342288970947 - }, - { - "auxiliary_loss_clip": 0.01095967, - "auxiliary_loss_mlp": 0.01051433, - "balance_loss_clip": 1.04621899, - "balance_loss_mlp": 1.03238297, - "epoch": 0.23327822035172102, - "flos": 21324762155520.0, - "grad_norm": 2.179587653719585, - "language_loss": 0.89532614, - "learning_rate": 3.5819628027061795e-06, - "loss": 0.91680014, - "num_input_tokens_seen": 83485390, - "step": 3880, - "time_per_iteration": 2.7358896732330322 - }, - { - "auxiliary_loss_clip": 0.01127816, - "auxiliary_loss_mlp": 0.01049697, - "balance_loss_clip": 1.05119944, - "balance_loss_mlp": 1.0319109, - "epoch": 0.233338343604389, - "flos": 19171881619200.0, - "grad_norm": 1.6825190155617658, - "language_loss": 0.71915156, - "learning_rate": 3.5817244844029334e-06, - "loss": 0.74092674, - "num_input_tokens_seen": 83504890, - "step": 3881, - "time_per_iteration": 2.702533721923828 - }, - { - "auxiliary_loss_clip": 0.01148084, - "auxiliary_loss_mlp": 0.0104282, - "balance_loss_clip": 1.05186546, - "balance_loss_mlp": 1.02497458, - "epoch": 0.23339846685705698, - "flos": 26908368543360.0, - "grad_norm": 1.5464986217430505, - "language_loss": 0.68210357, - "learning_rate": 3.581486106120537e-06, - "loss": 0.70401263, - "num_input_tokens_seen": 83526475, - "step": 3882, - "time_per_iteration": 2.6449384689331055 - }, - { - "auxiliary_loss_clip": 0.01106984, - "auxiliary_loss_mlp": 0.01053219, - "balance_loss_clip": 1.04567862, - "balance_loss_mlp": 1.03457499, - "epoch": 0.23345859010972494, - "flos": 32343160884480.0, - "grad_norm": 2.180831821464153, - "language_loss": 0.77379489, - "learning_rate": 3.5812476678680287e-06, - "loss": 0.79539698, - "num_input_tokens_seen": 83546620, - "step": 3883, - "time_per_iteration": 5.806958913803101 - }, - { - "auxiliary_loss_clip": 0.01053192, - "auxiliary_loss_mlp": 0.01007679, - "balance_loss_clip": 1.03368068, - "balance_loss_mlp": 1.0053544, - "epoch": 0.2335187133623929, - "flos": 58484229050880.0, - "grad_norm": 0.7945750769740417, - "language_loss": 0.59117424, - "learning_rate": 3.58100916965445e-06, - "loss": 0.61178291, - "num_input_tokens_seen": 83616160, - "step": 3884, - "time_per_iteration": 3.3524324893951416 - }, - { - "auxiliary_loss_clip": 0.01117007, - "auxiliary_loss_mlp": 0.01034005, - "balance_loss_clip": 1.04925692, - "balance_loss_mlp": 1.01704168, - "epoch": 0.23357883661506088, - "flos": 24502317972480.0, - "grad_norm": 1.6775563031527567, - "language_loss": 0.80286831, - "learning_rate": 3.5807706114888455e-06, - "loss": 0.82437843, - "num_input_tokens_seen": 83636795, - "step": 3885, - "time_per_iteration": 4.295818328857422 - }, - { - "auxiliary_loss_clip": 0.01136024, - "auxiliary_loss_mlp": 0.01040639, - "balance_loss_clip": 1.05494285, - "balance_loss_mlp": 1.02274597, - "epoch": 0.23363895986772884, - "flos": 18948516894720.0, - "grad_norm": 2.2066793657203116, - "language_loss": 0.88230193, - "learning_rate": 3.580531993380261e-06, - "loss": 0.90406859, - "num_input_tokens_seen": 83654050, - "step": 3886, - "time_per_iteration": 2.6672091484069824 - }, - { - "auxiliary_loss_clip": 0.01150675, - "auxiliary_loss_mlp": 0.01042457, - "balance_loss_clip": 1.05293703, - "balance_loss_mlp": 1.02512443, - "epoch": 0.2336990831203968, - "flos": 31686821619840.0, - "grad_norm": 4.0082984179074055, - "language_loss": 0.73170543, - "learning_rate": 3.5802933153377445e-06, - "loss": 0.75363672, - "num_input_tokens_seen": 83673720, - "step": 3887, - "time_per_iteration": 2.7338294982910156 - }, - { - "auxiliary_loss_clip": 0.01140271, - "auxiliary_loss_mlp": 0.0104923, - "balance_loss_clip": 1.05201173, - "balance_loss_mlp": 1.03183722, - "epoch": 0.23375920637306477, - "flos": 27709750926720.0, - "grad_norm": 2.677865426107907, - "language_loss": 0.84125429, - "learning_rate": 3.5800545773703475e-06, - "loss": 0.86314929, - "num_input_tokens_seen": 83693470, - "step": 3888, - "time_per_iteration": 2.7020208835601807 - }, - { - "auxiliary_loss_clip": 0.01121847, - "auxiliary_loss_mlp": 0.010605, - "balance_loss_clip": 1.04974008, - "balance_loss_mlp": 1.04121208, - "epoch": 0.23381932962573276, - "flos": 17675627656320.0, - "grad_norm": 3.2074942430893976, - "language_loss": 0.87298381, - "learning_rate": 3.5798157794871225e-06, - "loss": 0.89480728, - "num_input_tokens_seen": 83711620, - "step": 3889, - "time_per_iteration": 4.319674491882324 - }, - { - "auxiliary_loss_clip": 0.01141703, - "auxiliary_loss_mlp": 0.01046248, - "balance_loss_clip": 1.05330396, - "balance_loss_mlp": 1.02877164, - "epoch": 0.23387945287840073, - "flos": 14390842763520.0, - "grad_norm": 3.8719217250511164, - "language_loss": 0.76830876, - "learning_rate": 3.579576921697125e-06, - "loss": 0.79018819, - "num_input_tokens_seen": 83727890, - "step": 3890, - "time_per_iteration": 2.6133198738098145 - }, - { - "auxiliary_loss_clip": 0.01107139, - "auxiliary_loss_mlp": 0.00775386, - "balance_loss_clip": 1.04837406, - "balance_loss_mlp": 1.00124502, - "epoch": 0.2339395761310687, - "flos": 46097988503040.0, - "grad_norm": 1.8304579433009527, - "language_loss": 0.73385048, - "learning_rate": 3.579338004009412e-06, - "loss": 0.75267571, - "num_input_tokens_seen": 83749370, - "step": 3891, - "time_per_iteration": 3.008927583694458 - }, - { - "auxiliary_loss_clip": 0.01145053, - "auxiliary_loss_mlp": 0.01047702, - "balance_loss_clip": 1.05121398, - "balance_loss_mlp": 1.03035665, - "epoch": 0.23399969938373666, - "flos": 22382044007040.0, - "grad_norm": 1.8316289897122906, - "language_loss": 0.82725632, - "learning_rate": 3.5790990264330433e-06, - "loss": 0.84918392, - "num_input_tokens_seen": 83769560, - "step": 3892, - "time_per_iteration": 2.6455893516540527 - }, - { - "auxiliary_loss_clip": 0.01100914, - "auxiliary_loss_mlp": 0.01055558, - "balance_loss_clip": 1.04450488, - "balance_loss_mlp": 1.03491104, - "epoch": 0.23405982263640462, - "flos": 43508542066560.0, - "grad_norm": 2.707564715226966, - "language_loss": 0.64982933, - "learning_rate": 3.578859988977082e-06, - "loss": 0.67139405, - "num_input_tokens_seen": 83795635, - "step": 3893, - "time_per_iteration": 2.9392964839935303 - }, - { - "auxiliary_loss_clip": 0.01106007, - "auxiliary_loss_mlp": 0.01045218, - "balance_loss_clip": 1.04782617, - "balance_loss_mlp": 1.02701449, - "epoch": 0.2341199458890726, - "flos": 22564685687040.0, - "grad_norm": 2.5782091790717105, - "language_loss": 0.79415286, - "learning_rate": 3.5786208916505916e-06, - "loss": 0.81566513, - "num_input_tokens_seen": 83814090, - "step": 3894, - "time_per_iteration": 2.839935541152954 - }, - { - "auxiliary_loss_clip": 0.01134295, - "auxiliary_loss_mlp": 0.01049748, - "balance_loss_clip": 1.04747164, - "balance_loss_mlp": 1.03253388, - "epoch": 0.23418006914174055, - "flos": 25633970933760.0, - "grad_norm": 1.551347830991082, - "language_loss": 0.81978422, - "learning_rate": 3.5783817344626383e-06, - "loss": 0.84162462, - "num_input_tokens_seen": 83836870, - "step": 3895, - "time_per_iteration": 2.739955425262451 - }, - { - "auxiliary_loss_clip": 0.01134592, - "auxiliary_loss_mlp": 0.01052429, - "balance_loss_clip": 1.04999852, - "balance_loss_mlp": 1.03514385, - "epoch": 0.23424019239440855, - "flos": 13545936074880.0, - "grad_norm": 1.8690411936118732, - "language_loss": 0.80239451, - "learning_rate": 3.578142517422292e-06, - "loss": 0.82426476, - "num_input_tokens_seen": 83853275, - "step": 3896, - "time_per_iteration": 2.681114435195923 - }, - { - "auxiliary_loss_clip": 0.01125586, - "auxiliary_loss_mlp": 0.01045792, - "balance_loss_clip": 1.04685259, - "balance_loss_mlp": 1.02779162, - "epoch": 0.2343003156470765, - "flos": 22419498913920.0, - "grad_norm": 2.2492510100498087, - "language_loss": 0.83249009, - "learning_rate": 3.577903240538623e-06, - "loss": 0.85420382, - "num_input_tokens_seen": 83872340, - "step": 3897, - "time_per_iteration": 2.728916645050049 - }, - { - "auxiliary_loss_clip": 0.01134669, - "auxiliary_loss_mlp": 0.01058403, - "balance_loss_clip": 1.04949594, - "balance_loss_mlp": 1.04016376, - "epoch": 0.23436043889974448, - "flos": 14790815683200.0, - "grad_norm": 1.5875861860902294, - "language_loss": 0.78903484, - "learning_rate": 3.577663903820705e-06, - "loss": 0.81096554, - "num_input_tokens_seen": 83888795, - "step": 3898, - "time_per_iteration": 2.6597952842712402 - }, - { - "auxiliary_loss_clip": 0.01109182, - "auxiliary_loss_mlp": 0.01055226, - "balance_loss_clip": 1.04657888, - "balance_loss_mlp": 1.03785777, - "epoch": 0.23442056215241244, - "flos": 22965700101120.0, - "grad_norm": 1.9975380770167093, - "language_loss": 0.73769581, - "learning_rate": 3.577424507277614e-06, - "loss": 0.75933987, - "num_input_tokens_seen": 83906820, - "step": 3899, - "time_per_iteration": 2.7511518001556396 - }, - { - "auxiliary_loss_clip": 0.01110646, - "auxiliary_loss_mlp": 0.01053556, - "balance_loss_clip": 1.04662895, - "balance_loss_mlp": 1.03530502, - "epoch": 0.2344806854050804, - "flos": 23071887682560.0, - "grad_norm": 2.822835219305806, - "language_loss": 0.75323856, - "learning_rate": 3.5771850509184277e-06, - "loss": 0.77488053, - "num_input_tokens_seen": 83926370, - "step": 3900, - "time_per_iteration": 2.7366316318511963 - }, - { - "auxiliary_loss_clip": 0.01097598, - "auxiliary_loss_mlp": 0.01047935, - "balance_loss_clip": 1.04771769, - "balance_loss_mlp": 1.03019702, - "epoch": 0.23454080865774837, - "flos": 16327074418560.0, - "grad_norm": 1.7042292639984586, - "language_loss": 0.67123592, - "learning_rate": 3.5769455347522256e-06, - "loss": 0.69269133, - "num_input_tokens_seen": 83944600, - "step": 3901, - "time_per_iteration": 2.857386589050293 - }, - { - "auxiliary_loss_clip": 0.01029196, - "auxiliary_loss_mlp": 0.01060621, - "balance_loss_clip": 1.02959871, - "balance_loss_mlp": 1.0584631, - "epoch": 0.23460093191041637, - "flos": 67760958142080.0, - "grad_norm": 0.7708596717968548, - "language_loss": 0.58189189, - "learning_rate": 3.576705958788091e-06, - "loss": 0.60279006, - "num_input_tokens_seen": 84005100, - "step": 3902, - "time_per_iteration": 3.2769579887390137 - }, - { - "auxiliary_loss_clip": 0.01126982, - "auxiliary_loss_mlp": 0.01045748, - "balance_loss_clip": 1.05044544, - "balance_loss_mlp": 1.02691305, - "epoch": 0.23466105516308433, - "flos": 20077619990400.0, - "grad_norm": 2.0309755154884708, - "language_loss": 0.80396789, - "learning_rate": 3.576466323035108e-06, - "loss": 0.82569516, - "num_input_tokens_seen": 84023775, - "step": 3903, - "time_per_iteration": 2.683908462524414 - }, - { - "auxiliary_loss_clip": 0.01092072, - "auxiliary_loss_mlp": 0.01044121, - "balance_loss_clip": 1.04248238, - "balance_loss_mlp": 1.02614391, - "epoch": 0.2347211784157523, - "flos": 24535714642560.0, - "grad_norm": 1.970422818337997, - "language_loss": 0.82400727, - "learning_rate": 3.5762266275023645e-06, - "loss": 0.84536922, - "num_input_tokens_seen": 84042605, - "step": 3904, - "time_per_iteration": 2.8023037910461426 - }, - { - "auxiliary_loss_clip": 0.01147463, - "auxiliary_loss_mlp": 0.01043559, - "balance_loss_clip": 1.05247784, - "balance_loss_mlp": 1.02620173, - "epoch": 0.23478130166842026, - "flos": 23805040181760.0, - "grad_norm": 1.9105311329606578, - "language_loss": 0.71330345, - "learning_rate": 3.57598687219895e-06, - "loss": 0.73521364, - "num_input_tokens_seen": 84061520, - "step": 3905, - "time_per_iteration": 2.650956869125366 - }, - { - "auxiliary_loss_clip": 0.01143661, - "auxiliary_loss_mlp": 0.01035514, - "balance_loss_clip": 1.05086017, - "balance_loss_mlp": 1.01877677, - "epoch": 0.23484142492108823, - "flos": 24093618048000.0, - "grad_norm": 2.334164983860831, - "language_loss": 0.71415532, - "learning_rate": 3.5757470571339543e-06, - "loss": 0.73594707, - "num_input_tokens_seen": 84081800, - "step": 3906, - "time_per_iteration": 2.6635055541992188 - }, - { - "auxiliary_loss_clip": 0.01138147, - "auxiliary_loss_mlp": 0.01042098, - "balance_loss_clip": 1.04703832, - "balance_loss_mlp": 1.02246392, - "epoch": 0.2349015481737562, - "flos": 29095830898560.0, - "grad_norm": 2.5527171953873693, - "language_loss": 0.74024308, - "learning_rate": 3.575507182316473e-06, - "loss": 0.7620455, - "num_input_tokens_seen": 84102340, - "step": 3907, - "time_per_iteration": 2.751154661178589 - }, - { - "auxiliary_loss_clip": 0.01135101, - "auxiliary_loss_mlp": 0.01047433, - "balance_loss_clip": 1.04911268, - "balance_loss_mlp": 1.02950394, - "epoch": 0.23496167142642416, - "flos": 18916305373440.0, - "grad_norm": 1.9847054585906883, - "language_loss": 0.72428519, - "learning_rate": 3.575267247755601e-06, - "loss": 0.74611056, - "num_input_tokens_seen": 84120370, - "step": 3908, - "time_per_iteration": 2.631162166595459 - }, - { - "auxiliary_loss_clip": 0.01053013, - "auxiliary_loss_mlp": 0.01020478, - "balance_loss_clip": 1.03362584, - "balance_loss_mlp": 1.01765239, - "epoch": 0.23502179467909215, - "flos": 55868062896000.0, - "grad_norm": 1.0307072678924762, - "language_loss": 0.73359185, - "learning_rate": 3.5750272534604367e-06, - "loss": 0.75432676, - "num_input_tokens_seen": 84165515, - "step": 3909, - "time_per_iteration": 2.974531650543213 - }, - { - "auxiliary_loss_clip": 0.01136436, - "auxiliary_loss_mlp": 0.01046445, - "balance_loss_clip": 1.05006361, - "balance_loss_mlp": 1.02797985, - "epoch": 0.23508191793176011, - "flos": 23401763210880.0, - "grad_norm": 1.6771333047394956, - "language_loss": 0.88288009, - "learning_rate": 3.5747871994400822e-06, - "loss": 0.90470886, - "num_input_tokens_seen": 84184540, - "step": 3910, - "time_per_iteration": 2.6615123748779297 - }, - { - "auxiliary_loss_clip": 0.01134757, - "auxiliary_loss_mlp": 0.01038734, - "balance_loss_clip": 1.04980493, - "balance_loss_mlp": 1.02188933, - "epoch": 0.23514204118442808, - "flos": 20047671025920.0, - "grad_norm": 1.9388895528834493, - "language_loss": 0.76067305, - "learning_rate": 3.5745470857036386e-06, - "loss": 0.78240794, - "num_input_tokens_seen": 84202025, - "step": 3911, - "time_per_iteration": 2.6846752166748047 - }, - { - "auxiliary_loss_clip": 0.01130294, - "auxiliary_loss_mlp": 0.01041364, - "balance_loss_clip": 1.04968345, - "balance_loss_mlp": 1.02546179, - "epoch": 0.23520216443709605, - "flos": 21580589796480.0, - "grad_norm": 1.5851255377793763, - "language_loss": 0.81651384, - "learning_rate": 3.5743069122602122e-06, - "loss": 0.83823043, - "num_input_tokens_seen": 84221895, - "step": 3912, - "time_per_iteration": 2.6340627670288086 - }, - { - "auxiliary_loss_clip": 0.01123815, - "auxiliary_loss_mlp": 0.01046223, - "balance_loss_clip": 1.05082059, - "balance_loss_mlp": 1.02836537, - "epoch": 0.235262287689764, - "flos": 23185796688000.0, - "grad_norm": 3.1390338867327165, - "language_loss": 0.71748006, - "learning_rate": 3.574066679118909e-06, - "loss": 0.73918045, - "num_input_tokens_seen": 84240455, - "step": 3913, - "time_per_iteration": 2.6716067790985107 - }, - { - "auxiliary_loss_clip": 0.01141007, - "auxiliary_loss_mlp": 0.00776535, - "balance_loss_clip": 1.05018401, - "balance_loss_mlp": 1.00136077, - "epoch": 0.23532241094243198, - "flos": 23185222070400.0, - "grad_norm": 1.7080087282408476, - "language_loss": 0.76152158, - "learning_rate": 3.57382638628884e-06, - "loss": 0.78069693, - "num_input_tokens_seen": 84261605, - "step": 3914, - "time_per_iteration": 2.706982135772705 - }, - { - "auxiliary_loss_clip": 0.01088532, - "auxiliary_loss_mlp": 0.01039819, - "balance_loss_clip": 1.0485754, - "balance_loss_mlp": 1.02153206, - "epoch": 0.23538253419509997, - "flos": 17019324305280.0, - "grad_norm": 2.2148128973951877, - "language_loss": 0.89692557, - "learning_rate": 3.5735860337791174e-06, - "loss": 0.91820902, - "num_input_tokens_seen": 84278675, - "step": 3915, - "time_per_iteration": 2.8005998134613037 - }, - { - "auxiliary_loss_clip": 0.01045613, - "auxiliary_loss_mlp": 0.0100868, - "balance_loss_clip": 1.02860212, - "balance_loss_mlp": 1.00596201, - "epoch": 0.23544265744776793, - "flos": 63448588967040.0, - "grad_norm": 0.8066012642326402, - "language_loss": 0.59382623, - "learning_rate": 3.573345621598854e-06, - "loss": 0.61436915, - "num_input_tokens_seen": 84329765, - "step": 3916, - "time_per_iteration": 3.168708086013794 - }, - { - "auxiliary_loss_clip": 0.01027738, - "auxiliary_loss_mlp": 0.01005192, - "balance_loss_clip": 1.03619492, - "balance_loss_mlp": 1.00231957, - "epoch": 0.2355027807004359, - "flos": 70515343831680.0, - "grad_norm": 0.7680467252570666, - "language_loss": 0.49518228, - "learning_rate": 3.5731051497571675e-06, - "loss": 0.51551157, - "num_input_tokens_seen": 84393680, - "step": 3917, - "time_per_iteration": 3.3240060806274414 - }, - { - "auxiliary_loss_clip": 0.01112941, - "auxiliary_loss_mlp": 0.01048231, - "balance_loss_clip": 1.04929173, - "balance_loss_mlp": 1.03133857, - "epoch": 0.23556290395310386, - "flos": 21434289701760.0, - "grad_norm": 1.9721662885337694, - "language_loss": 0.76349282, - "learning_rate": 3.5728646182631756e-06, - "loss": 0.78510457, - "num_input_tokens_seen": 84412640, - "step": 3918, - "time_per_iteration": 2.739431619644165 - }, - { - "auxiliary_loss_clip": 0.0109904, - "auxiliary_loss_mlp": 0.01052049, - "balance_loss_clip": 1.04440236, - "balance_loss_mlp": 1.03514528, - "epoch": 0.23562302720577183, - "flos": 18186421011840.0, - "grad_norm": 2.001330675769641, - "language_loss": 0.69002521, - "learning_rate": 3.5726240271259995e-06, - "loss": 0.71153617, - "num_input_tokens_seen": 84431605, - "step": 3919, - "time_per_iteration": 2.8809926509857178 - }, - { - "auxiliary_loss_clip": 0.01106851, - "auxiliary_loss_mlp": 0.01039357, - "balance_loss_clip": 1.04772878, - "balance_loss_mlp": 1.02221501, - "epoch": 0.2356831504584398, - "flos": 33730497832320.0, - "grad_norm": 1.6908780146896767, - "language_loss": 0.70500779, - "learning_rate": 3.5723833763547634e-06, - "loss": 0.72646987, - "num_input_tokens_seen": 84454210, - "step": 3920, - "time_per_iteration": 2.7984554767608643 - }, - { - "auxiliary_loss_clip": 0.01124832, - "auxiliary_loss_mlp": 0.01054073, - "balance_loss_clip": 1.05141807, - "balance_loss_mlp": 1.03756285, - "epoch": 0.23574327371110776, - "flos": 24932778560640.0, - "grad_norm": 1.7460619151295316, - "language_loss": 0.77363533, - "learning_rate": 3.5721426659585916e-06, - "loss": 0.7954244, - "num_input_tokens_seen": 84475540, - "step": 3921, - "time_per_iteration": 2.8038690090179443 - }, - { - "auxiliary_loss_clip": 0.01113499, - "auxiliary_loss_mlp": 0.01043793, - "balance_loss_clip": 1.05042887, - "balance_loss_mlp": 1.02692485, - "epoch": 0.23580339696377575, - "flos": 17822107319040.0, - "grad_norm": 2.2761735813493775, - "language_loss": 0.74768102, - "learning_rate": 3.571901895946612e-06, - "loss": 0.76925397, - "num_input_tokens_seen": 84494580, - "step": 3922, - "time_per_iteration": 5.741380929946899 - }, - { - "auxiliary_loss_clip": 0.01116057, - "auxiliary_loss_mlp": 0.01041318, - "balance_loss_clip": 1.04831624, - "balance_loss_mlp": 1.02577269, - "epoch": 0.23586352021644372, - "flos": 26286611097600.0, - "grad_norm": 3.3386441952016868, - "language_loss": 0.79846609, - "learning_rate": 3.571661066327956e-06, - "loss": 0.82003981, - "num_input_tokens_seen": 84513850, - "step": 3923, - "time_per_iteration": 2.7889180183410645 - }, - { - "auxiliary_loss_clip": 0.01089456, - "auxiliary_loss_mlp": 0.0105728, - "balance_loss_clip": 1.04471469, - "balance_loss_mlp": 1.03935063, - "epoch": 0.23592364346911168, - "flos": 14246697484800.0, - "grad_norm": 4.698975622885271, - "language_loss": 0.74874711, - "learning_rate": 3.571420177111754e-06, - "loss": 0.77021456, - "num_input_tokens_seen": 84532315, - "step": 3924, - "time_per_iteration": 4.272740125656128 - }, - { - "auxiliary_loss_clip": 0.01145554, - "auxiliary_loss_mlp": 0.01046876, - "balance_loss_clip": 1.05115998, - "balance_loss_mlp": 1.030568, - "epoch": 0.23598376672177965, - "flos": 18587938216320.0, - "grad_norm": 2.8676741031402977, - "language_loss": 0.82357788, - "learning_rate": 3.5711792283071416e-06, - "loss": 0.8455022, - "num_input_tokens_seen": 84550970, - "step": 3925, - "time_per_iteration": 2.6825013160705566 - }, - { - "auxiliary_loss_clip": 0.0112035, - "auxiliary_loss_mlp": 0.01048071, - "balance_loss_clip": 1.04567564, - "balance_loss_mlp": 1.0315721, - "epoch": 0.2360438899744476, - "flos": 22675542036480.0, - "grad_norm": 1.5755651433289561, - "language_loss": 0.59533024, - "learning_rate": 3.5709382199232564e-06, - "loss": 0.61701441, - "num_input_tokens_seen": 84571655, - "step": 3926, - "time_per_iteration": 2.6960842609405518 - }, - { - "auxiliary_loss_clip": 0.01125496, - "auxiliary_loss_mlp": 0.01046163, - "balance_loss_clip": 1.04914129, - "balance_loss_mlp": 1.0302484, - "epoch": 0.23610401322711558, - "flos": 29570139014400.0, - "grad_norm": 2.4179456581838212, - "language_loss": 0.7155292, - "learning_rate": 3.570697151969235e-06, - "loss": 0.7372458, - "num_input_tokens_seen": 84593130, - "step": 3927, - "time_per_iteration": 2.786576986312866 - }, - { - "auxiliary_loss_clip": 0.01120941, - "auxiliary_loss_mlp": 0.01047009, - "balance_loss_clip": 1.04764938, - "balance_loss_mlp": 1.03125572, - "epoch": 0.23616413647978354, - "flos": 17858520731520.0, - "grad_norm": 1.9380358164668718, - "language_loss": 0.74792278, - "learning_rate": 3.570456024454221e-06, - "loss": 0.76960224, - "num_input_tokens_seen": 84612410, - "step": 3928, - "time_per_iteration": 4.450765609741211 - }, - { - "auxiliary_loss_clip": 0.01118656, - "auxiliary_loss_mlp": 0.01047112, - "balance_loss_clip": 1.04935324, - "balance_loss_mlp": 1.02949333, - "epoch": 0.23622425973245154, - "flos": 11034847157760.0, - "grad_norm": 4.3448767989564745, - "language_loss": 0.81905198, - "learning_rate": 3.5702148373873576e-06, - "loss": 0.84070963, - "num_input_tokens_seen": 84627610, - "step": 3929, - "time_per_iteration": 2.654085874557495 - }, - { - "auxiliary_loss_clip": 0.01151721, - "auxiliary_loss_mlp": 0.0105167, - "balance_loss_clip": 1.05143714, - "balance_loss_mlp": 1.03314447, - "epoch": 0.2362843829851195, - "flos": 23404061681280.0, - "grad_norm": 3.048788180104446, - "language_loss": 0.72323942, - "learning_rate": 3.569973590777789e-06, - "loss": 0.74527335, - "num_input_tokens_seen": 84648415, - "step": 3930, - "time_per_iteration": 2.67429780960083 - }, - { - "auxiliary_loss_clip": 0.01143652, - "auxiliary_loss_mlp": 0.01036151, - "balance_loss_clip": 1.04880345, - "balance_loss_mlp": 1.01985574, - "epoch": 0.23634450623778747, - "flos": 39529855261440.0, - "grad_norm": 2.7450987997323333, - "language_loss": 0.74105632, - "learning_rate": 3.569732284634665e-06, - "loss": 0.76285434, - "num_input_tokens_seen": 84670080, - "step": 3931, - "time_per_iteration": 2.8017847537994385 - }, - { - "auxiliary_loss_clip": 0.01137617, - "auxiliary_loss_mlp": 0.01046002, - "balance_loss_clip": 1.05250037, - "balance_loss_mlp": 1.02853799, - "epoch": 0.23640462949045543, - "flos": 24207167917440.0, - "grad_norm": 2.2419024865888852, - "language_loss": 0.8018778, - "learning_rate": 3.569490918967136e-06, - "loss": 0.82371396, - "num_input_tokens_seen": 84686465, - "step": 3932, - "time_per_iteration": 2.6295793056488037 - }, - { - "auxiliary_loss_clip": 0.01108498, - "auxiliary_loss_mlp": 0.0104053, - "balance_loss_clip": 1.04981244, - "balance_loss_mlp": 1.02614117, - "epoch": 0.2364647527431234, - "flos": 26177622255360.0, - "grad_norm": 2.247824561482015, - "language_loss": 0.85683465, - "learning_rate": 3.5692494937843537e-06, - "loss": 0.87832487, - "num_input_tokens_seen": 84708825, - "step": 3933, - "time_per_iteration": 2.7401201725006104 - }, - { - "auxiliary_loss_clip": 0.01101933, - "auxiliary_loss_mlp": 0.010512, - "balance_loss_clip": 1.04680276, - "balance_loss_mlp": 1.03112483, - "epoch": 0.23652487599579136, - "flos": 22637009721600.0, - "grad_norm": 2.0287283132247547, - "language_loss": 0.83179402, - "learning_rate": 3.5690080090954727e-06, - "loss": 0.85332537, - "num_input_tokens_seen": 84726165, - "step": 3934, - "time_per_iteration": 2.8152921199798584 - }, - { - "auxiliary_loss_clip": 0.01148508, - "auxiliary_loss_mlp": 0.01042164, - "balance_loss_clip": 1.05208373, - "balance_loss_mlp": 1.02556968, - "epoch": 0.23658499924845935, - "flos": 21762261809280.0, - "grad_norm": 1.8368151879100059, - "language_loss": 0.78513408, - "learning_rate": 3.5687664649096515e-06, - "loss": 0.80704081, - "num_input_tokens_seen": 84745815, - "step": 3935, - "time_per_iteration": 2.6769750118255615 - }, - { - "auxiliary_loss_clip": 0.01134595, - "auxiliary_loss_mlp": 0.01034926, - "balance_loss_clip": 1.05270088, - "balance_loss_mlp": 1.01891589, - "epoch": 0.23664512250112732, - "flos": 21798998444160.0, - "grad_norm": 1.5615220666884744, - "language_loss": 0.79614085, - "learning_rate": 3.5685248612360487e-06, - "loss": 0.81783605, - "num_input_tokens_seen": 84765415, - "step": 3936, - "time_per_iteration": 2.7037193775177 - }, - { - "auxiliary_loss_clip": 0.01126163, - "auxiliary_loss_mlp": 0.01034739, - "balance_loss_clip": 1.04967618, - "balance_loss_mlp": 1.01779902, - "epoch": 0.23670524575379528, - "flos": 22637871648000.0, - "grad_norm": 1.671201383656535, - "language_loss": 0.7915628, - "learning_rate": 3.568283198083826e-06, - "loss": 0.81317174, - "num_input_tokens_seen": 84787080, - "step": 3937, - "time_per_iteration": 2.7639834880828857 - }, - { - "auxiliary_loss_clip": 0.01134519, - "auxiliary_loss_mlp": 0.01038533, - "balance_loss_clip": 1.05320358, - "balance_loss_mlp": 1.02313685, - "epoch": 0.23676536900646325, - "flos": 16725000263040.0, - "grad_norm": 1.8758026172480324, - "language_loss": 0.85389286, - "learning_rate": 3.568041475462147e-06, - "loss": 0.8756234, - "num_input_tokens_seen": 84805395, - "step": 3938, - "time_per_iteration": 2.6919057369232178 - }, - { - "auxiliary_loss_clip": 0.01145522, - "auxiliary_loss_mlp": 0.01047488, - "balance_loss_clip": 1.05159402, - "balance_loss_mlp": 1.03076303, - "epoch": 0.23682549225913122, - "flos": 11135611785600.0, - "grad_norm": 4.660879571039018, - "language_loss": 0.9365679, - "learning_rate": 3.5677996933801785e-06, - "loss": 0.958498, - "num_input_tokens_seen": 84818090, - "step": 3939, - "time_per_iteration": 2.7249948978424072 - }, - { - "auxiliary_loss_clip": 0.01149288, - "auxiliary_loss_mlp": 0.01041833, - "balance_loss_clip": 1.0512023, - "balance_loss_mlp": 1.02463138, - "epoch": 0.23688561551179918, - "flos": 22559226819840.0, - "grad_norm": 1.884439522765895, - "language_loss": 0.82347792, - "learning_rate": 3.567557851847088e-06, - "loss": 0.84538913, - "num_input_tokens_seen": 84837695, - "step": 3940, - "time_per_iteration": 2.666647434234619 - }, - { - "auxiliary_loss_clip": 0.01128412, - "auxiliary_loss_mlp": 0.00775407, - "balance_loss_clip": 1.05063081, - "balance_loss_mlp": 1.00109661, - "epoch": 0.23694573876446715, - "flos": 18514895909760.0, - "grad_norm": 2.7155330970608214, - "language_loss": 0.88959104, - "learning_rate": 3.5673159508720464e-06, - "loss": 0.90862918, - "num_input_tokens_seen": 84854630, - "step": 3941, - "time_per_iteration": 2.6898627281188965 - }, - { - "auxiliary_loss_clip": 0.01147095, - "auxiliary_loss_mlp": 0.01040548, - "balance_loss_clip": 1.04976177, - "balance_loss_mlp": 1.0227741, - "epoch": 0.23700586201713514, - "flos": 15335723980800.0, - "grad_norm": 2.436898535695529, - "language_loss": 0.8484506, - "learning_rate": 3.5670739904642274e-06, - "loss": 0.870327, - "num_input_tokens_seen": 84871805, - "step": 3942, - "time_per_iteration": 2.560166835784912 - }, - { - "auxiliary_loss_clip": 0.01109105, - "auxiliary_loss_mlp": 0.01042997, - "balance_loss_clip": 1.04736543, - "balance_loss_mlp": 1.02447248, - "epoch": 0.2370659852698031, - "flos": 23947605262080.0, - "grad_norm": 1.9848651824816348, - "language_loss": 0.81126499, - "learning_rate": 3.5668319706328065e-06, - "loss": 0.83278596, - "num_input_tokens_seen": 84889815, - "step": 3943, - "time_per_iteration": 2.7389075756073 - }, - { - "auxiliary_loss_clip": 0.01114013, - "auxiliary_loss_mlp": 0.01044642, - "balance_loss_clip": 1.0464983, - "balance_loss_mlp": 1.02618814, - "epoch": 0.23712610852247107, - "flos": 15332527670400.0, - "grad_norm": 2.1611381488400143, - "language_loss": 0.67060351, - "learning_rate": 3.566589891386959e-06, - "loss": 0.69219005, - "num_input_tokens_seen": 84904380, - "step": 3944, - "time_per_iteration": 2.6382999420166016 - }, - { - "auxiliary_loss_clip": 0.01117531, - "auxiliary_loss_mlp": 0.01038157, - "balance_loss_clip": 1.04629564, - "balance_loss_mlp": 1.02003753, - "epoch": 0.23718623177513903, - "flos": 19682567233920.0, - "grad_norm": 1.9578725621632602, - "language_loss": 0.75573617, - "learning_rate": 3.566347752735866e-06, - "loss": 0.77729309, - "num_input_tokens_seen": 84922935, - "step": 3945, - "time_per_iteration": 2.678377628326416 - }, - { - "auxiliary_loss_clip": 0.01128604, - "auxiliary_loss_mlp": 0.01039043, - "balance_loss_clip": 1.0493716, - "balance_loss_mlp": 1.02255654, - "epoch": 0.237246355027807, - "flos": 24973322037120.0, - "grad_norm": 1.4378865328543082, - "language_loss": 0.63750178, - "learning_rate": 3.5661055546887094e-06, - "loss": 0.65917826, - "num_input_tokens_seen": 84943685, - "step": 3946, - "time_per_iteration": 2.77178955078125 - }, - { - "auxiliary_loss_clip": 0.01130702, - "auxiliary_loss_mlp": 0.01036796, - "balance_loss_clip": 1.0460459, - "balance_loss_mlp": 1.0186162, - "epoch": 0.23730647828047496, - "flos": 15377416692480.0, - "grad_norm": 2.53957699605931, - "language_loss": 0.77666485, - "learning_rate": 3.5658632972546734e-06, - "loss": 0.79833984, - "num_input_tokens_seen": 84959505, - "step": 3947, - "time_per_iteration": 2.65461802482605 - }, - { - "auxiliary_loss_clip": 0.01145835, - "auxiliary_loss_mlp": 0.01040502, - "balance_loss_clip": 1.0566994, - "balance_loss_mlp": 1.02299047, - "epoch": 0.23736660153314296, - "flos": 28150662372480.0, - "grad_norm": 2.0053805098120123, - "language_loss": 0.80706096, - "learning_rate": 3.565620980442944e-06, - "loss": 0.82892442, - "num_input_tokens_seen": 84982130, - "step": 3948, - "time_per_iteration": 2.756716012954712 - }, - { - "auxiliary_loss_clip": 0.01129664, - "auxiliary_loss_mlp": 0.01044051, - "balance_loss_clip": 1.05104828, - "balance_loss_mlp": 1.02643192, - "epoch": 0.23742672478581092, - "flos": 22086570729600.0, - "grad_norm": 2.5980612684471374, - "language_loss": 0.80257607, - "learning_rate": 3.5653786042627107e-06, - "loss": 0.82431316, - "num_input_tokens_seen": 85000640, - "step": 3949, - "time_per_iteration": 2.74457049369812 - }, - { - "auxiliary_loss_clip": 0.0112363, - "auxiliary_loss_mlp": 0.01038665, - "balance_loss_clip": 1.04977036, - "balance_loss_mlp": 1.02109337, - "epoch": 0.2374868480384789, - "flos": 19537093152000.0, - "grad_norm": 2.0592081961125093, - "language_loss": 0.73239946, - "learning_rate": 3.565136168723163e-06, - "loss": 0.75402236, - "num_input_tokens_seen": 85018970, - "step": 3950, - "time_per_iteration": 2.650508165359497 - }, - { - "auxiliary_loss_clip": 0.01145426, - "auxiliary_loss_mlp": 0.01037947, - "balance_loss_clip": 1.05055118, - "balance_loss_mlp": 1.02204442, - "epoch": 0.23754697129114685, - "flos": 19422501788160.0, - "grad_norm": 1.9969465766046124, - "language_loss": 0.72794384, - "learning_rate": 3.564893673833495e-06, - "loss": 0.74977756, - "num_input_tokens_seen": 85035905, - "step": 3951, - "time_per_iteration": 2.652399778366089 - }, - { - "auxiliary_loss_clip": 0.01122477, - "auxiliary_loss_mlp": 0.01039445, - "balance_loss_clip": 1.05080223, - "balance_loss_mlp": 1.0216229, - "epoch": 0.23760709454381482, - "flos": 19501002961920.0, - "grad_norm": 3.398248459712791, - "language_loss": 0.73703241, - "learning_rate": 3.564651119602903e-06, - "loss": 0.75865161, - "num_input_tokens_seen": 85054560, - "step": 3952, - "time_per_iteration": 2.7522144317626953 - }, - { - "auxiliary_loss_clip": 0.01100804, - "auxiliary_loss_mlp": 0.01042567, - "balance_loss_clip": 1.04366636, - "balance_loss_mlp": 1.02566266, - "epoch": 0.23766721779648278, - "flos": 27636600879360.0, - "grad_norm": 1.7524267936836437, - "language_loss": 0.71314329, - "learning_rate": 3.564408506040583e-06, - "loss": 0.73457694, - "num_input_tokens_seen": 85074425, - "step": 3953, - "time_per_iteration": 2.7846672534942627 - }, - { - "auxiliary_loss_clip": 0.01151909, - "auxiliary_loss_mlp": 0.01047443, - "balance_loss_clip": 1.05282676, - "balance_loss_mlp": 1.02854872, - "epoch": 0.23772734104915075, - "flos": 23404348990080.0, - "grad_norm": 1.9722222736847754, - "language_loss": 0.81792426, - "learning_rate": 3.5641658331557356e-06, - "loss": 0.83991784, - "num_input_tokens_seen": 85092865, - "step": 3954, - "time_per_iteration": 2.6262643337249756 - }, - { - "auxiliary_loss_clip": 0.01127802, - "auxiliary_loss_mlp": 0.01044439, - "balance_loss_clip": 1.05239391, - "balance_loss_mlp": 1.02616453, - "epoch": 0.23778746430181874, - "flos": 15705496540800.0, - "grad_norm": 2.2607510345904824, - "language_loss": 0.66270143, - "learning_rate": 3.5639231009575634e-06, - "loss": 0.68442386, - "num_input_tokens_seen": 85110175, - "step": 3955, - "time_per_iteration": 2.672151803970337 - }, - { - "auxiliary_loss_clip": 0.01149182, - "auxiliary_loss_mlp": 0.0104812, - "balance_loss_clip": 1.05219805, - "balance_loss_mlp": 1.03104961, - "epoch": 0.2378475875544867, - "flos": 19426452284160.0, - "grad_norm": 1.4117933502593074, - "language_loss": 0.83963013, - "learning_rate": 3.5636803094552704e-06, - "loss": 0.86160314, - "num_input_tokens_seen": 85129925, - "step": 3956, - "time_per_iteration": 2.6483681201934814 - }, - { - "auxiliary_loss_clip": 0.01103304, - "auxiliary_loss_mlp": 0.01042938, - "balance_loss_clip": 1.04726648, - "balance_loss_mlp": 1.02556944, - "epoch": 0.23790771080715467, - "flos": 22268565964800.0, - "grad_norm": 2.308539718278817, - "language_loss": 0.8482393, - "learning_rate": 3.5634374586580635e-06, - "loss": 0.86970174, - "num_input_tokens_seen": 85147755, - "step": 3957, - "time_per_iteration": 2.718961715698242 - }, - { - "auxiliary_loss_clip": 0.01087747, - "auxiliary_loss_mlp": 0.01039974, - "balance_loss_clip": 1.04701853, - "balance_loss_mlp": 1.02428651, - "epoch": 0.23796783405982264, - "flos": 20047311889920.0, - "grad_norm": 2.068360920278316, - "language_loss": 0.70373344, - "learning_rate": 3.563194548575151e-06, - "loss": 0.72501063, - "num_input_tokens_seen": 85165270, - "step": 3958, - "time_per_iteration": 2.818115472793579 - }, - { - "auxiliary_loss_clip": 0.01102632, - "auxiliary_loss_mlp": 0.01042002, - "balance_loss_clip": 1.04540312, - "balance_loss_mlp": 1.02276158, - "epoch": 0.2380279573124906, - "flos": 14245943299200.0, - "grad_norm": 2.474231994209954, - "language_loss": 0.66273189, - "learning_rate": 3.562951579215745e-06, - "loss": 0.68417823, - "num_input_tokens_seen": 85181555, - "step": 3959, - "time_per_iteration": 2.71085786819458 - }, - { - "auxiliary_loss_clip": 0.01103257, - "auxiliary_loss_mlp": 0.01044748, - "balance_loss_clip": 1.04910731, - "balance_loss_mlp": 1.02760553, - "epoch": 0.23808808056515857, - "flos": 21179180332800.0, - "grad_norm": 1.922923950627842, - "language_loss": 0.72140026, - "learning_rate": 3.5627085505890586e-06, - "loss": 0.74288028, - "num_input_tokens_seen": 85199455, - "step": 3960, - "time_per_iteration": 2.724398612976074 - }, - { - "auxiliary_loss_clip": 0.01065725, - "auxiliary_loss_mlp": 0.01041352, - "balance_loss_clip": 1.04778433, - "balance_loss_mlp": 1.02385175, - "epoch": 0.23814820381782653, - "flos": 22528308188160.0, - "grad_norm": 1.836282299199184, - "language_loss": 0.74303818, - "learning_rate": 3.562465462704307e-06, - "loss": 0.76410902, - "num_input_tokens_seen": 85219170, - "step": 3961, - "time_per_iteration": 4.592544794082642 - }, - { - "auxiliary_loss_clip": 0.01149701, - "auxiliary_loss_mlp": 0.010511, - "balance_loss_clip": 1.05083704, - "balance_loss_mlp": 1.0321815, - "epoch": 0.23820832707049452, - "flos": 22304332932480.0, - "grad_norm": 1.6798300631958207, - "language_loss": 0.6562922, - "learning_rate": 3.5622223155707085e-06, - "loss": 0.67830026, - "num_input_tokens_seen": 85238480, - "step": 3962, - "time_per_iteration": 4.40812087059021 - }, - { - "auxiliary_loss_clip": 0.01121684, - "auxiliary_loss_mlp": 0.01042601, - "balance_loss_clip": 1.04743505, - "balance_loss_mlp": 1.02511263, - "epoch": 0.2382684503231625, - "flos": 24864225454080.0, - "grad_norm": 1.838705722688445, - "language_loss": 0.74284148, - "learning_rate": 3.561979109197483e-06, - "loss": 0.76448429, - "num_input_tokens_seen": 85259180, - "step": 3963, - "time_per_iteration": 2.7173969745635986 - }, - { - "auxiliary_loss_clip": 0.01120014, - "auxiliary_loss_mlp": 0.01045721, - "balance_loss_clip": 1.0530858, - "balance_loss_mlp": 1.02756512, - "epoch": 0.23832857357583045, - "flos": 21871609787520.0, - "grad_norm": 2.045875790034744, - "language_loss": 0.77264321, - "learning_rate": 3.5617358435938538e-06, - "loss": 0.79430056, - "num_input_tokens_seen": 85278550, - "step": 3964, - "time_per_iteration": 4.25124716758728 - }, - { - "auxiliary_loss_clip": 0.01108604, - "auxiliary_loss_mlp": 0.01048343, - "balance_loss_clip": 1.04783297, - "balance_loss_mlp": 1.03124809, - "epoch": 0.23838869682849842, - "flos": 21288061434240.0, - "grad_norm": 2.3097885565999894, - "language_loss": 0.71521109, - "learning_rate": 3.561492518769045e-06, - "loss": 0.73678052, - "num_input_tokens_seen": 85297345, - "step": 3965, - "time_per_iteration": 2.757647752761841 - }, - { - "auxiliary_loss_clip": 0.01115176, - "auxiliary_loss_mlp": 0.01043319, - "balance_loss_clip": 1.04632521, - "balance_loss_mlp": 1.02647483, - "epoch": 0.23844882008116638, - "flos": 16180594755840.0, - "grad_norm": 2.673966650516871, - "language_loss": 0.78003007, - "learning_rate": 3.561249134732282e-06, - "loss": 0.801615, - "num_input_tokens_seen": 85315105, - "step": 3966, - "time_per_iteration": 2.71159291267395 - }, - { - "auxiliary_loss_clip": 0.01124693, - "auxiliary_loss_mlp": 0.01045448, - "balance_loss_clip": 1.05071902, - "balance_loss_mlp": 1.02899134, - "epoch": 0.23850894333383435, - "flos": 21069724613760.0, - "grad_norm": 2.116401462724705, - "language_loss": 0.68767631, - "learning_rate": 3.561005691492797e-06, - "loss": 0.70937771, - "num_input_tokens_seen": 85334735, - "step": 3967, - "time_per_iteration": 2.7072744369506836 - }, - { - "auxiliary_loss_clip": 0.01116174, - "auxiliary_loss_mlp": 0.01055757, - "balance_loss_clip": 1.04883289, - "balance_loss_mlp": 1.03803015, - "epoch": 0.23856906658650234, - "flos": 17201606849280.0, - "grad_norm": 3.581336577718575, - "language_loss": 0.68005061, - "learning_rate": 3.5607621890598185e-06, - "loss": 0.70176995, - "num_input_tokens_seen": 85352875, - "step": 3968, - "time_per_iteration": 4.378219842910767 - }, - { - "auxiliary_loss_clip": 0.01097883, - "auxiliary_loss_mlp": 0.01044394, - "balance_loss_clip": 1.05052614, - "balance_loss_mlp": 1.0274837, - "epoch": 0.2386291898391703, - "flos": 29494223619840.0, - "grad_norm": 2.210255088762028, - "language_loss": 0.77106255, - "learning_rate": 3.5605186274425823e-06, - "loss": 0.79248536, - "num_input_tokens_seen": 85372205, - "step": 3969, - "time_per_iteration": 2.847663164138794 - }, - { - "auxiliary_loss_clip": 0.01121681, - "auxiliary_loss_mlp": 0.01039809, - "balance_loss_clip": 1.0498476, - "balance_loss_mlp": 1.02334595, - "epoch": 0.23868931309183827, - "flos": 21142443697920.0, - "grad_norm": 2.1326335149840583, - "language_loss": 0.7617563, - "learning_rate": 3.5602750066503225e-06, - "loss": 0.78337121, - "num_input_tokens_seen": 85389705, - "step": 3970, - "time_per_iteration": 2.766862392425537 - }, - { - "auxiliary_loss_clip": 0.01106309, - "auxiliary_loss_mlp": 0.01049131, - "balance_loss_clip": 1.04287159, - "balance_loss_mlp": 1.03111875, - "epoch": 0.23874943634450624, - "flos": 25659394784640.0, - "grad_norm": 2.3319107764636415, - "language_loss": 0.85474384, - "learning_rate": 3.5600313266922793e-06, - "loss": 0.87629819, - "num_input_tokens_seen": 85407855, - "step": 3971, - "time_per_iteration": 2.7597670555114746 - }, - { - "auxiliary_loss_clip": 0.01062507, - "auxiliary_loss_mlp": 0.01039144, - "balance_loss_clip": 1.03465796, - "balance_loss_mlp": 1.03661716, - "epoch": 0.2388095595971742, - "flos": 58986618624000.0, - "grad_norm": 0.7451796217314707, - "language_loss": 0.62797832, - "learning_rate": 3.5597875875776915e-06, - "loss": 0.6489948, - "num_input_tokens_seen": 85470885, - "step": 3972, - "time_per_iteration": 3.2572779655456543 - }, - { - "auxiliary_loss_clip": 0.0112174, - "auxiliary_loss_mlp": 0.01037931, - "balance_loss_clip": 1.0492239, - "balance_loss_mlp": 1.02109838, - "epoch": 0.23886968284984217, - "flos": 16800341040000.0, - "grad_norm": 1.9449657433446057, - "language_loss": 0.82093811, - "learning_rate": 3.5595437893158013e-06, - "loss": 0.84253484, - "num_input_tokens_seen": 85488460, - "step": 3973, - "time_per_iteration": 2.6394145488739014 - }, - { - "auxiliary_loss_clip": 0.01115852, - "auxiliary_loss_mlp": 0.01050239, - "balance_loss_clip": 1.04884124, - "balance_loss_mlp": 1.03272736, - "epoch": 0.23892980610251013, - "flos": 22382654538240.0, - "grad_norm": 1.5639820592628684, - "language_loss": 0.79418832, - "learning_rate": 3.5592999319158546e-06, - "loss": 0.81584924, - "num_input_tokens_seen": 85508590, - "step": 3974, - "time_per_iteration": 2.6926944255828857 - }, - { - "auxiliary_loss_clip": 0.01134012, - "auxiliary_loss_mlp": 0.01042703, - "balance_loss_clip": 1.05169725, - "balance_loss_mlp": 1.02475047, - "epoch": 0.23898992935517813, - "flos": 12823198519680.0, - "grad_norm": 1.8382350241534648, - "language_loss": 0.8420803, - "learning_rate": 3.5590560153870984e-06, - "loss": 0.86384743, - "num_input_tokens_seen": 85525970, - "step": 3975, - "time_per_iteration": 2.6402463912963867 - }, - { - "auxiliary_loss_clip": 0.01126962, - "auxiliary_loss_mlp": 0.01042445, - "balance_loss_clip": 1.04938245, - "balance_loss_mlp": 1.02545786, - "epoch": 0.2390500526078461, - "flos": 22345666508160.0, - "grad_norm": 2.129124681208868, - "language_loss": 0.84249294, - "learning_rate": 3.5588120397387816e-06, - "loss": 0.864187, - "num_input_tokens_seen": 85543700, - "step": 3976, - "time_per_iteration": 2.624758720397949 - }, - { - "auxiliary_loss_clip": 0.01075224, - "auxiliary_loss_mlp": 0.01036827, - "balance_loss_clip": 1.0434798, - "balance_loss_mlp": 1.02103186, - "epoch": 0.23911017586051406, - "flos": 22635142214400.0, - "grad_norm": 1.8888081312271703, - "language_loss": 0.74451673, - "learning_rate": 3.5585680049801566e-06, - "loss": 0.76563722, - "num_input_tokens_seen": 85562765, - "step": 3977, - "time_per_iteration": 2.848529815673828 - }, - { - "auxiliary_loss_clip": 0.01151335, - "auxiliary_loss_mlp": 0.01045957, - "balance_loss_clip": 1.05476987, - "balance_loss_mlp": 1.02829063, - "epoch": 0.23917029911318202, - "flos": 23653281219840.0, - "grad_norm": 1.6816446874821869, - "language_loss": 0.72515011, - "learning_rate": 3.5583239111204764e-06, - "loss": 0.74712306, - "num_input_tokens_seen": 85581755, - "step": 3978, - "time_per_iteration": 2.6967527866363525 - }, - { - "auxiliary_loss_clip": 0.01123321, - "auxiliary_loss_mlp": 0.01045192, - "balance_loss_clip": 1.04713726, - "balance_loss_mlp": 1.02802634, - "epoch": 0.23923042236585, - "flos": 22783597125120.0, - "grad_norm": 2.5130493367739413, - "language_loss": 0.78474021, - "learning_rate": 3.558079758168997e-06, - "loss": 0.80642533, - "num_input_tokens_seen": 85599455, - "step": 3979, - "time_per_iteration": 2.6679623126983643 - }, - { - "auxiliary_loss_clip": 0.01123187, - "auxiliary_loss_mlp": 0.01052255, - "balance_loss_clip": 1.04774463, - "balance_loss_mlp": 1.03390861, - "epoch": 0.23929054561851795, - "flos": 28147717457280.0, - "grad_norm": 1.8353092232149775, - "language_loss": 0.81943917, - "learning_rate": 3.557835546134977e-06, - "loss": 0.84119362, - "num_input_tokens_seen": 85619970, - "step": 3980, - "time_per_iteration": 2.7941136360168457 - }, - { - "auxiliary_loss_clip": 0.01094849, - "auxiliary_loss_mlp": 0.01037854, - "balance_loss_clip": 1.04719615, - "balance_loss_mlp": 1.02036595, - "epoch": 0.23935066887118592, - "flos": 21686525982720.0, - "grad_norm": 1.7388406045293963, - "language_loss": 0.83562148, - "learning_rate": 3.5575912750276775e-06, - "loss": 0.85694849, - "num_input_tokens_seen": 85638850, - "step": 3981, - "time_per_iteration": 2.773372173309326 - }, - { - "auxiliary_loss_clip": 0.01126579, - "auxiliary_loss_mlp": 0.01045152, - "balance_loss_clip": 1.05084574, - "balance_loss_mlp": 1.0267818, - "epoch": 0.2394107921238539, - "flos": 32122274198400.0, - "grad_norm": 2.0270942419393676, - "language_loss": 0.76690662, - "learning_rate": 3.5573469448563607e-06, - "loss": 0.78862393, - "num_input_tokens_seen": 85656285, - "step": 3982, - "time_per_iteration": 2.770089864730835 - }, - { - "auxiliary_loss_clip": 0.01107786, - "auxiliary_loss_mlp": 0.01043737, - "balance_loss_clip": 1.04928303, - "balance_loss_mlp": 1.02757215, - "epoch": 0.23947091537652188, - "flos": 17019180650880.0, - "grad_norm": 2.333665248317953, - "language_loss": 0.78243405, - "learning_rate": 3.5571025556302915e-06, - "loss": 0.80394924, - "num_input_tokens_seen": 85673020, - "step": 3983, - "time_per_iteration": 2.8361902236938477 - }, - { - "auxiliary_loss_clip": 0.01136012, - "auxiliary_loss_mlp": 0.00775416, - "balance_loss_clip": 1.0530262, - "balance_loss_mlp": 1.00106907, - "epoch": 0.23953103862918984, - "flos": 20593584904320.0, - "grad_norm": 1.8468424363822287, - "language_loss": 0.73274761, - "learning_rate": 3.556858107358737e-06, - "loss": 0.75186193, - "num_input_tokens_seen": 85692565, - "step": 3984, - "time_per_iteration": 2.720289468765259 - }, - { - "auxiliary_loss_clip": 0.01102619, - "auxiliary_loss_mlp": 0.01051209, - "balance_loss_clip": 1.04748976, - "balance_loss_mlp": 1.0330658, - "epoch": 0.2395911618818578, - "flos": 20704405340160.0, - "grad_norm": 1.906378165207968, - "language_loss": 0.79090226, - "learning_rate": 3.5566136000509674e-06, - "loss": 0.81244051, - "num_input_tokens_seen": 85709730, - "step": 3985, - "time_per_iteration": 2.8464138507843018 - }, - { - "auxiliary_loss_clip": 0.01102898, - "auxiliary_loss_mlp": 0.01047238, - "balance_loss_clip": 1.04676175, - "balance_loss_mlp": 1.02930927, - "epoch": 0.23965128513452577, - "flos": 27053519402880.0, - "grad_norm": 1.780185130038595, - "language_loss": 0.73194253, - "learning_rate": 3.556369033716254e-06, - "loss": 0.7534439, - "num_input_tokens_seen": 85730045, - "step": 3986, - "time_per_iteration": 2.873837471008301 - }, - { - "auxiliary_loss_clip": 0.01143561, - "auxiliary_loss_mlp": 0.01052533, - "balance_loss_clip": 1.05392861, - "balance_loss_mlp": 1.03523529, - "epoch": 0.23971140838719374, - "flos": 23144319457920.0, - "grad_norm": 1.9275946084378768, - "language_loss": 0.88014174, - "learning_rate": 3.556124408363871e-06, - "loss": 0.90210271, - "num_input_tokens_seen": 85747590, - "step": 3987, - "time_per_iteration": 2.778970718383789 - }, - { - "auxiliary_loss_clip": 0.01131181, - "auxiliary_loss_mlp": 0.01037226, - "balance_loss_clip": 1.05180991, - "balance_loss_mlp": 1.02253985, - "epoch": 0.23977153163986173, - "flos": 18034554309120.0, - "grad_norm": 8.94948058332038, - "language_loss": 0.82985806, - "learning_rate": 3.5558797240030945e-06, - "loss": 0.85154212, - "num_input_tokens_seen": 85763460, - "step": 3988, - "time_per_iteration": 2.6707162857055664 - }, - { - "auxiliary_loss_clip": 0.01132219, - "auxiliary_loss_mlp": 0.01039377, - "balance_loss_clip": 1.04952908, - "balance_loss_mlp": 1.02213907, - "epoch": 0.2398316548925297, - "flos": 18113378705280.0, - "grad_norm": 1.6085860818119202, - "language_loss": 0.85336304, - "learning_rate": 3.5556349806432035e-06, - "loss": 0.87507904, - "num_input_tokens_seen": 85782050, - "step": 3989, - "time_per_iteration": 2.644075632095337 - }, - { - "auxiliary_loss_clip": 0.01144734, - "auxiliary_loss_mlp": 0.01039049, - "balance_loss_clip": 1.05094743, - "balance_loss_mlp": 1.02263403, - "epoch": 0.23989177814519766, - "flos": 12567730014720.0, - "grad_norm": 1.981474679784042, - "language_loss": 0.84109843, - "learning_rate": 3.555390178293477e-06, - "loss": 0.86293626, - "num_input_tokens_seen": 85797400, - "step": 3990, - "time_per_iteration": 2.5778160095214844 - }, - { - "auxiliary_loss_clip": 0.01131361, - "auxiliary_loss_mlp": 0.01042102, - "balance_loss_clip": 1.04863191, - "balance_loss_mlp": 1.02565074, - "epoch": 0.23995190139786562, - "flos": 25264593423360.0, - "grad_norm": 1.5352138463261382, - "language_loss": 0.75853264, - "learning_rate": 3.5551453169631994e-06, - "loss": 0.78026724, - "num_input_tokens_seen": 85818995, - "step": 3991, - "time_per_iteration": 2.7569639682769775 - }, - { - "auxiliary_loss_clip": 0.01040828, - "auxiliary_loss_mlp": 0.0100398, - "balance_loss_clip": 1.02825403, - "balance_loss_mlp": 1.00114298, - "epoch": 0.2400120246505336, - "flos": 61960379650560.0, - "grad_norm": 0.8795356934357302, - "language_loss": 0.63683558, - "learning_rate": 3.554900396661656e-06, - "loss": 0.65728366, - "num_input_tokens_seen": 85876695, - "step": 3992, - "time_per_iteration": 3.2559213638305664 - }, - { - "auxiliary_loss_clip": 0.01055123, - "auxiliary_loss_mlp": 0.01005737, - "balance_loss_clip": 1.02834392, - "balance_loss_mlp": 1.00292385, - "epoch": 0.24007214790320155, - "flos": 66708560540160.0, - "grad_norm": 0.7639831296699208, - "language_loss": 0.6297875, - "learning_rate": 3.5546554173981334e-06, - "loss": 0.65039611, - "num_input_tokens_seen": 85940990, - "step": 3993, - "time_per_iteration": 3.2946221828460693 - }, - { - "auxiliary_loss_clip": 0.0110983, - "auxiliary_loss_mlp": 0.01048609, - "balance_loss_clip": 1.05077267, - "balance_loss_mlp": 1.03078759, - "epoch": 0.24013227115586952, - "flos": 25809070757760.0, - "grad_norm": 1.7227387633537015, - "language_loss": 0.7656548, - "learning_rate": 3.5544103791819218e-06, - "loss": 0.78723919, - "num_input_tokens_seen": 85961165, - "step": 3994, - "time_per_iteration": 2.7735466957092285 - }, - { - "auxiliary_loss_clip": 0.01120115, - "auxiliary_loss_mlp": 0.01051235, - "balance_loss_clip": 1.04648936, - "balance_loss_mlp": 1.0323168, - "epoch": 0.2401923944085375, - "flos": 25557480921600.0, - "grad_norm": 1.7819538389347498, - "language_loss": 0.78550023, - "learning_rate": 3.5541652820223124e-06, - "loss": 0.80721372, - "num_input_tokens_seen": 85982710, - "step": 3995, - "time_per_iteration": 2.8184118270874023 - }, - { - "auxiliary_loss_clip": 0.01034, - "auxiliary_loss_mlp": 0.01026353, - "balance_loss_clip": 1.02876425, - "balance_loss_mlp": 1.0237658, - "epoch": 0.24025251766120548, - "flos": 54941138478720.0, - "grad_norm": 0.9088717203971356, - "language_loss": 0.6345036, - "learning_rate": 3.5539201259286006e-06, - "loss": 0.65510708, - "num_input_tokens_seen": 86046935, - "step": 3996, - "time_per_iteration": 3.304704189300537 - }, - { - "auxiliary_loss_clip": 0.01122635, - "auxiliary_loss_mlp": 0.01046678, - "balance_loss_clip": 1.04812241, - "balance_loss_mlp": 1.02960706, - "epoch": 0.24031264091387344, - "flos": 20631075724800.0, - "grad_norm": 2.5673853359086403, - "language_loss": 0.69455099, - "learning_rate": 3.5536749109100808e-06, - "loss": 0.7162441, - "num_input_tokens_seen": 86064355, - "step": 3997, - "time_per_iteration": 2.6638269424438477 - }, - { - "auxiliary_loss_clip": 0.01136246, - "auxiliary_loss_mlp": 0.01041204, - "balance_loss_clip": 1.0500989, - "balance_loss_mlp": 1.02390659, - "epoch": 0.2403727641665414, - "flos": 20886256920960.0, - "grad_norm": 1.9944619018673675, - "language_loss": 0.87352818, - "learning_rate": 3.5534296369760535e-06, - "loss": 0.89530265, - "num_input_tokens_seen": 86081340, - "step": 3998, - "time_per_iteration": 2.6837756633758545 - }, - { - "auxiliary_loss_clip": 0.01126262, - "auxiliary_loss_mlp": 0.01038814, - "balance_loss_clip": 1.04337883, - "balance_loss_mlp": 1.02173114, - "epoch": 0.24043288741920937, - "flos": 22820046451200.0, - "grad_norm": 1.5798261831400109, - "language_loss": 0.75723118, - "learning_rate": 3.5531843041358183e-06, - "loss": 0.77888191, - "num_input_tokens_seen": 86102260, - "step": 3999, - "time_per_iteration": 2.659717321395874 - }, - { - "auxiliary_loss_clip": 0.01116532, - "auxiliary_loss_mlp": 0.01049627, - "balance_loss_clip": 1.04679537, - "balance_loss_mlp": 1.03259242, - "epoch": 0.24049301067187734, - "flos": 27959652823680.0, - "grad_norm": 2.380373207595884, - "language_loss": 0.72602308, - "learning_rate": 3.552938912398679e-06, - "loss": 0.74768472, - "num_input_tokens_seen": 86123400, - "step": 4000, - "time_per_iteration": 4.285717487335205 - }, - { - "auxiliary_loss_clip": 0.01138397, - "auxiliary_loss_mlp": 0.01040819, - "balance_loss_clip": 1.05207551, - "balance_loss_mlp": 1.02389169, - "epoch": 0.24055313392454533, - "flos": 27451409333760.0, - "grad_norm": 2.3105318706157862, - "language_loss": 0.67128104, - "learning_rate": 3.5526934617739397e-06, - "loss": 0.69307321, - "num_input_tokens_seen": 86144060, - "step": 4001, - "time_per_iteration": 4.2180609703063965 - }, - { - "auxiliary_loss_clip": 0.01144863, - "auxiliary_loss_mlp": 0.01043304, - "balance_loss_clip": 1.04859209, - "balance_loss_mlp": 1.02525568, - "epoch": 0.2406132571772133, - "flos": 25556618995200.0, - "grad_norm": 2.360624564793828, - "language_loss": 0.82895994, - "learning_rate": 3.5524479522709095e-06, - "loss": 0.85084158, - "num_input_tokens_seen": 86163005, - "step": 4002, - "time_per_iteration": 2.6369640827178955 - }, - { - "auxiliary_loss_clip": 0.01106477, - "auxiliary_loss_mlp": 0.01045072, - "balance_loss_clip": 1.0493201, - "balance_loss_mlp": 1.0283823, - "epoch": 0.24067338042988126, - "flos": 24791398629120.0, - "grad_norm": 2.016027139567785, - "language_loss": 0.83058953, - "learning_rate": 3.552202383898897e-06, - "loss": 0.85210502, - "num_input_tokens_seen": 86182580, - "step": 4003, - "time_per_iteration": 4.312098979949951 - }, - { - "auxiliary_loss_clip": 0.01114745, - "auxiliary_loss_mlp": 0.01042117, - "balance_loss_clip": 1.0474503, - "balance_loss_mlp": 1.02458131, - "epoch": 0.24073350368254923, - "flos": 21177923356800.0, - "grad_norm": 1.971328156333658, - "language_loss": 0.8672772, - "learning_rate": 3.551956756667215e-06, - "loss": 0.8888458, - "num_input_tokens_seen": 86200665, - "step": 4004, - "time_per_iteration": 2.646578311920166 - }, - { - "auxiliary_loss_clip": 0.01115631, - "auxiliary_loss_mlp": 0.01054344, - "balance_loss_clip": 1.04529011, - "balance_loss_mlp": 1.03736866, - "epoch": 0.2407936269352172, - "flos": 22494300986880.0, - "grad_norm": 1.9965130860947515, - "language_loss": 0.78239757, - "learning_rate": 3.551711070585177e-06, - "loss": 0.80409735, - "num_input_tokens_seen": 86221640, - "step": 4005, - "time_per_iteration": 2.7220566272735596 - }, - { - "auxiliary_loss_clip": 0.01090518, - "auxiliary_loss_mlp": 0.01039515, - "balance_loss_clip": 1.04414058, - "balance_loss_mlp": 1.02164578, - "epoch": 0.24085375018788516, - "flos": 18551129754240.0, - "grad_norm": 1.6390993289809686, - "language_loss": 0.79391652, - "learning_rate": 3.5514653256620995e-06, - "loss": 0.8152169, - "num_input_tokens_seen": 86240795, - "step": 4006, - "time_per_iteration": 2.7188642024993896 - }, - { - "auxiliary_loss_clip": 0.01130191, - "auxiliary_loss_mlp": 0.00777161, - "balance_loss_clip": 1.0482645, - "balance_loss_mlp": 1.00115335, - "epoch": 0.24091387344055312, - "flos": 24170539023360.0, - "grad_norm": 1.6765272633695874, - "language_loss": 0.71939242, - "learning_rate": 3.551219521907302e-06, - "loss": 0.73846585, - "num_input_tokens_seen": 86262000, - "step": 4007, - "time_per_iteration": 4.3504638671875 - }, - { - "auxiliary_loss_clip": 0.01101925, - "auxiliary_loss_mlp": 0.01047677, - "balance_loss_clip": 1.04589975, - "balance_loss_mlp": 1.03132153, - "epoch": 0.24097399669322112, - "flos": 11036319615360.0, - "grad_norm": 1.6891966370612705, - "language_loss": 0.76460171, - "learning_rate": 3.5509736593301042e-06, - "loss": 0.78609765, - "num_input_tokens_seen": 86279680, - "step": 4008, - "time_per_iteration": 2.700744152069092 - }, - { - "auxiliary_loss_clip": 0.01136495, - "auxiliary_loss_mlp": 0.01038852, - "balance_loss_clip": 1.05069256, - "balance_loss_mlp": 1.02192402, - "epoch": 0.24103411994588908, - "flos": 17165085696000.0, - "grad_norm": 2.427830882471808, - "language_loss": 0.74601823, - "learning_rate": 3.5507277379398295e-06, - "loss": 0.76777172, - "num_input_tokens_seen": 86297180, - "step": 4009, - "time_per_iteration": 2.6175808906555176 - }, - { - "auxiliary_loss_clip": 0.01134079, - "auxiliary_loss_mlp": 0.01041957, - "balance_loss_clip": 1.05032861, - "balance_loss_mlp": 1.02532756, - "epoch": 0.24109424319855705, - "flos": 20667956014080.0, - "grad_norm": 1.6643292794637636, - "language_loss": 0.80064976, - "learning_rate": 3.550481757745804e-06, - "loss": 0.82241005, - "num_input_tokens_seen": 86317660, - "step": 4010, - "time_per_iteration": 2.680511236190796 - }, - { - "auxiliary_loss_clip": 0.01118599, - "auxiliary_loss_mlp": 0.01047241, - "balance_loss_clip": 1.04658401, - "balance_loss_mlp": 1.02779818, - "epoch": 0.241154366451225, - "flos": 28181796485760.0, - "grad_norm": 3.8737422865874245, - "language_loss": 0.70889425, - "learning_rate": 3.5502357187573555e-06, - "loss": 0.73055267, - "num_input_tokens_seen": 86338325, - "step": 4011, - "time_per_iteration": 2.716404676437378 - }, - { - "auxiliary_loss_clip": 0.01065208, - "auxiliary_loss_mlp": 0.01047099, - "balance_loss_clip": 1.0414176, - "balance_loss_mlp": 1.02802527, - "epoch": 0.24121448970389298, - "flos": 21689722293120.0, - "grad_norm": 1.675052333388822, - "language_loss": 0.69279736, - "learning_rate": 3.5499896209838118e-06, - "loss": 0.71392041, - "num_input_tokens_seen": 86357615, - "step": 4012, - "time_per_iteration": 2.804694890975952 - }, - { - "auxiliary_loss_clip": 0.01138123, - "auxiliary_loss_mlp": 0.0104149, - "balance_loss_clip": 1.05126536, - "balance_loss_mlp": 1.02213097, - "epoch": 0.24127461295656094, - "flos": 39676191269760.0, - "grad_norm": 1.5084253296098848, - "language_loss": 0.732813, - "learning_rate": 3.5497434644345073e-06, - "loss": 0.75460911, - "num_input_tokens_seen": 86380355, - "step": 4013, - "time_per_iteration": 2.8192849159240723 - }, - { - "auxiliary_loss_clip": 0.01148497, - "auxiliary_loss_mlp": 0.01037798, - "balance_loss_clip": 1.05201018, - "balance_loss_mlp": 1.02044141, - "epoch": 0.2413347362092289, - "flos": 19135863256320.0, - "grad_norm": 1.8372553923739565, - "language_loss": 0.88272971, - "learning_rate": 3.5494972491187753e-06, - "loss": 0.90459263, - "num_input_tokens_seen": 86399125, - "step": 4014, - "time_per_iteration": 2.6029160022735596 - }, - { - "auxiliary_loss_clip": 0.0111397, - "auxiliary_loss_mlp": 0.01046282, - "balance_loss_clip": 1.04315281, - "balance_loss_mlp": 1.0278163, - "epoch": 0.2413948594618969, - "flos": 26939430829440.0, - "grad_norm": 1.9589493379590102, - "language_loss": 0.94862974, - "learning_rate": 3.549250975045952e-06, - "loss": 0.97023225, - "num_input_tokens_seen": 86418625, - "step": 4015, - "time_per_iteration": 2.6958773136138916 - }, - { - "auxiliary_loss_clip": 0.01120117, - "auxiliary_loss_mlp": 0.01041079, - "balance_loss_clip": 1.04570341, - "balance_loss_mlp": 1.02331638, - "epoch": 0.24145498271456486, - "flos": 25228108183680.0, - "grad_norm": 1.5486712647521637, - "language_loss": 0.8271699, - "learning_rate": 3.5490046422253768e-06, - "loss": 0.84878188, - "num_input_tokens_seen": 86438375, - "step": 4016, - "time_per_iteration": 2.7045071125030518 - }, - { - "auxiliary_loss_clip": 0.01098573, - "auxiliary_loss_mlp": 0.01045564, - "balance_loss_clip": 1.04334974, - "balance_loss_mlp": 1.02838039, - "epoch": 0.24151510596723283, - "flos": 40661759617920.0, - "grad_norm": 1.8022012115417119, - "language_loss": 0.69207114, - "learning_rate": 3.54875825066639e-06, - "loss": 0.71351254, - "num_input_tokens_seen": 86463230, - "step": 4017, - "time_per_iteration": 2.8596649169921875 - }, - { - "auxiliary_loss_clip": 0.01141299, - "auxiliary_loss_mlp": 0.01051243, - "balance_loss_clip": 1.05106175, - "balance_loss_mlp": 1.03278995, - "epoch": 0.2415752292199008, - "flos": 18146667634560.0, - "grad_norm": 1.6419835865444041, - "language_loss": 0.84953403, - "learning_rate": 3.5485118003783353e-06, - "loss": 0.87145936, - "num_input_tokens_seen": 86481230, - "step": 4018, - "time_per_iteration": 2.627629518508911 - }, - { - "auxiliary_loss_clip": 0.01046489, - "auxiliary_loss_mlp": 0.01014362, - "balance_loss_clip": 1.02139664, - "balance_loss_mlp": 1.01140559, - "epoch": 0.24163535247256876, - "flos": 67288409792640.0, - "grad_norm": 0.8221446343976555, - "language_loss": 0.60642469, - "learning_rate": 3.548265291370558e-06, - "loss": 0.62703323, - "num_input_tokens_seen": 86541260, - "step": 4019, - "time_per_iteration": 3.269498586654663 - }, - { - "auxiliary_loss_clip": 0.01114983, - "auxiliary_loss_mlp": 0.01049089, - "balance_loss_clip": 1.04582107, - "balance_loss_mlp": 1.0312674, - "epoch": 0.24169547572523672, - "flos": 24929941386240.0, - "grad_norm": 1.8826005215725077, - "language_loss": 0.73324752, - "learning_rate": 3.5480187236524055e-06, - "loss": 0.75488818, - "num_input_tokens_seen": 86559580, - "step": 4020, - "time_per_iteration": 2.7341055870056152 - }, - { - "auxiliary_loss_clip": 0.01111064, - "auxiliary_loss_mlp": 0.01040515, - "balance_loss_clip": 1.04833841, - "balance_loss_mlp": 1.02315772, - "epoch": 0.24175559897790472, - "flos": 18728312567040.0, - "grad_norm": 1.7964731743776612, - "language_loss": 0.81617332, - "learning_rate": 3.5477720972332285e-06, - "loss": 0.83768916, - "num_input_tokens_seen": 86577560, - "step": 4021, - "time_per_iteration": 2.7154345512390137 - }, - { - "auxiliary_loss_clip": 0.01149117, - "auxiliary_loss_mlp": 0.01050015, - "balance_loss_clip": 1.04972911, - "balance_loss_mlp": 1.03070307, - "epoch": 0.24181572223057268, - "flos": 23039281111680.0, - "grad_norm": 2.078765142897874, - "language_loss": 0.76601863, - "learning_rate": 3.547525412122378e-06, - "loss": 0.78800994, - "num_input_tokens_seen": 86595350, - "step": 4022, - "time_per_iteration": 2.622262716293335 - }, - { - "auxiliary_loss_clip": 0.01102927, - "auxiliary_loss_mlp": 0.01053151, - "balance_loss_clip": 1.042714, - "balance_loss_mlp": 1.03271914, - "epoch": 0.24187584548324065, - "flos": 20376145923840.0, - "grad_norm": 1.7360501926549048, - "language_loss": 0.75283015, - "learning_rate": 3.5472786683292083e-06, - "loss": 0.774391, - "num_input_tokens_seen": 86614805, - "step": 4023, - "time_per_iteration": 2.7339353561401367 - }, - { - "auxiliary_loss_clip": 0.01121416, - "auxiliary_loss_mlp": 0.01047921, - "balance_loss_clip": 1.04916334, - "balance_loss_mlp": 1.0309217, - "epoch": 0.2419359687359086, - "flos": 21397517153280.0, - "grad_norm": 2.4319797200103466, - "language_loss": 0.82542646, - "learning_rate": 3.5470318658630766e-06, - "loss": 0.84711981, - "num_input_tokens_seen": 86633700, - "step": 4024, - "time_per_iteration": 2.6887242794036865 - }, - { - "auxiliary_loss_clip": 0.01133297, - "auxiliary_loss_mlp": 0.01047865, - "balance_loss_clip": 1.05029452, - "balance_loss_mlp": 1.03038907, - "epoch": 0.24199609198857658, - "flos": 18369385914240.0, - "grad_norm": 1.7776330743080708, - "language_loss": 0.85974258, - "learning_rate": 3.5467850047333424e-06, - "loss": 0.88155425, - "num_input_tokens_seen": 86650905, - "step": 4025, - "time_per_iteration": 2.7049782276153564 - }, - { - "auxiliary_loss_clip": 0.01092706, - "auxiliary_loss_mlp": 0.01064486, - "balance_loss_clip": 1.04161918, - "balance_loss_mlp": 1.04456651, - "epoch": 0.24205621524124454, - "flos": 19463871277440.0, - "grad_norm": 1.8800874250001207, - "language_loss": 0.71681315, - "learning_rate": 3.546538084949365e-06, - "loss": 0.73838508, - "num_input_tokens_seen": 86669185, - "step": 4026, - "time_per_iteration": 2.7773284912109375 - }, - { - "auxiliary_loss_clip": 0.01135992, - "auxiliary_loss_mlp": 0.01046992, - "balance_loss_clip": 1.05109096, - "balance_loss_mlp": 1.03088713, - "epoch": 0.2421163384939125, - "flos": 14976330451200.0, - "grad_norm": 1.967847260356932, - "language_loss": 0.64436764, - "learning_rate": 3.546291106520509e-06, - "loss": 0.66619748, - "num_input_tokens_seen": 86686805, - "step": 4027, - "time_per_iteration": 2.6143524646759033 - }, - { - "auxiliary_loss_clip": 0.01136637, - "auxiliary_loss_mlp": 0.00775283, - "balance_loss_clip": 1.05106425, - "balance_loss_mlp": 1.00103092, - "epoch": 0.2421764617465805, - "flos": 18662057930880.0, - "grad_norm": 3.6118562291520813, - "language_loss": 0.70909715, - "learning_rate": 3.5460440694561388e-06, - "loss": 0.72821641, - "num_input_tokens_seen": 86705520, - "step": 4028, - "time_per_iteration": 2.656334400177002 - }, - { - "auxiliary_loss_clip": 0.01053475, - "auxiliary_loss_mlp": 0.01050053, - "balance_loss_clip": 1.02715707, - "balance_loss_mlp": 1.04756165, - "epoch": 0.24223658499924847, - "flos": 64347327164160.0, - "grad_norm": 0.865443083354021, - "language_loss": 0.55302447, - "learning_rate": 3.545796973765623e-06, - "loss": 0.57405978, - "num_input_tokens_seen": 86767320, - "step": 4029, - "time_per_iteration": 3.1736607551574707 - }, - { - "auxiliary_loss_clip": 0.0113268, - "auxiliary_loss_mlp": 0.01051074, - "balance_loss_clip": 1.04679179, - "balance_loss_mlp": 1.03252554, - "epoch": 0.24229670825191643, - "flos": 25775243124480.0, - "grad_norm": 1.6290009052774777, - "language_loss": 0.74065894, - "learning_rate": 3.54554981945833e-06, - "loss": 0.76249647, - "num_input_tokens_seen": 86788110, - "step": 4030, - "time_per_iteration": 2.644153118133545 - }, - { - "auxiliary_loss_clip": 0.01146282, - "auxiliary_loss_mlp": 0.01053008, - "balance_loss_clip": 1.04945433, - "balance_loss_mlp": 1.03495932, - "epoch": 0.2423568315045844, - "flos": 20667094087680.0, - "grad_norm": 2.044571760348203, - "language_loss": 0.76492965, - "learning_rate": 3.5453026065436343e-06, - "loss": 0.78692257, - "num_input_tokens_seen": 86807640, - "step": 4031, - "time_per_iteration": 2.608718156814575 - }, - { - "auxiliary_loss_clip": 0.01130345, - "auxiliary_loss_mlp": 0.00776083, - "balance_loss_clip": 1.04857934, - "balance_loss_mlp": 1.00130129, - "epoch": 0.24241695475725236, - "flos": 22416805393920.0, - "grad_norm": 2.367928778009572, - "language_loss": 0.65578043, - "learning_rate": 3.5450553350309083e-06, - "loss": 0.67484468, - "num_input_tokens_seen": 86826795, - "step": 4032, - "time_per_iteration": 2.713796377182007 - }, - { - "auxiliary_loss_clip": 0.01128183, - "auxiliary_loss_mlp": 0.0104339, - "balance_loss_clip": 1.04551542, - "balance_loss_mlp": 1.02591443, - "epoch": 0.24247707800992033, - "flos": 17128995505920.0, - "grad_norm": 2.055558599382263, - "language_loss": 0.81589901, - "learning_rate": 3.5448080049295286e-06, - "loss": 0.83761466, - "num_input_tokens_seen": 86843175, - "step": 4033, - "time_per_iteration": 2.6381332874298096 - }, - { - "auxiliary_loss_clip": 0.01101134, - "auxiliary_loss_mlp": 0.01042507, - "balance_loss_clip": 1.04264998, - "balance_loss_mlp": 1.02450657, - "epoch": 0.2425372012625883, - "flos": 31613743399680.0, - "grad_norm": 2.655330103252085, - "language_loss": 0.68830204, - "learning_rate": 3.5445606162488754e-06, - "loss": 0.70973849, - "num_input_tokens_seen": 86863185, - "step": 4034, - "time_per_iteration": 2.8269567489624023 - }, - { - "auxiliary_loss_clip": 0.01129717, - "auxiliary_loss_mlp": 0.01036472, - "balance_loss_clip": 1.05142426, - "balance_loss_mlp": 1.01839972, - "epoch": 0.24259732451525629, - "flos": 16326032924160.0, - "grad_norm": 2.305872962411053, - "language_loss": 0.96432853, - "learning_rate": 3.5443131689983283e-06, - "loss": 0.98599035, - "num_input_tokens_seen": 86880040, - "step": 4035, - "time_per_iteration": 2.687131643295288 - }, - { - "auxiliary_loss_clip": 0.01116249, - "auxiliary_loss_mlp": 0.01051012, - "balance_loss_clip": 1.0467937, - "balance_loss_mlp": 1.03419125, - "epoch": 0.24265744776792425, - "flos": 22856639431680.0, - "grad_norm": 1.5931877581057647, - "language_loss": 0.7820307, - "learning_rate": 3.5440656631872715e-06, - "loss": 0.80370331, - "num_input_tokens_seen": 86900610, - "step": 4036, - "time_per_iteration": 2.7576112747192383 - }, - { - "auxiliary_loss_clip": 0.01137826, - "auxiliary_loss_mlp": 0.01049747, - "balance_loss_clip": 1.05010104, - "balance_loss_mlp": 1.03141224, - "epoch": 0.24271757102059222, - "flos": 21871573873920.0, - "grad_norm": 1.6332934168141529, - "language_loss": 0.74266672, - "learning_rate": 3.5438180988250898e-06, - "loss": 0.76454246, - "num_input_tokens_seen": 86919385, - "step": 4037, - "time_per_iteration": 2.7860629558563232 - }, - { - "auxiliary_loss_clip": 0.01100993, - "auxiliary_loss_mlp": 0.01042879, - "balance_loss_clip": 1.04173183, - "balance_loss_mlp": 1.02453303, - "epoch": 0.24277769427326018, - "flos": 19208582340480.0, - "grad_norm": 8.14050816007968, - "language_loss": 0.76632005, - "learning_rate": 3.543570475921171e-06, - "loss": 0.78775871, - "num_input_tokens_seen": 86938885, - "step": 4038, - "time_per_iteration": 2.691695213317871 - }, - { - "auxiliary_loss_clip": 0.01129874, - "auxiliary_loss_mlp": 0.01043604, - "balance_loss_clip": 1.04768467, - "balance_loss_mlp": 1.0249598, - "epoch": 0.24283781752592815, - "flos": 19499889640320.0, - "grad_norm": 3.2334161052349817, - "language_loss": 0.71992457, - "learning_rate": 3.543322794484905e-06, - "loss": 0.7416594, - "num_input_tokens_seen": 86957705, - "step": 4039, - "time_per_iteration": 4.128135442733765 - }, - { - "auxiliary_loss_clip": 0.0112766, - "auxiliary_loss_mlp": 0.01048109, - "balance_loss_clip": 1.04597354, - "balance_loss_mlp": 1.02921474, - "epoch": 0.2428979407785961, - "flos": 19902196944000.0, - "grad_norm": 1.6158763194283545, - "language_loss": 0.78655136, - "learning_rate": 3.5430750545256843e-06, - "loss": 0.80830908, - "num_input_tokens_seen": 86975845, - "step": 4040, - "time_per_iteration": 4.174723863601685 - }, - { - "auxiliary_loss_clip": 0.01090567, - "auxiliary_loss_mlp": 0.01038965, - "balance_loss_clip": 1.04526615, - "balance_loss_mlp": 1.02268124, - "epoch": 0.2429580640312641, - "flos": 24715878284160.0, - "grad_norm": 2.432557236688664, - "language_loss": 0.80599713, - "learning_rate": 3.5428272560529027e-06, - "loss": 0.8272925, - "num_input_tokens_seen": 86994800, - "step": 4041, - "time_per_iteration": 2.7933273315429688 - }, - { - "auxiliary_loss_clip": 0.01108653, - "auxiliary_loss_mlp": 0.01044101, - "balance_loss_clip": 1.04587245, - "balance_loss_mlp": 1.02733982, - "epoch": 0.24301818728393207, - "flos": 25630343660160.0, - "grad_norm": 1.9967913274059828, - "language_loss": 0.76708287, - "learning_rate": 3.542579399075957e-06, - "loss": 0.78861034, - "num_input_tokens_seen": 87016845, - "step": 4042, - "time_per_iteration": 4.336673021316528 - }, - { - "auxiliary_loss_clip": 0.01056541, - "auxiliary_loss_mlp": 0.01035377, - "balance_loss_clip": 1.04354727, - "balance_loss_mlp": 1.01928389, - "epoch": 0.24307831053660003, - "flos": 26141388410880.0, - "grad_norm": 1.8431659047813937, - "language_loss": 0.81232125, - "learning_rate": 3.542331483604246e-06, - "loss": 0.83324039, - "num_input_tokens_seen": 87036270, - "step": 4043, - "time_per_iteration": 2.9156856536865234 - }, - { - "auxiliary_loss_clip": 0.01126576, - "auxiliary_loss_mlp": 0.01038857, - "balance_loss_clip": 1.04610896, - "balance_loss_mlp": 1.02012897, - "epoch": 0.243138433789268, - "flos": 14972415868800.0, - "grad_norm": 2.052349433785912, - "language_loss": 0.73095596, - "learning_rate": 3.5420835096471706e-06, - "loss": 0.75261033, - "num_input_tokens_seen": 87049920, - "step": 4044, - "time_per_iteration": 2.6324286460876465 - }, - { - "auxiliary_loss_clip": 0.0113453, - "auxiliary_loss_mlp": 0.01042417, - "balance_loss_clip": 1.04967666, - "balance_loss_mlp": 1.02445269, - "epoch": 0.24319855704193596, - "flos": 25191694771200.0, - "grad_norm": 1.8848950918191658, - "language_loss": 0.83676481, - "learning_rate": 3.5418354772141337e-06, - "loss": 0.85853434, - "num_input_tokens_seen": 87068230, - "step": 4045, - "time_per_iteration": 2.68994402885437 - }, - { - "auxiliary_loss_clip": 0.010753, - "auxiliary_loss_mlp": 0.01047988, - "balance_loss_clip": 1.04608011, - "balance_loss_mlp": 1.03117943, - "epoch": 0.24325868029460393, - "flos": 22127221946880.0, - "grad_norm": 1.9701839557075844, - "language_loss": 0.86895847, - "learning_rate": 3.541587386314541e-06, - "loss": 0.89019132, - "num_input_tokens_seen": 87086435, - "step": 4046, - "time_per_iteration": 2.908737897872925 - }, - { - "auxiliary_loss_clip": 0.01120714, - "auxiliary_loss_mlp": 0.01038682, - "balance_loss_clip": 1.04705977, - "balance_loss_mlp": 1.02070522, - "epoch": 0.2433188035472719, - "flos": 23582106420480.0, - "grad_norm": 1.8855160425980928, - "language_loss": 0.72759771, - "learning_rate": 3.5413392369578e-06, - "loss": 0.74919164, - "num_input_tokens_seen": 87105340, - "step": 4047, - "time_per_iteration": 4.310218095779419 - }, - { - "auxiliary_loss_clip": 0.01124014, - "auxiliary_loss_mlp": 0.01045256, - "balance_loss_clip": 1.04447186, - "balance_loss_mlp": 1.02637279, - "epoch": 0.2433789267999399, - "flos": 24462815990400.0, - "grad_norm": 2.592486480291502, - "language_loss": 0.73029542, - "learning_rate": 3.5410910291533213e-06, - "loss": 0.75198811, - "num_input_tokens_seen": 87125780, - "step": 4048, - "time_per_iteration": 2.699544668197632 - }, - { - "auxiliary_loss_clip": 0.01112707, - "auxiliary_loss_mlp": 0.01045312, - "balance_loss_clip": 1.04923105, - "balance_loss_mlp": 1.02869391, - "epoch": 0.24343905005260785, - "flos": 16727909264640.0, - "grad_norm": 1.921127999919884, - "language_loss": 0.73616529, - "learning_rate": 3.5408427629105155e-06, - "loss": 0.7577455, - "num_input_tokens_seen": 87144470, - "step": 4049, - "time_per_iteration": 2.6988370418548584 - }, - { - "auxiliary_loss_clip": 0.01093349, - "auxiliary_loss_mlp": 0.01041657, - "balance_loss_clip": 1.04289758, - "balance_loss_mlp": 1.02583802, - "epoch": 0.24349917330527582, - "flos": 20043756443520.0, - "grad_norm": 2.073976648883723, - "language_loss": 0.7377705, - "learning_rate": 3.5405944382387985e-06, - "loss": 0.75912058, - "num_input_tokens_seen": 87162830, - "step": 4050, - "time_per_iteration": 2.718212604522705 - }, - { - "auxiliary_loss_clip": 0.01116995, - "auxiliary_loss_mlp": 0.01043968, - "balance_loss_clip": 1.04518783, - "balance_loss_mlp": 1.02800608, - "epoch": 0.24355929655794378, - "flos": 17420554200960.0, - "grad_norm": 2.361179977901575, - "language_loss": 0.75518602, - "learning_rate": 3.5403460551475854e-06, - "loss": 0.77679563, - "num_input_tokens_seen": 87180905, - "step": 4051, - "time_per_iteration": 2.6522655487060547 - }, - { - "auxiliary_loss_clip": 0.01092567, - "auxiliary_loss_mlp": 0.01042511, - "balance_loss_clip": 1.04197812, - "balance_loss_mlp": 1.02507067, - "epoch": 0.24361941981061175, - "flos": 25410929431680.0, - "grad_norm": 2.2644912923037985, - "language_loss": 0.70717591, - "learning_rate": 3.540097613646296e-06, - "loss": 0.72852671, - "num_input_tokens_seen": 87202290, - "step": 4052, - "time_per_iteration": 2.794059991836548 - }, - { - "auxiliary_loss_clip": 0.0111622, - "auxiliary_loss_mlp": 0.01045494, - "balance_loss_clip": 1.04823005, - "balance_loss_mlp": 1.02833986, - "epoch": 0.2436795430632797, - "flos": 22820800636800.0, - "grad_norm": 1.7022998331113812, - "language_loss": 0.80989587, - "learning_rate": 3.539849113744351e-06, - "loss": 0.83151299, - "num_input_tokens_seen": 87221650, - "step": 4053, - "time_per_iteration": 2.682805299758911 - }, - { - "auxiliary_loss_clip": 0.01148244, - "auxiliary_loss_mlp": 0.01038109, - "balance_loss_clip": 1.05124915, - "balance_loss_mlp": 1.0210743, - "epoch": 0.2437396663159477, - "flos": 15157786982400.0, - "grad_norm": 1.5338885161808513, - "language_loss": 0.77628779, - "learning_rate": 3.539600555451172e-06, - "loss": 0.79815125, - "num_input_tokens_seen": 87238515, - "step": 4054, - "time_per_iteration": 2.635181427001953 - }, - { - "auxiliary_loss_clip": 0.01095192, - "auxiliary_loss_mlp": 0.01055244, - "balance_loss_clip": 1.04067969, - "balance_loss_mlp": 1.03783989, - "epoch": 0.24379978956861567, - "flos": 22091131756800.0, - "grad_norm": 1.8808929031646056, - "language_loss": 0.84398115, - "learning_rate": 3.5393519387761866e-06, - "loss": 0.86548549, - "num_input_tokens_seen": 87256290, - "step": 4055, - "time_per_iteration": 2.757601261138916 - }, - { - "auxiliary_loss_clip": 0.01110063, - "auxiliary_loss_mlp": 0.01045315, - "balance_loss_clip": 1.04298997, - "balance_loss_mlp": 1.02767169, - "epoch": 0.24385991282128364, - "flos": 31467766527360.0, - "grad_norm": 2.5636936013515776, - "language_loss": 0.55038011, - "learning_rate": 3.5391032637288217e-06, - "loss": 0.57193393, - "num_input_tokens_seen": 87277085, - "step": 4056, - "time_per_iteration": 2.7788894176483154 - }, - { - "auxiliary_loss_clip": 0.0113756, - "auxiliary_loss_mlp": 0.01046233, - "balance_loss_clip": 1.04897046, - "balance_loss_mlp": 1.02876842, - "epoch": 0.2439200360739516, - "flos": 23838795987840.0, - "grad_norm": 2.64902132986976, - "language_loss": 0.80583262, - "learning_rate": 3.538854530318506e-06, - "loss": 0.82767057, - "num_input_tokens_seen": 87293020, - "step": 4057, - "time_per_iteration": 2.78110671043396 - }, - { - "auxiliary_loss_clip": 0.01132987, - "auxiliary_loss_mlp": 0.01048497, - "balance_loss_clip": 1.04877245, - "balance_loss_mlp": 1.03145027, - "epoch": 0.24398015932661957, - "flos": 19169978198400.0, - "grad_norm": 1.8133503864036424, - "language_loss": 0.79202968, - "learning_rate": 3.538605738554673e-06, - "loss": 0.81384456, - "num_input_tokens_seen": 87311445, - "step": 4058, - "time_per_iteration": 2.6609115600585938 - }, - { - "auxiliary_loss_clip": 0.01147749, - "auxiliary_loss_mlp": 0.01045059, - "balance_loss_clip": 1.04827118, - "balance_loss_mlp": 1.02920449, - "epoch": 0.24404028257928753, - "flos": 25262474520960.0, - "grad_norm": 3.3482411666646086, - "language_loss": 0.85503888, - "learning_rate": 3.538356888446756e-06, - "loss": 0.87696695, - "num_input_tokens_seen": 87332055, - "step": 4059, - "time_per_iteration": 2.724241256713867 - }, - { - "auxiliary_loss_clip": 0.01126127, - "auxiliary_loss_mlp": 0.01038967, - "balance_loss_clip": 1.04837418, - "balance_loss_mlp": 1.02296889, - "epoch": 0.2441004058319555, - "flos": 26467600752000.0, - "grad_norm": 2.2060888459440617, - "language_loss": 0.7483452, - "learning_rate": 3.5381079800041913e-06, - "loss": 0.76999605, - "num_input_tokens_seen": 87351295, - "step": 4060, - "time_per_iteration": 2.6769304275512695 - }, - { - "auxiliary_loss_clip": 0.01111679, - "auxiliary_loss_mlp": 0.01051445, - "balance_loss_clip": 1.04629493, - "balance_loss_mlp": 1.03247917, - "epoch": 0.2441605290846235, - "flos": 26760524163840.0, - "grad_norm": 2.624850134940939, - "language_loss": 0.73482168, - "learning_rate": 3.5378590132364182e-06, - "loss": 0.75645292, - "num_input_tokens_seen": 87370650, - "step": 4061, - "time_per_iteration": 2.7570559978485107 - }, - { - "auxiliary_loss_clip": 0.01144554, - "auxiliary_loss_mlp": 0.01039707, - "balance_loss_clip": 1.05180097, - "balance_loss_mlp": 1.02394772, - "epoch": 0.24422065233729146, - "flos": 21105850717440.0, - "grad_norm": 4.11905418985837, - "language_loss": 0.76135921, - "learning_rate": 3.5376099881528768e-06, - "loss": 0.78320187, - "num_input_tokens_seen": 87389020, - "step": 4062, - "time_per_iteration": 2.6387689113616943 - }, - { - "auxiliary_loss_clip": 0.01104974, - "auxiliary_loss_mlp": 0.01041222, - "balance_loss_clip": 1.04618907, - "balance_loss_mlp": 1.02458024, - "epoch": 0.24428077558995942, - "flos": 25263156879360.0, - "grad_norm": 2.5628995075758954, - "language_loss": 0.85376853, - "learning_rate": 3.537360904763011e-06, - "loss": 0.87523055, - "num_input_tokens_seen": 87409695, - "step": 4063, - "time_per_iteration": 2.7785301208496094 - }, - { - "auxiliary_loss_clip": 0.01119987, - "auxiliary_loss_mlp": 0.01047158, - "balance_loss_clip": 1.04776239, - "balance_loss_mlp": 1.02789354, - "epoch": 0.24434089884262739, - "flos": 20485278420480.0, - "grad_norm": 2.760332484942286, - "language_loss": 0.6845879, - "learning_rate": 3.5371117630762656e-06, - "loss": 0.70625937, - "num_input_tokens_seen": 87428250, - "step": 4064, - "time_per_iteration": 2.6691763401031494 - }, - { - "auxiliary_loss_clip": 0.01138225, - "auxiliary_loss_mlp": 0.01046639, - "balance_loss_clip": 1.04773867, - "balance_loss_mlp": 1.02892423, - "epoch": 0.24440102209529535, - "flos": 23621895711360.0, - "grad_norm": 1.603702751214229, - "language_loss": 0.70247531, - "learning_rate": 3.536862563102088e-06, - "loss": 0.72432399, - "num_input_tokens_seen": 87449380, - "step": 4065, - "time_per_iteration": 2.6677680015563965 - }, - { - "auxiliary_loss_clip": 0.01150465, - "auxiliary_loss_mlp": 0.0104697, - "balance_loss_clip": 1.05127215, - "balance_loss_mlp": 1.02803993, - "epoch": 0.24446114534796332, - "flos": 20554729367040.0, - "grad_norm": 1.788543447431289, - "language_loss": 0.84282506, - "learning_rate": 3.5366133048499282e-06, - "loss": 0.86479944, - "num_input_tokens_seen": 87465365, - "step": 4066, - "time_per_iteration": 2.5993456840515137 - }, - { - "auxiliary_loss_clip": 0.01067736, - "auxiliary_loss_mlp": 0.01002523, - "balance_loss_clip": 1.03198457, - "balance_loss_mlp": 1.00028193, - "epoch": 0.24452126860063128, - "flos": 60389575009920.0, - "grad_norm": 0.7359455307187547, - "language_loss": 0.52283657, - "learning_rate": 3.5363639883292374e-06, - "loss": 0.54353911, - "num_input_tokens_seen": 87522525, - "step": 4067, - "time_per_iteration": 3.056666374206543 - }, - { - "auxiliary_loss_clip": 0.01123042, - "auxiliary_loss_mlp": 0.01045731, - "balance_loss_clip": 1.04955244, - "balance_loss_mlp": 1.0279212, - "epoch": 0.24458139185329927, - "flos": 15121660878720.0, - "grad_norm": 2.6392300526537493, - "language_loss": 0.7185899, - "learning_rate": 3.5361146135494706e-06, - "loss": 0.74027765, - "num_input_tokens_seen": 87539170, - "step": 4068, - "time_per_iteration": 2.700847864151001 - }, - { - "auxiliary_loss_clip": 0.01086004, - "auxiliary_loss_mlp": 0.01047493, - "balance_loss_clip": 1.04378593, - "balance_loss_mlp": 1.02920675, - "epoch": 0.24464151510596724, - "flos": 27998723842560.0, - "grad_norm": 2.4202919064349744, - "language_loss": 0.78083313, - "learning_rate": 3.5358651805200835e-06, - "loss": 0.80216813, - "num_input_tokens_seen": 87558875, - "step": 4069, - "time_per_iteration": 2.9363162517547607 - }, - { - "auxiliary_loss_clip": 0.01119666, - "auxiliary_loss_mlp": 0.0105204, - "balance_loss_clip": 1.05164659, - "balance_loss_mlp": 1.03445613, - "epoch": 0.2447016383586352, - "flos": 19792884879360.0, - "grad_norm": 4.167143793475273, - "language_loss": 0.80607939, - "learning_rate": 3.5356156892505347e-06, - "loss": 0.82779646, - "num_input_tokens_seen": 87576485, - "step": 4070, - "time_per_iteration": 2.658191204071045 - }, - { - "auxiliary_loss_clip": 0.01127014, - "auxiliary_loss_mlp": 0.01049283, - "balance_loss_clip": 1.04832387, - "balance_loss_mlp": 1.03218853, - "epoch": 0.24476176161130317, - "flos": 26067340523520.0, - "grad_norm": 1.5316441932107319, - "language_loss": 0.84351504, - "learning_rate": 3.5353661397502854e-06, - "loss": 0.86527801, - "num_input_tokens_seen": 87598620, - "step": 4071, - "time_per_iteration": 2.7118849754333496 - }, - { - "auxiliary_loss_clip": 0.01120333, - "auxiliary_loss_mlp": 0.01057334, - "balance_loss_clip": 1.04778695, - "balance_loss_mlp": 1.03601933, - "epoch": 0.24482188486397113, - "flos": 18843550375680.0, - "grad_norm": 1.8860726044388547, - "language_loss": 0.80115497, - "learning_rate": 3.535116532028798e-06, - "loss": 0.82293165, - "num_input_tokens_seen": 87616595, - "step": 4072, - "time_per_iteration": 2.6662774085998535 - }, - { - "auxiliary_loss_clip": 0.01134806, - "auxiliary_loss_mlp": 0.0104215, - "balance_loss_clip": 1.05156791, - "balance_loss_mlp": 1.02614021, - "epoch": 0.2448820081166391, - "flos": 21251791676160.0, - "grad_norm": 3.990887653020168, - "language_loss": 0.70466423, - "learning_rate": 3.5348668660955382e-06, - "loss": 0.72643375, - "num_input_tokens_seen": 87635755, - "step": 4073, - "time_per_iteration": 2.7366209030151367 - }, - { - "auxiliary_loss_clip": 0.01110472, - "auxiliary_loss_mlp": 0.01047265, - "balance_loss_clip": 1.04666865, - "balance_loss_mlp": 1.03090906, - "epoch": 0.2449421313693071, - "flos": 23950586090880.0, - "grad_norm": 2.943884117668681, - "language_loss": 0.67292917, - "learning_rate": 3.5346171419599728e-06, - "loss": 0.69450659, - "num_input_tokens_seen": 87652885, - "step": 4074, - "time_per_iteration": 2.7158730030059814 - }, - { - "auxiliary_loss_clip": 0.01062567, - "auxiliary_loss_mlp": 0.01002121, - "balance_loss_clip": 1.02741885, - "balance_loss_mlp": 0.99986744, - "epoch": 0.24500225462197506, - "flos": 60687669980160.0, - "grad_norm": 0.8927046346070237, - "language_loss": 0.68608266, - "learning_rate": 3.5343673596315718e-06, - "loss": 0.70672953, - "num_input_tokens_seen": 87713220, - "step": 4075, - "time_per_iteration": 3.2283740043640137 - }, - { - "auxiliary_loss_clip": 0.01146172, - "auxiliary_loss_mlp": 0.01042507, - "balance_loss_clip": 1.05287361, - "balance_loss_mlp": 1.02612722, - "epoch": 0.24506237787464302, - "flos": 26284204886400.0, - "grad_norm": 2.3370219869490563, - "language_loss": 0.79263043, - "learning_rate": 3.5341175191198063e-06, - "loss": 0.81451714, - "num_input_tokens_seen": 87732680, - "step": 4076, - "time_per_iteration": 2.6744346618652344 - }, - { - "auxiliary_loss_clip": 0.01128421, - "auxiliary_loss_mlp": 0.00775989, - "balance_loss_clip": 1.04903293, - "balance_loss_mlp": 1.001266, - "epoch": 0.245122501127311, - "flos": 20552287242240.0, - "grad_norm": 1.816414447330212, - "language_loss": 0.81986046, - "learning_rate": 3.533867620434151e-06, - "loss": 0.83890456, - "num_input_tokens_seen": 87751880, - "step": 4077, - "time_per_iteration": 2.729391098022461 - }, - { - "auxiliary_loss_clip": 0.01148302, - "auxiliary_loss_mlp": 0.01047154, - "balance_loss_clip": 1.05185413, - "balance_loss_mlp": 1.0288794, - "epoch": 0.24518262437997895, - "flos": 29132603447040.0, - "grad_norm": 2.0328430965985045, - "language_loss": 0.62790757, - "learning_rate": 3.533617663584082e-06, - "loss": 0.64986217, - "num_input_tokens_seen": 87771795, - "step": 4078, - "time_per_iteration": 2.694767713546753 - }, - { - "auxiliary_loss_clip": 0.01114498, - "auxiliary_loss_mlp": 0.01039203, - "balance_loss_clip": 1.04953861, - "balance_loss_mlp": 1.02270436, - "epoch": 0.24524274763264692, - "flos": 23476924419840.0, - "grad_norm": 1.5687748074794818, - "language_loss": 0.75811553, - "learning_rate": 3.5333676485790765e-06, - "loss": 0.7796526, - "num_input_tokens_seen": 87793640, - "step": 4079, - "time_per_iteration": 4.288895130157471 - }, - { - "auxiliary_loss_clip": 0.01142871, - "auxiliary_loss_mlp": 0.01047138, - "balance_loss_clip": 1.04899406, - "balance_loss_mlp": 1.02955461, - "epoch": 0.24530287088531488, - "flos": 17201175886080.0, - "grad_norm": 1.8811380892336844, - "language_loss": 0.74537313, - "learning_rate": 3.5331175754286173e-06, - "loss": 0.76727325, - "num_input_tokens_seen": 87812390, - "step": 4080, - "time_per_iteration": 2.683969736099243 - }, - { - "auxiliary_loss_clip": 0.01115604, - "auxiliary_loss_mlp": 0.01041593, - "balance_loss_clip": 1.04717278, - "balance_loss_mlp": 1.02558291, - "epoch": 0.24536299413798288, - "flos": 14867449349760.0, - "grad_norm": 2.2859558621761997, - "language_loss": 0.83389306, - "learning_rate": 3.532867444142186e-06, - "loss": 0.85546505, - "num_input_tokens_seen": 87830640, - "step": 4081, - "time_per_iteration": 2.772573947906494 - }, - { - "auxiliary_loss_clip": 0.01114607, - "auxiliary_loss_mlp": 0.01040674, - "balance_loss_clip": 1.04734826, - "balance_loss_mlp": 1.02473605, - "epoch": 0.24542311739065084, - "flos": 35262051886080.0, - "grad_norm": 1.8658741711896472, - "language_loss": 0.73223484, - "learning_rate": 3.532617254729267e-06, - "loss": 0.7537877, - "num_input_tokens_seen": 87850450, - "step": 4082, - "time_per_iteration": 4.3304970264434814 - }, - { - "auxiliary_loss_clip": 0.01104397, - "auxiliary_loss_mlp": 0.01047151, - "balance_loss_clip": 1.04542649, - "balance_loss_mlp": 1.03163004, - "epoch": 0.2454832406433188, - "flos": 21503130117120.0, - "grad_norm": 1.7143564189307843, - "language_loss": 0.72032338, - "learning_rate": 3.5323670071993485e-06, - "loss": 0.74183893, - "num_input_tokens_seen": 87868810, - "step": 4083, - "time_per_iteration": 2.7463390827178955 - }, - { - "auxiliary_loss_clip": 0.01115479, - "auxiliary_loss_mlp": 0.01048832, - "balance_loss_clip": 1.04441845, - "balance_loss_mlp": 1.02979386, - "epoch": 0.24554336389598677, - "flos": 14756664827520.0, - "grad_norm": 2.556114612666859, - "language_loss": 0.74363655, - "learning_rate": 3.532116701561919e-06, - "loss": 0.76527965, - "num_input_tokens_seen": 87885685, - "step": 4084, - "time_per_iteration": 2.6828086376190186 - }, - { - "auxiliary_loss_clip": 0.01126215, - "auxiliary_loss_mlp": 0.01040078, - "balance_loss_clip": 1.04541206, - "balance_loss_mlp": 1.02269721, - "epoch": 0.24560348714865474, - "flos": 14976402278400.0, - "grad_norm": 2.030442784512354, - "language_loss": 0.85540497, - "learning_rate": 3.531866337826471e-06, - "loss": 0.87706792, - "num_input_tokens_seen": 87903715, - "step": 4085, - "time_per_iteration": 4.236302852630615 - }, - { - "auxiliary_loss_clip": 0.01110493, - "auxiliary_loss_mlp": 0.01046501, - "balance_loss_clip": 1.04634261, - "balance_loss_mlp": 1.02932286, - "epoch": 0.2456636104013227, - "flos": 22675326554880.0, - "grad_norm": 2.028282258660301, - "language_loss": 0.78985649, - "learning_rate": 3.5316159160024982e-06, - "loss": 0.8114264, - "num_input_tokens_seen": 87923375, - "step": 4086, - "time_per_iteration": 2.6638717651367188 - }, - { - "auxiliary_loss_clip": 0.01087456, - "auxiliary_loss_mlp": 0.0104508, - "balance_loss_clip": 1.04792905, - "balance_loss_mlp": 1.02847362, - "epoch": 0.2457237336539907, - "flos": 27417869009280.0, - "grad_norm": 5.7080500305845865, - "language_loss": 0.75053227, - "learning_rate": 3.531365436099496e-06, - "loss": 0.77185762, - "num_input_tokens_seen": 87943115, - "step": 4087, - "time_per_iteration": 2.8027901649475098 - }, - { - "auxiliary_loss_clip": 0.01090549, - "auxiliary_loss_mlp": 0.01045493, - "balance_loss_clip": 1.04807436, - "balance_loss_mlp": 1.02680135, - "epoch": 0.24578385690665866, - "flos": 20412379768320.0, - "grad_norm": 2.066557704160291, - "language_loss": 0.79291761, - "learning_rate": 3.5311148981269635e-06, - "loss": 0.81427807, - "num_input_tokens_seen": 87959505, - "step": 4088, - "time_per_iteration": 2.78812575340271 - }, - { - "auxiliary_loss_clip": 0.0110062, - "auxiliary_loss_mlp": 0.01035541, - "balance_loss_clip": 1.04435658, - "balance_loss_mlp": 1.01949525, - "epoch": 0.24584398015932662, - "flos": 23915393740800.0, - "grad_norm": 1.4918864539426413, - "language_loss": 0.77053773, - "learning_rate": 3.5308643020944e-06, - "loss": 0.79189926, - "num_input_tokens_seen": 87979725, - "step": 4089, - "time_per_iteration": 2.75034761428833 - }, - { - "auxiliary_loss_clip": 0.01125156, - "auxiliary_loss_mlp": 0.0104201, - "balance_loss_clip": 1.04609382, - "balance_loss_mlp": 1.02470064, - "epoch": 0.2459041034119946, - "flos": 41496359103360.0, - "grad_norm": 2.3383647352821737, - "language_loss": 0.81814516, - "learning_rate": 3.530613648011309e-06, - "loss": 0.83981681, - "num_input_tokens_seen": 87998270, - "step": 4090, - "time_per_iteration": 2.891878604888916 - }, - { - "auxiliary_loss_clip": 0.01121872, - "auxiliary_loss_mlp": 0.01050145, - "balance_loss_clip": 1.04687834, - "balance_loss_mlp": 1.03163147, - "epoch": 0.24596422666466256, - "flos": 19936814676480.0, - "grad_norm": 1.8221600402702927, - "language_loss": 0.73833978, - "learning_rate": 3.5303629358871946e-06, - "loss": 0.76005995, - "num_input_tokens_seen": 88016760, - "step": 4091, - "time_per_iteration": 2.6410961151123047 - }, - { - "auxiliary_loss_clip": 0.01114038, - "auxiliary_loss_mlp": 0.01045509, - "balance_loss_clip": 1.05517268, - "balance_loss_mlp": 1.0279969, - "epoch": 0.24602434991733052, - "flos": 21544391865600.0, - "grad_norm": 1.8983812190731213, - "language_loss": 0.7706998, - "learning_rate": 3.5301121657315653e-06, - "loss": 0.79229522, - "num_input_tokens_seen": 88036465, - "step": 4092, - "time_per_iteration": 2.7038323879241943 - }, - { - "auxiliary_loss_clip": 0.01115501, - "auxiliary_loss_mlp": 0.01040797, - "balance_loss_clip": 1.04371238, - "balance_loss_mlp": 1.02255797, - "epoch": 0.24608447316999849, - "flos": 23185078416000.0, - "grad_norm": 3.1365051823944627, - "language_loss": 0.81200075, - "learning_rate": 3.5298613375539287e-06, - "loss": 0.83356375, - "num_input_tokens_seen": 88053270, - "step": 4093, - "time_per_iteration": 2.680634021759033 - }, - { - "auxiliary_loss_clip": 0.01135527, - "auxiliary_loss_mlp": 0.01043826, - "balance_loss_clip": 1.04879606, - "balance_loss_mlp": 1.02613521, - "epoch": 0.24614459642266648, - "flos": 19641951930240.0, - "grad_norm": 1.9167765067224862, - "language_loss": 0.86932534, - "learning_rate": 3.529610451363797e-06, - "loss": 0.89111882, - "num_input_tokens_seen": 88072305, - "step": 4094, - "time_per_iteration": 2.6558003425598145 - }, - { - "auxiliary_loss_clip": 0.01007267, - "auxiliary_loss_mlp": 0.01019789, - "balance_loss_clip": 1.03124738, - "balance_loss_mlp": 1.01697576, - "epoch": 0.24620471967533444, - "flos": 61739816186880.0, - "grad_norm": 0.7554163750993251, - "language_loss": 0.57503664, - "learning_rate": 3.5293595071706833e-06, - "loss": 0.59530711, - "num_input_tokens_seen": 88137995, - "step": 4095, - "time_per_iteration": 3.3576478958129883 - }, - { - "auxiliary_loss_clip": 0.01051219, - "auxiliary_loss_mlp": 0.0102022, - "balance_loss_clip": 1.03409493, - "balance_loss_mlp": 1.01790738, - "epoch": 0.2462648429280024, - "flos": 69154436315520.0, - "grad_norm": 0.655284075812517, - "language_loss": 0.56260574, - "learning_rate": 3.5291085049841042e-06, - "loss": 0.58332014, - "num_input_tokens_seen": 88208490, - "step": 4096, - "time_per_iteration": 3.376516580581665 - }, - { - "auxiliary_loss_clip": 0.0112712, - "auxiliary_loss_mlp": 0.01040362, - "balance_loss_clip": 1.05330801, - "balance_loss_mlp": 1.0236733, - "epoch": 0.24632496618067037, - "flos": 29459605887360.0, - "grad_norm": 1.7306008966026363, - "language_loss": 0.77629399, - "learning_rate": 3.5288574448135773e-06, - "loss": 0.79796875, - "num_input_tokens_seen": 88228050, - "step": 4097, - "time_per_iteration": 2.6973912715911865 - }, - { - "auxiliary_loss_clip": 0.01114293, - "auxiliary_loss_mlp": 0.01047339, - "balance_loss_clip": 1.04898906, - "balance_loss_mlp": 1.02842093, - "epoch": 0.24638508943333834, - "flos": 24316444068480.0, - "grad_norm": 2.4079595240953613, - "language_loss": 0.75890571, - "learning_rate": 3.5286063266686235e-06, - "loss": 0.78052205, - "num_input_tokens_seen": 88248090, - "step": 4098, - "time_per_iteration": 2.739947557449341 - }, - { - "auxiliary_loss_clip": 0.0112794, - "auxiliary_loss_mlp": 0.01046194, - "balance_loss_clip": 1.05179596, - "balance_loss_mlp": 1.03002954, - "epoch": 0.2464452126860063, - "flos": 26613254401920.0, - "grad_norm": 2.5671853201902737, - "language_loss": 0.68179071, - "learning_rate": 3.528355150558764e-06, - "loss": 0.7035321, - "num_input_tokens_seen": 88267545, - "step": 4099, - "time_per_iteration": 2.7144618034362793 - }, - { - "auxiliary_loss_clip": 0.01133513, - "auxiliary_loss_mlp": 0.01045673, - "balance_loss_clip": 1.05187321, - "balance_loss_mlp": 1.02897191, - "epoch": 0.24650533593867427, - "flos": 31212405763200.0, - "grad_norm": 2.0343787496625656, - "language_loss": 0.65915, - "learning_rate": 3.5281039164935237e-06, - "loss": 0.68094188, - "num_input_tokens_seen": 88289785, - "step": 4100, - "time_per_iteration": 2.724008560180664 - }, - { - "auxiliary_loss_clip": 0.01054067, - "auxiliary_loss_mlp": 0.01041004, - "balance_loss_clip": 1.03763318, - "balance_loss_mlp": 1.03830957, - "epoch": 0.24656545919134226, - "flos": 68494002900480.0, - "grad_norm": 0.7229502883874133, - "language_loss": 0.61514676, - "learning_rate": 3.5278526244824304e-06, - "loss": 0.63609749, - "num_input_tokens_seen": 88357320, - "step": 4101, - "time_per_iteration": 3.3748011589050293 - }, - { - "auxiliary_loss_clip": 0.01144305, - "auxiliary_loss_mlp": 0.01041937, - "balance_loss_clip": 1.05133915, - "balance_loss_mlp": 1.02455676, - "epoch": 0.24662558244401023, - "flos": 20084192179200.0, - "grad_norm": 2.2333045722985028, - "language_loss": 0.73272061, - "learning_rate": 3.527601274535012e-06, - "loss": 0.754583, - "num_input_tokens_seen": 88377040, - "step": 4102, - "time_per_iteration": 2.7457518577575684 - }, - { - "auxiliary_loss_clip": 0.01124231, - "auxiliary_loss_mlp": 0.01043636, - "balance_loss_clip": 1.04909408, - "balance_loss_mlp": 1.02699423, - "epoch": 0.2466857056966782, - "flos": 30701361012480.0, - "grad_norm": 2.9311552217427774, - "language_loss": 0.76528364, - "learning_rate": 3.5273498666608004e-06, - "loss": 0.78696227, - "num_input_tokens_seen": 88395085, - "step": 4103, - "time_per_iteration": 2.732285499572754 - }, - { - "auxiliary_loss_clip": 0.01128751, - "auxiliary_loss_mlp": 0.01051695, - "balance_loss_clip": 1.04730439, - "balance_loss_mlp": 1.03313375, - "epoch": 0.24674582894934616, - "flos": 22528523669760.0, - "grad_norm": 2.3173933836652902, - "language_loss": 0.78658336, - "learning_rate": 3.5270984008693288e-06, - "loss": 0.80838788, - "num_input_tokens_seen": 88413205, - "step": 4104, - "time_per_iteration": 2.7234179973602295 - }, - { - "auxiliary_loss_clip": 0.01134641, - "auxiliary_loss_mlp": 0.01045411, - "balance_loss_clip": 1.05110276, - "balance_loss_mlp": 1.02601588, - "epoch": 0.24680595220201412, - "flos": 20704297599360.0, - "grad_norm": 1.883953093480743, - "language_loss": 0.8375451, - "learning_rate": 3.526846877170133e-06, - "loss": 0.85934561, - "num_input_tokens_seen": 88431525, - "step": 4105, - "time_per_iteration": 2.7051403522491455 - }, - { - "auxiliary_loss_clip": 0.01149885, - "auxiliary_loss_mlp": 0.01051204, - "balance_loss_clip": 1.05490828, - "balance_loss_mlp": 1.03340602, - "epoch": 0.2468660754546821, - "flos": 21831174051840.0, - "grad_norm": 1.9903096770852142, - "language_loss": 0.76503521, - "learning_rate": 3.52659529557275e-06, - "loss": 0.78704607, - "num_input_tokens_seen": 88451210, - "step": 4106, - "time_per_iteration": 2.6324243545532227 - }, - { - "auxiliary_loss_clip": 0.01107346, - "auxiliary_loss_mlp": 0.01058334, - "balance_loss_clip": 1.0438261, - "balance_loss_mlp": 1.03743649, - "epoch": 0.24692619870735008, - "flos": 15267709578240.0, - "grad_norm": 2.3469304270549487, - "language_loss": 0.72399199, - "learning_rate": 3.5263436560867205e-06, - "loss": 0.74564874, - "num_input_tokens_seen": 88467790, - "step": 4107, - "time_per_iteration": 2.6767516136169434 - }, - { - "auxiliary_loss_clip": 0.01149014, - "auxiliary_loss_mlp": 0.01055902, - "balance_loss_clip": 1.05365527, - "balance_loss_mlp": 1.03840184, - "epoch": 0.24698632196001805, - "flos": 29680097523840.0, - "grad_norm": 2.655550859638868, - "language_loss": 0.65495557, - "learning_rate": 3.526091958721587e-06, - "loss": 0.67700469, - "num_input_tokens_seen": 88490330, - "step": 4108, - "time_per_iteration": 2.666501760482788 - }, - { - "auxiliary_loss_clip": 0.01095567, - "auxiliary_loss_mlp": 0.01053352, - "balance_loss_clip": 1.04577923, - "balance_loss_mlp": 1.0351851, - "epoch": 0.247046445212686, - "flos": 39165469741440.0, - "grad_norm": 1.631565192024798, - "language_loss": 0.72685403, - "learning_rate": 3.5258402034868936e-06, - "loss": 0.74834323, - "num_input_tokens_seen": 88512435, - "step": 4109, - "time_per_iteration": 2.8588712215423584 - }, - { - "auxiliary_loss_clip": 0.01110552, - "auxiliary_loss_mlp": 0.01048877, - "balance_loss_clip": 1.04754984, - "balance_loss_mlp": 1.03132939, - "epoch": 0.24710656846535398, - "flos": 22998845376000.0, - "grad_norm": 1.9000447272053396, - "language_loss": 0.79328829, - "learning_rate": 3.5255883903921866e-06, - "loss": 0.81488264, - "num_input_tokens_seen": 88529780, - "step": 4110, - "time_per_iteration": 2.7403078079223633 - }, - { - "auxiliary_loss_clip": 0.01114435, - "auxiliary_loss_mlp": 0.0104359, - "balance_loss_clip": 1.04750848, - "balance_loss_mlp": 1.02536333, - "epoch": 0.24716669171802194, - "flos": 26432803451520.0, - "grad_norm": 1.9757162932013852, - "language_loss": 0.80630267, - "learning_rate": 3.5253365194470144e-06, - "loss": 0.82788301, - "num_input_tokens_seen": 88547200, - "step": 4111, - "time_per_iteration": 2.6893255710601807 - }, - { - "auxiliary_loss_clip": 0.01143907, - "auxiliary_loss_mlp": 0.0104799, - "balance_loss_clip": 1.0493356, - "balance_loss_mlp": 1.03203976, - "epoch": 0.2472268149706899, - "flos": 23329870139520.0, - "grad_norm": 1.928179444788623, - "language_loss": 0.75401616, - "learning_rate": 3.5250845906609294e-06, - "loss": 0.77593511, - "num_input_tokens_seen": 88566415, - "step": 4112, - "time_per_iteration": 2.641103506088257 - }, - { - "auxiliary_loss_clip": 0.01112249, - "auxiliary_loss_mlp": 0.00775958, - "balance_loss_clip": 1.04847336, - "balance_loss_mlp": 1.00114262, - "epoch": 0.24728693822335787, - "flos": 23768734510080.0, - "grad_norm": 2.1227710866712908, - "language_loss": 0.8244158, - "learning_rate": 3.5248326040434835e-06, - "loss": 0.84329784, - "num_input_tokens_seen": 88585225, - "step": 4113, - "time_per_iteration": 2.831209182739258 - }, - { - "auxiliary_loss_clip": 0.01143893, - "auxiliary_loss_mlp": 0.01043423, - "balance_loss_clip": 1.04927897, - "balance_loss_mlp": 1.02574396, - "epoch": 0.24734706147602586, - "flos": 19317499355520.0, - "grad_norm": 2.5263325514304813, - "language_loss": 0.8704375, - "learning_rate": 3.5245805596042322e-06, - "loss": 0.89231074, - "num_input_tokens_seen": 88603280, - "step": 4114, - "time_per_iteration": 2.7264626026153564 - }, - { - "auxiliary_loss_clip": 0.01096969, - "auxiliary_loss_mlp": 0.01047533, - "balance_loss_clip": 1.04748011, - "balance_loss_mlp": 1.03005731, - "epoch": 0.24740718472869383, - "flos": 28036932935040.0, - "grad_norm": 1.6498261942323098, - "language_loss": 0.75283766, - "learning_rate": 3.524328457352734e-06, - "loss": 0.77428269, - "num_input_tokens_seen": 88624925, - "step": 4115, - "time_per_iteration": 2.755342483520508 - }, - { - "auxiliary_loss_clip": 0.01018711, - "auxiliary_loss_mlp": 0.01070163, - "balance_loss_clip": 1.03186083, - "balance_loss_mlp": 1.06756425, - "epoch": 0.2474673079813618, - "flos": 68107569408000.0, - "grad_norm": 0.6879904854197085, - "language_loss": 0.58123159, - "learning_rate": 3.5240762972985475e-06, - "loss": 0.60212028, - "num_input_tokens_seen": 88691475, - "step": 4116, - "time_per_iteration": 3.4015462398529053 - }, - { - "auxiliary_loss_clip": 0.01122111, - "auxiliary_loss_mlp": 0.01038886, - "balance_loss_clip": 1.04813063, - "balance_loss_mlp": 1.02213693, - "epoch": 0.24752743123402976, - "flos": 29462119839360.0, - "grad_norm": 19.427234883564427, - "language_loss": 0.83599627, - "learning_rate": 3.523824079451235e-06, - "loss": 0.85760617, - "num_input_tokens_seen": 88713425, - "step": 4117, - "time_per_iteration": 2.7881336212158203 - }, - { - "auxiliary_loss_clip": 0.01041379, - "auxiliary_loss_mlp": 0.00755386, - "balance_loss_clip": 1.02616835, - "balance_loss_mlp": 1.0023396, - "epoch": 0.24758755448669773, - "flos": 58350459824640.0, - "grad_norm": 0.909523411860611, - "language_loss": 0.63518536, - "learning_rate": 3.5235718038203602e-06, - "loss": 0.65315294, - "num_input_tokens_seen": 88769995, - "step": 4118, - "time_per_iteration": 3.1125216484069824 - }, - { - "auxiliary_loss_clip": 0.01126335, - "auxiliary_loss_mlp": 0.01048787, - "balance_loss_clip": 1.04487431, - "balance_loss_mlp": 1.03127515, - "epoch": 0.2476476777393657, - "flos": 20484416494080.0, - "grad_norm": 2.1708029437062546, - "language_loss": 0.79272264, - "learning_rate": 3.523319470415491e-06, - "loss": 0.81447387, - "num_input_tokens_seen": 88789970, - "step": 4119, - "time_per_iteration": 6.294121503829956 - }, - { - "auxiliary_loss_clip": 0.01133521, - "auxiliary_loss_mlp": 0.01044138, - "balance_loss_clip": 1.05223441, - "balance_loss_mlp": 1.02707899, - "epoch": 0.24770780099203366, - "flos": 20485853038080.0, - "grad_norm": 1.7395275513138477, - "language_loss": 0.74590164, - "learning_rate": 3.5230670792461943e-06, - "loss": 0.76767826, - "num_input_tokens_seen": 88810000, - "step": 4120, - "time_per_iteration": 2.6947290897369385 - }, - { - "auxiliary_loss_clip": 0.01135162, - "auxiliary_loss_mlp": 0.01051636, - "balance_loss_clip": 1.04963648, - "balance_loss_mlp": 1.03435111, - "epoch": 0.24776792424470165, - "flos": 15153405523200.0, - "grad_norm": 3.32651820696464, - "language_loss": 0.88006538, - "learning_rate": 3.522814630322041e-06, - "loss": 0.90193337, - "num_input_tokens_seen": 88827515, - "step": 4121, - "time_per_iteration": 4.181556224822998 - }, - { - "auxiliary_loss_clip": 0.01147178, - "auxiliary_loss_mlp": 0.01042601, - "balance_loss_clip": 1.05039763, - "balance_loss_mlp": 1.02431381, - "epoch": 0.2478280474973696, - "flos": 21725453347200.0, - "grad_norm": 2.0457274986343204, - "language_loss": 0.69676709, - "learning_rate": 3.5225621236526045e-06, - "loss": 0.71866482, - "num_input_tokens_seen": 88845025, - "step": 4122, - "time_per_iteration": 2.7041239738464355 - }, - { - "auxiliary_loss_clip": 0.01147132, - "auxiliary_loss_mlp": 0.01045532, - "balance_loss_clip": 1.05045271, - "balance_loss_mlp": 1.02655339, - "epoch": 0.24788817075003758, - "flos": 20412200200320.0, - "grad_norm": 2.4058135017179976, - "language_loss": 0.8026911, - "learning_rate": 3.5223095592474596e-06, - "loss": 0.82461774, - "num_input_tokens_seen": 88861740, - "step": 4123, - "time_per_iteration": 2.6154532432556152 - }, - { - "auxiliary_loss_clip": 0.01085408, - "auxiliary_loss_mlp": 0.0105298, - "balance_loss_clip": 1.04720712, - "balance_loss_mlp": 1.0354923, - "epoch": 0.24794829400270554, - "flos": 22594455083520.0, - "grad_norm": 2.2195758993023578, - "language_loss": 0.74967635, - "learning_rate": 3.5220569371161846e-06, - "loss": 0.77106017, - "num_input_tokens_seen": 88879740, - "step": 4124, - "time_per_iteration": 2.787986993789673 - }, - { - "auxiliary_loss_clip": 0.01131947, - "auxiliary_loss_mlp": 0.01044392, - "balance_loss_clip": 1.04892588, - "balance_loss_mlp": 1.02809608, - "epoch": 0.2480084172553735, - "flos": 39676047615360.0, - "grad_norm": 1.4128536066198873, - "language_loss": 0.73432529, - "learning_rate": 3.521804257268357e-06, - "loss": 0.75608873, - "num_input_tokens_seen": 88904095, - "step": 4125, - "time_per_iteration": 4.472416162490845 - }, - { - "auxiliary_loss_clip": 0.01109646, - "auxiliary_loss_mlp": 0.00776697, - "balance_loss_clip": 1.04420686, - "balance_loss_mlp": 1.00122678, - "epoch": 0.24806854050804147, - "flos": 22053712763520.0, - "grad_norm": 1.9607758383710057, - "language_loss": 0.69630861, - "learning_rate": 3.5215515197135595e-06, - "loss": 0.71517205, - "num_input_tokens_seen": 88920740, - "step": 4126, - "time_per_iteration": 2.7412056922912598 - }, - { - "auxiliary_loss_clip": 0.01133758, - "auxiliary_loss_mlp": 0.01051914, - "balance_loss_clip": 1.047984, - "balance_loss_mlp": 1.03331721, - "epoch": 0.24812866376070947, - "flos": 15486764670720.0, - "grad_norm": 2.275786464609162, - "language_loss": 0.81219494, - "learning_rate": 3.5212987244613764e-06, - "loss": 0.83405173, - "num_input_tokens_seen": 88938510, - "step": 4127, - "time_per_iteration": 2.620143413543701 - }, - { - "auxiliary_loss_clip": 0.01136685, - "auxiliary_loss_mlp": 0.00775421, - "balance_loss_clip": 1.04974318, - "balance_loss_mlp": 1.00120401, - "epoch": 0.24818878701337743, - "flos": 14757419013120.0, - "grad_norm": 6.503475382998669, - "language_loss": 0.8435086, - "learning_rate": 3.5210458715213927e-06, - "loss": 0.86262965, - "num_input_tokens_seen": 88955235, - "step": 4128, - "time_per_iteration": 2.6764745712280273 - }, - { - "auxiliary_loss_clip": 0.01117625, - "auxiliary_loss_mlp": 0.01057179, - "balance_loss_clip": 1.04831362, - "balance_loss_mlp": 1.03814149, - "epoch": 0.2482489102660454, - "flos": 27089501852160.0, - "grad_norm": 7.318299516736359, - "language_loss": 0.6572547, - "learning_rate": 3.5207929609031973e-06, - "loss": 0.67900276, - "num_input_tokens_seen": 88975210, - "step": 4129, - "time_per_iteration": 2.7178256511688232 - }, - { - "auxiliary_loss_clip": 0.01098796, - "auxiliary_loss_mlp": 0.01044421, - "balance_loss_clip": 1.04595077, - "balance_loss_mlp": 1.02570498, - "epoch": 0.24830903351871336, - "flos": 26467528924800.0, - "grad_norm": 1.8507928533331595, - "language_loss": 0.7496134, - "learning_rate": 3.5205399926163806e-06, - "loss": 0.77104557, - "num_input_tokens_seen": 88996120, - "step": 4130, - "time_per_iteration": 2.82098126411438 - }, - { - "auxiliary_loss_clip": 0.01078173, - "auxiliary_loss_mlp": 0.01050295, - "balance_loss_clip": 1.04238284, - "balance_loss_mlp": 1.03163934, - "epoch": 0.24836915677138133, - "flos": 10228436870400.0, - "grad_norm": 2.098795320061471, - "language_loss": 0.7680133, - "learning_rate": 3.520286966670535e-06, - "loss": 0.78929794, - "num_input_tokens_seen": 89008685, - "step": 4131, - "time_per_iteration": 2.7543740272521973 - }, - { - "auxiliary_loss_clip": 0.0113176, - "auxiliary_loss_mlp": 0.0104424, - "balance_loss_clip": 1.04992545, - "balance_loss_mlp": 1.02781272, - "epoch": 0.2484292800240493, - "flos": 30080429579520.0, - "grad_norm": 2.181098565661814, - "language_loss": 0.83579504, - "learning_rate": 3.520033883075255e-06, - "loss": 0.85755503, - "num_input_tokens_seen": 89031160, - "step": 4132, - "time_per_iteration": 2.681339979171753 - }, - { - "auxiliary_loss_clip": 0.01120332, - "auxiliary_loss_mlp": 0.01043901, - "balance_loss_clip": 1.04574823, - "balance_loss_mlp": 1.02506626, - "epoch": 0.24848940327671726, - "flos": 13442944803840.0, - "grad_norm": 1.8557605687682572, - "language_loss": 0.71320271, - "learning_rate": 3.5197807418401386e-06, - "loss": 0.73484504, - "num_input_tokens_seen": 89047235, - "step": 4133, - "time_per_iteration": 2.6573541164398193 - }, - { - "auxiliary_loss_clip": 0.01150987, - "auxiliary_loss_mlp": 0.0104789, - "balance_loss_clip": 1.05105197, - "balance_loss_mlp": 1.02624202, - "epoch": 0.24854952652938525, - "flos": 19970247260160.0, - "grad_norm": 3.222598228665933, - "language_loss": 0.61894202, - "learning_rate": 3.5195275429747834e-06, - "loss": 0.64093071, - "num_input_tokens_seen": 89064790, - "step": 4134, - "time_per_iteration": 2.5639493465423584 - }, - { - "auxiliary_loss_clip": 0.01135356, - "auxiliary_loss_mlp": 0.01045434, - "balance_loss_clip": 1.04877877, - "balance_loss_mlp": 1.02764797, - "epoch": 0.24860964978205322, - "flos": 18150187167360.0, - "grad_norm": 1.882175713893398, - "language_loss": 0.78382719, - "learning_rate": 3.5192742864887914e-06, - "loss": 0.80563509, - "num_input_tokens_seen": 89083250, - "step": 4135, - "time_per_iteration": 2.6075639724731445 - }, - { - "auxiliary_loss_clip": 0.01123928, - "auxiliary_loss_mlp": 0.01035702, - "balance_loss_clip": 1.05297661, - "balance_loss_mlp": 1.01917946, - "epoch": 0.24866977303472118, - "flos": 11728641329280.0, - "grad_norm": 2.4269193192884186, - "language_loss": 0.83582413, - "learning_rate": 3.5190209723917662e-06, - "loss": 0.85742044, - "num_input_tokens_seen": 89100905, - "step": 4136, - "time_per_iteration": 2.623377799987793 - }, - { - "auxiliary_loss_clip": 0.01119838, - "auxiliary_loss_mlp": 0.01045223, - "balance_loss_clip": 1.05071807, - "balance_loss_mlp": 1.02713883, - "epoch": 0.24872989628738915, - "flos": 34823582565120.0, - "grad_norm": 2.1322549527950665, - "language_loss": 0.7057327, - "learning_rate": 3.518767600693314e-06, - "loss": 0.72738326, - "num_input_tokens_seen": 89122630, - "step": 4137, - "time_per_iteration": 2.814115524291992 - }, - { - "auxiliary_loss_clip": 0.01133507, - "auxiliary_loss_mlp": 0.00775347, - "balance_loss_clip": 1.0449059, - "balance_loss_mlp": 1.00107706, - "epoch": 0.2487900195400571, - "flos": 13699347062400.0, - "grad_norm": 2.085766315480858, - "language_loss": 0.66914427, - "learning_rate": 3.518514171403042e-06, - "loss": 0.68823284, - "num_input_tokens_seen": 89141050, - "step": 4138, - "time_per_iteration": 2.646043539047241 - }, - { - "auxiliary_loss_clip": 0.01103579, - "auxiliary_loss_mlp": 0.01036477, - "balance_loss_clip": 1.04612446, - "balance_loss_mlp": 1.02000237, - "epoch": 0.24885014279272508, - "flos": 25337815297920.0, - "grad_norm": 1.983116672544965, - "language_loss": 0.83913636, - "learning_rate": 3.51826068453056e-06, - "loss": 0.86053687, - "num_input_tokens_seen": 89160810, - "step": 4139, - "time_per_iteration": 2.741090774536133 - }, - { - "auxiliary_loss_clip": 0.01111549, - "auxiliary_loss_mlp": 0.01040434, - "balance_loss_clip": 1.04586422, - "balance_loss_mlp": 1.02192068, - "epoch": 0.24891026604539307, - "flos": 20631434860800.0, - "grad_norm": 1.4951428686450043, - "language_loss": 0.78923917, - "learning_rate": 3.518007140085481e-06, - "loss": 0.81075907, - "num_input_tokens_seen": 89180610, - "step": 4140, - "time_per_iteration": 2.712780237197876 - }, - { - "auxiliary_loss_clip": 0.01048621, - "auxiliary_loss_mlp": 0.01096526, - "balance_loss_clip": 1.02931261, - "balance_loss_mlp": 1.09464228, - "epoch": 0.24897038929806103, - "flos": 66960294030720.0, - "grad_norm": 0.8293539951671052, - "language_loss": 0.61007011, - "learning_rate": 3.51775353807742e-06, - "loss": 0.63152146, - "num_input_tokens_seen": 89241880, - "step": 4141, - "time_per_iteration": 3.240020513534546 - }, - { - "auxiliary_loss_clip": 0.01147379, - "auxiliary_loss_mlp": 0.01049841, - "balance_loss_clip": 1.05116534, - "balance_loss_mlp": 1.03240097, - "epoch": 0.249030512550729, - "flos": 36392555612160.0, - "grad_norm": 2.1246942361961025, - "language_loss": 0.72794569, - "learning_rate": 3.5174998785159913e-06, - "loss": 0.74991786, - "num_input_tokens_seen": 89263340, - "step": 4142, - "time_per_iteration": 2.7316160202026367 - }, - { - "auxiliary_loss_clip": 0.01133287, - "auxiliary_loss_mlp": 0.01044374, - "balance_loss_clip": 1.04780602, - "balance_loss_mlp": 1.02705276, - "epoch": 0.24909063580339696, - "flos": 20154576879360.0, - "grad_norm": 1.7635050074541005, - "language_loss": 0.80630821, - "learning_rate": 3.5172461614108157e-06, - "loss": 0.82808483, - "num_input_tokens_seen": 89282870, - "step": 4143, - "time_per_iteration": 2.6763389110565186 - }, - { - "auxiliary_loss_clip": 0.01117552, - "auxiliary_loss_mlp": 0.01036613, - "balance_loss_clip": 1.04615402, - "balance_loss_mlp": 1.02026916, - "epoch": 0.24915075905606493, - "flos": 26396569607040.0, - "grad_norm": 2.7235452599944145, - "language_loss": 0.59766376, - "learning_rate": 3.5169923867715137e-06, - "loss": 0.61920542, - "num_input_tokens_seen": 89303830, - "step": 4144, - "time_per_iteration": 2.789417266845703 - }, - { - "auxiliary_loss_clip": 0.01128344, - "auxiliary_loss_mlp": 0.01045393, - "balance_loss_clip": 1.04464769, - "balance_loss_mlp": 1.02850127, - "epoch": 0.2492108823087329, - "flos": 27527216987520.0, - "grad_norm": 2.1754585056135047, - "language_loss": 0.78476733, - "learning_rate": 3.516738554607708e-06, - "loss": 0.80650467, - "num_input_tokens_seen": 89324350, - "step": 4145, - "time_per_iteration": 2.8416056632995605 - }, - { - "auxiliary_loss_clip": 0.01140077, - "auxiliary_loss_mlp": 0.00778414, - "balance_loss_clip": 1.04980016, - "balance_loss_mlp": 1.00122261, - "epoch": 0.24927100556140086, - "flos": 16691388111360.0, - "grad_norm": 2.035933799021365, - "language_loss": 0.64925039, - "learning_rate": 3.5164846649290253e-06, - "loss": 0.66843534, - "num_input_tokens_seen": 89342875, - "step": 4146, - "time_per_iteration": 2.818240165710449 - }, - { - "auxiliary_loss_clip": 0.01036642, - "auxiliary_loss_mlp": 0.0100618, - "balance_loss_clip": 1.02582741, - "balance_loss_mlp": 1.00403452, - "epoch": 0.24933112881406885, - "flos": 62772464286720.0, - "grad_norm": 0.9560925601012792, - "language_loss": 0.67304933, - "learning_rate": 3.5162307177450915e-06, - "loss": 0.69347757, - "num_input_tokens_seen": 89404925, - "step": 4147, - "time_per_iteration": 3.339989185333252 - }, - { - "auxiliary_loss_clip": 0.01123141, - "auxiliary_loss_mlp": 0.0104863, - "balance_loss_clip": 1.04991198, - "balance_loss_mlp": 1.03078485, - "epoch": 0.24939125206673682, - "flos": 26651894457600.0, - "grad_norm": 2.4221411280533554, - "language_loss": 0.89285177, - "learning_rate": 3.5159767130655366e-06, - "loss": 0.9145695, - "num_input_tokens_seen": 89425090, - "step": 4148, - "time_per_iteration": 2.7497105598449707 - }, - { - "auxiliary_loss_clip": 0.01098234, - "auxiliary_loss_mlp": 0.01049718, - "balance_loss_clip": 1.04725289, - "balance_loss_mlp": 1.02874899, - "epoch": 0.24945137531940478, - "flos": 20704333512960.0, - "grad_norm": 1.90098046882646, - "language_loss": 0.68272161, - "learning_rate": 3.5157226508999935e-06, - "loss": 0.70420116, - "num_input_tokens_seen": 89442615, - "step": 4149, - "time_per_iteration": 2.7739884853363037 - }, - { - "auxiliary_loss_clip": 0.01134907, - "auxiliary_loss_mlp": 0.01044357, - "balance_loss_clip": 1.0508213, - "balance_loss_mlp": 1.02747655, - "epoch": 0.24951149857207275, - "flos": 23768662682880.0, - "grad_norm": 1.67166255010053, - "language_loss": 0.71424097, - "learning_rate": 3.515468531258095e-06, - "loss": 0.73603356, - "num_input_tokens_seen": 89463025, - "step": 4150, - "time_per_iteration": 2.6801233291625977 - }, - { - "auxiliary_loss_clip": 0.01098898, - "auxiliary_loss_mlp": 0.0104939, - "balance_loss_clip": 1.04628861, - "balance_loss_mlp": 1.03149676, - "epoch": 0.2495716218247407, - "flos": 15664881237120.0, - "grad_norm": 4.371450104119659, - "language_loss": 0.72732216, - "learning_rate": 3.515214354149478e-06, - "loss": 0.74880505, - "num_input_tokens_seen": 89480225, - "step": 4151, - "time_per_iteration": 2.7118351459503174 - }, - { - "auxiliary_loss_clip": 0.01142805, - "auxiliary_loss_mlp": 0.01054095, - "balance_loss_clip": 1.05117846, - "balance_loss_mlp": 1.0357486, - "epoch": 0.24963174507740868, - "flos": 24052499953920.0, - "grad_norm": 3.4200711789217397, - "language_loss": 0.63707078, - "learning_rate": 3.514960119583781e-06, - "loss": 0.65903974, - "num_input_tokens_seen": 89496985, - "step": 4152, - "time_per_iteration": 2.6352219581604004 - }, - { - "auxiliary_loss_clip": 0.01128057, - "auxiliary_loss_mlp": 0.01043812, - "balance_loss_clip": 1.05110407, - "balance_loss_mlp": 1.02628791, - "epoch": 0.24969186833007664, - "flos": 21799501234560.0, - "grad_norm": 3.664579624689737, - "language_loss": 0.77259195, - "learning_rate": 3.514705827570645e-06, - "loss": 0.79431069, - "num_input_tokens_seen": 89514420, - "step": 4153, - "time_per_iteration": 2.6120872497558594 - }, - { - "auxiliary_loss_clip": 0.01135035, - "auxiliary_loss_mlp": 0.01042909, - "balance_loss_clip": 1.05221617, - "balance_loss_mlp": 1.02620757, - "epoch": 0.24975199158274464, - "flos": 19938143479680.0, - "grad_norm": 2.5781435797973833, - "language_loss": 0.7677725, - "learning_rate": 3.514451478119711e-06, - "loss": 0.78955191, - "num_input_tokens_seen": 89532925, - "step": 4154, - "time_per_iteration": 2.7488853931427 - }, - { - "auxiliary_loss_clip": 0.0113655, - "auxiliary_loss_mlp": 0.01051969, - "balance_loss_clip": 1.05146766, - "balance_loss_mlp": 1.03251421, - "epoch": 0.2498121148354126, - "flos": 25338389915520.0, - "grad_norm": 1.9052782276095375, - "language_loss": 0.70335877, - "learning_rate": 3.5141970712406258e-06, - "loss": 0.72524405, - "num_input_tokens_seen": 89552855, - "step": 4155, - "time_per_iteration": 2.6622395515441895 - }, - { - "auxiliary_loss_clip": 0.01127695, - "auxiliary_loss_mlp": 0.01047805, - "balance_loss_clip": 1.05243564, - "balance_loss_mlp": 1.03074658, - "epoch": 0.24987223808808057, - "flos": 20558787603840.0, - "grad_norm": 1.6974192026095432, - "language_loss": 0.74953228, - "learning_rate": 3.513942606943036e-06, - "loss": 0.77128726, - "num_input_tokens_seen": 89572830, - "step": 4156, - "time_per_iteration": 2.7599329948425293 - }, - { - "auxiliary_loss_clip": 0.01127061, - "auxiliary_loss_mlp": 0.01040498, - "balance_loss_clip": 1.04922485, - "balance_loss_mlp": 1.02404737, - "epoch": 0.24993236134074853, - "flos": 19749037351680.0, - "grad_norm": 2.6541448192787858, - "language_loss": 0.76703429, - "learning_rate": 3.513688085236591e-06, - "loss": 0.78870988, - "num_input_tokens_seen": 89590345, - "step": 4157, - "time_per_iteration": 4.172720432281494 - }, - { - "auxiliary_loss_clip": 0.01087279, - "auxiliary_loss_mlp": 0.01050682, - "balance_loss_clip": 1.04686046, - "balance_loss_mlp": 1.03302717, - "epoch": 0.2499924845934165, - "flos": 18770292587520.0, - "grad_norm": 6.508490360255271, - "language_loss": 0.81656492, - "learning_rate": 3.513433506130942e-06, - "loss": 0.83794451, - "num_input_tokens_seen": 89610295, - "step": 4158, - "time_per_iteration": 4.373260736465454 - }, - { - "auxiliary_loss_clip": 0.01115824, - "auxiliary_loss_mlp": 0.01039502, - "balance_loss_clip": 1.04740119, - "balance_loss_mlp": 1.02166879, - "epoch": 0.25005260784608446, - "flos": 16872198197760.0, - "grad_norm": 2.799032697181286, - "language_loss": 0.76568067, - "learning_rate": 3.5131788696357427e-06, - "loss": 0.78723395, - "num_input_tokens_seen": 89627795, - "step": 4159, - "time_per_iteration": 2.6529338359832764 - }, - { - "auxiliary_loss_clip": 0.01139337, - "auxiliary_loss_mlp": 0.01038581, - "balance_loss_clip": 1.05149508, - "balance_loss_mlp": 1.02013946, - "epoch": 0.2501127310987524, - "flos": 22124923476480.0, - "grad_norm": 2.4918403268433122, - "language_loss": 0.71557873, - "learning_rate": 3.512924175760649e-06, - "loss": 0.73735791, - "num_input_tokens_seen": 89648090, - "step": 4160, - "time_per_iteration": 4.178418874740601 - }, - { - "auxiliary_loss_clip": 0.01062459, - "auxiliary_loss_mlp": 0.01001923, - "balance_loss_clip": 1.02823949, - "balance_loss_mlp": 0.99992067, - "epoch": 0.2501728543514204, - "flos": 69458061980160.0, - "grad_norm": 0.7611682305123987, - "language_loss": 0.56783372, - "learning_rate": 3.5126694245153186e-06, - "loss": 0.58847755, - "num_input_tokens_seen": 89710345, - "step": 4161, - "time_per_iteration": 3.1690969467163086 - }, - { - "auxiliary_loss_clip": 0.0114076, - "auxiliary_loss_mlp": 0.01048659, - "balance_loss_clip": 1.05206347, - "balance_loss_mlp": 1.0308131, - "epoch": 0.25023297760408836, - "flos": 16289978647680.0, - "grad_norm": 4.523737291621751, - "language_loss": 0.80654883, - "learning_rate": 3.5124146159094125e-06, - "loss": 0.82844305, - "num_input_tokens_seen": 89729390, - "step": 4162, - "time_per_iteration": 2.630491018295288 - }, - { - "auxiliary_loss_clip": 0.01127145, - "auxiliary_loss_mlp": 0.00776859, - "balance_loss_clip": 1.04807281, - "balance_loss_mlp": 1.00124371, - "epoch": 0.2502931008567563, - "flos": 12237998140800.0, - "grad_norm": 3.0029202967601107, - "language_loss": 0.87312925, - "learning_rate": 3.5121597499525927e-06, - "loss": 0.89216936, - "num_input_tokens_seen": 89742805, - "step": 4163, - "time_per_iteration": 2.660985231399536 - }, - { - "auxiliary_loss_clip": 0.01133331, - "auxiliary_loss_mlp": 0.01039671, - "balance_loss_clip": 1.0538981, - "balance_loss_mlp": 1.02234972, - "epoch": 0.25035322410942434, - "flos": 23181882105600.0, - "grad_norm": 1.700690076898522, - "language_loss": 0.83170879, - "learning_rate": 3.5119048266545232e-06, - "loss": 0.85343885, - "num_input_tokens_seen": 89761145, - "step": 4164, - "time_per_iteration": 4.217406988143921 - }, - { - "auxiliary_loss_clip": 0.01131608, - "auxiliary_loss_mlp": 0.01047174, - "balance_loss_clip": 1.05681539, - "balance_loss_mlp": 1.0309732, - "epoch": 0.2504133473620923, - "flos": 20917534688640.0, - "grad_norm": 1.61687510361108, - "language_loss": 0.73889691, - "learning_rate": 3.5116498460248716e-06, - "loss": 0.76068473, - "num_input_tokens_seen": 89780905, - "step": 4165, - "time_per_iteration": 2.7395150661468506 - }, - { - "auxiliary_loss_clip": 0.01112927, - "auxiliary_loss_mlp": 0.01043589, - "balance_loss_clip": 1.04912043, - "balance_loss_mlp": 1.02611279, - "epoch": 0.2504734706147603, - "flos": 20776549806720.0, - "grad_norm": 1.856982928728685, - "language_loss": 0.74739552, - "learning_rate": 3.5113948080733062e-06, - "loss": 0.7689606, - "num_input_tokens_seen": 89799230, - "step": 4166, - "time_per_iteration": 2.7567081451416016 - }, - { - "auxiliary_loss_clip": 0.01110594, - "auxiliary_loss_mlp": 0.01042647, - "balance_loss_clip": 1.04968488, - "balance_loss_mlp": 1.02651834, - "epoch": 0.25053359386742824, - "flos": 24349373861760.0, - "grad_norm": 2.0013578528528724, - "language_loss": 0.82254446, - "learning_rate": 3.5111397128094973e-06, - "loss": 0.84407687, - "num_input_tokens_seen": 89818240, - "step": 4167, - "time_per_iteration": 2.692664384841919 - }, - { - "auxiliary_loss_clip": 0.01130059, - "auxiliary_loss_mlp": 0.01043694, - "balance_loss_clip": 1.05185139, - "balance_loss_mlp": 1.02695727, - "epoch": 0.2505937171200962, - "flos": 21214336769280.0, - "grad_norm": 2.4392619558537407, - "language_loss": 0.79381847, - "learning_rate": 3.51088456024312e-06, - "loss": 0.81555605, - "num_input_tokens_seen": 89834485, - "step": 4168, - "time_per_iteration": 2.6286962032318115 - }, - { - "auxiliary_loss_clip": 0.01138966, - "auxiliary_loss_mlp": 0.01046302, - "balance_loss_clip": 1.05118442, - "balance_loss_mlp": 1.02704966, - "epoch": 0.25065384037276417, - "flos": 41427231379200.0, - "grad_norm": 2.2753262043393243, - "language_loss": 0.69603884, - "learning_rate": 3.510629350383849e-06, - "loss": 0.71789157, - "num_input_tokens_seen": 89855645, - "step": 4169, - "time_per_iteration": 2.7935590744018555 - }, - { - "auxiliary_loss_clip": 0.01110761, - "auxiliary_loss_mlp": 0.01049625, - "balance_loss_clip": 1.04870963, - "balance_loss_mlp": 1.03274524, - "epoch": 0.25071396362543213, - "flos": 26102389219200.0, - "grad_norm": 1.8250030020409629, - "language_loss": 0.78045398, - "learning_rate": 3.510374083241361e-06, - "loss": 0.80205786, - "num_input_tokens_seen": 89874895, - "step": 4170, - "time_per_iteration": 2.7728679180145264 - }, - { - "auxiliary_loss_clip": 0.01128286, - "auxiliary_loss_mlp": 0.01043437, - "balance_loss_clip": 1.05320668, - "balance_loss_mlp": 1.02662849, - "epoch": 0.2507740868781001, - "flos": 19098982967040.0, - "grad_norm": 2.5073993684848004, - "language_loss": 0.76440209, - "learning_rate": 3.5101187588253368e-06, - "loss": 0.78611928, - "num_input_tokens_seen": 89891700, - "step": 4171, - "time_per_iteration": 2.7825160026550293 - }, - { - "auxiliary_loss_clip": 0.01061117, - "auxiliary_loss_mlp": 0.01002396, - "balance_loss_clip": 1.027282, - "balance_loss_mlp": 1.00034571, - "epoch": 0.25083421013076806, - "flos": 64341868296960.0, - "grad_norm": 0.8424544393272001, - "language_loss": 0.6006161, - "learning_rate": 3.509863377145458e-06, - "loss": 0.62125123, - "num_input_tokens_seen": 89955775, - "step": 4172, - "time_per_iteration": 3.1981940269470215 - }, - { - "auxiliary_loss_clip": 0.01125517, - "auxiliary_loss_mlp": 0.01046213, - "balance_loss_clip": 1.05005789, - "balance_loss_mlp": 1.02821243, - "epoch": 0.25089433338343603, - "flos": 24279599692800.0, - "grad_norm": 1.4368714421460043, - "language_loss": 0.79106563, - "learning_rate": 3.509607938211409e-06, - "loss": 0.81278288, - "num_input_tokens_seen": 89977150, - "step": 4173, - "time_per_iteration": 2.8311028480529785 - }, - { - "auxiliary_loss_clip": 0.01152553, - "auxiliary_loss_mlp": 0.0104675, - "balance_loss_clip": 1.05725241, - "balance_loss_mlp": 1.02986968, - "epoch": 0.250954456636104, - "flos": 14721472477440.0, - "grad_norm": 2.103663042812158, - "language_loss": 0.83371937, - "learning_rate": 3.509352442032875e-06, - "loss": 0.85571229, - "num_input_tokens_seen": 89994925, - "step": 4174, - "time_per_iteration": 2.696199893951416 - }, - { - "auxiliary_loss_clip": 0.01095749, - "auxiliary_loss_mlp": 0.01049206, - "balance_loss_clip": 1.04728913, - "balance_loss_mlp": 1.03095484, - "epoch": 0.25101457988877196, - "flos": 22273593868800.0, - "grad_norm": 43.022796554959484, - "language_loss": 0.71023381, - "learning_rate": 3.509096888619545e-06, - "loss": 0.73168337, - "num_input_tokens_seen": 90013235, - "step": 4175, - "time_per_iteration": 2.8337926864624023 - }, - { - "auxiliary_loss_clip": 0.01119154, - "auxiliary_loss_mlp": 0.01038924, - "balance_loss_clip": 1.05135846, - "balance_loss_mlp": 1.02145982, - "epoch": 0.2510747031414399, - "flos": 25188929424000.0, - "grad_norm": 2.017414900854033, - "language_loss": 0.80957019, - "learning_rate": 3.50884127798111e-06, - "loss": 0.83115101, - "num_input_tokens_seen": 90032150, - "step": 4176, - "time_per_iteration": 2.936908483505249 - }, - { - "auxiliary_loss_clip": 0.01127542, - "auxiliary_loss_mlp": 0.0104611, - "balance_loss_clip": 1.0535233, - "balance_loss_mlp": 1.02753711, - "epoch": 0.25113482639410795, - "flos": 20704189858560.0, - "grad_norm": 2.475574978330162, - "language_loss": 0.82294285, - "learning_rate": 3.5085856101272623e-06, - "loss": 0.84467936, - "num_input_tokens_seen": 90049085, - "step": 4177, - "time_per_iteration": 2.7630460262298584 - }, - { - "auxiliary_loss_clip": 0.01110202, - "auxiliary_loss_mlp": 0.01051495, - "balance_loss_clip": 1.05168724, - "balance_loss_mlp": 1.03386414, - "epoch": 0.2511949496467759, - "flos": 21506936958720.0, - "grad_norm": 2.1761277698635593, - "language_loss": 0.82517993, - "learning_rate": 3.508329885067698e-06, - "loss": 0.84679693, - "num_input_tokens_seen": 90067695, - "step": 4178, - "time_per_iteration": 2.7356274127960205 - }, - { - "auxiliary_loss_clip": 0.01145101, - "auxiliary_loss_mlp": 0.00775573, - "balance_loss_clip": 1.05324888, - "balance_loss_mlp": 1.00148535, - "epoch": 0.2512550728994439, - "flos": 20701999128960.0, - "grad_norm": 2.1475299559000947, - "language_loss": 0.75229692, - "learning_rate": 3.508074102812112e-06, - "loss": 0.77150369, - "num_input_tokens_seen": 90083890, - "step": 4179, - "time_per_iteration": 2.631096363067627 - }, - { - "auxiliary_loss_clip": 0.01109293, - "auxiliary_loss_mlp": 0.01056583, - "balance_loss_clip": 1.04920673, - "balance_loss_mlp": 1.03833175, - "epoch": 0.25131519615211184, - "flos": 18478626151680.0, - "grad_norm": 1.9599833122138943, - "language_loss": 0.69976825, - "learning_rate": 3.507818263370206e-06, - "loss": 0.72142696, - "num_input_tokens_seen": 90100995, - "step": 4180, - "time_per_iteration": 2.708122730255127 - }, - { - "auxiliary_loss_clip": 0.01147992, - "auxiliary_loss_mlp": 0.01045783, - "balance_loss_clip": 1.05343485, - "balance_loss_mlp": 1.02909422, - "epoch": 0.2513753194047798, - "flos": 20484955198080.0, - "grad_norm": 1.8622914556591927, - "language_loss": 0.85940182, - "learning_rate": 3.5075623667516796e-06, - "loss": 0.88133955, - "num_input_tokens_seen": 90120365, - "step": 4181, - "time_per_iteration": 2.633091449737549 - }, - { - "auxiliary_loss_clip": 0.01148017, - "auxiliary_loss_mlp": 0.01049707, - "balance_loss_clip": 1.05351245, - "balance_loss_mlp": 1.03270781, - "epoch": 0.25143544265744777, - "flos": 37670077704960.0, - "grad_norm": 2.0695978407502467, - "language_loss": 0.6856631, - "learning_rate": 3.507306412966238e-06, - "loss": 0.70764029, - "num_input_tokens_seen": 90142610, - "step": 4182, - "time_per_iteration": 2.8169894218444824 - }, - { - "auxiliary_loss_clip": 0.01041202, - "auxiliary_loss_mlp": 0.010083, - "balance_loss_clip": 1.02456141, - "balance_loss_mlp": 1.00577307, - "epoch": 0.25149556591011574, - "flos": 69367457923200.0, - "grad_norm": 0.8403189096432666, - "language_loss": 0.70032597, - "learning_rate": 3.5070504020235853e-06, - "loss": 0.72082102, - "num_input_tokens_seen": 90200555, - "step": 4183, - "time_per_iteration": 3.2070610523223877 - }, - { - "auxiliary_loss_clip": 0.01130203, - "auxiliary_loss_mlp": 0.01042834, - "balance_loss_clip": 1.05145216, - "balance_loss_mlp": 1.02441609, - "epoch": 0.2515556891627837, - "flos": 13990402967040.0, - "grad_norm": 1.8802113118438855, - "language_loss": 0.73834902, - "learning_rate": 3.506794333933431e-06, - "loss": 0.76007938, - "num_input_tokens_seen": 90218120, - "step": 4184, - "time_per_iteration": 2.691950559616089 - }, - { - "auxiliary_loss_clip": 0.01136971, - "auxiliary_loss_mlp": 0.01047362, - "balance_loss_clip": 1.05233765, - "balance_loss_mlp": 1.0297792, - "epoch": 0.25161581241545167, - "flos": 22163527618560.0, - "grad_norm": 1.8676646084141537, - "language_loss": 0.8334859, - "learning_rate": 3.506538208705484e-06, - "loss": 0.85532916, - "num_input_tokens_seen": 90236790, - "step": 4185, - "time_per_iteration": 2.6931228637695312 - }, - { - "auxiliary_loss_clip": 0.01022217, - "auxiliary_loss_mlp": 0.01010846, - "balance_loss_clip": 1.03471541, - "balance_loss_mlp": 1.00902176, - "epoch": 0.25167593566811963, - "flos": 69358407696000.0, - "grad_norm": 0.7883550117667959, - "language_loss": 0.61448294, - "learning_rate": 3.5062820263494574e-06, - "loss": 0.63481361, - "num_input_tokens_seen": 90297070, - "step": 4186, - "time_per_iteration": 3.175295829772949 - }, - { - "auxiliary_loss_clip": 0.01107804, - "auxiliary_loss_mlp": 0.01041844, - "balance_loss_clip": 1.04873872, - "balance_loss_mlp": 1.02405787, - "epoch": 0.2517360589207876, - "flos": 13261452359040.0, - "grad_norm": 1.8553357788385085, - "language_loss": 0.79070914, - "learning_rate": 3.5060257868750656e-06, - "loss": 0.81220555, - "num_input_tokens_seen": 90315255, - "step": 4187, - "time_per_iteration": 2.887378215789795 - }, - { - "auxiliary_loss_clip": 0.01091434, - "auxiliary_loss_mlp": 0.01049489, - "balance_loss_clip": 1.0482558, - "balance_loss_mlp": 1.03138089, - "epoch": 0.25179618217345556, - "flos": 20376828282240.0, - "grad_norm": 3.7749228259968586, - "language_loss": 0.79629189, - "learning_rate": 3.5057694902920244e-06, - "loss": 0.8177011, - "num_input_tokens_seen": 90334990, - "step": 4188, - "time_per_iteration": 2.8985629081726074 - }, - { - "auxiliary_loss_clip": 0.01133381, - "auxiliary_loss_mlp": 0.01046993, - "balance_loss_clip": 1.05168021, - "balance_loss_mlp": 1.03012538, - "epoch": 0.25185630542612353, - "flos": 27664718250240.0, - "grad_norm": 1.7363151422402578, - "language_loss": 0.74419165, - "learning_rate": 3.5055131366100534e-06, - "loss": 0.76599538, - "num_input_tokens_seen": 90351825, - "step": 4189, - "time_per_iteration": 2.697097063064575 - }, - { - "auxiliary_loss_clip": 0.01118534, - "auxiliary_loss_mlp": 0.01044827, - "balance_loss_clip": 1.04871011, - "balance_loss_mlp": 1.02862656, - "epoch": 0.25191642867879155, - "flos": 20996430912000.0, - "grad_norm": 2.0536634388060078, - "language_loss": 0.84721291, - "learning_rate": 3.5052567258388745e-06, - "loss": 0.86884648, - "num_input_tokens_seen": 90369860, - "step": 4190, - "time_per_iteration": 2.731227397918701 - }, - { - "auxiliary_loss_clip": 0.01118209, - "auxiliary_loss_mlp": 0.01044895, - "balance_loss_clip": 1.04597688, - "balance_loss_mlp": 1.02633369, - "epoch": 0.2519765519314595, - "flos": 21105671149440.0, - "grad_norm": 2.0130913170662783, - "language_loss": 0.75695485, - "learning_rate": 3.5050002579882082e-06, - "loss": 0.77858591, - "num_input_tokens_seen": 90389245, - "step": 4191, - "time_per_iteration": 2.7403173446655273 - }, - { - "auxiliary_loss_clip": 0.01048031, - "auxiliary_loss_mlp": 0.01014765, - "balance_loss_clip": 1.02375531, - "balance_loss_mlp": 1.0122261, - "epoch": 0.2520366751841275, - "flos": 62744993360640.0, - "grad_norm": 0.7280864395517058, - "language_loss": 0.57129633, - "learning_rate": 3.5047437330677823e-06, - "loss": 0.59192419, - "num_input_tokens_seen": 90456735, - "step": 4192, - "time_per_iteration": 3.237478017807007 - }, - { - "auxiliary_loss_clip": 0.01121978, - "auxiliary_loss_mlp": 0.01041578, - "balance_loss_clip": 1.05535698, - "balance_loss_mlp": 1.02374434, - "epoch": 0.25209679843679544, - "flos": 22230716008320.0, - "grad_norm": 1.8423117439969312, - "language_loss": 0.76066267, - "learning_rate": 3.504487151087323e-06, - "loss": 0.78229821, - "num_input_tokens_seen": 90474165, - "step": 4193, - "time_per_iteration": 2.699486255645752 - }, - { - "auxiliary_loss_clip": 0.01137884, - "auxiliary_loss_mlp": 0.01046125, - "balance_loss_clip": 1.05232048, - "balance_loss_mlp": 1.02869618, - "epoch": 0.2521569216894634, - "flos": 12166643773440.0, - "grad_norm": 3.5003037089711437, - "language_loss": 0.84335077, - "learning_rate": 3.5042305120565598e-06, - "loss": 0.86519086, - "num_input_tokens_seen": 90491660, - "step": 4194, - "time_per_iteration": 2.6561896800994873 - }, - { - "auxiliary_loss_clip": 0.01149932, - "auxiliary_loss_mlp": 0.01050793, - "balance_loss_clip": 1.05253458, - "balance_loss_mlp": 1.03461599, - "epoch": 0.2522170449421314, - "flos": 23699786353920.0, - "grad_norm": 1.3753304678825264, - "language_loss": 0.88249695, - "learning_rate": 3.5039738159852253e-06, - "loss": 0.90450418, - "num_input_tokens_seen": 90514025, - "step": 4195, - "time_per_iteration": 2.67887806892395 - }, - { - "auxiliary_loss_clip": 0.01150202, - "auxiliary_loss_mlp": 0.01041959, - "balance_loss_clip": 1.05412734, - "balance_loss_mlp": 1.02199149, - "epoch": 0.25227716819479934, - "flos": 20955456472320.0, - "grad_norm": 2.4146072325129087, - "language_loss": 0.85488242, - "learning_rate": 3.503717062883053e-06, - "loss": 0.87680399, - "num_input_tokens_seen": 90533530, - "step": 4196, - "time_per_iteration": 2.6358916759490967 - }, - { - "auxiliary_loss_clip": 0.01137804, - "auxiliary_loss_mlp": 0.01049246, - "balance_loss_clip": 1.05213511, - "balance_loss_mlp": 1.03193665, - "epoch": 0.2523372914474673, - "flos": 23331342597120.0, - "grad_norm": 1.9329643035636839, - "language_loss": 0.8319478, - "learning_rate": 3.5034602527597786e-06, - "loss": 0.8538183, - "num_input_tokens_seen": 90554025, - "step": 4197, - "time_per_iteration": 5.738839387893677 - }, - { - "auxiliary_loss_clip": 0.01140063, - "auxiliary_loss_mlp": 0.01051416, - "balance_loss_clip": 1.05392218, - "balance_loss_mlp": 1.03224671, - "epoch": 0.25239741470013527, - "flos": 36970321875840.0, - "grad_norm": 2.1358917159416104, - "language_loss": 0.72820318, - "learning_rate": 3.5032033856251405e-06, - "loss": 0.75011802, - "num_input_tokens_seen": 90576930, - "step": 4198, - "time_per_iteration": 2.8819963932037354 - }, - { - "auxiliary_loss_clip": 0.01152924, - "auxiliary_loss_mlp": 0.01048555, - "balance_loss_clip": 1.05455935, - "balance_loss_mlp": 1.03045893, - "epoch": 0.25245753795280323, - "flos": 18515757836160.0, - "grad_norm": 6.722547943004915, - "language_loss": 0.76560014, - "learning_rate": 3.50294646148888e-06, - "loss": 0.78761488, - "num_input_tokens_seen": 90595710, - "step": 4199, - "time_per_iteration": 2.636993169784546 - }, - { - "auxiliary_loss_clip": 0.01125413, - "auxiliary_loss_mlp": 0.00776026, - "balance_loss_clip": 1.05274642, - "balance_loss_mlp": 1.00117147, - "epoch": 0.2525176612054712, - "flos": 32344884737280.0, - "grad_norm": 1.814097809936595, - "language_loss": 0.73571241, - "learning_rate": 3.502689480360739e-06, - "loss": 0.75472683, - "num_input_tokens_seen": 90617945, - "step": 4200, - "time_per_iteration": 4.297755002975464 - }, - { - "auxiliary_loss_clip": 0.01137136, - "auxiliary_loss_mlp": 0.01047957, - "balance_loss_clip": 1.05050063, - "balance_loss_mlp": 1.03187585, - "epoch": 0.25257778445813917, - "flos": 45258217459200.0, - "grad_norm": 1.6490086858694837, - "language_loss": 0.8223114, - "learning_rate": 3.5024324422504616e-06, - "loss": 0.84416234, - "num_input_tokens_seen": 90640855, - "step": 4201, - "time_per_iteration": 2.859703302383423 - }, - { - "auxiliary_loss_clip": 0.01098423, - "auxiliary_loss_mlp": 0.01048, - "balance_loss_clip": 1.05422068, - "balance_loss_mlp": 1.03126347, - "epoch": 0.25263790771080713, - "flos": 23367791923200.0, - "grad_norm": 1.9307441853812024, - "language_loss": 0.74854887, - "learning_rate": 3.5021753471677965e-06, - "loss": 0.77001321, - "num_input_tokens_seen": 90661350, - "step": 4202, - "time_per_iteration": 2.7475366592407227 - }, - { - "auxiliary_loss_clip": 0.01134371, - "auxiliary_loss_mlp": 0.01040637, - "balance_loss_clip": 1.05362439, - "balance_loss_mlp": 1.02392364, - "epoch": 0.25269803096347515, - "flos": 18515039564160.0, - "grad_norm": 1.882597455778369, - "language_loss": 0.7323755, - "learning_rate": 3.501918195122491e-06, - "loss": 0.75412554, - "num_input_tokens_seen": 90680540, - "step": 4203, - "time_per_iteration": 2.6547653675079346 - }, - { - "auxiliary_loss_clip": 0.01128208, - "auxiliary_loss_mlp": 0.01039636, - "balance_loss_clip": 1.05176711, - "balance_loss_mlp": 1.02239835, - "epoch": 0.2527581542161431, - "flos": 24610552629120.0, - "grad_norm": 1.4386036639708744, - "language_loss": 0.77731073, - "learning_rate": 3.501660986124297e-06, - "loss": 0.79898918, - "num_input_tokens_seen": 90703460, - "step": 4204, - "time_per_iteration": 4.4116432666778564 - }, - { - "auxiliary_loss_clip": 0.01115267, - "auxiliary_loss_mlp": 0.01052396, - "balance_loss_clip": 1.05262613, - "balance_loss_mlp": 1.03453815, - "epoch": 0.2528182774688111, - "flos": 12641275111680.0, - "grad_norm": 1.9357035590368088, - "language_loss": 0.72175288, - "learning_rate": 3.5014037201829684e-06, - "loss": 0.74342954, - "num_input_tokens_seen": 90718815, - "step": 4205, - "time_per_iteration": 2.6750712394714355 - }, - { - "auxiliary_loss_clip": 0.01124756, - "auxiliary_loss_mlp": 0.01044172, - "balance_loss_clip": 1.05032194, - "balance_loss_mlp": 1.02801895, - "epoch": 0.25287840072147905, - "flos": 46936789879680.0, - "grad_norm": 1.4680577763339375, - "language_loss": 0.75594378, - "learning_rate": 3.50114639730826e-06, - "loss": 0.77763301, - "num_input_tokens_seen": 90742125, - "step": 4206, - "time_per_iteration": 2.876408815383911 - }, - { - "auxiliary_loss_clip": 0.01107683, - "auxiliary_loss_mlp": 0.01044618, - "balance_loss_clip": 1.04771221, - "balance_loss_mlp": 1.02780974, - "epoch": 0.252938523974147, - "flos": 18879712392960.0, - "grad_norm": 1.5378963492414741, - "language_loss": 0.78807724, - "learning_rate": 3.5008890175099296e-06, - "loss": 0.80960023, - "num_input_tokens_seen": 90760785, - "step": 4207, - "time_per_iteration": 2.7176475524902344 - }, - { - "auxiliary_loss_clip": 0.01133715, - "auxiliary_loss_mlp": 0.01055631, - "balance_loss_clip": 1.0547328, - "balance_loss_mlp": 1.03984797, - "epoch": 0.252998647226815, - "flos": 21434720664960.0, - "grad_norm": 1.5723877129370716, - "language_loss": 0.76399815, - "learning_rate": 3.5006315807977375e-06, - "loss": 0.78589159, - "num_input_tokens_seen": 90780045, - "step": 4208, - "time_per_iteration": 2.797658920288086 - }, - { - "auxiliary_loss_clip": 0.01131059, - "auxiliary_loss_mlp": 0.01040866, - "balance_loss_clip": 1.05162513, - "balance_loss_mlp": 1.02465391, - "epoch": 0.25305877047948294, - "flos": 25442171285760.0, - "grad_norm": 3.9595354320915166, - "language_loss": 0.69848049, - "learning_rate": 3.5003740871814456e-06, - "loss": 0.72019976, - "num_input_tokens_seen": 90797980, - "step": 4209, - "time_per_iteration": 2.738159418106079 - }, - { - "auxiliary_loss_clip": 0.01046521, - "auxiliary_loss_mlp": 0.0100386, - "balance_loss_clip": 1.02250004, - "balance_loss_mlp": 1.0015471, - "epoch": 0.2531188937321509, - "flos": 60185603629440.0, - "grad_norm": 0.7787603502724176, - "language_loss": 0.55091059, - "learning_rate": 3.5001165366708175e-06, - "loss": 0.57141441, - "num_input_tokens_seen": 90864865, - "step": 4210, - "time_per_iteration": 3.196953535079956 - }, - { - "auxiliary_loss_clip": 0.01113643, - "auxiliary_loss_mlp": 0.01038759, - "balance_loss_clip": 1.05103207, - "balance_loss_mlp": 1.02215338, - "epoch": 0.25317901698481887, - "flos": 19682387665920.0, - "grad_norm": 1.8504444580052586, - "language_loss": 0.8006835, - "learning_rate": 3.4998589292756204e-06, - "loss": 0.82220757, - "num_input_tokens_seen": 90882885, - "step": 4211, - "time_per_iteration": 2.7241647243499756 - }, - { - "auxiliary_loss_clip": 0.01095085, - "auxiliary_loss_mlp": 0.01044368, - "balance_loss_clip": 1.04594803, - "balance_loss_mlp": 1.02844775, - "epoch": 0.25323914023748684, - "flos": 24424355502720.0, - "grad_norm": 1.531596575729193, - "language_loss": 0.78362429, - "learning_rate": 3.499601265005622e-06, - "loss": 0.80501878, - "num_input_tokens_seen": 90902985, - "step": 4212, - "time_per_iteration": 2.788607358932495 - }, - { - "auxiliary_loss_clip": 0.01133893, - "auxiliary_loss_mlp": 0.01041038, - "balance_loss_clip": 1.04857254, - "balance_loss_mlp": 1.02401471, - "epoch": 0.2532992634901548, - "flos": 25447450584960.0, - "grad_norm": 2.123277134845907, - "language_loss": 0.53516036, - "learning_rate": 3.4993435438705938e-06, - "loss": 0.55690968, - "num_input_tokens_seen": 90923550, - "step": 4213, - "time_per_iteration": 2.6675784587860107 - }, - { - "auxiliary_loss_clip": 0.01120924, - "auxiliary_loss_mlp": 0.01044765, - "balance_loss_clip": 1.05005503, - "balance_loss_mlp": 1.0273726, - "epoch": 0.25335938674282277, - "flos": 18880538405760.0, - "grad_norm": 2.4965805840577002, - "language_loss": 0.65416414, - "learning_rate": 3.499085765880308e-06, - "loss": 0.67582107, - "num_input_tokens_seen": 90943260, - "step": 4214, - "time_per_iteration": 2.691359281539917 - }, - { - "auxiliary_loss_clip": 0.01046401, - "auxiliary_loss_mlp": 0.01002761, - "balance_loss_clip": 1.02238619, - "balance_loss_mlp": 1.00056791, - "epoch": 0.25341950999549073, - "flos": 53062649936640.0, - "grad_norm": 0.8515065776804692, - "language_loss": 0.58004916, - "learning_rate": 3.4988279310445396e-06, - "loss": 0.60054076, - "num_input_tokens_seen": 90996295, - "step": 4215, - "time_per_iteration": 2.981840133666992 - }, - { - "auxiliary_loss_clip": 0.01124794, - "auxiliary_loss_mlp": 0.01043531, - "balance_loss_clip": 1.05316496, - "balance_loss_mlp": 1.02655554, - "epoch": 0.2534796332481587, - "flos": 39020247054720.0, - "grad_norm": 1.7497766885830588, - "language_loss": 0.83251095, - "learning_rate": 3.498570039373066e-06, - "loss": 0.85419416, - "num_input_tokens_seen": 91017545, - "step": 4216, - "time_per_iteration": 2.912137508392334 - }, - { - "auxiliary_loss_clip": 0.0112972, - "auxiliary_loss_mlp": 0.01040052, - "balance_loss_clip": 1.05088937, - "balance_loss_mlp": 1.02338624, - "epoch": 0.2535397565008267, - "flos": 23586990670080.0, - "grad_norm": 3.3733415491927996, - "language_loss": 0.80008072, - "learning_rate": 3.498312090875666e-06, - "loss": 0.82177842, - "num_input_tokens_seen": 91037715, - "step": 4217, - "time_per_iteration": 2.6532363891601562 - }, - { - "auxiliary_loss_clip": 0.01116019, - "auxiliary_loss_mlp": 0.01038346, - "balance_loss_clip": 1.04436612, - "balance_loss_mlp": 1.02234793, - "epoch": 0.2535998797534947, - "flos": 19281373251840.0, - "grad_norm": 2.333881972650505, - "language_loss": 0.75585902, - "learning_rate": 3.4980540855621218e-06, - "loss": 0.77740264, - "num_input_tokens_seen": 91055295, - "step": 4218, - "time_per_iteration": 2.650867223739624 - }, - { - "auxiliary_loss_clip": 0.0113544, - "auxiliary_loss_mlp": 0.0104021, - "balance_loss_clip": 1.04940748, - "balance_loss_mlp": 1.0229727, - "epoch": 0.25366000300616265, - "flos": 24024382583040.0, - "grad_norm": 2.040148074486094, - "language_loss": 0.74188256, - "learning_rate": 3.4977960234422167e-06, - "loss": 0.76363909, - "num_input_tokens_seen": 91075485, - "step": 4219, - "time_per_iteration": 2.727161169052124 - }, - { - "auxiliary_loss_clip": 0.01138406, - "auxiliary_loss_mlp": 0.01048455, - "balance_loss_clip": 1.05222011, - "balance_loss_mlp": 1.03138447, - "epoch": 0.2537201262588306, - "flos": 16289368116480.0, - "grad_norm": 4.990704095966988, - "language_loss": 0.81355274, - "learning_rate": 3.497537904525736e-06, - "loss": 0.83542132, - "num_input_tokens_seen": 91093620, - "step": 4220, - "time_per_iteration": 2.6146652698516846 - }, - { - "auxiliary_loss_clip": 0.01100698, - "auxiliary_loss_mlp": 0.01049127, - "balance_loss_clip": 1.04988587, - "balance_loss_mlp": 1.03041148, - "epoch": 0.2537802495114986, - "flos": 23294677789440.0, - "grad_norm": 2.3092995740689197, - "language_loss": 0.70819569, - "learning_rate": 3.497279728822468e-06, - "loss": 0.72969389, - "num_input_tokens_seen": 91114110, - "step": 4221, - "time_per_iteration": 2.851747751235962 - }, - { - "auxiliary_loss_clip": 0.0114682, - "auxiliary_loss_mlp": 0.01039444, - "balance_loss_clip": 1.05224657, - "balance_loss_mlp": 1.02257586, - "epoch": 0.25384037276416654, - "flos": 17639142416640.0, - "grad_norm": 2.4229893622177188, - "language_loss": 0.61689377, - "learning_rate": 3.497021496342202e-06, - "loss": 0.63875645, - "num_input_tokens_seen": 91133135, - "step": 4222, - "time_per_iteration": 2.6394412517547607 - }, - { - "auxiliary_loss_clip": 0.01138378, - "auxiliary_loss_mlp": 0.01051871, - "balance_loss_clip": 1.05371165, - "balance_loss_mlp": 1.03528929, - "epoch": 0.2539004960168345, - "flos": 21507044699520.0, - "grad_norm": 1.6839261376783914, - "language_loss": 0.74744058, - "learning_rate": 3.496763207094731e-06, - "loss": 0.76934308, - "num_input_tokens_seen": 91151805, - "step": 4223, - "time_per_iteration": 2.648322105407715 - }, - { - "auxiliary_loss_clip": 0.01092255, - "auxiliary_loss_mlp": 0.01039082, - "balance_loss_clip": 1.04767203, - "balance_loss_mlp": 1.02325082, - "epoch": 0.2539606192695025, - "flos": 23950909313280.0, - "grad_norm": 1.7092524284111348, - "language_loss": 0.80226004, - "learning_rate": 3.49650486108985e-06, - "loss": 0.82357341, - "num_input_tokens_seen": 91172270, - "step": 4224, - "time_per_iteration": 2.7572662830352783 - }, - { - "auxiliary_loss_clip": 0.01130506, - "auxiliary_loss_mlp": 0.00774076, - "balance_loss_clip": 1.05102324, - "balance_loss_mlp": 1.00112057, - "epoch": 0.25402074252217044, - "flos": 24169784837760.0, - "grad_norm": 1.4497407173280796, - "language_loss": 0.77330017, - "learning_rate": 3.496246458337354e-06, - "loss": 0.792346, - "num_input_tokens_seen": 91192080, - "step": 4225, - "time_per_iteration": 2.7661190032958984 - }, - { - "auxiliary_loss_clip": 0.01130647, - "auxiliary_loss_mlp": 0.01049954, - "balance_loss_clip": 1.04919255, - "balance_loss_mlp": 1.03271639, - "epoch": 0.2540808657748384, - "flos": 22303758314880.0, - "grad_norm": 2.0615353379683055, - "language_loss": 0.84638137, - "learning_rate": 3.4959879988470426e-06, - "loss": 0.86818743, - "num_input_tokens_seen": 91211450, - "step": 4226, - "time_per_iteration": 2.690683126449585 - }, - { - "auxiliary_loss_clip": 0.01143268, - "auxiliary_loss_mlp": 0.01043336, - "balance_loss_clip": 1.05067408, - "balance_loss_mlp": 1.02613425, - "epoch": 0.25414098902750637, - "flos": 27599541022080.0, - "grad_norm": 1.5600656222031943, - "language_loss": 0.70886129, - "learning_rate": 3.4957294826287164e-06, - "loss": 0.73072731, - "num_input_tokens_seen": 91231835, - "step": 4227, - "time_per_iteration": 2.6647307872772217 - }, - { - "auxiliary_loss_clip": 0.01055229, - "auxiliary_loss_mlp": 0.01001956, - "balance_loss_clip": 1.02168798, - "balance_loss_mlp": 0.9995476, - "epoch": 0.25420111228017434, - "flos": 58170834887040.0, - "grad_norm": 0.9869295588353136, - "language_loss": 0.61927998, - "learning_rate": 3.4954709096921785e-06, - "loss": 0.63985181, - "num_input_tokens_seen": 91288755, - "step": 4228, - "time_per_iteration": 2.986067533493042 - }, - { - "auxiliary_loss_clip": 0.01124878, - "auxiliary_loss_mlp": 0.01040149, - "balance_loss_clip": 1.0464859, - "balance_loss_mlp": 1.02212501, - "epoch": 0.2542612355328423, - "flos": 11464409905920.0, - "grad_norm": 2.314170874410929, - "language_loss": 0.86946094, - "learning_rate": 3.4952122800472336e-06, - "loss": 0.8911112, - "num_input_tokens_seen": 91302485, - "step": 4229, - "time_per_iteration": 2.629518985748291 - }, - { - "auxiliary_loss_clip": 0.01102882, - "auxiliary_loss_mlp": 0.01042519, - "balance_loss_clip": 1.04811144, - "balance_loss_mlp": 1.0241369, - "epoch": 0.2543213587855103, - "flos": 22965879669120.0, - "grad_norm": 1.7811216632522446, - "language_loss": 0.77265114, - "learning_rate": 3.4949535937036892e-06, - "loss": 0.79410517, - "num_input_tokens_seen": 91321120, - "step": 4230, - "time_per_iteration": 2.715655565261841 - }, - { - "auxiliary_loss_clip": 0.01133364, - "auxiliary_loss_mlp": 0.01047482, - "balance_loss_clip": 1.0504818, - "balance_loss_mlp": 1.03074503, - "epoch": 0.2543814820381783, - "flos": 18253178438400.0, - "grad_norm": 1.8956341732473607, - "language_loss": 0.7550717, - "learning_rate": 3.4946948506713544e-06, - "loss": 0.77688015, - "num_input_tokens_seen": 91338575, - "step": 4231, - "time_per_iteration": 2.6945316791534424 - }, - { - "auxiliary_loss_clip": 0.0113214, - "auxiliary_loss_mlp": 0.01038979, - "balance_loss_clip": 1.04939127, - "balance_loss_mlp": 1.0230999, - "epoch": 0.25444160529084625, - "flos": 15632705629440.0, - "grad_norm": 1.6179274617095247, - "language_loss": 0.73618764, - "learning_rate": 3.4944360509600416e-06, - "loss": 0.75789881, - "num_input_tokens_seen": 91357355, - "step": 4232, - "time_per_iteration": 2.6219112873077393 - }, - { - "auxiliary_loss_clip": 0.01149145, - "auxiliary_loss_mlp": 0.01043104, - "balance_loss_clip": 1.05579972, - "balance_loss_mlp": 1.02589035, - "epoch": 0.2545017285435142, - "flos": 24601610142720.0, - "grad_norm": 2.2856831174377388, - "language_loss": 0.86333203, - "learning_rate": 3.4941771945795637e-06, - "loss": 0.88525456, - "num_input_tokens_seen": 91376515, - "step": 4233, - "time_per_iteration": 2.675877809524536 - }, - { - "auxiliary_loss_clip": 0.01080108, - "auxiliary_loss_mlp": 0.01040124, - "balance_loss_clip": 1.04641938, - "balance_loss_mlp": 1.02457917, - "epoch": 0.2545618517961822, - "flos": 24679069822080.0, - "grad_norm": 1.5382450997432586, - "language_loss": 0.75319451, - "learning_rate": 3.493918281539737e-06, - "loss": 0.77439684, - "num_input_tokens_seen": 91397595, - "step": 4234, - "time_per_iteration": 2.9050087928771973 - }, - { - "auxiliary_loss_clip": 0.01117427, - "auxiliary_loss_mlp": 0.01044439, - "balance_loss_clip": 1.05171227, - "balance_loss_mlp": 1.02897787, - "epoch": 0.25462197504885015, - "flos": 23915106432000.0, - "grad_norm": 2.6382014960101765, - "language_loss": 0.74923635, - "learning_rate": 3.493659311850379e-06, - "loss": 0.77085495, - "num_input_tokens_seen": 91417775, - "step": 4235, - "time_per_iteration": 2.788041353225708 - }, - { - "auxiliary_loss_clip": 0.01124445, - "auxiliary_loss_mlp": 0.00776537, - "balance_loss_clip": 1.05315781, - "balance_loss_mlp": 1.00115323, - "epoch": 0.2546820983015181, - "flos": 24789387467520.0, - "grad_norm": 1.9882672691222136, - "language_loss": 0.64451182, - "learning_rate": 3.4934002855213106e-06, - "loss": 0.66352159, - "num_input_tokens_seen": 91437665, - "step": 4236, - "time_per_iteration": 2.8649141788482666 - }, - { - "auxiliary_loss_clip": 0.01144465, - "auxiliary_loss_mlp": 0.01036249, - "balance_loss_clip": 1.05185175, - "balance_loss_mlp": 1.02122915, - "epoch": 0.2547422215541861, - "flos": 18734130570240.0, - "grad_norm": 1.6410229940010734, - "language_loss": 0.6714325, - "learning_rate": 3.493141202562354e-06, - "loss": 0.69323969, - "num_input_tokens_seen": 91456705, - "step": 4237, - "time_per_iteration": 4.262012958526611 - }, - { - "auxiliary_loss_clip": 0.01147064, - "auxiliary_loss_mlp": 0.01049012, - "balance_loss_clip": 1.05240059, - "balance_loss_mlp": 1.03203678, - "epoch": 0.25480234480685404, - "flos": 21032449274880.0, - "grad_norm": 2.0013967295828237, - "language_loss": 0.75415373, - "learning_rate": 3.492882062983333e-06, - "loss": 0.77611452, - "num_input_tokens_seen": 91475535, - "step": 4238, - "time_per_iteration": 2.6378636360168457 - }, - { - "auxiliary_loss_clip": 0.01137265, - "auxiliary_loss_mlp": 0.01046047, - "balance_loss_clip": 1.05366278, - "balance_loss_mlp": 1.02843964, - "epoch": 0.254862468059522, - "flos": 25082167224960.0, - "grad_norm": 3.4417299363308613, - "language_loss": 0.80712521, - "learning_rate": 3.492622866794074e-06, - "loss": 0.82895833, - "num_input_tokens_seen": 91499140, - "step": 4239, - "time_per_iteration": 4.348390579223633 - }, - { - "auxiliary_loss_clip": 0.01128023, - "auxiliary_loss_mlp": 0.01045872, - "balance_loss_clip": 1.0522213, - "balance_loss_mlp": 1.02870631, - "epoch": 0.25492259131219, - "flos": 20558392554240.0, - "grad_norm": 1.7312526359597522, - "language_loss": 0.77521586, - "learning_rate": 3.492363614004407e-06, - "loss": 0.79695487, - "num_input_tokens_seen": 91518335, - "step": 4240, - "time_per_iteration": 2.7501273155212402 - }, - { - "auxiliary_loss_clip": 0.01151347, - "auxiliary_loss_mlp": 0.01040734, - "balance_loss_clip": 1.05296493, - "balance_loss_mlp": 1.0226146, - "epoch": 0.25498271456485794, - "flos": 25042485674880.0, - "grad_norm": 3.3593092651087595, - "language_loss": 0.83430749, - "learning_rate": 3.492104304624162e-06, - "loss": 0.85622829, - "num_input_tokens_seen": 91537655, - "step": 4241, - "time_per_iteration": 2.7480928897857666 - }, - { - "auxiliary_loss_clip": 0.01137407, - "auxiliary_loss_mlp": 0.01045384, - "balance_loss_clip": 1.05306387, - "balance_loss_mlp": 1.02887392, - "epoch": 0.2550428378175259, - "flos": 26178412354560.0, - "grad_norm": 1.6379574895871623, - "language_loss": 0.73322648, - "learning_rate": 3.4918449386631725e-06, - "loss": 0.75505441, - "num_input_tokens_seen": 91557545, - "step": 4242, - "time_per_iteration": 2.713635206222534 - }, - { - "auxiliary_loss_clip": 0.0114709, - "auxiliary_loss_mlp": 0.00774169, - "balance_loss_clip": 1.05182981, - "balance_loss_mlp": 1.00115824, - "epoch": 0.2551029610701939, - "flos": 15267170874240.0, - "grad_norm": 3.2486673035230993, - "language_loss": 0.72336024, - "learning_rate": 3.491585516131273e-06, - "loss": 0.7425729, - "num_input_tokens_seen": 91574405, - "step": 4243, - "time_per_iteration": 4.298815727233887 - }, - { - "auxiliary_loss_clip": 0.0113532, - "auxiliary_loss_mlp": 0.01045095, - "balance_loss_clip": 1.05183125, - "balance_loss_mlp": 1.02797616, - "epoch": 0.2551630843228619, - "flos": 18112193556480.0, - "grad_norm": 1.8323151946393021, - "language_loss": 0.82076979, - "learning_rate": 3.491326037038301e-06, - "loss": 0.842574, - "num_input_tokens_seen": 91593755, - "step": 4244, - "time_per_iteration": 2.6497015953063965 - }, - { - "auxiliary_loss_clip": 0.01054616, - "auxiliary_loss_mlp": 0.01017916, - "balance_loss_clip": 1.03294289, - "balance_loss_mlp": 1.01572227, - "epoch": 0.25522320757552985, - "flos": 70520192167680.0, - "grad_norm": 0.6914168393706984, - "language_loss": 0.57701397, - "learning_rate": 3.4910665013940967e-06, - "loss": 0.59773928, - "num_input_tokens_seen": 91660335, - "step": 4245, - "time_per_iteration": 3.2938833236694336 - }, - { - "auxiliary_loss_clip": 0.01146552, - "auxiliary_loss_mlp": 0.01052395, - "balance_loss_clip": 1.0508852, - "balance_loss_mlp": 1.03577745, - "epoch": 0.2552833308281978, - "flos": 22893088757760.0, - "grad_norm": 2.1326330958670567, - "language_loss": 0.65120399, - "learning_rate": 3.4908069092085015e-06, - "loss": 0.6731934, - "num_input_tokens_seen": 91678500, - "step": 4246, - "time_per_iteration": 2.5949065685272217 - }, - { - "auxiliary_loss_clip": 0.01127579, - "auxiliary_loss_mlp": 0.01044633, - "balance_loss_clip": 1.04806828, - "balance_loss_mlp": 1.02944601, - "epoch": 0.2553434540808658, - "flos": 22053605022720.0, - "grad_norm": 1.7151532201527704, - "language_loss": 0.81580049, - "learning_rate": 3.4905472604913585e-06, - "loss": 0.83752257, - "num_input_tokens_seen": 91696430, - "step": 4247, - "time_per_iteration": 2.673624277114868 - }, - { - "auxiliary_loss_clip": 0.01140059, - "auxiliary_loss_mlp": 0.01044068, - "balance_loss_clip": 1.05152941, - "balance_loss_mlp": 1.02543616, - "epoch": 0.25540357733353375, - "flos": 16544190176640.0, - "grad_norm": 2.241724474505105, - "language_loss": 0.83335149, - "learning_rate": 3.490287555252514e-06, - "loss": 0.85519278, - "num_input_tokens_seen": 91713270, - "step": 4248, - "time_per_iteration": 2.617570400238037 - }, - { - "auxiliary_loss_clip": 0.01112618, - "auxiliary_loss_mlp": 0.01042154, - "balance_loss_clip": 1.04433584, - "balance_loss_mlp": 1.02458215, - "epoch": 0.2554637005862017, - "flos": 17565022702080.0, - "grad_norm": 2.084670538042193, - "language_loss": 0.84011936, - "learning_rate": 3.4900277935018166e-06, - "loss": 0.8616671, - "num_input_tokens_seen": 91728865, - "step": 4249, - "time_per_iteration": 2.6617467403411865 - }, - { - "auxiliary_loss_clip": 0.01001275, - "auxiliary_loss_mlp": 0.01002657, - "balance_loss_clip": 1.0228157, - "balance_loss_mlp": 0.9996174, - "epoch": 0.2555238238388697, - "flos": 72244763953920.0, - "grad_norm": 0.765792812565725, - "language_loss": 0.56274796, - "learning_rate": 3.489767975249115e-06, - "loss": 0.58278728, - "num_input_tokens_seen": 91787470, - "step": 4250, - "time_per_iteration": 3.24300479888916 - }, - { - "auxiliary_loss_clip": 0.01117816, - "auxiliary_loss_mlp": 0.01036136, - "balance_loss_clip": 1.04929769, - "balance_loss_mlp": 1.01839769, - "epoch": 0.25558394709153764, - "flos": 24389414547840.0, - "grad_norm": 2.294460262471245, - "language_loss": 0.80566651, - "learning_rate": 3.4895081005042632e-06, - "loss": 0.82720602, - "num_input_tokens_seen": 91805640, - "step": 4251, - "time_per_iteration": 2.732752561569214 - }, - { - "auxiliary_loss_clip": 0.01030367, - "auxiliary_loss_mlp": 0.01001193, - "balance_loss_clip": 1.02468216, - "balance_loss_mlp": 0.99888068, - "epoch": 0.2556440703442056, - "flos": 69231213636480.0, - "grad_norm": 0.7932625116211053, - "language_loss": 0.6608988, - "learning_rate": 3.4892481692771146e-06, - "loss": 0.68121445, - "num_input_tokens_seen": 91869695, - "step": 4252, - "time_per_iteration": 3.304985523223877 - }, - { - "auxiliary_loss_clip": 0.01130428, - "auxiliary_loss_mlp": 0.01036056, - "balance_loss_clip": 1.0499115, - "balance_loss_mlp": 1.02097619, - "epoch": 0.2557041935968736, - "flos": 24863902231680.0, - "grad_norm": 2.60951363435401, - "language_loss": 0.73882902, - "learning_rate": 3.4889881815775267e-06, - "loss": 0.76049387, - "num_input_tokens_seen": 91889920, - "step": 4253, - "time_per_iteration": 2.706052303314209 - }, - { - "auxiliary_loss_clip": 0.01097964, - "auxiliary_loss_mlp": 0.01044298, - "balance_loss_clip": 1.04340124, - "balance_loss_mlp": 1.02782309, - "epoch": 0.25576431684954154, - "flos": 22492110257280.0, - "grad_norm": 2.978807414856607, - "language_loss": 0.72565317, - "learning_rate": 3.488728137415357e-06, - "loss": 0.7470758, - "num_input_tokens_seen": 91908665, - "step": 4254, - "time_per_iteration": 2.7579715251922607 - }, - { - "auxiliary_loss_clip": 0.01098791, - "auxiliary_loss_mlp": 0.00774228, - "balance_loss_clip": 1.04665136, - "balance_loss_mlp": 1.001104, - "epoch": 0.2558244401022095, - "flos": 19826748426240.0, - "grad_norm": 1.7240740787107458, - "language_loss": 0.80729312, - "learning_rate": 3.4884680368004675e-06, - "loss": 0.82602334, - "num_input_tokens_seen": 91927855, - "step": 4255, - "time_per_iteration": 2.788978099822998 - }, - { - "auxiliary_loss_clip": 0.01124525, - "auxiliary_loss_mlp": 0.01040748, - "balance_loss_clip": 1.05111384, - "balance_loss_mlp": 1.02414227, - "epoch": 0.2558845633548775, - "flos": 23220486247680.0, - "grad_norm": 1.5275751549355678, - "language_loss": 0.85734111, - "learning_rate": 3.488207879742721e-06, - "loss": 0.87899381, - "num_input_tokens_seen": 91948500, - "step": 4256, - "time_per_iteration": 2.7916831970214844 - }, - { - "auxiliary_loss_clip": 0.01102599, - "auxiliary_loss_mlp": 0.01049743, - "balance_loss_clip": 1.04525566, - "balance_loss_mlp": 1.03164732, - "epoch": 0.2559446866075455, - "flos": 16837867774080.0, - "grad_norm": 1.8301502951270987, - "language_loss": 0.74872649, - "learning_rate": 3.4879476662519826e-06, - "loss": 0.77024996, - "num_input_tokens_seen": 91968375, - "step": 4257, - "time_per_iteration": 2.7754952907562256 - }, - { - "auxiliary_loss_clip": 0.0102418, - "auxiliary_loss_mlp": 0.01011535, - "balance_loss_clip": 1.03534186, - "balance_loss_mlp": 1.00959146, - "epoch": 0.25600480986021346, - "flos": 57593786895360.0, - "grad_norm": 0.8003890262370261, - "language_loss": 0.65255105, - "learning_rate": 3.4876873963381196e-06, - "loss": 0.67290819, - "num_input_tokens_seen": 92028490, - "step": 4258, - "time_per_iteration": 3.269063949584961 - }, - { - "auxiliary_loss_clip": 0.01091736, - "auxiliary_loss_mlp": 0.00773347, - "balance_loss_clip": 1.04549718, - "balance_loss_mlp": 1.00111449, - "epoch": 0.2560649331128814, - "flos": 27819529868160.0, - "grad_norm": 1.5266978755669562, - "language_loss": 0.76443565, - "learning_rate": 3.4874270700110013e-06, - "loss": 0.78308654, - "num_input_tokens_seen": 92048060, - "step": 4259, - "time_per_iteration": 2.805574893951416 - }, - { - "auxiliary_loss_clip": 0.01026212, - "auxiliary_loss_mlp": 0.01016368, - "balance_loss_clip": 1.02208054, - "balance_loss_mlp": 1.01372147, - "epoch": 0.2561250563655494, - "flos": 70950509101440.0, - "grad_norm": 0.7927643603688844, - "language_loss": 0.58455491, - "learning_rate": 3.4871666872804994e-06, - "loss": 0.60498071, - "num_input_tokens_seen": 92118180, - "step": 4260, - "time_per_iteration": 3.3904550075531006 - }, - { - "auxiliary_loss_clip": 0.01133193, - "auxiliary_loss_mlp": 0.01048996, - "balance_loss_clip": 1.04874313, - "balance_loss_mlp": 1.03204465, - "epoch": 0.25618517961821735, - "flos": 27012329481600.0, - "grad_norm": 3.3188145253338543, - "language_loss": 0.77064955, - "learning_rate": 3.4869062481564875e-06, - "loss": 0.79247141, - "num_input_tokens_seen": 92137570, - "step": 4261, - "time_per_iteration": 2.769864082336426 - }, - { - "auxiliary_loss_clip": 0.01144035, - "auxiliary_loss_mlp": 0.01040091, - "balance_loss_clip": 1.05178332, - "balance_loss_mlp": 1.02465355, - "epoch": 0.2562453028708853, - "flos": 23068296322560.0, - "grad_norm": 1.5699122250769224, - "language_loss": 0.83367205, - "learning_rate": 3.486645752648842e-06, - "loss": 0.85551333, - "num_input_tokens_seen": 92157625, - "step": 4262, - "time_per_iteration": 2.682828426361084 - }, - { - "auxiliary_loss_clip": 0.01134556, - "auxiliary_loss_mlp": 0.01041299, - "balance_loss_clip": 1.05219626, - "balance_loss_mlp": 1.02344143, - "epoch": 0.2563054261235533, - "flos": 15120942606720.0, - "grad_norm": 2.340862226505914, - "language_loss": 0.73892939, - "learning_rate": 3.4863852007674405e-06, - "loss": 0.76068795, - "num_input_tokens_seen": 92175350, - "step": 4263, - "time_per_iteration": 2.70947003364563 - }, - { - "auxiliary_loss_clip": 0.0111297, - "auxiliary_loss_mlp": 0.00773371, - "balance_loss_clip": 1.05221081, - "balance_loss_mlp": 1.00093555, - "epoch": 0.25636554937622125, - "flos": 27854865872640.0, - "grad_norm": 1.8143922917988324, - "language_loss": 0.82766259, - "learning_rate": 3.486124592522163e-06, - "loss": 0.84652603, - "num_input_tokens_seen": 92196070, - "step": 4264, - "time_per_iteration": 2.7249553203582764 - }, - { - "auxiliary_loss_clip": 0.01133012, - "auxiliary_loss_mlp": 0.01041877, - "balance_loss_clip": 1.05265546, - "balance_loss_mlp": 1.02468669, - "epoch": 0.2564256726288892, - "flos": 28906509288960.0, - "grad_norm": 2.8986425954305206, - "language_loss": 0.74346334, - "learning_rate": 3.4858639279228924e-06, - "loss": 0.76521224, - "num_input_tokens_seen": 92216310, - "step": 4265, - "time_per_iteration": 2.7149150371551514 - }, - { - "auxiliary_loss_clip": 0.01110152, - "auxiliary_loss_mlp": 0.01036531, - "balance_loss_clip": 1.04754925, - "balance_loss_mlp": 1.02034247, - "epoch": 0.2564857958815572, - "flos": 18514931823360.0, - "grad_norm": 15.50909821859273, - "language_loss": 0.81623137, - "learning_rate": 3.485603206979513e-06, - "loss": 0.83769822, - "num_input_tokens_seen": 92234510, - "step": 4266, - "time_per_iteration": 2.6890153884887695 - }, - { - "auxiliary_loss_clip": 0.01083702, - "auxiliary_loss_mlp": 0.01050109, - "balance_loss_clip": 1.0468955, - "balance_loss_mlp": 1.0318346, - "epoch": 0.25654591913422514, - "flos": 25808280658560.0, - "grad_norm": 2.4522850064786037, - "language_loss": 0.79120672, - "learning_rate": 3.4853424297019103e-06, - "loss": 0.81254482, - "num_input_tokens_seen": 92254070, - "step": 4267, - "time_per_iteration": 2.8390700817108154 - }, - { - "auxiliary_loss_clip": 0.01094597, - "auxiliary_loss_mlp": 0.01044608, - "balance_loss_clip": 1.04643822, - "balance_loss_mlp": 1.0276804, - "epoch": 0.2566060423868931, - "flos": 19099665325440.0, - "grad_norm": 1.6765306902124857, - "language_loss": 0.79241312, - "learning_rate": 3.4850815960999736e-06, - "loss": 0.81380516, - "num_input_tokens_seen": 92275060, - "step": 4268, - "time_per_iteration": 2.7324178218841553 - }, - { - "auxiliary_loss_clip": 0.01106667, - "auxiliary_loss_mlp": 0.00778662, - "balance_loss_clip": 1.04940808, - "balance_loss_mlp": 1.00098729, - "epoch": 0.25666616563956113, - "flos": 23842674656640.0, - "grad_norm": 1.8248642507450341, - "language_loss": 0.67737979, - "learning_rate": 3.484820706183595e-06, - "loss": 0.69623303, - "num_input_tokens_seen": 92293610, - "step": 4269, - "time_per_iteration": 2.7897677421569824 - }, - { - "auxiliary_loss_clip": 0.01123993, - "auxiliary_loss_mlp": 0.01043408, - "balance_loss_clip": 1.05155373, - "balance_loss_mlp": 1.02596736, - "epoch": 0.2567262888922291, - "flos": 14604259420800.0, - "grad_norm": 3.069203267679029, - "language_loss": 0.79117787, - "learning_rate": 3.484559759962666e-06, - "loss": 0.81285185, - "num_input_tokens_seen": 92308305, - "step": 4270, - "time_per_iteration": 2.8076114654541016 - }, - { - "auxiliary_loss_clip": 0.01094814, - "auxiliary_loss_mlp": 0.010436, - "balance_loss_clip": 1.04357839, - "balance_loss_mlp": 1.02393079, - "epoch": 0.25678641214489706, - "flos": 32923117877760.0, - "grad_norm": 2.413207422396751, - "language_loss": 0.68088073, - "learning_rate": 3.4842987574470816e-06, - "loss": 0.7022649, - "num_input_tokens_seen": 92329875, - "step": 4271, - "time_per_iteration": 2.8195667266845703 - }, - { - "auxiliary_loss_clip": 0.01136281, - "auxiliary_loss_mlp": 0.00774788, - "balance_loss_clip": 1.05146289, - "balance_loss_mlp": 1.00110972, - "epoch": 0.256846535397565, - "flos": 24098933260800.0, - "grad_norm": 3.3671515903121216, - "language_loss": 0.87362605, - "learning_rate": 3.4840376986467403e-06, - "loss": 0.89273679, - "num_input_tokens_seen": 92348780, - "step": 4272, - "time_per_iteration": 2.6910364627838135 - }, - { - "auxiliary_loss_clip": 0.01122968, - "auxiliary_loss_mlp": 0.01046328, - "balance_loss_clip": 1.05348301, - "balance_loss_mlp": 1.02854192, - "epoch": 0.256906658650233, - "flos": 19718441942400.0, - "grad_norm": 1.6813472119330561, - "language_loss": 0.81420678, - "learning_rate": 3.483776583571541e-06, - "loss": 0.83589977, - "num_input_tokens_seen": 92368175, - "step": 4273, - "time_per_iteration": 2.6883673667907715 - }, - { - "auxiliary_loss_clip": 0.01097944, - "auxiliary_loss_mlp": 0.01041741, - "balance_loss_clip": 1.043715, - "balance_loss_mlp": 1.02459884, - "epoch": 0.25696678190290095, - "flos": 22926018551040.0, - "grad_norm": 3.3251008044769947, - "language_loss": 0.76944637, - "learning_rate": 3.4835154122313846e-06, - "loss": 0.79084325, - "num_input_tokens_seen": 92387755, - "step": 4274, - "time_per_iteration": 2.7613401412963867 - }, - { - "auxiliary_loss_clip": 0.01112797, - "auxiliary_loss_mlp": 0.01039272, - "balance_loss_clip": 1.04380774, - "balance_loss_mlp": 1.02220166, - "epoch": 0.2570269051555689, - "flos": 27307838672640.0, - "grad_norm": 2.1172072427968933, - "language_loss": 0.83780324, - "learning_rate": 3.4832541846361743e-06, - "loss": 0.85932392, - "num_input_tokens_seen": 92409850, - "step": 4275, - "time_per_iteration": 2.7835779190063477 - }, - { - "auxiliary_loss_clip": 0.01120289, - "auxiliary_loss_mlp": 0.01039714, - "balance_loss_clip": 1.05141211, - "balance_loss_mlp": 1.02223814, - "epoch": 0.2570870284082369, - "flos": 27563414918400.0, - "grad_norm": 2.725989678545036, - "language_loss": 0.7874397, - "learning_rate": 3.4829929007958175e-06, - "loss": 0.80903983, - "num_input_tokens_seen": 92431250, - "step": 4276, - "time_per_iteration": 5.679298400878906 - }, - { - "auxiliary_loss_clip": 0.01136261, - "auxiliary_loss_mlp": 0.01046327, - "balance_loss_clip": 1.05269814, - "balance_loss_mlp": 1.02982879, - "epoch": 0.25714715166090485, - "flos": 28730834847360.0, - "grad_norm": 4.7083902318823885, - "language_loss": 0.79273927, - "learning_rate": 3.4827315607202214e-06, - "loss": 0.81456512, - "num_input_tokens_seen": 92452065, - "step": 4277, - "time_per_iteration": 2.691035270690918 - }, - { - "auxiliary_loss_clip": 0.01146238, - "auxiliary_loss_mlp": 0.01040113, - "balance_loss_clip": 1.05214763, - "balance_loss_mlp": 1.02367437, - "epoch": 0.2572072749135728, - "flos": 20116152305280.0, - "grad_norm": 2.017980063834791, - "language_loss": 0.78986102, - "learning_rate": 3.482470164419295e-06, - "loss": 0.81172454, - "num_input_tokens_seen": 92470025, - "step": 4278, - "time_per_iteration": 4.2404680252075195 - }, - { - "auxiliary_loss_clip": 0.01126121, - "auxiliary_loss_mlp": 0.01037901, - "balance_loss_clip": 1.05176449, - "balance_loss_mlp": 1.02102113, - "epoch": 0.2572673981662408, - "flos": 26030855283840.0, - "grad_norm": 2.8070462448385904, - "language_loss": 0.74898899, - "learning_rate": 3.482208711902952e-06, - "loss": 0.77062923, - "num_input_tokens_seen": 92489825, - "step": 4279, - "time_per_iteration": 2.65977144241333 - }, - { - "auxiliary_loss_clip": 0.01134686, - "auxiliary_loss_mlp": 0.01051687, - "balance_loss_clip": 1.04973292, - "balance_loss_mlp": 1.03423464, - "epoch": 0.25732752141890874, - "flos": 16106618695680.0, - "grad_norm": 2.4256697448035687, - "language_loss": 0.85603923, - "learning_rate": 3.4819472031811065e-06, - "loss": 0.87790298, - "num_input_tokens_seen": 92507270, - "step": 4280, - "time_per_iteration": 2.6072864532470703 - }, - { - "auxiliary_loss_clip": 0.01136623, - "auxiliary_loss_mlp": 0.01039056, - "balance_loss_clip": 1.05183434, - "balance_loss_mlp": 1.02147269, - "epoch": 0.2573876446715767, - "flos": 22524429519360.0, - "grad_norm": 3.9579835716917695, - "language_loss": 0.79381943, - "learning_rate": 3.4816856382636744e-06, - "loss": 0.8155762, - "num_input_tokens_seen": 92526300, - "step": 4281, - "time_per_iteration": 2.613163471221924 - }, - { - "auxiliary_loss_clip": 0.01110196, - "auxiliary_loss_mlp": 0.01038018, - "balance_loss_clip": 1.04847932, - "balance_loss_mlp": 1.02099478, - "epoch": 0.2574477679242447, - "flos": 23950837486080.0, - "grad_norm": 2.240063499401578, - "language_loss": 0.87314785, - "learning_rate": 3.4814240171605737e-06, - "loss": 0.89462996, - "num_input_tokens_seen": 92546465, - "step": 4282, - "time_per_iteration": 4.489396333694458 - }, - { - "auxiliary_loss_clip": 0.01148783, - "auxiliary_loss_mlp": 0.01046594, - "balance_loss_clip": 1.0526619, - "balance_loss_mlp": 1.02959502, - "epoch": 0.2575078911769127, - "flos": 21981711951360.0, - "grad_norm": 1.5167715532309152, - "language_loss": 0.70110047, - "learning_rate": 3.4811623398817267e-06, - "loss": 0.72305429, - "num_input_tokens_seen": 92567260, - "step": 4283, - "time_per_iteration": 2.619131565093994 - }, - { - "auxiliary_loss_clip": 0.01144466, - "auxiliary_loss_mlp": 0.00774605, - "balance_loss_clip": 1.05443883, - "balance_loss_mlp": 1.0010494, - "epoch": 0.25756801442958066, - "flos": 21945406279680.0, - "grad_norm": 1.950947388276708, - "language_loss": 0.80411774, - "learning_rate": 3.4809006064370553e-06, - "loss": 0.82330847, - "num_input_tokens_seen": 92585425, - "step": 4284, - "time_per_iteration": 2.656998634338379 - }, - { - "auxiliary_loss_clip": 0.01105473, - "auxiliary_loss_mlp": 0.01039993, - "balance_loss_clip": 1.05797076, - "balance_loss_mlp": 1.02488899, - "epoch": 0.2576281376822486, - "flos": 35261980058880.0, - "grad_norm": 2.2559612506718434, - "language_loss": 0.70473522, - "learning_rate": 3.4806388168364835e-06, - "loss": 0.72618985, - "num_input_tokens_seen": 92604770, - "step": 4285, - "time_per_iteration": 2.880835771560669 - }, - { - "auxiliary_loss_clip": 0.01127807, - "auxiliary_loss_mlp": 0.0104515, - "balance_loss_clip": 1.05229783, - "balance_loss_mlp": 1.02971268, - "epoch": 0.2576882609349166, - "flos": 14132285688960.0, - "grad_norm": 1.8739093647405893, - "language_loss": 0.58494061, - "learning_rate": 3.4803769710899402e-06, - "loss": 0.6066702, - "num_input_tokens_seen": 92622635, - "step": 4286, - "time_per_iteration": 2.63923978805542 - }, - { - "auxiliary_loss_clip": 0.01138174, - "auxiliary_loss_mlp": 0.01046794, - "balance_loss_clip": 1.05271184, - "balance_loss_mlp": 1.03020048, - "epoch": 0.25774838418758456, - "flos": 23258336204160.0, - "grad_norm": 1.4732857929087761, - "language_loss": 0.63687879, - "learning_rate": 3.480115069207354e-06, - "loss": 0.65872842, - "num_input_tokens_seen": 92642960, - "step": 4287, - "time_per_iteration": 2.67764949798584 - }, - { - "auxiliary_loss_clip": 0.01127889, - "auxiliary_loss_mlp": 0.01045385, - "balance_loss_clip": 1.05252934, - "balance_loss_mlp": 1.02769411, - "epoch": 0.2578085074402525, - "flos": 22601745544320.0, - "grad_norm": 2.134546441867425, - "language_loss": 0.71780413, - "learning_rate": 3.4798531111986557e-06, - "loss": 0.73953688, - "num_input_tokens_seen": 92662455, - "step": 4288, - "time_per_iteration": 2.7174036502838135 - }, - { - "auxiliary_loss_clip": 0.0110996, - "auxiliary_loss_mlp": 0.01042748, - "balance_loss_clip": 1.04934072, - "balance_loss_mlp": 1.02691674, - "epoch": 0.2578686306929205, - "flos": 24571840746240.0, - "grad_norm": 1.4449800602700236, - "language_loss": 0.77059102, - "learning_rate": 3.4795910970737786e-06, - "loss": 0.79211813, - "num_input_tokens_seen": 92683520, - "step": 4289, - "time_per_iteration": 2.748249053955078 - }, - { - "auxiliary_loss_clip": 0.01146276, - "auxiliary_loss_mlp": 0.00775089, - "balance_loss_clip": 1.05252326, - "balance_loss_mlp": 1.001122, - "epoch": 0.25792875394558845, - "flos": 18113953322880.0, - "grad_norm": 2.0235699584636295, - "language_loss": 0.85416883, - "learning_rate": 3.4793290268426592e-06, - "loss": 0.87338245, - "num_input_tokens_seen": 92701450, - "step": 4290, - "time_per_iteration": 2.593461751937866 - }, - { - "auxiliary_loss_clip": 0.01114221, - "auxiliary_loss_mlp": 0.01056837, - "balance_loss_clip": 1.05081999, - "balance_loss_mlp": 1.03660691, - "epoch": 0.2579888771982564, - "flos": 17712902995200.0, - "grad_norm": 2.4272093439618847, - "language_loss": 0.72360331, - "learning_rate": 3.4790669005152354e-06, - "loss": 0.74531388, - "num_input_tokens_seen": 92720355, - "step": 4291, - "time_per_iteration": 2.6838138103485107 - }, - { - "auxiliary_loss_clip": 0.01150945, - "auxiliary_loss_mlp": 0.0104494, - "balance_loss_clip": 1.05378067, - "balance_loss_mlp": 1.02758288, - "epoch": 0.2580490004509244, - "flos": 16434878112000.0, - "grad_norm": 2.78045823134535, - "language_loss": 0.80846477, - "learning_rate": 3.4788047181014458e-06, - "loss": 0.83042365, - "num_input_tokens_seen": 92736755, - "step": 4292, - "time_per_iteration": 2.595710277557373 - }, - { - "auxiliary_loss_clip": 0.0115367, - "auxiliary_loss_mlp": 0.01044878, - "balance_loss_clip": 1.05773902, - "balance_loss_mlp": 1.02702022, - "epoch": 0.25810912370359235, - "flos": 33835141128960.0, - "grad_norm": 2.057533015633898, - "language_loss": 0.67592025, - "learning_rate": 3.4785424796112337e-06, - "loss": 0.69790578, - "num_input_tokens_seen": 92757655, - "step": 4293, - "time_per_iteration": 2.699570894241333 - }, - { - "auxiliary_loss_clip": 0.0110485, - "auxiliary_loss_mlp": 0.01048043, - "balance_loss_clip": 1.04971898, - "balance_loss_mlp": 1.03190207, - "epoch": 0.2581692469562603, - "flos": 25192197561600.0, - "grad_norm": 2.0097854631835217, - "language_loss": 0.75671911, - "learning_rate": 3.478280185054542e-06, - "loss": 0.77824801, - "num_input_tokens_seen": 92776100, - "step": 4294, - "time_per_iteration": 2.7217960357666016 - }, - { - "auxiliary_loss_clip": 0.01098332, - "auxiliary_loss_mlp": 0.01053556, - "balance_loss_clip": 1.0444684, - "balance_loss_mlp": 1.03404188, - "epoch": 0.2582293702089283, - "flos": 34932212271360.0, - "grad_norm": 1.7798433628760433, - "language_loss": 0.8047998, - "learning_rate": 3.478017834441318e-06, - "loss": 0.82631868, - "num_input_tokens_seen": 92798880, - "step": 4295, - "time_per_iteration": 2.871460437774658 - }, - { - "auxiliary_loss_clip": 0.01055358, - "auxiliary_loss_mlp": 0.01044188, - "balance_loss_clip": 1.04843688, - "balance_loss_mlp": 1.0256989, - "epoch": 0.2582894934615963, - "flos": 26833746038400.0, - "grad_norm": 2.1012913939780753, - "language_loss": 0.72843397, - "learning_rate": 3.4777554277815096e-06, - "loss": 0.74942946, - "num_input_tokens_seen": 92817750, - "step": 4296, - "time_per_iteration": 3.173367738723755 - }, - { - "auxiliary_loss_clip": 0.01091622, - "auxiliary_loss_mlp": 0.01038465, - "balance_loss_clip": 1.05392241, - "balance_loss_mlp": 1.02106011, - "epoch": 0.25834961671426426, - "flos": 23515241253120.0, - "grad_norm": 1.5772062283828172, - "language_loss": 0.86928564, - "learning_rate": 3.477492965085067e-06, - "loss": 0.8905865, - "num_input_tokens_seen": 92837995, - "step": 4297, - "time_per_iteration": 3.1598868370056152 - }, - { - "auxiliary_loss_clip": 0.01149748, - "auxiliary_loss_mlp": 0.01047412, - "balance_loss_clip": 1.05517435, - "balance_loss_mlp": 1.03090191, - "epoch": 0.25840973996693223, - "flos": 22451028076800.0, - "grad_norm": 1.8030727150796175, - "language_loss": 0.84720427, - "learning_rate": 3.477230446361943e-06, - "loss": 0.86917591, - "num_input_tokens_seen": 92857245, - "step": 4298, - "time_per_iteration": 2.632448196411133 - }, - { - "auxiliary_loss_clip": 0.01135108, - "auxiliary_loss_mlp": 0.00775458, - "balance_loss_clip": 1.05262494, - "balance_loss_mlp": 1.00111055, - "epoch": 0.2584698632196002, - "flos": 11290854366720.0, - "grad_norm": 2.0124667048247686, - "language_loss": 0.83514953, - "learning_rate": 3.4769678716220927e-06, - "loss": 0.8542552, - "num_input_tokens_seen": 92873265, - "step": 4299, - "time_per_iteration": 2.631248950958252 - }, - { - "auxiliary_loss_clip": 0.01117485, - "auxiliary_loss_mlp": 0.0103505, - "balance_loss_clip": 1.05216849, - "balance_loss_mlp": 1.01868308, - "epoch": 0.25852998647226816, - "flos": 17929982839680.0, - "grad_norm": 2.419754138344463, - "language_loss": 0.82422709, - "learning_rate": 3.4767052408754726e-06, - "loss": 0.84575242, - "num_input_tokens_seen": 92890880, - "step": 4300, - "time_per_iteration": 2.650834083557129 - }, - { - "auxiliary_loss_clip": 0.0113846, - "auxiliary_loss_mlp": 0.01041208, - "balance_loss_clip": 1.0535903, - "balance_loss_mlp": 1.02343392, - "epoch": 0.2585901097249361, - "flos": 33256117889280.0, - "grad_norm": 2.971673559214411, - "language_loss": 0.66949177, - "learning_rate": 3.4764425541320417e-06, - "loss": 0.69128841, - "num_input_tokens_seen": 92910770, - "step": 4301, - "time_per_iteration": 2.729519844055176 - }, - { - "auxiliary_loss_clip": 0.01139778, - "auxiliary_loss_mlp": 0.01040158, - "balance_loss_clip": 1.05335701, - "balance_loss_mlp": 1.02245533, - "epoch": 0.2586502329776041, - "flos": 18441278985600.0, - "grad_norm": 2.29820997177689, - "language_loss": 0.81177735, - "learning_rate": 3.4761798114017617e-06, - "loss": 0.83357668, - "num_input_tokens_seen": 92929520, - "step": 4302, - "time_per_iteration": 2.5496692657470703 - }, - { - "auxiliary_loss_clip": 0.01105433, - "auxiliary_loss_mlp": 0.01042423, - "balance_loss_clip": 1.05242491, - "balance_loss_mlp": 1.02542388, - "epoch": 0.25871035623027205, - "flos": 17968120104960.0, - "grad_norm": 1.8036447001063776, - "language_loss": 0.92147923, - "learning_rate": 3.475917012694595e-06, - "loss": 0.94295776, - "num_input_tokens_seen": 92947890, - "step": 4303, - "time_per_iteration": 2.686222791671753 - }, - { - "auxiliary_loss_clip": 0.01141887, - "auxiliary_loss_mlp": 0.01040139, - "balance_loss_clip": 1.05643094, - "balance_loss_mlp": 1.02322304, - "epoch": 0.25877047948294, - "flos": 27777729415680.0, - "grad_norm": 2.7085759571044368, - "language_loss": 0.67138135, - "learning_rate": 3.475654158020507e-06, - "loss": 0.69320166, - "num_input_tokens_seen": 92967690, - "step": 4304, - "time_per_iteration": 2.665797472000122 - }, - { - "auxiliary_loss_clip": 0.01113882, - "auxiliary_loss_mlp": 0.01041979, - "balance_loss_clip": 1.0509342, - "balance_loss_mlp": 1.02498007, - "epoch": 0.258830602735608, - "flos": 27125843437440.0, - "grad_norm": 2.126938769919949, - "language_loss": 0.72085559, - "learning_rate": 3.4753912473894657e-06, - "loss": 0.74241412, - "num_input_tokens_seen": 92986830, - "step": 4305, - "time_per_iteration": 2.7514076232910156 - }, - { - "auxiliary_loss_clip": 0.01103045, - "auxiliary_loss_mlp": 0.00775987, - "balance_loss_clip": 1.04804707, - "balance_loss_mlp": 1.00122118, - "epoch": 0.25889072598827595, - "flos": 17891486438400.0, - "grad_norm": 6.414506312387852, - "language_loss": 0.76175749, - "learning_rate": 3.4751282808114403e-06, - "loss": 0.78054774, - "num_input_tokens_seen": 93002740, - "step": 4306, - "time_per_iteration": 2.7326161861419678 - }, - { - "auxiliary_loss_clip": 0.01049461, - "auxiliary_loss_mlp": 0.0102188, - "balance_loss_clip": 1.03476799, - "balance_loss_mlp": 1.01943636, - "epoch": 0.2589508492409439, - "flos": 53934955724160.0, - "grad_norm": 0.8427062291747792, - "language_loss": 0.57128024, - "learning_rate": 3.474865258296403e-06, - "loss": 0.59199357, - "num_input_tokens_seen": 93058645, - "step": 4307, - "time_per_iteration": 3.1499595642089844 - }, - { - "auxiliary_loss_clip": 0.01123356, - "auxiliary_loss_mlp": 0.01045032, - "balance_loss_clip": 1.0514828, - "balance_loss_mlp": 1.02858078, - "epoch": 0.2590109724936119, - "flos": 22125785402880.0, - "grad_norm": 1.5299746109283647, - "language_loss": 0.71727359, - "learning_rate": 3.474602179854327e-06, - "loss": 0.73895752, - "num_input_tokens_seen": 93077140, - "step": 4308, - "time_per_iteration": 2.6824283599853516 - }, - { - "auxiliary_loss_clip": 0.01152705, - "auxiliary_loss_mlp": 0.01046843, - "balance_loss_clip": 1.05659723, - "balance_loss_mlp": 1.02976048, - "epoch": 0.2590710957462799, - "flos": 13474294398720.0, - "grad_norm": 1.8339599204524273, - "language_loss": 0.83940542, - "learning_rate": 3.4743390454951886e-06, - "loss": 0.86140084, - "num_input_tokens_seen": 93093580, - "step": 4309, - "time_per_iteration": 2.560194253921509 - }, - { - "auxiliary_loss_clip": 0.01137306, - "auxiliary_loss_mlp": 0.01044025, - "balance_loss_clip": 1.05587196, - "balance_loss_mlp": 1.02815771, - "epoch": 0.25913121899894787, - "flos": 22307098279680.0, - "grad_norm": 1.5397823214091813, - "language_loss": 0.84657532, - "learning_rate": 3.474075855228966e-06, - "loss": 0.86838865, - "num_input_tokens_seen": 93112345, - "step": 4310, - "time_per_iteration": 2.627716064453125 - }, - { - "auxiliary_loss_clip": 0.01143598, - "auxiliary_loss_mlp": 0.01047667, - "balance_loss_clip": 1.05802059, - "balance_loss_mlp": 1.03141904, - "epoch": 0.25919134225161583, - "flos": 25811728364160.0, - "grad_norm": 2.0190220849922094, - "language_loss": 0.77145267, - "learning_rate": 3.473812609065639e-06, - "loss": 0.79336536, - "num_input_tokens_seen": 93131545, - "step": 4311, - "time_per_iteration": 2.694856643676758 - }, - { - "auxiliary_loss_clip": 0.01110239, - "auxiliary_loss_mlp": 0.01052381, - "balance_loss_clip": 1.04629123, - "balance_loss_mlp": 1.03498793, - "epoch": 0.2592514655042838, - "flos": 31212262108800.0, - "grad_norm": 1.9233367952735905, - "language_loss": 0.72848439, - "learning_rate": 3.4735493070151904e-06, - "loss": 0.75011057, - "num_input_tokens_seen": 93150730, - "step": 4312, - "time_per_iteration": 2.7577714920043945 - }, - { - "auxiliary_loss_clip": 0.01150768, - "auxiliary_loss_mlp": 0.01044439, - "balance_loss_clip": 1.05618715, - "balance_loss_mlp": 1.02845287, - "epoch": 0.25931158875695176, - "flos": 18474998878080.0, - "grad_norm": 1.8485738044524733, - "language_loss": 0.70193493, - "learning_rate": 3.4732859490876044e-06, - "loss": 0.72388697, - "num_input_tokens_seen": 93167895, - "step": 4313, - "time_per_iteration": 2.6447813510894775 - }, - { - "auxiliary_loss_clip": 0.01150117, - "auxiliary_loss_mlp": 0.01054192, - "balance_loss_clip": 1.05624926, - "balance_loss_mlp": 1.03845656, - "epoch": 0.2593717120096197, - "flos": 19207935895680.0, - "grad_norm": 1.8538125013537565, - "language_loss": 0.80462205, - "learning_rate": 3.473022535292867e-06, - "loss": 0.82666522, - "num_input_tokens_seen": 93187650, - "step": 4314, - "time_per_iteration": 2.6073296070098877 - }, - { - "auxiliary_loss_clip": 0.01110006, - "auxiliary_loss_mlp": 0.01049511, - "balance_loss_clip": 1.04867387, - "balance_loss_mlp": 1.03253555, - "epoch": 0.2594318352622877, - "flos": 31248100903680.0, - "grad_norm": 2.061113629574459, - "language_loss": 0.670748, - "learning_rate": 3.472759065640968e-06, - "loss": 0.69234318, - "num_input_tokens_seen": 93207370, - "step": 4315, - "time_per_iteration": 6.427948236465454 - }, - { - "auxiliary_loss_clip": 0.01096074, - "auxiliary_loss_mlp": 0.01056601, - "balance_loss_clip": 1.04853845, - "balance_loss_mlp": 1.0407939, - "epoch": 0.25949195851495566, - "flos": 22237144542720.0, - "grad_norm": 2.0096953575355125, - "language_loss": 0.79649067, - "learning_rate": 3.4724955401418976e-06, - "loss": 0.81801736, - "num_input_tokens_seen": 93227925, - "step": 4316, - "time_per_iteration": 2.7463796138763428 - }, - { - "auxiliary_loss_clip": 0.01096584, - "auxiliary_loss_mlp": 0.01048328, - "balance_loss_clip": 1.0487628, - "balance_loss_mlp": 1.03112638, - "epoch": 0.2595520817676236, - "flos": 28075716645120.0, - "grad_norm": 3.2727308584132584, - "language_loss": 0.77498394, - "learning_rate": 3.4722319588056487e-06, - "loss": 0.79643309, - "num_input_tokens_seen": 93250020, - "step": 4317, - "time_per_iteration": 4.658867359161377 - }, - { - "auxiliary_loss_clip": 0.01155612, - "auxiliary_loss_mlp": 0.01054128, - "balance_loss_clip": 1.05959845, - "balance_loss_mlp": 1.03734958, - "epoch": 0.2596122050202916, - "flos": 20190954378240.0, - "grad_norm": 2.117435309152476, - "language_loss": 0.77656054, - "learning_rate": 3.4719683216422163e-06, - "loss": 0.79865795, - "num_input_tokens_seen": 93269070, - "step": 4318, - "time_per_iteration": 2.5934906005859375 - }, - { - "auxiliary_loss_clip": 0.01146449, - "auxiliary_loss_mlp": 0.01045441, - "balance_loss_clip": 1.0530901, - "balance_loss_mlp": 1.02733302, - "epoch": 0.25967232827295955, - "flos": 22527949052160.0, - "grad_norm": 1.6144223240331488, - "language_loss": 0.76362926, - "learning_rate": 3.471704628661598e-06, - "loss": 0.78554815, - "num_input_tokens_seen": 93290250, - "step": 4319, - "time_per_iteration": 2.607649564743042 - }, - { - "auxiliary_loss_clip": 0.01125042, - "auxiliary_loss_mlp": 0.01041624, - "balance_loss_clip": 1.05419481, - "balance_loss_mlp": 1.02587628, - "epoch": 0.2597324515256275, - "flos": 21068252156160.0, - "grad_norm": 1.6090277746740278, - "language_loss": 0.76549125, - "learning_rate": 3.4714408798737925e-06, - "loss": 0.78715789, - "num_input_tokens_seen": 93310090, - "step": 4320, - "time_per_iteration": 2.722574472427368 - }, - { - "auxiliary_loss_clip": 0.01116281, - "auxiliary_loss_mlp": 0.01042709, - "balance_loss_clip": 1.05157554, - "balance_loss_mlp": 1.02546, - "epoch": 0.2597925747782955, - "flos": 22050013662720.0, - "grad_norm": 1.6564648175426406, - "language_loss": 0.71067965, - "learning_rate": 3.471177075288801e-06, - "loss": 0.73226953, - "num_input_tokens_seen": 93329570, - "step": 4321, - "time_per_iteration": 4.276093244552612 - }, - { - "auxiliary_loss_clip": 0.01125031, - "auxiliary_loss_mlp": 0.01055033, - "balance_loss_clip": 1.05191207, - "balance_loss_mlp": 1.03549457, - "epoch": 0.2598526980309635, - "flos": 19536949497600.0, - "grad_norm": 1.9031382952841078, - "language_loss": 0.74805915, - "learning_rate": 3.4709132149166277e-06, - "loss": 0.76985979, - "num_input_tokens_seen": 93347920, - "step": 4322, - "time_per_iteration": 2.6573097705841064 - }, - { - "auxiliary_loss_clip": 0.0111558, - "auxiliary_loss_mlp": 0.0104757, - "balance_loss_clip": 1.05213332, - "balance_loss_mlp": 1.03004622, - "epoch": 0.25991282128363147, - "flos": 24495207079680.0, - "grad_norm": 1.8978708709823064, - "language_loss": 0.73837054, - "learning_rate": 3.470649298767278e-06, - "loss": 0.76000202, - "num_input_tokens_seen": 93367145, - "step": 4323, - "time_per_iteration": 2.75765061378479 - }, - { - "auxiliary_loss_clip": 0.01139686, - "auxiliary_loss_mlp": 0.00775622, - "balance_loss_clip": 1.0509938, - "balance_loss_mlp": 1.00099182, - "epoch": 0.25997294453629943, - "flos": 24201457655040.0, - "grad_norm": 2.107506603705316, - "language_loss": 0.67186093, - "learning_rate": 3.4703853268507597e-06, - "loss": 0.69101399, - "num_input_tokens_seen": 93386555, - "step": 4324, - "time_per_iteration": 2.752307891845703 - }, - { - "auxiliary_loss_clip": 0.0109649, - "auxiliary_loss_mlp": 0.01045367, - "balance_loss_clip": 1.05030632, - "balance_loss_mlp": 1.03026319, - "epoch": 0.2600330677889674, - "flos": 31431460855680.0, - "grad_norm": 2.121769328280442, - "language_loss": 0.71064055, - "learning_rate": 3.470121299177082e-06, - "loss": 0.732059, - "num_input_tokens_seen": 93405590, - "step": 4325, - "time_per_iteration": 2.824281692504883 - }, - { - "auxiliary_loss_clip": 0.01134613, - "auxiliary_loss_mlp": 0.01035571, - "balance_loss_clip": 1.04941416, - "balance_loss_mlp": 1.01839304, - "epoch": 0.26009319104163536, - "flos": 32266527217920.0, - "grad_norm": 1.8496839878379767, - "language_loss": 0.73106551, - "learning_rate": 3.469857215756257e-06, - "loss": 0.75276732, - "num_input_tokens_seen": 93424750, - "step": 4326, - "time_per_iteration": 2.7235658168792725 - }, - { - "auxiliary_loss_clip": 0.01118123, - "auxiliary_loss_mlp": 0.00776184, - "balance_loss_clip": 1.05001175, - "balance_loss_mlp": 1.00100303, - "epoch": 0.26015331429430333, - "flos": 26286754752000.0, - "grad_norm": 1.7229255626307804, - "language_loss": 0.86908734, - "learning_rate": 3.4695930765982997e-06, - "loss": 0.88803041, - "num_input_tokens_seen": 93443465, - "step": 4327, - "time_per_iteration": 2.7072155475616455 - }, - { - "auxiliary_loss_clip": 0.01153995, - "auxiliary_loss_mlp": 0.00775932, - "balance_loss_clip": 1.05640841, - "balance_loss_mlp": 1.0008533, - "epoch": 0.2602134375469713, - "flos": 21142335957120.0, - "grad_norm": 1.4664721830580452, - "language_loss": 0.80265766, - "learning_rate": 3.4693288817132255e-06, - "loss": 0.82195687, - "num_input_tokens_seen": 93462580, - "step": 4328, - "time_per_iteration": 2.6463024616241455 - }, - { - "auxiliary_loss_clip": 0.0111992, - "auxiliary_loss_mlp": 0.00774533, - "balance_loss_clip": 1.04837036, - "balance_loss_mlp": 1.00092077, - "epoch": 0.26027356079963926, - "flos": 25921327737600.0, - "grad_norm": 1.6317826670237516, - "language_loss": 0.88094193, - "learning_rate": 3.4690646311110525e-06, - "loss": 0.89988649, - "num_input_tokens_seen": 93482790, - "step": 4329, - "time_per_iteration": 2.7130861282348633 - }, - { - "auxiliary_loss_clip": 0.011478, - "auxiliary_loss_mlp": 0.01040633, - "balance_loss_clip": 1.05545115, - "balance_loss_mlp": 1.02431321, - "epoch": 0.2603336840523072, - "flos": 26359222440960.0, - "grad_norm": 1.8335620949826397, - "language_loss": 0.77834195, - "learning_rate": 3.468800324801802e-06, - "loss": 0.80022621, - "num_input_tokens_seen": 93498795, - "step": 4330, - "time_per_iteration": 2.6223180294036865 - }, - { - "auxiliary_loss_clip": 0.01148961, - "auxiliary_loss_mlp": 0.01047898, - "balance_loss_clip": 1.0536809, - "balance_loss_mlp": 1.03081572, - "epoch": 0.2603938073049752, - "flos": 23513661054720.0, - "grad_norm": 1.5875829464999673, - "language_loss": 0.75683081, - "learning_rate": 3.4685359627954958e-06, - "loss": 0.77879941, - "num_input_tokens_seen": 93518335, - "step": 4331, - "time_per_iteration": 2.6383559703826904 - }, - { - "auxiliary_loss_clip": 0.01130325, - "auxiliary_loss_mlp": 0.01042577, - "balance_loss_clip": 1.05964541, - "balance_loss_mlp": 1.0261023, - "epoch": 0.26045393055764315, - "flos": 25374300537600.0, - "grad_norm": 1.3798785286413686, - "language_loss": 0.69174874, - "learning_rate": 3.4682715451021584e-06, - "loss": 0.71347773, - "num_input_tokens_seen": 93539170, - "step": 4332, - "time_per_iteration": 2.675203800201416 - }, - { - "auxiliary_loss_clip": 0.01117119, - "auxiliary_loss_mlp": 0.01048864, - "balance_loss_clip": 1.04849494, - "balance_loss_mlp": 1.03203201, - "epoch": 0.2605140538103111, - "flos": 27635272076160.0, - "grad_norm": 6.1371153370044915, - "language_loss": 0.79897749, - "learning_rate": 3.4680070717318174e-06, - "loss": 0.82063735, - "num_input_tokens_seen": 93558480, - "step": 4333, - "time_per_iteration": 2.7595479488372803 - }, - { - "auxiliary_loss_clip": 0.01144159, - "auxiliary_loss_mlp": 0.01039411, - "balance_loss_clip": 1.05260658, - "balance_loss_mlp": 1.02317452, - "epoch": 0.2605741770629791, - "flos": 13769839503360.0, - "grad_norm": 1.9478362516602954, - "language_loss": 0.80919975, - "learning_rate": 3.467742542694501e-06, - "loss": 0.83103544, - "num_input_tokens_seen": 93575220, - "step": 4334, - "time_per_iteration": 2.585676670074463 - }, - { - "auxiliary_loss_clip": 0.01121127, - "auxiliary_loss_mlp": 0.0103772, - "balance_loss_clip": 1.04868293, - "balance_loss_mlp": 1.02051783, - "epoch": 0.26063430031564705, - "flos": 26031681296640.0, - "grad_norm": 1.8490049893982383, - "language_loss": 0.8027274, - "learning_rate": 3.46747795800024e-06, - "loss": 0.82431591, - "num_input_tokens_seen": 93597015, - "step": 4335, - "time_per_iteration": 2.730853796005249 - }, - { - "auxiliary_loss_clip": 0.01060862, - "auxiliary_loss_mlp": 0.01054521, - "balance_loss_clip": 1.03598261, - "balance_loss_mlp": 1.05267298, - "epoch": 0.26069442356831507, - "flos": 62443809820800.0, - "grad_norm": 1.1166557113782816, - "language_loss": 0.60850358, - "learning_rate": 3.467213317659068e-06, - "loss": 0.62965739, - "num_input_tokens_seen": 93657775, - "step": 4336, - "time_per_iteration": 3.1322128772735596 - }, - { - "auxiliary_loss_clip": 0.01111016, - "auxiliary_loss_mlp": 0.01046835, - "balance_loss_clip": 1.05039525, - "balance_loss_mlp": 1.02976441, - "epoch": 0.26075454682098304, - "flos": 13626376583040.0, - "grad_norm": 2.784557437613843, - "language_loss": 0.7679469, - "learning_rate": 3.46694862168102e-06, - "loss": 0.78952539, - "num_input_tokens_seen": 93676145, - "step": 4337, - "time_per_iteration": 2.704305410385132 - }, - { - "auxiliary_loss_clip": 0.0112146, - "auxiliary_loss_mlp": 0.01045064, - "balance_loss_clip": 1.04997659, - "balance_loss_mlp": 1.02728987, - "epoch": 0.260814670073651, - "flos": 12126531260160.0, - "grad_norm": 2.7677016823816976, - "language_loss": 0.74653983, - "learning_rate": 3.4666838700761334e-06, - "loss": 0.76820505, - "num_input_tokens_seen": 93692480, - "step": 4338, - "time_per_iteration": 2.652679204940796 - }, - { - "auxiliary_loss_clip": 0.01140171, - "auxiliary_loss_mlp": 0.01040507, - "balance_loss_clip": 1.05246329, - "balance_loss_mlp": 1.02314997, - "epoch": 0.26087479332631897, - "flos": 15122522805120.0, - "grad_norm": 2.378816803290104, - "language_loss": 0.81061137, - "learning_rate": 3.466419062854447e-06, - "loss": 0.8324182, - "num_input_tokens_seen": 93710165, - "step": 4339, - "time_per_iteration": 2.7237682342529297 - }, - { - "auxiliary_loss_clip": 0.01090328, - "auxiliary_loss_mlp": 0.01040213, - "balance_loss_clip": 1.04649866, - "balance_loss_mlp": 1.02436984, - "epoch": 0.26093491657898693, - "flos": 24680937329280.0, - "grad_norm": 1.6860698424881835, - "language_loss": 0.76643449, - "learning_rate": 3.4661542000260033e-06, - "loss": 0.78773987, - "num_input_tokens_seen": 93730185, - "step": 4340, - "time_per_iteration": 2.817647695541382 - }, - { - "auxiliary_loss_clip": 0.01082903, - "auxiliary_loss_mlp": 0.01040837, - "balance_loss_clip": 1.04781985, - "balance_loss_mlp": 1.02381396, - "epoch": 0.2609950398316549, - "flos": 25116138512640.0, - "grad_norm": 1.954971477972507, - "language_loss": 0.82689369, - "learning_rate": 3.465889281600845e-06, - "loss": 0.84813106, - "num_input_tokens_seen": 93747690, - "step": 4341, - "time_per_iteration": 2.822387218475342 - }, - { - "auxiliary_loss_clip": 0.01148407, - "auxiliary_loss_mlp": 0.0104134, - "balance_loss_clip": 1.0550344, - "balance_loss_mlp": 1.02387536, - "epoch": 0.26105516308432286, - "flos": 28548588216960.0, - "grad_norm": 2.3225619433460083, - "language_loss": 0.76828772, - "learning_rate": 3.4656243075890183e-06, - "loss": 0.79018521, - "num_input_tokens_seen": 93767405, - "step": 4342, - "time_per_iteration": 2.7091987133026123 - }, - { - "auxiliary_loss_clip": 0.01137117, - "auxiliary_loss_mlp": 0.01036127, - "balance_loss_clip": 1.05262113, - "balance_loss_mlp": 1.01837635, - "epoch": 0.2611152863369908, - "flos": 39530609447040.0, - "grad_norm": 1.8380809165191976, - "language_loss": 0.66072762, - "learning_rate": 3.4653592780005707e-06, - "loss": 0.68246007, - "num_input_tokens_seen": 93789950, - "step": 4343, - "time_per_iteration": 2.7885191440582275 - }, - { - "auxiliary_loss_clip": 0.01076135, - "auxiliary_loss_mlp": 0.01045298, - "balance_loss_clip": 1.04419374, - "balance_loss_mlp": 1.02715397, - "epoch": 0.2611754095896588, - "flos": 13735329511680.0, - "grad_norm": 1.9033089414913282, - "language_loss": 0.73626471, - "learning_rate": 3.465094192845553e-06, - "loss": 0.75747907, - "num_input_tokens_seen": 93807835, - "step": 4344, - "time_per_iteration": 2.7622575759887695 - }, - { - "auxiliary_loss_clip": 0.01150726, - "auxiliary_loss_mlp": 0.01042349, - "balance_loss_clip": 1.05625904, - "balance_loss_mlp": 1.02560019, - "epoch": 0.26123553284232676, - "flos": 21506649649920.0, - "grad_norm": 2.7815673216786045, - "language_loss": 0.86820161, - "learning_rate": 3.4648290521340165e-06, - "loss": 0.89013231, - "num_input_tokens_seen": 93825670, - "step": 4345, - "time_per_iteration": 2.615021228790283 - }, - { - "auxiliary_loss_clip": 0.01121997, - "auxiliary_loss_mlp": 0.01036853, - "balance_loss_clip": 1.05178094, - "balance_loss_mlp": 1.02056956, - "epoch": 0.2612956560949947, - "flos": 21139786091520.0, - "grad_norm": 1.9109970692142244, - "language_loss": 0.76235008, - "learning_rate": 3.464563855876015e-06, - "loss": 0.78393853, - "num_input_tokens_seen": 93844045, - "step": 4346, - "time_per_iteration": 2.660766363143921 - }, - { - "auxiliary_loss_clip": 0.01140284, - "auxiliary_loss_mlp": 0.01045855, - "balance_loss_clip": 1.05571795, - "balance_loss_mlp": 1.02870095, - "epoch": 0.2613557793476627, - "flos": 25119011600640.0, - "grad_norm": 1.6628741865434964, - "language_loss": 0.75995654, - "learning_rate": 3.464298604081606e-06, - "loss": 0.78181791, - "num_input_tokens_seen": 93864380, - "step": 4347, - "time_per_iteration": 2.6985979080200195 - }, - { - "auxiliary_loss_clip": 0.0110699, - "auxiliary_loss_mlp": 0.01041742, - "balance_loss_clip": 1.05063343, - "balance_loss_mlp": 1.02501726, - "epoch": 0.26141590260033065, - "flos": 26067699659520.0, - "grad_norm": 1.7474860409603998, - "language_loss": 0.73196864, - "learning_rate": 3.4640332967608476e-06, - "loss": 0.75345594, - "num_input_tokens_seen": 93885475, - "step": 4348, - "time_per_iteration": 2.7511887550354004 - }, - { - "auxiliary_loss_clip": 0.01110529, - "auxiliary_loss_mlp": 0.01045849, - "balance_loss_clip": 1.05199265, - "balance_loss_mlp": 1.0290519, - "epoch": 0.2614760258529987, - "flos": 25701518459520.0, - "grad_norm": 2.6377025292028944, - "language_loss": 0.91262084, - "learning_rate": 3.463767933923799e-06, - "loss": 0.93418467, - "num_input_tokens_seen": 93905545, - "step": 4349, - "time_per_iteration": 2.720240354537964 - }, - { - "auxiliary_loss_clip": 0.0113714, - "auxiliary_loss_mlp": 0.01048228, - "balance_loss_clip": 1.05569661, - "balance_loss_mlp": 1.03184831, - "epoch": 0.26153614910566664, - "flos": 17457147181440.0, - "grad_norm": 1.7232851278977876, - "language_loss": 0.80046499, - "learning_rate": 3.463502515580524e-06, - "loss": 0.82231867, - "num_input_tokens_seen": 93924185, - "step": 4350, - "time_per_iteration": 2.652054786682129 - }, - { - "auxiliary_loss_clip": 0.0113538, - "auxiliary_loss_mlp": 0.01049567, - "balance_loss_clip": 1.05652642, - "balance_loss_mlp": 1.03299654, - "epoch": 0.2615962723583346, - "flos": 17712831168000.0, - "grad_norm": 10.816271600027287, - "language_loss": 0.62736505, - "learning_rate": 3.4632370417410866e-06, - "loss": 0.64921451, - "num_input_tokens_seen": 93942825, - "step": 4351, - "time_per_iteration": 2.6674954891204834 - }, - { - "auxiliary_loss_clip": 0.01138265, - "auxiliary_loss_mlp": 0.01048518, - "balance_loss_clip": 1.05201697, - "balance_loss_mlp": 1.03168559, - "epoch": 0.26165639561100257, - "flos": 23257725672960.0, - "grad_norm": 1.9014393183165526, - "language_loss": 0.84131002, - "learning_rate": 3.462971512415555e-06, - "loss": 0.86317784, - "num_input_tokens_seen": 93962045, - "step": 4352, - "time_per_iteration": 2.8033063411712646 - }, - { - "auxiliary_loss_clip": 0.01065372, - "auxiliary_loss_mlp": 0.0102292, - "balance_loss_clip": 1.04145527, - "balance_loss_mlp": 1.02078664, - "epoch": 0.26171651886367053, - "flos": 66737970800640.0, - "grad_norm": 0.8050815788583346, - "language_loss": 0.70591724, - "learning_rate": 3.462705927613996e-06, - "loss": 0.7268002, - "num_input_tokens_seen": 94021175, - "step": 4353, - "time_per_iteration": 3.101954936981201 - }, - { - "auxiliary_loss_clip": 0.01115948, - "auxiliary_loss_mlp": 0.01069336, - "balance_loss_clip": 1.04858005, - "balance_loss_mlp": 1.05013168, - "epoch": 0.2617766421163385, - "flos": 22349581090560.0, - "grad_norm": 1.6494861832481549, - "language_loss": 0.77562749, - "learning_rate": 3.4624402873464816e-06, - "loss": 0.79748034, - "num_input_tokens_seen": 94043370, - "step": 4354, - "time_per_iteration": 2.772723436355591 - }, - { - "auxiliary_loss_clip": 0.01089887, - "auxiliary_loss_mlp": 0.01058882, - "balance_loss_clip": 1.04805279, - "balance_loss_mlp": 1.04082203, - "epoch": 0.26183676536900646, - "flos": 26067125041920.0, - "grad_norm": 1.8339738923409379, - "language_loss": 0.68351537, - "learning_rate": 3.462174591623085e-06, - "loss": 0.70500308, - "num_input_tokens_seen": 94063510, - "step": 4355, - "time_per_iteration": 5.908639430999756 - }, - { - "auxiliary_loss_clip": 0.01094509, - "auxiliary_loss_mlp": 0.01039879, - "balance_loss_clip": 1.0486095, - "balance_loss_mlp": 1.02164054, - "epoch": 0.26189688862167443, - "flos": 20996466825600.0, - "grad_norm": 1.9440617828376934, - "language_loss": 0.67573452, - "learning_rate": 3.4619088404538815e-06, - "loss": 0.69707847, - "num_input_tokens_seen": 94083865, - "step": 4356, - "time_per_iteration": 4.351539611816406 - }, - { - "auxiliary_loss_clip": 0.01057297, - "auxiliary_loss_mlp": 0.0100707, - "balance_loss_clip": 1.03335488, - "balance_loss_mlp": 1.00484037, - "epoch": 0.2619570118743424, - "flos": 65798261141760.0, - "grad_norm": 0.6809064288126679, - "language_loss": 0.53124392, - "learning_rate": 3.4616430338489487e-06, - "loss": 0.55188763, - "num_input_tokens_seen": 94144095, - "step": 4357, - "time_per_iteration": 3.0896964073181152 - }, - { - "auxiliary_loss_clip": 0.01139918, - "auxiliary_loss_mlp": 0.0104768, - "balance_loss_clip": 1.05365348, - "balance_loss_mlp": 1.03106248, - "epoch": 0.26201713512701036, - "flos": 28766817296640.0, - "grad_norm": 1.8814759411194193, - "language_loss": 0.84233022, - "learning_rate": 3.4613771718183654e-06, - "loss": 0.86420614, - "num_input_tokens_seen": 94163035, - "step": 4358, - "time_per_iteration": 2.723057746887207 - }, - { - "auxiliary_loss_clip": 0.01127273, - "auxiliary_loss_mlp": 0.01043309, - "balance_loss_clip": 1.04886353, - "balance_loss_mlp": 1.02411628, - "epoch": 0.2620772583796783, - "flos": 26432516142720.0, - "grad_norm": 2.354545555797757, - "language_loss": 0.67324048, - "learning_rate": 3.4611112543722127e-06, - "loss": 0.69494629, - "num_input_tokens_seen": 94182520, - "step": 4359, - "time_per_iteration": 2.7128403186798096 - }, - { - "auxiliary_loss_clip": 0.01118602, - "auxiliary_loss_mlp": 0.01045018, - "balance_loss_clip": 1.04637527, - "balance_loss_mlp": 1.02880526, - "epoch": 0.2621373816323463, - "flos": 20156552127360.0, - "grad_norm": 1.8862311303010293, - "language_loss": 0.78726596, - "learning_rate": 3.4608452815205757e-06, - "loss": 0.80890214, - "num_input_tokens_seen": 94201795, - "step": 4360, - "time_per_iteration": 4.41027569770813 - }, - { - "auxiliary_loss_clip": 0.01119481, - "auxiliary_loss_mlp": 0.01042435, - "balance_loss_clip": 1.04831719, - "balance_loss_mlp": 1.02640164, - "epoch": 0.26219750488501425, - "flos": 28621235473920.0, - "grad_norm": 1.8399079957082187, - "language_loss": 0.67980468, - "learning_rate": 3.4605792532735387e-06, - "loss": 0.70142382, - "num_input_tokens_seen": 94222390, - "step": 4361, - "time_per_iteration": 2.7642054557800293 - }, - { - "auxiliary_loss_clip": 0.01139509, - "auxiliary_loss_mlp": 0.01055985, - "balance_loss_clip": 1.05313993, - "balance_loss_mlp": 1.03842545, - "epoch": 0.2622576281376823, - "flos": 15042549173760.0, - "grad_norm": 2.1489496912575166, - "language_loss": 0.84068632, - "learning_rate": 3.46031316964119e-06, - "loss": 0.86264122, - "num_input_tokens_seen": 94239980, - "step": 4362, - "time_per_iteration": 2.6152050495147705 - }, - { - "auxiliary_loss_clip": 0.01105407, - "auxiliary_loss_mlp": 0.01046107, - "balance_loss_clip": 1.04752779, - "balance_loss_mlp": 1.02867842, - "epoch": 0.26231775139035024, - "flos": 26396174557440.0, - "grad_norm": 2.0545933935481835, - "language_loss": 0.65068752, - "learning_rate": 3.4600470306336197e-06, - "loss": 0.67220271, - "num_input_tokens_seen": 94260715, - "step": 4363, - "time_per_iteration": 2.7297046184539795 - }, - { - "auxiliary_loss_clip": 0.01040739, - "auxiliary_loss_mlp": 0.01017272, - "balance_loss_clip": 1.02776587, - "balance_loss_mlp": 1.01506662, - "epoch": 0.2623778746430182, - "flos": 65408918647680.0, - "grad_norm": 0.9195643121956573, - "language_loss": 0.61104208, - "learning_rate": 3.4597808362609194e-06, - "loss": 0.6316222, - "num_input_tokens_seen": 94321285, - "step": 4364, - "time_per_iteration": 3.3122286796569824 - }, - { - "auxiliary_loss_clip": 0.01151556, - "auxiliary_loss_mlp": 0.01050336, - "balance_loss_clip": 1.0550462, - "balance_loss_mlp": 1.03201365, - "epoch": 0.26243799789568617, - "flos": 12604215254400.0, - "grad_norm": 2.6922753747731387, - "language_loss": 0.7223357, - "learning_rate": 3.459514586533184e-06, - "loss": 0.74435461, - "num_input_tokens_seen": 94335420, - "step": 4365, - "time_per_iteration": 2.588611364364624 - }, - { - "auxiliary_loss_clip": 0.01123747, - "auxiliary_loss_mlp": 0.00776591, - "balance_loss_clip": 1.05296087, - "balance_loss_mlp": 1.00093484, - "epoch": 0.26249812114835414, - "flos": 28623821253120.0, - "grad_norm": 1.9684942716361389, - "language_loss": 0.77178609, - "learning_rate": 3.459248281460509e-06, - "loss": 0.79078948, - "num_input_tokens_seen": 94357440, - "step": 4366, - "time_per_iteration": 2.7489407062530518 - }, - { - "auxiliary_loss_clip": 0.01149499, - "auxiliary_loss_mlp": 0.0104305, - "balance_loss_clip": 1.05433846, - "balance_loss_mlp": 1.02652764, - "epoch": 0.2625582444010221, - "flos": 14465393441280.0, - "grad_norm": 1.9587652436204308, - "language_loss": 0.76205176, - "learning_rate": 3.4589819210529927e-06, - "loss": 0.78397727, - "num_input_tokens_seen": 94375690, - "step": 4367, - "time_per_iteration": 2.63778018951416 - }, - { - "auxiliary_loss_clip": 0.01136158, - "auxiliary_loss_mlp": 0.01045138, - "balance_loss_clip": 1.0523572, - "balance_loss_mlp": 1.02903318, - "epoch": 0.26261836765369007, - "flos": 16613174246400.0, - "grad_norm": 2.055472748506688, - "language_loss": 0.69400585, - "learning_rate": 3.458715505320736e-06, - "loss": 0.71581888, - "num_input_tokens_seen": 94393190, - "step": 4368, - "time_per_iteration": 2.6515018939971924 - }, - { - "auxiliary_loss_clip": 0.01123905, - "auxiliary_loss_mlp": 0.01045619, - "balance_loss_clip": 1.05272579, - "balance_loss_mlp": 1.02791643, - "epoch": 0.26267849090635803, - "flos": 20519932066560.0, - "grad_norm": 1.8794244148025279, - "language_loss": 0.79255176, - "learning_rate": 3.458449034273841e-06, - "loss": 0.81424701, - "num_input_tokens_seen": 94410975, - "step": 4369, - "time_per_iteration": 2.717142343521118 - }, - { - "auxiliary_loss_clip": 0.01119662, - "auxiliary_loss_mlp": 0.01040752, - "balance_loss_clip": 1.05190969, - "balance_loss_mlp": 1.02344334, - "epoch": 0.262738614159026, - "flos": 21323936142720.0, - "grad_norm": 4.796099217910503, - "language_loss": 0.83591807, - "learning_rate": 3.4581825079224133e-06, - "loss": 0.85752219, - "num_input_tokens_seen": 94429985, - "step": 4370, - "time_per_iteration": 2.742966890335083 - }, - { - "auxiliary_loss_clip": 0.01137822, - "auxiliary_loss_mlp": 0.01053822, - "balance_loss_clip": 1.05178714, - "balance_loss_mlp": 1.0345341, - "epoch": 0.26279873741169396, - "flos": 17603590930560.0, - "grad_norm": 1.7275848609842401, - "language_loss": 0.71854705, - "learning_rate": 3.4579159262765575e-06, - "loss": 0.7404635, - "num_input_tokens_seen": 94448660, - "step": 4371, - "time_per_iteration": 2.691899538040161 - }, - { - "auxiliary_loss_clip": 0.01062293, - "auxiliary_loss_mlp": 0.01003561, - "balance_loss_clip": 1.02797341, - "balance_loss_mlp": 1.00147498, - "epoch": 0.2628588606643619, - "flos": 60949746587520.0, - "grad_norm": 0.6802377941963699, - "language_loss": 0.56387627, - "learning_rate": 3.457649289346384e-06, - "loss": 0.58453482, - "num_input_tokens_seen": 94515630, - "step": 4372, - "time_per_iteration": 3.279158115386963 - }, - { - "auxiliary_loss_clip": 0.01124406, - "auxiliary_loss_mlp": 0.01038838, - "balance_loss_clip": 1.05295706, - "balance_loss_mlp": 1.02169585, - "epoch": 0.2629189839170299, - "flos": 27016315891200.0, - "grad_norm": 1.9842369613103452, - "language_loss": 0.77777553, - "learning_rate": 3.4573825971420042e-06, - "loss": 0.79940796, - "num_input_tokens_seen": 94535385, - "step": 4373, - "time_per_iteration": 2.8367159366607666 - }, - { - "auxiliary_loss_clip": 0.01104424, - "auxiliary_loss_mlp": 0.01039426, - "balance_loss_clip": 1.05070519, - "balance_loss_mlp": 1.02314186, - "epoch": 0.26297910716969786, - "flos": 17019863009280.0, - "grad_norm": 7.588420148526772, - "language_loss": 0.71397603, - "learning_rate": 3.4571158496735294e-06, - "loss": 0.73541456, - "num_input_tokens_seen": 94552650, - "step": 4374, - "time_per_iteration": 2.722332239151001 - }, - { - "auxiliary_loss_clip": 0.0112606, - "auxiliary_loss_mlp": 0.01045748, - "balance_loss_clip": 1.05836225, - "balance_loss_mlp": 1.02748489, - "epoch": 0.2630392304223659, - "flos": 24897370728960.0, - "grad_norm": 1.8414201938467747, - "language_loss": 0.81212163, - "learning_rate": 3.4568490469510756e-06, - "loss": 0.83383965, - "num_input_tokens_seen": 94574075, - "step": 4375, - "time_per_iteration": 2.7654781341552734 - }, - { - "auxiliary_loss_clip": 0.01118996, - "auxiliary_loss_mlp": 0.01045139, - "balance_loss_clip": 1.04959798, - "balance_loss_mlp": 1.02901626, - "epoch": 0.26309935367503384, - "flos": 32854026067200.0, - "grad_norm": 1.6461571134793078, - "language_loss": 0.6613251, - "learning_rate": 3.4565821889847603e-06, - "loss": 0.68296647, - "num_input_tokens_seen": 94594255, - "step": 4376, - "time_per_iteration": 2.778731107711792 - }, - { - "auxiliary_loss_clip": 0.01096695, - "auxiliary_loss_mlp": 0.0106417, - "balance_loss_clip": 1.04752398, - "balance_loss_mlp": 1.04587138, - "epoch": 0.2631594769277018, - "flos": 15887958652800.0, - "grad_norm": 1.7628322447974545, - "language_loss": 0.69351411, - "learning_rate": 3.4563152757847026e-06, - "loss": 0.71512282, - "num_input_tokens_seen": 94611410, - "step": 4377, - "time_per_iteration": 2.7606706619262695 - }, - { - "auxiliary_loss_clip": 0.01141095, - "auxiliary_loss_mlp": 0.01043033, - "balance_loss_clip": 1.0561285, - "balance_loss_mlp": 1.02606952, - "epoch": 0.2632196001803698, - "flos": 50804943557760.0, - "grad_norm": 2.1982489321824352, - "language_loss": 0.79961169, - "learning_rate": 3.4560483073610233e-06, - "loss": 0.82145292, - "num_input_tokens_seen": 94636575, - "step": 4378, - "time_per_iteration": 2.9000468254089355 - }, - { - "auxiliary_loss_clip": 0.01127331, - "auxiliary_loss_mlp": 0.01045659, - "balance_loss_clip": 1.05713558, - "balance_loss_mlp": 1.03063893, - "epoch": 0.26327972343303774, - "flos": 13733031041280.0, - "grad_norm": 1.912468890890116, - "language_loss": 0.76285684, - "learning_rate": 3.455781283723846e-06, - "loss": 0.78458679, - "num_input_tokens_seen": 94654345, - "step": 4379, - "time_per_iteration": 2.6757192611694336 - }, - { - "auxiliary_loss_clip": 0.01114814, - "auxiliary_loss_mlp": 0.01043, - "balance_loss_clip": 1.05360019, - "balance_loss_mlp": 1.02465415, - "epoch": 0.2633398466857057, - "flos": 23769057732480.0, - "grad_norm": 1.982346793660648, - "language_loss": 0.77895945, - "learning_rate": 3.4555142048832975e-06, - "loss": 0.80053759, - "num_input_tokens_seen": 94673985, - "step": 4380, - "time_per_iteration": 2.745392084121704 - }, - { - "auxiliary_loss_clip": 0.01125918, - "auxiliary_loss_mlp": 0.01040915, - "balance_loss_clip": 1.04945278, - "balance_loss_mlp": 1.02351093, - "epoch": 0.26339996993837367, - "flos": 27600223380480.0, - "grad_norm": 2.2040025999375215, - "language_loss": 0.64148676, - "learning_rate": 3.4552470708495036e-06, - "loss": 0.66315508, - "num_input_tokens_seen": 94693145, - "step": 4381, - "time_per_iteration": 2.8020689487457275 - }, - { - "auxiliary_loss_clip": 0.01136752, - "auxiliary_loss_mlp": 0.01038794, - "balance_loss_clip": 1.05113709, - "balance_loss_mlp": 1.02225995, - "epoch": 0.26346009319104163, - "flos": 16946317912320.0, - "grad_norm": 1.9675616702193486, - "language_loss": 0.82470775, - "learning_rate": 3.454979881632595e-06, - "loss": 0.8464632, - "num_input_tokens_seen": 94710185, - "step": 4382, - "time_per_iteration": 2.66001558303833 - }, - { - "auxiliary_loss_clip": 0.01106019, - "auxiliary_loss_mlp": 0.01045742, - "balance_loss_clip": 1.04899645, - "balance_loss_mlp": 1.02726483, - "epoch": 0.2635202164437096, - "flos": 37232218915200.0, - "grad_norm": 4.511875880791621, - "language_loss": 0.70333207, - "learning_rate": 3.4547126372427035e-06, - "loss": 0.7248497, - "num_input_tokens_seen": 94730280, - "step": 4383, - "time_per_iteration": 2.851227045059204 - }, - { - "auxiliary_loss_clip": 0.01136676, - "auxiliary_loss_mlp": 0.01039697, - "balance_loss_clip": 1.05237031, - "balance_loss_mlp": 1.0239253, - "epoch": 0.26358033969637756, - "flos": 20996359084800.0, - "grad_norm": 3.019496854013466, - "language_loss": 0.69455528, - "learning_rate": 3.4544453376899638e-06, - "loss": 0.71631902, - "num_input_tokens_seen": 94748560, - "step": 4384, - "time_per_iteration": 2.670023202896118 - }, - { - "auxiliary_loss_clip": 0.01135763, - "auxiliary_loss_mlp": 0.01039573, - "balance_loss_clip": 1.05114567, - "balance_loss_mlp": 1.02275276, - "epoch": 0.26364046294904553, - "flos": 27746092512000.0, - "grad_norm": 2.2712502599605036, - "language_loss": 0.70067525, - "learning_rate": 3.45417798298451e-06, - "loss": 0.72242868, - "num_input_tokens_seen": 94767570, - "step": 4385, - "time_per_iteration": 2.7232449054718018 - }, - { - "auxiliary_loss_clip": 0.01112529, - "auxiliary_loss_mlp": 0.0104946, - "balance_loss_clip": 1.04893148, - "balance_loss_mlp": 1.03190076, - "epoch": 0.2637005862017135, - "flos": 22893088757760.0, - "grad_norm": 1.8128608655109948, - "language_loss": 0.85684925, - "learning_rate": 3.453910573136482e-06, - "loss": 0.87846911, - "num_input_tokens_seen": 94784985, - "step": 4386, - "time_per_iteration": 2.727924108505249 - }, - { - "auxiliary_loss_clip": 0.01126521, - "auxiliary_loss_mlp": 0.01046433, - "balance_loss_clip": 1.0510478, - "balance_loss_mlp": 1.02955282, - "epoch": 0.26376070945438146, - "flos": 15048834053760.0, - "grad_norm": 2.174412940978395, - "language_loss": 0.7796396, - "learning_rate": 3.4536431081560196e-06, - "loss": 0.80136907, - "num_input_tokens_seen": 94802545, - "step": 4387, - "time_per_iteration": 2.666287660598755 - }, - { - "auxiliary_loss_clip": 0.01134058, - "auxiliary_loss_mlp": 0.01041407, - "balance_loss_clip": 1.05609179, - "balance_loss_mlp": 1.02537298, - "epoch": 0.2638208327070494, - "flos": 21141833166720.0, - "grad_norm": 2.003302761742054, - "language_loss": 0.76126039, - "learning_rate": 3.453375588053264e-06, - "loss": 0.78301507, - "num_input_tokens_seen": 94820730, - "step": 4388, - "time_per_iteration": 2.6321358680725098 - }, - { - "auxiliary_loss_clip": 0.01148944, - "auxiliary_loss_mlp": 0.01036978, - "balance_loss_clip": 1.05455542, - "balance_loss_mlp": 1.02002645, - "epoch": 0.26388095595971744, - "flos": 21725597001600.0, - "grad_norm": 2.534815675842734, - "language_loss": 0.86675179, - "learning_rate": 3.4531080128383617e-06, - "loss": 0.88861108, - "num_input_tokens_seen": 94839175, - "step": 4389, - "time_per_iteration": 2.6122422218322754 - }, - { - "auxiliary_loss_clip": 0.01048602, - "auxiliary_loss_mlp": 0.01002085, - "balance_loss_clip": 1.03000987, - "balance_loss_mlp": 0.99961758, - "epoch": 0.2639410792123854, - "flos": 65515537192320.0, - "grad_norm": 0.8388510572165676, - "language_loss": 0.60285747, - "learning_rate": 3.452840382521457e-06, - "loss": 0.62336433, - "num_input_tokens_seen": 94898865, - "step": 4390, - "time_per_iteration": 3.1867401599884033 - }, - { - "auxiliary_loss_clip": 0.01128567, - "auxiliary_loss_mlp": 0.01040305, - "balance_loss_clip": 1.05022383, - "balance_loss_mlp": 1.02319825, - "epoch": 0.2640012024650534, - "flos": 23948574929280.0, - "grad_norm": 1.6144448841655068, - "language_loss": 0.77730125, - "learning_rate": 3.4525726971127e-06, - "loss": 0.79899001, - "num_input_tokens_seen": 94917490, - "step": 4391, - "time_per_iteration": 2.707310676574707 - }, - { - "auxiliary_loss_clip": 0.01031384, - "auxiliary_loss_mlp": 0.00755302, - "balance_loss_clip": 1.02553821, - "balance_loss_mlp": 1.00244236, - "epoch": 0.26406132571772134, - "flos": 56441163369600.0, - "grad_norm": 0.8840896383522404, - "language_loss": 0.58758044, - "learning_rate": 3.45230495662224e-06, - "loss": 0.60544735, - "num_input_tokens_seen": 94969065, - "step": 4392, - "time_per_iteration": 3.211859941482544 - }, - { - "auxiliary_loss_clip": 0.01136937, - "auxiliary_loss_mlp": 0.0105019, - "balance_loss_clip": 1.05295539, - "balance_loss_mlp": 1.03322649, - "epoch": 0.2641214489703893, - "flos": 22090557139200.0, - "grad_norm": 1.9286153229889427, - "language_loss": 0.68954027, - "learning_rate": 3.4520371610602306e-06, - "loss": 0.71141154, - "num_input_tokens_seen": 94988540, - "step": 4393, - "time_per_iteration": 2.6483278274536133 - }, - { - "auxiliary_loss_clip": 0.01140079, - "auxiliary_loss_mlp": 0.01041521, - "balance_loss_clip": 1.05395103, - "balance_loss_mlp": 1.02398562, - "epoch": 0.26418157222305727, - "flos": 16544764794240.0, - "grad_norm": 2.0454829511435193, - "language_loss": 0.84071863, - "learning_rate": 3.4517693104368267e-06, - "loss": 0.86253464, - "num_input_tokens_seen": 95004810, - "step": 4394, - "time_per_iteration": 4.3396079540252686 - }, - { - "auxiliary_loss_clip": 0.01124083, - "auxiliary_loss_mlp": 0.01045374, - "balance_loss_clip": 1.04999089, - "balance_loss_mlp": 1.02661061, - "epoch": 0.26424169547572524, - "flos": 18002486442240.0, - "grad_norm": 2.096391063208514, - "language_loss": 0.70044839, - "learning_rate": 3.4515014047621856e-06, - "loss": 0.72214299, - "num_input_tokens_seen": 95024085, - "step": 4395, - "time_per_iteration": 2.8730056285858154 - }, - { - "auxiliary_loss_clip": 0.01110387, - "auxiliary_loss_mlp": 0.01037389, - "balance_loss_clip": 1.04736662, - "balance_loss_mlp": 1.02071214, - "epoch": 0.2643018187283932, - "flos": 16983162288000.0, - "grad_norm": 2.1761517020490606, - "language_loss": 0.86876452, - "learning_rate": 3.4512334440464655e-06, - "loss": 0.89024228, - "num_input_tokens_seen": 95042515, - "step": 4396, - "time_per_iteration": 4.384250640869141 - }, - { - "auxiliary_loss_clip": 0.01010716, - "auxiliary_loss_mlp": 0.01021406, - "balance_loss_clip": 1.02197146, - "balance_loss_mlp": 1.01856887, - "epoch": 0.26436194198106117, - "flos": 59664359416320.0, - "grad_norm": 0.7957760850485174, - "language_loss": 0.55022657, - "learning_rate": 3.4509654282998277e-06, - "loss": 0.57054776, - "num_input_tokens_seen": 95094835, - "step": 4397, - "time_per_iteration": 3.0656893253326416 - }, - { - "auxiliary_loss_clip": 0.01132938, - "auxiliary_loss_mlp": 0.01050463, - "balance_loss_clip": 1.0485754, - "balance_loss_mlp": 1.03357744, - "epoch": 0.26442206523372913, - "flos": 32921322197760.0, - "grad_norm": 1.9110208887501443, - "language_loss": 0.77881467, - "learning_rate": 3.450697357532435e-06, - "loss": 0.80064869, - "num_input_tokens_seen": 95113480, - "step": 4398, - "time_per_iteration": 2.740917444229126 - }, - { - "auxiliary_loss_clip": 0.01139914, - "auxiliary_loss_mlp": 0.01040709, - "balance_loss_clip": 1.05469537, - "balance_loss_mlp": 1.02347112, - "epoch": 0.2644821884863971, - "flos": 21031300039680.0, - "grad_norm": 1.7657486248278176, - "language_loss": 0.67534482, - "learning_rate": 3.4504292317544534e-06, - "loss": 0.69715106, - "num_input_tokens_seen": 95132580, - "step": 4399, - "time_per_iteration": 4.305487871170044 - }, - { - "auxiliary_loss_clip": 0.01097219, - "auxiliary_loss_mlp": 0.01042048, - "balance_loss_clip": 1.04840231, - "balance_loss_mlp": 1.02503681, - "epoch": 0.26454231173906506, - "flos": 20776801201920.0, - "grad_norm": 1.6309197312133479, - "language_loss": 0.86614597, - "learning_rate": 3.4501610509760504e-06, - "loss": 0.88753855, - "num_input_tokens_seen": 95152375, - "step": 4400, - "time_per_iteration": 2.695883274078369 - }, - { - "auxiliary_loss_clip": 0.01119339, - "auxiliary_loss_mlp": 0.01039987, - "balance_loss_clip": 1.0483284, - "balance_loss_mlp": 1.0226419, - "epoch": 0.264602434991733, - "flos": 16618669027200.0, - "grad_norm": 3.1942141071602546, - "language_loss": 0.76518428, - "learning_rate": 3.4498928152073944e-06, - "loss": 0.78677756, - "num_input_tokens_seen": 95170265, - "step": 4401, - "time_per_iteration": 2.69415545463562 - }, - { - "auxiliary_loss_clip": 0.01100665, - "auxiliary_loss_mlp": 0.01046326, - "balance_loss_clip": 1.04473615, - "balance_loss_mlp": 1.02758598, - "epoch": 0.26466255824440105, - "flos": 19062677295360.0, - "grad_norm": 2.336049134907364, - "language_loss": 0.88363832, - "learning_rate": 3.4496245244586577e-06, - "loss": 0.90510821, - "num_input_tokens_seen": 95188655, - "step": 4402, - "time_per_iteration": 2.7073450088500977 - }, - { - "auxiliary_loss_clip": 0.01105803, - "auxiliary_loss_mlp": 0.01040704, - "balance_loss_clip": 1.04894042, - "balance_loss_mlp": 1.02327585, - "epoch": 0.264722681497069, - "flos": 22638554006400.0, - "grad_norm": 1.7301089969072252, - "language_loss": 0.7811445, - "learning_rate": 3.4493561787400137e-06, - "loss": 0.80260956, - "num_input_tokens_seen": 95209615, - "step": 4403, - "time_per_iteration": 2.7213027477264404 - }, - { - "auxiliary_loss_clip": 0.01128649, - "auxiliary_loss_mlp": 0.01038032, - "balance_loss_clip": 1.04674816, - "balance_loss_mlp": 1.02050877, - "epoch": 0.264782804749737, - "flos": 22492253911680.0, - "grad_norm": 2.1369132533571604, - "language_loss": 0.88594282, - "learning_rate": 3.4490877780616387e-06, - "loss": 0.90760964, - "num_input_tokens_seen": 95227810, - "step": 4404, - "time_per_iteration": 2.6888909339904785 - }, - { - "auxiliary_loss_clip": 0.01123789, - "auxiliary_loss_mlp": 0.01040593, - "balance_loss_clip": 1.04607344, - "balance_loss_mlp": 1.02416539, - "epoch": 0.26484292800240494, - "flos": 16800269212800.0, - "grad_norm": 1.7519644069859235, - "language_loss": 0.76134694, - "learning_rate": 3.448819322433709e-06, - "loss": 0.78299075, - "num_input_tokens_seen": 95245890, - "step": 4405, - "time_per_iteration": 2.7172482013702393 - }, - { - "auxiliary_loss_clip": 0.01148976, - "auxiliary_loss_mlp": 0.01040198, - "balance_loss_clip": 1.05348206, - "balance_loss_mlp": 1.02266204, - "epoch": 0.2649030512550729, - "flos": 20449583280000.0, - "grad_norm": 1.711457274305917, - "language_loss": 0.69873697, - "learning_rate": 3.4485508118664066e-06, - "loss": 0.72062874, - "num_input_tokens_seen": 95264955, - "step": 4406, - "time_per_iteration": 2.584300994873047 - }, - { - "auxiliary_loss_clip": 0.01121151, - "auxiliary_loss_mlp": 0.01050453, - "balance_loss_clip": 1.05182838, - "balance_loss_mlp": 1.03432453, - "epoch": 0.2649631745077409, - "flos": 22416123035520.0, - "grad_norm": 1.7200250795424956, - "language_loss": 0.83956587, - "learning_rate": 3.448282246369912e-06, - "loss": 0.86128193, - "num_input_tokens_seen": 95284245, - "step": 4407, - "time_per_iteration": 2.731316328048706 - }, - { - "auxiliary_loss_clip": 0.01108599, - "auxiliary_loss_mlp": 0.01031757, - "balance_loss_clip": 1.04695201, - "balance_loss_mlp": 1.01501989, - "epoch": 0.26502329776040884, - "flos": 35116110927360.0, - "grad_norm": 1.8896460113896294, - "language_loss": 0.7597363, - "learning_rate": 3.4480136259544084e-06, - "loss": 0.78113985, - "num_input_tokens_seen": 95307125, - "step": 4408, - "time_per_iteration": 2.8600730895996094 - }, - { - "auxiliary_loss_clip": 0.01091919, - "auxiliary_loss_mlp": 0.01044721, - "balance_loss_clip": 1.04267502, - "balance_loss_mlp": 1.02679181, - "epoch": 0.2650834210130768, - "flos": 38687498438400.0, - "grad_norm": 1.7769050714437231, - "language_loss": 0.70612216, - "learning_rate": 3.447744950630084e-06, - "loss": 0.72748852, - "num_input_tokens_seen": 95329150, - "step": 4409, - "time_per_iteration": 2.936380386352539 - }, - { - "auxiliary_loss_clip": 0.01131548, - "auxiliary_loss_mlp": 0.01040186, - "balance_loss_clip": 1.04774857, - "balance_loss_mlp": 1.02218497, - "epoch": 0.26514354426574477, - "flos": 24716847951360.0, - "grad_norm": 1.7357795205395667, - "language_loss": 0.7337513, - "learning_rate": 3.4474762204071253e-06, - "loss": 0.75546867, - "num_input_tokens_seen": 95349880, - "step": 4410, - "time_per_iteration": 2.7315077781677246 - }, - { - "auxiliary_loss_clip": 0.01141374, - "auxiliary_loss_mlp": 0.0104966, - "balance_loss_clip": 1.05183268, - "balance_loss_mlp": 1.03216028, - "epoch": 0.26520366751841273, - "flos": 20340055733760.0, - "grad_norm": 1.8886288474708937, - "language_loss": 0.73828322, - "learning_rate": 3.4472074352957244e-06, - "loss": 0.76019359, - "num_input_tokens_seen": 95368570, - "step": 4411, - "time_per_iteration": 2.641920566558838 - }, - { - "auxiliary_loss_clip": 0.01099594, - "auxiliary_loss_mlp": 0.01041576, - "balance_loss_clip": 1.04986739, - "balance_loss_mlp": 1.02431464, - "epoch": 0.2652637907710807, - "flos": 22343870828160.0, - "grad_norm": 1.9943391034693418, - "language_loss": 0.82447588, - "learning_rate": 3.446938595306071e-06, - "loss": 0.84588754, - "num_input_tokens_seen": 95387065, - "step": 4412, - "time_per_iteration": 2.8344247341156006 - }, - { - "auxiliary_loss_clip": 0.01135402, - "auxiliary_loss_mlp": 0.01052016, - "balance_loss_clip": 1.05143464, - "balance_loss_mlp": 1.03544593, - "epoch": 0.26532391402374866, - "flos": 19354235990400.0, - "grad_norm": 1.775443234311944, - "language_loss": 0.7446382, - "learning_rate": 3.4466697004483622e-06, - "loss": 0.76651239, - "num_input_tokens_seen": 95406345, - "step": 4413, - "time_per_iteration": 2.657975196838379 - }, - { - "auxiliary_loss_clip": 0.01056582, - "auxiliary_loss_mlp": 0.01008584, - "balance_loss_clip": 1.03258443, - "balance_loss_mlp": 1.00659275, - "epoch": 0.26538403727641663, - "flos": 44787611422080.0, - "grad_norm": 0.873557285042922, - "language_loss": 0.56965125, - "learning_rate": 3.446400750732793e-06, - "loss": 0.59030288, - "num_input_tokens_seen": 95463595, - "step": 4414, - "time_per_iteration": 3.1158244609832764 - }, - { - "auxiliary_loss_clip": 0.01107803, - "auxiliary_loss_mlp": 0.01046612, - "balance_loss_clip": 1.04481411, - "balance_loss_mlp": 1.03048313, - "epoch": 0.26544416052908465, - "flos": 28182119708160.0, - "grad_norm": 1.5786807831647507, - "language_loss": 0.74238014, - "learning_rate": 3.4461317461695625e-06, - "loss": 0.76392424, - "num_input_tokens_seen": 95484115, - "step": 4415, - "time_per_iteration": 2.7223031520843506 - }, - { - "auxiliary_loss_clip": 0.01095743, - "auxiliary_loss_mlp": 0.01044325, - "balance_loss_clip": 1.04215193, - "balance_loss_mlp": 1.02402353, - "epoch": 0.2655042837817526, - "flos": 17565274097280.0, - "grad_norm": 2.5102345694159016, - "language_loss": 0.86855936, - "learning_rate": 3.4458626867688707e-06, - "loss": 0.88996005, - "num_input_tokens_seen": 95501435, - "step": 4416, - "time_per_iteration": 2.7001683712005615 - }, - { - "auxiliary_loss_clip": 0.01141467, - "auxiliary_loss_mlp": 0.01046153, - "balance_loss_clip": 1.05359149, - "balance_loss_mlp": 1.02761602, - "epoch": 0.2655644070344206, - "flos": 23404636298880.0, - "grad_norm": 1.6343137061510633, - "language_loss": 0.76870787, - "learning_rate": 3.4455935725409217e-06, - "loss": 0.79058409, - "num_input_tokens_seen": 95520135, - "step": 4417, - "time_per_iteration": 2.662196397781372 - }, - { - "auxiliary_loss_clip": 0.01119441, - "auxiliary_loss_mlp": 0.01041503, - "balance_loss_clip": 1.04989183, - "balance_loss_mlp": 1.02242982, - "epoch": 0.26562453028708854, - "flos": 26468462678400.0, - "grad_norm": 1.6334113226277946, - "language_loss": 0.80320108, - "learning_rate": 3.4453244034959196e-06, - "loss": 0.82481045, - "num_input_tokens_seen": 95541705, - "step": 4418, - "time_per_iteration": 2.7742624282836914 - }, - { - "auxiliary_loss_clip": 0.0113892, - "auxiliary_loss_mlp": 0.01045476, - "balance_loss_clip": 1.05182683, - "balance_loss_mlp": 1.02721274, - "epoch": 0.2656846535397565, - "flos": 19207576759680.0, - "grad_norm": 2.164903581235647, - "language_loss": 0.67788607, - "learning_rate": 3.445055179644071e-06, - "loss": 0.69972998, - "num_input_tokens_seen": 95560300, - "step": 4419, - "time_per_iteration": 2.6437718868255615 - }, - { - "auxiliary_loss_clip": 0.01149692, - "auxiliary_loss_mlp": 0.01046258, - "balance_loss_clip": 1.05360699, - "balance_loss_mlp": 1.02711296, - "epoch": 0.2657447767924245, - "flos": 30551325903360.0, - "grad_norm": 1.9366129468869788, - "language_loss": 0.79625547, - "learning_rate": 3.444785900995585e-06, - "loss": 0.81821501, - "num_input_tokens_seen": 95580150, - "step": 4420, - "time_per_iteration": 2.6594905853271484 - }, - { - "auxiliary_loss_clip": 0.01126984, - "auxiliary_loss_mlp": 0.01053725, - "balance_loss_clip": 1.05294895, - "balance_loss_mlp": 1.03368592, - "epoch": 0.26580490004509244, - "flos": 20922742160640.0, - "grad_norm": 1.9122536358412747, - "language_loss": 0.81690109, - "learning_rate": 3.444516567560673e-06, - "loss": 0.83870822, - "num_input_tokens_seen": 95597570, - "step": 4421, - "time_per_iteration": 2.681410551071167 - }, - { - "auxiliary_loss_clip": 0.0113176, - "auxiliary_loss_mlp": 0.01046737, - "balance_loss_clip": 1.05015123, - "balance_loss_mlp": 1.02904677, - "epoch": 0.2658650232977604, - "flos": 43945682584320.0, - "grad_norm": 1.6112293393448585, - "language_loss": 0.65704989, - "learning_rate": 3.444247179349548e-06, - "loss": 0.6788348, - "num_input_tokens_seen": 95619415, - "step": 4422, - "time_per_iteration": 2.8766117095947266 - }, - { - "auxiliary_loss_clip": 0.01130944, - "auxiliary_loss_mlp": 0.01047224, - "balance_loss_clip": 1.04903376, - "balance_loss_mlp": 1.03039181, - "epoch": 0.26592514655042837, - "flos": 29716439109120.0, - "grad_norm": 2.1017056533749896, - "language_loss": 0.74229872, - "learning_rate": 3.4439777363724252e-06, - "loss": 0.76408041, - "num_input_tokens_seen": 95639155, - "step": 4423, - "time_per_iteration": 2.6983659267425537 - }, - { - "auxiliary_loss_clip": 0.01130559, - "auxiliary_loss_mlp": 0.01057709, - "balance_loss_clip": 1.04790974, - "balance_loss_mlp": 1.03822982, - "epoch": 0.26598526980309634, - "flos": 46677730014720.0, - "grad_norm": 1.6865310965149165, - "language_loss": 0.77855694, - "learning_rate": 3.443708238639522e-06, - "loss": 0.80043966, - "num_input_tokens_seen": 95663320, - "step": 4424, - "time_per_iteration": 2.900214433670044 - }, - { - "auxiliary_loss_clip": 0.01132339, - "auxiliary_loss_mlp": 0.01049395, - "balance_loss_clip": 1.04963291, - "balance_loss_mlp": 1.03181148, - "epoch": 0.2660453930557643, - "flos": 11509442582400.0, - "grad_norm": 2.0755220631041684, - "language_loss": 0.78940654, - "learning_rate": 3.4434386861610573e-06, - "loss": 0.81122386, - "num_input_tokens_seen": 95680260, - "step": 4425, - "time_per_iteration": 2.6266820430755615 - }, - { - "auxiliary_loss_clip": 0.01123867, - "auxiliary_loss_mlp": 0.01043959, - "balance_loss_clip": 1.05143404, - "balance_loss_mlp": 1.02767467, - "epoch": 0.26610551630843227, - "flos": 24791578197120.0, - "grad_norm": 1.5673316066045293, - "language_loss": 0.80135047, - "learning_rate": 3.4431690789472532e-06, - "loss": 0.82302874, - "num_input_tokens_seen": 95701140, - "step": 4426, - "time_per_iteration": 2.7015280723571777 - }, - { - "auxiliary_loss_clip": 0.01150747, - "auxiliary_loss_mlp": 0.0104448, - "balance_loss_clip": 1.0554285, - "balance_loss_mlp": 1.02678883, - "epoch": 0.26616563956110023, - "flos": 27636385397760.0, - "grad_norm": 1.617839398314704, - "language_loss": 0.77174348, - "learning_rate": 3.442899417008333e-06, - "loss": 0.79369569, - "num_input_tokens_seen": 95722060, - "step": 4427, - "time_per_iteration": 2.6438984870910645 - }, - { - "auxiliary_loss_clip": 0.01112968, - "auxiliary_loss_mlp": 0.01037518, - "balance_loss_clip": 1.05125654, - "balance_loss_mlp": 1.02069747, - "epoch": 0.26622576281376825, - "flos": 28362893880960.0, - "grad_norm": 1.5634759975385293, - "language_loss": 0.76754683, - "learning_rate": 3.4426297003545227e-06, - "loss": 0.78905165, - "num_input_tokens_seen": 95742495, - "step": 4428, - "time_per_iteration": 2.7695741653442383 - }, - { - "auxiliary_loss_clip": 0.01114899, - "auxiliary_loss_mlp": 0.00775922, - "balance_loss_clip": 1.04922283, - "balance_loss_mlp": 1.0008111, - "epoch": 0.2662858860664362, - "flos": 18041341979520.0, - "grad_norm": 1.815928660217762, - "language_loss": 0.82900071, - "learning_rate": 3.4423599289960495e-06, - "loss": 0.84790885, - "num_input_tokens_seen": 95761510, - "step": 4429, - "time_per_iteration": 2.764183282852173 - }, - { - "auxiliary_loss_clip": 0.01106492, - "auxiliary_loss_mlp": 0.01039033, - "balance_loss_clip": 1.05041027, - "balance_loss_mlp": 1.02201009, - "epoch": 0.2663460093191042, - "flos": 22745818995840.0, - "grad_norm": 1.6463341595476202, - "language_loss": 0.71996218, - "learning_rate": 3.442090102943143e-06, - "loss": 0.74141741, - "num_input_tokens_seen": 95782385, - "step": 4430, - "time_per_iteration": 2.7244491577148438 - }, - { - "auxiliary_loss_clip": 0.01148257, - "auxiliary_loss_mlp": 0.01049268, - "balance_loss_clip": 1.05231071, - "balance_loss_mlp": 1.03068352, - "epoch": 0.26640613257177215, - "flos": 16508782344960.0, - "grad_norm": 1.9574919733512919, - "language_loss": 0.82021642, - "learning_rate": 3.441820222206035e-06, - "loss": 0.84219164, - "num_input_tokens_seen": 95800595, - "step": 4431, - "time_per_iteration": 2.5910067558288574 - }, - { - "auxiliary_loss_clip": 0.01143334, - "auxiliary_loss_mlp": 0.01050031, - "balance_loss_clip": 1.0540812, - "balance_loss_mlp": 1.03141046, - "epoch": 0.2664662558244401, - "flos": 23075945919360.0, - "grad_norm": 2.074794485495937, - "language_loss": 0.76745522, - "learning_rate": 3.44155028679496e-06, - "loss": 0.7893889, - "num_input_tokens_seen": 95818480, - "step": 4432, - "time_per_iteration": 2.6548166275024414 - }, - { - "auxiliary_loss_clip": 0.01089372, - "auxiliary_loss_mlp": 0.01052807, - "balance_loss_clip": 1.04526138, - "balance_loss_mlp": 1.03232694, - "epoch": 0.2665263790771081, - "flos": 23769273214080.0, - "grad_norm": 1.872584196626497, - "language_loss": 0.82903433, - "learning_rate": 3.441280296720154e-06, - "loss": 0.85045612, - "num_input_tokens_seen": 95837205, - "step": 4433, - "time_per_iteration": 4.2740867137908936 - }, - { - "auxiliary_loss_clip": 0.01142798, - "auxiliary_loss_mlp": 0.01045231, - "balance_loss_clip": 1.05565643, - "balance_loss_mlp": 1.02671802, - "epoch": 0.26658650232977604, - "flos": 28001273708160.0, - "grad_norm": 2.548777168378285, - "language_loss": 0.76308644, - "learning_rate": 3.441010251991854e-06, - "loss": 0.78496677, - "num_input_tokens_seen": 95858395, - "step": 4434, - "time_per_iteration": 4.203384160995483 - }, - { - "auxiliary_loss_clip": 0.0114611, - "auxiliary_loss_mlp": 0.01044925, - "balance_loss_clip": 1.05197668, - "balance_loss_mlp": 1.02772319, - "epoch": 0.266646625582444, - "flos": 22163635359360.0, - "grad_norm": 2.3452347637055393, - "language_loss": 0.82496321, - "learning_rate": 3.440740152620301e-06, - "loss": 0.84687358, - "num_input_tokens_seen": 95877875, - "step": 4435, - "time_per_iteration": 4.102782964706421 - }, - { - "auxiliary_loss_clip": 0.01104916, - "auxiliary_loss_mlp": 0.01062101, - "balance_loss_clip": 1.04567468, - "balance_loss_mlp": 1.04245555, - "epoch": 0.266706748835112, - "flos": 27853537069440.0, - "grad_norm": 1.994258420562806, - "language_loss": 0.87634504, - "learning_rate": 3.4404699986157376e-06, - "loss": 0.89801526, - "num_input_tokens_seen": 95895820, - "step": 4436, - "time_per_iteration": 2.8048155307769775 - }, - { - "auxiliary_loss_clip": 0.01121439, - "auxiliary_loss_mlp": 0.01047617, - "balance_loss_clip": 1.04637265, - "balance_loss_mlp": 1.03054643, - "epoch": 0.26676687208777994, - "flos": 25812123413760.0, - "grad_norm": 1.4763923958478316, - "language_loss": 0.787242, - "learning_rate": 3.440199789988407e-06, - "loss": 0.80893254, - "num_input_tokens_seen": 95918025, - "step": 4437, - "time_per_iteration": 2.7382607460021973 - }, - { - "auxiliary_loss_clip": 0.01093686, - "auxiliary_loss_mlp": 0.01048829, - "balance_loss_clip": 1.05000877, - "balance_loss_mlp": 1.03117394, - "epoch": 0.2668269953404479, - "flos": 36064583504640.0, - "grad_norm": 4.5178491997969115, - "language_loss": 0.63910848, - "learning_rate": 3.439929526748556e-06, - "loss": 0.66053367, - "num_input_tokens_seen": 95937725, - "step": 4438, - "time_per_iteration": 2.956014633178711 - }, - { - "auxiliary_loss_clip": 0.01080658, - "auxiliary_loss_mlp": 0.01047394, - "balance_loss_clip": 1.0432179, - "balance_loss_mlp": 1.02994168, - "epoch": 0.26688711859311587, - "flos": 26570987072640.0, - "grad_norm": 1.84569516037299, - "language_loss": 0.75897747, - "learning_rate": 3.4396592089064334e-06, - "loss": 0.78025794, - "num_input_tokens_seen": 95956335, - "step": 4439, - "time_per_iteration": 4.428173065185547 - }, - { - "auxiliary_loss_clip": 0.01089075, - "auxiliary_loss_mlp": 0.01041089, - "balance_loss_clip": 1.04845262, - "balance_loss_mlp": 1.02181315, - "epoch": 0.26694724184578383, - "flos": 26761565658240.0, - "grad_norm": 2.10654378697334, - "language_loss": 0.7172367, - "learning_rate": 3.4393888364722897e-06, - "loss": 0.73853838, - "num_input_tokens_seen": 95977135, - "step": 4440, - "time_per_iteration": 2.9196605682373047 - }, - { - "auxiliary_loss_clip": 0.01124038, - "auxiliary_loss_mlp": 0.01049644, - "balance_loss_clip": 1.04784775, - "balance_loss_mlp": 1.02931881, - "epoch": 0.2670073650984518, - "flos": 20959586536320.0, - "grad_norm": 1.869180757677473, - "language_loss": 0.66229129, - "learning_rate": 3.439118409456376e-06, - "loss": 0.68402815, - "num_input_tokens_seen": 95995435, - "step": 4441, - "time_per_iteration": 2.666428804397583 - }, - { - "auxiliary_loss_clip": 0.01137041, - "auxiliary_loss_mlp": 0.01049045, - "balance_loss_clip": 1.04973912, - "balance_loss_mlp": 1.02953053, - "epoch": 0.2670674883511198, - "flos": 28366054277760.0, - "grad_norm": 3.888081439634283, - "language_loss": 0.76102316, - "learning_rate": 3.4388479278689486e-06, - "loss": 0.78288412, - "num_input_tokens_seen": 96016340, - "step": 4442, - "time_per_iteration": 2.6413686275482178 - }, - { - "auxiliary_loss_clip": 0.0100646, - "auxiliary_loss_mlp": 0.0105848, - "balance_loss_clip": 1.02694619, - "balance_loss_mlp": 1.05538034, - "epoch": 0.2671276116037878, - "flos": 58971319430400.0, - "grad_norm": 0.9410220376713593, - "language_loss": 0.61210632, - "learning_rate": 3.4385773917202637e-06, - "loss": 0.63275576, - "num_input_tokens_seen": 96071205, - "step": 4443, - "time_per_iteration": 3.2342116832733154 - }, - { - "auxiliary_loss_clip": 0.01123665, - "auxiliary_loss_mlp": 0.01039982, - "balance_loss_clip": 1.05413401, - "balance_loss_mlp": 1.02239847, - "epoch": 0.26718773485645575, - "flos": 43945072053120.0, - "grad_norm": 1.5620381861600383, - "language_loss": 0.76195556, - "learning_rate": 3.4383068010205793e-06, - "loss": 0.78359205, - "num_input_tokens_seen": 96094240, - "step": 4444, - "time_per_iteration": 3.136178731918335 - }, - { - "auxiliary_loss_clip": 0.01142711, - "auxiliary_loss_mlp": 0.01040756, - "balance_loss_clip": 1.05331576, - "balance_loss_mlp": 1.0213964, - "epoch": 0.2672478581091237, - "flos": 25228323665280.0, - "grad_norm": 1.6750833182703528, - "language_loss": 0.80892444, - "learning_rate": 3.438036155780158e-06, - "loss": 0.83075905, - "num_input_tokens_seen": 96114105, - "step": 4445, - "time_per_iteration": 2.660952091217041 - }, - { - "auxiliary_loss_clip": 0.01124381, - "auxiliary_loss_mlp": 0.01048514, - "balance_loss_clip": 1.05190587, - "balance_loss_mlp": 1.02901077, - "epoch": 0.2673079813617917, - "flos": 15268176455040.0, - "grad_norm": 2.1125172985353533, - "language_loss": 0.89060926, - "learning_rate": 3.43776545600926e-06, - "loss": 0.9123382, - "num_input_tokens_seen": 96132140, - "step": 4446, - "time_per_iteration": 2.6609115600585938 - }, - { - "auxiliary_loss_clip": 0.011447, - "auxiliary_loss_mlp": 0.01053132, - "balance_loss_clip": 1.05528426, - "balance_loss_mlp": 1.03541803, - "epoch": 0.26736810461445965, - "flos": 25812733944960.0, - "grad_norm": 2.4310086382368783, - "language_loss": 0.67756736, - "learning_rate": 3.437494701718153e-06, - "loss": 0.69954574, - "num_input_tokens_seen": 96152090, - "step": 4447, - "time_per_iteration": 2.6696949005126953 - }, - { - "auxiliary_loss_clip": 0.01144309, - "auxiliary_loss_mlp": 0.0104489, - "balance_loss_clip": 1.05496442, - "balance_loss_mlp": 1.02572155, - "epoch": 0.2674282278671276, - "flos": 24312709054080.0, - "grad_norm": 1.9687667134305082, - "language_loss": 0.830899, - "learning_rate": 3.4372238929171026e-06, - "loss": 0.85279107, - "num_input_tokens_seen": 96170015, - "step": 4448, - "time_per_iteration": 2.639463424682617 - }, - { - "auxiliary_loss_clip": 0.0111564, - "auxiliary_loss_mlp": 0.01054364, - "balance_loss_clip": 1.05101895, - "balance_loss_mlp": 1.03557646, - "epoch": 0.2674883511197956, - "flos": 22815521337600.0, - "grad_norm": 1.479052407292424, - "language_loss": 0.84231561, - "learning_rate": 3.436953029616378e-06, - "loss": 0.8640157, - "num_input_tokens_seen": 96188065, - "step": 4449, - "time_per_iteration": 2.812290906906128 - }, - { - "auxiliary_loss_clip": 0.0113237, - "auxiliary_loss_mlp": 0.01055905, - "balance_loss_clip": 1.05103493, - "balance_loss_mlp": 1.03552055, - "epoch": 0.26754847437246354, - "flos": 25370170473600.0, - "grad_norm": 1.7379167843341312, - "language_loss": 0.84231997, - "learning_rate": 3.4366821118262506e-06, - "loss": 0.86420268, - "num_input_tokens_seen": 96205780, - "step": 4450, - "time_per_iteration": 2.7598626613616943 - }, - { - "auxiliary_loss_clip": 0.01109743, - "auxiliary_loss_mlp": 0.01057779, - "balance_loss_clip": 1.04833305, - "balance_loss_mlp": 1.04044628, - "epoch": 0.2676085976251315, - "flos": 20230420446720.0, - "grad_norm": 8.035146429526597, - "language_loss": 0.80842566, - "learning_rate": 3.4364111395569937e-06, - "loss": 0.83010095, - "num_input_tokens_seen": 96224990, - "step": 4451, - "time_per_iteration": 2.7467129230499268 - }, - { - "auxiliary_loss_clip": 0.01141732, - "auxiliary_loss_mlp": 0.01055516, - "balance_loss_clip": 1.0553689, - "balance_loss_mlp": 1.0379324, - "epoch": 0.26766872087779947, - "flos": 28038225824640.0, - "grad_norm": 1.6378235408468254, - "language_loss": 0.86285019, - "learning_rate": 3.436140112818882e-06, - "loss": 0.88482267, - "num_input_tokens_seen": 96245345, - "step": 4452, - "time_per_iteration": 2.7442660331726074 - }, - { - "auxiliary_loss_clip": 0.01134475, - "auxiliary_loss_mlp": 0.01047993, - "balance_loss_clip": 1.05496478, - "balance_loss_mlp": 1.02926481, - "epoch": 0.26772884413046744, - "flos": 18325179250560.0, - "grad_norm": 2.119384740597093, - "language_loss": 0.83521158, - "learning_rate": 3.435869031622194e-06, - "loss": 0.85703623, - "num_input_tokens_seen": 96259000, - "step": 4453, - "time_per_iteration": 2.659623146057129 - }, - { - "auxiliary_loss_clip": 0.01141347, - "auxiliary_loss_mlp": 0.01063496, - "balance_loss_clip": 1.05624223, - "balance_loss_mlp": 1.04485118, - "epoch": 0.2677889673831354, - "flos": 22127509255680.0, - "grad_norm": 1.8460317519144305, - "language_loss": 0.79565918, - "learning_rate": 3.435597895977208e-06, - "loss": 0.8177076, - "num_input_tokens_seen": 96277000, - "step": 4454, - "time_per_iteration": 2.6458942890167236 - }, - { - "auxiliary_loss_clip": 0.01130641, - "auxiliary_loss_mlp": 0.01056871, - "balance_loss_clip": 1.05338597, - "balance_loss_mlp": 1.03869116, - "epoch": 0.2678490906358034, - "flos": 23729699404800.0, - "grad_norm": 1.5255880946203295, - "language_loss": 0.7241919, - "learning_rate": 3.435326705894206e-06, - "loss": 0.74606699, - "num_input_tokens_seen": 96297010, - "step": 4455, - "time_per_iteration": 2.7328429222106934 - }, - { - "auxiliary_loss_clip": 0.01112613, - "auxiliary_loss_mlp": 0.01052208, - "balance_loss_clip": 1.04858243, - "balance_loss_mlp": 1.03508949, - "epoch": 0.2679092138884714, - "flos": 21762872340480.0, - "grad_norm": 1.5657028408886426, - "language_loss": 0.74017322, - "learning_rate": 3.435055461383471e-06, - "loss": 0.76182139, - "num_input_tokens_seen": 96315780, - "step": 4456, - "time_per_iteration": 2.700190544128418 - }, - { - "auxiliary_loss_clip": 0.0114232, - "auxiliary_loss_mlp": 0.01048809, - "balance_loss_clip": 1.05394006, - "balance_loss_mlp": 1.03033149, - "epoch": 0.26796933714113935, - "flos": 19861186590720.0, - "grad_norm": 2.4373070589767774, - "language_loss": 0.70647967, - "learning_rate": 3.4347841624552896e-06, - "loss": 0.72839093, - "num_input_tokens_seen": 96333465, - "step": 4457, - "time_per_iteration": 2.6334941387176514 - }, - { - "auxiliary_loss_clip": 0.01112923, - "auxiliary_loss_mlp": 0.01063608, - "balance_loss_clip": 1.05205595, - "balance_loss_mlp": 1.04513049, - "epoch": 0.2680294603938073, - "flos": 20047886507520.0, - "grad_norm": 1.8228045543818674, - "language_loss": 0.7903617, - "learning_rate": 3.4345128091199493e-06, - "loss": 0.81212699, - "num_input_tokens_seen": 96352005, - "step": 4458, - "time_per_iteration": 2.7377572059631348 - }, - { - "auxiliary_loss_clip": 0.01030327, - "auxiliary_loss_mlp": 0.01043883, - "balance_loss_clip": 1.0366354, - "balance_loss_mlp": 1.0414269, - "epoch": 0.2680895836464753, - "flos": 72113763052800.0, - "grad_norm": 0.9600198584891941, - "language_loss": 0.58691025, - "learning_rate": 3.434241401387739e-06, - "loss": 0.60765231, - "num_input_tokens_seen": 96406265, - "step": 4459, - "time_per_iteration": 3.2385354042053223 - }, - { - "auxiliary_loss_clip": 0.0108842, - "auxiliary_loss_mlp": 0.01056025, - "balance_loss_clip": 1.04306948, - "balance_loss_mlp": 1.0379889, - "epoch": 0.26814970689914325, - "flos": 20449044576000.0, - "grad_norm": 2.1196386888642382, - "language_loss": 0.84988648, - "learning_rate": 3.4339699392689507e-06, - "loss": 0.87133086, - "num_input_tokens_seen": 96425225, - "step": 4460, - "time_per_iteration": 2.767054319381714 - }, - { - "auxiliary_loss_clip": 0.01134128, - "auxiliary_loss_mlp": 0.01059054, - "balance_loss_clip": 1.0525527, - "balance_loss_mlp": 1.03916979, - "epoch": 0.2682098301518112, - "flos": 17566674727680.0, - "grad_norm": 1.6839260392555548, - "language_loss": 0.68334675, - "learning_rate": 3.4336984227738796e-06, - "loss": 0.70527858, - "num_input_tokens_seen": 96443780, - "step": 4461, - "time_per_iteration": 2.7217342853546143 - }, - { - "auxiliary_loss_clip": 0.0111525, - "auxiliary_loss_mlp": 0.01054739, - "balance_loss_clip": 1.05045152, - "balance_loss_mlp": 1.03649962, - "epoch": 0.2682699534044792, - "flos": 18333259810560.0, - "grad_norm": 1.7146103847032579, - "language_loss": 0.67240328, - "learning_rate": 3.43342685191282e-06, - "loss": 0.69410318, - "num_input_tokens_seen": 96464530, - "step": 4462, - "time_per_iteration": 2.730682134628296 - }, - { - "auxiliary_loss_clip": 0.01116667, - "auxiliary_loss_mlp": 0.01046675, - "balance_loss_clip": 1.05230319, - "balance_loss_mlp": 1.02710128, - "epoch": 0.26833007665714714, - "flos": 25301294144640.0, - "grad_norm": 1.7796857642272712, - "language_loss": 0.69503593, - "learning_rate": 3.4331552266960705e-06, - "loss": 0.71666932, - "num_input_tokens_seen": 96483345, - "step": 4463, - "time_per_iteration": 2.738046407699585 - }, - { - "auxiliary_loss_clip": 0.01118676, - "auxiliary_loss_mlp": 0.01049589, - "balance_loss_clip": 1.0492326, - "balance_loss_mlp": 1.02862048, - "epoch": 0.2683901999098151, - "flos": 16099759198080.0, - "grad_norm": 2.5866232358274277, - "language_loss": 0.77943784, - "learning_rate": 3.432883547133931e-06, - "loss": 0.80112046, - "num_input_tokens_seen": 96498305, - "step": 4464, - "time_per_iteration": 2.6794681549072266 - }, - { - "auxiliary_loss_clip": 0.01133564, - "auxiliary_loss_mlp": 0.01042879, - "balance_loss_clip": 1.05244994, - "balance_loss_mlp": 1.02410388, - "epoch": 0.2684503231624831, - "flos": 27308054154240.0, - "grad_norm": 2.2986867036088285, - "language_loss": 0.71375966, - "learning_rate": 3.432611813236704e-06, - "loss": 0.73552406, - "num_input_tokens_seen": 96519740, - "step": 4465, - "time_per_iteration": 2.699575662612915 - }, - { - "auxiliary_loss_clip": 0.01042347, - "auxiliary_loss_mlp": 0.01001834, - "balance_loss_clip": 1.02813911, - "balance_loss_mlp": 0.9993788, - "epoch": 0.26851044641515104, - "flos": 71858007239040.0, - "grad_norm": 0.7242654721351415, - "language_loss": 0.53150702, - "learning_rate": 3.4323400250146943e-06, - "loss": 0.5519489, - "num_input_tokens_seen": 96588870, - "step": 4466, - "time_per_iteration": 3.3984062671661377 - }, - { - "auxiliary_loss_clip": 0.01118674, - "auxiliary_loss_mlp": 0.0105552, - "balance_loss_clip": 1.04732478, - "balance_loss_mlp": 1.03381157, - "epoch": 0.268570569667819, - "flos": 18733771434240.0, - "grad_norm": 2.1738333593055796, - "language_loss": 0.74038142, - "learning_rate": 3.4320681824782057e-06, - "loss": 0.76212335, - "num_input_tokens_seen": 96605100, - "step": 4467, - "time_per_iteration": 2.6631343364715576 - }, - { - "auxiliary_loss_clip": 0.01126618, - "auxiliary_loss_mlp": 0.00777879, - "balance_loss_clip": 1.05088973, - "balance_loss_mlp": 1.00093102, - "epoch": 0.268630692920487, - "flos": 18178376365440.0, - "grad_norm": 3.586661477808892, - "language_loss": 0.80481976, - "learning_rate": 3.4317962856375493e-06, - "loss": 0.82386476, - "num_input_tokens_seen": 96621410, - "step": 4468, - "time_per_iteration": 2.64806866645813 - }, - { - "auxiliary_loss_clip": 0.01059326, - "auxiliary_loss_mlp": 0.01006331, - "balance_loss_clip": 1.02527809, - "balance_loss_mlp": 1.0036248, - "epoch": 0.268690816173155, - "flos": 68731768978560.0, - "grad_norm": 0.8399316740346766, - "language_loss": 0.59498715, - "learning_rate": 3.4315243345030334e-06, - "loss": 0.61564374, - "num_input_tokens_seen": 96684810, - "step": 4469, - "time_per_iteration": 3.1989517211914062 - }, - { - "auxiliary_loss_clip": 0.01156531, - "auxiliary_loss_mlp": 0.01048741, - "balance_loss_clip": 1.05689096, - "balance_loss_mlp": 1.02854705, - "epoch": 0.26875093942582295, - "flos": 23293636295040.0, - "grad_norm": 2.165956170420043, - "language_loss": 0.82055074, - "learning_rate": 3.431252329084972e-06, - "loss": 0.84260345, - "num_input_tokens_seen": 96701920, - "step": 4470, - "time_per_iteration": 2.6167352199554443 - }, - { - "auxiliary_loss_clip": 0.01117064, - "auxiliary_loss_mlp": 0.01054605, - "balance_loss_clip": 1.04794455, - "balance_loss_mlp": 1.03563929, - "epoch": 0.2688110626784909, - "flos": 21543458112000.0, - "grad_norm": 1.6543166375172473, - "language_loss": 0.82841349, - "learning_rate": 3.4309802693936786e-06, - "loss": 0.8501302, - "num_input_tokens_seen": 96721260, - "step": 4471, - "time_per_iteration": 4.177881956100464 - }, - { - "auxiliary_loss_clip": 0.01133274, - "auxiliary_loss_mlp": 0.01045934, - "balance_loss_clip": 1.05339766, - "balance_loss_mlp": 1.02762365, - "epoch": 0.2688711859311589, - "flos": 28400600183040.0, - "grad_norm": 2.017001756898941, - "language_loss": 0.69309431, - "learning_rate": 3.43070815543947e-06, - "loss": 0.71488637, - "num_input_tokens_seen": 96740385, - "step": 4472, - "time_per_iteration": 2.6611149311065674 - }, - { - "auxiliary_loss_clip": 0.01150636, - "auxiliary_loss_mlp": 0.01046679, - "balance_loss_clip": 1.05448234, - "balance_loss_mlp": 1.02882099, - "epoch": 0.26893130918382685, - "flos": 25994944661760.0, - "grad_norm": 1.889152474147147, - "language_loss": 0.67809618, - "learning_rate": 3.4304359872326656e-06, - "loss": 0.70006931, - "num_input_tokens_seen": 96761860, - "step": 4473, - "time_per_iteration": 2.6570448875427246 - }, - { - "auxiliary_loss_clip": 0.01123821, - "auxiliary_loss_mlp": 0.01056077, - "balance_loss_clip": 1.05778623, - "balance_loss_mlp": 1.03800452, - "epoch": 0.2689914324364948, - "flos": 20339624770560.0, - "grad_norm": 2.20378943201051, - "language_loss": 0.82835853, - "learning_rate": 3.4301637647835843e-06, - "loss": 0.8501575, - "num_input_tokens_seen": 96781890, - "step": 4474, - "time_per_iteration": 5.79376220703125 - }, - { - "auxiliary_loss_clip": 0.01138349, - "auxiliary_loss_mlp": 0.01055982, - "balance_loss_clip": 1.05353034, - "balance_loss_mlp": 1.03841054, - "epoch": 0.2690515556891628, - "flos": 19464553635840.0, - "grad_norm": 2.404484364093812, - "language_loss": 0.71004206, - "learning_rate": 3.4298914881025494e-06, - "loss": 0.73198539, - "num_input_tokens_seen": 96800390, - "step": 4475, - "time_per_iteration": 2.5969674587249756 - }, - { - "auxiliary_loss_clip": 0.01112288, - "auxiliary_loss_mlp": 0.00776382, - "balance_loss_clip": 1.05001771, - "balance_loss_mlp": 1.00081563, - "epoch": 0.26911167894183075, - "flos": 18146631720960.0, - "grad_norm": 1.8574153972172647, - "language_loss": 0.73638999, - "learning_rate": 3.4296191571998863e-06, - "loss": 0.75527668, - "num_input_tokens_seen": 96816685, - "step": 4476, - "time_per_iteration": 2.70358943939209 - }, - { - "auxiliary_loss_clip": 0.01119256, - "auxiliary_loss_mlp": 0.01043783, - "balance_loss_clip": 1.05050373, - "balance_loss_mlp": 1.02605665, - "epoch": 0.2691718021944987, - "flos": 19975131509760.0, - "grad_norm": 1.5040704863343832, - "language_loss": 0.80439913, - "learning_rate": 3.429346772085922e-06, - "loss": 0.82602954, - "num_input_tokens_seen": 96836285, - "step": 4477, - "time_per_iteration": 4.313180208206177 - }, - { - "auxiliary_loss_clip": 0.01097359, - "auxiliary_loss_mlp": 0.0104976, - "balance_loss_clip": 1.04965031, - "balance_loss_mlp": 1.0309844, - "epoch": 0.2692319254471667, - "flos": 37447215770880.0, - "grad_norm": 1.7971929656919947, - "language_loss": 0.65181434, - "learning_rate": 3.429074332770984e-06, - "loss": 0.67328548, - "num_input_tokens_seen": 96857745, - "step": 4478, - "time_per_iteration": 2.8882603645324707 - }, - { - "auxiliary_loss_clip": 0.01130488, - "auxiliary_loss_mlp": 0.01050401, - "balance_loss_clip": 1.04841042, - "balance_loss_mlp": 1.03163743, - "epoch": 0.26929204869983464, - "flos": 22127796564480.0, - "grad_norm": 1.933707281531851, - "language_loss": 0.80987537, - "learning_rate": 3.4288018392654047e-06, - "loss": 0.83168429, - "num_input_tokens_seen": 96877295, - "step": 4479, - "time_per_iteration": 2.670370578765869 - }, - { - "auxiliary_loss_clip": 0.01127626, - "auxiliary_loss_mlp": 0.00776143, - "balance_loss_clip": 1.05010593, - "balance_loss_mlp": 1.0010041, - "epoch": 0.2693521719525026, - "flos": 19792813052160.0, - "grad_norm": 16.364114673072947, - "language_loss": 0.81205857, - "learning_rate": 3.4285292915795166e-06, - "loss": 0.83109629, - "num_input_tokens_seen": 96896160, - "step": 4480, - "time_per_iteration": 2.687922954559326 - }, - { - "auxiliary_loss_clip": 0.01098242, - "auxiliary_loss_mlp": 0.01051142, - "balance_loss_clip": 1.04720628, - "balance_loss_mlp": 1.03243792, - "epoch": 0.2694122952051706, - "flos": 20994383836800.0, - "grad_norm": 1.5167677573266813, - "language_loss": 0.77982032, - "learning_rate": 3.4282566897236543e-06, - "loss": 0.80131412, - "num_input_tokens_seen": 96915410, - "step": 4481, - "time_per_iteration": 2.783400058746338 - }, - { - "auxiliary_loss_clip": 0.01138325, - "auxiliary_loss_mlp": 0.01055373, - "balance_loss_clip": 1.05098486, - "balance_loss_mlp": 1.03693104, - "epoch": 0.2694724184578386, - "flos": 25849291011840.0, - "grad_norm": 1.817845708033507, - "language_loss": 0.74072635, - "learning_rate": 3.4279840337081547e-06, - "loss": 0.76266336, - "num_input_tokens_seen": 96937865, - "step": 4482, - "time_per_iteration": 2.704923629760742 - }, - { - "auxiliary_loss_clip": 0.01124372, - "auxiliary_loss_mlp": 0.01046467, - "balance_loss_clip": 1.05258846, - "balance_loss_mlp": 1.02826333, - "epoch": 0.26953254171050656, - "flos": 21726961718400.0, - "grad_norm": 2.016330221700464, - "language_loss": 0.72562164, - "learning_rate": 3.4277113235433584e-06, - "loss": 0.74733007, - "num_input_tokens_seen": 96957710, - "step": 4483, - "time_per_iteration": 2.697889804840088 - }, - { - "auxiliary_loss_clip": 0.0113896, - "auxiliary_loss_mlp": 0.01056121, - "balance_loss_clip": 1.04867983, - "balance_loss_mlp": 1.03658295, - "epoch": 0.2695926649631745, - "flos": 19682926369920.0, - "grad_norm": 2.3663265895203356, - "language_loss": 0.86904967, - "learning_rate": 3.427438559239605e-06, - "loss": 0.89100051, - "num_input_tokens_seen": 96975890, - "step": 4484, - "time_per_iteration": 2.6893441677093506 - }, - { - "auxiliary_loss_clip": 0.01139698, - "auxiliary_loss_mlp": 0.01049025, - "balance_loss_clip": 1.05224931, - "balance_loss_mlp": 1.03148949, - "epoch": 0.2696527882158425, - "flos": 32886596724480.0, - "grad_norm": 1.783447205979712, - "language_loss": 0.6663093, - "learning_rate": 3.427165740807239e-06, - "loss": 0.68819648, - "num_input_tokens_seen": 96998595, - "step": 4485, - "time_per_iteration": 2.795172929763794 - }, - { - "auxiliary_loss_clip": 0.01112833, - "auxiliary_loss_mlp": 0.01053324, - "balance_loss_clip": 1.04507363, - "balance_loss_mlp": 1.03475094, - "epoch": 0.26971291146851045, - "flos": 12124843320960.0, - "grad_norm": 2.5437851063433743, - "language_loss": 0.73155308, - "learning_rate": 3.426892868256604e-06, - "loss": 0.75321472, - "num_input_tokens_seen": 97013715, - "step": 4486, - "time_per_iteration": 2.6854116916656494 - }, - { - "auxiliary_loss_clip": 0.01156209, - "auxiliary_loss_mlp": 0.01047906, - "balance_loss_clip": 1.05688012, - "balance_loss_mlp": 1.03062034, - "epoch": 0.2697730347211784, - "flos": 22634459856000.0, - "grad_norm": 2.2389379935408456, - "language_loss": 0.84326887, - "learning_rate": 3.4266199415980495e-06, - "loss": 0.86531007, - "num_input_tokens_seen": 97031570, - "step": 4487, - "time_per_iteration": 2.6117801666259766 - }, - { - "auxiliary_loss_clip": 0.01127332, - "auxiliary_loss_mlp": 0.0105083, - "balance_loss_clip": 1.05733204, - "balance_loss_mlp": 1.03228104, - "epoch": 0.2698331579738464, - "flos": 23513050523520.0, - "grad_norm": 2.345170862120161, - "language_loss": 0.7189706, - "learning_rate": 3.4263469608419234e-06, - "loss": 0.74075222, - "num_input_tokens_seen": 97049815, - "step": 4488, - "time_per_iteration": 2.7384660243988037 - }, - { - "auxiliary_loss_clip": 0.01074601, - "auxiliary_loss_mlp": 0.01061378, - "balance_loss_clip": 1.0494225, - "balance_loss_mlp": 1.04040885, - "epoch": 0.26989328122651435, - "flos": 24641040297600.0, - "grad_norm": 1.6359957516545125, - "language_loss": 0.83725536, - "learning_rate": 3.426073925998578e-06, - "loss": 0.85861516, - "num_input_tokens_seen": 97067570, - "step": 4489, - "time_per_iteration": 2.9274613857269287 - }, - { - "auxiliary_loss_clip": 0.01129648, - "auxiliary_loss_mlp": 0.01061235, - "balance_loss_clip": 1.05630314, - "balance_loss_mlp": 1.04203057, - "epoch": 0.2699534044791823, - "flos": 10772555068800.0, - "grad_norm": 2.6678463269995785, - "language_loss": 0.90056908, - "learning_rate": 3.4258008370783656e-06, - "loss": 0.9224779, - "num_input_tokens_seen": 97082180, - "step": 4490, - "time_per_iteration": 2.9096486568450928 - }, - { - "auxiliary_loss_clip": 0.01075397, - "auxiliary_loss_mlp": 0.01052666, - "balance_loss_clip": 1.04493999, - "balance_loss_mlp": 1.03319883, - "epoch": 0.2700135277318503, - "flos": 36171597098880.0, - "grad_norm": 2.0876908666200573, - "language_loss": 0.73380542, - "learning_rate": 3.4255276940916434e-06, - "loss": 0.75508606, - "num_input_tokens_seen": 97103470, - "step": 4491, - "time_per_iteration": 2.9016802310943604 - }, - { - "auxiliary_loss_clip": 0.01156852, - "auxiliary_loss_mlp": 0.01052294, - "balance_loss_clip": 1.05944943, - "balance_loss_mlp": 1.03453195, - "epoch": 0.27007365098451824, - "flos": 17418614866560.0, - "grad_norm": 2.7575700534068783, - "language_loss": 0.74795783, - "learning_rate": 3.4252544970487676e-06, - "loss": 0.77004933, - "num_input_tokens_seen": 97118100, - "step": 4492, - "time_per_iteration": 2.6685187816619873 - }, - { - "auxiliary_loss_clip": 0.01130467, - "auxiliary_loss_mlp": 0.01050253, - "balance_loss_clip": 1.05300546, - "balance_loss_mlp": 1.03205013, - "epoch": 0.2701337742371862, - "flos": 23185688947200.0, - "grad_norm": 3.551039047250381, - "language_loss": 0.89015245, - "learning_rate": 3.4249812459600986e-06, - "loss": 0.91195965, - "num_input_tokens_seen": 97136765, - "step": 4493, - "time_per_iteration": 2.7044742107391357 - }, - { - "auxiliary_loss_clip": 0.01142037, - "auxiliary_loss_mlp": 0.0104825, - "balance_loss_clip": 1.05408192, - "balance_loss_mlp": 1.03079772, - "epoch": 0.2701938974898542, - "flos": 24389450461440.0, - "grad_norm": 1.665337194117132, - "language_loss": 0.71139705, - "learning_rate": 3.424707940835998e-06, - "loss": 0.73329991, - "num_input_tokens_seen": 97157470, - "step": 4494, - "time_per_iteration": 2.6299519538879395 - }, - { - "auxiliary_loss_clip": 0.01120214, - "auxiliary_loss_mlp": 0.01045805, - "balance_loss_clip": 1.05193532, - "balance_loss_mlp": 1.02893662, - "epoch": 0.2702540207425222, - "flos": 26214322976640.0, - "grad_norm": 2.4718809008283045, - "language_loss": 0.8642354, - "learning_rate": 3.42443458168683e-06, - "loss": 0.88589561, - "num_input_tokens_seen": 97176905, - "step": 4495, - "time_per_iteration": 2.627389907836914 - }, - { - "auxiliary_loss_clip": 0.01151814, - "auxiliary_loss_mlp": 0.0105053, - "balance_loss_clip": 1.05591631, - "balance_loss_mlp": 1.03308964, - "epoch": 0.27031414399519016, - "flos": 22926377687040.0, - "grad_norm": 2.1521214825296844, - "language_loss": 0.76781964, - "learning_rate": 3.424161168522959e-06, - "loss": 0.78984308, - "num_input_tokens_seen": 97196380, - "step": 4496, - "time_per_iteration": 2.5360703468322754 - }, - { - "auxiliary_loss_clip": 0.01064272, - "auxiliary_loss_mlp": 0.01049575, - "balance_loss_clip": 1.03151321, - "balance_loss_mlp": 1.04716671, - "epoch": 0.2703742672478581, - "flos": 63019780404480.0, - "grad_norm": 0.7153442156657138, - "language_loss": 0.50134224, - "learning_rate": 3.423887701354754e-06, - "loss": 0.52248067, - "num_input_tokens_seen": 97260100, - "step": 4497, - "time_per_iteration": 3.1133949756622314 - }, - { - "auxiliary_loss_clip": 0.01106563, - "auxiliary_loss_mlp": 0.01051954, - "balance_loss_clip": 1.05492568, - "balance_loss_mlp": 1.03482318, - "epoch": 0.2704343905005261, - "flos": 18840820942080.0, - "grad_norm": 2.421164292554959, - "language_loss": 0.72386497, - "learning_rate": 3.4236141801925847e-06, - "loss": 0.74545014, - "num_input_tokens_seen": 97277935, - "step": 4498, - "time_per_iteration": 2.7409775257110596 - }, - { - "auxiliary_loss_clip": 0.01038432, - "auxiliary_loss_mlp": 0.01028244, - "balance_loss_clip": 1.0322926, - "balance_loss_mlp": 1.02582395, - "epoch": 0.27049451375319405, - "flos": 71233412618880.0, - "grad_norm": 0.7537228186848703, - "language_loss": 0.5917033, - "learning_rate": 3.4233406050468237e-06, - "loss": 0.61237001, - "num_input_tokens_seen": 97338845, - "step": 4499, - "time_per_iteration": 3.2331602573394775 - }, - { - "auxiliary_loss_clip": 0.01124574, - "auxiliary_loss_mlp": 0.01044613, - "balance_loss_clip": 1.05154204, - "balance_loss_mlp": 1.02593243, - "epoch": 0.270554637005862, - "flos": 24278594112000.0, - "grad_norm": 2.1159538878254756, - "language_loss": 0.73629957, - "learning_rate": 3.4230669759278438e-06, - "loss": 0.75799143, - "num_input_tokens_seen": 97356640, - "step": 4500, - "time_per_iteration": 2.7513487339019775 - }, - { - "auxiliary_loss_clip": 0.01116688, - "auxiliary_loss_mlp": 0.01047016, - "balance_loss_clip": 1.04657793, - "balance_loss_mlp": 1.02878881, - "epoch": 0.27061476025853, - "flos": 17632318832640.0, - "grad_norm": 2.8997006330289925, - "language_loss": 0.81041664, - "learning_rate": 3.4227932928460215e-06, - "loss": 0.83205366, - "num_input_tokens_seen": 97372585, - "step": 4501, - "time_per_iteration": 2.703014850616455 - }, - { - "auxiliary_loss_clip": 0.01104056, - "auxiliary_loss_mlp": 0.01053779, - "balance_loss_clip": 1.04828477, - "balance_loss_mlp": 1.03331053, - "epoch": 0.27067488351119795, - "flos": 22710123855360.0, - "grad_norm": 4.2139696132912565, - "language_loss": 0.7261312, - "learning_rate": 3.422519555811735e-06, - "loss": 0.74770957, - "num_input_tokens_seen": 97393315, - "step": 4502, - "time_per_iteration": 2.732167959213257 - }, - { - "auxiliary_loss_clip": 0.01129704, - "auxiliary_loss_mlp": 0.01047167, - "balance_loss_clip": 1.04821455, - "balance_loss_mlp": 1.0268774, - "epoch": 0.2707350067638659, - "flos": 41719616087040.0, - "grad_norm": 1.748421457410976, - "language_loss": 0.67973912, - "learning_rate": 3.4222457648353642e-06, - "loss": 0.70150787, - "num_input_tokens_seen": 97417860, - "step": 4503, - "time_per_iteration": 2.7950186729431152 - }, - { - "auxiliary_loss_clip": 0.01100008, - "auxiliary_loss_mlp": 0.01051668, - "balance_loss_clip": 1.04750037, - "balance_loss_mlp": 1.03180754, - "epoch": 0.2707951300165339, - "flos": 20193037367040.0, - "grad_norm": 1.847411158173202, - "language_loss": 0.67971921, - "learning_rate": 3.4219719199272918e-06, - "loss": 0.70123595, - "num_input_tokens_seen": 97436780, - "step": 4504, - "time_per_iteration": 2.7830374240875244 - }, - { - "auxiliary_loss_clip": 0.01142201, - "auxiliary_loss_mlp": 0.01052204, - "balance_loss_clip": 1.05604792, - "balance_loss_mlp": 1.03451371, - "epoch": 0.27085525326920185, - "flos": 21433966479360.0, - "grad_norm": 1.4870002594081857, - "language_loss": 0.75395846, - "learning_rate": 3.421698021097902e-06, - "loss": 0.77590245, - "num_input_tokens_seen": 97456190, - "step": 4505, - "time_per_iteration": 2.6758666038513184 - }, - { - "auxiliary_loss_clip": 0.01155407, - "auxiliary_loss_mlp": 0.01064618, - "balance_loss_clip": 1.05439496, - "balance_loss_mlp": 1.04436409, - "epoch": 0.2709153765218698, - "flos": 17675232606720.0, - "grad_norm": 2.0635482699578254, - "language_loss": 0.73474276, - "learning_rate": 3.42142406835758e-06, - "loss": 0.75694299, - "num_input_tokens_seen": 97474545, - "step": 4506, - "time_per_iteration": 2.652395009994507 - }, - { - "auxiliary_loss_clip": 0.01130629, - "auxiliary_loss_mlp": 0.01053462, - "balance_loss_clip": 1.05147469, - "balance_loss_mlp": 1.0338285, - "epoch": 0.2709754997745378, - "flos": 24456243801600.0, - "grad_norm": 2.6352592870517144, - "language_loss": 0.80730569, - "learning_rate": 3.421150061716715e-06, - "loss": 0.82914662, - "num_input_tokens_seen": 97494520, - "step": 4507, - "time_per_iteration": 2.7858307361602783 - }, - { - "auxiliary_loss_clip": 0.01041671, - "auxiliary_loss_mlp": 0.010698, - "balance_loss_clip": 1.0261147, - "balance_loss_mlp": 1.0667243, - "epoch": 0.2710356230272058, - "flos": 65210798206080.0, - "grad_norm": 0.7655673562950965, - "language_loss": 0.5085085, - "learning_rate": 3.420876001185698e-06, - "loss": 0.52962321, - "num_input_tokens_seen": 97552455, - "step": 4508, - "time_per_iteration": 3.144418716430664 - }, - { - "auxiliary_loss_clip": 0.01072779, - "auxiliary_loss_mlp": 0.01046589, - "balance_loss_clip": 1.04359698, - "balance_loss_mlp": 1.02843356, - "epoch": 0.27109574627987376, - "flos": 25484438615040.0, - "grad_norm": 1.9710162430227722, - "language_loss": 0.74710357, - "learning_rate": 3.4206018867749197e-06, - "loss": 0.76829731, - "num_input_tokens_seen": 97572650, - "step": 4509, - "time_per_iteration": 2.8052053451538086 - }, - { - "auxiliary_loss_clip": 0.01130819, - "auxiliary_loss_mlp": 0.01042284, - "balance_loss_clip": 1.05107474, - "balance_loss_mlp": 1.0254159, - "epoch": 0.2711558695325417, - "flos": 19682782715520.0, - "grad_norm": 2.0468089657674353, - "language_loss": 0.70937192, - "learning_rate": 3.4203277184947757e-06, - "loss": 0.73110294, - "num_input_tokens_seen": 97591150, - "step": 4510, - "time_per_iteration": 2.6244139671325684 - }, - { - "auxiliary_loss_clip": 0.01135912, - "auxiliary_loss_mlp": 0.0103914, - "balance_loss_clip": 1.05330467, - "balance_loss_mlp": 1.02156901, - "epoch": 0.2712159927852097, - "flos": 18587758648320.0, - "grad_norm": 2.4701723872261256, - "language_loss": 0.70409644, - "learning_rate": 3.4200534963556627e-06, - "loss": 0.72584701, - "num_input_tokens_seen": 97607410, - "step": 4511, - "time_per_iteration": 4.112820863723755 - }, - { - "auxiliary_loss_clip": 0.0112023, - "auxiliary_loss_mlp": 0.01049105, - "balance_loss_clip": 1.048491, - "balance_loss_mlp": 1.03115225, - "epoch": 0.27127611603787766, - "flos": 25630235919360.0, - "grad_norm": 6.028868725677894, - "language_loss": 0.81324005, - "learning_rate": 3.419779220367979e-06, - "loss": 0.83493352, - "num_input_tokens_seen": 97626870, - "step": 4512, - "time_per_iteration": 4.285844087600708 - }, - { - "auxiliary_loss_clip": 0.01147816, - "auxiliary_loss_mlp": 0.01038614, - "balance_loss_clip": 1.05365086, - "balance_loss_mlp": 1.02323616, - "epoch": 0.2713362392905456, - "flos": 23148952312320.0, - "grad_norm": 2.7707983308205053, - "language_loss": 0.80467856, - "learning_rate": 3.419504890542124e-06, - "loss": 0.82654285, - "num_input_tokens_seen": 97646595, - "step": 4513, - "time_per_iteration": 4.415290117263794 - }, - { - "auxiliary_loss_clip": 0.01119685, - "auxiliary_loss_mlp": 0.01044412, - "balance_loss_clip": 1.04594898, - "balance_loss_mlp": 1.02709103, - "epoch": 0.2713963625432136, - "flos": 18366045949440.0, - "grad_norm": 1.8005970142501413, - "language_loss": 0.88150048, - "learning_rate": 3.4192305068885026e-06, - "loss": 0.90314144, - "num_input_tokens_seen": 97665485, - "step": 4514, - "time_per_iteration": 2.691697835922241 - }, - { - "auxiliary_loss_clip": 0.01129072, - "auxiliary_loss_mlp": 0.01051817, - "balance_loss_clip": 1.05358005, - "balance_loss_mlp": 1.03337574, - "epoch": 0.27145648579588155, - "flos": 22491751121280.0, - "grad_norm": 1.6419144417830658, - "language_loss": 0.91461927, - "learning_rate": 3.418956069417517e-06, - "loss": 0.93642819, - "num_input_tokens_seen": 97683800, - "step": 4515, - "time_per_iteration": 2.6709890365600586 - }, - { - "auxiliary_loss_clip": 0.01100451, - "auxiliary_loss_mlp": 0.01057835, - "balance_loss_clip": 1.04920852, - "balance_loss_mlp": 1.03761721, - "epoch": 0.2715166090485495, - "flos": 19239177749760.0, - "grad_norm": 2.0250040358395944, - "language_loss": 0.74093282, - "learning_rate": 3.4186815781395756e-06, - "loss": 0.76251566, - "num_input_tokens_seen": 97700505, - "step": 4516, - "time_per_iteration": 2.7001607418060303 - }, - { - "auxiliary_loss_clip": 0.01136738, - "auxiliary_loss_mlp": 0.01052795, - "balance_loss_clip": 1.05046439, - "balance_loss_mlp": 1.03483033, - "epoch": 0.2715767323012175, - "flos": 17709598944000.0, - "grad_norm": 2.811509606055916, - "language_loss": 0.75989574, - "learning_rate": 3.4184070330650866e-06, - "loss": 0.78179109, - "num_input_tokens_seen": 97717410, - "step": 4517, - "time_per_iteration": 4.207966089248657 - }, - { - "auxiliary_loss_clip": 0.01097642, - "auxiliary_loss_mlp": 0.01058771, - "balance_loss_clip": 1.04378986, - "balance_loss_mlp": 1.03962636, - "epoch": 0.27163685555388545, - "flos": 22382834106240.0, - "grad_norm": 2.3161178488466097, - "language_loss": 0.77046895, - "learning_rate": 3.4181324342044607e-06, - "loss": 0.79203308, - "num_input_tokens_seen": 97734545, - "step": 4518, - "time_per_iteration": 2.754009246826172 - }, - { - "auxiliary_loss_clip": 0.01118909, - "auxiliary_loss_mlp": 0.01047823, - "balance_loss_clip": 1.05136919, - "balance_loss_mlp": 1.03077579, - "epoch": 0.2716969788065534, - "flos": 22346708002560.0, - "grad_norm": 2.717268994046331, - "language_loss": 0.68388188, - "learning_rate": 3.41785778156811e-06, - "loss": 0.70554924, - "num_input_tokens_seen": 97754000, - "step": 4519, - "time_per_iteration": 2.7800872325897217 - }, - { - "auxiliary_loss_clip": 0.01134075, - "auxiliary_loss_mlp": 0.01053278, - "balance_loss_clip": 1.05009973, - "balance_loss_mlp": 1.03611171, - "epoch": 0.2717571020592214, - "flos": 25228467319680.0, - "grad_norm": 2.367483937305651, - "language_loss": 0.75572526, - "learning_rate": 3.417583075166451e-06, - "loss": 0.7775988, - "num_input_tokens_seen": 97772080, - "step": 4520, - "time_per_iteration": 2.694591760635376 - }, - { - "auxiliary_loss_clip": 0.01138275, - "auxiliary_loss_mlp": 0.0106095, - "balance_loss_clip": 1.05209494, - "balance_loss_mlp": 1.04226971, - "epoch": 0.2718172253118894, - "flos": 20189769229440.0, - "grad_norm": 3.3698654303080935, - "language_loss": 0.76434267, - "learning_rate": 3.4173083150099e-06, - "loss": 0.78633487, - "num_input_tokens_seen": 97789370, - "step": 4521, - "time_per_iteration": 2.675443649291992 - }, - { - "auxiliary_loss_clip": 0.01117262, - "auxiliary_loss_mlp": 0.0106414, - "balance_loss_clip": 1.04636955, - "balance_loss_mlp": 1.04578209, - "epoch": 0.27187734856455736, - "flos": 14319129260160.0, - "grad_norm": 2.1933848209734936, - "language_loss": 0.75041616, - "learning_rate": 3.417033501108875e-06, - "loss": 0.77223015, - "num_input_tokens_seen": 97807385, - "step": 4522, - "time_per_iteration": 2.769519329071045 - }, - { - "auxiliary_loss_clip": 0.01151707, - "auxiliary_loss_mlp": 0.01045506, - "balance_loss_clip": 1.05433989, - "balance_loss_mlp": 1.02813768, - "epoch": 0.27193747181722533, - "flos": 21107682311040.0, - "grad_norm": 1.9328965147806931, - "language_loss": 0.73074079, - "learning_rate": 3.416758633473798e-06, - "loss": 0.75271285, - "num_input_tokens_seen": 97827930, - "step": 4523, - "time_per_iteration": 2.6642134189605713 - }, - { - "auxiliary_loss_clip": 0.01120278, - "auxiliary_loss_mlp": 0.01048373, - "balance_loss_clip": 1.05034256, - "balance_loss_mlp": 1.03014588, - "epoch": 0.2719975950698933, - "flos": 19682782715520.0, - "grad_norm": 1.3899676528871532, - "language_loss": 0.74113363, - "learning_rate": 3.4164837121150915e-06, - "loss": 0.76282012, - "num_input_tokens_seen": 97847440, - "step": 4524, - "time_per_iteration": 2.6365647315979004 - }, - { - "auxiliary_loss_clip": 0.0115251, - "auxiliary_loss_mlp": 0.01059779, - "balance_loss_clip": 1.05642283, - "balance_loss_mlp": 1.04233861, - "epoch": 0.27205771832256126, - "flos": 24754482426240.0, - "grad_norm": 1.6567279945506783, - "language_loss": 0.7639389, - "learning_rate": 3.4162087370431803e-06, - "loss": 0.78606176, - "num_input_tokens_seen": 97867620, - "step": 4525, - "time_per_iteration": 2.7116904258728027 - }, - { - "auxiliary_loss_clip": 0.01133976, - "auxiliary_loss_mlp": 0.01063183, - "balance_loss_clip": 1.05110538, - "balance_loss_mlp": 1.0458858, - "epoch": 0.2721178415752292, - "flos": 21755581879680.0, - "grad_norm": 1.8049087044415455, - "language_loss": 0.81449121, - "learning_rate": 3.4159337082684926e-06, - "loss": 0.8364628, - "num_input_tokens_seen": 97884345, - "step": 4526, - "time_per_iteration": 2.583151340484619 - }, - { - "auxiliary_loss_clip": 0.01150721, - "auxiliary_loss_mlp": 0.01050593, - "balance_loss_clip": 1.05157495, - "balance_loss_mlp": 1.03235435, - "epoch": 0.2721779648278972, - "flos": 12676826597760.0, - "grad_norm": 2.689071598576449, - "language_loss": 0.77230763, - "learning_rate": 3.4156586258014566e-06, - "loss": 0.79432082, - "num_input_tokens_seen": 97901500, - "step": 4527, - "time_per_iteration": 2.6060924530029297 - }, - { - "auxiliary_loss_clip": 0.01109469, - "auxiliary_loss_mlp": 0.00777538, - "balance_loss_clip": 1.04898691, - "balance_loss_mlp": 1.00073338, - "epoch": 0.27223808808056515, - "flos": 16253206099200.0, - "grad_norm": 2.5564103940467313, - "language_loss": 0.8187297, - "learning_rate": 3.415383489652503e-06, - "loss": 0.83759975, - "num_input_tokens_seen": 97917800, - "step": 4528, - "time_per_iteration": 2.697845458984375 - }, - { - "auxiliary_loss_clip": 0.01116518, - "auxiliary_loss_mlp": 0.01058829, - "balance_loss_clip": 1.05005443, - "balance_loss_mlp": 1.04094744, - "epoch": 0.2722982113332331, - "flos": 27745805203200.0, - "grad_norm": 1.774189879269534, - "language_loss": 0.77156031, - "learning_rate": 3.4151082998320666e-06, - "loss": 0.7933138, - "num_input_tokens_seen": 97937225, - "step": 4529, - "time_per_iteration": 2.75425124168396 - }, - { - "auxiliary_loss_clip": 0.01123493, - "auxiliary_loss_mlp": 0.01053103, - "balance_loss_clip": 1.0518961, - "balance_loss_mlp": 1.03634179, - "epoch": 0.2723583345859011, - "flos": 21726243446400.0, - "grad_norm": 2.104422440945624, - "language_loss": 0.82359695, - "learning_rate": 3.4148330563505805e-06, - "loss": 0.84536296, - "num_input_tokens_seen": 97956845, - "step": 4530, - "time_per_iteration": 2.6822023391723633 - }, - { - "auxiliary_loss_clip": 0.01136812, - "auxiliary_loss_mlp": 0.01047087, - "balance_loss_clip": 1.05334496, - "balance_loss_mlp": 1.02971828, - "epoch": 0.27241845783856905, - "flos": 17347260499200.0, - "grad_norm": 2.321764638586046, - "language_loss": 0.91554427, - "learning_rate": 3.4145577592184838e-06, - "loss": 0.93738323, - "num_input_tokens_seen": 97972465, - "step": 4531, - "time_per_iteration": 2.6979331970214844 - }, - { - "auxiliary_loss_clip": 0.01138188, - "auxiliary_loss_mlp": 0.01046663, - "balance_loss_clip": 1.05187678, - "balance_loss_mlp": 1.02856672, - "epoch": 0.272478581091237, - "flos": 24754302858240.0, - "grad_norm": 1.9110068503115385, - "language_loss": 0.76398945, - "learning_rate": 3.4142824084462155e-06, - "loss": 0.78583801, - "num_input_tokens_seen": 97990770, - "step": 4532, - "time_per_iteration": 2.6663877964019775 - }, - { - "auxiliary_loss_clip": 0.01113354, - "auxiliary_loss_mlp": 0.01040904, - "balance_loss_clip": 1.05224109, - "balance_loss_mlp": 1.02386856, - "epoch": 0.272538704343905, - "flos": 17890624512000.0, - "grad_norm": 2.311201731752709, - "language_loss": 0.88514459, - "learning_rate": 3.4140070040442162e-06, - "loss": 0.90668714, - "num_input_tokens_seen": 98005775, - "step": 4533, - "time_per_iteration": 2.693161725997925 - }, - { - "auxiliary_loss_clip": 0.01122748, - "auxiliary_loss_mlp": 0.01040937, - "balance_loss_clip": 1.05127299, - "balance_loss_mlp": 1.02398562, - "epoch": 0.272598827596573, - "flos": 22932016122240.0, - "grad_norm": 2.2174577403643245, - "language_loss": 0.71288157, - "learning_rate": 3.413731546022929e-06, - "loss": 0.73451841, - "num_input_tokens_seen": 98025750, - "step": 4534, - "time_per_iteration": 2.7371840476989746 - }, - { - "auxiliary_loss_clip": 0.01121649, - "auxiliary_loss_mlp": 0.01040323, - "balance_loss_clip": 1.05089378, - "balance_loss_mlp": 1.02177453, - "epoch": 0.27265895084924097, - "flos": 24238409771520.0, - "grad_norm": 1.6997646677502514, - "language_loss": 0.91605014, - "learning_rate": 3.4134560343928005e-06, - "loss": 0.93766987, - "num_input_tokens_seen": 98044955, - "step": 4535, - "time_per_iteration": 2.72127103805542 - }, - { - "auxiliary_loss_clip": 0.0113065, - "auxiliary_loss_mlp": 0.01045251, - "balance_loss_clip": 1.05495596, - "balance_loss_mlp": 1.02739298, - "epoch": 0.27271907410190893, - "flos": 27013155494400.0, - "grad_norm": 1.6448383128638457, - "language_loss": 0.72919363, - "learning_rate": 3.4131804691642778e-06, - "loss": 0.7509526, - "num_input_tokens_seen": 98065860, - "step": 4536, - "time_per_iteration": 2.778991460800171 - }, - { - "auxiliary_loss_clip": 0.01137601, - "auxiliary_loss_mlp": 0.01044231, - "balance_loss_clip": 1.05134857, - "balance_loss_mlp": 1.02601612, - "epoch": 0.2727791973545769, - "flos": 34452588942720.0, - "grad_norm": 1.7760428855271044, - "language_loss": 0.71682841, - "learning_rate": 3.41290485034781e-06, - "loss": 0.73864675, - "num_input_tokens_seen": 98085450, - "step": 4537, - "time_per_iteration": 2.7746009826660156 - }, - { - "auxiliary_loss_clip": 0.01119602, - "auxiliary_loss_mlp": 0.01042982, - "balance_loss_clip": 1.04899096, - "balance_loss_mlp": 1.02455187, - "epoch": 0.27283932060724486, - "flos": 15041723160960.0, - "grad_norm": 2.103574663853892, - "language_loss": 0.77419543, - "learning_rate": 3.4126291779538485e-06, - "loss": 0.79582125, - "num_input_tokens_seen": 98099115, - "step": 4538, - "time_per_iteration": 2.6432113647460938 - }, - { - "auxiliary_loss_clip": 0.011333, - "auxiliary_loss_mlp": 0.01044735, - "balance_loss_clip": 1.05075216, - "balance_loss_mlp": 1.02784324, - "epoch": 0.2728994438599128, - "flos": 21652411040640.0, - "grad_norm": 1.824827492408775, - "language_loss": 0.90160263, - "learning_rate": 3.412353451992847e-06, - "loss": 0.923383, - "num_input_tokens_seen": 98118415, - "step": 4539, - "time_per_iteration": 2.620088815689087 - }, - { - "auxiliary_loss_clip": 0.0112346, - "auxiliary_loss_mlp": 0.01044264, - "balance_loss_clip": 1.04970992, - "balance_loss_mlp": 1.0250001, - "epoch": 0.2729595671125808, - "flos": 17488424949120.0, - "grad_norm": 1.7778813807473632, - "language_loss": 0.88033229, - "learning_rate": 3.4120776724752607e-06, - "loss": 0.90200949, - "num_input_tokens_seen": 98136300, - "step": 4540, - "time_per_iteration": 2.7115092277526855 - }, - { - "auxiliary_loss_clip": 0.01139055, - "auxiliary_loss_mlp": 0.00775653, - "balance_loss_clip": 1.0515871, - "balance_loss_mlp": 1.00068974, - "epoch": 0.27301969036524876, - "flos": 19318145800320.0, - "grad_norm": 3.2240434674097758, - "language_loss": 0.82471287, - "learning_rate": 3.4118018394115476e-06, - "loss": 0.84385997, - "num_input_tokens_seen": 98154580, - "step": 4541, - "time_per_iteration": 2.6112682819366455 - }, - { - "auxiliary_loss_clip": 0.01123955, - "auxiliary_loss_mlp": 0.01045117, - "balance_loss_clip": 1.05166435, - "balance_loss_mlp": 1.02798617, - "epoch": 0.2730798136179167, - "flos": 21065666376960.0, - "grad_norm": 2.102491799578544, - "language_loss": 0.79535306, - "learning_rate": 3.4115259528121678e-06, - "loss": 0.81704378, - "num_input_tokens_seen": 98173115, - "step": 4542, - "time_per_iteration": 2.7202932834625244 - }, - { - "auxiliary_loss_clip": 0.01130053, - "auxiliary_loss_mlp": 0.0103993, - "balance_loss_clip": 1.05406725, - "balance_loss_mlp": 1.02263296, - "epoch": 0.2731399368705847, - "flos": 19171737964800.0, - "grad_norm": 1.955696716620197, - "language_loss": 0.89326978, - "learning_rate": 3.411250012687582e-06, - "loss": 0.91496956, - "num_input_tokens_seen": 98190260, - "step": 4543, - "time_per_iteration": 2.6846654415130615 - }, - { - "auxiliary_loss_clip": 0.01118776, - "auxiliary_loss_mlp": 0.00776653, - "balance_loss_clip": 1.04913735, - "balance_loss_mlp": 1.00080073, - "epoch": 0.27320006012325265, - "flos": 18290130554880.0, - "grad_norm": 2.4410785724718997, - "language_loss": 0.64012986, - "learning_rate": 3.410974019048255e-06, - "loss": 0.65908414, - "num_input_tokens_seen": 98207115, - "step": 4544, - "time_per_iteration": 2.6373775005340576 - }, - { - "auxiliary_loss_clip": 0.01123945, - "auxiliary_loss_mlp": 0.01044578, - "balance_loss_clip": 1.05455351, - "balance_loss_mlp": 1.02582633, - "epoch": 0.2732601833759206, - "flos": 34860929731200.0, - "grad_norm": 3.5876362405970643, - "language_loss": 0.69788039, - "learning_rate": 3.410697971904651e-06, - "loss": 0.71956557, - "num_input_tokens_seen": 98230610, - "step": 4545, - "time_per_iteration": 2.7943291664123535 - }, - { - "auxiliary_loss_clip": 0.0103839, - "auxiliary_loss_mlp": 0.01023664, - "balance_loss_clip": 1.02576709, - "balance_loss_mlp": 1.02123213, - "epoch": 0.2733203066285886, - "flos": 53910824762880.0, - "grad_norm": 0.7314456658795918, - "language_loss": 0.61636353, - "learning_rate": 3.4104218712672383e-06, - "loss": 0.63698411, - "num_input_tokens_seen": 98293585, - "step": 4546, - "time_per_iteration": 3.2244455814361572 - }, - { - "auxiliary_loss_clip": 0.0105925, - "auxiliary_loss_mlp": 0.01053726, - "balance_loss_clip": 1.04915786, - "balance_loss_mlp": 1.03472424, - "epoch": 0.2733804298812566, - "flos": 20660378244480.0, - "grad_norm": 1.905103737754333, - "language_loss": 0.6467241, - "learning_rate": 3.410145717146488e-06, - "loss": 0.66785389, - "num_input_tokens_seen": 98311680, - "step": 4547, - "time_per_iteration": 2.7815287113189697 - }, - { - "auxiliary_loss_clip": 0.01123347, - "auxiliary_loss_mlp": 0.00774125, - "balance_loss_clip": 1.05267262, - "balance_loss_mlp": 1.00081313, - "epoch": 0.27344055313392457, - "flos": 25884339707520.0, - "grad_norm": 1.90846373489731, - "language_loss": 0.77248073, - "learning_rate": 3.4098695095528694e-06, - "loss": 0.79145551, - "num_input_tokens_seen": 98330770, - "step": 4548, - "time_per_iteration": 2.8113017082214355 - }, - { - "auxiliary_loss_clip": 0.01122557, - "auxiliary_loss_mlp": 0.01050902, - "balance_loss_clip": 1.05430245, - "balance_loss_mlp": 1.03526139, - "epoch": 0.27350067638659253, - "flos": 22929753565440.0, - "grad_norm": 1.9713428286290122, - "language_loss": 0.82792878, - "learning_rate": 3.4095932484968585e-06, - "loss": 0.84966338, - "num_input_tokens_seen": 98349860, - "step": 4549, - "time_per_iteration": 2.6938650608062744 - }, - { - "auxiliary_loss_clip": 0.01135405, - "auxiliary_loss_mlp": 0.01048728, - "balance_loss_clip": 1.04898036, - "balance_loss_mlp": 1.02902281, - "epoch": 0.2735607996392605, - "flos": 16574821499520.0, - "grad_norm": 3.4543610040263655, - "language_loss": 0.71193838, - "learning_rate": 3.4093169339889305e-06, - "loss": 0.73377967, - "num_input_tokens_seen": 98367040, - "step": 4550, - "time_per_iteration": 2.638643503189087 - }, - { - "auxiliary_loss_clip": 0.01107347, - "auxiliary_loss_mlp": 0.01042242, - "balance_loss_clip": 1.05066109, - "balance_loss_mlp": 1.02569556, - "epoch": 0.27362092289192846, - "flos": 19645291895040.0, - "grad_norm": 3.3050607953849576, - "language_loss": 0.78899491, - "learning_rate": 3.409040566039563e-06, - "loss": 0.81049079, - "num_input_tokens_seen": 98384010, - "step": 4551, - "time_per_iteration": 4.352613210678101 - }, - { - "auxiliary_loss_clip": 0.01107945, - "auxiliary_loss_mlp": 0.01052105, - "balance_loss_clip": 1.04898548, - "balance_loss_mlp": 1.03342533, - "epoch": 0.27368104614459643, - "flos": 17639142416640.0, - "grad_norm": 2.480443972085862, - "language_loss": 0.71220398, - "learning_rate": 3.4087641446592362e-06, - "loss": 0.73380452, - "num_input_tokens_seen": 98399625, - "step": 4552, - "time_per_iteration": 4.194540739059448 - }, - { - "auxiliary_loss_clip": 0.01123037, - "auxiliary_loss_mlp": 0.01045225, - "balance_loss_clip": 1.05144608, - "balance_loss_mlp": 1.0275104, - "epoch": 0.2737411693972644, - "flos": 21580015178880.0, - "grad_norm": 2.1026303213651967, - "language_loss": 0.71636003, - "learning_rate": 3.408487669858431e-06, - "loss": 0.73804259, - "num_input_tokens_seen": 98417310, - "step": 4553, - "time_per_iteration": 2.7323882579803467 - }, - { - "auxiliary_loss_clip": 0.01134032, - "auxiliary_loss_mlp": 0.01045217, - "balance_loss_clip": 1.05039358, - "balance_loss_mlp": 1.02658415, - "epoch": 0.27380129264993236, - "flos": 25484043565440.0, - "grad_norm": 1.7325126580228065, - "language_loss": 0.58917797, - "learning_rate": 3.4082111416476337e-06, - "loss": 0.6109705, - "num_input_tokens_seen": 98438670, - "step": 4554, - "time_per_iteration": 2.7384533882141113 - }, - { - "auxiliary_loss_clip": 0.01129927, - "auxiliary_loss_mlp": 0.01042216, - "balance_loss_clip": 1.05440903, - "balance_loss_mlp": 1.02400088, - "epoch": 0.2738614159026003, - "flos": 18661196004480.0, - "grad_norm": 1.7915916386168997, - "language_loss": 0.73645991, - "learning_rate": 3.4079345600373275e-06, - "loss": 0.75818133, - "num_input_tokens_seen": 98456060, - "step": 4555, - "time_per_iteration": 2.742417335510254 - }, - { - "auxiliary_loss_clip": 0.01141373, - "auxiliary_loss_mlp": 0.01039158, - "balance_loss_clip": 1.0561738, - "balance_loss_mlp": 1.02152658, - "epoch": 0.2739215391552683, - "flos": 23477139901440.0, - "grad_norm": 2.8904145278515303, - "language_loss": 0.77755523, - "learning_rate": 3.407657925038002e-06, - "loss": 0.79936051, - "num_input_tokens_seen": 98473765, - "step": 4556, - "time_per_iteration": 4.419378280639648 - }, - { - "auxiliary_loss_clip": 0.01150896, - "auxiliary_loss_mlp": 0.01049261, - "balance_loss_clip": 1.05645621, - "balance_loss_mlp": 1.02959132, - "epoch": 0.27398166240793626, - "flos": 17128636369920.0, - "grad_norm": 7.460972643049535, - "language_loss": 0.82236463, - "learning_rate": 3.4073812366601473e-06, - "loss": 0.84436619, - "num_input_tokens_seen": 98490590, - "step": 4557, - "time_per_iteration": 2.6087756156921387 - }, - { - "auxiliary_loss_clip": 0.01089746, - "auxiliary_loss_mlp": 0.01046447, - "balance_loss_clip": 1.04229808, - "balance_loss_mlp": 1.02811229, - "epoch": 0.2740417856606042, - "flos": 23404744039680.0, - "grad_norm": 2.034332886344347, - "language_loss": 0.7293033, - "learning_rate": 3.4071044949142547e-06, - "loss": 0.75066525, - "num_input_tokens_seen": 98510590, - "step": 4558, - "time_per_iteration": 2.7908921241760254 - }, - { - "auxiliary_loss_clip": 0.0112554, - "auxiliary_loss_mlp": 0.01051481, - "balance_loss_clip": 1.05215442, - "balance_loss_mlp": 1.03334939, - "epoch": 0.2741019089132722, - "flos": 12780428400000.0, - "grad_norm": 2.134307291688894, - "language_loss": 0.67842996, - "learning_rate": 3.406827699810819e-06, - "loss": 0.70020014, - "num_input_tokens_seen": 98527875, - "step": 4559, - "time_per_iteration": 2.7246246337890625 - }, - { - "auxiliary_loss_clip": 0.01121642, - "auxiliary_loss_mlp": 0.01055203, - "balance_loss_clip": 1.04958165, - "balance_loss_mlp": 1.03646374, - "epoch": 0.27416203216594015, - "flos": 20631542601600.0, - "grad_norm": 2.095192605103166, - "language_loss": 0.7249226, - "learning_rate": 3.4065508513603353e-06, - "loss": 0.74669105, - "num_input_tokens_seen": 98547575, - "step": 4560, - "time_per_iteration": 2.634526252746582 - }, - { - "auxiliary_loss_clip": 0.01131443, - "auxiliary_loss_mlp": 0.01049928, - "balance_loss_clip": 1.05592251, - "balance_loss_mlp": 1.03115225, - "epoch": 0.27422215541860817, - "flos": 26541576812160.0, - "grad_norm": 2.095026193088577, - "language_loss": 0.81413525, - "learning_rate": 3.406273949573303e-06, - "loss": 0.83594894, - "num_input_tokens_seen": 98566290, - "step": 4561, - "time_per_iteration": 2.711106538772583 - }, - { - "auxiliary_loss_clip": 0.01156737, - "auxiliary_loss_mlp": 0.01043903, - "balance_loss_clip": 1.05919766, - "balance_loss_mlp": 1.02688003, - "epoch": 0.27428227867127614, - "flos": 23331163029120.0, - "grad_norm": 1.7066421621801435, - "language_loss": 0.75436246, - "learning_rate": 3.4059969944602214e-06, - "loss": 0.77636886, - "num_input_tokens_seen": 98586255, - "step": 4562, - "time_per_iteration": 2.699544668197632 - }, - { - "auxiliary_loss_clip": 0.01155238, - "auxiliary_loss_mlp": 0.01038722, - "balance_loss_clip": 1.06035113, - "balance_loss_mlp": 1.02138865, - "epoch": 0.2743424019239441, - "flos": 23035115134080.0, - "grad_norm": 1.784616644228294, - "language_loss": 0.74751598, - "learning_rate": 3.4057199860315928e-06, - "loss": 0.76945561, - "num_input_tokens_seen": 98606030, - "step": 4563, - "time_per_iteration": 2.788313627243042 - }, - { - "auxiliary_loss_clip": 0.01119321, - "auxiliary_loss_mlp": 0.01048987, - "balance_loss_clip": 1.04918432, - "balance_loss_mlp": 1.02912664, - "epoch": 0.27440252517661207, - "flos": 21981101420160.0, - "grad_norm": 1.7657560231579414, - "language_loss": 0.63026172, - "learning_rate": 3.4054429242979213e-06, - "loss": 0.65194476, - "num_input_tokens_seen": 98625225, - "step": 4564, - "time_per_iteration": 2.810922145843506 - }, - { - "auxiliary_loss_clip": 0.01128901, - "auxiliary_loss_mlp": 0.01046032, - "balance_loss_clip": 1.05438292, - "balance_loss_mlp": 1.02732766, - "epoch": 0.27446264842928003, - "flos": 40187451502080.0, - "grad_norm": 1.9571814389681148, - "language_loss": 0.78683448, - "learning_rate": 3.4051658092697135e-06, - "loss": 0.8085838, - "num_input_tokens_seen": 98649470, - "step": 4565, - "time_per_iteration": 2.846803665161133 - }, - { - "auxiliary_loss_clip": 0.01095875, - "auxiliary_loss_mlp": 0.01050978, - "balance_loss_clip": 1.04981828, - "balance_loss_mlp": 1.03370428, - "epoch": 0.274522771681948, - "flos": 13479681438720.0, - "grad_norm": 2.4708024317398003, - "language_loss": 0.68715227, - "learning_rate": 3.404888640957477e-06, - "loss": 0.70862079, - "num_input_tokens_seen": 98666915, - "step": 4566, - "time_per_iteration": 2.714352607727051 - }, - { - "auxiliary_loss_clip": 0.01142259, - "auxiliary_loss_mlp": 0.01049797, - "balance_loss_clip": 1.05835438, - "balance_loss_mlp": 1.03326273, - "epoch": 0.27458289493461596, - "flos": 28622133313920.0, - "grad_norm": 2.1203833431876435, - "language_loss": 0.60966527, - "learning_rate": 3.404611419371723e-06, - "loss": 0.63158584, - "num_input_tokens_seen": 98688240, - "step": 4567, - "time_per_iteration": 2.71791934967041 - }, - { - "auxiliary_loss_clip": 0.01135855, - "auxiliary_loss_mlp": 0.01047435, - "balance_loss_clip": 1.05527198, - "balance_loss_mlp": 1.02756321, - "epoch": 0.2746430181872839, - "flos": 20119815492480.0, - "grad_norm": 4.134990661591929, - "language_loss": 0.82529241, - "learning_rate": 3.4043341445229627e-06, - "loss": 0.84712529, - "num_input_tokens_seen": 98708245, - "step": 4568, - "time_per_iteration": 2.6779236793518066 - }, - { - "auxiliary_loss_clip": 0.01141648, - "auxiliary_loss_mlp": 0.01037451, - "balance_loss_clip": 1.06012177, - "balance_loss_mlp": 1.01916456, - "epoch": 0.2747031414399519, - "flos": 20193468330240.0, - "grad_norm": 2.0524329167860254, - "language_loss": 0.68425417, - "learning_rate": 3.4040568164217117e-06, - "loss": 0.70604521, - "num_input_tokens_seen": 98724575, - "step": 4569, - "time_per_iteration": 2.6595280170440674 - }, - { - "auxiliary_loss_clip": 0.0111585, - "auxiliary_loss_mlp": 0.01047943, - "balance_loss_clip": 1.04627442, - "balance_loss_mlp": 1.02938235, - "epoch": 0.27476326469261986, - "flos": 13516346246400.0, - "grad_norm": 2.9457223850766283, - "language_loss": 0.70966327, - "learning_rate": 3.4037794350784848e-06, - "loss": 0.73130119, - "num_input_tokens_seen": 98740700, - "step": 4570, - "time_per_iteration": 2.7404215335845947 - }, - { - "auxiliary_loss_clip": 0.01035018, - "auxiliary_loss_mlp": 0.01027544, - "balance_loss_clip": 1.03062916, - "balance_loss_mlp": 1.02521896, - "epoch": 0.2748233879452878, - "flos": 65937127121280.0, - "grad_norm": 0.7294499123437721, - "language_loss": 0.55835986, - "learning_rate": 3.4035020005038014e-06, - "loss": 0.57898545, - "num_input_tokens_seen": 98803030, - "step": 4571, - "time_per_iteration": 3.369403123855591 - }, - { - "auxiliary_loss_clip": 0.01096573, - "auxiliary_loss_mlp": 0.0104917, - "balance_loss_clip": 1.0493505, - "balance_loss_mlp": 1.03134847, - "epoch": 0.2748835111979558, - "flos": 17384212615680.0, - "grad_norm": 2.8212366896407772, - "language_loss": 0.78388298, - "learning_rate": 3.4032245127081812e-06, - "loss": 0.80534041, - "num_input_tokens_seen": 98820505, - "step": 4572, - "time_per_iteration": 2.835817813873291 - }, - { - "auxiliary_loss_clip": 0.01145371, - "auxiliary_loss_mlp": 0.01038852, - "balance_loss_clip": 1.05474758, - "balance_loss_mlp": 1.02365255, - "epoch": 0.27494363445062375, - "flos": 23587565287680.0, - "grad_norm": 3.882915196153325, - "language_loss": 0.8126958, - "learning_rate": 3.402946971702147e-06, - "loss": 0.83453798, - "num_input_tokens_seen": 98842150, - "step": 4573, - "time_per_iteration": 2.709415912628174 - }, - { - "auxiliary_loss_clip": 0.01135124, - "auxiliary_loss_mlp": 0.01042886, - "balance_loss_clip": 1.0529685, - "balance_loss_mlp": 1.0252434, - "epoch": 0.2750037577032918, - "flos": 17164582905600.0, - "grad_norm": 1.740498780022663, - "language_loss": 0.79043669, - "learning_rate": 3.402669377496223e-06, - "loss": 0.81221676, - "num_input_tokens_seen": 98861050, - "step": 4574, - "time_per_iteration": 2.651921272277832 - }, - { - "auxiliary_loss_clip": 0.01104251, - "auxiliary_loss_mlp": 0.01052183, - "balance_loss_clip": 1.05164313, - "balance_loss_mlp": 1.03518367, - "epoch": 0.27506388095595974, - "flos": 24491903028480.0, - "grad_norm": 2.03666793953709, - "language_loss": 0.74517256, - "learning_rate": 3.402391730100936e-06, - "loss": 0.76673687, - "num_input_tokens_seen": 98879695, - "step": 4575, - "time_per_iteration": 2.7622992992401123 - }, - { - "auxiliary_loss_clip": 0.01126178, - "auxiliary_loss_mlp": 0.01042992, - "balance_loss_clip": 1.05188203, - "balance_loss_mlp": 1.02700627, - "epoch": 0.2751240042086277, - "flos": 38764706722560.0, - "grad_norm": 2.5671977719319745, - "language_loss": 0.71951419, - "learning_rate": 3.402114029526814e-06, - "loss": 0.74120593, - "num_input_tokens_seen": 98902035, - "step": 4576, - "time_per_iteration": 2.85740065574646 - }, - { - "auxiliary_loss_clip": 0.01102681, - "auxiliary_loss_mlp": 0.00778132, - "balance_loss_clip": 1.0506314, - "balance_loss_mlp": 1.00075579, - "epoch": 0.27518412746129567, - "flos": 26907039740160.0, - "grad_norm": 1.8050360629969575, - "language_loss": 0.73217857, - "learning_rate": 3.4018362757843866e-06, - "loss": 0.7509867, - "num_input_tokens_seen": 98921835, - "step": 4577, - "time_per_iteration": 2.9024770259857178 - }, - { - "auxiliary_loss_clip": 0.01130618, - "auxiliary_loss_mlp": 0.01043838, - "balance_loss_clip": 1.05657601, - "balance_loss_mlp": 1.02571797, - "epoch": 0.27524425071396363, - "flos": 24900531125760.0, - "grad_norm": 1.7818656930434014, - "language_loss": 0.76073247, - "learning_rate": 3.401558468884188e-06, - "loss": 0.78247702, - "num_input_tokens_seen": 98939610, - "step": 4578, - "time_per_iteration": 2.7173874378204346 - }, - { - "auxiliary_loss_clip": 0.01120877, - "auxiliary_loss_mlp": 0.01047646, - "balance_loss_clip": 1.05252147, - "balance_loss_mlp": 1.02741659, - "epoch": 0.2753043739666316, - "flos": 26288047641600.0, - "grad_norm": 2.6134371594901773, - "language_loss": 0.66563278, - "learning_rate": 3.4012806088367516e-06, - "loss": 0.68731803, - "num_input_tokens_seen": 98962250, - "step": 4579, - "time_per_iteration": 2.730104446411133 - }, - { - "auxiliary_loss_clip": 0.01113502, - "auxiliary_loss_mlp": 0.01058443, - "balance_loss_clip": 1.04683816, - "balance_loss_mlp": 1.03911948, - "epoch": 0.27536449721929956, - "flos": 24206772867840.0, - "grad_norm": 1.8779975195253575, - "language_loss": 0.80174518, - "learning_rate": 3.4010026956526137e-06, - "loss": 0.82346463, - "num_input_tokens_seen": 98981845, - "step": 4580, - "time_per_iteration": 2.8395349979400635 - }, - { - "auxiliary_loss_clip": 0.01141995, - "auxiliary_loss_mlp": 0.01050029, - "balance_loss_clip": 1.05684924, - "balance_loss_mlp": 1.02942991, - "epoch": 0.27542462047196753, - "flos": 19537272720000.0, - "grad_norm": 1.5301552660019138, - "language_loss": 0.67242241, - "learning_rate": 3.4007247293423137e-06, - "loss": 0.69434267, - "num_input_tokens_seen": 99001855, - "step": 4581, - "time_per_iteration": 2.788644552230835 - }, - { - "auxiliary_loss_clip": 0.01132258, - "auxiliary_loss_mlp": 0.0104746, - "balance_loss_clip": 1.0560689, - "balance_loss_mlp": 1.03050864, - "epoch": 0.2754847437246355, - "flos": 14319165173760.0, - "grad_norm": 1.785645052077455, - "language_loss": 0.77915615, - "learning_rate": 3.400446709916392e-06, - "loss": 0.80095327, - "num_input_tokens_seen": 99019880, - "step": 4582, - "time_per_iteration": 2.730393409729004 - }, - { - "auxiliary_loss_clip": 0.0110084, - "auxiliary_loss_mlp": 0.01042256, - "balance_loss_clip": 1.05119133, - "balance_loss_mlp": 1.02575767, - "epoch": 0.27554486697730346, - "flos": 18838773866880.0, - "grad_norm": 1.737971373642785, - "language_loss": 0.84479475, - "learning_rate": 3.4001686373853895e-06, - "loss": 0.86622572, - "num_input_tokens_seen": 99037570, - "step": 4583, - "time_per_iteration": 2.7274270057678223 - }, - { - "auxiliary_loss_clip": 0.01139632, - "auxiliary_loss_mlp": 0.01044098, - "balance_loss_clip": 1.05364764, - "balance_loss_mlp": 1.02693176, - "epoch": 0.2756049902299714, - "flos": 22382295402240.0, - "grad_norm": 1.6883560409679848, - "language_loss": 0.67007428, - "learning_rate": 3.3998905117598528e-06, - "loss": 0.69191158, - "num_input_tokens_seen": 99056875, - "step": 4584, - "time_per_iteration": 2.643176794052124 - }, - { - "auxiliary_loss_clip": 0.01080495, - "auxiliary_loss_mlp": 0.01054092, - "balance_loss_clip": 1.04106402, - "balance_loss_mlp": 1.03475666, - "epoch": 0.2756651134826394, - "flos": 19573901614080.0, - "grad_norm": 1.8352571769398758, - "language_loss": 0.77349764, - "learning_rate": 3.399612333050327e-06, - "loss": 0.79484355, - "num_input_tokens_seen": 99074685, - "step": 4585, - "time_per_iteration": 2.6824886798858643 - }, - { - "auxiliary_loss_clip": 0.01142822, - "auxiliary_loss_mlp": 0.00775816, - "balance_loss_clip": 1.05703616, - "balance_loss_mlp": 1.00084651, - "epoch": 0.27572523673530736, - "flos": 23586559706880.0, - "grad_norm": 1.697985370469672, - "language_loss": 0.7201665, - "learning_rate": 3.399334101267362e-06, - "loss": 0.73935288, - "num_input_tokens_seen": 99095300, - "step": 4586, - "time_per_iteration": 2.672872304916382 - }, - { - "auxiliary_loss_clip": 0.01125604, - "auxiliary_loss_mlp": 0.01038583, - "balance_loss_clip": 1.05329537, - "balance_loss_mlp": 1.02184618, - "epoch": 0.2757853599879754, - "flos": 22820118278400.0, - "grad_norm": 2.166019285475688, - "language_loss": 0.80385983, - "learning_rate": 3.3990558164215073e-06, - "loss": 0.82550168, - "num_input_tokens_seen": 99115965, - "step": 4587, - "time_per_iteration": 2.716212272644043 - }, - { - "auxiliary_loss_clip": 0.01139286, - "auxiliary_loss_mlp": 0.0104661, - "balance_loss_clip": 1.05435753, - "balance_loss_mlp": 1.02916992, - "epoch": 0.27584548324064334, - "flos": 18551704371840.0, - "grad_norm": 3.416975868515595, - "language_loss": 0.83000016, - "learning_rate": 3.398777478523316e-06, - "loss": 0.85185915, - "num_input_tokens_seen": 99134265, - "step": 4588, - "time_per_iteration": 2.6104485988616943 - }, - { - "auxiliary_loss_clip": 0.01109827, - "auxiliary_loss_mlp": 0.01042868, - "balance_loss_clip": 1.04756808, - "balance_loss_mlp": 1.02567828, - "epoch": 0.2759056064933113, - "flos": 23769883745280.0, - "grad_norm": 1.3306263403060763, - "language_loss": 0.75309169, - "learning_rate": 3.398499087583342e-06, - "loss": 0.77461863, - "num_input_tokens_seen": 99156185, - "step": 4589, - "time_per_iteration": 4.333514928817749 - }, - { - "auxiliary_loss_clip": 0.01138237, - "auxiliary_loss_mlp": 0.01046648, - "balance_loss_clip": 1.0555464, - "balance_loss_mlp": 1.02944636, - "epoch": 0.27596572974597927, - "flos": 24281898163200.0, - "grad_norm": 1.9812216556422375, - "language_loss": 0.8860873, - "learning_rate": 3.398220643612143e-06, - "loss": 0.90793616, - "num_input_tokens_seen": 99176735, - "step": 4590, - "time_per_iteration": 4.256460428237915 - }, - { - "auxiliary_loss_clip": 0.01132985, - "auxiliary_loss_mlp": 0.01048634, - "balance_loss_clip": 1.05280411, - "balance_loss_mlp": 1.03025222, - "epoch": 0.27602585299864724, - "flos": 35040985632000.0, - "grad_norm": 1.594737426944321, - "language_loss": 0.71265185, - "learning_rate": 3.397942146620277e-06, - "loss": 0.7344681, - "num_input_tokens_seen": 99199765, - "step": 4591, - "time_per_iteration": 2.8263018131256104 - }, - { - "auxiliary_loss_clip": 0.01114882, - "auxiliary_loss_mlp": 0.01048296, - "balance_loss_clip": 1.05395412, - "balance_loss_mlp": 1.0301044, - "epoch": 0.2760859762513152, - "flos": 24309405002880.0, - "grad_norm": 3.793452037579163, - "language_loss": 0.80017495, - "learning_rate": 3.3976635966183046e-06, - "loss": 0.82180673, - "num_input_tokens_seen": 99218435, - "step": 4592, - "time_per_iteration": 4.289790153503418 - }, - { - "auxiliary_loss_clip": 0.01051224, - "auxiliary_loss_mlp": 0.00755885, - "balance_loss_clip": 1.02655387, - "balance_loss_mlp": 1.00253439, - "epoch": 0.27614609950398317, - "flos": 71260739890560.0, - "grad_norm": 0.710408868807485, - "language_loss": 0.61613023, - "learning_rate": 3.3973849936167886e-06, - "loss": 0.63420129, - "num_input_tokens_seen": 99276200, - "step": 4593, - "time_per_iteration": 3.201831817626953 - }, - { - "auxiliary_loss_clip": 0.01130969, - "auxiliary_loss_mlp": 0.01042983, - "balance_loss_clip": 1.05307889, - "balance_loss_mlp": 1.02640104, - "epoch": 0.27620622275665113, - "flos": 29674854138240.0, - "grad_norm": 1.9659750468178385, - "language_loss": 0.778301, - "learning_rate": 3.3971063376262937e-06, - "loss": 0.80004054, - "num_input_tokens_seen": 99297625, - "step": 4594, - "time_per_iteration": 2.7222111225128174 - }, - { - "auxiliary_loss_clip": 0.0113791, - "auxiliary_loss_mlp": 0.01038649, - "balance_loss_clip": 1.05557215, - "balance_loss_mlp": 1.02168524, - "epoch": 0.2762663460093191, - "flos": 15378063137280.0, - "grad_norm": 1.5118783378909677, - "language_loss": 0.91944981, - "learning_rate": 3.3968276286573866e-06, - "loss": 0.9412154, - "num_input_tokens_seen": 99315790, - "step": 4595, - "time_per_iteration": 4.290736198425293 - }, - { - "auxiliary_loss_clip": 0.01134891, - "auxiliary_loss_mlp": 0.01052323, - "balance_loss_clip": 1.05374146, - "balance_loss_mlp": 1.03413117, - "epoch": 0.27632646926198706, - "flos": 20704082117760.0, - "grad_norm": 1.7744098894398055, - "language_loss": 0.69208467, - "learning_rate": 3.3965488667206353e-06, - "loss": 0.71395689, - "num_input_tokens_seen": 99334615, - "step": 4596, - "time_per_iteration": 2.7178540229797363 - }, - { - "auxiliary_loss_clip": 0.01125254, - "auxiliary_loss_mlp": 0.01048102, - "balance_loss_clip": 1.05075955, - "balance_loss_mlp": 1.02977943, - "epoch": 0.276386592514655, - "flos": 32813374849920.0, - "grad_norm": 1.7305541104386353, - "language_loss": 0.63536781, - "learning_rate": 3.3962700518266113e-06, - "loss": 0.65710139, - "num_input_tokens_seen": 99356685, - "step": 4597, - "time_per_iteration": 2.7713348865509033 - }, - { - "auxiliary_loss_clip": 0.01150233, - "auxiliary_loss_mlp": 0.01046127, - "balance_loss_clip": 1.05762243, - "balance_loss_mlp": 1.02949786, - "epoch": 0.276446715767323, - "flos": 18551704371840.0, - "grad_norm": 2.077440653118394, - "language_loss": 0.86298984, - "learning_rate": 3.395991183985887e-06, - "loss": 0.8849535, - "num_input_tokens_seen": 99374810, - "step": 4598, - "time_per_iteration": 2.6077804565429688 - }, - { - "auxiliary_loss_clip": 0.01151532, - "auxiliary_loss_mlp": 0.01046218, - "balance_loss_clip": 1.0559516, - "balance_loss_mlp": 1.02790797, - "epoch": 0.27650683901999096, - "flos": 22819615488000.0, - "grad_norm": 2.6195813063936493, - "language_loss": 0.79957914, - "learning_rate": 3.395712263209037e-06, - "loss": 0.82155669, - "num_input_tokens_seen": 99391290, - "step": 4599, - "time_per_iteration": 2.67372989654541 - }, - { - "auxiliary_loss_clip": 0.01127397, - "auxiliary_loss_mlp": 0.01049332, - "balance_loss_clip": 1.04922533, - "balance_loss_mlp": 1.03152239, - "epoch": 0.276566962272659, - "flos": 21361534704000.0, - "grad_norm": 1.7492576371751551, - "language_loss": 0.78788924, - "learning_rate": 3.395433289506639e-06, - "loss": 0.80965656, - "num_input_tokens_seen": 99409120, - "step": 4600, - "time_per_iteration": 2.7197396755218506 - }, - { - "auxiliary_loss_clip": 0.01119636, - "auxiliary_loss_mlp": 0.01049981, - "balance_loss_clip": 1.05458808, - "balance_loss_mlp": 1.03226674, - "epoch": 0.27662708552532694, - "flos": 17710604524800.0, - "grad_norm": 2.9827767838021906, - "language_loss": 0.7372371, - "learning_rate": 3.3951542628892694e-06, - "loss": 0.75893331, - "num_input_tokens_seen": 99426180, - "step": 4601, - "time_per_iteration": 2.7212698459625244 - }, - { - "auxiliary_loss_clip": 0.01137986, - "auxiliary_loss_mlp": 0.01053484, - "balance_loss_clip": 1.05503917, - "balance_loss_mlp": 1.03514934, - "epoch": 0.2766872087779949, - "flos": 21252725429760.0, - "grad_norm": 1.7018676665174548, - "language_loss": 0.80055201, - "learning_rate": 3.3948751833675113e-06, - "loss": 0.82246667, - "num_input_tokens_seen": 99447720, - "step": 4602, - "time_per_iteration": 2.6929776668548584 - }, - { - "auxiliary_loss_clip": 0.01131471, - "auxiliary_loss_mlp": 0.01060998, - "balance_loss_clip": 1.05209374, - "balance_loss_mlp": 1.04194784, - "epoch": 0.2767473320306629, - "flos": 12931900053120.0, - "grad_norm": 2.3561631161543986, - "language_loss": 0.77018148, - "learning_rate": 3.3945960509519455e-06, - "loss": 0.79210615, - "num_input_tokens_seen": 99464720, - "step": 4603, - "time_per_iteration": 2.7761597633361816 - }, - { - "auxiliary_loss_clip": 0.01118804, - "auxiliary_loss_mlp": 0.01044782, - "balance_loss_clip": 1.05331254, - "balance_loss_mlp": 1.02858686, - "epoch": 0.27680745528333084, - "flos": 15012851604480.0, - "grad_norm": 1.686999686787164, - "language_loss": 0.81469357, - "learning_rate": 3.3943168656531585e-06, - "loss": 0.83632934, - "num_input_tokens_seen": 99482310, - "step": 4604, - "time_per_iteration": 2.6715614795684814 - }, - { - "auxiliary_loss_clip": 0.01096642, - "auxiliary_loss_mlp": 0.0104217, - "balance_loss_clip": 1.04733086, - "balance_loss_mlp": 1.02428889, - "epoch": 0.2768675785359988, - "flos": 22637835734400.0, - "grad_norm": 1.8500484413544072, - "language_loss": 0.7021662, - "learning_rate": 3.3940376274817363e-06, - "loss": 0.72355425, - "num_input_tokens_seen": 99501255, - "step": 4605, - "time_per_iteration": 2.824810266494751 - }, - { - "auxiliary_loss_clip": 0.01051326, - "auxiliary_loss_mlp": 0.01005015, - "balance_loss_clip": 1.02826095, - "balance_loss_mlp": 1.00244009, - "epoch": 0.27692770178866677, - "flos": 66130542881280.0, - "grad_norm": 0.7013581781305706, - "language_loss": 0.57222801, - "learning_rate": 3.3937583364482673e-06, - "loss": 0.59279138, - "num_input_tokens_seen": 99568925, - "step": 4606, - "time_per_iteration": 3.288269519805908 - }, - { - "auxiliary_loss_clip": 0.01125032, - "auxiliary_loss_mlp": 0.01050719, - "balance_loss_clip": 1.05177283, - "balance_loss_mlp": 1.03280139, - "epoch": 0.27698782504133473, - "flos": 26464979059200.0, - "grad_norm": 1.9503980757161308, - "language_loss": 0.69579148, - "learning_rate": 3.3934789925633424e-06, - "loss": 0.71754897, - "num_input_tokens_seen": 99588455, - "step": 4607, - "time_per_iteration": 2.7865042686462402 - }, - { - "auxiliary_loss_clip": 0.0113039, - "auxiliary_loss_mlp": 0.01040949, - "balance_loss_clip": 1.05402029, - "balance_loss_mlp": 1.0242002, - "epoch": 0.2770479482940027, - "flos": 25884806584320.0, - "grad_norm": 1.5552750364168406, - "language_loss": 0.69727945, - "learning_rate": 3.393199595837555e-06, - "loss": 0.71899283, - "num_input_tokens_seen": 99609355, - "step": 4608, - "time_per_iteration": 2.7139909267425537 - }, - { - "auxiliary_loss_clip": 0.0109619, - "auxiliary_loss_mlp": 0.01041619, - "balance_loss_clip": 1.04789758, - "balance_loss_mlp": 1.024894, - "epoch": 0.27710807154667066, - "flos": 22857249962880.0, - "grad_norm": 1.922338327624115, - "language_loss": 0.73170602, - "learning_rate": 3.392920146281499e-06, - "loss": 0.75308412, - "num_input_tokens_seen": 99628780, - "step": 4609, - "time_per_iteration": 2.8674490451812744 - }, - { - "auxiliary_loss_clip": 0.01105896, - "auxiliary_loss_mlp": 0.01054215, - "balance_loss_clip": 1.04444993, - "balance_loss_mlp": 1.03615475, - "epoch": 0.27716819479933863, - "flos": 17711071401600.0, - "grad_norm": 2.284482242639661, - "language_loss": 0.84028268, - "learning_rate": 3.3926406439057714e-06, - "loss": 0.86188376, - "num_input_tokens_seen": 99644545, - "step": 4610, - "time_per_iteration": 2.6861605644226074 - }, - { - "auxiliary_loss_clip": 0.01074905, - "auxiliary_loss_mlp": 0.00781444, - "balance_loss_clip": 1.04093325, - "balance_loss_mlp": 1.00102568, - "epoch": 0.2772283180520066, - "flos": 19646046080640.0, - "grad_norm": 2.0943450829127044, - "language_loss": 0.68915951, - "learning_rate": 3.3923610887209705e-06, - "loss": 0.70772296, - "num_input_tokens_seen": 99663125, - "step": 4611, - "time_per_iteration": 2.799345016479492 - }, - { - "auxiliary_loss_clip": 0.01144902, - "auxiliary_loss_mlp": 0.01042567, - "balance_loss_clip": 1.05466819, - "balance_loss_mlp": 1.02591395, - "epoch": 0.27728844130467456, - "flos": 21032628842880.0, - "grad_norm": 2.6988182686748785, - "language_loss": 0.73646772, - "learning_rate": 3.392081480737698e-06, - "loss": 0.75834239, - "num_input_tokens_seen": 99682645, - "step": 4612, - "time_per_iteration": 2.643157720565796 - }, - { - "auxiliary_loss_clip": 0.01139286, - "auxiliary_loss_mlp": 0.00775997, - "balance_loss_clip": 1.05283117, - "balance_loss_mlp": 1.00099993, - "epoch": 0.2773485645573425, - "flos": 18989204025600.0, - "grad_norm": 2.0654093622255436, - "language_loss": 0.66356897, - "learning_rate": 3.3918018199665563e-06, - "loss": 0.68272179, - "num_input_tokens_seen": 99700520, - "step": 4613, - "time_per_iteration": 2.6685144901275635 - }, - { - "auxiliary_loss_clip": 0.01096758, - "auxiliary_loss_mlp": 0.01051618, - "balance_loss_clip": 1.04526055, - "balance_loss_mlp": 1.03354573, - "epoch": 0.27740868781001055, - "flos": 21468440557440.0, - "grad_norm": 1.5160858700983233, - "language_loss": 0.79385912, - "learning_rate": 3.39152210641815e-06, - "loss": 0.8153429, - "num_input_tokens_seen": 99720355, - "step": 4614, - "time_per_iteration": 2.82061505317688 - }, - { - "auxiliary_loss_clip": 0.01129896, - "auxiliary_loss_mlp": 0.01047714, - "balance_loss_clip": 1.04873419, - "balance_loss_mlp": 1.02978539, - "epoch": 0.2774688110626785, - "flos": 19827825834240.0, - "grad_norm": 2.763943164845673, - "language_loss": 0.80632633, - "learning_rate": 3.3912423401030865e-06, - "loss": 0.82810241, - "num_input_tokens_seen": 99736090, - "step": 4615, - "time_per_iteration": 2.607448101043701 - }, - { - "auxiliary_loss_clip": 0.01114657, - "auxiliary_loss_mlp": 0.01051705, - "balance_loss_clip": 1.04532576, - "balance_loss_mlp": 1.03447962, - "epoch": 0.2775289343153465, - "flos": 18216226321920.0, - "grad_norm": 2.3373471978129543, - "language_loss": 0.646945, - "learning_rate": 3.3909625210319735e-06, - "loss": 0.66860855, - "num_input_tokens_seen": 99751805, - "step": 4616, - "time_per_iteration": 2.693556308746338 - }, - { - "auxiliary_loss_clip": 0.01133374, - "auxiliary_loss_mlp": 0.01047225, - "balance_loss_clip": 1.0536505, - "balance_loss_mlp": 1.03001153, - "epoch": 0.27758905756801444, - "flos": 16472476673280.0, - "grad_norm": 2.175848824107301, - "language_loss": 0.82324976, - "learning_rate": 3.3906826492154226e-06, - "loss": 0.84505582, - "num_input_tokens_seen": 99770610, - "step": 4617, - "time_per_iteration": 2.64677357673645 - }, - { - "auxiliary_loss_clip": 0.01147475, - "auxiliary_loss_mlp": 0.01049438, - "balance_loss_clip": 1.05210304, - "balance_loss_mlp": 1.03261721, - "epoch": 0.2776491808206824, - "flos": 18728240739840.0, - "grad_norm": 2.8579401527932236, - "language_loss": 0.77031851, - "learning_rate": 3.3904027246640458e-06, - "loss": 0.79228759, - "num_input_tokens_seen": 99787305, - "step": 4618, - "time_per_iteration": 2.555001735687256 - }, - { - "auxiliary_loss_clip": 0.01151182, - "auxiliary_loss_mlp": 0.01042958, - "balance_loss_clip": 1.05599475, - "balance_loss_mlp": 1.0268048, - "epoch": 0.27770930407335037, - "flos": 28038189911040.0, - "grad_norm": 1.6850470881083441, - "language_loss": 0.85102153, - "learning_rate": 3.390122747388459e-06, - "loss": 0.87296283, - "num_input_tokens_seen": 99808940, - "step": 4619, - "time_per_iteration": 2.753230094909668 - }, - { - "auxiliary_loss_clip": 0.01121872, - "auxiliary_loss_mlp": 0.01041506, - "balance_loss_clip": 1.05075216, - "balance_loss_mlp": 1.02592564, - "epoch": 0.27776942732601834, - "flos": 23549823072000.0, - "grad_norm": 1.6763124645732197, - "language_loss": 0.7707957, - "learning_rate": 3.3898427173992778e-06, - "loss": 0.79242951, - "num_input_tokens_seen": 99829575, - "step": 4620, - "time_per_iteration": 2.7764816284179688 - }, - { - "auxiliary_loss_clip": 0.01091863, - "auxiliary_loss_mlp": 0.01042513, - "balance_loss_clip": 1.04290819, - "balance_loss_mlp": 1.02517962, - "epoch": 0.2778295505786863, - "flos": 23908713811200.0, - "grad_norm": 1.985202794634515, - "language_loss": 0.78144193, - "learning_rate": 3.389562634707122e-06, - "loss": 0.80278563, - "num_input_tokens_seen": 99847575, - "step": 4621, - "time_per_iteration": 2.740419387817383 - }, - { - "auxiliary_loss_clip": 0.01113871, - "auxiliary_loss_mlp": 0.01054223, - "balance_loss_clip": 1.04857588, - "balance_loss_mlp": 1.03642535, - "epoch": 0.27788967383135427, - "flos": 25554571920000.0, - "grad_norm": 2.864120631038579, - "language_loss": 0.87357259, - "learning_rate": 3.389282499322611e-06, - "loss": 0.89525354, - "num_input_tokens_seen": 99864995, - "step": 4622, - "time_per_iteration": 2.8351151943206787 - }, - { - "auxiliary_loss_clip": 0.01096216, - "auxiliary_loss_mlp": 0.01052098, - "balance_loss_clip": 1.0477345, - "balance_loss_mlp": 1.0349195, - "epoch": 0.27794979708402223, - "flos": 16252631481600.0, - "grad_norm": 1.7857472181098575, - "language_loss": 0.81315404, - "learning_rate": 3.389002311256369e-06, - "loss": 0.83463717, - "num_input_tokens_seen": 99881540, - "step": 4623, - "time_per_iteration": 2.7112133502960205 - }, - { - "auxiliary_loss_clip": 0.01119674, - "auxiliary_loss_mlp": 0.01043259, - "balance_loss_clip": 1.05434608, - "balance_loss_mlp": 1.02628374, - "epoch": 0.2780099203366902, - "flos": 20667632791680.0, - "grad_norm": 2.1551340516102897, - "language_loss": 0.80889726, - "learning_rate": 3.3887220705190204e-06, - "loss": 0.83052659, - "num_input_tokens_seen": 99899595, - "step": 4624, - "time_per_iteration": 2.6492481231689453 - }, - { - "auxiliary_loss_clip": 0.01112812, - "auxiliary_loss_mlp": 0.0077763, - "balance_loss_clip": 1.05008531, - "balance_loss_mlp": 1.00092447, - "epoch": 0.27807004358935816, - "flos": 17739583822080.0, - "grad_norm": 2.21671742511245, - "language_loss": 0.76949263, - "learning_rate": 3.388441777121191e-06, - "loss": 0.78839707, - "num_input_tokens_seen": 99913020, - "step": 4625, - "time_per_iteration": 2.6312057971954346 - }, - { - "auxiliary_loss_clip": 0.01106879, - "auxiliary_loss_mlp": 0.01046687, - "balance_loss_clip": 1.04205859, - "balance_loss_mlp": 1.02767277, - "epoch": 0.2781301668420261, - "flos": 16727119165440.0, - "grad_norm": 1.790813282848893, - "language_loss": 0.69947815, - "learning_rate": 3.388161431073511e-06, - "loss": 0.72101378, - "num_input_tokens_seen": 99931405, - "step": 4626, - "time_per_iteration": 2.7656819820404053 - }, - { - "auxiliary_loss_clip": 0.0110548, - "auxiliary_loss_mlp": 0.01041917, - "balance_loss_clip": 1.04827905, - "balance_loss_mlp": 1.02385652, - "epoch": 0.27819029009469415, - "flos": 13844749317120.0, - "grad_norm": 2.1086116607571546, - "language_loss": 0.92367601, - "learning_rate": 3.38788103238661e-06, - "loss": 0.94515002, - "num_input_tokens_seen": 99948100, - "step": 4627, - "time_per_iteration": 2.8608667850494385 - }, - { - "auxiliary_loss_clip": 0.01149683, - "auxiliary_loss_mlp": 0.01040775, - "balance_loss_clip": 1.05388021, - "balance_loss_mlp": 1.0248611, - "epoch": 0.2782504133473621, - "flos": 27089286370560.0, - "grad_norm": 1.7290354122756755, - "language_loss": 0.85490036, - "learning_rate": 3.387600581071121e-06, - "loss": 0.87680495, - "num_input_tokens_seen": 99966470, - "step": 4628, - "time_per_iteration": 2.6468069553375244 - }, - { - "auxiliary_loss_clip": 0.01114712, - "auxiliary_loss_mlp": 0.0104202, - "balance_loss_clip": 1.0482378, - "balance_loss_mlp": 1.02509212, - "epoch": 0.2783105366000301, - "flos": 21068826773760.0, - "grad_norm": 1.5106040860694088, - "language_loss": 0.79246545, - "learning_rate": 3.387320077137679e-06, - "loss": 0.81403273, - "num_input_tokens_seen": 99985930, - "step": 4629, - "time_per_iteration": 5.656833648681641 - }, - { - "auxiliary_loss_clip": 0.01100825, - "auxiliary_loss_mlp": 0.01040328, - "balance_loss_clip": 1.04602218, - "balance_loss_mlp": 1.02339983, - "epoch": 0.27837065985269804, - "flos": 26501823434880.0, - "grad_norm": 1.5125577415085874, - "language_loss": 0.84574991, - "learning_rate": 3.3870395205969208e-06, - "loss": 0.86716145, - "num_input_tokens_seen": 100006235, - "step": 4630, - "time_per_iteration": 2.70917010307312 - }, - { - "auxiliary_loss_clip": 0.01123828, - "auxiliary_loss_mlp": 0.01038547, - "balance_loss_clip": 1.04848623, - "balance_loss_mlp": 1.02099967, - "epoch": 0.278430783105366, - "flos": 20223201813120.0, - "grad_norm": 2.1016222667741857, - "language_loss": 0.81134796, - "learning_rate": 3.386758911459485e-06, - "loss": 0.83297169, - "num_input_tokens_seen": 100023655, - "step": 4631, - "time_per_iteration": 4.19342041015625 - }, - { - "auxiliary_loss_clip": 0.01149092, - "auxiliary_loss_mlp": 0.01049428, - "balance_loss_clip": 1.05402875, - "balance_loss_mlp": 1.03257155, - "epoch": 0.278490906358034, - "flos": 25592888753280.0, - "grad_norm": 3.9436500565538295, - "language_loss": 0.71196103, - "learning_rate": 3.3864782497360126e-06, - "loss": 0.7339462, - "num_input_tokens_seen": 100043280, - "step": 4632, - "time_per_iteration": 2.620439291000366 - }, - { - "auxiliary_loss_clip": 0.01132813, - "auxiliary_loss_mlp": 0.01044268, - "balance_loss_clip": 1.05435467, - "balance_loss_mlp": 1.02798355, - "epoch": 0.27855102961070194, - "flos": 16171544528640.0, - "grad_norm": 1.8243983980851597, - "language_loss": 0.82563186, - "learning_rate": 3.386197535437145e-06, - "loss": 0.84740269, - "num_input_tokens_seen": 100057690, - "step": 4633, - "time_per_iteration": 2.6531693935394287 - }, - { - "auxiliary_loss_clip": 0.01122775, - "auxiliary_loss_mlp": 0.01039803, - "balance_loss_clip": 1.04714537, - "balance_loss_mlp": 1.02130151, - "epoch": 0.2786111528633699, - "flos": 22927598749440.0, - "grad_norm": 1.6667943176882647, - "language_loss": 0.87727869, - "learning_rate": 3.385916768573529e-06, - "loss": 0.89890444, - "num_input_tokens_seen": 100075875, - "step": 4634, - "time_per_iteration": 4.391691446304321 - }, - { - "auxiliary_loss_clip": 0.01118626, - "auxiliary_loss_mlp": 0.01042889, - "balance_loss_clip": 1.04900146, - "balance_loss_mlp": 1.02503181, - "epoch": 0.27867127611603787, - "flos": 23404205335680.0, - "grad_norm": 1.8664238108113964, - "language_loss": 0.7701081, - "learning_rate": 3.38563594915581e-06, - "loss": 0.79172325, - "num_input_tokens_seen": 100092930, - "step": 4635, - "time_per_iteration": 2.7107748985290527 - }, - { - "auxiliary_loss_clip": 0.01148262, - "auxiliary_loss_mlp": 0.01044984, - "balance_loss_clip": 1.05233121, - "balance_loss_mlp": 1.02705491, - "epoch": 0.27873139936870583, - "flos": 19829010983040.0, - "grad_norm": 1.6280540509164947, - "language_loss": 0.65174443, - "learning_rate": 3.385355077194637e-06, - "loss": 0.67367697, - "num_input_tokens_seen": 100110790, - "step": 4636, - "time_per_iteration": 2.660099744796753 - }, - { - "auxiliary_loss_clip": 0.01134021, - "auxiliary_loss_mlp": 0.01042528, - "balance_loss_clip": 1.048437, - "balance_loss_mlp": 1.0243845, - "epoch": 0.2787915226213738, - "flos": 17707659609600.0, - "grad_norm": 2.8501862977667667, - "language_loss": 0.83485681, - "learning_rate": 3.3850741527006604e-06, - "loss": 0.85662234, - "num_input_tokens_seen": 100126970, - "step": 4637, - "time_per_iteration": 2.6234302520751953 - }, - { - "auxiliary_loss_clip": 0.01117465, - "auxiliary_loss_mlp": 0.01043194, - "balance_loss_clip": 1.04580319, - "balance_loss_mlp": 1.02658796, - "epoch": 0.27885164587404176, - "flos": 22090557139200.0, - "grad_norm": 1.4481958644660236, - "language_loss": 0.75996393, - "learning_rate": 3.384793175684533e-06, - "loss": 0.78157055, - "num_input_tokens_seen": 100146720, - "step": 4638, - "time_per_iteration": 2.6488263607025146 - }, - { - "auxiliary_loss_clip": 0.0113367, - "auxiliary_loss_mlp": 0.01047522, - "balance_loss_clip": 1.04905438, - "balance_loss_mlp": 1.02935445, - "epoch": 0.27891176912670973, - "flos": 19207684500480.0, - "grad_norm": 1.973043880665722, - "language_loss": 0.71658665, - "learning_rate": 3.38451214615691e-06, - "loss": 0.73839855, - "num_input_tokens_seen": 100165920, - "step": 4639, - "time_per_iteration": 2.606290817260742 - }, - { - "auxiliary_loss_clip": 0.01134631, - "auxiliary_loss_mlp": 0.01040486, - "balance_loss_clip": 1.04905224, - "balance_loss_mlp": 1.02213931, - "epoch": 0.27897189237937775, - "flos": 27600007898880.0, - "grad_norm": 1.9413688357819885, - "language_loss": 0.6546669, - "learning_rate": 3.384231064128447e-06, - "loss": 0.67641807, - "num_input_tokens_seen": 100185525, - "step": 4640, - "time_per_iteration": 2.670572280883789 - }, - { - "auxiliary_loss_clip": 0.01134835, - "auxiliary_loss_mlp": 0.01040753, - "balance_loss_clip": 1.05033112, - "balance_loss_mlp": 1.02394438, - "epoch": 0.2790320156320457, - "flos": 21178210665600.0, - "grad_norm": 2.0528630099938385, - "language_loss": 0.72150993, - "learning_rate": 3.383949929609804e-06, - "loss": 0.74326581, - "num_input_tokens_seen": 100204850, - "step": 4641, - "time_per_iteration": 2.693377733230591 - }, - { - "auxiliary_loss_clip": 0.01112862, - "auxiliary_loss_mlp": 0.01043132, - "balance_loss_clip": 1.05076349, - "balance_loss_mlp": 1.02322423, - "epoch": 0.2790921388847137, - "flos": 22783920347520.0, - "grad_norm": 1.7365449070814052, - "language_loss": 0.74695385, - "learning_rate": 3.383668742611641e-06, - "loss": 0.7685138, - "num_input_tokens_seen": 100224520, - "step": 4642, - "time_per_iteration": 2.7462241649627686 - }, - { - "auxiliary_loss_clip": 0.0111075, - "auxiliary_loss_mlp": 0.01045242, - "balance_loss_clip": 1.04543257, - "balance_loss_mlp": 1.02603781, - "epoch": 0.27915226213738165, - "flos": 23400649889280.0, - "grad_norm": 1.8272594017764643, - "language_loss": 0.85924351, - "learning_rate": 3.3833875031446205e-06, - "loss": 0.88080341, - "num_input_tokens_seen": 100243935, - "step": 4643, - "time_per_iteration": 2.725135564804077 - }, - { - "auxiliary_loss_clip": 0.01105223, - "auxiliary_loss_mlp": 0.01045051, - "balance_loss_clip": 1.04933143, - "balance_loss_mlp": 1.02697933, - "epoch": 0.2792123853900496, - "flos": 22747794243840.0, - "grad_norm": 1.7474380366240072, - "language_loss": 0.83161986, - "learning_rate": 3.383106211219407e-06, - "loss": 0.85312265, - "num_input_tokens_seen": 100262290, - "step": 4644, - "time_per_iteration": 2.7356133460998535 - }, - { - "auxiliary_loss_clip": 0.01135825, - "auxiliary_loss_mlp": 0.01044339, - "balance_loss_clip": 1.04996896, - "balance_loss_mlp": 1.02672005, - "epoch": 0.2792725086427176, - "flos": 15049372757760.0, - "grad_norm": 1.8326156585035789, - "language_loss": 0.79077673, - "learning_rate": 3.3828248668466673e-06, - "loss": 0.81257844, - "num_input_tokens_seen": 100280015, - "step": 4645, - "time_per_iteration": 2.6605966091156006 - }, - { - "auxiliary_loss_clip": 0.01043101, - "auxiliary_loss_mlp": 0.01005168, - "balance_loss_clip": 1.02972245, - "balance_loss_mlp": 1.00273657, - "epoch": 0.27933263189538554, - "flos": 62544861757440.0, - "grad_norm": 0.7804050577208047, - "language_loss": 0.62298429, - "learning_rate": 3.3825434700370705e-06, - "loss": 0.64346695, - "num_input_tokens_seen": 100338935, - "step": 4646, - "time_per_iteration": 3.203944206237793 - }, - { - "auxiliary_loss_clip": 0.01116876, - "auxiliary_loss_mlp": 0.01036795, - "balance_loss_clip": 1.05170095, - "balance_loss_mlp": 1.02054703, - "epoch": 0.2793927551480535, - "flos": 25118365155840.0, - "grad_norm": 1.6679902986930268, - "language_loss": 0.89280778, - "learning_rate": 3.3822620208012865e-06, - "loss": 0.91434449, - "num_input_tokens_seen": 100359905, - "step": 4647, - "time_per_iteration": 2.829617500305176 - }, - { - "auxiliary_loss_clip": 0.0113911, - "auxiliary_loss_mlp": 0.01047084, - "balance_loss_clip": 1.05125523, - "balance_loss_mlp": 1.02880919, - "epoch": 0.27945287840072147, - "flos": 21324582587520.0, - "grad_norm": 1.8012650128540075, - "language_loss": 0.86784112, - "learning_rate": 3.381980519149988e-06, - "loss": 0.88970304, - "num_input_tokens_seen": 100376955, - "step": 4648, - "time_per_iteration": 2.632321357727051 - }, - { - "auxiliary_loss_clip": 0.01134603, - "auxiliary_loss_mlp": 0.01044893, - "balance_loss_clip": 1.05110133, - "balance_loss_mlp": 1.02733302, - "epoch": 0.27951300165338944, - "flos": 27450547407360.0, - "grad_norm": 2.0026822782024705, - "language_loss": 0.73003638, - "learning_rate": 3.38169896509385e-06, - "loss": 0.75183129, - "num_input_tokens_seen": 100397545, - "step": 4649, - "time_per_iteration": 2.7211172580718994 - }, - { - "auxiliary_loss_clip": 0.01111127, - "auxiliary_loss_mlp": 0.01044981, - "balance_loss_clip": 1.04752195, - "balance_loss_mlp": 1.02557421, - "epoch": 0.2795731249060574, - "flos": 15159008044800.0, - "grad_norm": 2.1164331968139325, - "language_loss": 0.80629992, - "learning_rate": 3.381417358643549e-06, - "loss": 0.82786095, - "num_input_tokens_seen": 100415080, - "step": 4650, - "time_per_iteration": 2.7502310276031494 - }, - { - "auxiliary_loss_clip": 0.01039445, - "auxiliary_loss_mlp": 0.00754956, - "balance_loss_clip": 1.03124094, - "balance_loss_mlp": 1.00203133, - "epoch": 0.27963324815872537, - "flos": 60120103178880.0, - "grad_norm": 0.8151234776797575, - "language_loss": 0.58806145, - "learning_rate": 3.3811356998097624e-06, - "loss": 0.60600549, - "num_input_tokens_seen": 100471105, - "step": 4651, - "time_per_iteration": 3.2224526405334473 - }, - { - "auxiliary_loss_clip": 0.01135312, - "auxiliary_loss_mlp": 0.01047398, - "balance_loss_clip": 1.04708123, - "balance_loss_mlp": 1.02753818, - "epoch": 0.27969337141139333, - "flos": 21765960910080.0, - "grad_norm": 1.7351399642666463, - "language_loss": 0.74332011, - "learning_rate": 3.3808539886031726e-06, - "loss": 0.76514727, - "num_input_tokens_seen": 100492520, - "step": 4652, - "time_per_iteration": 2.685736894607544 - }, - { - "auxiliary_loss_clip": 0.01148943, - "auxiliary_loss_mlp": 0.01045678, - "balance_loss_clip": 1.05235481, - "balance_loss_mlp": 1.02742696, - "epoch": 0.27975349466406135, - "flos": 39851398834560.0, - "grad_norm": 2.2003219434248633, - "language_loss": 0.79789567, - "learning_rate": 3.380572225034461e-06, - "loss": 0.81984192, - "num_input_tokens_seen": 100512870, - "step": 4653, - "time_per_iteration": 2.7558584213256836 - }, - { - "auxiliary_loss_clip": 0.01121239, - "auxiliary_loss_mlp": 0.01050268, - "balance_loss_clip": 1.04883742, - "balance_loss_mlp": 1.03280401, - "epoch": 0.2798136179167293, - "flos": 21579799697280.0, - "grad_norm": 2.080129868341082, - "language_loss": 0.78903222, - "learning_rate": 3.380290409114312e-06, - "loss": 0.81074733, - "num_input_tokens_seen": 100531655, - "step": 4654, - "time_per_iteration": 2.6496095657348633 - }, - { - "auxiliary_loss_clip": 0.01101836, - "auxiliary_loss_mlp": 0.01052085, - "balance_loss_clip": 1.04982615, - "balance_loss_mlp": 1.03267753, - "epoch": 0.2798737411693973, - "flos": 21537676022400.0, - "grad_norm": 2.0985102630300134, - "language_loss": 0.81319463, - "learning_rate": 3.3800085408534127e-06, - "loss": 0.83473378, - "num_input_tokens_seen": 100548005, - "step": 4655, - "time_per_iteration": 2.742586135864258 - }, - { - "auxiliary_loss_clip": 0.01112605, - "auxiliary_loss_mlp": 0.00776867, - "balance_loss_clip": 1.04759109, - "balance_loss_mlp": 1.00071263, - "epoch": 0.27993386442206525, - "flos": 26981051713920.0, - "grad_norm": 1.7515804597190672, - "language_loss": 0.81455064, - "learning_rate": 3.3797266202624506e-06, - "loss": 0.83344543, - "num_input_tokens_seen": 100567980, - "step": 4656, - "time_per_iteration": 2.796480894088745 - }, - { - "auxiliary_loss_clip": 0.01120191, - "auxiliary_loss_mlp": 0.01050328, - "balance_loss_clip": 1.05115008, - "balance_loss_mlp": 1.03204143, - "epoch": 0.2799939876747332, - "flos": 24349876652160.0, - "grad_norm": 2.044588364139205, - "language_loss": 0.83203471, - "learning_rate": 3.3794446473521176e-06, - "loss": 0.85373986, - "num_input_tokens_seen": 100588630, - "step": 4657, - "time_per_iteration": 2.6785871982574463 - }, - { - "auxiliary_loss_clip": 0.01111476, - "auxiliary_loss_mlp": 0.01052182, - "balance_loss_clip": 1.04937756, - "balance_loss_mlp": 1.03294206, - "epoch": 0.2800541109274012, - "flos": 33656988648960.0, - "grad_norm": 2.165484252442401, - "language_loss": 0.63694274, - "learning_rate": 3.379162622133105e-06, - "loss": 0.65857935, - "num_input_tokens_seen": 100608775, - "step": 4658, - "time_per_iteration": 2.879409074783325 - }, - { - "auxiliary_loss_clip": 0.01136248, - "auxiliary_loss_mlp": 0.010462, - "balance_loss_clip": 1.0495683, - "balance_loss_mlp": 1.02822304, - "epoch": 0.28011423418006914, - "flos": 21614417429760.0, - "grad_norm": 1.7192056687926605, - "language_loss": 0.78342974, - "learning_rate": 3.3788805446161073e-06, - "loss": 0.80525422, - "num_input_tokens_seen": 100627975, - "step": 4659, - "time_per_iteration": 2.6989047527313232 - }, - { - "auxiliary_loss_clip": 0.0111004, - "auxiliary_loss_mlp": 0.01054733, - "balance_loss_clip": 1.04974771, - "balance_loss_mlp": 1.03588593, - "epoch": 0.2801743574327371, - "flos": 23112431159040.0, - "grad_norm": 1.755148683242289, - "language_loss": 0.79341501, - "learning_rate": 3.3785984148118215e-06, - "loss": 0.8150627, - "num_input_tokens_seen": 100645430, - "step": 4660, - "time_per_iteration": 2.715477705001831 - }, - { - "auxiliary_loss_clip": 0.01108147, - "auxiliary_loss_mlp": 0.01046506, - "balance_loss_clip": 1.05007386, - "balance_loss_mlp": 1.02897, - "epoch": 0.2802344806854051, - "flos": 12641418766080.0, - "grad_norm": 2.2526204230687115, - "language_loss": 0.80604905, - "learning_rate": 3.3783162327309453e-06, - "loss": 0.82759559, - "num_input_tokens_seen": 100663775, - "step": 4661, - "time_per_iteration": 2.7715258598327637 - }, - { - "auxiliary_loss_clip": 0.01125452, - "auxiliary_loss_mlp": 0.01056292, - "balance_loss_clip": 1.05232596, - "balance_loss_mlp": 1.03836262, - "epoch": 0.28029460393807304, - "flos": 37267878142080.0, - "grad_norm": 1.5529278028038542, - "language_loss": 0.79010582, - "learning_rate": 3.3780339983841794e-06, - "loss": 0.81192333, - "num_input_tokens_seen": 100686085, - "step": 4662, - "time_per_iteration": 2.81427264213562 - }, - { - "auxiliary_loss_clip": 0.01133119, - "auxiliary_loss_mlp": 0.01052014, - "balance_loss_clip": 1.05226839, - "balance_loss_mlp": 1.03252363, - "epoch": 0.280354727190741, - "flos": 20741106061440.0, - "grad_norm": 1.6202884167711182, - "language_loss": 0.69617724, - "learning_rate": 3.377751711782227e-06, - "loss": 0.71802866, - "num_input_tokens_seen": 100705135, - "step": 4663, - "time_per_iteration": 2.697368860244751 - }, - { - "auxiliary_loss_clip": 0.01124677, - "auxiliary_loss_mlp": 0.01049339, - "balance_loss_clip": 1.05170035, - "balance_loss_mlp": 1.03104067, - "epoch": 0.28041485044340897, - "flos": 21471026336640.0, - "grad_norm": 1.9196144000248758, - "language_loss": 0.77708608, - "learning_rate": 3.377469372935791e-06, - "loss": 0.79882622, - "num_input_tokens_seen": 100724960, - "step": 4664, - "time_per_iteration": 2.7275149822235107 - }, - { - "auxiliary_loss_clip": 0.01107718, - "auxiliary_loss_mlp": 0.01048769, - "balance_loss_clip": 1.0480299, - "balance_loss_mlp": 1.03099537, - "epoch": 0.28047497369607693, - "flos": 14794263388800.0, - "grad_norm": 1.999889511399453, - "language_loss": 0.79593849, - "learning_rate": 3.377186981855578e-06, - "loss": 0.81750339, - "num_input_tokens_seen": 100741995, - "step": 4665, - "time_per_iteration": 2.710507392883301 - }, - { - "auxiliary_loss_clip": 0.01132609, - "auxiliary_loss_mlp": 0.01044622, - "balance_loss_clip": 1.04908824, - "balance_loss_mlp": 1.02724159, - "epoch": 0.2805350969487449, - "flos": 23070738447360.0, - "grad_norm": 1.8624041004678782, - "language_loss": 0.81080002, - "learning_rate": 3.3769045385522968e-06, - "loss": 0.83257234, - "num_input_tokens_seen": 100758985, - "step": 4666, - "time_per_iteration": 2.6129403114318848 - }, - { - "auxiliary_loss_clip": 0.01108409, - "auxiliary_loss_mlp": 0.01071225, - "balance_loss_clip": 1.04823136, - "balance_loss_mlp": 1.05097127, - "epoch": 0.2805952202014129, - "flos": 20479855466880.0, - "grad_norm": 2.103406835637469, - "language_loss": 0.84507895, - "learning_rate": 3.376622043036658e-06, - "loss": 0.86687529, - "num_input_tokens_seen": 100777820, - "step": 4667, - "time_per_iteration": 2.7332448959350586 - }, - { - "auxiliary_loss_clip": 0.01123034, - "auxiliary_loss_mlp": 0.00775483, - "balance_loss_clip": 1.05581784, - "balance_loss_mlp": 1.00072694, - "epoch": 0.2806553434540809, - "flos": 27417330305280.0, - "grad_norm": 3.1307253624061486, - "language_loss": 0.79295927, - "learning_rate": 3.376339495319373e-06, - "loss": 0.81194448, - "num_input_tokens_seen": 100798205, - "step": 4668, - "time_per_iteration": 5.80406928062439 - }, - { - "auxiliary_loss_clip": 0.01086886, - "auxiliary_loss_mlp": 0.01042603, - "balance_loss_clip": 1.04659402, - "balance_loss_mlp": 1.02432859, - "epoch": 0.28071546670674885, - "flos": 26505019745280.0, - "grad_norm": 1.6340052887006857, - "language_loss": 0.76323926, - "learning_rate": 3.3760568954111563e-06, - "loss": 0.7845341, - "num_input_tokens_seen": 100819800, - "step": 4669, - "time_per_iteration": 2.909986734390259 - }, - { - "auxiliary_loss_clip": 0.01135126, - "auxiliary_loss_mlp": 0.01048727, - "balance_loss_clip": 1.05091906, - "balance_loss_mlp": 1.03104806, - "epoch": 0.2807755899594168, - "flos": 20558679863040.0, - "grad_norm": 2.509610012971093, - "language_loss": 0.79246378, - "learning_rate": 3.375774243322725e-06, - "loss": 0.81430233, - "num_input_tokens_seen": 100837880, - "step": 4670, - "time_per_iteration": 4.177394866943359 - }, - { - "auxiliary_loss_clip": 0.01106377, - "auxiliary_loss_mlp": 0.01050214, - "balance_loss_clip": 1.04797912, - "balance_loss_mlp": 1.03053236, - "epoch": 0.2808357132120848, - "flos": 24313319585280.0, - "grad_norm": 2.7368773080153455, - "language_loss": 0.79247916, - "learning_rate": 3.3754915390647955e-06, - "loss": 0.81404507, - "num_input_tokens_seen": 100856350, - "step": 4671, - "time_per_iteration": 2.711390256881714 - }, - { - "auxiliary_loss_clip": 0.01127751, - "auxiliary_loss_mlp": 0.01045588, - "balance_loss_clip": 1.05121446, - "balance_loss_mlp": 1.02806473, - "epoch": 0.28089583646475275, - "flos": 26432408401920.0, - "grad_norm": 1.6750085767967255, - "language_loss": 0.74537772, - "learning_rate": 3.37520878264809e-06, - "loss": 0.76711112, - "num_input_tokens_seen": 100876135, - "step": 4672, - "time_per_iteration": 2.661121129989624 - }, - { - "auxiliary_loss_clip": 0.01124033, - "auxiliary_loss_mlp": 0.01050888, - "balance_loss_clip": 1.04696918, - "balance_loss_mlp": 1.03130245, - "epoch": 0.2809559597174207, - "flos": 23111820627840.0, - "grad_norm": 2.8450273884489805, - "language_loss": 0.75648308, - "learning_rate": 3.3749259740833286e-06, - "loss": 0.77823234, - "num_input_tokens_seen": 100894790, - "step": 4673, - "time_per_iteration": 2.672701120376587 - }, - { - "auxiliary_loss_clip": 0.0113134, - "auxiliary_loss_mlp": 0.01042591, - "balance_loss_clip": 1.04937172, - "balance_loss_mlp": 1.02492452, - "epoch": 0.2810160829700887, - "flos": 20923496346240.0, - "grad_norm": 1.8533271967959946, - "language_loss": 0.72668427, - "learning_rate": 3.374643113381237e-06, - "loss": 0.74842358, - "num_input_tokens_seen": 100915100, - "step": 4674, - "time_per_iteration": 4.2516560554504395 - }, - { - "auxiliary_loss_clip": 0.01138771, - "auxiliary_loss_mlp": 0.01046386, - "balance_loss_clip": 1.05174136, - "balance_loss_mlp": 1.02751493, - "epoch": 0.28107620622275664, - "flos": 14355901808640.0, - "grad_norm": 2.0688845921593377, - "language_loss": 0.77195638, - "learning_rate": 3.374360200552541e-06, - "loss": 0.79380798, - "num_input_tokens_seen": 100932795, - "step": 4675, - "time_per_iteration": 2.618218183517456 - }, - { - "auxiliary_loss_clip": 0.01149881, - "auxiliary_loss_mlp": 0.01047998, - "balance_loss_clip": 1.05321908, - "balance_loss_mlp": 1.02948523, - "epoch": 0.2811363294754246, - "flos": 20919078973440.0, - "grad_norm": 1.9283078401930889, - "language_loss": 0.70211101, - "learning_rate": 3.374077235607968e-06, - "loss": 0.7240898, - "num_input_tokens_seen": 100950505, - "step": 4676, - "time_per_iteration": 2.59861159324646 - }, - { - "auxiliary_loss_clip": 0.01144319, - "auxiliary_loss_mlp": 0.01042342, - "balance_loss_clip": 1.05481541, - "balance_loss_mlp": 1.02517629, - "epoch": 0.28119645272809257, - "flos": 20594841880320.0, - "grad_norm": 1.6132814643409343, - "language_loss": 0.7048012, - "learning_rate": 3.3737942185582487e-06, - "loss": 0.72666782, - "num_input_tokens_seen": 100968790, - "step": 4677, - "time_per_iteration": 2.6064453125 - }, - { - "auxiliary_loss_clip": 0.01125461, - "auxiliary_loss_mlp": 0.01047839, - "balance_loss_clip": 1.04849231, - "balance_loss_mlp": 1.02783537, - "epoch": 0.28125657598076054, - "flos": 25337420248320.0, - "grad_norm": 1.5663130673511025, - "language_loss": 0.639018, - "learning_rate": 3.3735111494141153e-06, - "loss": 0.66075099, - "num_input_tokens_seen": 100990205, - "step": 4678, - "time_per_iteration": 2.6609809398651123 - }, - { - "auxiliary_loss_clip": 0.01134563, - "auxiliary_loss_mlp": 0.01050264, - "balance_loss_clip": 1.05104351, - "balance_loss_mlp": 1.03315794, - "epoch": 0.2813166992334285, - "flos": 24827093769600.0, - "grad_norm": 5.827919401990006, - "language_loss": 0.70568973, - "learning_rate": 3.3732280281863013e-06, - "loss": 0.72753799, - "num_input_tokens_seen": 101009815, - "step": 4679, - "time_per_iteration": 2.7039310932159424 - }, - { - "auxiliary_loss_clip": 0.01134537, - "auxiliary_loss_mlp": 0.01040896, - "balance_loss_clip": 1.05048108, - "balance_loss_mlp": 1.02283621, - "epoch": 0.2813768224860965, - "flos": 21760753438080.0, - "grad_norm": 2.2073803144691255, - "language_loss": 0.74848735, - "learning_rate": 3.3729448548855422e-06, - "loss": 0.77024174, - "num_input_tokens_seen": 101026780, - "step": 4680, - "time_per_iteration": 2.6897919178009033 - }, - { - "auxiliary_loss_clip": 0.01149427, - "auxiliary_loss_mlp": 0.01039945, - "balance_loss_clip": 1.05414999, - "balance_loss_mlp": 1.02363694, - "epoch": 0.2814369457387645, - "flos": 24316803204480.0, - "grad_norm": 2.2743778704427267, - "language_loss": 0.7719292, - "learning_rate": 3.3726616295225774e-06, - "loss": 0.793823, - "num_input_tokens_seen": 101046215, - "step": 4681, - "time_per_iteration": 2.6178102493286133 - }, - { - "auxiliary_loss_clip": 0.01138594, - "auxiliary_loss_mlp": 0.01037179, - "balance_loss_clip": 1.05333447, - "balance_loss_mlp": 1.01864183, - "epoch": 0.28149706899143245, - "flos": 18515326872960.0, - "grad_norm": 2.5230258038951723, - "language_loss": 0.74197519, - "learning_rate": 3.372378352108146e-06, - "loss": 0.76373291, - "num_input_tokens_seen": 101063365, - "step": 4682, - "time_per_iteration": 2.5892751216888428 - }, - { - "auxiliary_loss_clip": 0.01145225, - "auxiliary_loss_mlp": 0.01043744, - "balance_loss_clip": 1.05250573, - "balance_loss_mlp": 1.02619636, - "epoch": 0.2815571922441004, - "flos": 24863255786880.0, - "grad_norm": 1.5493572746384299, - "language_loss": 0.81096184, - "learning_rate": 3.3720950226529894e-06, - "loss": 0.83285153, - "num_input_tokens_seen": 101083835, - "step": 4683, - "time_per_iteration": 2.6272947788238525 - }, - { - "auxiliary_loss_clip": 0.01089095, - "auxiliary_loss_mlp": 0.01048071, - "balance_loss_clip": 1.04691851, - "balance_loss_mlp": 1.02916479, - "epoch": 0.2816173154967684, - "flos": 19901622326400.0, - "grad_norm": 1.5570192452178944, - "language_loss": 0.76437271, - "learning_rate": 3.371811641167852e-06, - "loss": 0.78574431, - "num_input_tokens_seen": 101101740, - "step": 4684, - "time_per_iteration": 2.7542243003845215 - }, - { - "auxiliary_loss_clip": 0.01090035, - "auxiliary_loss_mlp": 0.01043858, - "balance_loss_clip": 1.04495156, - "balance_loss_mlp": 1.02659678, - "epoch": 0.28167743874943635, - "flos": 17491333950720.0, - "grad_norm": 3.250404845672824, - "language_loss": 0.76287019, - "learning_rate": 3.3715282076634807e-06, - "loss": 0.78420913, - "num_input_tokens_seen": 101120480, - "step": 4685, - "time_per_iteration": 2.724954843521118 - }, - { - "auxiliary_loss_clip": 0.01116834, - "auxiliary_loss_mlp": 0.01045285, - "balance_loss_clip": 1.05042076, - "balance_loss_mlp": 1.02820265, - "epoch": 0.2817375620021043, - "flos": 25302120157440.0, - "grad_norm": 1.80192319881426, - "language_loss": 0.75822544, - "learning_rate": 3.3712447221506218e-06, - "loss": 0.77984667, - "num_input_tokens_seen": 101142910, - "step": 4686, - "time_per_iteration": 2.7375218868255615 - }, - { - "auxiliary_loss_clip": 0.01113965, - "auxiliary_loss_mlp": 0.01054481, - "balance_loss_clip": 1.04542971, - "balance_loss_mlp": 1.03530002, - "epoch": 0.2817976852547723, - "flos": 18693227957760.0, - "grad_norm": 5.9534421572259095, - "language_loss": 0.62298906, - "learning_rate": 3.370961184640025e-06, - "loss": 0.64467359, - "num_input_tokens_seen": 101160030, - "step": 4687, - "time_per_iteration": 2.7273154258728027 - }, - { - "auxiliary_loss_clip": 0.01125077, - "auxiliary_loss_mlp": 0.01052662, - "balance_loss_clip": 1.05122471, - "balance_loss_mlp": 1.03501928, - "epoch": 0.28185780850744024, - "flos": 22742263549440.0, - "grad_norm": 3.512847657951686, - "language_loss": 0.76642895, - "learning_rate": 3.3706775951424433e-06, - "loss": 0.78820634, - "num_input_tokens_seen": 101177675, - "step": 4688, - "time_per_iteration": 2.6962485313415527 - }, - { - "auxiliary_loss_clip": 0.01111064, - "auxiliary_loss_mlp": 0.01038903, - "balance_loss_clip": 1.050143, - "balance_loss_mlp": 1.0222497, - "epoch": 0.2819179317601082, - "flos": 14933919467520.0, - "grad_norm": 2.029299855452059, - "language_loss": 0.78377295, - "learning_rate": 3.37039395366863e-06, - "loss": 0.80527258, - "num_input_tokens_seen": 101192225, - "step": 4689, - "time_per_iteration": 2.7611160278320312 - }, - { - "auxiliary_loss_clip": 0.01101002, - "auxiliary_loss_mlp": 0.01042004, - "balance_loss_clip": 1.044873, - "balance_loss_mlp": 1.02469492, - "epoch": 0.2819780550127762, - "flos": 23145325038720.0, - "grad_norm": 1.6619977361488503, - "language_loss": 0.78151089, - "learning_rate": 3.37011026022934e-06, - "loss": 0.80294096, - "num_input_tokens_seen": 101210870, - "step": 4690, - "time_per_iteration": 2.8166253566741943 - }, - { - "auxiliary_loss_clip": 0.01144307, - "auxiliary_loss_mlp": 0.0077562, - "balance_loss_clip": 1.04972041, - "balance_loss_mlp": 1.00065684, - "epoch": 0.28203817826544414, - "flos": 21616356764160.0, - "grad_norm": 1.8251699545436237, - "language_loss": 0.87835205, - "learning_rate": 3.369826514835332e-06, - "loss": 0.8975513, - "num_input_tokens_seen": 101229965, - "step": 4691, - "time_per_iteration": 2.755540609359741 - }, - { - "auxiliary_loss_clip": 0.01120177, - "auxiliary_loss_mlp": 0.01057161, - "balance_loss_clip": 1.0480932, - "balance_loss_mlp": 1.03866005, - "epoch": 0.2820983015181121, - "flos": 24026788794240.0, - "grad_norm": 2.0164591316320086, - "language_loss": 0.81783265, - "learning_rate": 3.3695427174973654e-06, - "loss": 0.83960605, - "num_input_tokens_seen": 101250980, - "step": 4692, - "time_per_iteration": 2.766826868057251 - }, - { - "auxiliary_loss_clip": 0.01108273, - "auxiliary_loss_mlp": 0.01044592, - "balance_loss_clip": 1.05000174, - "balance_loss_mlp": 1.02690101, - "epoch": 0.2821584247707801, - "flos": 30007925976960.0, - "grad_norm": 1.5153062693168577, - "language_loss": 0.74520338, - "learning_rate": 3.3692588682262022e-06, - "loss": 0.76673198, - "num_input_tokens_seen": 101273335, - "step": 4693, - "time_per_iteration": 2.833829402923584 - }, - { - "auxiliary_loss_clip": 0.01107692, - "auxiliary_loss_mlp": 0.01038565, - "balance_loss_clip": 1.04546356, - "balance_loss_mlp": 1.02018356, - "epoch": 0.2822185480234481, - "flos": 21396762967680.0, - "grad_norm": 1.6139880108231377, - "language_loss": 0.77396065, - "learning_rate": 3.3689749670326046e-06, - "loss": 0.79542327, - "num_input_tokens_seen": 101292110, - "step": 4694, - "time_per_iteration": 2.6783409118652344 - }, - { - "auxiliary_loss_clip": 0.01131719, - "auxiliary_loss_mlp": 0.01043428, - "balance_loss_clip": 1.05066633, - "balance_loss_mlp": 1.02610695, - "epoch": 0.28227867127611606, - "flos": 27452809964160.0, - "grad_norm": 2.1245298140537354, - "language_loss": 0.67171001, - "learning_rate": 3.3686910139273392e-06, - "loss": 0.69346148, - "num_input_tokens_seen": 101312815, - "step": 4695, - "time_per_iteration": 2.657508373260498 - }, - { - "auxiliary_loss_clip": 0.01129418, - "auxiliary_loss_mlp": 0.01047718, - "balance_loss_clip": 1.05160189, - "balance_loss_mlp": 1.02857292, - "epoch": 0.282338794528784, - "flos": 22593736811520.0, - "grad_norm": 2.1132011275006297, - "language_loss": 0.75410438, - "learning_rate": 3.3684070089211736e-06, - "loss": 0.77587581, - "num_input_tokens_seen": 101329045, - "step": 4696, - "time_per_iteration": 2.6419622898101807 - }, - { - "auxiliary_loss_clip": 0.01108873, - "auxiliary_loss_mlp": 0.01050131, - "balance_loss_clip": 1.04857826, - "balance_loss_mlp": 1.03241634, - "epoch": 0.282398917781452, - "flos": 42010923386880.0, - "grad_norm": 1.6547739374499746, - "language_loss": 0.62379837, - "learning_rate": 3.368122952024877e-06, - "loss": 0.64538848, - "num_input_tokens_seen": 101352715, - "step": 4697, - "time_per_iteration": 2.863271951675415 - }, - { - "auxiliary_loss_clip": 0.01098306, - "auxiliary_loss_mlp": 0.01038026, - "balance_loss_clip": 1.04702902, - "balance_loss_mlp": 1.0213964, - "epoch": 0.28245904103411995, - "flos": 23224724052480.0, - "grad_norm": 1.3648463295211168, - "language_loss": 0.73178887, - "learning_rate": 3.3678388432492214e-06, - "loss": 0.75315219, - "num_input_tokens_seen": 101374640, - "step": 4698, - "time_per_iteration": 2.7437515258789062 - }, - { - "auxiliary_loss_clip": 0.01138661, - "auxiliary_loss_mlp": 0.01044687, - "balance_loss_clip": 1.04783368, - "balance_loss_mlp": 1.02820039, - "epoch": 0.2825191642867879, - "flos": 25374623760000.0, - "grad_norm": 1.73143255072412, - "language_loss": 0.75260699, - "learning_rate": 3.3675546826049788e-06, - "loss": 0.77444041, - "num_input_tokens_seen": 101393595, - "step": 4699, - "time_per_iteration": 2.6352651119232178 - }, - { - "auxiliary_loss_clip": 0.01130406, - "auxiliary_loss_mlp": 0.01042781, - "balance_loss_clip": 1.04642487, - "balance_loss_mlp": 1.02379072, - "epoch": 0.2825792875394559, - "flos": 17236799199360.0, - "grad_norm": 2.939003683920128, - "language_loss": 0.80683541, - "learning_rate": 3.3672704701029265e-06, - "loss": 0.82856727, - "num_input_tokens_seen": 101409265, - "step": 4700, - "time_per_iteration": 2.597543478012085 - }, - { - "auxiliary_loss_clip": 0.01118395, - "auxiliary_loss_mlp": 0.01052226, - "balance_loss_clip": 1.05168593, - "balance_loss_mlp": 1.03699148, - "epoch": 0.28263941079212385, - "flos": 26723967096960.0, - "grad_norm": 1.8973185440197946, - "language_loss": 0.82377315, - "learning_rate": 3.3669862057538402e-06, - "loss": 0.84547931, - "num_input_tokens_seen": 101428365, - "step": 4701, - "time_per_iteration": 2.6613359451293945 - }, - { - "auxiliary_loss_clip": 0.01079732, - "auxiliary_loss_mlp": 0.01044955, - "balance_loss_clip": 1.04725862, - "balance_loss_mlp": 1.02782488, - "epoch": 0.2826995340447918, - "flos": 25921327737600.0, - "grad_norm": 2.6106451650427913, - "language_loss": 0.72911763, - "learning_rate": 3.3667018895685004e-06, - "loss": 0.75036454, - "num_input_tokens_seen": 101447280, - "step": 4702, - "time_per_iteration": 2.927156448364258 - }, - { - "auxiliary_loss_clip": 0.0114189, - "auxiliary_loss_mlp": 0.01039287, - "balance_loss_clip": 1.05118549, - "balance_loss_mlp": 1.02240694, - "epoch": 0.2827596572974598, - "flos": 22379709623040.0, - "grad_norm": 2.1110096252533754, - "language_loss": 0.78497601, - "learning_rate": 3.3664175215576886e-06, - "loss": 0.80678773, - "num_input_tokens_seen": 101465435, - "step": 4703, - "time_per_iteration": 2.603217124938965 - }, - { - "auxiliary_loss_clip": 0.01115372, - "auxiliary_loss_mlp": 0.01049407, - "balance_loss_clip": 1.04668045, - "balance_loss_mlp": 1.03100109, - "epoch": 0.28281978055012774, - "flos": 33547137880320.0, - "grad_norm": 1.6207045759516274, - "language_loss": 0.69310379, - "learning_rate": 3.3661331017321867e-06, - "loss": 0.71475154, - "num_input_tokens_seen": 101486355, - "step": 4704, - "time_per_iteration": 2.737741708755493 - }, - { - "auxiliary_loss_clip": 0.0110991, - "auxiliary_loss_mlp": 0.0104005, - "balance_loss_clip": 1.05106401, - "balance_loss_mlp": 1.02204967, - "epoch": 0.2828799038027957, - "flos": 23440870143360.0, - "grad_norm": 2.0629797483939893, - "language_loss": 0.70487976, - "learning_rate": 3.3658486301027807e-06, - "loss": 0.72637939, - "num_input_tokens_seen": 101505875, - "step": 4705, - "time_per_iteration": 2.7810943126678467 - }, - { - "auxiliary_loss_clip": 0.01051193, - "auxiliary_loss_mlp": 0.01011527, - "balance_loss_clip": 1.02885246, - "balance_loss_mlp": 1.00905895, - "epoch": 0.2829400270554637, - "flos": 69873690251520.0, - "grad_norm": 0.7331461257989402, - "language_loss": 0.59262896, - "learning_rate": 3.3655641066802577e-06, - "loss": 0.6132561, - "num_input_tokens_seen": 101565045, - "step": 4706, - "time_per_iteration": 3.223500967025757 - }, - { - "auxiliary_loss_clip": 0.01117208, - "auxiliary_loss_mlp": 0.01042955, - "balance_loss_clip": 1.04750693, - "balance_loss_mlp": 1.02711248, - "epoch": 0.2830001503081317, - "flos": 24789028331520.0, - "grad_norm": 1.4542369915695899, - "language_loss": 0.82314008, - "learning_rate": 3.365279531475407e-06, - "loss": 0.84474176, - "num_input_tokens_seen": 101585825, - "step": 4707, - "time_per_iteration": 5.995711326599121 - }, - { - "auxiliary_loss_clip": 0.0112325, - "auxiliary_loss_mlp": 0.01043198, - "balance_loss_clip": 1.04714823, - "balance_loss_mlp": 1.02451742, - "epoch": 0.28306027356079966, - "flos": 27669387018240.0, - "grad_norm": 1.6937335335925583, - "language_loss": 0.80196846, - "learning_rate": 3.36499490449902e-06, - "loss": 0.82363296, - "num_input_tokens_seen": 101606105, - "step": 4708, - "time_per_iteration": 2.730365753173828 - }, - { - "auxiliary_loss_clip": 0.01036827, - "auxiliary_loss_mlp": 0.01004906, - "balance_loss_clip": 1.0241586, - "balance_loss_mlp": 1.00274837, - "epoch": 0.2831203968134676, - "flos": 60527938199040.0, - "grad_norm": 0.8797441515413378, - "language_loss": 0.62768304, - "learning_rate": 3.3647102257618895e-06, - "loss": 0.64810038, - "num_input_tokens_seen": 101656875, - "step": 4709, - "time_per_iteration": 3.0734164714813232 - }, - { - "auxiliary_loss_clip": 0.01113275, - "auxiliary_loss_mlp": 0.01045412, - "balance_loss_clip": 1.04819441, - "balance_loss_mlp": 1.02711344, - "epoch": 0.2831805200661356, - "flos": 22054790171520.0, - "grad_norm": 1.4416556980461737, - "language_loss": 0.74092108, - "learning_rate": 3.3644254952748103e-06, - "loss": 0.76250798, - "num_input_tokens_seen": 101676225, - "step": 4710, - "time_per_iteration": 4.214928388595581 - }, - { - "auxiliary_loss_clip": 0.01108833, - "auxiliary_loss_mlp": 0.01058426, - "balance_loss_clip": 1.04568553, - "balance_loss_mlp": 1.0393765, - "epoch": 0.28324064331880355, - "flos": 22600668136320.0, - "grad_norm": 2.192994300890924, - "language_loss": 0.7857554, - "learning_rate": 3.364140713048579e-06, - "loss": 0.80742794, - "num_input_tokens_seen": 101693710, - "step": 4711, - "time_per_iteration": 2.9334824085235596 - }, - { - "auxiliary_loss_clip": 0.01135754, - "auxiliary_loss_mlp": 0.00775746, - "balance_loss_clip": 1.05244637, - "balance_loss_mlp": 1.00072622, - "epoch": 0.2833007665714715, - "flos": 30404127968640.0, - "grad_norm": 2.328121287113732, - "language_loss": 0.70832199, - "learning_rate": 3.363855879093996e-06, - "loss": 0.72743702, - "num_input_tokens_seen": 101714010, - "step": 4712, - "time_per_iteration": 2.8570704460144043 - }, - { - "auxiliary_loss_clip": 0.0114641, - "auxiliary_loss_mlp": 0.01050688, - "balance_loss_clip": 1.05171633, - "balance_loss_mlp": 1.03284216, - "epoch": 0.2833608898241395, - "flos": 23549499849600.0, - "grad_norm": 2.3843934106626157, - "language_loss": 0.81725228, - "learning_rate": 3.3635709934218605e-06, - "loss": 0.83922327, - "num_input_tokens_seen": 101732995, - "step": 4713, - "time_per_iteration": 4.343034029006958 - }, - { - "auxiliary_loss_clip": 0.01120505, - "auxiliary_loss_mlp": 0.01048075, - "balance_loss_clip": 1.05054498, - "balance_loss_mlp": 1.03044379, - "epoch": 0.28342101307680745, - "flos": 20266726118400.0, - "grad_norm": 1.7964609324305687, - "language_loss": 0.75316995, - "learning_rate": 3.3632860560429766e-06, - "loss": 0.77485573, - "num_input_tokens_seen": 101751385, - "step": 4714, - "time_per_iteration": 2.656919479370117 - }, - { - "auxiliary_loss_clip": 0.01129168, - "auxiliary_loss_mlp": 0.01051102, - "balance_loss_clip": 1.050372, - "balance_loss_mlp": 1.03424633, - "epoch": 0.2834811363294754, - "flos": 30847050576000.0, - "grad_norm": 1.4082553086863412, - "language_loss": 0.78457153, - "learning_rate": 3.3630010669681494e-06, - "loss": 0.80637431, - "num_input_tokens_seen": 101773825, - "step": 4715, - "time_per_iteration": 2.721869468688965 - }, - { - "auxiliary_loss_clip": 0.01117334, - "auxiliary_loss_mlp": 0.01046437, - "balance_loss_clip": 1.04618871, - "balance_loss_mlp": 1.0294199, - "epoch": 0.2835412595821434, - "flos": 22711021695360.0, - "grad_norm": 1.791082386208426, - "language_loss": 0.73825723, - "learning_rate": 3.3627160262081845e-06, - "loss": 0.75989497, - "num_input_tokens_seen": 101791920, - "step": 4716, - "time_per_iteration": 2.689964532852173 - }, - { - "auxiliary_loss_clip": 0.0111778, - "auxiliary_loss_mlp": 0.01054857, - "balance_loss_clip": 1.04580188, - "balance_loss_mlp": 1.03397131, - "epoch": 0.28360138283481134, - "flos": 18077719478400.0, - "grad_norm": 2.1425450832247868, - "language_loss": 0.74293232, - "learning_rate": 3.3624309337738917e-06, - "loss": 0.76465869, - "num_input_tokens_seen": 101809515, - "step": 4717, - "time_per_iteration": 2.653107166290283 - }, - { - "auxiliary_loss_clip": 0.01112398, - "auxiliary_loss_mlp": 0.01052347, - "balance_loss_clip": 1.04736984, - "balance_loss_mlp": 1.03526437, - "epoch": 0.2836615060874793, - "flos": 17854785717120.0, - "grad_norm": 1.96982951308544, - "language_loss": 0.67022157, - "learning_rate": 3.3621457896760813e-06, - "loss": 0.69186902, - "num_input_tokens_seen": 101827735, - "step": 4718, - "time_per_iteration": 2.7287323474884033 - }, - { - "auxiliary_loss_clip": 0.01119996, - "auxiliary_loss_mlp": 0.01052629, - "balance_loss_clip": 1.04606366, - "balance_loss_mlp": 1.03479528, - "epoch": 0.2837216293401473, - "flos": 25740302169600.0, - "grad_norm": 1.7409435577223806, - "language_loss": 0.72453725, - "learning_rate": 3.361860593925566e-06, - "loss": 0.7462635, - "num_input_tokens_seen": 101845970, - "step": 4719, - "time_per_iteration": 2.7101874351501465 - }, - { - "auxiliary_loss_clip": 0.01129472, - "auxiliary_loss_mlp": 0.01044, - "balance_loss_clip": 1.04724336, - "balance_loss_mlp": 1.02711964, - "epoch": 0.2837817525928153, - "flos": 20923532259840.0, - "grad_norm": 1.8163652523997504, - "language_loss": 0.80517805, - "learning_rate": 3.3615753465331605e-06, - "loss": 0.82691276, - "num_input_tokens_seen": 101865040, - "step": 4720, - "time_per_iteration": 2.630380392074585 - }, - { - "auxiliary_loss_clip": 0.01130938, - "auxiliary_loss_mlp": 0.01047274, - "balance_loss_clip": 1.04798317, - "balance_loss_mlp": 1.02935672, - "epoch": 0.28384187584548326, - "flos": 18916700423040.0, - "grad_norm": 2.340232614040239, - "language_loss": 0.79146183, - "learning_rate": 3.3612900475096817e-06, - "loss": 0.81324387, - "num_input_tokens_seen": 101883735, - "step": 4721, - "time_per_iteration": 2.6779117584228516 - }, - { - "auxiliary_loss_clip": 0.01091324, - "auxiliary_loss_mlp": 0.00778191, - "balance_loss_clip": 1.04653215, - "balance_loss_mlp": 1.00074911, - "epoch": 0.2839019990981512, - "flos": 27343964776320.0, - "grad_norm": 1.7859505861297744, - "language_loss": 0.82514244, - "learning_rate": 3.3610046968659474e-06, - "loss": 0.84383762, - "num_input_tokens_seen": 101903025, - "step": 4722, - "time_per_iteration": 2.8601412773132324 - }, - { - "auxiliary_loss_clip": 0.0114735, - "auxiliary_loss_mlp": 0.0104339, - "balance_loss_clip": 1.05396807, - "balance_loss_mlp": 1.02641416, - "epoch": 0.2839621223508192, - "flos": 18114312458880.0, - "grad_norm": 1.8976073667217488, - "language_loss": 0.70048773, - "learning_rate": 3.3607192946127785e-06, - "loss": 0.72239512, - "num_input_tokens_seen": 101922255, - "step": 4723, - "time_per_iteration": 2.6259007453918457 - }, - { - "auxiliary_loss_clip": 0.0111455, - "auxiliary_loss_mlp": 0.01051142, - "balance_loss_clip": 1.04818106, - "balance_loss_mlp": 1.03247368, - "epoch": 0.28402224560348716, - "flos": 26358360514560.0, - "grad_norm": 1.540245146059843, - "language_loss": 0.78676599, - "learning_rate": 3.360433840760998e-06, - "loss": 0.80842292, - "num_input_tokens_seen": 101943100, - "step": 4724, - "time_per_iteration": 2.7364859580993652 - }, - { - "auxiliary_loss_clip": 0.01116323, - "auxiliary_loss_mlp": 0.01063488, - "balance_loss_clip": 1.04846072, - "balance_loss_mlp": 1.04442668, - "epoch": 0.2840823688561551, - "flos": 24060795995520.0, - "grad_norm": 1.6728910575536384, - "language_loss": 0.92433345, - "learning_rate": 3.36014833532143e-06, - "loss": 0.94613159, - "num_input_tokens_seen": 101963160, - "step": 4725, - "time_per_iteration": 2.653244733810425 - }, - { - "auxiliary_loss_clip": 0.01137335, - "auxiliary_loss_mlp": 0.01047317, - "balance_loss_clip": 1.05249703, - "balance_loss_mlp": 1.02951932, - "epoch": 0.2841424921088231, - "flos": 29459821368960.0, - "grad_norm": 1.5774329387244128, - "language_loss": 0.88881439, - "learning_rate": 3.3598627783049e-06, - "loss": 0.91066098, - "num_input_tokens_seen": 101984300, - "step": 4726, - "time_per_iteration": 2.6815872192382812 - }, - { - "auxiliary_loss_clip": 0.01132666, - "auxiliary_loss_mlp": 0.01049768, - "balance_loss_clip": 1.05290008, - "balance_loss_mlp": 1.03223181, - "epoch": 0.28420261536149105, - "flos": 48100367053440.0, - "grad_norm": 2.008368257744288, - "language_loss": 0.78913373, - "learning_rate": 3.359577169722238e-06, - "loss": 0.81095803, - "num_input_tokens_seen": 102005765, - "step": 4727, - "time_per_iteration": 2.8668875694274902 - }, - { - "auxiliary_loss_clip": 0.01134036, - "auxiliary_loss_mlp": 0.01041813, - "balance_loss_clip": 1.05225933, - "balance_loss_mlp": 1.02603006, - "epoch": 0.284262738614159, - "flos": 25666146541440.0, - "grad_norm": 2.1196929739552433, - "language_loss": 0.66590458, - "learning_rate": 3.3592915095842733e-06, - "loss": 0.68766308, - "num_input_tokens_seen": 102022755, - "step": 4728, - "time_per_iteration": 2.6871252059936523 - }, - { - "auxiliary_loss_clip": 0.01111522, - "auxiliary_loss_mlp": 0.01054966, - "balance_loss_clip": 1.04948676, - "balance_loss_mlp": 1.03766847, - "epoch": 0.284322861866827, - "flos": 19718980646400.0, - "grad_norm": 1.7247901443745783, - "language_loss": 0.76369143, - "learning_rate": 3.3590057979018386e-06, - "loss": 0.78535628, - "num_input_tokens_seen": 102041850, - "step": 4729, - "time_per_iteration": 2.671739339828491 - }, - { - "auxiliary_loss_clip": 0.01121198, - "auxiliary_loss_mlp": 0.01054506, - "balance_loss_clip": 1.05166233, - "balance_loss_mlp": 1.03707767, - "epoch": 0.28438298511949495, - "flos": 23915250086400.0, - "grad_norm": 1.8284571123244682, - "language_loss": 0.67062581, - "learning_rate": 3.3587200346857674e-06, - "loss": 0.69238287, - "num_input_tokens_seen": 102059500, - "step": 4730, - "time_per_iteration": 2.6957883834838867 - }, - { - "auxiliary_loss_clip": 0.01120949, - "auxiliary_loss_mlp": 0.01040777, - "balance_loss_clip": 1.05008078, - "balance_loss_mlp": 1.02283621, - "epoch": 0.2844431083721629, - "flos": 26067340523520.0, - "grad_norm": 1.8142087038783352, - "language_loss": 0.7456513, - "learning_rate": 3.3584342199468965e-06, - "loss": 0.76726854, - "num_input_tokens_seen": 102080460, - "step": 4731, - "time_per_iteration": 2.7621212005615234 - }, - { - "auxiliary_loss_clip": 0.01100065, - "auxiliary_loss_mlp": 0.0104061, - "balance_loss_clip": 1.04959893, - "balance_loss_mlp": 1.02338386, - "epoch": 0.2845032316248309, - "flos": 25810435474560.0, - "grad_norm": 1.4533231430590194, - "language_loss": 0.83672202, - "learning_rate": 3.3581483536960638e-06, - "loss": 0.85812879, - "num_input_tokens_seen": 102100950, - "step": 4732, - "time_per_iteration": 2.807701587677002 - }, - { - "auxiliary_loss_clip": 0.01135958, - "auxiliary_loss_mlp": 0.01049006, - "balance_loss_clip": 1.05248308, - "balance_loss_mlp": 1.03040957, - "epoch": 0.2845633548774989, - "flos": 19823192979840.0, - "grad_norm": 2.88493918484894, - "language_loss": 0.78892827, - "learning_rate": 3.357862435944109e-06, - "loss": 0.8107779, - "num_input_tokens_seen": 102119345, - "step": 4733, - "time_per_iteration": 2.66524076461792 - }, - { - "auxiliary_loss_clip": 0.01153472, - "auxiliary_loss_mlp": 0.01047702, - "balance_loss_clip": 1.05533004, - "balance_loss_mlp": 1.02984452, - "epoch": 0.28462347813016686, - "flos": 23182815859200.0, - "grad_norm": 2.2364375024988776, - "language_loss": 0.71791029, - "learning_rate": 3.357576466701875e-06, - "loss": 0.73992205, - "num_input_tokens_seen": 102139050, - "step": 4734, - "time_per_iteration": 2.6941637992858887 - }, - { - "auxiliary_loss_clip": 0.01125779, - "auxiliary_loss_mlp": 0.01035132, - "balance_loss_clip": 1.05455363, - "balance_loss_mlp": 1.01766825, - "epoch": 0.2846836013828348, - "flos": 18660477732480.0, - "grad_norm": 1.8491255089189595, - "language_loss": 0.73942113, - "learning_rate": 3.3572904459802056e-06, - "loss": 0.76103032, - "num_input_tokens_seen": 102157935, - "step": 4735, - "time_per_iteration": 2.736027956008911 - }, - { - "auxiliary_loss_clip": 0.01124029, - "auxiliary_loss_mlp": 0.01048016, - "balance_loss_clip": 1.05248201, - "balance_loss_mlp": 1.03177929, - "epoch": 0.2847437246355028, - "flos": 14173511523840.0, - "grad_norm": 1.7217440703764713, - "language_loss": 0.79690897, - "learning_rate": 3.357004373789946e-06, - "loss": 0.81862932, - "num_input_tokens_seen": 102175325, - "step": 4736, - "time_per_iteration": 2.7069075107574463 - }, - { - "auxiliary_loss_clip": 0.01152237, - "auxiliary_loss_mlp": 0.01048515, - "balance_loss_clip": 1.0569663, - "balance_loss_mlp": 1.03019249, - "epoch": 0.28480384788817076, - "flos": 29278364837760.0, - "grad_norm": 2.5331890881723327, - "language_loss": 0.59956342, - "learning_rate": 3.3567182501419453e-06, - "loss": 0.62157094, - "num_input_tokens_seen": 102196625, - "step": 4737, - "time_per_iteration": 2.718904972076416 - }, - { - "auxiliary_loss_clip": 0.01131951, - "auxiliary_loss_mlp": 0.0104121, - "balance_loss_clip": 1.05099404, - "balance_loss_mlp": 1.02437758, - "epoch": 0.2848639711408387, - "flos": 22601314581120.0, - "grad_norm": 1.8696274848062555, - "language_loss": 0.86556888, - "learning_rate": 3.356432075047052e-06, - "loss": 0.88730049, - "num_input_tokens_seen": 102214975, - "step": 4738, - "time_per_iteration": 2.719223976135254 - }, - { - "auxiliary_loss_clip": 0.01127313, - "auxiliary_loss_mlp": 0.01051123, - "balance_loss_clip": 1.05986989, - "balance_loss_mlp": 1.03207278, - "epoch": 0.2849240943935067, - "flos": 17599460866560.0, - "grad_norm": 2.688438536338364, - "language_loss": 0.90028232, - "learning_rate": 3.356145848516118e-06, - "loss": 0.92206669, - "num_input_tokens_seen": 102231885, - "step": 4739, - "time_per_iteration": 2.674363851547241 - }, - { - "auxiliary_loss_clip": 0.01136036, - "auxiliary_loss_mlp": 0.01044124, - "balance_loss_clip": 1.05522013, - "balance_loss_mlp": 1.02627802, - "epoch": 0.28498421764617465, - "flos": 24862573428480.0, - "grad_norm": 1.41783833400805, - "language_loss": 0.7216897, - "learning_rate": 3.355859570559998e-06, - "loss": 0.74349129, - "num_input_tokens_seen": 102252725, - "step": 4740, - "time_per_iteration": 2.688591957092285 - }, - { - "auxiliary_loss_clip": 0.01130927, - "auxiliary_loss_mlp": 0.010392, - "balance_loss_clip": 1.05868936, - "balance_loss_mlp": 1.02229571, - "epoch": 0.2850443408988426, - "flos": 22782555630720.0, - "grad_norm": 3.325446081949271, - "language_loss": 0.77782756, - "learning_rate": 3.3555732411895477e-06, - "loss": 0.79952878, - "num_input_tokens_seen": 102271730, - "step": 4741, - "time_per_iteration": 2.6747119426727295 - }, - { - "auxiliary_loss_clip": 0.01107503, - "auxiliary_loss_mlp": 0.01048819, - "balance_loss_clip": 1.04771924, - "balance_loss_mlp": 1.03065109, - "epoch": 0.2851044641515106, - "flos": 18844053166080.0, - "grad_norm": 1.6557809034578879, - "language_loss": 0.75952959, - "learning_rate": 3.3552868604156235e-06, - "loss": 0.78109288, - "num_input_tokens_seen": 102291325, - "step": 4742, - "time_per_iteration": 2.7584095001220703 - }, - { - "auxiliary_loss_clip": 0.01151989, - "auxiliary_loss_mlp": 0.01057399, - "balance_loss_clip": 1.05341601, - "balance_loss_mlp": 1.03720486, - "epoch": 0.28516458740417855, - "flos": 18880502492160.0, - "grad_norm": 2.0538587827096713, - "language_loss": 0.57376975, - "learning_rate": 3.355000428249086e-06, - "loss": 0.59586358, - "num_input_tokens_seen": 102309000, - "step": 4743, - "time_per_iteration": 2.621572494506836 - }, - { - "auxiliary_loss_clip": 0.01116239, - "auxiliary_loss_mlp": 0.01056356, - "balance_loss_clip": 1.05067348, - "balance_loss_mlp": 1.03747356, - "epoch": 0.2852247106568465, - "flos": 25299821687040.0, - "grad_norm": 1.6259491452975234, - "language_loss": 0.74499846, - "learning_rate": 3.354713944700797e-06, - "loss": 0.76672441, - "num_input_tokens_seen": 102329240, - "step": 4744, - "time_per_iteration": 2.8029959201812744 - }, - { - "auxiliary_loss_clip": 0.01132324, - "auxiliary_loss_mlp": 0.01047205, - "balance_loss_clip": 1.05420351, - "balance_loss_mlp": 1.03014612, - "epoch": 0.2852848339095145, - "flos": 11655383541120.0, - "grad_norm": 2.4725597828733563, - "language_loss": 0.77258176, - "learning_rate": 3.3544274097816185e-06, - "loss": 0.79437709, - "num_input_tokens_seen": 102344440, - "step": 4745, - "time_per_iteration": 2.5961194038391113 - }, - { - "auxiliary_loss_clip": 0.01124474, - "auxiliary_loss_mlp": 0.01040571, - "balance_loss_clip": 1.05262041, - "balance_loss_mlp": 1.02427554, - "epoch": 0.2853449571621825, - "flos": 12933228856320.0, - "grad_norm": 1.9164884333366974, - "language_loss": 0.8275286, - "learning_rate": 3.3541408235024173e-06, - "loss": 0.84917903, - "num_input_tokens_seen": 102360985, - "step": 4746, - "time_per_iteration": 4.211855411529541 - }, - { - "auxiliary_loss_clip": 0.01101779, - "auxiliary_loss_mlp": 0.01043428, - "balance_loss_clip": 1.0488627, - "balance_loss_mlp": 1.02497482, - "epoch": 0.28540508041485046, - "flos": 20010575255040.0, - "grad_norm": 1.8281951571940926, - "language_loss": 0.79537141, - "learning_rate": 3.3538541858740604e-06, - "loss": 0.81682348, - "num_input_tokens_seen": 102380320, - "step": 4747, - "time_per_iteration": 4.276613712310791 - }, - { - "auxiliary_loss_clip": 0.01046154, - "auxiliary_loss_mlp": 0.01017989, - "balance_loss_clip": 1.02844512, - "balance_loss_mlp": 1.01572371, - "epoch": 0.28546520366751843, - "flos": 68139349966080.0, - "grad_norm": 0.7754147669680839, - "language_loss": 0.6049211, - "learning_rate": 3.3535674969074173e-06, - "loss": 0.62556255, - "num_input_tokens_seen": 102439140, - "step": 4748, - "time_per_iteration": 3.0963478088378906 - }, - { - "auxiliary_loss_clip": 0.01148062, - "auxiliary_loss_mlp": 0.01048043, - "balance_loss_clip": 1.05367923, - "balance_loss_mlp": 1.03001821, - "epoch": 0.2855253269201864, - "flos": 13251540205440.0, - "grad_norm": 2.39914017508816, - "language_loss": 0.8061412, - "learning_rate": 3.3532807566133592e-06, - "loss": 0.82810223, - "num_input_tokens_seen": 102450990, - "step": 4749, - "time_per_iteration": 4.199607610702515 - }, - { - "auxiliary_loss_clip": 0.01135936, - "auxiliary_loss_mlp": 0.01045252, - "balance_loss_clip": 1.05160487, - "balance_loss_mlp": 1.02788317, - "epoch": 0.28558545017285436, - "flos": 28620876337920.0, - "grad_norm": 1.92101956988616, - "language_loss": 0.70763719, - "learning_rate": 3.3529939650027587e-06, - "loss": 0.72944903, - "num_input_tokens_seen": 102471820, - "step": 4750, - "time_per_iteration": 2.6975722312927246 - }, - { - "auxiliary_loss_clip": 0.01132057, - "auxiliary_loss_mlp": 0.0104367, - "balance_loss_clip": 1.05308008, - "balance_loss_mlp": 1.02660573, - "epoch": 0.2856455734255223, - "flos": 34130470752000.0, - "grad_norm": 1.619747991653998, - "language_loss": 0.81983078, - "learning_rate": 3.3527071220864917e-06, - "loss": 0.84158808, - "num_input_tokens_seen": 102492625, - "step": 4751, - "time_per_iteration": 2.685194969177246 - }, - { - "auxiliary_loss_clip": 0.01146027, - "auxiliary_loss_mlp": 0.01046872, - "balance_loss_clip": 1.0541997, - "balance_loss_mlp": 1.03009951, - "epoch": 0.2857056966781903, - "flos": 39786149779200.0, - "grad_norm": 2.1857777553010203, - "language_loss": 0.80359828, - "learning_rate": 3.3524202278754353e-06, - "loss": 0.82552731, - "num_input_tokens_seen": 102514145, - "step": 4752, - "time_per_iteration": 4.363154649734497 - }, - { - "auxiliary_loss_clip": 0.01130862, - "auxiliary_loss_mlp": 0.010456, - "balance_loss_clip": 1.04920304, - "balance_loss_mlp": 1.02675319, - "epoch": 0.28576581993085826, - "flos": 21872292145920.0, - "grad_norm": 2.612706759191024, - "language_loss": 0.78674287, - "learning_rate": 3.3521332823804676e-06, - "loss": 0.8085075, - "num_input_tokens_seen": 102532365, - "step": 4753, - "time_per_iteration": 2.6128499507904053 - }, - { - "auxiliary_loss_clip": 0.0114991, - "auxiliary_loss_mlp": 0.01051658, - "balance_loss_clip": 1.05356765, - "balance_loss_mlp": 1.03166628, - "epoch": 0.2858259431835262, - "flos": 19091656592640.0, - "grad_norm": 3.5161743537336596, - "language_loss": 0.8947711, - "learning_rate": 3.3518462856124704e-06, - "loss": 0.91678679, - "num_input_tokens_seen": 102548425, - "step": 4754, - "time_per_iteration": 2.5410687923431396 - }, - { - "auxiliary_loss_clip": 0.01130155, - "auxiliary_loss_mlp": 0.010468, - "balance_loss_clip": 1.05048347, - "balance_loss_mlp": 1.03026593, - "epoch": 0.2858860664361942, - "flos": 20334309557760.0, - "grad_norm": 2.3617926288322724, - "language_loss": 0.82039523, - "learning_rate": 3.3515592375823267e-06, - "loss": 0.84216481, - "num_input_tokens_seen": 102566370, - "step": 4755, - "time_per_iteration": 2.6514527797698975 - }, - { - "auxiliary_loss_clip": 0.01098878, - "auxiliary_loss_mlp": 0.01049575, - "balance_loss_clip": 1.04732597, - "balance_loss_mlp": 1.03233767, - "epoch": 0.28594618968886215, - "flos": 24461738582400.0, - "grad_norm": 1.6385978416895255, - "language_loss": 0.83764589, - "learning_rate": 3.351272138300922e-06, - "loss": 0.8591305, - "num_input_tokens_seen": 102588715, - "step": 4756, - "time_per_iteration": 2.7975916862487793 - }, - { - "auxiliary_loss_clip": 0.01023363, - "auxiliary_loss_mlp": 0.01007772, - "balance_loss_clip": 1.01913142, - "balance_loss_mlp": 1.00524473, - "epoch": 0.2860063129415301, - "flos": 71652850709760.0, - "grad_norm": 0.8721113874523594, - "language_loss": 0.6097033, - "learning_rate": 3.350984987779142e-06, - "loss": 0.63001466, - "num_input_tokens_seen": 102656715, - "step": 4757, - "time_per_iteration": 3.406625986099243 - }, - { - "auxiliary_loss_clip": 0.01147819, - "auxiliary_loss_mlp": 0.01038916, - "balance_loss_clip": 1.05585599, - "balance_loss_mlp": 1.021595, - "epoch": 0.2860664361941981, - "flos": 20558679863040.0, - "grad_norm": 2.030913944398288, - "language_loss": 0.66206789, - "learning_rate": 3.3506977860278756e-06, - "loss": 0.68393528, - "num_input_tokens_seen": 102676545, - "step": 4758, - "time_per_iteration": 2.589768648147583 - }, - { - "auxiliary_loss_clip": 0.01133475, - "auxiliary_loss_mlp": 0.01042694, - "balance_loss_clip": 1.04988813, - "balance_loss_mlp": 1.02581418, - "epoch": 0.2861265594468661, - "flos": 35996389534080.0, - "grad_norm": 2.019963236438103, - "language_loss": 0.63374877, - "learning_rate": 3.3504105330580143e-06, - "loss": 0.65551043, - "num_input_tokens_seen": 102702875, - "step": 4759, - "time_per_iteration": 2.809325695037842 - }, - { - "auxiliary_loss_clip": 0.01129183, - "auxiliary_loss_mlp": 0.00777076, - "balance_loss_clip": 1.04924989, - "balance_loss_mlp": 1.00088644, - "epoch": 0.28618668269953407, - "flos": 20047419630720.0, - "grad_norm": 1.9693348774443893, - "language_loss": 0.74033993, - "learning_rate": 3.3501232288804496e-06, - "loss": 0.75940251, - "num_input_tokens_seen": 102723160, - "step": 4760, - "time_per_iteration": 2.6797397136688232 - }, - { - "auxiliary_loss_clip": 0.01124387, - "auxiliary_loss_mlp": 0.01045022, - "balance_loss_clip": 1.05517232, - "balance_loss_mlp": 1.02849925, - "epoch": 0.28624680595220203, - "flos": 24971849579520.0, - "grad_norm": 2.574168946313644, - "language_loss": 0.72227889, - "learning_rate": 3.3498358735060773e-06, - "loss": 0.74397296, - "num_input_tokens_seen": 102743855, - "step": 4761, - "time_per_iteration": 2.672394275665283 - }, - { - "auxiliary_loss_clip": 0.01079005, - "auxiliary_loss_mlp": 0.01049385, - "balance_loss_clip": 1.04688287, - "balance_loss_mlp": 1.03218305, - "epoch": 0.28630692920487, - "flos": 22492253911680.0, - "grad_norm": 2.095293128310336, - "language_loss": 0.74758703, - "learning_rate": 3.349548466945793e-06, - "loss": 0.76887095, - "num_input_tokens_seen": 102761370, - "step": 4762, - "time_per_iteration": 2.8573946952819824 - }, - { - "auxiliary_loss_clip": 0.01108257, - "auxiliary_loss_mlp": 0.01044255, - "balance_loss_clip": 1.05117726, - "balance_loss_mlp": 1.02725577, - "epoch": 0.28636705245753796, - "flos": 21249888255360.0, - "grad_norm": 1.4714690500952254, - "language_loss": 0.76185489, - "learning_rate": 3.349261009210496e-06, - "loss": 0.78338003, - "num_input_tokens_seen": 102780885, - "step": 4763, - "time_per_iteration": 2.7058494091033936 - }, - { - "auxiliary_loss_clip": 0.01103052, - "auxiliary_loss_mlp": 0.01041715, - "balance_loss_clip": 1.0442332, - "balance_loss_mlp": 1.0234046, - "epoch": 0.28642717571020593, - "flos": 24095772864000.0, - "grad_norm": 2.250941696220621, - "language_loss": 0.77264833, - "learning_rate": 3.348973500311086e-06, - "loss": 0.79409599, - "num_input_tokens_seen": 102801000, - "step": 4764, - "time_per_iteration": 2.7363107204437256 - }, - { - "auxiliary_loss_clip": 0.0111141, - "auxiliary_loss_mlp": 0.01044325, - "balance_loss_clip": 1.04883742, - "balance_loss_mlp": 1.02520347, - "epoch": 0.2864872989628739, - "flos": 22601386408320.0, - "grad_norm": 3.808468667851145, - "language_loss": 0.71222258, - "learning_rate": 3.348685940258466e-06, - "loss": 0.73377991, - "num_input_tokens_seen": 102820230, - "step": 4765, - "time_per_iteration": 2.7225682735443115 - }, - { - "auxiliary_loss_clip": 0.01127531, - "auxiliary_loss_mlp": 0.01037638, - "balance_loss_clip": 1.0501802, - "balance_loss_mlp": 1.02118707, - "epoch": 0.28654742221554186, - "flos": 32745073138560.0, - "grad_norm": 1.6284115173108313, - "language_loss": 0.76206756, - "learning_rate": 3.3483983290635395e-06, - "loss": 0.78371924, - "num_input_tokens_seen": 102842670, - "step": 4766, - "time_per_iteration": 2.724776268005371 - }, - { - "auxiliary_loss_clip": 0.01130255, - "auxiliary_loss_mlp": 0.01038205, - "balance_loss_clip": 1.0502758, - "balance_loss_mlp": 1.02133691, - "epoch": 0.2866075454682098, - "flos": 26981626331520.0, - "grad_norm": 1.7313176116986193, - "language_loss": 0.77457404, - "learning_rate": 3.348110666737214e-06, - "loss": 0.79625863, - "num_input_tokens_seen": 102864480, - "step": 4767, - "time_per_iteration": 2.7313742637634277 - }, - { - "auxiliary_loss_clip": 0.0114162, - "auxiliary_loss_mlp": 0.01042697, - "balance_loss_clip": 1.05109096, - "balance_loss_mlp": 1.02519727, - "epoch": 0.2866676687208778, - "flos": 23253847004160.0, - "grad_norm": 1.7818476838857593, - "language_loss": 0.65043855, - "learning_rate": 3.3478229532903956e-06, - "loss": 0.67228168, - "num_input_tokens_seen": 102883740, - "step": 4768, - "time_per_iteration": 2.6173784732818604 - }, - { - "auxiliary_loss_clip": 0.01123197, - "auxiliary_loss_mlp": 0.01041331, - "balance_loss_clip": 1.04803848, - "balance_loss_mlp": 1.02385533, - "epoch": 0.28672779197354575, - "flos": 21579727870080.0, - "grad_norm": 1.5842392137882455, - "language_loss": 0.70497799, - "learning_rate": 3.3475351887339967e-06, - "loss": 0.7266233, - "num_input_tokens_seen": 102902945, - "step": 4769, - "time_per_iteration": 2.627859115600586 - }, - { - "auxiliary_loss_clip": 0.01078118, - "auxiliary_loss_mlp": 0.01033792, - "balance_loss_clip": 1.04276228, - "balance_loss_mlp": 1.01722169, - "epoch": 0.2867879152262137, - "flos": 19865568049920.0, - "grad_norm": 1.555057890983365, - "language_loss": 0.74735439, - "learning_rate": 3.3472473730789288e-06, - "loss": 0.76847351, - "num_input_tokens_seen": 102922405, - "step": 4770, - "time_per_iteration": 2.807286262512207 - }, - { - "auxiliary_loss_clip": 0.01094623, - "auxiliary_loss_mlp": 0.01041164, - "balance_loss_clip": 1.04522562, - "balance_loss_mlp": 1.02336657, - "epoch": 0.2868480384788817, - "flos": 28213325648640.0, - "grad_norm": 2.2768786529491427, - "language_loss": 0.6760053, - "learning_rate": 3.3469595063361045e-06, - "loss": 0.6973632, - "num_input_tokens_seen": 102938980, - "step": 4771, - "time_per_iteration": 2.7709410190582275 - }, - { - "auxiliary_loss_clip": 0.01041422, - "auxiliary_loss_mlp": 0.01015109, - "balance_loss_clip": 1.01907253, - "balance_loss_mlp": 1.01243877, - "epoch": 0.2869081617315497, - "flos": 65424286690560.0, - "grad_norm": 0.770068198596698, - "language_loss": 0.56874299, - "learning_rate": 3.3466715885164414e-06, - "loss": 0.58930826, - "num_input_tokens_seen": 103000405, - "step": 4772, - "time_per_iteration": 3.0978245735168457 - }, - { - "auxiliary_loss_clip": 0.01067739, - "auxiliary_loss_mlp": 0.0077878, - "balance_loss_clip": 1.04115915, - "balance_loss_mlp": 1.00089169, - "epoch": 0.28696828498421767, - "flos": 18660729127680.0, - "grad_norm": 2.7874039039613345, - "language_loss": 0.82870376, - "learning_rate": 3.346383619630856e-06, - "loss": 0.84716898, - "num_input_tokens_seen": 103017970, - "step": 4773, - "time_per_iteration": 2.7716143131256104 - }, - { - "auxiliary_loss_clip": 0.0114188, - "auxiliary_loss_mlp": 0.01043405, - "balance_loss_clip": 1.04776216, - "balance_loss_mlp": 1.02553546, - "epoch": 0.28702840823688563, - "flos": 23659745667840.0, - "grad_norm": 11.069053071667042, - "language_loss": 0.77580261, - "learning_rate": 3.34609559969027e-06, - "loss": 0.79765546, - "num_input_tokens_seen": 103036385, - "step": 4774, - "time_per_iteration": 2.604790687561035 - }, - { - "auxiliary_loss_clip": 0.01119567, - "auxiliary_loss_mlp": 0.01042061, - "balance_loss_clip": 1.04915977, - "balance_loss_mlp": 1.02414346, - "epoch": 0.2870885314895536, - "flos": 13804744544640.0, - "grad_norm": 1.9103573283121942, - "language_loss": 0.73611873, - "learning_rate": 3.3458075287056034e-06, - "loss": 0.75773501, - "num_input_tokens_seen": 103052170, - "step": 4775, - "time_per_iteration": 2.6234211921691895 - }, - { - "auxiliary_loss_clip": 0.01133151, - "auxiliary_loss_mlp": 0.01045326, - "balance_loss_clip": 1.04905081, - "balance_loss_mlp": 1.02782607, - "epoch": 0.28714865474222157, - "flos": 17786771314560.0, - "grad_norm": 1.6535491049734306, - "language_loss": 0.88343942, - "learning_rate": 3.34551940668778e-06, - "loss": 0.9052242, - "num_input_tokens_seen": 103070510, - "step": 4776, - "time_per_iteration": 2.6941640377044678 - }, - { - "auxiliary_loss_clip": 0.01132773, - "auxiliary_loss_mlp": 0.0104327, - "balance_loss_clip": 1.05156159, - "balance_loss_mlp": 1.02712941, - "epoch": 0.28720877799488953, - "flos": 15997486199040.0, - "grad_norm": 1.7321020140737395, - "language_loss": 0.74257779, - "learning_rate": 3.345231233647726e-06, - "loss": 0.76433825, - "num_input_tokens_seen": 103089590, - "step": 4777, - "time_per_iteration": 2.645650863647461 - }, - { - "auxiliary_loss_clip": 0.01126691, - "auxiliary_loss_mlp": 0.01045293, - "balance_loss_clip": 1.05245948, - "balance_loss_mlp": 1.02812648, - "epoch": 0.2872689012475575, - "flos": 20923137210240.0, - "grad_norm": 1.9446580110028222, - "language_loss": 0.80069196, - "learning_rate": 3.3449430095963696e-06, - "loss": 0.82241178, - "num_input_tokens_seen": 103109080, - "step": 4778, - "time_per_iteration": 2.7606308460235596 - }, - { - "auxiliary_loss_clip": 0.01123482, - "auxiliary_loss_mlp": 0.01044505, - "balance_loss_clip": 1.05461526, - "balance_loss_mlp": 1.02750611, - "epoch": 0.28732902450022546, - "flos": 21325121291520.0, - "grad_norm": 1.7560492266469991, - "language_loss": 0.7396307, - "learning_rate": 3.3446547345446386e-06, - "loss": 0.76131058, - "num_input_tokens_seen": 103127755, - "step": 4779, - "time_per_iteration": 2.831167221069336 - }, - { - "auxiliary_loss_clip": 0.01122102, - "auxiliary_loss_mlp": 0.01043876, - "balance_loss_clip": 1.04866719, - "balance_loss_mlp": 1.0262928, - "epoch": 0.2873891477528934, - "flos": 20850382212480.0, - "grad_norm": 1.5882306223862566, - "language_loss": 0.76327771, - "learning_rate": 3.3443664085034656e-06, - "loss": 0.7849375, - "num_input_tokens_seen": 103147035, - "step": 4780, - "time_per_iteration": 2.6548538208007812 - }, - { - "auxiliary_loss_clip": 0.01102465, - "auxiliary_loss_mlp": 0.01042038, - "balance_loss_clip": 1.04413557, - "balance_loss_mlp": 1.02517641, - "epoch": 0.2874492710055614, - "flos": 17420051410560.0, - "grad_norm": 1.5896497572299877, - "language_loss": 0.81445092, - "learning_rate": 3.344078031483784e-06, - "loss": 0.83589596, - "num_input_tokens_seen": 103165410, - "step": 4781, - "time_per_iteration": 2.6422417163848877 - }, - { - "auxiliary_loss_clip": 0.01109573, - "auxiliary_loss_mlp": 0.01045358, - "balance_loss_clip": 1.05339658, - "balance_loss_mlp": 1.0277034, - "epoch": 0.28750939425822936, - "flos": 13406818700160.0, - "grad_norm": 1.8389370421072637, - "language_loss": 0.86738765, - "learning_rate": 3.3437896034965283e-06, - "loss": 0.888937, - "num_input_tokens_seen": 103183710, - "step": 4782, - "time_per_iteration": 2.7507951259613037 - }, - { - "auxiliary_loss_clip": 0.01113582, - "auxiliary_loss_mlp": 0.01043351, - "balance_loss_clip": 1.05343366, - "balance_loss_mlp": 1.02604771, - "epoch": 0.2875695175108973, - "flos": 21870029589120.0, - "grad_norm": 1.5283433651606986, - "language_loss": 0.71153063, - "learning_rate": 3.3435011245526357e-06, - "loss": 0.73309994, - "num_input_tokens_seen": 103203790, - "step": 4783, - "time_per_iteration": 2.7166218757629395 - }, - { - "auxiliary_loss_clip": 0.0112343, - "auxiliary_loss_mlp": 0.01047879, - "balance_loss_clip": 1.05475473, - "balance_loss_mlp": 1.030761, - "epoch": 0.2876296407635653, - "flos": 26245457089920.0, - "grad_norm": 1.6861942701171202, - "language_loss": 0.76872855, - "learning_rate": 3.343212594663047e-06, - "loss": 0.79044163, - "num_input_tokens_seen": 103223925, - "step": 4784, - "time_per_iteration": 2.693665027618408 - }, - { - "auxiliary_loss_clip": 0.01095423, - "auxiliary_loss_mlp": 0.01053931, - "balance_loss_clip": 1.04587293, - "balance_loss_mlp": 1.03514349, - "epoch": 0.28768976401623325, - "flos": 25373654092800.0, - "grad_norm": 4.596098798847224, - "language_loss": 0.75646108, - "learning_rate": 3.3429240138387015e-06, - "loss": 0.77795458, - "num_input_tokens_seen": 103244760, - "step": 4785, - "time_per_iteration": 4.380687236785889 - }, - { - "auxiliary_loss_clip": 0.01144615, - "auxiliary_loss_mlp": 0.01048905, - "balance_loss_clip": 1.0532378, - "balance_loss_mlp": 1.03213263, - "epoch": 0.28774988726890127, - "flos": 30664372982400.0, - "grad_norm": 2.434913324661012, - "language_loss": 0.83660555, - "learning_rate": 3.3426353820905425e-06, - "loss": 0.85854077, - "num_input_tokens_seen": 103261995, - "step": 4786, - "time_per_iteration": 4.138700723648071 - }, - { - "auxiliary_loss_clip": 0.01113505, - "auxiliary_loss_mlp": 0.0077478, - "balance_loss_clip": 1.05201936, - "balance_loss_mlp": 1.00095487, - "epoch": 0.28781001052156924, - "flos": 20595452411520.0, - "grad_norm": 1.8737605513707083, - "language_loss": 0.80388975, - "learning_rate": 3.342346699429516e-06, - "loss": 0.82277262, - "num_input_tokens_seen": 103279780, - "step": 4787, - "time_per_iteration": 2.7030651569366455 - }, - { - "auxiliary_loss_clip": 0.01120528, - "auxiliary_loss_mlp": 0.01039353, - "balance_loss_clip": 1.0489651, - "balance_loss_mlp": 1.02212751, - "epoch": 0.2878701337742372, - "flos": 26542330997760.0, - "grad_norm": 1.8370986188087255, - "language_loss": 0.83052301, - "learning_rate": 3.3420579658665677e-06, - "loss": 0.85212183, - "num_input_tokens_seen": 103300580, - "step": 4788, - "time_per_iteration": 2.7650442123413086 - }, - { - "auxiliary_loss_clip": 0.01110861, - "auxiliary_loss_mlp": 0.01044904, - "balance_loss_clip": 1.0567044, - "balance_loss_mlp": 1.0279882, - "epoch": 0.28793025702690517, - "flos": 28146855530880.0, - "grad_norm": 7.859878454786593, - "language_loss": 0.73045379, - "learning_rate": 3.3417691814126468e-06, - "loss": 0.75201148, - "num_input_tokens_seen": 103320430, - "step": 4789, - "time_per_iteration": 4.340694189071655 - }, - { - "auxiliary_loss_clip": 0.01123471, - "auxiliary_loss_mlp": 0.01042567, - "balance_loss_clip": 1.04852343, - "balance_loss_mlp": 1.02599669, - "epoch": 0.28799038027957313, - "flos": 23805471144960.0, - "grad_norm": 1.7615007973154742, - "language_loss": 0.84425223, - "learning_rate": 3.341480346078704e-06, - "loss": 0.86591256, - "num_input_tokens_seen": 103337695, - "step": 4790, - "time_per_iteration": 2.6953821182250977 - }, - { - "auxiliary_loss_clip": 0.01136004, - "auxiliary_loss_mlp": 0.01049022, - "balance_loss_clip": 1.05240703, - "balance_loss_mlp": 1.03145027, - "epoch": 0.2880505035322411, - "flos": 22344122223360.0, - "grad_norm": 1.743209341690147, - "language_loss": 0.78031182, - "learning_rate": 3.3411914598756922e-06, - "loss": 0.80216199, - "num_input_tokens_seen": 103357010, - "step": 4791, - "time_per_iteration": 4.299259424209595 - }, - { - "auxiliary_loss_clip": 0.01120123, - "auxiliary_loss_mlp": 0.01036962, - "balance_loss_clip": 1.05015528, - "balance_loss_mlp": 1.01999843, - "epoch": 0.28811062678490906, - "flos": 18004246208640.0, - "grad_norm": 2.2148694233914474, - "language_loss": 0.70164073, - "learning_rate": 3.3409025228145654e-06, - "loss": 0.72321159, - "num_input_tokens_seen": 103375600, - "step": 4792, - "time_per_iteration": 2.646732807159424 - }, - { - "auxiliary_loss_clip": 0.01107079, - "auxiliary_loss_mlp": 0.01037734, - "balance_loss_clip": 1.05645919, - "balance_loss_mlp": 1.02149773, - "epoch": 0.28817075003757703, - "flos": 22090880361600.0, - "grad_norm": 1.9192442052106609, - "language_loss": 0.79200894, - "learning_rate": 3.3406135349062812e-06, - "loss": 0.81345713, - "num_input_tokens_seen": 103395225, - "step": 4793, - "time_per_iteration": 2.765010356903076 - }, - { - "auxiliary_loss_clip": 0.01117839, - "auxiliary_loss_mlp": 0.01038019, - "balance_loss_clip": 1.05114603, - "balance_loss_mlp": 1.02235532, - "epoch": 0.288230873290245, - "flos": 41683130847360.0, - "grad_norm": 1.7689864288971164, - "language_loss": 0.78136635, - "learning_rate": 3.340324496161797e-06, - "loss": 0.80292487, - "num_input_tokens_seen": 103417245, - "step": 4794, - "time_per_iteration": 2.868473529815674 - }, - { - "auxiliary_loss_clip": 0.01134193, - "auxiliary_loss_mlp": 0.0104583, - "balance_loss_clip": 1.05259347, - "balance_loss_mlp": 1.02856886, - "epoch": 0.28829099654291296, - "flos": 18624423456000.0, - "grad_norm": 2.1692523829597063, - "language_loss": 0.8320052, - "learning_rate": 3.340035406592074e-06, - "loss": 0.85380542, - "num_input_tokens_seen": 103435500, - "step": 4795, - "time_per_iteration": 2.6216471195220947 - }, - { - "auxiliary_loss_clip": 0.01126764, - "auxiliary_loss_mlp": 0.01043565, - "balance_loss_clip": 1.05043364, - "balance_loss_mlp": 1.0279845, - "epoch": 0.2883511197955809, - "flos": 24674832017280.0, - "grad_norm": 2.290853867887048, - "language_loss": 0.74744678, - "learning_rate": 3.339746266208074e-06, - "loss": 0.76915002, - "num_input_tokens_seen": 103451040, - "step": 4796, - "time_per_iteration": 2.6819822788238525 - }, - { - "auxiliary_loss_clip": 0.01136938, - "auxiliary_loss_mlp": 0.01040822, - "balance_loss_clip": 1.05140758, - "balance_loss_mlp": 1.02221298, - "epoch": 0.2884112430482489, - "flos": 23112143850240.0, - "grad_norm": 1.9890524806298786, - "language_loss": 0.73144913, - "learning_rate": 3.3394570750207614e-06, - "loss": 0.7532267, - "num_input_tokens_seen": 103471330, - "step": 4797, - "time_per_iteration": 2.666097640991211 - }, - { - "auxiliary_loss_clip": 0.01104454, - "auxiliary_loss_mlp": 0.00775335, - "balance_loss_clip": 1.04594803, - "balance_loss_mlp": 1.00097072, - "epoch": 0.28847136630091685, - "flos": 16873347432960.0, - "grad_norm": 1.9324008515617646, - "language_loss": 0.74650872, - "learning_rate": 3.3391678330411017e-06, - "loss": 0.76530659, - "num_input_tokens_seen": 103488060, - "step": 4798, - "time_per_iteration": 2.7281830310821533 - }, - { - "auxiliary_loss_clip": 0.0113412, - "auxiliary_loss_mlp": 0.01043523, - "balance_loss_clip": 1.04996431, - "balance_loss_mlp": 1.02463984, - "epoch": 0.2885314895535849, - "flos": 25657527277440.0, - "grad_norm": 3.037553219769834, - "language_loss": 0.66004431, - "learning_rate": 3.3388785402800642e-06, - "loss": 0.68182075, - "num_input_tokens_seen": 103503600, - "step": 4799, - "time_per_iteration": 2.6416096687316895 - }, - { - "auxiliary_loss_clip": 0.01144575, - "auxiliary_loss_mlp": 0.01049843, - "balance_loss_clip": 1.05205584, - "balance_loss_mlp": 1.03268862, - "epoch": 0.28859161280625284, - "flos": 21107251347840.0, - "grad_norm": 1.7946911133370596, - "language_loss": 0.8231616, - "learning_rate": 3.3385891967486178e-06, - "loss": 0.84510577, - "num_input_tokens_seen": 103524195, - "step": 4800, - "time_per_iteration": 2.704357624053955 - }, - { - "auxiliary_loss_clip": 0.01105166, - "auxiliary_loss_mlp": 0.01040519, - "balance_loss_clip": 1.04861474, - "balance_loss_mlp": 1.02392507, - "epoch": 0.2886517360589208, - "flos": 26469540086400.0, - "grad_norm": 1.5930665564066124, - "language_loss": 0.9080106, - "learning_rate": 3.3382998024577347e-06, - "loss": 0.92946744, - "num_input_tokens_seen": 103545235, - "step": 4801, - "time_per_iteration": 2.8163902759552 - }, - { - "auxiliary_loss_clip": 0.01119221, - "auxiliary_loss_mlp": 0.00775037, - "balance_loss_clip": 1.05178905, - "balance_loss_mlp": 1.0008862, - "epoch": 0.28871185931158877, - "flos": 25265275781760.0, - "grad_norm": 2.098995863955026, - "language_loss": 0.74342406, - "learning_rate": 3.33801035741839e-06, - "loss": 0.76236671, - "num_input_tokens_seen": 103563305, - "step": 4802, - "time_per_iteration": 2.8244271278381348 - }, - { - "auxiliary_loss_clip": 0.01029511, - "auxiliary_loss_mlp": 0.01004263, - "balance_loss_clip": 1.02472734, - "balance_loss_mlp": 1.00193822, - "epoch": 0.28877198256425674, - "flos": 66665431284480.0, - "grad_norm": 0.7780596068321518, - "language_loss": 0.62987334, - "learning_rate": 3.337720861641558e-06, - "loss": 0.65021104, - "num_input_tokens_seen": 103625025, - "step": 4803, - "time_per_iteration": 3.299269676208496 - }, - { - "auxiliary_loss_clip": 0.01083739, - "auxiliary_loss_mlp": 0.01051002, - "balance_loss_clip": 1.03981495, - "balance_loss_mlp": 1.03369915, - "epoch": 0.2888321058169247, - "flos": 20303031790080.0, - "grad_norm": 1.8528386679599225, - "language_loss": 0.71095157, - "learning_rate": 3.3374313151382165e-06, - "loss": 0.73229897, - "num_input_tokens_seen": 103644235, - "step": 4804, - "time_per_iteration": 2.762883424758911 - }, - { - "auxiliary_loss_clip": 0.01135071, - "auxiliary_loss_mlp": 0.01047534, - "balance_loss_clip": 1.05108273, - "balance_loss_mlp": 1.0289135, - "epoch": 0.28889222906959267, - "flos": 25516721963520.0, - "grad_norm": 1.926588918304246, - "language_loss": 0.67916834, - "learning_rate": 3.337141717919346e-06, - "loss": 0.70099443, - "num_input_tokens_seen": 103664700, - "step": 4805, - "time_per_iteration": 2.6848111152648926 - }, - { - "auxiliary_loss_clip": 0.01135111, - "auxiliary_loss_mlp": 0.01046638, - "balance_loss_clip": 1.05359602, - "balance_loss_mlp": 1.03029394, - "epoch": 0.28895235232226063, - "flos": 32671312560000.0, - "grad_norm": 1.4381182508216341, - "language_loss": 0.69720542, - "learning_rate": 3.3368520699959272e-06, - "loss": 0.71902293, - "num_input_tokens_seen": 103686595, - "step": 4806, - "time_per_iteration": 2.762458562850952 - }, - { - "auxiliary_loss_clip": 0.01120642, - "auxiliary_loss_mlp": 0.01052311, - "balance_loss_clip": 1.05073118, - "balance_loss_mlp": 1.03559768, - "epoch": 0.2890124755749286, - "flos": 29714679342720.0, - "grad_norm": 1.4600495853323927, - "language_loss": 0.71255589, - "learning_rate": 3.3365623713789443e-06, - "loss": 0.73428547, - "num_input_tokens_seen": 103707525, - "step": 4807, - "time_per_iteration": 2.740931987762451 - }, - { - "auxiliary_loss_clip": 0.01106054, - "auxiliary_loss_mlp": 0.01043407, - "balance_loss_clip": 1.05087459, - "balance_loss_mlp": 1.02625299, - "epoch": 0.28907259882759656, - "flos": 22674464628480.0, - "grad_norm": 1.6111027163793539, - "language_loss": 0.81489629, - "learning_rate": 3.336272622079382e-06, - "loss": 0.83639085, - "num_input_tokens_seen": 103727905, - "step": 4808, - "time_per_iteration": 2.722787380218506 - }, - { - "auxiliary_loss_clip": 0.01098162, - "auxiliary_loss_mlp": 0.01048507, - "balance_loss_clip": 1.04795146, - "balance_loss_mlp": 1.03160298, - "epoch": 0.2891327220802645, - "flos": 22566050403840.0, - "grad_norm": 1.7874609682529725, - "language_loss": 0.78304112, - "learning_rate": 3.3359828221082276e-06, - "loss": 0.80450785, - "num_input_tokens_seen": 103748335, - "step": 4809, - "time_per_iteration": 2.742063522338867 - }, - { - "auxiliary_loss_clip": 0.01091743, - "auxiliary_loss_mlp": 0.01047553, - "balance_loss_clip": 1.04519784, - "balance_loss_mlp": 1.02924204, - "epoch": 0.2891928453329325, - "flos": 21652806090240.0, - "grad_norm": 1.7709564567634208, - "language_loss": 0.78864932, - "learning_rate": 3.3356929714764714e-06, - "loss": 0.81004226, - "num_input_tokens_seen": 103767020, - "step": 4810, - "time_per_iteration": 2.7578415870666504 - }, - { - "auxiliary_loss_clip": 0.01090252, - "auxiliary_loss_mlp": 0.01039009, - "balance_loss_clip": 1.04552603, - "balance_loss_mlp": 1.02280235, - "epoch": 0.28925296858560046, - "flos": 23222102359680.0, - "grad_norm": 1.6298276151024105, - "language_loss": 0.76974982, - "learning_rate": 3.3354030701951032e-06, - "loss": 0.79104245, - "num_input_tokens_seen": 103786355, - "step": 4811, - "time_per_iteration": 2.7336831092834473 - }, - { - "auxiliary_loss_clip": 0.01132677, - "auxiliary_loss_mlp": 0.01047674, - "balance_loss_clip": 1.05356216, - "balance_loss_mlp": 1.03038859, - "epoch": 0.2893130918382685, - "flos": 28621666437120.0, - "grad_norm": 1.4740946425962824, - "language_loss": 0.77044773, - "learning_rate": 3.335113118275117e-06, - "loss": 0.79225123, - "num_input_tokens_seen": 103809345, - "step": 4812, - "time_per_iteration": 2.745115280151367 - }, - { - "auxiliary_loss_clip": 0.01024348, - "auxiliary_loss_mlp": 0.01009076, - "balance_loss_clip": 1.02794337, - "balance_loss_mlp": 1.00728762, - "epoch": 0.28937321509093644, - "flos": 72301288982400.0, - "grad_norm": 0.8337141037006477, - "language_loss": 0.60292435, - "learning_rate": 3.3348231157275085e-06, - "loss": 0.62325859, - "num_input_tokens_seen": 103871180, - "step": 4813, - "time_per_iteration": 3.3592262268066406 - }, - { - "auxiliary_loss_clip": 0.01094544, - "auxiliary_loss_mlp": 0.01044805, - "balance_loss_clip": 1.0431211, - "balance_loss_mlp": 1.02734065, - "epoch": 0.2894333383436044, - "flos": 16216397637120.0, - "grad_norm": 3.1340543474440623, - "language_loss": 0.82301223, - "learning_rate": 3.3345330625632725e-06, - "loss": 0.84440577, - "num_input_tokens_seen": 103889040, - "step": 4814, - "time_per_iteration": 2.7069244384765625 - }, - { - "auxiliary_loss_clip": 0.01101478, - "auxiliary_loss_mlp": 0.01052591, - "balance_loss_clip": 1.05051374, - "balance_loss_mlp": 1.03556752, - "epoch": 0.2894934615962724, - "flos": 24828278918400.0, - "grad_norm": 1.6672038490985601, - "language_loss": 0.73249441, - "learning_rate": 3.3342429587934094e-06, - "loss": 0.75403512, - "num_input_tokens_seen": 103910380, - "step": 4815, - "time_per_iteration": 2.764214515686035 - }, - { - "auxiliary_loss_clip": 0.01131126, - "auxiliary_loss_mlp": 0.01045124, - "balance_loss_clip": 1.05259883, - "balance_loss_mlp": 1.02997231, - "epoch": 0.28955358484894034, - "flos": 20449978329600.0, - "grad_norm": 1.9821106518618066, - "language_loss": 0.70783043, - "learning_rate": 3.3339528044289198e-06, - "loss": 0.72959292, - "num_input_tokens_seen": 103929955, - "step": 4816, - "time_per_iteration": 2.7809629440307617 - }, - { - "auxiliary_loss_clip": 0.01119261, - "auxiliary_loss_mlp": 0.01048806, - "balance_loss_clip": 1.04862189, - "balance_loss_mlp": 1.03097248, - "epoch": 0.2896137081016083, - "flos": 22565188477440.0, - "grad_norm": 2.3636227133284122, - "language_loss": 0.7445122, - "learning_rate": 3.3336625994808055e-06, - "loss": 0.76619279, - "num_input_tokens_seen": 103948020, - "step": 4817, - "time_per_iteration": 2.829183578491211 - }, - { - "auxiliary_loss_clip": 0.01108198, - "auxiliary_loss_mlp": 0.01054129, - "balance_loss_clip": 1.05107522, - "balance_loss_mlp": 1.03633142, - "epoch": 0.28967383135427627, - "flos": 26687948734080.0, - "grad_norm": 1.8479613371686012, - "language_loss": 0.76190692, - "learning_rate": 3.3333723439600723e-06, - "loss": 0.78353024, - "num_input_tokens_seen": 103968740, - "step": 4818, - "time_per_iteration": 2.827925443649292 - }, - { - "auxiliary_loss_clip": 0.01074516, - "auxiliary_loss_mlp": 0.01041914, - "balance_loss_clip": 1.04805899, - "balance_loss_mlp": 1.02477193, - "epoch": 0.28973395460694423, - "flos": 15558262692480.0, - "grad_norm": 1.9558897556763024, - "language_loss": 0.80060315, - "learning_rate": 3.3330820378777263e-06, - "loss": 0.82176751, - "num_input_tokens_seen": 103986005, - "step": 4819, - "time_per_iteration": 2.8941574096679688 - }, - { - "auxiliary_loss_clip": 0.01110223, - "auxiliary_loss_mlp": 0.01048219, - "balance_loss_clip": 1.0494163, - "balance_loss_mlp": 1.02931273, - "epoch": 0.2897940778596122, - "flos": 18697465762560.0, - "grad_norm": 1.8074124972104149, - "language_loss": 0.78504574, - "learning_rate": 3.332791681244776e-06, - "loss": 0.80663019, - "num_input_tokens_seen": 104005070, - "step": 4820, - "time_per_iteration": 2.7016515731811523 - }, - { - "auxiliary_loss_clip": 0.01096478, - "auxiliary_loss_mlp": 0.01037037, - "balance_loss_clip": 1.04924846, - "balance_loss_mlp": 1.02028775, - "epoch": 0.28985420111228016, - "flos": 18770292587520.0, - "grad_norm": 2.105369007151224, - "language_loss": 0.72925651, - "learning_rate": 3.332501274072231e-06, - "loss": 0.7505917, - "num_input_tokens_seen": 104022945, - "step": 4821, - "time_per_iteration": 2.743091583251953 - }, - { - "auxiliary_loss_clip": 0.01132782, - "auxiliary_loss_mlp": 0.01040556, - "balance_loss_clip": 1.05055594, - "balance_loss_mlp": 1.02290142, - "epoch": 0.28991432436494813, - "flos": 23069840607360.0, - "grad_norm": 2.331696646407205, - "language_loss": 0.71962738, - "learning_rate": 3.332210816371104e-06, - "loss": 0.74136078, - "num_input_tokens_seen": 104042080, - "step": 4822, - "time_per_iteration": 2.768996477127075 - }, - { - "auxiliary_loss_clip": 0.01128837, - "auxiliary_loss_mlp": 0.01048176, - "balance_loss_clip": 1.05237818, - "balance_loss_mlp": 1.03142738, - "epoch": 0.2899744476176161, - "flos": 17603195880960.0, - "grad_norm": 1.8111020118629353, - "language_loss": 0.662521, - "learning_rate": 3.3319203081524102e-06, - "loss": 0.68429112, - "num_input_tokens_seen": 104060975, - "step": 4823, - "time_per_iteration": 2.733591318130493 - }, - { - "auxiliary_loss_clip": 0.01107872, - "auxiliary_loss_mlp": 0.01042255, - "balance_loss_clip": 1.04404497, - "balance_loss_mlp": 1.02588761, - "epoch": 0.29003457087028406, - "flos": 22309360836480.0, - "grad_norm": 4.579803152663717, - "language_loss": 0.81162238, - "learning_rate": 3.331629749427164e-06, - "loss": 0.83312368, - "num_input_tokens_seen": 104081395, - "step": 4824, - "time_per_iteration": 4.278540849685669 - }, - { - "auxiliary_loss_clip": 0.01143667, - "auxiliary_loss_mlp": 0.01043888, - "balance_loss_clip": 1.05104661, - "balance_loss_mlp": 1.025828, - "epoch": 0.2900946941229521, - "flos": 21944975316480.0, - "grad_norm": 2.265114761106369, - "language_loss": 0.72592747, - "learning_rate": 3.331339140206385e-06, - "loss": 0.74780297, - "num_input_tokens_seen": 104099995, - "step": 4825, - "time_per_iteration": 4.177908658981323 - }, - { - "auxiliary_loss_clip": 0.01147795, - "auxiliary_loss_mlp": 0.01036998, - "balance_loss_clip": 1.05434549, - "balance_loss_mlp": 1.01930714, - "epoch": 0.29015481737562004, - "flos": 17932173569280.0, - "grad_norm": 2.216571865047856, - "language_loss": 0.73680669, - "learning_rate": 3.331048480501092e-06, - "loss": 0.75865459, - "num_input_tokens_seen": 104118930, - "step": 4826, - "time_per_iteration": 2.6371700763702393 - }, - { - "auxiliary_loss_clip": 0.0113072, - "auxiliary_loss_mlp": 0.01040585, - "balance_loss_clip": 1.05073726, - "balance_loss_mlp": 1.02483773, - "epoch": 0.290214940628288, - "flos": 22783525297920.0, - "grad_norm": 2.324527624383577, - "language_loss": 0.68556225, - "learning_rate": 3.3307577703223073e-06, - "loss": 0.70727527, - "num_input_tokens_seen": 104136940, - "step": 4827, - "time_per_iteration": 2.6447484493255615 - }, - { - "auxiliary_loss_clip": 0.01125924, - "auxiliary_loss_mlp": 0.0104453, - "balance_loss_clip": 1.04981911, - "balance_loss_mlp": 1.02650571, - "epoch": 0.290275063880956, - "flos": 20006481104640.0, - "grad_norm": 1.8485927197530279, - "language_loss": 0.80266023, - "learning_rate": 3.3304670096810545e-06, - "loss": 0.82436466, - "num_input_tokens_seen": 104154280, - "step": 4828, - "time_per_iteration": 4.131803274154663 - }, - { - "auxiliary_loss_clip": 0.01144317, - "auxiliary_loss_mlp": 0.01049939, - "balance_loss_clip": 1.05393863, - "balance_loss_mlp": 1.03288054, - "epoch": 0.29033518713362394, - "flos": 22053605022720.0, - "grad_norm": 1.8003854621941846, - "language_loss": 0.80658895, - "learning_rate": 3.33017619858836e-06, - "loss": 0.8285315, - "num_input_tokens_seen": 104172605, - "step": 4829, - "time_per_iteration": 2.760899066925049 - }, - { - "auxiliary_loss_clip": 0.011197, - "auxiliary_loss_mlp": 0.01044046, - "balance_loss_clip": 1.05093288, - "balance_loss_mlp": 1.02680826, - "epoch": 0.2903953103862919, - "flos": 25630056351360.0, - "grad_norm": 1.5734536519128175, - "language_loss": 0.82911146, - "learning_rate": 3.329885337055249e-06, - "loss": 0.85074902, - "num_input_tokens_seen": 104194120, - "step": 4830, - "time_per_iteration": 4.403480529785156 - }, - { - "auxiliary_loss_clip": 0.01137563, - "auxiliary_loss_mlp": 0.01048934, - "balance_loss_clip": 1.05430257, - "balance_loss_mlp": 1.03155351, - "epoch": 0.29045543363895987, - "flos": 16945851035520.0, - "grad_norm": 2.2586543311689486, - "language_loss": 0.79236752, - "learning_rate": 3.3295944250927546e-06, - "loss": 0.81423253, - "num_input_tokens_seen": 104210875, - "step": 4831, - "time_per_iteration": 2.6066412925720215 - }, - { - "auxiliary_loss_clip": 0.01143728, - "auxiliary_loss_mlp": 0.01045824, - "balance_loss_clip": 1.05470276, - "balance_loss_mlp": 1.03000546, - "epoch": 0.29051555689162784, - "flos": 26395492199040.0, - "grad_norm": 1.9694662738232038, - "language_loss": 0.7459774, - "learning_rate": 3.3293034627119055e-06, - "loss": 0.76787293, - "num_input_tokens_seen": 104229875, - "step": 4832, - "time_per_iteration": 2.8411331176757812 - }, - { - "auxiliary_loss_clip": 0.01122405, - "auxiliary_loss_mlp": 0.01037758, - "balance_loss_clip": 1.05429769, - "balance_loss_mlp": 1.02335787, - "epoch": 0.2905756801442958, - "flos": 21103875469440.0, - "grad_norm": 1.979215737756815, - "language_loss": 0.76150024, - "learning_rate": 3.329012449923736e-06, - "loss": 0.78310186, - "num_input_tokens_seen": 104250405, - "step": 4833, - "time_per_iteration": 2.7510006427764893 - }, - { - "auxiliary_loss_clip": 0.01107016, - "auxiliary_loss_mlp": 0.01040024, - "balance_loss_clip": 1.04580688, - "balance_loss_mlp": 1.02383542, - "epoch": 0.29063580339696377, - "flos": 15706071158400.0, - "grad_norm": 1.7715964188803632, - "language_loss": 0.64404124, - "learning_rate": 3.3287213867392813e-06, - "loss": 0.66551173, - "num_input_tokens_seen": 104269185, - "step": 4834, - "time_per_iteration": 2.6475064754486084 - }, - { - "auxiliary_loss_clip": 0.01117159, - "auxiliary_loss_mlp": 0.01032155, - "balance_loss_clip": 1.05111325, - "balance_loss_mlp": 1.01724815, - "epoch": 0.29069592664963173, - "flos": 24644990793600.0, - "grad_norm": 1.4640588842294755, - "language_loss": 0.71717769, - "learning_rate": 3.3284302731695783e-06, - "loss": 0.73867083, - "num_input_tokens_seen": 104289400, - "step": 4835, - "time_per_iteration": 2.6991324424743652 - }, - { - "auxiliary_loss_clip": 0.01117393, - "auxiliary_loss_mlp": 0.01037314, - "balance_loss_clip": 1.04881835, - "balance_loss_mlp": 1.02187634, - "epoch": 0.2907560499022997, - "flos": 24973753000320.0, - "grad_norm": 1.657223137158586, - "language_loss": 0.79492378, - "learning_rate": 3.3281391092256668e-06, - "loss": 0.81647086, - "num_input_tokens_seen": 104310485, - "step": 4836, - "time_per_iteration": 2.7060084342956543 - }, - { - "auxiliary_loss_clip": 0.01107347, - "auxiliary_loss_mlp": 0.01045193, - "balance_loss_clip": 1.05334711, - "balance_loss_mlp": 1.02744293, - "epoch": 0.29081617315496766, - "flos": 18657496903680.0, - "grad_norm": 1.9442300400082562, - "language_loss": 0.81372344, - "learning_rate": 3.3278478949185865e-06, - "loss": 0.83524883, - "num_input_tokens_seen": 104327330, - "step": 4837, - "time_per_iteration": 2.640610933303833 - }, - { - "auxiliary_loss_clip": 0.01116355, - "auxiliary_loss_mlp": 0.01039398, - "balance_loss_clip": 1.04938102, - "balance_loss_mlp": 1.0233283, - "epoch": 0.2908762964076356, - "flos": 35331035955840.0, - "grad_norm": 6.209911556378307, - "language_loss": 0.67358792, - "learning_rate": 3.327556630259381e-06, - "loss": 0.69514549, - "num_input_tokens_seen": 104350350, - "step": 4838, - "time_per_iteration": 2.758422374725342 - }, - { - "auxiliary_loss_clip": 0.01147958, - "auxiliary_loss_mlp": 0.00775113, - "balance_loss_clip": 1.05402315, - "balance_loss_mlp": 1.00096607, - "epoch": 0.29093641966030365, - "flos": 23076305055360.0, - "grad_norm": 1.5628414298261506, - "language_loss": 0.71139944, - "learning_rate": 3.327265315259095e-06, - "loss": 0.73063016, - "num_input_tokens_seen": 104369995, - "step": 4839, - "time_per_iteration": 2.683349132537842 - }, - { - "auxiliary_loss_clip": 0.0114095, - "auxiliary_loss_mlp": 0.01036937, - "balance_loss_clip": 1.04966319, - "balance_loss_mlp": 1.02147555, - "epoch": 0.2909965429129716, - "flos": 35955415094400.0, - "grad_norm": 1.9403130873020338, - "language_loss": 0.7539593, - "learning_rate": 3.326973949928776e-06, - "loss": 0.77573812, - "num_input_tokens_seen": 104392285, - "step": 4840, - "time_per_iteration": 2.696808099746704 - }, - { - "auxiliary_loss_clip": 0.01093571, - "auxiliary_loss_mlp": 0.01045095, - "balance_loss_clip": 1.04470551, - "balance_loss_mlp": 1.02825069, - "epoch": 0.2910566661656396, - "flos": 30880231764480.0, - "grad_norm": 1.7841334294021773, - "language_loss": 0.60546595, - "learning_rate": 3.326682534279471e-06, - "loss": 0.62685257, - "num_input_tokens_seen": 104412640, - "step": 4841, - "time_per_iteration": 2.74575138092041 - }, - { - "auxiliary_loss_clip": 0.01120271, - "auxiliary_loss_mlp": 0.01039624, - "balance_loss_clip": 1.04983509, - "balance_loss_mlp": 1.02288651, - "epoch": 0.29111678941830754, - "flos": 30010188533760.0, - "grad_norm": 1.408353605568525, - "language_loss": 0.71321762, - "learning_rate": 3.326391068322232e-06, - "loss": 0.73481655, - "num_input_tokens_seen": 104435245, - "step": 4842, - "time_per_iteration": 2.7568962574005127 - }, - { - "auxiliary_loss_clip": 0.01130885, - "auxiliary_loss_mlp": 0.01037088, - "balance_loss_clip": 1.05042899, - "balance_loss_mlp": 1.02191257, - "epoch": 0.2911769126709755, - "flos": 22857393617280.0, - "grad_norm": 2.1183002067983585, - "language_loss": 0.73610562, - "learning_rate": 3.3260995520681098e-06, - "loss": 0.75778532, - "num_input_tokens_seen": 104455395, - "step": 4843, - "time_per_iteration": 2.6703171730041504 - }, - { - "auxiliary_loss_clip": 0.0108851, - "auxiliary_loss_mlp": 0.01036244, - "balance_loss_clip": 1.04775739, - "balance_loss_mlp": 1.02058005, - "epoch": 0.2912370359236435, - "flos": 21650507619840.0, - "grad_norm": 4.868884277111801, - "language_loss": 0.58445942, - "learning_rate": 3.3258079855281602e-06, - "loss": 0.60570699, - "num_input_tokens_seen": 104473350, - "step": 4844, - "time_per_iteration": 2.7461965084075928 - }, - { - "auxiliary_loss_clip": 0.01138917, - "auxiliary_loss_mlp": 0.01039428, - "balance_loss_clip": 1.05586743, - "balance_loss_mlp": 1.0222863, - "epoch": 0.29129715917631144, - "flos": 22893340152960.0, - "grad_norm": 1.9200815982611392, - "language_loss": 0.86459565, - "learning_rate": 3.3255163687134396e-06, - "loss": 0.88637912, - "num_input_tokens_seen": 104492265, - "step": 4845, - "time_per_iteration": 2.711101770401001 - }, - { - "auxiliary_loss_clip": 0.01115849, - "auxiliary_loss_mlp": 0.01052584, - "balance_loss_clip": 1.05018926, - "balance_loss_mlp": 1.03505993, - "epoch": 0.2913572824289794, - "flos": 22674464628480.0, - "grad_norm": 1.7226223126663984, - "language_loss": 0.67067879, - "learning_rate": 3.3252247016350046e-06, - "loss": 0.69236308, - "num_input_tokens_seen": 104510755, - "step": 4846, - "time_per_iteration": 2.698076009750366 - }, - { - "auxiliary_loss_clip": 0.01120746, - "auxiliary_loss_mlp": 0.01040428, - "balance_loss_clip": 1.05198884, - "balance_loss_mlp": 1.02457917, - "epoch": 0.29141740568164737, - "flos": 23107403255040.0, - "grad_norm": 1.9884880347168128, - "language_loss": 0.70629871, - "learning_rate": 3.3249329843039166e-06, - "loss": 0.7279104, - "num_input_tokens_seen": 104530830, - "step": 4847, - "time_per_iteration": 2.6693859100341797 - }, - { - "auxiliary_loss_clip": 0.01129385, - "auxiliary_loss_mlp": 0.01036362, - "balance_loss_clip": 1.0490911, - "balance_loss_mlp": 1.02048314, - "epoch": 0.29147752893431533, - "flos": 23587026583680.0, - "grad_norm": 1.4444788582363046, - "language_loss": 0.73975939, - "learning_rate": 3.324641216731237e-06, - "loss": 0.76141691, - "num_input_tokens_seen": 104550115, - "step": 4848, - "time_per_iteration": 2.779012680053711 - }, - { - "auxiliary_loss_clip": 0.0112526, - "auxiliary_loss_mlp": 0.01051811, - "balance_loss_clip": 1.04831481, - "balance_loss_mlp": 1.03391802, - "epoch": 0.2915376521869833, - "flos": 20591968792320.0, - "grad_norm": 3.067540232947916, - "language_loss": 0.76738584, - "learning_rate": 3.3243493989280295e-06, - "loss": 0.7891565, - "num_input_tokens_seen": 104566255, - "step": 4849, - "time_per_iteration": 2.6103999614715576 - }, - { - "auxiliary_loss_clip": 0.01124372, - "auxiliary_loss_mlp": 0.01041862, - "balance_loss_clip": 1.04718697, - "balance_loss_mlp": 1.02541125, - "epoch": 0.29159777543965126, - "flos": 20811490761600.0, - "grad_norm": 1.7266499063872853, - "language_loss": 0.78276592, - "learning_rate": 3.3240575309053596e-06, - "loss": 0.80442822, - "num_input_tokens_seen": 104585235, - "step": 4850, - "time_per_iteration": 2.6395609378814697 - }, - { - "auxiliary_loss_clip": 0.01111964, - "auxiliary_loss_mlp": 0.01038044, - "balance_loss_clip": 1.04907775, - "balance_loss_mlp": 1.0209378, - "epoch": 0.29165789869231923, - "flos": 24244155947520.0, - "grad_norm": 1.8024770323318549, - "language_loss": 0.7657702, - "learning_rate": 3.323765612674296e-06, - "loss": 0.78727031, - "num_input_tokens_seen": 104605315, - "step": 4851, - "time_per_iteration": 2.7265985012054443 - }, - { - "auxiliary_loss_clip": 0.01132156, - "auxiliary_loss_mlp": 0.01045641, - "balance_loss_clip": 1.052459, - "balance_loss_mlp": 1.03083527, - "epoch": 0.29171802194498725, - "flos": 28949925853440.0, - "grad_norm": 1.3639310788782566, - "language_loss": 0.77680421, - "learning_rate": 3.3234736442459078e-06, - "loss": 0.7985822, - "num_input_tokens_seen": 104626055, - "step": 4852, - "time_per_iteration": 2.7161712646484375 - }, - { - "auxiliary_loss_clip": 0.01120344, - "auxiliary_loss_mlp": 0.01051407, - "balance_loss_clip": 1.05108476, - "balance_loss_mlp": 1.03523064, - "epoch": 0.2917781451976552, - "flos": 22598226011520.0, - "grad_norm": 1.6397145219173752, - "language_loss": 0.7816534, - "learning_rate": 3.3231816256312665e-06, - "loss": 0.80337089, - "num_input_tokens_seen": 104646005, - "step": 4853, - "time_per_iteration": 2.748053789138794 - }, - { - "auxiliary_loss_clip": 0.01108012, - "auxiliary_loss_mlp": 0.01041349, - "balance_loss_clip": 1.04923177, - "balance_loss_mlp": 1.02535105, - "epoch": 0.2918382684503232, - "flos": 21574448570880.0, - "grad_norm": 2.273586870261815, - "language_loss": 0.8791436, - "learning_rate": 3.322889556841445e-06, - "loss": 0.90063715, - "num_input_tokens_seen": 104661620, - "step": 4854, - "time_per_iteration": 2.7663791179656982 - }, - { - "auxiliary_loss_clip": 0.01128591, - "auxiliary_loss_mlp": 0.01054226, - "balance_loss_clip": 1.05255818, - "balance_loss_mlp": 1.03502131, - "epoch": 0.29189839170299114, - "flos": 24353503925760.0, - "grad_norm": 1.7143523369489482, - "language_loss": 0.86374146, - "learning_rate": 3.322597437887519e-06, - "loss": 0.88556957, - "num_input_tokens_seen": 104681445, - "step": 4855, - "time_per_iteration": 2.613903284072876 - }, - { - "auxiliary_loss_clip": 0.01039808, - "auxiliary_loss_mlp": 0.01005184, - "balance_loss_clip": 1.02170599, - "balance_loss_mlp": 1.00303864, - "epoch": 0.2919585149556591, - "flos": 71316726215040.0, - "grad_norm": 0.7954079009769616, - "language_loss": 0.60148996, - "learning_rate": 3.322305268780566e-06, - "loss": 0.6219399, - "num_input_tokens_seen": 104747945, - "step": 4856, - "time_per_iteration": 3.273501396179199 - }, - { - "auxiliary_loss_clip": 0.01115701, - "auxiliary_loss_mlp": 0.00774991, - "balance_loss_clip": 1.04708552, - "balance_loss_mlp": 1.00107539, - "epoch": 0.2920186382083271, - "flos": 15633208419840.0, - "grad_norm": 1.7540806356878256, - "language_loss": 0.6825304, - "learning_rate": 3.322013049531664e-06, - "loss": 0.70143735, - "num_input_tokens_seen": 104766225, - "step": 4857, - "time_per_iteration": 2.6799964904785156 - }, - { - "auxiliary_loss_clip": 0.01129839, - "auxiliary_loss_mlp": 0.00774071, - "balance_loss_clip": 1.05058599, - "balance_loss_mlp": 1.00106227, - "epoch": 0.29207876146099504, - "flos": 28366018364160.0, - "grad_norm": 1.9069678720023968, - "language_loss": 0.83446503, - "learning_rate": 3.321720780151895e-06, - "loss": 0.85350412, - "num_input_tokens_seen": 104785345, - "step": 4858, - "time_per_iteration": 2.7004997730255127 - }, - { - "auxiliary_loss_clip": 0.01143419, - "auxiliary_loss_mlp": 0.01047414, - "balance_loss_clip": 1.05265319, - "balance_loss_mlp": 1.03119004, - "epoch": 0.292138884713663, - "flos": 21870963342720.0, - "grad_norm": 1.7162042036272904, - "language_loss": 0.77357888, - "learning_rate": 3.321428460652342e-06, - "loss": 0.79548717, - "num_input_tokens_seen": 104804560, - "step": 4859, - "time_per_iteration": 2.5901620388031006 - }, - { - "auxiliary_loss_clip": 0.01105726, - "auxiliary_loss_mlp": 0.01044957, - "balance_loss_clip": 1.05237806, - "balance_loss_mlp": 1.02816057, - "epoch": 0.29219900796633097, - "flos": 20992552243200.0, - "grad_norm": 2.2554676354860246, - "language_loss": 0.68046212, - "learning_rate": 3.3211360910440885e-06, - "loss": 0.70196903, - "num_input_tokens_seen": 104821105, - "step": 4860, - "time_per_iteration": 2.7831058502197266 - }, - { - "auxiliary_loss_clip": 0.01117304, - "auxiliary_loss_mlp": 0.01041096, - "balance_loss_clip": 1.05229402, - "balance_loss_mlp": 1.02662396, - "epoch": 0.29225913121899894, - "flos": 35004608133120.0, - "grad_norm": 2.539974445673703, - "language_loss": 0.75258791, - "learning_rate": 3.320843671338222e-06, - "loss": 0.77417195, - "num_input_tokens_seen": 104841440, - "step": 4861, - "time_per_iteration": 2.7506070137023926 - }, - { - "auxiliary_loss_clip": 0.01128031, - "auxiliary_loss_mlp": 0.0105121, - "balance_loss_clip": 1.04845262, - "balance_loss_mlp": 1.03620112, - "epoch": 0.2923192544716669, - "flos": 13515663888000.0, - "grad_norm": 3.0942357088370245, - "language_loss": 0.91498685, - "learning_rate": 3.320551201545832e-06, - "loss": 0.93677926, - "num_input_tokens_seen": 104858210, - "step": 4862, - "time_per_iteration": 2.589700937271118 - }, - { - "auxiliary_loss_clip": 0.01131947, - "auxiliary_loss_mlp": 0.01042917, - "balance_loss_clip": 1.05090141, - "balance_loss_mlp": 1.02786124, - "epoch": 0.29237937772433487, - "flos": 19463512141440.0, - "grad_norm": 2.2124063953391464, - "language_loss": 0.73112279, - "learning_rate": 3.320258681678008e-06, - "loss": 0.75287139, - "num_input_tokens_seen": 104875620, - "step": 4863, - "time_per_iteration": 4.142335653305054 - }, - { - "auxiliary_loss_clip": 0.01061699, - "auxiliary_loss_mlp": 0.01044676, - "balance_loss_clip": 1.04478168, - "balance_loss_mlp": 1.02934611, - "epoch": 0.29243950097700283, - "flos": 20850597694080.0, - "grad_norm": 1.893468710780351, - "language_loss": 0.77841508, - "learning_rate": 3.319966111745842e-06, - "loss": 0.79947883, - "num_input_tokens_seen": 104894600, - "step": 4864, - "time_per_iteration": 4.309613943099976 - }, - { - "auxiliary_loss_clip": 0.01102707, - "auxiliary_loss_mlp": 0.01050983, - "balance_loss_clip": 1.04593945, - "balance_loss_mlp": 1.03424644, - "epoch": 0.29249962422967085, - "flos": 23584225322880.0, - "grad_norm": 1.5703024458168264, - "language_loss": 0.81861019, - "learning_rate": 3.319673491760429e-06, - "loss": 0.84014714, - "num_input_tokens_seen": 104914530, - "step": 4865, - "time_per_iteration": 2.762397527694702 - }, - { - "auxiliary_loss_clip": 0.0109576, - "auxiliary_loss_mlp": 0.01046651, - "balance_loss_clip": 1.05265307, - "balance_loss_mlp": 1.02924657, - "epoch": 0.2925597474823388, - "flos": 22273342473600.0, - "grad_norm": 2.2072447614425554, - "language_loss": 0.85522473, - "learning_rate": 3.3193808217328645e-06, - "loss": 0.87664878, - "num_input_tokens_seen": 104933460, - "step": 4866, - "time_per_iteration": 2.8033764362335205 - }, - { - "auxiliary_loss_clip": 0.01110933, - "auxiliary_loss_mlp": 0.01039812, - "balance_loss_clip": 1.04811919, - "balance_loss_mlp": 1.02410054, - "epoch": 0.2926198707350068, - "flos": 34456108475520.0, - "grad_norm": 1.7213351696608077, - "language_loss": 0.75498515, - "learning_rate": 3.3190881016742476e-06, - "loss": 0.7764926, - "num_input_tokens_seen": 104954495, - "step": 4867, - "time_per_iteration": 4.2950732707977295 - }, - { - "auxiliary_loss_clip": 0.01083116, - "auxiliary_loss_mlp": 0.01052463, - "balance_loss_clip": 1.04825687, - "balance_loss_mlp": 1.03576183, - "epoch": 0.29267999398767475, - "flos": 20704153944960.0, - "grad_norm": 1.9203033465249189, - "language_loss": 0.73236179, - "learning_rate": 3.3187953315956776e-06, - "loss": 0.75371754, - "num_input_tokens_seen": 104971915, - "step": 4868, - "time_per_iteration": 2.775538921356201 - }, - { - "auxiliary_loss_clip": 0.01091396, - "auxiliary_loss_mlp": 0.01045538, - "balance_loss_clip": 1.04888034, - "balance_loss_mlp": 1.02836001, - "epoch": 0.2927401172403427, - "flos": 18368667642240.0, - "grad_norm": 1.663889887662616, - "language_loss": 0.74540651, - "learning_rate": 3.3185025115082566e-06, - "loss": 0.76677585, - "num_input_tokens_seen": 104991335, - "step": 4869, - "time_per_iteration": 2.734683036804199 - }, - { - "auxiliary_loss_clip": 0.01116568, - "auxiliary_loss_mlp": 0.01040323, - "balance_loss_clip": 1.050179, - "balance_loss_mlp": 1.02405143, - "epoch": 0.2928002404930107, - "flos": 26104041244800.0, - "grad_norm": 1.5721867242720646, - "language_loss": 0.76492888, - "learning_rate": 3.318209641423088e-06, - "loss": 0.78649783, - "num_input_tokens_seen": 105012015, - "step": 4870, - "time_per_iteration": 4.413575649261475 - }, - { - "auxiliary_loss_clip": 0.01133789, - "auxiliary_loss_mlp": 0.0105055, - "balance_loss_clip": 1.05237079, - "balance_loss_mlp": 1.0328114, - "epoch": 0.29286036374567864, - "flos": 21324726241920.0, - "grad_norm": 2.0174334678237655, - "language_loss": 0.6773119, - "learning_rate": 3.3179167213512777e-06, - "loss": 0.69915527, - "num_input_tokens_seen": 105031460, - "step": 4871, - "time_per_iteration": 2.68796706199646 - }, - { - "auxiliary_loss_clip": 0.01112736, - "auxiliary_loss_mlp": 0.01051475, - "balance_loss_clip": 1.04638386, - "balance_loss_mlp": 1.03515494, - "epoch": 0.2929204869983466, - "flos": 29569492569600.0, - "grad_norm": 4.945083241782643, - "language_loss": 0.77463269, - "learning_rate": 3.317623751303933e-06, - "loss": 0.79627478, - "num_input_tokens_seen": 105052965, - "step": 4872, - "time_per_iteration": 2.7679827213287354 - }, - { - "auxiliary_loss_clip": 0.01078644, - "auxiliary_loss_mlp": 0.01045822, - "balance_loss_clip": 1.0468123, - "balance_loss_mlp": 1.0273211, - "epoch": 0.2929806102510146, - "flos": 19058259922560.0, - "grad_norm": 1.9468785945114855, - "language_loss": 0.72814691, - "learning_rate": 3.317330731292164e-06, - "loss": 0.74939156, - "num_input_tokens_seen": 105071840, - "step": 4873, - "time_per_iteration": 2.8704919815063477 - }, - { - "auxiliary_loss_clip": 0.01135073, - "auxiliary_loss_mlp": 0.01044722, - "balance_loss_clip": 1.0525651, - "balance_loss_mlp": 1.02705503, - "epoch": 0.29304073350368254, - "flos": 21944221130880.0, - "grad_norm": 1.9420707280566882, - "language_loss": 0.78093398, - "learning_rate": 3.3170376613270812e-06, - "loss": 0.80273187, - "num_input_tokens_seen": 105089445, - "step": 4874, - "time_per_iteration": 2.6573073863983154 - }, - { - "auxiliary_loss_clip": 0.01093774, - "auxiliary_loss_mlp": 0.01045077, - "balance_loss_clip": 1.05151463, - "balance_loss_mlp": 1.02790475, - "epoch": 0.2931008567563505, - "flos": 15450818135040.0, - "grad_norm": 1.8901262824755785, - "language_loss": 0.77336359, - "learning_rate": 3.3167445414197985e-06, - "loss": 0.794752, - "num_input_tokens_seen": 105106210, - "step": 4875, - "time_per_iteration": 2.6960959434509277 - }, - { - "auxiliary_loss_clip": 0.01141436, - "auxiliary_loss_mlp": 0.01038673, - "balance_loss_clip": 1.05718327, - "balance_loss_mlp": 1.02218604, - "epoch": 0.29316098000901847, - "flos": 16983162288000.0, - "grad_norm": 1.556341262673854, - "language_loss": 0.69037539, - "learning_rate": 3.316451371581431e-06, - "loss": 0.71217644, - "num_input_tokens_seen": 105124200, - "step": 4876, - "time_per_iteration": 2.6719844341278076 - }, - { - "auxiliary_loss_clip": 0.01121768, - "auxiliary_loss_mlp": 0.01047732, - "balance_loss_clip": 1.04729414, - "balance_loss_mlp": 1.03105509, - "epoch": 0.29322110326168643, - "flos": 16357705741440.0, - "grad_norm": 2.0371531421747466, - "language_loss": 0.82111382, - "learning_rate": 3.316158151823096e-06, - "loss": 0.84280884, - "num_input_tokens_seen": 105140400, - "step": 4877, - "time_per_iteration": 2.632293462753296 - }, - { - "auxiliary_loss_clip": 0.01139233, - "auxiliary_loss_mlp": 0.01040634, - "balance_loss_clip": 1.05428672, - "balance_loss_mlp": 1.02392054, - "epoch": 0.29328122651435445, - "flos": 13990869843840.0, - "grad_norm": 3.614839551588232, - "language_loss": 0.67366385, - "learning_rate": 3.315864882155911e-06, - "loss": 0.69546252, - "num_input_tokens_seen": 105157535, - "step": 4878, - "time_per_iteration": 2.5839362144470215 - }, - { - "auxiliary_loss_clip": 0.01100237, - "auxiliary_loss_mlp": 0.01045253, - "balance_loss_clip": 1.04628241, - "balance_loss_mlp": 1.02817595, - "epoch": 0.2933413497670224, - "flos": 25264593423360.0, - "grad_norm": 2.0985622071445063, - "language_loss": 0.73632258, - "learning_rate": 3.3155715625909982e-06, - "loss": 0.75777751, - "num_input_tokens_seen": 105175185, - "step": 4879, - "time_per_iteration": 2.738429307937622 - }, - { - "auxiliary_loss_clip": 0.01104776, - "auxiliary_loss_mlp": 0.00776504, - "balance_loss_clip": 1.05266857, - "balance_loss_mlp": 1.00116253, - "epoch": 0.2934014730196904, - "flos": 32123746656000.0, - "grad_norm": 1.8172867500477656, - "language_loss": 0.66441375, - "learning_rate": 3.3152781931394803e-06, - "loss": 0.68322659, - "num_input_tokens_seen": 105194540, - "step": 4880, - "time_per_iteration": 2.7889339923858643 - }, - { - "auxiliary_loss_clip": 0.01130875, - "auxiliary_loss_mlp": 0.01049004, - "balance_loss_clip": 1.05021453, - "balance_loss_mlp": 1.03249359, - "epoch": 0.29346159627235835, - "flos": 24352498344960.0, - "grad_norm": 1.9971358437235982, - "language_loss": 0.70130688, - "learning_rate": 3.314984773812481e-06, - "loss": 0.72310567, - "num_input_tokens_seen": 105213215, - "step": 4881, - "time_per_iteration": 2.705906629562378 - }, - { - "auxiliary_loss_clip": 0.01112418, - "auxiliary_loss_mlp": 0.00775734, - "balance_loss_clip": 1.04823685, - "balance_loss_mlp": 1.00119698, - "epoch": 0.2935217195250263, - "flos": 22746752749440.0, - "grad_norm": 1.8949601379230998, - "language_loss": 0.83497417, - "learning_rate": 3.314691304621127e-06, - "loss": 0.85385573, - "num_input_tokens_seen": 105231585, - "step": 4882, - "time_per_iteration": 2.715853691101074 - }, - { - "auxiliary_loss_clip": 0.01148283, - "auxiliary_loss_mlp": 0.01045596, - "balance_loss_clip": 1.05350292, - "balance_loss_mlp": 1.02825117, - "epoch": 0.2935818427776943, - "flos": 21725561088000.0, - "grad_norm": 2.6750396503443827, - "language_loss": 0.71433568, - "learning_rate": 3.314397785576548e-06, - "loss": 0.73627448, - "num_input_tokens_seen": 105250120, - "step": 4883, - "time_per_iteration": 2.629642963409424 - }, - { - "auxiliary_loss_clip": 0.01123143, - "auxiliary_loss_mlp": 0.01040743, - "balance_loss_clip": 1.05262315, - "balance_loss_mlp": 1.0230521, - "epoch": 0.29364196603036224, - "flos": 23804968354560.0, - "grad_norm": 2.1262053984109226, - "language_loss": 0.92650437, - "learning_rate": 3.3141042166898726e-06, - "loss": 0.94814324, - "num_input_tokens_seen": 105266065, - "step": 4884, - "time_per_iteration": 2.727379322052002 - }, - { - "auxiliary_loss_clip": 0.01138638, - "auxiliary_loss_mlp": 0.01039707, - "balance_loss_clip": 1.05512667, - "balance_loss_mlp": 1.0232085, - "epoch": 0.2937020892830302, - "flos": 23470064922240.0, - "grad_norm": 2.19754538449792, - "language_loss": 0.73535883, - "learning_rate": 3.313810597972234e-06, - "loss": 0.75714231, - "num_input_tokens_seen": 105282155, - "step": 4885, - "time_per_iteration": 2.706212043762207 - }, - { - "auxiliary_loss_clip": 0.01124089, - "auxiliary_loss_mlp": 0.01045234, - "balance_loss_clip": 1.04882109, - "balance_loss_mlp": 1.02791286, - "epoch": 0.2937622125356982, - "flos": 24272740195200.0, - "grad_norm": 2.8259058407064566, - "language_loss": 0.84815478, - "learning_rate": 3.3135169294347655e-06, - "loss": 0.86984795, - "num_input_tokens_seen": 105299225, - "step": 4886, - "time_per_iteration": 2.651383876800537 - }, - { - "auxiliary_loss_clip": 0.01112051, - "auxiliary_loss_mlp": 0.01040147, - "balance_loss_clip": 1.04674077, - "balance_loss_mlp": 1.023839, - "epoch": 0.29382233578836614, - "flos": 20662461233280.0, - "grad_norm": 2.312079302728887, - "language_loss": 0.77030611, - "learning_rate": 3.313223211088603e-06, - "loss": 0.7918281, - "num_input_tokens_seen": 105315710, - "step": 4887, - "time_per_iteration": 2.8299317359924316 - }, - { - "auxiliary_loss_clip": 0.01121167, - "auxiliary_loss_mlp": 0.01044419, - "balance_loss_clip": 1.05137563, - "balance_loss_mlp": 1.02809978, - "epoch": 0.2938824590410341, - "flos": 16545052103040.0, - "grad_norm": 4.814706857660641, - "language_loss": 0.79822707, - "learning_rate": 3.3129294429448855e-06, - "loss": 0.81988299, - "num_input_tokens_seen": 105333505, - "step": 4888, - "time_per_iteration": 2.6942543983459473 - }, - { - "auxiliary_loss_clip": 0.01114672, - "auxiliary_loss_mlp": 0.01035208, - "balance_loss_clip": 1.05101824, - "balance_loss_mlp": 1.01886487, - "epoch": 0.29394258229370207, - "flos": 37925474382720.0, - "grad_norm": 1.8060574020422921, - "language_loss": 0.55514884, - "learning_rate": 3.3126356250147517e-06, - "loss": 0.57664764, - "num_input_tokens_seen": 105355605, - "step": 4889, - "time_per_iteration": 2.838529586791992 - }, - { - "auxiliary_loss_clip": 0.01136079, - "auxiliary_loss_mlp": 0.01040242, - "balance_loss_clip": 1.05230045, - "balance_loss_mlp": 1.02257514, - "epoch": 0.29400270554637004, - "flos": 20044690197120.0, - "grad_norm": 1.9006309093473746, - "language_loss": 0.84414017, - "learning_rate": 3.3123417573093434e-06, - "loss": 0.86590338, - "num_input_tokens_seen": 105374225, - "step": 4890, - "time_per_iteration": 2.653601884841919 - }, - { - "auxiliary_loss_clip": 0.01138833, - "auxiliary_loss_mlp": 0.01044226, - "balance_loss_clip": 1.05449104, - "balance_loss_mlp": 1.02767992, - "epoch": 0.294062828799038, - "flos": 15266380775040.0, - "grad_norm": 2.3284792525221625, - "language_loss": 0.72417939, - "learning_rate": 3.3120478398398046e-06, - "loss": 0.74600995, - "num_input_tokens_seen": 105391565, - "step": 4891, - "time_per_iteration": 2.6499764919281006 - }, - { - "auxiliary_loss_clip": 0.01148906, - "auxiliary_loss_mlp": 0.01046245, - "balance_loss_clip": 1.05517375, - "balance_loss_mlp": 1.02797008, - "epoch": 0.294122952051706, - "flos": 22747147799040.0, - "grad_norm": 1.6858898954482169, - "language_loss": 0.77310836, - "learning_rate": 3.3117538726172797e-06, - "loss": 0.7950598, - "num_input_tokens_seen": 105409840, - "step": 4892, - "time_per_iteration": 2.6123669147491455 - }, - { - "auxiliary_loss_clip": 0.01143283, - "auxiliary_loss_mlp": 0.01036481, - "balance_loss_clip": 1.05147183, - "balance_loss_mlp": 1.01932704, - "epoch": 0.294183075304374, - "flos": 24972891073920.0, - "grad_norm": 1.8056938004749827, - "language_loss": 0.77826709, - "learning_rate": 3.3114598556529164e-06, - "loss": 0.80006474, - "num_input_tokens_seen": 105428645, - "step": 4893, - "time_per_iteration": 2.6142194271087646 - }, - { - "auxiliary_loss_clip": 0.01106286, - "auxiliary_loss_mlp": 0.01045871, - "balance_loss_clip": 1.0508399, - "balance_loss_mlp": 1.02912164, - "epoch": 0.29424319855704195, - "flos": 30952986762240.0, - "grad_norm": 3.6552959609210944, - "language_loss": 0.85032988, - "learning_rate": 3.311165788957864e-06, - "loss": 0.87185144, - "num_input_tokens_seen": 105447480, - "step": 4894, - "time_per_iteration": 2.837883234024048 - }, - { - "auxiliary_loss_clip": 0.01131513, - "auxiliary_loss_mlp": 0.01038131, - "balance_loss_clip": 1.05098557, - "balance_loss_mlp": 1.02169216, - "epoch": 0.2943033218097099, - "flos": 15231583474560.0, - "grad_norm": 3.570255241204836, - "language_loss": 0.90650308, - "learning_rate": 3.310871672543274e-06, - "loss": 0.92819947, - "num_input_tokens_seen": 105464600, - "step": 4895, - "time_per_iteration": 2.588153839111328 - }, - { - "auxiliary_loss_clip": 0.01138224, - "auxiliary_loss_mlp": 0.01045554, - "balance_loss_clip": 1.05338621, - "balance_loss_mlp": 1.02777958, - "epoch": 0.2943634450623779, - "flos": 21725884310400.0, - "grad_norm": 1.7548452829513195, - "language_loss": 0.86612183, - "learning_rate": 3.3105775064202982e-06, - "loss": 0.88795966, - "num_input_tokens_seen": 105481510, - "step": 4896, - "time_per_iteration": 2.6405279636383057 - }, - { - "auxiliary_loss_clip": 0.01142594, - "auxiliary_loss_mlp": 0.01053714, - "balance_loss_clip": 1.05662429, - "balance_loss_mlp": 1.03620195, - "epoch": 0.29442356831504585, - "flos": 22602104680320.0, - "grad_norm": 2.0549220420715906, - "language_loss": 0.73394442, - "learning_rate": 3.3102832906000924e-06, - "loss": 0.75590742, - "num_input_tokens_seen": 105501390, - "step": 4897, - "time_per_iteration": 2.6669554710388184 - }, - { - "auxiliary_loss_clip": 0.01128563, - "auxiliary_loss_mlp": 0.01050668, - "balance_loss_clip": 1.04556203, - "balance_loss_mlp": 1.03214252, - "epoch": 0.2944836915677138, - "flos": 20011401267840.0, - "grad_norm": 2.0814872266581426, - "language_loss": 0.74344778, - "learning_rate": 3.309989025093813e-06, - "loss": 0.76524007, - "num_input_tokens_seen": 105519600, - "step": 4898, - "time_per_iteration": 2.6286890506744385 - }, - { - "auxiliary_loss_clip": 0.01140269, - "auxiliary_loss_mlp": 0.01047883, - "balance_loss_clip": 1.05775058, - "balance_loss_mlp": 1.02880955, - "epoch": 0.2945438148203818, - "flos": 20045875345920.0, - "grad_norm": 2.610474436320842, - "language_loss": 0.70560962, - "learning_rate": 3.309694709912618e-06, - "loss": 0.72749114, - "num_input_tokens_seen": 105535970, - "step": 4899, - "time_per_iteration": 2.6050777435302734 - }, - { - "auxiliary_loss_clip": 0.01122842, - "auxiliary_loss_mlp": 0.00775757, - "balance_loss_clip": 1.05115175, - "balance_loss_mlp": 1.00110114, - "epoch": 0.29460393807304974, - "flos": 23733542160000.0, - "grad_norm": 2.6981557529788587, - "language_loss": 0.78938496, - "learning_rate": 3.3094003450676685e-06, - "loss": 0.80837095, - "num_input_tokens_seen": 105556735, - "step": 4900, - "time_per_iteration": 2.7517058849334717 - }, - { - "auxiliary_loss_clip": 0.0110429, - "auxiliary_loss_mlp": 0.01059395, - "balance_loss_clip": 1.04257679, - "balance_loss_mlp": 1.03992808, - "epoch": 0.2946640613257177, - "flos": 14976079056000.0, - "grad_norm": 1.7286923709762618, - "language_loss": 0.80861294, - "learning_rate": 3.3091059305701268e-06, - "loss": 0.83024979, - "num_input_tokens_seen": 105574875, - "step": 4901, - "time_per_iteration": 2.58297061920166 - }, - { - "auxiliary_loss_clip": 0.01114064, - "auxiliary_loss_mlp": 0.01035256, - "balance_loss_clip": 1.05081403, - "balance_loss_mlp": 1.01993775, - "epoch": 0.2947241845783857, - "flos": 24243904552320.0, - "grad_norm": 2.2236242529025954, - "language_loss": 0.57768303, - "learning_rate": 3.308811466431157e-06, - "loss": 0.59917623, - "num_input_tokens_seen": 105594225, - "step": 4902, - "time_per_iteration": 2.6765553951263428 - }, - { - "auxiliary_loss_clip": 0.01122886, - "auxiliary_loss_mlp": 0.01044406, - "balance_loss_clip": 1.05165744, - "balance_loss_mlp": 1.02809834, - "epoch": 0.29478430783105364, - "flos": 19938394874880.0, - "grad_norm": 1.6365628527843905, - "language_loss": 0.7553789, - "learning_rate": 3.308516952661925e-06, - "loss": 0.77705181, - "num_input_tokens_seen": 105614000, - "step": 4903, - "time_per_iteration": 5.72201132774353 - }, - { - "auxiliary_loss_clip": 0.01117125, - "auxiliary_loss_mlp": 0.01054328, - "balance_loss_clip": 1.05058551, - "balance_loss_mlp": 1.03506362, - "epoch": 0.2948444310837216, - "flos": 27381347856000.0, - "grad_norm": 1.79479894391178, - "language_loss": 0.62782186, - "learning_rate": 3.3082223892736e-06, - "loss": 0.64953631, - "num_input_tokens_seen": 105634575, - "step": 4904, - "time_per_iteration": 2.7290875911712646 - }, - { - "auxiliary_loss_clip": 0.01135143, - "auxiliary_loss_mlp": 0.01043669, - "balance_loss_clip": 1.05146813, - "balance_loss_mlp": 1.02669382, - "epoch": 0.2949045543363896, - "flos": 23405462311680.0, - "grad_norm": 1.4755442774564356, - "language_loss": 0.73145443, - "learning_rate": 3.3079277762773496e-06, - "loss": 0.75324261, - "num_input_tokens_seen": 105654385, - "step": 4905, - "time_per_iteration": 2.6482555866241455 - }, - { - "auxiliary_loss_clip": 0.01112476, - "auxiliary_loss_mlp": 0.01046266, - "balance_loss_clip": 1.05017638, - "balance_loss_mlp": 1.028265, - "epoch": 0.2949646775890576, - "flos": 23951483930880.0, - "grad_norm": 1.7800977730713317, - "language_loss": 0.8199898, - "learning_rate": 3.3076331136843476e-06, - "loss": 0.84157723, - "num_input_tokens_seen": 105673570, - "step": 4906, - "time_per_iteration": 2.737182378768921 - }, - { - "auxiliary_loss_clip": 0.01094663, - "auxiliary_loss_mlp": 0.01040505, - "balance_loss_clip": 1.04579425, - "balance_loss_mlp": 1.02372003, - "epoch": 0.29502480084172555, - "flos": 22784315397120.0, - "grad_norm": 2.8763815934933867, - "language_loss": 0.87373984, - "learning_rate": 3.3073384015057667e-06, - "loss": 0.89509153, - "num_input_tokens_seen": 105691940, - "step": 4907, - "time_per_iteration": 4.367825746536255 - }, - { - "auxiliary_loss_clip": 0.01149393, - "auxiliary_loss_mlp": 0.01043671, - "balance_loss_clip": 1.05400407, - "balance_loss_mlp": 1.02501488, - "epoch": 0.2950849240943935, - "flos": 19646656611840.0, - "grad_norm": 2.047818146937445, - "language_loss": 0.81910521, - "learning_rate": 3.307043639752782e-06, - "loss": 0.84103584, - "num_input_tokens_seen": 105709825, - "step": 4908, - "time_per_iteration": 2.578582525253296 - }, - { - "auxiliary_loss_clip": 0.01055582, - "auxiliary_loss_mlp": 0.01003419, - "balance_loss_clip": 1.02453518, - "balance_loss_mlp": 1.00138056, - "epoch": 0.2951450473470615, - "flos": 71002829260800.0, - "grad_norm": 0.7982723827999523, - "language_loss": 0.57287854, - "learning_rate": 3.3067488284365728e-06, - "loss": 0.59346855, - "num_input_tokens_seen": 105766880, - "step": 4909, - "time_per_iteration": 4.640491247177124 - }, - { - "auxiliary_loss_clip": 0.01135445, - "auxiliary_loss_mlp": 0.00774301, - "balance_loss_clip": 1.05580318, - "balance_loss_mlp": 1.00097156, - "epoch": 0.29520517059972945, - "flos": 22966310632320.0, - "grad_norm": 1.756295161453336, - "language_loss": 0.87018639, - "learning_rate": 3.3064539675683163e-06, - "loss": 0.88928384, - "num_input_tokens_seen": 105786875, - "step": 4910, - "time_per_iteration": 2.642312526702881 - }, - { - "auxiliary_loss_clip": 0.01131096, - "auxiliary_loss_mlp": 0.0104303, - "balance_loss_clip": 1.05359542, - "balance_loss_mlp": 1.02744913, - "epoch": 0.2952652938523974, - "flos": 20485673470080.0, - "grad_norm": 1.692596753939278, - "language_loss": 0.73332304, - "learning_rate": 3.3061590571591946e-06, - "loss": 0.75506431, - "num_input_tokens_seen": 105805315, - "step": 4911, - "time_per_iteration": 2.6130573749542236 - }, - { - "auxiliary_loss_clip": 0.01132917, - "auxiliary_loss_mlp": 0.01038473, - "balance_loss_clip": 1.05330253, - "balance_loss_mlp": 1.02193832, - "epoch": 0.2953254171050654, - "flos": 19646584784640.0, - "grad_norm": 1.8009313294920104, - "language_loss": 0.89653587, - "learning_rate": 3.3058640972203904e-06, - "loss": 0.91824973, - "num_input_tokens_seen": 105825125, - "step": 4912, - "time_per_iteration": 2.660090684890747 - }, - { - "auxiliary_loss_clip": 0.01114053, - "auxiliary_loss_mlp": 0.010529, - "balance_loss_clip": 1.0482899, - "balance_loss_mlp": 1.03503084, - "epoch": 0.29538554035773334, - "flos": 22747973811840.0, - "grad_norm": 1.3579869674800176, - "language_loss": 0.83175462, - "learning_rate": 3.3055690877630894e-06, - "loss": 0.85342413, - "num_input_tokens_seen": 105846085, - "step": 4913, - "time_per_iteration": 2.743364095687866 - }, - { - "auxiliary_loss_clip": 0.01142468, - "auxiliary_loss_mlp": 0.01043093, - "balance_loss_clip": 1.04977608, - "balance_loss_mlp": 1.02690446, - "epoch": 0.2954456636104013, - "flos": 21871861182720.0, - "grad_norm": 1.9704695859403116, - "language_loss": 0.76919919, - "learning_rate": 3.3052740287984765e-06, - "loss": 0.79105484, - "num_input_tokens_seen": 105865400, - "step": 4914, - "time_per_iteration": 2.6778385639190674 - }, - { - "auxiliary_loss_clip": 0.01121315, - "auxiliary_loss_mlp": 0.01045386, - "balance_loss_clip": 1.05064511, - "balance_loss_mlp": 1.02818418, - "epoch": 0.2955057868630693, - "flos": 40442560871040.0, - "grad_norm": 1.678810736285401, - "language_loss": 0.81829619, - "learning_rate": 3.3049789203377424e-06, - "loss": 0.8399632, - "num_input_tokens_seen": 105887920, - "step": 4915, - "time_per_iteration": 2.9347212314605713 - }, - { - "auxiliary_loss_clip": 0.01068117, - "auxiliary_loss_mlp": 0.01044435, - "balance_loss_clip": 1.04405856, - "balance_loss_mlp": 1.02722168, - "epoch": 0.29556591011573724, - "flos": 22564506119040.0, - "grad_norm": 2.129336551193515, - "language_loss": 0.84701812, - "learning_rate": 3.3046837623920772e-06, - "loss": 0.86814368, - "num_input_tokens_seen": 105904035, - "step": 4916, - "time_per_iteration": 2.9183273315429688 - }, - { - "auxiliary_loss_clip": 0.01125851, - "auxiliary_loss_mlp": 0.01036694, - "balance_loss_clip": 1.04655123, - "balance_loss_mlp": 1.01975429, - "epoch": 0.2956260333684052, - "flos": 22089300163200.0, - "grad_norm": 2.1082729468541683, - "language_loss": 0.69490808, - "learning_rate": 3.3043885549726723e-06, - "loss": 0.71653348, - "num_input_tokens_seen": 105922685, - "step": 4917, - "time_per_iteration": 2.7400357723236084 - }, - { - "auxiliary_loss_clip": 0.01123659, - "auxiliary_loss_mlp": 0.01038633, - "balance_loss_clip": 1.05140972, - "balance_loss_mlp": 1.02214622, - "epoch": 0.2956861566210732, - "flos": 16435488643200.0, - "grad_norm": 2.699189623646437, - "language_loss": 0.91076934, - "learning_rate": 3.3040932980907226e-06, - "loss": 0.93239224, - "num_input_tokens_seen": 105940425, - "step": 4918, - "time_per_iteration": 2.7343270778656006 - }, - { - "auxiliary_loss_clip": 0.01147937, - "auxiliary_loss_mlp": 0.01043258, - "balance_loss_clip": 1.0551039, - "balance_loss_mlp": 1.02629495, - "epoch": 0.2957462798737412, - "flos": 25812087500160.0, - "grad_norm": 1.9388581576792214, - "language_loss": 0.72399175, - "learning_rate": 3.303797991757425e-06, - "loss": 0.74590373, - "num_input_tokens_seen": 105960550, - "step": 4919, - "time_per_iteration": 2.718583822250366 - }, - { - "auxiliary_loss_clip": 0.01119627, - "auxiliary_loss_mlp": 0.01045651, - "balance_loss_clip": 1.04843163, - "balance_loss_mlp": 1.02838945, - "epoch": 0.29580640312640916, - "flos": 16690849407360.0, - "grad_norm": 1.8826298231205452, - "language_loss": 0.75919485, - "learning_rate": 3.3035026359839763e-06, - "loss": 0.78084767, - "num_input_tokens_seen": 105978820, - "step": 4920, - "time_per_iteration": 2.7425734996795654 - }, - { - "auxiliary_loss_clip": 0.01121739, - "auxiliary_loss_mlp": 0.01052293, - "balance_loss_clip": 1.05511427, - "balance_loss_mlp": 1.03449547, - "epoch": 0.2958665263790771, - "flos": 23945594100480.0, - "grad_norm": 5.307541834842734, - "language_loss": 0.69020098, - "learning_rate": 3.3032072307815774e-06, - "loss": 0.71194124, - "num_input_tokens_seen": 105997545, - "step": 4921, - "time_per_iteration": 2.7755305767059326 - }, - { - "auxiliary_loss_clip": 0.01120164, - "auxiliary_loss_mlp": 0.01043, - "balance_loss_clip": 1.05075121, - "balance_loss_mlp": 1.02453458, - "epoch": 0.2959266496317451, - "flos": 18478410670080.0, - "grad_norm": 1.8488664920888758, - "language_loss": 0.7462194, - "learning_rate": 3.3029117761614298e-06, - "loss": 0.767851, - "num_input_tokens_seen": 106015320, - "step": 4922, - "time_per_iteration": 2.740687131881714 - }, - { - "auxiliary_loss_clip": 0.01152013, - "auxiliary_loss_mlp": 0.00775382, - "balance_loss_clip": 1.05429566, - "balance_loss_mlp": 1.00129843, - "epoch": 0.29598677288441305, - "flos": 25957489754880.0, - "grad_norm": 1.7662799143188246, - "language_loss": 0.77148855, - "learning_rate": 3.302616272134737e-06, - "loss": 0.79076254, - "num_input_tokens_seen": 106034555, - "step": 4923, - "time_per_iteration": 2.664875030517578 - }, - { - "auxiliary_loss_clip": 0.01117655, - "auxiliary_loss_mlp": 0.01042537, - "balance_loss_clip": 1.05065989, - "balance_loss_mlp": 1.0247035, - "epoch": 0.296046896137081, - "flos": 25155999630720.0, - "grad_norm": 1.7775190737024398, - "language_loss": 0.86232758, - "learning_rate": 3.3023207187127042e-06, - "loss": 0.88392955, - "num_input_tokens_seen": 106054200, - "step": 4924, - "time_per_iteration": 2.7413501739501953 - }, - { - "auxiliary_loss_clip": 0.01132544, - "auxiliary_loss_mlp": 0.01038356, - "balance_loss_clip": 1.05098939, - "balance_loss_mlp": 1.02114248, - "epoch": 0.296107019389749, - "flos": 21761148487680.0, - "grad_norm": 1.479657736715748, - "language_loss": 0.82050943, - "learning_rate": 3.3020251159065396e-06, - "loss": 0.84221852, - "num_input_tokens_seen": 106074700, - "step": 4925, - "time_per_iteration": 2.676556348800659 - }, - { - "auxiliary_loss_clip": 0.01078547, - "auxiliary_loss_mlp": 0.01051683, - "balance_loss_clip": 1.04153097, - "balance_loss_mlp": 1.03283572, - "epoch": 0.29616714264241695, - "flos": 17960039544960.0, - "grad_norm": 2.5440905583969697, - "language_loss": 0.86138272, - "learning_rate": 3.301729463727452e-06, - "loss": 0.88268495, - "num_input_tokens_seen": 106091415, - "step": 4926, - "time_per_iteration": 2.675780773162842 - }, - { - "auxiliary_loss_clip": 0.01108502, - "auxiliary_loss_mlp": 0.01035423, - "balance_loss_clip": 1.04910469, - "balance_loss_mlp": 1.0193243, - "epoch": 0.2962272658950849, - "flos": 15012779777280.0, - "grad_norm": 2.332235960138756, - "language_loss": 0.85897464, - "learning_rate": 3.3014337621866527e-06, - "loss": 0.88041389, - "num_input_tokens_seen": 106109135, - "step": 4927, - "time_per_iteration": 2.7407169342041016 - }, - { - "auxiliary_loss_clip": 0.01131541, - "auxiliary_loss_mlp": 0.01039363, - "balance_loss_clip": 1.05158448, - "balance_loss_mlp": 1.02312613, - "epoch": 0.2962873891477529, - "flos": 14720861946240.0, - "grad_norm": 3.581765820174834, - "language_loss": 0.80772752, - "learning_rate": 3.3011380112953553e-06, - "loss": 0.8294366, - "num_input_tokens_seen": 106125750, - "step": 4928, - "time_per_iteration": 2.6719777584075928 - }, - { - "auxiliary_loss_clip": 0.01123889, - "auxiliary_loss_mlp": 0.01043191, - "balance_loss_clip": 1.04852009, - "balance_loss_mlp": 1.02346206, - "epoch": 0.29634751240042084, - "flos": 26723787528960.0, - "grad_norm": 2.79065826833615, - "language_loss": 0.7313869, - "learning_rate": 3.300842211064773e-06, - "loss": 0.75305772, - "num_input_tokens_seen": 106142835, - "step": 4929, - "time_per_iteration": 2.75266695022583 - }, - { - "auxiliary_loss_clip": 0.0112132, - "auxiliary_loss_mlp": 0.01054118, - "balance_loss_clip": 1.0495156, - "balance_loss_mlp": 1.03481805, - "epoch": 0.2964076356530888, - "flos": 14571293713920.0, - "grad_norm": 2.360375509218164, - "language_loss": 0.71534413, - "learning_rate": 3.3005463615061246e-06, - "loss": 0.73709846, - "num_input_tokens_seen": 106160680, - "step": 4930, - "time_per_iteration": 2.799149990081787 - }, - { - "auxiliary_loss_clip": 0.01028509, - "auxiliary_loss_mlp": 0.01003992, - "balance_loss_clip": 1.03094876, - "balance_loss_mlp": 1.00229919, - "epoch": 0.29646775890575683, - "flos": 63104315063040.0, - "grad_norm": 0.8053244370028285, - "language_loss": 0.6061247, - "learning_rate": 3.3002504626306275e-06, - "loss": 0.6264497, - "num_input_tokens_seen": 106224415, - "step": 4931, - "time_per_iteration": 3.218900442123413 - }, - { - "auxiliary_loss_clip": 0.01007041, - "auxiliary_loss_mlp": 0.01005936, - "balance_loss_clip": 1.02247667, - "balance_loss_mlp": 1.00395727, - "epoch": 0.2965278821584248, - "flos": 63067686168960.0, - "grad_norm": 0.7408573754586586, - "language_loss": 0.52380091, - "learning_rate": 3.2999545144495023e-06, - "loss": 0.54393071, - "num_input_tokens_seen": 106279140, - "step": 4932, - "time_per_iteration": 3.26432728767395 - }, - { - "auxiliary_loss_clip": 0.01129633, - "auxiliary_loss_mlp": 0.01042438, - "balance_loss_clip": 1.04917526, - "balance_loss_mlp": 1.02584457, - "epoch": 0.29658800541109276, - "flos": 23768734510080.0, - "grad_norm": 2.012094119717185, - "language_loss": 0.81540775, - "learning_rate": 3.299658516973972e-06, - "loss": 0.83712846, - "num_input_tokens_seen": 106298190, - "step": 4933, - "time_per_iteration": 2.804293155670166 - }, - { - "auxiliary_loss_clip": 0.01092845, - "auxiliary_loss_mlp": 0.01036901, - "balance_loss_clip": 1.04405773, - "balance_loss_mlp": 1.01966333, - "epoch": 0.2966481286637607, - "flos": 23988543788160.0, - "grad_norm": 1.916542141573101, - "language_loss": 0.75165296, - "learning_rate": 3.299362470215261e-06, - "loss": 0.77295041, - "num_input_tokens_seen": 106319065, - "step": 4934, - "time_per_iteration": 2.797697067260742 - }, - { - "auxiliary_loss_clip": 0.01126398, - "auxiliary_loss_mlp": 0.01047716, - "balance_loss_clip": 1.04985118, - "balance_loss_mlp": 1.03013301, - "epoch": 0.2967082519164287, - "flos": 17165157523200.0, - "grad_norm": 1.8491505675561635, - "language_loss": 0.62093496, - "learning_rate": 3.299066374184594e-06, - "loss": 0.64267612, - "num_input_tokens_seen": 106338040, - "step": 4935, - "time_per_iteration": 2.6466407775878906 - }, - { - "auxiliary_loss_clip": 0.01129018, - "auxiliary_loss_mlp": 0.01041652, - "balance_loss_clip": 1.05052114, - "balance_loss_mlp": 1.02452123, - "epoch": 0.29676837516909665, - "flos": 29387712816000.0, - "grad_norm": 1.4269626202910053, - "language_loss": 0.79485404, - "learning_rate": 3.2987702288932e-06, - "loss": 0.81656075, - "num_input_tokens_seen": 106358900, - "step": 4936, - "time_per_iteration": 2.7333009243011475 - }, - { - "auxiliary_loss_clip": 0.01100808, - "auxiliary_loss_mlp": 0.01048756, - "balance_loss_clip": 1.04970682, - "balance_loss_mlp": 1.03040934, - "epoch": 0.2968284984217646, - "flos": 34751222616960.0, - "grad_norm": 1.5951903019521643, - "language_loss": 0.73993498, - "learning_rate": 3.298474034352309e-06, - "loss": 0.76143062, - "num_input_tokens_seen": 106381805, - "step": 4937, - "time_per_iteration": 2.853935718536377 - }, - { - "auxiliary_loss_clip": 0.01094789, - "auxiliary_loss_mlp": 0.01038743, - "balance_loss_clip": 1.05060768, - "balance_loss_mlp": 1.0209924, - "epoch": 0.2968886216744326, - "flos": 21544104556800.0, - "grad_norm": 1.654578873057457, - "language_loss": 0.78373563, - "learning_rate": 3.2981777905731526e-06, - "loss": 0.80507094, - "num_input_tokens_seen": 106402365, - "step": 4938, - "time_per_iteration": 2.803147077560425 - }, - { - "auxiliary_loss_clip": 0.0111878, - "auxiliary_loss_mlp": 0.01048023, - "balance_loss_clip": 1.05193913, - "balance_loss_mlp": 1.02931857, - "epoch": 0.29694874492710055, - "flos": 12787323811200.0, - "grad_norm": 2.4827377035181013, - "language_loss": 0.76842266, - "learning_rate": 3.297881497566964e-06, - "loss": 0.79009068, - "num_input_tokens_seen": 106419800, - "step": 4939, - "time_per_iteration": 2.8867270946502686 - }, - { - "auxiliary_loss_clip": 0.0111051, - "auxiliary_loss_mlp": 0.01041172, - "balance_loss_clip": 1.04666841, - "balance_loss_mlp": 1.02361226, - "epoch": 0.2970088681797685, - "flos": 24569973239040.0, - "grad_norm": 1.8055035581570296, - "language_loss": 0.78354549, - "learning_rate": 3.297585155344979e-06, - "loss": 0.80506229, - "num_input_tokens_seen": 106440300, - "step": 4940, - "time_per_iteration": 2.783046245574951 - }, - { - "auxiliary_loss_clip": 0.01117762, - "auxiliary_loss_mlp": 0.01037936, - "balance_loss_clip": 1.0486958, - "balance_loss_mlp": 1.01876736, - "epoch": 0.2970689914324365, - "flos": 23659171050240.0, - "grad_norm": 1.6305550110852276, - "language_loss": 0.75628781, - "learning_rate": 3.297288763918435e-06, - "loss": 0.77784479, - "num_input_tokens_seen": 106460035, - "step": 4941, - "time_per_iteration": 2.74379825592041 - }, - { - "auxiliary_loss_clip": 0.01138083, - "auxiliary_loss_mlp": 0.01051629, - "balance_loss_clip": 1.05272233, - "balance_loss_mlp": 1.03276968, - "epoch": 0.29712911468510445, - "flos": 39670301439360.0, - "grad_norm": 2.3053326725865313, - "language_loss": 0.74158287, - "learning_rate": 3.2969923232985712e-06, - "loss": 0.76347995, - "num_input_tokens_seen": 106481095, - "step": 4942, - "time_per_iteration": 4.468350410461426 - }, - { - "auxiliary_loss_clip": 0.01111068, - "auxiliary_loss_mlp": 0.0104429, - "balance_loss_clip": 1.05172181, - "balance_loss_mlp": 1.02589595, - "epoch": 0.2971892379377724, - "flos": 26395312631040.0, - "grad_norm": 2.42728921351593, - "language_loss": 0.702492, - "learning_rate": 3.2966958334966287e-06, - "loss": 0.72404563, - "num_input_tokens_seen": 106501590, - "step": 4943, - "time_per_iteration": 4.2555251121521 - }, - { - "auxiliary_loss_clip": 0.01124177, - "auxiliary_loss_mlp": 0.01041442, - "balance_loss_clip": 1.04988825, - "balance_loss_mlp": 1.02360821, - "epoch": 0.2972493611904404, - "flos": 17603195880960.0, - "grad_norm": 2.221197725988377, - "language_loss": 0.795506, - "learning_rate": 3.2963992945238497e-06, - "loss": 0.81716216, - "num_input_tokens_seen": 106519430, - "step": 4944, - "time_per_iteration": 2.6572201251983643 - }, - { - "auxiliary_loss_clip": 0.0111705, - "auxiliary_loss_mlp": 0.01041351, - "balance_loss_clip": 1.04914248, - "balance_loss_mlp": 1.02521038, - "epoch": 0.2973094844431084, - "flos": 20412774817920.0, - "grad_norm": 2.187472317578873, - "language_loss": 0.83260202, - "learning_rate": 3.2961027063914795e-06, - "loss": 0.85418606, - "num_input_tokens_seen": 106535870, - "step": 4945, - "time_per_iteration": 2.6700363159179688 - }, - { - "auxiliary_loss_clip": 0.01090371, - "auxiliary_loss_mlp": 0.01039575, - "balance_loss_clip": 1.04623246, - "balance_loss_mlp": 1.02256417, - "epoch": 0.29736960769577636, - "flos": 17493488766720.0, - "grad_norm": 1.8830005833778707, - "language_loss": 0.67067397, - "learning_rate": 3.2958060691107654e-06, - "loss": 0.69197345, - "num_input_tokens_seen": 106553560, - "step": 4946, - "time_per_iteration": 4.29357385635376 - }, - { - "auxiliary_loss_clip": 0.01127819, - "auxiliary_loss_mlp": 0.00777134, - "balance_loss_clip": 1.04997563, - "balance_loss_mlp": 1.00115252, - "epoch": 0.2974297309484443, - "flos": 26103969417600.0, - "grad_norm": 1.879721590970614, - "language_loss": 0.73877805, - "learning_rate": 3.2955093826929547e-06, - "loss": 0.75782764, - "num_input_tokens_seen": 106574115, - "step": 4947, - "time_per_iteration": 2.657038450241089 - }, - { - "auxiliary_loss_clip": 0.01109701, - "auxiliary_loss_mlp": 0.01045546, - "balance_loss_clip": 1.04896843, - "balance_loss_mlp": 1.02705622, - "epoch": 0.2974898542011123, - "flos": 25666433850240.0, - "grad_norm": 2.0989098852090633, - "language_loss": 0.73522758, - "learning_rate": 3.2952126471492985e-06, - "loss": 0.75678003, - "num_input_tokens_seen": 106593070, - "step": 4948, - "time_per_iteration": 4.4359636306762695 - }, - { - "auxiliary_loss_clip": 0.01139863, - "auxiliary_loss_mlp": 0.01040301, - "balance_loss_clip": 1.04885721, - "balance_loss_mlp": 1.02332592, - "epoch": 0.29754997745378026, - "flos": 18661339658880.0, - "grad_norm": 2.06615582769113, - "language_loss": 0.8397494, - "learning_rate": 3.2949158624910497e-06, - "loss": 0.86155105, - "num_input_tokens_seen": 106610695, - "step": 4949, - "time_per_iteration": 2.6052157878875732 - }, - { - "auxiliary_loss_clip": 0.01128522, - "auxiliary_loss_mlp": 0.01041578, - "balance_loss_clip": 1.04901218, - "balance_loss_mlp": 1.02459633, - "epoch": 0.2976101007064482, - "flos": 22274599449600.0, - "grad_norm": 2.2184783420455814, - "language_loss": 0.71360326, - "learning_rate": 3.2946190287294603e-06, - "loss": 0.73530424, - "num_input_tokens_seen": 106631300, - "step": 4950, - "time_per_iteration": 2.678953170776367 - }, - { - "auxiliary_loss_clip": 0.01095366, - "auxiliary_loss_mlp": 0.01039981, - "balance_loss_clip": 1.04944646, - "balance_loss_mlp": 1.0239712, - "epoch": 0.2976702239591162, - "flos": 21945657674880.0, - "grad_norm": 3.098719098855731, - "language_loss": 0.82645297, - "learning_rate": 3.294322145875789e-06, - "loss": 0.84780639, - "num_input_tokens_seen": 106650065, - "step": 4951, - "time_per_iteration": 2.7566003799438477 - }, - { - "auxiliary_loss_clip": 0.01118264, - "auxiliary_loss_mlp": 0.01039186, - "balance_loss_clip": 1.04655933, - "balance_loss_mlp": 1.02190065, - "epoch": 0.29773034721178415, - "flos": 24637197542400.0, - "grad_norm": 15.690000260498868, - "language_loss": 0.74144769, - "learning_rate": 3.2940252139412912e-06, - "loss": 0.76302218, - "num_input_tokens_seen": 106668230, - "step": 4952, - "time_per_iteration": 2.7019882202148438 - }, - { - "auxiliary_loss_clip": 0.01063128, - "auxiliary_loss_mlp": 0.01049349, - "balance_loss_clip": 1.0433315, - "balance_loss_mlp": 1.03133702, - "epoch": 0.2977904704644521, - "flos": 20557566541440.0, - "grad_norm": 1.6701113978494808, - "language_loss": 0.84251344, - "learning_rate": 3.293728232937228e-06, - "loss": 0.86363828, - "num_input_tokens_seen": 106687785, - "step": 4953, - "time_per_iteration": 2.9622793197631836 - }, - { - "auxiliary_loss_clip": 0.01120636, - "auxiliary_loss_mlp": 0.01040588, - "balance_loss_clip": 1.04966831, - "balance_loss_mlp": 1.02428031, - "epoch": 0.2978505937171201, - "flos": 18916449027840.0, - "grad_norm": 2.301918041259246, - "language_loss": 0.74366152, - "learning_rate": 3.2934312028748597e-06, - "loss": 0.76527375, - "num_input_tokens_seen": 106706875, - "step": 4954, - "time_per_iteration": 2.767455577850342 - }, - { - "auxiliary_loss_clip": 0.01138563, - "auxiliary_loss_mlp": 0.01036281, - "balance_loss_clip": 1.04899216, - "balance_loss_mlp": 1.02028275, - "epoch": 0.29791071696978805, - "flos": 19317750750720.0, - "grad_norm": 2.0603039788066155, - "language_loss": 0.75687683, - "learning_rate": 3.293134123765452e-06, - "loss": 0.77862525, - "num_input_tokens_seen": 106725105, - "step": 4955, - "time_per_iteration": 2.638389825820923 - }, - { - "auxiliary_loss_clip": 0.01094257, - "auxiliary_loss_mlp": 0.01042355, - "balance_loss_clip": 1.04760742, - "balance_loss_mlp": 1.02505171, - "epoch": 0.297970840222456, - "flos": 18806813740800.0, - "grad_norm": 2.358195616275362, - "language_loss": 0.72600436, - "learning_rate": 3.2928369956202684e-06, - "loss": 0.74737054, - "num_input_tokens_seen": 106744780, - "step": 4956, - "time_per_iteration": 2.777873992919922 - }, - { - "auxiliary_loss_clip": 0.01134603, - "auxiliary_loss_mlp": 0.0104754, - "balance_loss_clip": 1.04957581, - "balance_loss_mlp": 1.02930105, - "epoch": 0.298030963475124, - "flos": 22852760762880.0, - "grad_norm": 2.0297274127598435, - "language_loss": 0.79068756, - "learning_rate": 3.2925398184505754e-06, - "loss": 0.81250894, - "num_input_tokens_seen": 106764670, - "step": 4957, - "time_per_iteration": 2.719581365585327 - }, - { - "auxiliary_loss_clip": 0.01134843, - "auxiliary_loss_mlp": 0.01041974, - "balance_loss_clip": 1.05054235, - "balance_loss_mlp": 1.02383018, - "epoch": 0.298091086727792, - "flos": 21868485304320.0, - "grad_norm": 1.706880580606115, - "language_loss": 0.70570725, - "learning_rate": 3.2922425922676437e-06, - "loss": 0.7274754, - "num_input_tokens_seen": 106783695, - "step": 4958, - "time_per_iteration": 2.613697052001953 - }, - { - "auxiliary_loss_clip": 0.01108077, - "auxiliary_loss_mlp": 0.0104267, - "balance_loss_clip": 1.05166888, - "balance_loss_mlp": 1.0253129, - "epoch": 0.29815120998045996, - "flos": 21175014355200.0, - "grad_norm": 1.5383051389102413, - "language_loss": 0.78736448, - "learning_rate": 3.291945317082743e-06, - "loss": 0.80887192, - "num_input_tokens_seen": 106803150, - "step": 4959, - "time_per_iteration": 2.751455545425415 - }, - { - "auxiliary_loss_clip": 0.01129828, - "auxiliary_loss_mlp": 0.01045919, - "balance_loss_clip": 1.04906321, - "balance_loss_mlp": 1.0290029, - "epoch": 0.29821133323312793, - "flos": 19896271200000.0, - "grad_norm": 1.6624120752671379, - "language_loss": 0.79747117, - "learning_rate": 3.291647992907147e-06, - "loss": 0.81922865, - "num_input_tokens_seen": 106820705, - "step": 4960, - "time_per_iteration": 2.6345505714416504 - }, - { - "auxiliary_loss_clip": 0.01110987, - "auxiliary_loss_mlp": 0.01052912, - "balance_loss_clip": 1.04863763, - "balance_loss_mlp": 1.03449416, - "epoch": 0.2982714564857959, - "flos": 12750766744320.0, - "grad_norm": 2.376132196895137, - "language_loss": 0.73364639, - "learning_rate": 3.291350619752129e-06, - "loss": 0.75528538, - "num_input_tokens_seen": 106837335, - "step": 4961, - "time_per_iteration": 2.725008010864258 - }, - { - "auxiliary_loss_clip": 0.01130001, - "auxiliary_loss_mlp": 0.0104294, - "balance_loss_clip": 1.04824948, - "balance_loss_mlp": 1.02640533, - "epoch": 0.29833157973846386, - "flos": 22271905929600.0, - "grad_norm": 2.036560430862295, - "language_loss": 0.62106621, - "learning_rate": 3.291053197628967e-06, - "loss": 0.64279556, - "num_input_tokens_seen": 106856250, - "step": 4962, - "time_per_iteration": 2.690870523452759 - }, - { - "auxiliary_loss_clip": 0.01128362, - "auxiliary_loss_mlp": 0.01051341, - "balance_loss_clip": 1.05034256, - "balance_loss_mlp": 1.03310251, - "epoch": 0.2983917029911318, - "flos": 15372999319680.0, - "grad_norm": 2.046461333274312, - "language_loss": 0.82866591, - "learning_rate": 3.2907557265489375e-06, - "loss": 0.85046291, - "num_input_tokens_seen": 106873370, - "step": 4963, - "time_per_iteration": 2.637723207473755 - }, - { - "auxiliary_loss_clip": 0.01112844, - "auxiliary_loss_mlp": 0.01044675, - "balance_loss_clip": 1.05338502, - "balance_loss_mlp": 1.0272826, - "epoch": 0.2984518262437998, - "flos": 15377632174080.0, - "grad_norm": 2.580714695656121, - "language_loss": 0.65933317, - "learning_rate": 3.290458206523322e-06, - "loss": 0.68090838, - "num_input_tokens_seen": 106890330, - "step": 4964, - "time_per_iteration": 2.7210114002227783 - }, - { - "auxiliary_loss_clip": 0.01128428, - "auxiliary_loss_mlp": 0.01039216, - "balance_loss_clip": 1.04990005, - "balance_loss_mlp": 1.02345669, - "epoch": 0.29851194949646775, - "flos": 18108458542080.0, - "grad_norm": 1.8191471944851214, - "language_loss": 0.71093529, - "learning_rate": 3.2901606375634015e-06, - "loss": 0.73261172, - "num_input_tokens_seen": 106909190, - "step": 4965, - "time_per_iteration": 2.7070064544677734 - }, - { - "auxiliary_loss_clip": 0.01151396, - "auxiliary_loss_mlp": 0.01056357, - "balance_loss_clip": 1.05813003, - "balance_loss_mlp": 1.03827357, - "epoch": 0.2985720727491357, - "flos": 22018233104640.0, - "grad_norm": 2.164601494744612, - "language_loss": 0.65952027, - "learning_rate": 3.289863019680461e-06, - "loss": 0.68159783, - "num_input_tokens_seen": 106927825, - "step": 4966, - "time_per_iteration": 2.5820860862731934 - }, - { - "auxiliary_loss_clip": 0.01148496, - "auxiliary_loss_mlp": 0.01042183, - "balance_loss_clip": 1.05610132, - "balance_loss_mlp": 1.02496934, - "epoch": 0.2986321960018037, - "flos": 13041355772160.0, - "grad_norm": 5.631297794621363, - "language_loss": 0.73553479, - "learning_rate": 3.289565352885785e-06, - "loss": 0.75744158, - "num_input_tokens_seen": 106943155, - "step": 4967, - "time_per_iteration": 2.558378219604492 - }, - { - "auxiliary_loss_clip": 0.01110231, - "auxiliary_loss_mlp": 0.01041561, - "balance_loss_clip": 1.04339898, - "balance_loss_mlp": 1.02440643, - "epoch": 0.29869231925447165, - "flos": 14465034305280.0, - "grad_norm": 2.07351823246568, - "language_loss": 0.71246195, - "learning_rate": 3.2892676371906614e-06, - "loss": 0.73397982, - "num_input_tokens_seen": 106960295, - "step": 4968, - "time_per_iteration": 2.663163900375366 - }, - { - "auxiliary_loss_clip": 0.01124763, - "auxiliary_loss_mlp": 0.01043588, - "balance_loss_clip": 1.04864979, - "balance_loss_mlp": 1.02545607, - "epoch": 0.2987524425071396, - "flos": 31650228639360.0, - "grad_norm": 2.159507035183752, - "language_loss": 0.76744419, - "learning_rate": 3.2889698726063805e-06, - "loss": 0.78912771, - "num_input_tokens_seen": 106982870, - "step": 4969, - "time_per_iteration": 2.729922294616699 - }, - { - "auxiliary_loss_clip": 0.0114364, - "auxiliary_loss_mlp": 0.01036255, - "balance_loss_clip": 1.05239987, - "balance_loss_mlp": 1.02054322, - "epoch": 0.2988125657598076, - "flos": 21433427775360.0, - "grad_norm": 2.2724385668179936, - "language_loss": 0.69836891, - "learning_rate": 3.2886720591442327e-06, - "loss": 0.72016788, - "num_input_tokens_seen": 107002405, - "step": 4970, - "time_per_iteration": 2.6299381256103516 - }, - { - "auxiliary_loss_clip": 0.01135061, - "auxiliary_loss_mlp": 0.01048009, - "balance_loss_clip": 1.05199289, - "balance_loss_mlp": 1.02973413, - "epoch": 0.2988726890124756, - "flos": 18076965292800.0, - "grad_norm": 2.0648779209654258, - "language_loss": 0.85228848, - "learning_rate": 3.2883741968155103e-06, - "loss": 0.87411916, - "num_input_tokens_seen": 107017310, - "step": 4971, - "time_per_iteration": 2.6508536338806152 - }, - { - "auxiliary_loss_clip": 0.01112297, - "auxiliary_loss_mlp": 0.01054091, - "balance_loss_clip": 1.04895663, - "balance_loss_mlp": 1.03510106, - "epoch": 0.29893281226514357, - "flos": 21755653706880.0, - "grad_norm": 2.125047221260382, - "language_loss": 0.79404521, - "learning_rate": 3.2880762856315107e-06, - "loss": 0.81570905, - "num_input_tokens_seen": 107034645, - "step": 4972, - "time_per_iteration": 2.7924270629882812 - }, - { - "auxiliary_loss_clip": 0.01145651, - "auxiliary_loss_mlp": 0.01050789, - "balance_loss_clip": 1.05367875, - "balance_loss_mlp": 1.03427887, - "epoch": 0.29899293551781153, - "flos": 16836718538880.0, - "grad_norm": 2.200462139835186, - "language_loss": 0.85242772, - "learning_rate": 3.2877783256035285e-06, - "loss": 0.87439215, - "num_input_tokens_seen": 107051125, - "step": 4973, - "time_per_iteration": 2.5249850749969482 - }, - { - "auxiliary_loss_clip": 0.011108, - "auxiliary_loss_mlp": 0.0104405, - "balance_loss_clip": 1.04758012, - "balance_loss_mlp": 1.02664554, - "epoch": 0.2990530587704795, - "flos": 11729215946880.0, - "grad_norm": 2.0029664307268664, - "language_loss": 0.77612329, - "learning_rate": 3.287480316742863e-06, - "loss": 0.79767179, - "num_input_tokens_seen": 107068815, - "step": 4974, - "time_per_iteration": 2.6555633544921875 - }, - { - "auxiliary_loss_clip": 0.01115732, - "auxiliary_loss_mlp": 0.00779073, - "balance_loss_clip": 1.04864824, - "balance_loss_mlp": 1.00132942, - "epoch": 0.29911318202314746, - "flos": 28039877850240.0, - "grad_norm": 1.735885031779611, - "language_loss": 0.72557616, - "learning_rate": 3.287182259060815e-06, - "loss": 0.74452424, - "num_input_tokens_seen": 107090420, - "step": 4975, - "time_per_iteration": 2.826773166656494 - }, - { - "auxiliary_loss_clip": 0.01137332, - "auxiliary_loss_mlp": 0.01043625, - "balance_loss_clip": 1.05628741, - "balance_loss_mlp": 1.02561235, - "epoch": 0.2991733052758154, - "flos": 18733555952640.0, - "grad_norm": 2.282255680734404, - "language_loss": 0.76357341, - "learning_rate": 3.286884152568687e-06, - "loss": 0.78538299, - "num_input_tokens_seen": 107107255, - "step": 4976, - "time_per_iteration": 2.7506988048553467 - }, - { - "auxiliary_loss_clip": 0.01130399, - "auxiliary_loss_mlp": 0.01046525, - "balance_loss_clip": 1.0515976, - "balance_loss_mlp": 1.02988303, - "epoch": 0.2992334285284834, - "flos": 15559160532480.0, - "grad_norm": 2.005019372487673, - "language_loss": 0.86173046, - "learning_rate": 3.2865859972777827e-06, - "loss": 0.88349968, - "num_input_tokens_seen": 107123840, - "step": 4977, - "time_per_iteration": 2.665029764175415 - }, - { - "auxiliary_loss_clip": 0.01118345, - "auxiliary_loss_mlp": 0.01041325, - "balance_loss_clip": 1.05032945, - "balance_loss_mlp": 1.02443314, - "epoch": 0.29929355178115136, - "flos": 21797561900160.0, - "grad_norm": 1.7658271873172786, - "language_loss": 0.68290305, - "learning_rate": 3.2862877931994088e-06, - "loss": 0.70449972, - "num_input_tokens_seen": 107143475, - "step": 4978, - "time_per_iteration": 2.8401222229003906 - }, - { - "auxiliary_loss_clip": 0.011259, - "auxiliary_loss_mlp": 0.0104045, - "balance_loss_clip": 1.05556107, - "balance_loss_mlp": 1.02268767, - "epoch": 0.2993536750338193, - "flos": 21178533888000.0, - "grad_norm": 2.254262103488659, - "language_loss": 0.76281357, - "learning_rate": 3.2859895403448726e-06, - "loss": 0.78447711, - "num_input_tokens_seen": 107161725, - "step": 4979, - "time_per_iteration": 2.7814600467681885 - }, - { - "auxiliary_loss_clip": 0.01090165, - "auxiliary_loss_mlp": 0.0104942, - "balance_loss_clip": 1.04378402, - "balance_loss_mlp": 1.03001285, - "epoch": 0.2994137982864873, - "flos": 32122130544000.0, - "grad_norm": 2.1261514095664253, - "language_loss": 0.68627954, - "learning_rate": 3.285691238725484e-06, - "loss": 0.70767546, - "num_input_tokens_seen": 107183935, - "step": 4980, - "time_per_iteration": 2.891620635986328 - }, - { - "auxiliary_loss_clip": 0.01130184, - "auxiliary_loss_mlp": 0.00774942, - "balance_loss_clip": 1.0525018, - "balance_loss_mlp": 1.00121665, - "epoch": 0.29947392153915525, - "flos": 21105419754240.0, - "grad_norm": 2.1372298066204114, - "language_loss": 0.73153281, - "learning_rate": 3.285392888352555e-06, - "loss": 0.75058407, - "num_input_tokens_seen": 107204285, - "step": 4981, - "time_per_iteration": 5.394481420516968 - }, - { - "auxiliary_loss_clip": 0.01131964, - "auxiliary_loss_mlp": 0.0103921, - "balance_loss_clip": 1.0491364, - "balance_loss_mlp": 1.02280653, - "epoch": 0.2995340447918232, - "flos": 21542632099200.0, - "grad_norm": 1.6530173596529, - "language_loss": 0.86516619, - "learning_rate": 3.2850944892373987e-06, - "loss": 0.88687789, - "num_input_tokens_seen": 107225265, - "step": 4982, - "time_per_iteration": 4.269104480743408 - }, - { - "auxiliary_loss_clip": 0.01122605, - "auxiliary_loss_mlp": 0.01045235, - "balance_loss_clip": 1.05186415, - "balance_loss_mlp": 1.02632844, - "epoch": 0.2995941680444912, - "flos": 16725143917440.0, - "grad_norm": 2.446225936700185, - "language_loss": 0.86517423, - "learning_rate": 3.2847960413913307e-06, - "loss": 0.88685262, - "num_input_tokens_seen": 107241335, - "step": 4983, - "time_per_iteration": 2.844748020172119 - }, - { - "auxiliary_loss_clip": 0.01127565, - "auxiliary_loss_mlp": 0.01041992, - "balance_loss_clip": 1.05255556, - "balance_loss_mlp": 1.02594662, - "epoch": 0.2996542912971592, - "flos": 20923496346240.0, - "grad_norm": 2.024163877740881, - "language_loss": 0.78712893, - "learning_rate": 3.284497544825668e-06, - "loss": 0.80882448, - "num_input_tokens_seen": 107259375, - "step": 4984, - "time_per_iteration": 2.6945550441741943 - }, - { - "auxiliary_loss_clip": 0.01110139, - "auxiliary_loss_mlp": 0.01046002, - "balance_loss_clip": 1.0492574, - "balance_loss_mlp": 1.02761972, - "epoch": 0.29971441454982717, - "flos": 25079868754560.0, - "grad_norm": 1.5529534411437271, - "language_loss": 0.78736818, - "learning_rate": 3.2841989995517303e-06, - "loss": 0.8089295, - "num_input_tokens_seen": 107279890, - "step": 4985, - "time_per_iteration": 2.8082690238952637 - }, - { - "auxiliary_loss_clip": 0.01083189, - "auxiliary_loss_mlp": 0.01050178, - "balance_loss_clip": 1.04330277, - "balance_loss_mlp": 1.02925658, - "epoch": 0.29977453780249513, - "flos": 52555911840000.0, - "grad_norm": 2.2301347819864112, - "language_loss": 0.72089684, - "learning_rate": 3.283900405580837e-06, - "loss": 0.74223053, - "num_input_tokens_seen": 107303430, - "step": 4986, - "time_per_iteration": 4.54891562461853 - }, - { - "auxiliary_loss_clip": 0.01119419, - "auxiliary_loss_mlp": 0.01047564, - "balance_loss_clip": 1.04838538, - "balance_loss_mlp": 1.03007603, - "epoch": 0.2998346610551631, - "flos": 22237144542720.0, - "grad_norm": 2.1453051702670787, - "language_loss": 0.73143345, - "learning_rate": 3.283601762924312e-06, - "loss": 0.75310332, - "num_input_tokens_seen": 107323700, - "step": 4987, - "time_per_iteration": 4.324375152587891 - }, - { - "auxiliary_loss_clip": 0.01111213, - "auxiliary_loss_mlp": 0.01039103, - "balance_loss_clip": 1.04803324, - "balance_loss_mlp": 1.0233314, - "epoch": 0.29989478430783106, - "flos": 16873203778560.0, - "grad_norm": 2.095598578062247, - "language_loss": 0.80221194, - "learning_rate": 3.2833030715934793e-06, - "loss": 0.82371509, - "num_input_tokens_seen": 107341965, - "step": 4988, - "time_per_iteration": 2.772221565246582 - }, - { - "auxiliary_loss_clip": 0.01114945, - "auxiliary_loss_mlp": 0.00777889, - "balance_loss_clip": 1.04905486, - "balance_loss_mlp": 1.0013597, - "epoch": 0.29995490756049903, - "flos": 23768878164480.0, - "grad_norm": 1.6966696236855432, - "language_loss": 0.70858777, - "learning_rate": 3.2830043315996658e-06, - "loss": 0.72751617, - "num_input_tokens_seen": 107362615, - "step": 4989, - "time_per_iteration": 2.7470130920410156 - }, - { - "auxiliary_loss_clip": 0.0110827, - "auxiliary_loss_mlp": 0.01046589, - "balance_loss_clip": 1.0506041, - "balance_loss_mlp": 1.02906489, - "epoch": 0.300015030813167, - "flos": 14465321614080.0, - "grad_norm": 1.9545100728262668, - "language_loss": 0.85589516, - "learning_rate": 3.282705542954199e-06, - "loss": 0.87744367, - "num_input_tokens_seen": 107378980, - "step": 4990, - "time_per_iteration": 2.808276414871216 - }, - { - "auxiliary_loss_clip": 0.01133569, - "auxiliary_loss_mlp": 0.0103974, - "balance_loss_clip": 1.05172086, - "balance_loss_mlp": 1.02152538, - "epoch": 0.30007515406583496, - "flos": 25191982080000.0, - "grad_norm": 1.8023870470649808, - "language_loss": 0.67019355, - "learning_rate": 3.28240670566841e-06, - "loss": 0.69192666, - "num_input_tokens_seen": 107397640, - "step": 4991, - "time_per_iteration": 2.7097268104553223 - }, - { - "auxiliary_loss_clip": 0.0112021, - "auxiliary_loss_mlp": 0.01041383, - "balance_loss_clip": 1.04660511, - "balance_loss_mlp": 1.02248883, - "epoch": 0.3001352773185029, - "flos": 19391188106880.0, - "grad_norm": 1.684252307124257, - "language_loss": 0.78640115, - "learning_rate": 3.28210781975363e-06, - "loss": 0.80801708, - "num_input_tokens_seen": 107416020, - "step": 4992, - "time_per_iteration": 2.66925311088562 - }, - { - "auxiliary_loss_clip": 0.01143243, - "auxiliary_loss_mlp": 0.01041924, - "balance_loss_clip": 1.05240428, - "balance_loss_mlp": 1.02457952, - "epoch": 0.3001954005711709, - "flos": 21543853161600.0, - "grad_norm": 2.3134173579188175, - "language_loss": 0.82057947, - "learning_rate": 3.281808885221193e-06, - "loss": 0.84243113, - "num_input_tokens_seen": 107436340, - "step": 4993, - "time_per_iteration": 2.613849639892578 - }, - { - "auxiliary_loss_clip": 0.01096023, - "auxiliary_loss_mlp": 0.01048917, - "balance_loss_clip": 1.04667079, - "balance_loss_mlp": 1.02997458, - "epoch": 0.30025552382383885, - "flos": 17384320356480.0, - "grad_norm": 2.1042579138834197, - "language_loss": 0.86142659, - "learning_rate": 3.2815099020824345e-06, - "loss": 0.88287598, - "num_input_tokens_seen": 107454585, - "step": 4994, - "time_per_iteration": 2.703126907348633 - }, - { - "auxiliary_loss_clip": 0.01118329, - "auxiliary_loss_mlp": 0.01041975, - "balance_loss_clip": 1.05592799, - "balance_loss_mlp": 1.02504694, - "epoch": 0.3003156470765068, - "flos": 29533330552320.0, - "grad_norm": 1.5905866784601752, - "language_loss": 0.80834931, - "learning_rate": 3.2812108703486924e-06, - "loss": 0.82995236, - "num_input_tokens_seen": 107477180, - "step": 4995, - "time_per_iteration": 2.8100333213806152 - }, - { - "auxiliary_loss_clip": 0.01117939, - "auxiliary_loss_mlp": 0.01043612, - "balance_loss_clip": 1.05073023, - "balance_loss_mlp": 1.02623129, - "epoch": 0.3003757703291748, - "flos": 43646402465280.0, - "grad_norm": 1.9490007813217745, - "language_loss": 0.67086798, - "learning_rate": 3.2809117900313055e-06, - "loss": 0.69248348, - "num_input_tokens_seen": 107500250, - "step": 4996, - "time_per_iteration": 2.989062786102295 - }, - { - "auxiliary_loss_clip": 0.01114657, - "auxiliary_loss_mlp": 0.01042055, - "balance_loss_clip": 1.04888701, - "balance_loss_mlp": 1.02449584, - "epoch": 0.30043589358184275, - "flos": 22528380015360.0, - "grad_norm": 4.4692930536610245, - "language_loss": 0.75825363, - "learning_rate": 3.280612661141615e-06, - "loss": 0.7798208, - "num_input_tokens_seen": 107520070, - "step": 4997, - "time_per_iteration": 2.733402967453003 - }, - { - "auxiliary_loss_clip": 0.01131118, - "auxiliary_loss_mlp": 0.0104737, - "balance_loss_clip": 1.05176449, - "balance_loss_mlp": 1.03149128, - "epoch": 0.30049601683451077, - "flos": 20995892208000.0, - "grad_norm": 2.0588160995259197, - "language_loss": 0.78425241, - "learning_rate": 3.2803134836909646e-06, - "loss": 0.80603731, - "num_input_tokens_seen": 107539285, - "step": 4998, - "time_per_iteration": 2.7973837852478027 - }, - { - "auxiliary_loss_clip": 0.011392, - "auxiliary_loss_mlp": 0.01044927, - "balance_loss_clip": 1.05180395, - "balance_loss_mlp": 1.0287745, - "epoch": 0.30055614008717874, - "flos": 23916004272000.0, - "grad_norm": 18.871291300313036, - "language_loss": 0.73622382, - "learning_rate": 3.2800142576906985e-06, - "loss": 0.7580651, - "num_input_tokens_seen": 107560260, - "step": 4999, - "time_per_iteration": 2.7197916507720947 - }, - { - "auxiliary_loss_clip": 0.01131684, - "auxiliary_loss_mlp": 0.01044515, - "balance_loss_clip": 1.05033612, - "balance_loss_mlp": 1.02750361, - "epoch": 0.3006162633398467, - "flos": 19169798630400.0, - "grad_norm": 1.6090337016392804, - "language_loss": 0.75454789, - "learning_rate": 3.2797149831521626e-06, - "loss": 0.77630985, - "num_input_tokens_seen": 107579260, - "step": 5000, - "time_per_iteration": 2.688054323196411 - }, - { - "auxiliary_loss_clip": 0.01138443, - "auxiliary_loss_mlp": 0.01041074, - "balance_loss_clip": 1.0505259, - "balance_loss_mlp": 1.02564812, - "epoch": 0.30067638659251467, - "flos": 14679241061760.0, - "grad_norm": 1.7985326326547535, - "language_loss": 0.81841409, - "learning_rate": 3.2794156600867073e-06, - "loss": 0.84020931, - "num_input_tokens_seen": 107595245, - "step": 5001, - "time_per_iteration": 2.6519837379455566 - }, - { - "auxiliary_loss_clip": 0.01128756, - "auxiliary_loss_mlp": 0.01048602, - "balance_loss_clip": 1.05139947, - "balance_loss_mlp": 1.03068447, - "epoch": 0.30073650984518263, - "flos": 23368007404800.0, - "grad_norm": 1.8684342377814658, - "language_loss": 0.7999261, - "learning_rate": 3.2791162885056815e-06, - "loss": 0.82169974, - "num_input_tokens_seen": 107613985, - "step": 5002, - "time_per_iteration": 2.6749327182769775 - }, - { - "auxiliary_loss_clip": 0.01091983, - "auxiliary_loss_mlp": 0.0104282, - "balance_loss_clip": 1.04869151, - "balance_loss_mlp": 1.02431834, - "epoch": 0.3007966330978506, - "flos": 22966633854720.0, - "grad_norm": 1.9577039368374018, - "language_loss": 0.70993537, - "learning_rate": 3.2788168684204376e-06, - "loss": 0.73128337, - "num_input_tokens_seen": 107631435, - "step": 5003, - "time_per_iteration": 2.908494472503662 - }, - { - "auxiliary_loss_clip": 0.01110546, - "auxiliary_loss_mlp": 0.01043883, - "balance_loss_clip": 1.05014396, - "balance_loss_mlp": 1.02643037, - "epoch": 0.30085675635051856, - "flos": 27818452460160.0, - "grad_norm": 1.956987555909332, - "language_loss": 0.70556092, - "learning_rate": 3.27851739984233e-06, - "loss": 0.72710526, - "num_input_tokens_seen": 107650530, - "step": 5004, - "time_per_iteration": 2.8064236640930176 - }, - { - "auxiliary_loss_clip": 0.01119172, - "auxiliary_loss_mlp": 0.01045143, - "balance_loss_clip": 1.05067444, - "balance_loss_mlp": 1.02800083, - "epoch": 0.3009168796031865, - "flos": 10882729059840.0, - "grad_norm": 2.8453259041050805, - "language_loss": 0.81459486, - "learning_rate": 3.278217882782715e-06, - "loss": 0.83623803, - "num_input_tokens_seen": 107662240, - "step": 5005, - "time_per_iteration": 2.633951425552368 - }, - { - "auxiliary_loss_clip": 0.01130639, - "auxiliary_loss_mlp": 0.01043853, - "balance_loss_clip": 1.0514015, - "balance_loss_mlp": 1.02742577, - "epoch": 0.3009770028558545, - "flos": 23805399317760.0, - "grad_norm": 3.7156546302240043, - "language_loss": 0.74672973, - "learning_rate": 3.2779183172529497e-06, - "loss": 0.76847464, - "num_input_tokens_seen": 107680330, - "step": 5006, - "time_per_iteration": 2.7556662559509277 - }, - { - "auxiliary_loss_clip": 0.01101239, - "auxiliary_loss_mlp": 0.00775371, - "balance_loss_clip": 1.04850578, - "balance_loss_mlp": 1.00104856, - "epoch": 0.30103712610852246, - "flos": 26468211283200.0, - "grad_norm": 2.0504029481480153, - "language_loss": 0.71090448, - "learning_rate": 3.2776187032643932e-06, - "loss": 0.72967064, - "num_input_tokens_seen": 107700020, - "step": 5007, - "time_per_iteration": 2.83591365814209 - }, - { - "auxiliary_loss_clip": 0.01129575, - "auxiliary_loss_mlp": 0.01038114, - "balance_loss_clip": 1.05173922, - "balance_loss_mlp": 1.0206027, - "epoch": 0.3010972493611904, - "flos": 22856459863680.0, - "grad_norm": 2.302333802055736, - "language_loss": 0.76504552, - "learning_rate": 3.2773190408284075e-06, - "loss": 0.78672242, - "num_input_tokens_seen": 107718575, - "step": 5008, - "time_per_iteration": 2.7624082565307617 - }, - { - "auxiliary_loss_clip": 0.0112694, - "auxiliary_loss_mlp": 0.01039735, - "balance_loss_clip": 1.05119205, - "balance_loss_mlp": 1.02284265, - "epoch": 0.3011573726138584, - "flos": 24053685102720.0, - "grad_norm": 1.840633361886899, - "language_loss": 0.84215975, - "learning_rate": 3.2770193299563564e-06, - "loss": 0.86382657, - "num_input_tokens_seen": 107738635, - "step": 5009, - "time_per_iteration": 2.7053475379943848 - }, - { - "auxiliary_loss_clip": 0.01135722, - "auxiliary_loss_mlp": 0.0104281, - "balance_loss_clip": 1.05079174, - "balance_loss_mlp": 1.02389145, - "epoch": 0.30121749586652635, - "flos": 20259687052800.0, - "grad_norm": 1.970244045667646, - "language_loss": 0.83804011, - "learning_rate": 3.276719570659604e-06, - "loss": 0.85982549, - "num_input_tokens_seen": 107753415, - "step": 5010, - "time_per_iteration": 2.677002429962158 - }, - { - "auxiliary_loss_clip": 0.01108582, - "auxiliary_loss_mlp": 0.01038214, - "balance_loss_clip": 1.04942024, - "balance_loss_mlp": 1.02294374, - "epoch": 0.3012776191191944, - "flos": 26943058103040.0, - "grad_norm": 2.3216326772862246, - "language_loss": 0.85401523, - "learning_rate": 3.2764197629495176e-06, - "loss": 0.87548327, - "num_input_tokens_seen": 107773840, - "step": 5011, - "time_per_iteration": 2.807887077331543 - }, - { - "auxiliary_loss_clip": 0.01119452, - "auxiliary_loss_mlp": 0.01044648, - "balance_loss_clip": 1.04522014, - "balance_loss_mlp": 1.02680194, - "epoch": 0.30133774237186234, - "flos": 20412307941120.0, - "grad_norm": 2.58081844210284, - "language_loss": 0.72122502, - "learning_rate": 3.2761199068374656e-06, - "loss": 0.74286604, - "num_input_tokens_seen": 107792020, - "step": 5012, - "time_per_iteration": 2.689375400543213 - }, - { - "auxiliary_loss_clip": 0.01127162, - "auxiliary_loss_mlp": 0.01042946, - "balance_loss_clip": 1.04826403, - "balance_loss_mlp": 1.02628016, - "epoch": 0.3013978656245303, - "flos": 19792453916160.0, - "grad_norm": 2.871668468467944, - "language_loss": 0.88278735, - "learning_rate": 3.275820002334819e-06, - "loss": 0.90448833, - "num_input_tokens_seen": 107809595, - "step": 5013, - "time_per_iteration": 2.6482350826263428 - }, - { - "auxiliary_loss_clip": 0.01110184, - "auxiliary_loss_mlp": 0.01050326, - "balance_loss_clip": 1.04318821, - "balance_loss_mlp": 1.0286417, - "epoch": 0.30145798887719827, - "flos": 16249650652800.0, - "grad_norm": 1.8756845710135603, - "language_loss": 0.82593644, - "learning_rate": 3.2755200494529496e-06, - "loss": 0.84754151, - "num_input_tokens_seen": 107827230, - "step": 5014, - "time_per_iteration": 2.6681008338928223 - }, - { - "auxiliary_loss_clip": 0.01092673, - "auxiliary_loss_mlp": 0.01047692, - "balance_loss_clip": 1.04461288, - "balance_loss_mlp": 1.03045392, - "epoch": 0.30151811212986623, - "flos": 24571733005440.0, - "grad_norm": 1.7101695757694795, - "language_loss": 0.68239003, - "learning_rate": 3.2752200482032323e-06, - "loss": 0.7037937, - "num_input_tokens_seen": 107847195, - "step": 5015, - "time_per_iteration": 2.725411891937256 - }, - { - "auxiliary_loss_clip": 0.01110447, - "auxiliary_loss_mlp": 0.01043819, - "balance_loss_clip": 1.0448432, - "balance_loss_mlp": 1.02652168, - "epoch": 0.3015782353825342, - "flos": 21872076664320.0, - "grad_norm": 2.2766913154728625, - "language_loss": 0.74497074, - "learning_rate": 3.2749199985970436e-06, - "loss": 0.76651341, - "num_input_tokens_seen": 107866420, - "step": 5016, - "time_per_iteration": 2.710721492767334 - }, - { - "auxiliary_loss_clip": 0.01133464, - "auxiliary_loss_mlp": 0.01041604, - "balance_loss_clip": 1.05026031, - "balance_loss_mlp": 1.02444994, - "epoch": 0.30163835863520216, - "flos": 28769331248640.0, - "grad_norm": 1.7847015072033203, - "language_loss": 0.65504754, - "learning_rate": 3.2746199006457603e-06, - "loss": 0.67679822, - "num_input_tokens_seen": 107889090, - "step": 5017, - "time_per_iteration": 2.7239317893981934 - }, - { - "auxiliary_loss_clip": 0.01091977, - "auxiliary_loss_mlp": 0.01057247, - "balance_loss_clip": 1.04233074, - "balance_loss_mlp": 1.03813791, - "epoch": 0.30169848188787013, - "flos": 22966202891520.0, - "grad_norm": 2.1696927992492783, - "language_loss": 0.68739498, - "learning_rate": 3.2743197543607628e-06, - "loss": 0.70888722, - "num_input_tokens_seen": 107907520, - "step": 5018, - "time_per_iteration": 2.6655359268188477 - }, - { - "auxiliary_loss_clip": 0.01135218, - "auxiliary_loss_mlp": 0.01042787, - "balance_loss_clip": 1.0482893, - "balance_loss_mlp": 1.02783799, - "epoch": 0.3017586051405381, - "flos": 21835268202240.0, - "grad_norm": 1.9457029488983892, - "language_loss": 0.78853333, - "learning_rate": 3.2740195597534327e-06, - "loss": 0.8103134, - "num_input_tokens_seen": 107925650, - "step": 5019, - "time_per_iteration": 2.669679641723633 - }, - { - "auxiliary_loss_clip": 0.01112458, - "auxiliary_loss_mlp": 0.01044161, - "balance_loss_clip": 1.04863656, - "balance_loss_mlp": 1.02766263, - "epoch": 0.30181872839320606, - "flos": 22160403135360.0, - "grad_norm": 3.674249330665847, - "language_loss": 0.70038712, - "learning_rate": 3.2737193168351527e-06, - "loss": 0.72195333, - "num_input_tokens_seen": 107943975, - "step": 5020, - "time_per_iteration": 2.704000234603882 - }, - { - "auxiliary_loss_clip": 0.01143422, - "auxiliary_loss_mlp": 0.01049684, - "balance_loss_clip": 1.05071819, - "balance_loss_mlp": 1.03320909, - "epoch": 0.301878851645874, - "flos": 18114168804480.0, - "grad_norm": 5.641410405732297, - "language_loss": 0.78549969, - "learning_rate": 3.2734190256173085e-06, - "loss": 0.80743068, - "num_input_tokens_seen": 107962950, - "step": 5021, - "time_per_iteration": 4.521278142929077 - }, - { - "auxiliary_loss_clip": 0.01129372, - "auxiliary_loss_mlp": 0.01031797, - "balance_loss_clip": 1.04859924, - "balance_loss_mlp": 1.01572752, - "epoch": 0.301938974898542, - "flos": 17602226213760.0, - "grad_norm": 3.308202374048827, - "language_loss": 0.75482392, - "learning_rate": 3.2731186861112877e-06, - "loss": 0.77643561, - "num_input_tokens_seen": 107979700, - "step": 5022, - "time_per_iteration": 4.1478235721588135 - }, - { - "auxiliary_loss_clip": 0.01141828, - "auxiliary_loss_mlp": 0.01043797, - "balance_loss_clip": 1.04905522, - "balance_loss_mlp": 1.02676249, - "epoch": 0.30199909815120995, - "flos": 11181219079680.0, - "grad_norm": 1.7715139184612991, - "language_loss": 0.69534874, - "learning_rate": 3.2728182983284793e-06, - "loss": 0.71720505, - "num_input_tokens_seen": 107996645, - "step": 5023, - "time_per_iteration": 2.582491636276245 - }, - { - "auxiliary_loss_clip": 0.01112614, - "auxiliary_loss_mlp": 0.01040881, - "balance_loss_clip": 1.04434311, - "balance_loss_mlp": 1.02471602, - "epoch": 0.302059221403878, - "flos": 21907843632000.0, - "grad_norm": 4.128865002464027, - "language_loss": 0.71400636, - "learning_rate": 3.2725178622802724e-06, - "loss": 0.73554134, - "num_input_tokens_seen": 108015020, - "step": 5024, - "time_per_iteration": 2.6789708137512207 - }, - { - "auxiliary_loss_clip": 0.01125475, - "auxiliary_loss_mlp": 0.01051317, - "balance_loss_clip": 1.04789031, - "balance_loss_mlp": 1.03441346, - "epoch": 0.30211934465654594, - "flos": 26396390039040.0, - "grad_norm": 2.5352325664815396, - "language_loss": 0.73949707, - "learning_rate": 3.272217377978061e-06, - "loss": 0.76126498, - "num_input_tokens_seen": 108036430, - "step": 5025, - "time_per_iteration": 2.7021281719207764 - }, - { - "auxiliary_loss_clip": 0.01129438, - "auxiliary_loss_mlp": 0.01049255, - "balance_loss_clip": 1.05115628, - "balance_loss_mlp": 1.03333473, - "epoch": 0.3021794679092139, - "flos": 23400470321280.0, - "grad_norm": 1.5312912087399582, - "language_loss": 0.67339373, - "learning_rate": 3.2719168454332387e-06, - "loss": 0.69518065, - "num_input_tokens_seen": 108054250, - "step": 5026, - "time_per_iteration": 4.172817230224609 - }, - { - "auxiliary_loss_clip": 0.01131398, - "auxiliary_loss_mlp": 0.01045765, - "balance_loss_clip": 1.05058789, - "balance_loss_mlp": 1.02871835, - "epoch": 0.30223959116188187, - "flos": 20260979942400.0, - "grad_norm": 1.8656003857402752, - "language_loss": 0.84821522, - "learning_rate": 3.2716162646572034e-06, - "loss": 0.86998689, - "num_input_tokens_seen": 108071495, - "step": 5027, - "time_per_iteration": 2.66186785697937 - }, - { - "auxiliary_loss_clip": 0.01104085, - "auxiliary_loss_mlp": 0.01045706, - "balance_loss_clip": 1.04686451, - "balance_loss_mlp": 1.03030431, - "epoch": 0.30229971441454984, - "flos": 26687840993280.0, - "grad_norm": 1.633485895123786, - "language_loss": 0.78574622, - "learning_rate": 3.271315635661351e-06, - "loss": 0.80724418, - "num_input_tokens_seen": 108092135, - "step": 5028, - "time_per_iteration": 4.454678297042847 - }, - { - "auxiliary_loss_clip": 0.01113383, - "auxiliary_loss_mlp": 0.01048022, - "balance_loss_clip": 1.04682207, - "balance_loss_mlp": 1.03115392, - "epoch": 0.3023598376672178, - "flos": 34345323953280.0, - "grad_norm": 1.9340935936746968, - "language_loss": 0.77085543, - "learning_rate": 3.2710149584570826e-06, - "loss": 0.79246956, - "num_input_tokens_seen": 108112945, - "step": 5029, - "time_per_iteration": 2.841707229614258 - }, - { - "auxiliary_loss_clip": 0.01111921, - "auxiliary_loss_mlp": 0.01048937, - "balance_loss_clip": 1.04846191, - "balance_loss_mlp": 1.02920818, - "epoch": 0.30241996091988577, - "flos": 23112143850240.0, - "grad_norm": 2.1432001376374257, - "language_loss": 0.8240397, - "learning_rate": 3.2707142330557993e-06, - "loss": 0.84564829, - "num_input_tokens_seen": 108130325, - "step": 5030, - "time_per_iteration": 2.8557751178741455 - }, - { - "auxiliary_loss_clip": 0.01090897, - "auxiliary_loss_mlp": 0.00775419, - "balance_loss_clip": 1.04519463, - "balance_loss_mlp": 1.00112486, - "epoch": 0.30248008417255373, - "flos": 19390002958080.0, - "grad_norm": 2.2374457582531098, - "language_loss": 0.6987617, - "learning_rate": 3.270413459468905e-06, - "loss": 0.71742487, - "num_input_tokens_seen": 108150300, - "step": 5031, - "time_per_iteration": 2.7827746868133545 - }, - { - "auxiliary_loss_clip": 0.01121676, - "auxiliary_loss_mlp": 0.01044463, - "balance_loss_clip": 1.04549253, - "balance_loss_mlp": 1.02800059, - "epoch": 0.3025402074252217, - "flos": 23769704177280.0, - "grad_norm": 1.8685207024800563, - "language_loss": 0.82324117, - "learning_rate": 3.2701126377078047e-06, - "loss": 0.84490258, - "num_input_tokens_seen": 108170330, - "step": 5032, - "time_per_iteration": 2.6529927253723145 - }, - { - "auxiliary_loss_clip": 0.01104945, - "auxiliary_loss_mlp": 0.01059072, - "balance_loss_clip": 1.05129266, - "balance_loss_mlp": 1.03951025, - "epoch": 0.30260033067788966, - "flos": 25994118648960.0, - "grad_norm": 2.130148669813867, - "language_loss": 0.73156881, - "learning_rate": 3.269811767783906e-06, - "loss": 0.75320899, - "num_input_tokens_seen": 108191265, - "step": 5033, - "time_per_iteration": 2.7259597778320312 - }, - { - "auxiliary_loss_clip": 0.01124221, - "auxiliary_loss_mlp": 0.01049397, - "balance_loss_clip": 1.04687023, - "balance_loss_mlp": 1.03221893, - "epoch": 0.3026604539305576, - "flos": 25374551932800.0, - "grad_norm": 1.564237149834404, - "language_loss": 0.74164939, - "learning_rate": 3.2695108497086185e-06, - "loss": 0.76338559, - "num_input_tokens_seen": 108211615, - "step": 5034, - "time_per_iteration": 2.674745798110962 - }, - { - "auxiliary_loss_clip": 0.01140313, - "auxiliary_loss_mlp": 0.01039121, - "balance_loss_clip": 1.04939198, - "balance_loss_mlp": 1.02224064, - "epoch": 0.3027205771832256, - "flos": 25812733944960.0, - "grad_norm": 1.8295549596836873, - "language_loss": 0.72133434, - "learning_rate": 3.269209883493352e-06, - "loss": 0.74312872, - "num_input_tokens_seen": 108231080, - "step": 5035, - "time_per_iteration": 2.6429855823516846 - }, - { - "auxiliary_loss_clip": 0.01123118, - "auxiliary_loss_mlp": 0.01038432, - "balance_loss_clip": 1.04499483, - "balance_loss_mlp": 1.02267289, - "epoch": 0.30278070043589356, - "flos": 27344539393920.0, - "grad_norm": 2.468501372591198, - "language_loss": 0.86918867, - "learning_rate": 3.2689088691495196e-06, - "loss": 0.89080417, - "num_input_tokens_seen": 108251125, - "step": 5036, - "time_per_iteration": 2.6735007762908936 - }, - { - "auxiliary_loss_clip": 0.01097642, - "auxiliary_loss_mlp": 0.01051442, - "balance_loss_clip": 1.04504728, - "balance_loss_mlp": 1.0331912, - "epoch": 0.3028408236885616, - "flos": 24786227070720.0, - "grad_norm": 2.859596651876304, - "language_loss": 0.77406383, - "learning_rate": 3.268607806688536e-06, - "loss": 0.79555464, - "num_input_tokens_seen": 108272545, - "step": 5037, - "time_per_iteration": 2.7311182022094727 - }, - { - "auxiliary_loss_clip": 0.01102304, - "auxiliary_loss_mlp": 0.01044604, - "balance_loss_clip": 1.0462358, - "balance_loss_mlp": 1.02683008, - "epoch": 0.30290094694122954, - "flos": 12932474670720.0, - "grad_norm": 2.32450780354164, - "language_loss": 0.77307165, - "learning_rate": 3.268306696121816e-06, - "loss": 0.79454064, - "num_input_tokens_seen": 108289725, - "step": 5038, - "time_per_iteration": 2.677525043487549 - }, - { - "auxiliary_loss_clip": 0.01113965, - "auxiliary_loss_mlp": 0.01037105, - "balance_loss_clip": 1.04819584, - "balance_loss_mlp": 1.02067804, - "epoch": 0.3029610701938975, - "flos": 25916443488000.0, - "grad_norm": 2.1234468188232976, - "language_loss": 0.74140579, - "learning_rate": 3.2680055374607804e-06, - "loss": 0.76291645, - "num_input_tokens_seen": 108310690, - "step": 5039, - "time_per_iteration": 2.7086853981018066 - }, - { - "auxiliary_loss_clip": 0.01137739, - "auxiliary_loss_mlp": 0.00774651, - "balance_loss_clip": 1.05068994, - "balance_loss_mlp": 1.00113058, - "epoch": 0.3030211934465655, - "flos": 21980993679360.0, - "grad_norm": 2.3826017374700372, - "language_loss": 0.79777801, - "learning_rate": 3.267704330716847e-06, - "loss": 0.81690192, - "num_input_tokens_seen": 108328905, - "step": 5040, - "time_per_iteration": 2.665175199508667 - }, - { - "auxiliary_loss_clip": 0.01114198, - "auxiliary_loss_mlp": 0.01038229, - "balance_loss_clip": 1.04937124, - "balance_loss_mlp": 1.02279687, - "epoch": 0.30308131669923344, - "flos": 20991977625600.0, - "grad_norm": 1.7800027985776907, - "language_loss": 0.81872481, - "learning_rate": 3.267403075901438e-06, - "loss": 0.84024912, - "num_input_tokens_seen": 108346680, - "step": 5041, - "time_per_iteration": 2.6471712589263916 - }, - { - "auxiliary_loss_clip": 0.01018002, - "auxiliary_loss_mlp": 0.01004656, - "balance_loss_clip": 1.0244385, - "balance_loss_mlp": 1.00277221, - "epoch": 0.3031414399519014, - "flos": 60548875827840.0, - "grad_norm": 0.7715538683836823, - "language_loss": 0.59505904, - "learning_rate": 3.267101773025978e-06, - "loss": 0.61528552, - "num_input_tokens_seen": 108413885, - "step": 5042, - "time_per_iteration": 3.3167309761047363 - }, - { - "auxiliary_loss_clip": 0.0114486, - "auxiliary_loss_mlp": 0.01036647, - "balance_loss_clip": 1.05319929, - "balance_loss_mlp": 1.01940918, - "epoch": 0.30320156320456937, - "flos": 21907664064000.0, - "grad_norm": 1.838538817411587, - "language_loss": 0.71149278, - "learning_rate": 3.266800422101892e-06, - "loss": 0.73330784, - "num_input_tokens_seen": 108433640, - "step": 5043, - "time_per_iteration": 2.6266753673553467 - }, - { - "auxiliary_loss_clip": 0.01095086, - "auxiliary_loss_mlp": 0.01036293, - "balance_loss_clip": 1.04519725, - "balance_loss_mlp": 1.01948404, - "epoch": 0.30326168645723733, - "flos": 21652770176640.0, - "grad_norm": 3.620919115388089, - "language_loss": 0.69573802, - "learning_rate": 3.266499023140606e-06, - "loss": 0.71705186, - "num_input_tokens_seen": 108452640, - "step": 5044, - "time_per_iteration": 2.7561492919921875 - }, - { - "auxiliary_loss_clip": 0.01127659, - "auxiliary_loss_mlp": 0.01039805, - "balance_loss_clip": 1.05019724, - "balance_loss_mlp": 1.02335382, - "epoch": 0.3033218097099053, - "flos": 21871286565120.0, - "grad_norm": 1.3797061223764004, - "language_loss": 0.77188826, - "learning_rate": 3.2661975761535513e-06, - "loss": 0.79356289, - "num_input_tokens_seen": 108472470, - "step": 5045, - "time_per_iteration": 2.6529667377471924 - }, - { - "auxiliary_loss_clip": 0.01141388, - "auxiliary_loss_mlp": 0.00775246, - "balance_loss_clip": 1.05165195, - "balance_loss_mlp": 1.00136316, - "epoch": 0.30338193296257326, - "flos": 27089717333760.0, - "grad_norm": 1.772786200303907, - "language_loss": 0.72473782, - "learning_rate": 3.2658960811521564e-06, - "loss": 0.74390417, - "num_input_tokens_seen": 108493025, - "step": 5046, - "time_per_iteration": 2.8433380126953125 - }, - { - "auxiliary_loss_clip": 0.01131475, - "auxiliary_loss_mlp": 0.01040342, - "balance_loss_clip": 1.04979491, - "balance_loss_mlp": 1.02119732, - "epoch": 0.30344205621524123, - "flos": 19534363718400.0, - "grad_norm": 1.7729778222487513, - "language_loss": 0.81406343, - "learning_rate": 3.2655945381478564e-06, - "loss": 0.83578163, - "num_input_tokens_seen": 108513480, - "step": 5047, - "time_per_iteration": 2.6653506755828857 - }, - { - "auxiliary_loss_clip": 0.01078955, - "auxiliary_loss_mlp": 0.01042974, - "balance_loss_clip": 1.04126537, - "balance_loss_mlp": 1.02565265, - "epoch": 0.3035021794679092, - "flos": 23910976368000.0, - "grad_norm": 2.0012909108595287, - "language_loss": 0.7191782, - "learning_rate": 3.265292947152084e-06, - "loss": 0.74039751, - "num_input_tokens_seen": 108533155, - "step": 5048, - "time_per_iteration": 2.7198410034179688 - }, - { - "auxiliary_loss_clip": 0.01117557, - "auxiliary_loss_mlp": 0.01037944, - "balance_loss_clip": 1.04860258, - "balance_loss_mlp": 1.02263796, - "epoch": 0.30356230272057716, - "flos": 16143606725760.0, - "grad_norm": 1.6260333435769418, - "language_loss": 0.75220919, - "learning_rate": 3.2649913081762763e-06, - "loss": 0.77376425, - "num_input_tokens_seen": 108551900, - "step": 5049, - "time_per_iteration": 2.6649906635284424 - }, - { - "auxiliary_loss_clip": 0.01131404, - "auxiliary_loss_mlp": 0.01035526, - "balance_loss_clip": 1.04947305, - "balance_loss_mlp": 1.01907563, - "epoch": 0.3036224259732452, - "flos": 28914697589760.0, - "grad_norm": 1.5855456549340856, - "language_loss": 0.82088244, - "learning_rate": 3.2646896212318717e-06, - "loss": 0.84255171, - "num_input_tokens_seen": 108574005, - "step": 5050, - "time_per_iteration": 2.657400131225586 - }, - { - "auxiliary_loss_clip": 0.01106158, - "auxiliary_loss_mlp": 0.0103828, - "balance_loss_clip": 1.05031502, - "balance_loss_mlp": 1.02079201, - "epoch": 0.30368254922591315, - "flos": 21105599322240.0, - "grad_norm": 2.7844840544166436, - "language_loss": 0.74196702, - "learning_rate": 3.2643878863303106e-06, - "loss": 0.7634114, - "num_input_tokens_seen": 108592715, - "step": 5051, - "time_per_iteration": 2.8018569946289062 - }, - { - "auxiliary_loss_clip": 0.01079332, - "auxiliary_loss_mlp": 0.00775567, - "balance_loss_clip": 1.04338145, - "balance_loss_mlp": 1.00118661, - "epoch": 0.3037426724785811, - "flos": 23002293081600.0, - "grad_norm": 1.6849730779493737, - "language_loss": 0.76015687, - "learning_rate": 3.264086103483033e-06, - "loss": 0.77870589, - "num_input_tokens_seen": 108611770, - "step": 5052, - "time_per_iteration": 2.9220657348632812 - }, - { - "auxiliary_loss_clip": 0.01143047, - "auxiliary_loss_mlp": 0.01043624, - "balance_loss_clip": 1.0504849, - "balance_loss_mlp": 1.02656555, - "epoch": 0.3038027957312491, - "flos": 15632705629440.0, - "grad_norm": 2.421175308310746, - "language_loss": 0.82370055, - "learning_rate": 3.2637842727014836e-06, - "loss": 0.84556723, - "num_input_tokens_seen": 108629070, - "step": 5053, - "time_per_iteration": 2.5955326557159424 - }, - { - "auxiliary_loss_clip": 0.01113702, - "auxiliary_loss_mlp": 0.01042002, - "balance_loss_clip": 1.0471338, - "balance_loss_mlp": 1.02475214, - "epoch": 0.30386291898391704, - "flos": 12713994195840.0, - "grad_norm": 1.8307418288785484, - "language_loss": 0.70979112, - "learning_rate": 3.2634823939971083e-06, - "loss": 0.73134822, - "num_input_tokens_seen": 108646315, - "step": 5054, - "time_per_iteration": 2.7001569271087646 - }, - { - "auxiliary_loss_clip": 0.01140964, - "auxiliary_loss_mlp": 0.01039805, - "balance_loss_clip": 1.05088401, - "balance_loss_mlp": 1.0225668, - "epoch": 0.303923042236585, - "flos": 26359437922560.0, - "grad_norm": 2.314538095600907, - "language_loss": 0.69049591, - "learning_rate": 3.2631804673813545e-06, - "loss": 0.71230358, - "num_input_tokens_seen": 108665920, - "step": 5055, - "time_per_iteration": 2.6685287952423096 - }, - { - "auxiliary_loss_clip": 0.01113325, - "auxiliary_loss_mlp": 0.01036352, - "balance_loss_clip": 1.04871488, - "balance_loss_mlp": 1.01880479, - "epoch": 0.30398316548925297, - "flos": 19719232041600.0, - "grad_norm": 1.959915959447654, - "language_loss": 0.67298615, - "learning_rate": 3.2628784928656707e-06, - "loss": 0.69448292, - "num_input_tokens_seen": 108683485, - "step": 5056, - "time_per_iteration": 2.6933648586273193 - }, - { - "auxiliary_loss_clip": 0.01110454, - "auxiliary_loss_mlp": 0.01043223, - "balance_loss_clip": 1.04604077, - "balance_loss_mlp": 1.02673686, - "epoch": 0.30404328874192094, - "flos": 24239846315520.0, - "grad_norm": 1.7045430221851803, - "language_loss": 0.82544303, - "learning_rate": 3.262576470461507e-06, - "loss": 0.84697986, - "num_input_tokens_seen": 108702700, - "step": 5057, - "time_per_iteration": 2.740187406539917 - }, - { - "auxiliary_loss_clip": 0.01115402, - "auxiliary_loss_mlp": 0.01039139, - "balance_loss_clip": 1.04719019, - "balance_loss_mlp": 1.0222472, - "epoch": 0.3041034119945889, - "flos": 24498942094080.0, - "grad_norm": 1.8459128585017135, - "language_loss": 0.88849652, - "learning_rate": 3.2622744001803176e-06, - "loss": 0.91004193, - "num_input_tokens_seen": 108721860, - "step": 5058, - "time_per_iteration": 2.7015340328216553 - }, - { - "auxiliary_loss_clip": 0.01102971, - "auxiliary_loss_mlp": 0.01047692, - "balance_loss_clip": 1.04598641, - "balance_loss_mlp": 1.03040063, - "epoch": 0.30416353524725687, - "flos": 28288881907200.0, - "grad_norm": 7.837576661900421, - "language_loss": 0.71809238, - "learning_rate": 3.2619722820335564e-06, - "loss": 0.73959899, - "num_input_tokens_seen": 108743215, - "step": 5059, - "time_per_iteration": 2.7542827129364014 - }, - { - "auxiliary_loss_clip": 0.01083101, - "auxiliary_loss_mlp": 0.01042605, - "balance_loss_clip": 1.04435182, - "balance_loss_mlp": 1.02670228, - "epoch": 0.30422365849992483, - "flos": 23660392112640.0, - "grad_norm": 2.424944175434462, - "language_loss": 0.73316336, - "learning_rate": 3.26167011603268e-06, - "loss": 0.7544204, - "num_input_tokens_seen": 108765505, - "step": 5060, - "time_per_iteration": 4.655209541320801 - }, - { - "auxiliary_loss_clip": 0.01140365, - "auxiliary_loss_mlp": 0.01038221, - "balance_loss_clip": 1.05072367, - "balance_loss_mlp": 1.02234221, - "epoch": 0.3042837817525928, - "flos": 22998773548800.0, - "grad_norm": 2.6284704346086, - "language_loss": 0.77279079, - "learning_rate": 3.2613679021891463e-06, - "loss": 0.79457664, - "num_input_tokens_seen": 108783370, - "step": 5061, - "time_per_iteration": 4.1857099533081055 - }, - { - "auxiliary_loss_clip": 0.01105214, - "auxiliary_loss_mlp": 0.01039505, - "balance_loss_clip": 1.05216312, - "balance_loss_mlp": 1.02225542, - "epoch": 0.30434390500526076, - "flos": 22082332924800.0, - "grad_norm": 1.9238999634605745, - "language_loss": 0.81891274, - "learning_rate": 3.261065640514415e-06, - "loss": 0.84035993, - "num_input_tokens_seen": 108797430, - "step": 5062, - "time_per_iteration": 2.7250373363494873 - }, - { - "auxiliary_loss_clip": 0.01132809, - "auxiliary_loss_mlp": 0.01036348, - "balance_loss_clip": 1.04662633, - "balance_loss_mlp": 1.02098203, - "epoch": 0.3044040282579287, - "flos": 25483504861440.0, - "grad_norm": 1.8479376829176948, - "language_loss": 0.74707627, - "learning_rate": 3.2607633310199483e-06, - "loss": 0.76876783, - "num_input_tokens_seen": 108816945, - "step": 5063, - "time_per_iteration": 2.6387155055999756 - }, - { - "auxiliary_loss_clip": 0.01126143, - "auxiliary_loss_mlp": 0.00775405, - "balance_loss_clip": 1.04923415, - "balance_loss_mlp": 1.00135541, - "epoch": 0.30446415151059675, - "flos": 21945478106880.0, - "grad_norm": 1.691336757602503, - "language_loss": 0.84400523, - "learning_rate": 3.26046097371721e-06, - "loss": 0.86302078, - "num_input_tokens_seen": 108836615, - "step": 5064, - "time_per_iteration": 2.645256519317627 - }, - { - "auxiliary_loss_clip": 0.01125608, - "auxiliary_loss_mlp": 0.01040172, - "balance_loss_clip": 1.04725182, - "balance_loss_mlp": 1.02311337, - "epoch": 0.3045242747632647, - "flos": 16435416816000.0, - "grad_norm": 2.198572989748056, - "language_loss": 0.76257896, - "learning_rate": 3.2601585686176655e-06, - "loss": 0.78423673, - "num_input_tokens_seen": 108855165, - "step": 5065, - "time_per_iteration": 4.119553565979004 - }, - { - "auxiliary_loss_clip": 0.01110206, - "auxiliary_loss_mlp": 0.01043438, - "balance_loss_clip": 1.04441273, - "balance_loss_mlp": 1.0260098, - "epoch": 0.3045843980159327, - "flos": 31540341957120.0, - "grad_norm": 1.985168773674731, - "language_loss": 0.62328786, - "learning_rate": 3.2598561157327814e-06, - "loss": 0.64482433, - "num_input_tokens_seen": 108874690, - "step": 5066, - "time_per_iteration": 4.380331516265869 - }, - { - "auxiliary_loss_clip": 0.01112307, - "auxiliary_loss_mlp": 0.0104907, - "balance_loss_clip": 1.04790235, - "balance_loss_mlp": 1.03186774, - "epoch": 0.30464452126860064, - "flos": 17853636481920.0, - "grad_norm": 2.188592288059769, - "language_loss": 0.83193344, - "learning_rate": 3.2595536150740265e-06, - "loss": 0.85354722, - "num_input_tokens_seen": 108893140, - "step": 5067, - "time_per_iteration": 2.628598213195801 - }, - { - "auxiliary_loss_clip": 0.01136833, - "auxiliary_loss_mlp": 0.01045137, - "balance_loss_clip": 1.04994464, - "balance_loss_mlp": 1.02904344, - "epoch": 0.3047046445212686, - "flos": 20631398947200.0, - "grad_norm": 4.883769852075586, - "language_loss": 0.62878895, - "learning_rate": 3.259251066652873e-06, - "loss": 0.65060866, - "num_input_tokens_seen": 108911880, - "step": 5068, - "time_per_iteration": 2.583193302154541 - }, - { - "auxiliary_loss_clip": 0.01127244, - "auxiliary_loss_mlp": 0.01039272, - "balance_loss_clip": 1.04866779, - "balance_loss_mlp": 1.02316117, - "epoch": 0.3047647677739366, - "flos": 21287594557440.0, - "grad_norm": 4.297243307498397, - "language_loss": 0.74780715, - "learning_rate": 3.258948470480793e-06, - "loss": 0.7694723, - "num_input_tokens_seen": 108930440, - "step": 5069, - "time_per_iteration": 2.643608570098877 - }, - { - "auxiliary_loss_clip": 0.01103787, - "auxiliary_loss_mlp": 0.01045252, - "balance_loss_clip": 1.04608154, - "balance_loss_mlp": 1.02922475, - "epoch": 0.30482489102660454, - "flos": 20995928121600.0, - "grad_norm": 1.9753352797934713, - "language_loss": 0.75726902, - "learning_rate": 3.258645826569261e-06, - "loss": 0.77875942, - "num_input_tokens_seen": 108949125, - "step": 5070, - "time_per_iteration": 2.715672016143799 - }, - { - "auxiliary_loss_clip": 0.01140483, - "auxiliary_loss_mlp": 0.0077507, - "balance_loss_clip": 1.04843533, - "balance_loss_mlp": 1.0012939, - "epoch": 0.3048850142792725, - "flos": 26290812988800.0, - "grad_norm": 1.7281078039111346, - "language_loss": 0.81636953, - "learning_rate": 3.2583431349297527e-06, - "loss": 0.83552504, - "num_input_tokens_seen": 108972190, - "step": 5071, - "time_per_iteration": 2.635542869567871 - }, - { - "auxiliary_loss_clip": 0.01108476, - "auxiliary_loss_mlp": 0.01045674, - "balance_loss_clip": 1.04286063, - "balance_loss_mlp": 1.02776885, - "epoch": 0.30494513753194047, - "flos": 22346241125760.0, - "grad_norm": 2.0085610287172173, - "language_loss": 0.76208484, - "learning_rate": 3.2580403955737467e-06, - "loss": 0.78362632, - "num_input_tokens_seen": 108990325, - "step": 5072, - "time_per_iteration": 2.6662180423736572 - }, - { - "auxiliary_loss_clip": 0.01099158, - "auxiliary_loss_mlp": 0.01044752, - "balance_loss_clip": 1.04694605, - "balance_loss_mlp": 1.02821743, - "epoch": 0.30500526078460843, - "flos": 19537667769600.0, - "grad_norm": 1.8424983506970039, - "language_loss": 0.70873296, - "learning_rate": 3.257737608512723e-06, - "loss": 0.7301721, - "num_input_tokens_seen": 109009505, - "step": 5073, - "time_per_iteration": 2.815281867980957 - }, - { - "auxiliary_loss_clip": 0.01133011, - "auxiliary_loss_mlp": 0.01055026, - "balance_loss_clip": 1.05032837, - "balance_loss_mlp": 1.03757334, - "epoch": 0.3050653840372764, - "flos": 14465321614080.0, - "grad_norm": 2.0666195830085434, - "language_loss": 0.76370406, - "learning_rate": 3.257434773758163e-06, - "loss": 0.78558439, - "num_input_tokens_seen": 109026350, - "step": 5074, - "time_per_iteration": 2.748568534851074 - }, - { - "auxiliary_loss_clip": 0.01115721, - "auxiliary_loss_mlp": 0.01037599, - "balance_loss_clip": 1.04921389, - "balance_loss_mlp": 1.02149391, - "epoch": 0.30512550728994436, - "flos": 24243796811520.0, - "grad_norm": 1.8649350467458667, - "language_loss": 0.74393201, - "learning_rate": 3.25713189132155e-06, - "loss": 0.76546526, - "num_input_tokens_seen": 109044165, - "step": 5075, - "time_per_iteration": 2.7015154361724854 - }, - { - "auxiliary_loss_clip": 0.01141745, - "auxiliary_loss_mlp": 0.01047345, - "balance_loss_clip": 1.0498178, - "balance_loss_mlp": 1.02825916, - "epoch": 0.30518563054261233, - "flos": 16360542915840.0, - "grad_norm": 2.030111139920667, - "language_loss": 0.75904357, - "learning_rate": 3.2568289612143703e-06, - "loss": 0.78093445, - "num_input_tokens_seen": 109060665, - "step": 5076, - "time_per_iteration": 2.5811965465545654 - }, - { - "auxiliary_loss_clip": 0.01116901, - "auxiliary_loss_mlp": 0.01040641, - "balance_loss_clip": 1.04864156, - "balance_loss_mlp": 1.02466679, - "epoch": 0.30524575379528035, - "flos": 21579584215680.0, - "grad_norm": 1.6479970241835653, - "language_loss": 0.79240596, - "learning_rate": 3.25652598344811e-06, - "loss": 0.81398141, - "num_input_tokens_seen": 109080035, - "step": 5077, - "time_per_iteration": 2.680205821990967 - }, - { - "auxiliary_loss_clip": 0.01087088, - "auxiliary_loss_mlp": 0.01033699, - "balance_loss_clip": 1.04356635, - "balance_loss_mlp": 1.01881564, - "epoch": 0.3053058770479483, - "flos": 16545231671040.0, - "grad_norm": 1.6765288024346336, - "language_loss": 0.74525034, - "learning_rate": 3.256222958034259e-06, - "loss": 0.76645821, - "num_input_tokens_seen": 109097385, - "step": 5078, - "time_per_iteration": 2.7247111797332764 - }, - { - "auxiliary_loss_clip": 0.01085086, - "auxiliary_loss_mlp": 0.01054049, - "balance_loss_clip": 1.04356313, - "balance_loss_mlp": 1.03728211, - "epoch": 0.3053660003006163, - "flos": 12312907954560.0, - "grad_norm": 1.7442741256404064, - "language_loss": 0.66648543, - "learning_rate": 3.255919884984307e-06, - "loss": 0.68787676, - "num_input_tokens_seen": 109115495, - "step": 5079, - "time_per_iteration": 2.746490716934204 - }, - { - "auxiliary_loss_clip": 0.01127155, - "auxiliary_loss_mlp": 0.01040504, - "balance_loss_clip": 1.04811811, - "balance_loss_mlp": 1.0248282, - "epoch": 0.30542612355328425, - "flos": 23112287504640.0, - "grad_norm": 2.3583709354228213, - "language_loss": 0.79841697, - "learning_rate": 3.2556167643097477e-06, - "loss": 0.82009357, - "num_input_tokens_seen": 109134235, - "step": 5080, - "time_per_iteration": 2.7156612873077393 - }, - { - "auxiliary_loss_clip": 0.01124116, - "auxiliary_loss_mlp": 0.00772863, - "balance_loss_clip": 1.04919219, - "balance_loss_mlp": 1.00125837, - "epoch": 0.3054862468059522, - "flos": 24389450461440.0, - "grad_norm": 2.2636550763480074, - "language_loss": 0.81280053, - "learning_rate": 3.255313596022074e-06, - "loss": 0.8317703, - "num_input_tokens_seen": 109152760, - "step": 5081, - "time_per_iteration": 2.6763248443603516 - }, - { - "auxiliary_loss_clip": 0.01120003, - "auxiliary_loss_mlp": 0.01044443, - "balance_loss_clip": 1.04644883, - "balance_loss_mlp": 1.02843297, - "epoch": 0.3055463700586202, - "flos": 29386096704000.0, - "grad_norm": 7.924214405919456, - "language_loss": 0.71839154, - "learning_rate": 3.255010380132783e-06, - "loss": 0.74003601, - "num_input_tokens_seen": 109173925, - "step": 5082, - "time_per_iteration": 2.7159903049468994 - }, - { - "auxiliary_loss_clip": 0.0112721, - "auxiliary_loss_mlp": 0.01043614, - "balance_loss_clip": 1.04611564, - "balance_loss_mlp": 1.02554226, - "epoch": 0.30560649331128814, - "flos": 25591775431680.0, - "grad_norm": 2.25447896755926, - "language_loss": 0.73108822, - "learning_rate": 3.2547071166533736e-06, - "loss": 0.75279647, - "num_input_tokens_seen": 109192510, - "step": 5083, - "time_per_iteration": 2.646739959716797 - }, - { - "auxiliary_loss_clip": 0.01107487, - "auxiliary_loss_mlp": 0.00775151, - "balance_loss_clip": 1.04263341, - "balance_loss_mlp": 1.00127327, - "epoch": 0.3056666165639561, - "flos": 19128321400320.0, - "grad_norm": 1.7470718607902291, - "language_loss": 0.71378291, - "learning_rate": 3.254403805595344e-06, - "loss": 0.73260927, - "num_input_tokens_seen": 109210885, - "step": 5084, - "time_per_iteration": 2.6846230030059814 - }, - { - "auxiliary_loss_clip": 0.01099017, - "auxiliary_loss_mlp": 0.01047221, - "balance_loss_clip": 1.04366112, - "balance_loss_mlp": 1.02929187, - "epoch": 0.30572673981662407, - "flos": 15523860441600.0, - "grad_norm": 1.8852357422602322, - "language_loss": 0.78966236, - "learning_rate": 3.2541004469701962e-06, - "loss": 0.81112474, - "num_input_tokens_seen": 109229180, - "step": 5085, - "time_per_iteration": 2.7193636894226074 - }, - { - "auxiliary_loss_clip": 0.01130512, - "auxiliary_loss_mlp": 0.01034677, - "balance_loss_clip": 1.04483652, - "balance_loss_mlp": 1.01910806, - "epoch": 0.30578686306929204, - "flos": 21506541909120.0, - "grad_norm": 1.9742516674355037, - "language_loss": 0.78476739, - "learning_rate": 3.2537970407894342e-06, - "loss": 0.80641937, - "num_input_tokens_seen": 109249510, - "step": 5086, - "time_per_iteration": 2.5860135555267334 - }, - { - "auxiliary_loss_clip": 0.01103374, - "auxiliary_loss_mlp": 0.01052848, - "balance_loss_clip": 1.04314184, - "balance_loss_mlp": 1.03509736, - "epoch": 0.30584698632196, - "flos": 20954271323520.0, - "grad_norm": 1.8682002339545791, - "language_loss": 0.76727784, - "learning_rate": 3.253493587064563e-06, - "loss": 0.78884006, - "num_input_tokens_seen": 109268200, - "step": 5087, - "time_per_iteration": 2.732639789581299 - }, - { - "auxiliary_loss_clip": 0.01125241, - "auxiliary_loss_mlp": 0.01041401, - "balance_loss_clip": 1.04509556, - "balance_loss_mlp": 1.02450943, - "epoch": 0.30590710957462797, - "flos": 24681116897280.0, - "grad_norm": 2.048016576932303, - "language_loss": 0.72534674, - "learning_rate": 3.2531900858070885e-06, - "loss": 0.74701315, - "num_input_tokens_seen": 109288370, - "step": 5088, - "time_per_iteration": 2.66654109954834 - }, - { - "auxiliary_loss_clip": 0.01128516, - "auxiliary_loss_mlp": 0.01043444, - "balance_loss_clip": 1.04584277, - "balance_loss_mlp": 1.02587295, - "epoch": 0.30596723282729593, - "flos": 17086907744640.0, - "grad_norm": 2.359735204382993, - "language_loss": 0.79327172, - "learning_rate": 3.252886537028521e-06, - "loss": 0.8149913, - "num_input_tokens_seen": 109306730, - "step": 5089, - "time_per_iteration": 2.613231897354126 - }, - { - "auxiliary_loss_clip": 0.01110444, - "auxiliary_loss_mlp": 0.01041514, - "balance_loss_clip": 1.04634953, - "balance_loss_mlp": 1.02470577, - "epoch": 0.30602735607996395, - "flos": 22857106308480.0, - "grad_norm": 1.8271327477144206, - "language_loss": 0.77158219, - "learning_rate": 3.2525829407403703e-06, - "loss": 0.79310179, - "num_input_tokens_seen": 109327360, - "step": 5090, - "time_per_iteration": 2.7469358444213867 - }, - { - "auxiliary_loss_clip": 0.01116264, - "auxiliary_loss_mlp": 0.01050158, - "balance_loss_clip": 1.04506445, - "balance_loss_mlp": 1.03317034, - "epoch": 0.3060874793326319, - "flos": 29861482227840.0, - "grad_norm": 1.7853121536190235, - "language_loss": 0.76108491, - "learning_rate": 3.2522792969541488e-06, - "loss": 0.78274912, - "num_input_tokens_seen": 109348135, - "step": 5091, - "time_per_iteration": 2.7344727516174316 - }, - { - "auxiliary_loss_clip": 0.01076722, - "auxiliary_loss_mlp": 0.01049007, - "balance_loss_clip": 1.04582906, - "balance_loss_mlp": 1.02905178, - "epoch": 0.3061476025852999, - "flos": 20448577699200.0, - "grad_norm": 1.9985396703734173, - "language_loss": 0.71938324, - "learning_rate": 3.2519756056813705e-06, - "loss": 0.74064058, - "num_input_tokens_seen": 109366220, - "step": 5092, - "time_per_iteration": 2.767212390899658 - }, - { - "auxiliary_loss_clip": 0.01114871, - "auxiliary_loss_mlp": 0.01040516, - "balance_loss_clip": 1.04740167, - "balance_loss_mlp": 1.0246855, - "epoch": 0.30620772583796785, - "flos": 19391475415680.0, - "grad_norm": 3.231748461445431, - "language_loss": 0.82655406, - "learning_rate": 3.2516718669335522e-06, - "loss": 0.84810787, - "num_input_tokens_seen": 109385260, - "step": 5093, - "time_per_iteration": 2.705643892288208 - }, - { - "auxiliary_loss_clip": 0.01136927, - "auxiliary_loss_mlp": 0.00773786, - "balance_loss_clip": 1.04842925, - "balance_loss_mlp": 1.00142932, - "epoch": 0.3062678490906358, - "flos": 24024562151040.0, - "grad_norm": 1.6185046249293755, - "language_loss": 0.75340986, - "learning_rate": 3.2513680807222114e-06, - "loss": 0.77251703, - "num_input_tokens_seen": 109405025, - "step": 5094, - "time_per_iteration": 2.6171963214874268 - }, - { - "auxiliary_loss_clip": 0.01112613, - "auxiliary_loss_mlp": 0.01042135, - "balance_loss_clip": 1.04798305, - "balance_loss_mlp": 1.02639914, - "epoch": 0.3063279723433038, - "flos": 19754639873280.0, - "grad_norm": 2.1053112950674824, - "language_loss": 0.75988996, - "learning_rate": 3.251064247058868e-06, - "loss": 0.7814374, - "num_input_tokens_seen": 109422465, - "step": 5095, - "time_per_iteration": 2.7002673149108887 - }, - { - "auxiliary_loss_clip": 0.0112272, - "auxiliary_loss_mlp": 0.01043966, - "balance_loss_clip": 1.04654729, - "balance_loss_mlp": 1.0278492, - "epoch": 0.30638809559597174, - "flos": 22450022496000.0, - "grad_norm": 8.237851994820396, - "language_loss": 0.80608332, - "learning_rate": 3.250760365955042e-06, - "loss": 0.82775021, - "num_input_tokens_seen": 109440575, - "step": 5096, - "time_per_iteration": 2.675551414489746 - }, - { - "auxiliary_loss_clip": 0.01125431, - "auxiliary_loss_mlp": 0.01036388, - "balance_loss_clip": 1.04639602, - "balance_loss_mlp": 1.02030659, - "epoch": 0.3064482188486397, - "flos": 17165157523200.0, - "grad_norm": 3.1166257890970566, - "language_loss": 0.81695235, - "learning_rate": 3.250456437422258e-06, - "loss": 0.83857059, - "num_input_tokens_seen": 109459050, - "step": 5097, - "time_per_iteration": 2.6616358757019043 - }, - { - "auxiliary_loss_clip": 0.01138165, - "auxiliary_loss_mlp": 0.01042971, - "balance_loss_clip": 1.04782009, - "balance_loss_mlp": 1.02522099, - "epoch": 0.3065083421013077, - "flos": 23768483114880.0, - "grad_norm": 2.1722798378639663, - "language_loss": 0.78152639, - "learning_rate": 3.250152461472041e-06, - "loss": 0.80333775, - "num_input_tokens_seen": 109475860, - "step": 5098, - "time_per_iteration": 2.581339120864868 - }, - { - "auxiliary_loss_clip": 0.01093696, - "auxiliary_loss_mlp": 0.01039814, - "balance_loss_clip": 1.04763365, - "balance_loss_mlp": 1.02302897, - "epoch": 0.30656846535397564, - "flos": 26431833784320.0, - "grad_norm": 1.8342329708039284, - "language_loss": 0.84488571, - "learning_rate": 3.249848438115917e-06, - "loss": 0.86622083, - "num_input_tokens_seen": 109494760, - "step": 5099, - "time_per_iteration": 2.761580467224121 - }, - { - "auxiliary_loss_clip": 0.0113763, - "auxiliary_loss_mlp": 0.01044142, - "balance_loss_clip": 1.04598331, - "balance_loss_mlp": 1.02683902, - "epoch": 0.3066285886066436, - "flos": 26651786716800.0, - "grad_norm": 1.7645297710058767, - "language_loss": 0.85650218, - "learning_rate": 3.2495443673654148e-06, - "loss": 0.87831986, - "num_input_tokens_seen": 109516480, - "step": 5100, - "time_per_iteration": 4.130753517150879 - }, - { - "auxiliary_loss_clip": 0.01099546, - "auxiliary_loss_mlp": 0.01040494, - "balance_loss_clip": 1.04097986, - "balance_loss_mlp": 1.02268374, - "epoch": 0.30668871185931157, - "flos": 15049947375360.0, - "grad_norm": 1.8121599631247622, - "language_loss": 0.78980827, - "learning_rate": 3.249240249232065e-06, - "loss": 0.81120867, - "num_input_tokens_seen": 109534615, - "step": 5101, - "time_per_iteration": 4.324965000152588 - }, - { - "auxiliary_loss_clip": 0.01102347, - "auxiliary_loss_mlp": 0.01054476, - "balance_loss_clip": 1.04654586, - "balance_loss_mlp": 1.03549778, - "epoch": 0.30674883511197953, - "flos": 20082109190400.0, - "grad_norm": 3.103169454759946, - "language_loss": 0.8002606, - "learning_rate": 3.2489360837273998e-06, - "loss": 0.82182884, - "num_input_tokens_seen": 109554040, - "step": 5102, - "time_per_iteration": 2.6799395084381104 - }, - { - "auxiliary_loss_clip": 0.01142197, - "auxiliary_loss_mlp": 0.01041215, - "balance_loss_clip": 1.05097044, - "balance_loss_mlp": 1.02254653, - "epoch": 0.30680895836464755, - "flos": 22893807029760.0, - "grad_norm": 2.1213785434731416, - "language_loss": 0.88774347, - "learning_rate": 3.2486318708629532e-06, - "loss": 0.90957761, - "num_input_tokens_seen": 109574345, - "step": 5103, - "time_per_iteration": 2.65173077583313 - }, - { - "auxiliary_loss_clip": 0.01117159, - "auxiliary_loss_mlp": 0.01047865, - "balance_loss_clip": 1.04379106, - "balance_loss_mlp": 1.03051972, - "epoch": 0.3068690816173155, - "flos": 23696159080320.0, - "grad_norm": 1.7904968866721789, - "language_loss": 0.73977435, - "learning_rate": 3.2483276106502607e-06, - "loss": 0.7614246, - "num_input_tokens_seen": 109593670, - "step": 5104, - "time_per_iteration": 4.15887975692749 - }, - { - "auxiliary_loss_clip": 0.01124364, - "auxiliary_loss_mlp": 0.00776702, - "balance_loss_clip": 1.04378068, - "balance_loss_mlp": 1.00128829, - "epoch": 0.3069292048699835, - "flos": 23551044134400.0, - "grad_norm": 3.7241561762804496, - "language_loss": 0.72777617, - "learning_rate": 3.2480233031008605e-06, - "loss": 0.74678683, - "num_input_tokens_seen": 109613385, - "step": 5105, - "time_per_iteration": 2.657212972640991 - }, - { - "auxiliary_loss_clip": 0.01112354, - "auxiliary_loss_mlp": 0.01041782, - "balance_loss_clip": 1.0451684, - "balance_loss_mlp": 1.02401972, - "epoch": 0.30698932812265145, - "flos": 24531656405760.0, - "grad_norm": 1.9297281358185925, - "language_loss": 0.87290782, - "learning_rate": 3.2477189482262916e-06, - "loss": 0.89444917, - "num_input_tokens_seen": 109632395, - "step": 5106, - "time_per_iteration": 4.409428119659424 - }, - { - "auxiliary_loss_clip": 0.0110831, - "auxiliary_loss_mlp": 0.01052851, - "balance_loss_clip": 1.04540682, - "balance_loss_mlp": 1.03390849, - "epoch": 0.3070494513753194, - "flos": 20996430912000.0, - "grad_norm": 2.254355123120303, - "language_loss": 0.71420276, - "learning_rate": 3.2474145460380945e-06, - "loss": 0.73581433, - "num_input_tokens_seen": 109651380, - "step": 5107, - "time_per_iteration": 2.7320871353149414 - }, - { - "auxiliary_loss_clip": 0.01101295, - "auxiliary_loss_mlp": 0.0104767, - "balance_loss_clip": 1.04618347, - "balance_loss_mlp": 1.03034878, - "epoch": 0.3071095746279874, - "flos": 19025940660480.0, - "grad_norm": 2.1230574515432705, - "language_loss": 0.72282934, - "learning_rate": 3.247110096547814e-06, - "loss": 0.74431896, - "num_input_tokens_seen": 109670240, - "step": 5108, - "time_per_iteration": 2.720196485519409 - }, - { - "auxiliary_loss_clip": 0.01112658, - "auxiliary_loss_mlp": 0.01040837, - "balance_loss_clip": 1.04619241, - "balance_loss_mlp": 1.02325416, - "epoch": 0.30716969788065535, - "flos": 21215521918080.0, - "grad_norm": 3.0053852764205695, - "language_loss": 0.8601433, - "learning_rate": 3.2468055997669926e-06, - "loss": 0.88167822, - "num_input_tokens_seen": 109690810, - "step": 5109, - "time_per_iteration": 2.715580940246582 - }, - { - "auxiliary_loss_clip": 0.01109383, - "auxiliary_loss_mlp": 0.01036759, - "balance_loss_clip": 1.04432368, - "balance_loss_mlp": 1.02017736, - "epoch": 0.3072298211333233, - "flos": 25772765086080.0, - "grad_norm": 1.7463183423202828, - "language_loss": 0.67169911, - "learning_rate": 3.2465010557071788e-06, - "loss": 0.69316053, - "num_input_tokens_seen": 109711145, - "step": 5110, - "time_per_iteration": 2.7133336067199707 - }, - { - "auxiliary_loss_clip": 0.01126653, - "auxiliary_loss_mlp": 0.01033414, - "balance_loss_clip": 1.04854119, - "balance_loss_mlp": 1.01736796, - "epoch": 0.3072899443859913, - "flos": 25848931875840.0, - "grad_norm": 1.4548971516988844, - "language_loss": 0.76673061, - "learning_rate": 3.246196464379919e-06, - "loss": 0.78833127, - "num_input_tokens_seen": 109731425, - "step": 5111, - "time_per_iteration": 2.692505121231079 - }, - { - "auxiliary_loss_clip": 0.01140411, - "auxiliary_loss_mlp": 0.0103997, - "balance_loss_clip": 1.04979658, - "balance_loss_mlp": 1.02360249, - "epoch": 0.30735006763865924, - "flos": 25922800195200.0, - "grad_norm": 3.7694679470365244, - "language_loss": 0.67143333, - "learning_rate": 3.245891825796765e-06, - "loss": 0.69323719, - "num_input_tokens_seen": 109752720, - "step": 5112, - "time_per_iteration": 2.6441125869750977 - }, - { - "auxiliary_loss_clip": 0.01133822, - "auxiliary_loss_mlp": 0.01044497, - "balance_loss_clip": 1.05147326, - "balance_loss_mlp": 1.02482784, - "epoch": 0.3074101908913272, - "flos": 30917004312960.0, - "grad_norm": 2.062737517485213, - "language_loss": 0.79524493, - "learning_rate": 3.2455871399692678e-06, - "loss": 0.81702805, - "num_input_tokens_seen": 109772840, - "step": 5113, - "time_per_iteration": 2.7166647911071777 - }, - { - "auxiliary_loss_clip": 0.01102438, - "auxiliary_loss_mlp": 0.00774651, - "balance_loss_clip": 1.04638815, - "balance_loss_mlp": 1.00138378, - "epoch": 0.30747031414399517, - "flos": 18401058731520.0, - "grad_norm": 2.08885217843665, - "language_loss": 0.76926446, - "learning_rate": 3.2452824069089815e-06, - "loss": 0.78803539, - "num_input_tokens_seen": 109790150, - "step": 5114, - "time_per_iteration": 2.6842217445373535 - }, - { - "auxiliary_loss_clip": 0.01100955, - "auxiliary_loss_mlp": 0.01034415, - "balance_loss_clip": 1.0446732, - "balance_loss_mlp": 1.01589036, - "epoch": 0.30753043739666314, - "flos": 22633166966400.0, - "grad_norm": 2.179333764681939, - "language_loss": 0.62607706, - "learning_rate": 3.2449776266274623e-06, - "loss": 0.64743078, - "num_input_tokens_seen": 109807985, - "step": 5115, - "time_per_iteration": 2.7709848880767822 - }, - { - "auxiliary_loss_clip": 0.0113067, - "auxiliary_loss_mlp": 0.01041883, - "balance_loss_clip": 1.04829907, - "balance_loss_mlp": 1.02557516, - "epoch": 0.3075905606493311, - "flos": 27344072517120.0, - "grad_norm": 2.4707888757665684, - "language_loss": 0.82835108, - "learning_rate": 3.2446727991362657e-06, - "loss": 0.85007656, - "num_input_tokens_seen": 109825920, - "step": 5116, - "time_per_iteration": 2.6891255378723145 - }, - { - "auxiliary_loss_clip": 0.01115169, - "auxiliary_loss_mlp": 0.01050095, - "balance_loss_clip": 1.04928303, - "balance_loss_mlp": 1.03291702, - "epoch": 0.3076506839019991, - "flos": 22090808534400.0, - "grad_norm": 1.792550086960714, - "language_loss": 0.75943851, - "learning_rate": 3.244367924446952e-06, - "loss": 0.78109109, - "num_input_tokens_seen": 109846220, - "step": 5117, - "time_per_iteration": 2.6685919761657715 - }, - { - "auxiliary_loss_clip": 0.01096356, - "auxiliary_loss_mlp": 0.010422, - "balance_loss_clip": 1.04583359, - "balance_loss_mlp": 1.02309084, - "epoch": 0.3077108071546671, - "flos": 21289533891840.0, - "grad_norm": 2.509228810910763, - "language_loss": 0.71450555, - "learning_rate": 3.2440630025710826e-06, - "loss": 0.7358911, - "num_input_tokens_seen": 109863870, - "step": 5118, - "time_per_iteration": 2.7360472679138184 - }, - { - "auxiliary_loss_clip": 0.0109679, - "auxiliary_loss_mlp": 0.01040047, - "balance_loss_clip": 1.05069757, - "balance_loss_mlp": 1.02279758, - "epoch": 0.30777093040733505, - "flos": 21430985650560.0, - "grad_norm": 1.6950758291291428, - "language_loss": 0.74499059, - "learning_rate": 3.243758033520219e-06, - "loss": 0.76635897, - "num_input_tokens_seen": 109883500, - "step": 5119, - "time_per_iteration": 2.7963552474975586 - }, - { - "auxiliary_loss_clip": 0.01133391, - "auxiliary_loss_mlp": 0.01054336, - "balance_loss_clip": 1.05088997, - "balance_loss_mlp": 1.03520322, - "epoch": 0.307831053660003, - "flos": 23149275534720.0, - "grad_norm": 2.3083726349779785, - "language_loss": 0.79968077, - "learning_rate": 3.243453017305926e-06, - "loss": 0.821558, - "num_input_tokens_seen": 109904620, - "step": 5120, - "time_per_iteration": 2.7600536346435547 - }, - { - "auxiliary_loss_clip": 0.01127117, - "auxiliary_loss_mlp": 0.01045491, - "balance_loss_clip": 1.04772663, - "balance_loss_mlp": 1.02994657, - "epoch": 0.307891176912671, - "flos": 17019755268480.0, - "grad_norm": 1.7119475154385397, - "language_loss": 0.79864663, - "learning_rate": 3.24314795393977e-06, - "loss": 0.8203727, - "num_input_tokens_seen": 109922275, - "step": 5121, - "time_per_iteration": 2.6204211711883545 - }, - { - "auxiliary_loss_clip": 0.01105091, - "auxiliary_loss_mlp": 0.01039616, - "balance_loss_clip": 1.04669154, - "balance_loss_mlp": 1.02292657, - "epoch": 0.30795130016533895, - "flos": 27705046245120.0, - "grad_norm": 1.4682711249191758, - "language_loss": 0.82526803, - "learning_rate": 3.242842843433319e-06, - "loss": 0.84671509, - "num_input_tokens_seen": 109944265, - "step": 5122, - "time_per_iteration": 2.7210805416107178 - }, - { - "auxiliary_loss_clip": 0.01052784, - "auxiliary_loss_mlp": 0.01010188, - "balance_loss_clip": 1.03048515, - "balance_loss_mlp": 1.00826919, - "epoch": 0.3080114234180069, - "flos": 69058699591680.0, - "grad_norm": 0.7449761063336078, - "language_loss": 0.58609217, - "learning_rate": 3.242537685798143e-06, - "loss": 0.60672188, - "num_input_tokens_seen": 110014160, - "step": 5123, - "time_per_iteration": 3.303093433380127 - }, - { - "auxiliary_loss_clip": 0.01133855, - "auxiliary_loss_mlp": 0.00776294, - "balance_loss_clip": 1.04937184, - "balance_loss_mlp": 1.00136161, - "epoch": 0.3080715466706749, - "flos": 24060221377920.0, - "grad_norm": 1.5927838238117058, - "language_loss": 0.83550704, - "learning_rate": 3.242232481045813e-06, - "loss": 0.85460854, - "num_input_tokens_seen": 110034865, - "step": 5124, - "time_per_iteration": 2.7226438522338867 - }, - { - "auxiliary_loss_clip": 0.01143185, - "auxiliary_loss_mlp": 0.01038734, - "balance_loss_clip": 1.05123234, - "balance_loss_mlp": 1.02206898, - "epoch": 0.30813166992334284, - "flos": 25848680480640.0, - "grad_norm": 2.0767599752543657, - "language_loss": 0.79332423, - "learning_rate": 3.2419272291879035e-06, - "loss": 0.81514347, - "num_input_tokens_seen": 110052930, - "step": 5125, - "time_per_iteration": 2.6514153480529785 - }, - { - "auxiliary_loss_clip": 0.01125892, - "auxiliary_loss_mlp": 0.01035278, - "balance_loss_clip": 1.04636812, - "balance_loss_mlp": 1.01694369, - "epoch": 0.3081917931760108, - "flos": 20449619193600.0, - "grad_norm": 1.764828299724452, - "language_loss": 0.64689863, - "learning_rate": 3.241621930235989e-06, - "loss": 0.66851032, - "num_input_tokens_seen": 110071765, - "step": 5126, - "time_per_iteration": 2.6408963203430176 - }, - { - "auxiliary_loss_clip": 0.01099238, - "auxiliary_loss_mlp": 0.01044536, - "balance_loss_clip": 1.05009556, - "balance_loss_mlp": 1.02698874, - "epoch": 0.3082519164286788, - "flos": 22166257052160.0, - "grad_norm": 1.5302214532460006, - "language_loss": 0.86800975, - "learning_rate": 3.241316584201646e-06, - "loss": 0.88944745, - "num_input_tokens_seen": 110092660, - "step": 5127, - "time_per_iteration": 2.793318748474121 - }, - { - "auxiliary_loss_clip": 0.01086461, - "auxiliary_loss_mlp": 0.01045743, - "balance_loss_clip": 1.04368591, - "balance_loss_mlp": 1.02862501, - "epoch": 0.30831203968134674, - "flos": 28913404700160.0, - "grad_norm": 1.6968110238499217, - "language_loss": 0.69155616, - "learning_rate": 3.2410111910964538e-06, - "loss": 0.71287817, - "num_input_tokens_seen": 110114960, - "step": 5128, - "time_per_iteration": 2.777060031890869 - }, - { - "auxiliary_loss_clip": 0.01130807, - "auxiliary_loss_mlp": 0.00775186, - "balance_loss_clip": 1.05044532, - "balance_loss_mlp": 1.00153518, - "epoch": 0.3083721629340147, - "flos": 25667726739840.0, - "grad_norm": 1.7900045405252538, - "language_loss": 0.71075535, - "learning_rate": 3.240705750931993e-06, - "loss": 0.7298153, - "num_input_tokens_seen": 110135750, - "step": 5129, - "time_per_iteration": 2.7317588329315186 - }, - { - "auxiliary_loss_clip": 0.01030892, - "auxiliary_loss_mlp": 0.01007708, - "balance_loss_clip": 1.0286324, - "balance_loss_mlp": 1.00588405, - "epoch": 0.3084322861866827, - "flos": 68212679581440.0, - "grad_norm": 0.8221299931057983, - "language_loss": 0.59160221, - "learning_rate": 3.240400263719846e-06, - "loss": 0.61198819, - "num_input_tokens_seen": 110189480, - "step": 5130, - "time_per_iteration": 3.2141849994659424 - }, - { - "auxiliary_loss_clip": 0.01115906, - "auxiliary_loss_mlp": 0.01041214, - "balance_loss_clip": 1.04513061, - "balance_loss_mlp": 1.02297497, - "epoch": 0.3084924094393507, - "flos": 20296495514880.0, - "grad_norm": 2.986922621878904, - "language_loss": 0.73292506, - "learning_rate": 3.2400947294715957e-06, - "loss": 0.75449622, - "num_input_tokens_seen": 110206445, - "step": 5131, - "time_per_iteration": 2.6520204544067383 - }, - { - "auxiliary_loss_clip": 0.01099541, - "auxiliary_loss_mlp": 0.010345, - "balance_loss_clip": 1.04438055, - "balance_loss_mlp": 1.01822817, - "epoch": 0.30855253269201866, - "flos": 23949831905280.0, - "grad_norm": 1.569237882810685, - "language_loss": 0.71420097, - "learning_rate": 3.2397891481988303e-06, - "loss": 0.73554134, - "num_input_tokens_seen": 110226845, - "step": 5132, - "time_per_iteration": 2.8439948558807373 - }, - { - "auxiliary_loss_clip": 0.01134935, - "auxiliary_loss_mlp": 0.00774998, - "balance_loss_clip": 1.04922795, - "balance_loss_mlp": 1.00131333, - "epoch": 0.3086126559446866, - "flos": 19281876042240.0, - "grad_norm": 1.9070570981004293, - "language_loss": 0.89846021, - "learning_rate": 3.239483519913136e-06, - "loss": 0.91755956, - "num_input_tokens_seen": 110244095, - "step": 5133, - "time_per_iteration": 2.5872273445129395 - }, - { - "auxiliary_loss_clip": 0.01122429, - "auxiliary_loss_mlp": 0.01043613, - "balance_loss_clip": 1.04856205, - "balance_loss_mlp": 1.02580321, - "epoch": 0.3086727791973546, - "flos": 33760770019200.0, - "grad_norm": 1.7209646054950307, - "language_loss": 0.67267555, - "learning_rate": 3.239177844626102e-06, - "loss": 0.69433594, - "num_input_tokens_seen": 110264240, - "step": 5134, - "time_per_iteration": 2.7872183322906494 - }, - { - "auxiliary_loss_clip": 0.01124541, - "auxiliary_loss_mlp": 0.01041364, - "balance_loss_clip": 1.04777277, - "balance_loss_mlp": 1.02393556, - "epoch": 0.30873290245002255, - "flos": 16034151006720.0, - "grad_norm": 1.9145067593542924, - "language_loss": 0.82794344, - "learning_rate": 3.2388721223493197e-06, - "loss": 0.84960246, - "num_input_tokens_seen": 110282450, - "step": 5135, - "time_per_iteration": 2.6355140209198 - }, - { - "auxiliary_loss_clip": 0.01026512, - "auxiliary_loss_mlp": 0.01003035, - "balance_loss_clip": 1.02417064, - "balance_loss_mlp": 1.00113988, - "epoch": 0.3087930257026905, - "flos": 65048304055680.0, - "grad_norm": 0.6923211570832432, - "language_loss": 0.55314827, - "learning_rate": 3.2385663530943824e-06, - "loss": 0.57344365, - "num_input_tokens_seen": 110343715, - "step": 5136, - "time_per_iteration": 3.31300687789917 - }, - { - "auxiliary_loss_clip": 0.01118007, - "auxiliary_loss_mlp": 0.00775624, - "balance_loss_clip": 1.04826593, - "balance_loss_mlp": 1.00124264, - "epoch": 0.3088531489553585, - "flos": 74738829824640.0, - "grad_norm": 2.038560176689262, - "language_loss": 0.76524079, - "learning_rate": 3.2382605368728852e-06, - "loss": 0.78417706, - "num_input_tokens_seen": 110368430, - "step": 5137, - "time_per_iteration": 3.1237831115722656 - }, - { - "auxiliary_loss_clip": 0.01102933, - "auxiliary_loss_mlp": 0.010362, - "balance_loss_clip": 1.04592168, - "balance_loss_mlp": 1.02058411, - "epoch": 0.30891327220802645, - "flos": 21142300043520.0, - "grad_norm": 1.655645044155811, - "language_loss": 0.80083114, - "learning_rate": 3.237954673696424e-06, - "loss": 0.82222247, - "num_input_tokens_seen": 110386735, - "step": 5138, - "time_per_iteration": 2.775902509689331 - }, - { - "auxiliary_loss_clip": 0.01078807, - "auxiliary_loss_mlp": 0.0104514, - "balance_loss_clip": 1.03953338, - "balance_loss_mlp": 1.02583957, - "epoch": 0.3089733954606944, - "flos": 25664494515840.0, - "grad_norm": 1.3823165076112356, - "language_loss": 0.81288958, - "learning_rate": 3.2376487635765983e-06, - "loss": 0.8341291, - "num_input_tokens_seen": 110406820, - "step": 5139, - "time_per_iteration": 4.48141074180603 - }, - { - "auxiliary_loss_clip": 0.01127056, - "auxiliary_loss_mlp": 0.01044845, - "balance_loss_clip": 1.04565382, - "balance_loss_mlp": 1.02575994, - "epoch": 0.3090335187133624, - "flos": 19427350124160.0, - "grad_norm": 2.1511159973406593, - "language_loss": 0.77260494, - "learning_rate": 3.2373428065250067e-06, - "loss": 0.79432398, - "num_input_tokens_seen": 110424225, - "step": 5140, - "time_per_iteration": 4.1141037940979 - }, - { - "auxiliary_loss_clip": 0.01099157, - "auxiliary_loss_mlp": 0.01048812, - "balance_loss_clip": 1.04282403, - "balance_loss_mlp": 1.03233695, - "epoch": 0.30909364196603034, - "flos": 20011329440640.0, - "grad_norm": 1.77105935640331, - "language_loss": 0.78806967, - "learning_rate": 3.237036802553252e-06, - "loss": 0.80954939, - "num_input_tokens_seen": 110443310, - "step": 5141, - "time_per_iteration": 2.6497676372528076 - }, - { - "auxiliary_loss_clip": 0.01119702, - "auxiliary_loss_mlp": 0.0104967, - "balance_loss_clip": 1.04679799, - "balance_loss_mlp": 1.03138292, - "epoch": 0.3091537652186983, - "flos": 19677575243520.0, - "grad_norm": 2.261971688212118, - "language_loss": 0.86853915, - "learning_rate": 3.2367307516729377e-06, - "loss": 0.89023286, - "num_input_tokens_seen": 110460215, - "step": 5142, - "time_per_iteration": 2.635495662689209 - }, - { - "auxiliary_loss_clip": 0.01127738, - "auxiliary_loss_mlp": 0.01048033, - "balance_loss_clip": 1.04709148, - "balance_loss_mlp": 1.03136778, - "epoch": 0.3092138884713663, - "flos": 17020042577280.0, - "grad_norm": 1.7222677689082588, - "language_loss": 0.79352587, - "learning_rate": 3.23642465389567e-06, - "loss": 0.81528366, - "num_input_tokens_seen": 110479385, - "step": 5143, - "time_per_iteration": 2.672196388244629 - }, - { - "auxiliary_loss_clip": 0.01108121, - "auxiliary_loss_mlp": 0.01046466, - "balance_loss_clip": 1.04830873, - "balance_loss_mlp": 1.02858496, - "epoch": 0.3092740117240343, - "flos": 25009986844800.0, - "grad_norm": 1.849759687088619, - "language_loss": 0.72079581, - "learning_rate": 3.236118509233055e-06, - "loss": 0.7423417, - "num_input_tokens_seen": 110499885, - "step": 5144, - "time_per_iteration": 4.2138121128082275 - }, - { - "auxiliary_loss_clip": 0.01130266, - "auxiliary_loss_mlp": 0.0105055, - "balance_loss_clip": 1.04617548, - "balance_loss_mlp": 1.03297877, - "epoch": 0.30933413497670226, - "flos": 25590410714880.0, - "grad_norm": 1.9804845877808144, - "language_loss": 0.74328083, - "learning_rate": 3.235812317696702e-06, - "loss": 0.76508898, - "num_input_tokens_seen": 110519690, - "step": 5145, - "time_per_iteration": 4.315273761749268 - }, - { - "auxiliary_loss_clip": 0.01110927, - "auxiliary_loss_mlp": 0.01045527, - "balance_loss_clip": 1.04372048, - "balance_loss_mlp": 1.02788365, - "epoch": 0.3093942582293702, - "flos": 24389665943040.0, - "grad_norm": 1.6657569174801012, - "language_loss": 0.76391518, - "learning_rate": 3.2355060792982224e-06, - "loss": 0.78547978, - "num_input_tokens_seen": 110540520, - "step": 5146, - "time_per_iteration": 2.7259135246276855 - }, - { - "auxiliary_loss_clip": 0.0111122, - "auxiliary_loss_mlp": 0.01042459, - "balance_loss_clip": 1.04380584, - "balance_loss_mlp": 1.02553141, - "epoch": 0.3094543814820382, - "flos": 19646441130240.0, - "grad_norm": 2.148705061921787, - "language_loss": 0.66899967, - "learning_rate": 3.2351997940492286e-06, - "loss": 0.6905365, - "num_input_tokens_seen": 110557950, - "step": 5147, - "time_per_iteration": 2.6804444789886475 - }, - { - "auxiliary_loss_clip": 0.01132642, - "auxiliary_loss_mlp": 0.0104049, - "balance_loss_clip": 1.04998684, - "balance_loss_mlp": 1.0238843, - "epoch": 0.30951450473470615, - "flos": 25663812157440.0, - "grad_norm": 2.0634223914225585, - "language_loss": 0.74823105, - "learning_rate": 3.2348934619613346e-06, - "loss": 0.76996237, - "num_input_tokens_seen": 110578215, - "step": 5148, - "time_per_iteration": 2.637509346008301 - }, - { - "auxiliary_loss_clip": 0.0113505, - "auxiliary_loss_mlp": 0.01047495, - "balance_loss_clip": 1.0492146, - "balance_loss_mlp": 1.02901721, - "epoch": 0.3095746279873741, - "flos": 12020415505920.0, - "grad_norm": 2.1367843023537287, - "language_loss": 0.73082036, - "learning_rate": 3.2345870830461567e-06, - "loss": 0.75264585, - "num_input_tokens_seen": 110592990, - "step": 5149, - "time_per_iteration": 2.6134157180786133 - }, - { - "auxiliary_loss_clip": 0.01097892, - "auxiliary_loss_mlp": 0.0104428, - "balance_loss_clip": 1.04601955, - "balance_loss_mlp": 1.02615988, - "epoch": 0.3096347512400421, - "flos": 23623044946560.0, - "grad_norm": 2.0797901111423274, - "language_loss": 0.845025, - "learning_rate": 3.2342806573153132e-06, - "loss": 0.86644673, - "num_input_tokens_seen": 110612130, - "step": 5150, - "time_per_iteration": 2.7804181575775146 - }, - { - "auxiliary_loss_clip": 0.01086512, - "auxiliary_loss_mlp": 0.01047133, - "balance_loss_clip": 1.04168093, - "balance_loss_mlp": 1.02820301, - "epoch": 0.30969487449271005, - "flos": 22529313768960.0, - "grad_norm": 1.8768941622145223, - "language_loss": 0.78431082, - "learning_rate": 3.233974184780424e-06, - "loss": 0.80564725, - "num_input_tokens_seen": 110632045, - "step": 5151, - "time_per_iteration": 2.7539470195770264 - }, - { - "auxiliary_loss_clip": 0.01131879, - "auxiliary_loss_mlp": 0.01041443, - "balance_loss_clip": 1.04880977, - "balance_loss_mlp": 1.02362132, - "epoch": 0.309754997745378, - "flos": 15267925059840.0, - "grad_norm": 1.9606136965084777, - "language_loss": 0.67416716, - "learning_rate": 3.2336676654531084e-06, - "loss": 0.69590038, - "num_input_tokens_seen": 110649340, - "step": 5152, - "time_per_iteration": 2.579238176345825 - }, - { - "auxiliary_loss_clip": 0.01080518, - "auxiliary_loss_mlp": 0.01045921, - "balance_loss_clip": 1.04402971, - "balance_loss_mlp": 1.02807546, - "epoch": 0.309815120998046, - "flos": 26979291947520.0, - "grad_norm": 5.6670540450328355, - "language_loss": 0.8251189, - "learning_rate": 3.2333610993449926e-06, - "loss": 0.84638333, - "num_input_tokens_seen": 110668450, - "step": 5153, - "time_per_iteration": 2.792285203933716 - }, - { - "auxiliary_loss_clip": 0.01113849, - "auxiliary_loss_mlp": 0.00775793, - "balance_loss_clip": 1.04663801, - "balance_loss_mlp": 1.00127769, - "epoch": 0.30987524425071394, - "flos": 21143161969920.0, - "grad_norm": 1.937189485762574, - "language_loss": 0.73793215, - "learning_rate": 3.2330544864676997e-06, - "loss": 0.75682855, - "num_input_tokens_seen": 110689410, - "step": 5154, - "time_per_iteration": 2.678454875946045 - }, - { - "auxiliary_loss_clip": 0.01132509, - "auxiliary_loss_mlp": 0.0103738, - "balance_loss_clip": 1.0507983, - "balance_loss_mlp": 1.02009416, - "epoch": 0.3099353675033819, - "flos": 15268284195840.0, - "grad_norm": 2.1601099672999586, - "language_loss": 0.76069349, - "learning_rate": 3.232747826832858e-06, - "loss": 0.78239238, - "num_input_tokens_seen": 110707350, - "step": 5155, - "time_per_iteration": 2.577634334564209 - }, - { - "auxiliary_loss_clip": 0.01131155, - "auxiliary_loss_mlp": 0.01040429, - "balance_loss_clip": 1.05483913, - "balance_loss_mlp": 1.02283418, - "epoch": 0.30999549075604993, - "flos": 15413794191360.0, - "grad_norm": 2.044896457109867, - "language_loss": 0.79096609, - "learning_rate": 3.232441120452094e-06, - "loss": 0.81268191, - "num_input_tokens_seen": 110724910, - "step": 5156, - "time_per_iteration": 2.628363609313965 - }, - { - "auxiliary_loss_clip": 0.01127429, - "auxiliary_loss_mlp": 0.01047381, - "balance_loss_clip": 1.04775023, - "balance_loss_mlp": 1.02779543, - "epoch": 0.3100556140087179, - "flos": 23184539712000.0, - "grad_norm": 2.468311845454126, - "language_loss": 0.74950963, - "learning_rate": 3.23213436733704e-06, - "loss": 0.77125776, - "num_input_tokens_seen": 110744010, - "step": 5157, - "time_per_iteration": 2.6231181621551514 - }, - { - "auxiliary_loss_clip": 0.01108321, - "auxiliary_loss_mlp": 0.01042715, - "balance_loss_clip": 1.04868615, - "balance_loss_mlp": 1.02634752, - "epoch": 0.31011573726138586, - "flos": 25742169676800.0, - "grad_norm": 1.6453166696914168, - "language_loss": 0.69648343, - "learning_rate": 3.231827567499327e-06, - "loss": 0.71799374, - "num_input_tokens_seen": 110765835, - "step": 5158, - "time_per_iteration": 2.734889030456543 - }, - { - "auxiliary_loss_clip": 0.01095116, - "auxiliary_loss_mlp": 0.01046106, - "balance_loss_clip": 1.04443944, - "balance_loss_mlp": 1.0301435, - "epoch": 0.3101758605140538, - "flos": 20011329440640.0, - "grad_norm": 1.9329481500014836, - "language_loss": 0.84861457, - "learning_rate": 3.2315207209505896e-06, - "loss": 0.87002677, - "num_input_tokens_seen": 110784655, - "step": 5159, - "time_per_iteration": 2.665311813354492 - }, - { - "auxiliary_loss_clip": 0.01116498, - "auxiliary_loss_mlp": 0.01046065, - "balance_loss_clip": 1.04710639, - "balance_loss_mlp": 1.02877951, - "epoch": 0.3102359837667218, - "flos": 19135683688320.0, - "grad_norm": 1.9614748869944683, - "language_loss": 0.85129201, - "learning_rate": 3.231213827702462e-06, - "loss": 0.87291765, - "num_input_tokens_seen": 110802545, - "step": 5160, - "time_per_iteration": 2.597130298614502 - }, - { - "auxiliary_loss_clip": 0.01133056, - "auxiliary_loss_mlp": 0.01042602, - "balance_loss_clip": 1.0520395, - "balance_loss_mlp": 1.02582884, - "epoch": 0.31029610701938976, - "flos": 22265405568000.0, - "grad_norm": 1.9459577302566504, - "language_loss": 0.75555152, - "learning_rate": 3.230906887766584e-06, - "loss": 0.77730811, - "num_input_tokens_seen": 110820265, - "step": 5161, - "time_per_iteration": 2.583240032196045 - }, - { - "auxiliary_loss_clip": 0.0113313, - "auxiliary_loss_mlp": 0.01045414, - "balance_loss_clip": 1.05046988, - "balance_loss_mlp": 1.02797401, - "epoch": 0.3103562302720577, - "flos": 20805349536000.0, - "grad_norm": 1.9938857241338979, - "language_loss": 0.8156144, - "learning_rate": 3.2305999011545924e-06, - "loss": 0.83739984, - "num_input_tokens_seen": 110836195, - "step": 5162, - "time_per_iteration": 2.495689630508423 - }, - { - "auxiliary_loss_clip": 0.01128762, - "auxiliary_loss_mlp": 0.01039959, - "balance_loss_clip": 1.04903293, - "balance_loss_mlp": 1.02450919, - "epoch": 0.3104163535247257, - "flos": 22344158136960.0, - "grad_norm": 1.777649785974679, - "language_loss": 0.82892883, - "learning_rate": 3.2302928678781295e-06, - "loss": 0.85061604, - "num_input_tokens_seen": 110856420, - "step": 5163, - "time_per_iteration": 2.591036081314087 - }, - { - "auxiliary_loss_clip": 0.01147486, - "auxiliary_loss_mlp": 0.01044526, - "balance_loss_clip": 1.05307984, - "balance_loss_mlp": 1.0273242, - "epoch": 0.31047647677739365, - "flos": 21689363157120.0, - "grad_norm": 1.875247009463239, - "language_loss": 0.76131678, - "learning_rate": 3.2299857879488376e-06, - "loss": 0.78323686, - "num_input_tokens_seen": 110876650, - "step": 5164, - "time_per_iteration": 2.5745677947998047 - }, - { - "auxiliary_loss_clip": 0.01103275, - "auxiliary_loss_mlp": 0.01046349, - "balance_loss_clip": 1.04969811, - "balance_loss_mlp": 1.02880108, - "epoch": 0.3105366000300616, - "flos": 18917275040640.0, - "grad_norm": 3.462886730904856, - "language_loss": 0.74514711, - "learning_rate": 3.2296786613783626e-06, - "loss": 0.7666434, - "num_input_tokens_seen": 110894445, - "step": 5165, - "time_per_iteration": 2.724846124649048 - }, - { - "auxiliary_loss_clip": 0.01100578, - "auxiliary_loss_mlp": 0.01057021, - "balance_loss_clip": 1.04695523, - "balance_loss_mlp": 1.03841233, - "epoch": 0.3105967232827296, - "flos": 18260397072000.0, - "grad_norm": 1.6273273492295701, - "language_loss": 0.75827682, - "learning_rate": 3.229371488178348e-06, - "loss": 0.77985275, - "num_input_tokens_seen": 110912855, - "step": 5166, - "time_per_iteration": 2.7309961318969727 - }, - { - "auxiliary_loss_clip": 0.01121318, - "auxiliary_loss_mlp": 0.01043526, - "balance_loss_clip": 1.04969096, - "balance_loss_mlp": 1.02665818, - "epoch": 0.31065684653539755, - "flos": 17672144037120.0, - "grad_norm": 2.1635307284170833, - "language_loss": 0.73621917, - "learning_rate": 3.229064268360444e-06, - "loss": 0.75786763, - "num_input_tokens_seen": 110928025, - "step": 5167, - "time_per_iteration": 2.623375654220581 - }, - { - "auxiliary_loss_clip": 0.01007539, - "auxiliary_loss_mlp": 0.01008435, - "balance_loss_clip": 1.02476823, - "balance_loss_mlp": 1.0059557, - "epoch": 0.3107169697880655, - "flos": 68531996511360.0, - "grad_norm": 0.7113763854018822, - "language_loss": 0.53030008, - "learning_rate": 3.2287570019362997e-06, - "loss": 0.55045986, - "num_input_tokens_seen": 110992215, - "step": 5168, - "time_per_iteration": 3.3115129470825195 - }, - { - "auxiliary_loss_clip": 0.01138497, - "auxiliary_loss_mlp": 0.01050074, - "balance_loss_clip": 1.05561399, - "balance_loss_mlp": 1.03151321, - "epoch": 0.3107770930407335, - "flos": 13188733274880.0, - "grad_norm": 3.621905149464154, - "language_loss": 0.79032969, - "learning_rate": 3.2284496889175668e-06, - "loss": 0.81221539, - "num_input_tokens_seen": 111010400, - "step": 5169, - "time_per_iteration": 2.595463514328003 - }, - { - "auxiliary_loss_clip": 0.01121822, - "auxiliary_loss_mlp": 0.01047209, - "balance_loss_clip": 1.04804373, - "balance_loss_mlp": 1.02937579, - "epoch": 0.3108372162934015, - "flos": 31580849520000.0, - "grad_norm": 1.57130024638105, - "language_loss": 0.64071, - "learning_rate": 3.2281423293158986e-06, - "loss": 0.66240036, - "num_input_tokens_seen": 111033960, - "step": 5170, - "time_per_iteration": 2.746469497680664 - }, - { - "auxiliary_loss_clip": 0.0110491, - "auxiliary_loss_mlp": 0.00776539, - "balance_loss_clip": 1.04874384, - "balance_loss_mlp": 1.00120461, - "epoch": 0.31089733954606946, - "flos": 28729829266560.0, - "grad_norm": 2.172069963879317, - "language_loss": 0.7723515, - "learning_rate": 3.22783492314295e-06, - "loss": 0.79116607, - "num_input_tokens_seen": 111053265, - "step": 5171, - "time_per_iteration": 2.776974678039551 - }, - { - "auxiliary_loss_clip": 0.01100832, - "auxiliary_loss_mlp": 0.01048172, - "balance_loss_clip": 1.049088, - "balance_loss_mlp": 1.03055298, - "epoch": 0.3109574627987374, - "flos": 19683249592320.0, - "grad_norm": 1.830523579545495, - "language_loss": 0.84020013, - "learning_rate": 3.2275274704103785e-06, - "loss": 0.86169016, - "num_input_tokens_seen": 111071130, - "step": 5172, - "time_per_iteration": 2.718118906021118 - }, - { - "auxiliary_loss_clip": 0.01091688, - "auxiliary_loss_mlp": 0.01045541, - "balance_loss_clip": 1.04622412, - "balance_loss_mlp": 1.02706313, - "epoch": 0.3110175860514054, - "flos": 14683981656960.0, - "grad_norm": 1.9540355263753015, - "language_loss": 0.83730888, - "learning_rate": 3.227219971129842e-06, - "loss": 0.8586812, - "num_input_tokens_seen": 111089560, - "step": 5173, - "time_per_iteration": 2.735163927078247 - }, - { - "auxiliary_loss_clip": 0.01145239, - "auxiliary_loss_mlp": 0.01042621, - "balance_loss_clip": 1.05589437, - "balance_loss_mlp": 1.02656341, - "epoch": 0.31107770930407336, - "flos": 25739655724800.0, - "grad_norm": 3.2612368513370495, - "language_loss": 0.83354348, - "learning_rate": 3.226912425313001e-06, - "loss": 0.85542202, - "num_input_tokens_seen": 111109960, - "step": 5174, - "time_per_iteration": 2.65226411819458 - }, - { - "auxiliary_loss_clip": 0.01122854, - "auxiliary_loss_mlp": 0.01046101, - "balance_loss_clip": 1.05162597, - "balance_loss_mlp": 1.02928042, - "epoch": 0.3111378325567413, - "flos": 19208259118080.0, - "grad_norm": 1.9777752297496725, - "language_loss": 0.85181922, - "learning_rate": 3.2266048329715183e-06, - "loss": 0.87350869, - "num_input_tokens_seen": 111127960, - "step": 5175, - "time_per_iteration": 2.6930692195892334 - }, - { - "auxiliary_loss_clip": 0.01087659, - "auxiliary_loss_mlp": 0.01044685, - "balance_loss_clip": 1.04638839, - "balance_loss_mlp": 1.02623129, - "epoch": 0.3111979558094093, - "flos": 23696374561920.0, - "grad_norm": 1.845729409399547, - "language_loss": 0.82990116, - "learning_rate": 3.2262971941170575e-06, - "loss": 0.8512246, - "num_input_tokens_seen": 111146730, - "step": 5176, - "time_per_iteration": 2.7975289821624756 - }, - { - "auxiliary_loss_clip": 0.01126555, - "auxiliary_loss_mlp": 0.01042513, - "balance_loss_clip": 1.04662132, - "balance_loss_mlp": 1.02361798, - "epoch": 0.31125807906207725, - "flos": 21033023892480.0, - "grad_norm": 1.9258407965023028, - "language_loss": 0.8096348, - "learning_rate": 3.2259895087612837e-06, - "loss": 0.83132547, - "num_input_tokens_seen": 111166295, - "step": 5177, - "time_per_iteration": 2.6275687217712402 - }, - { - "auxiliary_loss_clip": 0.01134117, - "auxiliary_loss_mlp": 0.0077682, - "balance_loss_clip": 1.05381465, - "balance_loss_mlp": 1.00119591, - "epoch": 0.3113182023147452, - "flos": 23076628277760.0, - "grad_norm": 1.6855068015846089, - "language_loss": 0.80707169, - "learning_rate": 3.2256817769158657e-06, - "loss": 0.82618099, - "num_input_tokens_seen": 111185665, - "step": 5178, - "time_per_iteration": 4.142611742019653 - }, - { - "auxiliary_loss_clip": 0.01119942, - "auxiliary_loss_mlp": 0.01047667, - "balance_loss_clip": 1.05289316, - "balance_loss_mlp": 1.03076327, - "epoch": 0.3113783255674132, - "flos": 11838994888320.0, - "grad_norm": 2.5880769767242633, - "language_loss": 0.80990803, - "learning_rate": 3.225373998592471e-06, - "loss": 0.83158416, - "num_input_tokens_seen": 111201615, - "step": 5179, - "time_per_iteration": 2.6429331302642822 - }, - { - "auxiliary_loss_clip": 0.01112505, - "auxiliary_loss_mlp": 0.01048581, - "balance_loss_clip": 1.05353093, - "balance_loss_mlp": 1.03139079, - "epoch": 0.31143844882008115, - "flos": 16289547684480.0, - "grad_norm": 2.4201759029551813, - "language_loss": 0.78532577, - "learning_rate": 3.2250661738027715e-06, - "loss": 0.80693662, - "num_input_tokens_seen": 111220515, - "step": 5180, - "time_per_iteration": 4.1918723583221436 - }, - { - "auxiliary_loss_clip": 0.01107686, - "auxiliary_loss_mlp": 0.01037212, - "balance_loss_clip": 1.05114985, - "balance_loss_mlp": 1.02011788, - "epoch": 0.3114985720727491, - "flos": 23217792727680.0, - "grad_norm": 1.6775849826612523, - "language_loss": 0.83088589, - "learning_rate": 3.22475830255844e-06, - "loss": 0.85233486, - "num_input_tokens_seen": 111240395, - "step": 5181, - "time_per_iteration": 2.760340929031372 - }, - { - "auxiliary_loss_clip": 0.01110614, - "auxiliary_loss_mlp": 0.01044232, - "balance_loss_clip": 1.04879427, - "balance_loss_mlp": 1.02881861, - "epoch": 0.3115586953254171, - "flos": 30044626698240.0, - "grad_norm": 1.766790552230027, - "language_loss": 0.74396992, - "learning_rate": 3.2244503848711516e-06, - "loss": 0.76551843, - "num_input_tokens_seen": 111261100, - "step": 5182, - "time_per_iteration": 2.7501730918884277 - }, - { - "auxiliary_loss_clip": 0.01093489, - "auxiliary_loss_mlp": 0.00776946, - "balance_loss_clip": 1.04811049, - "balance_loss_mlp": 1.00152898, - "epoch": 0.3116188185780851, - "flos": 25666326109440.0, - "grad_norm": 2.03695228940596, - "language_loss": 0.70169222, - "learning_rate": 3.2241424207525815e-06, - "loss": 0.72039658, - "num_input_tokens_seen": 111281320, - "step": 5183, - "time_per_iteration": 4.26041579246521 - }, - { - "auxiliary_loss_clip": 0.01017812, - "auxiliary_loss_mlp": 0.01006564, - "balance_loss_clip": 1.01984847, - "balance_loss_mlp": 1.00418019, - "epoch": 0.31167894183075306, - "flos": 69510058917120.0, - "grad_norm": 0.9394459872440335, - "language_loss": 0.59573013, - "learning_rate": 3.223834410214408e-06, - "loss": 0.61597383, - "num_input_tokens_seen": 111341405, - "step": 5184, - "time_per_iteration": 4.992337226867676 - }, - { - "auxiliary_loss_clip": 0.01115495, - "auxiliary_loss_mlp": 0.01050891, - "balance_loss_clip": 1.04588842, - "balance_loss_mlp": 1.03422523, - "epoch": 0.31173906508342103, - "flos": 14939845211520.0, - "grad_norm": 2.48453112640368, - "language_loss": 0.70156622, - "learning_rate": 3.223526353268311e-06, - "loss": 0.72323, - "num_input_tokens_seen": 111358975, - "step": 5185, - "time_per_iteration": 2.6406824588775635 - }, - { - "auxiliary_loss_clip": 0.01122412, - "auxiliary_loss_mlp": 0.01051261, - "balance_loss_clip": 1.05447555, - "balance_loss_mlp": 1.03405905, - "epoch": 0.311799188336089, - "flos": 16176033728640.0, - "grad_norm": 2.8983279272522853, - "language_loss": 0.63588691, - "learning_rate": 3.2232182499259725e-06, - "loss": 0.65762365, - "num_input_tokens_seen": 111375845, - "step": 5186, - "time_per_iteration": 2.683971881866455 - }, - { - "auxiliary_loss_clip": 0.01126858, - "auxiliary_loss_mlp": 0.01049881, - "balance_loss_clip": 1.05240881, - "balance_loss_mlp": 1.03145099, - "epoch": 0.31185931158875696, - "flos": 25009627708800.0, - "grad_norm": 2.2127415604209335, - "language_loss": 0.86427295, - "learning_rate": 3.2229101001990747e-06, - "loss": 0.88604033, - "num_input_tokens_seen": 111394150, - "step": 5187, - "time_per_iteration": 2.6983299255371094 - }, - { - "auxiliary_loss_clip": 0.01146114, - "auxiliary_loss_mlp": 0.0077496, - "balance_loss_clip": 1.05417776, - "balance_loss_mlp": 1.00131774, - "epoch": 0.3119194348414249, - "flos": 37232901273600.0, - "grad_norm": 1.653121843679143, - "language_loss": 0.63481069, - "learning_rate": 3.2226019040993036e-06, - "loss": 0.6540215, - "num_input_tokens_seen": 111418355, - "step": 5188, - "time_per_iteration": 2.6974728107452393 - }, - { - "auxiliary_loss_clip": 0.01106256, - "auxiliary_loss_mlp": 0.01044626, - "balance_loss_clip": 1.05064225, - "balance_loss_mlp": 1.02799582, - "epoch": 0.3119795580940929, - "flos": 15012779777280.0, - "grad_norm": 2.578497111530561, - "language_loss": 0.83241487, - "learning_rate": 3.222293661638346e-06, - "loss": 0.85392368, - "num_input_tokens_seen": 111435445, - "step": 5189, - "time_per_iteration": 2.6956889629364014 - }, - { - "auxiliary_loss_clip": 0.01031008, - "auxiliary_loss_mlp": 0.01045956, - "balance_loss_clip": 1.03804195, - "balance_loss_mlp": 1.02812243, - "epoch": 0.31203968134676086, - "flos": 15998168557440.0, - "grad_norm": 1.8156368008577992, - "language_loss": 0.79266763, - "learning_rate": 3.22198537282789e-06, - "loss": 0.81343722, - "num_input_tokens_seen": 111453430, - "step": 5190, - "time_per_iteration": 3.0180671215057373 - }, - { - "auxiliary_loss_clip": 0.01086186, - "auxiliary_loss_mlp": 0.01053443, - "balance_loss_clip": 1.04333639, - "balance_loss_mlp": 1.03413141, - "epoch": 0.3120998045994288, - "flos": 23837359443840.0, - "grad_norm": 1.571307617405072, - "language_loss": 0.75174087, - "learning_rate": 3.2216770376796262e-06, - "loss": 0.77313721, - "num_input_tokens_seen": 111475325, - "step": 5191, - "time_per_iteration": 3.0170204639434814 - }, - { - "auxiliary_loss_clip": 0.01043661, - "auxiliary_loss_mlp": 0.00755081, - "balance_loss_clip": 1.02154636, - "balance_loss_mlp": 1.00261629, - "epoch": 0.3121599278520968, - "flos": 69184205712000.0, - "grad_norm": 0.8534965117798614, - "language_loss": 0.63942307, - "learning_rate": 3.221368656205247e-06, - "loss": 0.6574105, - "num_input_tokens_seen": 111533960, - "step": 5192, - "time_per_iteration": 3.288938045501709 - }, - { - "auxiliary_loss_clip": 0.01133662, - "auxiliary_loss_mlp": 0.01043466, - "balance_loss_clip": 1.05246997, - "balance_loss_mlp": 1.02569187, - "epoch": 0.31222005110476475, - "flos": 23806368984960.0, - "grad_norm": 1.9226654053779162, - "language_loss": 0.7976644, - "learning_rate": 3.221060228416446e-06, - "loss": 0.81943566, - "num_input_tokens_seen": 111554055, - "step": 5193, - "time_per_iteration": 2.758859157562256 - }, - { - "auxiliary_loss_clip": 0.01117628, - "auxiliary_loss_mlp": 0.01054751, - "balance_loss_clip": 1.04916263, - "balance_loss_mlp": 1.03508139, - "epoch": 0.3122801743574327, - "flos": 25226132935680.0, - "grad_norm": 2.5170295869133024, - "language_loss": 0.72488689, - "learning_rate": 3.2207517543249183e-06, - "loss": 0.74661064, - "num_input_tokens_seen": 111574305, - "step": 5194, - "time_per_iteration": 2.69765567779541 - }, - { - "auxiliary_loss_clip": 0.01144699, - "auxiliary_loss_mlp": 0.01044476, - "balance_loss_clip": 1.05394197, - "balance_loss_mlp": 1.02819204, - "epoch": 0.3123402976101007, - "flos": 22966490200320.0, - "grad_norm": 1.775027795968239, - "language_loss": 0.76423192, - "learning_rate": 3.2204432339423616e-06, - "loss": 0.78612363, - "num_input_tokens_seen": 111595680, - "step": 5195, - "time_per_iteration": 2.665656566619873 - }, - { - "auxiliary_loss_clip": 0.01144607, - "auxiliary_loss_mlp": 0.01042079, - "balance_loss_clip": 1.05148935, - "balance_loss_mlp": 1.02544916, - "epoch": 0.3124004208627687, - "flos": 25192089820800.0, - "grad_norm": 1.4414001308378115, - "language_loss": 0.78089559, - "learning_rate": 3.220134667280476e-06, - "loss": 0.80276251, - "num_input_tokens_seen": 111618135, - "step": 5196, - "time_per_iteration": 2.682476282119751 - }, - { - "auxiliary_loss_clip": 0.01032618, - "auxiliary_loss_mlp": 0.00755246, - "balance_loss_clip": 1.02237272, - "balance_loss_mlp": 1.00273037, - "epoch": 0.31246054411543667, - "flos": 67485165517440.0, - "grad_norm": 0.794984063014186, - "language_loss": 0.54770386, - "learning_rate": 3.2198260543509613e-06, - "loss": 0.56558245, - "num_input_tokens_seen": 111682220, - "step": 5197, - "time_per_iteration": 3.24509334564209 - }, - { - "auxiliary_loss_clip": 0.01144094, - "auxiliary_loss_mlp": 0.01042495, - "balance_loss_clip": 1.0547365, - "balance_loss_mlp": 1.02586555, - "epoch": 0.31252066736810463, - "flos": 17858520731520.0, - "grad_norm": 1.8260094290654212, - "language_loss": 0.66137004, - "learning_rate": 3.21951739516552e-06, - "loss": 0.68323588, - "num_input_tokens_seen": 111700815, - "step": 5198, - "time_per_iteration": 2.5970942974090576 - }, - { - "auxiliary_loss_clip": 0.01102297, - "auxiliary_loss_mlp": 0.01047482, - "balance_loss_clip": 1.0459094, - "balance_loss_mlp": 1.02898037, - "epoch": 0.3125807906207726, - "flos": 18475034791680.0, - "grad_norm": 2.530729988117139, - "language_loss": 0.6949119, - "learning_rate": 3.219208689735857e-06, - "loss": 0.71640968, - "num_input_tokens_seen": 111718195, - "step": 5199, - "time_per_iteration": 2.6682288646698 - }, - { - "auxiliary_loss_clip": 0.01132634, - "auxiliary_loss_mlp": 0.01050152, - "balance_loss_clip": 1.04906189, - "balance_loss_mlp": 1.03258061, - "epoch": 0.31264091387344056, - "flos": 18946541646720.0, - "grad_norm": 1.8087592578592666, - "language_loss": 0.78480452, - "learning_rate": 3.2188999380736785e-06, - "loss": 0.8066324, - "num_input_tokens_seen": 111734440, - "step": 5200, - "time_per_iteration": 2.6664814949035645 - }, - { - "auxiliary_loss_clip": 0.01132139, - "auxiliary_loss_mlp": 0.01037041, - "balance_loss_clip": 1.05233109, - "balance_loss_mlp": 1.02036345, - "epoch": 0.3127010371261085, - "flos": 21468512384640.0, - "grad_norm": 2.0480479984687214, - "language_loss": 0.83231741, - "learning_rate": 3.2185911401906917e-06, - "loss": 0.85400921, - "num_input_tokens_seen": 111751960, - "step": 5201, - "time_per_iteration": 2.674558401107788 - }, - { - "auxiliary_loss_clip": 0.01144703, - "auxiliary_loss_mlp": 0.01045083, - "balance_loss_clip": 1.05244124, - "balance_loss_mlp": 1.02697527, - "epoch": 0.3127611603787765, - "flos": 15336047203200.0, - "grad_norm": 3.6217323271444037, - "language_loss": 0.6910159, - "learning_rate": 3.2182822960986072e-06, - "loss": 0.71291375, - "num_input_tokens_seen": 111769585, - "step": 5202, - "time_per_iteration": 2.563164710998535 - }, - { - "auxiliary_loss_clip": 0.01146715, - "auxiliary_loss_mlp": 0.01041598, - "balance_loss_clip": 1.05293012, - "balance_loss_mlp": 1.02608871, - "epoch": 0.31282128363144446, - "flos": 17602980399360.0, - "grad_norm": 1.898082303559049, - "language_loss": 0.84124672, - "learning_rate": 3.2179734058091358e-06, - "loss": 0.86312985, - "num_input_tokens_seen": 111787880, - "step": 5203, - "time_per_iteration": 2.6024506092071533 - }, - { - "auxiliary_loss_clip": 0.01086755, - "auxiliary_loss_mlp": 0.01049344, - "balance_loss_clip": 1.04461396, - "balance_loss_mlp": 1.03139079, - "epoch": 0.3128814068841124, - "flos": 26756753235840.0, - "grad_norm": 2.246749233698224, - "language_loss": 0.61165982, - "learning_rate": 3.2176644693339913e-06, - "loss": 0.63302082, - "num_input_tokens_seen": 111805950, - "step": 5204, - "time_per_iteration": 2.748486042022705 - }, - { - "auxiliary_loss_clip": 0.01105223, - "auxiliary_loss_mlp": 0.01043537, - "balance_loss_clip": 1.04439998, - "balance_loss_mlp": 1.02722907, - "epoch": 0.3129415301367804, - "flos": 22272372806400.0, - "grad_norm": 1.6432390116063589, - "language_loss": 0.65875763, - "learning_rate": 3.217355486684887e-06, - "loss": 0.68024528, - "num_input_tokens_seen": 111826135, - "step": 5205, - "time_per_iteration": 2.717499256134033 - }, - { - "auxiliary_loss_clip": 0.01134026, - "auxiliary_loss_mlp": 0.01046734, - "balance_loss_clip": 1.05126929, - "balance_loss_mlp": 1.02849531, - "epoch": 0.31300165338944835, - "flos": 26464907232000.0, - "grad_norm": 1.6106510494401134, - "language_loss": 0.76811433, - "learning_rate": 3.2170464578735414e-06, - "loss": 0.78992188, - "num_input_tokens_seen": 111844700, - "step": 5206, - "time_per_iteration": 2.642439603805542 - }, - { - "auxiliary_loss_clip": 0.01140688, - "auxiliary_loss_mlp": 0.01041131, - "balance_loss_clip": 1.04956853, - "balance_loss_mlp": 1.02448893, - "epoch": 0.3130617766421163, - "flos": 21944652094080.0, - "grad_norm": 2.214530025407602, - "language_loss": 0.83204615, - "learning_rate": 3.216737382911672e-06, - "loss": 0.85386431, - "num_input_tokens_seen": 111861585, - "step": 5207, - "time_per_iteration": 2.616652727127075 - }, - { - "auxiliary_loss_clip": 0.01127002, - "auxiliary_loss_mlp": 0.0104831, - "balance_loss_clip": 1.0502398, - "balance_loss_mlp": 1.0328126, - "epoch": 0.3131218998947843, - "flos": 23292774368640.0, - "grad_norm": 1.5207985149404841, - "language_loss": 0.71359724, - "learning_rate": 3.216428261810999e-06, - "loss": 0.73535037, - "num_input_tokens_seen": 111882950, - "step": 5208, - "time_per_iteration": 2.674813747406006 - }, - { - "auxiliary_loss_clip": 0.01120564, - "auxiliary_loss_mlp": 0.01045064, - "balance_loss_clip": 1.04862344, - "balance_loss_mlp": 1.02827978, - "epoch": 0.3131820231474523, - "flos": 21139642437120.0, - "grad_norm": 1.848256205390157, - "language_loss": 0.74558908, - "learning_rate": 3.2161190945832445e-06, - "loss": 0.76724535, - "num_input_tokens_seen": 111901640, - "step": 5209, - "time_per_iteration": 2.7193644046783447 - }, - { - "auxiliary_loss_clip": 0.01140035, - "auxiliary_loss_mlp": 0.01045727, - "balance_loss_clip": 1.04733396, - "balance_loss_mlp": 1.02937174, - "epoch": 0.31324214640012027, - "flos": 23909863046400.0, - "grad_norm": 2.0633998475681135, - "language_loss": 0.77254915, - "learning_rate": 3.2158098812401325e-06, - "loss": 0.79440677, - "num_input_tokens_seen": 111919615, - "step": 5210, - "time_per_iteration": 2.6212270259857178 - }, - { - "auxiliary_loss_clip": 0.01125553, - "auxiliary_loss_mlp": 0.01039925, - "balance_loss_clip": 1.047261, - "balance_loss_mlp": 1.02385592, - "epoch": 0.31330226965278823, - "flos": 22236929061120.0, - "grad_norm": 1.9577389211395706, - "language_loss": 0.79128736, - "learning_rate": 3.2155006217933874e-06, - "loss": 0.81294215, - "num_input_tokens_seen": 111938485, - "step": 5211, - "time_per_iteration": 2.6618316173553467 - }, - { - "auxiliary_loss_clip": 0.01132257, - "auxiliary_loss_mlp": 0.01042587, - "balance_loss_clip": 1.05107522, - "balance_loss_mlp": 1.02768588, - "epoch": 0.3133623929054562, - "flos": 19753993428480.0, - "grad_norm": 2.4581961413264195, - "language_loss": 0.79612064, - "learning_rate": 3.2151913162547367e-06, - "loss": 0.81786901, - "num_input_tokens_seen": 111956425, - "step": 5212, - "time_per_iteration": 2.81793475151062 - }, - { - "auxiliary_loss_clip": 0.01125931, - "auxiliary_loss_mlp": 0.01053393, - "balance_loss_clip": 1.05156052, - "balance_loss_mlp": 1.03576159, - "epoch": 0.31342251615812416, - "flos": 27162256849920.0, - "grad_norm": 2.69561664367352, - "language_loss": 0.71024299, - "learning_rate": 3.2148819646359097e-06, - "loss": 0.73203623, - "num_input_tokens_seen": 111975915, - "step": 5213, - "time_per_iteration": 2.6739485263824463 - }, - { - "auxiliary_loss_clip": 0.01132672, - "auxiliary_loss_mlp": 0.01045903, - "balance_loss_clip": 1.05284989, - "balance_loss_mlp": 1.02961898, - "epoch": 0.31348263941079213, - "flos": 20229809915520.0, - "grad_norm": 1.9828215257111186, - "language_loss": 0.77684069, - "learning_rate": 3.2145725669486374e-06, - "loss": 0.79862642, - "num_input_tokens_seen": 111995055, - "step": 5214, - "time_per_iteration": 2.6108171939849854 - }, - { - "auxiliary_loss_clip": 0.01099316, - "auxiliary_loss_mlp": 0.01038553, - "balance_loss_clip": 1.0522778, - "balance_loss_mlp": 1.02317524, - "epoch": 0.3135427626634601, - "flos": 24607643627520.0, - "grad_norm": 2.2634840816113075, - "language_loss": 0.8300609, - "learning_rate": 3.2142631232046517e-06, - "loss": 0.8514396, - "num_input_tokens_seen": 112015830, - "step": 5215, - "time_per_iteration": 2.77897047996521 - }, - { - "auxiliary_loss_clip": 0.01131919, - "auxiliary_loss_mlp": 0.01040929, - "balance_loss_clip": 1.05089617, - "balance_loss_mlp": 1.02375078, - "epoch": 0.31360288591612806, - "flos": 20959873845120.0, - "grad_norm": 2.280765330466862, - "language_loss": 0.79540187, - "learning_rate": 3.213953633415686e-06, - "loss": 0.81713033, - "num_input_tokens_seen": 112035065, - "step": 5216, - "time_per_iteration": 2.675492763519287 - }, - { - "auxiliary_loss_clip": 0.01119434, - "auxiliary_loss_mlp": 0.01049814, - "balance_loss_clip": 1.04817545, - "balance_loss_mlp": 1.03174222, - "epoch": 0.313663009168796, - "flos": 26980513009920.0, - "grad_norm": 1.97082305961493, - "language_loss": 0.69007474, - "learning_rate": 3.213644097593477e-06, - "loss": 0.7117672, - "num_input_tokens_seen": 112058405, - "step": 5217, - "time_per_iteration": 2.7360196113586426 - }, - { - "auxiliary_loss_clip": 0.01121348, - "auxiliary_loss_mlp": 0.01038659, - "balance_loss_clip": 1.04833519, - "balance_loss_mlp": 1.02275062, - "epoch": 0.313723132421464, - "flos": 18040911016320.0, - "grad_norm": 1.7253432561329243, - "language_loss": 0.81228399, - "learning_rate": 3.2133345157497624e-06, - "loss": 0.83388406, - "num_input_tokens_seen": 112076420, - "step": 5218, - "time_per_iteration": 4.393778562545776 - }, - { - "auxiliary_loss_clip": 0.01139073, - "auxiliary_loss_mlp": 0.01041023, - "balance_loss_clip": 1.04819143, - "balance_loss_mlp": 1.02422082, - "epoch": 0.31378325567413196, - "flos": 22488913946880.0, - "grad_norm": 2.6452768271158167, - "language_loss": 0.69128895, - "learning_rate": 3.2130248878962813e-06, - "loss": 0.71308994, - "num_input_tokens_seen": 112090775, - "step": 5219, - "time_per_iteration": 4.162578344345093 - }, - { - "auxiliary_loss_clip": 0.01117748, - "auxiliary_loss_mlp": 0.01044298, - "balance_loss_clip": 1.04879618, - "balance_loss_mlp": 1.0287652, - "epoch": 0.3138433789267999, - "flos": 22419247518720.0, - "grad_norm": 5.057996341652072, - "language_loss": 0.80019122, - "learning_rate": 3.2127152140447747e-06, - "loss": 0.82181168, - "num_input_tokens_seen": 112110980, - "step": 5220, - "time_per_iteration": 2.693300247192383 - }, - { - "auxiliary_loss_clip": 0.01133002, - "auxiliary_loss_mlp": 0.01038024, - "balance_loss_clip": 1.05214572, - "balance_loss_mlp": 1.0220139, - "epoch": 0.3139035021794679, - "flos": 13005912026880.0, - "grad_norm": 1.7918234828134079, - "language_loss": 0.72575235, - "learning_rate": 3.212405494206986e-06, - "loss": 0.74746263, - "num_input_tokens_seen": 112129020, - "step": 5221, - "time_per_iteration": 2.6918861865997314 - }, - { - "auxiliary_loss_clip": 0.01105754, - "auxiliary_loss_mlp": 0.0104005, - "balance_loss_clip": 1.04538214, - "balance_loss_mlp": 1.02435017, - "epoch": 0.31396362543213585, - "flos": 16945994689920.0, - "grad_norm": 1.7850671432610508, - "language_loss": 0.82097268, - "learning_rate": 3.2120957283946588e-06, - "loss": 0.84243071, - "num_input_tokens_seen": 112147865, - "step": 5222, - "time_per_iteration": 4.193262100219727 - }, - { - "auxiliary_loss_clip": 0.01136096, - "auxiliary_loss_mlp": 0.01044943, - "balance_loss_clip": 1.05302894, - "balance_loss_mlp": 1.02764595, - "epoch": 0.31402374868480387, - "flos": 20156731695360.0, - "grad_norm": 2.3946225731958073, - "language_loss": 0.70159894, - "learning_rate": 3.2117859166195407e-06, - "loss": 0.7234093, - "num_input_tokens_seen": 112166745, - "step": 5223, - "time_per_iteration": 2.642608642578125 - }, - { - "auxiliary_loss_clip": 0.01120375, - "auxiliary_loss_mlp": 0.00773089, - "balance_loss_clip": 1.04545665, - "balance_loss_mlp": 1.0012387, - "epoch": 0.31408387193747184, - "flos": 21251073404160.0, - "grad_norm": 1.5662600408509175, - "language_loss": 0.80818307, - "learning_rate": 3.211476058893379e-06, - "loss": 0.82711768, - "num_input_tokens_seen": 112185895, - "step": 5224, - "time_per_iteration": 4.334134101867676 - }, - { - "auxiliary_loss_clip": 0.0113849, - "auxiliary_loss_mlp": 0.01044903, - "balance_loss_clip": 1.05376673, - "balance_loss_mlp": 1.02807033, - "epoch": 0.3141439951901398, - "flos": 27484267299840.0, - "grad_norm": 2.581635190586104, - "language_loss": 0.57647121, - "learning_rate": 3.2111661552279243e-06, - "loss": 0.59830517, - "num_input_tokens_seen": 112204465, - "step": 5225, - "time_per_iteration": 2.680227041244507 - }, - { - "auxiliary_loss_clip": 0.01086502, - "auxiliary_loss_mlp": 0.01032759, - "balance_loss_clip": 1.04252625, - "balance_loss_mlp": 1.0179472, - "epoch": 0.31420411844280777, - "flos": 17852235851520.0, - "grad_norm": 2.0500851879408577, - "language_loss": 0.81726074, - "learning_rate": 3.2108562056349273e-06, - "loss": 0.83845341, - "num_input_tokens_seen": 112221635, - "step": 5226, - "time_per_iteration": 2.8080878257751465 - }, - { - "auxiliary_loss_clip": 0.01123539, - "auxiliary_loss_mlp": 0.01053238, - "balance_loss_clip": 1.04718053, - "balance_loss_mlp": 1.03557122, - "epoch": 0.31426424169547573, - "flos": 21616967295360.0, - "grad_norm": 1.8156350578732643, - "language_loss": 0.7435357, - "learning_rate": 3.210546210126141e-06, - "loss": 0.76530349, - "num_input_tokens_seen": 112241240, - "step": 5227, - "time_per_iteration": 2.6420040130615234 - }, - { - "auxiliary_loss_clip": 0.01128154, - "auxiliary_loss_mlp": 0.01036288, - "balance_loss_clip": 1.05315053, - "balance_loss_mlp": 1.01981306, - "epoch": 0.3143243649481437, - "flos": 30920631586560.0, - "grad_norm": 1.9798889840887306, - "language_loss": 0.6779027, - "learning_rate": 3.2102361687133213e-06, - "loss": 0.69954711, - "num_input_tokens_seen": 112262350, - "step": 5228, - "time_per_iteration": 2.6904454231262207 - }, - { - "auxiliary_loss_clip": 0.01116854, - "auxiliary_loss_mlp": 0.01042698, - "balance_loss_clip": 1.04812217, - "balance_loss_mlp": 1.02755868, - "epoch": 0.31438448820081166, - "flos": 22821411168000.0, - "grad_norm": 2.2592581290101648, - "language_loss": 0.802086, - "learning_rate": 3.2099260814082254e-06, - "loss": 0.82368147, - "num_input_tokens_seen": 112283710, - "step": 5229, - "time_per_iteration": 2.720972776412964 - }, - { - "auxiliary_loss_clip": 0.01116185, - "auxiliary_loss_mlp": 0.01034979, - "balance_loss_clip": 1.04888391, - "balance_loss_mlp": 1.01917148, - "epoch": 0.3144446114534796, - "flos": 23292127923840.0, - "grad_norm": 2.206396959728329, - "language_loss": 0.69972271, - "learning_rate": 3.209615948222611e-06, - "loss": 0.72123438, - "num_input_tokens_seen": 112304285, - "step": 5230, - "time_per_iteration": 2.69555401802063 - }, - { - "auxiliary_loss_clip": 0.01094216, - "auxiliary_loss_mlp": 0.01051308, - "balance_loss_clip": 1.042889, - "balance_loss_mlp": 1.03331971, - "epoch": 0.3145047347061476, - "flos": 31355976424320.0, - "grad_norm": 11.083232715551919, - "language_loss": 0.79441226, - "learning_rate": 3.209305769168239e-06, - "loss": 0.81586754, - "num_input_tokens_seen": 112325110, - "step": 5231, - "time_per_iteration": 2.742414712905884 - }, - { - "auxiliary_loss_clip": 0.01111136, - "auxiliary_loss_mlp": 0.01044032, - "balance_loss_clip": 1.05004621, - "balance_loss_mlp": 1.02751017, - "epoch": 0.31456485795881556, - "flos": 10889552643840.0, - "grad_norm": 68.21693219117104, - "language_loss": 0.84846044, - "learning_rate": 3.2089955442568704e-06, - "loss": 0.87001216, - "num_input_tokens_seen": 112339855, - "step": 5232, - "time_per_iteration": 2.681541919708252 - }, - { - "auxiliary_loss_clip": 0.01082351, - "auxiliary_loss_mlp": 0.01063678, - "balance_loss_clip": 1.04169703, - "balance_loss_mlp": 1.04589176, - "epoch": 0.3146249812114835, - "flos": 17092438439040.0, - "grad_norm": 1.732593505271442, - "language_loss": 0.79899549, - "learning_rate": 3.2086852735002692e-06, - "loss": 0.82045579, - "num_input_tokens_seen": 112358480, - "step": 5233, - "time_per_iteration": 2.7261524200439453 - }, - { - "auxiliary_loss_clip": 0.01095476, - "auxiliary_loss_mlp": 0.01043701, - "balance_loss_clip": 1.04795146, - "balance_loss_mlp": 1.02775121, - "epoch": 0.3146851044641515, - "flos": 55291442889600.0, - "grad_norm": 1.8884411146751285, - "language_loss": 0.71124369, - "learning_rate": 3.2083749569102024e-06, - "loss": 0.73263544, - "num_input_tokens_seen": 112382350, - "step": 5234, - "time_per_iteration": 3.0071427822113037 - }, - { - "auxiliary_loss_clip": 0.01105209, - "auxiliary_loss_mlp": 0.01036666, - "balance_loss_clip": 1.05008078, - "balance_loss_mlp": 1.02060878, - "epoch": 0.31474522771681945, - "flos": 27015884928000.0, - "grad_norm": 2.1537517260325396, - "language_loss": 0.72106552, - "learning_rate": 3.2080645944984356e-06, - "loss": 0.74248433, - "num_input_tokens_seen": 112400260, - "step": 5235, - "time_per_iteration": 2.7347464561462402 - }, - { - "auxiliary_loss_clip": 0.011281, - "auxiliary_loss_mlp": 0.0103842, - "balance_loss_clip": 1.0479089, - "balance_loss_mlp": 1.0225656, - "epoch": 0.3148053509694875, - "flos": 21251935330560.0, - "grad_norm": 2.047935998004664, - "language_loss": 0.78640145, - "learning_rate": 3.2077541862767384e-06, - "loss": 0.80806667, - "num_input_tokens_seen": 112419400, - "step": 5236, - "time_per_iteration": 2.6480181217193604 - }, - { - "auxiliary_loss_clip": 0.01142531, - "auxiliary_loss_mlp": 0.0104222, - "balance_loss_clip": 1.04929006, - "balance_loss_mlp": 1.02536416, - "epoch": 0.31486547422215544, - "flos": 31248675521280.0, - "grad_norm": 1.8469097199945863, - "language_loss": 0.75903904, - "learning_rate": 3.207443732256881e-06, - "loss": 0.78088653, - "num_input_tokens_seen": 112440825, - "step": 5237, - "time_per_iteration": 2.7113847732543945 - }, - { - "auxiliary_loss_clip": 0.01133953, - "auxiliary_loss_mlp": 0.01035749, - "balance_loss_clip": 1.04817045, - "balance_loss_mlp": 1.02128255, - "epoch": 0.3149255974748234, - "flos": 19828615933440.0, - "grad_norm": 2.176202072112168, - "language_loss": 0.79725033, - "learning_rate": 3.2071332324506372e-06, - "loss": 0.81894737, - "num_input_tokens_seen": 112459180, - "step": 5238, - "time_per_iteration": 2.649968147277832 - }, - { - "auxiliary_loss_clip": 0.01046118, - "auxiliary_loss_mlp": 0.01018852, - "balance_loss_clip": 1.02561212, - "balance_loss_mlp": 1.01676548, - "epoch": 0.31498572072749137, - "flos": 67683965339520.0, - "grad_norm": 0.8324046464960934, - "language_loss": 0.67913729, - "learning_rate": 3.2068226868697795e-06, - "loss": 0.69978696, - "num_input_tokens_seen": 112516680, - "step": 5239, - "time_per_iteration": 3.130643606185913 - }, - { - "auxiliary_loss_clip": 0.01121581, - "auxiliary_loss_mlp": 0.01043617, - "balance_loss_clip": 1.04828835, - "balance_loss_mlp": 1.02528274, - "epoch": 0.31504584398015933, - "flos": 19793136274560.0, - "grad_norm": 2.4702861290170235, - "language_loss": 0.82906926, - "learning_rate": 3.2065120955260846e-06, - "loss": 0.85072124, - "num_input_tokens_seen": 112535895, - "step": 5240, - "time_per_iteration": 2.6314027309417725 - }, - { - "auxiliary_loss_clip": 0.0111196, - "auxiliary_loss_mlp": 0.0077379, - "balance_loss_clip": 1.04708409, - "balance_loss_mlp": 1.00132334, - "epoch": 0.3151059672328273, - "flos": 26615409217920.0, - "grad_norm": 1.6854261536361361, - "language_loss": 0.81405544, - "learning_rate": 3.2062014584313302e-06, - "loss": 0.83291298, - "num_input_tokens_seen": 112557490, - "step": 5241, - "time_per_iteration": 2.7245657444000244 - }, - { - "auxiliary_loss_clip": 0.01138561, - "auxiliary_loss_mlp": 0.01038584, - "balance_loss_clip": 1.05094576, - "balance_loss_mlp": 1.0230633, - "epoch": 0.31516609048549526, - "flos": 24204438483840.0, - "grad_norm": 1.7554610875937957, - "language_loss": 0.74513441, - "learning_rate": 3.2058907755972956e-06, - "loss": 0.7669059, - "num_input_tokens_seen": 112577075, - "step": 5242, - "time_per_iteration": 2.5925803184509277 - }, - { - "auxiliary_loss_clip": 0.01106752, - "auxiliary_loss_mlp": 0.01039069, - "balance_loss_clip": 1.04686832, - "balance_loss_mlp": 1.02230775, - "epoch": 0.31522621373816323, - "flos": 25958710817280.0, - "grad_norm": 12.905078117761404, - "language_loss": 0.73457384, - "learning_rate": 3.2055800470357626e-06, - "loss": 0.75603199, - "num_input_tokens_seen": 112597620, - "step": 5243, - "time_per_iteration": 2.721261739730835 - }, - { - "auxiliary_loss_clip": 0.01126602, - "auxiliary_loss_mlp": 0.01041378, - "balance_loss_clip": 1.04783881, - "balance_loss_mlp": 1.02524936, - "epoch": 0.3152863369908312, - "flos": 21908813299200.0, - "grad_norm": 2.079273463581607, - "language_loss": 0.6462577, - "learning_rate": 3.205269272758513e-06, - "loss": 0.66793752, - "num_input_tokens_seen": 112617150, - "step": 5244, - "time_per_iteration": 2.6753153800964355 - }, - { - "auxiliary_loss_clip": 0.01087107, - "auxiliary_loss_mlp": 0.01037472, - "balance_loss_clip": 1.04454994, - "balance_loss_mlp": 1.02158141, - "epoch": 0.31534646024349916, - "flos": 16281072074880.0, - "grad_norm": 2.126512737541558, - "language_loss": 0.91117549, - "learning_rate": 3.2049584527773313e-06, - "loss": 0.93242127, - "num_input_tokens_seen": 112631090, - "step": 5245, - "time_per_iteration": 2.717316150665283 - }, - { - "auxiliary_loss_clip": 0.01129236, - "auxiliary_loss_mlp": 0.01046116, - "balance_loss_clip": 1.04892504, - "balance_loss_mlp": 1.02911687, - "epoch": 0.3154065834961671, - "flos": 24717243000960.0, - "grad_norm": 2.0341104694483296, - "language_loss": 0.75199413, - "learning_rate": 3.2046475871040048e-06, - "loss": 0.77374756, - "num_input_tokens_seen": 112651220, - "step": 5246, - "time_per_iteration": 2.738969564437866 - }, - { - "auxiliary_loss_clip": 0.01139621, - "auxiliary_loss_mlp": 0.01044826, - "balance_loss_clip": 1.04860735, - "balance_loss_mlp": 1.027946, - "epoch": 0.3154667067488351, - "flos": 35371148469120.0, - "grad_norm": 1.7161631839732394, - "language_loss": 0.61524433, - "learning_rate": 3.204336675750321e-06, - "loss": 0.63708878, - "num_input_tokens_seen": 112671560, - "step": 5247, - "time_per_iteration": 2.714258909225464 - }, - { - "auxiliary_loss_clip": 0.01129569, - "auxiliary_loss_mlp": 0.0104508, - "balance_loss_clip": 1.04842138, - "balance_loss_mlp": 1.0283072, - "epoch": 0.31552683000150306, - "flos": 17456464823040.0, - "grad_norm": 2.438581052681848, - "language_loss": 0.82096362, - "learning_rate": 3.2040257187280693e-06, - "loss": 0.84271014, - "num_input_tokens_seen": 112689790, - "step": 5248, - "time_per_iteration": 2.6235198974609375 - }, - { - "auxiliary_loss_clip": 0.01121718, - "auxiliary_loss_mlp": 0.01047358, - "balance_loss_clip": 1.04964209, - "balance_loss_mlp": 1.0292145, - "epoch": 0.3155869532541711, - "flos": 18405763413120.0, - "grad_norm": 5.654706808285272, - "language_loss": 0.84601712, - "learning_rate": 3.2037147160490423e-06, - "loss": 0.86770785, - "num_input_tokens_seen": 112708265, - "step": 5249, - "time_per_iteration": 2.664454698562622 - }, - { - "auxiliary_loss_clip": 0.01105599, - "auxiliary_loss_mlp": 0.01040266, - "balance_loss_clip": 1.04724038, - "balance_loss_mlp": 1.02252758, - "epoch": 0.31564707650683904, - "flos": 21579763783680.0, - "grad_norm": 2.1333510394712034, - "language_loss": 0.85412121, - "learning_rate": 3.2034036677250322e-06, - "loss": 0.87557989, - "num_input_tokens_seen": 112727820, - "step": 5250, - "time_per_iteration": 2.7892768383026123 - }, - { - "auxiliary_loss_clip": 0.01110748, - "auxiliary_loss_mlp": 0.01044305, - "balance_loss_clip": 1.04626083, - "balance_loss_mlp": 1.02721059, - "epoch": 0.315707199759507, - "flos": 21030976817280.0, - "grad_norm": 3.250818956981283, - "language_loss": 0.68651402, - "learning_rate": 3.203092573767835e-06, - "loss": 0.70806456, - "num_input_tokens_seen": 112743140, - "step": 5251, - "time_per_iteration": 2.660738468170166 - }, - { - "auxiliary_loss_clip": 0.01141131, - "auxiliary_loss_mlp": 0.01040852, - "balance_loss_clip": 1.05063367, - "balance_loss_mlp": 1.02374566, - "epoch": 0.31576732301217497, - "flos": 26828861788800.0, - "grad_norm": 1.6959923935223091, - "language_loss": 0.79367268, - "learning_rate": 3.202781434189246e-06, - "loss": 0.81549257, - "num_input_tokens_seen": 112764705, - "step": 5252, - "time_per_iteration": 2.6600146293640137 - }, - { - "auxiliary_loss_clip": 0.01123952, - "auxiliary_loss_mlp": 0.01055554, - "balance_loss_clip": 1.04919744, - "balance_loss_mlp": 1.03742182, - "epoch": 0.31582744626484294, - "flos": 22711165349760.0, - "grad_norm": 1.5850214403847396, - "language_loss": 0.74167955, - "learning_rate": 3.202470249001066e-06, - "loss": 0.76347458, - "num_input_tokens_seen": 112785310, - "step": 5253, - "time_per_iteration": 2.6831557750701904 - }, - { - "auxiliary_loss_clip": 0.01117625, - "auxiliary_loss_mlp": 0.01042879, - "balance_loss_clip": 1.04685211, - "balance_loss_mlp": 1.02571261, - "epoch": 0.3158875695175109, - "flos": 23951914894080.0, - "grad_norm": 1.8578399335985847, - "language_loss": 0.73295557, - "learning_rate": 3.2021590182150924e-06, - "loss": 0.75456059, - "num_input_tokens_seen": 112802905, - "step": 5254, - "time_per_iteration": 2.664445161819458 - }, - { - "auxiliary_loss_clip": 0.0112999, - "auxiliary_loss_mlp": 0.0104166, - "balance_loss_clip": 1.04998255, - "balance_loss_mlp": 1.02442837, - "epoch": 0.31594769277017887, - "flos": 13261883322240.0, - "grad_norm": 1.9116991379626416, - "language_loss": 0.77497417, - "learning_rate": 3.201847741843128e-06, - "loss": 0.7966907, - "num_input_tokens_seen": 112820305, - "step": 5255, - "time_per_iteration": 2.5817084312438965 - }, - { - "auxiliary_loss_clip": 0.01116092, - "auxiliary_loss_mlp": 0.01045862, - "balance_loss_clip": 1.0481391, - "balance_loss_mlp": 1.02718151, - "epoch": 0.31600781602284683, - "flos": 23368258800000.0, - "grad_norm": 2.396272573281143, - "language_loss": 0.7821492, - "learning_rate": 3.2015364198969772e-06, - "loss": 0.80376875, - "num_input_tokens_seen": 112841185, - "step": 5256, - "time_per_iteration": 2.6798577308654785 - }, - { - "auxiliary_loss_clip": 0.0109858, - "auxiliary_loss_mlp": 0.01042238, - "balance_loss_clip": 1.04874921, - "balance_loss_mlp": 1.02676511, - "epoch": 0.3160679392755148, - "flos": 19828580019840.0, - "grad_norm": 1.575034121408654, - "language_loss": 0.71175283, - "learning_rate": 3.2012250523884453e-06, - "loss": 0.73316103, - "num_input_tokens_seen": 112860570, - "step": 5257, - "time_per_iteration": 4.252342462539673 - }, - { - "auxiliary_loss_clip": 0.01132481, - "auxiliary_loss_mlp": 0.01043271, - "balance_loss_clip": 1.05120182, - "balance_loss_mlp": 1.02524674, - "epoch": 0.31612806252818276, - "flos": 20193216935040.0, - "grad_norm": 2.0196036815267036, - "language_loss": 0.76539034, - "learning_rate": 3.2009136393293393e-06, - "loss": 0.78714788, - "num_input_tokens_seen": 112877975, - "step": 5258, - "time_per_iteration": 4.240477085113525 - }, - { - "auxiliary_loss_clip": 0.01110908, - "auxiliary_loss_mlp": 0.01047088, - "balance_loss_clip": 1.04727268, - "balance_loss_mlp": 1.02917099, - "epoch": 0.31618818578085073, - "flos": 24235967646720.0, - "grad_norm": 3.2354010090655403, - "language_loss": 0.72901475, - "learning_rate": 3.200602180731467e-06, - "loss": 0.75059474, - "num_input_tokens_seen": 112896170, - "step": 5259, - "time_per_iteration": 2.726944923400879 - }, - { - "auxiliary_loss_clip": 0.01117115, - "auxiliary_loss_mlp": 0.00776982, - "balance_loss_clip": 1.04983401, - "balance_loss_mlp": 1.0013001, - "epoch": 0.3162483090335187, - "flos": 25081844002560.0, - "grad_norm": 2.1961272089612307, - "language_loss": 0.66124642, - "learning_rate": 3.20029067660664e-06, - "loss": 0.68018734, - "num_input_tokens_seen": 112916180, - "step": 5260, - "time_per_iteration": 2.7605621814727783 - }, - { - "auxiliary_loss_clip": 0.01130372, - "auxiliary_loss_mlp": 0.01037108, - "balance_loss_clip": 1.04645884, - "balance_loss_mlp": 1.02016842, - "epoch": 0.31630843228618666, - "flos": 26323383646080.0, - "grad_norm": 1.8277182943015604, - "language_loss": 0.71989, - "learning_rate": 3.1999791269666706e-06, - "loss": 0.74156475, - "num_input_tokens_seen": 112936745, - "step": 5261, - "time_per_iteration": 4.231431484222412 - }, - { - "auxiliary_loss_clip": 0.01044321, - "auxiliary_loss_mlp": 0.01007323, - "balance_loss_clip": 1.02311194, - "balance_loss_mlp": 1.00424767, - "epoch": 0.3163685555388547, - "flos": 66758441552640.0, - "grad_norm": 0.7429950107461195, - "language_loss": 0.50646758, - "learning_rate": 3.1996675318233716e-06, - "loss": 0.5269841, - "num_input_tokens_seen": 112994845, - "step": 5262, - "time_per_iteration": 3.232384443283081 - }, - { - "auxiliary_loss_clip": 0.01131333, - "auxiliary_loss_mlp": 0.01046761, - "balance_loss_clip": 1.05222106, - "balance_loss_mlp": 1.02932084, - "epoch": 0.31642867879152264, - "flos": 25995662933760.0, - "grad_norm": 1.5863649349069382, - "language_loss": 0.85187083, - "learning_rate": 3.19935589118856e-06, - "loss": 0.8736518, - "num_input_tokens_seen": 113015125, - "step": 5263, - "time_per_iteration": 4.33522629737854 - }, - { - "auxiliary_loss_clip": 0.01112644, - "auxiliary_loss_mlp": 0.01048382, - "balance_loss_clip": 1.04875994, - "balance_loss_mlp": 1.03256297, - "epoch": 0.3164888020441906, - "flos": 25774955815680.0, - "grad_norm": 1.550008856477613, - "language_loss": 0.81648135, - "learning_rate": 3.1990442050740535e-06, - "loss": 0.83809161, - "num_input_tokens_seen": 113035535, - "step": 5264, - "time_per_iteration": 2.8155312538146973 - }, - { - "auxiliary_loss_clip": 0.01121259, - "auxiliary_loss_mlp": 0.0104222, - "balance_loss_clip": 1.04812968, - "balance_loss_mlp": 1.02431464, - "epoch": 0.3165489252968586, - "flos": 19756220071680.0, - "grad_norm": 2.234025317189389, - "language_loss": 0.78969181, - "learning_rate": 3.19873247349167e-06, - "loss": 0.81132656, - "num_input_tokens_seen": 113052720, - "step": 5265, - "time_per_iteration": 2.6533524990081787 - }, - { - "auxiliary_loss_clip": 0.0113452, - "auxiliary_loss_mlp": 0.01049591, - "balance_loss_clip": 1.05209899, - "balance_loss_mlp": 1.03144741, - "epoch": 0.31660904854952654, - "flos": 23183929180800.0, - "grad_norm": 1.789116232573577, - "language_loss": 0.74705631, - "learning_rate": 3.1984206964532307e-06, - "loss": 0.76889741, - "num_input_tokens_seen": 113071435, - "step": 5266, - "time_per_iteration": 2.66683292388916 - }, - { - "auxiliary_loss_clip": 0.01108402, - "auxiliary_loss_mlp": 0.0104338, - "balance_loss_clip": 1.04636073, - "balance_loss_mlp": 1.02660751, - "epoch": 0.3166691718021945, - "flos": 20408501099520.0, - "grad_norm": 2.507852328081816, - "language_loss": 0.79178059, - "learning_rate": 3.1981088739705585e-06, - "loss": 0.81329834, - "num_input_tokens_seen": 113088645, - "step": 5267, - "time_per_iteration": 2.6870310306549072 - }, - { - "auxiliary_loss_clip": 0.0103642, - "auxiliary_loss_mlp": 0.01002482, - "balance_loss_clip": 1.02563763, - "balance_loss_mlp": 1.00002623, - "epoch": 0.31672929505486247, - "flos": 70144781172480.0, - "grad_norm": 0.7343006553516018, - "language_loss": 0.57840127, - "learning_rate": 3.197797006055478e-06, - "loss": 0.59879029, - "num_input_tokens_seen": 113152775, - "step": 5268, - "time_per_iteration": 3.211494207382202 - }, - { - "auxiliary_loss_clip": 0.01144761, - "auxiliary_loss_mlp": 0.01044165, - "balance_loss_clip": 1.0517385, - "balance_loss_mlp": 1.02729666, - "epoch": 0.31678941830753043, - "flos": 14355758154240.0, - "grad_norm": 2.2657818682072146, - "language_loss": 0.73009932, - "learning_rate": 3.197485092719815e-06, - "loss": 0.75198865, - "num_input_tokens_seen": 113171410, - "step": 5269, - "time_per_iteration": 2.5840115547180176 - }, - { - "auxiliary_loss_clip": 0.01108492, - "auxiliary_loss_mlp": 0.01049824, - "balance_loss_clip": 1.0489136, - "balance_loss_mlp": 1.03283644, - "epoch": 0.3168495415601984, - "flos": 22747722416640.0, - "grad_norm": 2.2273308320264995, - "language_loss": 0.79972744, - "learning_rate": 3.1971731339753973e-06, - "loss": 0.82131052, - "num_input_tokens_seen": 113189965, - "step": 5270, - "time_per_iteration": 2.858154535293579 - }, - { - "auxiliary_loss_clip": 0.01146892, - "auxiliary_loss_mlp": 0.01050124, - "balance_loss_clip": 1.05206418, - "balance_loss_mlp": 1.03207529, - "epoch": 0.31690966481286637, - "flos": 20115254465280.0, - "grad_norm": 9.25747726986636, - "language_loss": 0.7941646, - "learning_rate": 3.1968611298340545e-06, - "loss": 0.81613475, - "num_input_tokens_seen": 113206355, - "step": 5271, - "time_per_iteration": 2.6510884761810303 - }, - { - "auxiliary_loss_clip": 0.01144344, - "auxiliary_loss_mlp": 0.01040088, - "balance_loss_clip": 1.05230093, - "balance_loss_mlp": 1.02269578, - "epoch": 0.31696978806553433, - "flos": 21178928937600.0, - "grad_norm": 1.806612869692892, - "language_loss": 0.72429144, - "learning_rate": 3.1965490803076173e-06, - "loss": 0.74613577, - "num_input_tokens_seen": 113225440, - "step": 5272, - "time_per_iteration": 2.6807363033294678 - }, - { - "auxiliary_loss_clip": 0.01123855, - "auxiliary_loss_mlp": 0.01052611, - "balance_loss_clip": 1.04942703, - "balance_loss_mlp": 1.03365636, - "epoch": 0.3170299113182023, - "flos": 42997030439040.0, - "grad_norm": 2.241731745129767, - "language_loss": 0.69146693, - "learning_rate": 3.1962369854079194e-06, - "loss": 0.71323156, - "num_input_tokens_seen": 113248840, - "step": 5273, - "time_per_iteration": 2.9202728271484375 - }, - { - "auxiliary_loss_clip": 0.01128467, - "auxiliary_loss_mlp": 0.00775845, - "balance_loss_clip": 1.04869509, - "balance_loss_mlp": 1.00146461, - "epoch": 0.31709003457087026, - "flos": 24460158384000.0, - "grad_norm": 1.872718303622414, - "language_loss": 0.67764306, - "learning_rate": 3.195924845146795e-06, - "loss": 0.69668615, - "num_input_tokens_seen": 113269630, - "step": 5274, - "time_per_iteration": 2.6541714668273926 - }, - { - "auxiliary_loss_clip": 0.01092683, - "auxiliary_loss_mlp": 0.0106112, - "balance_loss_clip": 1.04346347, - "balance_loss_mlp": 1.04305935, - "epoch": 0.3171501578235382, - "flos": 24135310759680.0, - "grad_norm": 1.7402048894999724, - "language_loss": 0.80815518, - "learning_rate": 3.195612659536081e-06, - "loss": 0.8296932, - "num_input_tokens_seen": 113291200, - "step": 5275, - "time_per_iteration": 2.840696096420288 - }, - { - "auxiliary_loss_clip": 0.0113287, - "auxiliary_loss_mlp": 0.01047853, - "balance_loss_clip": 1.04862475, - "balance_loss_mlp": 1.02979279, - "epoch": 0.31721028107620625, - "flos": 18879712392960.0, - "grad_norm": 2.28886723118271, - "language_loss": 0.72418922, - "learning_rate": 3.1953004285876147e-06, - "loss": 0.74599648, - "num_input_tokens_seen": 113310170, - "step": 5276, - "time_per_iteration": 2.6426591873168945 - }, - { - "auxiliary_loss_clip": 0.01122606, - "auxiliary_loss_mlp": 0.01041381, - "balance_loss_clip": 1.05439019, - "balance_loss_mlp": 1.02588356, - "epoch": 0.3172704043288742, - "flos": 23147874904320.0, - "grad_norm": 1.4542936031710312, - "language_loss": 0.77923822, - "learning_rate": 3.194988152313236e-06, - "loss": 0.80087811, - "num_input_tokens_seen": 113331140, - "step": 5277, - "time_per_iteration": 2.7192864418029785 - }, - { - "auxiliary_loss_clip": 0.01113098, - "auxiliary_loss_mlp": 0.01054598, - "balance_loss_clip": 1.04708886, - "balance_loss_mlp": 1.03432024, - "epoch": 0.3173305275815422, - "flos": 17858520731520.0, - "grad_norm": 2.071832444797603, - "language_loss": 0.79029107, - "learning_rate": 3.1946758307247878e-06, - "loss": 0.81196797, - "num_input_tokens_seen": 113350030, - "step": 5278, - "time_per_iteration": 2.606973648071289 - }, - { - "auxiliary_loss_clip": 0.01041198, - "auxiliary_loss_mlp": 0.01006121, - "balance_loss_clip": 1.02207565, - "balance_loss_mlp": 1.00391531, - "epoch": 0.31739065083421014, - "flos": 59973476883840.0, - "grad_norm": 0.8783580735908582, - "language_loss": 0.62817574, - "learning_rate": 3.1943634638341114e-06, - "loss": 0.64864898, - "num_input_tokens_seen": 113395820, - "step": 5279, - "time_per_iteration": 2.998594284057617 - }, - { - "auxiliary_loss_clip": 0.01146927, - "auxiliary_loss_mlp": 0.01055699, - "balance_loss_clip": 1.05080009, - "balance_loss_mlp": 1.03651857, - "epoch": 0.3174507740868781, - "flos": 23800981944960.0, - "grad_norm": 1.4881688285488497, - "language_loss": 0.80855167, - "learning_rate": 3.194051051653053e-06, - "loss": 0.83057791, - "num_input_tokens_seen": 113416835, - "step": 5280, - "time_per_iteration": 2.662240743637085 - }, - { - "auxiliary_loss_clip": 0.0110603, - "auxiliary_loss_mlp": 0.01050191, - "balance_loss_clip": 1.04850507, - "balance_loss_mlp": 1.0339663, - "epoch": 0.31751089733954607, - "flos": 27638899349760.0, - "grad_norm": 1.6411021360183768, - "language_loss": 0.77964067, - "learning_rate": 3.19373859419346e-06, - "loss": 0.80120289, - "num_input_tokens_seen": 113440850, - "step": 5281, - "time_per_iteration": 2.8303840160369873 - }, - { - "auxiliary_loss_clip": 0.01119054, - "auxiliary_loss_mlp": 0.0103955, - "balance_loss_clip": 1.04812443, - "balance_loss_mlp": 1.02194262, - "epoch": 0.31757102059221404, - "flos": 23769273214080.0, - "grad_norm": 2.6184534699054116, - "language_loss": 0.78539747, - "learning_rate": 3.193426091467179e-06, - "loss": 0.80698353, - "num_input_tokens_seen": 113461000, - "step": 5282, - "time_per_iteration": 2.75915265083313 - }, - { - "auxiliary_loss_clip": 0.01122553, - "auxiliary_loss_mlp": 0.01050996, - "balance_loss_clip": 1.0517695, - "balance_loss_mlp": 1.03284001, - "epoch": 0.317631143844882, - "flos": 25264521596160.0, - "grad_norm": 1.8901773671102746, - "language_loss": 0.67857707, - "learning_rate": 3.193113543486061e-06, - "loss": 0.70031261, - "num_input_tokens_seen": 113480820, - "step": 5283, - "time_per_iteration": 2.710601329803467 - }, - { - "auxiliary_loss_clip": 0.01039071, - "auxiliary_loss_mlp": 0.01003581, - "balance_loss_clip": 1.02084279, - "balance_loss_mlp": 1.00145948, - "epoch": 0.31769126709754997, - "flos": 55825939221120.0, - "grad_norm": 0.7284643981615322, - "language_loss": 0.52787578, - "learning_rate": 3.192800950261958e-06, - "loss": 0.54830229, - "num_input_tokens_seen": 113536910, - "step": 5284, - "time_per_iteration": 3.1312994956970215 - }, - { - "auxiliary_loss_clip": 0.01123508, - "auxiliary_loss_mlp": 0.01041652, - "balance_loss_clip": 1.05256152, - "balance_loss_mlp": 1.02529633, - "epoch": 0.31775139035021793, - "flos": 16690562098560.0, - "grad_norm": 1.6358492252526933, - "language_loss": 0.70703542, - "learning_rate": 3.1924883118067235e-06, - "loss": 0.72868699, - "num_input_tokens_seen": 113555480, - "step": 5285, - "time_per_iteration": 2.66414213180542 - }, - { - "auxiliary_loss_clip": 0.01051594, - "auxiliary_loss_mlp": 0.01001353, - "balance_loss_clip": 1.02112103, - "balance_loss_mlp": 0.99919558, - "epoch": 0.3178115136028859, - "flos": 64227241019520.0, - "grad_norm": 0.8795363824150627, - "language_loss": 0.60495377, - "learning_rate": 3.1921756281322123e-06, - "loss": 0.62548316, - "num_input_tokens_seen": 113616790, - "step": 5286, - "time_per_iteration": 3.1636195182800293 - }, - { - "auxiliary_loss_clip": 0.01145219, - "auxiliary_loss_mlp": 0.01047411, - "balance_loss_clip": 1.05137587, - "balance_loss_mlp": 1.02995849, - "epoch": 0.31787163685555386, - "flos": 18697465762560.0, - "grad_norm": 10.257300688850748, - "language_loss": 0.72160053, - "learning_rate": 3.1918628992502826e-06, - "loss": 0.74352682, - "num_input_tokens_seen": 113635320, - "step": 5287, - "time_per_iteration": 2.628863573074341 - }, - { - "auxiliary_loss_clip": 0.01132987, - "auxiliary_loss_mlp": 0.0105662, - "balance_loss_clip": 1.04966712, - "balance_loss_mlp": 1.03823805, - "epoch": 0.31793176010822183, - "flos": 21324762155520.0, - "grad_norm": 2.3229849512265126, - "language_loss": 0.75706261, - "learning_rate": 3.191550125172792e-06, - "loss": 0.77895868, - "num_input_tokens_seen": 113654000, - "step": 5288, - "time_per_iteration": 2.7565319538116455 - }, - { - "auxiliary_loss_clip": 0.01128698, - "auxiliary_loss_mlp": 0.01037369, - "balance_loss_clip": 1.04913831, - "balance_loss_mlp": 1.02223587, - "epoch": 0.31799188336088985, - "flos": 20958688696320.0, - "grad_norm": 3.550043827117326, - "language_loss": 0.87827504, - "learning_rate": 3.1912373059116007e-06, - "loss": 0.89993572, - "num_input_tokens_seen": 113672375, - "step": 5289, - "time_per_iteration": 2.6671485900878906 - }, - { - "auxiliary_loss_clip": 0.01126628, - "auxiliary_loss_mlp": 0.01039655, - "balance_loss_clip": 1.05225897, - "balance_loss_mlp": 1.02443218, - "epoch": 0.3180520066135578, - "flos": 22491930689280.0, - "grad_norm": 1.767762146387748, - "language_loss": 0.68103814, - "learning_rate": 3.190924441478572e-06, - "loss": 0.70270097, - "num_input_tokens_seen": 113692385, - "step": 5290, - "time_per_iteration": 2.6986947059631348 - }, - { - "auxiliary_loss_clip": 0.01120385, - "auxiliary_loss_mlp": 0.01046806, - "balance_loss_clip": 1.04791737, - "balance_loss_mlp": 1.02924609, - "epoch": 0.3181121298662258, - "flos": 27235335070080.0, - "grad_norm": 2.1353951835610303, - "language_loss": 0.80298805, - "learning_rate": 3.1906115318855687e-06, - "loss": 0.82465994, - "num_input_tokens_seen": 113712145, - "step": 5291, - "time_per_iteration": 2.67692494392395 - }, - { - "auxiliary_loss_clip": 0.01112404, - "auxiliary_loss_mlp": 0.01038285, - "balance_loss_clip": 1.05768418, - "balance_loss_mlp": 1.02066636, - "epoch": 0.31817225311889374, - "flos": 23180158252800.0, - "grad_norm": 4.0426741537939614, - "language_loss": 0.79877901, - "learning_rate": 3.1902985771444577e-06, - "loss": 0.82028592, - "num_input_tokens_seen": 113731435, - "step": 5292, - "time_per_iteration": 2.8386974334716797 - }, - { - "auxiliary_loss_clip": 0.01126783, - "auxiliary_loss_mlp": 0.01037968, - "balance_loss_clip": 1.05076253, - "balance_loss_mlp": 1.0233407, - "epoch": 0.3182323763715617, - "flos": 23258803080960.0, - "grad_norm": 1.5696258430885255, - "language_loss": 0.74754488, - "learning_rate": 3.1899855772671043e-06, - "loss": 0.7691924, - "num_input_tokens_seen": 113750825, - "step": 5293, - "time_per_iteration": 2.651566982269287 - }, - { - "auxiliary_loss_clip": 0.01129161, - "auxiliary_loss_mlp": 0.01045458, - "balance_loss_clip": 1.05253696, - "balance_loss_mlp": 1.03027081, - "epoch": 0.3182924996242297, - "flos": 29016683280000.0, - "grad_norm": 1.9205945835079516, - "language_loss": 0.74100351, - "learning_rate": 3.189672532265379e-06, - "loss": 0.76274973, - "num_input_tokens_seen": 113770010, - "step": 5294, - "time_per_iteration": 2.6593024730682373 - }, - { - "auxiliary_loss_clip": 0.01145372, - "auxiliary_loss_mlp": 0.01038723, - "balance_loss_clip": 1.05254447, - "balance_loss_mlp": 1.02166462, - "epoch": 0.31835262287689764, - "flos": 20449188230400.0, - "grad_norm": 3.618714545146935, - "language_loss": 0.76019043, - "learning_rate": 3.189359442151152e-06, - "loss": 0.78203136, - "num_input_tokens_seen": 113788640, - "step": 5295, - "time_per_iteration": 2.597567558288574 - }, - { - "auxiliary_loss_clip": 0.01110615, - "auxiliary_loss_mlp": 0.01046432, - "balance_loss_clip": 1.04994202, - "balance_loss_mlp": 1.02979052, - "epoch": 0.3184127461295656, - "flos": 25119478477440.0, - "grad_norm": 2.278908740959458, - "language_loss": 0.69146252, - "learning_rate": 3.189046306936296e-06, - "loss": 0.71303296, - "num_input_tokens_seen": 113809515, - "step": 5296, - "time_per_iteration": 4.286029100418091 - }, - { - "auxiliary_loss_clip": 0.01115954, - "auxiliary_loss_mlp": 0.01043279, - "balance_loss_clip": 1.04866266, - "balance_loss_mlp": 1.02709007, - "epoch": 0.31847286938223357, - "flos": 25551231955200.0, - "grad_norm": 1.7786470593469696, - "language_loss": 0.77374327, - "learning_rate": 3.1887331266326846e-06, - "loss": 0.79533565, - "num_input_tokens_seen": 113829770, - "step": 5297, - "time_per_iteration": 4.164870023727417 - }, - { - "auxiliary_loss_clip": 0.0111312, - "auxiliary_loss_mlp": 0.01036407, - "balance_loss_clip": 1.05341816, - "balance_loss_mlp": 1.01857328, - "epoch": 0.31853299263490154, - "flos": 27782470010880.0, - "grad_norm": 2.4185702861431104, - "language_loss": 0.79294181, - "learning_rate": 3.1884199012521942e-06, - "loss": 0.81443709, - "num_input_tokens_seen": 113849320, - "step": 5298, - "time_per_iteration": 2.761035919189453 - }, - { - "auxiliary_loss_clip": 0.01127152, - "auxiliary_loss_mlp": 0.01052383, - "balance_loss_clip": 1.05250955, - "balance_loss_mlp": 1.0361588, - "epoch": 0.3185931158875695, - "flos": 22706747976960.0, - "grad_norm": 2.109744523678234, - "language_loss": 0.74082595, - "learning_rate": 3.1881066308067016e-06, - "loss": 0.76262128, - "num_input_tokens_seen": 113867860, - "step": 5299, - "time_per_iteration": 2.6674296855926514 - }, - { - "auxiliary_loss_clip": 0.01133842, - "auxiliary_loss_mlp": 0.01048899, - "balance_loss_clip": 1.05652189, - "balance_loss_mlp": 1.03213775, - "epoch": 0.31865323914023747, - "flos": 24571517523840.0, - "grad_norm": 2.0125699214837627, - "language_loss": 0.78636098, - "learning_rate": 3.1877933153080873e-06, - "loss": 0.80818832, - "num_input_tokens_seen": 113886375, - "step": 5300, - "time_per_iteration": 2.721202850341797 - }, - { - "auxiliary_loss_clip": 0.01119633, - "auxiliary_loss_mlp": 0.01050293, - "balance_loss_clip": 1.04830885, - "balance_loss_mlp": 1.03297138, - "epoch": 0.31871336239290543, - "flos": 18186564666240.0, - "grad_norm": 1.8639511619571896, - "language_loss": 0.83660495, - "learning_rate": 3.1874799547682304e-06, - "loss": 0.8583042, - "num_input_tokens_seen": 113904065, - "step": 5301, - "time_per_iteration": 4.22704291343689 - }, - { - "auxiliary_loss_clip": 0.01131996, - "auxiliary_loss_mlp": 0.01049945, - "balance_loss_clip": 1.05371821, - "balance_loss_mlp": 1.03263569, - "epoch": 0.31877348564557345, - "flos": 21826756679040.0, - "grad_norm": 2.3173946845583444, - "language_loss": 0.77328432, - "learning_rate": 3.187166549199015e-06, - "loss": 0.79510373, - "num_input_tokens_seen": 113918415, - "step": 5302, - "time_per_iteration": 2.6678919792175293 - }, - { - "auxiliary_loss_clip": 0.011364, - "auxiliary_loss_mlp": 0.01039827, - "balance_loss_clip": 1.04891157, - "balance_loss_mlp": 1.02270818, - "epoch": 0.3188336088982414, - "flos": 22015252275840.0, - "grad_norm": 2.352282677018458, - "language_loss": 0.79816842, - "learning_rate": 3.1868530986123255e-06, - "loss": 0.81993073, - "num_input_tokens_seen": 113938135, - "step": 5303, - "time_per_iteration": 4.289660453796387 - }, - { - "auxiliary_loss_clip": 0.0113563, - "auxiliary_loss_mlp": 0.01045445, - "balance_loss_clip": 1.05256605, - "balance_loss_mlp": 1.02739668, - "epoch": 0.3188937321509094, - "flos": 20047886507520.0, - "grad_norm": 2.03328242361333, - "language_loss": 0.72914493, - "learning_rate": 3.186539603020047e-06, - "loss": 0.7509557, - "num_input_tokens_seen": 113957125, - "step": 5304, - "time_per_iteration": 2.6123225688934326 - }, - { - "auxiliary_loss_clip": 0.01106707, - "auxiliary_loss_mlp": 0.01038113, - "balance_loss_clip": 1.04701817, - "balance_loss_mlp": 1.02234125, - "epoch": 0.31895385540357735, - "flos": 25848105863040.0, - "grad_norm": 2.816339992135166, - "language_loss": 0.71918428, - "learning_rate": 3.186226062434068e-06, - "loss": 0.74063241, - "num_input_tokens_seen": 113974875, - "step": 5305, - "time_per_iteration": 2.7341108322143555 - }, - { - "auxiliary_loss_clip": 0.01120594, - "auxiliary_loss_mlp": 0.01042646, - "balance_loss_clip": 1.05007052, - "balance_loss_mlp": 1.0271126, - "epoch": 0.3190139786562453, - "flos": 23477714519040.0, - "grad_norm": 2.1368418928112067, - "language_loss": 0.64082253, - "learning_rate": 3.1859124768662778e-06, - "loss": 0.66245496, - "num_input_tokens_seen": 113994450, - "step": 5306, - "time_per_iteration": 2.678497791290283 - }, - { - "auxiliary_loss_clip": 0.01113987, - "auxiliary_loss_mlp": 0.01046306, - "balance_loss_clip": 1.04777002, - "balance_loss_mlp": 1.02913976, - "epoch": 0.3190741019089133, - "flos": 29095543589760.0, - "grad_norm": 2.249856956834014, - "language_loss": 0.7981708, - "learning_rate": 3.1855988463285678e-06, - "loss": 0.81977379, - "num_input_tokens_seen": 114013945, - "step": 5307, - "time_per_iteration": 2.684825897216797 - }, - { - "auxiliary_loss_clip": 0.01110939, - "auxiliary_loss_mlp": 0.01046246, - "balance_loss_clip": 1.04708028, - "balance_loss_mlp": 1.02869821, - "epoch": 0.31913422516158124, - "flos": 17129534209920.0, - "grad_norm": 1.891192054321282, - "language_loss": 0.77413881, - "learning_rate": 3.1852851708328308e-06, - "loss": 0.79571068, - "num_input_tokens_seen": 114031375, - "step": 5308, - "time_per_iteration": 2.62485408782959 - }, - { - "auxiliary_loss_clip": 0.01142071, - "auxiliary_loss_mlp": 0.01050679, - "balance_loss_clip": 1.05399549, - "balance_loss_mlp": 1.03109312, - "epoch": 0.3191943484142492, - "flos": 16069846147200.0, - "grad_norm": 3.6914677983836586, - "language_loss": 0.73960984, - "learning_rate": 3.184971450390961e-06, - "loss": 0.76153737, - "num_input_tokens_seen": 114048465, - "step": 5309, - "time_per_iteration": 2.6268463134765625 - }, - { - "auxiliary_loss_clip": 0.01134349, - "auxiliary_loss_mlp": 0.01035267, - "balance_loss_clip": 1.05286658, - "balance_loss_mlp": 1.01932931, - "epoch": 0.3192544716669172, - "flos": 22966166977920.0, - "grad_norm": 1.9182514579370458, - "language_loss": 0.82652342, - "learning_rate": 3.184657685014856e-06, - "loss": 0.84821963, - "num_input_tokens_seen": 114068415, - "step": 5310, - "time_per_iteration": 2.649099111557007 - }, - { - "auxiliary_loss_clip": 0.01116653, - "auxiliary_loss_mlp": 0.01039176, - "balance_loss_clip": 1.04808259, - "balance_loss_mlp": 1.02340484, - "epoch": 0.31931459491958514, - "flos": 26870339018880.0, - "grad_norm": 2.200225110342558, - "language_loss": 0.78296745, - "learning_rate": 3.184343874716412e-06, - "loss": 0.80452585, - "num_input_tokens_seen": 114088565, - "step": 5311, - "time_per_iteration": 2.7054250240325928 - }, - { - "auxiliary_loss_clip": 0.01106724, - "auxiliary_loss_mlp": 0.01036895, - "balance_loss_clip": 1.04822886, - "balance_loss_mlp": 1.01952648, - "epoch": 0.3193747181722531, - "flos": 21836525178240.0, - "grad_norm": 2.0057857548781883, - "language_loss": 0.84169972, - "learning_rate": 3.1840300195075295e-06, - "loss": 0.86313581, - "num_input_tokens_seen": 114107160, - "step": 5312, - "time_per_iteration": 2.749263048171997 - }, - { - "auxiliary_loss_clip": 0.01093899, - "auxiliary_loss_mlp": 0.01053441, - "balance_loss_clip": 1.04266024, - "balance_loss_mlp": 1.03477311, - "epoch": 0.31943484142492107, - "flos": 18324999682560.0, - "grad_norm": 3.6700749085790063, - "language_loss": 0.78648412, - "learning_rate": 3.1837161194001102e-06, - "loss": 0.80795753, - "num_input_tokens_seen": 114123420, - "step": 5313, - "time_per_iteration": 2.720930814743042 - }, - { - "auxiliary_loss_clip": 0.01130677, - "auxiliary_loss_mlp": 0.01038161, - "balance_loss_clip": 1.05141878, - "balance_loss_mlp": 1.0219605, - "epoch": 0.31949496467758903, - "flos": 21615818060160.0, - "grad_norm": 2.386195329240294, - "language_loss": 0.86217451, - "learning_rate": 3.183402174406057e-06, - "loss": 0.88386285, - "num_input_tokens_seen": 114139230, - "step": 5314, - "time_per_iteration": 2.6785764694213867 - }, - { - "auxiliary_loss_clip": 0.01116655, - "auxiliary_loss_mlp": 0.01050856, - "balance_loss_clip": 1.04983997, - "balance_loss_mlp": 1.03231871, - "epoch": 0.31955508793025705, - "flos": 21760214734080.0, - "grad_norm": 1.996028492072791, - "language_loss": 0.79866767, - "learning_rate": 3.1830881845372747e-06, - "loss": 0.82034278, - "num_input_tokens_seen": 114159290, - "step": 5315, - "time_per_iteration": 2.723097085952759 - }, - { - "auxiliary_loss_clip": 0.0110521, - "auxiliary_loss_mlp": 0.01063258, - "balance_loss_clip": 1.04667854, - "balance_loss_mlp": 1.04386258, - "epoch": 0.319615211182925, - "flos": 17164331510400.0, - "grad_norm": 2.2633227615123275, - "language_loss": 0.67312729, - "learning_rate": 3.18277414980567e-06, - "loss": 0.69481194, - "num_input_tokens_seen": 114177655, - "step": 5316, - "time_per_iteration": 2.7841827869415283 - }, - { - "auxiliary_loss_clip": 0.01131119, - "auxiliary_loss_mlp": 0.01046731, - "balance_loss_clip": 1.05015874, - "balance_loss_mlp": 1.03126907, - "epoch": 0.319675334435593, - "flos": 28112812416000.0, - "grad_norm": 1.540647016415601, - "language_loss": 0.69375229, - "learning_rate": 3.1824600702231515e-06, - "loss": 0.71553081, - "num_input_tokens_seen": 114200880, - "step": 5317, - "time_per_iteration": 2.7080705165863037 - }, - { - "auxiliary_loss_clip": 0.01036788, - "auxiliary_loss_mlp": 0.01033442, - "balance_loss_clip": 1.02571428, - "balance_loss_mlp": 1.03117692, - "epoch": 0.31973545768826095, - "flos": 69501119408640.0, - "grad_norm": 0.7974882454120521, - "language_loss": 0.53049421, - "learning_rate": 3.182145945801628e-06, - "loss": 0.55119646, - "num_input_tokens_seen": 114267145, - "step": 5318, - "time_per_iteration": 3.5072765350341797 - }, - { - "auxiliary_loss_clip": 0.0114058, - "auxiliary_loss_mlp": 0.01041014, - "balance_loss_clip": 1.05322218, - "balance_loss_mlp": 1.02509975, - "epoch": 0.3197955809409289, - "flos": 13699203408000.0, - "grad_norm": 3.679429868734815, - "language_loss": 0.84239668, - "learning_rate": 3.181831776553012e-06, - "loss": 0.86421257, - "num_input_tokens_seen": 114284630, - "step": 5319, - "time_per_iteration": 2.6148228645324707 - }, - { - "auxiliary_loss_clip": 0.0112589, - "auxiliary_loss_mlp": 0.01041338, - "balance_loss_clip": 1.04876614, - "balance_loss_mlp": 1.02552485, - "epoch": 0.3198557041935969, - "flos": 33218124278400.0, - "grad_norm": 1.684363339069699, - "language_loss": 0.63463295, - "learning_rate": 3.1815175624892165e-06, - "loss": 0.65630519, - "num_input_tokens_seen": 114305830, - "step": 5320, - "time_per_iteration": 2.7444913387298584 - }, - { - "auxiliary_loss_clip": 0.01120865, - "auxiliary_loss_mlp": 0.01042926, - "balance_loss_clip": 1.05072045, - "balance_loss_mlp": 1.02682114, - "epoch": 0.31991582744626484, - "flos": 23732033788800.0, - "grad_norm": 2.113040492667506, - "language_loss": 0.70552826, - "learning_rate": 3.1812033036221567e-06, - "loss": 0.72716618, - "num_input_tokens_seen": 114325165, - "step": 5321, - "time_per_iteration": 2.7078404426574707 - }, - { - "auxiliary_loss_clip": 0.01151862, - "auxiliary_loss_mlp": 0.00776802, - "balance_loss_clip": 1.05639851, - "balance_loss_mlp": 1.00126243, - "epoch": 0.3199759506989328, - "flos": 18550842445440.0, - "grad_norm": 2.699319417691227, - "language_loss": 0.8659147, - "learning_rate": 3.180888999963749e-06, - "loss": 0.88520133, - "num_input_tokens_seen": 114341310, - "step": 5322, - "time_per_iteration": 2.5562047958374023 - }, - { - "auxiliary_loss_clip": 0.01119411, - "auxiliary_loss_mlp": 0.01038951, - "balance_loss_clip": 1.05106568, - "balance_loss_mlp": 1.02265561, - "epoch": 0.3200360739516008, - "flos": 22418888382720.0, - "grad_norm": 1.7451682184714292, - "language_loss": 0.83021653, - "learning_rate": 3.1805746515259123e-06, - "loss": 0.85180014, - "num_input_tokens_seen": 114360355, - "step": 5323, - "time_per_iteration": 2.6323180198669434 - }, - { - "auxiliary_loss_clip": 0.01129356, - "auxiliary_loss_mlp": 0.01041616, - "balance_loss_clip": 1.05092812, - "balance_loss_mlp": 1.02440214, - "epoch": 0.32009619720426874, - "flos": 20595236929920.0, - "grad_norm": 1.6785162629315, - "language_loss": 0.77686846, - "learning_rate": 3.1802602583205663e-06, - "loss": 0.79857814, - "num_input_tokens_seen": 114379220, - "step": 5324, - "time_per_iteration": 2.6361289024353027 - }, - { - "auxiliary_loss_clip": 0.01115575, - "auxiliary_loss_mlp": 0.01035772, - "balance_loss_clip": 1.04754376, - "balance_loss_mlp": 1.01861751, - "epoch": 0.3201563204569367, - "flos": 18147637301760.0, - "grad_norm": 1.9010400542588533, - "language_loss": 0.80500418, - "learning_rate": 3.1799458203596333e-06, - "loss": 0.82651764, - "num_input_tokens_seen": 114396365, - "step": 5325, - "time_per_iteration": 2.681349277496338 - }, - { - "auxiliary_loss_clip": 0.01133585, - "auxiliary_loss_mlp": 0.01039966, - "balance_loss_clip": 1.05378425, - "balance_loss_mlp": 1.02394414, - "epoch": 0.32021644370960467, - "flos": 31684235840640.0, - "grad_norm": 1.7412856997403743, - "language_loss": 0.74817789, - "learning_rate": 3.179631337655037e-06, - "loss": 0.76991343, - "num_input_tokens_seen": 114416780, - "step": 5326, - "time_per_iteration": 2.6932616233825684 - }, - { - "auxiliary_loss_clip": 0.01103829, - "auxiliary_loss_mlp": 0.0104309, - "balance_loss_clip": 1.05045807, - "balance_loss_mlp": 1.02659154, - "epoch": 0.32027656696227264, - "flos": 26865921646080.0, - "grad_norm": 1.642662123916105, - "language_loss": 0.80796289, - "learning_rate": 3.179316810218701e-06, - "loss": 0.82943213, - "num_input_tokens_seen": 114437405, - "step": 5327, - "time_per_iteration": 2.7527899742126465 - }, - { - "auxiliary_loss_clip": 0.01115203, - "auxiliary_loss_mlp": 0.01038297, - "balance_loss_clip": 1.05185604, - "balance_loss_mlp": 1.02162015, - "epoch": 0.32033669021494066, - "flos": 24169928492160.0, - "grad_norm": 1.846540372387515, - "language_loss": 0.77796161, - "learning_rate": 3.179002238062554e-06, - "loss": 0.79949659, - "num_input_tokens_seen": 114458505, - "step": 5328, - "time_per_iteration": 2.7631096839904785 - }, - { - "auxiliary_loss_clip": 0.01087281, - "auxiliary_loss_mlp": 0.01043102, - "balance_loss_clip": 1.0453198, - "balance_loss_mlp": 1.0245527, - "epoch": 0.3203968134676086, - "flos": 24460768915200.0, - "grad_norm": 1.6837826518335735, - "language_loss": 0.74184239, - "learning_rate": 3.178687621198524e-06, - "loss": 0.76314622, - "num_input_tokens_seen": 114479050, - "step": 5329, - "time_per_iteration": 2.7749221324920654 - }, - { - "auxiliary_loss_clip": 0.01110066, - "auxiliary_loss_mlp": 0.01036662, - "balance_loss_clip": 1.04650402, - "balance_loss_mlp": 1.02133203, - "epoch": 0.3204569367202766, - "flos": 18004713085440.0, - "grad_norm": 1.7163505659405243, - "language_loss": 0.71138644, - "learning_rate": 3.1783729596385415e-06, - "loss": 0.73285371, - "num_input_tokens_seen": 114497415, - "step": 5330, - "time_per_iteration": 2.655578136444092 - }, - { - "auxiliary_loss_clip": 0.01093261, - "auxiliary_loss_mlp": 0.01053955, - "balance_loss_clip": 1.05082417, - "balance_loss_mlp": 1.03379714, - "epoch": 0.32051705997294455, - "flos": 30589678650240.0, - "grad_norm": 1.6854796065505788, - "language_loss": 0.80175424, - "learning_rate": 3.1780582533945376e-06, - "loss": 0.82322645, - "num_input_tokens_seen": 114518785, - "step": 5331, - "time_per_iteration": 2.851639747619629 - }, - { - "auxiliary_loss_clip": 0.01040347, - "auxiliary_loss_mlp": 0.01008357, - "balance_loss_clip": 1.02573299, - "balance_loss_mlp": 1.0059495, - "epoch": 0.3205771832256125, - "flos": 68417979765120.0, - "grad_norm": 0.8321512232204817, - "language_loss": 0.57821107, - "learning_rate": 3.177743502478447e-06, - "loss": 0.59869808, - "num_input_tokens_seen": 114577710, - "step": 5332, - "time_per_iteration": 3.1104307174682617 - }, - { - "auxiliary_loss_clip": 0.01104131, - "auxiliary_loss_mlp": 0.01038271, - "balance_loss_clip": 1.04842329, - "balance_loss_mlp": 1.02194548, - "epoch": 0.3206373064782805, - "flos": 30443953173120.0, - "grad_norm": 1.7127909178457088, - "language_loss": 0.72918129, - "learning_rate": 3.177428706902205e-06, - "loss": 0.75060534, - "num_input_tokens_seen": 114598640, - "step": 5333, - "time_per_iteration": 2.7683963775634766 - }, - { - "auxiliary_loss_clip": 0.01118957, - "auxiliary_loss_mlp": 0.01043487, - "balance_loss_clip": 1.04778981, - "balance_loss_mlp": 1.02685761, - "epoch": 0.32069742973094845, - "flos": 22054502862720.0, - "grad_norm": 2.1728626414536767, - "language_loss": 0.70592654, - "learning_rate": 3.1771138666777485e-06, - "loss": 0.72755098, - "num_input_tokens_seen": 114618780, - "step": 5334, - "time_per_iteration": 2.6861116886138916 - }, - { - "auxiliary_loss_clip": 0.01100969, - "auxiliary_loss_mlp": 0.01041644, - "balance_loss_clip": 1.04742825, - "balance_loss_mlp": 1.02536023, - "epoch": 0.3207575529836164, - "flos": 22054000072320.0, - "grad_norm": 2.526978692505362, - "language_loss": 0.77161503, - "learning_rate": 3.1767989818170156e-06, - "loss": 0.79304117, - "num_input_tokens_seen": 114637525, - "step": 5335, - "time_per_iteration": 4.33164381980896 - }, - { - "auxiliary_loss_clip": 0.01130469, - "auxiliary_loss_mlp": 0.01038297, - "balance_loss_clip": 1.05087018, - "balance_loss_mlp": 1.02213204, - "epoch": 0.3208176762362844, - "flos": 34057536186240.0, - "grad_norm": 1.6997548644452432, - "language_loss": 0.68414462, - "learning_rate": 3.1764840523319477e-06, - "loss": 0.7058323, - "num_input_tokens_seen": 114659705, - "step": 5336, - "time_per_iteration": 2.840373992919922 - }, - { - "auxiliary_loss_clip": 0.01102432, - "auxiliary_loss_mlp": 0.01055244, - "balance_loss_clip": 1.04495001, - "balance_loss_mlp": 1.03862596, - "epoch": 0.32087779948895234, - "flos": 21798711135360.0, - "grad_norm": 1.733261513029939, - "language_loss": 0.78828537, - "learning_rate": 3.176169078234487e-06, - "loss": 0.8098622, - "num_input_tokens_seen": 114678340, - "step": 5337, - "time_per_iteration": 4.268811464309692 - }, - { - "auxiliary_loss_clip": 0.01121282, - "auxiliary_loss_mlp": 0.01039712, - "balance_loss_clip": 1.04696417, - "balance_loss_mlp": 1.02512085, - "epoch": 0.3209379227416203, - "flos": 21434110133760.0, - "grad_norm": 2.1583979373304194, - "language_loss": 0.74322718, - "learning_rate": 3.1758540595365766e-06, - "loss": 0.76483715, - "num_input_tokens_seen": 114696980, - "step": 5338, - "time_per_iteration": 2.6442766189575195 - }, - { - "auxiliary_loss_clip": 0.01119062, - "auxiliary_loss_mlp": 0.01047297, - "balance_loss_clip": 1.04633641, - "balance_loss_mlp": 1.03078675, - "epoch": 0.3209980459942883, - "flos": 25849075530240.0, - "grad_norm": 2.118549362741933, - "language_loss": 0.62622869, - "learning_rate": 3.1755389962501626e-06, - "loss": 0.64789224, - "num_input_tokens_seen": 114717330, - "step": 5339, - "time_per_iteration": 2.684843063354492 - }, - { - "auxiliary_loss_clip": 0.01141698, - "auxiliary_loss_mlp": 0.01046177, - "balance_loss_clip": 1.05127931, - "balance_loss_mlp": 1.02954674, - "epoch": 0.32105816924695624, - "flos": 19099162535040.0, - "grad_norm": 2.480509085809345, - "language_loss": 0.81685597, - "learning_rate": 3.175223888387192e-06, - "loss": 0.83873475, - "num_input_tokens_seen": 114736320, - "step": 5340, - "time_per_iteration": 4.130942344665527 - }, - { - "auxiliary_loss_clip": 0.01110441, - "auxiliary_loss_mlp": 0.01050741, - "balance_loss_clip": 1.04820514, - "balance_loss_mlp": 1.03462362, - "epoch": 0.3211182924996242, - "flos": 16581860565120.0, - "grad_norm": 2.326860742494733, - "language_loss": 0.76571834, - "learning_rate": 3.1749087359596137e-06, - "loss": 0.78733015, - "num_input_tokens_seen": 114754575, - "step": 5341, - "time_per_iteration": 2.7302300930023193 - }, - { - "auxiliary_loss_clip": 0.01101828, - "auxiliary_loss_mlp": 0.01044591, - "balance_loss_clip": 1.04797173, - "balance_loss_mlp": 1.02840281, - "epoch": 0.3211784157522922, - "flos": 22672202071680.0, - "grad_norm": 1.680960149410583, - "language_loss": 0.79268491, - "learning_rate": 3.1745935389793786e-06, - "loss": 0.81414914, - "num_input_tokens_seen": 114773590, - "step": 5342, - "time_per_iteration": 4.462036609649658 - }, - { - "auxiliary_loss_clip": 0.01118478, - "auxiliary_loss_mlp": 0.01045941, - "balance_loss_clip": 1.05000186, - "balance_loss_mlp": 1.02876329, - "epoch": 0.3212385390049602, - "flos": 20558787603840.0, - "grad_norm": 3.232512085646521, - "language_loss": 0.74449253, - "learning_rate": 3.174278297458438e-06, - "loss": 0.76613677, - "num_input_tokens_seen": 114790775, - "step": 5343, - "time_per_iteration": 2.7057244777679443 - }, - { - "auxiliary_loss_clip": 0.01080228, - "auxiliary_loss_mlp": 0.0104431, - "balance_loss_clip": 1.04317784, - "balance_loss_mlp": 1.02704811, - "epoch": 0.32129866225762815, - "flos": 24791147233920.0, - "grad_norm": 1.672847320129023, - "language_loss": 0.82661629, - "learning_rate": 3.173963011408748e-06, - "loss": 0.84786165, - "num_input_tokens_seen": 114809835, - "step": 5344, - "time_per_iteration": 2.801013231277466 - }, - { - "auxiliary_loss_clip": 0.01088811, - "auxiliary_loss_mlp": 0.01042568, - "balance_loss_clip": 1.04556143, - "balance_loss_mlp": 1.02565217, - "epoch": 0.3213587855102961, - "flos": 18366871962240.0, - "grad_norm": 22.33494793204904, - "language_loss": 0.79863501, - "learning_rate": 3.173647680842262e-06, - "loss": 0.81994879, - "num_input_tokens_seen": 114826505, - "step": 5345, - "time_per_iteration": 2.743778944015503 - }, - { - "auxiliary_loss_clip": 0.01114864, - "auxiliary_loss_mlp": 0.01041047, - "balance_loss_clip": 1.04774046, - "balance_loss_mlp": 1.02507281, - "epoch": 0.3214189087629641, - "flos": 27015992668800.0, - "grad_norm": 2.095379605818748, - "language_loss": 0.83340824, - "learning_rate": 3.1733323057709384e-06, - "loss": 0.85496742, - "num_input_tokens_seen": 114846140, - "step": 5346, - "time_per_iteration": 2.8187026977539062 - }, - { - "auxiliary_loss_clip": 0.01110187, - "auxiliary_loss_mlp": 0.01045041, - "balance_loss_clip": 1.04783988, - "balance_loss_mlp": 1.02797008, - "epoch": 0.32147903201563205, - "flos": 23148269953920.0, - "grad_norm": 1.6371928172660764, - "language_loss": 0.81853002, - "learning_rate": 3.1730168862067366e-06, - "loss": 0.84008235, - "num_input_tokens_seen": 114866660, - "step": 5347, - "time_per_iteration": 2.724003553390503 - }, - { - "auxiliary_loss_clip": 0.0112676, - "auxiliary_loss_mlp": 0.01047135, - "balance_loss_clip": 1.048388, - "balance_loss_mlp": 1.02891994, - "epoch": 0.3215391552683, - "flos": 16580747243520.0, - "grad_norm": 4.152516057334243, - "language_loss": 0.80263776, - "learning_rate": 3.1727014221616164e-06, - "loss": 0.8243767, - "num_input_tokens_seen": 114882820, - "step": 5348, - "time_per_iteration": 2.6249122619628906 - }, - { - "auxiliary_loss_clip": 0.01113488, - "auxiliary_loss_mlp": 0.0105622, - "balance_loss_clip": 1.04640627, - "balance_loss_mlp": 1.03931606, - "epoch": 0.321599278520968, - "flos": 17821820010240.0, - "grad_norm": 2.570277900111974, - "language_loss": 0.85020632, - "learning_rate": 3.172385913647542e-06, - "loss": 0.87190342, - "num_input_tokens_seen": 114900745, - "step": 5349, - "time_per_iteration": 2.6685211658477783 - }, - { - "auxiliary_loss_clip": 0.01113139, - "auxiliary_loss_mlp": 0.0104332, - "balance_loss_clip": 1.04840457, - "balance_loss_mlp": 1.02644002, - "epoch": 0.32165940177363594, - "flos": 16251769555200.0, - "grad_norm": 2.7209437086115282, - "language_loss": 0.80619532, - "learning_rate": 3.172070360676475e-06, - "loss": 0.82775992, - "num_input_tokens_seen": 114917940, - "step": 5350, - "time_per_iteration": 2.6857874393463135 - }, - { - "auxiliary_loss_clip": 0.01128309, - "auxiliary_loss_mlp": 0.01045442, - "balance_loss_clip": 1.05025196, - "balance_loss_mlp": 1.02955103, - "epoch": 0.3217195250263039, - "flos": 27599900158080.0, - "grad_norm": 5.5112684101117395, - "language_loss": 0.80060112, - "learning_rate": 3.1717547632603828e-06, - "loss": 0.82233858, - "num_input_tokens_seen": 114937735, - "step": 5351, - "time_per_iteration": 2.68406081199646 - }, - { - "auxiliary_loss_clip": 0.01104774, - "auxiliary_loss_mlp": 0.01045518, - "balance_loss_clip": 1.04905438, - "balance_loss_mlp": 1.02811348, - "epoch": 0.3217796482789719, - "flos": 21470595373440.0, - "grad_norm": 2.189681121413186, - "language_loss": 0.75826663, - "learning_rate": 3.1714391214112326e-06, - "loss": 0.7797696, - "num_input_tokens_seen": 114956630, - "step": 5352, - "time_per_iteration": 2.7035396099090576 - }, - { - "auxiliary_loss_clip": 0.0109763, - "auxiliary_loss_mlp": 0.01043305, - "balance_loss_clip": 1.04897571, - "balance_loss_mlp": 1.02579308, - "epoch": 0.32183977153163984, - "flos": 21215593745280.0, - "grad_norm": 2.4508783518814807, - "language_loss": 0.81992233, - "learning_rate": 3.1711234351409933e-06, - "loss": 0.84133166, - "num_input_tokens_seen": 114976470, - "step": 5353, - "time_per_iteration": 2.731339931488037 - }, - { - "auxiliary_loss_clip": 0.01074627, - "auxiliary_loss_mlp": 0.0104331, - "balance_loss_clip": 1.04917347, - "balance_loss_mlp": 1.02605999, - "epoch": 0.3218998947843078, - "flos": 24608182331520.0, - "grad_norm": 2.2390857397461246, - "language_loss": 0.73474252, - "learning_rate": 3.1708077044616365e-06, - "loss": 0.75592184, - "num_input_tokens_seen": 114996710, - "step": 5354, - "time_per_iteration": 2.8337595462799072 - }, - { - "auxiliary_loss_clip": 0.01103547, - "auxiliary_loss_mlp": 0.01039731, - "balance_loss_clip": 1.04475546, - "balance_loss_mlp": 1.02428102, - "epoch": 0.3219600180369758, - "flos": 22270577126400.0, - "grad_norm": 1.8690515367544651, - "language_loss": 0.83792925, - "learning_rate": 3.1704919293851334e-06, - "loss": 0.85936201, - "num_input_tokens_seen": 115015775, - "step": 5355, - "time_per_iteration": 2.7299652099609375 - }, - { - "auxiliary_loss_clip": 0.01146025, - "auxiliary_loss_mlp": 0.01046795, - "balance_loss_clip": 1.05450225, - "balance_loss_mlp": 1.03032064, - "epoch": 0.3220201412896438, - "flos": 14939126939520.0, - "grad_norm": 1.9705527058452093, - "language_loss": 0.70895493, - "learning_rate": 3.1701761099234597e-06, - "loss": 0.73088312, - "num_input_tokens_seen": 115034265, - "step": 5356, - "time_per_iteration": 2.638268232345581 - }, - { - "auxiliary_loss_clip": 0.01102103, - "auxiliary_loss_mlp": 0.01040751, - "balance_loss_clip": 1.04954576, - "balance_loss_mlp": 1.02245283, - "epoch": 0.32208026454231176, - "flos": 22667389649280.0, - "grad_norm": 2.5241040535813095, - "language_loss": 0.67760962, - "learning_rate": 3.1698602460885903e-06, - "loss": 0.69903815, - "num_input_tokens_seen": 115051945, - "step": 5357, - "time_per_iteration": 2.7816576957702637 - }, - { - "auxiliary_loss_clip": 0.01037625, - "auxiliary_loss_mlp": 0.01029071, - "balance_loss_clip": 1.0279882, - "balance_loss_mlp": 1.02722347, - "epoch": 0.3221403877949797, - "flos": 64605130053120.0, - "grad_norm": 0.7244200234208643, - "language_loss": 0.58319688, - "learning_rate": 3.1695443378925035e-06, - "loss": 0.60386384, - "num_input_tokens_seen": 115119090, - "step": 5358, - "time_per_iteration": 3.3341448307037354 - }, - { - "auxiliary_loss_clip": 0.01076802, - "auxiliary_loss_mlp": 0.01044493, - "balance_loss_clip": 1.04142976, - "balance_loss_mlp": 1.0270052, - "epoch": 0.3222005110476477, - "flos": 20157019004160.0, - "grad_norm": 2.2322811787478427, - "language_loss": 0.83184302, - "learning_rate": 3.1692283853471777e-06, - "loss": 0.85305595, - "num_input_tokens_seen": 115137755, - "step": 5359, - "time_per_iteration": 2.836543083190918 - }, - { - "auxiliary_loss_clip": 0.01129966, - "auxiliary_loss_mlp": 0.01035598, - "balance_loss_clip": 1.04800034, - "balance_loss_mlp": 1.01938617, - "epoch": 0.32226063430031565, - "flos": 22674177319680.0, - "grad_norm": 2.0261007556732964, - "language_loss": 0.79563689, - "learning_rate": 3.168912388464595e-06, - "loss": 0.81729257, - "num_input_tokens_seen": 115158150, - "step": 5360, - "time_per_iteration": 2.66043758392334 - }, - { - "auxiliary_loss_clip": 0.01045199, - "auxiliary_loss_mlp": 0.01009155, - "balance_loss_clip": 1.02352595, - "balance_loss_mlp": 1.00706911, - "epoch": 0.3223207575529836, - "flos": 63828525075840.0, - "grad_norm": 0.6569282603798298, - "language_loss": 0.56928504, - "learning_rate": 3.168596347256737e-06, - "loss": 0.58982855, - "num_input_tokens_seen": 115212755, - "step": 5361, - "time_per_iteration": 3.007119655609131 - }, - { - "auxiliary_loss_clip": 0.01078785, - "auxiliary_loss_mlp": 0.01049092, - "balance_loss_clip": 1.04366553, - "balance_loss_mlp": 1.03166366, - "epoch": 0.3223808808056516, - "flos": 26870123537280.0, - "grad_norm": 3.2787914187636495, - "language_loss": 0.71563178, - "learning_rate": 3.168280261735588e-06, - "loss": 0.73691058, - "num_input_tokens_seen": 115233090, - "step": 5362, - "time_per_iteration": 2.8345048427581787 - }, - { - "auxiliary_loss_clip": 0.0112485, - "auxiliary_loss_mlp": 0.01053523, - "balance_loss_clip": 1.04899716, - "balance_loss_mlp": 1.03670287, - "epoch": 0.32244100405831955, - "flos": 26761350176640.0, - "grad_norm": 2.1292104037374773, - "language_loss": 0.74106693, - "learning_rate": 3.167964131913135e-06, - "loss": 0.76285076, - "num_input_tokens_seen": 115252645, - "step": 5363, - "time_per_iteration": 2.70552659034729 - }, - { - "auxiliary_loss_clip": 0.01134941, - "auxiliary_loss_mlp": 0.01042612, - "balance_loss_clip": 1.05024791, - "balance_loss_mlp": 1.02637601, - "epoch": 0.3225011273109875, - "flos": 23803029020160.0, - "grad_norm": 3.812297759050374, - "language_loss": 0.77379405, - "learning_rate": 3.167647957801365e-06, - "loss": 0.7955696, - "num_input_tokens_seen": 115269085, - "step": 5364, - "time_per_iteration": 2.66058087348938 - }, - { - "auxiliary_loss_clip": 0.01120766, - "auxiliary_loss_mlp": 0.01042612, - "balance_loss_clip": 1.05058861, - "balance_loss_mlp": 1.02468252, - "epoch": 0.3225612505636555, - "flos": 17274505501440.0, - "grad_norm": 3.514939630870356, - "language_loss": 0.76727009, - "learning_rate": 3.1673317394122672e-06, - "loss": 0.78890389, - "num_input_tokens_seen": 115286470, - "step": 5365, - "time_per_iteration": 2.6493194103240967 - }, - { - "auxiliary_loss_clip": 0.01124156, - "auxiliary_loss_mlp": 0.01048476, - "balance_loss_clip": 1.05429566, - "balance_loss_mlp": 1.03201342, - "epoch": 0.32262137381632344, - "flos": 23366247638400.0, - "grad_norm": 7.419360933702927, - "language_loss": 0.76938248, - "learning_rate": 3.1670154767578333e-06, - "loss": 0.79110885, - "num_input_tokens_seen": 115307000, - "step": 5366, - "time_per_iteration": 2.6984689235687256 - }, - { - "auxiliary_loss_clip": 0.01110868, - "auxiliary_loss_mlp": 0.01044399, - "balance_loss_clip": 1.04554594, - "balance_loss_mlp": 1.02792382, - "epoch": 0.3226814970689914, - "flos": 23258803080960.0, - "grad_norm": 2.2843777844497453, - "language_loss": 0.71972823, - "learning_rate": 3.166699169850055e-06, - "loss": 0.74128091, - "num_input_tokens_seen": 115325925, - "step": 5367, - "time_per_iteration": 2.6944496631622314 - }, - { - "auxiliary_loss_clip": 0.01138096, - "auxiliary_loss_mlp": 0.01043716, - "balance_loss_clip": 1.05035067, - "balance_loss_mlp": 1.0286001, - "epoch": 0.32274162032165943, - "flos": 16395196561920.0, - "grad_norm": 13.04054524246424, - "language_loss": 0.74414504, - "learning_rate": 3.1663828187009274e-06, - "loss": 0.76596308, - "num_input_tokens_seen": 115343705, - "step": 5368, - "time_per_iteration": 2.670567750930786 - }, - { - "auxiliary_loss_clip": 0.01103298, - "auxiliary_loss_mlp": 0.01049074, - "balance_loss_clip": 1.04370904, - "balance_loss_mlp": 1.0322659, - "epoch": 0.3228017435743274, - "flos": 27855081354240.0, - "grad_norm": 1.655769512058306, - "language_loss": 0.78693509, - "learning_rate": 3.1660664233224467e-06, - "loss": 0.80845881, - "num_input_tokens_seen": 115364170, - "step": 5369, - "time_per_iteration": 2.777437448501587 - }, - { - "auxiliary_loss_clip": 0.01099309, - "auxiliary_loss_mlp": 0.01037821, - "balance_loss_clip": 1.04874706, - "balance_loss_mlp": 1.0222764, - "epoch": 0.32286186682699536, - "flos": 19608770741760.0, - "grad_norm": 13.189929997499553, - "language_loss": 0.83189309, - "learning_rate": 3.16574998372661e-06, - "loss": 0.85326445, - "num_input_tokens_seen": 115382495, - "step": 5370, - "time_per_iteration": 2.734342336654663 - }, - { - "auxiliary_loss_clip": 0.01141788, - "auxiliary_loss_mlp": 0.01044735, - "balance_loss_clip": 1.05202413, - "balance_loss_mlp": 1.0291779, - "epoch": 0.3229219900796633, - "flos": 24134017870080.0, - "grad_norm": 3.3293058605981614, - "language_loss": 0.8288244, - "learning_rate": 3.1654334999254177e-06, - "loss": 0.85068965, - "num_input_tokens_seen": 115399450, - "step": 5371, - "time_per_iteration": 2.620091676712036 - }, - { - "auxiliary_loss_clip": 0.01133164, - "auxiliary_loss_mlp": 0.00776239, - "balance_loss_clip": 1.05046356, - "balance_loss_mlp": 1.00122416, - "epoch": 0.3229821133323313, - "flos": 17748705876480.0, - "grad_norm": 3.1117013800624993, - "language_loss": 0.8852632, - "learning_rate": 3.1651169719308695e-06, - "loss": 0.90435725, - "num_input_tokens_seen": 115417700, - "step": 5372, - "time_per_iteration": 2.673567056655884 - }, - { - "auxiliary_loss_clip": 0.01140269, - "auxiliary_loss_mlp": 0.01049295, - "balance_loss_clip": 1.05098414, - "balance_loss_mlp": 1.03341591, - "epoch": 0.32304223658499925, - "flos": 22346025644160.0, - "grad_norm": 2.7114986433136727, - "language_loss": 0.73388374, - "learning_rate": 3.1648003997549694e-06, - "loss": 0.75577939, - "num_input_tokens_seen": 115435840, - "step": 5373, - "time_per_iteration": 2.6910293102264404 - }, - { - "auxiliary_loss_clip": 0.0110976, - "auxiliary_loss_mlp": 0.01044756, - "balance_loss_clip": 1.04653084, - "balance_loss_mlp": 1.02873468, - "epoch": 0.3231023598376672, - "flos": 18478302929280.0, - "grad_norm": 2.3161305262959573, - "language_loss": 0.81114149, - "learning_rate": 3.1644837834097214e-06, - "loss": 0.83268672, - "num_input_tokens_seen": 115454210, - "step": 5374, - "time_per_iteration": 2.666707992553711 - }, - { - "auxiliary_loss_clip": 0.01095169, - "auxiliary_loss_mlp": 0.01038679, - "balance_loss_clip": 1.0438931, - "balance_loss_mlp": 1.02254975, - "epoch": 0.3231624830903352, - "flos": 27636313570560.0, - "grad_norm": 2.1309099752285863, - "language_loss": 0.87817222, - "learning_rate": 3.1641671229071317e-06, - "loss": 0.89951062, - "num_input_tokens_seen": 115471785, - "step": 5375, - "time_per_iteration": 4.252593994140625 - }, - { - "auxiliary_loss_clip": 0.01140942, - "auxiliary_loss_mlp": 0.01036182, - "balance_loss_clip": 1.04865098, - "balance_loss_mlp": 1.01960015, - "epoch": 0.32322260634300315, - "flos": 21726423014400.0, - "grad_norm": 2.12002794330764, - "language_loss": 0.75837636, - "learning_rate": 3.1638504182592076e-06, - "loss": 0.78014749, - "num_input_tokens_seen": 115491405, - "step": 5376, - "time_per_iteration": 2.64569091796875 - }, - { - "auxiliary_loss_clip": 0.01100111, - "auxiliary_loss_mlp": 0.01037893, - "balance_loss_clip": 1.04745007, - "balance_loss_mlp": 1.0227654, - "epoch": 0.3232827295956711, - "flos": 22637656166400.0, - "grad_norm": 16.356053535517315, - "language_loss": 0.66570163, - "learning_rate": 3.1635336694779594e-06, - "loss": 0.68708175, - "num_input_tokens_seen": 115511555, - "step": 5377, - "time_per_iteration": 4.228315591812134 - }, - { - "auxiliary_loss_clip": 0.01103406, - "auxiliary_loss_mlp": 0.01059488, - "balance_loss_clip": 1.04591548, - "balance_loss_mlp": 1.04070055, - "epoch": 0.3233428528483391, - "flos": 26322593546880.0, - "grad_norm": 1.5026052482517693, - "language_loss": 0.72276354, - "learning_rate": 3.1632168765753982e-06, - "loss": 0.74439251, - "num_input_tokens_seen": 115532860, - "step": 5378, - "time_per_iteration": 2.7754812240600586 - }, - { - "auxiliary_loss_clip": 0.0112205, - "auxiliary_loss_mlp": 0.0103656, - "balance_loss_clip": 1.04869092, - "balance_loss_mlp": 1.0214678, - "epoch": 0.32340297610100704, - "flos": 28585217111040.0, - "grad_norm": 2.7898138283200344, - "language_loss": 0.82221997, - "learning_rate": 3.1629000395635357e-06, - "loss": 0.84380603, - "num_input_tokens_seen": 115553850, - "step": 5379, - "time_per_iteration": 2.672743320465088 - }, - { - "auxiliary_loss_clip": 0.01130962, - "auxiliary_loss_mlp": 0.01035985, - "balance_loss_clip": 1.04864693, - "balance_loss_mlp": 1.02083325, - "epoch": 0.323463099353675, - "flos": 30773792787840.0, - "grad_norm": 1.5555457678220286, - "language_loss": 0.78895414, - "learning_rate": 3.162583158454388e-06, - "loss": 0.81062359, - "num_input_tokens_seen": 115575530, - "step": 5380, - "time_per_iteration": 4.130786180496216 - }, - { - "auxiliary_loss_clip": 0.01124956, - "auxiliary_loss_mlp": 0.01044026, - "balance_loss_clip": 1.04988194, - "balance_loss_mlp": 1.0286541, - "epoch": 0.32352322260634303, - "flos": 25228610974080.0, - "grad_norm": 1.7365933554134192, - "language_loss": 0.76877856, - "learning_rate": 3.1622662332599697e-06, - "loss": 0.79046834, - "num_input_tokens_seen": 115594885, - "step": 5381, - "time_per_iteration": 2.6297740936279297 - }, - { - "auxiliary_loss_clip": 0.01122723, - "auxiliary_loss_mlp": 0.0103758, - "balance_loss_clip": 1.0485673, - "balance_loss_mlp": 1.02333474, - "epoch": 0.323583345859011, - "flos": 23330480670720.0, - "grad_norm": 1.9510545380996942, - "language_loss": 0.71868116, - "learning_rate": 3.1619492639922998e-06, - "loss": 0.7402842, - "num_input_tokens_seen": 115614080, - "step": 5382, - "time_per_iteration": 4.239168167114258 - }, - { - "auxiliary_loss_clip": 0.01114051, - "auxiliary_loss_mlp": 0.01051511, - "balance_loss_clip": 1.0454843, - "balance_loss_mlp": 1.03392792, - "epoch": 0.32364346911167896, - "flos": 26207499392640.0, - "grad_norm": 2.5669193665709815, - "language_loss": 0.70947385, - "learning_rate": 3.1616322506633964e-06, - "loss": 0.73112947, - "num_input_tokens_seen": 115632820, - "step": 5383, - "time_per_iteration": 2.701462507247925 - }, - { - "auxiliary_loss_clip": 0.01123558, - "auxiliary_loss_mlp": 0.01038956, - "balance_loss_clip": 1.04770291, - "balance_loss_mlp": 1.02382779, - "epoch": 0.3237035923643469, - "flos": 23695764030720.0, - "grad_norm": 1.9442688765107798, - "language_loss": 0.78333974, - "learning_rate": 3.161315193285283e-06, - "loss": 0.8049649, - "num_input_tokens_seen": 115652860, - "step": 5384, - "time_per_iteration": 2.6939637660980225 - }, - { - "auxiliary_loss_clip": 0.01078749, - "auxiliary_loss_mlp": 0.01050129, - "balance_loss_clip": 1.04298878, - "balance_loss_mlp": 1.03203273, - "epoch": 0.3237637156170149, - "flos": 14428728633600.0, - "grad_norm": 2.1298780259276575, - "language_loss": 0.75396919, - "learning_rate": 3.16099809186998e-06, - "loss": 0.77525795, - "num_input_tokens_seen": 115670940, - "step": 5385, - "time_per_iteration": 2.7813403606414795 - }, - { - "auxiliary_loss_clip": 0.0111287, - "auxiliary_loss_mlp": 0.01040739, - "balance_loss_clip": 1.04995322, - "balance_loss_mlp": 1.0248363, - "epoch": 0.32382383886968286, - "flos": 31062981185280.0, - "grad_norm": 2.042597717530735, - "language_loss": 0.71488941, - "learning_rate": 3.1606809464295145e-06, - "loss": 0.73642552, - "num_input_tokens_seen": 115691155, - "step": 5386, - "time_per_iteration": 2.754636526107788 - }, - { - "auxiliary_loss_clip": 0.01142583, - "auxiliary_loss_mlp": 0.01040273, - "balance_loss_clip": 1.0499016, - "balance_loss_mlp": 1.02334547, - "epoch": 0.3238839621223508, - "flos": 23256935573760.0, - "grad_norm": 5.057227062214219, - "language_loss": 0.94889075, - "learning_rate": 3.1603637569759095e-06, - "loss": 0.97071928, - "num_input_tokens_seen": 115710340, - "step": 5387, - "time_per_iteration": 2.6547048091888428 - }, - { - "auxiliary_loss_clip": 0.01133488, - "auxiliary_loss_mlp": 0.01044118, - "balance_loss_clip": 1.05193102, - "balance_loss_mlp": 1.02696419, - "epoch": 0.3239440853750188, - "flos": 22964658606720.0, - "grad_norm": 10.717385990424205, - "language_loss": 0.77620786, - "learning_rate": 3.1600465235211956e-06, - "loss": 0.79798394, - "num_input_tokens_seen": 115726745, - "step": 5388, - "time_per_iteration": 2.657205820083618 - }, - { - "auxiliary_loss_clip": 0.01111832, - "auxiliary_loss_mlp": 0.01036701, - "balance_loss_clip": 1.04523969, - "balance_loss_mlp": 1.01978493, - "epoch": 0.32400420862768675, - "flos": 36246614653440.0, - "grad_norm": 2.237731185409586, - "language_loss": 0.71233571, - "learning_rate": 3.1597292460774006e-06, - "loss": 0.73382103, - "num_input_tokens_seen": 115749385, - "step": 5389, - "time_per_iteration": 2.799731731414795 - }, - { - "auxiliary_loss_clip": 0.01099836, - "auxiliary_loss_mlp": 0.01038996, - "balance_loss_clip": 1.04759645, - "balance_loss_mlp": 1.02302158, - "epoch": 0.3240643318803547, - "flos": 21616500418560.0, - "grad_norm": 1.8547230503773184, - "language_loss": 0.80461568, - "learning_rate": 3.159411924656557e-06, - "loss": 0.82600403, - "num_input_tokens_seen": 115768105, - "step": 5390, - "time_per_iteration": 2.703913450241089 - }, - { - "auxiliary_loss_clip": 0.01112322, - "auxiliary_loss_mlp": 0.01050073, - "balance_loss_clip": 1.04881656, - "balance_loss_mlp": 1.0330621, - "epoch": 0.3241244551330227, - "flos": 23295611543040.0, - "grad_norm": 4.514534114801655, - "language_loss": 0.72674775, - "learning_rate": 3.1590945592706967e-06, - "loss": 0.74837172, - "num_input_tokens_seen": 115787340, - "step": 5391, - "time_per_iteration": 2.8789660930633545 - }, - { - "auxiliary_loss_clip": 0.01110171, - "auxiliary_loss_mlp": 0.01040459, - "balance_loss_clip": 1.04422975, - "balance_loss_mlp": 1.02517664, - "epoch": 0.32418457838569065, - "flos": 14097236993280.0, - "grad_norm": 2.092129040046021, - "language_loss": 0.77347648, - "learning_rate": 3.158777149931855e-06, - "loss": 0.79498285, - "num_input_tokens_seen": 115805565, - "step": 5392, - "time_per_iteration": 2.6689188480377197 - }, - { - "auxiliary_loss_clip": 0.01112252, - "auxiliary_loss_mlp": 0.01051929, - "balance_loss_clip": 1.04517519, - "balance_loss_mlp": 1.03289127, - "epoch": 0.3242447016383586, - "flos": 29752672953600.0, - "grad_norm": 1.9207699243041063, - "language_loss": 0.62606925, - "learning_rate": 3.158459696652067e-06, - "loss": 0.6477111, - "num_input_tokens_seen": 115826725, - "step": 5393, - "time_per_iteration": 2.758423328399658 - }, - { - "auxiliary_loss_clip": 0.01122257, - "auxiliary_loss_mlp": 0.01043934, - "balance_loss_clip": 1.04730856, - "balance_loss_mlp": 1.02770925, - "epoch": 0.3243048248910266, - "flos": 24351205455360.0, - "grad_norm": 1.583732116281239, - "language_loss": 0.82284617, - "learning_rate": 3.158142199443371e-06, - "loss": 0.84450811, - "num_input_tokens_seen": 115846955, - "step": 5394, - "time_per_iteration": 2.6715636253356934 - }, - { - "auxiliary_loss_clip": 0.01111969, - "auxiliary_loss_mlp": 0.01045824, - "balance_loss_clip": 1.04729748, - "balance_loss_mlp": 1.03120947, - "epoch": 0.3243649481436946, - "flos": 24353037048960.0, - "grad_norm": 1.873068954405441, - "language_loss": 0.817029, - "learning_rate": 3.1578246583178076e-06, - "loss": 0.83860689, - "num_input_tokens_seen": 115865975, - "step": 5395, - "time_per_iteration": 2.7120518684387207 - }, - { - "auxiliary_loss_clip": 0.01126983, - "auxiliary_loss_mlp": 0.01039478, - "balance_loss_clip": 1.0519104, - "balance_loss_mlp": 1.02413607, - "epoch": 0.32442507139636256, - "flos": 22925228451840.0, - "grad_norm": 1.8441183317386671, - "language_loss": 0.83172363, - "learning_rate": 3.157507073287417e-06, - "loss": 0.85338825, - "num_input_tokens_seen": 115884950, - "step": 5396, - "time_per_iteration": 2.6589252948760986 - }, - { - "auxiliary_loss_clip": 0.0110371, - "auxiliary_loss_mlp": 0.01053141, - "balance_loss_clip": 1.04818082, - "balance_loss_mlp": 1.03462827, - "epoch": 0.32448519464903053, - "flos": 22200192426240.0, - "grad_norm": 2.3735724483298553, - "language_loss": 0.75721765, - "learning_rate": 3.1571894443642414e-06, - "loss": 0.77878618, - "num_input_tokens_seen": 115904170, - "step": 5397, - "time_per_iteration": 2.7118513584136963 - }, - { - "auxiliary_loss_clip": 0.01104001, - "auxiliary_loss_mlp": 0.0104059, - "balance_loss_clip": 1.04970932, - "balance_loss_mlp": 1.02504468, - "epoch": 0.3245453179016985, - "flos": 18838450644480.0, - "grad_norm": 7.349892433890134, - "language_loss": 0.67359912, - "learning_rate": 3.1568717715603263e-06, - "loss": 0.69504505, - "num_input_tokens_seen": 115919255, - "step": 5398, - "time_per_iteration": 2.690317153930664 - }, - { - "auxiliary_loss_clip": 0.01111486, - "auxiliary_loss_mlp": 0.01033579, - "balance_loss_clip": 1.04846239, - "balance_loss_mlp": 1.01784301, - "epoch": 0.32460544115436646, - "flos": 21178390233600.0, - "grad_norm": 1.692830304346276, - "language_loss": 0.73074687, - "learning_rate": 3.156554054887718e-06, - "loss": 0.7521975, - "num_input_tokens_seen": 115938535, - "step": 5399, - "time_per_iteration": 2.754539728164673 - }, - { - "auxiliary_loss_clip": 0.01101582, - "auxiliary_loss_mlp": 0.01036858, - "balance_loss_clip": 1.04522848, - "balance_loss_mlp": 1.02056217, - "epoch": 0.3246655644070344, - "flos": 21981137333760.0, - "grad_norm": 2.780796864612311, - "language_loss": 0.71580744, - "learning_rate": 3.1562362943584645e-06, - "loss": 0.7371918, - "num_input_tokens_seen": 115955005, - "step": 5400, - "time_per_iteration": 2.707712173461914 - }, - { - "auxiliary_loss_clip": 0.01127225, - "auxiliary_loss_mlp": 0.01040347, - "balance_loss_clip": 1.0472424, - "balance_loss_mlp": 1.02469516, - "epoch": 0.3247256876597024, - "flos": 32159729105280.0, - "grad_norm": 2.1905750946262805, - "language_loss": 0.79769576, - "learning_rate": 3.155918489984614e-06, - "loss": 0.81937146, - "num_input_tokens_seen": 115975305, - "step": 5401, - "time_per_iteration": 2.7813303470611572 - }, - { - "auxiliary_loss_clip": 0.01109499, - "auxiliary_loss_mlp": 0.01041329, - "balance_loss_clip": 1.04414558, - "balance_loss_mlp": 1.02341187, - "epoch": 0.32478581091237035, - "flos": 20997544233600.0, - "grad_norm": 4.743153882711402, - "language_loss": 0.87785316, - "learning_rate": 3.1556006417782196e-06, - "loss": 0.89936143, - "num_input_tokens_seen": 115994810, - "step": 5402, - "time_per_iteration": 2.7685606479644775 - }, - { - "auxiliary_loss_clip": 0.01078796, - "auxiliary_loss_mlp": 0.01044786, - "balance_loss_clip": 1.03948891, - "balance_loss_mlp": 1.02792931, - "epoch": 0.3248459341650383, - "flos": 17924990849280.0, - "grad_norm": 4.964706141121962, - "language_loss": 0.84572911, - "learning_rate": 3.155282749751332e-06, - "loss": 0.86696494, - "num_input_tokens_seen": 116011095, - "step": 5403, - "time_per_iteration": 2.7299063205718994 - }, - { - "auxiliary_loss_clip": 0.01104053, - "auxiliary_loss_mlp": 0.01045074, - "balance_loss_clip": 1.04597795, - "balance_loss_mlp": 1.03049469, - "epoch": 0.3249060574177063, - "flos": 24535606901760.0, - "grad_norm": 3.7265891750540785, - "language_loss": 0.87614954, - "learning_rate": 3.154964813916007e-06, - "loss": 0.89764082, - "num_input_tokens_seen": 116028805, - "step": 5404, - "time_per_iteration": 2.7740931510925293 - }, - { - "auxiliary_loss_clip": 0.01125798, - "auxiliary_loss_mlp": 0.01043439, - "balance_loss_clip": 1.04930234, - "balance_loss_mlp": 1.02685738, - "epoch": 0.32496618067037425, - "flos": 25994765093760.0, - "grad_norm": 2.5497237434599964, - "language_loss": 0.72717422, - "learning_rate": 3.1546468342843008e-06, - "loss": 0.74886656, - "num_input_tokens_seen": 116047765, - "step": 5405, - "time_per_iteration": 2.6756839752197266 - }, - { - "auxiliary_loss_clip": 0.01098309, - "auxiliary_loss_mlp": 0.01039466, - "balance_loss_clip": 1.04964566, - "balance_loss_mlp": 1.02390265, - "epoch": 0.3250263039230422, - "flos": 19573757959680.0, - "grad_norm": 1.6968031771183532, - "language_loss": 0.82927752, - "learning_rate": 3.1543288108682707e-06, - "loss": 0.8506552, - "num_input_tokens_seen": 116068385, - "step": 5406, - "time_per_iteration": 2.728217124938965 - }, - { - "auxiliary_loss_clip": 0.01136878, - "auxiliary_loss_mlp": 0.01032192, - "balance_loss_clip": 1.05117011, - "balance_loss_mlp": 1.01728487, - "epoch": 0.3250864271757102, - "flos": 16763640318720.0, - "grad_norm": 1.9312900503750694, - "language_loss": 0.87836796, - "learning_rate": 3.1540107436799764e-06, - "loss": 0.90005869, - "num_input_tokens_seen": 116085350, - "step": 5407, - "time_per_iteration": 2.5519261360168457 - }, - { - "auxiliary_loss_clip": 0.01112002, - "auxiliary_loss_mlp": 0.01040482, - "balance_loss_clip": 1.04575169, - "balance_loss_mlp": 1.02506793, - "epoch": 0.3251465504283782, - "flos": 27819458040960.0, - "grad_norm": 1.6044550363094983, - "language_loss": 0.69804603, - "learning_rate": 3.153692632731479e-06, - "loss": 0.71957088, - "num_input_tokens_seen": 116107560, - "step": 5408, - "time_per_iteration": 2.7141807079315186 - }, - { - "auxiliary_loss_clip": 0.01131975, - "auxiliary_loss_mlp": 0.01035871, - "balance_loss_clip": 1.05021083, - "balance_loss_mlp": 1.01977742, - "epoch": 0.32520667368104617, - "flos": 19063144172160.0, - "grad_norm": 10.423580562540607, - "language_loss": 0.77558911, - "learning_rate": 3.153374478034841e-06, - "loss": 0.79726762, - "num_input_tokens_seen": 116125980, - "step": 5409, - "time_per_iteration": 2.644792318344116 - }, - { - "auxiliary_loss_clip": 0.01079567, - "auxiliary_loss_mlp": 0.01043858, - "balance_loss_clip": 1.03893065, - "balance_loss_mlp": 1.0280745, - "epoch": 0.32526679693371413, - "flos": 29382146208000.0, - "grad_norm": 2.0524453166640146, - "language_loss": 0.83282518, - "learning_rate": 3.1530562796021285e-06, - "loss": 0.85405946, - "num_input_tokens_seen": 116146530, - "step": 5410, - "time_per_iteration": 2.846480131149292 - }, - { - "auxiliary_loss_clip": 0.01086095, - "auxiliary_loss_mlp": 0.01037636, - "balance_loss_clip": 1.04789686, - "balance_loss_mlp": 1.02272296, - "epoch": 0.3253269201863821, - "flos": 20704513080960.0, - "grad_norm": 1.6475099523255856, - "language_loss": 0.7081182, - "learning_rate": 3.152738037445405e-06, - "loss": 0.72935545, - "num_input_tokens_seen": 116165695, - "step": 5411, - "time_per_iteration": 2.779330253601074 - }, - { - "auxiliary_loss_clip": 0.0108148, - "auxiliary_loss_mlp": 0.01041588, - "balance_loss_clip": 1.04331398, - "balance_loss_mlp": 1.02688956, - "epoch": 0.32538704343905006, - "flos": 29094142959360.0, - "grad_norm": 1.6354124554173295, - "language_loss": 0.82894456, - "learning_rate": 3.1524197515767403e-06, - "loss": 0.85017526, - "num_input_tokens_seen": 116185375, - "step": 5412, - "time_per_iteration": 2.7841992378234863 - }, - { - "auxiliary_loss_clip": 0.01106895, - "auxiliary_loss_mlp": 0.01041599, - "balance_loss_clip": 1.04730868, - "balance_loss_mlp": 1.02430189, - "epoch": 0.325447166691718, - "flos": 24676124906880.0, - "grad_norm": 1.867437266565155, - "language_loss": 0.80913842, - "learning_rate": 3.152101422008203e-06, - "loss": 0.83062339, - "num_input_tokens_seen": 116204335, - "step": 5413, - "time_per_iteration": 2.7533957958221436 - }, - { - "auxiliary_loss_clip": 0.01115005, - "auxiliary_loss_mlp": 0.0103855, - "balance_loss_clip": 1.04923081, - "balance_loss_mlp": 1.02155089, - "epoch": 0.325507289944386, - "flos": 21543134889600.0, - "grad_norm": 3.355430774898342, - "language_loss": 0.76891947, - "learning_rate": 3.151783048751864e-06, - "loss": 0.79045498, - "num_input_tokens_seen": 116222840, - "step": 5414, - "time_per_iteration": 4.331217527389526 - }, - { - "auxiliary_loss_clip": 0.01030644, - "auxiliary_loss_mlp": 0.01012699, - "balance_loss_clip": 1.02726388, - "balance_loss_mlp": 1.01063681, - "epoch": 0.32556741319705396, - "flos": 71518722347520.0, - "grad_norm": 0.9066964616955783, - "language_loss": 0.63865513, - "learning_rate": 3.1514646318197965e-06, - "loss": 0.65908855, - "num_input_tokens_seen": 116274940, - "step": 5415, - "time_per_iteration": 3.172816753387451 - }, - { - "auxiliary_loss_clip": 0.01088465, - "auxiliary_loss_mlp": 0.01038606, - "balance_loss_clip": 1.04119301, - "balance_loss_mlp": 1.02279866, - "epoch": 0.3256275364497219, - "flos": 23732428838400.0, - "grad_norm": 1.52454367487569, - "language_loss": 0.74014068, - "learning_rate": 3.151146171224075e-06, - "loss": 0.76141143, - "num_input_tokens_seen": 116297300, - "step": 5416, - "time_per_iteration": 4.326166868209839 - }, - { - "auxiliary_loss_clip": 0.01062287, - "auxiliary_loss_mlp": 0.0100407, - "balance_loss_clip": 1.03045964, - "balance_loss_mlp": 1.00160217, - "epoch": 0.3256876597023899, - "flos": 67289199891840.0, - "grad_norm": 0.7686966052914506, - "language_loss": 0.57851374, - "learning_rate": 3.1508276669767757e-06, - "loss": 0.59917736, - "num_input_tokens_seen": 116362370, - "step": 5417, - "time_per_iteration": 3.2102463245391846 - }, - { - "auxiliary_loss_clip": 0.01040835, - "auxiliary_loss_mlp": 0.01012103, - "balance_loss_clip": 1.02768993, - "balance_loss_mlp": 1.00975466, - "epoch": 0.32574778295505785, - "flos": 71282323964160.0, - "grad_norm": 0.7997987203444133, - "language_loss": 0.63392216, - "learning_rate": 3.150509119089975e-06, - "loss": 0.65445155, - "num_input_tokens_seen": 116430365, - "step": 5418, - "time_per_iteration": 4.847350120544434 - }, - { - "auxiliary_loss_clip": 0.01110249, - "auxiliary_loss_mlp": 0.01043458, - "balance_loss_clip": 1.05171919, - "balance_loss_mlp": 1.02794838, - "epoch": 0.3258079062077258, - "flos": 20776370238720.0, - "grad_norm": 2.0985111563442325, - "language_loss": 0.69086784, - "learning_rate": 3.1501905275757537e-06, - "loss": 0.71240497, - "num_input_tokens_seen": 116447525, - "step": 5419, - "time_per_iteration": 2.6837174892425537 - }, - { - "auxiliary_loss_clip": 0.0112744, - "auxiliary_loss_mlp": 0.01037157, - "balance_loss_clip": 1.05152702, - "balance_loss_mlp": 1.02099252, - "epoch": 0.3258680294603938, - "flos": 22235456603520.0, - "grad_norm": 1.6553118170887535, - "language_loss": 0.77041519, - "learning_rate": 3.1498718924461926e-06, - "loss": 0.79206121, - "num_input_tokens_seen": 116466310, - "step": 5420, - "time_per_iteration": 2.690243721008301 - }, - { - "auxiliary_loss_clip": 0.01124221, - "auxiliary_loss_mlp": 0.00774579, - "balance_loss_clip": 1.04583097, - "balance_loss_mlp": 1.00118852, - "epoch": 0.3259281527130618, - "flos": 26979974305920.0, - "grad_norm": 1.6758047570714483, - "language_loss": 0.8033973, - "learning_rate": 3.1495532137133736e-06, - "loss": 0.82238531, - "num_input_tokens_seen": 116487825, - "step": 5421, - "time_per_iteration": 4.346652984619141 - }, - { - "auxiliary_loss_clip": 0.01133401, - "auxiliary_loss_mlp": 0.0103494, - "balance_loss_clip": 1.04982162, - "balance_loss_mlp": 1.0212909, - "epoch": 0.32598827596572977, - "flos": 26214251149440.0, - "grad_norm": 1.7368751669124027, - "language_loss": 0.75101721, - "learning_rate": 3.149234491389381e-06, - "loss": 0.77270067, - "num_input_tokens_seen": 116509950, - "step": 5422, - "time_per_iteration": 2.698486566543579 - }, - { - "auxiliary_loss_clip": 0.01104722, - "auxiliary_loss_mlp": 0.00773675, - "balance_loss_clip": 1.04894829, - "balance_loss_mlp": 1.00120938, - "epoch": 0.32604839921839773, - "flos": 17639752947840.0, - "grad_norm": 2.1580318636917384, - "language_loss": 0.63323581, - "learning_rate": 3.1489157254863026e-06, - "loss": 0.65201974, - "num_input_tokens_seen": 116527695, - "step": 5423, - "time_per_iteration": 2.7364964485168457 - }, - { - "auxiliary_loss_clip": 0.01098661, - "auxiliary_loss_mlp": 0.01032454, - "balance_loss_clip": 1.04357564, - "balance_loss_mlp": 1.01884615, - "epoch": 0.3261085224710657, - "flos": 23622721724160.0, - "grad_norm": 1.5676988826806029, - "language_loss": 0.74530792, - "learning_rate": 3.148596916016224e-06, - "loss": 0.76661909, - "num_input_tokens_seen": 116547800, - "step": 5424, - "time_per_iteration": 2.695530652999878 - }, - { - "auxiliary_loss_clip": 0.0110482, - "auxiliary_loss_mlp": 0.01035713, - "balance_loss_clip": 1.04803681, - "balance_loss_mlp": 1.02199221, - "epoch": 0.32616864572373366, - "flos": 23260455106560.0, - "grad_norm": 1.6667522289255576, - "language_loss": 0.77194774, - "learning_rate": 3.1482780629912355e-06, - "loss": 0.79335308, - "num_input_tokens_seen": 116568460, - "step": 5425, - "time_per_iteration": 2.6649699211120605 - }, - { - "auxiliary_loss_clip": 0.01106187, - "auxiliary_loss_mlp": 0.01040306, - "balance_loss_clip": 1.04740202, - "balance_loss_mlp": 1.02368808, - "epoch": 0.32622876897640163, - "flos": 25593427457280.0, - "grad_norm": 2.8883064562409744, - "language_loss": 0.78262472, - "learning_rate": 3.147959166423428e-06, - "loss": 0.80408967, - "num_input_tokens_seen": 116588705, - "step": 5426, - "time_per_iteration": 2.7820892333984375 - }, - { - "auxiliary_loss_clip": 0.01088898, - "auxiliary_loss_mlp": 0.01035243, - "balance_loss_clip": 1.04331303, - "balance_loss_mlp": 1.01889908, - "epoch": 0.3262888922290696, - "flos": 22418996123520.0, - "grad_norm": 1.9267107865215556, - "language_loss": 0.74485052, - "learning_rate": 3.147640226324893e-06, - "loss": 0.76609194, - "num_input_tokens_seen": 116608845, - "step": 5427, - "time_per_iteration": 2.7831003665924072 - }, - { - "auxiliary_loss_clip": 0.01103791, - "auxiliary_loss_mlp": 0.01041786, - "balance_loss_clip": 1.04539597, - "balance_loss_mlp": 1.02549028, - "epoch": 0.32634901548173756, - "flos": 19718908819200.0, - "grad_norm": 6.869638277775165, - "language_loss": 0.79136658, - "learning_rate": 3.1473212427077266e-06, - "loss": 0.81282234, - "num_input_tokens_seen": 116628145, - "step": 5428, - "time_per_iteration": 2.7186481952667236 - }, - { - "auxiliary_loss_clip": 0.01121911, - "auxiliary_loss_mlp": 0.01040908, - "balance_loss_clip": 1.04629314, - "balance_loss_mlp": 1.02576876, - "epoch": 0.3264091387344055, - "flos": 16142924367360.0, - "grad_norm": 5.016107817785842, - "language_loss": 0.71130025, - "learning_rate": 3.147002215584023e-06, - "loss": 0.7329284, - "num_input_tokens_seen": 116646920, - "step": 5429, - "time_per_iteration": 2.6733968257904053 - }, - { - "auxiliary_loss_clip": 0.01098408, - "auxiliary_loss_mlp": 0.01035827, - "balance_loss_clip": 1.04658663, - "balance_loss_mlp": 1.0212121, - "epoch": 0.3264692619870735, - "flos": 16399075230720.0, - "grad_norm": 1.7379615094125744, - "language_loss": 0.78620625, - "learning_rate": 3.146683144965881e-06, - "loss": 0.80754858, - "num_input_tokens_seen": 116665100, - "step": 5430, - "time_per_iteration": 2.7313849925994873 - }, - { - "auxiliary_loss_clip": 0.01084979, - "auxiliary_loss_mlp": 0.01043143, - "balance_loss_clip": 1.04809749, - "balance_loss_mlp": 1.02660871, - "epoch": 0.32652938523974145, - "flos": 22382331315840.0, - "grad_norm": 3.4420441965814477, - "language_loss": 0.84279943, - "learning_rate": 3.146364030865399e-06, - "loss": 0.86408061, - "num_input_tokens_seen": 116682205, - "step": 5431, - "time_per_iteration": 2.720797300338745 - }, - { - "auxiliary_loss_clip": 0.01117845, - "auxiliary_loss_mlp": 0.01034908, - "balance_loss_clip": 1.04730058, - "balance_loss_mlp": 1.02067482, - "epoch": 0.3265895084924094, - "flos": 21908059113600.0, - "grad_norm": 1.9482899767939774, - "language_loss": 0.70736587, - "learning_rate": 3.146044873294678e-06, - "loss": 0.7288934, - "num_input_tokens_seen": 116702575, - "step": 5432, - "time_per_iteration": 2.6805124282836914 - }, - { - "auxiliary_loss_clip": 0.01073417, - "auxiliary_loss_mlp": 0.01042634, - "balance_loss_clip": 1.04051948, - "balance_loss_mlp": 1.02625418, - "epoch": 0.3266496317450774, - "flos": 16067152627200.0, - "grad_norm": 1.6263283854003907, - "language_loss": 0.84160507, - "learning_rate": 3.1457256722658203e-06, - "loss": 0.86276555, - "num_input_tokens_seen": 116720885, - "step": 5433, - "time_per_iteration": 2.733450174331665 - }, - { - "auxiliary_loss_clip": 0.01110224, - "auxiliary_loss_mlp": 0.01031776, - "balance_loss_clip": 1.04831946, - "balance_loss_mlp": 1.01733375, - "epoch": 0.3267097549977454, - "flos": 22528236360960.0, - "grad_norm": 1.8752055231309104, - "language_loss": 0.860237, - "learning_rate": 3.145406427790931e-06, - "loss": 0.881657, - "num_input_tokens_seen": 116740395, - "step": 5434, - "time_per_iteration": 2.6711690425872803 - }, - { - "auxiliary_loss_clip": 0.01115762, - "auxiliary_loss_mlp": 0.0104022, - "balance_loss_clip": 1.04894018, - "balance_loss_mlp": 1.02460361, - "epoch": 0.32676987825041337, - "flos": 27270419679360.0, - "grad_norm": 2.089345873834278, - "language_loss": 0.87845808, - "learning_rate": 3.1450871398821147e-06, - "loss": 0.90001786, - "num_input_tokens_seen": 116758870, - "step": 5435, - "time_per_iteration": 2.7342183589935303 - }, - { - "auxiliary_loss_clip": 0.01137287, - "auxiliary_loss_mlp": 0.01037617, - "balance_loss_clip": 1.05190301, - "balance_loss_mlp": 1.02256095, - "epoch": 0.32683000150308134, - "flos": 11508257433600.0, - "grad_norm": 3.0926239838125595, - "language_loss": 0.7645883, - "learning_rate": 3.144767808551479e-06, - "loss": 0.78633732, - "num_input_tokens_seen": 116773440, - "step": 5436, - "time_per_iteration": 2.648062229156494 - }, - { - "auxiliary_loss_clip": 0.01137346, - "auxiliary_loss_mlp": 0.01034933, - "balance_loss_clip": 1.0532552, - "balance_loss_mlp": 1.02046728, - "epoch": 0.3268901247557493, - "flos": 25630200005760.0, - "grad_norm": 1.7720337367532448, - "language_loss": 0.71802473, - "learning_rate": 3.144448433811134e-06, - "loss": 0.73974752, - "num_input_tokens_seen": 116794375, - "step": 5437, - "time_per_iteration": 2.680525541305542 - }, - { - "auxiliary_loss_clip": 0.01095966, - "auxiliary_loss_mlp": 0.0104222, - "balance_loss_clip": 1.04542243, - "balance_loss_mlp": 1.02445781, - "epoch": 0.32695024800841727, - "flos": 24860849575680.0, - "grad_norm": 1.7134236857074348, - "language_loss": 0.63728261, - "learning_rate": 3.144129015673189e-06, - "loss": 0.65866441, - "num_input_tokens_seen": 116815095, - "step": 5438, - "time_per_iteration": 2.7343454360961914 - }, - { - "auxiliary_loss_clip": 0.01128746, - "auxiliary_loss_mlp": 0.01039734, - "balance_loss_clip": 1.05383801, - "balance_loss_mlp": 1.02468967, - "epoch": 0.32701037126108523, - "flos": 28839249072000.0, - "grad_norm": 3.854723832885701, - "language_loss": 0.74629039, - "learning_rate": 3.1438095541497576e-06, - "loss": 0.76797515, - "num_input_tokens_seen": 116836630, - "step": 5439, - "time_per_iteration": 2.6859002113342285 - }, - { - "auxiliary_loss_clip": 0.0113034, - "auxiliary_loss_mlp": 0.0104413, - "balance_loss_clip": 1.05407321, - "balance_loss_mlp": 1.02773881, - "epoch": 0.3270704945137532, - "flos": 27965075777280.0, - "grad_norm": 3.9922367032947634, - "language_loss": 0.74743968, - "learning_rate": 3.1434900492529527e-06, - "loss": 0.76918435, - "num_input_tokens_seen": 116856880, - "step": 5440, - "time_per_iteration": 2.6785733699798584 - }, - { - "auxiliary_loss_clip": 0.01124529, - "auxiliary_loss_mlp": 0.00773254, - "balance_loss_clip": 1.05180979, - "balance_loss_mlp": 1.00108397, - "epoch": 0.32713061776642116, - "flos": 23690700213120.0, - "grad_norm": 2.2888111794693033, - "language_loss": 0.84642965, - "learning_rate": 3.1431705009948914e-06, - "loss": 0.86540747, - "num_input_tokens_seen": 116873770, - "step": 5441, - "time_per_iteration": 2.692375421524048 - }, - { - "auxiliary_loss_clip": 0.01126517, - "auxiliary_loss_mlp": 0.01042941, - "balance_loss_clip": 1.05065203, - "balance_loss_mlp": 1.02715778, - "epoch": 0.3271907410190891, - "flos": 22455625017600.0, - "grad_norm": 3.048730330719705, - "language_loss": 0.86782062, - "learning_rate": 3.1428509093876897e-06, - "loss": 0.88951516, - "num_input_tokens_seen": 116891225, - "step": 5442, - "time_per_iteration": 2.6678872108459473 - }, - { - "auxiliary_loss_clip": 0.01105154, - "auxiliary_loss_mlp": 0.01041235, - "balance_loss_clip": 1.05088091, - "balance_loss_mlp": 1.02450991, - "epoch": 0.3272508642717571, - "flos": 22820118278400.0, - "grad_norm": 2.240879974234663, - "language_loss": 0.77471602, - "learning_rate": 3.1425312744434668e-06, - "loss": 0.79617989, - "num_input_tokens_seen": 116912300, - "step": 5443, - "time_per_iteration": 2.715407133102417 - }, - { - "auxiliary_loss_clip": 0.01109692, - "auxiliary_loss_mlp": 0.00773391, - "balance_loss_clip": 1.05144906, - "balance_loss_mlp": 1.00102162, - "epoch": 0.32731098752442506, - "flos": 11801360413440.0, - "grad_norm": 2.595112113661144, - "language_loss": 0.81782895, - "learning_rate": 3.142211596174343e-06, - "loss": 0.83665979, - "num_input_tokens_seen": 116929425, - "step": 5444, - "time_per_iteration": 2.7483620643615723 - }, - { - "auxiliary_loss_clip": 0.0109768, - "auxiliary_loss_mlp": 0.01042359, - "balance_loss_clip": 1.05127132, - "balance_loss_mlp": 1.02671897, - "epoch": 0.327371110777093, - "flos": 21027780506880.0, - "grad_norm": 2.0540771727134786, - "language_loss": 0.59668452, - "learning_rate": 3.1418918745924423e-06, - "loss": 0.61808491, - "num_input_tokens_seen": 116948255, - "step": 5445, - "time_per_iteration": 2.7937049865722656 - }, - { - "auxiliary_loss_clip": 0.01134371, - "auxiliary_loss_mlp": 0.01045479, - "balance_loss_clip": 1.05779314, - "balance_loss_mlp": 1.02935553, - "epoch": 0.327431234029761, - "flos": 19062102677760.0, - "grad_norm": 2.705344105300375, - "language_loss": 0.88343978, - "learning_rate": 3.1415721097098865e-06, - "loss": 0.90523833, - "num_input_tokens_seen": 116964905, - "step": 5446, - "time_per_iteration": 2.586451292037964 - }, - { - "auxiliary_loss_clip": 0.01135097, - "auxiliary_loss_mlp": 0.01041409, - "balance_loss_clip": 1.0612191, - "balance_loss_mlp": 1.02387285, - "epoch": 0.32749135728242895, - "flos": 25849219184640.0, - "grad_norm": 2.2697780368090883, - "language_loss": 0.79279661, - "learning_rate": 3.141252301538802e-06, - "loss": 0.81456167, - "num_input_tokens_seen": 116983650, - "step": 5447, - "time_per_iteration": 2.744072198867798 - }, - { - "auxiliary_loss_clip": 0.01107571, - "auxiliary_loss_mlp": 0.00773964, - "balance_loss_clip": 1.04747021, - "balance_loss_mlp": 1.00110793, - "epoch": 0.327551480535097, - "flos": 20120533764480.0, - "grad_norm": 1.8015667711206929, - "language_loss": 0.73182315, - "learning_rate": 3.1409324500913157e-06, - "loss": 0.75063848, - "num_input_tokens_seen": 117003265, - "step": 5448, - "time_per_iteration": 2.6825077533721924 - }, - { - "auxiliary_loss_clip": 0.01142648, - "auxiliary_loss_mlp": 0.01042295, - "balance_loss_clip": 1.05620432, - "balance_loss_mlp": 1.02694106, - "epoch": 0.32761160378776494, - "flos": 28803553931520.0, - "grad_norm": 1.4660761852129829, - "language_loss": 0.67103487, - "learning_rate": 3.1406125553795567e-06, - "loss": 0.69288433, - "num_input_tokens_seen": 117025370, - "step": 5449, - "time_per_iteration": 2.682499885559082 - }, - { - "auxiliary_loss_clip": 0.0110995, - "auxiliary_loss_mlp": 0.010411, - "balance_loss_clip": 1.0542469, - "balance_loss_mlp": 1.02627623, - "epoch": 0.3276717270404329, - "flos": 26937778803840.0, - "grad_norm": 3.4023702964270943, - "language_loss": 0.65110958, - "learning_rate": 3.1402926174156556e-06, - "loss": 0.67262006, - "num_input_tokens_seen": 117044350, - "step": 5450, - "time_per_iteration": 2.7582857608795166 - }, - { - "auxiliary_loss_clip": 0.0113136, - "auxiliary_loss_mlp": 0.01045713, - "balance_loss_clip": 1.05517817, - "balance_loss_mlp": 1.03021002, - "epoch": 0.32773185029310087, - "flos": 25338425829120.0, - "grad_norm": 1.5880234750249043, - "language_loss": 0.77630055, - "learning_rate": 3.1399726362117437e-06, - "loss": 0.79807132, - "num_input_tokens_seen": 117064450, - "step": 5451, - "time_per_iteration": 2.6543071269989014 - }, - { - "auxiliary_loss_clip": 0.01131184, - "auxiliary_loss_mlp": 0.01044056, - "balance_loss_clip": 1.05428064, - "balance_loss_mlp": 1.02809358, - "epoch": 0.32779197354576883, - "flos": 26391721271040.0, - "grad_norm": 1.913131066587778, - "language_loss": 0.70510584, - "learning_rate": 3.1396526117799555e-06, - "loss": 0.7268582, - "num_input_tokens_seen": 117083060, - "step": 5452, - "time_per_iteration": 2.6963608264923096 - }, - { - "auxiliary_loss_clip": 0.01112229, - "auxiliary_loss_mlp": 0.01036592, - "balance_loss_clip": 1.048841, - "balance_loss_mlp": 1.02223349, - "epoch": 0.3278520967984368, - "flos": 24899381890560.0, - "grad_norm": 2.6287596248848013, - "language_loss": 0.78730083, - "learning_rate": 3.1393325441324256e-06, - "loss": 0.80878907, - "num_input_tokens_seen": 117101860, - "step": 5453, - "time_per_iteration": 4.197263479232788 - }, - { - "auxiliary_loss_clip": 0.01130585, - "auxiliary_loss_mlp": 0.01035536, - "balance_loss_clip": 1.0526675, - "balance_loss_mlp": 1.02026486, - "epoch": 0.32791222005110476, - "flos": 29752996176000.0, - "grad_norm": 5.184832608635382, - "language_loss": 0.75771177, - "learning_rate": 3.1390124332812916e-06, - "loss": 0.77937293, - "num_input_tokens_seen": 117123100, - "step": 5454, - "time_per_iteration": 2.7643721103668213 - }, - { - "auxiliary_loss_clip": 0.01070253, - "auxiliary_loss_mlp": 0.01047697, - "balance_loss_clip": 1.03818846, - "balance_loss_mlp": 1.03363037, - "epoch": 0.32797234330377273, - "flos": 16508064072960.0, - "grad_norm": 2.8017119157252703, - "language_loss": 0.76891404, - "learning_rate": 3.1386922792386924e-06, - "loss": 0.79009354, - "num_input_tokens_seen": 117140515, - "step": 5455, - "time_per_iteration": 4.402290105819702 - }, - { - "auxiliary_loss_clip": 0.01131084, - "auxiliary_loss_mlp": 0.01042542, - "balance_loss_clip": 1.05241477, - "balance_loss_mlp": 1.02624655, - "epoch": 0.3280324665564407, - "flos": 26577918397440.0, - "grad_norm": 1.6426536912861747, - "language_loss": 0.74021912, - "learning_rate": 3.138372082016768e-06, - "loss": 0.76195538, - "num_input_tokens_seen": 117161485, - "step": 5456, - "time_per_iteration": 2.821965217590332 - }, - { - "auxiliary_loss_clip": 0.01140062, - "auxiliary_loss_mlp": 0.01047408, - "balance_loss_clip": 1.05334985, - "balance_loss_mlp": 1.03212523, - "epoch": 0.32809258980910866, - "flos": 22929969047040.0, - "grad_norm": 1.7597936582740754, - "language_loss": 0.78038168, - "learning_rate": 3.1380518416276596e-06, - "loss": 0.80225635, - "num_input_tokens_seen": 117181870, - "step": 5457, - "time_per_iteration": 2.703756093978882 - }, - { - "auxiliary_loss_clip": 0.01104649, - "auxiliary_loss_mlp": 0.01042509, - "balance_loss_clip": 1.04943132, - "balance_loss_mlp": 1.02752471, - "epoch": 0.3281527130617766, - "flos": 22783848520320.0, - "grad_norm": 5.102364490559591, - "language_loss": 0.79493362, - "learning_rate": 3.1377315580835115e-06, - "loss": 0.81640518, - "num_input_tokens_seen": 117201380, - "step": 5458, - "time_per_iteration": 4.307415962219238 - }, - { - "auxiliary_loss_clip": 0.01124323, - "auxiliary_loss_mlp": 0.01039216, - "balance_loss_clip": 1.05467916, - "balance_loss_mlp": 1.02362311, - "epoch": 0.3282128363144446, - "flos": 21250678354560.0, - "grad_norm": 1.6160363150508943, - "language_loss": 0.73029429, - "learning_rate": 3.1374112313964686e-06, - "loss": 0.7519297, - "num_input_tokens_seen": 117221040, - "step": 5459, - "time_per_iteration": 2.678131341934204 - }, - { - "auxiliary_loss_clip": 0.01118921, - "auxiliary_loss_mlp": 0.01041188, - "balance_loss_clip": 1.05190325, - "balance_loss_mlp": 1.02591753, - "epoch": 0.32827295956711255, - "flos": 30843064166400.0, - "grad_norm": 2.011905165126453, - "language_loss": 0.84018445, - "learning_rate": 3.1370908615786783e-06, - "loss": 0.86178553, - "num_input_tokens_seen": 117241395, - "step": 5460, - "time_per_iteration": 5.767046213150024 - }, - { - "auxiliary_loss_clip": 0.01138817, - "auxiliary_loss_mlp": 0.01035204, - "balance_loss_clip": 1.05174541, - "balance_loss_mlp": 1.02029121, - "epoch": 0.3283330828197806, - "flos": 25915006944000.0, - "grad_norm": 1.9959413021835115, - "language_loss": 0.76553524, - "learning_rate": 3.136770448642288e-06, - "loss": 0.78727543, - "num_input_tokens_seen": 117259340, - "step": 5461, - "time_per_iteration": 2.673659086227417 - }, - { - "auxiliary_loss_clip": 0.01121607, - "auxiliary_loss_mlp": 0.01042243, - "balance_loss_clip": 1.05065536, - "balance_loss_mlp": 1.02489805, - "epoch": 0.32839320607244854, - "flos": 38582065042560.0, - "grad_norm": 2.148112131584704, - "language_loss": 0.62898672, - "learning_rate": 3.1364499925994484e-06, - "loss": 0.65062523, - "num_input_tokens_seen": 117282375, - "step": 5462, - "time_per_iteration": 2.789217472076416 - }, - { - "auxiliary_loss_clip": 0.01136727, - "auxiliary_loss_mlp": 0.0077334, - "balance_loss_clip": 1.05279326, - "balance_loss_mlp": 1.00113511, - "epoch": 0.3284533293251165, - "flos": 26650888876800.0, - "grad_norm": 2.4415591889879056, - "language_loss": 0.7805075, - "learning_rate": 3.1361294934623115e-06, - "loss": 0.79960817, - "num_input_tokens_seen": 117303830, - "step": 5463, - "time_per_iteration": 2.6797146797180176 - }, - { - "auxiliary_loss_clip": 0.01109773, - "auxiliary_loss_mlp": 0.01040868, - "balance_loss_clip": 1.05036163, - "balance_loss_mlp": 1.02523983, - "epoch": 0.32851345257778447, - "flos": 15304158904320.0, - "grad_norm": 1.8407799027990368, - "language_loss": 0.70095646, - "learning_rate": 3.1358089512430303e-06, - "loss": 0.72246289, - "num_input_tokens_seen": 117320665, - "step": 5464, - "time_per_iteration": 2.7286477088928223 - }, - { - "auxiliary_loss_clip": 0.01130175, - "auxiliary_loss_mlp": 0.01038523, - "balance_loss_clip": 1.05659711, - "balance_loss_mlp": 1.02327609, - "epoch": 0.32857357583045244, - "flos": 23513732881920.0, - "grad_norm": 1.976060055551124, - "language_loss": 0.72474623, - "learning_rate": 3.1354883659537594e-06, - "loss": 0.74643314, - "num_input_tokens_seen": 117339795, - "step": 5465, - "time_per_iteration": 2.6666364669799805 - }, - { - "auxiliary_loss_clip": 0.01113042, - "auxiliary_loss_mlp": 0.01049431, - "balance_loss_clip": 1.05094242, - "balance_loss_mlp": 1.03334332, - "epoch": 0.3286336990831204, - "flos": 20995209849600.0, - "grad_norm": 1.953344541818443, - "language_loss": 0.832214, - "learning_rate": 3.1351677376066567e-06, - "loss": 0.8538388, - "num_input_tokens_seen": 117359525, - "step": 5466, - "time_per_iteration": 2.7432901859283447 - }, - { - "auxiliary_loss_clip": 0.01113455, - "auxiliary_loss_mlp": 0.01041029, - "balance_loss_clip": 1.04729056, - "balance_loss_mlp": 1.02577055, - "epoch": 0.32869382233578837, - "flos": 23658811914240.0, - "grad_norm": 1.7893036060845653, - "language_loss": 0.79221183, - "learning_rate": 3.134847066213879e-06, - "loss": 0.8137567, - "num_input_tokens_seen": 117380320, - "step": 5467, - "time_per_iteration": 2.701490879058838 - }, - { - "auxiliary_loss_clip": 0.0111678, - "auxiliary_loss_mlp": 0.0103291, - "balance_loss_clip": 1.05045676, - "balance_loss_mlp": 1.01759124, - "epoch": 0.32875394558845633, - "flos": 25336522408320.0, - "grad_norm": 1.5411251384559923, - "language_loss": 0.74338531, - "learning_rate": 3.134526351787587e-06, - "loss": 0.76488233, - "num_input_tokens_seen": 117400695, - "step": 5468, - "time_per_iteration": 2.6820507049560547 - }, - { - "auxiliary_loss_clip": 0.0111552, - "auxiliary_loss_mlp": 0.01042549, - "balance_loss_clip": 1.05065966, - "balance_loss_mlp": 1.02476263, - "epoch": 0.3288140688411243, - "flos": 14903108576640.0, - "grad_norm": 1.9818058078172698, - "language_loss": 0.7869612, - "learning_rate": 3.134205594339942e-06, - "loss": 0.80854189, - "num_input_tokens_seen": 117418800, - "step": 5469, - "time_per_iteration": 2.6281590461730957 - }, - { - "auxiliary_loss_clip": 0.01104752, - "auxiliary_loss_mlp": 0.01033111, - "balance_loss_clip": 1.04863441, - "balance_loss_mlp": 1.01838851, - "epoch": 0.32887419209379226, - "flos": 18551345235840.0, - "grad_norm": 1.9383846382167882, - "language_loss": 0.81744516, - "learning_rate": 3.133884793883107e-06, - "loss": 0.8388238, - "num_input_tokens_seen": 117438220, - "step": 5470, - "time_per_iteration": 2.8643784523010254 - }, - { - "auxiliary_loss_clip": 0.01140563, - "auxiliary_loss_mlp": 0.01045939, - "balance_loss_clip": 1.05232358, - "balance_loss_mlp": 1.03021562, - "epoch": 0.3289343153464602, - "flos": 48105610439040.0, - "grad_norm": 2.0914054865715768, - "language_loss": 0.67699564, - "learning_rate": 3.1335639504292478e-06, - "loss": 0.69886065, - "num_input_tokens_seen": 117462560, - "step": 5471, - "time_per_iteration": 2.851717948913574 - }, - { - "auxiliary_loss_clip": 0.01148136, - "auxiliary_loss_mlp": 0.01043561, - "balance_loss_clip": 1.05701339, - "balance_loss_mlp": 1.02594161, - "epoch": 0.3289944385991282, - "flos": 27600295207680.0, - "grad_norm": 2.097557855250848, - "language_loss": 0.64926231, - "learning_rate": 3.1332430639905288e-06, - "loss": 0.67117929, - "num_input_tokens_seen": 117483665, - "step": 5472, - "time_per_iteration": 2.6586108207702637 - }, - { - "auxiliary_loss_clip": 0.01128351, - "auxiliary_loss_mlp": 0.01045454, - "balance_loss_clip": 1.05333138, - "balance_loss_mlp": 1.02850199, - "epoch": 0.32905456185179616, - "flos": 20120318282880.0, - "grad_norm": 3.4668570750263155, - "language_loss": 0.88257217, - "learning_rate": 3.13292213457912e-06, - "loss": 0.90431023, - "num_input_tokens_seen": 117503565, - "step": 5473, - "time_per_iteration": 2.6792144775390625 - }, - { - "auxiliary_loss_clip": 0.01103479, - "auxiliary_loss_mlp": 0.01038881, - "balance_loss_clip": 1.04814398, - "balance_loss_mlp": 1.02123809, - "epoch": 0.3291146851044642, - "flos": 23180230080000.0, - "grad_norm": 1.8710184691373295, - "language_loss": 0.78193343, - "learning_rate": 3.1326011622071903e-06, - "loss": 0.80335701, - "num_input_tokens_seen": 117521460, - "step": 5474, - "time_per_iteration": 2.739057779312134 - }, - { - "auxiliary_loss_clip": 0.01038022, - "auxiliary_loss_mlp": 0.01029239, - "balance_loss_clip": 1.02788568, - "balance_loss_mlp": 1.02673554, - "epoch": 0.32917480835713214, - "flos": 67621912594560.0, - "grad_norm": 0.8109823017171686, - "language_loss": 0.6018818, - "learning_rate": 3.132280146886911e-06, - "loss": 0.62255442, - "num_input_tokens_seen": 117580550, - "step": 5475, - "time_per_iteration": 3.196384906768799 - }, - { - "auxiliary_loss_clip": 0.01091837, - "auxiliary_loss_mlp": 0.01057279, - "balance_loss_clip": 1.04454446, - "balance_loss_mlp": 1.03726411, - "epoch": 0.3292349316098001, - "flos": 27964537073280.0, - "grad_norm": 4.962450920257536, - "language_loss": 0.76504046, - "learning_rate": 3.131959088630455e-06, - "loss": 0.78653169, - "num_input_tokens_seen": 117600645, - "step": 5476, - "time_per_iteration": 2.7369961738586426 - }, - { - "auxiliary_loss_clip": 0.01100541, - "auxiliary_loss_mlp": 0.01044762, - "balance_loss_clip": 1.04824603, - "balance_loss_mlp": 1.02946782, - "epoch": 0.3292950548624681, - "flos": 20263673462400.0, - "grad_norm": 2.5019671735892937, - "language_loss": 0.74746907, - "learning_rate": 3.131637987449997e-06, - "loss": 0.76892209, - "num_input_tokens_seen": 117618880, - "step": 5477, - "time_per_iteration": 2.814467430114746 - }, - { - "auxiliary_loss_clip": 0.01135692, - "auxiliary_loss_mlp": 0.01042652, - "balance_loss_clip": 1.05235898, - "balance_loss_mlp": 1.02838814, - "epoch": 0.32935517811513604, - "flos": 20812999132800.0, - "grad_norm": 3.9065130557825234, - "language_loss": 0.75539625, - "learning_rate": 3.131316843357713e-06, - "loss": 0.77717972, - "num_input_tokens_seen": 117636445, - "step": 5478, - "time_per_iteration": 2.730445384979248 - }, - { - "auxiliary_loss_clip": 0.0112467, - "auxiliary_loss_mlp": 0.01042056, - "balance_loss_clip": 1.04921985, - "balance_loss_mlp": 1.02750051, - "epoch": 0.329415301367804, - "flos": 18441853603200.0, - "grad_norm": 2.855777191383278, - "language_loss": 0.80462509, - "learning_rate": 3.1309956563657807e-06, - "loss": 0.82629234, - "num_input_tokens_seen": 117653105, - "step": 5479, - "time_per_iteration": 2.6443796157836914 - }, - { - "auxiliary_loss_clip": 0.01037863, - "auxiliary_loss_mlp": 0.01000413, - "balance_loss_clip": 1.02671266, - "balance_loss_mlp": 0.99823159, - "epoch": 0.32947542462047197, - "flos": 66323024887680.0, - "grad_norm": 0.7530723778079996, - "language_loss": 0.56519568, - "learning_rate": 3.1306744264863804e-06, - "loss": 0.58557844, - "num_input_tokens_seen": 117719225, - "step": 5480, - "time_per_iteration": 3.213240146636963 - }, - { - "auxiliary_loss_clip": 0.01124019, - "auxiliary_loss_mlp": 0.00774449, - "balance_loss_clip": 1.04898739, - "balance_loss_mlp": 1.00116146, - "epoch": 0.32953554787313993, - "flos": 23221599569280.0, - "grad_norm": 1.7923941739082951, - "language_loss": 0.77444887, - "learning_rate": 3.1303531537316915e-06, - "loss": 0.79343355, - "num_input_tokens_seen": 117738725, - "step": 5481, - "time_per_iteration": 2.6905598640441895 - }, - { - "auxiliary_loss_clip": 0.01119194, - "auxiliary_loss_mlp": 0.01050738, - "balance_loss_clip": 1.05167091, - "balance_loss_mlp": 1.03557408, - "epoch": 0.3295956711258079, - "flos": 27009492307200.0, - "grad_norm": 1.5874205685036498, - "language_loss": 0.78222132, - "learning_rate": 3.130031838113899e-06, - "loss": 0.80392069, - "num_input_tokens_seen": 117757765, - "step": 5482, - "time_per_iteration": 2.765235424041748 - }, - { - "auxiliary_loss_clip": 0.01130055, - "auxiliary_loss_mlp": 0.01052605, - "balance_loss_clip": 1.05121589, - "balance_loss_mlp": 1.03674388, - "epoch": 0.32965579437847586, - "flos": 19171702051200.0, - "grad_norm": 2.9405789595849385, - "language_loss": 0.73674762, - "learning_rate": 3.129710479645185e-06, - "loss": 0.75857425, - "num_input_tokens_seen": 117776810, - "step": 5483, - "time_per_iteration": 2.624969005584717 - }, - { - "auxiliary_loss_clip": 0.01122896, - "auxiliary_loss_mlp": 0.01054419, - "balance_loss_clip": 1.05069685, - "balance_loss_mlp": 1.03886831, - "epoch": 0.32971591763114383, - "flos": 30482521401600.0, - "grad_norm": 1.8706124903497952, - "language_loss": 0.75649381, - "learning_rate": 3.1293890783377366e-06, - "loss": 0.77826691, - "num_input_tokens_seen": 117797730, - "step": 5484, - "time_per_iteration": 2.7650864124298096 - }, - { - "auxiliary_loss_clip": 0.01141223, - "auxiliary_loss_mlp": 0.01053478, - "balance_loss_clip": 1.05515027, - "balance_loss_mlp": 1.03807664, - "epoch": 0.3297760408838118, - "flos": 16289583598080.0, - "grad_norm": 72.4202789440072, - "language_loss": 0.71719176, - "learning_rate": 3.129067634203742e-06, - "loss": 0.73913872, - "num_input_tokens_seen": 117815365, - "step": 5485, - "time_per_iteration": 2.603039264678955 - }, - { - "auxiliary_loss_clip": 0.01081054, - "auxiliary_loss_mlp": 0.01052335, - "balance_loss_clip": 1.04921818, - "balance_loss_mlp": 1.03822041, - "epoch": 0.32983616413647976, - "flos": 29530924341120.0, - "grad_norm": 1.6108204077161399, - "language_loss": 0.80275488, - "learning_rate": 3.128746147255388e-06, - "loss": 0.82408869, - "num_input_tokens_seen": 117836095, - "step": 5486, - "time_per_iteration": 2.8364202976226807 - }, - { - "auxiliary_loss_clip": 0.01106188, - "auxiliary_loss_mlp": 0.01053006, - "balance_loss_clip": 1.04739475, - "balance_loss_mlp": 1.03650784, - "epoch": 0.3298962873891478, - "flos": 20631398947200.0, - "grad_norm": 2.173231613182175, - "language_loss": 0.84374005, - "learning_rate": 3.1284246175048683e-06, - "loss": 0.86533195, - "num_input_tokens_seen": 117854655, - "step": 5487, - "time_per_iteration": 2.7796428203582764 - }, - { - "auxiliary_loss_clip": 0.01087509, - "auxiliary_loss_mlp": 0.01055173, - "balance_loss_clip": 1.04317069, - "balance_loss_mlp": 1.0379355, - "epoch": 0.32995641064181574, - "flos": 14976007228800.0, - "grad_norm": 2.633362688401157, - "language_loss": 0.74667275, - "learning_rate": 3.1281030449643735e-06, - "loss": 0.76809955, - "num_input_tokens_seen": 117873300, - "step": 5488, - "time_per_iteration": 2.7173233032226562 - }, - { - "auxiliary_loss_clip": 0.01143363, - "auxiliary_loss_mlp": 0.01051325, - "balance_loss_clip": 1.05679107, - "balance_loss_mlp": 1.03563726, - "epoch": 0.3300165338944837, - "flos": 18661447399680.0, - "grad_norm": 2.518818086418956, - "language_loss": 0.71718305, - "learning_rate": 3.127781429646098e-06, - "loss": 0.7391299, - "num_input_tokens_seen": 117891540, - "step": 5489, - "time_per_iteration": 2.6647188663482666 - }, - { - "auxiliary_loss_clip": 0.01137372, - "auxiliary_loss_mlp": 0.01044261, - "balance_loss_clip": 1.05154073, - "balance_loss_mlp": 1.02973497, - "epoch": 0.3300766571471517, - "flos": 25583730785280.0, - "grad_norm": 6.067113992727344, - "language_loss": 0.88346136, - "learning_rate": 3.127459771562238e-06, - "loss": 0.90527773, - "num_input_tokens_seen": 117907690, - "step": 5490, - "time_per_iteration": 2.594193696975708 - }, - { - "auxiliary_loss_clip": 0.01127009, - "auxiliary_loss_mlp": 0.0103878, - "balance_loss_clip": 1.05081856, - "balance_loss_mlp": 1.02396214, - "epoch": 0.33013678039981964, - "flos": 11363501623680.0, - "grad_norm": 5.091693260582257, - "language_loss": 0.83396459, - "learning_rate": 3.1271380707249907e-06, - "loss": 0.85562241, - "num_input_tokens_seen": 117925640, - "step": 5491, - "time_per_iteration": 2.6124439239501953 - }, - { - "auxiliary_loss_clip": 0.01111643, - "auxiliary_loss_mlp": 0.01048849, - "balance_loss_clip": 1.05066538, - "balance_loss_mlp": 1.03372788, - "epoch": 0.3301969036524876, - "flos": 24821203939200.0, - "grad_norm": 1.9936853829327341, - "language_loss": 0.77453989, - "learning_rate": 3.126816327146554e-06, - "loss": 0.79614484, - "num_input_tokens_seen": 117944525, - "step": 5492, - "time_per_iteration": 4.26681923866272 - }, - { - "auxiliary_loss_clip": 0.01144384, - "auxiliary_loss_mlp": 0.01046422, - "balance_loss_clip": 1.05559993, - "balance_loss_mlp": 1.02987576, - "epoch": 0.33025702690515557, - "flos": 15961144613760.0, - "grad_norm": 2.586093125227841, - "language_loss": 0.74295127, - "learning_rate": 3.12649454083913e-06, - "loss": 0.76485932, - "num_input_tokens_seen": 117962515, - "step": 5493, - "time_per_iteration": 2.572657585144043 - }, - { - "auxiliary_loss_clip": 0.01007495, - "auxiliary_loss_mlp": 0.01051184, - "balance_loss_clip": 1.0238874, - "balance_loss_mlp": 1.0491215, - "epoch": 0.33031715015782354, - "flos": 59416755989760.0, - "grad_norm": 0.7952972655943692, - "language_loss": 0.53981996, - "learning_rate": 3.12617271181492e-06, - "loss": 0.5604068, - "num_input_tokens_seen": 118018780, - "step": 5494, - "time_per_iteration": 3.2123944759368896 - }, - { - "auxiliary_loss_clip": 0.01114646, - "auxiliary_loss_mlp": 0.0103786, - "balance_loss_clip": 1.04879999, - "balance_loss_mlp": 1.02241075, - "epoch": 0.3303772734104915, - "flos": 23184360144000.0, - "grad_norm": 1.4867113292626302, - "language_loss": 0.87236047, - "learning_rate": 3.1258508400861276e-06, - "loss": 0.89388549, - "num_input_tokens_seen": 118038610, - "step": 5495, - "time_per_iteration": 4.180245637893677 - }, - { - "auxiliary_loss_clip": 0.01104415, - "auxiliary_loss_mlp": 0.0104461, - "balance_loss_clip": 1.0520072, - "balance_loss_mlp": 1.02813482, - "epoch": 0.33043739666315947, - "flos": 33071896010880.0, - "grad_norm": 2.0634169818588157, - "language_loss": 0.73468459, - "learning_rate": 3.1255289256649587e-06, - "loss": 0.7561748, - "num_input_tokens_seen": 118055905, - "step": 5496, - "time_per_iteration": 2.816849946975708 - }, - { - "auxiliary_loss_clip": 0.01107244, - "auxiliary_loss_mlp": 0.01039897, - "balance_loss_clip": 1.04852057, - "balance_loss_mlp": 1.02469766, - "epoch": 0.33049751991582743, - "flos": 24895431394560.0, - "grad_norm": 2.430684839051296, - "language_loss": 0.72464252, - "learning_rate": 3.1252069685636196e-06, - "loss": 0.74611384, - "num_input_tokens_seen": 118073695, - "step": 5497, - "time_per_iteration": 4.314718961715698 - }, - { - "auxiliary_loss_clip": 0.01111966, - "auxiliary_loss_mlp": 0.01038015, - "balance_loss_clip": 1.05051875, - "balance_loss_mlp": 1.02313733, - "epoch": 0.3305576431684954, - "flos": 29460575554560.0, - "grad_norm": 1.9082848646705384, - "language_loss": 0.804672, - "learning_rate": 3.124884968794321e-06, - "loss": 0.82617176, - "num_input_tokens_seen": 118094030, - "step": 5498, - "time_per_iteration": 2.831347942352295 - }, - { - "auxiliary_loss_clip": 0.01121599, - "auxiliary_loss_mlp": 0.01041664, - "balance_loss_clip": 1.04826963, - "balance_loss_mlp": 1.02467656, - "epoch": 0.33061776642116336, - "flos": 22632305040000.0, - "grad_norm": 2.0593804502858823, - "language_loss": 0.75822198, - "learning_rate": 3.12456292636927e-06, - "loss": 0.77985466, - "num_input_tokens_seen": 118111665, - "step": 5499, - "time_per_iteration": 4.880478858947754 - }, - { - "auxiliary_loss_clip": 0.01119724, - "auxiliary_loss_mlp": 0.01035684, - "balance_loss_clip": 1.05307007, - "balance_loss_mlp": 1.02016318, - "epoch": 0.3306778896738313, - "flos": 25776320532480.0, - "grad_norm": 2.088317081581358, - "language_loss": 0.78981787, - "learning_rate": 3.124240841300681e-06, - "loss": 0.81137192, - "num_input_tokens_seen": 118132435, - "step": 5500, - "time_per_iteration": 2.7601048946380615 - }, - { - "auxiliary_loss_clip": 0.01131843, - "auxiliary_loss_mlp": 0.0103364, - "balance_loss_clip": 1.0540576, - "balance_loss_mlp": 1.01751041, - "epoch": 0.33073801292649935, - "flos": 36940552479360.0, - "grad_norm": 8.499573931934933, - "language_loss": 0.6655246, - "learning_rate": 3.1239187136007665e-06, - "loss": 0.68717939, - "num_input_tokens_seen": 118155255, - "step": 5501, - "time_per_iteration": 2.7880568504333496 - }, - { - "auxiliary_loss_clip": 0.01130024, - "auxiliary_loss_mlp": 0.01044854, - "balance_loss_clip": 1.05215073, - "balance_loss_mlp": 1.02766418, - "epoch": 0.3307981361791673, - "flos": 12967738848000.0, - "grad_norm": 2.417495150038941, - "language_loss": 0.77221018, - "learning_rate": 3.1235965432817417e-06, - "loss": 0.79395902, - "num_input_tokens_seen": 118169865, - "step": 5502, - "time_per_iteration": 2.621891736984253 - }, - { - "auxiliary_loss_clip": 0.01120279, - "auxiliary_loss_mlp": 0.01041312, - "balance_loss_clip": 1.05816746, - "balance_loss_mlp": 1.02508807, - "epoch": 0.3308582594318353, - "flos": 25374372364800.0, - "grad_norm": 1.6870244228079128, - "language_loss": 0.72882998, - "learning_rate": 3.123274330355824e-06, - "loss": 0.75044584, - "num_input_tokens_seen": 118190760, - "step": 5503, - "time_per_iteration": 2.731391191482544 - }, - { - "auxiliary_loss_clip": 0.01107126, - "auxiliary_loss_mlp": 0.01042991, - "balance_loss_clip": 1.04483843, - "balance_loss_mlp": 1.02543116, - "epoch": 0.33091838268450324, - "flos": 26468570419200.0, - "grad_norm": 1.6983408951831631, - "language_loss": 0.75341403, - "learning_rate": 3.12295207483523e-06, - "loss": 0.77491516, - "num_input_tokens_seen": 118213620, - "step": 5504, - "time_per_iteration": 2.734440565109253 - }, - { - "auxiliary_loss_clip": 0.01116159, - "auxiliary_loss_mlp": 0.01038384, - "balance_loss_clip": 1.05076432, - "balance_loss_mlp": 1.02267826, - "epoch": 0.3309785059371712, - "flos": 24971167221120.0, - "grad_norm": 1.5921827086772462, - "language_loss": 0.69537103, - "learning_rate": 3.1226297767321816e-06, - "loss": 0.71691644, - "num_input_tokens_seen": 118235010, - "step": 5505, - "time_per_iteration": 2.7224769592285156 - }, - { - "auxiliary_loss_clip": 0.0112242, - "auxiliary_loss_mlp": 0.01050735, - "balance_loss_clip": 1.04997373, - "balance_loss_mlp": 1.03454661, - "epoch": 0.3310386291898392, - "flos": 20446710192000.0, - "grad_norm": 1.6566524839278514, - "language_loss": 0.81701219, - "learning_rate": 3.122307436058899e-06, - "loss": 0.83874375, - "num_input_tokens_seen": 118255820, - "step": 5506, - "time_per_iteration": 2.6608633995056152 - }, - { - "auxiliary_loss_clip": 0.01126393, - "auxiliary_loss_mlp": 0.01036938, - "balance_loss_clip": 1.05129898, - "balance_loss_mlp": 1.02032042, - "epoch": 0.33109875244250714, - "flos": 23182672204800.0, - "grad_norm": 2.1165262291534663, - "language_loss": 0.7961843, - "learning_rate": 3.121985052827606e-06, - "loss": 0.81781757, - "num_input_tokens_seen": 118274160, - "step": 5507, - "time_per_iteration": 2.6279826164245605 - }, - { - "auxiliary_loss_clip": 0.01115407, - "auxiliary_loss_mlp": 0.0104488, - "balance_loss_clip": 1.04948068, - "balance_loss_mlp": 1.02901316, - "epoch": 0.3311588756951751, - "flos": 24168384207360.0, - "grad_norm": 1.8252383106416188, - "language_loss": 0.71632457, - "learning_rate": 3.1216626270505274e-06, - "loss": 0.73792744, - "num_input_tokens_seen": 118294385, - "step": 5508, - "time_per_iteration": 2.666274070739746 - }, - { - "auxiliary_loss_clip": 0.01105407, - "auxiliary_loss_mlp": 0.01035431, - "balance_loss_clip": 1.04841506, - "balance_loss_mlp": 1.02048194, - "epoch": 0.33121899894784307, - "flos": 28145742209280.0, - "grad_norm": 2.0681023318662053, - "language_loss": 0.71877921, - "learning_rate": 3.12134015873989e-06, - "loss": 0.74018759, - "num_input_tokens_seen": 118313105, - "step": 5509, - "time_per_iteration": 2.9805185794830322 - }, - { - "auxiliary_loss_clip": 0.01123913, - "auxiliary_loss_mlp": 0.01035754, - "balance_loss_clip": 1.05431342, - "balance_loss_mlp": 1.02019095, - "epoch": 0.33127912220051103, - "flos": 29567660976000.0, - "grad_norm": 1.690455092128618, - "language_loss": 0.72850806, - "learning_rate": 3.121017647907921e-06, - "loss": 0.75010473, - "num_input_tokens_seen": 118335250, - "step": 5510, - "time_per_iteration": 2.7012648582458496 - }, - { - "auxiliary_loss_clip": 0.01097101, - "auxiliary_loss_mlp": 0.01036395, - "balance_loss_clip": 1.04754674, - "balance_loss_mlp": 1.02099323, - "epoch": 0.331339245453179, - "flos": 14428836374400.0, - "grad_norm": 2.529653220973509, - "language_loss": 0.87842733, - "learning_rate": 3.1206950945668508e-06, - "loss": 0.89976227, - "num_input_tokens_seen": 118351470, - "step": 5511, - "time_per_iteration": 2.699303150177002 - }, - { - "auxiliary_loss_clip": 0.01077351, - "auxiliary_loss_mlp": 0.0103825, - "balance_loss_clip": 1.04569423, - "balance_loss_mlp": 1.0232892, - "epoch": 0.33139936870584696, - "flos": 20887118847360.0, - "grad_norm": 2.0800696693803404, - "language_loss": 0.73301774, - "learning_rate": 3.12037249872891e-06, - "loss": 0.7541737, - "num_input_tokens_seen": 118370970, - "step": 5512, - "time_per_iteration": 2.773071765899658 - }, - { - "auxiliary_loss_clip": 0.01092657, - "auxiliary_loss_mlp": 0.01037164, - "balance_loss_clip": 1.04608238, - "balance_loss_mlp": 1.02226281, - "epoch": 0.33145949195851493, - "flos": 36284356869120.0, - "grad_norm": 28.686212163123738, - "language_loss": 0.7188127, - "learning_rate": 3.1200498604063317e-06, - "loss": 0.74011087, - "num_input_tokens_seen": 118393125, - "step": 5513, - "time_per_iteration": 2.832712411880493 - }, - { - "auxiliary_loss_clip": 0.0110331, - "auxiliary_loss_mlp": 0.01037016, - "balance_loss_clip": 1.0480994, - "balance_loss_mlp": 1.02052951, - "epoch": 0.33151961521118295, - "flos": 14279735018880.0, - "grad_norm": 1.9100766123367274, - "language_loss": 0.68260789, - "learning_rate": 3.1197271796113507e-06, - "loss": 0.70401114, - "num_input_tokens_seen": 118410860, - "step": 5514, - "time_per_iteration": 2.62347674369812 - }, - { - "auxiliary_loss_clip": 0.01111479, - "auxiliary_loss_mlp": 0.01042546, - "balance_loss_clip": 1.04936767, - "balance_loss_mlp": 1.02481997, - "epoch": 0.3315797384638509, - "flos": 20774323163520.0, - "grad_norm": 1.9179680687741931, - "language_loss": 0.65994096, - "learning_rate": 3.1194044563562026e-06, - "loss": 0.68148118, - "num_input_tokens_seen": 118429570, - "step": 5515, - "time_per_iteration": 2.6913952827453613 - }, - { - "auxiliary_loss_clip": 0.01121539, - "auxiliary_loss_mlp": 0.01039988, - "balance_loss_clip": 1.04903245, - "balance_loss_mlp": 1.02393019, - "epoch": 0.3316398617165189, - "flos": 24679464871680.0, - "grad_norm": 1.8088538037879305, - "language_loss": 0.69273043, - "learning_rate": 3.1190816906531257e-06, - "loss": 0.71434575, - "num_input_tokens_seen": 118450285, - "step": 5516, - "time_per_iteration": 2.6469173431396484 - }, - { - "auxiliary_loss_clip": 0.011287, - "auxiliary_loss_mlp": 0.01039737, - "balance_loss_clip": 1.05089724, - "balance_loss_mlp": 1.02339315, - "epoch": 0.33169998496918685, - "flos": 18587974129920.0, - "grad_norm": 3.871010712989623, - "language_loss": 0.79914033, - "learning_rate": 3.118758882514359e-06, - "loss": 0.82082474, - "num_input_tokens_seen": 118468270, - "step": 5517, - "time_per_iteration": 2.6387667655944824 - }, - { - "auxiliary_loss_clip": 0.01113973, - "auxiliary_loss_mlp": 0.01040442, - "balance_loss_clip": 1.04587924, - "balance_loss_mlp": 1.02412271, - "epoch": 0.3317601082218548, - "flos": 20193647898240.0, - "grad_norm": 1.7856922866156533, - "language_loss": 0.74043357, - "learning_rate": 3.118436031952143e-06, - "loss": 0.76197767, - "num_input_tokens_seen": 118486615, - "step": 5518, - "time_per_iteration": 2.6136653423309326 - }, - { - "auxiliary_loss_clip": 0.01035845, - "auxiliary_loss_mlp": 0.0100663, - "balance_loss_clip": 1.02549803, - "balance_loss_mlp": 1.00447261, - "epoch": 0.3318202314745228, - "flos": 68974703637120.0, - "grad_norm": 0.6165261089589951, - "language_loss": 0.54330659, - "learning_rate": 3.1181131389787206e-06, - "loss": 0.56373143, - "num_input_tokens_seen": 118553580, - "step": 5519, - "time_per_iteration": 3.3124027252197266 - }, - { - "auxiliary_loss_clip": 0.01129225, - "auxiliary_loss_mlp": 0.01042237, - "balance_loss_clip": 1.05353975, - "balance_loss_mlp": 1.02483273, - "epoch": 0.33188035472719074, - "flos": 21500113374720.0, - "grad_norm": 2.4445902922344342, - "language_loss": 0.78693354, - "learning_rate": 3.117790203606336e-06, - "loss": 0.80864823, - "num_input_tokens_seen": 118570280, - "step": 5520, - "time_per_iteration": 2.680413246154785 - }, - { - "auxiliary_loss_clip": 0.0111174, - "auxiliary_loss_mlp": 0.01034453, - "balance_loss_clip": 1.04981971, - "balance_loss_mlp": 1.01946807, - "epoch": 0.3319404779798587, - "flos": 28870490926080.0, - "grad_norm": 2.1205551001068645, - "language_loss": 0.76597643, - "learning_rate": 3.1174672258472344e-06, - "loss": 0.78743839, - "num_input_tokens_seen": 118590455, - "step": 5521, - "time_per_iteration": 2.7977516651153564 - }, - { - "auxiliary_loss_clip": 0.01128356, - "auxiliary_loss_mlp": 0.0104906, - "balance_loss_clip": 1.0500772, - "balance_loss_mlp": 1.0320611, - "epoch": 0.33200060123252667, - "flos": 23076915586560.0, - "grad_norm": 5.546447388917159, - "language_loss": 0.70404172, - "learning_rate": 3.117144205713664e-06, - "loss": 0.72581589, - "num_input_tokens_seen": 118609495, - "step": 5522, - "time_per_iteration": 2.7343335151672363 - }, - { - "auxiliary_loss_clip": 0.01112615, - "auxiliary_loss_mlp": 0.01039333, - "balance_loss_clip": 1.04872596, - "balance_loss_mlp": 1.02413392, - "epoch": 0.33206072448519464, - "flos": 21142479611520.0, - "grad_norm": 2.5717643633026133, - "language_loss": 0.7406925, - "learning_rate": 3.1168211432178735e-06, - "loss": 0.76221192, - "num_input_tokens_seen": 118628720, - "step": 5523, - "time_per_iteration": 2.6910529136657715 - }, - { - "auxiliary_loss_clip": 0.01108522, - "auxiliary_loss_mlp": 0.01039859, - "balance_loss_clip": 1.04778576, - "balance_loss_mlp": 1.02415287, - "epoch": 0.3321208477378626, - "flos": 13079097987840.0, - "grad_norm": 1.7441145490896364, - "language_loss": 0.82432246, - "learning_rate": 3.116498038372114e-06, - "loss": 0.8458063, - "num_input_tokens_seen": 118645955, - "step": 5524, - "time_per_iteration": 2.747279405593872 - }, - { - "auxiliary_loss_clip": 0.01094215, - "auxiliary_loss_mlp": 0.00773366, - "balance_loss_clip": 1.04763544, - "balance_loss_mlp": 1.000983, - "epoch": 0.33218097099053057, - "flos": 21215414177280.0, - "grad_norm": 1.8821817398487202, - "language_loss": 0.83040905, - "learning_rate": 3.116174891188636e-06, - "loss": 0.84908485, - "num_input_tokens_seen": 118665605, - "step": 5525, - "time_per_iteration": 2.7802865505218506 - }, - { - "auxiliary_loss_clip": 0.01051991, - "auxiliary_loss_mlp": 0.01009126, - "balance_loss_clip": 1.02309918, - "balance_loss_mlp": 1.00730228, - "epoch": 0.33224109424319853, - "flos": 64348979189760.0, - "grad_norm": 0.7599038914172829, - "language_loss": 0.52588648, - "learning_rate": 3.1158517016796945e-06, - "loss": 0.54649764, - "num_input_tokens_seen": 118728155, - "step": 5526, - "time_per_iteration": 3.1430625915527344 - }, - { - "auxiliary_loss_clip": 0.01100912, - "auxiliary_loss_mlp": 0.00775153, - "balance_loss_clip": 1.05235875, - "balance_loss_mlp": 1.00101066, - "epoch": 0.33230121749586655, - "flos": 17346003523200.0, - "grad_norm": 1.9434005693126541, - "language_loss": 0.77540255, - "learning_rate": 3.1155284698575445e-06, - "loss": 0.79416323, - "num_input_tokens_seen": 118743955, - "step": 5527, - "time_per_iteration": 2.779862403869629 - }, - { - "auxiliary_loss_clip": 0.01095485, - "auxiliary_loss_mlp": 0.01045396, - "balance_loss_clip": 1.05338502, - "balance_loss_mlp": 1.02997637, - "epoch": 0.3323613407485345, - "flos": 20997041443200.0, - "grad_norm": 2.507974613956182, - "language_loss": 0.7222321, - "learning_rate": 3.1152051957344434e-06, - "loss": 0.7436409, - "num_input_tokens_seen": 118763275, - "step": 5528, - "time_per_iteration": 2.7340548038482666 - }, - { - "auxiliary_loss_clip": 0.01112677, - "auxiliary_loss_mlp": 0.01037789, - "balance_loss_clip": 1.04796624, - "balance_loss_mlp": 1.02333462, - "epoch": 0.3324214640012025, - "flos": 13152535344000.0, - "grad_norm": 1.86583443755271, - "language_loss": 0.82796729, - "learning_rate": 3.1148818793226497e-06, - "loss": 0.84947193, - "num_input_tokens_seen": 118781110, - "step": 5529, - "time_per_iteration": 2.6532175540924072 - }, - { - "auxiliary_loss_clip": 0.01113738, - "auxiliary_loss_mlp": 0.00775289, - "balance_loss_clip": 1.04990721, - "balance_loss_mlp": 1.00095487, - "epoch": 0.33248158725387045, - "flos": 22273522041600.0, - "grad_norm": 2.91854332756289, - "language_loss": 0.69676769, - "learning_rate": 3.114558520634423e-06, - "loss": 0.71565795, - "num_input_tokens_seen": 118800620, - "step": 5530, - "time_per_iteration": 2.708841323852539 - }, - { - "auxiliary_loss_clip": 0.01126266, - "auxiliary_loss_mlp": 0.01050268, - "balance_loss_clip": 1.05040276, - "balance_loss_mlp": 1.03394794, - "epoch": 0.3325417105065384, - "flos": 20740998320640.0, - "grad_norm": 2.896961644373142, - "language_loss": 0.75989115, - "learning_rate": 3.1142351196820256e-06, - "loss": 0.7816565, - "num_input_tokens_seen": 118818725, - "step": 5531, - "time_per_iteration": 2.672736167907715 - }, - { - "auxiliary_loss_clip": 0.01118495, - "auxiliary_loss_mlp": 0.0104264, - "balance_loss_clip": 1.05284333, - "balance_loss_mlp": 1.0260222, - "epoch": 0.3326018337592064, - "flos": 24790536702720.0, - "grad_norm": 2.0175366752259465, - "language_loss": 0.73189509, - "learning_rate": 3.1139116764777206e-06, - "loss": 0.75350642, - "num_input_tokens_seen": 118839390, - "step": 5532, - "time_per_iteration": 4.367426156997681 - }, - { - "auxiliary_loss_clip": 0.0111545, - "auxiliary_loss_mlp": 0.0103097, - "balance_loss_clip": 1.0523479, - "balance_loss_mlp": 1.01623583, - "epoch": 0.33266195701187434, - "flos": 14501699112960.0, - "grad_norm": 2.031596721272471, - "language_loss": 0.65847003, - "learning_rate": 3.1135881910337735e-06, - "loss": 0.67993426, - "num_input_tokens_seen": 118856275, - "step": 5533, - "time_per_iteration": 2.66029691696167 - }, - { - "auxiliary_loss_clip": 0.01080696, - "auxiliary_loss_mlp": 0.01037858, - "balance_loss_clip": 1.04513919, - "balance_loss_mlp": 1.02147257, - "epoch": 0.3327220802645423, - "flos": 15304410299520.0, - "grad_norm": 2.349847054242377, - "language_loss": 0.71297956, - "learning_rate": 3.113264663362451e-06, - "loss": 0.73416501, - "num_input_tokens_seen": 118873830, - "step": 5534, - "time_per_iteration": 4.27457070350647 - }, - { - "auxiliary_loss_clip": 0.0109151, - "auxiliary_loss_mlp": 0.01041219, - "balance_loss_clip": 1.04982436, - "balance_loss_mlp": 1.02534652, - "epoch": 0.3327822035172103, - "flos": 23477534951040.0, - "grad_norm": 2.0777718313633997, - "language_loss": 0.6718514, - "learning_rate": 3.1129410934760204e-06, - "loss": 0.69317865, - "num_input_tokens_seen": 118891560, - "step": 5535, - "time_per_iteration": 2.774434804916382 - }, - { - "auxiliary_loss_clip": 0.01126643, - "auxiliary_loss_mlp": 0.00774026, - "balance_loss_clip": 1.04974341, - "balance_loss_mlp": 1.00099397, - "epoch": 0.33284232676987824, - "flos": 25374516019200.0, - "grad_norm": 4.4518317449354905, - "language_loss": 0.72757089, - "learning_rate": 3.1126174813867517e-06, - "loss": 0.74657756, - "num_input_tokens_seen": 118910260, - "step": 5536, - "time_per_iteration": 4.211881399154663 - }, - { - "auxiliary_loss_clip": 0.0112639, - "auxiliary_loss_mlp": 0.01042922, - "balance_loss_clip": 1.05097485, - "balance_loss_mlp": 1.02740741, - "epoch": 0.3329024500225462, - "flos": 23694363400320.0, - "grad_norm": 1.6494647990025764, - "language_loss": 0.81951326, - "learning_rate": 3.112293827106917e-06, - "loss": 0.84120637, - "num_input_tokens_seen": 118929985, - "step": 5537, - "time_per_iteration": 2.723938465118408 - }, - { - "auxiliary_loss_clip": 0.01130953, - "auxiliary_loss_mlp": 0.01041699, - "balance_loss_clip": 1.05334187, - "balance_loss_mlp": 1.02568924, - "epoch": 0.33296257327521417, - "flos": 31723163205120.0, - "grad_norm": 2.0361349610506987, - "language_loss": 0.71549797, - "learning_rate": 3.111970130648789e-06, - "loss": 0.73722446, - "num_input_tokens_seen": 118951355, - "step": 5538, - "time_per_iteration": 4.913949489593506 - }, - { - "auxiliary_loss_clip": 0.01120461, - "auxiliary_loss_mlp": 0.01037376, - "balance_loss_clip": 1.04746032, - "balance_loss_mlp": 1.02189124, - "epoch": 0.33302269652788213, - "flos": 22744705674240.0, - "grad_norm": 1.8849765474814903, - "language_loss": 0.74648041, - "learning_rate": 3.1116463920246424e-06, - "loss": 0.76805872, - "num_input_tokens_seen": 118970910, - "step": 5539, - "time_per_iteration": 2.7290310859680176 - }, - { - "auxiliary_loss_clip": 0.01142521, - "auxiliary_loss_mlp": 0.01045266, - "balance_loss_clip": 1.05175686, - "balance_loss_mlp": 1.02844524, - "epoch": 0.33308281978055015, - "flos": 11473747441920.0, - "grad_norm": 1.7887365250144445, - "language_loss": 0.71008205, - "learning_rate": 3.1113226112467527e-06, - "loss": 0.73195994, - "num_input_tokens_seen": 118989200, - "step": 5540, - "time_per_iteration": 2.6340630054473877 - }, - { - "auxiliary_loss_clip": 0.01121672, - "auxiliary_loss_mlp": 0.01037813, - "balance_loss_clip": 1.04614174, - "balance_loss_mlp": 1.02212477, - "epoch": 0.3331429430332181, - "flos": 38213693112960.0, - "grad_norm": 2.2050863595265535, - "language_loss": 0.60332179, - "learning_rate": 3.1109987883273983e-06, - "loss": 0.62491661, - "num_input_tokens_seen": 119011030, - "step": 5541, - "time_per_iteration": 2.9001681804656982 - }, - { - "auxiliary_loss_clip": 0.01116142, - "auxiliary_loss_mlp": 0.01045386, - "balance_loss_clip": 1.04896498, - "balance_loss_mlp": 1.02827907, - "epoch": 0.3332030662858861, - "flos": 22528667324160.0, - "grad_norm": 1.8682676496278656, - "language_loss": 0.68843257, - "learning_rate": 3.1106749232788584e-06, - "loss": 0.7100479, - "num_input_tokens_seen": 119030620, - "step": 5542, - "time_per_iteration": 2.7336552143096924 - }, - { - "auxiliary_loss_clip": 0.01125827, - "auxiliary_loss_mlp": 0.01039479, - "balance_loss_clip": 1.04983997, - "balance_loss_mlp": 1.0241369, - "epoch": 0.33326318953855405, - "flos": 15997773507840.0, - "grad_norm": 1.7424785130645766, - "language_loss": 0.75545055, - "learning_rate": 3.110351016113414e-06, - "loss": 0.7771036, - "num_input_tokens_seen": 119048015, - "step": 5543, - "time_per_iteration": 2.7098708152770996 - }, - { - "auxiliary_loss_clip": 0.01059952, - "auxiliary_loss_mlp": 0.01049723, - "balance_loss_clip": 1.04679465, - "balance_loss_mlp": 1.03153133, - "epoch": 0.333323312791222, - "flos": 25593535198080.0, - "grad_norm": 1.720313350609618, - "language_loss": 0.75207818, - "learning_rate": 3.110027066843348e-06, - "loss": 0.77317488, - "num_input_tokens_seen": 119066280, - "step": 5544, - "time_per_iteration": 2.8580381870269775 - }, - { - "auxiliary_loss_clip": 0.01131382, - "auxiliary_loss_mlp": 0.01034467, - "balance_loss_clip": 1.0470835, - "balance_loss_mlp": 1.01900601, - "epoch": 0.33338343604389, - "flos": 25119550304640.0, - "grad_norm": 1.8195187872515122, - "language_loss": 0.70631826, - "learning_rate": 3.1097030754809456e-06, - "loss": 0.7279768, - "num_input_tokens_seen": 119087680, - "step": 5545, - "time_per_iteration": 2.6675262451171875 - }, - { - "auxiliary_loss_clip": 0.01090227, - "auxiliary_loss_mlp": 0.01038197, - "balance_loss_clip": 1.04591393, - "balance_loss_mlp": 1.0225687, - "epoch": 0.33344355929655795, - "flos": 16947287579520.0, - "grad_norm": 2.0475528286172615, - "language_loss": 0.68962657, - "learning_rate": 3.1093790420384894e-06, - "loss": 0.7109108, - "num_input_tokens_seen": 119105820, - "step": 5546, - "time_per_iteration": 2.6620733737945557 - }, - { - "auxiliary_loss_clip": 0.01099462, - "auxiliary_loss_mlp": 0.01039292, - "balance_loss_clip": 1.04328573, - "balance_loss_mlp": 1.02330589, - "epoch": 0.3335036825492259, - "flos": 27889591345920.0, - "grad_norm": 1.6439201248410251, - "language_loss": 0.64893299, - "learning_rate": 3.1090549665282702e-06, - "loss": 0.67032051, - "num_input_tokens_seen": 119126630, - "step": 5547, - "time_per_iteration": 2.7897326946258545 - }, - { - "auxiliary_loss_clip": 0.0111514, - "auxiliary_loss_mlp": 0.0103407, - "balance_loss_clip": 1.05108774, - "balance_loss_mlp": 1.01957989, - "epoch": 0.3335638058018939, - "flos": 16179553261440.0, - "grad_norm": 2.7266915889905765, - "language_loss": 0.85475278, - "learning_rate": 3.1087308489625742e-06, - "loss": 0.8762449, - "num_input_tokens_seen": 119143375, - "step": 5548, - "time_per_iteration": 2.691776990890503 - }, - { - "auxiliary_loss_clip": 0.0112443, - "auxiliary_loss_mlp": 0.01038689, - "balance_loss_clip": 1.04759526, - "balance_loss_mlp": 1.02190423, - "epoch": 0.33362392905456184, - "flos": 39896108288640.0, - "grad_norm": 2.1593805374763466, - "language_loss": 0.74996036, - "learning_rate": 3.1084066893536945e-06, - "loss": 0.77159154, - "num_input_tokens_seen": 119166450, - "step": 5549, - "time_per_iteration": 2.778918743133545 - }, - { - "auxiliary_loss_clip": 0.01129114, - "auxiliary_loss_mlp": 0.01040153, - "balance_loss_clip": 1.0509795, - "balance_loss_mlp": 1.02330887, - "epoch": 0.3336840523072298, - "flos": 44271212567040.0, - "grad_norm": 2.0942861782322577, - "language_loss": 0.6826036, - "learning_rate": 3.108082487713921e-06, - "loss": 0.70429623, - "num_input_tokens_seen": 119189645, - "step": 5550, - "time_per_iteration": 2.8417065143585205 - }, - { - "auxiliary_loss_clip": 0.01094461, - "auxiliary_loss_mlp": 0.01050862, - "balance_loss_clip": 1.04752803, - "balance_loss_mlp": 1.03398156, - "epoch": 0.33374417555989777, - "flos": 15085678429440.0, - "grad_norm": 3.079168539029832, - "language_loss": 0.60630679, - "learning_rate": 3.1077582440555495e-06, - "loss": 0.62776005, - "num_input_tokens_seen": 119208045, - "step": 5551, - "time_per_iteration": 2.7206614017486572 - }, - { - "auxiliary_loss_clip": 0.01096001, - "auxiliary_loss_mlp": 0.01040976, - "balance_loss_clip": 1.04871941, - "balance_loss_mlp": 1.02429891, - "epoch": 0.33380429881256574, - "flos": 15849174942720.0, - "grad_norm": 5.115117677651213, - "language_loss": 0.70642906, - "learning_rate": 3.1074339583908746e-06, - "loss": 0.72779882, - "num_input_tokens_seen": 119224910, - "step": 5552, - "time_per_iteration": 2.7452614307403564 - }, - { - "auxiliary_loss_clip": 0.0109902, - "auxiliary_loss_mlp": 0.01036983, - "balance_loss_clip": 1.04360175, - "balance_loss_mlp": 1.02150989, - "epoch": 0.33386442206523376, - "flos": 13480327883520.0, - "grad_norm": 2.544991024269762, - "language_loss": 0.82464319, - "learning_rate": 3.107109630732192e-06, - "loss": 0.84600323, - "num_input_tokens_seen": 119243290, - "step": 5553, - "time_per_iteration": 2.755664110183716 - }, - { - "auxiliary_loss_clip": 0.01115353, - "auxiliary_loss_mlp": 0.00774656, - "balance_loss_clip": 1.05034745, - "balance_loss_mlp": 1.00092673, - "epoch": 0.3339245453179017, - "flos": 16690669839360.0, - "grad_norm": 2.0139615227647343, - "language_loss": 0.80920005, - "learning_rate": 3.1067852610918017e-06, - "loss": 0.82810014, - "num_input_tokens_seen": 119261195, - "step": 5554, - "time_per_iteration": 2.701960563659668 - }, - { - "auxiliary_loss_clip": 0.01127546, - "auxiliary_loss_mlp": 0.01043388, - "balance_loss_clip": 1.05171227, - "balance_loss_mlp": 1.02820015, - "epoch": 0.3339846685705697, - "flos": 24610624456320.0, - "grad_norm": 1.6473304910242343, - "language_loss": 0.81187713, - "learning_rate": 3.1064608494820032e-06, - "loss": 0.83358645, - "num_input_tokens_seen": 119282845, - "step": 5555, - "time_per_iteration": 2.697605609893799 - }, - { - "auxiliary_loss_clip": 0.01120953, - "auxiliary_loss_mlp": 0.01039289, - "balance_loss_clip": 1.04721272, - "balance_loss_mlp": 1.02425706, - "epoch": 0.33404479182323765, - "flos": 30953812775040.0, - "grad_norm": 1.6543240081497628, - "language_loss": 0.74369228, - "learning_rate": 3.106136395915099e-06, - "loss": 0.76529467, - "num_input_tokens_seen": 119304430, - "step": 5556, - "time_per_iteration": 2.7341341972351074 - }, - { - "auxiliary_loss_clip": 0.01124745, - "auxiliary_loss_mlp": 0.0103615, - "balance_loss_clip": 1.05016208, - "balance_loss_mlp": 1.02102232, - "epoch": 0.3341049150759056, - "flos": 23513301918720.0, - "grad_norm": 1.6367363007204896, - "language_loss": 0.82058722, - "learning_rate": 3.105811900403391e-06, - "loss": 0.84219617, - "num_input_tokens_seen": 119323830, - "step": 5557, - "time_per_iteration": 2.6798059940338135 - }, - { - "auxiliary_loss_clip": 0.01115524, - "auxiliary_loss_mlp": 0.01038861, - "balance_loss_clip": 1.04990697, - "balance_loss_mlp": 1.02333987, - "epoch": 0.3341650383285736, - "flos": 24026824707840.0, - "grad_norm": 1.4529426900334401, - "language_loss": 0.80220526, - "learning_rate": 3.1054873629591855e-06, - "loss": 0.82374907, - "num_input_tokens_seen": 119346340, - "step": 5558, - "time_per_iteration": 2.760270118713379 - }, - { - "auxiliary_loss_clip": 0.01108428, - "auxiliary_loss_mlp": 0.01040994, - "balance_loss_clip": 1.04822016, - "balance_loss_mlp": 1.02628982, - "epoch": 0.33422516158124155, - "flos": 24901967669760.0, - "grad_norm": 1.5625296304307381, - "language_loss": 0.8137213, - "learning_rate": 3.105162783594788e-06, - "loss": 0.83521557, - "num_input_tokens_seen": 119367285, - "step": 5559, - "time_per_iteration": 2.7685365676879883 - }, - { - "auxiliary_loss_clip": 0.01096895, - "auxiliary_loss_mlp": 0.01042951, - "balance_loss_clip": 1.04609013, - "balance_loss_mlp": 1.02726293, - "epoch": 0.3342852848339095, - "flos": 18333403464960.0, - "grad_norm": 2.3834321283612003, - "language_loss": 0.7164095, - "learning_rate": 3.1048381623225074e-06, - "loss": 0.73780799, - "num_input_tokens_seen": 119385370, - "step": 5560, - "time_per_iteration": 2.721888780593872 - }, - { - "auxiliary_loss_clip": 0.011201, - "auxiliary_loss_mlp": 0.01043409, - "balance_loss_clip": 1.05215085, - "balance_loss_mlp": 1.02716064, - "epoch": 0.3343454080865775, - "flos": 30046530119040.0, - "grad_norm": 2.1203222418546015, - "language_loss": 0.75029516, - "learning_rate": 3.1045134991546526e-06, - "loss": 0.77193022, - "num_input_tokens_seen": 119409150, - "step": 5561, - "time_per_iteration": 2.8445487022399902 - }, - { - "auxiliary_loss_clip": 0.01115063, - "auxiliary_loss_mlp": 0.01036711, - "balance_loss_clip": 1.05170679, - "balance_loss_mlp": 1.02177453, - "epoch": 0.33440553133924544, - "flos": 16398823835520.0, - "grad_norm": 1.6036143049019338, - "language_loss": 0.69467896, - "learning_rate": 3.1041887941035355e-06, - "loss": 0.71619672, - "num_input_tokens_seen": 119426475, - "step": 5562, - "time_per_iteration": 2.664062023162842 - }, - { - "auxiliary_loss_clip": 0.01125323, - "auxiliary_loss_mlp": 0.01042082, - "balance_loss_clip": 1.05125499, - "balance_loss_mlp": 1.02763367, - "epoch": 0.3344656545919134, - "flos": 24242072958720.0, - "grad_norm": 3.5139835262543504, - "language_loss": 0.65094876, - "learning_rate": 3.1038640471814685e-06, - "loss": 0.67262286, - "num_input_tokens_seen": 119446900, - "step": 5563, - "time_per_iteration": 2.70878529548645 - }, - { - "auxiliary_loss_clip": 0.01078552, - "auxiliary_loss_mlp": 0.01045974, - "balance_loss_clip": 1.04751515, - "balance_loss_mlp": 1.0296303, - "epoch": 0.3345257778445814, - "flos": 52118843149440.0, - "grad_norm": 1.4983314251487456, - "language_loss": 0.74106556, - "learning_rate": 3.103539258400766e-06, - "loss": 0.76231086, - "num_input_tokens_seen": 119470945, - "step": 5564, - "time_per_iteration": 3.0751025676727295 - }, - { - "auxiliary_loss_clip": 0.01035298, - "auxiliary_loss_mlp": 0.01009529, - "balance_loss_clip": 1.03294694, - "balance_loss_mlp": 1.00762165, - "epoch": 0.33458590109724934, - "flos": 68048602254720.0, - "grad_norm": 0.7758359845819034, - "language_loss": 0.555296, - "learning_rate": 3.103214427773745e-06, - "loss": 0.57574433, - "num_input_tokens_seen": 119529925, - "step": 5565, - "time_per_iteration": 3.2246947288513184 - }, - { - "auxiliary_loss_clip": 0.01134316, - "auxiliary_loss_mlp": 0.01036162, - "balance_loss_clip": 1.05123055, - "balance_loss_mlp": 1.02145183, - "epoch": 0.3346460243499173, - "flos": 37414788768000.0, - "grad_norm": 2.332924120890769, - "language_loss": 0.65000319, - "learning_rate": 3.102889555312721e-06, - "loss": 0.67170799, - "num_input_tokens_seen": 119550700, - "step": 5566, - "time_per_iteration": 2.8920817375183105 - }, - { - "auxiliary_loss_clip": 0.01115876, - "auxiliary_loss_mlp": 0.0103757, - "balance_loss_clip": 1.05134845, - "balance_loss_mlp": 1.02252626, - "epoch": 0.3347061476025853, - "flos": 18697358021760.0, - "grad_norm": 2.3005222539878436, - "language_loss": 0.77525175, - "learning_rate": 3.102564641030016e-06, - "loss": 0.79678619, - "num_input_tokens_seen": 119569295, - "step": 5567, - "time_per_iteration": 2.82244610786438 - }, - { - "auxiliary_loss_clip": 0.01112911, - "auxiliary_loss_mlp": 0.01037105, - "balance_loss_clip": 1.0479182, - "balance_loss_mlp": 1.02079725, - "epoch": 0.3347662708552533, - "flos": 13917827537280.0, - "grad_norm": 1.7148039320536435, - "language_loss": 0.76432139, - "learning_rate": 3.102239684937949e-06, - "loss": 0.78582156, - "num_input_tokens_seen": 119587375, - "step": 5568, - "time_per_iteration": 2.689354181289673 - }, - { - "auxiliary_loss_clip": 0.01099358, - "auxiliary_loss_mlp": 0.01048314, - "balance_loss_clip": 1.04898834, - "balance_loss_mlp": 1.03163624, - "epoch": 0.33482639410792125, - "flos": 19750402068480.0, - "grad_norm": 3.260707250765708, - "language_loss": 0.70965171, - "learning_rate": 3.101914687048842e-06, - "loss": 0.73112851, - "num_input_tokens_seen": 119604530, - "step": 5569, - "time_per_iteration": 2.747023344039917 - }, - { - "auxiliary_loss_clip": 0.01099669, - "auxiliary_loss_mlp": 0.01034787, - "balance_loss_clip": 1.04569411, - "balance_loss_mlp": 1.01819277, - "epoch": 0.3348865173605892, - "flos": 16102991422080.0, - "grad_norm": 2.127450904564192, - "language_loss": 0.89788258, - "learning_rate": 3.10158964737502e-06, - "loss": 0.91922712, - "num_input_tokens_seen": 119621025, - "step": 5570, - "time_per_iteration": 2.810328960418701 - }, - { - "auxiliary_loss_clip": 0.01098742, - "auxiliary_loss_mlp": 0.01034906, - "balance_loss_clip": 1.04593182, - "balance_loss_mlp": 1.01970696, - "epoch": 0.3349466406132572, - "flos": 25008945350400.0, - "grad_norm": 2.0196203016458245, - "language_loss": 0.79848439, - "learning_rate": 3.101264565928808e-06, - "loss": 0.81982088, - "num_input_tokens_seen": 119641725, - "step": 5571, - "time_per_iteration": 4.5300047397613525 - }, - { - "auxiliary_loss_clip": 0.01052126, - "auxiliary_loss_mlp": 0.00754598, - "balance_loss_clip": 1.02251923, - "balance_loss_mlp": 1.0014987, - "epoch": 0.33500676386592515, - "flos": 54319991564160.0, - "grad_norm": 0.8956854098175919, - "language_loss": 0.5596205, - "learning_rate": 3.1009394427225335e-06, - "loss": 0.57768774, - "num_input_tokens_seen": 119693560, - "step": 5572, - "time_per_iteration": 3.0931503772735596 - }, - { - "auxiliary_loss_clip": 0.01137277, - "auxiliary_loss_mlp": 0.01047626, - "balance_loss_clip": 1.05220318, - "balance_loss_mlp": 1.03196192, - "epoch": 0.3350668871185931, - "flos": 26797332625920.0, - "grad_norm": 2.019282888464976, - "language_loss": 0.78090006, - "learning_rate": 3.1006142777685257e-06, - "loss": 0.8027491, - "num_input_tokens_seen": 119712935, - "step": 5573, - "time_per_iteration": 2.710340738296509 - }, - { - "auxiliary_loss_clip": 0.01105804, - "auxiliary_loss_mlp": 0.01046551, - "balance_loss_clip": 1.05004358, - "balance_loss_mlp": 1.02974284, - "epoch": 0.3351270103712611, - "flos": 33510508986240.0, - "grad_norm": 3.3664569303363834, - "language_loss": 0.7253201, - "learning_rate": 3.1002890710791133e-06, - "loss": 0.74684364, - "num_input_tokens_seen": 119731680, - "step": 5574, - "time_per_iteration": 4.390132427215576 - }, - { - "auxiliary_loss_clip": 0.01119913, - "auxiliary_loss_mlp": 0.01033586, - "balance_loss_clip": 1.04622221, - "balance_loss_mlp": 1.01882839, - "epoch": 0.33518713362392905, - "flos": 26506240807680.0, - "grad_norm": 1.806126996337021, - "language_loss": 0.87605375, - "learning_rate": 3.0999638226666287e-06, - "loss": 0.89758873, - "num_input_tokens_seen": 119752155, - "step": 5575, - "time_per_iteration": 2.6650984287261963 - }, - { - "auxiliary_loss_clip": 0.01119423, - "auxiliary_loss_mlp": 0.01044892, - "balance_loss_clip": 1.05073953, - "balance_loss_mlp": 1.02783298, - "epoch": 0.335247256876597, - "flos": 17232345912960.0, - "grad_norm": 2.5292682388354404, - "language_loss": 0.82834053, - "learning_rate": 3.0996385325434063e-06, - "loss": 0.84998369, - "num_input_tokens_seen": 119769195, - "step": 5576, - "time_per_iteration": 4.143759727478027 - }, - { - "auxiliary_loss_clip": 0.01126035, - "auxiliary_loss_mlp": 0.01042249, - "balance_loss_clip": 1.04928613, - "balance_loss_mlp": 1.02584612, - "epoch": 0.335307380129265, - "flos": 25629373992960.0, - "grad_norm": 2.62081807641563, - "language_loss": 0.72970062, - "learning_rate": 3.0993132007217806e-06, - "loss": 0.75138342, - "num_input_tokens_seen": 119786810, - "step": 5577, - "time_per_iteration": 4.264250755310059 - }, - { - "auxiliary_loss_clip": 0.01102749, - "auxiliary_loss_mlp": 0.01040193, - "balance_loss_clip": 1.05250812, - "balance_loss_mlp": 1.02409935, - "epoch": 0.33536750338193294, - "flos": 19680089195520.0, - "grad_norm": 2.2461501835528255, - "language_loss": 0.8147049, - "learning_rate": 3.0989878272140883e-06, - "loss": 0.83613431, - "num_input_tokens_seen": 119805395, - "step": 5578, - "time_per_iteration": 2.748187780380249 - }, - { - "auxiliary_loss_clip": 0.01072311, - "auxiliary_loss_mlp": 0.0077377, - "balance_loss_clip": 1.04737353, - "balance_loss_mlp": 1.00086129, - "epoch": 0.3354276266346009, - "flos": 18332613365760.0, - "grad_norm": 2.081067644088489, - "language_loss": 0.72135395, - "learning_rate": 3.0986624120326676e-06, - "loss": 0.73981476, - "num_input_tokens_seen": 119823135, - "step": 5579, - "time_per_iteration": 2.797891616821289 - }, - { - "auxiliary_loss_clip": 0.0108369, - "auxiliary_loss_mlp": 0.01042635, - "balance_loss_clip": 1.04664183, - "balance_loss_mlp": 1.02608919, - "epoch": 0.3354877498872689, - "flos": 17858556645120.0, - "grad_norm": 2.1516301629227255, - "language_loss": 0.81264424, - "learning_rate": 3.0983369551898573e-06, - "loss": 0.83390749, - "num_input_tokens_seen": 119842265, - "step": 5580, - "time_per_iteration": 2.76359224319458 - }, - { - "auxiliary_loss_clip": 0.01112891, - "auxiliary_loss_mlp": 0.01034758, - "balance_loss_clip": 1.04777932, - "balance_loss_mlp": 1.01918936, - "epoch": 0.3355478731399369, - "flos": 24717745791360.0, - "grad_norm": 1.787418199208594, - "language_loss": 0.78071463, - "learning_rate": 3.0980114566980003e-06, - "loss": 0.80219114, - "num_input_tokens_seen": 119862500, - "step": 5581, - "time_per_iteration": 2.6893699169158936 - }, - { - "auxiliary_loss_clip": 0.01102381, - "auxiliary_loss_mlp": 0.01044533, - "balance_loss_clip": 1.04555583, - "balance_loss_mlp": 1.02674723, - "epoch": 0.33560799639260486, - "flos": 16873886136960.0, - "grad_norm": 3.5541134032025528, - "language_loss": 0.74734783, - "learning_rate": 3.0976859165694384e-06, - "loss": 0.76881701, - "num_input_tokens_seen": 119880160, - "step": 5582, - "time_per_iteration": 2.750110149383545 - }, - { - "auxiliary_loss_clip": 0.01109205, - "auxiliary_loss_mlp": 0.0104468, - "balance_loss_clip": 1.04334664, - "balance_loss_mlp": 1.02793145, - "epoch": 0.3356681196452728, - "flos": 18333511205760.0, - "grad_norm": 2.0738327777636574, - "language_loss": 0.82039702, - "learning_rate": 3.0973603348165166e-06, - "loss": 0.84193587, - "num_input_tokens_seen": 119899040, - "step": 5583, - "time_per_iteration": 2.629065990447998 - }, - { - "auxiliary_loss_clip": 0.01113126, - "auxiliary_loss_mlp": 0.01047702, - "balance_loss_clip": 1.04719925, - "balance_loss_mlp": 1.0322051, - "epoch": 0.3357282428979408, - "flos": 34750612085760.0, - "grad_norm": 2.1437775006956814, - "language_loss": 0.77524137, - "learning_rate": 3.097034711451581e-06, - "loss": 0.79684973, - "num_input_tokens_seen": 119921120, - "step": 5584, - "time_per_iteration": 2.9303438663482666 - }, - { - "auxiliary_loss_clip": 0.01115168, - "auxiliary_loss_mlp": 0.01043431, - "balance_loss_clip": 1.04803944, - "balance_loss_mlp": 1.02755225, - "epoch": 0.33578836615060875, - "flos": 21580087006080.0, - "grad_norm": 1.8068970963649096, - "language_loss": 0.76473475, - "learning_rate": 3.0967090464869795e-06, - "loss": 0.78632081, - "num_input_tokens_seen": 119940165, - "step": 5585, - "time_per_iteration": 2.7168867588043213 - }, - { - "auxiliary_loss_clip": 0.01120824, - "auxiliary_loss_mlp": 0.01040676, - "balance_loss_clip": 1.04579937, - "balance_loss_mlp": 1.02442741, - "epoch": 0.3358484894032767, - "flos": 24530291688960.0, - "grad_norm": 1.8490215812193886, - "language_loss": 0.77754235, - "learning_rate": 3.0963833399350608e-06, - "loss": 0.79915732, - "num_input_tokens_seen": 119959730, - "step": 5586, - "time_per_iteration": 2.88452410697937 - }, - { - "auxiliary_loss_clip": 0.01100333, - "auxiliary_loss_mlp": 0.01057166, - "balance_loss_clip": 1.0484302, - "balance_loss_mlp": 1.03673398, - "epoch": 0.3359086126559447, - "flos": 22455589104000.0, - "grad_norm": 1.6698470723885088, - "language_loss": 0.810045, - "learning_rate": 3.0960575918081756e-06, - "loss": 0.8316201, - "num_input_tokens_seen": 119979315, - "step": 5587, - "time_per_iteration": 2.7335522174835205 - }, - { - "auxiliary_loss_clip": 0.01130777, - "auxiliary_loss_mlp": 0.01042735, - "balance_loss_clip": 1.04809558, - "balance_loss_mlp": 1.02837586, - "epoch": 0.33596873590861265, - "flos": 16543687386240.0, - "grad_norm": 1.8626695130182664, - "language_loss": 0.67307252, - "learning_rate": 3.095731802118677e-06, - "loss": 0.69480765, - "num_input_tokens_seen": 119996140, - "step": 5588, - "time_per_iteration": 2.5910611152648926 - }, - { - "auxiliary_loss_clip": 0.01113468, - "auxiliary_loss_mlp": 0.00774774, - "balance_loss_clip": 1.04702032, - "balance_loss_mlp": 1.0007664, - "epoch": 0.3360288591612806, - "flos": 31175812782720.0, - "grad_norm": 2.758181662666948, - "language_loss": 0.70459288, - "learning_rate": 3.095405970878919e-06, - "loss": 0.72347522, - "num_input_tokens_seen": 120017720, - "step": 5589, - "time_per_iteration": 2.7966625690460205 - }, - { - "auxiliary_loss_clip": 0.01110605, - "auxiliary_loss_mlp": 0.01046945, - "balance_loss_clip": 1.04478765, - "balance_loss_mlp": 1.02951634, - "epoch": 0.3360889824139486, - "flos": 23696913265920.0, - "grad_norm": 6.820816752821097, - "language_loss": 0.6717155, - "learning_rate": 3.0950800981012567e-06, - "loss": 0.69329101, - "num_input_tokens_seen": 120036335, - "step": 5590, - "time_per_iteration": 2.804384231567383 - }, - { - "auxiliary_loss_clip": 0.01107091, - "auxiliary_loss_mlp": 0.01044113, - "balance_loss_clip": 1.05176187, - "balance_loss_mlp": 1.02741194, - "epoch": 0.33614910566661654, - "flos": 19318109886720.0, - "grad_norm": 2.108159500929249, - "language_loss": 0.731767, - "learning_rate": 3.094754183798047e-06, - "loss": 0.75327909, - "num_input_tokens_seen": 120056120, - "step": 5591, - "time_per_iteration": 2.7423245906829834 - }, - { - "auxiliary_loss_clip": 0.01132777, - "auxiliary_loss_mlp": 0.01043438, - "balance_loss_clip": 1.04753232, - "balance_loss_mlp": 1.02802432, - "epoch": 0.3362092289192845, - "flos": 16472261191680.0, - "grad_norm": 2.4812698890164238, - "language_loss": 0.6978277, - "learning_rate": 3.0944282279816493e-06, - "loss": 0.71958983, - "num_input_tokens_seen": 120073650, - "step": 5592, - "time_per_iteration": 2.624565362930298 - }, - { - "auxiliary_loss_clip": 0.01109265, - "auxiliary_loss_mlp": 0.01035799, - "balance_loss_clip": 1.0459764, - "balance_loss_mlp": 1.02034986, - "epoch": 0.33626935217195253, - "flos": 24243581329920.0, - "grad_norm": 2.2034044743639676, - "language_loss": 0.76362681, - "learning_rate": 3.094102230664423e-06, - "loss": 0.78507739, - "num_input_tokens_seen": 120093260, - "step": 5593, - "time_per_iteration": 2.7709946632385254 - }, - { - "auxiliary_loss_clip": 0.01100555, - "auxiliary_loss_mlp": 0.00775613, - "balance_loss_clip": 1.04247713, - "balance_loss_mlp": 1.00074506, - "epoch": 0.3363294754246205, - "flos": 19718765164800.0, - "grad_norm": 2.2856177577930876, - "language_loss": 0.7229932, - "learning_rate": 3.093776191858731e-06, - "loss": 0.74175489, - "num_input_tokens_seen": 120111830, - "step": 5594, - "time_per_iteration": 2.7880120277404785 - }, - { - "auxiliary_loss_clip": 0.01079557, - "auxiliary_loss_mlp": 0.00778898, - "balance_loss_clip": 1.04157269, - "balance_loss_mlp": 1.00079668, - "epoch": 0.33638959867728846, - "flos": 22596286677120.0, - "grad_norm": 3.2295215673950293, - "language_loss": 0.79940557, - "learning_rate": 3.0934501115769363e-06, - "loss": 0.81799006, - "num_input_tokens_seen": 120130470, - "step": 5595, - "time_per_iteration": 2.8623924255371094 - }, - { - "auxiliary_loss_clip": 0.01111225, - "auxiliary_loss_mlp": 0.01039348, - "balance_loss_clip": 1.04694319, - "balance_loss_mlp": 1.02456045, - "epoch": 0.3364497219299564, - "flos": 20994742972800.0, - "grad_norm": 3.201033356603963, - "language_loss": 0.81473815, - "learning_rate": 3.0931239898314037e-06, - "loss": 0.83624387, - "num_input_tokens_seen": 120150735, - "step": 5596, - "time_per_iteration": 2.900319814682007 - }, - { - "auxiliary_loss_clip": 0.01113286, - "auxiliary_loss_mlp": 0.01044516, - "balance_loss_clip": 1.04682481, - "balance_loss_mlp": 1.02877986, - "epoch": 0.3365098451826244, - "flos": 25228610974080.0, - "grad_norm": 1.642499178477658, - "language_loss": 0.75647599, - "learning_rate": 3.0927978266344995e-06, - "loss": 0.778054, - "num_input_tokens_seen": 120173230, - "step": 5597, - "time_per_iteration": 2.8402984142303467 - }, - { - "auxiliary_loss_clip": 0.0112326, - "auxiliary_loss_mlp": 0.01034747, - "balance_loss_clip": 1.04734445, - "balance_loss_mlp": 1.01902318, - "epoch": 0.33656996843529235, - "flos": 24571697091840.0, - "grad_norm": 1.910742765655482, - "language_loss": 0.78611934, - "learning_rate": 3.0924716219985916e-06, - "loss": 0.80769938, - "num_input_tokens_seen": 120191860, - "step": 5598, - "time_per_iteration": 2.7380945682525635 - }, - { - "auxiliary_loss_clip": 0.01141013, - "auxiliary_loss_mlp": 0.01041333, - "balance_loss_clip": 1.04969454, - "balance_loss_mlp": 1.0235827, - "epoch": 0.3366300916879603, - "flos": 44091120752640.0, - "grad_norm": 1.511676842650176, - "language_loss": 0.6446076, - "learning_rate": 3.0921453759360514e-06, - "loss": 0.66643113, - "num_input_tokens_seen": 120219195, - "step": 5599, - "time_per_iteration": 2.845017433166504 - }, - { - "auxiliary_loss_clip": 0.01103042, - "auxiliary_loss_mlp": 0.01054079, - "balance_loss_clip": 1.04571164, - "balance_loss_mlp": 1.03408813, - "epoch": 0.3366902149406283, - "flos": 13879869840000.0, - "grad_norm": 3.0475721260430486, - "language_loss": 0.8262403, - "learning_rate": 3.091819088459249e-06, - "loss": 0.84781146, - "num_input_tokens_seen": 120232950, - "step": 5600, - "time_per_iteration": 2.690335512161255 - }, - { - "auxiliary_loss_clip": 0.01128117, - "auxiliary_loss_mlp": 0.01045257, - "balance_loss_clip": 1.04780042, - "balance_loss_mlp": 1.02822232, - "epoch": 0.33675033819329625, - "flos": 16253098358400.0, - "grad_norm": 2.4530209101601037, - "language_loss": 0.83457136, - "learning_rate": 3.0914927595805573e-06, - "loss": 0.856305, - "num_input_tokens_seen": 120248865, - "step": 5601, - "time_per_iteration": 2.760735034942627 - }, - { - "auxiliary_loss_clip": 0.01122256, - "auxiliary_loss_mlp": 0.0103673, - "balance_loss_clip": 1.04873729, - "balance_loss_mlp": 1.02092862, - "epoch": 0.3368104614459642, - "flos": 17055809544960.0, - "grad_norm": 2.1704904083215903, - "language_loss": 0.83173311, - "learning_rate": 3.0911663893123507e-06, - "loss": 0.85332292, - "num_input_tokens_seen": 120267820, - "step": 5602, - "time_per_iteration": 2.6818981170654297 - }, - { - "auxiliary_loss_clip": 0.0113558, - "auxiliary_loss_mlp": 0.01053921, - "balance_loss_clip": 1.04765427, - "balance_loss_mlp": 1.03756535, - "epoch": 0.3368705846986322, - "flos": 17858628472320.0, - "grad_norm": 3.8525391607572477, - "language_loss": 0.69046748, - "learning_rate": 3.0908399776670048e-06, - "loss": 0.71236247, - "num_input_tokens_seen": 120286540, - "step": 5603, - "time_per_iteration": 2.6086158752441406 - }, - { - "auxiliary_loss_clip": 0.01116527, - "auxiliary_loss_mlp": 0.01042678, - "balance_loss_clip": 1.04876411, - "balance_loss_mlp": 1.02617979, - "epoch": 0.33693070795130015, - "flos": 22929502170240.0, - "grad_norm": 1.5388557517073465, - "language_loss": 0.83146536, - "learning_rate": 3.090513524656898e-06, - "loss": 0.85305738, - "num_input_tokens_seen": 120307305, - "step": 5604, - "time_per_iteration": 2.7269375324249268 - }, - { - "auxiliary_loss_clip": 0.01095396, - "auxiliary_loss_mlp": 0.01043597, - "balance_loss_clip": 1.04384422, - "balance_loss_mlp": 1.02708673, - "epoch": 0.3369908312039681, - "flos": 22017443005440.0, - "grad_norm": 1.634462052702842, - "language_loss": 0.73473096, - "learning_rate": 3.090187030294409e-06, - "loss": 0.75612092, - "num_input_tokens_seen": 120327845, - "step": 5605, - "time_per_iteration": 2.712197780609131 - }, - { - "auxiliary_loss_clip": 0.0111786, - "auxiliary_loss_mlp": 0.01038834, - "balance_loss_clip": 1.04761815, - "balance_loss_mlp": 1.02235925, - "epoch": 0.33705095445663613, - "flos": 11801970944640.0, - "grad_norm": 3.8834830456250913, - "language_loss": 0.83444858, - "learning_rate": 3.089860494591919e-06, - "loss": 0.85601556, - "num_input_tokens_seen": 120343255, - "step": 5606, - "time_per_iteration": 2.6680989265441895 - }, - { - "auxiliary_loss_clip": 0.01108557, - "auxiliary_loss_mlp": 0.01039061, - "balance_loss_clip": 1.04293787, - "balance_loss_mlp": 1.02370059, - "epoch": 0.3371110777093041, - "flos": 25046400257280.0, - "grad_norm": 2.0409696956182946, - "language_loss": 0.67694759, - "learning_rate": 3.089533917561809e-06, - "loss": 0.69842374, - "num_input_tokens_seen": 120361745, - "step": 5607, - "time_per_iteration": 2.8172407150268555 - }, - { - "auxiliary_loss_clip": 0.01121964, - "auxiliary_loss_mlp": 0.01053243, - "balance_loss_clip": 1.04604626, - "balance_loss_mlp": 1.03458667, - "epoch": 0.33717120096197206, - "flos": 26579031719040.0, - "grad_norm": 1.9822534609557965, - "language_loss": 0.70618403, - "learning_rate": 3.089207299216464e-06, - "loss": 0.72793615, - "num_input_tokens_seen": 120380565, - "step": 5608, - "time_per_iteration": 2.669027090072632 - }, - { - "auxiliary_loss_clip": 0.01055328, - "auxiliary_loss_mlp": 0.01040575, - "balance_loss_clip": 1.03931713, - "balance_loss_mlp": 1.02449393, - "epoch": 0.33723132421464, - "flos": 15158541168000.0, - "grad_norm": 1.931960515128334, - "language_loss": 0.79290974, - "learning_rate": 3.088880639568269e-06, - "loss": 0.81386876, - "num_input_tokens_seen": 120399235, - "step": 5609, - "time_per_iteration": 2.7859673500061035 - }, - { - "auxiliary_loss_clip": 0.01124996, - "auxiliary_loss_mlp": 0.01041459, - "balance_loss_clip": 1.04914641, - "balance_loss_mlp": 1.02387619, - "epoch": 0.337291447467308, - "flos": 23436093634560.0, - "grad_norm": 1.7580059679361764, - "language_loss": 0.82490408, - "learning_rate": 3.0885539386296114e-06, - "loss": 0.8465687, - "num_input_tokens_seen": 120420095, - "step": 5610, - "time_per_iteration": 4.319208145141602 - }, - { - "auxiliary_loss_clip": 0.01123032, - "auxiliary_loss_mlp": 0.0104256, - "balance_loss_clip": 1.0486002, - "balance_loss_mlp": 1.02448845, - "epoch": 0.33735157071997596, - "flos": 17238163916160.0, - "grad_norm": 2.0228863025134824, - "language_loss": 0.82122159, - "learning_rate": 3.088227196412879e-06, - "loss": 0.84287751, - "num_input_tokens_seen": 120437690, - "step": 5611, - "time_per_iteration": 2.6127841472625732 - }, - { - "auxiliary_loss_clip": 0.01116485, - "auxiliary_loss_mlp": 0.01045036, - "balance_loss_clip": 1.04920387, - "balance_loss_mlp": 1.02683246, - "epoch": 0.3374116939726439, - "flos": 28257388657920.0, - "grad_norm": 2.0856936331065037, - "language_loss": 0.79704899, - "learning_rate": 3.0879004129304626e-06, - "loss": 0.81866419, - "num_input_tokens_seen": 120459240, - "step": 5612, - "time_per_iteration": 2.7237493991851807 - }, - { - "auxiliary_loss_clip": 0.01076712, - "auxiliary_loss_mlp": 0.01040315, - "balance_loss_clip": 1.04079247, - "balance_loss_mlp": 1.02410221, - "epoch": 0.3374718172253119, - "flos": 35919396731520.0, - "grad_norm": 2.390785367991082, - "language_loss": 0.70200634, - "learning_rate": 3.087573588194753e-06, - "loss": 0.7231766, - "num_input_tokens_seen": 120481090, - "step": 5613, - "time_per_iteration": 4.43415379524231 - }, - { - "auxiliary_loss_clip": 0.01118495, - "auxiliary_loss_mlp": 0.01037291, - "balance_loss_clip": 1.04903054, - "balance_loss_mlp": 1.02097178, - "epoch": 0.33753194047797985, - "flos": 18186672407040.0, - "grad_norm": 2.1929626699857585, - "language_loss": 0.79407388, - "learning_rate": 3.087246722218144e-06, - "loss": 0.81563175, - "num_input_tokens_seen": 120500045, - "step": 5614, - "time_per_iteration": 2.6484436988830566 - }, - { - "auxiliary_loss_clip": 0.01105902, - "auxiliary_loss_mlp": 0.01046863, - "balance_loss_clip": 1.04512811, - "balance_loss_mlp": 1.02796841, - "epoch": 0.3375920637306478, - "flos": 23148916398720.0, - "grad_norm": 1.967540834348034, - "language_loss": 0.91201901, - "learning_rate": 3.086919815013031e-06, - "loss": 0.93354666, - "num_input_tokens_seen": 120521125, - "step": 5615, - "time_per_iteration": 4.486853361129761 - }, - { - "auxiliary_loss_clip": 0.01119294, - "auxiliary_loss_mlp": 0.01042109, - "balance_loss_clip": 1.04542458, - "balance_loss_mlp": 1.0265168, - "epoch": 0.3376521869833158, - "flos": 23112215677440.0, - "grad_norm": 2.688104519924193, - "language_loss": 0.80865037, - "learning_rate": 3.086592866591809e-06, - "loss": 0.83026439, - "num_input_tokens_seen": 120539180, - "step": 5616, - "time_per_iteration": 2.693419933319092 - }, - { - "auxiliary_loss_clip": 0.01132102, - "auxiliary_loss_mlp": 0.00776249, - "balance_loss_clip": 1.04987526, - "balance_loss_mlp": 1.00074387, - "epoch": 0.33771231023598375, - "flos": 19274585581440.0, - "grad_norm": 5.641479508637021, - "language_loss": 0.83967853, - "learning_rate": 3.0862658769668774e-06, - "loss": 0.85876203, - "num_input_tokens_seen": 120556280, - "step": 5617, - "time_per_iteration": 4.261611461639404 - }, - { - "auxiliary_loss_clip": 0.01065047, - "auxiliary_loss_mlp": 0.01048039, - "balance_loss_clip": 1.0423851, - "balance_loss_mlp": 1.030074, - "epoch": 0.3377724334886517, - "flos": 18150187167360.0, - "grad_norm": 2.2609860925126117, - "language_loss": 0.80159199, - "learning_rate": 3.0859388461506343e-06, - "loss": 0.82272285, - "num_input_tokens_seen": 120575395, - "step": 5618, - "time_per_iteration": 2.8115389347076416 - }, - { - "auxiliary_loss_clip": 0.01092947, - "auxiliary_loss_mlp": 0.01037796, - "balance_loss_clip": 1.04605365, - "balance_loss_mlp": 1.02121353, - "epoch": 0.3378325567413197, - "flos": 25775997310080.0, - "grad_norm": 1.9598490702889584, - "language_loss": 0.7111814, - "learning_rate": 3.085611774155481e-06, - "loss": 0.73248887, - "num_input_tokens_seen": 120596075, - "step": 5619, - "time_per_iteration": 2.86958909034729 - }, - { - "auxiliary_loss_clip": 0.01116213, - "auxiliary_loss_mlp": 0.01047745, - "balance_loss_clip": 1.04749656, - "balance_loss_mlp": 1.03167593, - "epoch": 0.3378926799939877, - "flos": 21317112558720.0, - "grad_norm": 2.630730252639156, - "language_loss": 0.70144761, - "learning_rate": 3.085284660993821e-06, - "loss": 0.72308713, - "num_input_tokens_seen": 120614195, - "step": 5620, - "time_per_iteration": 2.6953368186950684 - }, - { - "auxiliary_loss_clip": 0.01136416, - "auxiliary_loss_mlp": 0.01047216, - "balance_loss_clip": 1.05076015, - "balance_loss_mlp": 1.03201699, - "epoch": 0.33795280324665566, - "flos": 24900028335360.0, - "grad_norm": 1.8373178803043773, - "language_loss": 0.67899036, - "learning_rate": 3.084957506678058e-06, - "loss": 0.70082676, - "num_input_tokens_seen": 120634475, - "step": 5621, - "time_per_iteration": 2.6531872749328613 - }, - { - "auxiliary_loss_clip": 0.0110792, - "auxiliary_loss_mlp": 0.01044445, - "balance_loss_clip": 1.04716897, - "balance_loss_mlp": 1.02814865, - "epoch": 0.33801292649932363, - "flos": 24753943722240.0, - "grad_norm": 1.7693089540657438, - "language_loss": 0.82862681, - "learning_rate": 3.0846303112205975e-06, - "loss": 0.85015041, - "num_input_tokens_seen": 120654980, - "step": 5622, - "time_per_iteration": 2.7764267921447754 - }, - { - "auxiliary_loss_clip": 0.01097036, - "auxiliary_loss_mlp": 0.01041227, - "balance_loss_clip": 1.043239, - "balance_loss_mlp": 1.02565813, - "epoch": 0.3380730497519916, - "flos": 26723967096960.0, - "grad_norm": 7.015051283901371, - "language_loss": 0.73815429, - "learning_rate": 3.0843030746338464e-06, - "loss": 0.75953692, - "num_input_tokens_seen": 120676245, - "step": 5623, - "time_per_iteration": 2.7962961196899414 - }, - { - "auxiliary_loss_clip": 0.0104645, - "auxiliary_loss_mlp": 0.01031816, - "balance_loss_clip": 1.03514934, - "balance_loss_mlp": 1.0298605, - "epoch": 0.33813317300465956, - "flos": 70035756416640.0, - "grad_norm": 0.757644747116446, - "language_loss": 0.55002284, - "learning_rate": 3.083975796930215e-06, - "loss": 0.57080543, - "num_input_tokens_seen": 120741965, - "step": 5624, - "time_per_iteration": 3.3495559692382812 - }, - { - "auxiliary_loss_clip": 0.01091887, - "auxiliary_loss_mlp": 0.01055525, - "balance_loss_clip": 1.04508519, - "balance_loss_mlp": 1.03704786, - "epoch": 0.3381932962573275, - "flos": 24097317148800.0, - "grad_norm": 3.1490866232839876, - "language_loss": 0.73299229, - "learning_rate": 3.083648478122111e-06, - "loss": 0.75446641, - "num_input_tokens_seen": 120760410, - "step": 5625, - "time_per_iteration": 2.7474253177642822 - }, - { - "auxiliary_loss_clip": 0.01127839, - "auxiliary_loss_mlp": 0.01045252, - "balance_loss_clip": 1.04838002, - "balance_loss_mlp": 1.02828884, - "epoch": 0.3382534195099955, - "flos": 19278248768640.0, - "grad_norm": 5.828984180477566, - "language_loss": 0.70578009, - "learning_rate": 3.0833211182219497e-06, - "loss": 0.72751105, - "num_input_tokens_seen": 120777705, - "step": 5626, - "time_per_iteration": 2.6597115993499756 - }, - { - "auxiliary_loss_clip": 0.01108172, - "auxiliary_loss_mlp": 0.01041744, - "balance_loss_clip": 1.04509664, - "balance_loss_mlp": 1.02605569, - "epoch": 0.33831354276266346, - "flos": 25226240676480.0, - "grad_norm": 3.2927176036830574, - "language_loss": 0.80853224, - "learning_rate": 3.0829937172421425e-06, - "loss": 0.83003139, - "num_input_tokens_seen": 120798660, - "step": 5627, - "time_per_iteration": 2.730774402618408 - }, - { - "auxiliary_loss_clip": 0.01131612, - "auxiliary_loss_mlp": 0.0077564, - "balance_loss_clip": 1.05286694, - "balance_loss_mlp": 1.00064421, - "epoch": 0.3383736660153314, - "flos": 23112000195840.0, - "grad_norm": 2.306116347111899, - "language_loss": 0.80454439, - "learning_rate": 3.0826662751951055e-06, - "loss": 0.82361686, - "num_input_tokens_seen": 120816705, - "step": 5628, - "time_per_iteration": 2.691471576690674 - }, - { - "auxiliary_loss_clip": 0.01080566, - "auxiliary_loss_mlp": 0.01046147, - "balance_loss_clip": 1.04250276, - "balance_loss_mlp": 1.02787185, - "epoch": 0.3384337892679994, - "flos": 23477139901440.0, - "grad_norm": 3.64262689820424, - "language_loss": 0.77174091, - "learning_rate": 3.082338792093254e-06, - "loss": 0.79300809, - "num_input_tokens_seen": 120835375, - "step": 5629, - "time_per_iteration": 2.7564992904663086 - }, - { - "auxiliary_loss_clip": 0.01116368, - "auxiliary_loss_mlp": 0.01046104, - "balance_loss_clip": 1.04699719, - "balance_loss_mlp": 1.02819836, - "epoch": 0.33849391252066735, - "flos": 19425805839360.0, - "grad_norm": 4.669184863549949, - "language_loss": 0.84738326, - "learning_rate": 3.0820112679490074e-06, - "loss": 0.86900795, - "num_input_tokens_seen": 120854260, - "step": 5630, - "time_per_iteration": 2.7284910678863525 - }, - { - "auxiliary_loss_clip": 0.0108732, - "auxiliary_loss_mlp": 0.01055965, - "balance_loss_clip": 1.04692125, - "balance_loss_mlp": 1.03889382, - "epoch": 0.3385540357733353, - "flos": 21064840364160.0, - "grad_norm": 2.0951078731071204, - "language_loss": 0.71627271, - "learning_rate": 3.0816837027747857e-06, - "loss": 0.73770559, - "num_input_tokens_seen": 120871590, - "step": 5631, - "time_per_iteration": 2.7423501014709473 - }, - { - "auxiliary_loss_clip": 0.01036653, - "auxiliary_loss_mlp": 0.01008716, - "balance_loss_clip": 1.02691352, - "balance_loss_mlp": 1.00683236, - "epoch": 0.3386141590260033, - "flos": 69208013450880.0, - "grad_norm": 0.8383263502294551, - "language_loss": 0.56103444, - "learning_rate": 3.0813560965830084e-06, - "loss": 0.58148813, - "num_input_tokens_seen": 120925550, - "step": 5632, - "time_per_iteration": 3.24780535697937 - }, - { - "auxiliary_loss_clip": 0.01122742, - "auxiliary_loss_mlp": 0.01038822, - "balance_loss_clip": 1.05064476, - "balance_loss_mlp": 1.02198935, - "epoch": 0.3386742822786713, - "flos": 25519487310720.0, - "grad_norm": 1.5341010429525646, - "language_loss": 0.80410492, - "learning_rate": 3.0810284493861005e-06, - "loss": 0.82572055, - "num_input_tokens_seen": 120947620, - "step": 5633, - "time_per_iteration": 2.6492738723754883 - }, - { - "auxiliary_loss_clip": 0.01099799, - "auxiliary_loss_mlp": 0.01044702, - "balance_loss_clip": 1.04435778, - "balance_loss_mlp": 1.02854943, - "epoch": 0.33873440553133927, - "flos": 23623116773760.0, - "grad_norm": 2.1401050060877997, - "language_loss": 0.59013391, - "learning_rate": 3.0807007611964855e-06, - "loss": 0.61157894, - "num_input_tokens_seen": 120965205, - "step": 5634, - "time_per_iteration": 2.7261369228363037 - }, - { - "auxiliary_loss_clip": 0.01106157, - "auxiliary_loss_mlp": 0.01040516, - "balance_loss_clip": 1.04877985, - "balance_loss_mlp": 1.02482784, - "epoch": 0.33879452878400723, - "flos": 17088882992640.0, - "grad_norm": 1.8243057386875807, - "language_loss": 0.92440355, - "learning_rate": 3.080373032026589e-06, - "loss": 0.94587028, - "num_input_tokens_seen": 120983560, - "step": 5635, - "time_per_iteration": 2.627788782119751 - }, - { - "auxiliary_loss_clip": 0.01091476, - "auxiliary_loss_mlp": 0.01039192, - "balance_loss_clip": 1.05005646, - "balance_loss_mlp": 1.02288401, - "epoch": 0.3388546520366752, - "flos": 15742053607680.0, - "grad_norm": 2.00681285666687, - "language_loss": 0.75539577, - "learning_rate": 3.0800452618888386e-06, - "loss": 0.7767024, - "num_input_tokens_seen": 121001400, - "step": 5636, - "time_per_iteration": 2.706772565841675 - }, - { - "auxiliary_loss_clip": 0.0112617, - "auxiliary_loss_mlp": 0.01044921, - "balance_loss_clip": 1.05089188, - "balance_loss_mlp": 1.02866137, - "epoch": 0.33891477528934316, - "flos": 22418744728320.0, - "grad_norm": 1.7127540900641318, - "language_loss": 0.83448696, - "learning_rate": 3.0797174507956637e-06, - "loss": 0.85619783, - "num_input_tokens_seen": 121021760, - "step": 5637, - "time_per_iteration": 2.6864166259765625 - }, - { - "auxiliary_loss_clip": 0.0109052, - "auxiliary_loss_mlp": 0.01051499, - "balance_loss_clip": 1.04899251, - "balance_loss_mlp": 1.03193665, - "epoch": 0.3389748985420111, - "flos": 17274828723840.0, - "grad_norm": 1.650296659926583, - "language_loss": 0.70123053, - "learning_rate": 3.079389598759495e-06, - "loss": 0.72265071, - "num_input_tokens_seen": 121041070, - "step": 5638, - "time_per_iteration": 2.7513418197631836 - }, - { - "auxiliary_loss_clip": 0.01107421, - "auxiliary_loss_mlp": 0.01049541, - "balance_loss_clip": 1.0486834, - "balance_loss_mlp": 1.0325892, - "epoch": 0.3390350217946791, - "flos": 27744979190400.0, - "grad_norm": 3.471125425253904, - "language_loss": 0.80819786, - "learning_rate": 3.079061705792765e-06, - "loss": 0.82976747, - "num_input_tokens_seen": 121060890, - "step": 5639, - "time_per_iteration": 2.8025810718536377 - }, - { - "auxiliary_loss_clip": 0.01143398, - "auxiliary_loss_mlp": 0.01048836, - "balance_loss_clip": 1.0533762, - "balance_loss_mlp": 1.03158689, - "epoch": 0.33909514504734706, - "flos": 20339804338560.0, - "grad_norm": 8.162571098362656, - "language_loss": 0.67619336, - "learning_rate": 3.078733771907907e-06, - "loss": 0.69811565, - "num_input_tokens_seen": 121079135, - "step": 5640, - "time_per_iteration": 2.662127733230591 - }, - { - "auxiliary_loss_clip": 0.01114186, - "auxiliary_loss_mlp": 0.01038526, - "balance_loss_clip": 1.04930854, - "balance_loss_mlp": 1.02196789, - "epoch": 0.339155268300015, - "flos": 14830030356480.0, - "grad_norm": 1.6687164879604648, - "language_loss": 0.69589841, - "learning_rate": 3.0784057971173554e-06, - "loss": 0.71742553, - "num_input_tokens_seen": 121097685, - "step": 5641, - "time_per_iteration": 2.6596109867095947 - }, - { - "auxiliary_loss_clip": 0.01142481, - "auxiliary_loss_mlp": 0.0104296, - "balance_loss_clip": 1.05451512, - "balance_loss_mlp": 1.02698565, - "epoch": 0.339215391552683, - "flos": 26067951054720.0, - "grad_norm": 2.4357287647671266, - "language_loss": 0.87591994, - "learning_rate": 3.0780777814335483e-06, - "loss": 0.89777428, - "num_input_tokens_seen": 121115640, - "step": 5642, - "time_per_iteration": 2.6347198486328125 - }, - { - "auxiliary_loss_clip": 0.01117312, - "auxiliary_loss_mlp": 0.01034931, - "balance_loss_clip": 1.04759669, - "balance_loss_mlp": 1.02112639, - "epoch": 0.33927551480535095, - "flos": 14574705505920.0, - "grad_norm": 1.860184080586481, - "language_loss": 0.83900917, - "learning_rate": 3.077749724868924e-06, - "loss": 0.86053157, - "num_input_tokens_seen": 121132485, - "step": 5643, - "time_per_iteration": 2.678086042404175 - }, - { - "auxiliary_loss_clip": 0.01107188, - "auxiliary_loss_mlp": 0.01049417, - "balance_loss_clip": 1.04616475, - "balance_loss_mlp": 1.03295422, - "epoch": 0.3393356380580189, - "flos": 23805578885760.0, - "grad_norm": 4.293096130940915, - "language_loss": 0.76897138, - "learning_rate": 3.077421627435922e-06, - "loss": 0.79053748, - "num_input_tokens_seen": 121152935, - "step": 5644, - "time_per_iteration": 2.6681976318359375 - }, - { - "auxiliary_loss_clip": 0.01123, - "auxiliary_loss_mlp": 0.01046638, - "balance_loss_clip": 1.05055666, - "balance_loss_mlp": 1.02978194, - "epoch": 0.3393957613106869, - "flos": 17347871030400.0, - "grad_norm": 8.889141309374795, - "language_loss": 0.62855232, - "learning_rate": 3.0770934891469832e-06, - "loss": 0.65024871, - "num_input_tokens_seen": 121169835, - "step": 5645, - "time_per_iteration": 2.5976576805114746 - }, - { - "auxiliary_loss_clip": 0.01123901, - "auxiliary_loss_mlp": 0.01042398, - "balance_loss_clip": 1.04963613, - "balance_loss_mlp": 1.0272944, - "epoch": 0.3394558845633549, - "flos": 28433960939520.0, - "grad_norm": 1.8158202042065192, - "language_loss": 0.76223624, - "learning_rate": 3.076765310014552e-06, - "loss": 0.78389925, - "num_input_tokens_seen": 121190290, - "step": 5646, - "time_per_iteration": 2.674058437347412 - }, - { - "auxiliary_loss_clip": 0.01128511, - "auxiliary_loss_mlp": 0.01049927, - "balance_loss_clip": 1.05314088, - "balance_loss_mlp": 1.03245091, - "epoch": 0.33951600781602287, - "flos": 22086929865600.0, - "grad_norm": 2.6597837481337256, - "language_loss": 0.78888249, - "learning_rate": 3.0764370900510727e-06, - "loss": 0.81066692, - "num_input_tokens_seen": 121209060, - "step": 5647, - "time_per_iteration": 2.636462688446045 - }, - { - "auxiliary_loss_clip": 0.01113432, - "auxiliary_loss_mlp": 0.0077397, - "balance_loss_clip": 1.05254745, - "balance_loss_mlp": 1.00053275, - "epoch": 0.33957613106869083, - "flos": 23878262056320.0, - "grad_norm": 2.0563114900155037, - "language_loss": 0.77694631, - "learning_rate": 3.0761088292689904e-06, - "loss": 0.7958203, - "num_input_tokens_seen": 121227480, - "step": 5648, - "time_per_iteration": 2.704535484313965 - }, - { - "auxiliary_loss_clip": 0.00999132, - "auxiliary_loss_mlp": 0.01023587, - "balance_loss_clip": 1.03748918, - "balance_loss_mlp": 1.02168012, - "epoch": 0.3396362543213588, - "flos": 71242642414080.0, - "grad_norm": 0.7822172669689142, - "language_loss": 0.56281364, - "learning_rate": 3.075780527680754e-06, - "loss": 0.58304083, - "num_input_tokens_seen": 121291305, - "step": 5649, - "time_per_iteration": 3.6428561210632324 - }, - { - "auxiliary_loss_clip": 0.01109513, - "auxiliary_loss_mlp": 0.00776659, - "balance_loss_clip": 1.04886901, - "balance_loss_mlp": 1.00053644, - "epoch": 0.33969637757402676, - "flos": 25921615046400.0, - "grad_norm": 1.4990429944851429, - "language_loss": 0.85522908, - "learning_rate": 3.0754521852988117e-06, - "loss": 0.87409085, - "num_input_tokens_seen": 121312740, - "step": 5650, - "time_per_iteration": 4.6250996589660645 - }, - { - "auxiliary_loss_clip": 0.01125063, - "auxiliary_loss_mlp": 0.01029114, - "balance_loss_clip": 1.04845572, - "balance_loss_mlp": 1.01392674, - "epoch": 0.33975650082669473, - "flos": 35261728663680.0, - "grad_norm": 1.7009103293103713, - "language_loss": 0.70462626, - "learning_rate": 3.0751238021356152e-06, - "loss": 0.7261681, - "num_input_tokens_seen": 121334220, - "step": 5651, - "time_per_iteration": 3.0873425006866455 - }, - { - "auxiliary_loss_clip": 0.01088353, - "auxiliary_loss_mlp": 0.01041459, - "balance_loss_clip": 1.04718101, - "balance_loss_mlp": 1.02539587, - "epoch": 0.3398166240793627, - "flos": 16647001879680.0, - "grad_norm": 2.657059560006321, - "language_loss": 0.80932343, - "learning_rate": 3.074795378203616e-06, - "loss": 0.83062148, - "num_input_tokens_seen": 121351870, - "step": 5652, - "time_per_iteration": 2.957105875015259 - }, - { - "auxiliary_loss_clip": 0.01143187, - "auxiliary_loss_mlp": 0.0104477, - "balance_loss_clip": 1.05543184, - "balance_loss_mlp": 1.0275445, - "epoch": 0.33987674733203066, - "flos": 24062196625920.0, - "grad_norm": 2.181969038816262, - "language_loss": 0.76847494, - "learning_rate": 3.0744669135152685e-06, - "loss": 0.79035449, - "num_input_tokens_seen": 121373400, - "step": 5653, - "time_per_iteration": 4.277743816375732 - }, - { - "auxiliary_loss_clip": 0.01117346, - "auxiliary_loss_mlp": 0.01041107, - "balance_loss_clip": 1.04708898, - "balance_loss_mlp": 1.02475142, - "epoch": 0.3399368705846986, - "flos": 13250678279040.0, - "grad_norm": 2.9108557214850217, - "language_loss": 0.85412633, - "learning_rate": 3.0741384080830278e-06, - "loss": 0.8757109, - "num_input_tokens_seen": 121385225, - "step": 5654, - "time_per_iteration": 4.243285179138184 - }, - { - "auxiliary_loss_clip": 0.01118111, - "auxiliary_loss_mlp": 0.01041226, - "balance_loss_clip": 1.04521537, - "balance_loss_mlp": 1.02490664, - "epoch": 0.3399969938373666, - "flos": 27012832272000.0, - "grad_norm": 5.5024852924346765, - "language_loss": 0.64919531, - "learning_rate": 3.073809861919351e-06, - "loss": 0.67078876, - "num_input_tokens_seen": 121404735, - "step": 5655, - "time_per_iteration": 2.793121576309204 - }, - { - "auxiliary_loss_clip": 0.01129599, - "auxiliary_loss_mlp": 0.01043607, - "balance_loss_clip": 1.05404055, - "balance_loss_mlp": 1.02828872, - "epoch": 0.34005711709003456, - "flos": 28550096588160.0, - "grad_norm": 1.7231624830718477, - "language_loss": 0.7624622, - "learning_rate": 3.073481275036697e-06, - "loss": 0.78419423, - "num_input_tokens_seen": 121426780, - "step": 5656, - "time_per_iteration": 2.739227056503296 - }, - { - "auxiliary_loss_clip": 0.01102847, - "auxiliary_loss_mlp": 0.01040319, - "balance_loss_clip": 1.0458467, - "balance_loss_mlp": 1.02364159, - "epoch": 0.3401172403427025, - "flos": 21617003208960.0, - "grad_norm": 8.964185236965056, - "language_loss": 0.82842731, - "learning_rate": 3.073152647447525e-06, - "loss": 0.849859, - "num_input_tokens_seen": 121447245, - "step": 5657, - "time_per_iteration": 5.179774761199951 - }, - { - "auxiliary_loss_clip": 0.01113742, - "auxiliary_loss_mlp": 0.01048481, - "balance_loss_clip": 1.05169284, - "balance_loss_mlp": 1.03313899, - "epoch": 0.3401773635953705, - "flos": 25885776251520.0, - "grad_norm": 1.8385093437954252, - "language_loss": 0.85050905, - "learning_rate": 3.0728239791642976e-06, - "loss": 0.87213123, - "num_input_tokens_seen": 121468165, - "step": 5658, - "time_per_iteration": 2.776137351989746 - }, - { - "auxiliary_loss_clip": 0.01053106, - "auxiliary_loss_mlp": 0.01016184, - "balance_loss_clip": 1.03449082, - "balance_loss_mlp": 1.01424086, - "epoch": 0.3402374868480385, - "flos": 65507995336320.0, - "grad_norm": 0.825209949556337, - "language_loss": 0.59988189, - "learning_rate": 3.072495270199477e-06, - "loss": 0.62057471, - "num_input_tokens_seen": 121523795, - "step": 5659, - "time_per_iteration": 3.272684335708618 - }, - { - "auxiliary_loss_clip": 0.01137862, - "auxiliary_loss_mlp": 0.01036085, - "balance_loss_clip": 1.05531621, - "balance_loss_mlp": 1.02102888, - "epoch": 0.34029761010070647, - "flos": 24060580513920.0, - "grad_norm": 2.521681543348545, - "language_loss": 0.67763948, - "learning_rate": 3.0721665205655284e-06, - "loss": 0.69937897, - "num_input_tokens_seen": 121542950, - "step": 5660, - "time_per_iteration": 2.699267864227295 - }, - { - "auxiliary_loss_clip": 0.01142235, - "auxiliary_loss_mlp": 0.010443, - "balance_loss_clip": 1.05695057, - "balance_loss_mlp": 1.02787328, - "epoch": 0.34035773335337444, - "flos": 27599720590080.0, - "grad_norm": 1.9299535220965447, - "language_loss": 0.67668259, - "learning_rate": 3.071837730274918e-06, - "loss": 0.69854796, - "num_input_tokens_seen": 121562765, - "step": 5661, - "time_per_iteration": 2.647101402282715 - }, - { - "auxiliary_loss_clip": 0.01119112, - "auxiliary_loss_mlp": 0.01041902, - "balance_loss_clip": 1.05479288, - "balance_loss_mlp": 1.02634561, - "epoch": 0.3404178566060424, - "flos": 20812783651200.0, - "grad_norm": 2.0521689983251954, - "language_loss": 0.78806192, - "learning_rate": 3.071508899340113e-06, - "loss": 0.80967206, - "num_input_tokens_seen": 121581610, - "step": 5662, - "time_per_iteration": 2.847168207168579 - }, - { - "auxiliary_loss_clip": 0.01103563, - "auxiliary_loss_mlp": 0.01041962, - "balance_loss_clip": 1.05163002, - "balance_loss_mlp": 1.02498698, - "epoch": 0.34047797985871037, - "flos": 26833566470400.0, - "grad_norm": 2.226848836482441, - "language_loss": 0.73531127, - "learning_rate": 3.0711800277735833e-06, - "loss": 0.75676656, - "num_input_tokens_seen": 121601885, - "step": 5663, - "time_per_iteration": 2.8581340312957764 - }, - { - "auxiliary_loss_clip": 0.01090462, - "auxiliary_loss_mlp": 0.01035271, - "balance_loss_clip": 1.04631042, - "balance_loss_mlp": 1.02079868, - "epoch": 0.34053810311137833, - "flos": 19682639061120.0, - "grad_norm": 1.7108226041633658, - "language_loss": 0.86297357, - "learning_rate": 3.0708511155877997e-06, - "loss": 0.88423085, - "num_input_tokens_seen": 121621335, - "step": 5664, - "time_per_iteration": 2.778038501739502 - }, - { - "auxiliary_loss_clip": 0.01139377, - "auxiliary_loss_mlp": 0.0103938, - "balance_loss_clip": 1.05399597, - "balance_loss_mlp": 1.0245564, - "epoch": 0.3405982263640463, - "flos": 21725740656000.0, - "grad_norm": 2.2398696420560675, - "language_loss": 0.68712831, - "learning_rate": 3.070522162795235e-06, - "loss": 0.70891583, - "num_input_tokens_seen": 121641310, - "step": 5665, - "time_per_iteration": 2.688643217086792 - }, - { - "auxiliary_loss_clip": 0.01138662, - "auxiliary_loss_mlp": 0.01039766, - "balance_loss_clip": 1.05278993, - "balance_loss_mlp": 1.0229218, - "epoch": 0.34065834961671426, - "flos": 18041629288320.0, - "grad_norm": 2.716291820837314, - "language_loss": 0.73084486, - "learning_rate": 3.0701931694083626e-06, - "loss": 0.7526291, - "num_input_tokens_seen": 121659625, - "step": 5666, - "time_per_iteration": 2.7325544357299805 - }, - { - "auxiliary_loss_clip": 0.01128915, - "auxiliary_loss_mlp": 0.01039671, - "balance_loss_clip": 1.05135012, - "balance_loss_mlp": 1.0244832, - "epoch": 0.3407184728693822, - "flos": 21397337585280.0, - "grad_norm": 2.363121461769924, - "language_loss": 0.72947341, - "learning_rate": 3.0698641354396576e-06, - "loss": 0.75115931, - "num_input_tokens_seen": 121679205, - "step": 5667, - "time_per_iteration": 2.7143874168395996 - }, - { - "auxiliary_loss_clip": 0.01042137, - "auxiliary_loss_mlp": 0.01008076, - "balance_loss_clip": 1.02401757, - "balance_loss_mlp": 1.00638342, - "epoch": 0.3407785961220502, - "flos": 68688101018880.0, - "grad_norm": 0.8313790259289849, - "language_loss": 0.63259363, - "learning_rate": 3.069535060901597e-06, - "loss": 0.65309572, - "num_input_tokens_seen": 121751085, - "step": 5668, - "time_per_iteration": 3.3907217979431152 - }, - { - "auxiliary_loss_clip": 0.01036989, - "auxiliary_loss_mlp": 0.01045108, - "balance_loss_clip": 1.03961444, - "balance_loss_mlp": 1.02808475, - "epoch": 0.34083871937471816, - "flos": 14064379027200.0, - "grad_norm": 2.2447075161594365, - "language_loss": 0.71795446, - "learning_rate": 3.0692059458066596e-06, - "loss": 0.73877549, - "num_input_tokens_seen": 121768565, - "step": 5669, - "time_per_iteration": 2.941349983215332 - }, - { - "auxiliary_loss_clip": 0.0110323, - "auxiliary_loss_mlp": 0.00773367, - "balance_loss_clip": 1.04966998, - "balance_loss_mlp": 1.00054646, - "epoch": 0.3408988426273861, - "flos": 17085435287040.0, - "grad_norm": 1.973306725053756, - "language_loss": 0.80678529, - "learning_rate": 3.0688767901673265e-06, - "loss": 0.82555127, - "num_input_tokens_seen": 121784925, - "step": 5670, - "time_per_iteration": 2.8877930641174316 - }, - { - "auxiliary_loss_clip": 0.01088488, - "auxiliary_loss_mlp": 0.01037182, - "balance_loss_clip": 1.04484558, - "balance_loss_mlp": 1.02111244, - "epoch": 0.3409589658800541, - "flos": 24024562151040.0, - "grad_norm": 1.926244069219147, - "language_loss": 0.77521646, - "learning_rate": 3.068547593996078e-06, - "loss": 0.79647315, - "num_input_tokens_seen": 121804425, - "step": 5671, - "time_per_iteration": 2.886425256729126 - }, - { - "auxiliary_loss_clip": 0.01138739, - "auxiliary_loss_mlp": 0.0077388, - "balance_loss_clip": 1.05301285, - "balance_loss_mlp": 1.00052333, - "epoch": 0.34101908913272205, - "flos": 21142012734720.0, - "grad_norm": 3.7152219569219427, - "language_loss": 0.74220848, - "learning_rate": 3.0682183573053974e-06, - "loss": 0.76133466, - "num_input_tokens_seen": 121825145, - "step": 5672, - "time_per_iteration": 2.751692056655884 - }, - { - "auxiliary_loss_clip": 0.01121109, - "auxiliary_loss_mlp": 0.01047405, - "balance_loss_clip": 1.04886246, - "balance_loss_mlp": 1.03089476, - "epoch": 0.3410792123853901, - "flos": 15702012921600.0, - "grad_norm": 1.8011032028958165, - "language_loss": 0.73721337, - "learning_rate": 3.06788908010777e-06, - "loss": 0.7588985, - "num_input_tokens_seen": 121842185, - "step": 5673, - "time_per_iteration": 2.6628050804138184 - }, - { - "auxiliary_loss_clip": 0.01126244, - "auxiliary_loss_mlp": 0.01038975, - "balance_loss_clip": 1.05143654, - "balance_loss_mlp": 1.02362132, - "epoch": 0.34113933563805804, - "flos": 23036012974080.0, - "grad_norm": 1.7591090628800392, - "language_loss": 0.79972708, - "learning_rate": 3.067559762415682e-06, - "loss": 0.8213793, - "num_input_tokens_seen": 121862260, - "step": 5674, - "time_per_iteration": 2.6803476810455322 - }, - { - "auxiliary_loss_clip": 0.01054856, - "auxiliary_loss_mlp": 0.01001466, - "balance_loss_clip": 1.0258925, - "balance_loss_mlp": 0.9994635, - "epoch": 0.341199458890726, - "flos": 69614235336960.0, - "grad_norm": 0.7875282266281167, - "language_loss": 0.56080592, - "learning_rate": 3.0672304042416198e-06, - "loss": 0.5813691, - "num_input_tokens_seen": 121923560, - "step": 5675, - "time_per_iteration": 3.3068313598632812 - }, - { - "auxiliary_loss_clip": 0.01115956, - "auxiliary_loss_mlp": 0.00773448, - "balance_loss_clip": 1.052145, - "balance_loss_mlp": 1.0006851, - "epoch": 0.34125958214339397, - "flos": 22346348866560.0, - "grad_norm": 1.6444328441844458, - "language_loss": 0.78795338, - "learning_rate": 3.0669010055980734e-06, - "loss": 0.80684733, - "num_input_tokens_seen": 121943515, - "step": 5676, - "time_per_iteration": 2.7983739376068115 - }, - { - "auxiliary_loss_clip": 0.01120251, - "auxiliary_loss_mlp": 0.01036846, - "balance_loss_clip": 1.04593658, - "balance_loss_mlp": 1.02024043, - "epoch": 0.34131970539606193, - "flos": 21871933009920.0, - "grad_norm": 1.8897537275348075, - "language_loss": 0.85468972, - "learning_rate": 3.0665715664975357e-06, - "loss": 0.8762607, - "num_input_tokens_seen": 121962540, - "step": 5677, - "time_per_iteration": 2.698751449584961 - }, - { - "auxiliary_loss_clip": 0.01109896, - "auxiliary_loss_mlp": 0.01042182, - "balance_loss_clip": 1.04772925, - "balance_loss_mlp": 1.02586842, - "epoch": 0.3413798286487299, - "flos": 24935723475840.0, - "grad_norm": 1.7514589696636707, - "language_loss": 0.79352021, - "learning_rate": 3.0662420869524966e-06, - "loss": 0.81504107, - "num_input_tokens_seen": 121979830, - "step": 5678, - "time_per_iteration": 2.731834650039673 - }, - { - "auxiliary_loss_clip": 0.01123477, - "auxiliary_loss_mlp": 0.01033453, - "balance_loss_clip": 1.04799783, - "balance_loss_mlp": 1.01833677, - "epoch": 0.34143995190139786, - "flos": 25374372364800.0, - "grad_norm": 1.8765190883227818, - "language_loss": 0.74821675, - "learning_rate": 3.0659125669754506e-06, - "loss": 0.76978606, - "num_input_tokens_seen": 121999055, - "step": 5679, - "time_per_iteration": 2.7362489700317383 - }, - { - "auxiliary_loss_clip": 0.01044772, - "auxiliary_loss_mlp": 0.01004164, - "balance_loss_clip": 1.02617037, - "balance_loss_mlp": 1.00210214, - "epoch": 0.34150007515406583, - "flos": 67782578129280.0, - "grad_norm": 0.716476818724812, - "language_loss": 0.59445524, - "learning_rate": 3.0655830065788923e-06, - "loss": 0.61494464, - "num_input_tokens_seen": 122067015, - "step": 5680, - "time_per_iteration": 3.241750955581665 - }, - { - "auxiliary_loss_clip": 0.01108333, - "auxiliary_loss_mlp": 0.01032851, - "balance_loss_clip": 1.04563892, - "balance_loss_mlp": 1.01804543, - "epoch": 0.3415601984067338, - "flos": 20302421258880.0, - "grad_norm": 1.760771174406363, - "language_loss": 0.72054088, - "learning_rate": 3.0652534057753206e-06, - "loss": 0.74195278, - "num_input_tokens_seen": 122085295, - "step": 5681, - "time_per_iteration": 2.7306556701660156 - }, - { - "auxiliary_loss_clip": 0.01109003, - "auxiliary_loss_mlp": 0.0104301, - "balance_loss_clip": 1.0462265, - "balance_loss_mlp": 1.02786994, - "epoch": 0.34162032165940176, - "flos": 26031178506240.0, - "grad_norm": 2.2327180896030443, - "language_loss": 0.71463466, - "learning_rate": 3.064923764577233e-06, - "loss": 0.73615474, - "num_input_tokens_seen": 122104020, - "step": 5682, - "time_per_iteration": 2.825296640396118 - }, - { - "auxiliary_loss_clip": 0.01132395, - "auxiliary_loss_mlp": 0.0104079, - "balance_loss_clip": 1.04721618, - "balance_loss_mlp": 1.02507806, - "epoch": 0.3416804449120697, - "flos": 28803338449920.0, - "grad_norm": 1.5426603390069147, - "language_loss": 0.84101224, - "learning_rate": 3.0645940829971295e-06, - "loss": 0.86274409, - "num_input_tokens_seen": 122125080, - "step": 5683, - "time_per_iteration": 2.6654412746429443 - }, - { - "auxiliary_loss_clip": 0.01112942, - "auxiliary_loss_mlp": 0.01047099, - "balance_loss_clip": 1.04768562, - "balance_loss_mlp": 1.03113699, - "epoch": 0.3417405681647377, - "flos": 22601601889920.0, - "grad_norm": 4.046428716645244, - "language_loss": 0.70964772, - "learning_rate": 3.0642643610475116e-06, - "loss": 0.73124808, - "num_input_tokens_seen": 122146350, - "step": 5684, - "time_per_iteration": 2.724592924118042 - }, - { - "auxiliary_loss_clip": 0.01132202, - "auxiliary_loss_mlp": 0.01038054, - "balance_loss_clip": 1.04905093, - "balance_loss_mlp": 1.02367699, - "epoch": 0.34180069141740566, - "flos": 24716237420160.0, - "grad_norm": 1.9204482618269598, - "language_loss": 0.74832582, - "learning_rate": 3.0639345987408823e-06, - "loss": 0.77002841, - "num_input_tokens_seen": 122168085, - "step": 5685, - "time_per_iteration": 2.7046890258789062 - }, - { - "auxiliary_loss_clip": 0.01114777, - "auxiliary_loss_mlp": 0.0104831, - "balance_loss_clip": 1.04522872, - "balance_loss_mlp": 1.03261042, - "epoch": 0.3418608146700737, - "flos": 30518755246080.0, - "grad_norm": 1.9200820074556442, - "language_loss": 0.70611888, - "learning_rate": 3.0636047960897468e-06, - "loss": 0.72774971, - "num_input_tokens_seen": 122191040, - "step": 5686, - "time_per_iteration": 2.7390410900115967 - }, - { - "auxiliary_loss_clip": 0.01123208, - "auxiliary_loss_mlp": 0.01044107, - "balance_loss_clip": 1.04809284, - "balance_loss_mlp": 1.02819252, - "epoch": 0.34192093792274164, - "flos": 15122343237120.0, - "grad_norm": 2.0197354521106563, - "language_loss": 0.77240539, - "learning_rate": 3.06327495310661e-06, - "loss": 0.79407853, - "num_input_tokens_seen": 122209225, - "step": 5687, - "time_per_iteration": 2.6381263732910156 - }, - { - "auxiliary_loss_clip": 0.01106353, - "auxiliary_loss_mlp": 0.01040255, - "balance_loss_clip": 1.04849195, - "balance_loss_mlp": 1.02412593, - "epoch": 0.3419810611754096, - "flos": 13187799521280.0, - "grad_norm": 3.7332163528162385, - "language_loss": 0.8676976, - "learning_rate": 3.062945069803981e-06, - "loss": 0.88916373, - "num_input_tokens_seen": 122226160, - "step": 5688, - "time_per_iteration": 2.647320508956909 - }, - { - "auxiliary_loss_clip": 0.01119843, - "auxiliary_loss_mlp": 0.01042145, - "balance_loss_clip": 1.04928863, - "balance_loss_mlp": 1.0255394, - "epoch": 0.34204118442807757, - "flos": 19536267139200.0, - "grad_norm": 1.870477619822585, - "language_loss": 0.79564822, - "learning_rate": 3.0626151461943684e-06, - "loss": 0.81726807, - "num_input_tokens_seen": 122243115, - "step": 5689, - "time_per_iteration": 4.1660990715026855 - }, - { - "auxiliary_loss_clip": 0.0112576, - "auxiliary_loss_mlp": 0.01042306, - "balance_loss_clip": 1.04875994, - "balance_loss_mlp": 1.02580786, - "epoch": 0.34210130768074554, - "flos": 15194846839680.0, - "grad_norm": 1.7530560995380315, - "language_loss": 0.73215616, - "learning_rate": 3.0622851822902834e-06, - "loss": 0.75383675, - "num_input_tokens_seen": 122261105, - "step": 5690, - "time_per_iteration": 2.699846029281616 - }, - { - "auxiliary_loss_clip": 0.01115188, - "auxiliary_loss_mlp": 0.01047594, - "balance_loss_clip": 1.04381919, - "balance_loss_mlp": 1.03121471, - "epoch": 0.3421614309334135, - "flos": 24936226266240.0, - "grad_norm": 2.1339055209058184, - "language_loss": 0.76036334, - "learning_rate": 3.061955178104237e-06, - "loss": 0.78199112, - "num_input_tokens_seen": 122279995, - "step": 5691, - "time_per_iteration": 2.707598924636841 - }, - { - "auxiliary_loss_clip": 0.01119412, - "auxiliary_loss_mlp": 0.01042889, - "balance_loss_clip": 1.04769242, - "balance_loss_mlp": 1.02878046, - "epoch": 0.34222155418608147, - "flos": 21908633731200.0, - "grad_norm": 1.9419180569645556, - "language_loss": 0.68321705, - "learning_rate": 3.0616251336487447e-06, - "loss": 0.70484006, - "num_input_tokens_seen": 122299070, - "step": 5692, - "time_per_iteration": 2.6876816749572754 - }, - { - "auxiliary_loss_clip": 0.01123804, - "auxiliary_loss_mlp": 0.01042902, - "balance_loss_clip": 1.0481621, - "balance_loss_mlp": 1.02660608, - "epoch": 0.34228167743874943, - "flos": 18114061063680.0, - "grad_norm": 2.8342834288415504, - "language_loss": 0.72458065, - "learning_rate": 3.06129504893632e-06, - "loss": 0.74624765, - "num_input_tokens_seen": 122316800, - "step": 5693, - "time_per_iteration": 5.672837018966675 - }, - { - "auxiliary_loss_clip": 0.01090312, - "auxiliary_loss_mlp": 0.01043466, - "balance_loss_clip": 1.0433774, - "balance_loss_mlp": 1.02832651, - "epoch": 0.3423418006914174, - "flos": 21288600138240.0, - "grad_norm": 1.9009541760697364, - "language_loss": 0.75556326, - "learning_rate": 3.0609649239794813e-06, - "loss": 0.77690107, - "num_input_tokens_seen": 122335275, - "step": 5694, - "time_per_iteration": 2.713236093521118 - }, - { - "auxiliary_loss_clip": 0.01093804, - "auxiliary_loss_mlp": 0.01036832, - "balance_loss_clip": 1.04769742, - "balance_loss_mlp": 1.02205038, - "epoch": 0.34240192394408536, - "flos": 19823480288640.0, - "grad_norm": 2.1810058063417608, - "language_loss": 0.79590774, - "learning_rate": 3.060634758790747e-06, - "loss": 0.81721413, - "num_input_tokens_seen": 122353215, - "step": 5695, - "time_per_iteration": 2.7206506729125977 - }, - { - "auxiliary_loss_clip": 0.01077977, - "auxiliary_loss_mlp": 0.01043311, - "balance_loss_clip": 1.04183137, - "balance_loss_mlp": 1.02764642, - "epoch": 0.3424620471967533, - "flos": 24535535074560.0, - "grad_norm": 1.8643380844369803, - "language_loss": 0.73428202, - "learning_rate": 3.060304553382635e-06, - "loss": 0.75549489, - "num_input_tokens_seen": 122372495, - "step": 5696, - "time_per_iteration": 4.777001857757568 - }, - { - "auxiliary_loss_clip": 0.01088152, - "auxiliary_loss_mlp": 0.01052674, - "balance_loss_clip": 1.0424118, - "balance_loss_mlp": 1.03569841, - "epoch": 0.3425221704494213, - "flos": 25848895962240.0, - "grad_norm": 5.815439398629578, - "language_loss": 0.71460104, - "learning_rate": 3.0599743077676685e-06, - "loss": 0.73600936, - "num_input_tokens_seen": 122394600, - "step": 5697, - "time_per_iteration": 2.7620668411254883 - }, - { - "auxiliary_loss_clip": 0.01108783, - "auxiliary_loss_mlp": 0.01032533, - "balance_loss_clip": 1.04925871, - "balance_loss_mlp": 1.01740503, - "epoch": 0.34258229370208926, - "flos": 21540513196800.0, - "grad_norm": 2.6993537181180316, - "language_loss": 0.82170486, - "learning_rate": 3.05964402195837e-06, - "loss": 0.84311801, - "num_input_tokens_seen": 122414700, - "step": 5698, - "time_per_iteration": 2.6930580139160156 - }, - { - "auxiliary_loss_clip": 0.01077965, - "auxiliary_loss_mlp": 0.01049711, - "balance_loss_clip": 1.0451839, - "balance_loss_mlp": 1.03073311, - "epoch": 0.3426424169547573, - "flos": 23652778429440.0, - "grad_norm": 2.492082875954734, - "language_loss": 0.68941295, - "learning_rate": 3.0593136959672645e-06, - "loss": 0.71068972, - "num_input_tokens_seen": 122432760, - "step": 5699, - "time_per_iteration": 2.8604705333709717 - }, - { - "auxiliary_loss_clip": 0.01113381, - "auxiliary_loss_mlp": 0.01042187, - "balance_loss_clip": 1.05009818, - "balance_loss_mlp": 1.02698755, - "epoch": 0.34270254020742524, - "flos": 24644883052800.0, - "grad_norm": 2.4799642493365046, - "language_loss": 0.72708368, - "learning_rate": 3.058983329806877e-06, - "loss": 0.74863935, - "num_input_tokens_seen": 122449105, - "step": 5700, - "time_per_iteration": 2.721219301223755 - }, - { - "auxiliary_loss_clip": 0.01107869, - "auxiliary_loss_mlp": 0.01033632, - "balance_loss_clip": 1.05173492, - "balance_loss_mlp": 1.01942825, - "epoch": 0.3427626634600932, - "flos": 20996754134400.0, - "grad_norm": 1.8907099352771195, - "language_loss": 0.81771016, - "learning_rate": 3.0586529234897354e-06, - "loss": 0.83912516, - "num_input_tokens_seen": 122468700, - "step": 5701, - "time_per_iteration": 2.668776273727417 - }, - { - "auxiliary_loss_clip": 0.01122749, - "auxiliary_loss_mlp": 0.01036444, - "balance_loss_clip": 1.05318427, - "balance_loss_mlp": 1.02137566, - "epoch": 0.3428227867127612, - "flos": 21433786911360.0, - "grad_norm": 1.8540703451937275, - "language_loss": 0.71611702, - "learning_rate": 3.0583224770283694e-06, - "loss": 0.73770893, - "num_input_tokens_seen": 122488160, - "step": 5702, - "time_per_iteration": 2.7413434982299805 - }, - { - "auxiliary_loss_clip": 0.01034072, - "auxiliary_loss_mlp": 0.0102117, - "balance_loss_clip": 1.02648544, - "balance_loss_mlp": 1.01936996, - "epoch": 0.34288290996542914, - "flos": 55731782695680.0, - "grad_norm": 0.8291151185510042, - "language_loss": 0.57455015, - "learning_rate": 3.057991990435309e-06, - "loss": 0.59510255, - "num_input_tokens_seen": 122542890, - "step": 5703, - "time_per_iteration": 3.123619318008423 - }, - { - "auxiliary_loss_clip": 0.01125899, - "auxiliary_loss_mlp": 0.01044546, - "balance_loss_clip": 1.05167961, - "balance_loss_mlp": 1.02754664, - "epoch": 0.3429430332180971, - "flos": 20156803522560.0, - "grad_norm": 2.054859273280662, - "language_loss": 0.75049305, - "learning_rate": 3.057661463723086e-06, - "loss": 0.77219748, - "num_input_tokens_seen": 122561770, - "step": 5704, - "time_per_iteration": 2.786344051361084 - }, - { - "auxiliary_loss_clip": 0.01103715, - "auxiliary_loss_mlp": 0.01039493, - "balance_loss_clip": 1.05234969, - "balance_loss_mlp": 1.02506232, - "epoch": 0.34300315647076507, - "flos": 17965857548160.0, - "grad_norm": 1.921400910299184, - "language_loss": 0.72367042, - "learning_rate": 3.0573308969042346e-06, - "loss": 0.74510252, - "num_input_tokens_seen": 122580580, - "step": 5705, - "time_per_iteration": 2.7464826107025146 - }, - { - "auxiliary_loss_clip": 0.01099266, - "auxiliary_loss_mlp": 0.01035276, - "balance_loss_clip": 1.05201912, - "balance_loss_mlp": 1.01980281, - "epoch": 0.34306327972343303, - "flos": 22086822124800.0, - "grad_norm": 2.585473080189318, - "language_loss": 0.80016834, - "learning_rate": 3.057000289991289e-06, - "loss": 0.82151377, - "num_input_tokens_seen": 122599810, - "step": 5706, - "time_per_iteration": 2.83493971824646 - }, - { - "auxiliary_loss_clip": 0.01126183, - "auxiliary_loss_mlp": 0.01037399, - "balance_loss_clip": 1.05822873, - "balance_loss_mlp": 1.02111542, - "epoch": 0.343123402976101, - "flos": 18442679616000.0, - "grad_norm": 2.833985332828215, - "language_loss": 0.83001584, - "learning_rate": 3.056669642996787e-06, - "loss": 0.85165167, - "num_input_tokens_seen": 122616035, - "step": 5707, - "time_per_iteration": 2.6888725757598877 - }, - { - "auxiliary_loss_clip": 0.01130807, - "auxiliary_loss_mlp": 0.01038349, - "balance_loss_clip": 1.05664158, - "balance_loss_mlp": 1.02264881, - "epoch": 0.34318352622876896, - "flos": 17163685065600.0, - "grad_norm": 1.6733576562987098, - "language_loss": 0.75313264, - "learning_rate": 3.056338955933266e-06, - "loss": 0.7748242, - "num_input_tokens_seen": 122633785, - "step": 5708, - "time_per_iteration": 2.655061960220337 - }, - { - "auxiliary_loss_clip": 0.01105586, - "auxiliary_loss_mlp": 0.01039807, - "balance_loss_clip": 1.05063939, - "balance_loss_mlp": 1.02357078, - "epoch": 0.34324364948143693, - "flos": 26688164215680.0, - "grad_norm": 1.6008558791331946, - "language_loss": 0.81187862, - "learning_rate": 3.0560082288132662e-06, - "loss": 0.83333254, - "num_input_tokens_seen": 122652100, - "step": 5709, - "time_per_iteration": 2.7354934215545654 - }, - { - "auxiliary_loss_clip": 0.01119071, - "auxiliary_loss_mlp": 0.01043385, - "balance_loss_clip": 1.0550828, - "balance_loss_mlp": 1.02581382, - "epoch": 0.3433037727341049, - "flos": 21251576194560.0, - "grad_norm": 2.1605529243452297, - "language_loss": 0.79441178, - "learning_rate": 3.055677461649329e-06, - "loss": 0.81603634, - "num_input_tokens_seen": 122669720, - "step": 5710, - "time_per_iteration": 2.757321834564209 - }, - { - "auxiliary_loss_clip": 0.01130524, - "auxiliary_loss_mlp": 0.01039861, - "balance_loss_clip": 1.05363941, - "balance_loss_mlp": 1.02329111, - "epoch": 0.34336389598677286, - "flos": 20629423699200.0, - "grad_norm": 1.8403881586839854, - "language_loss": 0.70303786, - "learning_rate": 3.055346654453996e-06, - "loss": 0.7247417, - "num_input_tokens_seen": 122688715, - "step": 5711, - "time_per_iteration": 2.6535775661468506 - }, - { - "auxiliary_loss_clip": 0.01106817, - "auxiliary_loss_mlp": 0.00774858, - "balance_loss_clip": 1.05299044, - "balance_loss_mlp": 1.00072622, - "epoch": 0.3434240192394409, - "flos": 14538579402240.0, - "grad_norm": 1.8401630077009354, - "language_loss": 0.67124939, - "learning_rate": 3.055015807239812e-06, - "loss": 0.69006616, - "num_input_tokens_seen": 122706970, - "step": 5712, - "time_per_iteration": 2.7115519046783447 - }, - { - "auxiliary_loss_clip": 0.01051163, - "auxiliary_loss_mlp": 0.01005713, - "balance_loss_clip": 1.0511148, - "balance_loss_mlp": 1.00409162, - "epoch": 0.34348414249210885, - "flos": 58051538841600.0, - "grad_norm": 0.846630151399307, - "language_loss": 0.58072996, - "learning_rate": 3.0546849200193226e-06, - "loss": 0.60129869, - "num_input_tokens_seen": 122758095, - "step": 5713, - "time_per_iteration": 3.3988189697265625 - }, - { - "auxiliary_loss_clip": 0.01142007, - "auxiliary_loss_mlp": 0.01043862, - "balance_loss_clip": 1.05782688, - "balance_loss_mlp": 1.02813852, - "epoch": 0.3435442657447768, - "flos": 20704441253760.0, - "grad_norm": 1.6506449407169241, - "language_loss": 0.8079257, - "learning_rate": 3.054353992805076e-06, - "loss": 0.82978439, - "num_input_tokens_seen": 122777815, - "step": 5714, - "time_per_iteration": 2.682537078857422 - }, - { - "auxiliary_loss_clip": 0.01142274, - "auxiliary_loss_mlp": 0.01042249, - "balance_loss_clip": 1.0581255, - "balance_loss_mlp": 1.02628696, - "epoch": 0.3436043889974448, - "flos": 22930256355840.0, - "grad_norm": 2.1462767477025055, - "language_loss": 0.72059911, - "learning_rate": 3.05402302560962e-06, - "loss": 0.74244434, - "num_input_tokens_seen": 122797555, - "step": 5715, - "time_per_iteration": 2.6535134315490723 - }, - { - "auxiliary_loss_clip": 0.01070037, - "auxiliary_loss_mlp": 0.01002865, - "balance_loss_clip": 1.0577507, - "balance_loss_mlp": 1.00051689, - "epoch": 0.34366451225011274, - "flos": 58403285752320.0, - "grad_norm": 0.9103705044251069, - "language_loss": 0.65885556, - "learning_rate": 3.053692018445505e-06, - "loss": 0.67958462, - "num_input_tokens_seen": 122863955, - "step": 5716, - "time_per_iteration": 3.205113172531128 - }, - { - "auxiliary_loss_clip": 0.01124236, - "auxiliary_loss_mlp": 0.0104266, - "balance_loss_clip": 1.05416417, - "balance_loss_mlp": 1.02718663, - "epoch": 0.3437246355027807, - "flos": 15596292216960.0, - "grad_norm": 2.101112668121384, - "language_loss": 0.74272031, - "learning_rate": 3.0533609713252838e-06, - "loss": 0.76438928, - "num_input_tokens_seen": 122883000, - "step": 5717, - "time_per_iteration": 2.60300350189209 - }, - { - "auxiliary_loss_clip": 0.01084832, - "auxiliary_loss_mlp": 0.01039269, - "balance_loss_clip": 1.05195725, - "balance_loss_mlp": 1.02437937, - "epoch": 0.34378475875544867, - "flos": 27672260106240.0, - "grad_norm": 1.8405555467441777, - "language_loss": 0.75446129, - "learning_rate": 3.0530298842615077e-06, - "loss": 0.7757023, - "num_input_tokens_seen": 122903265, - "step": 5718, - "time_per_iteration": 2.787687301635742 - }, - { - "auxiliary_loss_clip": 0.01097103, - "auxiliary_loss_mlp": 0.01043125, - "balance_loss_clip": 1.04837775, - "balance_loss_mlp": 1.02739501, - "epoch": 0.34384488200811664, - "flos": 31431496769280.0, - "grad_norm": 1.9369525419747404, - "language_loss": 0.63647246, - "learning_rate": 3.052698757266734e-06, - "loss": 0.65787476, - "num_input_tokens_seen": 122923860, - "step": 5719, - "time_per_iteration": 2.8138949871063232 - }, - { - "auxiliary_loss_clip": 0.01098152, - "auxiliary_loss_mlp": 0.01040429, - "balance_loss_clip": 1.05234158, - "balance_loss_mlp": 1.02310777, - "epoch": 0.3439050052607846, - "flos": 24899920594560.0, - "grad_norm": 1.8182809721987367, - "language_loss": 0.73785692, - "learning_rate": 3.0523675903535183e-06, - "loss": 0.75924277, - "num_input_tokens_seen": 122945305, - "step": 5720, - "time_per_iteration": 2.761371612548828 - }, - { - "auxiliary_loss_clip": 0.01127909, - "auxiliary_loss_mlp": 0.01052147, - "balance_loss_clip": 1.056463, - "balance_loss_mlp": 1.03434944, - "epoch": 0.34396512851345257, - "flos": 18150079426560.0, - "grad_norm": 2.2267988645125896, - "language_loss": 0.74087942, - "learning_rate": 3.0520363835344173e-06, - "loss": 0.76267999, - "num_input_tokens_seen": 122962535, - "step": 5721, - "time_per_iteration": 2.6139280796051025 - }, - { - "auxiliary_loss_clip": 0.0111919, - "auxiliary_loss_mlp": 0.0077563, - "balance_loss_clip": 1.05647993, - "balance_loss_mlp": 1.00063252, - "epoch": 0.34402525176612053, - "flos": 16034438315520.0, - "grad_norm": 2.313932715754647, - "language_loss": 0.80464351, - "learning_rate": 3.051705136821992e-06, - "loss": 0.82359171, - "num_input_tokens_seen": 122979750, - "step": 5722, - "time_per_iteration": 2.6886982917785645 - }, - { - "auxiliary_loss_clip": 0.01092207, - "auxiliary_loss_mlp": 0.01038868, - "balance_loss_clip": 1.05326557, - "balance_loss_mlp": 1.02348995, - "epoch": 0.3440853750187885, - "flos": 21178641628800.0, - "grad_norm": 2.5095280683984984, - "language_loss": 0.81647789, - "learning_rate": 3.051373850228801e-06, - "loss": 0.83778864, - "num_input_tokens_seen": 122998955, - "step": 5723, - "time_per_iteration": 2.7464921474456787 - }, - { - "auxiliary_loss_clip": 0.01099736, - "auxiliary_loss_mlp": 0.0105726, - "balance_loss_clip": 1.0488528, - "balance_loss_mlp": 1.04023743, - "epoch": 0.34414549827145646, - "flos": 12677868092160.0, - "grad_norm": 1.9897062128640133, - "language_loss": 0.81431544, - "learning_rate": 3.0510425237674096e-06, - "loss": 0.83588541, - "num_input_tokens_seen": 123016165, - "step": 5724, - "time_per_iteration": 2.7447471618652344 - }, - { - "auxiliary_loss_clip": 0.01112954, - "auxiliary_loss_mlp": 0.01047765, - "balance_loss_clip": 1.05231178, - "balance_loss_mlp": 1.03056324, - "epoch": 0.3442056215241244, - "flos": 31284514316160.0, - "grad_norm": 1.858960952495153, - "language_loss": 0.68913317, - "learning_rate": 3.05071115745038e-06, - "loss": 0.71074033, - "num_input_tokens_seen": 123036900, - "step": 5725, - "time_per_iteration": 2.798987627029419 - }, - { - "auxiliary_loss_clip": 0.01132971, - "auxiliary_loss_mlp": 0.0105182, - "balance_loss_clip": 1.05775714, - "balance_loss_mlp": 1.03379524, - "epoch": 0.34426574477679245, - "flos": 23367289132800.0, - "grad_norm": 1.4701315954442116, - "language_loss": 0.6946882, - "learning_rate": 3.0503797512902773e-06, - "loss": 0.71653616, - "num_input_tokens_seen": 123057480, - "step": 5726, - "time_per_iteration": 2.663766622543335 - }, - { - "auxiliary_loss_clip": 0.01111868, - "auxiliary_loss_mlp": 0.01038496, - "balance_loss_clip": 1.05667615, - "balance_loss_mlp": 1.02374983, - "epoch": 0.3443258680294604, - "flos": 24535427333760.0, - "grad_norm": 2.4860883718983873, - "language_loss": 0.73317868, - "learning_rate": 3.0500483052996703e-06, - "loss": 0.7546823, - "num_input_tokens_seen": 123076890, - "step": 5727, - "time_per_iteration": 2.8002336025238037 - }, - { - "auxiliary_loss_clip": 0.01097058, - "auxiliary_loss_mlp": 0.01052204, - "balance_loss_clip": 1.05053401, - "balance_loss_mlp": 1.03590822, - "epoch": 0.3443859912821284, - "flos": 20230133137920.0, - "grad_norm": 2.2067060616784815, - "language_loss": 0.88451493, - "learning_rate": 3.0497168194911257e-06, - "loss": 0.90600753, - "num_input_tokens_seen": 123092530, - "step": 5728, - "time_per_iteration": 2.703842878341675 - }, - { - "auxiliary_loss_clip": 0.01089582, - "auxiliary_loss_mlp": 0.01048379, - "balance_loss_clip": 1.04858351, - "balance_loss_mlp": 1.03266144, - "epoch": 0.34444611453479634, - "flos": 24316515895680.0, - "grad_norm": 2.2135571419735904, - "language_loss": 0.70018214, - "learning_rate": 3.0493852938772143e-06, - "loss": 0.72156173, - "num_input_tokens_seen": 123110560, - "step": 5729, - "time_per_iteration": 4.360877275466919 - }, - { - "auxiliary_loss_clip": 0.01124088, - "auxiliary_loss_mlp": 0.01037772, - "balance_loss_clip": 1.0525502, - "balance_loss_mlp": 1.02208424, - "epoch": 0.3445062377874643, - "flos": 16983413683200.0, - "grad_norm": 1.9483871766944658, - "language_loss": 0.7435137, - "learning_rate": 3.0490537284705078e-06, - "loss": 0.76513231, - "num_input_tokens_seen": 123128655, - "step": 5730, - "time_per_iteration": 2.6021499633789062 - }, - { - "auxiliary_loss_clip": 0.01099617, - "auxiliary_loss_mlp": 0.0105823, - "balance_loss_clip": 1.04880106, - "balance_loss_mlp": 1.04053974, - "epoch": 0.3445663610401323, - "flos": 20302708567680.0, - "grad_norm": 2.1142556114368314, - "language_loss": 0.7952323, - "learning_rate": 3.048722123283578e-06, - "loss": 0.81681079, - "num_input_tokens_seen": 123145130, - "step": 5731, - "time_per_iteration": 4.273399114608765 - }, - { - "auxiliary_loss_clip": 0.01130567, - "auxiliary_loss_mlp": 0.01043537, - "balance_loss_clip": 1.05617356, - "balance_loss_mlp": 1.02793896, - "epoch": 0.34462648429280024, - "flos": 15888102307200.0, - "grad_norm": 2.0299111477971334, - "language_loss": 0.78609502, - "learning_rate": 3.0483904783290006e-06, - "loss": 0.80783606, - "num_input_tokens_seen": 123162265, - "step": 5732, - "time_per_iteration": 4.672218322753906 - }, - { - "auxiliary_loss_clip": 0.01037769, - "auxiliary_loss_mlp": 0.0101237, - "balance_loss_clip": 1.03788018, - "balance_loss_mlp": 1.0106411, - "epoch": 0.3446866075454682, - "flos": 59311035285120.0, - "grad_norm": 0.7456337544046427, - "language_loss": 0.53537595, - "learning_rate": 3.0480587936193505e-06, - "loss": 0.55587733, - "num_input_tokens_seen": 123218620, - "step": 5733, - "time_per_iteration": 3.322802782058716 - }, - { - "auxiliary_loss_clip": 0.01122514, - "auxiliary_loss_mlp": 0.01042066, - "balance_loss_clip": 1.05675018, - "balance_loss_mlp": 1.02577019, - "epoch": 0.34474673079813617, - "flos": 22343799000960.0, - "grad_norm": 1.936820728476944, - "language_loss": 0.832178, - "learning_rate": 3.047727069167207e-06, - "loss": 0.85382378, - "num_input_tokens_seen": 123237325, - "step": 5734, - "time_per_iteration": 2.7426953315734863 - }, - { - "auxiliary_loss_clip": 0.01120142, - "auxiliary_loss_mlp": 0.0103601, - "balance_loss_clip": 1.05517805, - "balance_loss_mlp": 1.01988125, - "epoch": 0.34480685405080413, - "flos": 27670141203840.0, - "grad_norm": 2.7764640699074077, - "language_loss": 0.92655241, - "learning_rate": 3.0473953049851478e-06, - "loss": 0.94811392, - "num_input_tokens_seen": 123258650, - "step": 5735, - "time_per_iteration": 4.536838054656982 - }, - { - "auxiliary_loss_clip": 0.0110302, - "auxiliary_loss_mlp": 0.01041265, - "balance_loss_clip": 1.05774188, - "balance_loss_mlp": 1.02492189, - "epoch": 0.3448669773034721, - "flos": 22456020067200.0, - "grad_norm": 1.7508294751665012, - "language_loss": 0.76571405, - "learning_rate": 3.0470635010857533e-06, - "loss": 0.78715694, - "num_input_tokens_seen": 123277155, - "step": 5736, - "time_per_iteration": 2.784958600997925 - }, - { - "auxiliary_loss_clip": 0.01122912, - "auxiliary_loss_mlp": 0.0104053, - "balance_loss_clip": 1.05683184, - "balance_loss_mlp": 1.02396011, - "epoch": 0.34492710055614006, - "flos": 24936190352640.0, - "grad_norm": 1.7983696926456887, - "language_loss": 0.78327668, - "learning_rate": 3.0467316574816064e-06, - "loss": 0.80491114, - "num_input_tokens_seen": 123297640, - "step": 5737, - "time_per_iteration": 2.709786891937256 - }, - { - "auxiliary_loss_clip": 0.01083721, - "auxiliary_loss_mlp": 0.0104406, - "balance_loss_clip": 1.04379368, - "balance_loss_mlp": 1.02520096, - "epoch": 0.34498722380880803, - "flos": 20120821073280.0, - "grad_norm": 2.0055780284948375, - "language_loss": 0.71544027, - "learning_rate": 3.0463997741852893e-06, - "loss": 0.73671806, - "num_input_tokens_seen": 123314370, - "step": 5738, - "time_per_iteration": 2.779651165008545 - }, - { - "auxiliary_loss_clip": 0.0110112, - "auxiliary_loss_mlp": 0.01042892, - "balance_loss_clip": 1.04991913, - "balance_loss_mlp": 1.02520132, - "epoch": 0.34504734706147605, - "flos": 28438126917120.0, - "grad_norm": 2.7751951344870562, - "language_loss": 0.82324719, - "learning_rate": 3.046067851209389e-06, - "loss": 0.84468728, - "num_input_tokens_seen": 123336085, - "step": 5739, - "time_per_iteration": 2.7953522205352783 - }, - { - "auxiliary_loss_clip": 0.01104482, - "auxiliary_loss_mlp": 0.01037335, - "balance_loss_clip": 1.05071819, - "balance_loss_mlp": 1.02132511, - "epoch": 0.345107470314144, - "flos": 22674464628480.0, - "grad_norm": 1.8186717226973075, - "language_loss": 0.83071041, - "learning_rate": 3.0457358885664898e-06, - "loss": 0.85212862, - "num_input_tokens_seen": 123354460, - "step": 5740, - "time_per_iteration": 2.7530486583709717 - }, - { - "auxiliary_loss_clip": 0.01130478, - "auxiliary_loss_mlp": 0.01035685, - "balance_loss_clip": 1.05699897, - "balance_loss_mlp": 1.01901984, - "epoch": 0.345167593566812, - "flos": 20630716588800.0, - "grad_norm": 2.1971165557092656, - "language_loss": 0.7704618, - "learning_rate": 3.045403886269181e-06, - "loss": 0.79212344, - "num_input_tokens_seen": 123373420, - "step": 5741, - "time_per_iteration": 2.6488983631134033 - }, - { - "auxiliary_loss_clip": 0.01116686, - "auxiliary_loss_mlp": 0.01038328, - "balance_loss_clip": 1.05202794, - "balance_loss_mlp": 1.02271724, - "epoch": 0.34522771681947995, - "flos": 26214358890240.0, - "grad_norm": 1.629760829576741, - "language_loss": 0.76972193, - "learning_rate": 3.045071844330053e-06, - "loss": 0.7912721, - "num_input_tokens_seen": 123394730, - "step": 5742, - "time_per_iteration": 2.7333807945251465 - }, - { - "auxiliary_loss_clip": 0.01133631, - "auxiliary_loss_mlp": 0.01040013, - "balance_loss_clip": 1.05862427, - "balance_loss_mlp": 1.02371693, - "epoch": 0.3452878400721479, - "flos": 19062354072960.0, - "grad_norm": 2.2460068376984523, - "language_loss": 0.76135588, - "learning_rate": 3.0447397627616955e-06, - "loss": 0.78309238, - "num_input_tokens_seen": 123412895, - "step": 5743, - "time_per_iteration": 2.677682638168335 - }, - { - "auxiliary_loss_clip": 0.01128893, - "auxiliary_loss_mlp": 0.01037178, - "balance_loss_clip": 1.05570602, - "balance_loss_mlp": 1.02171636, - "epoch": 0.3453479633248159, - "flos": 27929739772800.0, - "grad_norm": 2.0501405423310097, - "language_loss": 0.70481914, - "learning_rate": 3.0444076415767016e-06, - "loss": 0.72647989, - "num_input_tokens_seen": 123432320, - "step": 5744, - "time_per_iteration": 2.7430574893951416 - }, - { - "auxiliary_loss_clip": 0.01140382, - "auxiliary_loss_mlp": 0.01036281, - "balance_loss_clip": 1.05727339, - "balance_loss_mlp": 1.01959133, - "epoch": 0.34540808657748384, - "flos": 19606113135360.0, - "grad_norm": 2.271690731291802, - "language_loss": 0.79658759, - "learning_rate": 3.044075480787665e-06, - "loss": 0.81835419, - "num_input_tokens_seen": 123450980, - "step": 5745, - "time_per_iteration": 2.6587865352630615 - }, - { - "auxiliary_loss_clip": 0.01092128, - "auxiliary_loss_mlp": 0.01041398, - "balance_loss_clip": 1.0486573, - "balance_loss_mlp": 1.02435148, - "epoch": 0.3454682098301518, - "flos": 20411661496320.0, - "grad_norm": 1.8194779915280654, - "language_loss": 0.89049339, - "learning_rate": 3.043743280407182e-06, - "loss": 0.91182864, - "num_input_tokens_seen": 123469365, - "step": 5746, - "time_per_iteration": 2.7314908504486084 - }, - { - "auxiliary_loss_clip": 0.01133638, - "auxiliary_loss_mlp": 0.01038455, - "balance_loss_clip": 1.05554819, - "balance_loss_mlp": 1.02101421, - "epoch": 0.34552833308281977, - "flos": 21325121291520.0, - "grad_norm": 2.5554958969654136, - "language_loss": 0.64851058, - "learning_rate": 3.043411040447849e-06, - "loss": 0.67023152, - "num_input_tokens_seen": 123489425, - "step": 5747, - "time_per_iteration": 2.6858277320861816 - }, - { - "auxiliary_loss_clip": 0.01119459, - "auxiliary_loss_mlp": 0.01035118, - "balance_loss_clip": 1.05213308, - "balance_loss_mlp": 1.01928735, - "epoch": 0.34558845633548774, - "flos": 36243633824640.0, - "grad_norm": 1.5633023430662023, - "language_loss": 0.72855747, - "learning_rate": 3.043078760922264e-06, - "loss": 0.75010324, - "num_input_tokens_seen": 123509970, - "step": 5748, - "time_per_iteration": 2.805250406265259 - }, - { - "auxiliary_loss_clip": 0.01084714, - "auxiliary_loss_mlp": 0.01032651, - "balance_loss_clip": 1.05246413, - "balance_loss_mlp": 1.01832819, - "epoch": 0.3456485795881557, - "flos": 22450561200000.0, - "grad_norm": 1.6861475272665256, - "language_loss": 0.7584126, - "learning_rate": 3.042746441843029e-06, - "loss": 0.7795862, - "num_input_tokens_seen": 123531055, - "step": 5749, - "time_per_iteration": 2.8886258602142334 - }, - { - "auxiliary_loss_clip": 0.01061531, - "auxiliary_loss_mlp": 0.01002064, - "balance_loss_clip": 1.05058503, - "balance_loss_mlp": 1.00045478, - "epoch": 0.34570870284082367, - "flos": 62004299005440.0, - "grad_norm": 0.8852783380527953, - "language_loss": 0.62715566, - "learning_rate": 3.0424140832227437e-06, - "loss": 0.64779162, - "num_input_tokens_seen": 123584720, - "step": 5750, - "time_per_iteration": 3.1283066272735596 - }, - { - "auxiliary_loss_clip": 0.01110881, - "auxiliary_loss_mlp": 0.01037788, - "balance_loss_clip": 1.05210388, - "balance_loss_mlp": 1.02242184, - "epoch": 0.34576882609349163, - "flos": 22782196494720.0, - "grad_norm": 2.239830827663745, - "language_loss": 0.80332017, - "learning_rate": 3.042081685074012e-06, - "loss": 0.82480681, - "num_input_tokens_seen": 123604465, - "step": 5751, - "time_per_iteration": 2.721344470977783 - }, - { - "auxiliary_loss_clip": 0.01135561, - "auxiliary_loss_mlp": 0.01045926, - "balance_loss_clip": 1.0536952, - "balance_loss_mlp": 1.03101254, - "epoch": 0.34582894934615965, - "flos": 12348818576640.0, - "grad_norm": 2.3847713847020744, - "language_loss": 0.84148252, - "learning_rate": 3.041749247409439e-06, - "loss": 0.86329746, - "num_input_tokens_seen": 123622320, - "step": 5752, - "time_per_iteration": 2.578984260559082 - }, - { - "auxiliary_loss_clip": 0.01047286, - "auxiliary_loss_mlp": 0.00754976, - "balance_loss_clip": 1.0380801, - "balance_loss_mlp": 1.00148225, - "epoch": 0.3458890725988276, - "flos": 70167691071360.0, - "grad_norm": 0.7284359747550926, - "language_loss": 0.6310631, - "learning_rate": 3.0414167702416296e-06, - "loss": 0.64908576, - "num_input_tokens_seen": 123678010, - "step": 5753, - "time_per_iteration": 3.0907819271087646 - }, - { - "auxiliary_loss_clip": 0.01112695, - "auxiliary_loss_mlp": 0.01035981, - "balance_loss_clip": 1.05358505, - "balance_loss_mlp": 1.01956582, - "epoch": 0.3459491958514956, - "flos": 17092582093440.0, - "grad_norm": 1.9590865283999213, - "language_loss": 0.71000856, - "learning_rate": 3.0410842535831914e-06, - "loss": 0.73149538, - "num_input_tokens_seen": 123696830, - "step": 5754, - "time_per_iteration": 2.7031564712524414 - }, - { - "auxiliary_loss_clip": 0.01127989, - "auxiliary_loss_mlp": 0.01038041, - "balance_loss_clip": 1.05300486, - "balance_loss_mlp": 1.02251959, - "epoch": 0.34600931910416355, - "flos": 16650952375680.0, - "grad_norm": 2.56305874029915, - "language_loss": 0.73286581, - "learning_rate": 3.0407516974467343e-06, - "loss": 0.75452608, - "num_input_tokens_seen": 123714360, - "step": 5755, - "time_per_iteration": 2.656804084777832 - }, - { - "auxiliary_loss_clip": 0.01122508, - "auxiliary_loss_mlp": 0.01033304, - "balance_loss_clip": 1.0504849, - "balance_loss_mlp": 1.01791406, - "epoch": 0.3460694423568315, - "flos": 38546190334080.0, - "grad_norm": 1.7746130503339408, - "language_loss": 0.7232182, - "learning_rate": 3.040419101844869e-06, - "loss": 0.74477637, - "num_input_tokens_seen": 123739250, - "step": 5756, - "time_per_iteration": 2.8805603981018066 - }, - { - "auxiliary_loss_clip": 0.01055943, - "auxiliary_loss_mlp": 0.01012753, - "balance_loss_clip": 1.03647125, - "balance_loss_mlp": 1.01088166, - "epoch": 0.3461295656094995, - "flos": 72081479704320.0, - "grad_norm": 0.7176054236110851, - "language_loss": 0.62659568, - "learning_rate": 3.040086466790207e-06, - "loss": 0.64728266, - "num_input_tokens_seen": 123802845, - "step": 5757, - "time_per_iteration": 3.21248197555542 - }, - { - "auxiliary_loss_clip": 0.0103445, - "auxiliary_loss_mlp": 0.00755471, - "balance_loss_clip": 1.03495657, - "balance_loss_mlp": 1.0016396, - "epoch": 0.34618968886216744, - "flos": 65460089571840.0, - "grad_norm": 0.8171010225304897, - "language_loss": 0.59206927, - "learning_rate": 3.039753792295362e-06, - "loss": 0.60996854, - "num_input_tokens_seen": 123861805, - "step": 5758, - "time_per_iteration": 3.2514266967773438 - }, - { - "auxiliary_loss_clip": 0.01122832, - "auxiliary_loss_mlp": 0.01042223, - "balance_loss_clip": 1.05849838, - "balance_loss_mlp": 1.02783418, - "epoch": 0.3462498121148354, - "flos": 23472542960640.0, - "grad_norm": 1.8827972101732287, - "language_loss": 0.71806967, - "learning_rate": 3.0394210783729487e-06, - "loss": 0.73972023, - "num_input_tokens_seen": 123881820, - "step": 5759, - "time_per_iteration": 2.943061351776123 - }, - { - "auxiliary_loss_clip": 0.0108272, - "auxiliary_loss_mlp": 0.01061154, - "balance_loss_clip": 1.0455631, - "balance_loss_mlp": 1.04352307, - "epoch": 0.3463099353675034, - "flos": 24170790418560.0, - "grad_norm": 1.9206924983950955, - "language_loss": 0.83097923, - "learning_rate": 3.0390883250355836e-06, - "loss": 0.85241801, - "num_input_tokens_seen": 123903700, - "step": 5760, - "time_per_iteration": 2.8922929763793945 - }, - { - "auxiliary_loss_clip": 0.01029416, - "auxiliary_loss_mlp": 0.01010127, - "balance_loss_clip": 1.02909803, - "balance_loss_mlp": 1.00855386, - "epoch": 0.34637005862017134, - "flos": 63700609766400.0, - "grad_norm": 0.8149802448400086, - "language_loss": 0.56472003, - "learning_rate": 3.0387555322958865e-06, - "loss": 0.58511543, - "num_input_tokens_seen": 123960075, - "step": 5761, - "time_per_iteration": 3.274470567703247 - }, - { - "auxiliary_loss_clip": 0.01122229, - "auxiliary_loss_mlp": 0.00773416, - "balance_loss_clip": 1.04931128, - "balance_loss_mlp": 1.00069964, - "epoch": 0.3464301818728393, - "flos": 13145532192000.0, - "grad_norm": 2.486389460519204, - "language_loss": 0.94996566, - "learning_rate": 3.038422700166474e-06, - "loss": 0.96892214, - "num_input_tokens_seen": 123975805, - "step": 5762, - "time_per_iteration": 2.636906623840332 - }, - { - "auxiliary_loss_clip": 0.01106692, - "auxiliary_loss_mlp": 0.0104127, - "balance_loss_clip": 1.04844642, - "balance_loss_mlp": 1.02467608, - "epoch": 0.34649030512550727, - "flos": 29315173299840.0, - "grad_norm": 1.8335548533403485, - "language_loss": 0.69540495, - "learning_rate": 3.0380898286599692e-06, - "loss": 0.71688455, - "num_input_tokens_seen": 123997530, - "step": 5763, - "time_per_iteration": 2.8476505279541016 - }, - { - "auxiliary_loss_clip": 0.01125911, - "auxiliary_loss_mlp": 0.01051478, - "balance_loss_clip": 1.04963946, - "balance_loss_mlp": 1.03319085, - "epoch": 0.34655042837817523, - "flos": 23730884553600.0, - "grad_norm": 2.0043623648961195, - "language_loss": 0.83985734, - "learning_rate": 3.0377569177889945e-06, - "loss": 0.86163127, - "num_input_tokens_seen": 124016375, - "step": 5764, - "time_per_iteration": 2.693847417831421 - }, - { - "auxiliary_loss_clip": 0.01103367, - "auxiliary_loss_mlp": 0.01039514, - "balance_loss_clip": 1.04989028, - "balance_loss_mlp": 1.02363563, - "epoch": 0.34661055163084326, - "flos": 22054215553920.0, - "grad_norm": 2.2905956292147045, - "language_loss": 0.6769501, - "learning_rate": 3.0374239675661722e-06, - "loss": 0.69837892, - "num_input_tokens_seen": 124033975, - "step": 5765, - "time_per_iteration": 2.7656123638153076 - }, - { - "auxiliary_loss_clip": 0.01108658, - "auxiliary_loss_mlp": 0.01045242, - "balance_loss_clip": 1.05017447, - "balance_loss_mlp": 1.0279808, - "epoch": 0.3466706748835112, - "flos": 21799213925760.0, - "grad_norm": 2.7236728572511653, - "language_loss": 0.77394044, - "learning_rate": 3.03709097800413e-06, - "loss": 0.79547942, - "num_input_tokens_seen": 124051930, - "step": 5766, - "time_per_iteration": 2.7095906734466553 - }, - { - "auxiliary_loss_clip": 0.01078684, - "auxiliary_loss_mlp": 0.01035923, - "balance_loss_clip": 1.04552221, - "balance_loss_mlp": 1.02113521, - "epoch": 0.3467307981361792, - "flos": 19461680547840.0, - "grad_norm": 1.6543575607114767, - "language_loss": 0.73547316, - "learning_rate": 3.0367579491154943e-06, - "loss": 0.75661922, - "num_input_tokens_seen": 124071220, - "step": 5767, - "time_per_iteration": 2.8161730766296387 - }, - { - "auxiliary_loss_clip": 0.01111822, - "auxiliary_loss_mlp": 0.01043875, - "balance_loss_clip": 1.05307102, - "balance_loss_mlp": 1.02734113, - "epoch": 0.34679092138884715, - "flos": 24827452905600.0, - "grad_norm": 2.2530154082607776, - "language_loss": 0.7832194, - "learning_rate": 3.036424880912893e-06, - "loss": 0.80477637, - "num_input_tokens_seen": 124090140, - "step": 5768, - "time_per_iteration": 4.265673875808716 - }, - { - "auxiliary_loss_clip": 0.01050543, - "auxiliary_loss_mlp": 0.01012109, - "balance_loss_clip": 1.0320363, - "balance_loss_mlp": 1.0104636, - "epoch": 0.3468510446415151, - "flos": 63236070149760.0, - "grad_norm": 0.7741250202123364, - "language_loss": 0.57502627, - "learning_rate": 3.036091773408956e-06, - "loss": 0.59565282, - "num_input_tokens_seen": 124152025, - "step": 5769, - "time_per_iteration": 3.2264139652252197 - }, - { - "auxiliary_loss_clip": 0.01107195, - "auxiliary_loss_mlp": 0.01044629, - "balance_loss_clip": 1.04818511, - "balance_loss_mlp": 1.02630615, - "epoch": 0.3469111678941831, - "flos": 12120713256960.0, - "grad_norm": 2.34841523993127, - "language_loss": 0.85575318, - "learning_rate": 3.0357586266163154e-06, - "loss": 0.87727135, - "num_input_tokens_seen": 124165795, - "step": 5770, - "time_per_iteration": 2.7029645442962646 - }, - { - "auxiliary_loss_clip": 0.01034922, - "auxiliary_loss_mlp": 0.01007496, - "balance_loss_clip": 1.02998519, - "balance_loss_mlp": 1.00527906, - "epoch": 0.34697129114685105, - "flos": 65934110378880.0, - "grad_norm": 0.7677707974310557, - "language_loss": 0.59758615, - "learning_rate": 3.0354254405476036e-06, - "loss": 0.6180104, - "num_input_tokens_seen": 124222925, - "step": 5771, - "time_per_iteration": 4.5523951053619385 - }, - { - "auxiliary_loss_clip": 0.01127175, - "auxiliary_loss_mlp": 0.01049141, - "balance_loss_clip": 1.05249262, - "balance_loss_mlp": 1.03320241, - "epoch": 0.347031414399519, - "flos": 34454205054720.0, - "grad_norm": 1.9048919633537342, - "language_loss": 0.71560407, - "learning_rate": 3.0350922152154557e-06, - "loss": 0.73736715, - "num_input_tokens_seen": 124240915, - "step": 5772, - "time_per_iteration": 2.8108439445495605 - }, - { - "auxiliary_loss_clip": 0.01108886, - "auxiliary_loss_mlp": 0.0077423, - "balance_loss_clip": 1.05118012, - "balance_loss_mlp": 1.00077164, - "epoch": 0.347091537652187, - "flos": 26944135511040.0, - "grad_norm": 1.679823492532721, - "language_loss": 0.764898, - "learning_rate": 3.034758950632507e-06, - "loss": 0.78372908, - "num_input_tokens_seen": 124262770, - "step": 5773, - "time_per_iteration": 2.813775062561035 - }, - { - "auxiliary_loss_clip": 0.01128178, - "auxiliary_loss_mlp": 0.01043067, - "balance_loss_clip": 1.05019748, - "balance_loss_mlp": 1.02674699, - "epoch": 0.34715166090485494, - "flos": 21142228216320.0, - "grad_norm": 5.389351496516036, - "language_loss": 0.70094979, - "learning_rate": 3.034425646811396e-06, - "loss": 0.72266221, - "num_input_tokens_seen": 124280950, - "step": 5774, - "time_per_iteration": 4.167816162109375 - }, - { - "auxiliary_loss_clip": 0.01113209, - "auxiliary_loss_mlp": 0.00774032, - "balance_loss_clip": 1.05024052, - "balance_loss_mlp": 1.00071549, - "epoch": 0.3472117841575229, - "flos": 23478001827840.0, - "grad_norm": 1.6687380405540382, - "language_loss": 0.76013231, - "learning_rate": 3.0340923037647602e-06, - "loss": 0.77900469, - "num_input_tokens_seen": 124299540, - "step": 5775, - "time_per_iteration": 2.739729404449463 - }, - { - "auxiliary_loss_clip": 0.01114926, - "auxiliary_loss_mlp": 0.01046919, - "balance_loss_clip": 1.0480268, - "balance_loss_mlp": 1.02965736, - "epoch": 0.34727190741019087, - "flos": 17492806408320.0, - "grad_norm": 2.598065011523741, - "language_loss": 0.77565503, - "learning_rate": 3.0337589215052404e-06, - "loss": 0.79727352, - "num_input_tokens_seen": 124316285, - "step": 5776, - "time_per_iteration": 2.7339272499084473 - }, - { - "auxiliary_loss_clip": 0.01036494, - "auxiliary_loss_mlp": 0.01014475, - "balance_loss_clip": 1.02741766, - "balance_loss_mlp": 1.01280594, - "epoch": 0.34733203066285884, - "flos": 65265491640960.0, - "grad_norm": 0.8358378555600092, - "language_loss": 0.63272905, - "learning_rate": 3.033425500045478e-06, - "loss": 0.65323877, - "num_input_tokens_seen": 124376650, - "step": 5777, - "time_per_iteration": 3.257993459701538 - }, - { - "auxiliary_loss_clip": 0.01098381, - "auxiliary_loss_mlp": 0.01045801, - "balance_loss_clip": 1.04933393, - "balance_loss_mlp": 1.02975535, - "epoch": 0.3473921539155268, - "flos": 28658726294400.0, - "grad_norm": 3.5330364681008755, - "language_loss": 0.6504612, - "learning_rate": 3.033092039398119e-06, - "loss": 0.67190301, - "num_input_tokens_seen": 124396475, - "step": 5778, - "time_per_iteration": 2.775846481323242 - }, - { - "auxiliary_loss_clip": 0.01113961, - "auxiliary_loss_mlp": 0.01054607, - "balance_loss_clip": 1.04786038, - "balance_loss_mlp": 1.03903246, - "epoch": 0.3474522771681948, - "flos": 40836895355520.0, - "grad_norm": 2.3967507755094064, - "language_loss": 0.71278334, - "learning_rate": 3.0327585395758046e-06, - "loss": 0.73446906, - "num_input_tokens_seen": 124416480, - "step": 5779, - "time_per_iteration": 2.7915873527526855 - }, - { - "auxiliary_loss_clip": 0.01142932, - "auxiliary_loss_mlp": 0.01053692, - "balance_loss_clip": 1.05395269, - "balance_loss_mlp": 1.03762269, - "epoch": 0.3475124004208628, - "flos": 24608577381120.0, - "grad_norm": 2.0452202029673043, - "language_loss": 0.62873107, - "learning_rate": 3.0324250005911837e-06, - "loss": 0.65069735, - "num_input_tokens_seen": 124435950, - "step": 5780, - "time_per_iteration": 2.6743876934051514 - }, - { - "auxiliary_loss_clip": 0.01095736, - "auxiliary_loss_mlp": 0.01050069, - "balance_loss_clip": 1.04648292, - "balance_loss_mlp": 1.03446484, - "epoch": 0.34757252367353075, - "flos": 22711309004160.0, - "grad_norm": 1.6009150193459345, - "language_loss": 0.72167897, - "learning_rate": 3.0320914224569033e-06, - "loss": 0.743137, - "num_input_tokens_seen": 124455410, - "step": 5781, - "time_per_iteration": 2.749302625656128 - }, - { - "auxiliary_loss_clip": 0.01073898, - "auxiliary_loss_mlp": 0.01052117, - "balance_loss_clip": 1.040519, - "balance_loss_mlp": 1.03405714, - "epoch": 0.3476326469261987, - "flos": 19828184970240.0, - "grad_norm": 2.5507599846278644, - "language_loss": 0.76966107, - "learning_rate": 3.031757805185612e-06, - "loss": 0.79092121, - "num_input_tokens_seen": 124474870, - "step": 5782, - "time_per_iteration": 2.801867723464966 - }, - { - "auxiliary_loss_clip": 0.01108825, - "auxiliary_loss_mlp": 0.01037018, - "balance_loss_clip": 1.05032897, - "balance_loss_mlp": 1.02193785, - "epoch": 0.3476927701788667, - "flos": 19938107566080.0, - "grad_norm": 2.367934041085959, - "language_loss": 0.62506068, - "learning_rate": 3.0314241487899622e-06, - "loss": 0.64651906, - "num_input_tokens_seen": 124494105, - "step": 5783, - "time_per_iteration": 2.709778070449829 - }, - { - "auxiliary_loss_clip": 0.01092863, - "auxiliary_loss_mlp": 0.01031024, - "balance_loss_clip": 1.04997683, - "balance_loss_mlp": 1.0163672, - "epoch": 0.34775289343153465, - "flos": 20735108490240.0, - "grad_norm": 1.7498214415914104, - "language_loss": 0.88513505, - "learning_rate": 3.031090453282605e-06, - "loss": 0.90637398, - "num_input_tokens_seen": 124512030, - "step": 5784, - "time_per_iteration": 2.769317150115967 - }, - { - "auxiliary_loss_clip": 0.01089006, - "auxiliary_loss_mlp": 0.01036783, - "balance_loss_clip": 1.05206084, - "balance_loss_mlp": 1.02097547, - "epoch": 0.3478130166842026, - "flos": 19354846521600.0, - "grad_norm": 1.703369857104052, - "language_loss": 0.81740022, - "learning_rate": 3.0307567186761946e-06, - "loss": 0.83865809, - "num_input_tokens_seen": 124530980, - "step": 5785, - "time_per_iteration": 2.791860818862915 - }, - { - "auxiliary_loss_clip": 0.01106676, - "auxiliary_loss_mlp": 0.01040592, - "balance_loss_clip": 1.04747128, - "balance_loss_mlp": 1.02563095, - "epoch": 0.3478731399368706, - "flos": 22051198811520.0, - "grad_norm": 1.689422515624071, - "language_loss": 0.80540836, - "learning_rate": 3.0304229449833862e-06, - "loss": 0.82688099, - "num_input_tokens_seen": 124549330, - "step": 5786, - "time_per_iteration": 2.7547576427459717 - }, - { - "auxiliary_loss_clip": 0.0113505, - "auxiliary_loss_mlp": 0.00773369, - "balance_loss_clip": 1.05242872, - "balance_loss_mlp": 1.00073981, - "epoch": 0.34793326318953854, - "flos": 18041449720320.0, - "grad_norm": 2.7072955912962686, - "language_loss": 0.74945676, - "learning_rate": 3.030089132216836e-06, - "loss": 0.76854098, - "num_input_tokens_seen": 124567200, - "step": 5787, - "time_per_iteration": 2.592688798904419 - }, - { - "auxiliary_loss_clip": 0.01102822, - "auxiliary_loss_mlp": 0.00773627, - "balance_loss_clip": 1.04294109, - "balance_loss_mlp": 1.00074553, - "epoch": 0.3479933864422065, - "flos": 29314670509440.0, - "grad_norm": 1.9068485918966191, - "language_loss": 0.81542754, - "learning_rate": 3.029755280389203e-06, - "loss": 0.83419204, - "num_input_tokens_seen": 124587025, - "step": 5788, - "time_per_iteration": 2.84395694732666 - }, - { - "auxiliary_loss_clip": 0.01144785, - "auxiliary_loss_mlp": 0.01037478, - "balance_loss_clip": 1.0562067, - "balance_loss_mlp": 1.02140832, - "epoch": 0.3480535096948745, - "flos": 20120713332480.0, - "grad_norm": 2.2432452775203964, - "language_loss": 0.85701168, - "learning_rate": 3.029421389513147e-06, - "loss": 0.87883425, - "num_input_tokens_seen": 124605860, - "step": 5789, - "time_per_iteration": 2.630535125732422 - }, - { - "auxiliary_loss_clip": 0.01130136, - "auxiliary_loss_mlp": 0.01056162, - "balance_loss_clip": 1.05231345, - "balance_loss_mlp": 1.04007459, - "epoch": 0.34811363294754244, - "flos": 18548974938240.0, - "grad_norm": 5.008598067350991, - "language_loss": 0.8502599, - "learning_rate": 3.029087459601328e-06, - "loss": 0.87212288, - "num_input_tokens_seen": 124624270, - "step": 5790, - "time_per_iteration": 2.6052823066711426 - }, - { - "auxiliary_loss_clip": 0.01130643, - "auxiliary_loss_mlp": 0.01044731, - "balance_loss_clip": 1.05373776, - "balance_loss_mlp": 1.02904904, - "epoch": 0.3481737562002104, - "flos": 26870303105280.0, - "grad_norm": 1.9264082121319324, - "language_loss": 0.80832046, - "learning_rate": 3.0287534906664097e-06, - "loss": 0.83007419, - "num_input_tokens_seen": 124644005, - "step": 5791, - "time_per_iteration": 2.7190260887145996 - }, - { - "auxiliary_loss_clip": 0.01125872, - "auxiliary_loss_mlp": 0.0104286, - "balance_loss_clip": 1.04968619, - "balance_loss_mlp": 1.02690983, - "epoch": 0.3482338794528784, - "flos": 28908664104960.0, - "grad_norm": 2.4373031068755022, - "language_loss": 0.77855796, - "learning_rate": 3.028419482721056e-06, - "loss": 0.80024529, - "num_input_tokens_seen": 124663020, - "step": 5792, - "time_per_iteration": 2.7223403453826904 - }, - { - "auxiliary_loss_clip": 0.01108923, - "auxiliary_loss_mlp": 0.01034893, - "balance_loss_clip": 1.04401517, - "balance_loss_mlp": 1.01922882, - "epoch": 0.3482940027055464, - "flos": 22200767043840.0, - "grad_norm": 1.6684091148270528, - "language_loss": 0.81824791, - "learning_rate": 3.0280854357779325e-06, - "loss": 0.8396861, - "num_input_tokens_seen": 124682975, - "step": 5793, - "time_per_iteration": 2.84191632270813 - }, - { - "auxiliary_loss_clip": 0.01124823, - "auxiliary_loss_mlp": 0.01055766, - "balance_loss_clip": 1.05077863, - "balance_loss_mlp": 1.0392313, - "epoch": 0.34835412595821436, - "flos": 20302708567680.0, - "grad_norm": 1.8786694421525794, - "language_loss": 0.7607373, - "learning_rate": 3.027751349849706e-06, - "loss": 0.78254318, - "num_input_tokens_seen": 124701340, - "step": 5794, - "time_per_iteration": 2.707648515701294 - }, - { - "auxiliary_loss_clip": 0.01123664, - "auxiliary_loss_mlp": 0.01044013, - "balance_loss_clip": 1.04820764, - "balance_loss_mlp": 1.02735913, - "epoch": 0.3484142492108823, - "flos": 20449691020800.0, - "grad_norm": 2.79979085265216, - "language_loss": 0.57190084, - "learning_rate": 3.0274172249490456e-06, - "loss": 0.59357756, - "num_input_tokens_seen": 124719165, - "step": 5795, - "time_per_iteration": 2.6533401012420654 - }, - { - "auxiliary_loss_clip": 0.01106011, - "auxiliary_loss_mlp": 0.0103693, - "balance_loss_clip": 1.04720807, - "balance_loss_mlp": 1.02177811, - "epoch": 0.3484743724635503, - "flos": 24352929308160.0, - "grad_norm": 2.0564463844351546, - "language_loss": 0.82218957, - "learning_rate": 3.0270830610886213e-06, - "loss": 0.84361899, - "num_input_tokens_seen": 124738670, - "step": 5796, - "time_per_iteration": 2.6823246479034424 - }, - { - "auxiliary_loss_clip": 0.01120404, - "auxiliary_loss_mlp": 0.01034067, - "balance_loss_clip": 1.04927754, - "balance_loss_mlp": 1.0192616, - "epoch": 0.34853449571621825, - "flos": 24353001135360.0, - "grad_norm": 1.9927036097023587, - "language_loss": 0.83429003, - "learning_rate": 3.0267488582811033e-06, - "loss": 0.85583472, - "num_input_tokens_seen": 124758760, - "step": 5797, - "time_per_iteration": 2.7048346996307373 - }, - { - "auxiliary_loss_clip": 0.01132676, - "auxiliary_loss_mlp": 0.01037057, - "balance_loss_clip": 1.05049801, - "balance_loss_mlp": 1.02151191, - "epoch": 0.3485946189688862, - "flos": 27267690245760.0, - "grad_norm": 1.9361964581914621, - "language_loss": 0.73449033, - "learning_rate": 3.026414616539167e-06, - "loss": 0.75618768, - "num_input_tokens_seen": 124777765, - "step": 5798, - "time_per_iteration": 2.6807782649993896 - }, - { - "auxiliary_loss_clip": 0.01135458, - "auxiliary_loss_mlp": 0.01044729, - "balance_loss_clip": 1.04995012, - "balance_loss_mlp": 1.02815914, - "epoch": 0.3486547422215542, - "flos": 20156695781760.0, - "grad_norm": 2.5738259800272725, - "language_loss": 0.76111758, - "learning_rate": 3.026080335875485e-06, - "loss": 0.78291941, - "num_input_tokens_seen": 124796775, - "step": 5799, - "time_per_iteration": 2.629671096801758 - }, - { - "auxiliary_loss_clip": 0.01073192, - "auxiliary_loss_mlp": 0.01035978, - "balance_loss_clip": 1.05208993, - "balance_loss_mlp": 1.02083826, - "epoch": 0.34871486547422215, - "flos": 20230348619520.0, - "grad_norm": 2.242229362705527, - "language_loss": 0.75801086, - "learning_rate": 3.025746016302734e-06, - "loss": 0.77910256, - "num_input_tokens_seen": 124815825, - "step": 5800, - "time_per_iteration": 3.047725200653076 - }, - { - "auxiliary_loss_clip": 0.01112927, - "auxiliary_loss_mlp": 0.00774006, - "balance_loss_clip": 1.04720354, - "balance_loss_mlp": 1.00079536, - "epoch": 0.3487749887268901, - "flos": 44053234882560.0, - "grad_norm": 2.6257316922509286, - "language_loss": 0.67468953, - "learning_rate": 3.025411657833591e-06, - "loss": 0.69355887, - "num_input_tokens_seen": 124838420, - "step": 5801, - "time_per_iteration": 3.2364816665649414 - }, - { - "auxiliary_loss_clip": 0.01103773, - "auxiliary_loss_mlp": 0.010448, - "balance_loss_clip": 1.04506934, - "balance_loss_mlp": 1.028754, - "epoch": 0.3488351119795581, - "flos": 23295144666240.0, - "grad_norm": 1.8428676315803219, - "language_loss": 0.76738638, - "learning_rate": 3.025077260480735e-06, - "loss": 0.78887206, - "num_input_tokens_seen": 124857320, - "step": 5802, - "time_per_iteration": 2.7959024906158447 - }, - { - "auxiliary_loss_clip": 0.01053855, - "auxiliary_loss_mlp": 0.01037371, - "balance_loss_clip": 1.03989601, - "balance_loss_mlp": 1.02219605, - "epoch": 0.34889523523222604, - "flos": 19934839428480.0, - "grad_norm": 1.7816673584343024, - "language_loss": 0.78991377, - "learning_rate": 3.0247428242568474e-06, - "loss": 0.81082606, - "num_input_tokens_seen": 124875685, - "step": 5803, - "time_per_iteration": 2.8440747261047363 - }, - { - "auxiliary_loss_clip": 0.01111548, - "auxiliary_loss_mlp": 0.00774436, - "balance_loss_clip": 1.04601288, - "balance_loss_mlp": 1.00073576, - "epoch": 0.348955358484894, - "flos": 30446179816320.0, - "grad_norm": 6.169621760932873, - "language_loss": 0.67899323, - "learning_rate": 3.0244083491746085e-06, - "loss": 0.69785309, - "num_input_tokens_seen": 124895960, - "step": 5804, - "time_per_iteration": 2.8011341094970703 - }, - { - "auxiliary_loss_clip": 0.01109039, - "auxiliary_loss_mlp": 0.01046207, - "balance_loss_clip": 1.05153811, - "balance_loss_mlp": 1.0306263, - "epoch": 0.349015481737562, - "flos": 17999972490240.0, - "grad_norm": 1.9366950093174176, - "language_loss": 0.75972986, - "learning_rate": 3.024073835246702e-06, - "loss": 0.78128237, - "num_input_tokens_seen": 124914140, - "step": 5805, - "time_per_iteration": 2.735410213470459 - }, - { - "auxiliary_loss_clip": 0.01085261, - "auxiliary_loss_mlp": 0.0103851, - "balance_loss_clip": 1.040416, - "balance_loss_mlp": 1.0230304, - "epoch": 0.34907560499023, - "flos": 27198490694400.0, - "grad_norm": 2.3089286954803194, - "language_loss": 0.67154014, - "learning_rate": 3.023739282485814e-06, - "loss": 0.69277781, - "num_input_tokens_seen": 124934180, - "step": 5806, - "time_per_iteration": 2.793893575668335 - }, - { - "auxiliary_loss_clip": 0.01122813, - "auxiliary_loss_mlp": 0.0104012, - "balance_loss_clip": 1.05324221, - "balance_loss_mlp": 1.02445614, - "epoch": 0.34913572824289796, - "flos": 30226873328640.0, - "grad_norm": 1.5212397526739, - "language_loss": 0.71703929, - "learning_rate": 3.023404690904629e-06, - "loss": 0.73866862, - "num_input_tokens_seen": 124956060, - "step": 5807, - "time_per_iteration": 2.7225730419158936 - }, - { - "auxiliary_loss_clip": 0.01135343, - "auxiliary_loss_mlp": 0.0103686, - "balance_loss_clip": 1.04923332, - "balance_loss_mlp": 1.02102923, - "epoch": 0.3491958514955659, - "flos": 29971907614080.0, - "grad_norm": 2.9062872704377125, - "language_loss": 0.7383548, - "learning_rate": 3.0230700605158364e-06, - "loss": 0.76007676, - "num_input_tokens_seen": 124976070, - "step": 5808, - "time_per_iteration": 4.38737154006958 - }, - { - "auxiliary_loss_clip": 0.01133483, - "auxiliary_loss_mlp": 0.01047071, - "balance_loss_clip": 1.05228174, - "balance_loss_mlp": 1.03241384, - "epoch": 0.3492559747482339, - "flos": 22783273902720.0, - "grad_norm": 1.513097370663534, - "language_loss": 0.84501046, - "learning_rate": 3.0227353913321238e-06, - "loss": 0.86681598, - "num_input_tokens_seen": 124996995, - "step": 5809, - "time_per_iteration": 2.629246711730957 - }, - { - "auxiliary_loss_clip": 0.01106316, - "auxiliary_loss_mlp": 0.01034055, - "balance_loss_clip": 1.04668331, - "balance_loss_mlp": 1.01995289, - "epoch": 0.34931609800090185, - "flos": 26068022881920.0, - "grad_norm": 2.856878325415132, - "language_loss": 0.80759805, - "learning_rate": 3.0224006833661835e-06, - "loss": 0.82900178, - "num_input_tokens_seen": 125015600, - "step": 5810, - "time_per_iteration": 2.815232276916504 - }, - { - "auxiliary_loss_clip": 0.01134295, - "auxiliary_loss_mlp": 0.01039591, - "balance_loss_clip": 1.05105019, - "balance_loss_mlp": 1.02539277, - "epoch": 0.3493762212535698, - "flos": 29242023252480.0, - "grad_norm": 1.9587859815348794, - "language_loss": 0.75694251, - "learning_rate": 3.0220659366307057e-06, - "loss": 0.7786814, - "num_input_tokens_seen": 125035290, - "step": 5811, - "time_per_iteration": 4.295617580413818 - }, - { - "auxiliary_loss_clip": 0.0111498, - "auxiliary_loss_mlp": 0.01040701, - "balance_loss_clip": 1.04791081, - "balance_loss_mlp": 1.02616942, - "epoch": 0.3494363445062378, - "flos": 27126058919040.0, - "grad_norm": 1.5951936061604581, - "language_loss": 0.80199474, - "learning_rate": 3.021731151138386e-06, - "loss": 0.82355154, - "num_input_tokens_seen": 125057130, - "step": 5812, - "time_per_iteration": 2.8571486473083496 - }, - { - "auxiliary_loss_clip": 0.0106966, - "auxiliary_loss_mlp": 0.01038506, - "balance_loss_clip": 1.04193187, - "balance_loss_mlp": 1.02299738, - "epoch": 0.34949646775890575, - "flos": 12276207233280.0, - "grad_norm": 1.932575417997546, - "language_loss": 0.69221139, - "learning_rate": 3.021396326901918e-06, - "loss": 0.71329308, - "num_input_tokens_seen": 125073720, - "step": 5813, - "time_per_iteration": 4.446147441864014 - }, - { - "auxiliary_loss_clip": 0.01101223, - "auxiliary_loss_mlp": 0.00772918, - "balance_loss_clip": 1.04168797, - "balance_loss_mlp": 1.00074911, - "epoch": 0.3495565910115737, - "flos": 17165516659200.0, - "grad_norm": 2.168508070197816, - "language_loss": 0.76586467, - "learning_rate": 3.0210614639339998e-06, - "loss": 0.7846061, - "num_input_tokens_seen": 125090635, - "step": 5814, - "time_per_iteration": 2.698594331741333 - }, - { - "auxiliary_loss_clip": 0.01114737, - "auxiliary_loss_mlp": 0.00773337, - "balance_loss_clip": 1.05010188, - "balance_loss_mlp": 1.00060046, - "epoch": 0.3496167142642417, - "flos": 26465661417600.0, - "grad_norm": 1.9777422761312171, - "language_loss": 0.84760284, - "learning_rate": 3.020726562247328e-06, - "loss": 0.86648357, - "num_input_tokens_seen": 125110070, - "step": 5815, - "time_per_iteration": 2.7839486598968506 - }, - { - "auxiliary_loss_clip": 0.01117022, - "auxiliary_loss_mlp": 0.01031007, - "balance_loss_clip": 1.04850423, - "balance_loss_mlp": 1.01695168, - "epoch": 0.34967683751690964, - "flos": 17414843938560.0, - "grad_norm": 2.1137892099104674, - "language_loss": 0.77541941, - "learning_rate": 3.0203916218546024e-06, - "loss": 0.79689968, - "num_input_tokens_seen": 125125730, - "step": 5816, - "time_per_iteration": 2.6244633197784424 - }, - { - "auxiliary_loss_clip": 0.01122041, - "auxiliary_loss_mlp": 0.01042966, - "balance_loss_clip": 1.05198002, - "balance_loss_mlp": 1.0282141, - "epoch": 0.3497369607695776, - "flos": 22600021691520.0, - "grad_norm": 2.2643435778821246, - "language_loss": 0.5898062, - "learning_rate": 3.0200566427685246e-06, - "loss": 0.61145627, - "num_input_tokens_seen": 125146195, - "step": 5817, - "time_per_iteration": 2.676058530807495 - }, - { - "auxiliary_loss_clip": 0.01065616, - "auxiliary_loss_mlp": 0.01004328, - "balance_loss_clip": 1.03704262, - "balance_loss_mlp": 1.00290895, - "epoch": 0.34979708402224563, - "flos": 68529374818560.0, - "grad_norm": 0.8661744616347857, - "language_loss": 0.59915632, - "learning_rate": 3.0197216250017975e-06, - "loss": 0.61985576, - "num_input_tokens_seen": 125207790, - "step": 5818, - "time_per_iteration": 3.2298331260681152 - }, - { - "auxiliary_loss_clip": 0.0109396, - "auxiliary_loss_mlp": 0.01044055, - "balance_loss_clip": 1.04599476, - "balance_loss_mlp": 1.02892733, - "epoch": 0.3498572072749136, - "flos": 18989634988800.0, - "grad_norm": 2.0582091611638713, - "language_loss": 0.83473527, - "learning_rate": 3.019386568567123e-06, - "loss": 0.85611546, - "num_input_tokens_seen": 125226220, - "step": 5819, - "time_per_iteration": 2.6558237075805664 - }, - { - "auxiliary_loss_clip": 0.01106439, - "auxiliary_loss_mlp": 0.01034351, - "balance_loss_clip": 1.04502416, - "balance_loss_mlp": 1.01987886, - "epoch": 0.34991733052758156, - "flos": 27818883423360.0, - "grad_norm": 1.848700539441483, - "language_loss": 0.7078613, - "learning_rate": 3.0190514734772083e-06, - "loss": 0.72926915, - "num_input_tokens_seen": 125247485, - "step": 5820, - "time_per_iteration": 2.703023672103882 - }, - { - "auxiliary_loss_clip": 0.01122902, - "auxiliary_loss_mlp": 0.01036767, - "balance_loss_clip": 1.04821718, - "balance_loss_mlp": 1.02288496, - "epoch": 0.3499774537802495, - "flos": 33584197737600.0, - "grad_norm": 1.691680241057735, - "language_loss": 0.70418453, - "learning_rate": 3.018716339744759e-06, - "loss": 0.7257812, - "num_input_tokens_seen": 125268625, - "step": 5821, - "time_per_iteration": 2.7258172035217285 - }, - { - "auxiliary_loss_clip": 0.01128016, - "auxiliary_loss_mlp": 0.01045237, - "balance_loss_clip": 1.05040097, - "balance_loss_mlp": 1.02945328, - "epoch": 0.3500375770329175, - "flos": 23476744851840.0, - "grad_norm": 3.022669367007059, - "language_loss": 0.73552108, - "learning_rate": 3.0183811673824842e-06, - "loss": 0.75725359, - "num_input_tokens_seen": 125287530, - "step": 5822, - "time_per_iteration": 2.6288442611694336 - }, - { - "auxiliary_loss_clip": 0.01111612, - "auxiliary_loss_mlp": 0.01034787, - "balance_loss_clip": 1.04867673, - "balance_loss_mlp": 1.0193131, - "epoch": 0.35009770028558546, - "flos": 19026048401280.0, - "grad_norm": 13.86145468617928, - "language_loss": 0.78286207, - "learning_rate": 3.018045956403094e-06, - "loss": 0.80432606, - "num_input_tokens_seen": 125307020, - "step": 5823, - "time_per_iteration": 2.585644245147705 - }, - { - "auxiliary_loss_clip": 0.01050549, - "auxiliary_loss_mlp": 0.01002993, - "balance_loss_clip": 1.03169346, - "balance_loss_mlp": 1.00141954, - "epoch": 0.3501578235382534, - "flos": 68351868783360.0, - "grad_norm": 0.7268668465066358, - "language_loss": 0.59232962, - "learning_rate": 3.017710706819298e-06, - "loss": 0.61286497, - "num_input_tokens_seen": 125370445, - "step": 5824, - "time_per_iteration": 3.2155251502990723 - }, - { - "auxiliary_loss_clip": 0.01110681, - "auxiliary_loss_mlp": 0.01041197, - "balance_loss_clip": 1.04737854, - "balance_loss_mlp": 1.02561092, - "epoch": 0.3502179467909214, - "flos": 21250893836160.0, - "grad_norm": 3.9873136748139126, - "language_loss": 0.84533477, - "learning_rate": 3.017375418643811e-06, - "loss": 0.86685359, - "num_input_tokens_seen": 125388900, - "step": 5825, - "time_per_iteration": 2.687849998474121 - }, - { - "auxiliary_loss_clip": 0.01123129, - "auxiliary_loss_mlp": 0.00772852, - "balance_loss_clip": 1.04982102, - "balance_loss_mlp": 1.00084817, - "epoch": 0.35027807004358935, - "flos": 11942955826560.0, - "grad_norm": 3.7970216760931654, - "language_loss": 0.83272213, - "learning_rate": 3.0170400918893464e-06, - "loss": 0.85168195, - "num_input_tokens_seen": 125402675, - "step": 5826, - "time_per_iteration": 2.623713970184326 - }, - { - "auxiliary_loss_clip": 0.01108751, - "auxiliary_loss_mlp": 0.01045941, - "balance_loss_clip": 1.04680669, - "balance_loss_mlp": 1.0308249, - "epoch": 0.3503381932962573, - "flos": 21470918595840.0, - "grad_norm": 1.799644232020304, - "language_loss": 0.8068707, - "learning_rate": 3.0167047265686186e-06, - "loss": 0.82841766, - "num_input_tokens_seen": 125421360, - "step": 5827, - "time_per_iteration": 2.7149739265441895 - }, - { - "auxiliary_loss_clip": 0.01080927, - "auxiliary_loss_mlp": 0.01041383, - "balance_loss_clip": 1.04276204, - "balance_loss_mlp": 1.02641606, - "epoch": 0.3503983165489253, - "flos": 21251109317760.0, - "grad_norm": 3.105536532024743, - "language_loss": 0.71077561, - "learning_rate": 3.0163693226943467e-06, - "loss": 0.73199868, - "num_input_tokens_seen": 125440000, - "step": 5828, - "time_per_iteration": 2.7468550205230713 - }, - { - "auxiliary_loss_clip": 0.01126682, - "auxiliary_loss_mlp": 0.01050267, - "balance_loss_clip": 1.05060673, - "balance_loss_mlp": 1.0323143, - "epoch": 0.35045843980159325, - "flos": 27815723026560.0, - "grad_norm": 2.750124615693701, - "language_loss": 0.79695857, - "learning_rate": 3.016033880279248e-06, - "loss": 0.81872809, - "num_input_tokens_seen": 125460390, - "step": 5829, - "time_per_iteration": 2.6937646865844727 - }, - { - "auxiliary_loss_clip": 0.01096574, - "auxiliary_loss_mlp": 0.01044418, - "balance_loss_clip": 1.0481379, - "balance_loss_mlp": 1.02766919, - "epoch": 0.3505185630542612, - "flos": 25921148169600.0, - "grad_norm": 1.9090298023730403, - "language_loss": 0.72606629, - "learning_rate": 3.0156983993360417e-06, - "loss": 0.74747616, - "num_input_tokens_seen": 125478410, - "step": 5830, - "time_per_iteration": 2.7369346618652344 - }, - { - "auxiliary_loss_clip": 0.01090166, - "auxiliary_loss_mlp": 0.01037306, - "balance_loss_clip": 1.04190445, - "balance_loss_mlp": 1.02131414, - "epoch": 0.35057868630692923, - "flos": 20521763660160.0, - "grad_norm": 2.5268343856675437, - "language_loss": 0.88473773, - "learning_rate": 3.0153628798774513e-06, - "loss": 0.90601242, - "num_input_tokens_seen": 125495975, - "step": 5831, - "time_per_iteration": 2.716801166534424 - }, - { - "auxiliary_loss_clip": 0.01076431, - "auxiliary_loss_mlp": 0.01046131, - "balance_loss_clip": 1.04348278, - "balance_loss_mlp": 1.03036547, - "epoch": 0.3506388095595972, - "flos": 20448649526400.0, - "grad_norm": 2.8335622037275052, - "language_loss": 0.78706706, - "learning_rate": 3.0150273219161985e-06, - "loss": 0.80829263, - "num_input_tokens_seen": 125515035, - "step": 5832, - "time_per_iteration": 2.719874143600464 - }, - { - "auxiliary_loss_clip": 0.01096023, - "auxiliary_loss_mlp": 0.01049214, - "balance_loss_clip": 1.04483593, - "balance_loss_mlp": 1.0303669, - "epoch": 0.35069893281226516, - "flos": 23109665811840.0, - "grad_norm": 2.771771323399588, - "language_loss": 0.71084702, - "learning_rate": 3.014691725465008e-06, - "loss": 0.73229945, - "num_input_tokens_seen": 125535555, - "step": 5833, - "time_per_iteration": 2.729029655456543 - }, - { - "auxiliary_loss_clip": 0.0111933, - "auxiliary_loss_mlp": 0.01035784, - "balance_loss_clip": 1.04690456, - "balance_loss_mlp": 1.02119827, - "epoch": 0.35075905606493313, - "flos": 27271999877760.0, - "grad_norm": 1.4652984704802052, - "language_loss": 0.80866987, - "learning_rate": 3.014356090536606e-06, - "loss": 0.830221, - "num_input_tokens_seen": 125558195, - "step": 5834, - "time_per_iteration": 2.6999855041503906 - }, - { - "auxiliary_loss_clip": 0.01086162, - "auxiliary_loss_mlp": 0.01041057, - "balance_loss_clip": 1.05142856, - "balance_loss_mlp": 1.02516639, - "epoch": 0.3508191793176011, - "flos": 19128608709120.0, - "grad_norm": 2.24398587431922, - "language_loss": 0.84067535, - "learning_rate": 3.0140204171437183e-06, - "loss": 0.86194754, - "num_input_tokens_seen": 125575375, - "step": 5835, - "time_per_iteration": 2.7401607036590576 - }, - { - "auxiliary_loss_clip": 0.01072219, - "auxiliary_loss_mlp": 0.0104369, - "balance_loss_clip": 1.04324877, - "balance_loss_mlp": 1.02816927, - "epoch": 0.35087930257026906, - "flos": 25557588662400.0, - "grad_norm": 1.6286460178957367, - "language_loss": 0.76643491, - "learning_rate": 3.0136847052990754e-06, - "loss": 0.78759408, - "num_input_tokens_seen": 125596745, - "step": 5836, - "time_per_iteration": 2.767824649810791 - }, - { - "auxiliary_loss_clip": 0.01095252, - "auxiliary_loss_mlp": 0.01044499, - "balance_loss_clip": 1.04785156, - "balance_loss_mlp": 1.02751756, - "epoch": 0.350939425822937, - "flos": 18004246208640.0, - "grad_norm": 2.0145924652365945, - "language_loss": 0.77402902, - "learning_rate": 3.0133489550154074e-06, - "loss": 0.79542655, - "num_input_tokens_seen": 125613980, - "step": 5837, - "time_per_iteration": 2.684300661087036 - }, - { - "auxiliary_loss_clip": 0.01122261, - "auxiliary_loss_mlp": 0.01044889, - "balance_loss_clip": 1.04895687, - "balance_loss_mlp": 1.02941537, - "epoch": 0.350999549075605, - "flos": 22273198819200.0, - "grad_norm": 2.68275803808264, - "language_loss": 0.67695981, - "learning_rate": 3.0130131663054442e-06, - "loss": 0.69863135, - "num_input_tokens_seen": 125632100, - "step": 5838, - "time_per_iteration": 2.6679129600524902 - }, - { - "auxiliary_loss_clip": 0.01133084, - "auxiliary_loss_mlp": 0.01041419, - "balance_loss_clip": 1.04808521, - "balance_loss_mlp": 1.02538526, - "epoch": 0.35105967232827295, - "flos": 14392279307520.0, - "grad_norm": 2.478699358378921, - "language_loss": 0.83575064, - "learning_rate": 3.0126773391819215e-06, - "loss": 0.85749567, - "num_input_tokens_seen": 125649190, - "step": 5839, - "time_per_iteration": 2.7186849117279053 - }, - { - "auxiliary_loss_clip": 0.01125827, - "auxiliary_loss_mlp": 0.01045138, - "balance_loss_clip": 1.0484879, - "balance_loss_mlp": 1.02930689, - "epoch": 0.3511197955809409, - "flos": 25082346792960.0, - "grad_norm": 2.56286420283892, - "language_loss": 0.58882701, - "learning_rate": 3.012341473657572e-06, - "loss": 0.61053669, - "num_input_tokens_seen": 125668680, - "step": 5840, - "time_per_iteration": 2.7048165798187256 - }, - { - "auxiliary_loss_clip": 0.01093858, - "auxiliary_loss_mlp": 0.01043209, - "balance_loss_clip": 1.0449121, - "balance_loss_mlp": 1.02719963, - "epoch": 0.3511799188336089, - "flos": 25884160139520.0, - "grad_norm": 2.762376787670534, - "language_loss": 0.87442869, - "learning_rate": 3.0120055697451322e-06, - "loss": 0.89579934, - "num_input_tokens_seen": 125686935, - "step": 5841, - "time_per_iteration": 2.763007402420044 - }, - { - "auxiliary_loss_clip": 0.01116677, - "auxiliary_loss_mlp": 0.01038697, - "balance_loss_clip": 1.04990196, - "balance_loss_mlp": 1.02083993, - "epoch": 0.35124004208627685, - "flos": 20083725302400.0, - "grad_norm": 1.9868500880648916, - "language_loss": 0.75116056, - "learning_rate": 3.0116696274573406e-06, - "loss": 0.77271438, - "num_input_tokens_seen": 125707180, - "step": 5842, - "time_per_iteration": 2.703010082244873 - }, - { - "auxiliary_loss_clip": 0.01124735, - "auxiliary_loss_mlp": 0.01045785, - "balance_loss_clip": 1.04863322, - "balance_loss_mlp": 1.0302043, - "epoch": 0.3513001653389448, - "flos": 17783431349760.0, - "grad_norm": 2.134458584945634, - "language_loss": 0.68687361, - "learning_rate": 3.0113336468069346e-06, - "loss": 0.70857882, - "num_input_tokens_seen": 125722780, - "step": 5843, - "time_per_iteration": 2.6459767818450928 - }, - { - "auxiliary_loss_clip": 0.01135637, - "auxiliary_loss_mlp": 0.01046534, - "balance_loss_clip": 1.05054379, - "balance_loss_mlp": 1.0305481, - "epoch": 0.3513602885916128, - "flos": 29387138198400.0, - "grad_norm": 2.0610262324560984, - "language_loss": 0.65392244, - "learning_rate": 3.010997627806655e-06, - "loss": 0.67574418, - "num_input_tokens_seen": 125742110, - "step": 5844, - "time_per_iteration": 2.6542131900787354 - }, - { - "auxiliary_loss_clip": 0.01119986, - "auxiliary_loss_mlp": 0.01042575, - "balance_loss_clip": 1.04791713, - "balance_loss_mlp": 1.02620745, - "epoch": 0.3514204118442808, - "flos": 16179876483840.0, - "grad_norm": 2.0120705985466394, - "language_loss": 0.75180912, - "learning_rate": 3.010661570469245e-06, - "loss": 0.77343476, - "num_input_tokens_seen": 125759980, - "step": 5845, - "time_per_iteration": 2.686753511428833 - }, - { - "auxiliary_loss_clip": 0.01122626, - "auxiliary_loss_mlp": 0.01043989, - "balance_loss_clip": 1.0485301, - "balance_loss_mlp": 1.02835488, - "epoch": 0.35148053509694877, - "flos": 23834665923840.0, - "grad_norm": 4.021226487899694, - "language_loss": 0.73548663, - "learning_rate": 3.0103254748074465e-06, - "loss": 0.7571528, - "num_input_tokens_seen": 125772660, - "step": 5846, - "time_per_iteration": 2.67868971824646 - }, - { - "auxiliary_loss_clip": 0.01094187, - "auxiliary_loss_mlp": 0.01044379, - "balance_loss_clip": 1.04565465, - "balance_loss_mlp": 1.02834511, - "epoch": 0.35154065834961673, - "flos": 20991295267200.0, - "grad_norm": 1.687499817432144, - "language_loss": 0.756024, - "learning_rate": 3.0099893408340046e-06, - "loss": 0.77740967, - "num_input_tokens_seen": 125791935, - "step": 5847, - "time_per_iteration": 2.749495267868042 - }, - { - "auxiliary_loss_clip": 0.011087, - "auxiliary_loss_mlp": 0.01034036, - "balance_loss_clip": 1.04465413, - "balance_loss_mlp": 1.01871789, - "epoch": 0.3516007816022847, - "flos": 33255471444480.0, - "grad_norm": 2.8847551511625675, - "language_loss": 0.71752924, - "learning_rate": 3.009653168561666e-06, - "loss": 0.73895657, - "num_input_tokens_seen": 125813455, - "step": 5848, - "time_per_iteration": 4.367843151092529 - }, - { - "auxiliary_loss_clip": 0.0111724, - "auxiliary_loss_mlp": 0.01051356, - "balance_loss_clip": 1.04754996, - "balance_loss_mlp": 1.03528619, - "epoch": 0.35166090485495266, - "flos": 11726953390080.0, - "grad_norm": 2.1303857634409455, - "language_loss": 0.89211285, - "learning_rate": 3.009316958003178e-06, - "loss": 0.91379881, - "num_input_tokens_seen": 125827660, - "step": 5849, - "time_per_iteration": 2.720156192779541 - }, - { - "auxiliary_loss_clip": 0.01112345, - "auxiliary_loss_mlp": 0.01035199, - "balance_loss_clip": 1.04670548, - "balance_loss_mlp": 1.01948714, - "epoch": 0.3517210281076206, - "flos": 22638446265600.0, - "grad_norm": 5.671837642447228, - "language_loss": 0.74645329, - "learning_rate": 3.0089807091712897e-06, - "loss": 0.76792872, - "num_input_tokens_seen": 125846655, - "step": 5850, - "time_per_iteration": 5.769666910171509 - }, - { - "auxiliary_loss_clip": 0.01124277, - "auxiliary_loss_mlp": 0.01039165, - "balance_loss_clip": 1.05061293, - "balance_loss_mlp": 1.02304828, - "epoch": 0.3517811513602886, - "flos": 21322750993920.0, - "grad_norm": 4.453824391316201, - "language_loss": 0.75497609, - "learning_rate": 3.0086444220787515e-06, - "loss": 0.77661049, - "num_input_tokens_seen": 125866290, - "step": 5851, - "time_per_iteration": 2.6903436183929443 - }, - { - "auxiliary_loss_clip": 0.01109028, - "auxiliary_loss_mlp": 0.01043585, - "balance_loss_clip": 1.047647, - "balance_loss_mlp": 1.02581048, - "epoch": 0.35184127461295656, - "flos": 21032880238080.0, - "grad_norm": 2.6842208339362714, - "language_loss": 0.8711859, - "learning_rate": 3.0083080967383165e-06, - "loss": 0.892712, - "num_input_tokens_seen": 125884620, - "step": 5852, - "time_per_iteration": 4.37211275100708 - }, - { - "auxiliary_loss_clip": 0.01134086, - "auxiliary_loss_mlp": 0.01034974, - "balance_loss_clip": 1.05088282, - "balance_loss_mlp": 1.02020407, - "epoch": 0.3519013978656245, - "flos": 22455265881600.0, - "grad_norm": 4.894656899057391, - "language_loss": 0.67756367, - "learning_rate": 3.007971733162737e-06, - "loss": 0.69925427, - "num_input_tokens_seen": 125902430, - "step": 5853, - "time_per_iteration": 2.6657445430755615 - }, - { - "auxiliary_loss_clip": 0.0110992, - "auxiliary_loss_mlp": 0.01035315, - "balance_loss_clip": 1.04499912, - "balance_loss_mlp": 1.01943672, - "epoch": 0.3519615211182925, - "flos": 13115295918720.0, - "grad_norm": 1.9396695842158058, - "language_loss": 0.80834955, - "learning_rate": 3.0076353313647686e-06, - "loss": 0.82980192, - "num_input_tokens_seen": 125920570, - "step": 5854, - "time_per_iteration": 2.741804361343384 - }, - { - "auxiliary_loss_clip": 0.0111683, - "auxiliary_loss_mlp": 0.01035573, - "balance_loss_clip": 1.05230534, - "balance_loss_mlp": 1.02117872, - "epoch": 0.35202164437096045, - "flos": 19135144984320.0, - "grad_norm": 2.236186864476635, - "language_loss": 0.73234653, - "learning_rate": 3.0072988913571666e-06, - "loss": 0.75387061, - "num_input_tokens_seen": 125939800, - "step": 5855, - "time_per_iteration": 2.730731725692749 - }, - { - "auxiliary_loss_clip": 0.0113392, - "auxiliary_loss_mlp": 0.01038425, - "balance_loss_clip": 1.05024409, - "balance_loss_mlp": 1.02407861, - "epoch": 0.3520817676236284, - "flos": 26542187343360.0, - "grad_norm": 2.4482136775911427, - "language_loss": 0.71000826, - "learning_rate": 3.006962413152691e-06, - "loss": 0.73173165, - "num_input_tokens_seen": 125958720, - "step": 5856, - "time_per_iteration": 2.632906436920166 - }, - { - "auxiliary_loss_clip": 0.01121339, - "auxiliary_loss_mlp": 0.01047265, - "balance_loss_clip": 1.0479008, - "balance_loss_mlp": 1.03056359, - "epoch": 0.3521418908762964, - "flos": 44893472803200.0, - "grad_norm": 1.9582827204032656, - "language_loss": 0.61505377, - "learning_rate": 3.0066258967640987e-06, - "loss": 0.63673985, - "num_input_tokens_seen": 125984310, - "step": 5857, - "time_per_iteration": 2.8992249965667725 - }, - { - "auxiliary_loss_clip": 0.01126198, - "auxiliary_loss_mlp": 0.0103782, - "balance_loss_clip": 1.05141187, - "balance_loss_mlp": 1.02197754, - "epoch": 0.3522020141289644, - "flos": 20187398931840.0, - "grad_norm": 2.047463358229584, - "language_loss": 0.73246485, - "learning_rate": 3.006289342204152e-06, - "loss": 0.75410509, - "num_input_tokens_seen": 126002410, - "step": 5858, - "time_per_iteration": 2.6754567623138428 - }, - { - "auxiliary_loss_clip": 0.01139705, - "auxiliary_loss_mlp": 0.01044718, - "balance_loss_clip": 1.05193448, - "balance_loss_mlp": 1.028947, - "epoch": 0.35226213738163237, - "flos": 27563917708800.0, - "grad_norm": 1.8174320112537778, - "language_loss": 0.7662344, - "learning_rate": 3.0059527494856126e-06, - "loss": 0.78807867, - "num_input_tokens_seen": 126022490, - "step": 5859, - "time_per_iteration": 2.6464414596557617 - }, - { - "auxiliary_loss_clip": 0.01123734, - "auxiliary_loss_mlp": 0.0104748, - "balance_loss_clip": 1.05600715, - "balance_loss_mlp": 1.03037381, - "epoch": 0.35232226063430033, - "flos": 22966310632320.0, - "grad_norm": 2.0728265984729974, - "language_loss": 0.71452159, - "learning_rate": 3.0056161186212435e-06, - "loss": 0.73623371, - "num_input_tokens_seen": 126042895, - "step": 5860, - "time_per_iteration": 2.7567954063415527 - }, - { - "auxiliary_loss_clip": 0.01107752, - "auxiliary_loss_mlp": 0.01042463, - "balance_loss_clip": 1.04505348, - "balance_loss_mlp": 1.02517724, - "epoch": 0.3523823838869683, - "flos": 19168290259200.0, - "grad_norm": 2.4820154826508896, - "language_loss": 0.66456246, - "learning_rate": 3.005279449623811e-06, - "loss": 0.6860646, - "num_input_tokens_seen": 126060130, - "step": 5861, - "time_per_iteration": 2.6954853534698486 - }, - { - "auxiliary_loss_clip": 0.01114832, - "auxiliary_loss_mlp": 0.01037396, - "balance_loss_clip": 1.05085611, - "balance_loss_mlp": 1.0220778, - "epoch": 0.35244250713963626, - "flos": 17930988420480.0, - "grad_norm": 2.552495084661914, - "language_loss": 0.66833258, - "learning_rate": 3.0049427425060815e-06, - "loss": 0.68985492, - "num_input_tokens_seen": 126077850, - "step": 5862, - "time_per_iteration": 2.758626699447632 - }, - { - "auxiliary_loss_clip": 0.01111543, - "auxiliary_loss_mlp": 0.01046885, - "balance_loss_clip": 1.04932082, - "balance_loss_mlp": 1.02999306, - "epoch": 0.35250263039230423, - "flos": 21432529935360.0, - "grad_norm": 2.001922070828984, - "language_loss": 0.77027225, - "learning_rate": 3.0046059972808215e-06, - "loss": 0.79185653, - "num_input_tokens_seen": 126095985, - "step": 5863, - "time_per_iteration": 2.692974328994751 - }, - { - "auxiliary_loss_clip": 0.01124448, - "auxiliary_loss_mlp": 0.01041257, - "balance_loss_clip": 1.05029762, - "balance_loss_mlp": 1.02602828, - "epoch": 0.3525627536449722, - "flos": 27416863428480.0, - "grad_norm": 2.204178263750967, - "language_loss": 0.75406265, - "learning_rate": 3.0042692139608024e-06, - "loss": 0.77571976, - "num_input_tokens_seen": 126116070, - "step": 5864, - "time_per_iteration": 2.7303273677825928 - }, - { - "auxiliary_loss_clip": 0.01124417, - "auxiliary_loss_mlp": 0.01048097, - "balance_loss_clip": 1.04847336, - "balance_loss_mlp": 1.03237331, - "epoch": 0.35262287689764016, - "flos": 24789818430720.0, - "grad_norm": 2.3571129928423713, - "language_loss": 0.79312253, - "learning_rate": 3.003932392558793e-06, - "loss": 0.81484771, - "num_input_tokens_seen": 126135205, - "step": 5865, - "time_per_iteration": 2.6439075469970703 - }, - { - "auxiliary_loss_clip": 0.01136688, - "auxiliary_loss_mlp": 0.01047929, - "balance_loss_clip": 1.05626893, - "balance_loss_mlp": 1.03143001, - "epoch": 0.3526830001503081, - "flos": 17821604528640.0, - "grad_norm": 2.261768767041389, - "language_loss": 0.81215894, - "learning_rate": 3.0035955330875677e-06, - "loss": 0.83400512, - "num_input_tokens_seen": 126151895, - "step": 5866, - "time_per_iteration": 2.649991035461426 - }, - { - "auxiliary_loss_clip": 0.01095064, - "auxiliary_loss_mlp": 0.01040513, - "balance_loss_clip": 1.04940605, - "balance_loss_mlp": 1.0227983, - "epoch": 0.3527431234029761, - "flos": 18078114528000.0, - "grad_norm": 2.4092573216113182, - "language_loss": 0.84224141, - "learning_rate": 3.0032586355598986e-06, - "loss": 0.86359721, - "num_input_tokens_seen": 126168515, - "step": 5867, - "time_per_iteration": 2.7634172439575195 - }, - { - "auxiliary_loss_clip": 0.01142449, - "auxiliary_loss_mlp": 0.01051484, - "balance_loss_clip": 1.05421114, - "balance_loss_mlp": 1.03525996, - "epoch": 0.35280324665564405, - "flos": 19427350124160.0, - "grad_norm": 1.8115003163784764, - "language_loss": 0.74367464, - "learning_rate": 3.0029216999885613e-06, - "loss": 0.76561391, - "num_input_tokens_seen": 126186460, - "step": 5868, - "time_per_iteration": 2.5986721515655518 - }, - { - "auxiliary_loss_clip": 0.01131163, - "auxiliary_loss_mlp": 0.01040977, - "balance_loss_clip": 1.05391645, - "balance_loss_mlp": 1.02457356, - "epoch": 0.352863369908312, - "flos": 21504027957120.0, - "grad_norm": 1.9536193185751474, - "language_loss": 0.6105355, - "learning_rate": 3.0025847263863327e-06, - "loss": 0.63225693, - "num_input_tokens_seen": 126206170, - "step": 5869, - "time_per_iteration": 2.6737887859344482 - }, - { - "auxiliary_loss_clip": 0.0112854, - "auxiliary_loss_mlp": 0.01048512, - "balance_loss_clip": 1.05128717, - "balance_loss_mlp": 1.03254998, - "epoch": 0.35292349316098, - "flos": 22309504490880.0, - "grad_norm": 2.4234624332717347, - "language_loss": 0.74279565, - "learning_rate": 3.0022477147659917e-06, - "loss": 0.76456618, - "num_input_tokens_seen": 126225605, - "step": 5870, - "time_per_iteration": 2.6921114921569824 - }, - { - "auxiliary_loss_clip": 0.01126478, - "auxiliary_loss_mlp": 0.01039703, - "balance_loss_clip": 1.05037582, - "balance_loss_mlp": 1.02376485, - "epoch": 0.352983616413648, - "flos": 33109745967360.0, - "grad_norm": 1.6641276231491144, - "language_loss": 0.71796882, - "learning_rate": 3.001910665140316e-06, - "loss": 0.73963058, - "num_input_tokens_seen": 126250230, - "step": 5871, - "time_per_iteration": 2.8457682132720947 - }, - { - "auxiliary_loss_clip": 0.01120204, - "auxiliary_loss_mlp": 0.01040363, - "balance_loss_clip": 1.04829907, - "balance_loss_mlp": 1.02547359, - "epoch": 0.35304373966631597, - "flos": 18696603836160.0, - "grad_norm": 2.0001362497177233, - "language_loss": 0.73279023, - "learning_rate": 3.0015735775220873e-06, - "loss": 0.75439584, - "num_input_tokens_seen": 126268315, - "step": 5872, - "time_per_iteration": 2.6763055324554443 - }, - { - "auxiliary_loss_clip": 0.01114426, - "auxiliary_loss_mlp": 0.0077352, - "balance_loss_clip": 1.04808497, - "balance_loss_mlp": 1.00056779, - "epoch": 0.35310386291898394, - "flos": 23364954748800.0, - "grad_norm": 1.9067005964756008, - "language_loss": 0.82472706, - "learning_rate": 3.001236451924089e-06, - "loss": 0.84360659, - "num_input_tokens_seen": 126288390, - "step": 5873, - "time_per_iteration": 2.7487120628356934 - }, - { - "auxiliary_loss_clip": 0.0111852, - "auxiliary_loss_mlp": 0.01055173, - "balance_loss_clip": 1.04805684, - "balance_loss_mlp": 1.03743458, - "epoch": 0.3531639861716519, - "flos": 24461954064000.0, - "grad_norm": 2.0747562837168956, - "language_loss": 0.65867126, - "learning_rate": 3.000899288359104e-06, - "loss": 0.68040824, - "num_input_tokens_seen": 126305750, - "step": 5874, - "time_per_iteration": 2.717100143432617 - }, - { - "auxiliary_loss_clip": 0.01065517, - "auxiliary_loss_mlp": 0.01018804, - "balance_loss_clip": 1.04397154, - "balance_loss_mlp": 1.01712346, - "epoch": 0.35322410942431987, - "flos": 70312446881280.0, - "grad_norm": 0.7718710282270123, - "language_loss": 0.61513722, - "learning_rate": 3.000562086839917e-06, - "loss": 0.63598049, - "num_input_tokens_seen": 126362495, - "step": 5875, - "time_per_iteration": 3.1768009662628174 - }, - { - "auxiliary_loss_clip": 0.0106968, - "auxiliary_loss_mlp": 0.01053019, - "balance_loss_clip": 1.04069328, - "balance_loss_mlp": 1.03722405, - "epoch": 0.35328423267698783, - "flos": 19820894509440.0, - "grad_norm": 1.9274751499515825, - "language_loss": 0.79748046, - "learning_rate": 3.0002248473793163e-06, - "loss": 0.81870747, - "num_input_tokens_seen": 126378320, - "step": 5876, - "time_per_iteration": 2.7911314964294434 - }, - { - "auxiliary_loss_clip": 0.01038976, - "auxiliary_loss_mlp": 0.00753375, - "balance_loss_clip": 1.03853297, - "balance_loss_mlp": 1.00146759, - "epoch": 0.3533443559296558, - "flos": 60826356391680.0, - "grad_norm": 0.6715924709851474, - "language_loss": 0.56771934, - "learning_rate": 2.999887569990088e-06, - "loss": 0.58564281, - "num_input_tokens_seen": 126442735, - "step": 5877, - "time_per_iteration": 3.3190126419067383 - }, - { - "auxiliary_loss_clip": 0.01106988, - "auxiliary_loss_mlp": 0.0103768, - "balance_loss_clip": 1.04755747, - "balance_loss_mlp": 1.02150357, - "epoch": 0.35340447918232376, - "flos": 24755775315840.0, - "grad_norm": 2.262624772342981, - "language_loss": 0.72041059, - "learning_rate": 2.999550254685024e-06, - "loss": 0.74185729, - "num_input_tokens_seen": 126463090, - "step": 5878, - "time_per_iteration": 2.769482135772705 - }, - { - "auxiliary_loss_clip": 0.01111223, - "auxiliary_loss_mlp": 0.01039233, - "balance_loss_clip": 1.0494144, - "balance_loss_mlp": 1.02333045, - "epoch": 0.3534646024349917, - "flos": 21796304924160.0, - "grad_norm": 1.9529875004972157, - "language_loss": 0.78282005, - "learning_rate": 2.9992129014769136e-06, - "loss": 0.80432463, - "num_input_tokens_seen": 126482105, - "step": 5879, - "time_per_iteration": 2.7066614627838135 - }, - { - "auxiliary_loss_clip": 0.01111375, - "auxiliary_loss_mlp": 0.01046843, - "balance_loss_clip": 1.05344558, - "balance_loss_mlp": 1.0287354, - "epoch": 0.3535247256876597, - "flos": 20012119539840.0, - "grad_norm": 2.4774809869114547, - "language_loss": 0.63312674, - "learning_rate": 2.9988755103785493e-06, - "loss": 0.65470898, - "num_input_tokens_seen": 126502125, - "step": 5880, - "time_per_iteration": 2.87187123298645 - }, - { - "auxiliary_loss_clip": 0.01116729, - "auxiliary_loss_mlp": 0.01037267, - "balance_loss_clip": 1.05014002, - "balance_loss_mlp": 1.02067327, - "epoch": 0.35358484894032766, - "flos": 18187929383040.0, - "grad_norm": 2.079670586085082, - "language_loss": 0.65503716, - "learning_rate": 2.998538081402727e-06, - "loss": 0.67657715, - "num_input_tokens_seen": 126521950, - "step": 5881, - "time_per_iteration": 2.701570510864258 - }, - { - "auxiliary_loss_clip": 0.01119778, - "auxiliary_loss_mlp": 0.01035576, - "balance_loss_clip": 1.05182576, - "balance_loss_mlp": 1.02047253, - "epoch": 0.3536449721929956, - "flos": 22820369673600.0, - "grad_norm": 1.437925300063569, - "language_loss": 0.75797737, - "learning_rate": 2.998200614562239e-06, - "loss": 0.77953088, - "num_input_tokens_seen": 126542445, - "step": 5882, - "time_per_iteration": 2.713350772857666 - }, - { - "auxiliary_loss_clip": 0.01112568, - "auxiliary_loss_mlp": 0.01044857, - "balance_loss_clip": 1.0485872, - "balance_loss_mlp": 1.02591491, - "epoch": 0.3537050954456636, - "flos": 26432336574720.0, - "grad_norm": 2.160470372067537, - "language_loss": 0.70095098, - "learning_rate": 2.9978631098698847e-06, - "loss": 0.72252524, - "num_input_tokens_seen": 126560690, - "step": 5883, - "time_per_iteration": 2.77695631980896 - }, - { - "auxiliary_loss_clip": 0.01107169, - "auxiliary_loss_mlp": 0.01040706, - "balance_loss_clip": 1.04937398, - "balance_loss_mlp": 1.02364671, - "epoch": 0.3537652186983316, - "flos": 17197153562880.0, - "grad_norm": 3.3935912100169117, - "language_loss": 0.78052664, - "learning_rate": 2.9975255673384614e-06, - "loss": 0.80200535, - "num_input_tokens_seen": 126577620, - "step": 5884, - "time_per_iteration": 2.8704800605773926 - }, - { - "auxiliary_loss_clip": 0.0111409, - "auxiliary_loss_mlp": 0.01036742, - "balance_loss_clip": 1.05093837, - "balance_loss_mlp": 1.02157819, - "epoch": 0.3538253419509996, - "flos": 19536769929600.0, - "grad_norm": 1.9052381201351025, - "language_loss": 0.7519542, - "learning_rate": 2.9971879869807673e-06, - "loss": 0.77346253, - "num_input_tokens_seen": 126596235, - "step": 5885, - "time_per_iteration": 2.74930477142334 - }, - { - "auxiliary_loss_clip": 0.01088229, - "auxiliary_loss_mlp": 0.01040915, - "balance_loss_clip": 1.04355764, - "balance_loss_mlp": 1.02321255, - "epoch": 0.35388546520366754, - "flos": 12128578335360.0, - "grad_norm": 3.360136520151105, - "language_loss": 0.83904099, - "learning_rate": 2.996850368809606e-06, - "loss": 0.86033243, - "num_input_tokens_seen": 126612830, - "step": 5886, - "time_per_iteration": 2.9362361431121826 - }, - { - "auxiliary_loss_clip": 0.01139122, - "auxiliary_loss_mlp": 0.01039479, - "balance_loss_clip": 1.05223978, - "balance_loss_mlp": 1.02178788, - "epoch": 0.3539455884563355, - "flos": 19678149861120.0, - "grad_norm": 2.3342407880968765, - "language_loss": 0.78239143, - "learning_rate": 2.9965127128377787e-06, - "loss": 0.8041774, - "num_input_tokens_seen": 126630910, - "step": 5887, - "time_per_iteration": 4.157519340515137 - }, - { - "auxiliary_loss_clip": 0.01079386, - "auxiliary_loss_mlp": 0.01047635, - "balance_loss_clip": 1.04380405, - "balance_loss_mlp": 1.03155398, - "epoch": 0.35400571170900347, - "flos": 18072045129600.0, - "grad_norm": 3.4693260211189614, - "language_loss": 0.65532601, - "learning_rate": 2.996175019078089e-06, - "loss": 0.67659628, - "num_input_tokens_seen": 126648365, - "step": 5888, - "time_per_iteration": 2.7693519592285156 - }, - { - "auxiliary_loss_clip": 0.01108859, - "auxiliary_loss_mlp": 0.01038745, - "balance_loss_clip": 1.04853678, - "balance_loss_mlp": 1.02278328, - "epoch": 0.35406583496167143, - "flos": 26068058795520.0, - "grad_norm": 2.324375134725136, - "language_loss": 0.77100271, - "learning_rate": 2.9958372875433437e-06, - "loss": 0.7924788, - "num_input_tokens_seen": 126667500, - "step": 5889, - "time_per_iteration": 4.211338996887207 - }, - { - "auxiliary_loss_clip": 0.0110217, - "auxiliary_loss_mlp": 0.01041504, - "balance_loss_clip": 1.05017257, - "balance_loss_mlp": 1.0262332, - "epoch": 0.3541259582143394, - "flos": 19792453916160.0, - "grad_norm": 2.074151752869495, - "language_loss": 0.81132901, - "learning_rate": 2.9954995182463478e-06, - "loss": 0.83276576, - "num_input_tokens_seen": 126686820, - "step": 5890, - "time_per_iteration": 4.248823642730713 - }, - { - "auxiliary_loss_clip": 0.01112591, - "auxiliary_loss_mlp": 0.01034659, - "balance_loss_clip": 1.04692972, - "balance_loss_mlp": 1.01979923, - "epoch": 0.35418608146700736, - "flos": 24022084112640.0, - "grad_norm": 1.8036187380252735, - "language_loss": 0.79384875, - "learning_rate": 2.99516171119991e-06, - "loss": 0.81532121, - "num_input_tokens_seen": 126706965, - "step": 5891, - "time_per_iteration": 4.335815668106079 - }, - { - "auxiliary_loss_clip": 0.01099264, - "auxiliary_loss_mlp": 0.01046084, - "balance_loss_clip": 1.04669261, - "balance_loss_mlp": 1.0285244, - "epoch": 0.35424620471967533, - "flos": 12385770693120.0, - "grad_norm": 2.015603194975926, - "language_loss": 0.73404211, - "learning_rate": 2.9948238664168415e-06, - "loss": 0.75549555, - "num_input_tokens_seen": 126724015, - "step": 5892, - "time_per_iteration": 2.760498046875 - }, - { - "auxiliary_loss_clip": 0.01112321, - "auxiliary_loss_mlp": 0.01041472, - "balance_loss_clip": 1.04650092, - "balance_loss_mlp": 1.02434158, - "epoch": 0.3543063279723433, - "flos": 19673624747520.0, - "grad_norm": 2.094655212929219, - "language_loss": 0.6720162, - "learning_rate": 2.9944859839099518e-06, - "loss": 0.6935541, - "num_input_tokens_seen": 126737565, - "step": 5893, - "time_per_iteration": 2.671706199645996 - }, - { - "auxiliary_loss_clip": 0.01084647, - "auxiliary_loss_mlp": 0.01041527, - "balance_loss_clip": 1.04317796, - "balance_loss_mlp": 1.02440834, - "epoch": 0.35436645122501126, - "flos": 21909208348800.0, - "grad_norm": 1.9115541405313234, - "language_loss": 0.69860309, - "learning_rate": 2.9941480636920533e-06, - "loss": 0.71986485, - "num_input_tokens_seen": 126756095, - "step": 5894, - "time_per_iteration": 2.720066785812378 - }, - { - "auxiliary_loss_clip": 0.01111006, - "auxiliary_loss_mlp": 0.00773076, - "balance_loss_clip": 1.04764175, - "balance_loss_mlp": 1.00055242, - "epoch": 0.3544265744776792, - "flos": 21719527603200.0, - "grad_norm": 1.7998653616668008, - "language_loss": 0.74833035, - "learning_rate": 2.9938101057759615e-06, - "loss": 0.76717114, - "num_input_tokens_seen": 126775455, - "step": 5895, - "time_per_iteration": 2.8295304775238037 - }, - { - "auxiliary_loss_clip": 0.011052, - "auxiliary_loss_mlp": 0.01040742, - "balance_loss_clip": 1.04288006, - "balance_loss_mlp": 1.02485108, - "epoch": 0.3544866977303472, - "flos": 21213223447680.0, - "grad_norm": 2.053997857318945, - "language_loss": 0.83762395, - "learning_rate": 2.993472110174491e-06, - "loss": 0.85908329, - "num_input_tokens_seen": 126792320, - "step": 5896, - "time_per_iteration": 2.723158836364746 - }, - { - "auxiliary_loss_clip": 0.01111237, - "auxiliary_loss_mlp": 0.00773671, - "balance_loss_clip": 1.04756641, - "balance_loss_mlp": 1.0005331, - "epoch": 0.35454682098301515, - "flos": 29311402371840.0, - "grad_norm": 1.7709518935889355, - "language_loss": 0.70033729, - "learning_rate": 2.9931340769004576e-06, - "loss": 0.71918637, - "num_input_tokens_seen": 126813680, - "step": 5897, - "time_per_iteration": 2.744617223739624 - }, - { - "auxiliary_loss_clip": 0.01111293, - "auxiliary_loss_mlp": 0.01046033, - "balance_loss_clip": 1.04829669, - "balance_loss_mlp": 1.02830625, - "epoch": 0.3546069442356832, - "flos": 24316587722880.0, - "grad_norm": 3.0934933528513344, - "language_loss": 0.81546402, - "learning_rate": 2.9927960059666816e-06, - "loss": 0.83703721, - "num_input_tokens_seen": 126834395, - "step": 5898, - "time_per_iteration": 2.77911376953125 - }, - { - "auxiliary_loss_clip": 0.0113395, - "auxiliary_loss_mlp": 0.01037456, - "balance_loss_clip": 1.04943967, - "balance_loss_mlp": 1.02232838, - "epoch": 0.35466706748835114, - "flos": 22857285876480.0, - "grad_norm": 5.100417261000322, - "language_loss": 0.73975331, - "learning_rate": 2.9924578973859804e-06, - "loss": 0.7614674, - "num_input_tokens_seen": 126855145, - "step": 5899, - "time_per_iteration": 2.6566851139068604 - }, - { - "auxiliary_loss_clip": 0.0113747, - "auxiliary_loss_mlp": 0.00772565, - "balance_loss_clip": 1.04971743, - "balance_loss_mlp": 1.00056052, - "epoch": 0.3547271907410191, - "flos": 28330107742080.0, - "grad_norm": 1.7615083390778834, - "language_loss": 0.79458243, - "learning_rate": 2.9921197511711763e-06, - "loss": 0.81368273, - "num_input_tokens_seen": 126873790, - "step": 5900, - "time_per_iteration": 2.6658642292022705 - }, - { - "auxiliary_loss_clip": 0.0111331, - "auxiliary_loss_mlp": 0.01044824, - "balance_loss_clip": 1.04659319, - "balance_loss_mlp": 1.0288384, - "epoch": 0.35478731399368707, - "flos": 23514092017920.0, - "grad_norm": 2.160550694830747, - "language_loss": 0.81303531, - "learning_rate": 2.991781567335093e-06, - "loss": 0.83461666, - "num_input_tokens_seen": 126892865, - "step": 5901, - "time_per_iteration": 2.711568593978882 - }, - { - "auxiliary_loss_clip": 0.01125037, - "auxiliary_loss_mlp": 0.00772744, - "balance_loss_clip": 1.05092883, - "balance_loss_mlp": 1.00049663, - "epoch": 0.35484743724635504, - "flos": 18624315715200.0, - "grad_norm": 2.0558354102165373, - "language_loss": 0.75869077, - "learning_rate": 2.9914433458905525e-06, - "loss": 0.7776686, - "num_input_tokens_seen": 126911935, - "step": 5902, - "time_per_iteration": 2.6833012104034424 - }, - { - "auxiliary_loss_clip": 0.01123978, - "auxiliary_loss_mlp": 0.01036322, - "balance_loss_clip": 1.04852581, - "balance_loss_mlp": 1.02142096, - "epoch": 0.354907560499023, - "flos": 17384499924480.0, - "grad_norm": 2.534328384273088, - "language_loss": 0.70550704, - "learning_rate": 2.991105086850381e-06, - "loss": 0.72711003, - "num_input_tokens_seen": 126930040, - "step": 5903, - "time_per_iteration": 2.689303159713745 - }, - { - "auxiliary_loss_clip": 0.01128401, - "auxiliary_loss_mlp": 0.01036477, - "balance_loss_clip": 1.05025887, - "balance_loss_mlp": 1.02051437, - "epoch": 0.35496768375169097, - "flos": 19208546426880.0, - "grad_norm": 3.3775979872187203, - "language_loss": 0.7448622, - "learning_rate": 2.9907667902274053e-06, - "loss": 0.76651096, - "num_input_tokens_seen": 126948390, - "step": 5904, - "time_per_iteration": 2.6360747814178467 - }, - { - "auxiliary_loss_clip": 0.01113034, - "auxiliary_loss_mlp": 0.00772738, - "balance_loss_clip": 1.04721618, - "balance_loss_mlp": 1.000543, - "epoch": 0.35502780700435893, - "flos": 18332792933760.0, - "grad_norm": 3.051840518778985, - "language_loss": 0.78653091, - "learning_rate": 2.9904284560344536e-06, - "loss": 0.80538863, - "num_input_tokens_seen": 126964905, - "step": 5905, - "time_per_iteration": 2.8539419174194336 - }, - { - "auxiliary_loss_clip": 0.01101916, - "auxiliary_loss_mlp": 0.01038927, - "balance_loss_clip": 1.04842138, - "balance_loss_mlp": 1.02486014, - "epoch": 0.3550879302570269, - "flos": 15448555578240.0, - "grad_norm": 18.846860460510154, - "language_loss": 0.72740704, - "learning_rate": 2.990090084284356e-06, - "loss": 0.74881542, - "num_input_tokens_seen": 126982000, - "step": 5906, - "time_per_iteration": 2.7013392448425293 - }, - { - "auxiliary_loss_clip": 0.01109726, - "auxiliary_loss_mlp": 0.01039804, - "balance_loss_clip": 1.04908431, - "balance_loss_mlp": 1.02265012, - "epoch": 0.35514805350969486, - "flos": 21979197999360.0, - "grad_norm": 1.821131131528883, - "language_loss": 0.74746358, - "learning_rate": 2.9897516749899426e-06, - "loss": 0.76895893, - "num_input_tokens_seen": 126998390, - "step": 5907, - "time_per_iteration": 2.7603847980499268 - }, - { - "auxiliary_loss_clip": 0.01062812, - "auxiliary_loss_mlp": 0.01042872, - "balance_loss_clip": 1.03682017, - "balance_loss_mlp": 1.02463293, - "epoch": 0.3552081767623628, - "flos": 29861949104640.0, - "grad_norm": 3.0473905008627775, - "language_loss": 0.7563526, - "learning_rate": 2.989413228164047e-06, - "loss": 0.77740943, - "num_input_tokens_seen": 127020220, - "step": 5908, - "time_per_iteration": 2.8653454780578613 - }, - { - "auxiliary_loss_clip": 0.01114185, - "auxiliary_loss_mlp": 0.01042445, - "balance_loss_clip": 1.05034626, - "balance_loss_mlp": 1.02736473, - "epoch": 0.3552683000150308, - "flos": 26432264747520.0, - "grad_norm": 2.926995842336842, - "language_loss": 0.68243527, - "learning_rate": 2.989074743819502e-06, - "loss": 0.70400161, - "num_input_tokens_seen": 127038585, - "step": 5909, - "time_per_iteration": 2.6967928409576416 - }, - { - "auxiliary_loss_clip": 0.01120713, - "auxiliary_loss_mlp": 0.01037454, - "balance_loss_clip": 1.0503571, - "balance_loss_mlp": 1.02271986, - "epoch": 0.35532842326769876, - "flos": 19785989468160.0, - "grad_norm": 2.2169711344959864, - "language_loss": 0.78605235, - "learning_rate": 2.988736221969144e-06, - "loss": 0.807634, - "num_input_tokens_seen": 127056215, - "step": 5910, - "time_per_iteration": 2.65592885017395 - }, - { - "auxiliary_loss_clip": 0.01111825, - "auxiliary_loss_mlp": 0.01044022, - "balance_loss_clip": 1.04383612, - "balance_loss_mlp": 1.02745175, - "epoch": 0.3553885465203668, - "flos": 17239277237760.0, - "grad_norm": 4.097628076705993, - "language_loss": 0.71322721, - "learning_rate": 2.98839766262581e-06, - "loss": 0.73478568, - "num_input_tokens_seen": 127075825, - "step": 5911, - "time_per_iteration": 2.6958134174346924 - }, - { - "auxiliary_loss_clip": 0.01122761, - "auxiliary_loss_mlp": 0.01041881, - "balance_loss_clip": 1.04820287, - "balance_loss_mlp": 1.02711153, - "epoch": 0.35544866977303474, - "flos": 14934350430720.0, - "grad_norm": 2.592685980990988, - "language_loss": 0.86703777, - "learning_rate": 2.9880590658023366e-06, - "loss": 0.88868415, - "num_input_tokens_seen": 127091205, - "step": 5912, - "time_per_iteration": 2.615788221359253 - }, - { - "auxiliary_loss_clip": 0.01113661, - "auxiliary_loss_mlp": 0.01038659, - "balance_loss_clip": 1.04849911, - "balance_loss_mlp": 1.02413917, - "epoch": 0.3555087930257027, - "flos": 19756040503680.0, - "grad_norm": 1.9602305341473392, - "language_loss": 0.76948488, - "learning_rate": 2.9877204315115646e-06, - "loss": 0.79100811, - "num_input_tokens_seen": 127109210, - "step": 5913, - "time_per_iteration": 2.7827799320220947 - }, - { - "auxiliary_loss_clip": 0.01098195, - "auxiliary_loss_mlp": 0.01036489, - "balance_loss_clip": 1.04796672, - "balance_loss_mlp": 1.02183783, - "epoch": 0.3555689162783707, - "flos": 21068252156160.0, - "grad_norm": 1.6272917241322848, - "language_loss": 0.82545209, - "learning_rate": 2.9873817597663353e-06, - "loss": 0.8467989, - "num_input_tokens_seen": 127128400, - "step": 5914, - "time_per_iteration": 2.7242603302001953 - }, - { - "auxiliary_loss_clip": 0.01137835, - "auxiliary_loss_mlp": 0.01037677, - "balance_loss_clip": 1.05178475, - "balance_loss_mlp": 1.02247739, - "epoch": 0.35562903953103864, - "flos": 33069633454080.0, - "grad_norm": 2.9034799926536, - "language_loss": 0.70664769, - "learning_rate": 2.98704305057949e-06, - "loss": 0.72840279, - "num_input_tokens_seen": 127149965, - "step": 5915, - "time_per_iteration": 2.6785290241241455 - }, - { - "auxiliary_loss_clip": 0.01124956, - "auxiliary_loss_mlp": 0.01042738, - "balance_loss_clip": 1.04884696, - "balance_loss_mlp": 1.02823067, - "epoch": 0.3556891627837066, - "flos": 20557853850240.0, - "grad_norm": 1.7433450554379117, - "language_loss": 0.76387751, - "learning_rate": 2.9867043039638737e-06, - "loss": 0.78555447, - "num_input_tokens_seen": 127169865, - "step": 5916, - "time_per_iteration": 2.646141529083252 - }, - { - "auxiliary_loss_clip": 0.01103991, - "auxiliary_loss_mlp": 0.01039438, - "balance_loss_clip": 1.04549897, - "balance_loss_mlp": 1.02451277, - "epoch": 0.35574928603637457, - "flos": 20703327932160.0, - "grad_norm": 1.7213233773991115, - "language_loss": 0.88551259, - "learning_rate": 2.986365519932332e-06, - "loss": 0.9069469, - "num_input_tokens_seen": 127188075, - "step": 5917, - "time_per_iteration": 2.735424757003784 - }, - { - "auxiliary_loss_clip": 0.01057648, - "auxiliary_loss_mlp": 0.01050179, - "balance_loss_clip": 1.03888357, - "balance_loss_mlp": 1.03190458, - "epoch": 0.35580940928904253, - "flos": 15194595444480.0, - "grad_norm": 2.1986231946039916, - "language_loss": 0.74800515, - "learning_rate": 2.98602669849771e-06, - "loss": 0.76908338, - "num_input_tokens_seen": 127206065, - "step": 5918, - "time_per_iteration": 2.759612798690796 - }, - { - "auxiliary_loss_clip": 0.01046226, - "auxiliary_loss_mlp": 0.01004318, - "balance_loss_clip": 1.03416467, - "balance_loss_mlp": 1.00212467, - "epoch": 0.3558695325417105, - "flos": 58639145431680.0, - "grad_norm": 0.9523078238877629, - "language_loss": 0.63871694, - "learning_rate": 2.985687839672857e-06, - "loss": 0.65922242, - "num_input_tokens_seen": 127257885, - "step": 5919, - "time_per_iteration": 2.974400281906128 - }, - { - "auxiliary_loss_clip": 0.01125949, - "auxiliary_loss_mlp": 0.01037737, - "balance_loss_clip": 1.05126309, - "balance_loss_mlp": 1.02168, - "epoch": 0.35592965579437846, - "flos": 22018233104640.0, - "grad_norm": 2.3466450300124952, - "language_loss": 0.73515332, - "learning_rate": 2.9853489434706223e-06, - "loss": 0.75679016, - "num_input_tokens_seen": 127275550, - "step": 5920, - "time_per_iteration": 2.6402368545532227 - }, - { - "auxiliary_loss_clip": 0.01092607, - "auxiliary_loss_mlp": 0.01035798, - "balance_loss_clip": 1.0452888, - "balance_loss_mlp": 1.02082539, - "epoch": 0.35598977904704643, - "flos": 23367684182400.0, - "grad_norm": 2.020155019062759, - "language_loss": 0.76745147, - "learning_rate": 2.985010009903857e-06, - "loss": 0.78873557, - "num_input_tokens_seen": 127295110, - "step": 5921, - "time_per_iteration": 2.7224855422973633 - }, - { - "auxiliary_loss_clip": 0.01112186, - "auxiliary_loss_mlp": 0.01038012, - "balance_loss_clip": 1.04887438, - "balance_loss_mlp": 1.0231111, - "epoch": 0.3560499022997144, - "flos": 17785334770560.0, - "grad_norm": 2.0978128065546717, - "language_loss": 0.68095905, - "learning_rate": 2.9846710389854133e-06, - "loss": 0.702461, - "num_input_tokens_seen": 127312865, - "step": 5922, - "time_per_iteration": 2.6849706172943115 - }, - { - "auxiliary_loss_clip": 0.01120912, - "auxiliary_loss_mlp": 0.01035687, - "balance_loss_clip": 1.04752564, - "balance_loss_mlp": 1.02032125, - "epoch": 0.35611002555238236, - "flos": 20740459616640.0, - "grad_norm": 3.470851899346702, - "language_loss": 0.79121947, - "learning_rate": 2.9843320307281454e-06, - "loss": 0.81278539, - "num_input_tokens_seen": 127331710, - "step": 5923, - "time_per_iteration": 2.659977436065674 - }, - { - "auxiliary_loss_clip": 0.01118161, - "auxiliary_loss_mlp": 0.01042419, - "balance_loss_clip": 1.0530231, - "balance_loss_mlp": 1.02770221, - "epoch": 0.3561701488050504, - "flos": 19462219251840.0, - "grad_norm": 2.2084385051152946, - "language_loss": 0.85266459, - "learning_rate": 2.983992985144908e-06, - "loss": 0.87427044, - "num_input_tokens_seen": 127350950, - "step": 5924, - "time_per_iteration": 2.680994987487793 - }, - { - "auxiliary_loss_clip": 0.01109604, - "auxiliary_loss_mlp": 0.01046078, - "balance_loss_clip": 1.04669881, - "balance_loss_mlp": 1.02974653, - "epoch": 0.35623027205771834, - "flos": 30774942023040.0, - "grad_norm": 3.12021389910605, - "language_loss": 0.77619767, - "learning_rate": 2.9836539022485578e-06, - "loss": 0.79775453, - "num_input_tokens_seen": 127369385, - "step": 5925, - "time_per_iteration": 2.854043960571289 - }, - { - "auxiliary_loss_clip": 0.01078608, - "auxiliary_loss_mlp": 0.01047631, - "balance_loss_clip": 1.04546142, - "balance_loss_mlp": 1.03274155, - "epoch": 0.3562903953103863, - "flos": 16981079299200.0, - "grad_norm": 2.0406100546628108, - "language_loss": 0.75402963, - "learning_rate": 2.9833147820519535e-06, - "loss": 0.77529198, - "num_input_tokens_seen": 127386965, - "step": 5926, - "time_per_iteration": 4.347430467605591 - }, - { - "auxiliary_loss_clip": 0.01110536, - "auxiliary_loss_mlp": 0.00773423, - "balance_loss_clip": 1.04907203, - "balance_loss_mlp": 1.00041842, - "epoch": 0.3563505185630543, - "flos": 23839837482240.0, - "grad_norm": 2.7011184644215254, - "language_loss": 0.69563019, - "learning_rate": 2.9829756245679544e-06, - "loss": 0.71446979, - "num_input_tokens_seen": 127406075, - "step": 5927, - "time_per_iteration": 2.8237216472625732 - }, - { - "auxiliary_loss_clip": 0.01136293, - "auxiliary_loss_mlp": 0.01040585, - "balance_loss_clip": 1.05083871, - "balance_loss_mlp": 1.0256958, - "epoch": 0.35641064181572224, - "flos": 22273450214400.0, - "grad_norm": 2.594343371199836, - "language_loss": 0.79681075, - "learning_rate": 2.9826364298094212e-06, - "loss": 0.81857955, - "num_input_tokens_seen": 127425350, - "step": 5928, - "time_per_iteration": 4.171353340148926 - }, - { - "auxiliary_loss_clip": 0.01139765, - "auxiliary_loss_mlp": 0.01040338, - "balance_loss_clip": 1.05304861, - "balance_loss_mlp": 1.02473354, - "epoch": 0.3564707650683902, - "flos": 23001251587200.0, - "grad_norm": 1.4355701611092584, - "language_loss": 0.81758744, - "learning_rate": 2.982297197789215e-06, - "loss": 0.83938849, - "num_input_tokens_seen": 127446335, - "step": 5929, - "time_per_iteration": 4.3162572383880615 - }, - { - "auxiliary_loss_clip": 0.01120871, - "auxiliary_loss_mlp": 0.01037566, - "balance_loss_clip": 1.04776335, - "balance_loss_mlp": 1.02304602, - "epoch": 0.35653088832105817, - "flos": 14684268965760.0, - "grad_norm": 1.9323399136404307, - "language_loss": 0.70277226, - "learning_rate": 2.981957928520201e-06, - "loss": 0.72435665, - "num_input_tokens_seen": 127462795, - "step": 5930, - "time_per_iteration": 2.6527109146118164 - }, - { - "auxiliary_loss_clip": 0.01131875, - "auxiliary_loss_mlp": 0.01045641, - "balance_loss_clip": 1.05533779, - "balance_loss_mlp": 1.02960742, - "epoch": 0.35659101157372614, - "flos": 23477068074240.0, - "grad_norm": 2.2535070260025147, - "language_loss": 0.6758765, - "learning_rate": 2.981618622015244e-06, - "loss": 0.69765162, - "num_input_tokens_seen": 127482675, - "step": 5931, - "time_per_iteration": 4.3453147411346436 - }, - { - "auxiliary_loss_clip": 0.0112554, - "auxiliary_loss_mlp": 0.01040124, - "balance_loss_clip": 1.04992425, - "balance_loss_mlp": 1.02531803, - "epoch": 0.3566511348263941, - "flos": 26578672583040.0, - "grad_norm": 1.9436277425022137, - "language_loss": 0.67792088, - "learning_rate": 2.981279278287211e-06, - "loss": 0.69957745, - "num_input_tokens_seen": 127502275, - "step": 5932, - "time_per_iteration": 2.700096368789673 - }, - { - "auxiliary_loss_clip": 0.01082532, - "auxiliary_loss_mlp": 0.01033095, - "balance_loss_clip": 1.04578543, - "balance_loss_mlp": 1.01849222, - "epoch": 0.35671125807906207, - "flos": 13115008609920.0, - "grad_norm": 5.160615382495107, - "language_loss": 0.78454852, - "learning_rate": 2.980939897348969e-06, - "loss": 0.80570471, - "num_input_tokens_seen": 127520195, - "step": 5933, - "time_per_iteration": 2.6900391578674316 - }, - { - "auxiliary_loss_clip": 0.01121777, - "auxiliary_loss_mlp": 0.01052933, - "balance_loss_clip": 1.0480361, - "balance_loss_mlp": 1.03600574, - "epoch": 0.35677138133173003, - "flos": 33000577557120.0, - "grad_norm": 1.6861574442761758, - "language_loss": 0.69256425, - "learning_rate": 2.980600479213388e-06, - "loss": 0.7143113, - "num_input_tokens_seen": 127544495, - "step": 5934, - "time_per_iteration": 2.7415738105773926 - }, - { - "auxiliary_loss_clip": 0.01117054, - "auxiliary_loss_mlp": 0.0077763, - "balance_loss_clip": 1.05076528, - "balance_loss_mlp": 1.00057197, - "epoch": 0.356831504584398, - "flos": 20777842696320.0, - "grad_norm": 1.9577931058258786, - "language_loss": 0.70848507, - "learning_rate": 2.9802610238933384e-06, - "loss": 0.72743189, - "num_input_tokens_seen": 127563810, - "step": 5935, - "time_per_iteration": 2.689974069595337 - }, - { - "auxiliary_loss_clip": 0.01105553, - "auxiliary_loss_mlp": 0.01040367, - "balance_loss_clip": 1.04790044, - "balance_loss_mlp": 1.02414298, - "epoch": 0.35689162783706596, - "flos": 12165566365440.0, - "grad_norm": 2.8406009493899567, - "language_loss": 0.7755211, - "learning_rate": 2.979921531401692e-06, - "loss": 0.79698032, - "num_input_tokens_seen": 127579065, - "step": 5936, - "time_per_iteration": 2.741913318634033 - }, - { - "auxiliary_loss_clip": 0.0112859, - "auxiliary_loss_mlp": 0.00773213, - "balance_loss_clip": 1.05281317, - "balance_loss_mlp": 1.00073922, - "epoch": 0.356951751089734, - "flos": 23841489507840.0, - "grad_norm": 1.4219917851433757, - "language_loss": 0.64282179, - "learning_rate": 2.9795820017513242e-06, - "loss": 0.66183978, - "num_input_tokens_seen": 127599105, - "step": 5937, - "time_per_iteration": 2.698432207107544 - }, - { - "auxiliary_loss_clip": 0.011437, - "auxiliary_loss_mlp": 0.00773044, - "balance_loss_clip": 1.05475211, - "balance_loss_mlp": 1.00064254, - "epoch": 0.35701187434240195, - "flos": 11722176881280.0, - "grad_norm": 3.0634993604384744, - "language_loss": 0.78483748, - "learning_rate": 2.9792424349551073e-06, - "loss": 0.80400497, - "num_input_tokens_seen": 127614940, - "step": 5938, - "time_per_iteration": 2.617074489593506 - }, - { - "auxiliary_loss_clip": 0.01104152, - "auxiliary_loss_mlp": 0.01042471, - "balance_loss_clip": 1.05522823, - "balance_loss_mlp": 1.0276773, - "epoch": 0.3570719975950699, - "flos": 24898879100160.0, - "grad_norm": 1.4921508018011957, - "language_loss": 0.8058449, - "learning_rate": 2.9789028310259202e-06, - "loss": 0.82731104, - "num_input_tokens_seen": 127634960, - "step": 5939, - "time_per_iteration": 2.805285930633545 - }, - { - "auxiliary_loss_clip": 0.01119857, - "auxiliary_loss_mlp": 0.01039048, - "balance_loss_clip": 1.05386829, - "balance_loss_mlp": 1.02343178, - "epoch": 0.3571321208477379, - "flos": 25994836920960.0, - "grad_norm": 2.412769849050775, - "language_loss": 0.79263425, - "learning_rate": 2.9785631899766395e-06, - "loss": 0.81422341, - "num_input_tokens_seen": 127654545, - "step": 5940, - "time_per_iteration": 2.729759693145752 - }, - { - "auxiliary_loss_clip": 0.01122797, - "auxiliary_loss_mlp": 0.0103573, - "balance_loss_clip": 1.05434561, - "balance_loss_mlp": 1.01836729, - "epoch": 0.35719224410040584, - "flos": 14501663199360.0, - "grad_norm": 2.99992676537861, - "language_loss": 0.72561693, - "learning_rate": 2.9782235118201443e-06, - "loss": 0.74720228, - "num_input_tokens_seen": 127672320, - "step": 5941, - "time_per_iteration": 2.7407357692718506 - }, - { - "auxiliary_loss_clip": 0.01131761, - "auxiliary_loss_mlp": 0.01043456, - "balance_loss_clip": 1.0537883, - "balance_loss_mlp": 1.02636182, - "epoch": 0.3572523673530738, - "flos": 31175453646720.0, - "grad_norm": 4.524453853263744, - "language_loss": 0.64234614, - "learning_rate": 2.9778837965693154e-06, - "loss": 0.66409832, - "num_input_tokens_seen": 127693315, - "step": 5942, - "time_per_iteration": 2.693835735321045 - }, - { - "auxiliary_loss_clip": 0.01125006, - "auxiliary_loss_mlp": 0.0104058, - "balance_loss_clip": 1.05074191, - "balance_loss_mlp": 1.02442718, - "epoch": 0.3573124906057418, - "flos": 15851976203520.0, - "grad_norm": 1.88999720959261, - "language_loss": 0.7433207, - "learning_rate": 2.9775440442370354e-06, - "loss": 0.76497656, - "num_input_tokens_seen": 127711570, - "step": 5943, - "time_per_iteration": 2.6655383110046387 - }, - { - "auxiliary_loss_clip": 0.0107084, - "auxiliary_loss_mlp": 0.01002098, - "balance_loss_clip": 1.04128122, - "balance_loss_mlp": 1.000512, - "epoch": 0.35737261385840974, - "flos": 60822729118080.0, - "grad_norm": 0.7930578325967097, - "language_loss": 0.60739905, - "learning_rate": 2.9772042548361867e-06, - "loss": 0.62812841, - "num_input_tokens_seen": 127772475, - "step": 5944, - "time_per_iteration": 3.257052421569824 - }, - { - "auxiliary_loss_clip": 0.01113544, - "auxiliary_loss_mlp": 0.01038819, - "balance_loss_clip": 1.05017304, - "balance_loss_mlp": 1.02329779, - "epoch": 0.3574327371110777, - "flos": 18843765857280.0, - "grad_norm": 2.0176419730945554, - "language_loss": 0.72310007, - "learning_rate": 2.976864428379655e-06, - "loss": 0.74462366, - "num_input_tokens_seen": 127790940, - "step": 5945, - "time_per_iteration": 2.6320457458496094 - }, - { - "auxiliary_loss_clip": 0.01113199, - "auxiliary_loss_mlp": 0.00773448, - "balance_loss_clip": 1.04710388, - "balance_loss_mlp": 1.00053716, - "epoch": 0.35749286036374567, - "flos": 23549679417600.0, - "grad_norm": 2.1873404124300655, - "language_loss": 0.81147355, - "learning_rate": 2.976524564880326e-06, - "loss": 0.83034003, - "num_input_tokens_seen": 127808275, - "step": 5946, - "time_per_iteration": 2.7045581340789795 - }, - { - "auxiliary_loss_clip": 0.01142015, - "auxiliary_loss_mlp": 0.01041839, - "balance_loss_clip": 1.05382085, - "balance_loss_mlp": 1.02568626, - "epoch": 0.35755298361641363, - "flos": 21105491581440.0, - "grad_norm": 1.5286248167474699, - "language_loss": 0.68842459, - "learning_rate": 2.9761846643510882e-06, - "loss": 0.71026313, - "num_input_tokens_seen": 127828840, - "step": 5947, - "time_per_iteration": 2.6360325813293457 - }, - { - "auxiliary_loss_clip": 0.01107164, - "auxiliary_loss_mlp": 0.01039633, - "balance_loss_clip": 1.04598188, - "balance_loss_mlp": 1.02426696, - "epoch": 0.3576131068690816, - "flos": 19245031666560.0, - "grad_norm": 4.061535671212192, - "language_loss": 0.76024956, - "learning_rate": 2.9758447268048297e-06, - "loss": 0.78171754, - "num_input_tokens_seen": 127846240, - "step": 5948, - "time_per_iteration": 2.6968884468078613 - }, - { - "auxiliary_loss_clip": 0.01081903, - "auxiliary_loss_mlp": 0.01043894, - "balance_loss_clip": 1.04692364, - "balance_loss_mlp": 1.0291121, - "epoch": 0.35767323012174956, - "flos": 28654703971200.0, - "grad_norm": 1.8353415788349725, - "language_loss": 0.70553362, - "learning_rate": 2.9755047522544415e-06, - "loss": 0.72679162, - "num_input_tokens_seen": 127866880, - "step": 5949, - "time_per_iteration": 2.8849079608917236 - }, - { - "auxiliary_loss_clip": 0.01113321, - "auxiliary_loss_mlp": 0.01041031, - "balance_loss_clip": 1.04892492, - "balance_loss_mlp": 1.02688098, - "epoch": 0.35773335337441753, - "flos": 17085363459840.0, - "grad_norm": 2.820547719587591, - "language_loss": 0.77489066, - "learning_rate": 2.9751647407128154e-06, - "loss": 0.79643422, - "num_input_tokens_seen": 127883560, - "step": 5950, - "time_per_iteration": 2.6595206260681152 - }, - { - "auxiliary_loss_clip": 0.0112732, - "auxiliary_loss_mlp": 0.01041981, - "balance_loss_clip": 1.04834211, - "balance_loss_mlp": 1.02592397, - "epoch": 0.35779347662708555, - "flos": 15888605097600.0, - "grad_norm": 1.7233867228761917, - "language_loss": 0.72746027, - "learning_rate": 2.9748246921928445e-06, - "loss": 0.74915326, - "num_input_tokens_seen": 127902330, - "step": 5951, - "time_per_iteration": 2.6544554233551025 - }, - { - "auxiliary_loss_clip": 0.01129333, - "auxiliary_loss_mlp": 0.01041471, - "balance_loss_clip": 1.05047357, - "balance_loss_mlp": 1.0256753, - "epoch": 0.3578535998797535, - "flos": 28658834035200.0, - "grad_norm": 2.2344429074284693, - "language_loss": 0.69326741, - "learning_rate": 2.9744846067074236e-06, - "loss": 0.71497542, - "num_input_tokens_seen": 127922325, - "step": 5952, - "time_per_iteration": 2.7666146755218506 - }, - { - "auxiliary_loss_clip": 0.01080716, - "auxiliary_loss_mlp": 0.01049645, - "balance_loss_clip": 1.04122877, - "balance_loss_mlp": 1.03411233, - "epoch": 0.3579137231324215, - "flos": 37852432076160.0, - "grad_norm": 4.791743787800428, - "language_loss": 0.69651616, - "learning_rate": 2.974144484269449e-06, - "loss": 0.71781975, - "num_input_tokens_seen": 127942635, - "step": 5953, - "time_per_iteration": 2.900196075439453 - }, - { - "auxiliary_loss_clip": 0.01113192, - "auxiliary_loss_mlp": 0.01034652, - "balance_loss_clip": 1.0476222, - "balance_loss_mlp": 1.0198822, - "epoch": 0.35797384638508944, - "flos": 22346851656960.0, - "grad_norm": 2.3015234956442394, - "language_loss": 0.6670965, - "learning_rate": 2.9738043248918175e-06, - "loss": 0.68857497, - "num_input_tokens_seen": 127962520, - "step": 5954, - "time_per_iteration": 2.7609100341796875 - }, - { - "auxiliary_loss_clip": 0.011102, - "auxiliary_loss_mlp": 0.01040434, - "balance_loss_clip": 1.04845512, - "balance_loss_mlp": 1.02633798, - "epoch": 0.3580339696377574, - "flos": 13589711775360.0, - "grad_norm": 1.9332002852280215, - "language_loss": 0.74798024, - "learning_rate": 2.9734641285874282e-06, - "loss": 0.76948655, - "num_input_tokens_seen": 127981180, - "step": 5955, - "time_per_iteration": 2.727787733078003 - }, - { - "auxiliary_loss_clip": 0.01114534, - "auxiliary_loss_mlp": 0.01039755, - "balance_loss_clip": 1.04827058, - "balance_loss_mlp": 1.02546179, - "epoch": 0.3580940928904254, - "flos": 23768231719680.0, - "grad_norm": 1.745052650810224, - "language_loss": 0.75871193, - "learning_rate": 2.973123895369182e-06, - "loss": 0.78025484, - "num_input_tokens_seen": 127999725, - "step": 5956, - "time_per_iteration": 2.685006856918335 - }, - { - "auxiliary_loss_clip": 0.01133387, - "auxiliary_loss_mlp": 0.01035002, - "balance_loss_clip": 1.05088747, - "balance_loss_mlp": 1.0211376, - "epoch": 0.35815421614309334, - "flos": 19463871277440.0, - "grad_norm": 4.15447674959345, - "language_loss": 0.73543882, - "learning_rate": 2.9727836252499805e-06, - "loss": 0.75712276, - "num_input_tokens_seen": 128018885, - "step": 5957, - "time_per_iteration": 2.6640098094940186 - }, - { - "auxiliary_loss_clip": 0.01113163, - "auxiliary_loss_mlp": 0.01037962, - "balance_loss_clip": 1.04958355, - "balance_loss_mlp": 1.02395511, - "epoch": 0.3582143393957613, - "flos": 23368186972800.0, - "grad_norm": 3.3283201757671037, - "language_loss": 0.70960939, - "learning_rate": 2.972443318242726e-06, - "loss": 0.73112065, - "num_input_tokens_seen": 128037875, - "step": 5958, - "time_per_iteration": 2.6962838172912598 - }, - { - "auxiliary_loss_clip": 0.01093969, - "auxiliary_loss_mlp": 0.01038485, - "balance_loss_clip": 1.04454029, - "balance_loss_mlp": 1.02435875, - "epoch": 0.35827446264842927, - "flos": 26323275905280.0, - "grad_norm": 2.5438119471533494, - "language_loss": 0.88630176, - "learning_rate": 2.972102974360324e-06, - "loss": 0.90762633, - "num_input_tokens_seen": 128056045, - "step": 5959, - "time_per_iteration": 2.713508129119873 - }, - { - "auxiliary_loss_clip": 0.0113447, - "auxiliary_loss_mlp": 0.010399, - "balance_loss_clip": 1.05009389, - "balance_loss_mlp": 1.02511787, - "epoch": 0.35833458590109724, - "flos": 30446610779520.0, - "grad_norm": 2.2010810744211486, - "language_loss": 0.58033586, - "learning_rate": 2.971762593615679e-06, - "loss": 0.60207957, - "num_input_tokens_seen": 128077815, - "step": 5960, - "time_per_iteration": 2.685009479522705 - }, - { - "auxiliary_loss_clip": 0.0113445, - "auxiliary_loss_mlp": 0.01041748, - "balance_loss_clip": 1.04900908, - "balance_loss_mlp": 1.0255897, - "epoch": 0.3583947091537652, - "flos": 14829886702080.0, - "grad_norm": 2.9088839798225035, - "language_loss": 0.75860739, - "learning_rate": 2.9714221760216993e-06, - "loss": 0.7803694, - "num_input_tokens_seen": 128095460, - "step": 5961, - "time_per_iteration": 2.591665506362915 - }, - { - "auxiliary_loss_clip": 0.01103629, - "auxiliary_loss_mlp": 0.01037452, - "balance_loss_clip": 1.04985154, - "balance_loss_mlp": 1.022223, - "epoch": 0.35845483240643317, - "flos": 34240644743040.0, - "grad_norm": 1.7962139278871543, - "language_loss": 0.70392656, - "learning_rate": 2.971081721591294e-06, - "loss": 0.72533739, - "num_input_tokens_seen": 128118605, - "step": 5962, - "time_per_iteration": 2.78696346282959 - }, - { - "auxiliary_loss_clip": 0.01116632, - "auxiliary_loss_mlp": 0.01038106, - "balance_loss_clip": 1.0513072, - "balance_loss_mlp": 1.02532077, - "epoch": 0.35851495565910113, - "flos": 20960089326720.0, - "grad_norm": 3.937600501619356, - "language_loss": 0.75052911, - "learning_rate": 2.9707412303373716e-06, - "loss": 0.77207649, - "num_input_tokens_seen": 128139205, - "step": 5963, - "time_per_iteration": 2.779210090637207 - }, - { - "auxiliary_loss_clip": 0.01136067, - "auxiliary_loss_mlp": 0.01044967, - "balance_loss_clip": 1.05189323, - "balance_loss_mlp": 1.03017306, - "epoch": 0.35857507891176915, - "flos": 22309863626880.0, - "grad_norm": 3.7087256254692305, - "language_loss": 0.78717148, - "learning_rate": 2.9704007022728447e-06, - "loss": 0.80898178, - "num_input_tokens_seen": 128158765, - "step": 5964, - "time_per_iteration": 2.598621368408203 - }, - { - "auxiliary_loss_clip": 0.01112011, - "auxiliary_loss_mlp": 0.01041333, - "balance_loss_clip": 1.05019569, - "balance_loss_mlp": 1.02534723, - "epoch": 0.3586352021644371, - "flos": 23367863750400.0, - "grad_norm": 2.0226045347569857, - "language_loss": 0.66572571, - "learning_rate": 2.970060137410626e-06, - "loss": 0.6872592, - "num_input_tokens_seen": 128177850, - "step": 5965, - "time_per_iteration": 2.684847116470337 - }, - { - "auxiliary_loss_clip": 0.01132652, - "auxiliary_loss_mlp": 0.0077213, - "balance_loss_clip": 1.04819942, - "balance_loss_mlp": 1.00052619, - "epoch": 0.3586953254171051, - "flos": 27849227437440.0, - "grad_norm": 2.180178648475794, - "language_loss": 0.79150963, - "learning_rate": 2.9697195357636294e-06, - "loss": 0.81055743, - "num_input_tokens_seen": 128196925, - "step": 5966, - "time_per_iteration": 4.321925163269043 - }, - { - "auxiliary_loss_clip": 0.01076497, - "auxiliary_loss_mlp": 0.01042048, - "balance_loss_clip": 1.04272628, - "balance_loss_mlp": 1.02573991, - "epoch": 0.35875544866977305, - "flos": 19500500171520.0, - "grad_norm": 2.3639555115609663, - "language_loss": 0.91201752, - "learning_rate": 2.9693788973447715e-06, - "loss": 0.93320298, - "num_input_tokens_seen": 128213955, - "step": 5967, - "time_per_iteration": 2.7455573081970215 - }, - { - "auxiliary_loss_clip": 0.01101026, - "auxiliary_loss_mlp": 0.01053293, - "balance_loss_clip": 1.04794097, - "balance_loss_mlp": 1.03494644, - "epoch": 0.358815571922441, - "flos": 21471134077440.0, - "grad_norm": 5.4514250686274695, - "language_loss": 0.80356693, - "learning_rate": 2.9690382221669682e-06, - "loss": 0.82511014, - "num_input_tokens_seen": 128232980, - "step": 5968, - "time_per_iteration": 4.176758766174316 - }, - { - "auxiliary_loss_clip": 0.01109306, - "auxiliary_loss_mlp": 0.01052187, - "balance_loss_clip": 1.04507756, - "balance_loss_mlp": 1.03602266, - "epoch": 0.358875695175109, - "flos": 21835411856640.0, - "grad_norm": 2.18425096992674, - "language_loss": 0.8341769, - "learning_rate": 2.9686975102431384e-06, - "loss": 0.85579193, - "num_input_tokens_seen": 128252795, - "step": 5969, - "time_per_iteration": 4.278231382369995 - }, - { - "auxiliary_loss_clip": 0.01089525, - "auxiliary_loss_mlp": 0.01034474, - "balance_loss_clip": 1.04389262, - "balance_loss_mlp": 1.0201571, - "epoch": 0.35893581842777694, - "flos": 32011633330560.0, - "grad_norm": 2.040075228447558, - "language_loss": 0.72608048, - "learning_rate": 2.968356761586202e-06, - "loss": 0.74732047, - "num_input_tokens_seen": 128273115, - "step": 5970, - "time_per_iteration": 2.7784154415130615 - }, - { - "auxiliary_loss_clip": 0.01110616, - "auxiliary_loss_mlp": 0.01033542, - "balance_loss_clip": 1.04673791, - "balance_loss_mlp": 1.01868832, - "epoch": 0.3589959416804449, - "flos": 20485817124480.0, - "grad_norm": 1.7975318028216438, - "language_loss": 0.79562962, - "learning_rate": 2.9680159762090805e-06, - "loss": 0.8170712, - "num_input_tokens_seen": 128292220, - "step": 5971, - "time_per_iteration": 4.519066333770752 - }, - { - "auxiliary_loss_clip": 0.01098267, - "auxiliary_loss_mlp": 0.01043063, - "balance_loss_clip": 1.04956031, - "balance_loss_mlp": 1.02766144, - "epoch": 0.3590560649331129, - "flos": 16180666583040.0, - "grad_norm": 1.754965992567408, - "language_loss": 0.78217793, - "learning_rate": 2.967675154124696e-06, - "loss": 0.80359125, - "num_input_tokens_seen": 128310305, - "step": 5972, - "time_per_iteration": 2.7724227905273438 - }, - { - "auxiliary_loss_clip": 0.01092509, - "auxiliary_loss_mlp": 0.01035503, - "balance_loss_clip": 1.04198921, - "balance_loss_mlp": 1.02043509, - "epoch": 0.35911618818578084, - "flos": 20375391738240.0, - "grad_norm": 2.4812117519320287, - "language_loss": 0.8120966, - "learning_rate": 2.9673342953459722e-06, - "loss": 0.83337677, - "num_input_tokens_seen": 128328305, - "step": 5973, - "time_per_iteration": 2.8266379833221436 - }, - { - "auxiliary_loss_clip": 0.01042329, - "auxiliary_loss_mlp": 0.01005341, - "balance_loss_clip": 1.03088689, - "balance_loss_mlp": 1.0036602, - "epoch": 0.3591763114384488, - "flos": 41236691685120.0, - "grad_norm": 0.9056618080123127, - "language_loss": 0.56743383, - "learning_rate": 2.9669933998858355e-06, - "loss": 0.58791053, - "num_input_tokens_seen": 128378380, - "step": 5974, - "time_per_iteration": 3.0758044719696045 - }, - { - "auxiliary_loss_clip": 0.01126274, - "auxiliary_loss_mlp": 0.01037404, - "balance_loss_clip": 1.04946434, - "balance_loss_mlp": 1.02339661, - "epoch": 0.35923643469111677, - "flos": 18695454600960.0, - "grad_norm": 2.5569125412900022, - "language_loss": 0.68787563, - "learning_rate": 2.9666524677572114e-06, - "loss": 0.70951241, - "num_input_tokens_seen": 128394315, - "step": 5975, - "time_per_iteration": 2.657576084136963 - }, - { - "auxiliary_loss_clip": 0.01134392, - "auxiliary_loss_mlp": 0.01038612, - "balance_loss_clip": 1.04914975, - "balance_loss_mlp": 1.02426553, - "epoch": 0.35929655794378473, - "flos": 25009950931200.0, - "grad_norm": 1.804443520579843, - "language_loss": 0.79982442, - "learning_rate": 2.96631149897303e-06, - "loss": 0.82155442, - "num_input_tokens_seen": 128414515, - "step": 5976, - "time_per_iteration": 2.6197311878204346 - }, - { - "auxiliary_loss_clip": 0.01074524, - "auxiliary_loss_mlp": 0.01040105, - "balance_loss_clip": 1.04337287, - "balance_loss_mlp": 1.02404785, - "epoch": 0.35935668119645275, - "flos": 14975576265600.0, - "grad_norm": 1.9714674470262432, - "language_loss": 0.78818405, - "learning_rate": 2.9659704935462194e-06, - "loss": 0.8093304, - "num_input_tokens_seen": 128430615, - "step": 5977, - "time_per_iteration": 2.735844612121582 - }, - { - "auxiliary_loss_clip": 0.01094647, - "auxiliary_loss_mlp": 0.01041851, - "balance_loss_clip": 1.04511654, - "balance_loss_mlp": 1.02789736, - "epoch": 0.3594168044491207, - "flos": 21178138838400.0, - "grad_norm": 2.560014574379112, - "language_loss": 0.79859221, - "learning_rate": 2.9656294514897102e-06, - "loss": 0.8199572, - "num_input_tokens_seen": 128449480, - "step": 5978, - "time_per_iteration": 2.704134941101074 - }, - { - "auxiliary_loss_clip": 0.01135434, - "auxiliary_loss_mlp": 0.00773692, - "balance_loss_clip": 1.04890609, - "balance_loss_mlp": 1.00073409, - "epoch": 0.3594769277017887, - "flos": 27672152365440.0, - "grad_norm": 4.868201977342703, - "language_loss": 0.68310702, - "learning_rate": 2.965288372816436e-06, - "loss": 0.70219827, - "num_input_tokens_seen": 128471465, - "step": 5979, - "time_per_iteration": 2.667222499847412 - }, - { - "auxiliary_loss_clip": 0.01105596, - "auxiliary_loss_mlp": 0.01033841, - "balance_loss_clip": 1.04548645, - "balance_loss_mlp": 1.01876652, - "epoch": 0.35953705095445665, - "flos": 23002328995200.0, - "grad_norm": 6.298210491387724, - "language_loss": 0.67445302, - "learning_rate": 2.9649472575393296e-06, - "loss": 0.69584739, - "num_input_tokens_seen": 128490645, - "step": 5980, - "time_per_iteration": 2.6262974739074707 - }, - { - "auxiliary_loss_clip": 0.01113802, - "auxiliary_loss_mlp": 0.01040029, - "balance_loss_clip": 1.04725266, - "balance_loss_mlp": 1.02324414, - "epoch": 0.3595971742071246, - "flos": 25513992529920.0, - "grad_norm": 1.8251567017824133, - "language_loss": 0.71328801, - "learning_rate": 2.964606105671327e-06, - "loss": 0.73482633, - "num_input_tokens_seen": 128510225, - "step": 5981, - "time_per_iteration": 2.696676254272461 - }, - { - "auxiliary_loss_clip": 0.01109039, - "auxiliary_loss_mlp": 0.01041685, - "balance_loss_clip": 1.04872131, - "balance_loss_mlp": 1.02498353, - "epoch": 0.3596572974597926, - "flos": 29862559635840.0, - "grad_norm": 2.0089481436352767, - "language_loss": 0.71294796, - "learning_rate": 2.9642649172253635e-06, - "loss": 0.73445523, - "num_input_tokens_seen": 128530195, - "step": 5982, - "time_per_iteration": 2.7264244556427 - }, - { - "auxiliary_loss_clip": 0.01114107, - "auxiliary_loss_mlp": 0.01046667, - "balance_loss_clip": 1.04542398, - "balance_loss_mlp": 1.03115773, - "epoch": 0.35971742071246054, - "flos": 23112538899840.0, - "grad_norm": 1.8520970942870048, - "language_loss": 0.75614822, - "learning_rate": 2.9639236922143786e-06, - "loss": 0.77775598, - "num_input_tokens_seen": 128549990, - "step": 5983, - "time_per_iteration": 2.6827449798583984 - }, - { - "auxiliary_loss_clip": 0.01140239, - "auxiliary_loss_mlp": 0.01042697, - "balance_loss_clip": 1.0510025, - "balance_loss_mlp": 1.02626991, - "epoch": 0.3597775439651285, - "flos": 16725359399040.0, - "grad_norm": 17.088734777986428, - "language_loss": 0.76256114, - "learning_rate": 2.96358243065131e-06, - "loss": 0.78439057, - "num_input_tokens_seen": 128567925, - "step": 5984, - "time_per_iteration": 2.695389747619629 - }, - { - "auxiliary_loss_clip": 0.01117847, - "auxiliary_loss_mlp": 0.00772256, - "balance_loss_clip": 1.04583967, - "balance_loss_mlp": 1.00047541, - "epoch": 0.3598376672177965, - "flos": 19719483436800.0, - "grad_norm": 1.8513392555770956, - "language_loss": 0.86111921, - "learning_rate": 2.9632411325490993e-06, - "loss": 0.88002026, - "num_input_tokens_seen": 128585655, - "step": 5985, - "time_per_iteration": 2.6440985202789307 - }, - { - "auxiliary_loss_clip": 0.01117958, - "auxiliary_loss_mlp": 0.01045892, - "balance_loss_clip": 1.04564977, - "balance_loss_mlp": 1.03012037, - "epoch": 0.35989779047046444, - "flos": 17311529445120.0, - "grad_norm": 2.5721307867834406, - "language_loss": 0.72770452, - "learning_rate": 2.9628997979206884e-06, - "loss": 0.74934304, - "num_input_tokens_seen": 128604820, - "step": 5986, - "time_per_iteration": 2.6169698238372803 - }, - { - "auxiliary_loss_clip": 0.01100506, - "auxiliary_loss_mlp": 0.01039862, - "balance_loss_clip": 1.04264784, - "balance_loss_mlp": 1.02473474, - "epoch": 0.3599579137231324, - "flos": 22711237176960.0, - "grad_norm": 2.1943162754876497, - "language_loss": 0.73883474, - "learning_rate": 2.9625584267790204e-06, - "loss": 0.76023847, - "num_input_tokens_seen": 128623070, - "step": 5987, - "time_per_iteration": 2.72385573387146 - }, - { - "auxiliary_loss_clip": 0.0114047, - "auxiliary_loss_mlp": 0.01040262, - "balance_loss_clip": 1.05135727, - "balance_loss_mlp": 1.02456188, - "epoch": 0.36001803697580037, - "flos": 20959873845120.0, - "grad_norm": 2.225645474388546, - "language_loss": 0.69665354, - "learning_rate": 2.9622170191370404e-06, - "loss": 0.71846086, - "num_input_tokens_seen": 128642430, - "step": 5988, - "time_per_iteration": 2.6040101051330566 - }, - { - "auxiliary_loss_clip": 0.01127132, - "auxiliary_loss_mlp": 0.01043358, - "balance_loss_clip": 1.04819822, - "balance_loss_mlp": 1.0278132, - "epoch": 0.36007816022846834, - "flos": 20485565729280.0, - "grad_norm": 2.281223653114012, - "language_loss": 0.73300481, - "learning_rate": 2.9618755750076953e-06, - "loss": 0.75470972, - "num_input_tokens_seen": 128661285, - "step": 5989, - "time_per_iteration": 2.6532981395721436 - }, - { - "auxiliary_loss_clip": 0.01089891, - "auxiliary_loss_mlp": 0.01037817, - "balance_loss_clip": 1.04161119, - "balance_loss_mlp": 1.02237916, - "epoch": 0.36013828348113636, - "flos": 28001237794560.0, - "grad_norm": 3.1935134184936156, - "language_loss": 0.79950285, - "learning_rate": 2.961534094403931e-06, - "loss": 0.82077992, - "num_input_tokens_seen": 128682210, - "step": 5990, - "time_per_iteration": 2.785142421722412 - }, - { - "auxiliary_loss_clip": 0.01123339, - "auxiliary_loss_mlp": 0.0103344, - "balance_loss_clip": 1.04714704, - "balance_loss_mlp": 1.01775789, - "epoch": 0.3601984067338043, - "flos": 20082181017600.0, - "grad_norm": 2.506195073342272, - "language_loss": 0.83875644, - "learning_rate": 2.961192577338698e-06, - "loss": 0.86032414, - "num_input_tokens_seen": 128700445, - "step": 5991, - "time_per_iteration": 2.6310808658599854 - }, - { - "auxiliary_loss_clip": 0.01111044, - "auxiliary_loss_mlp": 0.01045829, - "balance_loss_clip": 1.04896092, - "balance_loss_mlp": 1.03068912, - "epoch": 0.3602585299864723, - "flos": 18617599872000.0, - "grad_norm": 2.314320245159203, - "language_loss": 0.75628942, - "learning_rate": 2.9608510238249463e-06, - "loss": 0.77785814, - "num_input_tokens_seen": 128716855, - "step": 5992, - "time_per_iteration": 2.6698272228240967 - }, - { - "auxiliary_loss_clip": 0.01134951, - "auxiliary_loss_mlp": 0.01039412, - "balance_loss_clip": 1.04993188, - "balance_loss_mlp": 1.02385557, - "epoch": 0.36031865323914025, - "flos": 19573003774080.0, - "grad_norm": 2.1820524355734072, - "language_loss": 0.76886415, - "learning_rate": 2.960509433875627e-06, - "loss": 0.79060775, - "num_input_tokens_seen": 128735835, - "step": 5993, - "time_per_iteration": 2.5999341011047363 - }, - { - "auxiliary_loss_clip": 0.01111748, - "auxiliary_loss_mlp": 0.01054388, - "balance_loss_clip": 1.04750419, - "balance_loss_mlp": 1.03762674, - "epoch": 0.3603787764918082, - "flos": 17490615678720.0, - "grad_norm": 1.8546706349055275, - "language_loss": 0.74672681, - "learning_rate": 2.9601678075036943e-06, - "loss": 0.76838815, - "num_input_tokens_seen": 128752465, - "step": 5994, - "time_per_iteration": 2.6691155433654785 - }, - { - "auxiliary_loss_clip": 0.01095118, - "auxiliary_loss_mlp": 0.01038312, - "balance_loss_clip": 1.0480628, - "balance_loss_mlp": 1.02331567, - "epoch": 0.3604388997444762, - "flos": 15523393564800.0, - "grad_norm": 2.7696142346579666, - "language_loss": 0.68887782, - "learning_rate": 2.9598261447221024e-06, - "loss": 0.71021217, - "num_input_tokens_seen": 128770865, - "step": 5995, - "time_per_iteration": 2.7497267723083496 - }, - { - "auxiliary_loss_clip": 0.01104395, - "auxiliary_loss_mlp": 0.01046311, - "balance_loss_clip": 1.04338932, - "balance_loss_mlp": 1.03031349, - "epoch": 0.36049902299714415, - "flos": 17310883000320.0, - "grad_norm": 2.2305093143222248, - "language_loss": 0.82564914, - "learning_rate": 2.9594844455438057e-06, - "loss": 0.84715617, - "num_input_tokens_seen": 128789730, - "step": 5996, - "time_per_iteration": 2.7227983474731445 - }, - { - "auxiliary_loss_clip": 0.01135369, - "auxiliary_loss_mlp": 0.0103828, - "balance_loss_clip": 1.04974842, - "balance_loss_mlp": 1.02300954, - "epoch": 0.3605591462498121, - "flos": 17056025026560.0, - "grad_norm": 2.068995609090248, - "language_loss": 0.73795009, - "learning_rate": 2.959142709981763e-06, - "loss": 0.75968659, - "num_input_tokens_seen": 128806610, - "step": 5997, - "time_per_iteration": 2.572842836380005 - }, - { - "auxiliary_loss_clip": 0.01121916, - "auxiliary_loss_mlp": 0.01036628, - "balance_loss_clip": 1.0482775, - "balance_loss_mlp": 1.0226686, - "epoch": 0.3606192695024801, - "flos": 16836862193280.0, - "grad_norm": 2.7116535757300215, - "language_loss": 0.69209671, - "learning_rate": 2.9588009380489337e-06, - "loss": 0.71368217, - "num_input_tokens_seen": 128824830, - "step": 5998, - "time_per_iteration": 2.604459047317505 - }, - { - "auxiliary_loss_clip": 0.01085406, - "auxiliary_loss_mlp": 0.01041904, - "balance_loss_clip": 1.04395008, - "balance_loss_mlp": 1.02565587, - "epoch": 0.36067939275514804, - "flos": 12129655743360.0, - "grad_norm": 2.6293691676304745, - "language_loss": 0.76580822, - "learning_rate": 2.9584591297582758e-06, - "loss": 0.78708136, - "num_input_tokens_seen": 128838170, - "step": 5999, - "time_per_iteration": 2.6671667098999023 - }, - { - "auxiliary_loss_clip": 0.01098137, - "auxiliary_loss_mlp": 0.01040783, - "balance_loss_clip": 1.04674315, - "balance_loss_mlp": 1.02590609, - "epoch": 0.360739516007816, - "flos": 18041449720320.0, - "grad_norm": 1.8157116334206203, - "language_loss": 0.78264523, - "learning_rate": 2.9581172851227516e-06, - "loss": 0.80403441, - "num_input_tokens_seen": 128855625, - "step": 6000, - "time_per_iteration": 2.743117332458496 - }, - { - "auxiliary_loss_clip": 0.01095162, - "auxiliary_loss_mlp": 0.01036289, - "balance_loss_clip": 1.04705954, - "balance_loss_mlp": 1.02203155, - "epoch": 0.360799639260484, - "flos": 18549800951040.0, - "grad_norm": 1.8701006971713747, - "language_loss": 0.78316295, - "learning_rate": 2.9577754041553243e-06, - "loss": 0.80447751, - "num_input_tokens_seen": 128873540, - "step": 6001, - "time_per_iteration": 2.7342417240142822 - }, - { - "auxiliary_loss_clip": 0.01130356, - "auxiliary_loss_mlp": 0.0077146, - "balance_loss_clip": 1.04727733, - "balance_loss_mlp": 1.00072694, - "epoch": 0.36085976251315194, - "flos": 19682028529920.0, - "grad_norm": 3.3927220139250056, - "language_loss": 0.83151853, - "learning_rate": 2.9574334868689575e-06, - "loss": 0.8505367, - "num_input_tokens_seen": 128889925, - "step": 6002, - "time_per_iteration": 2.6884238719940186 - }, - { - "auxiliary_loss_clip": 0.01101804, - "auxiliary_loss_mlp": 0.01033284, - "balance_loss_clip": 1.04249346, - "balance_loss_mlp": 1.02011156, - "epoch": 0.3609198857658199, - "flos": 24198943703040.0, - "grad_norm": 2.135208430409031, - "language_loss": 0.90677911, - "learning_rate": 2.9570915332766165e-06, - "loss": 0.92812997, - "num_input_tokens_seen": 128906890, - "step": 6003, - "time_per_iteration": 2.666738986968994 - }, - { - "auxiliary_loss_clip": 0.01036783, - "auxiliary_loss_mlp": 0.0101378, - "balance_loss_clip": 1.03707922, - "balance_loss_mlp": 1.01194429, - "epoch": 0.3609800090184879, - "flos": 57115995160320.0, - "grad_norm": 0.8844533830179444, - "language_loss": 0.53396428, - "learning_rate": 2.9567495433912693e-06, - "loss": 0.55446988, - "num_input_tokens_seen": 128965940, - "step": 6004, - "time_per_iteration": 3.1421444416046143 - }, - { - "auxiliary_loss_clip": 0.01112391, - "auxiliary_loss_mlp": 0.00772771, - "balance_loss_clip": 1.04665363, - "balance_loss_mlp": 1.00050342, - "epoch": 0.3610401322711559, - "flos": 20811239366400.0, - "grad_norm": 2.085214899207264, - "language_loss": 0.77743608, - "learning_rate": 2.956407517225883e-06, - "loss": 0.79628766, - "num_input_tokens_seen": 128985835, - "step": 6005, - "time_per_iteration": 4.196998596191406 - }, - { - "auxiliary_loss_clip": 0.01114373, - "auxiliary_loss_mlp": 0.01043264, - "balance_loss_clip": 1.04545391, - "balance_loss_mlp": 1.02866125, - "epoch": 0.36110025552382385, - "flos": 13699167494400.0, - "grad_norm": 1.984756598411705, - "language_loss": 0.78795588, - "learning_rate": 2.956065454793429e-06, - "loss": 0.80953228, - "num_input_tokens_seen": 129003120, - "step": 6006, - "time_per_iteration": 2.642446517944336 - }, - { - "auxiliary_loss_clip": 0.01135515, - "auxiliary_loss_mlp": 0.01037404, - "balance_loss_clip": 1.04913247, - "balance_loss_mlp": 1.02116823, - "epoch": 0.3611603787764918, - "flos": 22455014486400.0, - "grad_norm": 3.6522767524231248, - "language_loss": 0.84766537, - "learning_rate": 2.955723356106876e-06, - "loss": 0.86939454, - "num_input_tokens_seen": 129021645, - "step": 6007, - "time_per_iteration": 4.38408637046814 - }, - { - "auxiliary_loss_clip": 0.01120706, - "auxiliary_loss_mlp": 0.01035853, - "balance_loss_clip": 1.05059266, - "balance_loss_mlp": 1.01940203, - "epoch": 0.3612205020291598, - "flos": 20886651970560.0, - "grad_norm": 2.20663208121776, - "language_loss": 0.72179425, - "learning_rate": 2.955381221179198e-06, - "loss": 0.7433598, - "num_input_tokens_seen": 129038375, - "step": 6008, - "time_per_iteration": 4.262283802032471 - }, - { - "auxiliary_loss_clip": 0.01118211, - "auxiliary_loss_mlp": 0.0103587, - "balance_loss_clip": 1.04345882, - "balance_loss_mlp": 1.02150559, - "epoch": 0.36128062528182775, - "flos": 15741981780480.0, - "grad_norm": 7.815944525258205, - "language_loss": 0.83056295, - "learning_rate": 2.955039050023368e-06, - "loss": 0.85210377, - "num_input_tokens_seen": 129056235, - "step": 6009, - "time_per_iteration": 2.643824577331543 - }, - { - "auxiliary_loss_clip": 0.01105662, - "auxiliary_loss_mlp": 0.01045676, - "balance_loss_clip": 1.04862237, - "balance_loss_mlp": 1.03013086, - "epoch": 0.3613407485344957, - "flos": 16764502245120.0, - "grad_norm": 2.1132167438001166, - "language_loss": 0.7616573, - "learning_rate": 2.954696842652362e-06, - "loss": 0.7831707, - "num_input_tokens_seen": 129072405, - "step": 6010, - "time_per_iteration": 4.361377000808716 - }, - { - "auxiliary_loss_clip": 0.01104786, - "auxiliary_loss_mlp": 0.01035576, - "balance_loss_clip": 1.04665053, - "balance_loss_mlp": 1.02091312, - "epoch": 0.3614008717871637, - "flos": 20371189847040.0, - "grad_norm": 1.759609272436165, - "language_loss": 0.83214396, - "learning_rate": 2.9543545990791554e-06, - "loss": 0.85354757, - "num_input_tokens_seen": 129090225, - "step": 6011, - "time_per_iteration": 2.679145574569702 - }, - { - "auxiliary_loss_clip": 0.01141696, - "auxiliary_loss_mlp": 0.01041601, - "balance_loss_clip": 1.05070031, - "balance_loss_mlp": 1.02562666, - "epoch": 0.36146099503983165, - "flos": 22776665800320.0, - "grad_norm": 2.194420173883677, - "language_loss": 0.62446111, - "learning_rate": 2.954012319316727e-06, - "loss": 0.64629406, - "num_input_tokens_seen": 129107685, - "step": 6012, - "time_per_iteration": 2.6012516021728516 - }, - { - "auxiliary_loss_clip": 0.01106556, - "auxiliary_loss_mlp": 0.01038245, - "balance_loss_clip": 1.04518831, - "balance_loss_mlp": 1.02368951, - "epoch": 0.3615211182924996, - "flos": 22996654646400.0, - "grad_norm": 1.831524666449312, - "language_loss": 0.8381623, - "learning_rate": 2.9536700033780565e-06, - "loss": 0.85961026, - "num_input_tokens_seen": 129125315, - "step": 6013, - "time_per_iteration": 2.7191901206970215 - }, - { - "auxiliary_loss_clip": 0.01131608, - "auxiliary_loss_mlp": 0.01040321, - "balance_loss_clip": 1.04590511, - "balance_loss_mlp": 1.02466893, - "epoch": 0.3615812415451676, - "flos": 16648079287680.0, - "grad_norm": 3.6755742539930285, - "language_loss": 0.91541535, - "learning_rate": 2.9533276512761228e-06, - "loss": 0.93713462, - "num_input_tokens_seen": 129141600, - "step": 6014, - "time_per_iteration": 2.714121103286743 - }, - { - "auxiliary_loss_clip": 0.01131507, - "auxiliary_loss_mlp": 0.01042414, - "balance_loss_clip": 1.0463829, - "balance_loss_mlp": 1.0268693, - "epoch": 0.36164136479783554, - "flos": 21320093387520.0, - "grad_norm": 2.2181121985150094, - "language_loss": 0.73578274, - "learning_rate": 2.95298526302391e-06, - "loss": 0.75752199, - "num_input_tokens_seen": 129160665, - "step": 6015, - "time_per_iteration": 2.668600082397461 - }, - { - "auxiliary_loss_clip": 0.0105036, - "auxiliary_loss_mlp": 0.01047702, - "balance_loss_clip": 1.03610015, - "balance_loss_mlp": 1.02980912, - "epoch": 0.3617014880505035, - "flos": 24169569356160.0, - "grad_norm": 2.2662955263586158, - "language_loss": 0.64756966, - "learning_rate": 2.9526428386344e-06, - "loss": 0.66855025, - "num_input_tokens_seen": 129179220, - "step": 6016, - "time_per_iteration": 2.8753597736358643 - }, - { - "auxiliary_loss_clip": 0.01127577, - "auxiliary_loss_mlp": 0.01039172, - "balance_loss_clip": 1.05000329, - "balance_loss_mlp": 1.02170801, - "epoch": 0.3617616113031715, - "flos": 39014824101120.0, - "grad_norm": 2.0483319793753343, - "language_loss": 0.71927178, - "learning_rate": 2.9523003781205785e-06, - "loss": 0.74093938, - "num_input_tokens_seen": 129200385, - "step": 6017, - "time_per_iteration": 2.8195903301239014 - }, - { - "auxiliary_loss_clip": 0.01123165, - "auxiliary_loss_mlp": 0.01043013, - "balance_loss_clip": 1.04506993, - "balance_loss_mlp": 1.02724147, - "epoch": 0.3618217345558395, - "flos": 12130840892160.0, - "grad_norm": 2.196881428409859, - "language_loss": 0.73543239, - "learning_rate": 2.9519578814954307e-06, - "loss": 0.7570942, - "num_input_tokens_seen": 129217395, - "step": 6018, - "time_per_iteration": 2.6454639434814453 - }, - { - "auxiliary_loss_clip": 0.01088616, - "auxiliary_loss_mlp": 0.01036025, - "balance_loss_clip": 1.0470562, - "balance_loss_mlp": 1.02079058, - "epoch": 0.36188185780850746, - "flos": 24935005203840.0, - "grad_norm": 2.8373114264415222, - "language_loss": 0.69157374, - "learning_rate": 2.9516153487719448e-06, - "loss": 0.71282017, - "num_input_tokens_seen": 129238940, - "step": 6019, - "time_per_iteration": 2.824361801147461 - }, - { - "auxiliary_loss_clip": 0.0111438, - "auxiliary_loss_mlp": 0.0103897, - "balance_loss_clip": 1.04542887, - "balance_loss_mlp": 1.02275765, - "epoch": 0.3619419810611754, - "flos": 20958832350720.0, - "grad_norm": 3.405770043894724, - "language_loss": 0.76428473, - "learning_rate": 2.95127277996311e-06, - "loss": 0.78581828, - "num_input_tokens_seen": 129258240, - "step": 6020, - "time_per_iteration": 2.6757993698120117 - }, - { - "auxiliary_loss_clip": 0.01124662, - "auxiliary_loss_mlp": 0.01041506, - "balance_loss_clip": 1.04899478, - "balance_loss_mlp": 1.02512705, - "epoch": 0.3620021043138434, - "flos": 22528882805760.0, - "grad_norm": 2.1413312386751606, - "language_loss": 0.73802006, - "learning_rate": 2.9509301750819156e-06, - "loss": 0.7596817, - "num_input_tokens_seen": 129279040, - "step": 6021, - "time_per_iteration": 2.6422386169433594 - }, - { - "auxiliary_loss_clip": 0.01094575, - "auxiliary_loss_mlp": 0.01036086, - "balance_loss_clip": 1.04502845, - "balance_loss_mlp": 1.02170944, - "epoch": 0.36206222756651135, - "flos": 15596687266560.0, - "grad_norm": 8.65046906858069, - "language_loss": 0.80683851, - "learning_rate": 2.9505875341413533e-06, - "loss": 0.82814515, - "num_input_tokens_seen": 129295415, - "step": 6022, - "time_per_iteration": 2.7069809436798096 - }, - { - "auxiliary_loss_clip": 0.0112144, - "auxiliary_loss_mlp": 0.01034482, - "balance_loss_clip": 1.04967427, - "balance_loss_mlp": 1.02036762, - "epoch": 0.3621223508191793, - "flos": 23587170238080.0, - "grad_norm": 1.6359940708258738, - "language_loss": 0.81630391, - "learning_rate": 2.950244857154417e-06, - "loss": 0.83786309, - "num_input_tokens_seen": 129312620, - "step": 6023, - "time_per_iteration": 2.676196575164795 - }, - { - "auxiliary_loss_clip": 0.01115391, - "auxiliary_loss_mlp": 0.01037931, - "balance_loss_clip": 1.04994166, - "balance_loss_mlp": 1.02266037, - "epoch": 0.3621824740718473, - "flos": 22309899540480.0, - "grad_norm": 2.238629896510925, - "language_loss": 0.79401833, - "learning_rate": 2.9499021441341e-06, - "loss": 0.81555158, - "num_input_tokens_seen": 129331825, - "step": 6024, - "time_per_iteration": 2.6479294300079346 - }, - { - "auxiliary_loss_clip": 0.01098352, - "auxiliary_loss_mlp": 0.01041698, - "balance_loss_clip": 1.04168642, - "balance_loss_mlp": 1.02567625, - "epoch": 0.36224259732451525, - "flos": 16763640318720.0, - "grad_norm": 2.1016508822119517, - "language_loss": 0.74409318, - "learning_rate": 2.9495593950933997e-06, - "loss": 0.76549369, - "num_input_tokens_seen": 129350400, - "step": 6025, - "time_per_iteration": 2.720113515853882 - }, - { - "auxiliary_loss_clip": 0.01121634, - "auxiliary_loss_mlp": 0.00772492, - "balance_loss_clip": 1.04758501, - "balance_loss_mlp": 1.00045466, - "epoch": 0.3623027205771832, - "flos": 23149742411520.0, - "grad_norm": 1.7192758683210898, - "language_loss": 0.72363192, - "learning_rate": 2.9492166100453107e-06, - "loss": 0.74257314, - "num_input_tokens_seen": 129371155, - "step": 6026, - "time_per_iteration": 2.647515296936035 - }, - { - "auxiliary_loss_clip": 0.01130763, - "auxiliary_loss_mlp": 0.01045791, - "balance_loss_clip": 1.05090141, - "balance_loss_mlp": 1.0300554, - "epoch": 0.3623628438298512, - "flos": 28549162834560.0, - "grad_norm": 3.1509295844270166, - "language_loss": 0.79584157, - "learning_rate": 2.948873789002833e-06, - "loss": 0.81760705, - "num_input_tokens_seen": 129391230, - "step": 6027, - "time_per_iteration": 2.666778802871704 - }, - { - "auxiliary_loss_clip": 0.01112806, - "auxiliary_loss_mlp": 0.01044567, - "balance_loss_clip": 1.04690945, - "balance_loss_mlp": 1.02730584, - "epoch": 0.36242296708251914, - "flos": 25484941405440.0, - "grad_norm": 2.036912075012155, - "language_loss": 0.67857373, - "learning_rate": 2.9485309319789667e-06, - "loss": 0.70014751, - "num_input_tokens_seen": 129410065, - "step": 6028, - "time_per_iteration": 2.721635103225708 - }, - { - "auxiliary_loss_clip": 0.01093428, - "auxiliary_loss_mlp": 0.01039806, - "balance_loss_clip": 1.04534137, - "balance_loss_mlp": 1.02493429, - "epoch": 0.3624830903351871, - "flos": 16290373697280.0, - "grad_norm": 2.040296243102333, - "language_loss": 0.85588348, - "learning_rate": 2.9481880389867117e-06, - "loss": 0.87721586, - "num_input_tokens_seen": 129428655, - "step": 6029, - "time_per_iteration": 2.768638849258423 - }, - { - "auxiliary_loss_clip": 0.01097178, - "auxiliary_loss_mlp": 0.01040472, - "balance_loss_clip": 1.04583371, - "balance_loss_mlp": 1.02534389, - "epoch": 0.36254321358785513, - "flos": 18296307694080.0, - "grad_norm": 1.826841085229912, - "language_loss": 0.72638077, - "learning_rate": 2.9478451100390714e-06, - "loss": 0.74775726, - "num_input_tokens_seen": 129447845, - "step": 6030, - "time_per_iteration": 2.6222145557403564 - }, - { - "auxiliary_loss_clip": 0.01111443, - "auxiliary_loss_mlp": 0.0104401, - "balance_loss_clip": 1.0471518, - "balance_loss_mlp": 1.02635479, - "epoch": 0.3626033368405231, - "flos": 14865294533760.0, - "grad_norm": 2.682823168265615, - "language_loss": 0.74219912, - "learning_rate": 2.94750214514905e-06, - "loss": 0.76375365, - "num_input_tokens_seen": 129463275, - "step": 6031, - "time_per_iteration": 2.62003493309021 - }, - { - "auxiliary_loss_clip": 0.01090216, - "auxiliary_loss_mlp": 0.01046109, - "balance_loss_clip": 1.04174352, - "balance_loss_mlp": 1.03031349, - "epoch": 0.36266346009319106, - "flos": 22306595489280.0, - "grad_norm": 2.122404426395552, - "language_loss": 0.72930032, - "learning_rate": 2.9471591443296516e-06, - "loss": 0.75066358, - "num_input_tokens_seen": 129483205, - "step": 6032, - "time_per_iteration": 2.7382266521453857 - }, - { - "auxiliary_loss_clip": 0.01089342, - "auxiliary_loss_mlp": 0.0104871, - "balance_loss_clip": 1.0457828, - "balance_loss_mlp": 1.03320134, - "epoch": 0.362723583345859, - "flos": 18222331633920.0, - "grad_norm": 2.0052695882675895, - "language_loss": 0.77577424, - "learning_rate": 2.946816107593884e-06, - "loss": 0.79715478, - "num_input_tokens_seen": 129499885, - "step": 6033, - "time_per_iteration": 2.712574005126953 - }, - { - "auxiliary_loss_clip": 0.01011518, - "auxiliary_loss_mlp": 0.01010455, - "balance_loss_clip": 1.02346182, - "balance_loss_mlp": 1.00881004, - "epoch": 0.362783706598527, - "flos": 68499174458880.0, - "grad_norm": 0.775881514372135, - "language_loss": 0.6472615, - "learning_rate": 2.9464730349547547e-06, - "loss": 0.66748118, - "num_input_tokens_seen": 129561885, - "step": 6034, - "time_per_iteration": 3.33389949798584 - }, - { - "auxiliary_loss_clip": 0.0111586, - "auxiliary_loss_mlp": 0.01039589, - "balance_loss_clip": 1.04362679, - "balance_loss_mlp": 1.02373409, - "epoch": 0.36284382985119495, - "flos": 26576589594240.0, - "grad_norm": 2.348469757016237, - "language_loss": 0.89869213, - "learning_rate": 2.946129926425273e-06, - "loss": 0.9202466, - "num_input_tokens_seen": 129582325, - "step": 6035, - "time_per_iteration": 2.661137580871582 - }, - { - "auxiliary_loss_clip": 0.01112128, - "auxiliary_loss_mlp": 0.01040682, - "balance_loss_clip": 1.04810882, - "balance_loss_mlp": 1.02445734, - "epoch": 0.3629039531038629, - "flos": 20156767608960.0, - "grad_norm": 1.7965494412259506, - "language_loss": 0.73480749, - "learning_rate": 2.9457867820184496e-06, - "loss": 0.75633562, - "num_input_tokens_seen": 129600350, - "step": 6036, - "time_per_iteration": 2.627746105194092 - }, - { - "auxiliary_loss_clip": 0.01118939, - "auxiliary_loss_mlp": 0.01034203, - "balance_loss_clip": 1.0476563, - "balance_loss_mlp": 1.01825309, - "epoch": 0.3629640763565309, - "flos": 18625716345600.0, - "grad_norm": 2.247638401714898, - "language_loss": 0.75895989, - "learning_rate": 2.945443601747297e-06, - "loss": 0.78049135, - "num_input_tokens_seen": 129618425, - "step": 6037, - "time_per_iteration": 2.6763134002685547 - }, - { - "auxiliary_loss_clip": 0.01117432, - "auxiliary_loss_mlp": 0.0105958, - "balance_loss_clip": 1.04722893, - "balance_loss_mlp": 1.04149556, - "epoch": 0.36302419960919885, - "flos": 19571459489280.0, - "grad_norm": 1.7641921793444904, - "language_loss": 0.78425813, - "learning_rate": 2.945100385624828e-06, - "loss": 0.80602825, - "num_input_tokens_seen": 129636750, - "step": 6038, - "time_per_iteration": 2.6576154232025146 - }, - { - "auxiliary_loss_clip": 0.01042272, - "auxiliary_loss_mlp": 0.01000075, - "balance_loss_clip": 1.02576721, - "balance_loss_mlp": 0.99842948, - "epoch": 0.3630843228618668, - "flos": 63797606444160.0, - "grad_norm": 0.8328343708327894, - "language_loss": 0.63371962, - "learning_rate": 2.9447571336640573e-06, - "loss": 0.6541431, - "num_input_tokens_seen": 129699030, - "step": 6039, - "time_per_iteration": 3.268035650253296 - }, - { - "auxiliary_loss_clip": 0.01108663, - "auxiliary_loss_mlp": 0.01052032, - "balance_loss_clip": 1.04687905, - "balance_loss_mlp": 1.03485394, - "epoch": 0.3631444461145348, - "flos": 21835160461440.0, - "grad_norm": 2.83972356132426, - "language_loss": 0.71349055, - "learning_rate": 2.944413845878002e-06, - "loss": 0.73509747, - "num_input_tokens_seen": 129717135, - "step": 6040, - "time_per_iteration": 2.7468066215515137 - }, - { - "auxiliary_loss_clip": 0.01129452, - "auxiliary_loss_mlp": 0.01039721, - "balance_loss_clip": 1.05027485, - "balance_loss_mlp": 1.02372289, - "epoch": 0.36320456936720275, - "flos": 21722041555200.0, - "grad_norm": 1.6017927687359714, - "language_loss": 0.81615877, - "learning_rate": 2.9440705222796783e-06, - "loss": 0.83785057, - "num_input_tokens_seen": 129735940, - "step": 6041, - "time_per_iteration": 2.6624767780303955 - }, - { - "auxiliary_loss_clip": 0.01116373, - "auxiliary_loss_mlp": 0.01037475, - "balance_loss_clip": 1.04789138, - "balance_loss_mlp": 1.02039289, - "epoch": 0.3632646926198707, - "flos": 17019072910080.0, - "grad_norm": 6.335898198250863, - "language_loss": 0.83848882, - "learning_rate": 2.943727162882107e-06, - "loss": 0.86002731, - "num_input_tokens_seen": 129752790, - "step": 6042, - "time_per_iteration": 2.6279616355895996 - }, - { - "auxiliary_loss_clip": 0.01113831, - "auxiliary_loss_mlp": 0.01045895, - "balance_loss_clip": 1.04817295, - "balance_loss_mlp": 1.03020668, - "epoch": 0.36332481587253873, - "flos": 23331163029120.0, - "grad_norm": 1.8194124872693949, - "language_loss": 0.78401059, - "learning_rate": 2.9433837676983064e-06, - "loss": 0.80560786, - "num_input_tokens_seen": 129773655, - "step": 6043, - "time_per_iteration": 4.221862077713013 - }, - { - "auxiliary_loss_clip": 0.01111193, - "auxiliary_loss_mlp": 0.01036813, - "balance_loss_clip": 1.05454051, - "balance_loss_mlp": 1.02078581, - "epoch": 0.3633849391252067, - "flos": 10743539857920.0, - "grad_norm": 2.743973887678544, - "language_loss": 0.65664518, - "learning_rate": 2.943040336741298e-06, - "loss": 0.67812526, - "num_input_tokens_seen": 129791605, - "step": 6044, - "time_per_iteration": 2.7301173210144043 - }, - { - "auxiliary_loss_clip": 0.01109397, - "auxiliary_loss_mlp": 0.01034976, - "balance_loss_clip": 1.04838157, - "balance_loss_mlp": 1.02035475, - "epoch": 0.36344506237787466, - "flos": 25849147357440.0, - "grad_norm": 2.5365479968338187, - "language_loss": 0.81149542, - "learning_rate": 2.9426968700241066e-06, - "loss": 0.83293915, - "num_input_tokens_seen": 129811075, - "step": 6045, - "time_per_iteration": 2.6896753311157227 - }, - { - "auxiliary_loss_clip": 0.0110304, - "auxiliary_loss_mlp": 0.01045503, - "balance_loss_clip": 1.04706383, - "balance_loss_mlp": 1.02923083, - "epoch": 0.3635051856305426, - "flos": 30154046503680.0, - "grad_norm": 2.400629400498793, - "language_loss": 0.65010375, - "learning_rate": 2.942353367559755e-06, - "loss": 0.67158914, - "num_input_tokens_seen": 129833755, - "step": 6046, - "time_per_iteration": 2.800321578979492 - }, - { - "auxiliary_loss_clip": 0.01102544, - "auxiliary_loss_mlp": 0.01038937, - "balance_loss_clip": 1.0467155, - "balance_loss_mlp": 1.02399993, - "epoch": 0.3635653088832106, - "flos": 22198396746240.0, - "grad_norm": 2.172977049503826, - "language_loss": 0.77142686, - "learning_rate": 2.9420098293612692e-06, - "loss": 0.79284167, - "num_input_tokens_seen": 129854475, - "step": 6047, - "time_per_iteration": 4.274283170700073 - }, - { - "auxiliary_loss_clip": 0.01137356, - "auxiliary_loss_mlp": 0.01047564, - "balance_loss_clip": 1.05142486, - "balance_loss_mlp": 1.02983761, - "epoch": 0.36362543213587856, - "flos": 24787053083520.0, - "grad_norm": 1.922622021112015, - "language_loss": 0.79610157, - "learning_rate": 2.9416662554416767e-06, - "loss": 0.81795079, - "num_input_tokens_seen": 129873530, - "step": 6048, - "time_per_iteration": 4.283480644226074 - }, - { - "auxiliary_loss_clip": 0.01037942, - "auxiliary_loss_mlp": 0.01005664, - "balance_loss_clip": 1.01860034, - "balance_loss_mlp": 1.00387573, - "epoch": 0.3636855553885465, - "flos": 62526369231360.0, - "grad_norm": 0.749844121463454, - "language_loss": 0.52550006, - "learning_rate": 2.9413226458140054e-06, - "loss": 0.54593611, - "num_input_tokens_seen": 129940400, - "step": 6049, - "time_per_iteration": 3.2647299766540527 - }, - { - "auxiliary_loss_clip": 0.01105759, - "auxiliary_loss_mlp": 0.01042028, - "balance_loss_clip": 1.04831481, - "balance_loss_mlp": 1.02467084, - "epoch": 0.3637456786412145, - "flos": 24060652341120.0, - "grad_norm": 9.722138117523357, - "language_loss": 0.8628068, - "learning_rate": 2.9409790004912845e-06, - "loss": 0.88428462, - "num_input_tokens_seen": 129958635, - "step": 6050, - "time_per_iteration": 2.744236469268799 - }, - { - "auxiliary_loss_clip": 0.01120328, - "auxiliary_loss_mlp": 0.00772785, - "balance_loss_clip": 1.04944158, - "balance_loss_mlp": 1.0004611, - "epoch": 0.36380580189388245, - "flos": 16691495852160.0, - "grad_norm": 3.109361789309709, - "language_loss": 0.78116536, - "learning_rate": 2.940635319486546e-06, - "loss": 0.80009651, - "num_input_tokens_seen": 129977685, - "step": 6051, - "time_per_iteration": 2.6305320262908936 - }, - { - "auxiliary_loss_clip": 0.01127196, - "auxiliary_loss_mlp": 0.01040856, - "balance_loss_clip": 1.04900503, - "balance_loss_mlp": 1.02559745, - "epoch": 0.3638659251465504, - "flos": 25114091437440.0, - "grad_norm": 1.9275322741448784, - "language_loss": 0.82526582, - "learning_rate": 2.940291602812822e-06, - "loss": 0.84694636, - "num_input_tokens_seen": 129997530, - "step": 6052, - "time_per_iteration": 2.711794853210449 - }, - { - "auxiliary_loss_clip": 0.01100415, - "auxiliary_loss_mlp": 0.01036967, - "balance_loss_clip": 1.04675376, - "balance_loss_mlp": 1.02270949, - "epoch": 0.3639260483992184, - "flos": 23003011353600.0, - "grad_norm": 1.7820298413079305, - "language_loss": 0.72085792, - "learning_rate": 2.939947850483145e-06, - "loss": 0.74223173, - "num_input_tokens_seen": 130017955, - "step": 6053, - "time_per_iteration": 2.725600481033325 - }, - { - "auxiliary_loss_clip": 0.01015406, - "auxiliary_loss_mlp": 0.01003631, - "balance_loss_clip": 1.0300014, - "balance_loss_mlp": 1.00155663, - "epoch": 0.36398617165188635, - "flos": 70716011160960.0, - "grad_norm": 0.7712310074836012, - "language_loss": 0.61214095, - "learning_rate": 2.9396040625105532e-06, - "loss": 0.63233131, - "num_input_tokens_seen": 130074275, - "step": 6054, - "time_per_iteration": 3.3252007961273193 - }, - { - "auxiliary_loss_clip": 0.0111079, - "auxiliary_loss_mlp": 0.01038999, - "balance_loss_clip": 1.04735899, - "balance_loss_mlp": 1.02214301, - "epoch": 0.3640462949045543, - "flos": 22235456603520.0, - "grad_norm": 2.93078334140581, - "language_loss": 0.75820959, - "learning_rate": 2.9392602389080802e-06, - "loss": 0.77970749, - "num_input_tokens_seen": 130091375, - "step": 6055, - "time_per_iteration": 2.656001091003418 - }, - { - "auxiliary_loss_clip": 0.0113529, - "auxiliary_loss_mlp": 0.01041525, - "balance_loss_clip": 1.04910016, - "balance_loss_mlp": 1.02581286, - "epoch": 0.3641064181572223, - "flos": 21543529939200.0, - "grad_norm": 1.6734377169093124, - "language_loss": 0.7533145, - "learning_rate": 2.938916379688765e-06, - "loss": 0.77508265, - "num_input_tokens_seen": 130111595, - "step": 6056, - "time_per_iteration": 2.654418468475342 - }, - { - "auxiliary_loss_clip": 0.01121707, - "auxiliary_loss_mlp": 0.01038714, - "balance_loss_clip": 1.055071, - "balance_loss_mlp": 1.02337217, - "epoch": 0.3641665414098903, - "flos": 22273306560000.0, - "grad_norm": 2.035168503846255, - "language_loss": 0.80473512, - "learning_rate": 2.9385724848656468e-06, - "loss": 0.82633936, - "num_input_tokens_seen": 130131440, - "step": 6057, - "time_per_iteration": 2.7347753047943115 - }, - { - "auxiliary_loss_clip": 0.01107128, - "auxiliary_loss_mlp": 0.01039802, - "balance_loss_clip": 1.04495037, - "balance_loss_mlp": 1.02438855, - "epoch": 0.36422666466255826, - "flos": 28329676778880.0, - "grad_norm": 2.043030499006847, - "language_loss": 0.80264485, - "learning_rate": 2.9382285544517647e-06, - "loss": 0.8241142, - "num_input_tokens_seen": 130151375, - "step": 6058, - "time_per_iteration": 2.695674180984497 - }, - { - "auxiliary_loss_clip": 0.01102831, - "auxiliary_loss_mlp": 0.00772601, - "balance_loss_clip": 1.04357934, - "balance_loss_mlp": 1.00046432, - "epoch": 0.36428678791522623, - "flos": 24170503109760.0, - "grad_norm": 2.032310914115462, - "language_loss": 0.84994543, - "learning_rate": 2.9378845884601636e-06, - "loss": 0.86869979, - "num_input_tokens_seen": 130169960, - "step": 6059, - "time_per_iteration": 2.6912410259246826 - }, - { - "auxiliary_loss_clip": 0.01093721, - "auxiliary_loss_mlp": 0.01039242, - "balance_loss_clip": 1.04318213, - "balance_loss_mlp": 1.02287483, - "epoch": 0.3643469111678942, - "flos": 22528451842560.0, - "grad_norm": 5.903326132338396, - "language_loss": 0.87806225, - "learning_rate": 2.937540586903884e-06, - "loss": 0.89939183, - "num_input_tokens_seen": 130189800, - "step": 6060, - "time_per_iteration": 2.713115692138672 - }, - { - "auxiliary_loss_clip": 0.01125791, - "auxiliary_loss_mlp": 0.01040312, - "balance_loss_clip": 1.0498302, - "balance_loss_mlp": 1.02388453, - "epoch": 0.36440703442056216, - "flos": 19426595938560.0, - "grad_norm": 2.3521788015610805, - "language_loss": 0.66954017, - "learning_rate": 2.937196549795971e-06, - "loss": 0.69120121, - "num_input_tokens_seen": 130206370, - "step": 6061, - "time_per_iteration": 2.8435866832733154 - }, - { - "auxiliary_loss_clip": 0.0111942, - "auxiliary_loss_mlp": 0.01038694, - "balance_loss_clip": 1.05207086, - "balance_loss_mlp": 1.02260041, - "epoch": 0.3644671576732301, - "flos": 18040515966720.0, - "grad_norm": 2.5119296796020354, - "language_loss": 0.75012159, - "learning_rate": 2.9368524771494718e-06, - "loss": 0.77170277, - "num_input_tokens_seen": 130224445, - "step": 6062, - "time_per_iteration": 2.659853935241699 - }, - { - "auxiliary_loss_clip": 0.01108402, - "auxiliary_loss_mlp": 0.01034157, - "balance_loss_clip": 1.04851866, - "balance_loss_mlp": 1.01628149, - "epoch": 0.3645272809258981, - "flos": 21542811667200.0, - "grad_norm": 2.568706719167558, - "language_loss": 0.72070628, - "learning_rate": 2.936508368977432e-06, - "loss": 0.74213189, - "num_input_tokens_seen": 130245380, - "step": 6063, - "time_per_iteration": 2.7098159790039062 - }, - { - "auxiliary_loss_clip": 0.01118768, - "auxiliary_loss_mlp": 0.010373, - "balance_loss_clip": 1.04472148, - "balance_loss_mlp": 1.02187479, - "epoch": 0.36458740417856605, - "flos": 22746860490240.0, - "grad_norm": 2.3511982692020936, - "language_loss": 0.68179435, - "learning_rate": 2.936164225292901e-06, - "loss": 0.70335501, - "num_input_tokens_seen": 130265575, - "step": 6064, - "time_per_iteration": 2.6513044834136963 - }, - { - "auxiliary_loss_clip": 0.01116627, - "auxiliary_loss_mlp": 0.01045789, - "balance_loss_clip": 1.04925466, - "balance_loss_mlp": 1.02988076, - "epoch": 0.364647527431234, - "flos": 26140670138880.0, - "grad_norm": 1.9840367281230236, - "language_loss": 0.74147421, - "learning_rate": 2.9358200461089297e-06, - "loss": 0.76309836, - "num_input_tokens_seen": 130286195, - "step": 6065, - "time_per_iteration": 2.764556407928467 - }, - { - "auxiliary_loss_clip": 0.0111688, - "auxiliary_loss_mlp": 0.01040465, - "balance_loss_clip": 1.04924774, - "balance_loss_mlp": 1.02306008, - "epoch": 0.364707650683902, - "flos": 31029907737600.0, - "grad_norm": 2.0108238901766042, - "language_loss": 0.75444913, - "learning_rate": 2.9354758314385676e-06, - "loss": 0.77602255, - "num_input_tokens_seen": 130306095, - "step": 6066, - "time_per_iteration": 2.749293088912964 - }, - { - "auxiliary_loss_clip": 0.01121102, - "auxiliary_loss_mlp": 0.01034674, - "balance_loss_clip": 1.04859555, - "balance_loss_mlp": 1.02010643, - "epoch": 0.36476777393656995, - "flos": 19572896033280.0, - "grad_norm": 2.8385875288429587, - "language_loss": 0.76480901, - "learning_rate": 2.9351315812948684e-06, - "loss": 0.78636676, - "num_input_tokens_seen": 130324685, - "step": 6067, - "time_per_iteration": 2.619833469390869 - }, - { - "auxiliary_loss_clip": 0.01135088, - "auxiliary_loss_mlp": 0.0103807, - "balance_loss_clip": 1.05067635, - "balance_loss_mlp": 1.02401567, - "epoch": 0.3648278971892379, - "flos": 17748849530880.0, - "grad_norm": 2.2214902441228563, - "language_loss": 0.71036232, - "learning_rate": 2.934787295690886e-06, - "loss": 0.73209393, - "num_input_tokens_seen": 130343855, - "step": 6068, - "time_per_iteration": 2.633678674697876 - }, - { - "auxiliary_loss_clip": 0.01119276, - "auxiliary_loss_mlp": 0.01039471, - "balance_loss_clip": 1.0432384, - "balance_loss_mlp": 1.02402711, - "epoch": 0.3648880204419059, - "flos": 17931167988480.0, - "grad_norm": 2.184109901605664, - "language_loss": 0.74421692, - "learning_rate": 2.9344429746396755e-06, - "loss": 0.76580441, - "num_input_tokens_seen": 130362320, - "step": 6069, - "time_per_iteration": 2.6463425159454346 - }, - { - "auxiliary_loss_clip": 0.01115147, - "auxiliary_loss_mlp": 0.0103807, - "balance_loss_clip": 1.04814148, - "balance_loss_mlp": 1.02237022, - "epoch": 0.3649481436945739, - "flos": 22638266697600.0, - "grad_norm": 1.8874088651190308, - "language_loss": 0.66247845, - "learning_rate": 2.9340986181542945e-06, - "loss": 0.68401062, - "num_input_tokens_seen": 130383165, - "step": 6070, - "time_per_iteration": 2.70835280418396 - }, - { - "auxiliary_loss_clip": 0.01118852, - "auxiliary_loss_mlp": 0.01036547, - "balance_loss_clip": 1.04837227, - "balance_loss_mlp": 1.02161574, - "epoch": 0.36500826694724187, - "flos": 21579656042880.0, - "grad_norm": 1.882521473859371, - "language_loss": 0.74406028, - "learning_rate": 2.9337542262477994e-06, - "loss": 0.76561427, - "num_input_tokens_seen": 130402425, - "step": 6071, - "time_per_iteration": 2.6479921340942383 - }, - { - "auxiliary_loss_clip": 0.0112348, - "auxiliary_loss_mlp": 0.01037332, - "balance_loss_clip": 1.04683149, - "balance_loss_mlp": 1.02142978, - "epoch": 0.36506839019990983, - "flos": 13772533023360.0, - "grad_norm": 1.9443656652026238, - "language_loss": 0.88592315, - "learning_rate": 2.9334097989332506e-06, - "loss": 0.9075312, - "num_input_tokens_seen": 130419440, - "step": 6072, - "time_per_iteration": 2.641340732574463 - }, - { - "auxiliary_loss_clip": 0.01122637, - "auxiliary_loss_mlp": 0.01036427, - "balance_loss_clip": 1.0495832, - "balance_loss_mlp": 1.02225924, - "epoch": 0.3651285134525778, - "flos": 17274972378240.0, - "grad_norm": 2.382408041683643, - "language_loss": 0.72436309, - "learning_rate": 2.9330653362237094e-06, - "loss": 0.7459538, - "num_input_tokens_seen": 130438495, - "step": 6073, - "time_per_iteration": 2.6814513206481934 - }, - { - "auxiliary_loss_clip": 0.01067321, - "auxiliary_loss_mlp": 0.01042007, - "balance_loss_clip": 1.04483008, - "balance_loss_mlp": 1.0249722, - "epoch": 0.36518863670524576, - "flos": 21907987286400.0, - "grad_norm": 3.1332797030940913, - "language_loss": 0.66850221, - "learning_rate": 2.932720838132236e-06, - "loss": 0.68959546, - "num_input_tokens_seen": 130455575, - "step": 6074, - "time_per_iteration": 2.7943460941314697 - }, - { - "auxiliary_loss_clip": 0.01103652, - "auxiliary_loss_mlp": 0.01037343, - "balance_loss_clip": 1.04833269, - "balance_loss_mlp": 1.02238262, - "epoch": 0.3652487599579137, - "flos": 27122180250240.0, - "grad_norm": 1.5371260958261816, - "language_loss": 0.72812623, - "learning_rate": 2.9323763046718954e-06, - "loss": 0.74953616, - "num_input_tokens_seen": 130476385, - "step": 6075, - "time_per_iteration": 2.7581374645233154 - }, - { - "auxiliary_loss_clip": 0.01100578, - "auxiliary_loss_mlp": 0.01046604, - "balance_loss_clip": 1.04679585, - "balance_loss_mlp": 1.03011715, - "epoch": 0.3653088832105817, - "flos": 19755573626880.0, - "grad_norm": 2.1248471900324186, - "language_loss": 0.89377797, - "learning_rate": 2.9320317358557524e-06, - "loss": 0.91524976, - "num_input_tokens_seen": 130493630, - "step": 6076, - "time_per_iteration": 2.7085182666778564 - }, - { - "auxiliary_loss_clip": 0.01125287, - "auxiliary_loss_mlp": 0.01043945, - "balance_loss_clip": 1.0504595, - "balance_loss_mlp": 1.02784586, - "epoch": 0.36536900646324966, - "flos": 13115008609920.0, - "grad_norm": 2.218138292044272, - "language_loss": 0.69377828, - "learning_rate": 2.931687131696872e-06, - "loss": 0.71547067, - "num_input_tokens_seen": 130510735, - "step": 6077, - "time_per_iteration": 2.6516926288604736 - }, - { - "auxiliary_loss_clip": 0.01063406, - "auxiliary_loss_mlp": 0.01003112, - "balance_loss_clip": 1.03200221, - "balance_loss_mlp": 1.00121677, - "epoch": 0.3654291297159176, - "flos": 71100472383360.0, - "grad_norm": 0.7484778409156561, - "language_loss": 0.61802375, - "learning_rate": 2.9313424922083224e-06, - "loss": 0.63868892, - "num_input_tokens_seen": 130577050, - "step": 6078, - "time_per_iteration": 3.2192225456237793 - }, - { - "auxiliary_loss_clip": 0.01105852, - "auxiliary_loss_mlp": 0.01053011, - "balance_loss_clip": 1.04234397, - "balance_loss_mlp": 1.03565383, - "epoch": 0.3654892529685856, - "flos": 23617478338560.0, - "grad_norm": 2.6620805395927283, - "language_loss": 0.78445792, - "learning_rate": 2.930997817403173e-06, - "loss": 0.80604661, - "num_input_tokens_seen": 130593780, - "step": 6079, - "time_per_iteration": 2.6616902351379395 - }, - { - "auxiliary_loss_clip": 0.01129934, - "auxiliary_loss_mlp": 0.01040158, - "balance_loss_clip": 1.05226243, - "balance_loss_mlp": 1.02386224, - "epoch": 0.36554937622125355, - "flos": 43470799850880.0, - "grad_norm": 2.4767906644356037, - "language_loss": 0.62662333, - "learning_rate": 2.9306531072944913e-06, - "loss": 0.64832425, - "num_input_tokens_seen": 130615510, - "step": 6080, - "time_per_iteration": 2.8651509284973145 - }, - { - "auxiliary_loss_clip": 0.01108292, - "auxiliary_loss_mlp": 0.01042236, - "balance_loss_clip": 1.04737091, - "balance_loss_mlp": 1.02529645, - "epoch": 0.3656094994739215, - "flos": 23294641875840.0, - "grad_norm": 3.1314387429818327, - "language_loss": 0.67686033, - "learning_rate": 2.930308361895352e-06, - "loss": 0.69836557, - "num_input_tokens_seen": 130635410, - "step": 6081, - "time_per_iteration": 2.707031011581421 - }, - { - "auxiliary_loss_clip": 0.01112746, - "auxiliary_loss_mlp": 0.00773158, - "balance_loss_clip": 1.04989302, - "balance_loss_mlp": 1.00033236, - "epoch": 0.3656696227265895, - "flos": 24571984400640.0, - "grad_norm": 1.5854068035466964, - "language_loss": 0.74755692, - "learning_rate": 2.9299635812188257e-06, - "loss": 0.76641595, - "num_input_tokens_seen": 130657725, - "step": 6082, - "time_per_iteration": 2.7261881828308105 - }, - { - "auxiliary_loss_clip": 0.01072732, - "auxiliary_loss_mlp": 0.00772597, - "balance_loss_clip": 1.04222691, - "balance_loss_mlp": 1.00042963, - "epoch": 0.3657297459792575, - "flos": 27928375056000.0, - "grad_norm": 2.051480252043875, - "language_loss": 0.82956016, - "learning_rate": 2.929618765277987e-06, - "loss": 0.8480134, - "num_input_tokens_seen": 130678360, - "step": 6083, - "time_per_iteration": 4.360748529434204 - }, - { - "auxiliary_loss_clip": 0.01041394, - "auxiliary_loss_mlp": 0.01001412, - "balance_loss_clip": 1.02900386, - "balance_loss_mlp": 0.99936181, - "epoch": 0.36578986923192547, - "flos": 67392622126080.0, - "grad_norm": 0.8163771270511553, - "language_loss": 0.59314513, - "learning_rate": 2.9292739140859125e-06, - "loss": 0.61357319, - "num_input_tokens_seen": 130742110, - "step": 6084, - "time_per_iteration": 3.3273561000823975 - }, - { - "auxiliary_loss_clip": 0.0109183, - "auxiliary_loss_mlp": 0.0104143, - "balance_loss_clip": 1.04496968, - "balance_loss_mlp": 1.02570593, - "epoch": 0.36584999248459343, - "flos": 20227511445120.0, - "grad_norm": 3.4329037043843478, - "language_loss": 0.72791892, - "learning_rate": 2.9289290276556767e-06, - "loss": 0.74925154, - "num_input_tokens_seen": 130759870, - "step": 6085, - "time_per_iteration": 2.7221856117248535 - }, - { - "auxiliary_loss_clip": 0.01101549, - "auxiliary_loss_mlp": 0.01038512, - "balance_loss_clip": 1.04982924, - "balance_loss_mlp": 1.02383745, - "epoch": 0.3659101157372614, - "flos": 19062461813760.0, - "grad_norm": 2.636651052815632, - "language_loss": 0.77860379, - "learning_rate": 2.9285841060003604e-06, - "loss": 0.80000436, - "num_input_tokens_seen": 130778510, - "step": 6086, - "time_per_iteration": 4.265977621078491 - }, - { - "auxiliary_loss_clip": 0.0111591, - "auxiliary_loss_mlp": 0.01032554, - "balance_loss_clip": 1.04616153, - "balance_loss_mlp": 1.01771855, - "epoch": 0.36597023898992936, - "flos": 30810708990720.0, - "grad_norm": 1.8562986050024126, - "language_loss": 0.76759315, - "learning_rate": 2.9282391491330416e-06, - "loss": 0.78907776, - "num_input_tokens_seen": 130798535, - "step": 6087, - "time_per_iteration": 4.227373123168945 - }, - { - "auxiliary_loss_clip": 0.01081855, - "auxiliary_loss_mlp": 0.01042282, - "balance_loss_clip": 1.04556108, - "balance_loss_mlp": 1.02589023, - "epoch": 0.36603036224259733, - "flos": 20521799573760.0, - "grad_norm": 2.2476274891892474, - "language_loss": 0.71063232, - "learning_rate": 2.9278941570668002e-06, - "loss": 0.73187363, - "num_input_tokens_seen": 130816655, - "step": 6088, - "time_per_iteration": 4.3080058097839355 - }, - { - "auxiliary_loss_clip": 0.01136094, - "auxiliary_loss_mlp": 0.01039702, - "balance_loss_clip": 1.05314517, - "balance_loss_mlp": 1.02267289, - "epoch": 0.3660904854952653, - "flos": 38329397798400.0, - "grad_norm": 1.6318023186273214, - "language_loss": 0.79717827, - "learning_rate": 2.92754912981472e-06, - "loss": 0.81893623, - "num_input_tokens_seen": 130841225, - "step": 6089, - "time_per_iteration": 2.782954216003418 - }, - { - "auxiliary_loss_clip": 0.01099767, - "auxiliary_loss_mlp": 0.01036428, - "balance_loss_clip": 1.04514015, - "balance_loss_mlp": 1.02220643, - "epoch": 0.36615060874793326, - "flos": 21835555511040.0, - "grad_norm": 2.0312735397290043, - "language_loss": 0.71617413, - "learning_rate": 2.927204067389884e-06, - "loss": 0.73753607, - "num_input_tokens_seen": 130861050, - "step": 6090, - "time_per_iteration": 2.7414958477020264 - }, - { - "auxiliary_loss_clip": 0.01105933, - "auxiliary_loss_mlp": 0.01047805, - "balance_loss_clip": 1.05133104, - "balance_loss_mlp": 1.03305852, - "epoch": 0.3662107320006012, - "flos": 16581537342720.0, - "grad_norm": 2.037307676788604, - "language_loss": 0.74434924, - "learning_rate": 2.9268589698053763e-06, - "loss": 0.7658866, - "num_input_tokens_seen": 130879775, - "step": 6091, - "time_per_iteration": 2.628554344177246 - }, - { - "auxiliary_loss_clip": 0.01076087, - "auxiliary_loss_mlp": 0.01042935, - "balance_loss_clip": 1.04836047, - "balance_loss_mlp": 1.02728868, - "epoch": 0.3662708552532692, - "flos": 20958365473920.0, - "grad_norm": 2.1960531931019682, - "language_loss": 0.73387206, - "learning_rate": 2.926513837074284e-06, - "loss": 0.75506234, - "num_input_tokens_seen": 130898070, - "step": 6092, - "time_per_iteration": 2.7320556640625 - }, - { - "auxiliary_loss_clip": 0.01127006, - "auxiliary_loss_mlp": 0.01044139, - "balance_loss_clip": 1.04809344, - "balance_loss_mlp": 1.02796876, - "epoch": 0.36633097850593715, - "flos": 21902707987200.0, - "grad_norm": 1.9967925590844784, - "language_loss": 0.77662504, - "learning_rate": 2.9261686692096942e-06, - "loss": 0.79833645, - "num_input_tokens_seen": 130915250, - "step": 6093, - "time_per_iteration": 2.721311092376709 - }, - { - "auxiliary_loss_clip": 0.01124005, - "auxiliary_loss_mlp": 0.01042053, - "balance_loss_clip": 1.04696584, - "balance_loss_mlp": 1.02686548, - "epoch": 0.3663911017586051, - "flos": 32854133808000.0, - "grad_norm": 1.926436620767835, - "language_loss": 0.7455743, - "learning_rate": 2.925823466224696e-06, - "loss": 0.76723486, - "num_input_tokens_seen": 130936995, - "step": 6094, - "time_per_iteration": 2.767188310623169 - }, - { - "auxiliary_loss_clip": 0.01142303, - "auxiliary_loss_mlp": 0.01055832, - "balance_loss_clip": 1.05334711, - "balance_loss_mlp": 1.03969133, - "epoch": 0.3664512250112731, - "flos": 27271748482560.0, - "grad_norm": 1.743331442809004, - "language_loss": 0.79444361, - "learning_rate": 2.9254782281323785e-06, - "loss": 0.81642497, - "num_input_tokens_seen": 130957970, - "step": 6095, - "time_per_iteration": 2.718632459640503 - }, - { - "auxiliary_loss_clip": 0.01118218, - "auxiliary_loss_mlp": 0.00774719, - "balance_loss_clip": 1.05141842, - "balance_loss_mlp": 1.00037265, - "epoch": 0.3665113482639411, - "flos": 17784436930560.0, - "grad_norm": 3.4988865885900178, - "language_loss": 0.73592722, - "learning_rate": 2.925132954945834e-06, - "loss": 0.75485659, - "num_input_tokens_seen": 130974915, - "step": 6096, - "time_per_iteration": 2.674382448196411 - }, - { - "auxiliary_loss_clip": 0.01099743, - "auxiliary_loss_mlp": 0.01038971, - "balance_loss_clip": 1.04458702, - "balance_loss_mlp": 1.02355742, - "epoch": 0.36657147151660907, - "flos": 27854614477440.0, - "grad_norm": 2.41624095312735, - "language_loss": 0.67081815, - "learning_rate": 2.924787646678155e-06, - "loss": 0.69220531, - "num_input_tokens_seen": 130995745, - "step": 6097, - "time_per_iteration": 2.789118766784668 - }, - { - "auxiliary_loss_clip": 0.01077673, - "auxiliary_loss_mlp": 0.01038362, - "balance_loss_clip": 1.04489172, - "balance_loss_mlp": 1.02268624, - "epoch": 0.36663159476927704, - "flos": 25374013228800.0, - "grad_norm": 1.4796406838499911, - "language_loss": 0.77679402, - "learning_rate": 2.9244423033424365e-06, - "loss": 0.79795432, - "num_input_tokens_seen": 131015545, - "step": 6098, - "time_per_iteration": 2.7803733348846436 - }, - { - "auxiliary_loss_clip": 0.01122346, - "auxiliary_loss_mlp": 0.01045291, - "balance_loss_clip": 1.04734826, - "balance_loss_mlp": 1.02987766, - "epoch": 0.366691718021945, - "flos": 21357225072000.0, - "grad_norm": 1.744595499322522, - "language_loss": 0.73707491, - "learning_rate": 2.9240969249517723e-06, - "loss": 0.75875127, - "num_input_tokens_seen": 131033990, - "step": 6099, - "time_per_iteration": 2.6809163093566895 - }, - { - "auxiliary_loss_clip": 0.01111202, - "auxiliary_loss_mlp": 0.01044256, - "balance_loss_clip": 1.04759586, - "balance_loss_mlp": 1.02931285, - "epoch": 0.36675184127461297, - "flos": 16800376953600.0, - "grad_norm": 1.8475933970370078, - "language_loss": 0.84773195, - "learning_rate": 2.9237515115192602e-06, - "loss": 0.86928654, - "num_input_tokens_seen": 131050710, - "step": 6100, - "time_per_iteration": 2.6730356216430664 - }, - { - "auxiliary_loss_clip": 0.01102438, - "auxiliary_loss_mlp": 0.01037575, - "balance_loss_clip": 1.04448223, - "balance_loss_mlp": 1.02181566, - "epoch": 0.36681196452728093, - "flos": 21906514828800.0, - "grad_norm": 3.9532097547953104, - "language_loss": 0.70893979, - "learning_rate": 2.9234060630579992e-06, - "loss": 0.73033994, - "num_input_tokens_seen": 131071435, - "step": 6101, - "time_per_iteration": 2.7369589805603027 - }, - { - "auxiliary_loss_clip": 0.01111262, - "auxiliary_loss_mlp": 0.01052791, - "balance_loss_clip": 1.05096185, - "balance_loss_mlp": 1.0361371, - "epoch": 0.3668720877799489, - "flos": 17712436118400.0, - "grad_norm": 2.286737474315047, - "language_loss": 0.76634502, - "learning_rate": 2.9230605795810865e-06, - "loss": 0.7879855, - "num_input_tokens_seen": 131088775, - "step": 6102, - "time_per_iteration": 2.7081708908081055 - }, - { - "auxiliary_loss_clip": 0.01131629, - "auxiliary_loss_mlp": 0.01037373, - "balance_loss_clip": 1.0524683, - "balance_loss_mlp": 1.02050483, - "epoch": 0.36693221103261686, - "flos": 47045455499520.0, - "grad_norm": 4.369253140908342, - "language_loss": 0.70019859, - "learning_rate": 2.922715061101625e-06, - "loss": 0.72188866, - "num_input_tokens_seen": 131112800, - "step": 6103, - "time_per_iteration": 2.8610281944274902 - }, - { - "auxiliary_loss_clip": 0.01093091, - "auxiliary_loss_mlp": 0.0103895, - "balance_loss_clip": 1.04730344, - "balance_loss_mlp": 1.02283263, - "epoch": 0.3669923342852848, - "flos": 15960929132160.0, - "grad_norm": 3.0883152470965842, - "language_loss": 0.72272754, - "learning_rate": 2.922369507632716e-06, - "loss": 0.744048, - "num_input_tokens_seen": 131131150, - "step": 6104, - "time_per_iteration": 2.7520432472229004 - }, - { - "auxiliary_loss_clip": 0.01127975, - "auxiliary_loss_mlp": 0.01036046, - "balance_loss_clip": 1.05017686, - "balance_loss_mlp": 1.01940393, - "epoch": 0.3670524575379528, - "flos": 19974485064960.0, - "grad_norm": 2.1608886453477947, - "language_loss": 0.81461251, - "learning_rate": 2.9220239191874617e-06, - "loss": 0.83625269, - "num_input_tokens_seen": 131150365, - "step": 6105, - "time_per_iteration": 2.7565362453460693 - }, - { - "auxiliary_loss_clip": 0.0114363, - "auxiliary_loss_mlp": 0.01041522, - "balance_loss_clip": 1.05170739, - "balance_loss_mlp": 1.02526236, - "epoch": 0.36711258079062076, - "flos": 25702955003520.0, - "grad_norm": 1.7202629897198451, - "language_loss": 0.81035495, - "learning_rate": 2.9216782957789692e-06, - "loss": 0.83220649, - "num_input_tokens_seen": 131169310, - "step": 6106, - "time_per_iteration": 2.73502779006958 - }, - { - "auxiliary_loss_clip": 0.01035121, - "auxiliary_loss_mlp": 0.00753905, - "balance_loss_clip": 1.03131676, - "balance_loss_mlp": 1.00104892, - "epoch": 0.3671727040432887, - "flos": 60772743342720.0, - "grad_norm": 0.6921927745874564, - "language_loss": 0.59176284, - "learning_rate": 2.9213326374203426e-06, - "loss": 0.60965312, - "num_input_tokens_seen": 131232900, - "step": 6107, - "time_per_iteration": 3.2754647731781006 - }, - { - "auxiliary_loss_clip": 0.01111272, - "auxiliary_loss_mlp": 0.01035704, - "balance_loss_clip": 1.04770529, - "balance_loss_mlp": 1.02058864, - "epoch": 0.3672328272959567, - "flos": 18661303745280.0, - "grad_norm": 1.8102661289525128, - "language_loss": 0.74492711, - "learning_rate": 2.92098694412469e-06, - "loss": 0.76639688, - "num_input_tokens_seen": 131250920, - "step": 6108, - "time_per_iteration": 2.730562448501587 - }, - { - "auxiliary_loss_clip": 0.01129123, - "auxiliary_loss_mlp": 0.01037704, - "balance_loss_clip": 1.04957151, - "balance_loss_mlp": 1.02196801, - "epoch": 0.3672929505486247, - "flos": 15049049535360.0, - "grad_norm": 2.04949693656995, - "language_loss": 0.72790694, - "learning_rate": 2.9206412159051213e-06, - "loss": 0.7495752, - "num_input_tokens_seen": 131267910, - "step": 6109, - "time_per_iteration": 2.6488542556762695 - }, - { - "auxiliary_loss_clip": 0.01065451, - "auxiliary_loss_mlp": 0.01040533, - "balance_loss_clip": 1.04156637, - "balance_loss_mlp": 1.02426052, - "epoch": 0.3673530738012927, - "flos": 20589347099520.0, - "grad_norm": 4.856830375229604, - "language_loss": 0.53295934, - "learning_rate": 2.920295452774744e-06, - "loss": 0.55401909, - "num_input_tokens_seen": 131287150, - "step": 6110, - "time_per_iteration": 2.8366596698760986 - }, - { - "auxiliary_loss_clip": 0.01123878, - "auxiliary_loss_mlp": 0.01039006, - "balance_loss_clip": 1.04783487, - "balance_loss_mlp": 1.02253747, - "epoch": 0.36741319705396064, - "flos": 21689830033920.0, - "grad_norm": 1.6516494205850427, - "language_loss": 0.80507129, - "learning_rate": 2.919949654746672e-06, - "loss": 0.82670015, - "num_input_tokens_seen": 131308225, - "step": 6111, - "time_per_iteration": 2.7537708282470703 - }, - { - "auxiliary_loss_clip": 0.01083524, - "auxiliary_loss_mlp": 0.01044306, - "balance_loss_clip": 1.04381704, - "balance_loss_mlp": 1.02897525, - "epoch": 0.3674733203066286, - "flos": 29862200499840.0, - "grad_norm": 1.7980410764958656, - "language_loss": 0.72401643, - "learning_rate": 2.9196038218340163e-06, - "loss": 0.74529469, - "num_input_tokens_seen": 131332115, - "step": 6112, - "time_per_iteration": 2.80513858795166 - }, - { - "auxiliary_loss_clip": 0.0112775, - "auxiliary_loss_mlp": 0.01046215, - "balance_loss_clip": 1.05025816, - "balance_loss_mlp": 1.03102732, - "epoch": 0.36753344355929657, - "flos": 18257021193600.0, - "grad_norm": 1.6179233760027578, - "language_loss": 0.8539387, - "learning_rate": 2.919257954049892e-06, - "loss": 0.8756783, - "num_input_tokens_seen": 131351885, - "step": 6113, - "time_per_iteration": 2.6997315883636475 - }, - { - "auxiliary_loss_clip": 0.01128342, - "auxiliary_loss_mlp": 0.01041644, - "balance_loss_clip": 1.04813516, - "balance_loss_mlp": 1.02512193, - "epoch": 0.36759356681196453, - "flos": 25301150490240.0, - "grad_norm": 2.2420277636185872, - "language_loss": 0.78542709, - "learning_rate": 2.918912051407413e-06, - "loss": 0.807127, - "num_input_tokens_seen": 131370245, - "step": 6114, - "time_per_iteration": 2.694831609725952 - }, - { - "auxiliary_loss_clip": 0.01133627, - "auxiliary_loss_mlp": 0.01044455, - "balance_loss_clip": 1.05145383, - "balance_loss_mlp": 1.02612031, - "epoch": 0.3676536900646325, - "flos": 21032952065280.0, - "grad_norm": 1.6750895304816946, - "language_loss": 0.67368686, - "learning_rate": 2.918566113919698e-06, - "loss": 0.69546771, - "num_input_tokens_seen": 131388115, - "step": 6115, - "time_per_iteration": 2.6966724395751953 - }, - { - "auxiliary_loss_clip": 0.01104674, - "auxiliary_loss_mlp": 0.01037383, - "balance_loss_clip": 1.04332471, - "balance_loss_mlp": 1.02229142, - "epoch": 0.36771381331730046, - "flos": 16288506190080.0, - "grad_norm": 3.500949938115168, - "language_loss": 0.76685899, - "learning_rate": 2.9182201415998636e-06, - "loss": 0.78827953, - "num_input_tokens_seen": 131404595, - "step": 6116, - "time_per_iteration": 2.6796109676361084 - }, - { - "auxiliary_loss_clip": 0.01088778, - "auxiliary_loss_mlp": 0.01043047, - "balance_loss_clip": 1.04433835, - "balance_loss_mlp": 1.02729988, - "epoch": 0.36777393656996843, - "flos": 22309971367680.0, - "grad_norm": 1.7533988300226562, - "language_loss": 0.62997502, - "learning_rate": 2.9178741344610286e-06, - "loss": 0.65129328, - "num_input_tokens_seen": 131423760, - "step": 6117, - "time_per_iteration": 2.7784011363983154 - }, - { - "auxiliary_loss_clip": 0.01103848, - "auxiliary_loss_mlp": 0.01037351, - "balance_loss_clip": 1.04275632, - "balance_loss_mlp": 1.0210557, - "epoch": 0.3678340598226364, - "flos": 26834069260800.0, - "grad_norm": 1.9867834860772036, - "language_loss": 0.73087811, - "learning_rate": 2.9175280925163156e-06, - "loss": 0.75229007, - "num_input_tokens_seen": 131444955, - "step": 6118, - "time_per_iteration": 2.734731674194336 - }, - { - "auxiliary_loss_clip": 0.01132746, - "auxiliary_loss_mlp": 0.01043898, - "balance_loss_clip": 1.05198336, - "balance_loss_mlp": 1.0266242, - "epoch": 0.36789418307530436, - "flos": 21761723105280.0, - "grad_norm": 2.319960114880422, - "language_loss": 0.72638988, - "learning_rate": 2.9171820157788445e-06, - "loss": 0.74815631, - "num_input_tokens_seen": 131465720, - "step": 6119, - "time_per_iteration": 2.7073371410369873 - }, - { - "auxiliary_loss_clip": 0.0111183, - "auxiliary_loss_mlp": 0.01037904, - "balance_loss_clip": 1.04830384, - "balance_loss_mlp": 1.02101171, - "epoch": 0.3679543063279723, - "flos": 15924192497280.0, - "grad_norm": 1.9587818101138383, - "language_loss": 0.80524689, - "learning_rate": 2.9168359042617404e-06, - "loss": 0.8267442, - "num_input_tokens_seen": 131483080, - "step": 6120, - "time_per_iteration": 2.679933547973633 - }, - { - "auxiliary_loss_clip": 0.01093981, - "auxiliary_loss_mlp": 0.0104441, - "balance_loss_clip": 1.04785204, - "balance_loss_mlp": 1.02894819, - "epoch": 0.3680144295806403, - "flos": 24275541456000.0, - "grad_norm": 2.4092121945194496, - "language_loss": 0.64745319, - "learning_rate": 2.916489757978126e-06, - "loss": 0.66883707, - "num_input_tokens_seen": 131502545, - "step": 6121, - "time_per_iteration": 2.7067880630493164 - }, - { - "auxiliary_loss_clip": 0.01126101, - "auxiliary_loss_mlp": 0.01043212, - "balance_loss_clip": 1.05021691, - "balance_loss_mlp": 1.02735114, - "epoch": 0.36807455283330826, - "flos": 26104148985600.0, - "grad_norm": 1.774708172393826, - "language_loss": 0.71686751, - "learning_rate": 2.9161435769411286e-06, - "loss": 0.73856068, - "num_input_tokens_seen": 131522155, - "step": 6122, - "time_per_iteration": 4.026647329330444 - }, - { - "auxiliary_loss_clip": 0.01106964, - "auxiliary_loss_mlp": 0.01043545, - "balance_loss_clip": 1.04859734, - "balance_loss_mlp": 1.0265938, - "epoch": 0.3681346760859763, - "flos": 24644990793600.0, - "grad_norm": 5.6855406070233245, - "language_loss": 0.69653022, - "learning_rate": 2.915797361163875e-06, - "loss": 0.71803534, - "num_input_tokens_seen": 131543865, - "step": 6123, - "time_per_iteration": 2.7548627853393555 - }, - { - "auxiliary_loss_clip": 0.01128204, - "auxiliary_loss_mlp": 0.01040578, - "balance_loss_clip": 1.04822993, - "balance_loss_mlp": 1.02251744, - "epoch": 0.36819479933864424, - "flos": 23878369797120.0, - "grad_norm": 7.022932421262019, - "language_loss": 0.73640841, - "learning_rate": 2.9154511106594933e-06, - "loss": 0.75809622, - "num_input_tokens_seen": 131562155, - "step": 6124, - "time_per_iteration": 2.6710870265960693 - }, - { - "auxiliary_loss_clip": 0.01116833, - "auxiliary_loss_mlp": 0.01045789, - "balance_loss_clip": 1.04977059, - "balance_loss_mlp": 1.02809882, - "epoch": 0.3682549225913122, - "flos": 25553997302400.0, - "grad_norm": 1.931714997280456, - "language_loss": 0.74334198, - "learning_rate": 2.915104825441114e-06, - "loss": 0.76496822, - "num_input_tokens_seen": 131581695, - "step": 6125, - "time_per_iteration": 4.175686359405518 - }, - { - "auxiliary_loss_clip": 0.01132649, - "auxiliary_loss_mlp": 0.01053205, - "balance_loss_clip": 1.05193818, - "balance_loss_mlp": 1.03514445, - "epoch": 0.36831504584398017, - "flos": 16946605221120.0, - "grad_norm": 1.8318884745506827, - "language_loss": 0.78127813, - "learning_rate": 2.9147585055218686e-06, - "loss": 0.80313659, - "num_input_tokens_seen": 131599465, - "step": 6126, - "time_per_iteration": 2.6783266067504883 - }, - { - "auxiliary_loss_clip": 0.01128437, - "auxiliary_loss_mlp": 0.01045021, - "balance_loss_clip": 1.0490706, - "balance_loss_mlp": 1.02659082, - "epoch": 0.36837516909664814, - "flos": 19865065259520.0, - "grad_norm": 2.7490159956422575, - "language_loss": 0.66118228, - "learning_rate": 2.914412150914888e-06, - "loss": 0.68291688, - "num_input_tokens_seen": 131618330, - "step": 6127, - "time_per_iteration": 4.20530891418457 - }, - { - "auxiliary_loss_clip": 0.01120142, - "auxiliary_loss_mlp": 0.01046706, - "balance_loss_clip": 1.05205703, - "balance_loss_mlp": 1.02980185, - "epoch": 0.3684352923493161, - "flos": 37626984362880.0, - "grad_norm": 1.8515813315176315, - "language_loss": 0.70152593, - "learning_rate": 2.9140657616333074e-06, - "loss": 0.72319436, - "num_input_tokens_seen": 131638960, - "step": 6128, - "time_per_iteration": 4.498606204986572 - }, - { - "auxiliary_loss_clip": 0.0112131, - "auxiliary_loss_mlp": 0.01046424, - "balance_loss_clip": 1.05264103, - "balance_loss_mlp": 1.02957964, - "epoch": 0.36849541560198407, - "flos": 14465501182080.0, - "grad_norm": 2.3245894967836698, - "language_loss": 0.75067866, - "learning_rate": 2.9137193376902614e-06, - "loss": 0.77235603, - "num_input_tokens_seen": 131657440, - "step": 6129, - "time_per_iteration": 2.6874284744262695 - }, - { - "auxiliary_loss_clip": 0.01118674, - "auxiliary_loss_mlp": 0.01040759, - "balance_loss_clip": 1.04533887, - "balance_loss_mlp": 1.02403355, - "epoch": 0.36855553885465203, - "flos": 25770753924480.0, - "grad_norm": 1.6533761140504426, - "language_loss": 0.84758681, - "learning_rate": 2.9133728790988868e-06, - "loss": 0.86918116, - "num_input_tokens_seen": 131678035, - "step": 6130, - "time_per_iteration": 2.729963541030884 - }, - { - "auxiliary_loss_clip": 0.0102639, - "auxiliary_loss_mlp": 0.01017875, - "balance_loss_clip": 1.02295637, - "balance_loss_mlp": 1.01620567, - "epoch": 0.36861566210732, - "flos": 65049417377280.0, - "grad_norm": 0.8481176099425293, - "language_loss": 0.60254776, - "learning_rate": 2.913026385872321e-06, - "loss": 0.62299049, - "num_input_tokens_seen": 131742470, - "step": 6131, - "time_per_iteration": 3.2806124687194824 - }, - { - "auxiliary_loss_clip": 0.01097122, - "auxiliary_loss_mlp": 0.01035652, - "balance_loss_clip": 1.04542315, - "balance_loss_mlp": 1.01914179, - "epoch": 0.36867578535998796, - "flos": 30954495133440.0, - "grad_norm": 1.5587449528822306, - "language_loss": 0.73085582, - "learning_rate": 2.9126798580237034e-06, - "loss": 0.75218356, - "num_input_tokens_seen": 131764570, - "step": 6132, - "time_per_iteration": 2.781385898590088 - }, - { - "auxiliary_loss_clip": 0.01127214, - "auxiliary_loss_mlp": 0.01039387, - "balance_loss_clip": 1.04795551, - "balance_loss_mlp": 1.02187514, - "epoch": 0.3687359086126559, - "flos": 28837956182400.0, - "grad_norm": 1.9292425463255205, - "language_loss": 0.74192035, - "learning_rate": 2.9123332955661736e-06, - "loss": 0.76358628, - "num_input_tokens_seen": 131785720, - "step": 6133, - "time_per_iteration": 2.718660831451416 - }, - { - "auxiliary_loss_clip": 0.01072831, - "auxiliary_loss_mlp": 0.01049093, - "balance_loss_clip": 1.041502, - "balance_loss_mlp": 1.03042495, - "epoch": 0.3687960318653239, - "flos": 21396798881280.0, - "grad_norm": 1.8863128538280483, - "language_loss": 0.71522588, - "learning_rate": 2.911986698512874e-06, - "loss": 0.73644507, - "num_input_tokens_seen": 131804430, - "step": 6134, - "time_per_iteration": 2.8003294467926025 - }, - { - "auxiliary_loss_clip": 0.01102901, - "auxiliary_loss_mlp": 0.01034768, - "balance_loss_clip": 1.0472008, - "balance_loss_mlp": 1.01838863, - "epoch": 0.36885615511799186, - "flos": 20266043760000.0, - "grad_norm": 1.6906065874809195, - "language_loss": 0.75386798, - "learning_rate": 2.9116400668769477e-06, - "loss": 0.77524465, - "num_input_tokens_seen": 131822060, - "step": 6135, - "time_per_iteration": 2.7916624546051025 - }, - { - "auxiliary_loss_clip": 0.01030435, - "auxiliary_loss_mlp": 0.01019879, - "balance_loss_clip": 1.0281316, - "balance_loss_mlp": 1.01760185, - "epoch": 0.3689162783706599, - "flos": 63088836301440.0, - "grad_norm": 0.8159837123545765, - "language_loss": 0.58766222, - "learning_rate": 2.9112934006715376e-06, - "loss": 0.60816532, - "num_input_tokens_seen": 131880715, - "step": 6136, - "time_per_iteration": 3.2766408920288086 - }, - { - "auxiliary_loss_clip": 0.01106354, - "auxiliary_loss_mlp": 0.01043903, - "balance_loss_clip": 1.04497695, - "balance_loss_mlp": 1.02723718, - "epoch": 0.36897640162332784, - "flos": 10961984419200.0, - "grad_norm": 2.3780452593473393, - "language_loss": 0.79126394, - "learning_rate": 2.9109466999097918e-06, - "loss": 0.81276655, - "num_input_tokens_seen": 131895850, - "step": 6137, - "time_per_iteration": 2.8411052227020264 - }, - { - "auxiliary_loss_clip": 0.011261, - "auxiliary_loss_mlp": 0.01043272, - "balance_loss_clip": 1.04803205, - "balance_loss_mlp": 1.02645159, - "epoch": 0.3690365248759958, - "flos": 20704297599360.0, - "grad_norm": 2.0312275113078337, - "language_loss": 0.7454071, - "learning_rate": 2.9105999646048552e-06, - "loss": 0.76710081, - "num_input_tokens_seen": 131915775, - "step": 6138, - "time_per_iteration": 2.7210230827331543 - }, - { - "auxiliary_loss_clip": 0.01090918, - "auxiliary_loss_mlp": 0.01042472, - "balance_loss_clip": 1.04320955, - "balance_loss_mlp": 1.0259856, - "epoch": 0.3690966481286638, - "flos": 31826369957760.0, - "grad_norm": 2.0947758027881767, - "language_loss": 0.64676917, - "learning_rate": 2.9102531947698764e-06, - "loss": 0.66810304, - "num_input_tokens_seen": 131935715, - "step": 6139, - "time_per_iteration": 2.8667304515838623 - }, - { - "auxiliary_loss_clip": 0.01095075, - "auxiliary_loss_mlp": 0.01042873, - "balance_loss_clip": 1.04443955, - "balance_loss_mlp": 1.02646971, - "epoch": 0.36915677138133174, - "flos": 13114936782720.0, - "grad_norm": 2.1146776737326998, - "language_loss": 0.71764016, - "learning_rate": 2.909906390418006e-06, - "loss": 0.73901963, - "num_input_tokens_seen": 131954120, - "step": 6140, - "time_per_iteration": 2.718100070953369 - }, - { - "auxiliary_loss_clip": 0.01017799, - "auxiliary_loss_mlp": 0.01004631, - "balance_loss_clip": 1.02079976, - "balance_loss_mlp": 1.00281894, - "epoch": 0.3692168946339997, - "flos": 68686879956480.0, - "grad_norm": 0.7503567012350645, - "language_loss": 0.59252203, - "learning_rate": 2.9095595515623934e-06, - "loss": 0.61274636, - "num_input_tokens_seen": 132017485, - "step": 6141, - "time_per_iteration": 3.3003833293914795 - }, - { - "auxiliary_loss_clip": 0.01122088, - "auxiliary_loss_mlp": 0.01040836, - "balance_loss_clip": 1.04716861, - "balance_loss_mlp": 1.02458787, - "epoch": 0.36927701788666767, - "flos": 22017873968640.0, - "grad_norm": 1.900744005055956, - "language_loss": 0.75374687, - "learning_rate": 2.909212678216192e-06, - "loss": 0.77537608, - "num_input_tokens_seen": 132036760, - "step": 6142, - "time_per_iteration": 2.707676410675049 - }, - { - "auxiliary_loss_clip": 0.01122008, - "auxiliary_loss_mlp": 0.01037683, - "balance_loss_clip": 1.04708242, - "balance_loss_mlp": 1.02276349, - "epoch": 0.36933714113933563, - "flos": 21835591424640.0, - "grad_norm": 2.0868371024046346, - "language_loss": 0.77474618, - "learning_rate": 2.908865770392555e-06, - "loss": 0.79634303, - "num_input_tokens_seen": 132056935, - "step": 6143, - "time_per_iteration": 2.6308929920196533 - }, - { - "auxiliary_loss_clip": 0.01122961, - "auxiliary_loss_mlp": 0.01033227, - "balance_loss_clip": 1.04840302, - "balance_loss_mlp": 1.01860011, - "epoch": 0.3693972643920036, - "flos": 23691705793920.0, - "grad_norm": 2.7754530777388555, - "language_loss": 0.82127941, - "learning_rate": 2.9085188281046364e-06, - "loss": 0.84284127, - "num_input_tokens_seen": 132077285, - "step": 6144, - "time_per_iteration": 2.7094409465789795 - }, - { - "auxiliary_loss_clip": 0.01126238, - "auxiliary_loss_mlp": 0.01040495, - "balance_loss_clip": 1.0479883, - "balance_loss_mlp": 1.02547419, - "epoch": 0.36945738764467156, - "flos": 22856747172480.0, - "grad_norm": 2.260022101229928, - "language_loss": 0.774791, - "learning_rate": 2.908171851365593e-06, - "loss": 0.79645836, - "num_input_tokens_seen": 132095520, - "step": 6145, - "time_per_iteration": 2.6951241493225098 - }, - { - "auxiliary_loss_clip": 0.01120499, - "auxiliary_loss_mlp": 0.01030806, - "balance_loss_clip": 1.04903388, - "balance_loss_mlp": 1.01503491, - "epoch": 0.36951751089733953, - "flos": 16615939593600.0, - "grad_norm": 2.2611713814894423, - "language_loss": 0.76861286, - "learning_rate": 2.9078248401885815e-06, - "loss": 0.79012597, - "num_input_tokens_seen": 132112810, - "step": 6146, - "time_per_iteration": 2.6205246448516846 - }, - { - "auxiliary_loss_clip": 0.0110988, - "auxiliary_loss_mlp": 0.01042802, - "balance_loss_clip": 1.04717457, - "balance_loss_mlp": 1.02518249, - "epoch": 0.3695776341500075, - "flos": 18914545607040.0, - "grad_norm": 3.3549376840260394, - "language_loss": 0.80945081, - "learning_rate": 2.907477794586761e-06, - "loss": 0.83097762, - "num_input_tokens_seen": 132131615, - "step": 6147, - "time_per_iteration": 2.7176942825317383 - }, - { - "auxiliary_loss_clip": 0.01108097, - "auxiliary_loss_mlp": 0.00773519, - "balance_loss_clip": 1.05041718, - "balance_loss_mlp": 1.00029731, - "epoch": 0.36963775740267546, - "flos": 20808474019200.0, - "grad_norm": 1.8104892137163535, - "language_loss": 0.83325249, - "learning_rate": 2.9071307145732926e-06, - "loss": 0.85206866, - "num_input_tokens_seen": 132149585, - "step": 6148, - "time_per_iteration": 2.7764229774475098 - }, - { - "auxiliary_loss_clip": 0.01121751, - "auxiliary_loss_mlp": 0.01033697, - "balance_loss_clip": 1.04946411, - "balance_loss_mlp": 1.01843238, - "epoch": 0.3696978806553435, - "flos": 26061881656320.0, - "grad_norm": 2.472295207741171, - "language_loss": 0.74167144, - "learning_rate": 2.9067836001613357e-06, - "loss": 0.76322597, - "num_input_tokens_seen": 132165555, - "step": 6149, - "time_per_iteration": 2.729785680770874 - }, - { - "auxiliary_loss_clip": 0.01141043, - "auxiliary_loss_mlp": 0.01040796, - "balance_loss_clip": 1.0524776, - "balance_loss_mlp": 1.02347541, - "epoch": 0.36975800390801145, - "flos": 26833925606400.0, - "grad_norm": 2.18045381202803, - "language_loss": 0.71229833, - "learning_rate": 2.906436451364054e-06, - "loss": 0.73411667, - "num_input_tokens_seen": 132185100, - "step": 6150, - "time_per_iteration": 2.6558914184570312 - }, - { - "auxiliary_loss_clip": 0.01112432, - "auxiliary_loss_mlp": 0.0104236, - "balance_loss_clip": 1.04834723, - "balance_loss_mlp": 1.02634454, - "epoch": 0.3698181271606794, - "flos": 21142623265920.0, - "grad_norm": 2.1283605732632487, - "language_loss": 0.82001126, - "learning_rate": 2.906089268194611e-06, - "loss": 0.84155917, - "num_input_tokens_seen": 132203930, - "step": 6151, - "time_per_iteration": 2.811908483505249 - }, - { - "auxiliary_loss_clip": 0.0104085, - "auxiliary_loss_mlp": 0.01012111, - "balance_loss_clip": 1.02895284, - "balance_loss_mlp": 1.01035905, - "epoch": 0.3698782504133474, - "flos": 66742639568640.0, - "grad_norm": 0.8434423047890295, - "language_loss": 0.63103437, - "learning_rate": 2.9057420506661726e-06, - "loss": 0.651564, - "num_input_tokens_seen": 132263845, - "step": 6152, - "time_per_iteration": 3.283348798751831 - }, - { - "auxiliary_loss_clip": 0.01083912, - "auxiliary_loss_mlp": 0.01046371, - "balance_loss_clip": 1.04603028, - "balance_loss_mlp": 1.02939606, - "epoch": 0.36993837366601534, - "flos": 24311523905280.0, - "grad_norm": 2.101714417244525, - "language_loss": 0.70249707, - "learning_rate": 2.9053947987919044e-06, - "loss": 0.72379988, - "num_input_tokens_seen": 132282350, - "step": 6153, - "time_per_iteration": 2.776003837585449 - }, - { - "auxiliary_loss_clip": 0.01126735, - "auxiliary_loss_mlp": 0.01038393, - "balance_loss_clip": 1.04984677, - "balance_loss_mlp": 1.02176309, - "epoch": 0.3699984969186833, - "flos": 24349194293760.0, - "grad_norm": 1.5983560512083512, - "language_loss": 0.72364891, - "learning_rate": 2.9050475125849755e-06, - "loss": 0.74530017, - "num_input_tokens_seen": 132301930, - "step": 6154, - "time_per_iteration": 2.7031455039978027 - }, - { - "auxiliary_loss_clip": 0.01108862, - "auxiliary_loss_mlp": 0.01038947, - "balance_loss_clip": 1.04792106, - "balance_loss_mlp": 1.02376008, - "epoch": 0.37005862017135127, - "flos": 19829154637440.0, - "grad_norm": 1.6579101756116525, - "language_loss": 0.67716074, - "learning_rate": 2.9047001920585534e-06, - "loss": 0.6986388, - "num_input_tokens_seen": 132320915, - "step": 6155, - "time_per_iteration": 2.7716591358184814 - }, - { - "auxiliary_loss_clip": 0.01124062, - "auxiliary_loss_mlp": 0.01032665, - "balance_loss_clip": 1.04789114, - "balance_loss_mlp": 1.0171442, - "epoch": 0.37011874342401924, - "flos": 19573793873280.0, - "grad_norm": 1.797024775246088, - "language_loss": 0.68048114, - "learning_rate": 2.9043528372258097e-06, - "loss": 0.70204842, - "num_input_tokens_seen": 132340415, - "step": 6156, - "time_per_iteration": 2.7830615043640137 - }, - { - "auxiliary_loss_clip": 0.01109781, - "auxiliary_loss_mlp": 0.0103684, - "balance_loss_clip": 1.04603815, - "balance_loss_mlp": 1.02202225, - "epoch": 0.3701788666766872, - "flos": 20374350243840.0, - "grad_norm": 1.8485807917443284, - "language_loss": 0.82232833, - "learning_rate": 2.904005448099916e-06, - "loss": 0.84379458, - "num_input_tokens_seen": 132358600, - "step": 6157, - "time_per_iteration": 2.676429033279419 - }, - { - "auxiliary_loss_clip": 0.01087924, - "auxiliary_loss_mlp": 0.01042208, - "balance_loss_clip": 1.04360199, - "balance_loss_mlp": 1.02474344, - "epoch": 0.37023898992935517, - "flos": 15340931452800.0, - "grad_norm": 2.2992188770836175, - "language_loss": 0.76899838, - "learning_rate": 2.9036580246940444e-06, - "loss": 0.79029977, - "num_input_tokens_seen": 132373160, - "step": 6158, - "time_per_iteration": 2.7764365673065186 - }, - { - "auxiliary_loss_clip": 0.01138492, - "auxiliary_loss_mlp": 0.01037057, - "balance_loss_clip": 1.0489651, - "balance_loss_mlp": 1.01997483, - "epoch": 0.37029911318202313, - "flos": 19573937527680.0, - "grad_norm": 2.8360595009252196, - "language_loss": 0.68930852, - "learning_rate": 2.9033105670213708e-06, - "loss": 0.71106398, - "num_input_tokens_seen": 132392345, - "step": 6159, - "time_per_iteration": 2.664858818054199 - }, - { - "auxiliary_loss_clip": 0.01110756, - "auxiliary_loss_mlp": 0.01035031, - "balance_loss_clip": 1.049088, - "balance_loss_mlp": 1.02067792, - "epoch": 0.3703592364346911, - "flos": 26213353309440.0, - "grad_norm": 2.9956624327703523, - "language_loss": 0.71067882, - "learning_rate": 2.9029630750950697e-06, - "loss": 0.73213673, - "num_input_tokens_seen": 132412620, - "step": 6160, - "time_per_iteration": 2.757081985473633 - }, - { - "auxiliary_loss_clip": 0.01106906, - "auxiliary_loss_mlp": 0.01033059, - "balance_loss_clip": 1.04698467, - "balance_loss_mlp": 1.01918936, - "epoch": 0.37041935968735906, - "flos": 20048317470720.0, - "grad_norm": 2.0439504076987403, - "language_loss": 0.79205775, - "learning_rate": 2.9026155489283176e-06, - "loss": 0.81345737, - "num_input_tokens_seen": 132431570, - "step": 6161, - "time_per_iteration": 2.8008711338043213 - }, - { - "auxiliary_loss_clip": 0.01136197, - "auxiliary_loss_mlp": 0.01038947, - "balance_loss_clip": 1.04960537, - "balance_loss_mlp": 1.02284193, - "epoch": 0.3704794829400271, - "flos": 24133802388480.0, - "grad_norm": 2.0425786778899058, - "language_loss": 0.79665029, - "learning_rate": 2.902267988534295e-06, - "loss": 0.81840169, - "num_input_tokens_seen": 132451525, - "step": 6162, - "time_per_iteration": 4.2554450035095215 - }, - { - "auxiliary_loss_clip": 0.01107039, - "auxiliary_loss_mlp": 0.00773743, - "balance_loss_clip": 1.0442729, - "balance_loss_mlp": 1.00038123, - "epoch": 0.37053960619269505, - "flos": 14866874732160.0, - "grad_norm": 2.0272159369395193, - "language_loss": 0.79314882, - "learning_rate": 2.9019203939261783e-06, - "loss": 0.81195664, - "num_input_tokens_seen": 132469875, - "step": 6163, - "time_per_iteration": 2.753324508666992 - }, - { - "auxiliary_loss_clip": 0.0112147, - "auxiliary_loss_mlp": 0.01039825, - "balance_loss_clip": 1.04676855, - "balance_loss_mlp": 1.02351689, - "epoch": 0.370599729445363, - "flos": 21361498790400.0, - "grad_norm": 1.847799951808159, - "language_loss": 0.67843366, - "learning_rate": 2.9015727651171507e-06, - "loss": 0.7000466, - "num_input_tokens_seen": 132488360, - "step": 6164, - "time_per_iteration": 2.7885541915893555 - }, - { - "auxiliary_loss_clip": 0.01109766, - "auxiliary_loss_mlp": 0.01045808, - "balance_loss_clip": 1.04918885, - "balance_loss_mlp": 1.02877307, - "epoch": 0.370659852698031, - "flos": 26829041356800.0, - "grad_norm": 2.0007288653084334, - "language_loss": 0.83441198, - "learning_rate": 2.9012251021203935e-06, - "loss": 0.85596776, - "num_input_tokens_seen": 132508630, - "step": 6165, - "time_per_iteration": 4.3637871742248535 - }, - { - "auxiliary_loss_clip": 0.01115767, - "auxiliary_loss_mlp": 0.01037848, - "balance_loss_clip": 1.0473845, - "balance_loss_mlp": 1.02026439, - "epoch": 0.37071997595069894, - "flos": 19099018880640.0, - "grad_norm": 1.7502292049636352, - "language_loss": 0.69057518, - "learning_rate": 2.9008774049490896e-06, - "loss": 0.71211129, - "num_input_tokens_seen": 132527465, - "step": 6166, - "time_per_iteration": 2.6754019260406494 - }, - { - "auxiliary_loss_clip": 0.01032616, - "auxiliary_loss_mlp": 0.01025464, - "balance_loss_clip": 1.03081024, - "balance_loss_mlp": 1.02362847, - "epoch": 0.3707800992033669, - "flos": 52178384920320.0, - "grad_norm": 0.8028866408552083, - "language_loss": 0.5688796, - "learning_rate": 2.9005296736164244e-06, - "loss": 0.58946037, - "num_input_tokens_seen": 132579940, - "step": 6167, - "time_per_iteration": 6.357440233230591 - }, - { - "auxiliary_loss_clip": 0.01110244, - "auxiliary_loss_mlp": 0.01037896, - "balance_loss_clip": 1.04592001, - "balance_loss_mlp": 1.02284551, - "epoch": 0.3708402224560349, - "flos": 19901837808000.0, - "grad_norm": 2.0812394742982203, - "language_loss": 0.75159574, - "learning_rate": 2.900181908135584e-06, - "loss": 0.77307719, - "num_input_tokens_seen": 132598390, - "step": 6168, - "time_per_iteration": 2.7107198238372803 - }, - { - "auxiliary_loss_clip": 0.01117658, - "auxiliary_loss_mlp": 0.00773774, - "balance_loss_clip": 1.04381216, - "balance_loss_mlp": 1.00029826, - "epoch": 0.37090034570870284, - "flos": 20007630339840.0, - "grad_norm": 2.166706099657804, - "language_loss": 0.73690271, - "learning_rate": 2.899834108519755e-06, - "loss": 0.755817, - "num_input_tokens_seen": 132616920, - "step": 6169, - "time_per_iteration": 2.743741035461426 - }, - { - "auxiliary_loss_clip": 0.0113208, - "auxiliary_loss_mlp": 0.01038383, - "balance_loss_clip": 1.0476737, - "balance_loss_mlp": 1.02352989, - "epoch": 0.3709604689613708, - "flos": 24134700228480.0, - "grad_norm": 1.6724632615545945, - "language_loss": 0.79498589, - "learning_rate": 2.899486274782127e-06, - "loss": 0.81669056, - "num_input_tokens_seen": 132637660, - "step": 6170, - "time_per_iteration": 2.738492727279663 - }, - { - "auxiliary_loss_clip": 0.01122253, - "auxiliary_loss_mlp": 0.01045679, - "balance_loss_clip": 1.04780805, - "balance_loss_mlp": 1.02913237, - "epoch": 0.37102059221403877, - "flos": 23876071326720.0, - "grad_norm": 1.739457755704792, - "language_loss": 0.76506341, - "learning_rate": 2.8991384069358885e-06, - "loss": 0.78674281, - "num_input_tokens_seen": 132657635, - "step": 6171, - "time_per_iteration": 2.6531472206115723 - }, - { - "auxiliary_loss_clip": 0.01112543, - "auxiliary_loss_mlp": 0.01041865, - "balance_loss_clip": 1.05081654, - "balance_loss_mlp": 1.02546144, - "epoch": 0.37108071546670673, - "flos": 14501268149760.0, - "grad_norm": 2.0084032146250608, - "language_loss": 0.80705774, - "learning_rate": 2.898790504994232e-06, - "loss": 0.82860184, - "num_input_tokens_seen": 132674455, - "step": 6172, - "time_per_iteration": 2.6587960720062256 - }, - { - "auxiliary_loss_clip": 0.01125694, - "auxiliary_loss_mlp": 0.01044257, - "balance_loss_clip": 1.0475564, - "balance_loss_mlp": 1.02747262, - "epoch": 0.3711408387193747, - "flos": 34562619279360.0, - "grad_norm": 2.410153405618026, - "language_loss": 0.59260982, - "learning_rate": 2.89844256897035e-06, - "loss": 0.61430931, - "num_input_tokens_seen": 132695140, - "step": 6173, - "time_per_iteration": 2.738430976867676 - }, - { - "auxiliary_loss_clip": 0.01110933, - "auxiliary_loss_mlp": 0.01044385, - "balance_loss_clip": 1.04549873, - "balance_loss_mlp": 1.02885222, - "epoch": 0.37120096197204266, - "flos": 17310703432320.0, - "grad_norm": 1.954423749693878, - "language_loss": 0.80869365, - "learning_rate": 2.898094598877435e-06, - "loss": 0.83024681, - "num_input_tokens_seen": 132712470, - "step": 6174, - "time_per_iteration": 2.7166690826416016 - }, - { - "auxiliary_loss_clip": 0.01129522, - "auxiliary_loss_mlp": 0.01045042, - "balance_loss_clip": 1.04628158, - "balance_loss_mlp": 1.03025961, - "epoch": 0.37126108522471063, - "flos": 30664049760000.0, - "grad_norm": 2.1592050046005, - "language_loss": 0.79910219, - "learning_rate": 2.8977465947286826e-06, - "loss": 0.82084787, - "num_input_tokens_seen": 132732945, - "step": 6175, - "time_per_iteration": 2.6746280193328857 - }, - { - "auxiliary_loss_clip": 0.011267, - "auxiliary_loss_mlp": 0.01053826, - "balance_loss_clip": 1.05173898, - "balance_loss_mlp": 1.0380547, - "epoch": 0.37132120847737865, - "flos": 25155640494720.0, - "grad_norm": 2.2578092376668315, - "language_loss": 0.88735723, - "learning_rate": 2.89739855653729e-06, - "loss": 0.90916252, - "num_input_tokens_seen": 132752470, - "step": 6176, - "time_per_iteration": 2.6791093349456787 - }, - { - "auxiliary_loss_clip": 0.01124216, - "auxiliary_loss_mlp": 0.01042973, - "balance_loss_clip": 1.04811859, - "balance_loss_mlp": 1.02713037, - "epoch": 0.3713813317300466, - "flos": 21213474842880.0, - "grad_norm": 1.5716198978013565, - "language_loss": 0.73431349, - "learning_rate": 2.8970504843164546e-06, - "loss": 0.75598538, - "num_input_tokens_seen": 132771485, - "step": 6177, - "time_per_iteration": 2.6808605194091797 - }, - { - "auxiliary_loss_clip": 0.01102086, - "auxiliary_loss_mlp": 0.01051929, - "balance_loss_clip": 1.04524541, - "balance_loss_mlp": 1.03575838, - "epoch": 0.3714414549827146, - "flos": 21616644072960.0, - "grad_norm": 2.0030850547718915, - "language_loss": 0.75349051, - "learning_rate": 2.896702378079374e-06, - "loss": 0.77503073, - "num_input_tokens_seen": 132791465, - "step": 6178, - "time_per_iteration": 2.7112066745758057 - }, - { - "auxiliary_loss_clip": 0.0107122, - "auxiliary_loss_mlp": 0.01050415, - "balance_loss_clip": 1.04323864, - "balance_loss_mlp": 1.03208089, - "epoch": 0.37150157823538255, - "flos": 19972294335360.0, - "grad_norm": 2.0305314414463136, - "language_loss": 0.72141892, - "learning_rate": 2.8963542378392502e-06, - "loss": 0.74263525, - "num_input_tokens_seen": 132810160, - "step": 6179, - "time_per_iteration": 2.7965877056121826 - }, - { - "auxiliary_loss_clip": 0.01137504, - "auxiliary_loss_mlp": 0.01046799, - "balance_loss_clip": 1.05008841, - "balance_loss_mlp": 1.03018165, - "epoch": 0.3715617014880505, - "flos": 24860562266880.0, - "grad_norm": 2.387630814732786, - "language_loss": 0.6993162, - "learning_rate": 2.896006063609283e-06, - "loss": 0.72115916, - "num_input_tokens_seen": 132831265, - "step": 6180, - "time_per_iteration": 2.695232391357422 - }, - { - "auxiliary_loss_clip": 0.01113448, - "auxiliary_loss_mlp": 0.01037109, - "balance_loss_clip": 1.04914021, - "balance_loss_mlp": 1.02208257, - "epoch": 0.3716218247407185, - "flos": 20449080489600.0, - "grad_norm": 2.1080005695464243, - "language_loss": 0.77920252, - "learning_rate": 2.8956578554026767e-06, - "loss": 0.80070812, - "num_input_tokens_seen": 132850005, - "step": 6181, - "time_per_iteration": 2.7087795734405518 - }, - { - "auxiliary_loss_clip": 0.01123157, - "auxiliary_loss_mlp": 0.01041815, - "balance_loss_clip": 1.05016994, - "balance_loss_mlp": 1.02525139, - "epoch": 0.37168194799338644, - "flos": 24133479166080.0, - "grad_norm": 2.570629027716188, - "language_loss": 0.79222846, - "learning_rate": 2.8953096132326343e-06, - "loss": 0.81387818, - "num_input_tokens_seen": 132865790, - "step": 6182, - "time_per_iteration": 2.6541473865509033 - }, - { - "auxiliary_loss_clip": 0.01041849, - "auxiliary_loss_mlp": 0.01016945, - "balance_loss_clip": 1.03053021, - "balance_loss_mlp": 1.01533604, - "epoch": 0.3717420712460544, - "flos": 67408926900480.0, - "grad_norm": 0.7830434308203498, - "language_loss": 0.57445002, - "learning_rate": 2.894961337112362e-06, - "loss": 0.59503794, - "num_input_tokens_seen": 132921775, - "step": 6183, - "time_per_iteration": 3.191969633102417 - }, - { - "auxiliary_loss_clip": 0.01126783, - "auxiliary_loss_mlp": 0.00775242, - "balance_loss_clip": 1.04496169, - "balance_loss_mlp": 1.00043631, - "epoch": 0.37180219449872237, - "flos": 22376908362240.0, - "grad_norm": 1.9647478507461604, - "language_loss": 0.76617277, - "learning_rate": 2.894613027055066e-06, - "loss": 0.78519297, - "num_input_tokens_seen": 132941060, - "step": 6184, - "time_per_iteration": 2.7096588611602783 - }, - { - "auxiliary_loss_clip": 0.01090654, - "auxiliary_loss_mlp": 0.01039062, - "balance_loss_clip": 1.04084587, - "balance_loss_mlp": 1.02344596, - "epoch": 0.37186231775139034, - "flos": 21869885934720.0, - "grad_norm": 2.1021072738728717, - "language_loss": 0.7217713, - "learning_rate": 2.894264683073954e-06, - "loss": 0.74306846, - "num_input_tokens_seen": 132961850, - "step": 6185, - "time_per_iteration": 2.739130735397339 - }, - { - "auxiliary_loss_clip": 0.01081138, - "auxiliary_loss_mlp": 0.01034498, - "balance_loss_clip": 1.04156423, - "balance_loss_mlp": 1.01805878, - "epoch": 0.3719224410040583, - "flos": 22415225195520.0, - "grad_norm": 2.1871647895832496, - "language_loss": 0.76805776, - "learning_rate": 2.8939163051822363e-06, - "loss": 0.78921413, - "num_input_tokens_seen": 132981625, - "step": 6186, - "time_per_iteration": 2.779510259628296 - }, - { - "auxiliary_loss_clip": 0.01131414, - "auxiliary_loss_mlp": 0.01042221, - "balance_loss_clip": 1.05090106, - "balance_loss_mlp": 1.02491212, - "epoch": 0.37198256425672627, - "flos": 25151223121920.0, - "grad_norm": 1.8929067887672733, - "language_loss": 0.84037393, - "learning_rate": 2.8935678933931224e-06, - "loss": 0.86211032, - "num_input_tokens_seen": 133001225, - "step": 6187, - "time_per_iteration": 2.67541241645813 - }, - { - "auxiliary_loss_clip": 0.01120953, - "auxiliary_loss_mlp": 0.01040882, - "balance_loss_clip": 1.04474545, - "balance_loss_mlp": 1.02553999, - "epoch": 0.37204268750939423, - "flos": 21138313633920.0, - "grad_norm": 1.7194664317181616, - "language_loss": 0.84831274, - "learning_rate": 2.893219447719824e-06, - "loss": 0.86993104, - "num_input_tokens_seen": 133018820, - "step": 6188, - "time_per_iteration": 2.6241226196289062 - }, - { - "auxiliary_loss_clip": 0.01108827, - "auxiliary_loss_mlp": 0.01040814, - "balance_loss_clip": 1.04934168, - "balance_loss_mlp": 1.02501917, - "epoch": 0.37210281076206225, - "flos": 21506829217920.0, - "grad_norm": 2.498329305558477, - "language_loss": 0.65702367, - "learning_rate": 2.8928709681755548e-06, - "loss": 0.67852014, - "num_input_tokens_seen": 133040205, - "step": 6189, - "time_per_iteration": 2.724707841873169 - }, - { - "auxiliary_loss_clip": 0.01112219, - "auxiliary_loss_mlp": 0.0104713, - "balance_loss_clip": 1.0451889, - "balance_loss_mlp": 1.03045225, - "epoch": 0.3721629340147302, - "flos": 17347835116800.0, - "grad_norm": 1.9571366893805608, - "language_loss": 0.84120989, - "learning_rate": 2.8925224547735293e-06, - "loss": 0.86280334, - "num_input_tokens_seen": 133058095, - "step": 6190, - "time_per_iteration": 2.719454050064087 - }, - { - "auxiliary_loss_clip": 0.01109992, - "auxiliary_loss_mlp": 0.01041587, - "balance_loss_clip": 1.0465343, - "balance_loss_mlp": 1.02571416, - "epoch": 0.3722230572673982, - "flos": 16432400073600.0, - "grad_norm": 4.021000090429005, - "language_loss": 0.87807733, - "learning_rate": 2.8921739075269633e-06, - "loss": 0.89959311, - "num_input_tokens_seen": 133071530, - "step": 6191, - "time_per_iteration": 2.7081027030944824 - }, - { - "auxiliary_loss_clip": 0.0108777, - "auxiliary_loss_mlp": 0.01037991, - "balance_loss_clip": 1.04300189, - "balance_loss_mlp": 1.01962125, - "epoch": 0.37228318052006615, - "flos": 22674716023680.0, - "grad_norm": 3.7199150853096508, - "language_loss": 0.74228656, - "learning_rate": 2.891825326449073e-06, - "loss": 0.7635442, - "num_input_tokens_seen": 133091410, - "step": 6192, - "time_per_iteration": 2.8161356449127197 - }, - { - "auxiliary_loss_clip": 0.01134777, - "auxiliary_loss_mlp": 0.0104013, - "balance_loss_clip": 1.04818201, - "balance_loss_mlp": 1.02497888, - "epoch": 0.3723433037727341, - "flos": 25265491263360.0, - "grad_norm": 2.31871347399746, - "language_loss": 0.80621845, - "learning_rate": 2.8914767115530766e-06, - "loss": 0.82796752, - "num_input_tokens_seen": 133110365, - "step": 6193, - "time_per_iteration": 2.661550760269165 - }, - { - "auxiliary_loss_clip": 0.01101478, - "auxiliary_loss_mlp": 0.01041083, - "balance_loss_clip": 1.04354334, - "balance_loss_mlp": 1.02522826, - "epoch": 0.3724034270254021, - "flos": 10524664333440.0, - "grad_norm": 2.475173523724827, - "language_loss": 0.84729886, - "learning_rate": 2.891128062852194e-06, - "loss": 0.86872447, - "num_input_tokens_seen": 133128255, - "step": 6194, - "time_per_iteration": 2.711531400680542 - }, - { - "auxiliary_loss_clip": 0.0111161, - "auxiliary_loss_mlp": 0.010372, - "balance_loss_clip": 1.04650784, - "balance_loss_mlp": 1.02142286, - "epoch": 0.37246355027807004, - "flos": 20266223328000.0, - "grad_norm": 9.44838101604173, - "language_loss": 0.77016377, - "learning_rate": 2.890779380359646e-06, - "loss": 0.79165184, - "num_input_tokens_seen": 133143975, - "step": 6195, - "time_per_iteration": 2.6527512073516846 - }, - { - "auxiliary_loss_clip": 0.01112195, - "auxiliary_loss_mlp": 0.0103539, - "balance_loss_clip": 1.0468967, - "balance_loss_mlp": 1.02030444, - "epoch": 0.372523673530738, - "flos": 19500571998720.0, - "grad_norm": 1.7021548935758455, - "language_loss": 0.79216856, - "learning_rate": 2.890430664088655e-06, - "loss": 0.81364441, - "num_input_tokens_seen": 133162935, - "step": 6196, - "time_per_iteration": 2.6642892360687256 - }, - { - "auxiliary_loss_clip": 0.01124648, - "auxiliary_loss_mlp": 0.01038359, - "balance_loss_clip": 1.04975688, - "balance_loss_mlp": 1.0240953, - "epoch": 0.372583796783406, - "flos": 16764250849920.0, - "grad_norm": 2.570886031241156, - "language_loss": 0.83998835, - "learning_rate": 2.890081914052443e-06, - "loss": 0.8616184, - "num_input_tokens_seen": 133181180, - "step": 6197, - "time_per_iteration": 2.627305030822754 - }, - { - "auxiliary_loss_clip": 0.01131102, - "auxiliary_loss_mlp": 0.01040963, - "balance_loss_clip": 1.04697967, - "balance_loss_mlp": 1.02488184, - "epoch": 0.37264392003607394, - "flos": 22637979388800.0, - "grad_norm": 1.697216275583005, - "language_loss": 0.64450538, - "learning_rate": 2.889733130264237e-06, - "loss": 0.66622603, - "num_input_tokens_seen": 133199615, - "step": 6198, - "time_per_iteration": 2.606621503829956 - }, - { - "auxiliary_loss_clip": 0.01120059, - "auxiliary_loss_mlp": 0.01044451, - "balance_loss_clip": 1.04676938, - "balance_loss_mlp": 1.02959776, - "epoch": 0.3727040432887419, - "flos": 19973120348160.0, - "grad_norm": 1.4273324893736263, - "language_loss": 0.737185, - "learning_rate": 2.889384312737261e-06, - "loss": 0.75883007, - "num_input_tokens_seen": 133219650, - "step": 6199, - "time_per_iteration": 2.78157901763916 - }, - { - "auxiliary_loss_clip": 0.01105963, - "auxiliary_loss_mlp": 0.01037053, - "balance_loss_clip": 1.04564095, - "balance_loss_mlp": 1.02154374, - "epoch": 0.37276416654140987, - "flos": 63899122279680.0, - "grad_norm": 2.2948998309451905, - "language_loss": 0.80481982, - "learning_rate": 2.889035461484742e-06, - "loss": 0.82624996, - "num_input_tokens_seen": 133245675, - "step": 6200, - "time_per_iteration": 3.0623533725738525 - }, - { - "auxiliary_loss_clip": 0.0109608, - "auxiliary_loss_mlp": 0.01045798, - "balance_loss_clip": 1.04552174, - "balance_loss_mlp": 1.03016961, - "epoch": 0.37282428979407783, - "flos": 39785970211200.0, - "grad_norm": 2.0774746879263746, - "language_loss": 0.60494614, - "learning_rate": 2.88868657651991e-06, - "loss": 0.62636495, - "num_input_tokens_seen": 133266905, - "step": 6201, - "time_per_iteration": 2.8960700035095215 - }, - { - "auxiliary_loss_clip": 0.01125447, - "auxiliary_loss_mlp": 0.01039384, - "balance_loss_clip": 1.0489639, - "balance_loss_mlp": 1.02346373, - "epoch": 0.37288441304674586, - "flos": 22709046447360.0, - "grad_norm": 1.870117482164085, - "language_loss": 0.72692698, - "learning_rate": 2.8883376578559934e-06, - "loss": 0.74857527, - "num_input_tokens_seen": 133286865, - "step": 6202, - "time_per_iteration": 4.202298402786255 - }, - { - "auxiliary_loss_clip": 0.01110741, - "auxiliary_loss_mlp": 0.01033326, - "balance_loss_clip": 1.04642594, - "balance_loss_mlp": 1.01800799, - "epoch": 0.3729445362994138, - "flos": 18770292587520.0, - "grad_norm": 2.0679450432005666, - "language_loss": 0.74148834, - "learning_rate": 2.8879887055062243e-06, - "loss": 0.76292896, - "num_input_tokens_seen": 133305295, - "step": 6203, - "time_per_iteration": 2.7268033027648926 - }, - { - "auxiliary_loss_clip": 0.01106859, - "auxiliary_loss_mlp": 0.01038826, - "balance_loss_clip": 1.04595554, - "balance_loss_mlp": 1.02524805, - "epoch": 0.3730046595520818, - "flos": 22456199635200.0, - "grad_norm": 1.649450499506288, - "language_loss": 0.81921744, - "learning_rate": 2.8876397194838353e-06, - "loss": 0.84067428, - "num_input_tokens_seen": 133324625, - "step": 6204, - "time_per_iteration": 4.347074747085571 - }, - { - "auxiliary_loss_clip": 0.01123916, - "auxiliary_loss_mlp": 0.01044159, - "balance_loss_clip": 1.04827762, - "balance_loss_mlp": 1.02794707, - "epoch": 0.37306478280474975, - "flos": 24316372241280.0, - "grad_norm": 1.675399556922802, - "language_loss": 0.74961317, - "learning_rate": 2.8872906998020577e-06, - "loss": 0.77129394, - "num_input_tokens_seen": 133344625, - "step": 6205, - "time_per_iteration": 2.66701602935791 - }, - { - "auxiliary_loss_clip": 0.01117233, - "auxiliary_loss_mlp": 0.01045323, - "balance_loss_clip": 1.04337549, - "balance_loss_mlp": 1.02857447, - "epoch": 0.3731249060574177, - "flos": 15815167741440.0, - "grad_norm": 1.8318607259579, - "language_loss": 0.7815854, - "learning_rate": 2.886941646474128e-06, - "loss": 0.80321097, - "num_input_tokens_seen": 133363605, - "step": 6206, - "time_per_iteration": 4.202580451965332 - }, - { - "auxiliary_loss_clip": 0.01134488, - "auxiliary_loss_mlp": 0.01039926, - "balance_loss_clip": 1.04804325, - "balance_loss_mlp": 1.02317739, - "epoch": 0.3731850293100857, - "flos": 19828077229440.0, - "grad_norm": 2.3232535418166256, - "language_loss": 0.93322426, - "learning_rate": 2.886592559513283e-06, - "loss": 0.95496845, - "num_input_tokens_seen": 133379405, - "step": 6207, - "time_per_iteration": 4.318574666976929 - }, - { - "auxiliary_loss_clip": 0.01105421, - "auxiliary_loss_mlp": 0.0103386, - "balance_loss_clip": 1.0478878, - "balance_loss_mlp": 1.01876843, - "epoch": 0.37324515256275365, - "flos": 19062354072960.0, - "grad_norm": 3.0736568130228363, - "language_loss": 0.82651198, - "learning_rate": 2.886243438932759e-06, - "loss": 0.8479048, - "num_input_tokens_seen": 133397585, - "step": 6208, - "time_per_iteration": 2.749662160873413 - }, - { - "auxiliary_loss_clip": 0.01122225, - "auxiliary_loss_mlp": 0.0103968, - "balance_loss_clip": 1.04488516, - "balance_loss_mlp": 1.0223707, - "epoch": 0.3733052758154216, - "flos": 20704333512960.0, - "grad_norm": 2.0157740087962845, - "language_loss": 0.73122764, - "learning_rate": 2.8858942847457953e-06, - "loss": 0.75284666, - "num_input_tokens_seen": 133415365, - "step": 6209, - "time_per_iteration": 2.6315791606903076 - }, - { - "auxiliary_loss_clip": 0.01095649, - "auxiliary_loss_mlp": 0.01037134, - "balance_loss_clip": 1.04820108, - "balance_loss_mlp": 1.02065969, - "epoch": 0.3733653990680896, - "flos": 20193504243840.0, - "grad_norm": 1.9650719997143145, - "language_loss": 0.70413053, - "learning_rate": 2.8855450969656305e-06, - "loss": 0.72545838, - "num_input_tokens_seen": 133435700, - "step": 6210, - "time_per_iteration": 2.7484405040740967 - }, - { - "auxiliary_loss_clip": 0.01072484, - "auxiliary_loss_mlp": 0.01045611, - "balance_loss_clip": 1.03769457, - "balance_loss_mlp": 1.02674007, - "epoch": 0.37342552232075754, - "flos": 20339660684160.0, - "grad_norm": 2.0510282142916427, - "language_loss": 0.77773547, - "learning_rate": 2.8851958756055073e-06, - "loss": 0.79891646, - "num_input_tokens_seen": 133455180, - "step": 6211, - "time_per_iteration": 2.706294536590576 - }, - { - "auxiliary_loss_clip": 0.01122999, - "auxiliary_loss_mlp": 0.01042393, - "balance_loss_clip": 1.04602683, - "balance_loss_mlp": 1.02645469, - "epoch": 0.3734856455734255, - "flos": 35517879527040.0, - "grad_norm": 1.675173432335243, - "language_loss": 0.73258781, - "learning_rate": 2.884846620678668e-06, - "loss": 0.7542417, - "num_input_tokens_seen": 133476715, - "step": 6212, - "time_per_iteration": 2.788787841796875 - }, - { - "auxiliary_loss_clip": 0.01131124, - "auxiliary_loss_mlp": 0.01047595, - "balance_loss_clip": 1.05055571, - "balance_loss_mlp": 1.03106034, - "epoch": 0.37354576882609347, - "flos": 21142300043520.0, - "grad_norm": 1.9808770110660865, - "language_loss": 0.81656909, - "learning_rate": 2.884497332198356e-06, - "loss": 0.83835626, - "num_input_tokens_seen": 133494550, - "step": 6213, - "time_per_iteration": 2.6829304695129395 - }, - { - "auxiliary_loss_clip": 0.01089374, - "auxiliary_loss_mlp": 0.01046172, - "balance_loss_clip": 1.0412662, - "balance_loss_mlp": 1.02843404, - "epoch": 0.37360589207876144, - "flos": 21506793304320.0, - "grad_norm": 2.223600899112558, - "language_loss": 0.78999674, - "learning_rate": 2.8841480101778167e-06, - "loss": 0.81135225, - "num_input_tokens_seen": 133512640, - "step": 6214, - "time_per_iteration": 2.674373149871826 - }, - { - "auxiliary_loss_clip": 0.01109052, - "auxiliary_loss_mlp": 0.01044175, - "balance_loss_clip": 1.04420567, - "balance_loss_mlp": 1.02827835, - "epoch": 0.37366601533142946, - "flos": 38435800861440.0, - "grad_norm": 1.9266500277332215, - "language_loss": 0.84611148, - "learning_rate": 2.883798654630296e-06, - "loss": 0.86764371, - "num_input_tokens_seen": 133535540, - "step": 6215, - "time_per_iteration": 2.8276026248931885 - }, - { - "auxiliary_loss_clip": 0.01100197, - "auxiliary_loss_mlp": 0.01039814, - "balance_loss_clip": 1.04435837, - "balance_loss_mlp": 1.02298141, - "epoch": 0.3737261385840974, - "flos": 18441171244800.0, - "grad_norm": 1.8731663372997254, - "language_loss": 0.67690969, - "learning_rate": 2.8834492655690423e-06, - "loss": 0.69830984, - "num_input_tokens_seen": 133555795, - "step": 6216, - "time_per_iteration": 2.724090576171875 - }, - { - "auxiliary_loss_clip": 0.01111654, - "auxiliary_loss_mlp": 0.01042601, - "balance_loss_clip": 1.045977, - "balance_loss_mlp": 1.02578092, - "epoch": 0.3737862618367654, - "flos": 22929861306240.0, - "grad_norm": 2.3172976096058853, - "language_loss": 0.65993899, - "learning_rate": 2.883099843007303e-06, - "loss": 0.68148154, - "num_input_tokens_seen": 133575905, - "step": 6217, - "time_per_iteration": 2.7126269340515137 - }, - { - "auxiliary_loss_clip": 0.01115905, - "auxiliary_loss_mlp": 0.01039702, - "balance_loss_clip": 1.0483315, - "balance_loss_mlp": 1.02264857, - "epoch": 0.37384638508943335, - "flos": 15409664127360.0, - "grad_norm": 2.0273109551694777, - "language_loss": 0.80449212, - "learning_rate": 2.88275038695833e-06, - "loss": 0.82604814, - "num_input_tokens_seen": 133592585, - "step": 6218, - "time_per_iteration": 2.680894374847412 - }, - { - "auxiliary_loss_clip": 0.01115539, - "auxiliary_loss_mlp": 0.0103289, - "balance_loss_clip": 1.04488862, - "balance_loss_mlp": 1.01760781, - "epoch": 0.3739065083421013, - "flos": 24280820755200.0, - "grad_norm": 1.5960804840892617, - "language_loss": 0.78692639, - "learning_rate": 2.8824008974353736e-06, - "loss": 0.80841064, - "num_input_tokens_seen": 133615070, - "step": 6219, - "time_per_iteration": 2.6683976650238037 - }, - { - "auxiliary_loss_clip": 0.01107805, - "auxiliary_loss_mlp": 0.01040758, - "balance_loss_clip": 1.04602623, - "balance_loss_mlp": 1.0247364, - "epoch": 0.3739666315947693, - "flos": 23002831785600.0, - "grad_norm": 1.8103875982928064, - "language_loss": 0.77023458, - "learning_rate": 2.8820513744516866e-06, - "loss": 0.79172027, - "num_input_tokens_seen": 133633490, - "step": 6220, - "time_per_iteration": 2.670686960220337 - }, - { - "auxiliary_loss_clip": 0.01105245, - "auxiliary_loss_mlp": 0.01041158, - "balance_loss_clip": 1.04717016, - "balance_loss_mlp": 1.02473164, - "epoch": 0.37402675484743725, - "flos": 19391116279680.0, - "grad_norm": 3.4153989861378204, - "language_loss": 0.8298834, - "learning_rate": 2.8817018180205235e-06, - "loss": 0.85134745, - "num_input_tokens_seen": 133653425, - "step": 6221, - "time_per_iteration": 2.730738401412964 - }, - { - "auxiliary_loss_clip": 0.01108391, - "auxiliary_loss_mlp": 0.01043965, - "balance_loss_clip": 1.04499435, - "balance_loss_mlp": 1.02825367, - "epoch": 0.3740868781001052, - "flos": 17126158331520.0, - "grad_norm": 1.9668982067313725, - "language_loss": 0.75944567, - "learning_rate": 2.8813522281551387e-06, - "loss": 0.78096926, - "num_input_tokens_seen": 133670220, - "step": 6222, - "time_per_iteration": 2.62052321434021 - }, - { - "auxiliary_loss_clip": 0.01103117, - "auxiliary_loss_mlp": 0.00772891, - "balance_loss_clip": 1.04785156, - "balance_loss_mlp": 1.00029564, - "epoch": 0.3741470013527732, - "flos": 20043505048320.0, - "grad_norm": 1.8881600065301847, - "language_loss": 0.70621789, - "learning_rate": 2.881002604868789e-06, - "loss": 0.72497797, - "num_input_tokens_seen": 133688910, - "step": 6223, - "time_per_iteration": 2.7686285972595215 - }, - { - "auxiliary_loss_clip": 0.01104752, - "auxiliary_loss_mlp": 0.01035203, - "balance_loss_clip": 1.05155015, - "balance_loss_mlp": 1.02057576, - "epoch": 0.37420712460544114, - "flos": 36897279569280.0, - "grad_norm": 2.1852519558340644, - "language_loss": 0.6875304, - "learning_rate": 2.8806529481747325e-06, - "loss": 0.7089299, - "num_input_tokens_seen": 133708690, - "step": 6224, - "time_per_iteration": 2.817263126373291 - }, - { - "auxiliary_loss_clip": 0.01091747, - "auxiliary_loss_mlp": 0.01036393, - "balance_loss_clip": 1.04859614, - "balance_loss_mlp": 1.02059817, - "epoch": 0.3742672478581091, - "flos": 22201198007040.0, - "grad_norm": 2.246642459489035, - "language_loss": 0.70192593, - "learning_rate": 2.880303258086228e-06, - "loss": 0.72320735, - "num_input_tokens_seen": 133728095, - "step": 6225, - "time_per_iteration": 2.785083532333374 - }, - { - "auxiliary_loss_clip": 0.01088757, - "auxiliary_loss_mlp": 0.01048544, - "balance_loss_clip": 1.04366183, - "balance_loss_mlp": 1.03175974, - "epoch": 0.3743273711107771, - "flos": 24681547860480.0, - "grad_norm": 2.1768682992812236, - "language_loss": 0.7896018, - "learning_rate": 2.879953534616536e-06, - "loss": 0.81097472, - "num_input_tokens_seen": 133745590, - "step": 6226, - "time_per_iteration": 2.7403974533081055 - }, - { - "auxiliary_loss_clip": 0.01105293, - "auxiliary_loss_mlp": 0.01039029, - "balance_loss_clip": 1.04631484, - "balance_loss_mlp": 1.02303696, - "epoch": 0.37438749436344504, - "flos": 24459619680000.0, - "grad_norm": 1.7825799805329443, - "language_loss": 0.67965841, - "learning_rate": 2.879603777778917e-06, - "loss": 0.70110166, - "num_input_tokens_seen": 133766155, - "step": 6227, - "time_per_iteration": 2.6975693702697754 - }, - { - "auxiliary_loss_clip": 0.01099252, - "auxiliary_loss_mlp": 0.01034493, - "balance_loss_clip": 1.04493213, - "balance_loss_mlp": 1.01890039, - "epoch": 0.374447617616113, - "flos": 21798747048960.0, - "grad_norm": 1.9005486801766094, - "language_loss": 0.829476, - "learning_rate": 2.879253987586635e-06, - "loss": 0.85081351, - "num_input_tokens_seen": 133783185, - "step": 6228, - "time_per_iteration": 2.7754271030426025 - }, - { - "auxiliary_loss_clip": 0.01090082, - "auxiliary_loss_mlp": 0.01048677, - "balance_loss_clip": 1.04396605, - "balance_loss_mlp": 1.03159404, - "epoch": 0.374507740868781, - "flos": 17968191932160.0, - "grad_norm": 1.6406992237121778, - "language_loss": 0.74450547, - "learning_rate": 2.8789041640529535e-06, - "loss": 0.76589304, - "num_input_tokens_seen": 133800975, - "step": 6229, - "time_per_iteration": 2.6378824710845947 - }, - { - "auxiliary_loss_clip": 0.0109707, - "auxiliary_loss_mlp": 0.01035996, - "balance_loss_clip": 1.0470053, - "balance_loss_mlp": 1.01971197, - "epoch": 0.374567864121449, - "flos": 16105828596480.0, - "grad_norm": 2.127994694324029, - "language_loss": 0.83782691, - "learning_rate": 2.8785543071911383e-06, - "loss": 0.85915756, - "num_input_tokens_seen": 133818020, - "step": 6230, - "time_per_iteration": 2.6857657432556152 - }, - { - "auxiliary_loss_clip": 0.0112393, - "auxiliary_loss_mlp": 0.01041627, - "balance_loss_clip": 1.04905128, - "balance_loss_mlp": 1.02556968, - "epoch": 0.37462798737411696, - "flos": 25773160135680.0, - "grad_norm": 2.8382818326589145, - "language_loss": 0.735865, - "learning_rate": 2.878204417014456e-06, - "loss": 0.75752056, - "num_input_tokens_seen": 133840690, - "step": 6231, - "time_per_iteration": 2.7082016468048096 - }, - { - "auxiliary_loss_clip": 0.0112579, - "auxiliary_loss_mlp": 0.01046917, - "balance_loss_clip": 1.05376148, - "balance_loss_mlp": 1.03075266, - "epoch": 0.3746881106267849, - "flos": 16654507822080.0, - "grad_norm": 2.9683381932525665, - "language_loss": 0.7412858, - "learning_rate": 2.8778544935361735e-06, - "loss": 0.76301289, - "num_input_tokens_seen": 133858350, - "step": 6232, - "time_per_iteration": 2.5764057636260986 - }, - { - "auxiliary_loss_clip": 0.01106131, - "auxiliary_loss_mlp": 0.01039245, - "balance_loss_clip": 1.04461622, - "balance_loss_mlp": 1.02237701, - "epoch": 0.3747482338794529, - "flos": 26177981391360.0, - "grad_norm": 2.121427790242168, - "language_loss": 0.77296579, - "learning_rate": 2.877504536769561e-06, - "loss": 0.79441959, - "num_input_tokens_seen": 133879775, - "step": 6233, - "time_per_iteration": 2.692286252975464 - }, - { - "auxiliary_loss_clip": 0.01118513, - "auxiliary_loss_mlp": 0.01040639, - "balance_loss_clip": 1.05093503, - "balance_loss_mlp": 1.024593, - "epoch": 0.37480835713212085, - "flos": 12021061950720.0, - "grad_norm": 1.8446337373318833, - "language_loss": 0.69493848, - "learning_rate": 2.8771545467278883e-06, - "loss": 0.71652997, - "num_input_tokens_seen": 133898295, - "step": 6234, - "time_per_iteration": 2.658332586288452 - }, - { - "auxiliary_loss_clip": 0.01123531, - "auxiliary_loss_mlp": 0.01042963, - "balance_loss_clip": 1.04885483, - "balance_loss_mlp": 1.02833033, - "epoch": 0.3748684803847888, - "flos": 19679263182720.0, - "grad_norm": 1.9015387878630694, - "language_loss": 0.82462788, - "learning_rate": 2.8768045234244276e-06, - "loss": 0.84629285, - "num_input_tokens_seen": 133915230, - "step": 6235, - "time_per_iteration": 2.591198682785034 - }, - { - "auxiliary_loss_clip": 0.01140927, - "auxiliary_loss_mlp": 0.0103602, - "balance_loss_clip": 1.05301189, - "balance_loss_mlp": 1.02021289, - "epoch": 0.3749286036374568, - "flos": 20521189042560.0, - "grad_norm": 1.8869628373328378, - "language_loss": 0.78439927, - "learning_rate": 2.8764544668724517e-06, - "loss": 0.80616879, - "num_input_tokens_seen": 133934110, - "step": 6236, - "time_per_iteration": 2.6754372119903564 - }, - { - "auxiliary_loss_clip": 0.01118225, - "auxiliary_loss_mlp": 0.01050242, - "balance_loss_clip": 1.04519606, - "balance_loss_mlp": 1.03202713, - "epoch": 0.37498872689012475, - "flos": 20704620821760.0, - "grad_norm": 2.0770406770017242, - "language_loss": 0.74357057, - "learning_rate": 2.876104377085234e-06, - "loss": 0.76525521, - "num_input_tokens_seen": 133952395, - "step": 6237, - "time_per_iteration": 2.6760342121124268 - }, - { - "auxiliary_loss_clip": 0.01114513, - "auxiliary_loss_mlp": 0.00773766, - "balance_loss_clip": 1.04626942, - "balance_loss_mlp": 1.00036037, - "epoch": 0.3750488501427927, - "flos": 21574843620480.0, - "grad_norm": 2.0699756536584633, - "language_loss": 0.93258965, - "learning_rate": 2.8757542540760508e-06, - "loss": 0.95147252, - "num_input_tokens_seen": 133969635, - "step": 6238, - "time_per_iteration": 2.6805243492126465 - }, - { - "auxiliary_loss_clip": 0.01137619, - "auxiliary_loss_mlp": 0.01037341, - "balance_loss_clip": 1.04995167, - "balance_loss_mlp": 1.02081275, - "epoch": 0.3751089733954607, - "flos": 15923869274880.0, - "grad_norm": 2.3841921025147284, - "language_loss": 0.70885909, - "learning_rate": 2.8754040978581777e-06, - "loss": 0.73060858, - "num_input_tokens_seen": 133987215, - "step": 6239, - "time_per_iteration": 2.548285961151123 - }, - { - "auxiliary_loss_clip": 0.01068531, - "auxiliary_loss_mlp": 0.01040031, - "balance_loss_clip": 1.04656243, - "balance_loss_mlp": 1.02303219, - "epoch": 0.37516909664812864, - "flos": 36284644177920.0, - "grad_norm": 1.601808094344726, - "language_loss": 0.65752542, - "learning_rate": 2.875053908444895e-06, - "loss": 0.67861104, - "num_input_tokens_seen": 134009250, - "step": 6240, - "time_per_iteration": 3.016897201538086 - }, - { - "auxiliary_loss_clip": 0.01101858, - "auxiliary_loss_mlp": 0.00773445, - "balance_loss_clip": 1.04618907, - "balance_loss_mlp": 1.00033951, - "epoch": 0.3752292199007966, - "flos": 13515915283200.0, - "grad_norm": 2.721418670308367, - "language_loss": 0.75816065, - "learning_rate": 2.8747036858494795e-06, - "loss": 0.7769137, - "num_input_tokens_seen": 134026875, - "step": 6241, - "time_per_iteration": 4.402552843093872 - }, - { - "auxiliary_loss_clip": 0.01103844, - "auxiliary_loss_mlp": 0.01044119, - "balance_loss_clip": 1.04654765, - "balance_loss_mlp": 1.0276264, - "epoch": 0.3752893431534646, - "flos": 27198095644800.0, - "grad_norm": 2.108703330368865, - "language_loss": 0.83791685, - "learning_rate": 2.874353430085213e-06, - "loss": 0.85939646, - "num_input_tokens_seen": 134047185, - "step": 6242, - "time_per_iteration": 2.7508704662323 - }, - { - "auxiliary_loss_clip": 0.01110348, - "auxiliary_loss_mlp": 0.01048171, - "balance_loss_clip": 1.04799628, - "balance_loss_mlp": 1.03319848, - "epoch": 0.3753494664061326, - "flos": 30007674581760.0, - "grad_norm": 2.4924519814208774, - "language_loss": 0.68438506, - "learning_rate": 2.8740031411653766e-06, - "loss": 0.70597029, - "num_input_tokens_seen": 134067330, - "step": 6243, - "time_per_iteration": 2.7814478874206543 - }, - { - "auxiliary_loss_clip": 0.01056696, - "auxiliary_loss_mlp": 0.00776554, - "balance_loss_clip": 1.04175019, - "balance_loss_mlp": 1.00038528, - "epoch": 0.37540958965880056, - "flos": 24461954064000.0, - "grad_norm": 1.7699519682943652, - "language_loss": 0.84165168, - "learning_rate": 2.8736528191032535e-06, - "loss": 0.85998416, - "num_input_tokens_seen": 134085525, - "step": 6244, - "time_per_iteration": 4.510041952133179 - }, - { - "auxiliary_loss_clip": 0.01074238, - "auxiliary_loss_mlp": 0.01042872, - "balance_loss_clip": 1.03981614, - "balance_loss_mlp": 1.02712417, - "epoch": 0.3754697129114685, - "flos": 16508387295360.0, - "grad_norm": 2.7453088605805616, - "language_loss": 0.82679987, - "learning_rate": 2.8733024639121277e-06, - "loss": 0.84797096, - "num_input_tokens_seen": 134101855, - "step": 6245, - "time_per_iteration": 4.745215654373169 - }, - { - "auxiliary_loss_clip": 0.01096909, - "auxiliary_loss_mlp": 0.0104658, - "balance_loss_clip": 1.04049206, - "balance_loss_mlp": 1.0296756, - "epoch": 0.3755298361641365, - "flos": 19390900798080.0, - "grad_norm": 8.46557879021872, - "language_loss": 0.63902843, - "learning_rate": 2.8729520756052853e-06, - "loss": 0.66046333, - "num_input_tokens_seen": 134119360, - "step": 6246, - "time_per_iteration": 4.33053731918335 - }, - { - "auxiliary_loss_clip": 0.01112093, - "auxiliary_loss_mlp": 0.0104355, - "balance_loss_clip": 1.04961443, - "balance_loss_mlp": 1.0264082, - "epoch": 0.37558995941680445, - "flos": 14720395069440.0, - "grad_norm": 2.0508038288587183, - "language_loss": 0.74467009, - "learning_rate": 2.8726016541960124e-06, - "loss": 0.76622653, - "num_input_tokens_seen": 134137475, - "step": 6247, - "time_per_iteration": 2.688081979751587 - }, - { - "auxiliary_loss_clip": 0.01126872, - "auxiliary_loss_mlp": 0.01037368, - "balance_loss_clip": 1.05022037, - "balance_loss_mlp": 1.02133489, - "epoch": 0.3756500826694724, - "flos": 21689901861120.0, - "grad_norm": 2.703785960910372, - "language_loss": 0.5497098, - "learning_rate": 2.872251199697598e-06, - "loss": 0.57135224, - "num_input_tokens_seen": 134154580, - "step": 6248, - "time_per_iteration": 2.6308822631835938 - }, - { - "auxiliary_loss_clip": 0.01117073, - "auxiliary_loss_mlp": 0.01036379, - "balance_loss_clip": 1.04465234, - "balance_loss_mlp": 1.0200597, - "epoch": 0.3757102059221404, - "flos": 26505666190080.0, - "grad_norm": 4.209721572066423, - "language_loss": 0.84492457, - "learning_rate": 2.8719007121233297e-06, - "loss": 0.86645913, - "num_input_tokens_seen": 134174285, - "step": 6249, - "time_per_iteration": 2.6539809703826904 - }, - { - "auxiliary_loss_clip": 0.01107733, - "auxiliary_loss_mlp": 0.01035495, - "balance_loss_clip": 1.04784632, - "balance_loss_mlp": 1.01956248, - "epoch": 0.37577032917480835, - "flos": 37338083274240.0, - "grad_norm": 1.546160982958922, - "language_loss": 0.67701882, - "learning_rate": 2.8715501914864993e-06, - "loss": 0.69845104, - "num_input_tokens_seen": 134195940, - "step": 6250, - "time_per_iteration": 2.787398338317871 - }, - { - "auxiliary_loss_clip": 0.01117019, - "auxiliary_loss_mlp": 0.01044359, - "balance_loss_clip": 1.04946029, - "balance_loss_mlp": 1.0293386, - "epoch": 0.3758304524274763, - "flos": 21908597817600.0, - "grad_norm": 1.960309683567346, - "language_loss": 0.77824795, - "learning_rate": 2.8711996378003987e-06, - "loss": 0.79986179, - "num_input_tokens_seen": 134212235, - "step": 6251, - "time_per_iteration": 2.7143123149871826 - }, - { - "auxiliary_loss_clip": 0.01121024, - "auxiliary_loss_mlp": 0.01039102, - "balance_loss_clip": 1.04994178, - "balance_loss_mlp": 1.0236522, - "epoch": 0.3758905756801443, - "flos": 36569343375360.0, - "grad_norm": 2.527016245081176, - "language_loss": 0.58002663, - "learning_rate": 2.8708490510783203e-06, - "loss": 0.60162789, - "num_input_tokens_seen": 134233810, - "step": 6252, - "time_per_iteration": 2.716597557067871 - }, - { - "auxiliary_loss_clip": 0.01116459, - "auxiliary_loss_mlp": 0.01042556, - "balance_loss_clip": 1.05007291, - "balance_loss_mlp": 1.0260098, - "epoch": 0.37595069893281224, - "flos": 24528783317760.0, - "grad_norm": 4.856643583290163, - "language_loss": 0.89482141, - "learning_rate": 2.8704984313335584e-06, - "loss": 0.91641152, - "num_input_tokens_seen": 134252020, - "step": 6253, - "time_per_iteration": 2.701361894607544 - }, - { - "auxiliary_loss_clip": 0.01098154, - "auxiliary_loss_mlp": 0.01040398, - "balance_loss_clip": 1.04815936, - "balance_loss_mlp": 1.02562761, - "epoch": 0.3760108221854802, - "flos": 16435021766400.0, - "grad_norm": 2.218099502464204, - "language_loss": 0.76568806, - "learning_rate": 2.8701477785794097e-06, - "loss": 0.78707361, - "num_input_tokens_seen": 134269495, - "step": 6254, - "time_per_iteration": 2.6995303630828857 - }, - { - "auxiliary_loss_clip": 0.01096995, - "auxiliary_loss_mlp": 0.01043484, - "balance_loss_clip": 1.04379475, - "balance_loss_mlp": 1.02628207, - "epoch": 0.37607094543814823, - "flos": 13771742924160.0, - "grad_norm": 2.131769376763656, - "language_loss": 0.6180023, - "learning_rate": 2.869797092829169e-06, - "loss": 0.6394071, - "num_input_tokens_seen": 134287035, - "step": 6255, - "time_per_iteration": 2.7164864540100098 - }, - { - "auxiliary_loss_clip": 0.01127282, - "auxiliary_loss_mlp": 0.01036673, - "balance_loss_clip": 1.04883361, - "balance_loss_mlp": 1.02017426, - "epoch": 0.3761310686908162, - "flos": 19857918453120.0, - "grad_norm": 2.6629341180561545, - "language_loss": 0.74404681, - "learning_rate": 2.869446374096135e-06, - "loss": 0.76568639, - "num_input_tokens_seen": 134304840, - "step": 6256, - "time_per_iteration": 2.588169574737549 - }, - { - "auxiliary_loss_clip": 0.01127124, - "auxiliary_loss_mlp": 0.01046358, - "balance_loss_clip": 1.04913831, - "balance_loss_mlp": 1.02977645, - "epoch": 0.37619119194348416, - "flos": 12750802657920.0, - "grad_norm": 2.3087979716808937, - "language_loss": 0.702447, - "learning_rate": 2.8690956223936088e-06, - "loss": 0.72418177, - "num_input_tokens_seen": 134323180, - "step": 6257, - "time_per_iteration": 2.701555013656616 - }, - { - "auxiliary_loss_clip": 0.01110787, - "auxiliary_loss_mlp": 0.01033343, - "balance_loss_clip": 1.04812109, - "balance_loss_mlp": 1.01796508, - "epoch": 0.3762513151961521, - "flos": 17530548624000.0, - "grad_norm": 1.673769537318751, - "language_loss": 0.84842372, - "learning_rate": 2.868744837734889e-06, - "loss": 0.86986494, - "num_input_tokens_seen": 134341390, - "step": 6258, - "time_per_iteration": 2.6336703300476074 - }, - { - "auxiliary_loss_clip": 0.01091689, - "auxiliary_loss_mlp": 0.01041654, - "balance_loss_clip": 1.04571128, - "balance_loss_mlp": 1.0271697, - "epoch": 0.3763114384488201, - "flos": 23617406511360.0, - "grad_norm": 1.4940028515654036, - "language_loss": 0.80920124, - "learning_rate": 2.868394020133277e-06, - "loss": 0.83053464, - "num_input_tokens_seen": 134360425, - "step": 6259, - "time_per_iteration": 2.752392053604126 - }, - { - "auxiliary_loss_clip": 0.01093234, - "auxiliary_loss_mlp": 0.01046443, - "balance_loss_clip": 1.04547083, - "balance_loss_mlp": 1.02969444, - "epoch": 0.37637156170148806, - "flos": 25406978935680.0, - "grad_norm": 2.4951694968605627, - "language_loss": 0.71285564, - "learning_rate": 2.8680431696020783e-06, - "loss": 0.73425239, - "num_input_tokens_seen": 134379775, - "step": 6260, - "time_per_iteration": 2.782561779022217 - }, - { - "auxiliary_loss_clip": 0.01107136, - "auxiliary_loss_mlp": 0.01039319, - "balance_loss_clip": 1.04386747, - "balance_loss_mlp": 1.02305889, - "epoch": 0.376431684954156, - "flos": 23440906056960.0, - "grad_norm": 1.627422352949978, - "language_loss": 0.78342533, - "learning_rate": 2.867692286154594e-06, - "loss": 0.80488986, - "num_input_tokens_seen": 134400315, - "step": 6261, - "time_per_iteration": 2.6978867053985596 - }, - { - "auxiliary_loss_clip": 0.01112259, - "auxiliary_loss_mlp": 0.01048861, - "balance_loss_clip": 1.04744315, - "balance_loss_mlp": 1.0312773, - "epoch": 0.376491808206824, - "flos": 34204482725760.0, - "grad_norm": 2.418447947297228, - "language_loss": 0.80871278, - "learning_rate": 2.867341369804132e-06, - "loss": 0.83032399, - "num_input_tokens_seen": 134422875, - "step": 6262, - "time_per_iteration": 2.852675437927246 - }, - { - "auxiliary_loss_clip": 0.01115101, - "auxiliary_loss_mlp": 0.01038136, - "balance_loss_clip": 1.04584765, - "balance_loss_mlp": 1.02277565, - "epoch": 0.37655193145949195, - "flos": 35185669614720.0, - "grad_norm": 2.9875520790285774, - "language_loss": 0.80295742, - "learning_rate": 2.866990420563998e-06, - "loss": 0.82448983, - "num_input_tokens_seen": 134443025, - "step": 6263, - "time_per_iteration": 2.785395622253418 - }, - { - "auxiliary_loss_clip": 0.01140252, - "auxiliary_loss_mlp": 0.01045838, - "balance_loss_clip": 1.05247605, - "balance_loss_mlp": 1.0300312, - "epoch": 0.3766120547121599, - "flos": 16761844638720.0, - "grad_norm": 2.896352989954936, - "language_loss": 0.79601765, - "learning_rate": 2.866639438447501e-06, - "loss": 0.81787854, - "num_input_tokens_seen": 134460945, - "step": 6264, - "time_per_iteration": 2.581125497817993 - }, - { - "auxiliary_loss_clip": 0.01133548, - "auxiliary_loss_mlp": 0.0105155, - "balance_loss_clip": 1.04770851, - "balance_loss_mlp": 1.03557551, - "epoch": 0.3766721779648279, - "flos": 23550361776000.0, - "grad_norm": 2.0921625870578913, - "language_loss": 0.73808366, - "learning_rate": 2.8662884234679497e-06, - "loss": 0.75993466, - "num_input_tokens_seen": 134480440, - "step": 6265, - "time_per_iteration": 2.6998226642608643 - }, - { - "auxiliary_loss_clip": 0.01123221, - "auxiliary_loss_mlp": 0.0103937, - "balance_loss_clip": 1.05005145, - "balance_loss_mlp": 1.02543402, - "epoch": 0.37673230121749585, - "flos": 29129191655040.0, - "grad_norm": 1.9744000825782282, - "language_loss": 0.68550873, - "learning_rate": 2.865937375638654e-06, - "loss": 0.70713472, - "num_input_tokens_seen": 134501110, - "step": 6266, - "time_per_iteration": 2.6934731006622314 - }, - { - "auxiliary_loss_clip": 0.01128105, - "auxiliary_loss_mlp": 0.01041187, - "balance_loss_clip": 1.04846668, - "balance_loss_mlp": 1.02536833, - "epoch": 0.3767924244701638, - "flos": 28146783703680.0, - "grad_norm": 3.437883319374573, - "language_loss": 0.63078731, - "learning_rate": 2.8655862949729264e-06, - "loss": 0.65248024, - "num_input_tokens_seen": 134522460, - "step": 6267, - "time_per_iteration": 2.7006735801696777 - }, - { - "auxiliary_loss_clip": 0.01050407, - "auxiliary_loss_mlp": 0.01011452, - "balance_loss_clip": 1.02822745, - "balance_loss_mlp": 1.00960469, - "epoch": 0.37685254772283183, - "flos": 60797197526400.0, - "grad_norm": 0.7198108741876666, - "language_loss": 0.58852816, - "learning_rate": 2.8652351814840795e-06, - "loss": 0.60914677, - "num_input_tokens_seen": 134589545, - "step": 6268, - "time_per_iteration": 3.355120897293091 - }, - { - "auxiliary_loss_clip": 0.011375, - "auxiliary_loss_mlp": 0.01043603, - "balance_loss_clip": 1.05033755, - "balance_loss_mlp": 1.02698505, - "epoch": 0.3769126709754998, - "flos": 26032543223040.0, - "grad_norm": 2.34128493463531, - "language_loss": 0.65263468, - "learning_rate": 2.8648840351854283e-06, - "loss": 0.67444575, - "num_input_tokens_seen": 134610550, - "step": 6269, - "time_per_iteration": 2.656585931777954 - }, - { - "auxiliary_loss_clip": 0.01099912, - "auxiliary_loss_mlp": 0.01041008, - "balance_loss_clip": 1.04970932, - "balance_loss_mlp": 1.02536798, - "epoch": 0.37697279422816776, - "flos": 23579879777280.0, - "grad_norm": 1.5250715006737088, - "language_loss": 0.7069717, - "learning_rate": 2.8645328560902874e-06, - "loss": 0.72838092, - "num_input_tokens_seen": 134630485, - "step": 6270, - "time_per_iteration": 2.7498419284820557 - }, - { - "auxiliary_loss_clip": 0.01059818, - "auxiliary_loss_mlp": 0.01007405, - "balance_loss_clip": 1.02900875, - "balance_loss_mlp": 1.00581956, - "epoch": 0.3770329174808357, - "flos": 64745935367040.0, - "grad_norm": 0.7193704591933474, - "language_loss": 0.56122422, - "learning_rate": 2.8641816442119746e-06, - "loss": 0.58189648, - "num_input_tokens_seen": 134693510, - "step": 6271, - "time_per_iteration": 3.1569089889526367 - }, - { - "auxiliary_loss_clip": 0.01121208, - "auxiliary_loss_mlp": 0.01042721, - "balance_loss_clip": 1.04645681, - "balance_loss_mlp": 1.02609181, - "epoch": 0.3770930407335037, - "flos": 21835304115840.0, - "grad_norm": 2.1611051517344246, - "language_loss": 0.79855239, - "learning_rate": 2.8638303995638066e-06, - "loss": 0.82019162, - "num_input_tokens_seen": 134713115, - "step": 6272, - "time_per_iteration": 2.628180742263794 - }, - { - "auxiliary_loss_clip": 0.01118748, - "auxiliary_loss_mlp": 0.01033695, - "balance_loss_clip": 1.0451988, - "balance_loss_mlp": 1.01934206, - "epoch": 0.37715316398617166, - "flos": 22747901984640.0, - "grad_norm": 2.0954681641544304, - "language_loss": 0.73789483, - "learning_rate": 2.863479122159103e-06, - "loss": 0.75941932, - "num_input_tokens_seen": 134732635, - "step": 6273, - "time_per_iteration": 2.7064390182495117 - }, - { - "auxiliary_loss_clip": 0.01117899, - "auxiliary_loss_mlp": 0.01044408, - "balance_loss_clip": 1.04745209, - "balance_loss_mlp": 1.02905381, - "epoch": 0.3772132872388396, - "flos": 18914581520640.0, - "grad_norm": 1.6440580648783938, - "language_loss": 0.71867502, - "learning_rate": 2.8631278120111858e-06, - "loss": 0.74029803, - "num_input_tokens_seen": 134750695, - "step": 6274, - "time_per_iteration": 2.650559186935425 - }, - { - "auxiliary_loss_clip": 0.01105418, - "auxiliary_loss_mlp": 0.01040714, - "balance_loss_clip": 1.04509926, - "balance_loss_mlp": 1.02567029, - "epoch": 0.3772734104915076, - "flos": 17346219004800.0, - "grad_norm": 1.9251108643001593, - "language_loss": 0.83620244, - "learning_rate": 2.8627764691333742e-06, - "loss": 0.85766381, - "num_input_tokens_seen": 134768935, - "step": 6275, - "time_per_iteration": 2.662346839904785 - }, - { - "auxiliary_loss_clip": 0.01077547, - "auxiliary_loss_mlp": 0.01035941, - "balance_loss_clip": 1.04383206, - "balance_loss_mlp": 1.02238655, - "epoch": 0.37733353374417555, - "flos": 32342370785280.0, - "grad_norm": 1.4850375213112275, - "language_loss": 0.75779188, - "learning_rate": 2.8624250935389935e-06, - "loss": 0.77892679, - "num_input_tokens_seen": 134791260, - "step": 6276, - "time_per_iteration": 2.824374198913574 - }, - { - "auxiliary_loss_clip": 0.01109985, - "auxiliary_loss_mlp": 0.01039728, - "balance_loss_clip": 1.04301822, - "balance_loss_mlp": 1.02318192, - "epoch": 0.3773936569968435, - "flos": 23360681030400.0, - "grad_norm": 1.996464283971086, - "language_loss": 0.85758084, - "learning_rate": 2.862073685241366e-06, - "loss": 0.87907803, - "num_input_tokens_seen": 134808350, - "step": 6277, - "time_per_iteration": 2.6880812644958496 - }, - { - "auxiliary_loss_clip": 0.01123239, - "auxiliary_loss_mlp": 0.01035838, - "balance_loss_clip": 1.04981339, - "balance_loss_mlp": 1.02147365, - "epoch": 0.3774537802495115, - "flos": 21466788531840.0, - "grad_norm": 2.8692620956149613, - "language_loss": 0.78788501, - "learning_rate": 2.861722244253818e-06, - "loss": 0.80947578, - "num_input_tokens_seen": 134826005, - "step": 6278, - "time_per_iteration": 2.6566152572631836 - }, - { - "auxiliary_loss_clip": 0.01104603, - "auxiliary_loss_mlp": 0.01044359, - "balance_loss_clip": 1.04592609, - "balance_loss_mlp": 1.02740717, - "epoch": 0.37751390350217945, - "flos": 24973717086720.0, - "grad_norm": 2.420687530183356, - "language_loss": 0.8289634, - "learning_rate": 2.8613707705896767e-06, - "loss": 0.85045302, - "num_input_tokens_seen": 134844995, - "step": 6279, - "time_per_iteration": 2.732966899871826 - }, - { - "auxiliary_loss_clip": 0.01110227, - "auxiliary_loss_mlp": 0.01039275, - "balance_loss_clip": 1.04498839, - "balance_loss_mlp": 1.02520263, - "epoch": 0.3775740267548474, - "flos": 27819098904960.0, - "grad_norm": 5.36242068768128, - "language_loss": 0.74968797, - "learning_rate": 2.861019264262269e-06, - "loss": 0.77118295, - "num_input_tokens_seen": 134865285, - "step": 6280, - "time_per_iteration": 4.266780376434326 - }, - { - "auxiliary_loss_clip": 0.01130932, - "auxiliary_loss_mlp": 0.01036032, - "balance_loss_clip": 1.04845715, - "balance_loss_mlp": 1.02235854, - "epoch": 0.3776341500075154, - "flos": 22565224391040.0, - "grad_norm": 1.4530407212668277, - "language_loss": 0.76169163, - "learning_rate": 2.8606677252849242e-06, - "loss": 0.7833612, - "num_input_tokens_seen": 134886535, - "step": 6281, - "time_per_iteration": 2.649930477142334 - }, - { - "auxiliary_loss_clip": 0.01101629, - "auxiliary_loss_mlp": 0.01040327, - "balance_loss_clip": 1.04291892, - "balance_loss_mlp": 1.02471018, - "epoch": 0.3776942732601834, - "flos": 23077238808960.0, - "grad_norm": 2.430303484367767, - "language_loss": 0.83814883, - "learning_rate": 2.860316153670974e-06, - "loss": 0.85956836, - "num_input_tokens_seen": 134907435, - "step": 6282, - "time_per_iteration": 2.6882312297821045 - }, - { - "auxiliary_loss_clip": 0.0111945, - "auxiliary_loss_mlp": 0.0103679, - "balance_loss_clip": 1.04452085, - "balance_loss_mlp": 1.02134025, - "epoch": 0.37775439651285136, - "flos": 21724411852800.0, - "grad_norm": 2.5787880774083725, - "language_loss": 0.698241, - "learning_rate": 2.8599645494337484e-06, - "loss": 0.71980345, - "num_input_tokens_seen": 134925360, - "step": 6283, - "time_per_iteration": 4.2020978927612305 - }, - { - "auxiliary_loss_clip": 0.01072442, - "auxiliary_loss_mlp": 0.01052062, - "balance_loss_clip": 1.04226279, - "balance_loss_mlp": 1.03394175, - "epoch": 0.37781451976551933, - "flos": 23987753688960.0, - "grad_norm": 2.007181392308561, - "language_loss": 0.76503819, - "learning_rate": 2.859612912586581e-06, - "loss": 0.78628325, - "num_input_tokens_seen": 134944205, - "step": 6284, - "time_per_iteration": 4.349794387817383 - }, - { - "auxiliary_loss_clip": 0.01142581, - "auxiliary_loss_mlp": 0.01033355, - "balance_loss_clip": 1.05249381, - "balance_loss_mlp": 1.01713097, - "epoch": 0.3778746430181873, - "flos": 13727967223680.0, - "grad_norm": 2.7318562260547554, - "language_loss": 0.85677552, - "learning_rate": 2.8592612431428055e-06, - "loss": 0.87853491, - "num_input_tokens_seen": 134960255, - "step": 6285, - "time_per_iteration": 2.6949870586395264 - }, - { - "auxiliary_loss_clip": 0.01111269, - "auxiliary_loss_mlp": 0.01042933, - "balance_loss_clip": 1.04731882, - "balance_loss_mlp": 1.02694702, - "epoch": 0.37793476627085526, - "flos": 19460495399040.0, - "grad_norm": 1.8544385642750592, - "language_loss": 0.84419537, - "learning_rate": 2.858909541115758e-06, - "loss": 0.86573738, - "num_input_tokens_seen": 134978605, - "step": 6286, - "time_per_iteration": 4.541024684906006 - }, - { - "auxiliary_loss_clip": 0.01120151, - "auxiliary_loss_mlp": 0.01043503, - "balance_loss_clip": 1.05024576, - "balance_loss_mlp": 1.0280652, - "epoch": 0.3779948895235232, - "flos": 10707018704640.0, - "grad_norm": 2.400905995704231, - "language_loss": 0.81738019, - "learning_rate": 2.858557806518775e-06, - "loss": 0.83901674, - "num_input_tokens_seen": 134995020, - "step": 6287, - "time_per_iteration": 2.6611125469207764 - }, - { - "auxiliary_loss_clip": 0.01118978, - "auxiliary_loss_mlp": 0.01041796, - "balance_loss_clip": 1.04537022, - "balance_loss_mlp": 1.02645934, - "epoch": 0.3780550127761912, - "flos": 22310007281280.0, - "grad_norm": 3.0671932020533133, - "language_loss": 0.73071134, - "learning_rate": 2.8582060393651927e-06, - "loss": 0.7523191, - "num_input_tokens_seen": 135012620, - "step": 6288, - "time_per_iteration": 2.6759073734283447 - }, - { - "auxiliary_loss_clip": 0.01124666, - "auxiliary_loss_mlp": 0.01036773, - "balance_loss_clip": 1.05113983, - "balance_loss_mlp": 1.02115071, - "epoch": 0.37811513602885916, - "flos": 28950644125440.0, - "grad_norm": 1.9644960153972613, - "language_loss": 0.75616127, - "learning_rate": 2.857854239668352e-06, - "loss": 0.77777576, - "num_input_tokens_seen": 135033365, - "step": 6289, - "time_per_iteration": 2.656367778778076 - }, - { - "auxiliary_loss_clip": 0.0112159, - "auxiliary_loss_mlp": 0.01035617, - "balance_loss_clip": 1.04737473, - "balance_loss_mlp": 1.02025056, - "epoch": 0.3781752592815271, - "flos": 23112933949440.0, - "grad_norm": 1.7941331023092641, - "language_loss": 0.73271513, - "learning_rate": 2.857502407441593e-06, - "loss": 0.75428718, - "num_input_tokens_seen": 135052185, - "step": 6290, - "time_per_iteration": 2.740370512008667 - }, - { - "auxiliary_loss_clip": 0.01098389, - "auxiliary_loss_mlp": 0.01041015, - "balance_loss_clip": 1.04425681, - "balance_loss_mlp": 1.023193, - "epoch": 0.3782353825341951, - "flos": 19755932762880.0, - "grad_norm": 8.943174604406142, - "language_loss": 0.79843229, - "learning_rate": 2.8571505426982566e-06, - "loss": 0.81982636, - "num_input_tokens_seen": 135070425, - "step": 6291, - "time_per_iteration": 2.729116916656494 - }, - { - "auxiliary_loss_clip": 0.01101536, - "auxiliary_loss_mlp": 0.01032627, - "balance_loss_clip": 1.04736066, - "balance_loss_mlp": 1.01611638, - "epoch": 0.37829550578686305, - "flos": 22050839675520.0, - "grad_norm": 2.1381581001103203, - "language_loss": 0.76017123, - "learning_rate": 2.8567986454516854e-06, - "loss": 0.78151298, - "num_input_tokens_seen": 135090525, - "step": 6292, - "time_per_iteration": 2.7115557193756104 - }, - { - "auxiliary_loss_clip": 0.0111659, - "auxiliary_loss_mlp": 0.01045333, - "balance_loss_clip": 1.04599166, - "balance_loss_mlp": 1.02922773, - "epoch": 0.378355629039531, - "flos": 16470357770880.0, - "grad_norm": 2.0329947363530616, - "language_loss": 0.69857049, - "learning_rate": 2.856446715715224e-06, - "loss": 0.72018969, - "num_input_tokens_seen": 135109575, - "step": 6293, - "time_per_iteration": 2.6687965393066406 - }, - { - "auxiliary_loss_clip": 0.01133204, - "auxiliary_loss_mlp": 0.01039264, - "balance_loss_clip": 1.04852223, - "balance_loss_mlp": 1.02307534, - "epoch": 0.378415752292199, - "flos": 19974844200960.0, - "grad_norm": 2.030259976194038, - "language_loss": 0.70870757, - "learning_rate": 2.8560947535022173e-06, - "loss": 0.73043227, - "num_input_tokens_seen": 135127000, - "step": 6294, - "time_per_iteration": 2.600249767303467 - }, - { - "auxiliary_loss_clip": 0.01115678, - "auxiliary_loss_mlp": 0.01040569, - "balance_loss_clip": 1.04706097, - "balance_loss_mlp": 1.02365303, - "epoch": 0.378475875544867, - "flos": 14647388676480.0, - "grad_norm": 4.788626069957177, - "language_loss": 0.82803214, - "learning_rate": 2.855742758826011e-06, - "loss": 0.84959471, - "num_input_tokens_seen": 135145285, - "step": 6295, - "time_per_iteration": 2.656090497970581 - }, - { - "auxiliary_loss_clip": 0.0111937, - "auxiliary_loss_mlp": 0.0103653, - "balance_loss_clip": 1.04782999, - "balance_loss_mlp": 1.02058005, - "epoch": 0.37853599879753497, - "flos": 26650996617600.0, - "grad_norm": 9.577751233202987, - "language_loss": 0.71744889, - "learning_rate": 2.8553907316999547e-06, - "loss": 0.73900783, - "num_input_tokens_seen": 135165240, - "step": 6296, - "time_per_iteration": 2.6698925495147705 - }, - { - "auxiliary_loss_clip": 0.01134516, - "auxiliary_loss_mlp": 0.01043376, - "balance_loss_clip": 1.05133939, - "balance_loss_mlp": 1.02771211, - "epoch": 0.37859612205020293, - "flos": 17311960408320.0, - "grad_norm": 3.288847845161644, - "language_loss": 0.76889098, - "learning_rate": 2.855038672137396e-06, - "loss": 0.79066986, - "num_input_tokens_seen": 135184045, - "step": 6297, - "time_per_iteration": 2.629037380218506 - }, - { - "auxiliary_loss_clip": 0.01109354, - "auxiliary_loss_mlp": 0.01038115, - "balance_loss_clip": 1.04526067, - "balance_loss_mlp": 1.02226055, - "epoch": 0.3786562453028709, - "flos": 18220392299520.0, - "grad_norm": 1.9191527971099975, - "language_loss": 0.79743183, - "learning_rate": 2.854686580151684e-06, - "loss": 0.81890655, - "num_input_tokens_seen": 135202365, - "step": 6298, - "time_per_iteration": 2.673081874847412 - }, - { - "auxiliary_loss_clip": 0.01075918, - "auxiliary_loss_mlp": 0.01051187, - "balance_loss_clip": 1.04113722, - "balance_loss_mlp": 1.03267384, - "epoch": 0.37871636855553886, - "flos": 21214875473280.0, - "grad_norm": 1.8248163373816215, - "language_loss": 0.84369445, - "learning_rate": 2.8543344557561722e-06, - "loss": 0.86496556, - "num_input_tokens_seen": 135220955, - "step": 6299, - "time_per_iteration": 2.748072862625122 - }, - { - "auxiliary_loss_clip": 0.01104171, - "auxiliary_loss_mlp": 0.01036156, - "balance_loss_clip": 1.0473597, - "balance_loss_mlp": 1.02021194, - "epoch": 0.3787764918082068, - "flos": 20952727038720.0, - "grad_norm": 2.2683019346862587, - "language_loss": 0.76286763, - "learning_rate": 2.8539822989642116e-06, - "loss": 0.78427088, - "num_input_tokens_seen": 135239715, - "step": 6300, - "time_per_iteration": 2.742335796356201 - }, - { - "auxiliary_loss_clip": 0.01118244, - "auxiliary_loss_mlp": 0.01037884, - "balance_loss_clip": 1.04743147, - "balance_loss_mlp": 1.01999068, - "epoch": 0.3788366150608748, - "flos": 17308009912320.0, - "grad_norm": 2.2544575031135863, - "language_loss": 0.82409781, - "learning_rate": 2.8536301097891577e-06, - "loss": 0.84565908, - "num_input_tokens_seen": 135257035, - "step": 6301, - "time_per_iteration": 2.6785736083984375 - }, - { - "auxiliary_loss_clip": 0.01120863, - "auxiliary_loss_mlp": 0.01039969, - "balance_loss_clip": 1.04765666, - "balance_loss_mlp": 1.02410781, - "epoch": 0.37889673831354276, - "flos": 24311092942080.0, - "grad_norm": 2.7886341766039466, - "language_loss": 0.67584914, - "learning_rate": 2.8532778882443636e-06, - "loss": 0.69745743, - "num_input_tokens_seen": 135275720, - "step": 6302, - "time_per_iteration": 2.677690029144287 - }, - { - "auxiliary_loss_clip": 0.01090953, - "auxiliary_loss_mlp": 0.01043064, - "balance_loss_clip": 1.04460323, - "balance_loss_mlp": 1.02736425, - "epoch": 0.3789568615662107, - "flos": 26683603188480.0, - "grad_norm": 1.752291551629032, - "language_loss": 0.68745166, - "learning_rate": 2.8529256343431867e-06, - "loss": 0.70879185, - "num_input_tokens_seen": 135294140, - "step": 6303, - "time_per_iteration": 2.8387813568115234 - }, - { - "auxiliary_loss_clip": 0.01133092, - "auxiliary_loss_mlp": 0.01039166, - "balance_loss_clip": 1.04745388, - "balance_loss_mlp": 1.02412772, - "epoch": 0.3790169848188787, - "flos": 23585194990080.0, - "grad_norm": 1.8875159078783896, - "language_loss": 0.77695227, - "learning_rate": 2.8525733480989846e-06, - "loss": 0.79867482, - "num_input_tokens_seen": 135314845, - "step": 6304, - "time_per_iteration": 2.673499584197998 - }, - { - "auxiliary_loss_clip": 0.01145067, - "auxiliary_loss_mlp": 0.01040581, - "balance_loss_clip": 1.05417812, - "balance_loss_mlp": 1.02412987, - "epoch": 0.37907710807154665, - "flos": 18437436230400.0, - "grad_norm": 2.779181085633227, - "language_loss": 0.79659361, - "learning_rate": 2.8522210295251146e-06, - "loss": 0.81845009, - "num_input_tokens_seen": 135333055, - "step": 6305, - "time_per_iteration": 2.5770838260650635 - }, - { - "auxiliary_loss_clip": 0.01046795, - "auxiliary_loss_mlp": 0.01001141, - "balance_loss_clip": 1.02554131, - "balance_loss_mlp": 0.99954396, - "epoch": 0.3791372313242146, - "flos": 50107165954560.0, - "grad_norm": 0.9814261912828969, - "language_loss": 0.64473259, - "learning_rate": 2.8518686786349387e-06, - "loss": 0.66521198, - "num_input_tokens_seen": 135387865, - "step": 6306, - "time_per_iteration": 3.0782721042633057 - }, - { - "auxiliary_loss_clip": 0.01111605, - "auxiliary_loss_mlp": 0.01058558, - "balance_loss_clip": 1.04987538, - "balance_loss_mlp": 1.03932941, - "epoch": 0.3791973545768826, - "flos": 24316551809280.0, - "grad_norm": 3.4757923579383343, - "language_loss": 0.73271245, - "learning_rate": 2.851516295441817e-06, - "loss": 0.75441408, - "num_input_tokens_seen": 135409095, - "step": 6307, - "time_per_iteration": 2.756335973739624 - }, - { - "auxiliary_loss_clip": 0.01112868, - "auxiliary_loss_mlp": 0.01041837, - "balance_loss_clip": 1.04757965, - "balance_loss_mlp": 1.02545738, - "epoch": 0.3792574778295506, - "flos": 21579907438080.0, - "grad_norm": 1.5984922637838355, - "language_loss": 0.78426826, - "learning_rate": 2.851163879959112e-06, - "loss": 0.80581522, - "num_input_tokens_seen": 135429585, - "step": 6308, - "time_per_iteration": 2.7782399654388428 - }, - { - "auxiliary_loss_clip": 0.01099815, - "auxiliary_loss_mlp": 0.01047567, - "balance_loss_clip": 1.04646075, - "balance_loss_mlp": 1.03061557, - "epoch": 0.37931760108221857, - "flos": 22272731942400.0, - "grad_norm": 30.20771720098995, - "language_loss": 0.72349942, - "learning_rate": 2.8508114322001876e-06, - "loss": 0.74497324, - "num_input_tokens_seen": 135446320, - "step": 6309, - "time_per_iteration": 2.779332399368286 - }, - { - "auxiliary_loss_clip": 0.0107726, - "auxiliary_loss_mlp": 0.01047463, - "balance_loss_clip": 1.04217935, - "balance_loss_mlp": 1.03061867, - "epoch": 0.37937772433488653, - "flos": 19682998197120.0, - "grad_norm": 1.3823910789919382, - "language_loss": 0.78832853, - "learning_rate": 2.8504589521784083e-06, - "loss": 0.8095758, - "num_input_tokens_seen": 135465720, - "step": 6310, - "time_per_iteration": 2.771423101425171 - }, - { - "auxiliary_loss_clip": 0.01125039, - "auxiliary_loss_mlp": 0.0077385, - "balance_loss_clip": 1.04667282, - "balance_loss_mlp": 1.00038886, - "epoch": 0.3794378475875545, - "flos": 19099378016640.0, - "grad_norm": 2.0276391959107687, - "language_loss": 0.76350379, - "learning_rate": 2.8501064399071403e-06, - "loss": 0.78249264, - "num_input_tokens_seen": 135485155, - "step": 6311, - "time_per_iteration": 2.6458020210266113 - }, - { - "auxiliary_loss_clip": 0.01111162, - "auxiliary_loss_mlp": 0.01038798, - "balance_loss_clip": 1.04782593, - "balance_loss_mlp": 1.02345526, - "epoch": 0.37949797084022246, - "flos": 20339660684160.0, - "grad_norm": 1.662830094695082, - "language_loss": 0.7082535, - "learning_rate": 2.8497538953997504e-06, - "loss": 0.72975308, - "num_input_tokens_seen": 135502675, - "step": 6312, - "time_per_iteration": 2.719555377960205 - }, - { - "auxiliary_loss_clip": 0.01023104, - "auxiliary_loss_mlp": 0.01013837, - "balance_loss_clip": 1.02154779, - "balance_loss_mlp": 1.0123291, - "epoch": 0.37955809409289043, - "flos": 63972203477760.0, - "grad_norm": 0.7865225154891, - "language_loss": 0.56087357, - "learning_rate": 2.849401318669608e-06, - "loss": 0.58124298, - "num_input_tokens_seen": 135562005, - "step": 6313, - "time_per_iteration": 3.2287843227386475 - }, - { - "auxiliary_loss_clip": 0.01096229, - "auxiliary_loss_mlp": 0.01051812, - "balance_loss_clip": 1.04299724, - "balance_loss_mlp": 1.03592694, - "epoch": 0.3796182173455584, - "flos": 31540665179520.0, - "grad_norm": 1.6673731637282567, - "language_loss": 0.71260917, - "learning_rate": 2.849048709730083e-06, - "loss": 0.73408955, - "num_input_tokens_seen": 135582600, - "step": 6314, - "time_per_iteration": 2.7842931747436523 - }, - { - "auxiliary_loss_clip": 0.01129376, - "auxiliary_loss_mlp": 0.01048605, - "balance_loss_clip": 1.04880047, - "balance_loss_mlp": 1.03201127, - "epoch": 0.37967834059822636, - "flos": 12130804978560.0, - "grad_norm": 2.0299747539506408, - "language_loss": 0.73270208, - "learning_rate": 2.848696068594545e-06, - "loss": 0.75448191, - "num_input_tokens_seen": 135600280, - "step": 6315, - "time_per_iteration": 2.6785545349121094 - }, - { - "auxiliary_loss_clip": 0.01122054, - "auxiliary_loss_mlp": 0.01048691, - "balance_loss_clip": 1.0479691, - "balance_loss_mlp": 1.03326535, - "epoch": 0.3797384638508943, - "flos": 39348578298240.0, - "grad_norm": 2.0273248392275645, - "language_loss": 0.71108794, - "learning_rate": 2.8483433952763677e-06, - "loss": 0.73279542, - "num_input_tokens_seen": 135621560, - "step": 6316, - "time_per_iteration": 2.7634074687957764 - }, - { - "auxiliary_loss_clip": 0.01099766, - "auxiliary_loss_mlp": 0.01041876, - "balance_loss_clip": 1.04686475, - "balance_loss_mlp": 1.02733219, - "epoch": 0.3797985871035623, - "flos": 34054016653440.0, - "grad_norm": 6.091183487708486, - "language_loss": 0.6551193, - "learning_rate": 2.847990689788923e-06, - "loss": 0.67653567, - "num_input_tokens_seen": 135641745, - "step": 6317, - "time_per_iteration": 2.8334715366363525 - }, - { - "auxiliary_loss_clip": 0.01119227, - "auxiliary_loss_mlp": 0.01036315, - "balance_loss_clip": 1.04556906, - "balance_loss_mlp": 1.02204525, - "epoch": 0.37985871035623026, - "flos": 23222174186880.0, - "grad_norm": 2.5588148844770364, - "language_loss": 0.85254991, - "learning_rate": 2.8476379521455877e-06, - "loss": 0.87410533, - "num_input_tokens_seen": 135660650, - "step": 6318, - "time_per_iteration": 2.6611499786376953 - }, - { - "auxiliary_loss_clip": 0.01113843, - "auxiliary_loss_mlp": 0.01046062, - "balance_loss_clip": 1.04669976, - "balance_loss_mlp": 1.02933645, - "epoch": 0.3799188336088982, - "flos": 18114958903680.0, - "grad_norm": 2.5013130494780254, - "language_loss": 0.75813186, - "learning_rate": 2.8472851823597354e-06, - "loss": 0.77973092, - "num_input_tokens_seen": 135679980, - "step": 6319, - "time_per_iteration": 2.643206834793091 - }, - { - "auxiliary_loss_clip": 0.01136645, - "auxiliary_loss_mlp": 0.01043703, - "balance_loss_clip": 1.04961717, - "balance_loss_mlp": 1.02813435, - "epoch": 0.3799789568615662, - "flos": 21871897096320.0, - "grad_norm": 1.6614251909537696, - "language_loss": 0.64298296, - "learning_rate": 2.846932380444744e-06, - "loss": 0.66478646, - "num_input_tokens_seen": 135699400, - "step": 6320, - "time_per_iteration": 4.031519174575806 - }, - { - "auxiliary_loss_clip": 0.01102323, - "auxiliary_loss_mlp": 0.01046665, - "balance_loss_clip": 1.05175698, - "balance_loss_mlp": 1.03132319, - "epoch": 0.3800390801142342, - "flos": 32962943082240.0, - "grad_norm": 2.289587921641626, - "language_loss": 0.713642, - "learning_rate": 2.846579546413992e-06, - "loss": 0.73513186, - "num_input_tokens_seen": 135723455, - "step": 6321, - "time_per_iteration": 2.8465514183044434 - }, - { - "auxiliary_loss_clip": 0.01096183, - "auxiliary_loss_mlp": 0.01042053, - "balance_loss_clip": 1.04067016, - "balance_loss_mlp": 1.02673435, - "epoch": 0.38009920336690217, - "flos": 26907075653760.0, - "grad_norm": 1.7413772853733611, - "language_loss": 0.74461544, - "learning_rate": 2.846226680280859e-06, - "loss": 0.76599777, - "num_input_tokens_seen": 135744335, - "step": 6322, - "time_per_iteration": 4.407487630844116 - }, - { - "auxiliary_loss_clip": 0.01122719, - "auxiliary_loss_mlp": 0.01040835, - "balance_loss_clip": 1.0462966, - "balance_loss_mlp": 1.02587986, - "epoch": 0.38015932661957014, - "flos": 22488913946880.0, - "grad_norm": 3.5770930684707527, - "language_loss": 0.84908414, - "learning_rate": 2.845873782058725e-06, - "loss": 0.87071967, - "num_input_tokens_seen": 135761440, - "step": 6323, - "time_per_iteration": 2.6349892616271973 - }, - { - "auxiliary_loss_clip": 0.01111414, - "auxiliary_loss_mlp": 0.01037556, - "balance_loss_clip": 1.04454303, - "balance_loss_mlp": 1.02075982, - "epoch": 0.3802194498722381, - "flos": 21980993679360.0, - "grad_norm": 5.3693824839272954, - "language_loss": 0.73171353, - "learning_rate": 2.845520851760973e-06, - "loss": 0.75320327, - "num_input_tokens_seen": 135779955, - "step": 6324, - "time_per_iteration": 4.240839958190918 - }, - { - "auxiliary_loss_clip": 0.01105568, - "auxiliary_loss_mlp": 0.01038696, - "balance_loss_clip": 1.04704404, - "balance_loss_mlp": 1.02263856, - "epoch": 0.38027957312490607, - "flos": 21324869896320.0, - "grad_norm": 1.716026134262254, - "language_loss": 0.83859229, - "learning_rate": 2.8451678894009847e-06, - "loss": 0.86003488, - "num_input_tokens_seen": 135799840, - "step": 6325, - "time_per_iteration": 2.72074818611145 - }, - { - "auxiliary_loss_clip": 0.01110489, - "auxiliary_loss_mlp": 0.01035658, - "balance_loss_clip": 1.04811895, - "balance_loss_mlp": 1.02094209, - "epoch": 0.38033969637757403, - "flos": 16691244456960.0, - "grad_norm": 2.0321742163093264, - "language_loss": 0.80093408, - "learning_rate": 2.8448148949921465e-06, - "loss": 0.82239556, - "num_input_tokens_seen": 135817880, - "step": 6326, - "time_per_iteration": 4.313997030258179 - }, - { - "auxiliary_loss_clip": 0.01119893, - "auxiliary_loss_mlp": 0.01038876, - "balance_loss_clip": 1.04593146, - "balance_loss_mlp": 1.02497053, - "epoch": 0.380399819630242, - "flos": 36210847685760.0, - "grad_norm": 1.80559395505396, - "language_loss": 0.72578084, - "learning_rate": 2.844461868547842e-06, - "loss": 0.74736857, - "num_input_tokens_seen": 135838940, - "step": 6327, - "time_per_iteration": 2.7500593662261963 - }, - { - "auxiliary_loss_clip": 0.01134332, - "auxiliary_loss_mlp": 0.00772576, - "balance_loss_clip": 1.04898763, - "balance_loss_mlp": 1.00039506, - "epoch": 0.38045994288290996, - "flos": 21288851533440.0, - "grad_norm": 1.9791898832174752, - "language_loss": 0.83074433, - "learning_rate": 2.844108810081459e-06, - "loss": 0.84981334, - "num_input_tokens_seen": 135858325, - "step": 6328, - "time_per_iteration": 2.7503418922424316 - }, - { - "auxiliary_loss_clip": 0.01119735, - "auxiliary_loss_mlp": 0.01029986, - "balance_loss_clip": 1.04522514, - "balance_loss_mlp": 1.01522779, - "epoch": 0.38052006613557793, - "flos": 20922885815040.0, - "grad_norm": 1.5313878449465446, - "language_loss": 0.61713332, - "learning_rate": 2.843755719606385e-06, - "loss": 0.63863051, - "num_input_tokens_seen": 135878430, - "step": 6329, - "time_per_iteration": 2.682016134262085 - }, - { - "auxiliary_loss_clip": 0.01103557, - "auxiliary_loss_mlp": 0.01040275, - "balance_loss_clip": 1.04332185, - "balance_loss_mlp": 1.02436066, - "epoch": 0.3805801893882459, - "flos": 20990720649600.0, - "grad_norm": 1.9096594726999414, - "language_loss": 0.56007183, - "learning_rate": 2.8434025971360104e-06, - "loss": 0.58151013, - "num_input_tokens_seen": 135894755, - "step": 6330, - "time_per_iteration": 2.6704044342041016 - }, - { - "auxiliary_loss_clip": 0.01088801, - "auxiliary_loss_mlp": 0.01035148, - "balance_loss_clip": 1.04801345, - "balance_loss_mlp": 1.02142704, - "epoch": 0.38064031264091386, - "flos": 25558594243200.0, - "grad_norm": 3.9882905607247046, - "language_loss": 0.65945244, - "learning_rate": 2.8430494426837243e-06, - "loss": 0.6806919, - "num_input_tokens_seen": 135918275, - "step": 6331, - "time_per_iteration": 2.750293731689453 - }, - { - "auxiliary_loss_clip": 0.01120934, - "auxiliary_loss_mlp": 0.01042908, - "balance_loss_clip": 1.05122471, - "balance_loss_mlp": 1.02723169, - "epoch": 0.3807004358935818, - "flos": 15085857997440.0, - "grad_norm": 2.769340057272882, - "language_loss": 0.7601527, - "learning_rate": 2.842696256262919e-06, - "loss": 0.78179109, - "num_input_tokens_seen": 135937430, - "step": 6332, - "time_per_iteration": 2.64774227142334 - }, - { - "auxiliary_loss_clip": 0.01073508, - "auxiliary_loss_mlp": 0.00772959, - "balance_loss_clip": 1.04594767, - "balance_loss_mlp": 1.00029111, - "epoch": 0.3807605591462498, - "flos": 16399398453120.0, - "grad_norm": 2.059894273755589, - "language_loss": 0.8224051, - "learning_rate": 2.842343037886987e-06, - "loss": 0.84086972, - "num_input_tokens_seen": 135954210, - "step": 6333, - "time_per_iteration": 2.7650275230407715 - }, - { - "auxiliary_loss_clip": 0.01121534, - "auxiliary_loss_mlp": 0.01033205, - "balance_loss_clip": 1.04730785, - "balance_loss_mlp": 1.01878643, - "epoch": 0.3808206823989178, - "flos": 29057083102080.0, - "grad_norm": 1.5368445040683132, - "language_loss": 0.8620519, - "learning_rate": 2.8419897875693226e-06, - "loss": 0.88359934, - "num_input_tokens_seen": 135974425, - "step": 6334, - "time_per_iteration": 2.7348363399505615 - }, - { - "auxiliary_loss_clip": 0.01123412, - "auxiliary_loss_mlp": 0.01038067, - "balance_loss_clip": 1.04626036, - "balance_loss_mlp": 1.02280819, - "epoch": 0.3808808056515858, - "flos": 15705855676800.0, - "grad_norm": 1.7714454860846107, - "language_loss": 0.79359698, - "learning_rate": 2.841636505323321e-06, - "loss": 0.81521177, - "num_input_tokens_seen": 135991985, - "step": 6335, - "time_per_iteration": 2.7020695209503174 - }, - { - "auxiliary_loss_clip": 0.01121693, - "auxiliary_loss_mlp": 0.01033758, - "balance_loss_clip": 1.04490542, - "balance_loss_mlp": 1.01847494, - "epoch": 0.38094092890425374, - "flos": 20704584908160.0, - "grad_norm": 1.872444579903983, - "language_loss": 0.72939491, - "learning_rate": 2.8412831911623795e-06, - "loss": 0.75094938, - "num_input_tokens_seen": 136010015, - "step": 6336, - "time_per_iteration": 2.7088463306427 - }, - { - "auxiliary_loss_clip": 0.01117324, - "auxiliary_loss_mlp": 0.01033417, - "balance_loss_clip": 1.04605365, - "balance_loss_mlp": 1.01930285, - "epoch": 0.3810010521569217, - "flos": 20667956014080.0, - "grad_norm": 2.014308937626889, - "language_loss": 0.69164217, - "learning_rate": 2.840929845099894e-06, - "loss": 0.71314949, - "num_input_tokens_seen": 136028440, - "step": 6337, - "time_per_iteration": 2.6832611560821533 - }, - { - "auxiliary_loss_clip": 0.01111033, - "auxiliary_loss_mlp": 0.01036513, - "balance_loss_clip": 1.04483473, - "balance_loss_mlp": 1.02133763, - "epoch": 0.38106117540958967, - "flos": 31827626933760.0, - "grad_norm": 1.9800177042646252, - "language_loss": 0.63416338, - "learning_rate": 2.8405764671492652e-06, - "loss": 0.65563887, - "num_input_tokens_seen": 136048360, - "step": 6338, - "time_per_iteration": 2.8045074939727783 - }, - { - "auxiliary_loss_clip": 0.01112594, - "auxiliary_loss_mlp": 0.01041591, - "balance_loss_clip": 1.04514265, - "balance_loss_mlp": 1.02520001, - "epoch": 0.38112129866225763, - "flos": 16902757693440.0, - "grad_norm": 2.42049576026076, - "language_loss": 0.69146717, - "learning_rate": 2.8402230573238923e-06, - "loss": 0.713009, - "num_input_tokens_seen": 136065500, - "step": 6339, - "time_per_iteration": 2.6873764991760254 - }, - { - "auxiliary_loss_clip": 0.01107753, - "auxiliary_loss_mlp": 0.01047128, - "balance_loss_clip": 1.04493856, - "balance_loss_mlp": 1.03165436, - "epoch": 0.3811814219149256, - "flos": 20887226588160.0, - "grad_norm": 2.484915003961603, - "language_loss": 0.68283296, - "learning_rate": 2.839869615637177e-06, - "loss": 0.70438182, - "num_input_tokens_seen": 136084060, - "step": 6340, - "time_per_iteration": 2.730966567993164 - }, - { - "auxiliary_loss_clip": 0.01098909, - "auxiliary_loss_mlp": 0.01040765, - "balance_loss_clip": 1.0444243, - "balance_loss_mlp": 1.02449322, - "epoch": 0.38124154516759357, - "flos": 16690813493760.0, - "grad_norm": 2.645956512625022, - "language_loss": 0.89689833, - "learning_rate": 2.839516142102522e-06, - "loss": 0.91829509, - "num_input_tokens_seen": 136102310, - "step": 6341, - "time_per_iteration": 2.7552878856658936 - }, - { - "auxiliary_loss_clip": 0.01127861, - "auxiliary_loss_mlp": 0.01042909, - "balance_loss_clip": 1.04863834, - "balance_loss_mlp": 1.02668464, - "epoch": 0.38130166842026153, - "flos": 19681956702720.0, - "grad_norm": 2.1539523414578103, - "language_loss": 0.75359344, - "learning_rate": 2.83916263673333e-06, - "loss": 0.7753011, - "num_input_tokens_seen": 136120725, - "step": 6342, - "time_per_iteration": 2.6937670707702637 - }, - { - "auxiliary_loss_clip": 0.01109868, - "auxiliary_loss_mlp": 0.01035797, - "balance_loss_clip": 1.04506934, - "balance_loss_mlp": 1.02071738, - "epoch": 0.3813617916729295, - "flos": 22198432659840.0, - "grad_norm": 1.797512240627555, - "language_loss": 0.8348105, - "learning_rate": 2.838809099543007e-06, - "loss": 0.85626709, - "num_input_tokens_seen": 136139105, - "step": 6343, - "time_per_iteration": 2.6647467613220215 - }, - { - "auxiliary_loss_clip": 0.01073856, - "auxiliary_loss_mlp": 0.01047466, - "balance_loss_clip": 1.04339314, - "balance_loss_mlp": 1.03099144, - "epoch": 0.38142191492559746, - "flos": 19096899978240.0, - "grad_norm": 1.8507846773973766, - "language_loss": 0.76930642, - "learning_rate": 2.838455530544959e-06, - "loss": 0.7905196, - "num_input_tokens_seen": 136158265, - "step": 6344, - "time_per_iteration": 2.807464838027954 - }, - { - "auxiliary_loss_clip": 0.01099031, - "auxiliary_loss_mlp": 0.01049913, - "balance_loss_clip": 1.04580665, - "balance_loss_mlp": 1.03225255, - "epoch": 0.3814820381782654, - "flos": 24097748112000.0, - "grad_norm": 2.0591822661314847, - "language_loss": 0.73010087, - "learning_rate": 2.838101929752593e-06, - "loss": 0.75159037, - "num_input_tokens_seen": 136176100, - "step": 6345, - "time_per_iteration": 2.756462574005127 - }, - { - "auxiliary_loss_clip": 0.01094565, - "auxiliary_loss_mlp": 0.00771987, - "balance_loss_clip": 1.04568338, - "balance_loss_mlp": 1.00028944, - "epoch": 0.3815421614309334, - "flos": 15778502933760.0, - "grad_norm": 1.8320535118847152, - "language_loss": 0.69709373, - "learning_rate": 2.8377482971793187e-06, - "loss": 0.71575922, - "num_input_tokens_seen": 136195125, - "step": 6346, - "time_per_iteration": 2.7221782207489014 - }, - { - "auxiliary_loss_clip": 0.01124746, - "auxiliary_loss_mlp": 0.01038046, - "balance_loss_clip": 1.04819, - "balance_loss_mlp": 1.02297819, - "epoch": 0.38160228468360136, - "flos": 19899754819200.0, - "grad_norm": 1.9952986193352877, - "language_loss": 0.75480664, - "learning_rate": 2.8373946328385437e-06, - "loss": 0.77643454, - "num_input_tokens_seen": 136213885, - "step": 6347, - "time_per_iteration": 2.646730422973633 - }, - { - "auxiliary_loss_clip": 0.0112204, - "auxiliary_loss_mlp": 0.01039786, - "balance_loss_clip": 1.04638994, - "balance_loss_mlp": 1.0253861, - "epoch": 0.3816624079362694, - "flos": 19281050029440.0, - "grad_norm": 3.670871038619067, - "language_loss": 0.74398822, - "learning_rate": 2.8370409367436813e-06, - "loss": 0.76560652, - "num_input_tokens_seen": 136232700, - "step": 6348, - "time_per_iteration": 2.651153802871704 - }, - { - "auxiliary_loss_clip": 0.01109969, - "auxiliary_loss_mlp": 0.01037685, - "balance_loss_clip": 1.04792547, - "balance_loss_mlp": 1.0233444, - "epoch": 0.38172253118893734, - "flos": 21177564220800.0, - "grad_norm": 2.7978232906816665, - "language_loss": 0.87172502, - "learning_rate": 2.836687208908142e-06, - "loss": 0.89320159, - "num_input_tokens_seen": 136248975, - "step": 6349, - "time_per_iteration": 2.693459987640381 - }, - { - "auxiliary_loss_clip": 0.0112098, - "auxiliary_loss_mlp": 0.01037146, - "balance_loss_clip": 1.04788637, - "balance_loss_mlp": 1.02244771, - "epoch": 0.3817826544416053, - "flos": 17529219820800.0, - "grad_norm": 1.7341599494512197, - "language_loss": 0.76554048, - "learning_rate": 2.836333449345341e-06, - "loss": 0.78712171, - "num_input_tokens_seen": 136266710, - "step": 6350, - "time_per_iteration": 2.6194076538085938 - }, - { - "auxiliary_loss_clip": 0.01104228, - "auxiliary_loss_mlp": 0.01032221, - "balance_loss_clip": 1.04922175, - "balance_loss_mlp": 1.01640153, - "epoch": 0.38184277769427327, - "flos": 16326535714560.0, - "grad_norm": 2.525722230514251, - "language_loss": 0.75608248, - "learning_rate": 2.8359796580686907e-06, - "loss": 0.77744693, - "num_input_tokens_seen": 136284445, - "step": 6351, - "time_per_iteration": 2.723487138748169 - }, - { - "auxiliary_loss_clip": 0.01122109, - "auxiliary_loss_mlp": 0.01037028, - "balance_loss_clip": 1.04607773, - "balance_loss_mlp": 1.02048135, - "epoch": 0.38190290094694124, - "flos": 30443450382720.0, - "grad_norm": 2.201358799690427, - "language_loss": 0.74001205, - "learning_rate": 2.8356258350916085e-06, - "loss": 0.76160336, - "num_input_tokens_seen": 136305730, - "step": 6352, - "time_per_iteration": 2.6779909133911133 - }, - { - "auxiliary_loss_clip": 0.01093469, - "auxiliary_loss_mlp": 0.01035075, - "balance_loss_clip": 1.04185915, - "balance_loss_mlp": 1.02093625, - "epoch": 0.3819630241996092, - "flos": 14209924936320.0, - "grad_norm": 1.7014377772216425, - "language_loss": 0.64249897, - "learning_rate": 2.8352719804275104e-06, - "loss": 0.66378438, - "num_input_tokens_seen": 136323850, - "step": 6353, - "time_per_iteration": 2.731860399246216 - }, - { - "auxiliary_loss_clip": 0.01133265, - "auxiliary_loss_mlp": 0.01039549, - "balance_loss_clip": 1.04809213, - "balance_loss_mlp": 1.02529204, - "epoch": 0.38202314745227717, - "flos": 25009699536000.0, - "grad_norm": 2.7523604394748644, - "language_loss": 0.83447051, - "learning_rate": 2.834918094089816e-06, - "loss": 0.85619861, - "num_input_tokens_seen": 136344880, - "step": 6354, - "time_per_iteration": 2.665891170501709 - }, - { - "auxiliary_loss_clip": 0.01132291, - "auxiliary_loss_mlp": 0.01034862, - "balance_loss_clip": 1.04866302, - "balance_loss_mlp": 1.02162409, - "epoch": 0.38208327070494513, - "flos": 20814507504000.0, - "grad_norm": 16.091226432139102, - "language_loss": 0.80633152, - "learning_rate": 2.834564176091943e-06, - "loss": 0.82800299, - "num_input_tokens_seen": 136366060, - "step": 6355, - "time_per_iteration": 2.6580965518951416 - }, - { - "auxiliary_loss_clip": 0.01092469, - "auxiliary_loss_mlp": 0.01037027, - "balance_loss_clip": 1.04551625, - "balance_loss_mlp": 1.02263832, - "epoch": 0.3821433939576131, - "flos": 22637727993600.0, - "grad_norm": 1.8508447811900344, - "language_loss": 0.75970227, - "learning_rate": 2.8342102264473125e-06, - "loss": 0.78099722, - "num_input_tokens_seen": 136385625, - "step": 6356, - "time_per_iteration": 2.7381057739257812 - }, - { - "auxiliary_loss_clip": 0.01123851, - "auxiliary_loss_mlp": 0.00772749, - "balance_loss_clip": 1.04802036, - "balance_loss_mlp": 1.00034022, - "epoch": 0.38220351721028106, - "flos": 26869872142080.0, - "grad_norm": 2.3854964939919188, - "language_loss": 0.81208009, - "learning_rate": 2.833856245169348e-06, - "loss": 0.8310461, - "num_input_tokens_seen": 136405750, - "step": 6357, - "time_per_iteration": 2.8209376335144043 - }, - { - "auxiliary_loss_clip": 0.01118527, - "auxiliary_loss_mlp": 0.01044748, - "balance_loss_clip": 1.05246222, - "balance_loss_mlp": 1.02842796, - "epoch": 0.38226364046294903, - "flos": 23367468700800.0, - "grad_norm": 2.215929075758269, - "language_loss": 0.77378345, - "learning_rate": 2.8335022322714695e-06, - "loss": 0.79541618, - "num_input_tokens_seen": 136426085, - "step": 6358, - "time_per_iteration": 2.7004640102386475 - }, - { - "auxiliary_loss_clip": 0.01115504, - "auxiliary_loss_mlp": 0.01047061, - "balance_loss_clip": 1.0469476, - "balance_loss_mlp": 1.03118849, - "epoch": 0.382323763715617, - "flos": 19646225648640.0, - "grad_norm": 3.6635579737055837, - "language_loss": 0.78477705, - "learning_rate": 2.8331481877671036e-06, - "loss": 0.80640268, - "num_input_tokens_seen": 136442670, - "step": 6359, - "time_per_iteration": 4.184551954269409 - }, - { - "auxiliary_loss_clip": 0.01065181, - "auxiliary_loss_mlp": 0.01052018, - "balance_loss_clip": 1.03820515, - "balance_loss_mlp": 1.03462481, - "epoch": 0.38238388696828496, - "flos": 54124741232640.0, - "grad_norm": 1.6779400536158158, - "language_loss": 0.69735414, - "learning_rate": 2.8327941116696754e-06, - "loss": 0.71852612, - "num_input_tokens_seen": 136465730, - "step": 6360, - "time_per_iteration": 3.1072845458984375 - }, - { - "auxiliary_loss_clip": 0.01102455, - "auxiliary_loss_mlp": 0.01037366, - "balance_loss_clip": 1.04502857, - "balance_loss_mlp": 1.02189279, - "epoch": 0.382444010220953, - "flos": 24936190352640.0, - "grad_norm": 1.5790785802582266, - "language_loss": 0.79362941, - "learning_rate": 2.83244000399261e-06, - "loss": 0.81502759, - "num_input_tokens_seen": 136487215, - "step": 6361, - "time_per_iteration": 4.285314559936523 - }, - { - "auxiliary_loss_clip": 0.01111113, - "auxiliary_loss_mlp": 0.01043827, - "balance_loss_clip": 1.04649949, - "balance_loss_mlp": 1.02906859, - "epoch": 0.38250413347362094, - "flos": 42337351209600.0, - "grad_norm": 1.9067122847602551, - "language_loss": 0.65606177, - "learning_rate": 2.832085864749337e-06, - "loss": 0.67761117, - "num_input_tokens_seen": 136510365, - "step": 6362, - "time_per_iteration": 2.8447117805480957 - }, - { - "auxiliary_loss_clip": 0.0113439, - "auxiliary_loss_mlp": 0.01035947, - "balance_loss_clip": 1.0483737, - "balance_loss_mlp": 1.01978207, - "epoch": 0.3825642567262889, - "flos": 16289224462080.0, - "grad_norm": 2.3383155012254284, - "language_loss": 0.82138497, - "learning_rate": 2.8317316939532848e-06, - "loss": 0.84308833, - "num_input_tokens_seen": 136527100, - "step": 6363, - "time_per_iteration": 4.166736602783203 - }, - { - "auxiliary_loss_clip": 0.01075728, - "auxiliary_loss_mlp": 0.01042552, - "balance_loss_clip": 1.04349709, - "balance_loss_mlp": 1.02707291, - "epoch": 0.3826243799789569, - "flos": 45654778586880.0, - "grad_norm": 2.1311203141010835, - "language_loss": 0.59044886, - "learning_rate": 2.8313774916178825e-06, - "loss": 0.61163169, - "num_input_tokens_seen": 136550870, - "step": 6364, - "time_per_iteration": 3.006801128387451 - }, - { - "auxiliary_loss_clip": 0.01122076, - "auxiliary_loss_mlp": 0.01041213, - "balance_loss_clip": 1.05097353, - "balance_loss_mlp": 1.02542353, - "epoch": 0.38268450323162484, - "flos": 25301581453440.0, - "grad_norm": 1.9239689491626994, - "language_loss": 0.68903065, - "learning_rate": 2.8310232577565635e-06, - "loss": 0.7106635, - "num_input_tokens_seen": 136569895, - "step": 6365, - "time_per_iteration": 2.695068597793579 - }, - { - "auxiliary_loss_clip": 0.01123716, - "auxiliary_loss_mlp": 0.01039809, - "balance_loss_clip": 1.04955769, - "balance_loss_mlp": 1.02366817, - "epoch": 0.3827446264842928, - "flos": 21836022387840.0, - "grad_norm": 2.0334034116186137, - "language_loss": 0.73193848, - "learning_rate": 2.830668992382758e-06, - "loss": 0.75357372, - "num_input_tokens_seen": 136588585, - "step": 6366, - "time_per_iteration": 4.418980598449707 - }, - { - "auxiliary_loss_clip": 0.01115964, - "auxiliary_loss_mlp": 0.01038347, - "balance_loss_clip": 1.04846239, - "balance_loss_mlp": 1.02265882, - "epoch": 0.38280474973696077, - "flos": 25734591907200.0, - "grad_norm": 2.4539991484931645, - "language_loss": 0.68623614, - "learning_rate": 2.830314695509902e-06, - "loss": 0.70777929, - "num_input_tokens_seen": 136606640, - "step": 6367, - "time_per_iteration": 2.6878082752227783 - }, - { - "auxiliary_loss_clip": 0.01125961, - "auxiliary_loss_mlp": 0.01037618, - "balance_loss_clip": 1.05120409, - "balance_loss_mlp": 1.02256823, - "epoch": 0.38286487298962874, - "flos": 24895934184960.0, - "grad_norm": 2.196344444241347, - "language_loss": 0.64423102, - "learning_rate": 2.82996036715143e-06, - "loss": 0.66586685, - "num_input_tokens_seen": 136624940, - "step": 6368, - "time_per_iteration": 2.6698646545410156 - }, - { - "auxiliary_loss_clip": 0.01139795, - "auxiliary_loss_mlp": 0.01040116, - "balance_loss_clip": 1.05269098, - "balance_loss_mlp": 1.02390361, - "epoch": 0.3829249962422967, - "flos": 28543703967360.0, - "grad_norm": 1.346024597035963, - "language_loss": 0.684017, - "learning_rate": 2.8296060073207763e-06, - "loss": 0.70581615, - "num_input_tokens_seen": 136645540, - "step": 6369, - "time_per_iteration": 2.7156169414520264 - }, - { - "auxiliary_loss_clip": 0.01084469, - "auxiliary_loss_mlp": 0.01039929, - "balance_loss_clip": 1.04267466, - "balance_loss_mlp": 1.02391946, - "epoch": 0.38298511949496467, - "flos": 21471205904640.0, - "grad_norm": 1.7824237306329542, - "language_loss": 0.78701794, - "learning_rate": 2.8292516160313804e-06, - "loss": 0.80826187, - "num_input_tokens_seen": 136664530, - "step": 6370, - "time_per_iteration": 2.7351901531219482 - }, - { - "auxiliary_loss_clip": 0.01121027, - "auxiliary_loss_mlp": 0.01050163, - "balance_loss_clip": 1.04909503, - "balance_loss_mlp": 1.03279376, - "epoch": 0.38304524274763263, - "flos": 31679998035840.0, - "grad_norm": 2.5095706519371794, - "language_loss": 0.65098304, - "learning_rate": 2.8288971932966805e-06, - "loss": 0.67269492, - "num_input_tokens_seen": 136682315, - "step": 6371, - "time_per_iteration": 2.739689350128174 - }, - { - "auxiliary_loss_clip": 0.01110581, - "auxiliary_loss_mlp": 0.01041968, - "balance_loss_clip": 1.04938042, - "balance_loss_mlp": 1.02471852, - "epoch": 0.3831053660003006, - "flos": 25076816098560.0, - "grad_norm": 3.269308088463154, - "language_loss": 0.7304002, - "learning_rate": 2.8285427391301155e-06, - "loss": 0.75192571, - "num_input_tokens_seen": 136701185, - "step": 6372, - "time_per_iteration": 2.7497966289520264 - }, - { - "auxiliary_loss_clip": 0.01127864, - "auxiliary_loss_mlp": 0.01034223, - "balance_loss_clip": 1.05050421, - "balance_loss_mlp": 1.01848698, - "epoch": 0.38316548925296856, - "flos": 23259018562560.0, - "grad_norm": 1.83316702621751, - "language_loss": 0.8491025, - "learning_rate": 2.8281882535451266e-06, - "loss": 0.87072337, - "num_input_tokens_seen": 136721265, - "step": 6373, - "time_per_iteration": 2.6510777473449707 - }, - { - "auxiliary_loss_clip": 0.01084717, - "auxiliary_loss_mlp": 0.01048262, - "balance_loss_clip": 1.0416218, - "balance_loss_mlp": 1.0316565, - "epoch": 0.3832256125056366, - "flos": 34423465991040.0, - "grad_norm": 2.287485479433922, - "language_loss": 0.74893212, - "learning_rate": 2.8278337365551567e-06, - "loss": 0.770262, - "num_input_tokens_seen": 136741885, - "step": 6374, - "time_per_iteration": 2.8658056259155273 - }, - { - "auxiliary_loss_clip": 0.01130215, - "auxiliary_loss_mlp": 0.01042427, - "balance_loss_clip": 1.05264366, - "balance_loss_mlp": 1.02613068, - "epoch": 0.38328573575830455, - "flos": 21762764599680.0, - "grad_norm": 7.5426595342284735, - "language_loss": 0.75737238, - "learning_rate": 2.8274791881736485e-06, - "loss": 0.77909875, - "num_input_tokens_seen": 136760905, - "step": 6375, - "time_per_iteration": 2.6622958183288574 - }, - { - "auxiliary_loss_clip": 0.01126708, - "auxiliary_loss_mlp": 0.01039776, - "balance_loss_clip": 1.05043924, - "balance_loss_mlp": 1.0244453, - "epoch": 0.3833458590109725, - "flos": 17380010724480.0, - "grad_norm": 2.1246389624552435, - "language_loss": 0.72777182, - "learning_rate": 2.8271246084140457e-06, - "loss": 0.74943662, - "num_input_tokens_seen": 136777240, - "step": 6376, - "time_per_iteration": 2.6562421321868896 - }, - { - "auxiliary_loss_clip": 0.01122147, - "auxiliary_loss_mlp": 0.01039822, - "balance_loss_clip": 1.04791379, - "balance_loss_mlp": 1.02381194, - "epoch": 0.3834059822636405, - "flos": 29424557191680.0, - "grad_norm": 1.7414598633373413, - "language_loss": 0.67441249, - "learning_rate": 2.826769997289796e-06, - "loss": 0.69603217, - "num_input_tokens_seen": 136801040, - "step": 6377, - "time_per_iteration": 2.779766798019409 - }, - { - "auxiliary_loss_clip": 0.01110002, - "auxiliary_loss_mlp": 0.01041228, - "balance_loss_clip": 1.05152845, - "balance_loss_mlp": 1.02421689, - "epoch": 0.38346610551630844, - "flos": 21470739027840.0, - "grad_norm": 2.377659826482013, - "language_loss": 0.73287642, - "learning_rate": 2.826415354814344e-06, - "loss": 0.75438869, - "num_input_tokens_seen": 136819495, - "step": 6378, - "time_per_iteration": 2.7345829010009766 - }, - { - "auxiliary_loss_clip": 0.01085335, - "auxiliary_loss_mlp": 0.01042694, - "balance_loss_clip": 1.0479784, - "balance_loss_mlp": 1.02707767, - "epoch": 0.3835262287689764, - "flos": 27561224188800.0, - "grad_norm": 2.283576437082984, - "language_loss": 0.69473612, - "learning_rate": 2.8260606810011396e-06, - "loss": 0.71601641, - "num_input_tokens_seen": 136838840, - "step": 6379, - "time_per_iteration": 2.7592358589172363 - }, - { - "auxiliary_loss_clip": 0.01124706, - "auxiliary_loss_mlp": 0.01036177, - "balance_loss_clip": 1.0516969, - "balance_loss_mlp": 1.02094209, - "epoch": 0.3835863520216444, - "flos": 15523716787200.0, - "grad_norm": 1.8393672130560537, - "language_loss": 0.83356249, - "learning_rate": 2.8257059758636315e-06, - "loss": 0.85517132, - "num_input_tokens_seen": 136854425, - "step": 6380, - "time_per_iteration": 2.6572370529174805 - }, - { - "auxiliary_loss_clip": 0.01135434, - "auxiliary_loss_mlp": 0.01035321, - "balance_loss_clip": 1.05187774, - "balance_loss_mlp": 1.02010989, - "epoch": 0.38364647527431234, - "flos": 21904934630400.0, - "grad_norm": 1.5891747666862521, - "language_loss": 0.8141042, - "learning_rate": 2.8253512394152697e-06, - "loss": 0.83581179, - "num_input_tokens_seen": 136874355, - "step": 6381, - "time_per_iteration": 2.7251663208007812 - }, - { - "auxiliary_loss_clip": 0.01057344, - "auxiliary_loss_mlp": 0.01005901, - "balance_loss_clip": 1.02759361, - "balance_loss_mlp": 1.00418437, - "epoch": 0.3837065985269803, - "flos": 65534927558400.0, - "grad_norm": 0.7954141143291842, - "language_loss": 0.60376751, - "learning_rate": 2.8249964716695068e-06, - "loss": 0.62440002, - "num_input_tokens_seen": 136937475, - "step": 6382, - "time_per_iteration": 3.1750948429107666 - }, - { - "auxiliary_loss_clip": 0.01139607, - "auxiliary_loss_mlp": 0.0103679, - "balance_loss_clip": 1.05060625, - "balance_loss_mlp": 1.02099442, - "epoch": 0.38376672177964827, - "flos": 28256598558720.0, - "grad_norm": 3.8324285625149925, - "language_loss": 0.66432369, - "learning_rate": 2.824641672639794e-06, - "loss": 0.68608773, - "num_input_tokens_seen": 136955805, - "step": 6383, - "time_per_iteration": 2.7543957233428955 - }, - { - "auxiliary_loss_clip": 0.01103794, - "auxiliary_loss_mlp": 0.01039577, - "balance_loss_clip": 1.04783142, - "balance_loss_mlp": 1.02375221, - "epoch": 0.38382684503231623, - "flos": 20631363033600.0, - "grad_norm": 2.110615575498957, - "language_loss": 0.75144917, - "learning_rate": 2.824286842339587e-06, - "loss": 0.77288288, - "num_input_tokens_seen": 136975240, - "step": 6384, - "time_per_iteration": 2.7796735763549805 - }, - { - "auxiliary_loss_clip": 0.01122869, - "auxiliary_loss_mlp": 0.01040365, - "balance_loss_clip": 1.05156231, - "balance_loss_mlp": 1.02510643, - "epoch": 0.3838869682849842, - "flos": 19605825826560.0, - "grad_norm": 1.5394774946197278, - "language_loss": 0.76096714, - "learning_rate": 2.823931980782341e-06, - "loss": 0.78259945, - "num_input_tokens_seen": 136994985, - "step": 6385, - "time_per_iteration": 2.6831300258636475 - }, - { - "auxiliary_loss_clip": 0.01046831, - "auxiliary_loss_mlp": 0.01001133, - "balance_loss_clip": 1.02648735, - "balance_loss_mlp": 0.99943984, - "epoch": 0.38394709153765216, - "flos": 56556110891520.0, - "grad_norm": 0.9063295744618779, - "language_loss": 0.66955769, - "learning_rate": 2.82357708798151e-06, - "loss": 0.69003725, - "num_input_tokens_seen": 137046290, - "step": 6386, - "time_per_iteration": 3.0693411827087402 - }, - { - "auxiliary_loss_clip": 0.0109652, - "auxiliary_loss_mlp": 0.01041859, - "balance_loss_clip": 1.04551756, - "balance_loss_mlp": 1.02686286, - "epoch": 0.3840072147903202, - "flos": 15888748752000.0, - "grad_norm": 1.7986188221191803, - "language_loss": 0.7215755, - "learning_rate": 2.8232221639505547e-06, - "loss": 0.74295932, - "num_input_tokens_seen": 137064725, - "step": 6387, - "time_per_iteration": 2.736774206161499 - }, - { - "auxiliary_loss_clip": 0.01134624, - "auxiliary_loss_mlp": 0.01044946, - "balance_loss_clip": 1.05156994, - "balance_loss_mlp": 1.03039086, - "epoch": 0.38406733804298815, - "flos": 28218030330240.0, - "grad_norm": 1.6374516085838389, - "language_loss": 0.8088249, - "learning_rate": 2.822867208702932e-06, - "loss": 0.83062065, - "num_input_tokens_seen": 137086030, - "step": 6388, - "time_per_iteration": 2.782958507537842 - }, - { - "auxiliary_loss_clip": 0.01103471, - "auxiliary_loss_mlp": 0.01047592, - "balance_loss_clip": 1.04727554, - "balance_loss_mlp": 1.03298843, - "epoch": 0.3841274612956561, - "flos": 18223588609920.0, - "grad_norm": 1.7872750649564642, - "language_loss": 0.76085746, - "learning_rate": 2.8225122222521026e-06, - "loss": 0.78236812, - "num_input_tokens_seen": 137105400, - "step": 6389, - "time_per_iteration": 2.6644833087921143 - }, - { - "auxiliary_loss_clip": 0.01119906, - "auxiliary_loss_mlp": 0.0104877, - "balance_loss_clip": 1.05389404, - "balance_loss_mlp": 1.03203344, - "epoch": 0.3841875845483241, - "flos": 19792884879360.0, - "grad_norm": 4.9507505317589775, - "language_loss": 0.76550084, - "learning_rate": 2.8221572046115273e-06, - "loss": 0.78718758, - "num_input_tokens_seen": 137124985, - "step": 6390, - "time_per_iteration": 2.825714588165283 - }, - { - "auxiliary_loss_clip": 0.01090482, - "auxiliary_loss_mlp": 0.01048203, - "balance_loss_clip": 1.04517913, - "balance_loss_mlp": 1.03196096, - "epoch": 0.38424770780099204, - "flos": 29898829393920.0, - "grad_norm": 1.7614871223783444, - "language_loss": 0.70377523, - "learning_rate": 2.821802155794668e-06, - "loss": 0.72516215, - "num_input_tokens_seen": 137146745, - "step": 6391, - "time_per_iteration": 2.918065309524536 - }, - { - "auxiliary_loss_clip": 0.01125443, - "auxiliary_loss_mlp": 0.01036977, - "balance_loss_clip": 1.04874265, - "balance_loss_mlp": 1.02158153, - "epoch": 0.38430783105366, - "flos": 20813717404800.0, - "grad_norm": 1.7948670510085722, - "language_loss": 0.84005457, - "learning_rate": 2.8214470758149884e-06, - "loss": 0.86167878, - "num_input_tokens_seen": 137163195, - "step": 6392, - "time_per_iteration": 2.679427146911621 - }, - { - "auxiliary_loss_clip": 0.01122701, - "auxiliary_loss_mlp": 0.01037128, - "balance_loss_clip": 1.04846168, - "balance_loss_mlp": 1.0227809, - "epoch": 0.384367954306328, - "flos": 10998577399680.0, - "grad_norm": 2.3141685884805145, - "language_loss": 0.6062203, - "learning_rate": 2.8210919646859536e-06, - "loss": 0.62781858, - "num_input_tokens_seen": 137179330, - "step": 6393, - "time_per_iteration": 2.6622374057769775 - }, - { - "auxiliary_loss_clip": 0.01110672, - "auxiliary_loss_mlp": 0.01036894, - "balance_loss_clip": 1.04954767, - "balance_loss_mlp": 1.02025223, - "epoch": 0.38442807755899594, - "flos": 25338030779520.0, - "grad_norm": 1.7908313499382054, - "language_loss": 0.70639426, - "learning_rate": 2.820736822421029e-06, - "loss": 0.72786993, - "num_input_tokens_seen": 137198655, - "step": 6394, - "time_per_iteration": 2.7460365295410156 - }, - { - "auxiliary_loss_clip": 0.01123613, - "auxiliary_loss_mlp": 0.01035163, - "balance_loss_clip": 1.04763663, - "balance_loss_mlp": 1.01871169, - "epoch": 0.3844882008116639, - "flos": 21069760527360.0, - "grad_norm": 2.646318489707099, - "language_loss": 0.81774974, - "learning_rate": 2.8203816490336822e-06, - "loss": 0.83933747, - "num_input_tokens_seen": 137217120, - "step": 6395, - "time_per_iteration": 2.676023006439209 - }, - { - "auxiliary_loss_clip": 0.01129196, - "auxiliary_loss_mlp": 0.01046949, - "balance_loss_clip": 1.05485177, - "balance_loss_mlp": 1.03209007, - "epoch": 0.38454832406433187, - "flos": 17963235855360.0, - "grad_norm": 1.9755185808990787, - "language_loss": 0.71031433, - "learning_rate": 2.8200264445373813e-06, - "loss": 0.73207581, - "num_input_tokens_seen": 137234410, - "step": 6396, - "time_per_iteration": 2.7082455158233643 - }, - { - "auxiliary_loss_clip": 0.01044031, - "auxiliary_loss_mlp": 0.0100801, - "balance_loss_clip": 1.02689695, - "balance_loss_mlp": 1.00657308, - "epoch": 0.38460844731699984, - "flos": 67924999555200.0, - "grad_norm": 0.8839433118134116, - "language_loss": 0.59671199, - "learning_rate": 2.8196712089455954e-06, - "loss": 0.61723238, - "num_input_tokens_seen": 137294940, - "step": 6397, - "time_per_iteration": 3.2412428855895996 - }, - { - "auxiliary_loss_clip": 0.01137376, - "auxiliary_loss_mlp": 0.01035554, - "balance_loss_clip": 1.05209756, - "balance_loss_mlp": 1.02044976, - "epoch": 0.3846685705696678, - "flos": 25849075530240.0, - "grad_norm": 2.648974669995796, - "language_loss": 0.85017276, - "learning_rate": 2.819315942271794e-06, - "loss": 0.87190199, - "num_input_tokens_seen": 137315035, - "step": 6398, - "time_per_iteration": 2.7374656200408936 - }, - { - "auxiliary_loss_clip": 0.01136492, - "auxiliary_loss_mlp": 0.01030698, - "balance_loss_clip": 1.0517211, - "balance_loss_mlp": 1.0165, - "epoch": 0.38472869382233577, - "flos": 16290194129280.0, - "grad_norm": 2.1032431430060075, - "language_loss": 0.79989493, - "learning_rate": 2.8189606445294515e-06, - "loss": 0.82156688, - "num_input_tokens_seen": 137333155, - "step": 6399, - "time_per_iteration": 4.446218729019165 - }, - { - "auxiliary_loss_clip": 0.0113807, - "auxiliary_loss_mlp": 0.00773562, - "balance_loss_clip": 1.05109119, - "balance_loss_mlp": 1.00025833, - "epoch": 0.38478881707500373, - "flos": 19353122668800.0, - "grad_norm": 3.0376300513317416, - "language_loss": 0.67328328, - "learning_rate": 2.818605315732038e-06, - "loss": 0.69239962, - "num_input_tokens_seen": 137351515, - "step": 6400, - "time_per_iteration": 2.6920905113220215 - }, - { - "auxiliary_loss_clip": 0.01122811, - "auxiliary_loss_mlp": 0.01042029, - "balance_loss_clip": 1.05546772, - "balance_loss_mlp": 1.0264008, - "epoch": 0.38484894032767175, - "flos": 24860849575680.0, - "grad_norm": 11.158483612058907, - "language_loss": 0.73623443, - "learning_rate": 2.81824995589303e-06, - "loss": 0.75788283, - "num_input_tokens_seen": 137371255, - "step": 6401, - "time_per_iteration": 4.2371673583984375 - }, - { - "auxiliary_loss_clip": 0.01102005, - "auxiliary_loss_mlp": 0.01039851, - "balance_loss_clip": 1.04852486, - "balance_loss_mlp": 1.02387738, - "epoch": 0.3849090635803397, - "flos": 14501806853760.0, - "grad_norm": 2.0006804524577233, - "language_loss": 0.72059876, - "learning_rate": 2.8178945650259012e-06, - "loss": 0.74201727, - "num_input_tokens_seen": 137388980, - "step": 6402, - "time_per_iteration": 2.686413288116455 - }, - { - "auxiliary_loss_clip": 0.0113478, - "auxiliary_loss_mlp": 0.01035082, - "balance_loss_clip": 1.05094552, - "balance_loss_mlp": 1.02016854, - "epoch": 0.3849691868330077, - "flos": 18515865576960.0, - "grad_norm": 2.094788133183166, - "language_loss": 0.82884681, - "learning_rate": 2.817539143144128e-06, - "loss": 0.85054541, - "num_input_tokens_seen": 137406885, - "step": 6403, - "time_per_iteration": 4.234680891036987 - }, - { - "auxiliary_loss_clip": 0.01078109, - "auxiliary_loss_mlp": 0.01040581, - "balance_loss_clip": 1.04205656, - "balance_loss_mlp": 1.02466702, - "epoch": 0.38502931008567565, - "flos": 21616392677760.0, - "grad_norm": 4.587008789601206, - "language_loss": 0.82845348, - "learning_rate": 2.817183690261189e-06, - "loss": 0.84964037, - "num_input_tokens_seen": 137425535, - "step": 6404, - "time_per_iteration": 2.777756452560425 - }, - { - "auxiliary_loss_clip": 0.0111195, - "auxiliary_loss_mlp": 0.01034861, - "balance_loss_clip": 1.04970074, - "balance_loss_mlp": 1.02046084, - "epoch": 0.3850894333383436, - "flos": 25415346804480.0, - "grad_norm": 2.6287869212560646, - "language_loss": 0.69417107, - "learning_rate": 2.816828206390563e-06, - "loss": 0.71563923, - "num_input_tokens_seen": 137447700, - "step": 6405, - "time_per_iteration": 4.478301286697388 - }, - { - "auxiliary_loss_clip": 0.01102381, - "auxiliary_loss_mlp": 0.01038086, - "balance_loss_clip": 1.0438571, - "balance_loss_mlp": 1.02414417, - "epoch": 0.3851495565910116, - "flos": 20227870581120.0, - "grad_norm": 1.9306681180439358, - "language_loss": 0.79248095, - "learning_rate": 2.816472691545729e-06, - "loss": 0.81388557, - "num_input_tokens_seen": 137462245, - "step": 6406, - "time_per_iteration": 2.7157816886901855 - }, - { - "auxiliary_loss_clip": 0.01129296, - "auxiliary_loss_mlp": 0.01040841, - "balance_loss_clip": 1.05465746, - "balance_loss_mlp": 1.02483082, - "epoch": 0.38520967984367954, - "flos": 16508459122560.0, - "grad_norm": 5.929375109580111, - "language_loss": 0.84107637, - "learning_rate": 2.8161171457401694e-06, - "loss": 0.86277771, - "num_input_tokens_seen": 137476455, - "step": 6407, - "time_per_iteration": 2.6058037281036377 - }, - { - "auxiliary_loss_clip": 0.01049614, - "auxiliary_loss_mlp": 0.00999678, - "balance_loss_clip": 1.03001904, - "balance_loss_mlp": 0.99828893, - "epoch": 0.3852698030963475, - "flos": 61313772971520.0, - "grad_norm": 0.845548946049954, - "language_loss": 0.64919412, - "learning_rate": 2.815761568987365e-06, - "loss": 0.66968703, - "num_input_tokens_seen": 137539845, - "step": 6408, - "time_per_iteration": 3.2015879154205322 - }, - { - "auxiliary_loss_clip": 0.01110915, - "auxiliary_loss_mlp": 0.01042045, - "balance_loss_clip": 1.05201948, - "balance_loss_mlp": 1.02547526, - "epoch": 0.3853299263490155, - "flos": 22893016930560.0, - "grad_norm": 1.5734517214124462, - "language_loss": 0.73444313, - "learning_rate": 2.8154059613008e-06, - "loss": 0.75597274, - "num_input_tokens_seen": 137559880, - "step": 6409, - "time_per_iteration": 2.683310031890869 - }, - { - "auxiliary_loss_clip": 0.01099042, - "auxiliary_loss_mlp": 0.01052587, - "balance_loss_clip": 1.05162942, - "balance_loss_mlp": 1.03458679, - "epoch": 0.38539004960168344, - "flos": 20047491457920.0, - "grad_norm": 3.095928763270071, - "language_loss": 0.70505756, - "learning_rate": 2.81505032269396e-06, - "loss": 0.72657388, - "num_input_tokens_seen": 137578225, - "step": 6410, - "time_per_iteration": 2.7694053649902344 - }, - { - "auxiliary_loss_clip": 0.01018797, - "auxiliary_loss_mlp": 0.00754046, - "balance_loss_clip": 1.02754462, - "balance_loss_mlp": 1.00070059, - "epoch": 0.3854501728543514, - "flos": 68730691570560.0, - "grad_norm": 0.6824056349925876, - "language_loss": 0.6019417, - "learning_rate": 2.81469465318033e-06, - "loss": 0.61967015, - "num_input_tokens_seen": 137645770, - "step": 6411, - "time_per_iteration": 3.3692543506622314 - }, - { - "auxiliary_loss_clip": 0.01091571, - "auxiliary_loss_mlp": 0.01029185, - "balance_loss_clip": 1.04337883, - "balance_loss_mlp": 1.01451063, - "epoch": 0.38551029610701937, - "flos": 20485027025280.0, - "grad_norm": 2.4386958956664344, - "language_loss": 0.78219938, - "learning_rate": 2.814338952773397e-06, - "loss": 0.80340695, - "num_input_tokens_seen": 137664090, - "step": 6412, - "time_per_iteration": 2.7462196350097656 - }, - { - "auxiliary_loss_clip": 0.01097982, - "auxiliary_loss_mlp": 0.01037754, - "balance_loss_clip": 1.04309821, - "balance_loss_mlp": 1.01995587, - "epoch": 0.38557041935968733, - "flos": 23471788775040.0, - "grad_norm": 2.0249224045322802, - "language_loss": 0.78112727, - "learning_rate": 2.8139832214866493e-06, - "loss": 0.80248463, - "num_input_tokens_seen": 137683190, - "step": 6413, - "time_per_iteration": 2.768624782562256 - }, - { - "auxiliary_loss_clip": 0.01056912, - "auxiliary_loss_mlp": 0.01003998, - "balance_loss_clip": 1.02733278, - "balance_loss_mlp": 1.00254369, - "epoch": 0.38563054261235535, - "flos": 63966636869760.0, - "grad_norm": 0.8082958368118873, - "language_loss": 0.61342072, - "learning_rate": 2.813627459333576e-06, - "loss": 0.63402981, - "num_input_tokens_seen": 137737315, - "step": 6414, - "time_per_iteration": 2.983466625213623 - }, - { - "auxiliary_loss_clip": 0.01103716, - "auxiliary_loss_mlp": 0.01038577, - "balance_loss_clip": 1.05065155, - "balance_loss_mlp": 1.02302015, - "epoch": 0.3856906658650233, - "flos": 23987789602560.0, - "grad_norm": 2.2111312580879106, - "language_loss": 0.77225536, - "learning_rate": 2.8132716663276685e-06, - "loss": 0.79367828, - "num_input_tokens_seen": 137753535, - "step": 6415, - "time_per_iteration": 2.7486205101013184 - }, - { - "auxiliary_loss_clip": 0.01109368, - "auxiliary_loss_mlp": 0.01030786, - "balance_loss_clip": 1.04894936, - "balance_loss_mlp": 1.01676726, - "epoch": 0.3857507891176913, - "flos": 25007436979200.0, - "grad_norm": 1.644505635534703, - "language_loss": 0.80036473, - "learning_rate": 2.8129158424824173e-06, - "loss": 0.82176626, - "num_input_tokens_seen": 137773405, - "step": 6416, - "time_per_iteration": 2.709200859069824 - }, - { - "auxiliary_loss_clip": 0.0112133, - "auxiliary_loss_mlp": 0.00771665, - "balance_loss_clip": 1.04777813, - "balance_loss_mlp": 1.00020468, - "epoch": 0.38581091237035925, - "flos": 21536778182400.0, - "grad_norm": 1.8974153334913886, - "language_loss": 0.78746861, - "learning_rate": 2.8125599878113155e-06, - "loss": 0.80639857, - "num_input_tokens_seen": 137790810, - "step": 6417, - "time_per_iteration": 2.6839869022369385 - }, - { - "auxiliary_loss_clip": 0.01106617, - "auxiliary_loss_mlp": 0.0103795, - "balance_loss_clip": 1.04771507, - "balance_loss_mlp": 1.02424121, - "epoch": 0.3858710356230272, - "flos": 17383889393280.0, - "grad_norm": 1.8492847143532247, - "language_loss": 0.80066824, - "learning_rate": 2.8122041023278583e-06, - "loss": 0.82211387, - "num_input_tokens_seen": 137810265, - "step": 6418, - "time_per_iteration": 2.709463119506836 - }, - { - "auxiliary_loss_clip": 0.01106426, - "auxiliary_loss_mlp": 0.01035927, - "balance_loss_clip": 1.04606509, - "balance_loss_mlp": 1.02115691, - "epoch": 0.3859311588756952, - "flos": 20339588856960.0, - "grad_norm": 2.0121704661475524, - "language_loss": 0.79591382, - "learning_rate": 2.8118481860455407e-06, - "loss": 0.81733727, - "num_input_tokens_seen": 137828580, - "step": 6419, - "time_per_iteration": 2.687030553817749 - }, - { - "auxiliary_loss_clip": 0.01109367, - "auxiliary_loss_mlp": 0.01035627, - "balance_loss_clip": 1.04662013, - "balance_loss_mlp": 1.0194031, - "epoch": 0.38599128212836314, - "flos": 26321157002880.0, - "grad_norm": 2.202509680177809, - "language_loss": 0.67581224, - "learning_rate": 2.8114922389778573e-06, - "loss": 0.69726223, - "num_input_tokens_seen": 137846145, - "step": 6420, - "time_per_iteration": 2.7517049312591553 - }, - { - "auxiliary_loss_clip": 0.01089731, - "auxiliary_loss_mlp": 0.01053637, - "balance_loss_clip": 1.04479241, - "balance_loss_mlp": 1.03771043, - "epoch": 0.3860514053810311, - "flos": 13553837066880.0, - "grad_norm": 2.406147976104497, - "language_loss": 0.81137526, - "learning_rate": 2.8111362611383076e-06, - "loss": 0.83280897, - "num_input_tokens_seen": 137863705, - "step": 6421, - "time_per_iteration": 2.970040798187256 - }, - { - "auxiliary_loss_clip": 0.01108309, - "auxiliary_loss_mlp": 0.01040046, - "balance_loss_clip": 1.04625583, - "balance_loss_mlp": 1.02510345, - "epoch": 0.3861115286336991, - "flos": 20954271323520.0, - "grad_norm": 2.6092074943148797, - "language_loss": 0.71989834, - "learning_rate": 2.8107802525403886e-06, - "loss": 0.74138188, - "num_input_tokens_seen": 137880285, - "step": 6422, - "time_per_iteration": 2.690490961074829 - }, - { - "auxiliary_loss_clip": 0.01104575, - "auxiliary_loss_mlp": 0.0104152, - "balance_loss_clip": 1.04663455, - "balance_loss_mlp": 1.02759588, - "epoch": 0.38617165188636704, - "flos": 16362697731840.0, - "grad_norm": 1.6942063430957965, - "language_loss": 0.66644311, - "learning_rate": 2.8104242131976025e-06, - "loss": 0.687904, - "num_input_tokens_seen": 137898335, - "step": 6423, - "time_per_iteration": 2.6189329624176025 - }, - { - "auxiliary_loss_clip": 0.01128312, - "auxiliary_loss_mlp": 0.01042786, - "balance_loss_clip": 1.05139875, - "balance_loss_mlp": 1.02860618, - "epoch": 0.386231775139035, - "flos": 34787276893440.0, - "grad_norm": 2.1536039728580394, - "language_loss": 0.68359423, - "learning_rate": 2.810068143123449e-06, - "loss": 0.70530522, - "num_input_tokens_seen": 137918605, - "step": 6424, - "time_per_iteration": 2.7609992027282715 - }, - { - "auxiliary_loss_clip": 0.01098796, - "auxiliary_loss_mlp": 0.01038309, - "balance_loss_clip": 1.04750848, - "balance_loss_mlp": 1.02387285, - "epoch": 0.38629189839170297, - "flos": 21726171619200.0, - "grad_norm": 1.4481478329406698, - "language_loss": 0.72367114, - "learning_rate": 2.809712042331429e-06, - "loss": 0.7450422, - "num_input_tokens_seen": 137938245, - "step": 6425, - "time_per_iteration": 2.7069387435913086 - }, - { - "auxiliary_loss_clip": 0.01099551, - "auxiliary_loss_mlp": 0.00773141, - "balance_loss_clip": 1.0428803, - "balance_loss_mlp": 1.00013173, - "epoch": 0.38635202164437094, - "flos": 27923634460800.0, - "grad_norm": 2.52438881915832, - "language_loss": 0.80258477, - "learning_rate": 2.8093559108350484e-06, - "loss": 0.82131171, - "num_input_tokens_seen": 137956770, - "step": 6426, - "time_per_iteration": 2.8976056575775146 - }, - { - "auxiliary_loss_clip": 0.01125602, - "auxiliary_loss_mlp": 0.0103515, - "balance_loss_clip": 1.04929447, - "balance_loss_mlp": 1.02013016, - "epoch": 0.38641214489703896, - "flos": 23586631534080.0, - "grad_norm": 2.2578291383073825, - "language_loss": 0.7536087, - "learning_rate": 2.80899974864781e-06, - "loss": 0.77521622, - "num_input_tokens_seen": 137977040, - "step": 6427, - "time_per_iteration": 2.7281436920166016 - }, - { - "auxiliary_loss_clip": 0.01075932, - "auxiliary_loss_mlp": 0.01057335, - "balance_loss_clip": 1.04142189, - "balance_loss_mlp": 1.04013276, - "epoch": 0.3864722681497069, - "flos": 12641239198080.0, - "grad_norm": 2.0875975256988055, - "language_loss": 0.69435054, - "learning_rate": 2.8086435557832203e-06, - "loss": 0.71568322, - "num_input_tokens_seen": 137993545, - "step": 6428, - "time_per_iteration": 2.7289116382598877 - }, - { - "auxiliary_loss_clip": 0.01113154, - "auxiliary_loss_mlp": 0.01042018, - "balance_loss_clip": 1.04947257, - "balance_loss_mlp": 1.02729535, - "epoch": 0.3865323914023749, - "flos": 17598922162560.0, - "grad_norm": 2.847119477349317, - "language_loss": 0.8444519, - "learning_rate": 2.8082873322547863e-06, - "loss": 0.86600363, - "num_input_tokens_seen": 138010140, - "step": 6429, - "time_per_iteration": 2.7385170459747314 - }, - { - "auxiliary_loss_clip": 0.01110797, - "auxiliary_loss_mlp": 0.01038599, - "balance_loss_clip": 1.04555535, - "balance_loss_mlp": 1.02423429, - "epoch": 0.38659251465504285, - "flos": 18478949374080.0, - "grad_norm": 2.174010980525696, - "language_loss": 0.80673695, - "learning_rate": 2.807931078076015e-06, - "loss": 0.82823092, - "num_input_tokens_seen": 138028880, - "step": 6430, - "time_per_iteration": 2.660228967666626 - }, - { - "auxiliary_loss_clip": 0.0102628, - "auxiliary_loss_mlp": 0.01015101, - "balance_loss_clip": 1.02508974, - "balance_loss_mlp": 1.01382565, - "epoch": 0.3866526379077108, - "flos": 64165726978560.0, - "grad_norm": 0.719429045650031, - "language_loss": 0.58803207, - "learning_rate": 2.807574793260416e-06, - "loss": 0.60844588, - "num_input_tokens_seen": 138098090, - "step": 6431, - "time_per_iteration": 3.2772469520568848 - }, - { - "auxiliary_loss_clip": 0.01086398, - "auxiliary_loss_mlp": 0.01039293, - "balance_loss_clip": 1.04541588, - "balance_loss_mlp": 1.02296114, - "epoch": 0.3867127611603788, - "flos": 14388292897920.0, - "grad_norm": 2.1660589654497424, - "language_loss": 0.79041815, - "learning_rate": 2.8072184778215004e-06, - "loss": 0.81167507, - "num_input_tokens_seen": 138114735, - "step": 6432, - "time_per_iteration": 2.7949061393737793 - }, - { - "auxiliary_loss_clip": 0.01125593, - "auxiliary_loss_mlp": 0.01048624, - "balance_loss_clip": 1.04708362, - "balance_loss_mlp": 1.03231645, - "epoch": 0.38677288441304675, - "flos": 20010754823040.0, - "grad_norm": 2.0695366497364294, - "language_loss": 0.80186564, - "learning_rate": 2.806862131772779e-06, - "loss": 0.82360786, - "num_input_tokens_seen": 138130480, - "step": 6433, - "time_per_iteration": 2.6526312828063965 - }, - { - "auxiliary_loss_clip": 0.01111087, - "auxiliary_loss_mlp": 0.01037922, - "balance_loss_clip": 1.04934025, - "balance_loss_mlp": 1.02162611, - "epoch": 0.3868330076657147, - "flos": 22236893147520.0, - "grad_norm": 1.6267030007711512, - "language_loss": 0.70441496, - "learning_rate": 2.806505755127765e-06, - "loss": 0.72590506, - "num_input_tokens_seen": 138150640, - "step": 6434, - "time_per_iteration": 2.6985394954681396 - }, - { - "auxiliary_loss_clip": 0.01097728, - "auxiliary_loss_mlp": 0.01047403, - "balance_loss_clip": 1.04536152, - "balance_loss_mlp": 1.03008235, - "epoch": 0.3868931309183827, - "flos": 16727442387840.0, - "grad_norm": 1.7348790517482282, - "language_loss": 0.77462173, - "learning_rate": 2.806149347899972e-06, - "loss": 0.79607308, - "num_input_tokens_seen": 138169700, - "step": 6435, - "time_per_iteration": 2.7326719760894775 - }, - { - "auxiliary_loss_clip": 0.01119609, - "auxiliary_loss_mlp": 0.01035834, - "balance_loss_clip": 1.04651809, - "balance_loss_mlp": 1.0208497, - "epoch": 0.38695325417105064, - "flos": 22674716023680.0, - "grad_norm": 2.3575842278582813, - "language_loss": 0.79599082, - "learning_rate": 2.805792910102915e-06, - "loss": 0.81754529, - "num_input_tokens_seen": 138185835, - "step": 6436, - "time_per_iteration": 2.6643154621124268 - }, - { - "auxiliary_loss_clip": 0.01107099, - "auxiliary_loss_mlp": 0.01036499, - "balance_loss_clip": 1.04809546, - "balance_loss_mlp": 1.0215621, - "epoch": 0.3870133774237186, - "flos": 23112036109440.0, - "grad_norm": 1.9038851888933561, - "language_loss": 0.76043606, - "learning_rate": 2.8054364417501093e-06, - "loss": 0.78187203, - "num_input_tokens_seen": 138204080, - "step": 6437, - "time_per_iteration": 2.701834201812744 - }, - { - "auxiliary_loss_clip": 0.01110073, - "auxiliary_loss_mlp": 0.01037115, - "balance_loss_clip": 1.04696321, - "balance_loss_mlp": 1.02374589, - "epoch": 0.3870735006763866, - "flos": 17675699483520.0, - "grad_norm": 2.022501790448194, - "language_loss": 0.81817484, - "learning_rate": 2.805079942855074e-06, - "loss": 0.8396467, - "num_input_tokens_seen": 138220710, - "step": 6438, - "time_per_iteration": 4.327820539474487 - }, - { - "auxiliary_loss_clip": 0.01111326, - "auxiliary_loss_mlp": 0.0077319, - "balance_loss_clip": 1.04504764, - "balance_loss_mlp": 1.00027561, - "epoch": 0.38713362392905454, - "flos": 23295791111040.0, - "grad_norm": 1.7517226143139228, - "language_loss": 0.75388491, - "learning_rate": 2.804723413431326e-06, - "loss": 0.77273011, - "num_input_tokens_seen": 138241720, - "step": 6439, - "time_per_iteration": 2.797830104827881 - }, - { - "auxiliary_loss_clip": 0.01131277, - "auxiliary_loss_mlp": 0.01037901, - "balance_loss_clip": 1.04915833, - "balance_loss_mlp": 1.0235002, - "epoch": 0.38719374718172256, - "flos": 21031192298880.0, - "grad_norm": 1.7565856090832077, - "language_loss": 0.74071443, - "learning_rate": 2.8043668534923855e-06, - "loss": 0.76240611, - "num_input_tokens_seen": 138261885, - "step": 6440, - "time_per_iteration": 4.2160422801971436 - }, - { - "auxiliary_loss_clip": 0.01125111, - "auxiliary_loss_mlp": 0.01034995, - "balance_loss_clip": 1.04927301, - "balance_loss_mlp": 1.01949763, - "epoch": 0.3872538704343905, - "flos": 19609776322560.0, - "grad_norm": 2.101028456947384, - "language_loss": 0.82017142, - "learning_rate": 2.804010263051774e-06, - "loss": 0.84177244, - "num_input_tokens_seen": 138280255, - "step": 6441, - "time_per_iteration": 4.199851036071777 - }, - { - "auxiliary_loss_clip": 0.0113476, - "auxiliary_loss_mlp": 0.01039285, - "balance_loss_clip": 1.05011272, - "balance_loss_mlp": 1.02490842, - "epoch": 0.3873139936870585, - "flos": 17530045833600.0, - "grad_norm": 2.8802239922493147, - "language_loss": 0.80824792, - "learning_rate": 2.8036536421230118e-06, - "loss": 0.82998842, - "num_input_tokens_seen": 138296675, - "step": 6442, - "time_per_iteration": 2.6942524909973145 - }, - { - "auxiliary_loss_clip": 0.01090073, - "auxiliary_loss_mlp": 0.01032275, - "balance_loss_clip": 1.04431343, - "balance_loss_mlp": 1.01747537, - "epoch": 0.38737411693972645, - "flos": 17786555832960.0, - "grad_norm": 2.1593394156288044, - "language_loss": 0.84054118, - "learning_rate": 2.803296990719624e-06, - "loss": 0.86176467, - "num_input_tokens_seen": 138314985, - "step": 6443, - "time_per_iteration": 2.6660094261169434 - }, - { - "auxiliary_loss_clip": 0.01033878, - "auxiliary_loss_mlp": 0.01000185, - "balance_loss_clip": 1.02513885, - "balance_loss_mlp": 0.99879646, - "epoch": 0.3874342401923944, - "flos": 58304637048960.0, - "grad_norm": 0.7605185654135588, - "language_loss": 0.50208193, - "learning_rate": 2.8029403088551327e-06, - "loss": 0.52242255, - "num_input_tokens_seen": 138373275, - "step": 6444, - "time_per_iteration": 4.807433128356934 - }, - { - "auxiliary_loss_clip": 0.01086333, - "auxiliary_loss_mlp": 0.00773648, - "balance_loss_clip": 1.04187298, - "balance_loss_mlp": 1.00033963, - "epoch": 0.3874943634450624, - "flos": 17711933328000.0, - "grad_norm": 1.4666177781563792, - "language_loss": 0.78874767, - "learning_rate": 2.802583596543065e-06, - "loss": 0.80734754, - "num_input_tokens_seen": 138391145, - "step": 6445, - "time_per_iteration": 2.689142942428589 - }, - { - "auxiliary_loss_clip": 0.0111426, - "auxiliary_loss_mlp": 0.0103959, - "balance_loss_clip": 1.04754841, - "balance_loss_mlp": 1.02445602, - "epoch": 0.38755448669773035, - "flos": 19244852098560.0, - "grad_norm": 2.4274750437973958, - "language_loss": 0.81207073, - "learning_rate": 2.8022268537969474e-06, - "loss": 0.83360916, - "num_input_tokens_seen": 138409875, - "step": 6446, - "time_per_iteration": 2.6582860946655273 - }, - { - "auxiliary_loss_clip": 0.01107394, - "auxiliary_loss_mlp": 0.01037275, - "balance_loss_clip": 1.04530001, - "balance_loss_mlp": 1.02277923, - "epoch": 0.3876146099503983, - "flos": 20594267262720.0, - "grad_norm": 3.0137556994939887, - "language_loss": 0.77366996, - "learning_rate": 2.801870080630306e-06, - "loss": 0.79511666, - "num_input_tokens_seen": 138428965, - "step": 6447, - "time_per_iteration": 2.727285146713257 - }, - { - "auxiliary_loss_clip": 0.01108854, - "auxiliary_loss_mlp": 0.01037807, - "balance_loss_clip": 1.04590762, - "balance_loss_mlp": 1.02378821, - "epoch": 0.3876747332030663, - "flos": 19281121856640.0, - "grad_norm": 2.4450461903172562, - "language_loss": 0.76364803, - "learning_rate": 2.801513277056671e-06, - "loss": 0.78511459, - "num_input_tokens_seen": 138448090, - "step": 6448, - "time_per_iteration": 2.663989543914795 - }, - { - "auxiliary_loss_clip": 0.01102873, - "auxiliary_loss_mlp": 0.01038866, - "balance_loss_clip": 1.04449654, - "balance_loss_mlp": 1.02322626, - "epoch": 0.38773485645573424, - "flos": 18945895201920.0, - "grad_norm": 1.6490971101368535, - "language_loss": 0.76146352, - "learning_rate": 2.8011564430895725e-06, - "loss": 0.7828809, - "num_input_tokens_seen": 138466105, - "step": 6449, - "time_per_iteration": 2.806537628173828 - }, - { - "auxiliary_loss_clip": 0.01098531, - "auxiliary_loss_mlp": 0.00772575, - "balance_loss_clip": 1.04406381, - "balance_loss_mlp": 1.00027394, - "epoch": 0.3877949797084022, - "flos": 23071348978560.0, - "grad_norm": 2.0995234377866985, - "language_loss": 0.78572172, - "learning_rate": 2.800799578742542e-06, - "loss": 0.80443275, - "num_input_tokens_seen": 138485160, - "step": 6450, - "time_per_iteration": 2.7541351318359375 - }, - { - "auxiliary_loss_clip": 0.01137663, - "auxiliary_loss_mlp": 0.01039948, - "balance_loss_clip": 1.04827702, - "balance_loss_mlp": 1.02452803, - "epoch": 0.3878551029610702, - "flos": 29095543589760.0, - "grad_norm": 2.5655640440870946, - "language_loss": 0.78046334, - "learning_rate": 2.8004426840291106e-06, - "loss": 0.80223942, - "num_input_tokens_seen": 138504135, - "step": 6451, - "time_per_iteration": 2.6868700981140137 - }, - { - "auxiliary_loss_clip": 0.01126689, - "auxiliary_loss_mlp": 0.01031637, - "balance_loss_clip": 1.04576159, - "balance_loss_mlp": 1.01696229, - "epoch": 0.38791522621373814, - "flos": 20996394998400.0, - "grad_norm": 2.633183178462793, - "language_loss": 0.76404589, - "learning_rate": 2.800085758962812e-06, - "loss": 0.78562915, - "num_input_tokens_seen": 138523955, - "step": 6452, - "time_per_iteration": 2.708750009536743 - }, - { - "auxiliary_loss_clip": 0.01103834, - "auxiliary_loss_mlp": 0.01042785, - "balance_loss_clip": 1.04665875, - "balance_loss_mlp": 1.0285815, - "epoch": 0.3879753494664061, - "flos": 15486836497920.0, - "grad_norm": 1.5878969811553463, - "language_loss": 0.79534453, - "learning_rate": 2.799728803557182e-06, - "loss": 0.81681073, - "num_input_tokens_seen": 138541655, - "step": 6453, - "time_per_iteration": 2.7226593494415283 - }, - { - "auxiliary_loss_clip": 0.0112782, - "auxiliary_loss_mlp": 0.0104096, - "balance_loss_clip": 1.04889584, - "balance_loss_mlp": 1.02560616, - "epoch": 0.3880354727190741, - "flos": 22053964158720.0, - "grad_norm": 19.957823861770734, - "language_loss": 0.71643323, - "learning_rate": 2.7993718178257555e-06, - "loss": 0.73812103, - "num_input_tokens_seen": 138560860, - "step": 6454, - "time_per_iteration": 2.7265548706054688 - }, - { - "auxiliary_loss_clip": 0.01137183, - "auxiliary_loss_mlp": 0.01043076, - "balance_loss_clip": 1.04976404, - "balance_loss_mlp": 1.02693522, - "epoch": 0.3880955959717421, - "flos": 20340307128960.0, - "grad_norm": 2.029110970619929, - "language_loss": 0.77489239, - "learning_rate": 2.7990148017820694e-06, - "loss": 0.79669499, - "num_input_tokens_seen": 138580200, - "step": 6455, - "time_per_iteration": 2.7688205242156982 - }, - { - "auxiliary_loss_clip": 0.01131496, - "auxiliary_loss_mlp": 0.01043781, - "balance_loss_clip": 1.04975748, - "balance_loss_mlp": 1.02897501, - "epoch": 0.38815571922441006, - "flos": 23075407215360.0, - "grad_norm": 1.8133016626985128, - "language_loss": 0.76193333, - "learning_rate": 2.798657755439662e-06, - "loss": 0.78368604, - "num_input_tokens_seen": 138598315, - "step": 6456, - "time_per_iteration": 2.6894283294677734 - }, - { - "auxiliary_loss_clip": 0.01059894, - "auxiliary_loss_mlp": 0.01038862, - "balance_loss_clip": 1.04251969, - "balance_loss_mlp": 1.02365136, - "epoch": 0.388215842477078, - "flos": 20776944856320.0, - "grad_norm": 9.416859416659493, - "language_loss": 0.59422505, - "learning_rate": 2.7983006788120726e-06, - "loss": 0.61521268, - "num_input_tokens_seen": 138615695, - "step": 6457, - "time_per_iteration": 2.8189444541931152 - }, - { - "auxiliary_loss_clip": 0.01136561, - "auxiliary_loss_mlp": 0.01039144, - "balance_loss_clip": 1.04989612, - "balance_loss_mlp": 1.02262187, - "epoch": 0.388275965729746, - "flos": 20448182649600.0, - "grad_norm": 2.336997181985419, - "language_loss": 0.79927063, - "learning_rate": 2.797943571912841e-06, - "loss": 0.82102776, - "num_input_tokens_seen": 138633180, - "step": 6458, - "time_per_iteration": 2.66198992729187 - }, - { - "auxiliary_loss_clip": 0.01081764, - "auxiliary_loss_mlp": 0.0104529, - "balance_loss_clip": 1.04428816, - "balance_loss_mlp": 1.02855277, - "epoch": 0.38833608898241395, - "flos": 27892392606720.0, - "grad_norm": 2.218973373394608, - "language_loss": 0.81497735, - "learning_rate": 2.797586434755509e-06, - "loss": 0.83624792, - "num_input_tokens_seen": 138654785, - "step": 6459, - "time_per_iteration": 2.780120611190796 - }, - { - "auxiliary_loss_clip": 0.01105714, - "auxiliary_loss_mlp": 0.01037251, - "balance_loss_clip": 1.04633725, - "balance_loss_mlp": 1.0236907, - "epoch": 0.3883962122350819, - "flos": 18076390675200.0, - "grad_norm": 1.942341955564712, - "language_loss": 0.62001127, - "learning_rate": 2.7972292673536202e-06, - "loss": 0.64144087, - "num_input_tokens_seen": 138673330, - "step": 6460, - "time_per_iteration": 2.625399112701416 - }, - { - "auxiliary_loss_clip": 0.01120569, - "auxiliary_loss_mlp": 0.01032391, - "balance_loss_clip": 1.04955411, - "balance_loss_mlp": 1.01920033, - "epoch": 0.3884563354877499, - "flos": 23622254847360.0, - "grad_norm": 1.928823237011181, - "language_loss": 0.86226058, - "learning_rate": 2.796872069720717e-06, - "loss": 0.88379019, - "num_input_tokens_seen": 138694185, - "step": 6461, - "time_per_iteration": 2.6901583671569824 - }, - { - "auxiliary_loss_clip": 0.0111976, - "auxiliary_loss_mlp": 0.01038779, - "balance_loss_clip": 1.04810238, - "balance_loss_mlp": 1.0244205, - "epoch": 0.38851645874041785, - "flos": 27453528236160.0, - "grad_norm": 4.963760229824091, - "language_loss": 0.70659202, - "learning_rate": 2.7965148418703456e-06, - "loss": 0.72817743, - "num_input_tokens_seen": 138714625, - "step": 6462, - "time_per_iteration": 2.7463371753692627 - }, - { - "auxiliary_loss_clip": 0.01086013, - "auxiliary_loss_mlp": 0.01043745, - "balance_loss_clip": 1.04045033, - "balance_loss_mlp": 1.02786636, - "epoch": 0.3885765819930858, - "flos": 25228072270080.0, - "grad_norm": 2.747031306466439, - "language_loss": 0.76228201, - "learning_rate": 2.796157583816052e-06, - "loss": 0.78357965, - "num_input_tokens_seen": 138733585, - "step": 6463, - "time_per_iteration": 2.7231578826904297 - }, - { - "auxiliary_loss_clip": 0.01103201, - "auxiliary_loss_mlp": 0.0104459, - "balance_loss_clip": 1.05013013, - "balance_loss_mlp": 1.02841353, - "epoch": 0.3886367052457538, - "flos": 16946605221120.0, - "grad_norm": 3.605418601568306, - "language_loss": 0.70244539, - "learning_rate": 2.795800295571382e-06, - "loss": 0.72392333, - "num_input_tokens_seen": 138752335, - "step": 6464, - "time_per_iteration": 2.773066759109497 - }, - { - "auxiliary_loss_clip": 0.01110861, - "auxiliary_loss_mlp": 0.01037039, - "balance_loss_clip": 1.04950786, - "balance_loss_mlp": 1.02211452, - "epoch": 0.38869682849842174, - "flos": 27154140376320.0, - "grad_norm": 2.8184770761764777, - "language_loss": 0.69632983, - "learning_rate": 2.7954429771498858e-06, - "loss": 0.71780872, - "num_input_tokens_seen": 138768450, - "step": 6465, - "time_per_iteration": 2.7013487815856934 - }, - { - "auxiliary_loss_clip": 0.01097351, - "auxiliary_loss_mlp": 0.01041748, - "balance_loss_clip": 1.04837847, - "balance_loss_mlp": 1.02645373, - "epoch": 0.3887569517510897, - "flos": 21063619301760.0, - "grad_norm": 2.665243237814177, - "language_loss": 0.78489739, - "learning_rate": 2.7950856285651117e-06, - "loss": 0.80628836, - "num_input_tokens_seen": 138786775, - "step": 6466, - "time_per_iteration": 2.736819267272949 - }, - { - "auxiliary_loss_clip": 0.01095374, - "auxiliary_loss_mlp": 0.01037568, - "balance_loss_clip": 1.0463171, - "balance_loss_mlp": 1.02242851, - "epoch": 0.38881707500375773, - "flos": 29497384016640.0, - "grad_norm": 1.6522613533538497, - "language_loss": 0.69341898, - "learning_rate": 2.794728249830611e-06, - "loss": 0.71474838, - "num_input_tokens_seen": 138810100, - "step": 6467, - "time_per_iteration": 2.778083324432373 - }, - { - "auxiliary_loss_clip": 0.01098114, - "auxiliary_loss_mlp": 0.01048152, - "balance_loss_clip": 1.04706931, - "balance_loss_mlp": 1.0326246, - "epoch": 0.3888771982564257, - "flos": 17488281294720.0, - "grad_norm": 3.2276382920067817, - "language_loss": 0.84199375, - "learning_rate": 2.794370840959936e-06, - "loss": 0.86345637, - "num_input_tokens_seen": 138825140, - "step": 6468, - "time_per_iteration": 2.6842098236083984 - }, - { - "auxiliary_loss_clip": 0.01108569, - "auxiliary_loss_mlp": 0.01036235, - "balance_loss_clip": 1.048172, - "balance_loss_mlp": 1.0227766, - "epoch": 0.38893732150909366, - "flos": 21942425450880.0, - "grad_norm": 1.8219377355536144, - "language_loss": 0.84232908, - "learning_rate": 2.7940134019666383e-06, - "loss": 0.86377716, - "num_input_tokens_seen": 138844115, - "step": 6469, - "time_per_iteration": 2.7538135051727295 - }, - { - "auxiliary_loss_clip": 0.0109067, - "auxiliary_loss_mlp": 0.01048288, - "balance_loss_clip": 1.04416847, - "balance_loss_mlp": 1.03205132, - "epoch": 0.3889974447617616, - "flos": 24276367468800.0, - "grad_norm": 2.339210402911935, - "language_loss": 0.75173676, - "learning_rate": 2.793655932864273e-06, - "loss": 0.7731263, - "num_input_tokens_seen": 138860860, - "step": 6470, - "time_per_iteration": 2.7425949573516846 - }, - { - "auxiliary_loss_clip": 0.01095528, - "auxiliary_loss_mlp": 0.00772188, - "balance_loss_clip": 1.0480423, - "balance_loss_mlp": 1.00016475, - "epoch": 0.3890575680144296, - "flos": 25667116208640.0, - "grad_norm": 1.5943716760052937, - "language_loss": 0.74977577, - "learning_rate": 2.7932984336663953e-06, - "loss": 0.76845288, - "num_input_tokens_seen": 138881910, - "step": 6471, - "time_per_iteration": 2.8880369663238525 - }, - { - "auxiliary_loss_clip": 0.01077518, - "auxiliary_loss_mlp": 0.01049277, - "balance_loss_clip": 1.03879571, - "balance_loss_mlp": 1.03336215, - "epoch": 0.38911769126709755, - "flos": 22855274714880.0, - "grad_norm": 2.421548050110463, - "language_loss": 0.67984551, - "learning_rate": 2.792940904386562e-06, - "loss": 0.70111346, - "num_input_tokens_seen": 138900975, - "step": 6472, - "time_per_iteration": 2.7776875495910645 - }, - { - "auxiliary_loss_clip": 0.01103596, - "auxiliary_loss_mlp": 0.01043152, - "balance_loss_clip": 1.04819107, - "balance_loss_mlp": 1.02974129, - "epoch": 0.3891778145197655, - "flos": 25447522412160.0, - "grad_norm": 1.8102352941433608, - "language_loss": 0.76068687, - "learning_rate": 2.7925833450383293e-06, - "loss": 0.78215432, - "num_input_tokens_seen": 138920795, - "step": 6473, - "time_per_iteration": 2.7568469047546387 - }, - { - "auxiliary_loss_clip": 0.01113975, - "auxiliary_loss_mlp": 0.01046096, - "balance_loss_clip": 1.05217087, - "balance_loss_mlp": 1.03031242, - "epoch": 0.3892379377724335, - "flos": 14027965614720.0, - "grad_norm": 2.045216735434868, - "language_loss": 0.70959115, - "learning_rate": 2.792225755635257e-06, - "loss": 0.73119187, - "num_input_tokens_seen": 138938770, - "step": 6474, - "time_per_iteration": 2.6930696964263916 - }, - { - "auxiliary_loss_clip": 0.01135028, - "auxiliary_loss_mlp": 0.01042055, - "balance_loss_clip": 1.05145836, - "balance_loss_mlp": 1.02861369, - "epoch": 0.38929806102510145, - "flos": 20157449967360.0, - "grad_norm": 1.5519949793695216, - "language_loss": 0.69049072, - "learning_rate": 2.7918681361909046e-06, - "loss": 0.71226156, - "num_input_tokens_seen": 138958880, - "step": 6475, - "time_per_iteration": 2.670830011367798 - }, - { - "auxiliary_loss_clip": 0.01110637, - "auxiliary_loss_mlp": 0.01057592, - "balance_loss_clip": 1.04578567, - "balance_loss_mlp": 1.03981757, - "epoch": 0.3893581842777694, - "flos": 22163958581760.0, - "grad_norm": 1.9596553320764234, - "language_loss": 0.75820196, - "learning_rate": 2.7915104867188332e-06, - "loss": 0.77988434, - "num_input_tokens_seen": 138977240, - "step": 6476, - "time_per_iteration": 2.683980941772461 - }, - { - "auxiliary_loss_clip": 0.01039888, - "auxiliary_loss_mlp": 0.01002183, - "balance_loss_clip": 1.02862918, - "balance_loss_mlp": 1.00084782, - "epoch": 0.3894183075304374, - "flos": 67301877392640.0, - "grad_norm": 0.7759740468574157, - "language_loss": 0.58146399, - "learning_rate": 2.7911528072326055e-06, - "loss": 0.60188472, - "num_input_tokens_seen": 139039035, - "step": 6477, - "time_per_iteration": 3.2430496215820312 - }, - { - "auxiliary_loss_clip": 0.01092497, - "auxiliary_loss_mlp": 0.01040603, - "balance_loss_clip": 1.04780793, - "balance_loss_mlp": 1.02428961, - "epoch": 0.38947843078310534, - "flos": 18547502480640.0, - "grad_norm": 1.9073891309950948, - "language_loss": 0.78554142, - "learning_rate": 2.7907950977457832e-06, - "loss": 0.80687243, - "num_input_tokens_seen": 139055560, - "step": 6478, - "time_per_iteration": 4.241156339645386 - }, - { - "auxiliary_loss_clip": 0.01116081, - "auxiliary_loss_mlp": 0.0103975, - "balance_loss_clip": 1.04505491, - "balance_loss_mlp": 1.02545047, - "epoch": 0.3895385540357733, - "flos": 14605875532800.0, - "grad_norm": 2.6992371438810783, - "language_loss": 0.82647753, - "learning_rate": 2.7904373582719317e-06, - "loss": 0.84803581, - "num_input_tokens_seen": 139071865, - "step": 6479, - "time_per_iteration": 4.1569294929504395 - }, - { - "auxiliary_loss_clip": 0.01131381, - "auxiliary_loss_mlp": 0.01036344, - "balance_loss_clip": 1.04886651, - "balance_loss_mlp": 1.02161551, - "epoch": 0.38959867728844133, - "flos": 19975203336960.0, - "grad_norm": 2.334048099077096, - "language_loss": 0.79657412, - "learning_rate": 2.790079588824617e-06, - "loss": 0.81825137, - "num_input_tokens_seen": 139089640, - "step": 6480, - "time_per_iteration": 4.170635938644409 - }, - { - "auxiliary_loss_clip": 0.0110471, - "auxiliary_loss_mlp": 0.01032466, - "balance_loss_clip": 1.04561472, - "balance_loss_mlp": 1.01822066, - "epoch": 0.3896588005411093, - "flos": 22672130244480.0, - "grad_norm": 6.364109786330533, - "language_loss": 0.83021134, - "learning_rate": 2.7897217894174038e-06, - "loss": 0.85158312, - "num_input_tokens_seen": 139109365, - "step": 6481, - "time_per_iteration": 2.638821840286255 - }, - { - "auxiliary_loss_clip": 0.01102815, - "auxiliary_loss_mlp": 0.01038843, - "balance_loss_clip": 1.04740214, - "balance_loss_mlp": 1.02503228, - "epoch": 0.38971892379377726, - "flos": 20996035862400.0, - "grad_norm": 1.7002276765936415, - "language_loss": 0.75389051, - "learning_rate": 2.789363960063863e-06, - "loss": 0.77530706, - "num_input_tokens_seen": 139128260, - "step": 6482, - "time_per_iteration": 2.5737624168395996 - }, - { - "auxiliary_loss_clip": 0.01100553, - "auxiliary_loss_mlp": 0.01035815, - "balance_loss_clip": 1.04781246, - "balance_loss_mlp": 1.02164662, - "epoch": 0.3897790470464452, - "flos": 22528487756160.0, - "grad_norm": 2.0703094316503554, - "language_loss": 0.78786725, - "learning_rate": 2.78900610077756e-06, - "loss": 0.80923092, - "num_input_tokens_seen": 139147315, - "step": 6483, - "time_per_iteration": 2.6177117824554443 - }, - { - "auxiliary_loss_clip": 0.01121516, - "auxiliary_loss_mlp": 0.01030702, - "balance_loss_clip": 1.04790664, - "balance_loss_mlp": 1.01487088, - "epoch": 0.3898391702991132, - "flos": 26209905603840.0, - "grad_norm": 1.6677367088018817, - "language_loss": 0.79871929, - "learning_rate": 2.788648211572067e-06, - "loss": 0.82024151, - "num_input_tokens_seen": 139167270, - "step": 6484, - "time_per_iteration": 4.221461534500122 - }, - { - "auxiliary_loss_clip": 0.01119394, - "auxiliary_loss_mlp": 0.01051487, - "balance_loss_clip": 1.05063844, - "balance_loss_mlp": 1.03472662, - "epoch": 0.38989929355178116, - "flos": 21065558636160.0, - "grad_norm": 2.1008000508061104, - "language_loss": 0.77901775, - "learning_rate": 2.7882902924609557e-06, - "loss": 0.80072653, - "num_input_tokens_seen": 139185970, - "step": 6485, - "time_per_iteration": 2.664097785949707 - }, - { - "auxiliary_loss_clip": 0.01085813, - "auxiliary_loss_mlp": 0.01036912, - "balance_loss_clip": 1.0427084, - "balance_loss_mlp": 1.02207613, - "epoch": 0.3899594168044491, - "flos": 25484115392640.0, - "grad_norm": 6.223818029706007, - "language_loss": 0.85190272, - "learning_rate": 2.7879323434577965e-06, - "loss": 0.87312996, - "num_input_tokens_seen": 139203730, - "step": 6486, - "time_per_iteration": 2.8325467109680176 - }, - { - "auxiliary_loss_clip": 0.01111569, - "auxiliary_loss_mlp": 0.01033392, - "balance_loss_clip": 1.04786611, - "balance_loss_mlp": 1.01883638, - "epoch": 0.3900195400571171, - "flos": 31139363456640.0, - "grad_norm": 2.4250185390770618, - "language_loss": 0.85333234, - "learning_rate": 2.7875743645761645e-06, - "loss": 0.87478197, - "num_input_tokens_seen": 139222560, - "step": 6487, - "time_per_iteration": 2.8390486240386963 - }, - { - "auxiliary_loss_clip": 0.01103222, - "auxiliary_loss_mlp": 0.01032994, - "balance_loss_clip": 1.04449213, - "balance_loss_mlp": 1.01793766, - "epoch": 0.39007966330978505, - "flos": 20229917656320.0, - "grad_norm": 1.5390409302603854, - "language_loss": 0.72954559, - "learning_rate": 2.787216355829633e-06, - "loss": 0.75090778, - "num_input_tokens_seen": 139242165, - "step": 6488, - "time_per_iteration": 2.7613236904144287 - }, - { - "auxiliary_loss_clip": 0.01096805, - "auxiliary_loss_mlp": 0.01044873, - "balance_loss_clip": 1.04673266, - "balance_loss_mlp": 1.02771914, - "epoch": 0.390139786562453, - "flos": 22528739151360.0, - "grad_norm": 2.6420160637986383, - "language_loss": 0.68467176, - "learning_rate": 2.786858317231779e-06, - "loss": 0.70608854, - "num_input_tokens_seen": 139262525, - "step": 6489, - "time_per_iteration": 2.746307849884033 - }, - { - "auxiliary_loss_clip": 0.01108111, - "auxiliary_loss_mlp": 0.01041602, - "balance_loss_clip": 1.04793715, - "balance_loss_mlp": 1.02673674, - "epoch": 0.390199909815121, - "flos": 26432911192320.0, - "grad_norm": 1.6912118236512272, - "language_loss": 0.80629271, - "learning_rate": 2.7865002487961788e-06, - "loss": 0.82778984, - "num_input_tokens_seen": 139282835, - "step": 6490, - "time_per_iteration": 2.7116847038269043 - }, - { - "auxiliary_loss_clip": 0.01124963, - "auxiliary_loss_mlp": 0.01033045, - "balance_loss_clip": 1.04856181, - "balance_loss_mlp": 1.0187161, - "epoch": 0.39026003306778895, - "flos": 17274577328640.0, - "grad_norm": 3.073568327903315, - "language_loss": 0.89115125, - "learning_rate": 2.7861421505364104e-06, - "loss": 0.91273135, - "num_input_tokens_seen": 139299490, - "step": 6491, - "time_per_iteration": 2.6211190223693848 - }, - { - "auxiliary_loss_clip": 0.01092029, - "auxiliary_loss_mlp": 0.01045074, - "balance_loss_clip": 1.04406416, - "balance_loss_mlp": 1.02952874, - "epoch": 0.3903201563204569, - "flos": 24532841554560.0, - "grad_norm": 1.8064559635296407, - "language_loss": 0.78637981, - "learning_rate": 2.7857840224660523e-06, - "loss": 0.80775088, - "num_input_tokens_seen": 139317865, - "step": 6492, - "time_per_iteration": 2.7505667209625244 - }, - { - "auxiliary_loss_clip": 0.01108778, - "auxiliary_loss_mlp": 0.01041967, - "balance_loss_clip": 1.04486537, - "balance_loss_mlp": 1.02735257, - "epoch": 0.39038027957312493, - "flos": 23767944410880.0, - "grad_norm": 1.7227367696506604, - "language_loss": 0.74431908, - "learning_rate": 2.7854258645986857e-06, - "loss": 0.76582652, - "num_input_tokens_seen": 139339840, - "step": 6493, - "time_per_iteration": 2.7200233936309814 - }, - { - "auxiliary_loss_clip": 0.01091358, - "auxiliary_loss_mlp": 0.01040258, - "balance_loss_clip": 1.04613161, - "balance_loss_mlp": 1.02549398, - "epoch": 0.3904404028257929, - "flos": 14100612871680.0, - "grad_norm": 2.9656676182999395, - "language_loss": 0.7637316, - "learning_rate": 2.7850676769478916e-06, - "loss": 0.78504777, - "num_input_tokens_seen": 139357555, - "step": 6494, - "time_per_iteration": 2.6818442344665527 - }, - { - "auxiliary_loss_clip": 0.01131498, - "auxiliary_loss_mlp": 0.01048378, - "balance_loss_clip": 1.0500524, - "balance_loss_mlp": 1.03182006, - "epoch": 0.39050052607846086, - "flos": 16910048154240.0, - "grad_norm": 2.1152980497113782, - "language_loss": 0.74208486, - "learning_rate": 2.7847094595272525e-06, - "loss": 0.76388359, - "num_input_tokens_seen": 139374455, - "step": 6495, - "time_per_iteration": 2.6432337760925293 - }, - { - "auxiliary_loss_clip": 0.01137243, - "auxiliary_loss_mlp": 0.01045454, - "balance_loss_clip": 1.05153751, - "balance_loss_mlp": 1.02913451, - "epoch": 0.39056064933112883, - "flos": 25915761129600.0, - "grad_norm": 2.402575660392066, - "language_loss": 0.67757058, - "learning_rate": 2.784351212350352e-06, - "loss": 0.69939756, - "num_input_tokens_seen": 139394770, - "step": 6496, - "time_per_iteration": 2.762009859085083 - }, - { - "auxiliary_loss_clip": 0.01023856, - "auxiliary_loss_mlp": 0.01010625, - "balance_loss_clip": 1.02393842, - "balance_loss_mlp": 1.00925446, - "epoch": 0.3906207725837968, - "flos": 60028421713920.0, - "grad_norm": 0.6655460592599327, - "language_loss": 0.53920811, - "learning_rate": 2.783992935430775e-06, - "loss": 0.55955297, - "num_input_tokens_seen": 139454760, - "step": 6497, - "time_per_iteration": 3.351006507873535 - }, - { - "auxiliary_loss_clip": 0.01094838, - "auxiliary_loss_mlp": 0.00772151, - "balance_loss_clip": 1.0476501, - "balance_loss_mlp": 1.00038421, - "epoch": 0.39068089583646476, - "flos": 21068683119360.0, - "grad_norm": 2.7558428999232847, - "language_loss": 0.6865977, - "learning_rate": 2.7836346287821068e-06, - "loss": 0.70526755, - "num_input_tokens_seen": 139472645, - "step": 6498, - "time_per_iteration": 2.7838692665100098 - }, - { - "auxiliary_loss_clip": 0.01022021, - "auxiliary_loss_mlp": 0.01009741, - "balance_loss_clip": 1.02064919, - "balance_loss_mlp": 1.00839996, - "epoch": 0.3907410190891327, - "flos": 70445677403520.0, - "grad_norm": 0.7248596102007157, - "language_loss": 0.51767612, - "learning_rate": 2.783276292417936e-06, - "loss": 0.53799379, - "num_input_tokens_seen": 139536730, - "step": 6499, - "time_per_iteration": 3.2980377674102783 - }, - { - "auxiliary_loss_clip": 0.01122618, - "auxiliary_loss_mlp": 0.01044387, - "balance_loss_clip": 1.04676056, - "balance_loss_mlp": 1.02793658, - "epoch": 0.3908011423418007, - "flos": 27962454084480.0, - "grad_norm": 1.973185164339423, - "language_loss": 0.73842579, - "learning_rate": 2.7829179263518487e-06, - "loss": 0.76009583, - "num_input_tokens_seen": 139557540, - "step": 6500, - "time_per_iteration": 2.7239198684692383 - }, - { - "auxiliary_loss_clip": 0.01125366, - "auxiliary_loss_mlp": 0.01037256, - "balance_loss_clip": 1.05035591, - "balance_loss_mlp": 1.02246249, - "epoch": 0.39086126559446865, - "flos": 24462097718400.0, - "grad_norm": 2.6021512056662814, - "language_loss": 0.68837166, - "learning_rate": 2.7825595305974354e-06, - "loss": 0.70999795, - "num_input_tokens_seen": 139576875, - "step": 6501, - "time_per_iteration": 2.6926429271698 - }, - { - "auxiliary_loss_clip": 0.01122637, - "auxiliary_loss_mlp": 0.0103859, - "balance_loss_clip": 1.04832482, - "balance_loss_mlp": 1.02442181, - "epoch": 0.3909213888471366, - "flos": 16941541403520.0, - "grad_norm": 2.1384909443348246, - "language_loss": 0.78875881, - "learning_rate": 2.782201105168287e-06, - "loss": 0.8103711, - "num_input_tokens_seen": 139594295, - "step": 6502, - "time_per_iteration": 2.647021770477295 - }, - { - "auxiliary_loss_clip": 0.01109811, - "auxiliary_loss_mlp": 0.01035328, - "balance_loss_clip": 1.04876852, - "balance_loss_mlp": 1.02171457, - "epoch": 0.3909815120998046, - "flos": 29278400751360.0, - "grad_norm": 3.671996146003432, - "language_loss": 0.80537987, - "learning_rate": 2.7818426500779932e-06, - "loss": 0.82683128, - "num_input_tokens_seen": 139614080, - "step": 6503, - "time_per_iteration": 2.7318384647369385 - }, - { - "auxiliary_loss_clip": 0.0110371, - "auxiliary_loss_mlp": 0.01031248, - "balance_loss_clip": 1.04387689, - "balance_loss_mlp": 1.01760423, - "epoch": 0.39104163535247255, - "flos": 18951246328320.0, - "grad_norm": 1.848076786389183, - "language_loss": 0.71439689, - "learning_rate": 2.7814841653401485e-06, - "loss": 0.7357465, - "num_input_tokens_seen": 139632755, - "step": 6504, - "time_per_iteration": 2.6983554363250732 - }, - { - "auxiliary_loss_clip": 0.01130195, - "auxiliary_loss_mlp": 0.01034576, - "balance_loss_clip": 1.0459981, - "balance_loss_mlp": 1.0199374, - "epoch": 0.3911017586051405, - "flos": 26323347732480.0, - "grad_norm": 1.4848516480735832, - "language_loss": 0.83245611, - "learning_rate": 2.7811256509683454e-06, - "loss": 0.8541038, - "num_input_tokens_seen": 139654205, - "step": 6505, - "time_per_iteration": 2.6663267612457275 - }, - { - "auxiliary_loss_clip": 0.01131259, - "auxiliary_loss_mlp": 0.01036964, - "balance_loss_clip": 1.04880178, - "balance_loss_mlp": 1.02123427, - "epoch": 0.3911618818578085, - "flos": 21835770992640.0, - "grad_norm": 1.9330872564568533, - "language_loss": 0.71352887, - "learning_rate": 2.7807671069761797e-06, - "loss": 0.73521107, - "num_input_tokens_seen": 139673595, - "step": 6506, - "time_per_iteration": 2.6168534755706787 - }, - { - "auxiliary_loss_clip": 0.01105925, - "auxiliary_loss_mlp": 0.01036209, - "balance_loss_clip": 1.04536867, - "balance_loss_mlp": 1.02267289, - "epoch": 0.3912220051104765, - "flos": 16359680989440.0, - "grad_norm": 2.106647299507305, - "language_loss": 0.75086504, - "learning_rate": 2.7804085333772477e-06, - "loss": 0.77228636, - "num_input_tokens_seen": 139690565, - "step": 6507, - "time_per_iteration": 2.8207101821899414 - }, - { - "auxiliary_loss_clip": 0.01053146, - "auxiliary_loss_mlp": 0.01002126, - "balance_loss_clip": 1.02403712, - "balance_loss_mlp": 1.00068331, - "epoch": 0.39128212836314447, - "flos": 71050986420480.0, - "grad_norm": 0.9386901837185221, - "language_loss": 0.56488812, - "learning_rate": 2.7800499301851446e-06, - "loss": 0.58544087, - "num_input_tokens_seen": 139749420, - "step": 6508, - "time_per_iteration": 3.3985793590545654 - }, - { - "auxiliary_loss_clip": 0.01121659, - "auxiliary_loss_mlp": 0.01038464, - "balance_loss_clip": 1.05045915, - "balance_loss_mlp": 1.02476096, - "epoch": 0.39134225161581243, - "flos": 20331975173760.0, - "grad_norm": 2.0207920703954936, - "language_loss": 0.76855135, - "learning_rate": 2.779691297413471e-06, - "loss": 0.79015261, - "num_input_tokens_seen": 139766265, - "step": 6509, - "time_per_iteration": 2.6667048931121826 - }, - { - "auxiliary_loss_clip": 0.01101334, - "auxiliary_loss_mlp": 0.01043985, - "balance_loss_clip": 1.04298568, - "balance_loss_mlp": 1.02731967, - "epoch": 0.3914023748684804, - "flos": 17018390551680.0, - "grad_norm": 5.905968065437354, - "language_loss": 0.82739937, - "learning_rate": 2.779332635075825e-06, - "loss": 0.84885252, - "num_input_tokens_seen": 139782400, - "step": 6510, - "time_per_iteration": 2.933931589126587 - }, - { - "auxiliary_loss_clip": 0.0112259, - "auxiliary_loss_mlp": 0.01038677, - "balance_loss_clip": 1.04712081, - "balance_loss_mlp": 1.02406788, - "epoch": 0.39146249812114836, - "flos": 18405224709120.0, - "grad_norm": 5.781106582003857, - "language_loss": 0.76999253, - "learning_rate": 2.7789739431858073e-06, - "loss": 0.79160517, - "num_input_tokens_seen": 139801435, - "step": 6511, - "time_per_iteration": 2.6926233768463135 - }, - { - "auxiliary_loss_clip": 0.01035867, - "auxiliary_loss_mlp": 0.01006458, - "balance_loss_clip": 1.02583003, - "balance_loss_mlp": 1.00515223, - "epoch": 0.3915226213738163, - "flos": 67637355442560.0, - "grad_norm": 0.716551875912138, - "language_loss": 0.57749176, - "learning_rate": 2.7786152217570196e-06, - "loss": 0.59791505, - "num_input_tokens_seen": 139869700, - "step": 6512, - "time_per_iteration": 3.3695731163024902 - }, - { - "auxiliary_loss_clip": 0.01135844, - "auxiliary_loss_mlp": 0.01035397, - "balance_loss_clip": 1.05013657, - "balance_loss_mlp": 1.02001858, - "epoch": 0.3915827446264843, - "flos": 26359330181760.0, - "grad_norm": 1.8014676974175234, - "language_loss": 0.69625974, - "learning_rate": 2.7782564708030647e-06, - "loss": 0.71797216, - "num_input_tokens_seen": 139890140, - "step": 6513, - "time_per_iteration": 2.8037526607513428 - }, - { - "auxiliary_loss_clip": 0.01095461, - "auxiliary_loss_mlp": 0.01038913, - "balance_loss_clip": 1.04791474, - "balance_loss_mlp": 1.02376771, - "epoch": 0.39164286787915226, - "flos": 21943897908480.0, - "grad_norm": 8.577901504868834, - "language_loss": 0.75566119, - "learning_rate": 2.7778976903375464e-06, - "loss": 0.77700496, - "num_input_tokens_seen": 139908020, - "step": 6514, - "time_per_iteration": 2.8419485092163086 - }, - { - "auxiliary_loss_clip": 0.01094835, - "auxiliary_loss_mlp": 0.01040327, - "balance_loss_clip": 1.04639578, - "balance_loss_mlp": 1.02636766, - "epoch": 0.3917029911318202, - "flos": 16399829416320.0, - "grad_norm": 2.188170768945522, - "language_loss": 0.77334291, - "learning_rate": 2.7775388803740693e-06, - "loss": 0.79469454, - "num_input_tokens_seen": 139926180, - "step": 6515, - "time_per_iteration": 2.7894155979156494 - }, - { - "auxiliary_loss_clip": 0.01087017, - "auxiliary_loss_mlp": 0.0105158, - "balance_loss_clip": 1.03979194, - "balance_loss_mlp": 1.03763223, - "epoch": 0.3917631143844882, - "flos": 26211701283840.0, - "grad_norm": 1.5088395946363757, - "language_loss": 0.79678488, - "learning_rate": 2.7771800409262406e-06, - "loss": 0.81817091, - "num_input_tokens_seen": 139947420, - "step": 6516, - "time_per_iteration": 2.902660608291626 - }, - { - "auxiliary_loss_clip": 0.01092649, - "auxiliary_loss_mlp": 0.01042434, - "balance_loss_clip": 1.04691982, - "balance_loss_mlp": 1.02799749, - "epoch": 0.39182323763715615, - "flos": 18548364407040.0, - "grad_norm": 1.9539461980584907, - "language_loss": 0.70539331, - "learning_rate": 2.7768211720076665e-06, - "loss": 0.72674412, - "num_input_tokens_seen": 139965800, - "step": 6517, - "time_per_iteration": 4.275412082672119 - }, - { - "auxiliary_loss_clip": 0.0108795, - "auxiliary_loss_mlp": 0.01045392, - "balance_loss_clip": 1.04107618, - "balance_loss_mlp": 1.03034759, - "epoch": 0.3918833608898241, - "flos": 34313543395200.0, - "grad_norm": 1.7270068216094292, - "language_loss": 0.72215492, - "learning_rate": 2.776462273631956e-06, - "loss": 0.74348831, - "num_input_tokens_seen": 139988140, - "step": 6518, - "time_per_iteration": 4.390907287597656 - }, - { - "auxiliary_loss_clip": 0.01124647, - "auxiliary_loss_mlp": 0.0104138, - "balance_loss_clip": 1.05179489, - "balance_loss_mlp": 1.02679503, - "epoch": 0.3919434841424921, - "flos": 36939582812160.0, - "grad_norm": 1.8265438315676477, - "language_loss": 0.61835045, - "learning_rate": 2.7761033458127177e-06, - "loss": 0.64001071, - "num_input_tokens_seen": 140010060, - "step": 6519, - "time_per_iteration": 4.281017780303955 - }, - { - "auxiliary_loss_clip": 0.01142133, - "auxiliary_loss_mlp": 0.01043415, - "balance_loss_clip": 1.05199361, - "balance_loss_mlp": 1.02807307, - "epoch": 0.3920036073951601, - "flos": 23508956373120.0, - "grad_norm": 2.723028929016538, - "language_loss": 0.67084813, - "learning_rate": 2.775744388563563e-06, - "loss": 0.6927036, - "num_input_tokens_seen": 140029400, - "step": 6520, - "time_per_iteration": 2.6971800327301025 - }, - { - "auxiliary_loss_clip": 0.01130641, - "auxiliary_loss_mlp": 0.01040483, - "balance_loss_clip": 1.04749501, - "balance_loss_mlp": 1.02648759, - "epoch": 0.39206373064782807, - "flos": 18406086635520.0, - "grad_norm": 1.8214273138880266, - "language_loss": 0.78716481, - "learning_rate": 2.775385401898104e-06, - "loss": 0.80887604, - "num_input_tokens_seen": 140048940, - "step": 6521, - "time_per_iteration": 2.69966459274292 - }, - { - "auxiliary_loss_clip": 0.01128458, - "auxiliary_loss_mlp": 0.01040156, - "balance_loss_clip": 1.05050826, - "balance_loss_mlp": 1.02289462, - "epoch": 0.39212385390049603, - "flos": 12313051608960.0, - "grad_norm": 2.9673341059873897, - "language_loss": 0.70119011, - "learning_rate": 2.775026385829952e-06, - "loss": 0.72287625, - "num_input_tokens_seen": 140066380, - "step": 6522, - "time_per_iteration": 2.7100417613983154 - }, - { - "auxiliary_loss_clip": 0.0110971, - "auxiliary_loss_mlp": 0.01035612, - "balance_loss_clip": 1.0467701, - "balance_loss_mlp": 1.02100325, - "epoch": 0.392183977153164, - "flos": 19719160214400.0, - "grad_norm": 2.0481488550445595, - "language_loss": 0.76847959, - "learning_rate": 2.774667340372722e-06, - "loss": 0.78993279, - "num_input_tokens_seen": 140085275, - "step": 6523, - "time_per_iteration": 4.336375713348389 - }, - { - "auxiliary_loss_clip": 0.01111577, - "auxiliary_loss_mlp": 0.01040964, - "balance_loss_clip": 1.04617906, - "balance_loss_mlp": 1.02597904, - "epoch": 0.39224410040583196, - "flos": 33144902403840.0, - "grad_norm": 2.4064695780458254, - "language_loss": 0.62052447, - "learning_rate": 2.7743082655400293e-06, - "loss": 0.64204991, - "num_input_tokens_seen": 140105105, - "step": 6524, - "time_per_iteration": 2.861999750137329 - }, - { - "auxiliary_loss_clip": 0.0113421, - "auxiliary_loss_mlp": 0.01041444, - "balance_loss_clip": 1.04792655, - "balance_loss_mlp": 1.02591681, - "epoch": 0.39230422365849993, - "flos": 27782434097280.0, - "grad_norm": 3.311294983146634, - "language_loss": 0.74027938, - "learning_rate": 2.773949161345489e-06, - "loss": 0.76203597, - "num_input_tokens_seen": 140125645, - "step": 6525, - "time_per_iteration": 2.6660265922546387 - }, - { - "auxiliary_loss_clip": 0.01111123, - "auxiliary_loss_mlp": 0.01038937, - "balance_loss_clip": 1.04621911, - "balance_loss_mlp": 1.02488267, - "epoch": 0.3923643469111679, - "flos": 17931634865280.0, - "grad_norm": 1.9378599466790423, - "language_loss": 0.81101322, - "learning_rate": 2.773590027802719e-06, - "loss": 0.83251387, - "num_input_tokens_seen": 140141925, - "step": 6526, - "time_per_iteration": 2.6949198246002197 - }, - { - "auxiliary_loss_clip": 0.01122115, - "auxiliary_loss_mlp": 0.01043128, - "balance_loss_clip": 1.04750228, - "balance_loss_mlp": 1.02844119, - "epoch": 0.39242447016383586, - "flos": 24059539019520.0, - "grad_norm": 2.21390394508072, - "language_loss": 0.69860446, - "learning_rate": 2.7732308649253383e-06, - "loss": 0.72025692, - "num_input_tokens_seen": 140160965, - "step": 6527, - "time_per_iteration": 2.648738384246826 - }, - { - "auxiliary_loss_clip": 0.01093845, - "auxiliary_loss_mlp": 0.01034532, - "balance_loss_clip": 1.04563034, - "balance_loss_mlp": 1.01990485, - "epoch": 0.3924845934165038, - "flos": 10664069016960.0, - "grad_norm": 2.870547931880311, - "language_loss": 0.82659566, - "learning_rate": 2.772871672726965e-06, - "loss": 0.84787941, - "num_input_tokens_seen": 140177780, - "step": 6528, - "time_per_iteration": 2.7436537742614746 - }, - { - "auxiliary_loss_clip": 0.01105744, - "auxiliary_loss_mlp": 0.01032675, - "balance_loss_clip": 1.04709864, - "balance_loss_mlp": 1.01909113, - "epoch": 0.3925447166691718, - "flos": 31245910174080.0, - "grad_norm": 1.7012894335018593, - "language_loss": 0.68846285, - "learning_rate": 2.7725124512212205e-06, - "loss": 0.70984709, - "num_input_tokens_seen": 140201660, - "step": 6529, - "time_per_iteration": 2.7932794094085693 - }, - { - "auxiliary_loss_clip": 0.01112194, - "auxiliary_loss_mlp": 0.01035865, - "balance_loss_clip": 1.04500198, - "balance_loss_mlp": 1.02043366, - "epoch": 0.39260483992183975, - "flos": 29415040087680.0, - "grad_norm": 2.4176127237752145, - "language_loss": 0.80461496, - "learning_rate": 2.7721532004217267e-06, - "loss": 0.82609558, - "num_input_tokens_seen": 140218585, - "step": 6530, - "time_per_iteration": 2.7094242572784424 - }, - { - "auxiliary_loss_clip": 0.01119536, - "auxiliary_loss_mlp": 0.0104093, - "balance_loss_clip": 1.04586959, - "balance_loss_mlp": 1.0264107, - "epoch": 0.3926649631745077, - "flos": 22857788666880.0, - "grad_norm": 1.6828565274400475, - "language_loss": 0.75680822, - "learning_rate": 2.7717939203421063e-06, - "loss": 0.77841288, - "num_input_tokens_seen": 140239905, - "step": 6531, - "time_per_iteration": 2.7238411903381348 - }, - { - "auxiliary_loss_clip": 0.01058847, - "auxiliary_loss_mlp": 0.01008064, - "balance_loss_clip": 1.03009987, - "balance_loss_mlp": 1.00663972, - "epoch": 0.3927250864271757, - "flos": 63893881872000.0, - "grad_norm": 0.8211432271778524, - "language_loss": 0.60317427, - "learning_rate": 2.7714346109959822e-06, - "loss": 0.62384337, - "num_input_tokens_seen": 140293820, - "step": 6532, - "time_per_iteration": 3.047954797744751 - }, - { - "auxiliary_loss_clip": 0.01037233, - "auxiliary_loss_mlp": 0.01004719, - "balance_loss_clip": 1.02873898, - "balance_loss_mlp": 1.00334251, - "epoch": 0.3927852096798437, - "flos": 68909741890560.0, - "grad_norm": 0.7803139799058858, - "language_loss": 0.55459583, - "learning_rate": 2.771075272396981e-06, - "loss": 0.57501537, - "num_input_tokens_seen": 140360420, - "step": 6533, - "time_per_iteration": 3.306561231613159 - }, - { - "auxiliary_loss_clip": 0.01112553, - "auxiliary_loss_mlp": 0.01040733, - "balance_loss_clip": 1.04983759, - "balance_loss_mlp": 1.02614141, - "epoch": 0.39284533293251167, - "flos": 29715972232320.0, - "grad_norm": 2.2467248181922232, - "language_loss": 0.75955313, - "learning_rate": 2.7707159045587284e-06, - "loss": 0.78108597, - "num_input_tokens_seen": 140381950, - "step": 6534, - "time_per_iteration": 2.7788329124450684 - }, - { - "auxiliary_loss_clip": 0.0112134, - "auxiliary_loss_mlp": 0.01045716, - "balance_loss_clip": 1.04698312, - "balance_loss_mlp": 1.02866912, - "epoch": 0.39290545618517964, - "flos": 18552027594240.0, - "grad_norm": 2.2080736338994944, - "language_loss": 0.78123498, - "learning_rate": 2.770356507494851e-06, - "loss": 0.80290556, - "num_input_tokens_seen": 140399410, - "step": 6535, - "time_per_iteration": 2.6949005126953125 - }, - { - "auxiliary_loss_clip": 0.0109337, - "auxiliary_loss_mlp": 0.0103265, - "balance_loss_clip": 1.04779291, - "balance_loss_mlp": 1.01950169, - "epoch": 0.3929655794378476, - "flos": 26249479413120.0, - "grad_norm": 1.9769476518607105, - "language_loss": 0.686719, - "learning_rate": 2.769997081218978e-06, - "loss": 0.7079792, - "num_input_tokens_seen": 140419055, - "step": 6536, - "time_per_iteration": 2.7684245109558105 - }, - { - "auxiliary_loss_clip": 0.01104946, - "auxiliary_loss_mlp": 0.01037851, - "balance_loss_clip": 1.04767156, - "balance_loss_mlp": 1.02469027, - "epoch": 0.39302570269051557, - "flos": 29277933874560.0, - "grad_norm": 1.8012856746153256, - "language_loss": 0.69048655, - "learning_rate": 2.769637625744738e-06, - "loss": 0.71191454, - "num_input_tokens_seen": 140438800, - "step": 6537, - "time_per_iteration": 2.7638440132141113 - }, - { - "auxiliary_loss_clip": 0.01122897, - "auxiliary_loss_mlp": 0.01040751, - "balance_loss_clip": 1.05155134, - "balance_loss_mlp": 1.02624357, - "epoch": 0.39308582594318353, - "flos": 17347440067200.0, - "grad_norm": 1.7514361880438423, - "language_loss": 0.78990901, - "learning_rate": 2.769278141085763e-06, - "loss": 0.81154549, - "num_input_tokens_seen": 140456880, - "step": 6538, - "time_per_iteration": 2.635075807571411 - }, - { - "auxiliary_loss_clip": 0.01003397, - "auxiliary_loss_mlp": 0.01017351, - "balance_loss_clip": 1.02259159, - "balance_loss_mlp": 1.01596797, - "epoch": 0.3931459491958515, - "flos": 61007094650880.0, - "grad_norm": 0.8098068956453415, - "language_loss": 0.6190061, - "learning_rate": 2.768918627255683e-06, - "loss": 0.63921356, - "num_input_tokens_seen": 140507510, - "step": 6539, - "time_per_iteration": 3.0673203468322754 - }, - { - "auxiliary_loss_clip": 0.01104217, - "auxiliary_loss_mlp": 0.0103537, - "balance_loss_clip": 1.04730296, - "balance_loss_mlp": 1.0206002, - "epoch": 0.39320607244851946, - "flos": 39016009249920.0, - "grad_norm": 3.0347619755245248, - "language_loss": 0.68405032, - "learning_rate": 2.7685590842681315e-06, - "loss": 0.70544618, - "num_input_tokens_seen": 140528740, - "step": 6540, - "time_per_iteration": 2.7993643283843994 - }, - { - "auxiliary_loss_clip": 0.01105128, - "auxiliary_loss_mlp": 0.01030736, - "balance_loss_clip": 1.04439306, - "balance_loss_mlp": 1.01638293, - "epoch": 0.3932661957011874, - "flos": 24679752180480.0, - "grad_norm": 1.8325322608278536, - "language_loss": 0.7276125, - "learning_rate": 2.7681995121367433e-06, - "loss": 0.74897116, - "num_input_tokens_seen": 140547560, - "step": 6541, - "time_per_iteration": 2.659224510192871 - }, - { - "auxiliary_loss_clip": 0.01054751, - "auxiliary_loss_mlp": 0.01009472, - "balance_loss_clip": 1.02648139, - "balance_loss_mlp": 1.0080775, - "epoch": 0.3933263189538554, - "flos": 70096552185600.0, - "grad_norm": 0.8313029932067456, - "language_loss": 0.60319722, - "learning_rate": 2.7678399108751516e-06, - "loss": 0.6238395, - "num_input_tokens_seen": 140601175, - "step": 6542, - "time_per_iteration": 2.968062400817871 - }, - { - "auxiliary_loss_clip": 0.01121623, - "auxiliary_loss_mlp": 0.01038302, - "balance_loss_clip": 1.04764903, - "balance_loss_mlp": 1.0243547, - "epoch": 0.39338644220652336, - "flos": 22929071207040.0, - "grad_norm": 1.6209695943494522, - "language_loss": 0.82034504, - "learning_rate": 2.7674802804969947e-06, - "loss": 0.84194422, - "num_input_tokens_seen": 140622200, - "step": 6543, - "time_per_iteration": 2.638796806335449 - }, - { - "auxiliary_loss_clip": 0.01103923, - "auxiliary_loss_mlp": 0.01034902, - "balance_loss_clip": 1.04355097, - "balance_loss_mlp": 1.02045417, - "epoch": 0.3934465654591913, - "flos": 30848163897600.0, - "grad_norm": 3.743075543188527, - "language_loss": 0.69100285, - "learning_rate": 2.767120621015908e-06, - "loss": 0.71239114, - "num_input_tokens_seen": 140643125, - "step": 6544, - "time_per_iteration": 2.7180936336517334 - }, - { - "auxiliary_loss_clip": 0.01112442, - "auxiliary_loss_mlp": 0.01047198, - "balance_loss_clip": 1.04659534, - "balance_loss_mlp": 1.0316174, - "epoch": 0.3935066887118593, - "flos": 29236528471680.0, - "grad_norm": 2.0996268311737976, - "language_loss": 0.76072371, - "learning_rate": 2.76676093244553e-06, - "loss": 0.78232014, - "num_input_tokens_seen": 140662500, - "step": 6545, - "time_per_iteration": 2.7429869174957275 - }, - { - "auxiliary_loss_clip": 0.01091051, - "auxiliary_loss_mlp": 0.01033724, - "balance_loss_clip": 1.04633403, - "balance_loss_mlp": 1.02104044, - "epoch": 0.3935668119645273, - "flos": 19135288638720.0, - "grad_norm": 1.7673371756448844, - "language_loss": 0.74672133, - "learning_rate": 2.7664012147995015e-06, - "loss": 0.76796907, - "num_input_tokens_seen": 140681960, - "step": 6546, - "time_per_iteration": 2.6785295009613037 - }, - { - "auxiliary_loss_clip": 0.01109428, - "auxiliary_loss_mlp": 0.0103425, - "balance_loss_clip": 1.04903293, - "balance_loss_mlp": 1.01946843, - "epoch": 0.3936269352171953, - "flos": 18516116972160.0, - "grad_norm": 1.9230817449169166, - "language_loss": 0.81627518, - "learning_rate": 2.7660414680914617e-06, - "loss": 0.83771199, - "num_input_tokens_seen": 140699170, - "step": 6547, - "time_per_iteration": 2.638214588165283 - }, - { - "auxiliary_loss_clip": 0.01114598, - "auxiliary_loss_mlp": 0.00772919, - "balance_loss_clip": 1.04404151, - "balance_loss_mlp": 1.00032711, - "epoch": 0.39368705846986324, - "flos": 15632813370240.0, - "grad_norm": 1.9821442562566327, - "language_loss": 0.84406352, - "learning_rate": 2.7656816923350525e-06, - "loss": 0.86293864, - "num_input_tokens_seen": 140714920, - "step": 6548, - "time_per_iteration": 2.6490747928619385 - }, - { - "auxiliary_loss_clip": 0.01118074, - "auxiliary_loss_mlp": 0.00771091, - "balance_loss_clip": 1.04686236, - "balance_loss_mlp": 1.00034189, - "epoch": 0.3937471817225312, - "flos": 21325839563520.0, - "grad_norm": 1.7617733187332765, - "language_loss": 0.7311933, - "learning_rate": 2.7653218875439174e-06, - "loss": 0.75008494, - "num_input_tokens_seen": 140734595, - "step": 6549, - "time_per_iteration": 2.635380983352661 - }, - { - "auxiliary_loss_clip": 0.01071621, - "auxiliary_loss_mlp": 0.01042928, - "balance_loss_clip": 1.0444963, - "balance_loss_mlp": 1.0259527, - "epoch": 0.39380730497519917, - "flos": 20776693461120.0, - "grad_norm": 2.774519883144605, - "language_loss": 0.77592897, - "learning_rate": 2.764962053731699e-06, - "loss": 0.7970745, - "num_input_tokens_seen": 140754050, - "step": 6550, - "time_per_iteration": 2.733921527862549 - }, - { - "auxiliary_loss_clip": 0.01095205, - "auxiliary_loss_mlp": 0.01030728, - "balance_loss_clip": 1.04455531, - "balance_loss_mlp": 1.01674485, - "epoch": 0.39386742822786713, - "flos": 21609784575360.0, - "grad_norm": 3.1837220930493517, - "language_loss": 0.81144142, - "learning_rate": 2.7646021909120434e-06, - "loss": 0.83270073, - "num_input_tokens_seen": 140771440, - "step": 6551, - "time_per_iteration": 2.851475238800049 - }, - { - "auxiliary_loss_clip": 0.01117625, - "auxiliary_loss_mlp": 0.01036299, - "balance_loss_clip": 1.0443331, - "balance_loss_mlp": 1.02188659, - "epoch": 0.3939275514805351, - "flos": 12414642249600.0, - "grad_norm": 12.177431380433415, - "language_loss": 0.80449802, - "learning_rate": 2.764242299098596e-06, - "loss": 0.82603723, - "num_input_tokens_seen": 140786715, - "step": 6552, - "time_per_iteration": 2.667344570159912 - }, - { - "auxiliary_loss_clip": 0.01133223, - "auxiliary_loss_mlp": 0.01043273, - "balance_loss_clip": 1.04791522, - "balance_loss_mlp": 1.02883697, - "epoch": 0.39398767473320306, - "flos": 18552027594240.0, - "grad_norm": 2.002210962432939, - "language_loss": 0.71199149, - "learning_rate": 2.763882378305003e-06, - "loss": 0.73375642, - "num_input_tokens_seen": 140804950, - "step": 6553, - "time_per_iteration": 2.6329705715179443 - }, - { - "auxiliary_loss_clip": 0.0111827, - "auxiliary_loss_mlp": 0.0077145, - "balance_loss_clip": 1.04818738, - "balance_loss_mlp": 1.00036502, - "epoch": 0.39404779798587103, - "flos": 29308888419840.0, - "grad_norm": 4.200797737547303, - "language_loss": 0.64058566, - "learning_rate": 2.7635224285449144e-06, - "loss": 0.65948284, - "num_input_tokens_seen": 140822800, - "step": 6554, - "time_per_iteration": 2.7190303802490234 - }, - { - "auxiliary_loss_clip": 0.01109713, - "auxiliary_loss_mlp": 0.01041117, - "balance_loss_clip": 1.04655266, - "balance_loss_mlp": 1.02747416, - "epoch": 0.394107921238539, - "flos": 34897055834880.0, - "grad_norm": 2.186636266316066, - "language_loss": 0.78957009, - "learning_rate": 2.7631624498319796e-06, - "loss": 0.81107843, - "num_input_tokens_seen": 140842940, - "step": 6555, - "time_per_iteration": 2.7675819396972656 - }, - { - "auxiliary_loss_clip": 0.01102424, - "auxiliary_loss_mlp": 0.0104302, - "balance_loss_clip": 1.04469514, - "balance_loss_mlp": 1.02758873, - "epoch": 0.39416804449120696, - "flos": 25081413039360.0, - "grad_norm": 1.7945119387028163, - "language_loss": 0.71689165, - "learning_rate": 2.7628024421798473e-06, - "loss": 0.7383461, - "num_input_tokens_seen": 140863060, - "step": 6556, - "time_per_iteration": 4.261122703552246 - }, - { - "auxiliary_loss_clip": 0.01129248, - "auxiliary_loss_mlp": 0.01031706, - "balance_loss_clip": 1.0445503, - "balance_loss_mlp": 1.01749015, - "epoch": 0.3942281677438749, - "flos": 32306639731200.0, - "grad_norm": 1.7970895618407805, - "language_loss": 0.84080362, - "learning_rate": 2.7624424056021705e-06, - "loss": 0.86241317, - "num_input_tokens_seen": 140883795, - "step": 6557, - "time_per_iteration": 2.7031610012054443 - }, - { - "auxiliary_loss_clip": 0.01116561, - "auxiliary_loss_mlp": 0.01032116, - "balance_loss_clip": 1.04790783, - "balance_loss_mlp": 1.01810956, - "epoch": 0.3942882909965429, - "flos": 24936621315840.0, - "grad_norm": 3.8501140650976238, - "language_loss": 0.806759, - "learning_rate": 2.7620823401126004e-06, - "loss": 0.82824582, - "num_input_tokens_seen": 140903055, - "step": 6558, - "time_per_iteration": 5.6523637771606445 - }, - { - "auxiliary_loss_clip": 0.01130051, - "auxiliary_loss_mlp": 0.01035884, - "balance_loss_clip": 1.04807055, - "balance_loss_mlp": 1.02238965, - "epoch": 0.39434841424921085, - "flos": 11874797769600.0, - "grad_norm": 1.8974962376031472, - "language_loss": 0.70930403, - "learning_rate": 2.761722245724792e-06, - "loss": 0.73096335, - "num_input_tokens_seen": 140920685, - "step": 6559, - "time_per_iteration": 2.6645302772521973 - }, - { - "auxiliary_loss_clip": 0.01113668, - "auxiliary_loss_mlp": 0.0104073, - "balance_loss_clip": 1.04660964, - "balance_loss_mlp": 1.02452326, - "epoch": 0.3944085375018789, - "flos": 16361620323840.0, - "grad_norm": 2.3002241644217865, - "language_loss": 0.80355662, - "learning_rate": 2.7613621224524003e-06, - "loss": 0.82510054, - "num_input_tokens_seen": 140937320, - "step": 6560, - "time_per_iteration": 2.8372745513916016 - }, - { - "auxiliary_loss_clip": 0.01109469, - "auxiliary_loss_mlp": 0.0103941, - "balance_loss_clip": 1.04681468, - "balance_loss_mlp": 1.02334619, - "epoch": 0.39446866075454684, - "flos": 10633365866880.0, - "grad_norm": 2.2192317359233034, - "language_loss": 0.828062, - "learning_rate": 2.7610019703090803e-06, - "loss": 0.84955078, - "num_input_tokens_seen": 140954855, - "step": 6561, - "time_per_iteration": 2.6724014282226562 - }, - { - "auxiliary_loss_clip": 0.01119263, - "auxiliary_loss_mlp": 0.01043889, - "balance_loss_clip": 1.04620779, - "balance_loss_mlp": 1.02972126, - "epoch": 0.3945287840072148, - "flos": 18187498419840.0, - "grad_norm": 2.478683034492453, - "language_loss": 0.80985552, - "learning_rate": 2.7606417893084887e-06, - "loss": 0.83148706, - "num_input_tokens_seen": 140973250, - "step": 6562, - "time_per_iteration": 4.211291074752808 - }, - { - "auxiliary_loss_clip": 0.01100981, - "auxiliary_loss_mlp": 0.01040375, - "balance_loss_clip": 1.04367661, - "balance_loss_mlp": 1.02568245, - "epoch": 0.39458890725988277, - "flos": 23039891642880.0, - "grad_norm": 1.8396668644534004, - "language_loss": 0.81574059, - "learning_rate": 2.7602815794642853e-06, - "loss": 0.83715415, - "num_input_tokens_seen": 140993050, - "step": 6563, - "time_per_iteration": 2.6933205127716064 - }, - { - "auxiliary_loss_clip": 0.01078578, - "auxiliary_loss_mlp": 0.01052866, - "balance_loss_clip": 1.03979552, - "balance_loss_mlp": 1.03385234, - "epoch": 0.39464903051255074, - "flos": 17159052211200.0, - "grad_norm": 2.4284687703059276, - "language_loss": 0.69678622, - "learning_rate": 2.759921340790127e-06, - "loss": 0.71810067, - "num_input_tokens_seen": 141010815, - "step": 6564, - "time_per_iteration": 2.7754619121551514 - }, - { - "auxiliary_loss_clip": 0.01119553, - "auxiliary_loss_mlp": 0.01037847, - "balance_loss_clip": 1.04547322, - "balance_loss_mlp": 1.02260029, - "epoch": 0.3947091537652187, - "flos": 15889000147200.0, - "grad_norm": 2.342409184231709, - "language_loss": 0.82842124, - "learning_rate": 2.759561073299676e-06, - "loss": 0.84999526, - "num_input_tokens_seen": 141028720, - "step": 6565, - "time_per_iteration": 2.652029037475586 - }, - { - "auxiliary_loss_clip": 0.01091527, - "auxiliary_loss_mlp": 0.01044097, - "balance_loss_clip": 1.04201448, - "balance_loss_mlp": 1.02794445, - "epoch": 0.39476927701788667, - "flos": 18545491319040.0, - "grad_norm": 1.8313066364371182, - "language_loss": 0.83458865, - "learning_rate": 2.7592007770065937e-06, - "loss": 0.85594487, - "num_input_tokens_seen": 141046025, - "step": 6566, - "time_per_iteration": 2.6853299140930176 - }, - { - "auxiliary_loss_clip": 0.01137834, - "auxiliary_loss_mlp": 0.01036947, - "balance_loss_clip": 1.04882693, - "balance_loss_mlp": 1.02146816, - "epoch": 0.39482940027055463, - "flos": 22275712771200.0, - "grad_norm": 2.7953182854439973, - "language_loss": 0.77462149, - "learning_rate": 2.7588404519245403e-06, - "loss": 0.79636931, - "num_input_tokens_seen": 141066865, - "step": 6567, - "time_per_iteration": 2.6695878505706787 - }, - { - "auxiliary_loss_clip": 0.01114738, - "auxiliary_loss_mlp": 0.01037774, - "balance_loss_clip": 1.04457474, - "balance_loss_mlp": 1.0235877, - "epoch": 0.3948895235232226, - "flos": 14757634494720.0, - "grad_norm": 3.2000391748281065, - "language_loss": 0.80752146, - "learning_rate": 2.758480098067182e-06, - "loss": 0.82904655, - "num_input_tokens_seen": 141084210, - "step": 6568, - "time_per_iteration": 2.6126980781555176 - }, - { - "auxiliary_loss_clip": 0.01100656, - "auxiliary_loss_mlp": 0.01035941, - "balance_loss_clip": 1.04693437, - "balance_loss_mlp": 1.02142143, - "epoch": 0.39494964677589056, - "flos": 22565763095040.0, - "grad_norm": 3.155903507973262, - "language_loss": 0.846977, - "learning_rate": 2.7581197154481816e-06, - "loss": 0.868343, - "num_input_tokens_seen": 141103895, - "step": 6569, - "time_per_iteration": 2.731241464614868 - }, - { - "auxiliary_loss_clip": 0.01076285, - "auxiliary_loss_mlp": 0.01045444, - "balance_loss_clip": 1.046417, - "balance_loss_mlp": 1.03076911, - "epoch": 0.3950097700285585, - "flos": 22963186149120.0, - "grad_norm": 2.966787083651573, - "language_loss": 0.74931526, - "learning_rate": 2.7577593040812066e-06, - "loss": 0.77053261, - "num_input_tokens_seen": 141124000, - "step": 6570, - "time_per_iteration": 2.816168785095215 - }, - { - "auxiliary_loss_clip": 0.01093382, - "auxiliary_loss_mlp": 0.01037489, - "balance_loss_clip": 1.04271865, - "balance_loss_mlp": 1.02224803, - "epoch": 0.3950698932812265, - "flos": 20595236929920.0, - "grad_norm": 3.807643490882315, - "language_loss": 0.80009687, - "learning_rate": 2.757398863979922e-06, - "loss": 0.82140559, - "num_input_tokens_seen": 141142535, - "step": 6571, - "time_per_iteration": 2.7444143295288086 - }, - { - "auxiliary_loss_clip": 0.0110309, - "auxiliary_loss_mlp": 0.01042438, - "balance_loss_clip": 1.046592, - "balance_loss_mlp": 1.02792382, - "epoch": 0.39513001653389446, - "flos": 20375786787840.0, - "grad_norm": 2.0513494110156105, - "language_loss": 0.77667749, - "learning_rate": 2.757038395157997e-06, - "loss": 0.79813272, - "num_input_tokens_seen": 141161575, - "step": 6572, - "time_per_iteration": 2.787951946258545 - }, - { - "auxiliary_loss_clip": 0.01096298, - "auxiliary_loss_mlp": 0.01039178, - "balance_loss_clip": 1.04524946, - "balance_loss_mlp": 1.02422285, - "epoch": 0.3951901397865625, - "flos": 26463650256000.0, - "grad_norm": 2.2233910711840092, - "language_loss": 0.74710405, - "learning_rate": 2.7566778976291002e-06, - "loss": 0.76845872, - "num_input_tokens_seen": 141181150, - "step": 6573, - "time_per_iteration": 2.8065271377563477 - }, - { - "auxiliary_loss_clip": 0.01119667, - "auxiliary_loss_mlp": 0.01033875, - "balance_loss_clip": 1.04583275, - "balance_loss_mlp": 1.02073228, - "epoch": 0.39525026303923044, - "flos": 43838345767680.0, - "grad_norm": 1.5702623893020402, - "language_loss": 0.681665, - "learning_rate": 2.7563173714069017e-06, - "loss": 0.7032004, - "num_input_tokens_seen": 141206310, - "step": 6574, - "time_per_iteration": 2.917938470840454 - }, - { - "auxiliary_loss_clip": 0.01066027, - "auxiliary_loss_mlp": 0.01046829, - "balance_loss_clip": 1.03601551, - "balance_loss_mlp": 1.02941298, - "epoch": 0.3953103862918984, - "flos": 18040803275520.0, - "grad_norm": 11.51359836007049, - "language_loss": 0.71934754, - "learning_rate": 2.755956816505072e-06, - "loss": 0.74047613, - "num_input_tokens_seen": 141223925, - "step": 6575, - "time_per_iteration": 2.8125574588775635 - }, - { - "auxiliary_loss_clip": 0.01106625, - "auxiliary_loss_mlp": 0.01044084, - "balance_loss_clip": 1.04328454, - "balance_loss_mlp": 1.02871156, - "epoch": 0.3953705095445664, - "flos": 16976015481600.0, - "grad_norm": 2.3082458130711276, - "language_loss": 0.73497486, - "learning_rate": 2.7555962329372845e-06, - "loss": 0.75648189, - "num_input_tokens_seen": 141239010, - "step": 6576, - "time_per_iteration": 2.7072994709014893 - }, - { - "auxiliary_loss_clip": 0.01131853, - "auxiliary_loss_mlp": 0.01038072, - "balance_loss_clip": 1.04721868, - "balance_loss_mlp": 1.02482772, - "epoch": 0.39543063279723434, - "flos": 17411144837760.0, - "grad_norm": 2.584581612312142, - "language_loss": 0.83806884, - "learning_rate": 2.7552356207172124e-06, - "loss": 0.85976809, - "num_input_tokens_seen": 141252255, - "step": 6577, - "time_per_iteration": 2.673980236053467 - }, - { - "auxiliary_loss_clip": 0.01108115, - "auxiliary_loss_mlp": 0.01038249, - "balance_loss_clip": 1.04473734, - "balance_loss_mlp": 1.02394366, - "epoch": 0.3954907560499023, - "flos": 22784207656320.0, - "grad_norm": 3.282604232183532, - "language_loss": 0.90597945, - "learning_rate": 2.75487497985853e-06, - "loss": 0.92744309, - "num_input_tokens_seen": 141269325, - "step": 6578, - "time_per_iteration": 2.8357715606689453 - }, - { - "auxiliary_loss_clip": 0.01113431, - "auxiliary_loss_mlp": 0.01038042, - "balance_loss_clip": 1.04971015, - "balance_loss_mlp": 1.0215559, - "epoch": 0.39555087930257027, - "flos": 21944400698880.0, - "grad_norm": 1.9328811386040925, - "language_loss": 0.77836883, - "learning_rate": 2.7545143103749117e-06, - "loss": 0.7998836, - "num_input_tokens_seen": 141288505, - "step": 6579, - "time_per_iteration": 2.78900146484375 - }, - { - "auxiliary_loss_clip": 0.01080071, - "auxiliary_loss_mlp": 0.01037596, - "balance_loss_clip": 1.04296517, - "balance_loss_mlp": 1.02181292, - "epoch": 0.39561100255523823, - "flos": 20404622430720.0, - "grad_norm": 2.0515288557813705, - "language_loss": 0.68375254, - "learning_rate": 2.754153612280037e-06, - "loss": 0.70492923, - "num_input_tokens_seen": 141303680, - "step": 6580, - "time_per_iteration": 2.796602249145508 - }, - { - "auxiliary_loss_clip": 0.01119101, - "auxiliary_loss_mlp": 0.01031775, - "balance_loss_clip": 1.04687381, - "balance_loss_mlp": 1.01770234, - "epoch": 0.3956711258079062, - "flos": 27964572986880.0, - "grad_norm": 5.6192422063497425, - "language_loss": 0.58592093, - "learning_rate": 2.7537928855875797e-06, - "loss": 0.60742974, - "num_input_tokens_seen": 141324090, - "step": 6581, - "time_per_iteration": 2.738732099533081 - }, - { - "auxiliary_loss_clip": 0.0110807, - "auxiliary_loss_mlp": 0.01047889, - "balance_loss_clip": 1.04554892, - "balance_loss_mlp": 1.03111625, - "epoch": 0.39573124906057416, - "flos": 14428297670400.0, - "grad_norm": 1.840388254222325, - "language_loss": 0.69687581, - "learning_rate": 2.7534321303112224e-06, - "loss": 0.71843535, - "num_input_tokens_seen": 141342235, - "step": 6582, - "time_per_iteration": 2.74564790725708 - }, - { - "auxiliary_loss_clip": 0.01132763, - "auxiliary_loss_mlp": 0.0077198, - "balance_loss_clip": 1.04670966, - "balance_loss_mlp": 1.00066948, - "epoch": 0.39579137231324213, - "flos": 18733699607040.0, - "grad_norm": 2.093309053458098, - "language_loss": 0.76838243, - "learning_rate": 2.753071346464642e-06, - "loss": 0.78742981, - "num_input_tokens_seen": 141361195, - "step": 6583, - "time_per_iteration": 2.6127665042877197 - }, - { - "auxiliary_loss_clip": 0.01084294, - "auxiliary_loss_mlp": 0.00772199, - "balance_loss_clip": 1.04135418, - "balance_loss_mlp": 1.00058353, - "epoch": 0.3958514955659101, - "flos": 17676417755520.0, - "grad_norm": 2.422087879109688, - "language_loss": 0.66005278, - "learning_rate": 2.7527105340615207e-06, - "loss": 0.67861772, - "num_input_tokens_seen": 141378275, - "step": 6584, - "time_per_iteration": 2.8412790298461914 - }, - { - "auxiliary_loss_clip": 0.0109769, - "auxiliary_loss_mlp": 0.01042803, - "balance_loss_clip": 1.04634333, - "balance_loss_mlp": 1.02687716, - "epoch": 0.39591161881857806, - "flos": 29309103901440.0, - "grad_norm": 7.452692779947077, - "language_loss": 0.72775561, - "learning_rate": 2.7523496931155413e-06, - "loss": 0.74916053, - "num_input_tokens_seen": 141396960, - "step": 6585, - "time_per_iteration": 2.8504436016082764 - }, - { - "auxiliary_loss_clip": 0.0109915, - "auxiliary_loss_mlp": 0.01041099, - "balance_loss_clip": 1.04335117, - "balance_loss_mlp": 1.02628136, - "epoch": 0.3959717420712461, - "flos": 25771831332480.0, - "grad_norm": 1.8603715362450812, - "language_loss": 0.73381901, - "learning_rate": 2.7519888236403856e-06, - "loss": 0.75522149, - "num_input_tokens_seen": 141417320, - "step": 6586, - "time_per_iteration": 2.8426311016082764 - }, - { - "auxiliary_loss_clip": 0.01101854, - "auxiliary_loss_mlp": 0.0103792, - "balance_loss_clip": 1.04255629, - "balance_loss_mlp": 1.02266693, - "epoch": 0.39603186532391405, - "flos": 20923783655040.0, - "grad_norm": 2.174728382433504, - "language_loss": 0.71447468, - "learning_rate": 2.7516279256497382e-06, - "loss": 0.73587245, - "num_input_tokens_seen": 141435985, - "step": 6587, - "time_per_iteration": 2.7798478603363037 - }, - { - "auxiliary_loss_clip": 0.01007869, - "auxiliary_loss_mlp": 0.01003214, - "balance_loss_clip": 1.02249742, - "balance_loss_mlp": 1.00195026, - "epoch": 0.396091988576582, - "flos": 54880986176640.0, - "grad_norm": 0.9478406102040471, - "language_loss": 0.61186492, - "learning_rate": 2.751266999157285e-06, - "loss": 0.63197577, - "num_input_tokens_seen": 141486075, - "step": 6588, - "time_per_iteration": 3.1663742065429688 - }, - { - "auxiliary_loss_clip": 0.0110963, - "auxiliary_loss_mlp": 0.00772247, - "balance_loss_clip": 1.04547548, - "balance_loss_mlp": 1.0006907, - "epoch": 0.39615211182925, - "flos": 20702896968960.0, - "grad_norm": 3.1004380492305206, - "language_loss": 0.81686854, - "learning_rate": 2.7509060441767115e-06, - "loss": 0.8356874, - "num_input_tokens_seen": 141505280, - "step": 6589, - "time_per_iteration": 2.7711055278778076 - }, - { - "auxiliary_loss_clip": 0.01106228, - "auxiliary_loss_mlp": 0.01038149, - "balance_loss_clip": 1.04562962, - "balance_loss_mlp": 1.02241325, - "epoch": 0.39621223508191794, - "flos": 20994312009600.0, - "grad_norm": 2.2429889858322802, - "language_loss": 0.69913912, - "learning_rate": 2.7505450607217057e-06, - "loss": 0.72058284, - "num_input_tokens_seen": 141523930, - "step": 6590, - "time_per_iteration": 2.793330669403076 - }, - { - "auxiliary_loss_clip": 0.01117633, - "auxiliary_loss_mlp": 0.01056421, - "balance_loss_clip": 1.04669666, - "balance_loss_mlp": 1.03980339, - "epoch": 0.3962723583345859, - "flos": 23368833417600.0, - "grad_norm": 1.772211549409949, - "language_loss": 0.75809395, - "learning_rate": 2.750184048805956e-06, - "loss": 0.77983451, - "num_input_tokens_seen": 141541320, - "step": 6591, - "time_per_iteration": 2.7317981719970703 - }, - { - "auxiliary_loss_clip": 0.01043506, - "auxiliary_loss_mlp": 0.01049181, - "balance_loss_clip": 1.03802264, - "balance_loss_mlp": 1.03364813, - "epoch": 0.39633248158725387, - "flos": 25115599808640.0, - "grad_norm": 2.064980952243903, - "language_loss": 0.78466719, - "learning_rate": 2.749823008443152e-06, - "loss": 0.80559409, - "num_input_tokens_seen": 141561880, - "step": 6592, - "time_per_iteration": 3.192194700241089 - }, - { - "auxiliary_loss_clip": 0.01059924, - "auxiliary_loss_mlp": 0.01033868, - "balance_loss_clip": 1.03984666, - "balance_loss_mlp": 1.01872826, - "epoch": 0.39639260483992184, - "flos": 39787622236800.0, - "grad_norm": 1.9568402514544967, - "language_loss": 0.69690341, - "learning_rate": 2.7494619396469843e-06, - "loss": 0.71784127, - "num_input_tokens_seen": 141586460, - "step": 6593, - "time_per_iteration": 3.365752696990967 - }, - { - "auxiliary_loss_clip": 0.01059564, - "auxiliary_loss_mlp": 0.01046377, - "balance_loss_clip": 1.03668404, - "balance_loss_mlp": 1.03035569, - "epoch": 0.3964527280925898, - "flos": 17347045017600.0, - "grad_norm": 1.624713370756075, - "language_loss": 0.77905881, - "learning_rate": 2.7491008424311452e-06, - "loss": 0.80011821, - "num_input_tokens_seen": 141605955, - "step": 6594, - "time_per_iteration": 2.890626907348633 - }, - { - "auxiliary_loss_clip": 0.01025812, - "auxiliary_loss_mlp": 0.01003509, - "balance_loss_clip": 1.02550435, - "balance_loss_mlp": 1.00200129, - "epoch": 0.39651285134525777, - "flos": 71717848369920.0, - "grad_norm": 0.9363100872746896, - "language_loss": 0.6304667, - "learning_rate": 2.7487397168093265e-06, - "loss": 0.65075988, - "num_input_tokens_seen": 141673140, - "step": 6595, - "time_per_iteration": 3.3955094814300537 - }, - { - "auxiliary_loss_clip": 0.01096586, - "auxiliary_loss_mlp": 0.01055368, - "balance_loss_clip": 1.0442034, - "balance_loss_mlp": 1.03774858, - "epoch": 0.39657297459792573, - "flos": 25775710001280.0, - "grad_norm": 2.5609780352809732, - "language_loss": 0.63787287, - "learning_rate": 2.748378562795223e-06, - "loss": 0.65939242, - "num_input_tokens_seen": 141692955, - "step": 6596, - "time_per_iteration": 4.60092568397522 - }, - { - "auxiliary_loss_clip": 0.01120147, - "auxiliary_loss_mlp": 0.010422, - "balance_loss_clip": 1.04657853, - "balance_loss_mlp": 1.02747798, - "epoch": 0.3966330978505937, - "flos": 20266115587200.0, - "grad_norm": 2.0315739566024567, - "language_loss": 0.79006839, - "learning_rate": 2.7480173804025293e-06, - "loss": 0.81169188, - "num_input_tokens_seen": 141710680, - "step": 6597, - "time_per_iteration": 5.807824373245239 - }, - { - "auxiliary_loss_clip": 0.01099639, - "auxiliary_loss_mlp": 0.00773402, - "balance_loss_clip": 1.04352951, - "balance_loss_mlp": 1.00076032, - "epoch": 0.39669322110326166, - "flos": 20631183465600.0, - "grad_norm": 2.966898609781474, - "language_loss": 0.6772182, - "learning_rate": 2.747656169644941e-06, - "loss": 0.69594866, - "num_input_tokens_seen": 141729860, - "step": 6598, - "time_per_iteration": 2.786884307861328 - }, - { - "auxiliary_loss_clip": 0.01129462, - "auxiliary_loss_mlp": 0.01041455, - "balance_loss_clip": 1.04473436, - "balance_loss_mlp": 1.02785325, - "epoch": 0.3967533443559297, - "flos": 21726063878400.0, - "grad_norm": 2.1804433985902247, - "language_loss": 0.79342777, - "learning_rate": 2.747294930536157e-06, - "loss": 0.81513697, - "num_input_tokens_seen": 141749060, - "step": 6599, - "time_per_iteration": 2.6758370399475098 - }, - { - "auxiliary_loss_clip": 0.01091573, - "auxiliary_loss_mlp": 0.01041619, - "balance_loss_clip": 1.04314208, - "balance_loss_mlp": 1.02487051, - "epoch": 0.39681346760859765, - "flos": 25484151306240.0, - "grad_norm": 2.279505844275463, - "language_loss": 0.72878486, - "learning_rate": 2.7469336630898737e-06, - "loss": 0.75011677, - "num_input_tokens_seen": 141769860, - "step": 6600, - "time_per_iteration": 2.7616889476776123 - }, - { - "auxiliary_loss_clip": 0.01083152, - "auxiliary_loss_mlp": 0.01037422, - "balance_loss_clip": 1.03626251, - "balance_loss_mlp": 1.0220201, - "epoch": 0.3968735908612656, - "flos": 20959586536320.0, - "grad_norm": 2.0515710245603938, - "language_loss": 0.85973942, - "learning_rate": 2.746572367319791e-06, - "loss": 0.88094509, - "num_input_tokens_seen": 141788465, - "step": 6601, - "time_per_iteration": 2.755791664123535 - }, - { - "auxiliary_loss_clip": 0.01095713, - "auxiliary_loss_mlp": 0.01041964, - "balance_loss_clip": 1.0429877, - "balance_loss_mlp": 1.02468467, - "epoch": 0.3969337141139336, - "flos": 10707090531840.0, - "grad_norm": 2.240549855963289, - "language_loss": 0.70372766, - "learning_rate": 2.7462110432396095e-06, - "loss": 0.72510445, - "num_input_tokens_seen": 141804955, - "step": 6602, - "time_per_iteration": 4.643726348876953 - }, - { - "auxiliary_loss_clip": 0.01133428, - "auxiliary_loss_mlp": 0.01047809, - "balance_loss_clip": 1.04658508, - "balance_loss_mlp": 1.03230548, - "epoch": 0.39699383736660154, - "flos": 17593714690560.0, - "grad_norm": 3.7711392584572896, - "language_loss": 0.83248609, - "learning_rate": 2.7458496908630305e-06, - "loss": 0.85429847, - "num_input_tokens_seen": 141820025, - "step": 6603, - "time_per_iteration": 2.8909716606140137 - }, - { - "auxiliary_loss_clip": 0.01112282, - "auxiliary_loss_mlp": 0.01034339, - "balance_loss_clip": 1.04651403, - "balance_loss_mlp": 1.02003431, - "epoch": 0.3970539606192695, - "flos": 17785945301760.0, - "grad_norm": 1.9508498227264648, - "language_loss": 0.73302728, - "learning_rate": 2.7454883102037563e-06, - "loss": 0.75449347, - "num_input_tokens_seen": 141838735, - "step": 6604, - "time_per_iteration": 2.828908920288086 - }, - { - "auxiliary_loss_clip": 0.01105132, - "auxiliary_loss_mlp": 0.01038476, - "balance_loss_clip": 1.04384422, - "balance_loss_mlp": 1.02364659, - "epoch": 0.3971140838719375, - "flos": 24789495208320.0, - "grad_norm": 1.769953580879433, - "language_loss": 0.82582277, - "learning_rate": 2.745126901275491e-06, - "loss": 0.84725887, - "num_input_tokens_seen": 141858090, - "step": 6605, - "time_per_iteration": 2.6773502826690674 - }, - { - "auxiliary_loss_clip": 0.01128613, - "auxiliary_loss_mlp": 0.01033129, - "balance_loss_clip": 1.04549098, - "balance_loss_mlp": 1.01968801, - "epoch": 0.39717420712460544, - "flos": 24243581329920.0, - "grad_norm": 1.4871941413504006, - "language_loss": 0.73511499, - "learning_rate": 2.7447654640919383e-06, - "loss": 0.75673246, - "num_input_tokens_seen": 141877540, - "step": 6606, - "time_per_iteration": 2.632805347442627 - }, - { - "auxiliary_loss_clip": 0.01089285, - "auxiliary_loss_mlp": 0.01048599, - "balance_loss_clip": 1.0436089, - "balance_loss_mlp": 1.03198171, - "epoch": 0.3972343303772734, - "flos": 25884698843520.0, - "grad_norm": 2.092571399644939, - "language_loss": 0.74296981, - "learning_rate": 2.744403998666805e-06, - "loss": 0.76434863, - "num_input_tokens_seen": 141897315, - "step": 6607, - "time_per_iteration": 2.7277770042419434 - }, - { - "auxiliary_loss_clip": 0.01124169, - "auxiliary_loss_mlp": 0.01037393, - "balance_loss_clip": 1.04697132, - "balance_loss_mlp": 1.02267027, - "epoch": 0.39729445362994137, - "flos": 45623716300800.0, - "grad_norm": 1.5196847129379933, - "language_loss": 0.6787042, - "learning_rate": 2.744042505013797e-06, - "loss": 0.70031989, - "num_input_tokens_seen": 141919580, - "step": 6608, - "time_per_iteration": 2.8229119777679443 - }, - { - "auxiliary_loss_clip": 0.01094928, - "auxiliary_loss_mlp": 0.01054175, - "balance_loss_clip": 1.04091311, - "balance_loss_mlp": 1.03580451, - "epoch": 0.39735457688260933, - "flos": 20193971120640.0, - "grad_norm": 7.314681050409252, - "language_loss": 0.74670005, - "learning_rate": 2.7436809831466233e-06, - "loss": 0.7681911, - "num_input_tokens_seen": 141937045, - "step": 6609, - "time_per_iteration": 2.7502245903015137 - }, - { - "auxiliary_loss_clip": 0.01107217, - "auxiliary_loss_mlp": 0.01036058, - "balance_loss_clip": 1.04354, - "balance_loss_mlp": 1.02056026, - "epoch": 0.3974147001352773, - "flos": 23331163029120.0, - "grad_norm": 1.742454501323656, - "language_loss": 0.713238, - "learning_rate": 2.7433194330789927e-06, - "loss": 0.73467076, - "num_input_tokens_seen": 141956695, - "step": 6610, - "time_per_iteration": 2.7225286960601807 - }, - { - "auxiliary_loss_clip": 0.01105851, - "auxiliary_loss_mlp": 0.01030068, - "balance_loss_clip": 1.03818822, - "balance_loss_mlp": 1.01509547, - "epoch": 0.39747482338794526, - "flos": 21688644885120.0, - "grad_norm": 1.7960063460415152, - "language_loss": 0.78151029, - "learning_rate": 2.7429578548246133e-06, - "loss": 0.8028695, - "num_input_tokens_seen": 141975935, - "step": 6611, - "time_per_iteration": 2.6464622020721436 - }, - { - "auxiliary_loss_clip": 0.01121213, - "auxiliary_loss_mlp": 0.01038273, - "balance_loss_clip": 1.04614162, - "balance_loss_mlp": 1.0235095, - "epoch": 0.3975349466406133, - "flos": 30988717816320.0, - "grad_norm": 1.7937788130001704, - "language_loss": 0.7921629, - "learning_rate": 2.7425962483971985e-06, - "loss": 0.81375778, - "num_input_tokens_seen": 141995750, - "step": 6612, - "time_per_iteration": 2.734950304031372 - }, - { - "auxiliary_loss_clip": 0.01018209, - "auxiliary_loss_mlp": 0.0100828, - "balance_loss_clip": 1.02113628, - "balance_loss_mlp": 1.00702214, - "epoch": 0.39759506989328125, - "flos": 63683948833920.0, - "grad_norm": 0.8423760762856193, - "language_loss": 0.64935088, - "learning_rate": 2.742234613810459e-06, - "loss": 0.66961575, - "num_input_tokens_seen": 142057655, - "step": 6613, - "time_per_iteration": 3.1294105052948 - }, - { - "auxiliary_loss_clip": 0.01097901, - "auxiliary_loss_mlp": 0.01042526, - "balance_loss_clip": 1.03916883, - "balance_loss_mlp": 1.02507401, - "epoch": 0.3976551931459492, - "flos": 23695835857920.0, - "grad_norm": 3.0444472336295636, - "language_loss": 0.71508956, - "learning_rate": 2.741872951078109e-06, - "loss": 0.73649383, - "num_input_tokens_seen": 142076020, - "step": 6614, - "time_per_iteration": 2.6479976177215576 - }, - { - "auxiliary_loss_clip": 0.01116106, - "auxiliary_loss_mlp": 0.01035284, - "balance_loss_clip": 1.04503131, - "balance_loss_mlp": 1.02034712, - "epoch": 0.3977153163986172, - "flos": 15669657745920.0, - "grad_norm": 2.2927333729520885, - "language_loss": 0.81362098, - "learning_rate": 2.741511260213862e-06, - "loss": 0.83513486, - "num_input_tokens_seen": 142093790, - "step": 6615, - "time_per_iteration": 2.6567723751068115 - }, - { - "auxiliary_loss_clip": 0.01094954, - "auxiliary_loss_mlp": 0.01034601, - "balance_loss_clip": 1.04491544, - "balance_loss_mlp": 1.02023649, - "epoch": 0.39777543965128515, - "flos": 14064702249600.0, - "grad_norm": 2.01024859105405, - "language_loss": 0.67510247, - "learning_rate": 2.741149541231434e-06, - "loss": 0.69639802, - "num_input_tokens_seen": 142110545, - "step": 6616, - "time_per_iteration": 2.6675400733947754 - }, - { - "auxiliary_loss_clip": 0.01133654, - "auxiliary_loss_mlp": 0.01043633, - "balance_loss_clip": 1.04658771, - "balance_loss_mlp": 1.02765918, - "epoch": 0.3978355629039531, - "flos": 23367468700800.0, - "grad_norm": 2.3086733420735785, - "language_loss": 0.83678514, - "learning_rate": 2.740787794144541e-06, - "loss": 0.85855806, - "num_input_tokens_seen": 142128695, - "step": 6617, - "time_per_iteration": 2.5879552364349365 - }, - { - "auxiliary_loss_clip": 0.01126085, - "auxiliary_loss_mlp": 0.01039432, - "balance_loss_clip": 1.04570735, - "balance_loss_mlp": 1.02563334, - "epoch": 0.3978956861566211, - "flos": 19062785036160.0, - "grad_norm": 1.7795732635152253, - "language_loss": 0.72766519, - "learning_rate": 2.7404260189669e-06, - "loss": 0.74932027, - "num_input_tokens_seen": 142148375, - "step": 6618, - "time_per_iteration": 2.613162040710449 - }, - { - "auxiliary_loss_clip": 0.01111951, - "auxiliary_loss_mlp": 0.01041983, - "balance_loss_clip": 1.04827428, - "balance_loss_mlp": 1.02544832, - "epoch": 0.39795580940928904, - "flos": 30227699341440.0, - "grad_norm": 1.6960793445061386, - "language_loss": 0.65858316, - "learning_rate": 2.740064215712231e-06, - "loss": 0.68012249, - "num_input_tokens_seen": 142169735, - "step": 6619, - "time_per_iteration": 2.7474000453948975 - }, - { - "auxiliary_loss_clip": 0.01052495, - "auxiliary_loss_mlp": 0.01004058, - "balance_loss_clip": 1.0230546, - "balance_loss_mlp": 1.00270545, - "epoch": 0.398015932661957, - "flos": 69847224906240.0, - "grad_norm": 0.7704475067145287, - "language_loss": 0.58246851, - "learning_rate": 2.7397023843942527e-06, - "loss": 0.60303402, - "num_input_tokens_seen": 142229520, - "step": 6620, - "time_per_iteration": 3.1400091648101807 - }, - { - "auxiliary_loss_clip": 0.01113547, - "auxiliary_loss_mlp": 0.0103675, - "balance_loss_clip": 1.04998422, - "balance_loss_mlp": 1.02314794, - "epoch": 0.39807605591462497, - "flos": 20157773189760.0, - "grad_norm": 1.821199996328267, - "language_loss": 0.7925806, - "learning_rate": 2.739340525026686e-06, - "loss": 0.81408358, - "num_input_tokens_seen": 142247660, - "step": 6621, - "time_per_iteration": 2.7389161586761475 - }, - { - "auxiliary_loss_clip": 0.0110802, - "auxiliary_loss_mlp": 0.01034956, - "balance_loss_clip": 1.04590595, - "balance_loss_mlp": 1.02088952, - "epoch": 0.39813617916729294, - "flos": 21141761339520.0, - "grad_norm": 1.899291394170355, - "language_loss": 0.77800381, - "learning_rate": 2.738978637623252e-06, - "loss": 0.79943347, - "num_input_tokens_seen": 142266990, - "step": 6622, - "time_per_iteration": 2.7175779342651367 - }, - { - "auxiliary_loss_clip": 0.01101638, - "auxiliary_loss_mlp": 0.01038721, - "balance_loss_clip": 1.04108417, - "balance_loss_mlp": 1.02377844, - "epoch": 0.3981963024199609, - "flos": 18988485753600.0, - "grad_norm": 1.6278701941081761, - "language_loss": 0.7497921, - "learning_rate": 2.738616722197674e-06, - "loss": 0.77119565, - "num_input_tokens_seen": 142287170, - "step": 6623, - "time_per_iteration": 2.682567596435547 - }, - { - "auxiliary_loss_clip": 0.01088304, - "auxiliary_loss_mlp": 0.01040759, - "balance_loss_clip": 1.04280734, - "balance_loss_mlp": 1.02590537, - "epoch": 0.39825642567262887, - "flos": 16575108808320.0, - "grad_norm": 2.4968757127264465, - "language_loss": 0.79733497, - "learning_rate": 2.7382547787636766e-06, - "loss": 0.81862563, - "num_input_tokens_seen": 142305405, - "step": 6624, - "time_per_iteration": 2.6878697872161865 - }, - { - "auxiliary_loss_clip": 0.01135858, - "auxiliary_loss_mlp": 0.01043783, - "balance_loss_clip": 1.04792297, - "balance_loss_mlp": 1.0270462, - "epoch": 0.39831654892529683, - "flos": 22199833290240.0, - "grad_norm": 2.0557211895564884, - "language_loss": 0.83616954, - "learning_rate": 2.7378928073349832e-06, - "loss": 0.85796595, - "num_input_tokens_seen": 142322710, - "step": 6625, - "time_per_iteration": 2.5847036838531494 - }, - { - "auxiliary_loss_clip": 0.011152, - "auxiliary_loss_mlp": 0.01044436, - "balance_loss_clip": 1.04585958, - "balance_loss_mlp": 1.02948713, - "epoch": 0.39837667217796485, - "flos": 10487963612160.0, - "grad_norm": 2.4120237094780377, - "language_loss": 0.87324822, - "learning_rate": 2.737530807925321e-06, - "loss": 0.89484465, - "num_input_tokens_seen": 142338535, - "step": 6626, - "time_per_iteration": 2.5845320224761963 - }, - { - "auxiliary_loss_clip": 0.01067442, - "auxiliary_loss_mlp": 0.00775778, - "balance_loss_clip": 1.03995085, - "balance_loss_mlp": 1.00066137, - "epoch": 0.3984367954306328, - "flos": 17965282930560.0, - "grad_norm": 2.3324132294494797, - "language_loss": 0.83462882, - "learning_rate": 2.737168780548417e-06, - "loss": 0.85306096, - "num_input_tokens_seen": 142354570, - "step": 6627, - "time_per_iteration": 2.854428291320801 - }, - { - "auxiliary_loss_clip": 0.01087071, - "auxiliary_loss_mlp": 0.00771611, - "balance_loss_clip": 1.04081798, - "balance_loss_mlp": 1.00056684, - "epoch": 0.3984969186833008, - "flos": 22711057608960.0, - "grad_norm": 1.4575889504047923, - "language_loss": 0.82904339, - "learning_rate": 2.736806725217998e-06, - "loss": 0.84763026, - "num_input_tokens_seen": 142374395, - "step": 6628, - "time_per_iteration": 2.772620916366577 - }, - { - "auxiliary_loss_clip": 0.01092039, - "auxiliary_loss_mlp": 0.01062711, - "balance_loss_clip": 1.04402328, - "balance_loss_mlp": 1.04652882, - "epoch": 0.39855704193596875, - "flos": 23405785534080.0, - "grad_norm": 1.6631347026103094, - "language_loss": 0.71145642, - "learning_rate": 2.7364446419477945e-06, - "loss": 0.73300385, - "num_input_tokens_seen": 142396040, - "step": 6629, - "time_per_iteration": 2.681969165802002 - }, - { - "auxiliary_loss_clip": 0.01097676, - "auxiliary_loss_mlp": 0.01035809, - "balance_loss_clip": 1.04695797, - "balance_loss_mlp": 1.02136111, - "epoch": 0.3986171651886367, - "flos": 21251935330560.0, - "grad_norm": 1.757569665448266, - "language_loss": 0.80513418, - "learning_rate": 2.7360825307515366e-06, - "loss": 0.82646906, - "num_input_tokens_seen": 142415495, - "step": 6630, - "time_per_iteration": 2.7747275829315186 - }, - { - "auxiliary_loss_clip": 0.01072778, - "auxiliary_loss_mlp": 0.01032526, - "balance_loss_clip": 1.04495096, - "balance_loss_mlp": 1.01805389, - "epoch": 0.3986772884413047, - "flos": 12458705258880.0, - "grad_norm": 2.3833222910170857, - "language_loss": 0.74846494, - "learning_rate": 2.7357203916429555e-06, - "loss": 0.76951796, - "num_input_tokens_seen": 142431865, - "step": 6631, - "time_per_iteration": 2.8098866939544678 - }, - { - "auxiliary_loss_clip": 0.01095184, - "auxiliary_loss_mlp": 0.01040404, - "balance_loss_clip": 1.04248333, - "balance_loss_mlp": 1.02500248, - "epoch": 0.39873741169397264, - "flos": 19646117907840.0, - "grad_norm": 2.096728163981437, - "language_loss": 0.7160908, - "learning_rate": 2.735358224635783e-06, - "loss": 0.73744667, - "num_input_tokens_seen": 142450595, - "step": 6632, - "time_per_iteration": 2.81479811668396 - }, - { - "auxiliary_loss_clip": 0.01063774, - "auxiliary_loss_mlp": 0.00771132, - "balance_loss_clip": 1.04164338, - "balance_loss_mlp": 1.00057721, - "epoch": 0.3987975349466406, - "flos": 21684766216320.0, - "grad_norm": 2.0680050346702945, - "language_loss": 0.7479074, - "learning_rate": 2.7349960297437533e-06, - "loss": 0.76625645, - "num_input_tokens_seen": 142466650, - "step": 6633, - "time_per_iteration": 2.9533073902130127 - }, - { - "auxiliary_loss_clip": 0.01105798, - "auxiliary_loss_mlp": 0.01028668, - "balance_loss_clip": 1.0465138, - "balance_loss_mlp": 1.01509583, - "epoch": 0.3988576581993086, - "flos": 23914064937600.0, - "grad_norm": 1.7626671777587215, - "language_loss": 0.81420207, - "learning_rate": 2.7346338069806e-06, - "loss": 0.83554673, - "num_input_tokens_seen": 142486165, - "step": 6634, - "time_per_iteration": 2.760012626647949 - }, - { - "auxiliary_loss_clip": 0.0110458, - "auxiliary_loss_mlp": 0.0103091, - "balance_loss_clip": 1.04739153, - "balance_loss_mlp": 1.01618731, - "epoch": 0.39891778145197654, - "flos": 18149899858560.0, - "grad_norm": 2.495702621722643, - "language_loss": 0.74914795, - "learning_rate": 2.7342715563600597e-06, - "loss": 0.77050287, - "num_input_tokens_seen": 142505035, - "step": 6635, - "time_per_iteration": 4.225152015686035 - }, - { - "auxiliary_loss_clip": 0.01101511, - "auxiliary_loss_mlp": 0.01039121, - "balance_loss_clip": 1.04791617, - "balance_loss_mlp": 1.02265239, - "epoch": 0.3989779047046445, - "flos": 22595281096320.0, - "grad_norm": 28.19463582214486, - "language_loss": 0.66373086, - "learning_rate": 2.733909277895868e-06, - "loss": 0.68513715, - "num_input_tokens_seen": 142521870, - "step": 6636, - "time_per_iteration": 4.455794811248779 - }, - { - "auxiliary_loss_clip": 0.01118899, - "auxiliary_loss_mlp": 0.01041724, - "balance_loss_clip": 1.04681683, - "balance_loss_mlp": 1.02687669, - "epoch": 0.39903802795731247, - "flos": 18077216688000.0, - "grad_norm": 2.0422591411720723, - "language_loss": 0.81318372, - "learning_rate": 2.733546971601763e-06, - "loss": 0.83478993, - "num_input_tokens_seen": 142540455, - "step": 6637, - "time_per_iteration": 4.3843090534210205 - }, - { - "auxiliary_loss_clip": 0.0102804, - "auxiliary_loss_mlp": 0.01018728, - "balance_loss_clip": 1.02743387, - "balance_loss_mlp": 1.01694012, - "epoch": 0.39909815120998043, - "flos": 70441367771520.0, - "grad_norm": 0.719892771757815, - "language_loss": 0.53119934, - "learning_rate": 2.733184637491484e-06, - "loss": 0.55166698, - "num_input_tokens_seen": 142599665, - "step": 6638, - "time_per_iteration": 3.2910361289978027 - }, - { - "auxiliary_loss_clip": 0.01112783, - "auxiliary_loss_mlp": 0.00772668, - "balance_loss_clip": 1.04786587, - "balance_loss_mlp": 1.00065207, - "epoch": 0.39915827446264845, - "flos": 18549262247040.0, - "grad_norm": 1.6115719033099838, - "language_loss": 0.75487578, - "learning_rate": 2.732822275578769e-06, - "loss": 0.77373028, - "num_input_tokens_seen": 142618845, - "step": 6639, - "time_per_iteration": 2.7083969116210938 - }, - { - "auxiliary_loss_clip": 0.0105821, - "auxiliary_loss_mlp": 0.01036909, - "balance_loss_clip": 1.03856301, - "balance_loss_mlp": 1.022264, - "epoch": 0.3992183977153164, - "flos": 29897249195520.0, - "grad_norm": 2.505539025941121, - "language_loss": 0.76163709, - "learning_rate": 2.7324598858773603e-06, - "loss": 0.78258824, - "num_input_tokens_seen": 142640885, - "step": 6640, - "time_per_iteration": 2.8801841735839844 - }, - { - "auxiliary_loss_clip": 0.01102565, - "auxiliary_loss_mlp": 0.01038995, - "balance_loss_clip": 1.04663992, - "balance_loss_mlp": 1.02430892, - "epoch": 0.3992785209679844, - "flos": 22565080736640.0, - "grad_norm": 2.779199402703341, - "language_loss": 0.81995392, - "learning_rate": 2.7320974684009996e-06, - "loss": 0.84136951, - "num_input_tokens_seen": 142659340, - "step": 6641, - "time_per_iteration": 4.346608638763428 - }, - { - "auxiliary_loss_clip": 0.01136449, - "auxiliary_loss_mlp": 0.01038781, - "balance_loss_clip": 1.05189252, - "balance_loss_mlp": 1.02393353, - "epoch": 0.39933864422065235, - "flos": 19682674974720.0, - "grad_norm": 2.1545130527280985, - "language_loss": 0.76744998, - "learning_rate": 2.7317350231634288e-06, - "loss": 0.78920233, - "num_input_tokens_seen": 142677085, - "step": 6642, - "time_per_iteration": 2.656057596206665 - }, - { - "auxiliary_loss_clip": 0.01106418, - "auxiliary_loss_mlp": 0.01034072, - "balance_loss_clip": 1.04871511, - "balance_loss_mlp": 1.0196898, - "epoch": 0.3993987674733203, - "flos": 23038491012480.0, - "grad_norm": 2.1744041926742788, - "language_loss": 0.72387367, - "learning_rate": 2.731372550178393e-06, - "loss": 0.7452786, - "num_input_tokens_seen": 142694595, - "step": 6643, - "time_per_iteration": 2.680995225906372 - }, - { - "auxiliary_loss_clip": 0.01123145, - "auxiliary_loss_mlp": 0.01040337, - "balance_loss_clip": 1.04840899, - "balance_loss_mlp": 1.02565074, - "epoch": 0.3994588907259883, - "flos": 19390828970880.0, - "grad_norm": 1.7059817149479597, - "language_loss": 0.6665355, - "learning_rate": 2.7310100494596375e-06, - "loss": 0.68817025, - "num_input_tokens_seen": 142714175, - "step": 6644, - "time_per_iteration": 2.6378324031829834 - }, - { - "auxiliary_loss_clip": 0.01130779, - "auxiliary_loss_mlp": 0.0103839, - "balance_loss_clip": 1.04629064, - "balance_loss_mlp": 1.02349472, - "epoch": 0.39951901397865625, - "flos": 13734395758080.0, - "grad_norm": 2.1425296608964937, - "language_loss": 0.78164649, - "learning_rate": 2.730647521020907e-06, - "loss": 0.80333817, - "num_input_tokens_seen": 142730955, - "step": 6645, - "time_per_iteration": 2.6268746852874756 - }, - { - "auxiliary_loss_clip": 0.0112116, - "auxiliary_loss_mlp": 0.01037104, - "balance_loss_clip": 1.04624033, - "balance_loss_mlp": 1.02252507, - "epoch": 0.3995791372313242, - "flos": 23586451966080.0, - "grad_norm": 1.7492924628724136, - "language_loss": 0.69861412, - "learning_rate": 2.73028496487595e-06, - "loss": 0.72019678, - "num_input_tokens_seen": 142751200, - "step": 6646, - "time_per_iteration": 2.7350409030914307 - }, - { - "auxiliary_loss_clip": 0.0107684, - "auxiliary_loss_mlp": 0.01037342, - "balance_loss_clip": 1.03799927, - "balance_loss_mlp": 1.02223825, - "epoch": 0.3996392604839922, - "flos": 21355896268800.0, - "grad_norm": 1.7623657715359762, - "language_loss": 0.72017872, - "learning_rate": 2.729922381038513e-06, - "loss": 0.74132061, - "num_input_tokens_seen": 142770170, - "step": 6647, - "time_per_iteration": 2.7607529163360596 - }, - { - "auxiliary_loss_clip": 0.01093143, - "auxiliary_loss_mlp": 0.01043089, - "balance_loss_clip": 1.04529011, - "balance_loss_mlp": 1.02973795, - "epoch": 0.39969938373666014, - "flos": 26032255914240.0, - "grad_norm": 1.4563496549616326, - "language_loss": 0.74217343, - "learning_rate": 2.7295597695223463e-06, - "loss": 0.7635358, - "num_input_tokens_seen": 142792680, - "step": 6648, - "time_per_iteration": 2.8048219680786133 - }, - { - "auxiliary_loss_clip": 0.01133606, - "auxiliary_loss_mlp": 0.01037616, - "balance_loss_clip": 1.04912674, - "balance_loss_mlp": 1.02281022, - "epoch": 0.3997595069893281, - "flos": 20116367786880.0, - "grad_norm": 2.0433040752683578, - "language_loss": 0.6589973, - "learning_rate": 2.7291971303412006e-06, - "loss": 0.6807096, - "num_input_tokens_seen": 142810510, - "step": 6649, - "time_per_iteration": 2.6976583003997803 - }, - { - "auxiliary_loss_clip": 0.01103049, - "auxiliary_loss_mlp": 0.01042133, - "balance_loss_clip": 1.04713392, - "balance_loss_mlp": 1.02803016, - "epoch": 0.39981963024199607, - "flos": 27783403764480.0, - "grad_norm": 1.7319771659085785, - "language_loss": 0.75106388, - "learning_rate": 2.728834463508826e-06, - "loss": 0.77251565, - "num_input_tokens_seen": 142832455, - "step": 6650, - "time_per_iteration": 2.7441325187683105 - }, - { - "auxiliary_loss_clip": 0.01132922, - "auxiliary_loss_mlp": 0.01042591, - "balance_loss_clip": 1.04873252, - "balance_loss_mlp": 1.02803564, - "epoch": 0.39987975349466404, - "flos": 21944436612480.0, - "grad_norm": 1.5673208577322473, - "language_loss": 0.72102094, - "learning_rate": 2.728471769038975e-06, - "loss": 0.74277604, - "num_input_tokens_seen": 142852590, - "step": 6651, - "time_per_iteration": 2.6027066707611084 - }, - { - "auxiliary_loss_clip": 0.01132958, - "auxiliary_loss_mlp": 0.01045235, - "balance_loss_clip": 1.04850328, - "balance_loss_mlp": 1.03093004, - "epoch": 0.39993987674733206, - "flos": 20704405340160.0, - "grad_norm": 1.8492457158027382, - "language_loss": 0.73126423, - "learning_rate": 2.728109046945403e-06, - "loss": 0.75304615, - "num_input_tokens_seen": 142870595, - "step": 6652, - "time_per_iteration": 2.5880327224731445 - }, - { - "auxiliary_loss_clip": 0.01029168, - "auxiliary_loss_mlp": 0.01002764, - "balance_loss_clip": 1.02822125, - "balance_loss_mlp": 1.00134552, - "epoch": 0.4, - "flos": 61525429862400.0, - "grad_norm": 0.8458278780382239, - "language_loss": 0.60614997, - "learning_rate": 2.727746297241862e-06, - "loss": 0.62646931, - "num_input_tokens_seen": 142925805, - "step": 6653, - "time_per_iteration": 3.1626622676849365 - }, - { - "auxiliary_loss_clip": 0.01093219, - "auxiliary_loss_mlp": 0.01039197, - "balance_loss_clip": 1.04810715, - "balance_loss_mlp": 1.02577376, - "epoch": 0.400060123252668, - "flos": 14502309644160.0, - "grad_norm": 3.0617453661279788, - "language_loss": 0.66701174, - "learning_rate": 2.7273835199421085e-06, - "loss": 0.6883359, - "num_input_tokens_seen": 142943145, - "step": 6654, - "time_per_iteration": 2.696179151535034 - }, - { - "auxiliary_loss_clip": 0.01119303, - "auxiliary_loss_mlp": 0.01043738, - "balance_loss_clip": 1.04738593, - "balance_loss_mlp": 1.03145993, - "epoch": 0.40012024650533595, - "flos": 19093308618240.0, - "grad_norm": 2.461149956206156, - "language_loss": 0.89818919, - "learning_rate": 2.7270207150599e-06, - "loss": 0.91981959, - "num_input_tokens_seen": 142956925, - "step": 6655, - "time_per_iteration": 2.601891279220581 - }, - { - "auxiliary_loss_clip": 0.01100614, - "auxiliary_loss_mlp": 0.01040322, - "balance_loss_clip": 1.04367936, - "balance_loss_mlp": 1.02693462, - "epoch": 0.4001803697580039, - "flos": 29351012094720.0, - "grad_norm": 1.7913118709828861, - "language_loss": 0.73551166, - "learning_rate": 2.7266578826089917e-06, - "loss": 0.75692105, - "num_input_tokens_seen": 142978040, - "step": 6656, - "time_per_iteration": 2.705662727355957 - }, - { - "auxiliary_loss_clip": 0.01131953, - "auxiliary_loss_mlp": 0.01046856, - "balance_loss_clip": 1.04838896, - "balance_loss_mlp": 1.03224063, - "epoch": 0.4002404930106719, - "flos": 20920048640640.0, - "grad_norm": 1.6512050463613386, - "language_loss": 0.73344004, - "learning_rate": 2.726295022603144e-06, - "loss": 0.75522816, - "num_input_tokens_seen": 142998390, - "step": 6657, - "time_per_iteration": 2.7595558166503906 - }, - { - "auxiliary_loss_clip": 0.0113267, - "auxiliary_loss_mlp": 0.01046679, - "balance_loss_clip": 1.04887247, - "balance_loss_mlp": 1.03145635, - "epoch": 0.40030061626333985, - "flos": 28405735827840.0, - "grad_norm": 1.7318374723338787, - "language_loss": 0.79715288, - "learning_rate": 2.725932135056117e-06, - "loss": 0.81894636, - "num_input_tokens_seen": 143021505, - "step": 6658, - "time_per_iteration": 2.6718270778656006 - }, - { - "auxiliary_loss_clip": 0.01115521, - "auxiliary_loss_mlp": 0.01042275, - "balance_loss_clip": 1.04249525, - "balance_loss_mlp": 1.02865553, - "epoch": 0.4003607395160078, - "flos": 25921615046400.0, - "grad_norm": 2.0999446343296317, - "language_loss": 0.77464151, - "learning_rate": 2.72556921998167e-06, - "loss": 0.79621947, - "num_input_tokens_seen": 143041375, - "step": 6659, - "time_per_iteration": 2.7160539627075195 - }, - { - "auxiliary_loss_clip": 0.01118822, - "auxiliary_loss_mlp": 0.01028418, - "balance_loss_clip": 1.04276848, - "balance_loss_mlp": 1.01649117, - "epoch": 0.4004208627686758, - "flos": 20768648814720.0, - "grad_norm": 1.6781351315554156, - "language_loss": 0.72410327, - "learning_rate": 2.7252062773935662e-06, - "loss": 0.74557567, - "num_input_tokens_seen": 143058725, - "step": 6660, - "time_per_iteration": 2.636833429336548 - }, - { - "auxiliary_loss_clip": 0.01101229, - "auxiliary_loss_mlp": 0.01041317, - "balance_loss_clip": 1.04196119, - "balance_loss_mlp": 1.02828765, - "epoch": 0.40048098602134374, - "flos": 24681224638080.0, - "grad_norm": 1.813091564393644, - "language_loss": 0.71008015, - "learning_rate": 2.7248433073055674e-06, - "loss": 0.73150557, - "num_input_tokens_seen": 143076995, - "step": 6661, - "time_per_iteration": 2.6956517696380615 - }, - { - "auxiliary_loss_clip": 0.0113437, - "auxiliary_loss_mlp": 0.01042051, - "balance_loss_clip": 1.0506804, - "balance_loss_mlp": 1.02825832, - "epoch": 0.4005411092740117, - "flos": 23185688947200.0, - "grad_norm": 1.8086148623568068, - "language_loss": 0.75526643, - "learning_rate": 2.724480309731437e-06, - "loss": 0.77703071, - "num_input_tokens_seen": 143096780, - "step": 6662, - "time_per_iteration": 2.6232621669769287 - }, - { - "auxiliary_loss_clip": 0.01115634, - "auxiliary_loss_mlp": 0.01036997, - "balance_loss_clip": 1.04385805, - "balance_loss_mlp": 1.02194118, - "epoch": 0.4006012325266797, - "flos": 17522324409600.0, - "grad_norm": 2.00646115239694, - "language_loss": 0.66450548, - "learning_rate": 2.7241172846849417e-06, - "loss": 0.68603182, - "num_input_tokens_seen": 143112590, - "step": 6663, - "time_per_iteration": 2.622520923614502 - }, - { - "auxiliary_loss_clip": 0.01112804, - "auxiliary_loss_mlp": 0.01042686, - "balance_loss_clip": 1.04327071, - "balance_loss_mlp": 1.02767718, - "epoch": 0.40066135577934764, - "flos": 19857200181120.0, - "grad_norm": 2.069962140682172, - "language_loss": 0.86383915, - "learning_rate": 2.7237542321798455e-06, - "loss": 0.88539398, - "num_input_tokens_seen": 143130220, - "step": 6664, - "time_per_iteration": 2.575124979019165 - }, - { - "auxiliary_loss_clip": 0.01119355, - "auxiliary_loss_mlp": 0.01036584, - "balance_loss_clip": 1.04696763, - "balance_loss_mlp": 1.0227679, - "epoch": 0.40072147903201566, - "flos": 18150007599360.0, - "grad_norm": 16.441358853078547, - "language_loss": 0.84723455, - "learning_rate": 2.723391152229917e-06, - "loss": 0.86879396, - "num_input_tokens_seen": 143147160, - "step": 6665, - "time_per_iteration": 2.671715259552002 - }, - { - "auxiliary_loss_clip": 0.01119739, - "auxiliary_loss_mlp": 0.01037355, - "balance_loss_clip": 1.04976356, - "balance_loss_mlp": 1.02249575, - "epoch": 0.4007816022846836, - "flos": 18661267831680.0, - "grad_norm": 1.8896907519127706, - "language_loss": 0.78118432, - "learning_rate": 2.7230280448489236e-06, - "loss": 0.80275524, - "num_input_tokens_seen": 143164605, - "step": 6666, - "time_per_iteration": 2.606566905975342 - }, - { - "auxiliary_loss_clip": 0.01120664, - "auxiliary_loss_mlp": 0.01038255, - "balance_loss_clip": 1.0485028, - "balance_loss_mlp": 1.02380657, - "epoch": 0.4008417255373516, - "flos": 25703170485120.0, - "grad_norm": 1.7955817814438895, - "language_loss": 0.73301423, - "learning_rate": 2.7226649100506333e-06, - "loss": 0.75460339, - "num_input_tokens_seen": 143183965, - "step": 6667, - "time_per_iteration": 2.652503490447998 - }, - { - "auxiliary_loss_clip": 0.0111465, - "auxiliary_loss_mlp": 0.01054818, - "balance_loss_clip": 1.04516435, - "balance_loss_mlp": 1.03899896, - "epoch": 0.40090184879001955, - "flos": 22858614679680.0, - "grad_norm": 1.708550182183753, - "language_loss": 0.76022822, - "learning_rate": 2.7223017478488183e-06, - "loss": 0.78192288, - "num_input_tokens_seen": 143204965, - "step": 6668, - "time_per_iteration": 2.6797566413879395 - }, - { - "auxiliary_loss_clip": 0.01096645, - "auxiliary_loss_mlp": 0.01046849, - "balance_loss_clip": 1.04792619, - "balance_loss_mlp": 1.0321629, - "epoch": 0.4009619720426875, - "flos": 29059848449280.0, - "grad_norm": 2.335244314112793, - "language_loss": 0.8221435, - "learning_rate": 2.721938558257248e-06, - "loss": 0.84357846, - "num_input_tokens_seen": 143225015, - "step": 6669, - "time_per_iteration": 2.7661361694335938 - }, - { - "auxiliary_loss_clip": 0.010311, - "auxiliary_loss_mlp": 0.01009516, - "balance_loss_clip": 1.02684975, - "balance_loss_mlp": 1.00805604, - "epoch": 0.4010220952953555, - "flos": 66059763131520.0, - "grad_norm": 0.69994773813092, - "language_loss": 0.53312683, - "learning_rate": 2.721575341289695e-06, - "loss": 0.55353302, - "num_input_tokens_seen": 143294925, - "step": 6670, - "time_per_iteration": 3.5547046661376953 - }, - { - "auxiliary_loss_clip": 0.01083638, - "auxiliary_loss_mlp": 0.01041448, - "balance_loss_clip": 1.04546833, - "balance_loss_mlp": 1.02720881, - "epoch": 0.40108221854802345, - "flos": 29642822184960.0, - "grad_norm": 1.626307597556219, - "language_loss": 0.88544351, - "learning_rate": 2.7212120969599333e-06, - "loss": 0.90669441, - "num_input_tokens_seen": 143314170, - "step": 6671, - "time_per_iteration": 2.9112329483032227 - }, - { - "auxiliary_loss_clip": 0.01119533, - "auxiliary_loss_mlp": 0.01036462, - "balance_loss_clip": 1.04568124, - "balance_loss_mlp": 1.02137589, - "epoch": 0.4011423418006914, - "flos": 19929560129280.0, - "grad_norm": 3.0264857347014993, - "language_loss": 0.79105932, - "learning_rate": 2.720848825281736e-06, - "loss": 0.81261927, - "num_input_tokens_seen": 143330050, - "step": 6672, - "time_per_iteration": 2.789889335632324 - }, - { - "auxiliary_loss_clip": 0.01096186, - "auxiliary_loss_mlp": 0.01045513, - "balance_loss_clip": 1.04610085, - "balance_loss_mlp": 1.03012288, - "epoch": 0.4012024650533594, - "flos": 20084299920000.0, - "grad_norm": 4.192283777131793, - "language_loss": 0.6293034, - "learning_rate": 2.72048552626888e-06, - "loss": 0.65072036, - "num_input_tokens_seen": 143348650, - "step": 6673, - "time_per_iteration": 2.796834945678711 - }, - { - "auxiliary_loss_clip": 0.011055, - "auxiliary_loss_mlp": 0.00771502, - "balance_loss_clip": 1.04474831, - "balance_loss_mlp": 1.00076985, - "epoch": 0.40126258830602735, - "flos": 21695719864320.0, - "grad_norm": 1.5776272245666931, - "language_loss": 0.79948354, - "learning_rate": 2.7201221999351402e-06, - "loss": 0.81825352, - "num_input_tokens_seen": 143370275, - "step": 6674, - "time_per_iteration": 4.298279523849487 - }, - { - "auxiliary_loss_clip": 0.0108893, - "auxiliary_loss_mlp": 0.01040876, - "balance_loss_clip": 1.04919565, - "balance_loss_mlp": 1.02610552, - "epoch": 0.4013227115586953, - "flos": 12020379592320.0, - "grad_norm": 6.494329221896898, - "language_loss": 0.82218468, - "learning_rate": 2.719758846294294e-06, - "loss": 0.84348273, - "num_input_tokens_seen": 143385390, - "step": 6675, - "time_per_iteration": 2.7607553005218506 - }, - { - "auxiliary_loss_clip": 0.01116053, - "auxiliary_loss_mlp": 0.01038994, - "balance_loss_clip": 1.04261947, - "balance_loss_mlp": 1.02364039, - "epoch": 0.4013828348113633, - "flos": 25447522412160.0, - "grad_norm": 2.205024073964141, - "language_loss": 0.93500578, - "learning_rate": 2.71939546536012e-06, - "loss": 0.95655626, - "num_input_tokens_seen": 143404215, - "step": 6676, - "time_per_iteration": 5.81420373916626 - }, - { - "auxiliary_loss_clip": 0.01126662, - "auxiliary_loss_mlp": 0.01041717, - "balance_loss_clip": 1.04832482, - "balance_loss_mlp": 1.02589226, - "epoch": 0.40144295806403124, - "flos": 18582946225920.0, - "grad_norm": 2.1287377468959727, - "language_loss": 0.79300511, - "learning_rate": 2.719032057146399e-06, - "loss": 0.81468892, - "num_input_tokens_seen": 143422245, - "step": 6677, - "time_per_iteration": 2.6485939025878906 - }, - { - "auxiliary_loss_clip": 0.01107812, - "auxiliary_loss_mlp": 0.01035756, - "balance_loss_clip": 1.04743207, - "balance_loss_mlp": 1.02122426, - "epoch": 0.4015030813166992, - "flos": 22930220442240.0, - "grad_norm": 2.404301700652251, - "language_loss": 0.83507645, - "learning_rate": 2.71866862166691e-06, - "loss": 0.85651207, - "num_input_tokens_seen": 143443130, - "step": 6678, - "time_per_iteration": 2.749229907989502 - }, - { - "auxiliary_loss_clip": 0.01127798, - "auxiliary_loss_mlp": 0.01039278, - "balance_loss_clip": 1.04660463, - "balance_loss_mlp": 1.02481759, - "epoch": 0.4015632045693672, - "flos": 20595057361920.0, - "grad_norm": 2.137342142944676, - "language_loss": 0.63547456, - "learning_rate": 2.718305158935434e-06, - "loss": 0.65714526, - "num_input_tokens_seen": 143461385, - "step": 6679, - "time_per_iteration": 4.272741794586182 - }, - { - "auxiliary_loss_clip": 0.01100371, - "auxiliary_loss_mlp": 0.01032462, - "balance_loss_clip": 1.04277802, - "balance_loss_mlp": 1.01852596, - "epoch": 0.4016233278220352, - "flos": 23438930808960.0, - "grad_norm": 2.2420809209281582, - "language_loss": 0.78955674, - "learning_rate": 2.7179416689657554e-06, - "loss": 0.81088501, - "num_input_tokens_seen": 143481750, - "step": 6680, - "time_per_iteration": 2.6541543006896973 - }, - { - "auxiliary_loss_clip": 0.01099744, - "auxiliary_loss_mlp": 0.00773185, - "balance_loss_clip": 1.04565692, - "balance_loss_mlp": 1.0009259, - "epoch": 0.40168345107470316, - "flos": 21431057477760.0, - "grad_norm": 1.5474671150398438, - "language_loss": 0.75901389, - "learning_rate": 2.7175781517716556e-06, - "loss": 0.77774316, - "num_input_tokens_seen": 143501540, - "step": 6681, - "time_per_iteration": 2.747549295425415 - }, - { - "auxiliary_loss_clip": 0.01092334, - "auxiliary_loss_mlp": 0.01031295, - "balance_loss_clip": 1.04743123, - "balance_loss_mlp": 1.01728785, - "epoch": 0.4017435743273711, - "flos": 22857214049280.0, - "grad_norm": 1.9537198932922564, - "language_loss": 0.64593118, - "learning_rate": 2.7172146073669213e-06, - "loss": 0.66716748, - "num_input_tokens_seen": 143520530, - "step": 6682, - "time_per_iteration": 2.764676094055176 - }, - { - "auxiliary_loss_clip": 0.01084656, - "auxiliary_loss_mlp": 0.01040039, - "balance_loss_clip": 1.04031992, - "balance_loss_mlp": 1.025424, - "epoch": 0.4018036975800391, - "flos": 28622312881920.0, - "grad_norm": 8.033606907615594, - "language_loss": 0.72794902, - "learning_rate": 2.716851035765337e-06, - "loss": 0.74919599, - "num_input_tokens_seen": 143540210, - "step": 6683, - "time_per_iteration": 2.9106507301330566 - }, - { - "auxiliary_loss_clip": 0.01116481, - "auxiliary_loss_mlp": 0.01043307, - "balance_loss_clip": 1.04472065, - "balance_loss_mlp": 1.02844119, - "epoch": 0.40186382083270705, - "flos": 26651212099200.0, - "grad_norm": 1.6079104273266733, - "language_loss": 0.73560667, - "learning_rate": 2.7164874369806896e-06, - "loss": 0.75720453, - "num_input_tokens_seen": 143560940, - "step": 6684, - "time_per_iteration": 2.814746141433716 - }, - { - "auxiliary_loss_clip": 0.01038178, - "auxiliary_loss_mlp": 0.01003165, - "balance_loss_clip": 1.02248073, - "balance_loss_mlp": 1.00177026, - "epoch": 0.401923944085375, - "flos": 59259969123840.0, - "grad_norm": 0.8040960642815781, - "language_loss": 0.6037817, - "learning_rate": 2.716123811026767e-06, - "loss": 0.6241951, - "num_input_tokens_seen": 143624015, - "step": 6685, - "time_per_iteration": 3.3159523010253906 - }, - { - "auxiliary_loss_clip": 0.01121727, - "auxiliary_loss_mlp": 0.0103265, - "balance_loss_clip": 1.04626095, - "balance_loss_mlp": 1.01806533, - "epoch": 0.401984067338043, - "flos": 16982803152000.0, - "grad_norm": 2.1640557725493563, - "language_loss": 0.69947135, - "learning_rate": 2.715760157917357e-06, - "loss": 0.7210151, - "num_input_tokens_seen": 143642750, - "step": 6686, - "time_per_iteration": 2.7339890003204346 - }, - { - "auxiliary_loss_clip": 0.01109024, - "auxiliary_loss_mlp": 0.01036336, - "balance_loss_clip": 1.04641056, - "balance_loss_mlp": 1.02213836, - "epoch": 0.40204419059071095, - "flos": 24972496024320.0, - "grad_norm": 1.482832144271372, - "language_loss": 0.74904519, - "learning_rate": 2.7153964776662504e-06, - "loss": 0.77049881, - "num_input_tokens_seen": 143664515, - "step": 6687, - "time_per_iteration": 2.7403111457824707 - }, - { - "auxiliary_loss_clip": 0.01110823, - "auxiliary_loss_mlp": 0.01036282, - "balance_loss_clip": 1.04890549, - "balance_loss_mlp": 1.02179182, - "epoch": 0.4021043138433789, - "flos": 23477463123840.0, - "grad_norm": 1.9109413621033529, - "language_loss": 0.71165651, - "learning_rate": 2.7150327702872385e-06, - "loss": 0.73312759, - "num_input_tokens_seen": 143683135, - "step": 6688, - "time_per_iteration": 2.7349321842193604 - }, - { - "auxiliary_loss_clip": 0.01105847, - "auxiliary_loss_mlp": 0.01043979, - "balance_loss_clip": 1.0426929, - "balance_loss_mlp": 1.02785039, - "epoch": 0.4021644370960469, - "flos": 25995806588160.0, - "grad_norm": 2.0144045301965248, - "language_loss": 0.64289308, - "learning_rate": 2.7146690357941112e-06, - "loss": 0.66439128, - "num_input_tokens_seen": 143703985, - "step": 6689, - "time_per_iteration": 2.740938186645508 - }, - { - "auxiliary_loss_clip": 0.0112261, - "auxiliary_loss_mlp": 0.01032805, - "balance_loss_clip": 1.04519129, - "balance_loss_mlp": 1.01838636, - "epoch": 0.40222456034871484, - "flos": 13587987922560.0, - "grad_norm": 2.8658666003554147, - "language_loss": 0.7358911, - "learning_rate": 2.7143052742006632e-06, - "loss": 0.75744528, - "num_input_tokens_seen": 143719245, - "step": 6690, - "time_per_iteration": 2.622920513153076 - }, - { - "auxiliary_loss_clip": 0.01099316, - "auxiliary_loss_mlp": 0.01037502, - "balance_loss_clip": 1.04444623, - "balance_loss_mlp": 1.0230422, - "epoch": 0.4022846836013828, - "flos": 24278019494400.0, - "grad_norm": 1.7112869735009542, - "language_loss": 0.74805617, - "learning_rate": 2.7139414855206872e-06, - "loss": 0.76942438, - "num_input_tokens_seen": 143739575, - "step": 6691, - "time_per_iteration": 2.704138994216919 - }, - { - "auxiliary_loss_clip": 0.0111344, - "auxiliary_loss_mlp": 0.01040266, - "balance_loss_clip": 1.0485332, - "balance_loss_mlp": 1.02509689, - "epoch": 0.40234480685405083, - "flos": 20151596050560.0, - "grad_norm": 1.5633314974955987, - "language_loss": 0.7267946, - "learning_rate": 2.7135776697679785e-06, - "loss": 0.74833167, - "num_input_tokens_seen": 143758515, - "step": 6692, - "time_per_iteration": 2.6782071590423584 - }, - { - "auxiliary_loss_clip": 0.01081716, - "auxiliary_loss_mlp": 0.0103731, - "balance_loss_clip": 1.04122448, - "balance_loss_mlp": 1.02274227, - "epoch": 0.4024049301067188, - "flos": 22930220442240.0, - "grad_norm": 2.743543242099247, - "language_loss": 0.84403068, - "learning_rate": 2.7132138269563333e-06, - "loss": 0.8652209, - "num_input_tokens_seen": 143776770, - "step": 6693, - "time_per_iteration": 2.746689558029175 - }, - { - "auxiliary_loss_clip": 0.01092043, - "auxiliary_loss_mlp": 0.0104876, - "balance_loss_clip": 1.04803836, - "balance_loss_mlp": 1.03265464, - "epoch": 0.40246505335938676, - "flos": 36028421487360.0, - "grad_norm": 2.4363636021200716, - "language_loss": 0.70996636, - "learning_rate": 2.7128499570995483e-06, - "loss": 0.73137438, - "num_input_tokens_seen": 143798450, - "step": 6694, - "time_per_iteration": 2.8071961402893066 - }, - { - "auxiliary_loss_clip": 0.01104186, - "auxiliary_loss_mlp": 0.01044295, - "balance_loss_clip": 1.04619551, - "balance_loss_mlp": 1.0292511, - "epoch": 0.4025251766120547, - "flos": 20594303176320.0, - "grad_norm": 2.4336892369471976, - "language_loss": 0.67823637, - "learning_rate": 2.7124860602114212e-06, - "loss": 0.6997211, - "num_input_tokens_seen": 143816995, - "step": 6695, - "time_per_iteration": 2.628509283065796 - }, - { - "auxiliary_loss_clip": 0.01100807, - "auxiliary_loss_mlp": 0.01043105, - "balance_loss_clip": 1.04269171, - "balance_loss_mlp": 1.0272975, - "epoch": 0.4025852998647227, - "flos": 64523932381440.0, - "grad_norm": 2.090135381279502, - "language_loss": 0.79316044, - "learning_rate": 2.7121221363057515e-06, - "loss": 0.81459951, - "num_input_tokens_seen": 143842090, - "step": 6696, - "time_per_iteration": 3.065619707107544 - }, - { - "auxiliary_loss_clip": 0.01107424, - "auxiliary_loss_mlp": 0.0105453, - "balance_loss_clip": 1.04772997, - "balance_loss_mlp": 1.03700638, - "epoch": 0.40264542311739066, - "flos": 20886292834560.0, - "grad_norm": 2.0469796766510164, - "language_loss": 0.71048194, - "learning_rate": 2.7117581853963393e-06, - "loss": 0.73210156, - "num_input_tokens_seen": 143860800, - "step": 6697, - "time_per_iteration": 2.732112169265747 - }, - { - "auxiliary_loss_clip": 0.01119834, - "auxiliary_loss_mlp": 0.01045848, - "balance_loss_clip": 1.04644823, - "balance_loss_mlp": 1.03167999, - "epoch": 0.4027055463700586, - "flos": 26250197685120.0, - "grad_norm": 2.1595912992700725, - "language_loss": 0.6184175, - "learning_rate": 2.711394207496984e-06, - "loss": 0.64007437, - "num_input_tokens_seen": 143878950, - "step": 6698, - "time_per_iteration": 2.6853909492492676 - }, - { - "auxiliary_loss_clip": 0.01122685, - "auxiliary_loss_mlp": 0.01038255, - "balance_loss_clip": 1.04787982, - "balance_loss_mlp": 1.02309155, - "epoch": 0.4027656696227266, - "flos": 20631398947200.0, - "grad_norm": 2.043260848719272, - "language_loss": 0.76455128, - "learning_rate": 2.711030202621491e-06, - "loss": 0.78616071, - "num_input_tokens_seen": 143898385, - "step": 6699, - "time_per_iteration": 2.6033456325531006 - }, - { - "auxiliary_loss_clip": 0.01093615, - "auxiliary_loss_mlp": 0.01030999, - "balance_loss_clip": 1.04446507, - "balance_loss_mlp": 1.01700354, - "epoch": 0.40282579287539455, - "flos": 22346277039360.0, - "grad_norm": 1.6890007857677205, - "language_loss": 0.80442715, - "learning_rate": 2.7106661707836605e-06, - "loss": 0.82567334, - "num_input_tokens_seen": 143918795, - "step": 6700, - "time_per_iteration": 2.777510404586792 - }, - { - "auxiliary_loss_clip": 0.01112643, - "auxiliary_loss_mlp": 0.01045016, - "balance_loss_clip": 1.04943717, - "balance_loss_mlp": 1.02808821, - "epoch": 0.4028859161280625, - "flos": 29274988959360.0, - "grad_norm": 2.176323872107602, - "language_loss": 0.74529326, - "learning_rate": 2.7103021119972977e-06, - "loss": 0.7668699, - "num_input_tokens_seen": 143938245, - "step": 6701, - "time_per_iteration": 2.7424893379211426 - }, - { - "auxiliary_loss_clip": 0.01099003, - "auxiliary_loss_mlp": 0.01037912, - "balance_loss_clip": 1.04379773, - "balance_loss_mlp": 1.02355886, - "epoch": 0.4029460393807305, - "flos": 28622312881920.0, - "grad_norm": 1.8130604516939894, - "language_loss": 0.66064012, - "learning_rate": 2.709938026276208e-06, - "loss": 0.68200922, - "num_input_tokens_seen": 143960995, - "step": 6702, - "time_per_iteration": 2.7448410987854004 - }, - { - "auxiliary_loss_clip": 0.01105222, - "auxiliary_loss_mlp": 0.01045108, - "balance_loss_clip": 1.0470736, - "balance_loss_mlp": 1.02900267, - "epoch": 0.40300616263339845, - "flos": 22601925112320.0, - "grad_norm": 1.86356350955038, - "language_loss": 0.66031915, - "learning_rate": 2.7095739136341964e-06, - "loss": 0.68182242, - "num_input_tokens_seen": 143979910, - "step": 6703, - "time_per_iteration": 2.679979085922241 - }, - { - "auxiliary_loss_clip": 0.01060539, - "auxiliary_loss_mlp": 0.01041946, - "balance_loss_clip": 1.04386449, - "balance_loss_mlp": 1.02445817, - "epoch": 0.4030662858860664, - "flos": 25520313323520.0, - "grad_norm": 2.0398618821746304, - "language_loss": 0.82689512, - "learning_rate": 2.709209774085071e-06, - "loss": 0.84792, - "num_input_tokens_seen": 144000095, - "step": 6704, - "time_per_iteration": 2.9296765327453613 - }, - { - "auxiliary_loss_clip": 0.01112771, - "auxiliary_loss_mlp": 0.01040131, - "balance_loss_clip": 1.04960763, - "balance_loss_mlp": 1.02517009, - "epoch": 0.40312640913873443, - "flos": 23586703361280.0, - "grad_norm": 1.6638111373196858, - "language_loss": 0.73759186, - "learning_rate": 2.7088456076426407e-06, - "loss": 0.75912088, - "num_input_tokens_seen": 144019695, - "step": 6705, - "time_per_iteration": 3.0039970874786377 - }, - { - "auxiliary_loss_clip": 0.0111798, - "auxiliary_loss_mlp": 0.01039471, - "balance_loss_clip": 1.04735386, - "balance_loss_mlp": 1.02541077, - "epoch": 0.4031865323914024, - "flos": 20011042131840.0, - "grad_norm": 1.7718881662691552, - "language_loss": 0.65816283, - "learning_rate": 2.708481414320713e-06, - "loss": 0.67973745, - "num_input_tokens_seen": 144038525, - "step": 6706, - "time_per_iteration": 2.6920299530029297 - }, - { - "auxiliary_loss_clip": 0.01123098, - "auxiliary_loss_mlp": 0.01039977, - "balance_loss_clip": 1.05084229, - "balance_loss_mlp": 1.02508759, - "epoch": 0.40324665564407036, - "flos": 21871430219520.0, - "grad_norm": 1.5916886093016338, - "language_loss": 0.71493578, - "learning_rate": 2.7081171941330992e-06, - "loss": 0.73656654, - "num_input_tokens_seen": 144059485, - "step": 6707, - "time_per_iteration": 2.6424286365509033 - }, - { - "auxiliary_loss_clip": 0.01104664, - "auxiliary_loss_mlp": 0.0103554, - "balance_loss_clip": 1.04652226, - "balance_loss_mlp": 1.0201261, - "epoch": 0.4033067788967383, - "flos": 23878728933120.0, - "grad_norm": 1.6049010195706548, - "language_loss": 0.79860801, - "learning_rate": 2.707752947093611e-06, - "loss": 0.82001007, - "num_input_tokens_seen": 144080265, - "step": 6708, - "time_per_iteration": 2.7476210594177246 - }, - { - "auxiliary_loss_clip": 0.01081311, - "auxiliary_loss_mlp": 0.01041497, - "balance_loss_clip": 1.04192591, - "balance_loss_mlp": 1.0254873, - "epoch": 0.4033669021494063, - "flos": 17419907756160.0, - "grad_norm": 2.2092970812397823, - "language_loss": 0.82527256, - "learning_rate": 2.70738867321606e-06, - "loss": 0.84650064, - "num_input_tokens_seen": 144098040, - "step": 6709, - "time_per_iteration": 2.6981422901153564 - }, - { - "auxiliary_loss_clip": 0.01126319, - "auxiliary_loss_mlp": 0.01037264, - "balance_loss_clip": 1.052701, - "balance_loss_mlp": 1.02168322, - "epoch": 0.40342702540207426, - "flos": 29600554855680.0, - "grad_norm": 3.462855853005799, - "language_loss": 0.71349508, - "learning_rate": 2.70702437251426e-06, - "loss": 0.73513091, - "num_input_tokens_seen": 144118265, - "step": 6710, - "time_per_iteration": 2.745234727859497 - }, - { - "auxiliary_loss_clip": 0.01100277, - "auxiliary_loss_mlp": 0.01040518, - "balance_loss_clip": 1.0461812, - "balance_loss_mlp": 1.02506852, - "epoch": 0.4034871486547422, - "flos": 11284605400320.0, - "grad_norm": 2.0008015592173285, - "language_loss": 0.8497777, - "learning_rate": 2.7066600450020236e-06, - "loss": 0.8711856, - "num_input_tokens_seen": 144133865, - "step": 6711, - "time_per_iteration": 2.6388518810272217 - }, - { - "auxiliary_loss_clip": 0.01124865, - "auxiliary_loss_mlp": 0.01037377, - "balance_loss_clip": 1.04873466, - "balance_loss_mlp": 1.02192783, - "epoch": 0.4035472719074102, - "flos": 15552839738880.0, - "grad_norm": 1.9288958482484087, - "language_loss": 0.76210845, - "learning_rate": 2.706295690693168e-06, - "loss": 0.78373086, - "num_input_tokens_seen": 144150125, - "step": 6712, - "time_per_iteration": 2.617612838745117 - }, - { - "auxiliary_loss_clip": 0.0110296, - "auxiliary_loss_mlp": 0.01042297, - "balance_loss_clip": 1.0465771, - "balance_loss_mlp": 1.02682328, - "epoch": 0.40360739516007815, - "flos": 24674365140480.0, - "grad_norm": 2.8401310029686284, - "language_loss": 0.79334903, - "learning_rate": 2.7059313096015096e-06, - "loss": 0.81480157, - "num_input_tokens_seen": 144169295, - "step": 6713, - "time_per_iteration": 4.2229533195495605 - }, - { - "auxiliary_loss_clip": 0.01096327, - "auxiliary_loss_mlp": 0.01040909, - "balance_loss_clip": 1.04259837, - "balance_loss_mlp": 1.02437484, - "epoch": 0.4036675184127461, - "flos": 17304095329920.0, - "grad_norm": 2.4269881355691867, - "language_loss": 0.88230258, - "learning_rate": 2.705566901740865e-06, - "loss": 0.90367496, - "num_input_tokens_seen": 144185790, - "step": 6714, - "time_per_iteration": 2.6861040592193604 - }, - { - "auxiliary_loss_clip": 0.0112277, - "auxiliary_loss_mlp": 0.01042461, - "balance_loss_clip": 1.04913116, - "balance_loss_mlp": 1.02755439, - "epoch": 0.4037276416654141, - "flos": 19864023765120.0, - "grad_norm": 1.685218394347131, - "language_loss": 0.69355965, - "learning_rate": 2.7052024671250527e-06, - "loss": 0.71521199, - "num_input_tokens_seen": 144205190, - "step": 6715, - "time_per_iteration": 6.05805778503418 - }, - { - "auxiliary_loss_clip": 0.01085368, - "auxiliary_loss_mlp": 0.01039127, - "balance_loss_clip": 1.03982067, - "balance_loss_mlp": 1.02422547, - "epoch": 0.40378776491808205, - "flos": 18296271780480.0, - "grad_norm": 2.4042590138214543, - "language_loss": 0.7738142, - "learning_rate": 2.704838005767892e-06, - "loss": 0.7950592, - "num_input_tokens_seen": 144222705, - "step": 6716, - "time_per_iteration": 2.874701738357544 - }, - { - "auxiliary_loss_clip": 0.01084201, - "auxiliary_loss_mlp": 0.01039901, - "balance_loss_clip": 1.04515779, - "balance_loss_mlp": 1.02554834, - "epoch": 0.40384788817075, - "flos": 15049372757760.0, - "grad_norm": 1.8822370621315767, - "language_loss": 0.7590825, - "learning_rate": 2.7044735176832037e-06, - "loss": 0.78032351, - "num_input_tokens_seen": 144239545, - "step": 6717, - "time_per_iteration": 2.806605339050293 - }, - { - "auxiliary_loss_clip": 0.01034573, - "auxiliary_loss_mlp": 0.01006348, - "balance_loss_clip": 1.03120637, - "balance_loss_mlp": 1.00481057, - "epoch": 0.40390801142341803, - "flos": 61929927895680.0, - "grad_norm": 0.9365934623644069, - "language_loss": 0.60732949, - "learning_rate": 2.7041090028848084e-06, - "loss": 0.62773865, - "num_input_tokens_seen": 144288145, - "step": 6718, - "time_per_iteration": 4.683047771453857 - }, - { - "auxiliary_loss_clip": 0.01137275, - "auxiliary_loss_mlp": 0.01039366, - "balance_loss_clip": 1.04942691, - "balance_loss_mlp": 1.02322555, - "epoch": 0.403968134676086, - "flos": 22738779930240.0, - "grad_norm": 2.360676977629441, - "language_loss": 0.74748445, - "learning_rate": 2.7037444613865306e-06, - "loss": 0.76925087, - "num_input_tokens_seen": 144302315, - "step": 6719, - "time_per_iteration": 2.6020865440368652 - }, - { - "auxiliary_loss_clip": 0.01122679, - "auxiliary_loss_mlp": 0.01042794, - "balance_loss_clip": 1.04766619, - "balance_loss_mlp": 1.02643895, - "epoch": 0.40402825792875396, - "flos": 19784409269760.0, - "grad_norm": 2.123342604077105, - "language_loss": 0.81516802, - "learning_rate": 2.7033798932021906e-06, - "loss": 0.83682275, - "num_input_tokens_seen": 144318990, - "step": 6720, - "time_per_iteration": 2.6707048416137695 - }, - { - "auxiliary_loss_clip": 0.01106407, - "auxiliary_loss_mlp": 0.01033364, - "balance_loss_clip": 1.04365981, - "balance_loss_mlp": 1.01866555, - "epoch": 0.40408838118142193, - "flos": 19609273532160.0, - "grad_norm": 2.786601864332057, - "language_loss": 0.77150661, - "learning_rate": 2.7030152983456153e-06, - "loss": 0.79290426, - "num_input_tokens_seen": 144335765, - "step": 6721, - "time_per_iteration": 2.648050546646118 - }, - { - "auxiliary_loss_clip": 0.01091711, - "auxiliary_loss_mlp": 0.0102956, - "balance_loss_clip": 1.04391122, - "balance_loss_mlp": 1.01643503, - "epoch": 0.4041485044340899, - "flos": 24426043441920.0, - "grad_norm": 2.012609049395132, - "language_loss": 0.72214961, - "learning_rate": 2.7026506768306304e-06, - "loss": 0.74336231, - "num_input_tokens_seen": 144355825, - "step": 6722, - "time_per_iteration": 2.7598764896392822 - }, - { - "auxiliary_loss_clip": 0.01117849, - "auxiliary_loss_mlp": 0.01035294, - "balance_loss_clip": 1.04649532, - "balance_loss_mlp": 1.02137017, - "epoch": 0.40420862768675786, - "flos": 16760192613120.0, - "grad_norm": 2.003025152561758, - "language_loss": 0.66099858, - "learning_rate": 2.7022860286710602e-06, - "loss": 0.68252993, - "num_input_tokens_seen": 144374320, - "step": 6723, - "time_per_iteration": 2.6525375843048096 - }, - { - "auxiliary_loss_clip": 0.0111764, - "auxiliary_loss_mlp": 0.01047962, - "balance_loss_clip": 1.04678059, - "balance_loss_mlp": 1.03247619, - "epoch": 0.4042687509394258, - "flos": 22491571553280.0, - "grad_norm": 1.6479262490520643, - "language_loss": 0.73566139, - "learning_rate": 2.701921353880734e-06, - "loss": 0.75731742, - "num_input_tokens_seen": 144394325, - "step": 6724, - "time_per_iteration": 2.6602234840393066 - }, - { - "auxiliary_loss_clip": 0.01096943, - "auxiliary_loss_mlp": 0.01034012, - "balance_loss_clip": 1.04471684, - "balance_loss_mlp": 1.02009475, - "epoch": 0.4043288741920938, - "flos": 30336149479680.0, - "grad_norm": 1.8514955948130458, - "language_loss": 0.74733102, - "learning_rate": 2.7015566524734787e-06, - "loss": 0.76864064, - "num_input_tokens_seen": 144412765, - "step": 6725, - "time_per_iteration": 2.7086737155914307 - }, - { - "auxiliary_loss_clip": 0.01116531, - "auxiliary_loss_mlp": 0.01035939, - "balance_loss_clip": 1.04757476, - "balance_loss_mlp": 1.02062047, - "epoch": 0.40438899744476176, - "flos": 46348321363200.0, - "grad_norm": 2.3229573968410766, - "language_loss": 0.76987183, - "learning_rate": 2.701191924463126e-06, - "loss": 0.7913965, - "num_input_tokens_seen": 144435400, - "step": 6726, - "time_per_iteration": 2.880244493484497 - }, - { - "auxiliary_loss_clip": 0.01102844, - "auxiliary_loss_mlp": 0.00775301, - "balance_loss_clip": 1.04148483, - "balance_loss_mlp": 1.00105536, - "epoch": 0.4044491206974297, - "flos": 13333524998400.0, - "grad_norm": 2.125548317574291, - "language_loss": 0.8180182, - "learning_rate": 2.7008271698635054e-06, - "loss": 0.83679968, - "num_input_tokens_seen": 144452925, - "step": 6727, - "time_per_iteration": 2.6953587532043457 - }, - { - "auxiliary_loss_clip": 0.01128783, - "auxiliary_loss_mlp": 0.01036901, - "balance_loss_clip": 1.04577255, - "balance_loss_mlp": 1.02264905, - "epoch": 0.4045092439500977, - "flos": 12093745121280.0, - "grad_norm": 2.0701087852414504, - "language_loss": 0.85462439, - "learning_rate": 2.700462388688447e-06, - "loss": 0.87628114, - "num_input_tokens_seen": 144470195, - "step": 6728, - "time_per_iteration": 2.5963056087493896 - }, - { - "auxiliary_loss_clip": 0.01095663, - "auxiliary_loss_mlp": 0.01043865, - "balance_loss_clip": 1.04611719, - "balance_loss_mlp": 1.029351, - "epoch": 0.40456936720276565, - "flos": 21179683123200.0, - "grad_norm": 1.739738235535384, - "language_loss": 0.81606215, - "learning_rate": 2.700097580951786e-06, - "loss": 0.83745748, - "num_input_tokens_seen": 144490320, - "step": 6729, - "time_per_iteration": 2.8157620429992676 - }, - { - "auxiliary_loss_clip": 0.01105665, - "auxiliary_loss_mlp": 0.01043945, - "balance_loss_clip": 1.0443244, - "balance_loss_mlp": 1.02993762, - "epoch": 0.4046294904554336, - "flos": 23915286000000.0, - "grad_norm": 1.917482865643355, - "language_loss": 0.73375344, - "learning_rate": 2.6997327466673533e-06, - "loss": 0.75524956, - "num_input_tokens_seen": 144508990, - "step": 6730, - "time_per_iteration": 2.67053484916687 - }, - { - "auxiliary_loss_clip": 0.01113781, - "auxiliary_loss_mlp": 0.01041271, - "balance_loss_clip": 1.04319108, - "balance_loss_mlp": 1.02674532, - "epoch": 0.4046896137081016, - "flos": 38071235773440.0, - "grad_norm": 2.5953767613834673, - "language_loss": 0.67485142, - "learning_rate": 2.699367885848985e-06, - "loss": 0.69640195, - "num_input_tokens_seen": 144529550, - "step": 6731, - "time_per_iteration": 2.8106632232666016 - }, - { - "auxiliary_loss_clip": 0.01128909, - "auxiliary_loss_mlp": 0.01038653, - "balance_loss_clip": 1.04689097, - "balance_loss_mlp": 1.02531338, - "epoch": 0.4047497369607696, - "flos": 23617262856960.0, - "grad_norm": 1.5691591770044138, - "language_loss": 0.74245793, - "learning_rate": 2.699002998510517e-06, - "loss": 0.76413357, - "num_input_tokens_seen": 144549310, - "step": 6732, - "time_per_iteration": 2.6608641147613525 - }, - { - "auxiliary_loss_clip": 0.0110044, - "auxiliary_loss_mlp": 0.00770096, - "balance_loss_clip": 1.04635525, - "balance_loss_mlp": 1.00099349, - "epoch": 0.40480986021343757, - "flos": 12823593569280.0, - "grad_norm": 1.738611378800115, - "language_loss": 0.77579916, - "learning_rate": 2.6986380846657852e-06, - "loss": 0.79450446, - "num_input_tokens_seen": 144567430, - "step": 6733, - "time_per_iteration": 2.648707151412964 - }, - { - "auxiliary_loss_clip": 0.01102753, - "auxiliary_loss_mlp": 0.01043236, - "balance_loss_clip": 1.04195142, - "balance_loss_mlp": 1.0276798, - "epoch": 0.40486998346610553, - "flos": 23768770423680.0, - "grad_norm": 1.875618790304424, - "language_loss": 0.76887047, - "learning_rate": 2.698273144328627e-06, - "loss": 0.79033035, - "num_input_tokens_seen": 144585975, - "step": 6734, - "time_per_iteration": 2.7222812175750732 - }, - { - "auxiliary_loss_clip": 0.01110956, - "auxiliary_loss_mlp": 0.01032993, - "balance_loss_clip": 1.04893517, - "balance_loss_mlp": 1.01923609, - "epoch": 0.4049301067187735, - "flos": 22856818999680.0, - "grad_norm": 2.463703641644531, - "language_loss": 0.64536786, - "learning_rate": 2.6979081775128805e-06, - "loss": 0.66680741, - "num_input_tokens_seen": 144605225, - "step": 6735, - "time_per_iteration": 2.682111978530884 - }, - { - "auxiliary_loss_clip": 0.01088904, - "auxiliary_loss_mlp": 0.01039113, - "balance_loss_clip": 1.04142201, - "balance_loss_mlp": 1.0247122, - "epoch": 0.40499022997144146, - "flos": 22783992174720.0, - "grad_norm": 1.9621030422141739, - "language_loss": 0.83120507, - "learning_rate": 2.697543184232387e-06, - "loss": 0.85248524, - "num_input_tokens_seen": 144624145, - "step": 6736, - "time_per_iteration": 2.737946033477783 - }, - { - "auxiliary_loss_clip": 0.01103471, - "auxiliary_loss_mlp": 0.00773133, - "balance_loss_clip": 1.04903114, - "balance_loss_mlp": 1.00089931, - "epoch": 0.4050503532241094, - "flos": 23039352938880.0, - "grad_norm": 1.950757015883091, - "language_loss": 0.75173002, - "learning_rate": 2.6971781645009863e-06, - "loss": 0.77049613, - "num_input_tokens_seen": 144644470, - "step": 6737, - "time_per_iteration": 2.7009494304656982 - }, - { - "auxiliary_loss_clip": 0.01119674, - "auxiliary_loss_mlp": 0.01042697, - "balance_loss_clip": 1.04876637, - "balance_loss_mlp": 1.02858806, - "epoch": 0.4051104764767774, - "flos": 16647756065280.0, - "grad_norm": 3.18955375846042, - "language_loss": 0.72142565, - "learning_rate": 2.696813118332519e-06, - "loss": 0.74304938, - "num_input_tokens_seen": 144661055, - "step": 6738, - "time_per_iteration": 2.63269305229187 - }, - { - "auxiliary_loss_clip": 0.01094776, - "auxiliary_loss_mlp": 0.01033432, - "balance_loss_clip": 1.04453516, - "balance_loss_mlp": 1.02065849, - "epoch": 0.40517059972944536, - "flos": 16358962717440.0, - "grad_norm": 1.9585661201522753, - "language_loss": 0.75113159, - "learning_rate": 2.696448045740828e-06, - "loss": 0.77241367, - "num_input_tokens_seen": 144677935, - "step": 6739, - "time_per_iteration": 2.678330421447754 - }, - { - "auxiliary_loss_clip": 0.01092708, - "auxiliary_loss_mlp": 0.01036695, - "balance_loss_clip": 1.04475963, - "balance_loss_mlp": 1.02244925, - "epoch": 0.4052307229821133, - "flos": 28803374363520.0, - "grad_norm": 2.0151914408481066, - "language_loss": 0.73516095, - "learning_rate": 2.6960829467397576e-06, - "loss": 0.75645494, - "num_input_tokens_seen": 144697725, - "step": 6740, - "time_per_iteration": 2.821165084838867 - }, - { - "auxiliary_loss_clip": 0.01111182, - "auxiliary_loss_mlp": 0.01032908, - "balance_loss_clip": 1.04380143, - "balance_loss_mlp": 1.01927674, - "epoch": 0.4052908462347813, - "flos": 21397876289280.0, - "grad_norm": 1.5447802431592257, - "language_loss": 0.77149022, - "learning_rate": 2.695717821343153e-06, - "loss": 0.79293114, - "num_input_tokens_seen": 144718805, - "step": 6741, - "time_per_iteration": 2.639744758605957 - }, - { - "auxiliary_loss_clip": 0.01132474, - "auxiliary_loss_mlp": 0.01039418, - "balance_loss_clip": 1.04797888, - "balance_loss_mlp": 1.02415919, - "epoch": 0.40535096948744925, - "flos": 22419067950720.0, - "grad_norm": 2.3470472177782584, - "language_loss": 0.71132898, - "learning_rate": 2.6953526695648577e-06, - "loss": 0.73304784, - "num_input_tokens_seen": 144737105, - "step": 6742, - "time_per_iteration": 2.566246509552002 - }, - { - "auxiliary_loss_clip": 0.01132445, - "auxiliary_loss_mlp": 0.01031417, - "balance_loss_clip": 1.04941666, - "balance_loss_mlp": 1.01739824, - "epoch": 0.4054110927401172, - "flos": 17010776868480.0, - "grad_norm": 2.3285032794047966, - "language_loss": 0.71915448, - "learning_rate": 2.6949874914187202e-06, - "loss": 0.74079311, - "num_input_tokens_seen": 144751350, - "step": 6743, - "time_per_iteration": 2.7150700092315674 - }, - { - "auxiliary_loss_clip": 0.01109405, - "auxiliary_loss_mlp": 0.01036045, - "balance_loss_clip": 1.04626715, - "balance_loss_mlp": 1.0209291, - "epoch": 0.4054712159927852, - "flos": 21614848392960.0, - "grad_norm": 2.2533363543989053, - "language_loss": 0.70529258, - "learning_rate": 2.694622286918588e-06, - "loss": 0.72674704, - "num_input_tokens_seen": 144770030, - "step": 6744, - "time_per_iteration": 2.715900421142578 - }, - { - "auxiliary_loss_clip": 0.01118115, - "auxiliary_loss_mlp": 0.01036188, - "balance_loss_clip": 1.04826701, - "balance_loss_mlp": 1.02316439, - "epoch": 0.4055313392454532, - "flos": 25812554376960.0, - "grad_norm": 1.8071994834567642, - "language_loss": 0.80102956, - "learning_rate": 2.6942570560783076e-06, - "loss": 0.82257259, - "num_input_tokens_seen": 144790965, - "step": 6745, - "time_per_iteration": 2.6989259719848633 - }, - { - "auxiliary_loss_clip": 0.01108583, - "auxiliary_loss_mlp": 0.01034972, - "balance_loss_clip": 1.04861784, - "balance_loss_mlp": 1.02049959, - "epoch": 0.40559146249812117, - "flos": 14137098111360.0, - "grad_norm": 1.8906308851954157, - "language_loss": 0.66942173, - "learning_rate": 2.693891798911731e-06, - "loss": 0.69085735, - "num_input_tokens_seen": 144807755, - "step": 6746, - "time_per_iteration": 2.7211005687713623 - }, - { - "auxiliary_loss_clip": 0.01092509, - "auxiliary_loss_mlp": 0.01033925, - "balance_loss_clip": 1.04508781, - "balance_loss_mlp": 1.02044201, - "epoch": 0.40565158575078913, - "flos": 41355481962240.0, - "grad_norm": 1.4960206584486848, - "language_loss": 0.57240731, - "learning_rate": 2.6935265154327075e-06, - "loss": 0.59367168, - "num_input_tokens_seen": 144832405, - "step": 6747, - "time_per_iteration": 2.8735926151275635 - }, - { - "auxiliary_loss_clip": 0.0109681, - "auxiliary_loss_mlp": 0.01043537, - "balance_loss_clip": 1.04770565, - "balance_loss_mlp": 1.03084731, - "epoch": 0.4057117090034571, - "flos": 28544529980160.0, - "grad_norm": 1.7545120295248704, - "language_loss": 0.8468259, - "learning_rate": 2.693161205655089e-06, - "loss": 0.86822933, - "num_input_tokens_seen": 144853890, - "step": 6748, - "time_per_iteration": 2.7470786571502686 - }, - { - "auxiliary_loss_clip": 0.01107762, - "auxiliary_loss_mlp": 0.0104113, - "balance_loss_clip": 1.05110598, - "balance_loss_mlp": 1.02695, - "epoch": 0.40577183225612506, - "flos": 18004066640640.0, - "grad_norm": 2.5881063547984398, - "language_loss": 0.81445849, - "learning_rate": 2.6927958695927287e-06, - "loss": 0.83594739, - "num_input_tokens_seen": 144871395, - "step": 6749, - "time_per_iteration": 2.677762746810913 - }, - { - "auxiliary_loss_clip": 0.01119763, - "auxiliary_loss_mlp": 0.00771508, - "balance_loss_clip": 1.04914761, - "balance_loss_mlp": 1.00084698, - "epoch": 0.40583195550879303, - "flos": 19536734016000.0, - "grad_norm": 1.7422987888005266, - "language_loss": 0.75235945, - "learning_rate": 2.6924305072594784e-06, - "loss": 0.77127212, - "num_input_tokens_seen": 144890975, - "step": 6750, - "time_per_iteration": 2.6956052780151367 - }, - { - "auxiliary_loss_clip": 0.0111553, - "auxiliary_loss_mlp": 0.01041156, - "balance_loss_clip": 1.04812646, - "balance_loss_mlp": 1.02654123, - "epoch": 0.405892078761461, - "flos": 22309468577280.0, - "grad_norm": 2.479262216129207, - "language_loss": 0.73942888, - "learning_rate": 2.692065118669195e-06, - "loss": 0.76099575, - "num_input_tokens_seen": 144908170, - "step": 6751, - "time_per_iteration": 2.6845548152923584 - }, - { - "auxiliary_loss_clip": 0.01086462, - "auxiliary_loss_mlp": 0.01042521, - "balance_loss_clip": 1.04832053, - "balance_loss_mlp": 1.02627254, - "epoch": 0.40595220201412896, - "flos": 25484402701440.0, - "grad_norm": 1.7042707146701068, - "language_loss": 0.66690767, - "learning_rate": 2.6916997038357326e-06, - "loss": 0.68819749, - "num_input_tokens_seen": 144928020, - "step": 6752, - "time_per_iteration": 4.372137784957886 - }, - { - "auxiliary_loss_clip": 0.01086822, - "auxiliary_loss_mlp": 0.0104486, - "balance_loss_clip": 1.04698646, - "balance_loss_mlp": 1.02896988, - "epoch": 0.4060123252667969, - "flos": 49856004103680.0, - "grad_norm": 2.0675680438490374, - "language_loss": 0.7062583, - "learning_rate": 2.691334262772948e-06, - "loss": 0.72757506, - "num_input_tokens_seen": 144951240, - "step": 6753, - "time_per_iteration": 2.954685688018799 - }, - { - "auxiliary_loss_clip": 0.0110904, - "auxiliary_loss_mlp": 0.01037036, - "balance_loss_clip": 1.04630709, - "balance_loss_mlp": 1.02162218, - "epoch": 0.4060724485194649, - "flos": 21135476459520.0, - "grad_norm": 1.6674578897026393, - "language_loss": 0.72053552, - "learning_rate": 2.690968795494699e-06, - "loss": 0.74199629, - "num_input_tokens_seen": 144969100, - "step": 6754, - "time_per_iteration": 5.758596420288086 - }, - { - "auxiliary_loss_clip": 0.01097183, - "auxiliary_loss_mlp": 0.01040377, - "balance_loss_clip": 1.04531932, - "balance_loss_mlp": 1.02634573, - "epoch": 0.40613257177213286, - "flos": 21758059918080.0, - "grad_norm": 1.655202233640458, - "language_loss": 0.8301084, - "learning_rate": 2.690603302014844e-06, - "loss": 0.851484, - "num_input_tokens_seen": 144987065, - "step": 6755, - "time_per_iteration": 2.7983388900756836 - }, - { - "auxiliary_loss_clip": 0.01086578, - "auxiliary_loss_mlp": 0.01041496, - "balance_loss_clip": 1.04638743, - "balance_loss_mlp": 1.02645206, - "epoch": 0.4061926950248008, - "flos": 25555074710400.0, - "grad_norm": 1.5597680276021608, - "language_loss": 0.71212381, - "learning_rate": 2.6902377823472426e-06, - "loss": 0.73340452, - "num_input_tokens_seen": 145007310, - "step": 6756, - "time_per_iteration": 2.8140816688537598 - }, - { - "auxiliary_loss_clip": 0.01071802, - "auxiliary_loss_mlp": 0.00773633, - "balance_loss_clip": 1.04193711, - "balance_loss_mlp": 1.00074661, - "epoch": 0.4062528182774688, - "flos": 23695799944320.0, - "grad_norm": 2.0528550033278075, - "language_loss": 0.79103237, - "learning_rate": 2.689872236505755e-06, - "loss": 0.80948675, - "num_input_tokens_seen": 145026210, - "step": 6757, - "time_per_iteration": 4.472316741943359 - }, - { - "auxiliary_loss_clip": 0.01112634, - "auxiliary_loss_mlp": 0.0103161, - "balance_loss_clip": 1.05197811, - "balance_loss_mlp": 1.01777542, - "epoch": 0.4063129415301368, - "flos": 21726027964800.0, - "grad_norm": 1.8573345394429819, - "language_loss": 0.78500074, - "learning_rate": 2.6895066645042437e-06, - "loss": 0.80644321, - "num_input_tokens_seen": 145045475, - "step": 6758, - "time_per_iteration": 2.732006072998047 - }, - { - "auxiliary_loss_clip": 0.01096195, - "auxiliary_loss_mlp": 0.0103788, - "balance_loss_clip": 1.05092061, - "balance_loss_mlp": 1.02355731, - "epoch": 0.40637306478280477, - "flos": 12787575206400.0, - "grad_norm": 2.1068114153090254, - "language_loss": 0.89142424, - "learning_rate": 2.6891410663565703e-06, - "loss": 0.91276503, - "num_input_tokens_seen": 145062260, - "step": 6759, - "time_per_iteration": 2.768120288848877 - }, - { - "auxiliary_loss_clip": 0.0109872, - "auxiliary_loss_mlp": 0.01036325, - "balance_loss_clip": 1.04916096, - "balance_loss_mlp": 1.02241302, - "epoch": 0.40643318803547274, - "flos": 24024490323840.0, - "grad_norm": 1.8143975866028277, - "language_loss": 0.64272439, - "learning_rate": 2.688775442076598e-06, - "loss": 0.66407484, - "num_input_tokens_seen": 145082470, - "step": 6760, - "time_per_iteration": 2.724278211593628 - }, - { - "auxiliary_loss_clip": 0.01120642, - "auxiliary_loss_mlp": 0.01035863, - "balance_loss_clip": 1.04679084, - "balance_loss_mlp": 1.02100921, - "epoch": 0.4064933112881407, - "flos": 25592421876480.0, - "grad_norm": 1.9958038926303674, - "language_loss": 0.75134486, - "learning_rate": 2.688409791678193e-06, - "loss": 0.77290988, - "num_input_tokens_seen": 145105685, - "step": 6761, - "time_per_iteration": 2.81839919090271 - }, - { - "auxiliary_loss_clip": 0.01097139, - "auxiliary_loss_mlp": 0.0103932, - "balance_loss_clip": 1.04636633, - "balance_loss_mlp": 1.02598023, - "epoch": 0.40655343454080867, - "flos": 22054323294720.0, - "grad_norm": 1.6270794268543942, - "language_loss": 0.70070893, - "learning_rate": 2.6880441151752185e-06, - "loss": 0.72207355, - "num_input_tokens_seen": 145125590, - "step": 6762, - "time_per_iteration": 2.6583070755004883 - }, - { - "auxiliary_loss_clip": 0.0111912, - "auxiliary_loss_mlp": 0.01032723, - "balance_loss_clip": 1.0519619, - "balance_loss_mlp": 1.01906157, - "epoch": 0.40661355779347663, - "flos": 26468893641600.0, - "grad_norm": 1.6183981098694702, - "language_loss": 0.73523986, - "learning_rate": 2.6876784125815433e-06, - "loss": 0.75675833, - "num_input_tokens_seen": 145146810, - "step": 6763, - "time_per_iteration": 2.674830198287964 - }, - { - "auxiliary_loss_clip": 0.01090413, - "auxiliary_loss_mlp": 0.01035278, - "balance_loss_clip": 1.04031014, - "balance_loss_mlp": 1.0199244, - "epoch": 0.4066736810461446, - "flos": 13261129136640.0, - "grad_norm": 2.065371393903723, - "language_loss": 0.68689919, - "learning_rate": 2.687312683911033e-06, - "loss": 0.70815611, - "num_input_tokens_seen": 145163130, - "step": 6764, - "time_per_iteration": 2.7424631118774414 - }, - { - "auxiliary_loss_clip": 0.01104645, - "auxiliary_loss_mlp": 0.01045832, - "balance_loss_clip": 1.0461781, - "balance_loss_mlp": 1.02930999, - "epoch": 0.40673380429881256, - "flos": 28803625758720.0, - "grad_norm": 2.4553121190783, - "language_loss": 0.91144872, - "learning_rate": 2.686946929177557e-06, - "loss": 0.93295348, - "num_input_tokens_seen": 145181420, - "step": 6765, - "time_per_iteration": 2.705754280090332 - }, - { - "auxiliary_loss_clip": 0.01121713, - "auxiliary_loss_mlp": 0.01044564, - "balance_loss_clip": 1.04742265, - "balance_loss_mlp": 1.02876294, - "epoch": 0.4067939275514805, - "flos": 12495334152960.0, - "grad_norm": 3.5832481358362673, - "language_loss": 0.78673786, - "learning_rate": 2.6865811483949855e-06, - "loss": 0.80840063, - "num_input_tokens_seen": 145198545, - "step": 6766, - "time_per_iteration": 2.6291732788085938 - }, - { - "auxiliary_loss_clip": 0.01137462, - "auxiliary_loss_mlp": 0.01042218, - "balance_loss_clip": 1.0502665, - "balance_loss_mlp": 1.02767396, - "epoch": 0.4068540508041485, - "flos": 18770508069120.0, - "grad_norm": 2.203846422574217, - "language_loss": 0.763403, - "learning_rate": 2.6862153415771867e-06, - "loss": 0.78519982, - "num_input_tokens_seen": 145215835, - "step": 6767, - "time_per_iteration": 2.583494186401367 - }, - { - "auxiliary_loss_clip": 0.01124058, - "auxiliary_loss_mlp": 0.01038086, - "balance_loss_clip": 1.0510633, - "balance_loss_mlp": 1.02363229, - "epoch": 0.40691417405681646, - "flos": 28512821249280.0, - "grad_norm": 2.5264206630573827, - "language_loss": 0.77474844, - "learning_rate": 2.685849508738034e-06, - "loss": 0.79636991, - "num_input_tokens_seen": 145236555, - "step": 6768, - "time_per_iteration": 2.6851589679718018 - }, - { - "auxiliary_loss_clip": 0.01134023, - "auxiliary_loss_mlp": 0.01032915, - "balance_loss_clip": 1.05076826, - "balance_loss_mlp": 1.01887226, - "epoch": 0.4069742973094844, - "flos": 20814040627200.0, - "grad_norm": 1.8984102670150322, - "language_loss": 0.87523651, - "learning_rate": 2.6854836498913995e-06, - "loss": 0.8969059, - "num_input_tokens_seen": 145254595, - "step": 6769, - "time_per_iteration": 2.7267651557922363 - }, - { - "auxiliary_loss_clip": 0.01105045, - "auxiliary_loss_mlp": 0.0104448, - "balance_loss_clip": 1.04947972, - "balance_loss_mlp": 1.03028178, - "epoch": 0.4070344205621524, - "flos": 21470272151040.0, - "grad_norm": 3.1498546640216234, - "language_loss": 0.80951393, - "learning_rate": 2.685117765051156e-06, - "loss": 0.83100921, - "num_input_tokens_seen": 145274005, - "step": 6770, - "time_per_iteration": 2.7272839546203613 - }, - { - "auxiliary_loss_clip": 0.01136551, - "auxiliary_loss_mlp": 0.0103334, - "balance_loss_clip": 1.05021751, - "balance_loss_mlp": 1.01781273, - "epoch": 0.4070945438148204, - "flos": 26830046937600.0, - "grad_norm": 1.9062828764414554, - "language_loss": 0.80237663, - "learning_rate": 2.6847518542311783e-06, - "loss": 0.82407558, - "num_input_tokens_seen": 145294850, - "step": 6771, - "time_per_iteration": 2.5958163738250732 - }, - { - "auxiliary_loss_clip": 0.01097968, - "auxiliary_loss_mlp": 0.01044728, - "balance_loss_clip": 1.04523098, - "balance_loss_mlp": 1.02995801, - "epoch": 0.4071546670674884, - "flos": 26354158623360.0, - "grad_norm": 1.4305431390081056, - "language_loss": 0.76077241, - "learning_rate": 2.6843859174453417e-06, - "loss": 0.78219938, - "num_input_tokens_seen": 145317050, - "step": 6772, - "time_per_iteration": 2.79603910446167 - }, - { - "auxiliary_loss_clip": 0.01110195, - "auxiliary_loss_mlp": 0.01043051, - "balance_loss_clip": 1.04724109, - "balance_loss_mlp": 1.0283401, - "epoch": 0.40721479032015634, - "flos": 17895401020800.0, - "grad_norm": 1.8845179488175254, - "language_loss": 0.81205189, - "learning_rate": 2.6840199547075218e-06, - "loss": 0.83358431, - "num_input_tokens_seen": 145334480, - "step": 6773, - "time_per_iteration": 2.699221611022949 - }, - { - "auxiliary_loss_clip": 0.01044722, - "auxiliary_loss_mlp": 0.01025696, - "balance_loss_clip": 1.03283918, - "balance_loss_mlp": 1.02369332, - "epoch": 0.4072749135728243, - "flos": 49854570537600.0, - "grad_norm": 0.9856620885651128, - "language_loss": 0.64339805, - "learning_rate": 2.683653966031597e-06, - "loss": 0.6641022, - "num_input_tokens_seen": 145388695, - "step": 6774, - "time_per_iteration": 3.147400140762329 - }, - { - "auxiliary_loss_clip": 0.01089769, - "auxiliary_loss_mlp": 0.01034709, - "balance_loss_clip": 1.04686499, - "balance_loss_mlp": 1.02041602, - "epoch": 0.40733503682549227, - "flos": 27563630400000.0, - "grad_norm": 2.273542425652267, - "language_loss": 0.72560251, - "learning_rate": 2.683287951431446e-06, - "loss": 0.74684727, - "num_input_tokens_seen": 145408240, - "step": 6775, - "time_per_iteration": 2.787423849105835 - }, - { - "auxiliary_loss_clip": 0.01105468, - "auxiliary_loss_mlp": 0.00773431, - "balance_loss_clip": 1.04828203, - "balance_loss_mlp": 1.00090027, - "epoch": 0.40739516007816023, - "flos": 22126970551680.0, - "grad_norm": 1.407391884450963, - "language_loss": 0.77802348, - "learning_rate": 2.6829219109209474e-06, - "loss": 0.79681242, - "num_input_tokens_seen": 145428395, - "step": 6776, - "time_per_iteration": 2.682548761367798 - }, - { - "auxiliary_loss_clip": 0.01126451, - "auxiliary_loss_mlp": 0.0104142, - "balance_loss_clip": 1.05063748, - "balance_loss_mlp": 1.02654302, - "epoch": 0.4074552833308282, - "flos": 23842243693440.0, - "grad_norm": 2.817997105966, - "language_loss": 0.79558617, - "learning_rate": 2.682555844513981e-06, - "loss": 0.81726491, - "num_input_tokens_seen": 145448290, - "step": 6777, - "time_per_iteration": 2.7163336277008057 - }, - { - "auxiliary_loss_clip": 0.01058602, - "auxiliary_loss_mlp": 0.01001315, - "balance_loss_clip": 1.02913916, - "balance_loss_mlp": 0.99987298, - "epoch": 0.40751540658349616, - "flos": 58000008781440.0, - "grad_norm": 0.6823534121540719, - "language_loss": 0.5315339, - "learning_rate": 2.6821897522244286e-06, - "loss": 0.55213308, - "num_input_tokens_seen": 145509785, - "step": 6778, - "time_per_iteration": 3.1687095165252686 - }, - { - "auxiliary_loss_clip": 0.01135647, - "auxiliary_loss_mlp": 0.00772948, - "balance_loss_clip": 1.05136371, - "balance_loss_mlp": 1.0008893, - "epoch": 0.40757552983616413, - "flos": 21214659991680.0, - "grad_norm": 2.33935347330558, - "language_loss": 0.82312328, - "learning_rate": 2.6818236340661718e-06, - "loss": 0.84220922, - "num_input_tokens_seen": 145528620, - "step": 6779, - "time_per_iteration": 2.584343194961548 - }, - { - "auxiliary_loss_clip": 0.0112113, - "auxiliary_loss_mlp": 0.01036902, - "balance_loss_clip": 1.04708278, - "balance_loss_mlp": 1.02178645, - "epoch": 0.4076356530888321, - "flos": 26833530556800.0, - "grad_norm": 1.5589663074171618, - "language_loss": 0.76523471, - "learning_rate": 2.6814574900530957e-06, - "loss": 0.78681505, - "num_input_tokens_seen": 145547775, - "step": 6780, - "time_per_iteration": 2.6672446727752686 - }, - { - "auxiliary_loss_clip": 0.01117549, - "auxiliary_loss_mlp": 0.01034773, - "balance_loss_clip": 1.04889798, - "balance_loss_mlp": 1.0212667, - "epoch": 0.40769577634150006, - "flos": 12203021272320.0, - "grad_norm": 2.1749592638145123, - "language_loss": 0.65482175, - "learning_rate": 2.6810913201990827e-06, - "loss": 0.67634493, - "num_input_tokens_seen": 145564465, - "step": 6781, - "time_per_iteration": 2.612326145172119 - }, - { - "auxiliary_loss_clip": 0.01107362, - "auxiliary_loss_mlp": 0.01034378, - "balance_loss_clip": 1.04472542, - "balance_loss_mlp": 1.01922643, - "epoch": 0.407755899594168, - "flos": 33655264796160.0, - "grad_norm": 1.5476514756078803, - "language_loss": 0.71028459, - "learning_rate": 2.6807251245180183e-06, - "loss": 0.73170209, - "num_input_tokens_seen": 145585965, - "step": 6782, - "time_per_iteration": 2.7483837604522705 - }, - { - "auxiliary_loss_clip": 0.01124897, - "auxiliary_loss_mlp": 0.01032941, - "balance_loss_clip": 1.04813361, - "balance_loss_mlp": 1.01833797, - "epoch": 0.407816022846836, - "flos": 20157342226560.0, - "grad_norm": 1.9402515659282311, - "language_loss": 0.82272756, - "learning_rate": 2.6803589030237897e-06, - "loss": 0.84430599, - "num_input_tokens_seen": 145605000, - "step": 6783, - "time_per_iteration": 2.6157009601593018 - }, - { - "auxiliary_loss_clip": 0.01117034, - "auxiliary_loss_mlp": 0.01038578, - "balance_loss_clip": 1.04744446, - "balance_loss_mlp": 1.0235455, - "epoch": 0.40787614609950396, - "flos": 21178821196800.0, - "grad_norm": 1.6713842677384587, - "language_loss": 0.81044209, - "learning_rate": 2.679992655730283e-06, - "loss": 0.83199817, - "num_input_tokens_seen": 145623740, - "step": 6784, - "time_per_iteration": 2.6054811477661133 - }, - { - "auxiliary_loss_clip": 0.01107175, - "auxiliary_loss_mlp": 0.01044009, - "balance_loss_clip": 1.05123401, - "balance_loss_mlp": 1.02725959, - "epoch": 0.407936269352172, - "flos": 20520650338560.0, - "grad_norm": 2.1708514595694655, - "language_loss": 0.65653902, - "learning_rate": 2.679626382651386e-06, - "loss": 0.67805088, - "num_input_tokens_seen": 145643515, - "step": 6785, - "time_per_iteration": 2.816330671310425 - }, - { - "auxiliary_loss_clip": 0.01115764, - "auxiliary_loss_mlp": 0.01038413, - "balance_loss_clip": 1.04758108, - "balance_loss_mlp": 1.02347052, - "epoch": 0.40799639260483994, - "flos": 20118809911680.0, - "grad_norm": 1.8523263348252557, - "language_loss": 0.79567587, - "learning_rate": 2.679260083800989e-06, - "loss": 0.81721765, - "num_input_tokens_seen": 145660890, - "step": 6786, - "time_per_iteration": 2.629009962081909 - }, - { - "auxiliary_loss_clip": 0.01132323, - "auxiliary_loss_mlp": 0.01042176, - "balance_loss_clip": 1.04911721, - "balance_loss_mlp": 1.02866411, - "epoch": 0.4080565158575079, - "flos": 20997328752000.0, - "grad_norm": 1.716981063220771, - "language_loss": 0.81870878, - "learning_rate": 2.678893759192982e-06, - "loss": 0.84045374, - "num_input_tokens_seen": 145680070, - "step": 6787, - "time_per_iteration": 2.6304709911346436 - }, - { - "auxiliary_loss_clip": 0.01117339, - "auxiliary_loss_mlp": 0.0103421, - "balance_loss_clip": 1.04705477, - "balance_loss_mlp": 1.02019691, - "epoch": 0.40811663911017587, - "flos": 19317714837120.0, - "grad_norm": 1.8408166150848957, - "language_loss": 0.67954206, - "learning_rate": 2.678527408841255e-06, - "loss": 0.70105749, - "num_input_tokens_seen": 145698010, - "step": 6788, - "time_per_iteration": 2.6314821243286133 - }, - { - "auxiliary_loss_clip": 0.01102044, - "auxiliary_loss_mlp": 0.01047553, - "balance_loss_clip": 1.04318452, - "balance_loss_mlp": 1.03095889, - "epoch": 0.40817676236284384, - "flos": 40625382119040.0, - "grad_norm": 2.0355882471601014, - "language_loss": 0.66265976, - "learning_rate": 2.678161032759701e-06, - "loss": 0.6841557, - "num_input_tokens_seen": 145722215, - "step": 6789, - "time_per_iteration": 2.8329808712005615 - }, - { - "auxiliary_loss_clip": 0.01084234, - "auxiliary_loss_mlp": 0.01036407, - "balance_loss_clip": 1.04236126, - "balance_loss_mlp": 1.021101, - "epoch": 0.4082368856155118, - "flos": 20522086882560.0, - "grad_norm": 1.7612282198179636, - "language_loss": 0.60220939, - "learning_rate": 2.6777946309622123e-06, - "loss": 0.62341583, - "num_input_tokens_seen": 145741090, - "step": 6790, - "time_per_iteration": 2.705007791519165 - }, - { - "auxiliary_loss_clip": 0.01115221, - "auxiliary_loss_mlp": 0.0104035, - "balance_loss_clip": 1.04814339, - "balance_loss_mlp": 1.02482939, - "epoch": 0.40829700886817977, - "flos": 11427745098240.0, - "grad_norm": 2.946877052856992, - "language_loss": 0.69406867, - "learning_rate": 2.677428203462683e-06, - "loss": 0.71562433, - "num_input_tokens_seen": 145754985, - "step": 6791, - "time_per_iteration": 2.629746675491333 - }, - { - "auxiliary_loss_clip": 0.01047663, - "auxiliary_loss_mlp": 0.01005727, - "balance_loss_clip": 1.02732182, - "balance_loss_mlp": 1.00409365, - "epoch": 0.40835713212084773, - "flos": 67330677121920.0, - "grad_norm": 0.7512190297569652, - "language_loss": 0.59569383, - "learning_rate": 2.6770617502750093e-06, - "loss": 0.61622775, - "num_input_tokens_seen": 145815260, - "step": 6792, - "time_per_iteration": 4.680825710296631 - }, - { - "auxiliary_loss_clip": 0.0113903, - "auxiliary_loss_mlp": 0.01043884, - "balance_loss_clip": 1.05271673, - "balance_loss_mlp": 1.02787423, - "epoch": 0.4084172553735157, - "flos": 21762010414080.0, - "grad_norm": 1.9475859316217028, - "language_loss": 0.80324817, - "learning_rate": 2.6766952714130857e-06, - "loss": 0.8250773, - "num_input_tokens_seen": 145832665, - "step": 6793, - "time_per_iteration": 4.095003128051758 - }, - { - "auxiliary_loss_clip": 0.01124776, - "auxiliary_loss_mlp": 0.01044052, - "balance_loss_clip": 1.04916334, - "balance_loss_mlp": 1.02811408, - "epoch": 0.40847737862618366, - "flos": 27417258478080.0, - "grad_norm": 1.8631596367030567, - "language_loss": 0.84994531, - "learning_rate": 2.6763287668908094e-06, - "loss": 0.87163359, - "num_input_tokens_seen": 145850240, - "step": 6794, - "time_per_iteration": 4.198231935501099 - }, - { - "auxiliary_loss_clip": 0.01100105, - "auxiliary_loss_mlp": 0.01040677, - "balance_loss_clip": 1.0469923, - "balance_loss_mlp": 1.02570391, - "epoch": 0.4085375018788516, - "flos": 18587255857920.0, - "grad_norm": 2.862264995792616, - "language_loss": 0.7989887, - "learning_rate": 2.6759622367220788e-06, - "loss": 0.82039654, - "num_input_tokens_seen": 145869545, - "step": 6795, - "time_per_iteration": 2.7477807998657227 - }, - { - "auxiliary_loss_clip": 0.01121705, - "auxiliary_loss_mlp": 0.01039831, - "balance_loss_clip": 1.04831719, - "balance_loss_mlp": 1.02385116, - "epoch": 0.4085976251315196, - "flos": 15411783029760.0, - "grad_norm": 2.7561951150254633, - "language_loss": 0.70605052, - "learning_rate": 2.675595680920792e-06, - "loss": 0.7276659, - "num_input_tokens_seen": 145884025, - "step": 6796, - "time_per_iteration": 4.261413335800171 - }, - { - "auxiliary_loss_clip": 0.01116135, - "auxiliary_loss_mlp": 0.0077634, - "balance_loss_clip": 1.04606998, - "balance_loss_mlp": 1.00082135, - "epoch": 0.40865774838418756, - "flos": 21252222639360.0, - "grad_norm": 1.6356766399676357, - "language_loss": 0.78218019, - "learning_rate": 2.6752290995008498e-06, - "loss": 0.80110496, - "num_input_tokens_seen": 145903210, - "step": 6797, - "time_per_iteration": 2.6453776359558105 - }, - { - "auxiliary_loss_clip": 0.01121906, - "auxiliary_loss_mlp": 0.0105076, - "balance_loss_clip": 1.04562223, - "balance_loss_mlp": 1.03619301, - "epoch": 0.4087178716368556, - "flos": 13772245714560.0, - "grad_norm": 2.2166943768421534, - "language_loss": 0.86117017, - "learning_rate": 2.6748624924761523e-06, - "loss": 0.8828969, - "num_input_tokens_seen": 145920985, - "step": 6798, - "time_per_iteration": 2.67480731010437 - }, - { - "auxiliary_loss_clip": 0.01130307, - "auxiliary_loss_mlp": 0.01042728, - "balance_loss_clip": 1.04780984, - "balance_loss_mlp": 1.02931094, - "epoch": 0.40877799488952354, - "flos": 23621752056960.0, - "grad_norm": 1.473518761352831, - "language_loss": 0.84252232, - "learning_rate": 2.674495859860601e-06, - "loss": 0.86425269, - "num_input_tokens_seen": 145940350, - "step": 6799, - "time_per_iteration": 2.6273906230926514 - }, - { - "auxiliary_loss_clip": 0.01093085, - "auxiliary_loss_mlp": 0.01052249, - "balance_loss_clip": 1.04557848, - "balance_loss_mlp": 1.03427255, - "epoch": 0.4088381181421915, - "flos": 20918791664640.0, - "grad_norm": 2.1256660898165913, - "language_loss": 0.83567548, - "learning_rate": 2.6741292016681e-06, - "loss": 0.85712886, - "num_input_tokens_seen": 145957460, - "step": 6800, - "time_per_iteration": 2.7064268589019775 - }, - { - "auxiliary_loss_clip": 0.01119062, - "auxiliary_loss_mlp": 0.01043239, - "balance_loss_clip": 1.04534221, - "balance_loss_mlp": 1.02778912, - "epoch": 0.4088982413948595, - "flos": 13297578462720.0, - "grad_norm": 2.1612690472856353, - "language_loss": 0.74336559, - "learning_rate": 2.6737625179125514e-06, - "loss": 0.76498854, - "num_input_tokens_seen": 145975285, - "step": 6801, - "time_per_iteration": 2.631030321121216 - }, - { - "auxiliary_loss_clip": 0.01122834, - "auxiliary_loss_mlp": 0.0104231, - "balance_loss_clip": 1.04511952, - "balance_loss_mlp": 1.02699137, - "epoch": 0.40895836464752744, - "flos": 15267673664640.0, - "grad_norm": 2.1715684147319907, - "language_loss": 0.80430126, - "learning_rate": 2.673395808607861e-06, - "loss": 0.82595277, - "num_input_tokens_seen": 145989150, - "step": 6802, - "time_per_iteration": 2.5802509784698486 - }, - { - "auxiliary_loss_clip": 0.0112096, - "auxiliary_loss_mlp": 0.01044934, - "balance_loss_clip": 1.04893684, - "balance_loss_mlp": 1.02843595, - "epoch": 0.4090184879001954, - "flos": 14501411804160.0, - "grad_norm": 2.2436343912353283, - "language_loss": 0.75734484, - "learning_rate": 2.673029073767934e-06, - "loss": 0.77900374, - "num_input_tokens_seen": 146006980, - "step": 6803, - "time_per_iteration": 2.609602689743042 - }, - { - "auxiliary_loss_clip": 0.0106898, - "auxiliary_loss_mlp": 0.00773774, - "balance_loss_clip": 1.04085743, - "balance_loss_mlp": 1.00086641, - "epoch": 0.40907861115286337, - "flos": 13881593692800.0, - "grad_norm": 1.8843395194203503, - "language_loss": 0.78824151, - "learning_rate": 2.6726623134066764e-06, - "loss": 0.806669, - "num_input_tokens_seen": 146025125, - "step": 6804, - "time_per_iteration": 2.7654101848602295 - }, - { - "auxiliary_loss_clip": 0.01137979, - "auxiliary_loss_mlp": 0.01045985, - "balance_loss_clip": 1.04858065, - "balance_loss_mlp": 1.03147769, - "epoch": 0.40913873440553133, - "flos": 28037615293440.0, - "grad_norm": 2.2298994676504225, - "language_loss": 0.75672269, - "learning_rate": 2.672295527537998e-06, - "loss": 0.77856231, - "num_input_tokens_seen": 146044990, - "step": 6805, - "time_per_iteration": 2.680368185043335 - }, - { - "auxiliary_loss_clip": 0.01089569, - "auxiliary_loss_mlp": 0.01047964, - "balance_loss_clip": 1.04342198, - "balance_loss_mlp": 1.03309822, - "epoch": 0.4091988576581993, - "flos": 21618188357760.0, - "grad_norm": 1.8743994628433338, - "language_loss": 0.79440027, - "learning_rate": 2.671928716175804e-06, - "loss": 0.81577563, - "num_input_tokens_seen": 146066045, - "step": 6806, - "time_per_iteration": 2.8212954998016357 - }, - { - "auxiliary_loss_clip": 0.01126847, - "auxiliary_loss_mlp": 0.01038318, - "balance_loss_clip": 1.04977083, - "balance_loss_mlp": 1.02272499, - "epoch": 0.40925898091086726, - "flos": 25224085860480.0, - "grad_norm": 1.915245819215902, - "language_loss": 0.71779263, - "learning_rate": 2.671561879334007e-06, - "loss": 0.73944426, - "num_input_tokens_seen": 146086280, - "step": 6807, - "time_per_iteration": 2.7223496437072754 - }, - { - "auxiliary_loss_clip": 0.01034248, - "auxiliary_loss_mlp": 0.01005874, - "balance_loss_clip": 1.0356338, - "balance_loss_mlp": 1.00364494, - "epoch": 0.40931910416353523, - "flos": 68930568800640.0, - "grad_norm": 0.8232207365722912, - "language_loss": 0.58807027, - "learning_rate": 2.6711950170265155e-06, - "loss": 0.60847151, - "num_input_tokens_seen": 146148840, - "step": 6808, - "time_per_iteration": 3.2951159477233887 - }, - { - "auxiliary_loss_clip": 0.01113663, - "auxiliary_loss_mlp": 0.01048693, - "balance_loss_clip": 1.04732299, - "balance_loss_mlp": 1.03419733, - "epoch": 0.4093792274162032, - "flos": 20189553747840.0, - "grad_norm": 1.705790136999867, - "language_loss": 0.54954052, - "learning_rate": 2.670828129267242e-06, - "loss": 0.57116413, - "num_input_tokens_seen": 146166195, - "step": 6809, - "time_per_iteration": 2.663210868835449 - }, - { - "auxiliary_loss_clip": 0.01108384, - "auxiliary_loss_mlp": 0.01031551, - "balance_loss_clip": 1.0446471, - "balance_loss_mlp": 1.01682281, - "epoch": 0.40943935066887116, - "flos": 25228754628480.0, - "grad_norm": 1.7788203343455933, - "language_loss": 0.83185786, - "learning_rate": 2.6704612160700983e-06, - "loss": 0.85325718, - "num_input_tokens_seen": 146185045, - "step": 6810, - "time_per_iteration": 2.683969020843506 - }, - { - "auxiliary_loss_clip": 0.01105454, - "auxiliary_loss_mlp": 0.01053382, - "balance_loss_clip": 1.0451473, - "balance_loss_mlp": 1.03608489, - "epoch": 0.4094994739215392, - "flos": 23255319461760.0, - "grad_norm": 2.954085357706404, - "language_loss": 0.77419919, - "learning_rate": 2.670094277448999e-06, - "loss": 0.79578757, - "num_input_tokens_seen": 146204655, - "step": 6811, - "time_per_iteration": 2.6727347373962402 - }, - { - "auxiliary_loss_clip": 0.01135893, - "auxiliary_loss_mlp": 0.01036603, - "balance_loss_clip": 1.04917455, - "balance_loss_mlp": 1.02042687, - "epoch": 0.40955959717420715, - "flos": 17382165540480.0, - "grad_norm": 1.6058461501005727, - "language_loss": 0.70272696, - "learning_rate": 2.669727313417857e-06, - "loss": 0.72445196, - "num_input_tokens_seen": 146222000, - "step": 6812, - "time_per_iteration": 2.6267693042755127 - }, - { - "auxiliary_loss_clip": 0.01132783, - "auxiliary_loss_mlp": 0.01048088, - "balance_loss_clip": 1.04780114, - "balance_loss_mlp": 1.03210163, - "epoch": 0.4096197204268751, - "flos": 25082418620160.0, - "grad_norm": 1.9378136524882912, - "language_loss": 0.66298044, - "learning_rate": 2.6693603239905872e-06, - "loss": 0.68478918, - "num_input_tokens_seen": 146242630, - "step": 6813, - "time_per_iteration": 2.6447062492370605 - }, - { - "auxiliary_loss_clip": 0.01117463, - "auxiliary_loss_mlp": 0.00774455, - "balance_loss_clip": 1.04784274, - "balance_loss_mlp": 1.0009681, - "epoch": 0.4096798436795431, - "flos": 30586769648640.0, - "grad_norm": 1.8922051995482987, - "language_loss": 0.73949504, - "learning_rate": 2.6689933091811087e-06, - "loss": 0.75841421, - "num_input_tokens_seen": 146263070, - "step": 6814, - "time_per_iteration": 2.7325870990753174 - }, - { - "auxiliary_loss_clip": 0.0108334, - "auxiliary_loss_mlp": 0.01038435, - "balance_loss_clip": 1.04231858, - "balance_loss_mlp": 1.02281821, - "epoch": 0.40973996693221104, - "flos": 24133622820480.0, - "grad_norm": 2.0095509453801728, - "language_loss": 0.65957761, - "learning_rate": 2.6686262690033357e-06, - "loss": 0.68079543, - "num_input_tokens_seen": 146282890, - "step": 6815, - "time_per_iteration": 2.780668258666992 - }, - { - "auxiliary_loss_clip": 0.01122383, - "auxiliary_loss_mlp": 0.01045791, - "balance_loss_clip": 1.05130887, - "balance_loss_mlp": 1.03100336, - "epoch": 0.409800090184879, - "flos": 23988974751360.0, - "grad_norm": 1.5903260932613887, - "language_loss": 0.76872814, - "learning_rate": 2.668259203471188e-06, - "loss": 0.79040992, - "num_input_tokens_seen": 146301755, - "step": 6816, - "time_per_iteration": 2.6901748180389404 - }, - { - "auxiliary_loss_clip": 0.01118517, - "auxiliary_loss_mlp": 0.0104269, - "balance_loss_clip": 1.05008173, - "balance_loss_mlp": 1.02716875, - "epoch": 0.40986021343754697, - "flos": 16143678552960.0, - "grad_norm": 2.2788575244766966, - "language_loss": 0.81621635, - "learning_rate": 2.6678921125985843e-06, - "loss": 0.8378284, - "num_input_tokens_seen": 146316835, - "step": 6817, - "time_per_iteration": 2.6194167137145996 - }, - { - "auxiliary_loss_clip": 0.01114033, - "auxiliary_loss_mlp": 0.01046853, - "balance_loss_clip": 1.04633307, - "balance_loss_mlp": 1.02987719, - "epoch": 0.40992033669021494, - "flos": 24790824011520.0, - "grad_norm": 2.698849637369061, - "language_loss": 0.8016938, - "learning_rate": 2.667524996399444e-06, - "loss": 0.82330263, - "num_input_tokens_seen": 146336650, - "step": 6818, - "time_per_iteration": 2.8449223041534424 - }, - { - "auxiliary_loss_clip": 0.0111157, - "auxiliary_loss_mlp": 0.01039221, - "balance_loss_clip": 1.05212271, - "balance_loss_mlp": 1.02459419, - "epoch": 0.4099804599428829, - "flos": 29641888431360.0, - "grad_norm": 1.781955605236185, - "language_loss": 0.66531783, - "learning_rate": 2.66715785488769e-06, - "loss": 0.68682575, - "num_input_tokens_seen": 146357640, - "step": 6819, - "time_per_iteration": 2.8016393184661865 - }, - { - "auxiliary_loss_clip": 0.01118061, - "auxiliary_loss_mlp": 0.01052321, - "balance_loss_clip": 1.05068922, - "balance_loss_mlp": 1.03429687, - "epoch": 0.41004058319555087, - "flos": 24826590979200.0, - "grad_norm": 1.7017427969889725, - "language_loss": 0.85438228, - "learning_rate": 2.6667906880772428e-06, - "loss": 0.87608612, - "num_input_tokens_seen": 146379325, - "step": 6820, - "time_per_iteration": 2.7182726860046387 - }, - { - "auxiliary_loss_clip": 0.01127803, - "auxiliary_loss_mlp": 0.01035666, - "balance_loss_clip": 1.05361152, - "balance_loss_mlp": 1.02019835, - "epoch": 0.41010070644821883, - "flos": 25737464995200.0, - "grad_norm": 1.8388824613750698, - "language_loss": 0.71235943, - "learning_rate": 2.6664234959820256e-06, - "loss": 0.73399413, - "num_input_tokens_seen": 146398635, - "step": 6821, - "time_per_iteration": 2.6716413497924805 - }, - { - "auxiliary_loss_clip": 0.01123531, - "auxiliary_loss_mlp": 0.01036959, - "balance_loss_clip": 1.05253363, - "balance_loss_mlp": 1.02228427, - "epoch": 0.4101608297008868, - "flos": 22346061557760.0, - "grad_norm": 1.9657765704612085, - "language_loss": 0.74500406, - "learning_rate": 2.6660562786159634e-06, - "loss": 0.76660895, - "num_input_tokens_seen": 146417585, - "step": 6822, - "time_per_iteration": 2.652270793914795 - }, - { - "auxiliary_loss_clip": 0.01118135, - "auxiliary_loss_mlp": 0.01038075, - "balance_loss_clip": 1.05201709, - "balance_loss_mlp": 1.02313757, - "epoch": 0.41022095295355476, - "flos": 21945083057280.0, - "grad_norm": 2.1947910409652116, - "language_loss": 0.75539672, - "learning_rate": 2.6656890359929796e-06, - "loss": 0.77695882, - "num_input_tokens_seen": 146437035, - "step": 6823, - "time_per_iteration": 2.767306327819824 - }, - { - "auxiliary_loss_clip": 0.01095631, - "auxiliary_loss_mlp": 0.01044283, - "balance_loss_clip": 1.05394316, - "balance_loss_mlp": 1.02697372, - "epoch": 0.4102810762062228, - "flos": 27450511493760.0, - "grad_norm": 2.0691169068872086, - "language_loss": 0.73186851, - "learning_rate": 2.665321768127001e-06, - "loss": 0.75326765, - "num_input_tokens_seen": 146457370, - "step": 6824, - "time_per_iteration": 2.793712615966797 - }, - { - "auxiliary_loss_clip": 0.01110429, - "auxiliary_loss_mlp": 0.0103962, - "balance_loss_clip": 1.05025351, - "balance_loss_mlp": 1.02316904, - "epoch": 0.41034119945889075, - "flos": 24499265316480.0, - "grad_norm": 2.036284375586757, - "language_loss": 0.72426587, - "learning_rate": 2.6649544750319548e-06, - "loss": 0.7457664, - "num_input_tokens_seen": 146478105, - "step": 6825, - "time_per_iteration": 2.764977216720581 - }, - { - "auxiliary_loss_clip": 0.01097265, - "auxiliary_loss_mlp": 0.01045464, - "balance_loss_clip": 1.04605746, - "balance_loss_mlp": 1.03027654, - "epoch": 0.4104013227115587, - "flos": 24352641999360.0, - "grad_norm": 1.8249289811640228, - "language_loss": 0.85226274, - "learning_rate": 2.664587156721768e-06, - "loss": 0.87369001, - "num_input_tokens_seen": 146497835, - "step": 6826, - "time_per_iteration": 2.7680137157440186 - }, - { - "auxiliary_loss_clip": 0.01115829, - "auxiliary_loss_mlp": 0.00775051, - "balance_loss_clip": 1.05372024, - "balance_loss_mlp": 1.00099707, - "epoch": 0.4104614459642267, - "flos": 23729340268800.0, - "grad_norm": 1.8772466232345664, - "language_loss": 0.66074443, - "learning_rate": 2.6642198132103696e-06, - "loss": 0.67965323, - "num_input_tokens_seen": 146517735, - "step": 6827, - "time_per_iteration": 2.791212797164917 - }, - { - "auxiliary_loss_clip": 0.01113343, - "auxiliary_loss_mlp": 0.01033945, - "balance_loss_clip": 1.04942787, - "balance_loss_mlp": 1.01910365, - "epoch": 0.41052156921689464, - "flos": 22127976132480.0, - "grad_norm": 2.0535618692070914, - "language_loss": 0.72474444, - "learning_rate": 2.663852444511689e-06, - "loss": 0.74621731, - "num_input_tokens_seen": 146537640, - "step": 6828, - "time_per_iteration": 2.6675491333007812 - }, - { - "auxiliary_loss_clip": 0.01111113, - "auxiliary_loss_mlp": 0.01048054, - "balance_loss_clip": 1.04920423, - "balance_loss_mlp": 1.03068542, - "epoch": 0.4105816924695626, - "flos": 20084371747200.0, - "grad_norm": 2.67524304617312, - "language_loss": 0.83464897, - "learning_rate": 2.6634850506396574e-06, - "loss": 0.85624069, - "num_input_tokens_seen": 146554695, - "step": 6829, - "time_per_iteration": 2.762298107147217 - }, - { - "auxiliary_loss_clip": 0.01124628, - "auxiliary_loss_mlp": 0.01039003, - "balance_loss_clip": 1.05062759, - "balance_loss_mlp": 1.02405417, - "epoch": 0.4106418157222306, - "flos": 18076785724800.0, - "grad_norm": 1.5363498208464375, - "language_loss": 0.89878875, - "learning_rate": 2.663117631608206e-06, - "loss": 0.92042506, - "num_input_tokens_seen": 146573740, - "step": 6830, - "time_per_iteration": 2.7726032733917236 - }, - { - "auxiliary_loss_clip": 0.01098336, - "auxiliary_loss_mlp": 0.01034169, - "balance_loss_clip": 1.04938424, - "balance_loss_mlp": 1.01833797, - "epoch": 0.41070193897489854, - "flos": 21647850013440.0, - "grad_norm": 1.7853690904757185, - "language_loss": 0.65810287, - "learning_rate": 2.662750187431268e-06, - "loss": 0.67942798, - "num_input_tokens_seen": 146592885, - "step": 6831, - "time_per_iteration": 4.213804244995117 - }, - { - "auxiliary_loss_clip": 0.01137663, - "auxiliary_loss_mlp": 0.01039058, - "balance_loss_clip": 1.05280805, - "balance_loss_mlp": 1.02361393, - "epoch": 0.4107620622275665, - "flos": 26648195356800.0, - "grad_norm": 1.7075421510763598, - "language_loss": 0.69710165, - "learning_rate": 2.662382718122776e-06, - "loss": 0.71886885, - "num_input_tokens_seen": 146611995, - "step": 6832, - "time_per_iteration": 4.146309852600098 - }, - { - "auxiliary_loss_clip": 0.01089843, - "auxiliary_loss_mlp": 0.01042117, - "balance_loss_clip": 1.05080116, - "balance_loss_mlp": 1.02703142, - "epoch": 0.41082218548023447, - "flos": 18734310138240.0, - "grad_norm": 2.3374205466797537, - "language_loss": 0.73910743, - "learning_rate": 2.662015223696666e-06, - "loss": 0.760427, - "num_input_tokens_seen": 146628045, - "step": 6833, - "time_per_iteration": 4.23652195930481 - }, - { - "auxiliary_loss_clip": 0.01083988, - "auxiliary_loss_mlp": 0.01045346, - "balance_loss_clip": 1.04393578, - "balance_loss_mlp": 1.02754784, - "epoch": 0.41088230873290243, - "flos": 22893771116160.0, - "grad_norm": 1.56012293193972, - "language_loss": 0.7299009, - "learning_rate": 2.6616477041668713e-06, - "loss": 0.75119424, - "num_input_tokens_seen": 146648355, - "step": 6834, - "time_per_iteration": 2.72806453704834 - }, - { - "auxiliary_loss_clip": 0.0113018, - "auxiliary_loss_mlp": 0.01049062, - "balance_loss_clip": 1.05203891, - "balance_loss_mlp": 1.03320765, - "epoch": 0.4109424319855704, - "flos": 24276978000000.0, - "grad_norm": 1.7978087117059114, - "language_loss": 0.71254998, - "learning_rate": 2.661280159547329e-06, - "loss": 0.73434246, - "num_input_tokens_seen": 146668370, - "step": 6835, - "time_per_iteration": 4.406278133392334 - }, - { - "auxiliary_loss_clip": 0.01130021, - "auxiliary_loss_mlp": 0.01043294, - "balance_loss_clip": 1.05188155, - "balance_loss_mlp": 1.02630687, - "epoch": 0.41100255523823837, - "flos": 12969139478400.0, - "grad_norm": 1.9060780079348063, - "language_loss": 0.87366456, - "learning_rate": 2.660912589851978e-06, - "loss": 0.89539772, - "num_input_tokens_seen": 146686665, - "step": 6836, - "time_per_iteration": 2.6482133865356445 - }, - { - "auxiliary_loss_clip": 0.0112613, - "auxiliary_loss_mlp": 0.01040074, - "balance_loss_clip": 1.05334806, - "balance_loss_mlp": 1.02461267, - "epoch": 0.4110626784909064, - "flos": 23145648261120.0, - "grad_norm": 6.565804686602276, - "language_loss": 0.69167227, - "learning_rate": 2.6605449950947547e-06, - "loss": 0.71333432, - "num_input_tokens_seen": 146706570, - "step": 6837, - "time_per_iteration": 2.682241916656494 - }, - { - "auxiliary_loss_clip": 0.0114114, - "auxiliary_loss_mlp": 0.01041377, - "balance_loss_clip": 1.0544312, - "balance_loss_mlp": 1.02540302, - "epoch": 0.41112280174357435, - "flos": 22747399194240.0, - "grad_norm": 1.8671169017141842, - "language_loss": 0.75408459, - "learning_rate": 2.660177375289599e-06, - "loss": 0.77590978, - "num_input_tokens_seen": 146723425, - "step": 6838, - "time_per_iteration": 2.625422239303589 - }, - { - "auxiliary_loss_clip": 0.0110141, - "auxiliary_loss_mlp": 0.01042257, - "balance_loss_clip": 1.0521034, - "balance_loss_mlp": 1.02617598, - "epoch": 0.4111829249962423, - "flos": 21102403011840.0, - "grad_norm": 2.061873935528421, - "language_loss": 0.82113552, - "learning_rate": 2.659809730450451e-06, - "loss": 0.84257221, - "num_input_tokens_seen": 146741640, - "step": 6839, - "time_per_iteration": 2.7850279808044434 - }, - { - "auxiliary_loss_clip": 0.01135439, - "auxiliary_loss_mlp": 0.0103927, - "balance_loss_clip": 1.05122948, - "balance_loss_mlp": 1.02421379, - "epoch": 0.4112430482489103, - "flos": 21505787723520.0, - "grad_norm": 5.701831641175022, - "language_loss": 0.80077577, - "learning_rate": 2.6594420605912523e-06, - "loss": 0.82252288, - "num_input_tokens_seen": 146759195, - "step": 6840, - "time_per_iteration": 2.656494140625 - }, - { - "auxiliary_loss_clip": 0.01120054, - "auxiliary_loss_mlp": 0.01035027, - "balance_loss_clip": 1.0487783, - "balance_loss_mlp": 1.02117467, - "epoch": 0.41130317150157825, - "flos": 19570022945280.0, - "grad_norm": 1.862146821875906, - "language_loss": 0.6778084, - "learning_rate": 2.6590743657259442e-06, - "loss": 0.69935924, - "num_input_tokens_seen": 146774990, - "step": 6841, - "time_per_iteration": 2.6612377166748047 - }, - { - "auxiliary_loss_clip": 0.01055489, - "auxiliary_loss_mlp": 0.01004436, - "balance_loss_clip": 1.03532803, - "balance_loss_mlp": 1.00270772, - "epoch": 0.4113632947542462, - "flos": 62383157706240.0, - "grad_norm": 0.8163554776107808, - "language_loss": 0.59717554, - "learning_rate": 2.65870664586847e-06, - "loss": 0.61777478, - "num_input_tokens_seen": 146839610, - "step": 6842, - "time_per_iteration": 3.2157862186431885 - }, - { - "auxiliary_loss_clip": 0.01120166, - "auxiliary_loss_mlp": 0.01038325, - "balance_loss_clip": 1.05330658, - "balance_loss_mlp": 1.02400184, - "epoch": 0.4114234180069142, - "flos": 13918617636480.0, - "grad_norm": 2.3538351775584156, - "language_loss": 0.70293331, - "learning_rate": 2.6583389010327742e-06, - "loss": 0.72451818, - "num_input_tokens_seen": 146857360, - "step": 6843, - "time_per_iteration": 2.6172597408294678 - }, - { - "auxiliary_loss_clip": 0.01014929, - "auxiliary_loss_mlp": 0.01002572, - "balance_loss_clip": 1.01983762, - "balance_loss_mlp": 1.00047398, - "epoch": 0.41148354125958214, - "flos": 64928505219840.0, - "grad_norm": 0.7263883634768764, - "language_loss": 0.53593683, - "learning_rate": 2.6579711312328013e-06, - "loss": 0.55611187, - "num_input_tokens_seen": 146917055, - "step": 6844, - "time_per_iteration": 3.21069598197937 - }, - { - "auxiliary_loss_clip": 0.01124589, - "auxiliary_loss_mlp": 0.01041114, - "balance_loss_clip": 1.05226612, - "balance_loss_mlp": 1.02679706, - "epoch": 0.4115436645122501, - "flos": 18728779443840.0, - "grad_norm": 1.870188515464334, - "language_loss": 0.66065252, - "learning_rate": 2.6576033364824967e-06, - "loss": 0.68230951, - "num_input_tokens_seen": 146935215, - "step": 6845, - "time_per_iteration": 2.6289329528808594 - }, - { - "auxiliary_loss_clip": 0.01134084, - "auxiliary_loss_mlp": 0.01038479, - "balance_loss_clip": 1.05250192, - "balance_loss_mlp": 1.02355433, - "epoch": 0.41160378776491807, - "flos": 16252918790400.0, - "grad_norm": 2.0932374873894655, - "language_loss": 0.70088863, - "learning_rate": 2.657235516795808e-06, - "loss": 0.72261429, - "num_input_tokens_seen": 146951970, - "step": 6846, - "time_per_iteration": 2.578780174255371 - }, - { - "auxiliary_loss_clip": 0.01111001, - "auxiliary_loss_mlp": 0.01041074, - "balance_loss_clip": 1.04926157, - "balance_loss_mlp": 1.0254035, - "epoch": 0.41166391101758604, - "flos": 27970031854080.0, - "grad_norm": 1.8006459441278344, - "language_loss": 0.65271175, - "learning_rate": 2.6568676721866826e-06, - "loss": 0.67423248, - "num_input_tokens_seen": 146975615, - "step": 6847, - "time_per_iteration": 2.7504281997680664 - }, - { - "auxiliary_loss_clip": 0.01111807, - "auxiliary_loss_mlp": 0.01046607, - "balance_loss_clip": 1.04943776, - "balance_loss_mlp": 1.03167558, - "epoch": 0.411724034270254, - "flos": 34131296764800.0, - "grad_norm": 1.371398558221349, - "language_loss": 0.70655453, - "learning_rate": 2.656499802669069e-06, - "loss": 0.72813869, - "num_input_tokens_seen": 146998855, - "step": 6848, - "time_per_iteration": 2.7842190265655518 - }, - { - "auxiliary_loss_clip": 0.01032604, - "auxiliary_loss_mlp": 0.00753743, - "balance_loss_clip": 1.02356267, - "balance_loss_mlp": 1.00076866, - "epoch": 0.41178415752292197, - "flos": 67923670752000.0, - "grad_norm": 0.9037714041830832, - "language_loss": 0.5627954, - "learning_rate": 2.6561319082569174e-06, - "loss": 0.58065879, - "num_input_tokens_seen": 147062710, - "step": 6849, - "time_per_iteration": 3.3100218772888184 - }, - { - "auxiliary_loss_clip": 0.01115279, - "auxiliary_loss_mlp": 0.0104026, - "balance_loss_clip": 1.05035055, - "balance_loss_mlp": 1.0254786, - "epoch": 0.41184428077558993, - "flos": 34313938444800.0, - "grad_norm": 2.6235370790375767, - "language_loss": 0.76318872, - "learning_rate": 2.6557639889641783e-06, - "loss": 0.78474414, - "num_input_tokens_seen": 147086075, - "step": 6850, - "time_per_iteration": 2.879258632659912 - }, - { - "auxiliary_loss_clip": 0.010812, - "auxiliary_loss_mlp": 0.01037976, - "balance_loss_clip": 1.0412885, - "balance_loss_mlp": 1.02356339, - "epoch": 0.41190440402825795, - "flos": 35444118948480.0, - "grad_norm": 1.5473555335002718, - "language_loss": 0.68093288, - "learning_rate": 2.6553960448048025e-06, - "loss": 0.70212466, - "num_input_tokens_seen": 147107590, - "step": 6851, - "time_per_iteration": 2.931530237197876 - }, - { - "auxiliary_loss_clip": 0.01101431, - "auxiliary_loss_mlp": 0.01049233, - "balance_loss_clip": 1.0504117, - "balance_loss_mlp": 1.03207839, - "epoch": 0.4119645272809259, - "flos": 20849879422080.0, - "grad_norm": 2.1361960755807634, - "language_loss": 0.79698718, - "learning_rate": 2.655028075792743e-06, - "loss": 0.81849384, - "num_input_tokens_seen": 147123715, - "step": 6852, - "time_per_iteration": 2.6807408332824707 - }, - { - "auxiliary_loss_clip": 0.01141214, - "auxiliary_loss_mlp": 0.01043074, - "balance_loss_clip": 1.05327845, - "balance_loss_mlp": 1.02688491, - "epoch": 0.4120246505335939, - "flos": 27562050201600.0, - "grad_norm": 1.901908158264802, - "language_loss": 0.77750659, - "learning_rate": 2.6546600819419537e-06, - "loss": 0.79934943, - "num_input_tokens_seen": 147144290, - "step": 6853, - "time_per_iteration": 2.699430227279663 - }, - { - "auxiliary_loss_clip": 0.01126437, - "auxiliary_loss_mlp": 0.01046106, - "balance_loss_clip": 1.04821801, - "balance_loss_mlp": 1.0298574, - "epoch": 0.41208477378626185, - "flos": 37815444046080.0, - "grad_norm": 1.8090743517086876, - "language_loss": 0.65556479, - "learning_rate": 2.6542920632663883e-06, - "loss": 0.6772902, - "num_input_tokens_seen": 147166340, - "step": 6854, - "time_per_iteration": 2.8111729621887207 - }, - { - "auxiliary_loss_clip": 0.01104516, - "auxiliary_loss_mlp": 0.01052436, - "balance_loss_clip": 1.04534888, - "balance_loss_mlp": 1.03615212, - "epoch": 0.4121448970389298, - "flos": 23440762402560.0, - "grad_norm": 2.1224683572406917, - "language_loss": 0.8348515, - "learning_rate": 2.6539240197800023e-06, - "loss": 0.85642099, - "num_input_tokens_seen": 147184025, - "step": 6855, - "time_per_iteration": 2.6698896884918213 - }, - { - "auxiliary_loss_clip": 0.01117307, - "auxiliary_loss_mlp": 0.01044081, - "balance_loss_clip": 1.04969764, - "balance_loss_mlp": 1.02976418, - "epoch": 0.4122050202915978, - "flos": 21325300859520.0, - "grad_norm": 2.1069107949142554, - "language_loss": 0.7929827, - "learning_rate": 2.6535559514967517e-06, - "loss": 0.81459653, - "num_input_tokens_seen": 147202730, - "step": 6856, - "time_per_iteration": 2.6754775047302246 - }, - { - "auxiliary_loss_clip": 0.01098846, - "auxiliary_loss_mlp": 0.01042601, - "balance_loss_clip": 1.04761338, - "balance_loss_mlp": 1.02777684, - "epoch": 0.41226514354426574, - "flos": 17306286059520.0, - "grad_norm": 2.5035417030553018, - "language_loss": 0.80352724, - "learning_rate": 2.6531878584305935e-06, - "loss": 0.82494175, - "num_input_tokens_seen": 147215315, - "step": 6857, - "time_per_iteration": 2.7415785789489746 - }, - { - "auxiliary_loss_clip": 0.01123756, - "auxiliary_loss_mlp": 0.0077359, - "balance_loss_clip": 1.04799688, - "balance_loss_mlp": 1.00088441, - "epoch": 0.4123252667969337, - "flos": 17638855107840.0, - "grad_norm": 2.1785137319374575, - "language_loss": 0.70367694, - "learning_rate": 2.6528197405954873e-06, - "loss": 0.72265041, - "num_input_tokens_seen": 147233330, - "step": 6858, - "time_per_iteration": 2.6482796669006348 - }, - { - "auxiliary_loss_clip": 0.01123125, - "auxiliary_loss_mlp": 0.01046787, - "balance_loss_clip": 1.04916668, - "balance_loss_mlp": 1.03116488, - "epoch": 0.4123853900496017, - "flos": 46424811375360.0, - "grad_norm": 2.660424997773602, - "language_loss": 0.59025121, - "learning_rate": 2.652451598005391e-06, - "loss": 0.61195034, - "num_input_tokens_seen": 147257780, - "step": 6859, - "time_per_iteration": 2.8688454627990723 - }, - { - "auxiliary_loss_clip": 0.01132817, - "auxiliary_loss_mlp": 0.0104458, - "balance_loss_clip": 1.04658365, - "balance_loss_mlp": 1.0293684, - "epoch": 0.41244551330226964, - "flos": 17675160779520.0, - "grad_norm": 2.4672414929748863, - "language_loss": 0.73583943, - "learning_rate": 2.652083430674264e-06, - "loss": 0.75761342, - "num_input_tokens_seen": 147276055, - "step": 6860, - "time_per_iteration": 2.552107572555542 - }, - { - "auxiliary_loss_clip": 0.01058973, - "auxiliary_loss_mlp": 0.01038942, - "balance_loss_clip": 1.04514742, - "balance_loss_mlp": 1.024279, - "epoch": 0.4125056365549376, - "flos": 18693730748160.0, - "grad_norm": 1.7024014286117355, - "language_loss": 0.7499401, - "learning_rate": 2.651715238616068e-06, - "loss": 0.7709192, - "num_input_tokens_seen": 147293200, - "step": 6861, - "time_per_iteration": 2.8850560188293457 - }, - { - "auxiliary_loss_clip": 0.01110545, - "auxiliary_loss_mlp": 0.01044439, - "balance_loss_clip": 1.04591155, - "balance_loss_mlp": 1.03024721, - "epoch": 0.41256575980760557, - "flos": 17895293280000.0, - "grad_norm": 2.2415523494511467, - "language_loss": 0.79298902, - "learning_rate": 2.651347021844765e-06, - "loss": 0.8145389, - "num_input_tokens_seen": 147310640, - "step": 6862, - "time_per_iteration": 2.900341510772705 - }, - { - "auxiliary_loss_clip": 0.01101386, - "auxiliary_loss_mlp": 0.01041536, - "balance_loss_clip": 1.04071999, - "balance_loss_mlp": 1.02640843, - "epoch": 0.41262588306027354, - "flos": 21981316901760.0, - "grad_norm": 1.8032442507418176, - "language_loss": 0.7571404, - "learning_rate": 2.650978780374318e-06, - "loss": 0.77856958, - "num_input_tokens_seen": 147329435, - "step": 6863, - "time_per_iteration": 2.653726100921631 - }, - { - "auxiliary_loss_clip": 0.01042253, - "auxiliary_loss_mlp": 0.0101594, - "balance_loss_clip": 1.02186918, - "balance_loss_mlp": 1.01400852, - "epoch": 0.41268600631294156, - "flos": 53350006740480.0, - "grad_norm": 0.7071869047358454, - "language_loss": 0.52727556, - "learning_rate": 2.650610514218691e-06, - "loss": 0.54785752, - "num_input_tokens_seen": 147385805, - "step": 6864, - "time_per_iteration": 3.1097042560577393 - }, - { - "auxiliary_loss_clip": 0.01138053, - "auxiliary_loss_mlp": 0.01037208, - "balance_loss_clip": 1.04946339, - "balance_loss_mlp": 1.02124572, - "epoch": 0.4127461295656095, - "flos": 24385356311040.0, - "grad_norm": 2.542549123445174, - "language_loss": 0.72281235, - "learning_rate": 2.6502422233918468e-06, - "loss": 0.74456495, - "num_input_tokens_seen": 147405160, - "step": 6865, - "time_per_iteration": 2.6489152908325195 - }, - { - "auxiliary_loss_clip": 0.01052076, - "auxiliary_loss_mlp": 0.01005202, - "balance_loss_clip": 1.02275848, - "balance_loss_mlp": 1.0035094, - "epoch": 0.4128062528182775, - "flos": 71705242696320.0, - "grad_norm": 0.9209058739863084, - "language_loss": 0.66585267, - "learning_rate": 2.649873907907753e-06, - "loss": 0.68642545, - "num_input_tokens_seen": 147460245, - "step": 6866, - "time_per_iteration": 3.062208890914917 - }, - { - "auxiliary_loss_clip": 0.01129627, - "auxiliary_loss_mlp": 0.01039004, - "balance_loss_clip": 1.04632759, - "balance_loss_mlp": 1.02420402, - "epoch": 0.41286637607094545, - "flos": 17849111368320.0, - "grad_norm": 2.3224691577841905, - "language_loss": 0.8131212, - "learning_rate": 2.649505567780375e-06, - "loss": 0.83480746, - "num_input_tokens_seen": 147476200, - "step": 6867, - "time_per_iteration": 2.6058406829833984 - }, - { - "auxiliary_loss_clip": 0.01114316, - "auxiliary_loss_mlp": 0.01036267, - "balance_loss_clip": 1.04773378, - "balance_loss_mlp": 1.02069843, - "epoch": 0.4129264993236134, - "flos": 25549544016000.0, - "grad_norm": 2.2632029728217913, - "language_loss": 0.78249037, - "learning_rate": 2.6491372030236815e-06, - "loss": 0.80399621, - "num_input_tokens_seen": 147494315, - "step": 6868, - "time_per_iteration": 2.7882273197174072 - }, - { - "auxiliary_loss_clip": 0.0104195, - "auxiliary_loss_mlp": 0.01002347, - "balance_loss_clip": 1.02322721, - "balance_loss_mlp": 1.00078535, - "epoch": 0.4129866225762814, - "flos": 65414446364160.0, - "grad_norm": 0.8559261941349585, - "language_loss": 0.57746547, - "learning_rate": 2.64876881365164e-06, - "loss": 0.59790844, - "num_input_tokens_seen": 147543665, - "step": 6869, - "time_per_iteration": 2.9020984172821045 - }, - { - "auxiliary_loss_clip": 0.01116756, - "auxiliary_loss_mlp": 0.01037209, - "balance_loss_clip": 1.04666448, - "balance_loss_mlp": 1.02235568, - "epoch": 0.41304674582894935, - "flos": 28876991287680.0, - "grad_norm": 2.064989454661501, - "language_loss": 0.74957705, - "learning_rate": 2.64840039967822e-06, - "loss": 0.77111673, - "num_input_tokens_seen": 147564870, - "step": 6870, - "time_per_iteration": 4.271910667419434 - }, - { - "auxiliary_loss_clip": 0.01102765, - "auxiliary_loss_mlp": 0.01045795, - "balance_loss_clip": 1.04849434, - "balance_loss_mlp": 1.0301609, - "epoch": 0.4131068690816173, - "flos": 22891975436160.0, - "grad_norm": 1.7132239618858751, - "language_loss": 0.83188486, - "learning_rate": 2.6480319611173912e-06, - "loss": 0.85337055, - "num_input_tokens_seen": 147584840, - "step": 6871, - "time_per_iteration": 2.7382373809814453 - }, - { - "auxiliary_loss_clip": 0.01102249, - "auxiliary_loss_mlp": 0.01042486, - "balance_loss_clip": 1.04694879, - "balance_loss_mlp": 1.02648854, - "epoch": 0.4131669923342853, - "flos": 26065185707520.0, - "grad_norm": 1.8588331523997874, - "language_loss": 0.68419731, - "learning_rate": 2.6476634979831263e-06, - "loss": 0.70564461, - "num_input_tokens_seen": 147604635, - "step": 6872, - "time_per_iteration": 2.731513738632202 - }, - { - "auxiliary_loss_clip": 0.01116452, - "auxiliary_loss_mlp": 0.0103393, - "balance_loss_clip": 1.0480907, - "balance_loss_mlp": 1.01936865, - "epoch": 0.41322711558695324, - "flos": 19244564789760.0, - "grad_norm": 2.0600406966329468, - "language_loss": 0.75857317, - "learning_rate": 2.6472950102893964e-06, - "loss": 0.78007692, - "num_input_tokens_seen": 147620700, - "step": 6873, - "time_per_iteration": 4.200350999832153 - }, - { - "auxiliary_loss_clip": 0.0110667, - "auxiliary_loss_mlp": 0.01041498, - "balance_loss_clip": 1.04465103, - "balance_loss_mlp": 1.02552366, - "epoch": 0.4132872388396212, - "flos": 22674464628480.0, - "grad_norm": 2.335780539187462, - "language_loss": 0.83409697, - "learning_rate": 2.6469264980501746e-06, - "loss": 0.85557866, - "num_input_tokens_seen": 147639490, - "step": 6874, - "time_per_iteration": 2.677481174468994 - }, - { - "auxiliary_loss_clip": 0.01095645, - "auxiliary_loss_mlp": 0.01037651, - "balance_loss_clip": 1.04236686, - "balance_loss_mlp": 1.02203512, - "epoch": 0.4133473620922892, - "flos": 20150195420160.0, - "grad_norm": 2.13686316676373, - "language_loss": 0.71832943, - "learning_rate": 2.646557961279436e-06, - "loss": 0.73966241, - "num_input_tokens_seen": 147657205, - "step": 6875, - "time_per_iteration": 4.490081548690796 - }, - { - "auxiliary_loss_clip": 0.01099487, - "auxiliary_loss_mlp": 0.0104606, - "balance_loss_clip": 1.0442456, - "balance_loss_mlp": 1.03144503, - "epoch": 0.41340748534495714, - "flos": 24242755317120.0, - "grad_norm": 2.0421788997824164, - "language_loss": 0.82396001, - "learning_rate": 2.646189399991154e-06, - "loss": 0.84541547, - "num_input_tokens_seen": 147677005, - "step": 6876, - "time_per_iteration": 2.7446470260620117 - }, - { - "auxiliary_loss_clip": 0.01120566, - "auxiliary_loss_mlp": 0.01041258, - "balance_loss_clip": 1.04677415, - "balance_loss_mlp": 1.02511716, - "epoch": 0.41346760859762516, - "flos": 14392171566720.0, - "grad_norm": 2.56742905987435, - "language_loss": 0.64847958, - "learning_rate": 2.6458208141993048e-06, - "loss": 0.67009783, - "num_input_tokens_seen": 147693435, - "step": 6877, - "time_per_iteration": 2.5988993644714355 - }, - { - "auxiliary_loss_clip": 0.01117576, - "auxiliary_loss_mlp": 0.01038622, - "balance_loss_clip": 1.04535675, - "balance_loss_mlp": 1.02366138, - "epoch": 0.4135277318502931, - "flos": 22492002516480.0, - "grad_norm": 1.9690610536683542, - "language_loss": 0.76823169, - "learning_rate": 2.6454522039178668e-06, - "loss": 0.78979367, - "num_input_tokens_seen": 147714000, - "step": 6878, - "time_per_iteration": 2.6289098262786865 - }, - { - "auxiliary_loss_clip": 0.01120186, - "auxiliary_loss_mlp": 0.0077293, - "balance_loss_clip": 1.04670906, - "balance_loss_mlp": 1.00107956, - "epoch": 0.4135878551029611, - "flos": 22418744728320.0, - "grad_norm": 1.7550266496384528, - "language_loss": 0.80281323, - "learning_rate": 2.6450835691608154e-06, - "loss": 0.82174444, - "num_input_tokens_seen": 147731010, - "step": 6879, - "time_per_iteration": 2.661945343017578 - }, - { - "auxiliary_loss_clip": 0.01130865, - "auxiliary_loss_mlp": 0.01039257, - "balance_loss_clip": 1.04709899, - "balance_loss_mlp": 1.02471972, - "epoch": 0.41364797835562905, - "flos": 27053232094080.0, - "grad_norm": 2.4786614895541312, - "language_loss": 0.84795272, - "learning_rate": 2.6447149099421315e-06, - "loss": 0.869654, - "num_input_tokens_seen": 147750880, - "step": 6880, - "time_per_iteration": 2.6188430786132812 - }, - { - "auxiliary_loss_clip": 0.01111764, - "auxiliary_loss_mlp": 0.0102976, - "balance_loss_clip": 1.04788852, - "balance_loss_mlp": 1.01497793, - "epoch": 0.413708101608297, - "flos": 22967603521920.0, - "grad_norm": 3.387576232567814, - "language_loss": 0.70222247, - "learning_rate": 2.6443462262757927e-06, - "loss": 0.72363776, - "num_input_tokens_seen": 147771360, - "step": 6881, - "time_per_iteration": 2.733462333679199 - }, - { - "auxiliary_loss_clip": 0.0112877, - "auxiliary_loss_mlp": 0.01037286, - "balance_loss_clip": 1.04717231, - "balance_loss_mlp": 1.02352309, - "epoch": 0.413768224860965, - "flos": 13333991875200.0, - "grad_norm": 2.043279627081185, - "language_loss": 0.81609744, - "learning_rate": 2.6439775181757805e-06, - "loss": 0.837758, - "num_input_tokens_seen": 147787440, - "step": 6882, - "time_per_iteration": 2.6478219032287598 - }, - { - "auxiliary_loss_clip": 0.01107335, - "auxiliary_loss_mlp": 0.0104742, - "balance_loss_clip": 1.04388988, - "balance_loss_mlp": 1.02958596, - "epoch": 0.41382834811363295, - "flos": 20813968800000.0, - "grad_norm": 2.1226762712951195, - "language_loss": 0.69825858, - "learning_rate": 2.643608785656077e-06, - "loss": 0.71980608, - "num_input_tokens_seen": 147805720, - "step": 6883, - "time_per_iteration": 2.7219526767730713 - }, - { - "auxiliary_loss_clip": 0.01117809, - "auxiliary_loss_mlp": 0.01042891, - "balance_loss_clip": 1.04390156, - "balance_loss_mlp": 1.02804899, - "epoch": 0.4138884713663009, - "flos": 20667130001280.0, - "grad_norm": 1.778769139531053, - "language_loss": 0.76219916, - "learning_rate": 2.643240028730663e-06, - "loss": 0.7838062, - "num_input_tokens_seen": 147824605, - "step": 6884, - "time_per_iteration": 2.7255208492279053 - }, - { - "auxiliary_loss_clip": 0.01095169, - "auxiliary_loss_mlp": 0.01038756, - "balance_loss_clip": 1.04337394, - "balance_loss_mlp": 1.02405715, - "epoch": 0.4139485946189689, - "flos": 29056616225280.0, - "grad_norm": 1.442860134230448, - "language_loss": 0.75787425, - "learning_rate": 2.642871247413523e-06, - "loss": 0.77921343, - "num_input_tokens_seen": 147845445, - "step": 6885, - "time_per_iteration": 2.759103775024414 - }, - { - "auxiliary_loss_clip": 0.0113157, - "auxiliary_loss_mlp": 0.01040383, - "balance_loss_clip": 1.04593658, - "balance_loss_mlp": 1.0249809, - "epoch": 0.41400871787163684, - "flos": 24425720219520.0, - "grad_norm": 2.975461049679227, - "language_loss": 0.70157146, - "learning_rate": 2.6425024417186414e-06, - "loss": 0.72329092, - "num_input_tokens_seen": 147865580, - "step": 6886, - "time_per_iteration": 2.5969202518463135 - }, - { - "auxiliary_loss_clip": 0.01130858, - "auxiliary_loss_mlp": 0.00772578, - "balance_loss_clip": 1.04714894, - "balance_loss_mlp": 1.00082159, - "epoch": 0.4140688411243048, - "flos": 19464050845440.0, - "grad_norm": 4.863732808232375, - "language_loss": 0.75765413, - "learning_rate": 2.642133611660002e-06, - "loss": 0.77668852, - "num_input_tokens_seen": 147885230, - "step": 6887, - "time_per_iteration": 2.6130294799804688 - }, - { - "auxiliary_loss_clip": 0.01115226, - "auxiliary_loss_mlp": 0.0103352, - "balance_loss_clip": 1.04343033, - "balance_loss_mlp": 1.01858318, - "epoch": 0.4141289643769728, - "flos": 19313656600320.0, - "grad_norm": 1.960325409954457, - "language_loss": 0.70337266, - "learning_rate": 2.641764757251592e-06, - "loss": 0.72486007, - "num_input_tokens_seen": 147903035, - "step": 6888, - "time_per_iteration": 2.616093635559082 - }, - { - "auxiliary_loss_clip": 0.01125875, - "auxiliary_loss_mlp": 0.01041471, - "balance_loss_clip": 1.04317069, - "balance_loss_mlp": 1.02698743, - "epoch": 0.41418908762964074, - "flos": 16726903683840.0, - "grad_norm": 2.06267801428711, - "language_loss": 0.76650596, - "learning_rate": 2.6413958785073976e-06, - "loss": 0.7881794, - "num_input_tokens_seen": 147918745, - "step": 6889, - "time_per_iteration": 2.5624022483825684 - }, - { - "auxiliary_loss_clip": 0.01098507, - "auxiliary_loss_mlp": 0.00771883, - "balance_loss_clip": 1.05070317, - "balance_loss_mlp": 1.00089312, - "epoch": 0.41424921088230876, - "flos": 25296840858240.0, - "grad_norm": 2.7156921824995224, - "language_loss": 0.80554968, - "learning_rate": 2.6410269754414074e-06, - "loss": 0.82425356, - "num_input_tokens_seen": 147938265, - "step": 6890, - "time_per_iteration": 2.796128273010254 - }, - { - "auxiliary_loss_clip": 0.0112736, - "auxiliary_loss_mlp": 0.01038801, - "balance_loss_clip": 1.04589438, - "balance_loss_mlp": 1.0235126, - "epoch": 0.4143093341349767, - "flos": 20960520289920.0, - "grad_norm": 1.7630713030967287, - "language_loss": 0.74180973, - "learning_rate": 2.6406580480676113e-06, - "loss": 0.76347136, - "num_input_tokens_seen": 147957320, - "step": 6891, - "time_per_iteration": 2.6974401473999023 - }, - { - "auxiliary_loss_clip": 0.01092037, - "auxiliary_loss_mlp": 0.01043425, - "balance_loss_clip": 1.0482198, - "balance_loss_mlp": 1.02647936, - "epoch": 0.4143694573876447, - "flos": 22017694400640.0, - "grad_norm": 1.8611116210645706, - "language_loss": 0.84570521, - "learning_rate": 2.6402890963999963e-06, - "loss": 0.86705983, - "num_input_tokens_seen": 147977045, - "step": 6892, - "time_per_iteration": 2.8065037727355957 - }, - { - "auxiliary_loss_clip": 0.01081139, - "auxiliary_loss_mlp": 0.00774401, - "balance_loss_clip": 1.04017556, - "balance_loss_mlp": 1.00088513, - "epoch": 0.41442958064031266, - "flos": 35697396723840.0, - "grad_norm": 1.7475313827364956, - "language_loss": 0.70824122, - "learning_rate": 2.6399201204525554e-06, - "loss": 0.72679669, - "num_input_tokens_seen": 147996905, - "step": 6893, - "time_per_iteration": 2.865112543106079 - }, - { - "auxiliary_loss_clip": 0.01126872, - "auxiliary_loss_mlp": 0.01033016, - "balance_loss_clip": 1.04508913, - "balance_loss_mlp": 1.01873493, - "epoch": 0.4144897038929806, - "flos": 28293766156800.0, - "grad_norm": 1.5118367219903406, - "language_loss": 0.72955495, - "learning_rate": 2.639551120239279e-06, - "loss": 0.75115383, - "num_input_tokens_seen": 148017875, - "step": 6894, - "time_per_iteration": 2.6412105560302734 - }, - { - "auxiliary_loss_clip": 0.0111867, - "auxiliary_loss_mlp": 0.01032409, - "balance_loss_clip": 1.0444473, - "balance_loss_mlp": 1.01803279, - "epoch": 0.4145498271456486, - "flos": 11648093080320.0, - "grad_norm": 2.8699191887217697, - "language_loss": 0.63006961, - "learning_rate": 2.63918209577416e-06, - "loss": 0.65158045, - "num_input_tokens_seen": 148032300, - "step": 6895, - "time_per_iteration": 2.6429762840270996 - }, - { - "auxiliary_loss_clip": 0.01084496, - "auxiliary_loss_mlp": 0.01047641, - "balance_loss_clip": 1.04230917, - "balance_loss_mlp": 1.03178644, - "epoch": 0.41460995039831655, - "flos": 27235622378880.0, - "grad_norm": 1.395247516884051, - "language_loss": 0.7072767, - "learning_rate": 2.638813047071192e-06, - "loss": 0.728598, - "num_input_tokens_seen": 148053260, - "step": 6896, - "time_per_iteration": 2.754567861557007 - }, - { - "auxiliary_loss_clip": 0.01125613, - "auxiliary_loss_mlp": 0.0104596, - "balance_loss_clip": 1.04233313, - "balance_loss_mlp": 1.03083241, - "epoch": 0.4146700736509845, - "flos": 25922369232000.0, - "grad_norm": 1.6183082189069362, - "language_loss": 0.73234701, - "learning_rate": 2.6384439741443696e-06, - "loss": 0.75406271, - "num_input_tokens_seen": 148072965, - "step": 6897, - "time_per_iteration": 2.737884759902954 - }, - { - "auxiliary_loss_clip": 0.01114786, - "auxiliary_loss_mlp": 0.01041831, - "balance_loss_clip": 1.04562593, - "balance_loss_mlp": 1.02713859, - "epoch": 0.4147301969036525, - "flos": 26833243248000.0, - "grad_norm": 1.834097351521641, - "language_loss": 0.84865111, - "learning_rate": 2.6380748770076873e-06, - "loss": 0.87021732, - "num_input_tokens_seen": 148093240, - "step": 6898, - "time_per_iteration": 2.689467430114746 - }, - { - "auxiliary_loss_clip": 0.01079261, - "auxiliary_loss_mlp": 0.01035002, - "balance_loss_clip": 1.03853178, - "balance_loss_mlp": 1.02030301, - "epoch": 0.41479032015632045, - "flos": 20298291194880.0, - "grad_norm": 1.6538444757930724, - "language_loss": 0.74696559, - "learning_rate": 2.6377057556751416e-06, - "loss": 0.76810819, - "num_input_tokens_seen": 148110925, - "step": 6899, - "time_per_iteration": 2.73575758934021 - }, - { - "auxiliary_loss_clip": 0.0109529, - "auxiliary_loss_mlp": 0.0104143, - "balance_loss_clip": 1.04097557, - "balance_loss_mlp": 1.02549219, - "epoch": 0.4148504434089884, - "flos": 25264988472960.0, - "grad_norm": 2.0028183144746254, - "language_loss": 0.75739181, - "learning_rate": 2.6373366101607306e-06, - "loss": 0.778759, - "num_input_tokens_seen": 148130670, - "step": 6900, - "time_per_iteration": 2.7304093837738037 - }, - { - "auxiliary_loss_clip": 0.01112354, - "auxiliary_loss_mlp": 0.01038142, - "balance_loss_clip": 1.04515111, - "balance_loss_mlp": 1.02218616, - "epoch": 0.4149105666616564, - "flos": 12822300679680.0, - "grad_norm": 37.61175094058464, - "language_loss": 0.79667652, - "learning_rate": 2.6369674404784503e-06, - "loss": 0.81818151, - "num_input_tokens_seen": 148148350, - "step": 6901, - "time_per_iteration": 2.6238512992858887 - }, - { - "auxiliary_loss_clip": 0.01085977, - "auxiliary_loss_mlp": 0.01046173, - "balance_loss_clip": 1.03959978, - "balance_loss_mlp": 1.0302825, - "epoch": 0.41497068991432434, - "flos": 16763891713920.0, - "grad_norm": 1.6395274695924928, - "language_loss": 0.69640017, - "learning_rate": 2.6365982466423014e-06, - "loss": 0.7177217, - "num_input_tokens_seen": 148167550, - "step": 6902, - "time_per_iteration": 2.6854305267333984 - }, - { - "auxiliary_loss_clip": 0.01097592, - "auxiliary_loss_mlp": 0.00770925, - "balance_loss_clip": 1.04278207, - "balance_loss_mlp": 1.00099885, - "epoch": 0.4150308131669923, - "flos": 18000906243840.0, - "grad_norm": 2.384025861502229, - "language_loss": 0.83949161, - "learning_rate": 2.6362290286662834e-06, - "loss": 0.85817683, - "num_input_tokens_seen": 148184740, - "step": 6903, - "time_per_iteration": 2.6454520225524902 - }, - { - "auxiliary_loss_clip": 0.01133263, - "auxiliary_loss_mlp": 0.01042035, - "balance_loss_clip": 1.04633808, - "balance_loss_mlp": 1.02569163, - "epoch": 0.41509093641966033, - "flos": 30044770352640.0, - "grad_norm": 1.9553359330266324, - "language_loss": 0.67639846, - "learning_rate": 2.6358597865643968e-06, - "loss": 0.69815147, - "num_input_tokens_seen": 148204605, - "step": 6904, - "time_per_iteration": 2.7322065830230713 - }, - { - "auxiliary_loss_clip": 0.01130567, - "auxiliary_loss_mlp": 0.0077237, - "balance_loss_clip": 1.04620719, - "balance_loss_mlp": 1.00097251, - "epoch": 0.4151510596723283, - "flos": 24279994742400.0, - "grad_norm": 1.8757192691258513, - "language_loss": 0.77572656, - "learning_rate": 2.635490520350643e-06, - "loss": 0.79475594, - "num_input_tokens_seen": 148224675, - "step": 6905, - "time_per_iteration": 2.648400068283081 - }, - { - "auxiliary_loss_clip": 0.0113062, - "auxiliary_loss_mlp": 0.01033001, - "balance_loss_clip": 1.04648256, - "balance_loss_mlp": 1.01869583, - "epoch": 0.41521118292499626, - "flos": 23476206147840.0, - "grad_norm": 1.5608092182069806, - "language_loss": 0.68316001, - "learning_rate": 2.635121230039025e-06, - "loss": 0.7047962, - "num_input_tokens_seen": 148243375, - "step": 6906, - "time_per_iteration": 2.6084086894989014 - }, - { - "auxiliary_loss_clip": 0.01104219, - "auxiliary_loss_mlp": 0.0103582, - "balance_loss_clip": 1.04238176, - "balance_loss_mlp": 1.02167583, - "epoch": 0.4152713061776642, - "flos": 22125498094080.0, - "grad_norm": 2.313429051291415, - "language_loss": 0.67982537, - "learning_rate": 2.6347519156435467e-06, - "loss": 0.70122576, - "num_input_tokens_seen": 148261140, - "step": 6907, - "time_per_iteration": 2.715506076812744 - }, - { - "auxiliary_loss_clip": 0.01100263, - "auxiliary_loss_mlp": 0.01038198, - "balance_loss_clip": 1.0479455, - "balance_loss_mlp": 1.02419686, - "epoch": 0.4153314294303322, - "flos": 21251396626560.0, - "grad_norm": 2.133321939860832, - "language_loss": 0.77338696, - "learning_rate": 2.6343825771782123e-06, - "loss": 0.79477155, - "num_input_tokens_seen": 148279655, - "step": 6908, - "time_per_iteration": 2.699028253555298 - }, - { - "auxiliary_loss_clip": 0.01035537, - "auxiliary_loss_mlp": 0.01050035, - "balance_loss_clip": 1.02502179, - "balance_loss_mlp": 1.04800892, - "epoch": 0.41539155268300015, - "flos": 57920681594880.0, - "grad_norm": 0.8023457423545532, - "language_loss": 0.64889216, - "learning_rate": 2.634013214657026e-06, - "loss": 0.66974789, - "num_input_tokens_seen": 148339005, - "step": 6909, - "time_per_iteration": 3.174577474594116 - }, - { - "auxiliary_loss_clip": 0.01096348, - "auxiliary_loss_mlp": 0.0103783, - "balance_loss_clip": 1.04794037, - "balance_loss_mlp": 1.02368009, - "epoch": 0.4154516759356681, - "flos": 21903677654400.0, - "grad_norm": 3.1710005220016293, - "language_loss": 0.8712942, - "learning_rate": 2.633643828093996e-06, - "loss": 0.89263594, - "num_input_tokens_seen": 148358715, - "step": 6910, - "time_per_iteration": 4.24171257019043 - }, - { - "auxiliary_loss_clip": 0.01040831, - "auxiliary_loss_mlp": 0.01008541, - "balance_loss_clip": 1.02141929, - "balance_loss_mlp": 1.00702703, - "epoch": 0.4155117991883361, - "flos": 67833677226240.0, - "grad_norm": 0.8180681021689019, - "language_loss": 0.62115103, - "learning_rate": 2.633274417503128e-06, - "loss": 0.64164472, - "num_input_tokens_seen": 148417280, - "step": 6911, - "time_per_iteration": 3.171510696411133 - }, - { - "auxiliary_loss_clip": 0.01138851, - "auxiliary_loss_mlp": 0.01037606, - "balance_loss_clip": 1.05016613, - "balance_loss_mlp": 1.0219059, - "epoch": 0.41557192244100405, - "flos": 14282679934080.0, - "grad_norm": 2.4116200088670845, - "language_loss": 0.87474132, - "learning_rate": 2.6329049828984312e-06, - "loss": 0.89650595, - "num_input_tokens_seen": 148432610, - "step": 6912, - "time_per_iteration": 5.576058864593506 - }, - { - "auxiliary_loss_clip": 0.01117561, - "auxiliary_loss_mlp": 0.01034627, - "balance_loss_clip": 1.04753387, - "balance_loss_mlp": 1.02098989, - "epoch": 0.415632045693672, - "flos": 24461954064000.0, - "grad_norm": 22.77173838310247, - "language_loss": 0.63224173, - "learning_rate": 2.632535524293914e-06, - "loss": 0.65376365, - "num_input_tokens_seen": 148451510, - "step": 6913, - "time_per_iteration": 2.702631711959839 - }, - { - "auxiliary_loss_clip": 0.01102511, - "auxiliary_loss_mlp": 0.00771597, - "balance_loss_clip": 1.04298615, - "balance_loss_mlp": 1.00093937, - "epoch": 0.41569216894634, - "flos": 20115290378880.0, - "grad_norm": 1.7272855093915238, - "language_loss": 0.74980754, - "learning_rate": 2.632166041703586e-06, - "loss": 0.76854861, - "num_input_tokens_seen": 148469945, - "step": 6914, - "time_per_iteration": 4.340964078903198 - }, - { - "auxiliary_loss_clip": 0.01077278, - "auxiliary_loss_mlp": 0.01044004, - "balance_loss_clip": 1.04201877, - "balance_loss_mlp": 1.02906704, - "epoch": 0.41575229219900794, - "flos": 23798827128960.0, - "grad_norm": 1.8325905436461942, - "language_loss": 0.87653631, - "learning_rate": 2.631796535141458e-06, - "loss": 0.89774919, - "num_input_tokens_seen": 148486655, - "step": 6915, - "time_per_iteration": 2.757596731185913 - }, - { - "auxiliary_loss_clip": 0.0109973, - "auxiliary_loss_mlp": 0.01041371, - "balance_loss_clip": 1.04447317, - "balance_loss_mlp": 1.02728081, - "epoch": 0.4158124154516759, - "flos": 23108229267840.0, - "grad_norm": 3.0600667343253214, - "language_loss": 0.70990372, - "learning_rate": 2.6314270046215426e-06, - "loss": 0.73131478, - "num_input_tokens_seen": 148505035, - "step": 6916, - "time_per_iteration": 2.6894583702087402 - }, - { - "auxiliary_loss_clip": 0.01135969, - "auxiliary_loss_mlp": 0.01038621, - "balance_loss_clip": 1.04934418, - "balance_loss_mlp": 1.02361822, - "epoch": 0.41587253870434393, - "flos": 24242970798720.0, - "grad_norm": 1.53910679789622, - "language_loss": 0.71859491, - "learning_rate": 2.631057450157852e-06, - "loss": 0.74034083, - "num_input_tokens_seen": 148525575, - "step": 6917, - "time_per_iteration": 2.560401439666748 - }, - { - "auxiliary_loss_clip": 0.01104226, - "auxiliary_loss_mlp": 0.01032177, - "balance_loss_clip": 1.04427075, - "balance_loss_mlp": 1.01856291, - "epoch": 0.4159326619570119, - "flos": 23881602021120.0, - "grad_norm": 1.8609084037764254, - "language_loss": 0.80841225, - "learning_rate": 2.6306878717643988e-06, - "loss": 0.82977629, - "num_input_tokens_seen": 148547270, - "step": 6918, - "time_per_iteration": 2.71455979347229 - }, - { - "auxiliary_loss_clip": 0.01122968, - "auxiliary_loss_mlp": 0.01038479, - "balance_loss_clip": 1.05033052, - "balance_loss_mlp": 1.02306533, - "epoch": 0.41599278520967986, - "flos": 40626531354240.0, - "grad_norm": 1.460873312199365, - "language_loss": 0.70399261, - "learning_rate": 2.6303182694551995e-06, - "loss": 0.72560704, - "num_input_tokens_seen": 148572100, - "step": 6919, - "time_per_iteration": 2.784090518951416 - }, - { - "auxiliary_loss_clip": 0.01108371, - "auxiliary_loss_mlp": 0.0104095, - "balance_loss_clip": 1.04570937, - "balance_loss_mlp": 1.0255723, - "epoch": 0.4160529084623478, - "flos": 18222942165120.0, - "grad_norm": 1.8818708282287906, - "language_loss": 0.81701922, - "learning_rate": 2.6299486432442677e-06, - "loss": 0.83851242, - "num_input_tokens_seen": 148591245, - "step": 6920, - "time_per_iteration": 2.644867181777954 - }, - { - "auxiliary_loss_clip": 0.01113217, - "auxiliary_loss_mlp": 0.01042119, - "balance_loss_clip": 1.04909408, - "balance_loss_mlp": 1.02627623, - "epoch": 0.4161130317150158, - "flos": 13661963982720.0, - "grad_norm": 2.168550443744471, - "language_loss": 0.65408564, - "learning_rate": 2.6295789931456195e-06, - "loss": 0.67563891, - "num_input_tokens_seen": 148607980, - "step": 6921, - "time_per_iteration": 2.647270441055298 - }, - { - "auxiliary_loss_clip": 0.01108151, - "auxiliary_loss_mlp": 0.01042421, - "balance_loss_clip": 1.04479325, - "balance_loss_mlp": 1.02768648, - "epoch": 0.41617315496768376, - "flos": 16178511767040.0, - "grad_norm": 2.3873319200859004, - "language_loss": 0.80806041, - "learning_rate": 2.629209319173274e-06, - "loss": 0.82956612, - "num_input_tokens_seen": 148624490, - "step": 6922, - "time_per_iteration": 2.6521530151367188 - }, - { - "auxiliary_loss_clip": 0.01107722, - "auxiliary_loss_mlp": 0.01037357, - "balance_loss_clip": 1.04645085, - "balance_loss_mlp": 1.02304578, - "epoch": 0.4162332782203517, - "flos": 26213317395840.0, - "grad_norm": 1.6600188367705673, - "language_loss": 0.67455506, - "learning_rate": 2.628839621341247e-06, - "loss": 0.69600594, - "num_input_tokens_seen": 148646490, - "step": 6923, - "time_per_iteration": 2.6982760429382324 - }, - { - "auxiliary_loss_clip": 0.01100761, - "auxiliary_loss_mlp": 0.01052569, - "balance_loss_clip": 1.04614723, - "balance_loss_mlp": 1.03649926, - "epoch": 0.4162934014730197, - "flos": 28183987215360.0, - "grad_norm": 2.1905305361602676, - "language_loss": 0.75802875, - "learning_rate": 2.6284698996635593e-06, - "loss": 0.77956206, - "num_input_tokens_seen": 148668580, - "step": 6924, - "time_per_iteration": 2.746675491333008 - }, - { - "auxiliary_loss_clip": 0.01134317, - "auxiliary_loss_mlp": 0.01042613, - "balance_loss_clip": 1.04869533, - "balance_loss_mlp": 1.02842665, - "epoch": 0.41635352472568765, - "flos": 19865316654720.0, - "grad_norm": 2.7384378444587774, - "language_loss": 0.73572767, - "learning_rate": 2.62810015415423e-06, - "loss": 0.75749695, - "num_input_tokens_seen": 148688410, - "step": 6925, - "time_per_iteration": 2.6443655490875244 - }, - { - "auxiliary_loss_clip": 0.01107096, - "auxiliary_loss_mlp": 0.01035039, - "balance_loss_clip": 1.04328012, - "balance_loss_mlp": 1.02092457, - "epoch": 0.4164136479783556, - "flos": 14935356011520.0, - "grad_norm": 2.2965796841293487, - "language_loss": 0.83732742, - "learning_rate": 2.6277303848272792e-06, - "loss": 0.85874879, - "num_input_tokens_seen": 148704855, - "step": 6926, - "time_per_iteration": 2.688778877258301 - }, - { - "auxiliary_loss_clip": 0.01101563, - "auxiliary_loss_mlp": 0.0104323, - "balance_loss_clip": 1.04851913, - "balance_loss_mlp": 1.03019416, - "epoch": 0.4164737712310236, - "flos": 21757593041280.0, - "grad_norm": 1.7122304152619183, - "language_loss": 0.86459213, - "learning_rate": 2.6273605916967302e-06, - "loss": 0.88604003, - "num_input_tokens_seen": 148723065, - "step": 6927, - "time_per_iteration": 2.6891677379608154 - }, - { - "auxiliary_loss_clip": 0.01123007, - "auxiliary_loss_mlp": 0.01048103, - "balance_loss_clip": 1.04902172, - "balance_loss_mlp": 1.03252852, - "epoch": 0.41653389448369155, - "flos": 20740136394240.0, - "grad_norm": 2.2496180093698555, - "language_loss": 0.72619522, - "learning_rate": 2.626990774776604e-06, - "loss": 0.74790633, - "num_input_tokens_seen": 148741780, - "step": 6928, - "time_per_iteration": 2.6853785514831543 - }, - { - "auxiliary_loss_clip": 0.01103421, - "auxiliary_loss_mlp": 0.01037571, - "balance_loss_clip": 1.04516923, - "balance_loss_mlp": 1.02305102, - "epoch": 0.4165940177363595, - "flos": 24972891073920.0, - "grad_norm": 2.3320684503004667, - "language_loss": 0.781192, - "learning_rate": 2.6266209340809254e-06, - "loss": 0.80260193, - "num_input_tokens_seen": 148759795, - "step": 6929, - "time_per_iteration": 2.675412893295288 - }, - { - "auxiliary_loss_clip": 0.01130228, - "auxiliary_loss_mlp": 0.01034459, - "balance_loss_clip": 1.04634309, - "balance_loss_mlp": 1.02042162, - "epoch": 0.41665414098902753, - "flos": 20521727746560.0, - "grad_norm": 2.2076337971053897, - "language_loss": 0.70941442, - "learning_rate": 2.6262510696237182e-06, - "loss": 0.73106134, - "num_input_tokens_seen": 148778680, - "step": 6930, - "time_per_iteration": 2.5896191596984863 - }, - { - "auxiliary_loss_clip": 0.0110378, - "auxiliary_loss_mlp": 0.01040113, - "balance_loss_clip": 1.04316616, - "balance_loss_mlp": 1.02566469, - "epoch": 0.4167142642416955, - "flos": 19682926369920.0, - "grad_norm": 1.7468000498396183, - "language_loss": 0.81265134, - "learning_rate": 2.625881181419007e-06, - "loss": 0.83409023, - "num_input_tokens_seen": 148796470, - "step": 6931, - "time_per_iteration": 2.693753719329834 - }, - { - "auxiliary_loss_clip": 0.01073611, - "auxiliary_loss_mlp": 0.01040047, - "balance_loss_clip": 1.03671885, - "balance_loss_mlp": 1.0253247, - "epoch": 0.41677438749436346, - "flos": 23763742519680.0, - "grad_norm": 1.7136797301427433, - "language_loss": 0.78969777, - "learning_rate": 2.6255112694808193e-06, - "loss": 0.81083435, - "num_input_tokens_seen": 148815300, - "step": 6932, - "time_per_iteration": 2.900186061859131 - }, - { - "auxiliary_loss_clip": 0.01110051, - "auxiliary_loss_mlp": 0.00772641, - "balance_loss_clip": 1.04659891, - "balance_loss_mlp": 1.00109386, - "epoch": 0.41683451074703143, - "flos": 30410053712640.0, - "grad_norm": 1.8812444225834188, - "language_loss": 0.81995165, - "learning_rate": 2.6251413338231813e-06, - "loss": 0.83877861, - "num_input_tokens_seen": 148834315, - "step": 6933, - "time_per_iteration": 2.815415143966675 - }, - { - "auxiliary_loss_clip": 0.01135077, - "auxiliary_loss_mlp": 0.01036525, - "balance_loss_clip": 1.04731107, - "balance_loss_mlp": 1.02077699, - "epoch": 0.4168946339996994, - "flos": 21506757390720.0, - "grad_norm": 2.9283724451949236, - "language_loss": 0.76852083, - "learning_rate": 2.624771374460121e-06, - "loss": 0.79023689, - "num_input_tokens_seen": 148852420, - "step": 6934, - "time_per_iteration": 2.7175137996673584 - }, - { - "auxiliary_loss_clip": 0.01122637, - "auxiliary_loss_mlp": 0.01034712, - "balance_loss_clip": 1.048594, - "balance_loss_mlp": 1.02038264, - "epoch": 0.41695475725236736, - "flos": 17638675539840.0, - "grad_norm": 1.7602525666099749, - "language_loss": 0.67555362, - "learning_rate": 2.624401391405668e-06, - "loss": 0.6971271, - "num_input_tokens_seen": 148869305, - "step": 6935, - "time_per_iteration": 2.740238666534424 - }, - { - "auxiliary_loss_clip": 0.01106934, - "auxiliary_loss_mlp": 0.01041015, - "balance_loss_clip": 1.04740202, - "balance_loss_mlp": 1.02606606, - "epoch": 0.4170148805050353, - "flos": 15668903560320.0, - "grad_norm": 2.0770148597671834, - "language_loss": 0.73310643, - "learning_rate": 2.6240313846738513e-06, - "loss": 0.75458586, - "num_input_tokens_seen": 148886395, - "step": 6936, - "time_per_iteration": 2.71653413772583 - }, - { - "auxiliary_loss_clip": 0.01115958, - "auxiliary_loss_mlp": 0.01036656, - "balance_loss_clip": 1.04845905, - "balance_loss_mlp": 1.02274418, - "epoch": 0.4170750037577033, - "flos": 15159151699200.0, - "grad_norm": 2.3408521316198794, - "language_loss": 0.74009961, - "learning_rate": 2.6236613542787024e-06, - "loss": 0.76162577, - "num_input_tokens_seen": 148905235, - "step": 6937, - "time_per_iteration": 2.627197265625 - }, - { - "auxiliary_loss_clip": 0.01105318, - "auxiliary_loss_mlp": 0.01038451, - "balance_loss_clip": 1.04543686, - "balance_loss_mlp": 1.02422357, - "epoch": 0.41713512701037125, - "flos": 28768289754240.0, - "grad_norm": 2.1407867738666977, - "language_loss": 0.84349155, - "learning_rate": 2.6232913002342518e-06, - "loss": 0.8649292, - "num_input_tokens_seen": 148928130, - "step": 6938, - "time_per_iteration": 2.7512307167053223 - }, - { - "auxiliary_loss_clip": 0.01107641, - "auxiliary_loss_mlp": 0.01037692, - "balance_loss_clip": 1.04718804, - "balance_loss_mlp": 1.02217638, - "epoch": 0.4171952502630392, - "flos": 28256993608320.0, - "grad_norm": 1.985550471698889, - "language_loss": 0.7437641, - "learning_rate": 2.6229212225545334e-06, - "loss": 0.76521742, - "num_input_tokens_seen": 148948790, - "step": 6939, - "time_per_iteration": 2.8480472564697266 - }, - { - "auxiliary_loss_clip": 0.01121822, - "auxiliary_loss_mlp": 0.01033365, - "balance_loss_clip": 1.0470984, - "balance_loss_mlp": 1.01803446, - "epoch": 0.4172553735157072, - "flos": 24571697091840.0, - "grad_norm": 2.560264252806934, - "language_loss": 0.74981248, - "learning_rate": 2.622551121253579e-06, - "loss": 0.77136433, - "num_input_tokens_seen": 148967690, - "step": 6940, - "time_per_iteration": 2.707803249359131 - }, - { - "auxiliary_loss_clip": 0.01132435, - "auxiliary_loss_mlp": 0.01040605, - "balance_loss_clip": 1.04839242, - "balance_loss_mlp": 1.0266397, - "epoch": 0.41731549676837515, - "flos": 27045797978880.0, - "grad_norm": 2.248952291582723, - "language_loss": 0.71683985, - "learning_rate": 2.622180996345424e-06, - "loss": 0.73857027, - "num_input_tokens_seen": 148987150, - "step": 6941, - "time_per_iteration": 2.6406352519989014 - }, - { - "auxiliary_loss_clip": 0.01119657, - "auxiliary_loss_mlp": 0.0103964, - "balance_loss_clip": 1.04871619, - "balance_loss_mlp": 1.02461994, - "epoch": 0.4173756200210431, - "flos": 28394063907840.0, - "grad_norm": 2.929963903641068, - "language_loss": 0.74062824, - "learning_rate": 2.621810847844104e-06, - "loss": 0.76222122, - "num_input_tokens_seen": 149004895, - "step": 6942, - "time_per_iteration": 2.7269139289855957 - }, - { - "auxiliary_loss_clip": 0.01096497, - "auxiliary_loss_mlp": 0.01046649, - "balance_loss_clip": 1.04605746, - "balance_loss_mlp": 1.03079462, - "epoch": 0.41743574327371114, - "flos": 22521556431360.0, - "grad_norm": 2.258418581580233, - "language_loss": 0.72607493, - "learning_rate": 2.6214406757636534e-06, - "loss": 0.74750638, - "num_input_tokens_seen": 149020970, - "step": 6943, - "time_per_iteration": 2.8146276473999023 - }, - { - "auxiliary_loss_clip": 0.01100254, - "auxiliary_loss_mlp": 0.00772502, - "balance_loss_clip": 1.04520488, - "balance_loss_mlp": 1.00081825, - "epoch": 0.4174958665263791, - "flos": 30113431200000.0, - "grad_norm": 1.7970886758223585, - "language_loss": 0.63763773, - "learning_rate": 2.621070480118111e-06, - "loss": 0.65636539, - "num_input_tokens_seen": 149041795, - "step": 6944, - "time_per_iteration": 2.7709715366363525 - }, - { - "auxiliary_loss_clip": 0.0109928, - "auxiliary_loss_mlp": 0.01037535, - "balance_loss_clip": 1.03980803, - "balance_loss_mlp": 1.02262771, - "epoch": 0.41755598977904707, - "flos": 25263444188160.0, - "grad_norm": 1.5620596317333308, - "language_loss": 0.70201832, - "learning_rate": 2.620700260921513e-06, - "loss": 0.72338641, - "num_input_tokens_seen": 149063700, - "step": 6945, - "time_per_iteration": 2.7668464183807373 - }, - { - "auxiliary_loss_clip": 0.01086028, - "auxiliary_loss_mlp": 0.01052164, - "balance_loss_clip": 1.03888953, - "balance_loss_mlp": 1.03434181, - "epoch": 0.41761611303171503, - "flos": 19828580019840.0, - "grad_norm": 3.903492543127265, - "language_loss": 0.81313473, - "learning_rate": 2.620330018187899e-06, - "loss": 0.8345167, - "num_input_tokens_seen": 149082410, - "step": 6946, - "time_per_iteration": 2.7656164169311523 - }, - { - "auxiliary_loss_clip": 0.0111906, - "auxiliary_loss_mlp": 0.01033842, - "balance_loss_clip": 1.04820168, - "balance_loss_mlp": 1.01947689, - "epoch": 0.417676236284383, - "flos": 15523249910400.0, - "grad_norm": 3.3237502950686997, - "language_loss": 0.77819085, - "learning_rate": 2.6199597519313086e-06, - "loss": 0.79971987, - "num_input_tokens_seen": 149098745, - "step": 6947, - "time_per_iteration": 2.6658904552459717 - }, - { - "auxiliary_loss_clip": 0.01131014, - "auxiliary_loss_mlp": 0.01035473, - "balance_loss_clip": 1.04678917, - "balance_loss_mlp": 1.020262, - "epoch": 0.41773635953705096, - "flos": 32524473761280.0, - "grad_norm": 4.535535573323162, - "language_loss": 0.72142154, - "learning_rate": 2.6195894621657825e-06, - "loss": 0.7430864, - "num_input_tokens_seen": 149122255, - "step": 6948, - "time_per_iteration": 2.728604316711426 - }, - { - "auxiliary_loss_clip": 0.0111373, - "auxiliary_loss_mlp": 0.01035464, - "balance_loss_clip": 1.04416013, - "balance_loss_mlp": 1.02127814, - "epoch": 0.4177964827897189, - "flos": 23440941970560.0, - "grad_norm": 1.752796472610303, - "language_loss": 0.77020466, - "learning_rate": 2.619219148905362e-06, - "loss": 0.79169655, - "num_input_tokens_seen": 149142845, - "step": 6949, - "time_per_iteration": 4.2494752407073975 - }, - { - "auxiliary_loss_clip": 0.011131, - "auxiliary_loss_mlp": 0.01040025, - "balance_loss_clip": 1.05060196, - "balance_loss_mlp": 1.02523708, - "epoch": 0.4178566060423869, - "flos": 22748907565440.0, - "grad_norm": 1.637174584956538, - "language_loss": 0.8214075, - "learning_rate": 2.6188488121640888e-06, - "loss": 0.84293878, - "num_input_tokens_seen": 149163375, - "step": 6950, - "time_per_iteration": 2.7383689880371094 - }, - { - "auxiliary_loss_clip": 0.01099413, - "auxiliary_loss_mlp": 0.00770849, - "balance_loss_clip": 1.04511857, - "balance_loss_mlp": 1.00090635, - "epoch": 0.41791672929505486, - "flos": 26032794618240.0, - "grad_norm": 1.501775844018401, - "language_loss": 0.7649653, - "learning_rate": 2.618478451956007e-06, - "loss": 0.78366792, - "num_input_tokens_seen": 149185610, - "step": 6951, - "time_per_iteration": 5.789496660232544 - }, - { - "auxiliary_loss_clip": 0.01088001, - "auxiliary_loss_mlp": 0.01034314, - "balance_loss_clip": 1.04565978, - "balance_loss_mlp": 1.01929939, - "epoch": 0.4179768525477228, - "flos": 19568694142080.0, - "grad_norm": 1.8438034417752391, - "language_loss": 0.73442549, - "learning_rate": 2.61810806829516e-06, - "loss": 0.75564867, - "num_input_tokens_seen": 149203990, - "step": 6952, - "time_per_iteration": 2.762404680252075 - }, - { - "auxiliary_loss_clip": 0.01116339, - "auxiliary_loss_mlp": 0.01038971, - "balance_loss_clip": 1.04836369, - "balance_loss_mlp": 1.0251013, - "epoch": 0.4180369758003908, - "flos": 17783826399360.0, - "grad_norm": 2.8847563198217667, - "language_loss": 0.7161783, - "learning_rate": 2.617737661195593e-06, - "loss": 0.73773146, - "num_input_tokens_seen": 149221385, - "step": 6953, - "time_per_iteration": 2.6514034271240234 - }, - { - "auxiliary_loss_clip": 0.01118442, - "auxiliary_loss_mlp": 0.01038634, - "balance_loss_clip": 1.04711723, - "balance_loss_mlp": 1.02363181, - "epoch": 0.41809709905305875, - "flos": 20960663944320.0, - "grad_norm": 1.7834717110535325, - "language_loss": 0.75982141, - "learning_rate": 2.617367230671353e-06, - "loss": 0.78139216, - "num_input_tokens_seen": 149241175, - "step": 6954, - "time_per_iteration": 4.3135082721710205 - }, - { - "auxiliary_loss_clip": 0.01092319, - "auxiliary_loss_mlp": 0.01046188, - "balance_loss_clip": 1.04647863, - "balance_loss_mlp": 1.02979708, - "epoch": 0.4181572223057267, - "flos": 22017622573440.0, - "grad_norm": 2.907950037168039, - "language_loss": 0.84492826, - "learning_rate": 2.616996776736485e-06, - "loss": 0.86631334, - "num_input_tokens_seen": 149259115, - "step": 6955, - "time_per_iteration": 2.7724356651306152 - }, - { - "auxiliary_loss_clip": 0.01121525, - "auxiliary_loss_mlp": 0.01040437, - "balance_loss_clip": 1.04870594, - "balance_loss_mlp": 1.02604234, - "epoch": 0.4182173455583947, - "flos": 26245528917120.0, - "grad_norm": 1.6794559400644542, - "language_loss": 0.83262718, - "learning_rate": 2.616626299405037e-06, - "loss": 0.8542468, - "num_input_tokens_seen": 149278705, - "step": 6956, - "time_per_iteration": 2.7260353565216064 - }, - { - "auxiliary_loss_clip": 0.01093652, - "auxiliary_loss_mlp": 0.01039325, - "balance_loss_clip": 1.04491091, - "balance_loss_mlp": 1.02423358, - "epoch": 0.4182774688110627, - "flos": 14791605782400.0, - "grad_norm": 2.3946498969788634, - "language_loss": 0.71788859, - "learning_rate": 2.616255798691059e-06, - "loss": 0.73921835, - "num_input_tokens_seen": 149294040, - "step": 6957, - "time_per_iteration": 2.6826114654541016 - }, - { - "auxiliary_loss_clip": 0.01099548, - "auxiliary_loss_mlp": 0.01043781, - "balance_loss_clip": 1.0462482, - "balance_loss_mlp": 1.02966106, - "epoch": 0.41833759206373067, - "flos": 20412020632320.0, - "grad_norm": 2.4781797095716276, - "language_loss": 0.75947559, - "learning_rate": 2.6158852746085982e-06, - "loss": 0.78090888, - "num_input_tokens_seen": 149310385, - "step": 6958, - "time_per_iteration": 2.7528226375579834 - }, - { - "auxiliary_loss_clip": 0.01083285, - "auxiliary_loss_mlp": 0.00772338, - "balance_loss_clip": 1.04087532, - "balance_loss_mlp": 1.0007602, - "epoch": 0.41839771531639863, - "flos": 23656333875840.0, - "grad_norm": 1.8764496083097535, - "language_loss": 0.7693305, - "learning_rate": 2.6155147271717066e-06, - "loss": 0.78788674, - "num_input_tokens_seen": 149328235, - "step": 6959, - "time_per_iteration": 2.7859151363372803 - }, - { - "auxiliary_loss_clip": 0.01089374, - "auxiliary_loss_mlp": 0.00772565, - "balance_loss_clip": 1.04304624, - "balance_loss_mlp": 1.00090861, - "epoch": 0.4184578385690666, - "flos": 19754137082880.0, - "grad_norm": 2.1131068778060498, - "language_loss": 0.77339065, - "learning_rate": 2.6151441563944347e-06, - "loss": 0.79201001, - "num_input_tokens_seen": 149347465, - "step": 6960, - "time_per_iteration": 2.7497265338897705 - }, - { - "auxiliary_loss_clip": 0.01098942, - "auxiliary_loss_mlp": 0.01037539, - "balance_loss_clip": 1.04735017, - "balance_loss_mlp": 1.02385998, - "epoch": 0.41851796182173456, - "flos": 20193396503040.0, - "grad_norm": 1.8404962312042226, - "language_loss": 0.75842559, - "learning_rate": 2.614773562290835e-06, - "loss": 0.7797904, - "num_input_tokens_seen": 149366685, - "step": 6961, - "time_per_iteration": 2.6800267696380615 - }, - { - "auxiliary_loss_clip": 0.01038031, - "auxiliary_loss_mlp": 0.01001682, - "balance_loss_clip": 1.03925419, - "balance_loss_mlp": 0.99970287, - "epoch": 0.41857808507440253, - "flos": 59018794231680.0, - "grad_norm": 0.7827663866056928, - "language_loss": 0.54655838, - "learning_rate": 2.61440294487496e-06, - "loss": 0.56695551, - "num_input_tokens_seen": 149422925, - "step": 6962, - "time_per_iteration": 3.1537134647369385 - }, - { - "auxiliary_loss_clip": 0.01120288, - "auxiliary_loss_mlp": 0.0104634, - "balance_loss_clip": 1.04961705, - "balance_loss_mlp": 1.0318327, - "epoch": 0.4186382083270705, - "flos": 18478805719680.0, - "grad_norm": 1.960507757786237, - "language_loss": 0.85535777, - "learning_rate": 2.614032304160864e-06, - "loss": 0.87702405, - "num_input_tokens_seen": 149440820, - "step": 6963, - "time_per_iteration": 2.5925374031066895 - }, - { - "auxiliary_loss_clip": 0.01106535, - "auxiliary_loss_mlp": 0.01041093, - "balance_loss_clip": 1.04856253, - "balance_loss_mlp": 1.02657938, - "epoch": 0.41869833157973846, - "flos": 21578758202880.0, - "grad_norm": 1.6555227491445992, - "language_loss": 0.70422602, - "learning_rate": 2.6136616401626014e-06, - "loss": 0.72570229, - "num_input_tokens_seen": 149461060, - "step": 6964, - "time_per_iteration": 2.675595760345459 - }, - { - "auxiliary_loss_clip": 0.01131013, - "auxiliary_loss_mlp": 0.01048168, - "balance_loss_clip": 1.04926276, - "balance_loss_mlp": 1.03433418, - "epoch": 0.4187584548324064, - "flos": 35517412650240.0, - "grad_norm": 2.107779734715906, - "language_loss": 0.71486962, - "learning_rate": 2.6132909528942273e-06, - "loss": 0.73666137, - "num_input_tokens_seen": 149483115, - "step": 6965, - "time_per_iteration": 2.728795289993286 - }, - { - "auxiliary_loss_clip": 0.01081273, - "auxiliary_loss_mlp": 0.01038276, - "balance_loss_clip": 1.04315698, - "balance_loss_mlp": 1.02465594, - "epoch": 0.4188185780850744, - "flos": 18655880791680.0, - "grad_norm": 1.546256806673652, - "language_loss": 0.71920437, - "learning_rate": 2.6129202423697997e-06, - "loss": 0.74039984, - "num_input_tokens_seen": 149501495, - "step": 6966, - "time_per_iteration": 2.9000282287597656 - }, - { - "auxiliary_loss_clip": 0.01127558, - "auxiliary_loss_mlp": 0.01037127, - "balance_loss_clip": 1.04965436, - "balance_loss_mlp": 1.02194023, - "epoch": 0.41887870133774235, - "flos": 40333428374400.0, - "grad_norm": 2.0539481091161664, - "language_loss": 0.71188843, - "learning_rate": 2.612549508603375e-06, - "loss": 0.73353529, - "num_input_tokens_seen": 149523170, - "step": 6967, - "time_per_iteration": 2.8494174480438232 - }, - { - "auxiliary_loss_clip": 0.01059483, - "auxiliary_loss_mlp": 0.01001432, - "balance_loss_clip": 1.039819, - "balance_loss_mlp": 0.99973947, - "epoch": 0.4189388245904103, - "flos": 61371336516480.0, - "grad_norm": 0.6719582962825281, - "language_loss": 0.46191829, - "learning_rate": 2.612178751609011e-06, - "loss": 0.48252743, - "num_input_tokens_seen": 149583955, - "step": 6968, - "time_per_iteration": 3.2362303733825684 - }, - { - "auxiliary_loss_clip": 0.01123461, - "auxiliary_loss_mlp": 0.01043061, - "balance_loss_clip": 1.04708195, - "balance_loss_mlp": 1.02722979, - "epoch": 0.4189989478430783, - "flos": 28215624119040.0, - "grad_norm": 2.2684151977061386, - "language_loss": 0.75044996, - "learning_rate": 2.6118079714007685e-06, - "loss": 0.77211517, - "num_input_tokens_seen": 149604440, - "step": 6969, - "time_per_iteration": 2.836956739425659 - }, - { - "auxiliary_loss_clip": 0.01108551, - "auxiliary_loss_mlp": 0.01045091, - "balance_loss_clip": 1.0470643, - "balance_loss_mlp": 1.03178096, - "epoch": 0.4190590710957463, - "flos": 24565879088640.0, - "grad_norm": 1.9985372976124152, - "language_loss": 0.8083396, - "learning_rate": 2.611437167992705e-06, - "loss": 0.82987607, - "num_input_tokens_seen": 149623745, - "step": 6970, - "time_per_iteration": 2.7209956645965576 - }, - { - "auxiliary_loss_clip": 0.01119916, - "auxiliary_loss_mlp": 0.0104141, - "balance_loss_clip": 1.04898238, - "balance_loss_mlp": 1.02689075, - "epoch": 0.41911919434841427, - "flos": 21726027964800.0, - "grad_norm": 2.2489196165322713, - "language_loss": 0.82699662, - "learning_rate": 2.6110663413988835e-06, - "loss": 0.84860986, - "num_input_tokens_seen": 149643025, - "step": 6971, - "time_per_iteration": 2.6844992637634277 - }, - { - "auxiliary_loss_clip": 0.01105807, - "auxiliary_loss_mlp": 0.01047014, - "balance_loss_clip": 1.0493474, - "balance_loss_mlp": 1.03207135, - "epoch": 0.41917931760108224, - "flos": 17601543855360.0, - "grad_norm": 1.6553402405348427, - "language_loss": 0.74262661, - "learning_rate": 2.6106954916333648e-06, - "loss": 0.76415479, - "num_input_tokens_seen": 149660695, - "step": 6972, - "time_per_iteration": 2.6240105628967285 - }, - { - "auxiliary_loss_clip": 0.01102199, - "auxiliary_loss_mlp": 0.01040482, - "balance_loss_clip": 1.0421176, - "balance_loss_mlp": 1.02589083, - "epoch": 0.4192394408537502, - "flos": 37816701022080.0, - "grad_norm": 1.5708676830874608, - "language_loss": 0.72811258, - "learning_rate": 2.610324618710212e-06, - "loss": 0.74953938, - "num_input_tokens_seen": 149682040, - "step": 6973, - "time_per_iteration": 2.8109309673309326 - }, - { - "auxiliary_loss_clip": 0.01101478, - "auxiliary_loss_mlp": 0.01038786, - "balance_loss_clip": 1.05107093, - "balance_loss_mlp": 1.02461183, - "epoch": 0.41929956410641817, - "flos": 23107726477440.0, - "grad_norm": 1.8294609220169469, - "language_loss": 0.74864107, - "learning_rate": 2.609953722643489e-06, - "loss": 0.77004373, - "num_input_tokens_seen": 149700855, - "step": 6974, - "time_per_iteration": 2.7036855220794678 - }, - { - "auxiliary_loss_clip": 0.01117361, - "auxiliary_loss_mlp": 0.01037617, - "balance_loss_clip": 1.04402697, - "balance_loss_mlp": 1.02359784, - "epoch": 0.41935968735908613, - "flos": 22524537260160.0, - "grad_norm": 1.843462386151443, - "language_loss": 0.7271533, - "learning_rate": 2.609582803447259e-06, - "loss": 0.748703, - "num_input_tokens_seen": 149717360, - "step": 6975, - "time_per_iteration": 2.632661819458008 - }, - { - "auxiliary_loss_clip": 0.01113766, - "auxiliary_loss_mlp": 0.01042513, - "balance_loss_clip": 1.04679942, - "balance_loss_mlp": 1.02849412, - "epoch": 0.4194198106117541, - "flos": 26870446759680.0, - "grad_norm": 1.580698900699299, - "language_loss": 0.80874467, - "learning_rate": 2.6092118611355885e-06, - "loss": 0.83030754, - "num_input_tokens_seen": 149738975, - "step": 6976, - "time_per_iteration": 2.68833327293396 - }, - { - "auxiliary_loss_clip": 0.01098184, - "auxiliary_loss_mlp": 0.01042179, - "balance_loss_clip": 1.04087496, - "balance_loss_mlp": 1.02671123, - "epoch": 0.41947993386442206, - "flos": 19902412425600.0, - "grad_norm": 4.6015574144833264, - "language_loss": 0.6767152, - "learning_rate": 2.6088408957225425e-06, - "loss": 0.69811881, - "num_input_tokens_seen": 149757055, - "step": 6977, - "time_per_iteration": 2.6453959941864014 - }, - { - "auxiliary_loss_clip": 0.01122702, - "auxiliary_loss_mlp": 0.0104277, - "balance_loss_clip": 1.04980922, - "balance_loss_mlp": 1.02926338, - "epoch": 0.41954005711709, - "flos": 17383889393280.0, - "grad_norm": 2.3946463459425966, - "language_loss": 0.80506754, - "learning_rate": 2.6084699072221898e-06, - "loss": 0.82672226, - "num_input_tokens_seen": 149772885, - "step": 6978, - "time_per_iteration": 2.596269369125366 - }, - { - "auxiliary_loss_clip": 0.01133146, - "auxiliary_loss_mlp": 0.0103908, - "balance_loss_clip": 1.04677558, - "balance_loss_mlp": 1.02459598, - "epoch": 0.419600180369758, - "flos": 25003306915200.0, - "grad_norm": 1.7226002389356767, - "language_loss": 0.82708085, - "learning_rate": 2.6080988956485964e-06, - "loss": 0.84880304, - "num_input_tokens_seen": 149791515, - "step": 6979, - "time_per_iteration": 2.588383197784424 - }, - { - "auxiliary_loss_clip": 0.01129014, - "auxiliary_loss_mlp": 0.01037351, - "balance_loss_clip": 1.04659355, - "balance_loss_mlp": 1.02302253, - "epoch": 0.41966030362242596, - "flos": 17383781652480.0, - "grad_norm": 2.4214608222579206, - "language_loss": 0.83723533, - "learning_rate": 2.6077278610158325e-06, - "loss": 0.85889894, - "num_input_tokens_seen": 149807250, - "step": 6980, - "time_per_iteration": 2.5890002250671387 - }, - { - "auxiliary_loss_clip": 0.01132913, - "auxiliary_loss_mlp": 0.01043925, - "balance_loss_clip": 1.04753232, - "balance_loss_mlp": 1.02994215, - "epoch": 0.4197204268750939, - "flos": 22156165330560.0, - "grad_norm": 2.919161771051539, - "language_loss": 0.7951659, - "learning_rate": 2.6073568033379665e-06, - "loss": 0.81693423, - "num_input_tokens_seen": 149821640, - "step": 6981, - "time_per_iteration": 2.6015915870666504 - }, - { - "auxiliary_loss_clip": 0.01096505, - "auxiliary_loss_mlp": 0.01037263, - "balance_loss_clip": 1.04636097, - "balance_loss_mlp": 1.02382243, - "epoch": 0.4197805501277619, - "flos": 22084128604800.0, - "grad_norm": 2.285836698514787, - "language_loss": 0.84386683, - "learning_rate": 2.6069857226290696e-06, - "loss": 0.86520445, - "num_input_tokens_seen": 149840545, - "step": 6982, - "time_per_iteration": 2.755657434463501 - }, - { - "auxiliary_loss_clip": 0.01120032, - "auxiliary_loss_mlp": 0.01038775, - "balance_loss_clip": 1.04708028, - "balance_loss_mlp": 1.02419019, - "epoch": 0.4198406733804299, - "flos": 26432192920320.0, - "grad_norm": 2.941579449236281, - "language_loss": 0.57212174, - "learning_rate": 2.606614618903214e-06, - "loss": 0.59370977, - "num_input_tokens_seen": 149860375, - "step": 6983, - "time_per_iteration": 2.699927568435669 - }, - { - "auxiliary_loss_clip": 0.01120799, - "auxiliary_loss_mlp": 0.01037958, - "balance_loss_clip": 1.05017662, - "balance_loss_mlp": 1.02513719, - "epoch": 0.4199007966330979, - "flos": 12531029293440.0, - "grad_norm": 1.788715678149628, - "language_loss": 0.82569104, - "learning_rate": 2.606243492174471e-06, - "loss": 0.84727859, - "num_input_tokens_seen": 149877850, - "step": 6984, - "time_per_iteration": 2.6608574390411377 - }, - { - "auxiliary_loss_clip": 0.01110821, - "auxiliary_loss_mlp": 0.01031336, - "balance_loss_clip": 1.04403567, - "balance_loss_mlp": 1.01740074, - "epoch": 0.41996091988576584, - "flos": 21762944167680.0, - "grad_norm": 1.8578510762238896, - "language_loss": 0.79251826, - "learning_rate": 2.605872342456914e-06, - "loss": 0.81393987, - "num_input_tokens_seen": 149896110, - "step": 6985, - "time_per_iteration": 2.6915009021759033 - }, - { - "auxiliary_loss_clip": 0.01134356, - "auxiliary_loss_mlp": 0.01037444, - "balance_loss_clip": 1.04694271, - "balance_loss_mlp": 1.02278078, - "epoch": 0.4200210431384338, - "flos": 26541935948160.0, - "grad_norm": 1.6735394330256788, - "language_loss": 0.78439772, - "learning_rate": 2.6055011697646173e-06, - "loss": 0.80611569, - "num_input_tokens_seen": 149916495, - "step": 6986, - "time_per_iteration": 2.6553595066070557 - }, - { - "auxiliary_loss_clip": 0.01108367, - "auxiliary_loss_mlp": 0.01032308, - "balance_loss_clip": 1.04705167, - "balance_loss_mlp": 1.01957011, - "epoch": 0.42008116639110177, - "flos": 26795824254720.0, - "grad_norm": 1.6966099884396408, - "language_loss": 0.72624969, - "learning_rate": 2.605129974111655e-06, - "loss": 0.7476564, - "num_input_tokens_seen": 149936445, - "step": 6987, - "time_per_iteration": 2.7428104877471924 - }, - { - "auxiliary_loss_clip": 0.01105896, - "auxiliary_loss_mlp": 0.00774749, - "balance_loss_clip": 1.04440594, - "balance_loss_mlp": 1.00098395, - "epoch": 0.42014128964376973, - "flos": 32087333243520.0, - "grad_norm": 1.4394465087417463, - "language_loss": 0.74992245, - "learning_rate": 2.604758755512104e-06, - "loss": 0.76872891, - "num_input_tokens_seen": 149959430, - "step": 6988, - "time_per_iteration": 4.499454975128174 - }, - { - "auxiliary_loss_clip": 0.01124153, - "auxiliary_loss_mlp": 0.01040193, - "balance_loss_clip": 1.04908502, - "balance_loss_mlp": 1.02585781, - "epoch": 0.4202014128964377, - "flos": 26467133875200.0, - "grad_norm": 1.6029470393888554, - "language_loss": 0.73995304, - "learning_rate": 2.60438751398004e-06, - "loss": 0.76159656, - "num_input_tokens_seen": 149980365, - "step": 6989, - "time_per_iteration": 2.6979968547821045 - }, - { - "auxiliary_loss_clip": 0.01109104, - "auxiliary_loss_mlp": 0.01037728, - "balance_loss_clip": 1.04531431, - "balance_loss_mlp": 1.02353013, - "epoch": 0.42026153614910566, - "flos": 13401216178560.0, - "grad_norm": 2.8939358842188043, - "language_loss": 0.70562875, - "learning_rate": 2.6040162495295404e-06, - "loss": 0.72709703, - "num_input_tokens_seen": 149997375, - "step": 6990, - "time_per_iteration": 4.2269814014434814 - }, - { - "auxiliary_loss_clip": 0.01052428, - "auxiliary_loss_mlp": 0.00753318, - "balance_loss_clip": 1.03888559, - "balance_loss_mlp": 1.00109041, - "epoch": 0.42032165940177363, - "flos": 60250457635200.0, - "grad_norm": 1.417771869116233, - "language_loss": 0.60470819, - "learning_rate": 2.603644962174685e-06, - "loss": 0.62276566, - "num_input_tokens_seen": 150051230, - "step": 6991, - "time_per_iteration": 4.600361585617065 - }, - { - "auxiliary_loss_clip": 0.01135512, - "auxiliary_loss_mlp": 0.01038632, - "balance_loss_clip": 1.05044973, - "balance_loss_mlp": 1.02417135, - "epoch": 0.4203817826544416, - "flos": 24535211852160.0, - "grad_norm": 1.4766426515770763, - "language_loss": 0.832901, - "learning_rate": 2.6032736519295517e-06, - "loss": 0.85464245, - "num_input_tokens_seen": 150071135, - "step": 6992, - "time_per_iteration": 2.688693046569824 - }, - { - "auxiliary_loss_clip": 0.01058225, - "auxiliary_loss_mlp": 0.01016781, - "balance_loss_clip": 1.02967906, - "balance_loss_mlp": 1.01523161, - "epoch": 0.42044190590710956, - "flos": 58820781530880.0, - "grad_norm": 0.8077151468791776, - "language_loss": 0.65494478, - "learning_rate": 2.6029023188082217e-06, - "loss": 0.67569482, - "num_input_tokens_seen": 150125220, - "step": 6993, - "time_per_iteration": 4.7132039070129395 - }, - { - "auxiliary_loss_clip": 0.011371, - "auxiliary_loss_mlp": 0.010386, - "balance_loss_clip": 1.04959965, - "balance_loss_mlp": 1.02267361, - "epoch": 0.4205020291597775, - "flos": 16436063260800.0, - "grad_norm": 1.948890763784571, - "language_loss": 0.83380342, - "learning_rate": 2.6025309628247746e-06, - "loss": 0.85556042, - "num_input_tokens_seen": 150142300, - "step": 6994, - "time_per_iteration": 2.5883679389953613 - }, - { - "auxiliary_loss_clip": 0.01120964, - "auxiliary_loss_mlp": 0.00771063, - "balance_loss_clip": 1.04939461, - "balance_loss_mlp": 1.00095451, - "epoch": 0.4205621524124455, - "flos": 18405655672320.0, - "grad_norm": 1.5483229522184627, - "language_loss": 0.78529471, - "learning_rate": 2.6021595839932934e-06, - "loss": 0.80421495, - "num_input_tokens_seen": 150161345, - "step": 6995, - "time_per_iteration": 2.716649055480957 - }, - { - "auxiliary_loss_clip": 0.0109323, - "auxiliary_loss_mlp": 0.01032227, - "balance_loss_clip": 1.04375339, - "balance_loss_mlp": 1.01855421, - "epoch": 0.4206222756651135, - "flos": 25520097841920.0, - "grad_norm": 1.4060831947988737, - "language_loss": 0.80397403, - "learning_rate": 2.60178818232786e-06, - "loss": 0.82522857, - "num_input_tokens_seen": 150182420, - "step": 6996, - "time_per_iteration": 2.773655891418457 - }, - { - "auxiliary_loss_clip": 0.01111456, - "auxiliary_loss_mlp": 0.00771084, - "balance_loss_clip": 1.0477984, - "balance_loss_mlp": 1.00100029, - "epoch": 0.4206823989177815, - "flos": 15304338472320.0, - "grad_norm": 1.9934224916744, - "language_loss": 0.7558648, - "learning_rate": 2.601416757842559e-06, - "loss": 0.77469015, - "num_input_tokens_seen": 150200175, - "step": 6997, - "time_per_iteration": 2.6486191749572754 - }, - { - "auxiliary_loss_clip": 0.01130573, - "auxiliary_loss_mlp": 0.01042531, - "balance_loss_clip": 1.04606771, - "balance_loss_mlp": 1.02835727, - "epoch": 0.42074252217044944, - "flos": 15554096714880.0, - "grad_norm": 3.451993658012451, - "language_loss": 0.75860173, - "learning_rate": 2.6010453105514743e-06, - "loss": 0.78033274, - "num_input_tokens_seen": 150217100, - "step": 6998, - "time_per_iteration": 2.548783540725708 - }, - { - "auxiliary_loss_clip": 0.01136566, - "auxiliary_loss_mlp": 0.01042996, - "balance_loss_clip": 1.05027032, - "balance_loss_mlp": 1.02827394, - "epoch": 0.4208026454231174, - "flos": 26145877610880.0, - "grad_norm": 1.6802908884651202, - "language_loss": 0.76294345, - "learning_rate": 2.60067384046869e-06, - "loss": 0.78473908, - "num_input_tokens_seen": 150239830, - "step": 6999, - "time_per_iteration": 2.6605780124664307 - }, - { - "auxiliary_loss_clip": 0.01082307, - "auxiliary_loss_mlp": 0.01039523, - "balance_loss_clip": 1.04213417, - "balance_loss_mlp": 1.02420449, - "epoch": 0.42086276867578537, - "flos": 23550110380800.0, - "grad_norm": 2.828142255503796, - "language_loss": 0.64361006, - "learning_rate": 2.600302347608295e-06, - "loss": 0.66482836, - "num_input_tokens_seen": 150260690, - "step": 7000, - "time_per_iteration": 2.7295126914978027 - }, - { - "auxiliary_loss_clip": 0.01089826, - "auxiliary_loss_mlp": 0.01039051, - "balance_loss_clip": 1.04259682, - "balance_loss_mlp": 1.02433491, - "epoch": 0.42092289192845334, - "flos": 18113414618880.0, - "grad_norm": 2.276209987309232, - "language_loss": 0.76550955, - "learning_rate": 2.5999308319843743e-06, - "loss": 0.78679836, - "num_input_tokens_seen": 150279885, - "step": 7001, - "time_per_iteration": 2.793407917022705 - }, - { - "auxiliary_loss_clip": 0.01091534, - "auxiliary_loss_mlp": 0.00771163, - "balance_loss_clip": 1.04483819, - "balance_loss_mlp": 1.00107491, - "epoch": 0.4209830151811213, - "flos": 20006588845440.0, - "grad_norm": 1.4928891465725471, - "language_loss": 0.86682802, - "learning_rate": 2.5995592936110154e-06, - "loss": 0.88545501, - "num_input_tokens_seen": 150297390, - "step": 7002, - "time_per_iteration": 2.719127655029297 - }, - { - "auxiliary_loss_clip": 0.0109333, - "auxiliary_loss_mlp": 0.0103625, - "balance_loss_clip": 1.04801917, - "balance_loss_mlp": 1.02297568, - "epoch": 0.42104313843378927, - "flos": 21978946604160.0, - "grad_norm": 1.8843999139097827, - "language_loss": 0.67807466, - "learning_rate": 2.5991877325023096e-06, - "loss": 0.6993705, - "num_input_tokens_seen": 150317390, - "step": 7003, - "time_per_iteration": 2.732848882675171 - }, - { - "auxiliary_loss_clip": 0.01132341, - "auxiliary_loss_mlp": 0.01035322, - "balance_loss_clip": 1.04725492, - "balance_loss_mlp": 1.02031398, - "epoch": 0.42110326168645723, - "flos": 25443966965760.0, - "grad_norm": 1.9778982096910334, - "language_loss": 0.77774739, - "learning_rate": 2.598816148672344e-06, - "loss": 0.79942405, - "num_input_tokens_seen": 150337455, - "step": 7004, - "time_per_iteration": 2.630838394165039 - }, - { - "auxiliary_loss_clip": 0.01129987, - "auxiliary_loss_mlp": 0.0103854, - "balance_loss_clip": 1.04988933, - "balance_loss_mlp": 1.02351916, - "epoch": 0.4211633849391252, - "flos": 17822574195840.0, - "grad_norm": 2.0674356984544557, - "language_loss": 0.67855948, - "learning_rate": 2.59844454213521e-06, - "loss": 0.70024478, - "num_input_tokens_seen": 150355385, - "step": 7005, - "time_per_iteration": 2.588533401489258 - }, - { - "auxiliary_loss_clip": 0.01121703, - "auxiliary_loss_mlp": 0.01033597, - "balance_loss_clip": 1.0483923, - "balance_loss_mlp": 1.01941752, - "epoch": 0.42122350819179316, - "flos": 16282436791680.0, - "grad_norm": 1.9633544911967673, - "language_loss": 0.72481513, - "learning_rate": 2.5980729129049994e-06, - "loss": 0.74636805, - "num_input_tokens_seen": 150371750, - "step": 7006, - "time_per_iteration": 2.5879828929901123 - }, - { - "auxiliary_loss_clip": 0.01133912, - "auxiliary_loss_mlp": 0.01032205, - "balance_loss_clip": 1.04963207, - "balance_loss_mlp": 1.01787031, - "epoch": 0.4212836314444611, - "flos": 19645866512640.0, - "grad_norm": 1.722108681435548, - "language_loss": 0.70495522, - "learning_rate": 2.5977012609958033e-06, - "loss": 0.72661638, - "num_input_tokens_seen": 150389955, - "step": 7007, - "time_per_iteration": 2.5199153423309326 - }, - { - "auxiliary_loss_clip": 0.0110564, - "auxiliary_loss_mlp": 0.00771949, - "balance_loss_clip": 1.04377306, - "balance_loss_mlp": 1.00098372, - "epoch": 0.4213437546971291, - "flos": 18369026778240.0, - "grad_norm": 1.772679877033185, - "language_loss": 0.82893503, - "learning_rate": 2.5973295864217166e-06, - "loss": 0.84771085, - "num_input_tokens_seen": 150405780, - "step": 7008, - "time_per_iteration": 2.6636033058166504 - }, - { - "auxiliary_loss_clip": 0.01089865, - "auxiliary_loss_mlp": 0.01039598, - "balance_loss_clip": 1.04483509, - "balance_loss_mlp": 1.02535856, - "epoch": 0.42140387794979706, - "flos": 27704507541120.0, - "grad_norm": 1.895033591472922, - "language_loss": 0.72206765, - "learning_rate": 2.596957889196831e-06, - "loss": 0.74336231, - "num_input_tokens_seen": 150425615, - "step": 7009, - "time_per_iteration": 2.738678216934204 - }, - { - "auxiliary_loss_clip": 0.01132456, - "auxiliary_loss_mlp": 0.01030814, - "balance_loss_clip": 1.04812074, - "balance_loss_mlp": 1.01674712, - "epoch": 0.4214640012024651, - "flos": 28147071012480.0, - "grad_norm": 2.558025018080716, - "language_loss": 0.66191494, - "learning_rate": 2.596586169335243e-06, - "loss": 0.68354768, - "num_input_tokens_seen": 150445765, - "step": 7010, - "time_per_iteration": 2.6812071800231934 - }, - { - "auxiliary_loss_clip": 0.01092262, - "auxiliary_loss_mlp": 0.01032367, - "balance_loss_clip": 1.0424943, - "balance_loss_mlp": 1.01774001, - "epoch": 0.42152412445513304, - "flos": 22997265177600.0, - "grad_norm": 2.024875050938184, - "language_loss": 0.72456133, - "learning_rate": 2.5962144268510477e-06, - "loss": 0.74580765, - "num_input_tokens_seen": 150464405, - "step": 7011, - "time_per_iteration": 2.741454601287842 - }, - { - "auxiliary_loss_clip": 0.01046137, - "auxiliary_loss_mlp": 0.01001201, - "balance_loss_clip": 1.02718639, - "balance_loss_mlp": 0.99971068, - "epoch": 0.421584247707801, - "flos": 63749592938880.0, - "grad_norm": 0.7906258228604641, - "language_loss": 0.54322207, - "learning_rate": 2.5958426617583417e-06, - "loss": 0.56369549, - "num_input_tokens_seen": 150520430, - "step": 7012, - "time_per_iteration": 3.1284689903259277 - }, - { - "auxiliary_loss_clip": 0.01123004, - "auxiliary_loss_mlp": 0.01031037, - "balance_loss_clip": 1.05000162, - "balance_loss_mlp": 1.01663089, - "epoch": 0.421644370960469, - "flos": 24314612474880.0, - "grad_norm": 1.3828895368097467, - "language_loss": 0.78401852, - "learning_rate": 2.5954708740712215e-06, - "loss": 0.80555892, - "num_input_tokens_seen": 150542610, - "step": 7013, - "time_per_iteration": 2.6729819774627686 - }, - { - "auxiliary_loss_clip": 0.01133162, - "auxiliary_loss_mlp": 0.01033094, - "balance_loss_clip": 1.04858398, - "balance_loss_mlp": 1.01826453, - "epoch": 0.42170449421313694, - "flos": 23440690575360.0, - "grad_norm": 1.8094728177732207, - "language_loss": 0.81603825, - "learning_rate": 2.595099063803787e-06, - "loss": 0.83770084, - "num_input_tokens_seen": 150560970, - "step": 7014, - "time_per_iteration": 2.662652015686035 - }, - { - "auxiliary_loss_clip": 0.01117627, - "auxiliary_loss_mlp": 0.0103256, - "balance_loss_clip": 1.04452634, - "balance_loss_mlp": 1.01831448, - "epoch": 0.4217646174658049, - "flos": 23695476721920.0, - "grad_norm": 1.7861369926261594, - "language_loss": 0.77908784, - "learning_rate": 2.5947272309701354e-06, - "loss": 0.80058968, - "num_input_tokens_seen": 150582615, - "step": 7015, - "time_per_iteration": 2.763761043548584 - }, - { - "auxiliary_loss_clip": 0.01132815, - "auxiliary_loss_mlp": 0.01036697, - "balance_loss_clip": 1.04966104, - "balance_loss_mlp": 1.02183151, - "epoch": 0.42182474071847287, - "flos": 24971562270720.0, - "grad_norm": 1.3268186837565954, - "language_loss": 0.82412994, - "learning_rate": 2.594355375584368e-06, - "loss": 0.84582508, - "num_input_tokens_seen": 150603640, - "step": 7016, - "time_per_iteration": 2.771812677383423 - }, - { - "auxiliary_loss_clip": 0.01091213, - "auxiliary_loss_mlp": 0.0103466, - "balance_loss_clip": 1.04072332, - "balance_loss_mlp": 1.01999736, - "epoch": 0.42188486397114083, - "flos": 22856639431680.0, - "grad_norm": 1.813350419138722, - "language_loss": 0.68270308, - "learning_rate": 2.593983497660586e-06, - "loss": 0.70396179, - "num_input_tokens_seen": 150622490, - "step": 7017, - "time_per_iteration": 2.703078508377075 - }, - { - "auxiliary_loss_clip": 0.01045206, - "auxiliary_loss_mlp": 0.01012048, - "balance_loss_clip": 1.02663231, - "balance_loss_mlp": 1.01053989, - "epoch": 0.4219449872238088, - "flos": 66975700965120.0, - "grad_norm": 0.7659311952437052, - "language_loss": 0.59381223, - "learning_rate": 2.5936115972128895e-06, - "loss": 0.61438477, - "num_input_tokens_seen": 150689545, - "step": 7018, - "time_per_iteration": 3.2514843940734863 - }, - { - "auxiliary_loss_clip": 0.01113322, - "auxiliary_loss_mlp": 0.01033039, - "balance_loss_clip": 1.04147184, - "balance_loss_mlp": 1.01840591, - "epoch": 0.42200511047647676, - "flos": 13115367745920.0, - "grad_norm": 2.3056993234384957, - "language_loss": 0.75083554, - "learning_rate": 2.593239674255382e-06, - "loss": 0.77229911, - "num_input_tokens_seen": 150707610, - "step": 7019, - "time_per_iteration": 2.6845014095306396 - }, - { - "auxiliary_loss_clip": 0.01106969, - "auxiliary_loss_mlp": 0.01035543, - "balance_loss_clip": 1.04650903, - "balance_loss_mlp": 1.02023685, - "epoch": 0.42206523372914473, - "flos": 13991193066240.0, - "grad_norm": 1.8835929197669175, - "language_loss": 0.69198954, - "learning_rate": 2.592867728802166e-06, - "loss": 0.71341467, - "num_input_tokens_seen": 150724530, - "step": 7020, - "time_per_iteration": 2.635646343231201 - }, - { - "auxiliary_loss_clip": 0.01107351, - "auxiliary_loss_mlp": 0.00771638, - "balance_loss_clip": 1.04847479, - "balance_loss_mlp": 1.00088549, - "epoch": 0.4221253569818127, - "flos": 21942317710080.0, - "grad_norm": 3.182010152232146, - "language_loss": 0.81085485, - "learning_rate": 2.592495760867347e-06, - "loss": 0.82964474, - "num_input_tokens_seen": 150742870, - "step": 7021, - "time_per_iteration": 2.712358236312866 - }, - { - "auxiliary_loss_clip": 0.0105744, - "auxiliary_loss_mlp": 0.01040763, - "balance_loss_clip": 1.03628528, - "balance_loss_mlp": 1.02439523, - "epoch": 0.42218548023448066, - "flos": 32192587071360.0, - "grad_norm": 1.7516152237568758, - "language_loss": 0.70298421, - "learning_rate": 2.5921237704650293e-06, - "loss": 0.72396624, - "num_input_tokens_seen": 150765500, - "step": 7022, - "time_per_iteration": 2.9338343143463135 - }, - { - "auxiliary_loss_clip": 0.01114774, - "auxiliary_loss_mlp": 0.01028964, - "balance_loss_clip": 1.0467478, - "balance_loss_mlp": 1.01637506, - "epoch": 0.4222456034871487, - "flos": 30118961894400.0, - "grad_norm": 1.5162864908148717, - "language_loss": 0.67418218, - "learning_rate": 2.5917517576093188e-06, - "loss": 0.69561946, - "num_input_tokens_seen": 150784945, - "step": 7023, - "time_per_iteration": 2.7014782428741455 - }, - { - "auxiliary_loss_clip": 0.01101297, - "auxiliary_loss_mlp": 0.01042754, - "balance_loss_clip": 1.0460372, - "balance_loss_mlp": 1.0259577, - "epoch": 0.42230572673981664, - "flos": 22127904305280.0, - "grad_norm": 1.6579428625462107, - "language_loss": 0.69768953, - "learning_rate": 2.591379722314322e-06, - "loss": 0.71913004, - "num_input_tokens_seen": 150803120, - "step": 7024, - "time_per_iteration": 2.8669025897979736 - }, - { - "auxiliary_loss_clip": 0.011321, - "auxiliary_loss_mlp": 0.01035188, - "balance_loss_clip": 1.04982734, - "balance_loss_mlp": 1.02107334, - "epoch": 0.4223658499924846, - "flos": 22055077480320.0, - "grad_norm": 1.7199232023790467, - "language_loss": 0.76781225, - "learning_rate": 2.591007664594147e-06, - "loss": 0.7894851, - "num_input_tokens_seen": 150823135, - "step": 7025, - "time_per_iteration": 2.696200132369995 - }, - { - "auxiliary_loss_clip": 0.01097355, - "auxiliary_loss_mlp": 0.01036622, - "balance_loss_clip": 1.04367328, - "balance_loss_mlp": 1.02268052, - "epoch": 0.4224259732451526, - "flos": 20410727742720.0, - "grad_norm": 1.6766870979897237, - "language_loss": 0.79664457, - "learning_rate": 2.5906355844629024e-06, - "loss": 0.81798434, - "num_input_tokens_seen": 150842070, - "step": 7026, - "time_per_iteration": 2.7131056785583496 - }, - { - "auxiliary_loss_clip": 0.01053, - "auxiliary_loss_mlp": 0.00999983, - "balance_loss_clip": 1.02519512, - "balance_loss_mlp": 0.9985466, - "epoch": 0.42248609649782054, - "flos": 62846655828480.0, - "grad_norm": 0.7210787168966012, - "language_loss": 0.61874068, - "learning_rate": 2.5902634819346966e-06, - "loss": 0.63927048, - "num_input_tokens_seen": 150907450, - "step": 7027, - "time_per_iteration": 3.2111167907714844 - }, - { - "auxiliary_loss_clip": 0.01131577, - "auxiliary_loss_mlp": 0.01038162, - "balance_loss_clip": 1.05022967, - "balance_loss_mlp": 1.02400613, - "epoch": 0.4225462197504885, - "flos": 26249946289920.0, - "grad_norm": 1.8872379728212205, - "language_loss": 0.71137869, - "learning_rate": 2.5898913570236414e-06, - "loss": 0.7330761, - "num_input_tokens_seen": 150928040, - "step": 7028, - "time_per_iteration": 4.185323476791382 - }, - { - "auxiliary_loss_clip": 0.01109127, - "auxiliary_loss_mlp": 0.01041278, - "balance_loss_clip": 1.04935491, - "balance_loss_mlp": 1.02702022, - "epoch": 0.42260634300315647, - "flos": 20521943228160.0, - "grad_norm": 3.7456767842675136, - "language_loss": 0.82652044, - "learning_rate": 2.589519209743846e-06, - "loss": 0.84802449, - "num_input_tokens_seen": 150945760, - "step": 7029, - "time_per_iteration": 2.617464542388916 - }, - { - "auxiliary_loss_clip": 0.01086316, - "auxiliary_loss_mlp": 0.01043345, - "balance_loss_clip": 1.04393244, - "balance_loss_mlp": 1.02826512, - "epoch": 0.42266646625582444, - "flos": 24316731377280.0, - "grad_norm": 1.852504104659585, - "language_loss": 0.75125468, - "learning_rate": 2.589147040109424e-06, - "loss": 0.7725513, - "num_input_tokens_seen": 150965665, - "step": 7030, - "time_per_iteration": 5.787954807281494 - }, - { - "auxiliary_loss_clip": 0.01129772, - "auxiliary_loss_mlp": 0.01039193, - "balance_loss_clip": 1.04772067, - "balance_loss_mlp": 1.02368367, - "epoch": 0.4227265895084924, - "flos": 24204151175040.0, - "grad_norm": 1.9107182577124318, - "language_loss": 0.86337131, - "learning_rate": 2.588774848134486e-06, - "loss": 0.88506097, - "num_input_tokens_seen": 150982260, - "step": 7031, - "time_per_iteration": 2.622174024581909 - }, - { - "auxiliary_loss_clip": 0.01120469, - "auxiliary_loss_mlp": 0.01038756, - "balance_loss_clip": 1.04873753, - "balance_loss_mlp": 1.0234381, - "epoch": 0.42278671276116037, - "flos": 16909760845440.0, - "grad_norm": 1.9974648735142886, - "language_loss": 0.73489487, - "learning_rate": 2.5884026338331473e-06, - "loss": 0.75648719, - "num_input_tokens_seen": 150999990, - "step": 7032, - "time_per_iteration": 2.681155204772949 - }, - { - "auxiliary_loss_clip": 0.01100841, - "auxiliary_loss_mlp": 0.01044575, - "balance_loss_clip": 1.04449272, - "balance_loss_mlp": 1.029531, - "epoch": 0.42284683601382833, - "flos": 25411073086080.0, - "grad_norm": 1.657781585480679, - "language_loss": 0.70232797, - "learning_rate": 2.5880303972195222e-06, - "loss": 0.72378218, - "num_input_tokens_seen": 151021105, - "step": 7033, - "time_per_iteration": 4.264399290084839 - }, - { - "auxiliary_loss_clip": 0.01105188, - "auxiliary_loss_mlp": 0.00773118, - "balance_loss_clip": 1.04417682, - "balance_loss_mlp": 1.00101566, - "epoch": 0.4229069592664963, - "flos": 23040322606080.0, - "grad_norm": 2.084860036541982, - "language_loss": 0.90209413, - "learning_rate": 2.5876581383077256e-06, - "loss": 0.92087722, - "num_input_tokens_seen": 151040665, - "step": 7034, - "time_per_iteration": 2.6903390884399414 - }, - { - "auxiliary_loss_clip": 0.01107447, - "auxiliary_loss_mlp": 0.01038024, - "balance_loss_clip": 1.04703283, - "balance_loss_mlp": 1.02456498, - "epoch": 0.42296708251916426, - "flos": 26067448264320.0, - "grad_norm": 1.854470548564886, - "language_loss": 0.77645576, - "learning_rate": 2.5872858571118723e-06, - "loss": 0.79791045, - "num_input_tokens_seen": 151061240, - "step": 7035, - "time_per_iteration": 2.839463233947754 - }, - { - "auxiliary_loss_clip": 0.01118463, - "auxiliary_loss_mlp": 0.01043438, - "balance_loss_clip": 1.04904413, - "balance_loss_mlp": 1.02879918, - "epoch": 0.4230272057718323, - "flos": 19458376496640.0, - "grad_norm": 1.8047665428966375, - "language_loss": 0.82544887, - "learning_rate": 2.5869135536460817e-06, - "loss": 0.84706789, - "num_input_tokens_seen": 151076870, - "step": 7036, - "time_per_iteration": 2.7344322204589844 - }, - { - "auxiliary_loss_clip": 0.01105244, - "auxiliary_loss_mlp": 0.01037982, - "balance_loss_clip": 1.04819334, - "balance_loss_mlp": 1.02430892, - "epoch": 0.42308732902450025, - "flos": 22383300983040.0, - "grad_norm": 1.7884357315749977, - "language_loss": 0.70379841, - "learning_rate": 2.58654122792447e-06, - "loss": 0.72523069, - "num_input_tokens_seen": 151095110, - "step": 7037, - "time_per_iteration": 2.7701706886291504 - }, - { - "auxiliary_loss_clip": 0.01088589, - "auxiliary_loss_mlp": 0.00773432, - "balance_loss_clip": 1.04192328, - "balance_loss_mlp": 1.00089622, - "epoch": 0.4231474522771682, - "flos": 20995425331200.0, - "grad_norm": 1.6174527275157642, - "language_loss": 0.78031301, - "learning_rate": 2.586168879961155e-06, - "loss": 0.79893327, - "num_input_tokens_seen": 151114355, - "step": 7038, - "time_per_iteration": 2.7142980098724365 - }, - { - "auxiliary_loss_clip": 0.01093843, - "auxiliary_loss_mlp": 0.01045553, - "balance_loss_clip": 1.04870033, - "balance_loss_mlp": 1.02938843, - "epoch": 0.4232075755298362, - "flos": 14975863574400.0, - "grad_norm": 2.472987059089125, - "language_loss": 0.67238259, - "learning_rate": 2.585796509770259e-06, - "loss": 0.69377655, - "num_input_tokens_seen": 151131505, - "step": 7039, - "time_per_iteration": 2.723700761795044 - }, - { - "auxiliary_loss_clip": 0.01126742, - "auxiliary_loss_mlp": 0.0103978, - "balance_loss_clip": 1.04828668, - "balance_loss_mlp": 1.02421153, - "epoch": 0.42326769878250414, - "flos": 24532661986560.0, - "grad_norm": 2.3861719735257627, - "language_loss": 0.75643921, - "learning_rate": 2.5854241173658996e-06, - "loss": 0.77810442, - "num_input_tokens_seen": 151151555, - "step": 7040, - "time_per_iteration": 2.6909239292144775 - }, - { - "auxiliary_loss_clip": 0.01120351, - "auxiliary_loss_mlp": 0.01033687, - "balance_loss_clip": 1.04682565, - "balance_loss_mlp": 1.01907206, - "epoch": 0.4233278220351721, - "flos": 26870303105280.0, - "grad_norm": 1.612614450493485, - "language_loss": 0.6520682, - "learning_rate": 2.5850517027621996e-06, - "loss": 0.67360854, - "num_input_tokens_seen": 151172385, - "step": 7041, - "time_per_iteration": 2.705819845199585 - }, - { - "auxiliary_loss_clip": 0.01105037, - "auxiliary_loss_mlp": 0.01044866, - "balance_loss_clip": 1.04526758, - "balance_loss_mlp": 1.02961886, - "epoch": 0.4233879452878401, - "flos": 42814927463040.0, - "grad_norm": 1.8077043446733942, - "language_loss": 0.74725586, - "learning_rate": 2.5846792659732803e-06, - "loss": 0.76875484, - "num_input_tokens_seen": 151194930, - "step": 7042, - "time_per_iteration": 2.8701279163360596 - }, - { - "auxiliary_loss_clip": 0.01118432, - "auxiliary_loss_mlp": 0.01041709, - "balance_loss_clip": 1.04900146, - "balance_loss_mlp": 1.02783322, - "epoch": 0.42344806854050804, - "flos": 25229006023680.0, - "grad_norm": 1.5999390710673906, - "language_loss": 0.82543206, - "learning_rate": 2.5843068070132643e-06, - "loss": 0.84703344, - "num_input_tokens_seen": 151217905, - "step": 7043, - "time_per_iteration": 2.7351741790771484 - }, - { - "auxiliary_loss_clip": 0.01110906, - "auxiliary_loss_mlp": 0.01054459, - "balance_loss_clip": 1.04981089, - "balance_loss_mlp": 1.0383476, - "epoch": 0.423508191793176, - "flos": 22778820616320.0, - "grad_norm": 4.941461848597107, - "language_loss": 0.64840907, - "learning_rate": 2.5839343258962763e-06, - "loss": 0.67006272, - "num_input_tokens_seen": 151234580, - "step": 7044, - "time_per_iteration": 2.729717969894409 - }, - { - "auxiliary_loss_clip": 0.01118394, - "auxiliary_loss_mlp": 0.01056481, - "balance_loss_clip": 1.04780793, - "balance_loss_mlp": 1.04023242, - "epoch": 0.42356831504584397, - "flos": 34637493179520.0, - "grad_norm": 4.901784512002612, - "language_loss": 0.75249708, - "learning_rate": 2.5835618226364393e-06, - "loss": 0.77424586, - "num_input_tokens_seen": 151254765, - "step": 7045, - "time_per_iteration": 2.768423557281494 - }, - { - "auxiliary_loss_clip": 0.0109684, - "auxiliary_loss_mlp": 0.0105935, - "balance_loss_clip": 1.04820228, - "balance_loss_mlp": 1.04277968, - "epoch": 0.42362843829851193, - "flos": 17596767346560.0, - "grad_norm": 2.3365752409002027, - "language_loss": 0.80862033, - "learning_rate": 2.5831892972478797e-06, - "loss": 0.83018219, - "num_input_tokens_seen": 151269045, - "step": 7046, - "time_per_iteration": 2.778648614883423 - }, - { - "auxiliary_loss_clip": 0.01050075, - "auxiliary_loss_mlp": 0.01043729, - "balance_loss_clip": 1.04536414, - "balance_loss_mlp": 1.02847028, - "epoch": 0.4236885615511799, - "flos": 22565691267840.0, - "grad_norm": 1.629581050390514, - "language_loss": 0.76806176, - "learning_rate": 2.5828167497447242e-06, - "loss": 0.78899974, - "num_input_tokens_seen": 151287530, - "step": 7047, - "time_per_iteration": 2.957385301589966 - }, - { - "auxiliary_loss_clip": 0.01132762, - "auxiliary_loss_mlp": 0.01044489, - "balance_loss_clip": 1.05149937, - "balance_loss_mlp": 1.03061271, - "epoch": 0.42374868480384786, - "flos": 26469216864000.0, - "grad_norm": 2.0123660706562294, - "language_loss": 0.68135488, - "learning_rate": 2.582444180141098e-06, - "loss": 0.70312738, - "num_input_tokens_seen": 151308905, - "step": 7048, - "time_per_iteration": 2.976609468460083 - }, - { - "auxiliary_loss_clip": 0.01119986, - "auxiliary_loss_mlp": 0.0104419, - "balance_loss_clip": 1.04684722, - "balance_loss_mlp": 1.02822733, - "epoch": 0.4238088080565159, - "flos": 20370220179840.0, - "grad_norm": 1.9442365727521234, - "language_loss": 0.78292572, - "learning_rate": 2.5820715884511307e-06, - "loss": 0.80456746, - "num_input_tokens_seen": 151326525, - "step": 7049, - "time_per_iteration": 2.7592408657073975 - }, - { - "auxiliary_loss_clip": 0.01128638, - "auxiliary_loss_mlp": 0.0105084, - "balance_loss_clip": 1.05336547, - "balance_loss_mlp": 1.03632045, - "epoch": 0.42386893130918385, - "flos": 21172105353600.0, - "grad_norm": 1.9473547987347861, - "language_loss": 0.82839847, - "learning_rate": 2.5816989746889504e-06, - "loss": 0.85019326, - "num_input_tokens_seen": 151344675, - "step": 7050, - "time_per_iteration": 2.70487117767334 - }, - { - "auxiliary_loss_clip": 0.01132896, - "auxiliary_loss_mlp": 0.01042146, - "balance_loss_clip": 1.04812455, - "balance_loss_mlp": 1.02791238, - "epoch": 0.4239290545618518, - "flos": 17675627656320.0, - "grad_norm": 2.6140682586064754, - "language_loss": 0.73742986, - "learning_rate": 2.581326338868687e-06, - "loss": 0.75918031, - "num_input_tokens_seen": 151360730, - "step": 7051, - "time_per_iteration": 2.6406943798065186 - }, - { - "auxiliary_loss_clip": 0.01103657, - "auxiliary_loss_mlp": 0.0104179, - "balance_loss_clip": 1.05070043, - "balance_loss_mlp": 1.02773547, - "epoch": 0.4239891778145198, - "flos": 24314504734080.0, - "grad_norm": 1.6610077810318091, - "language_loss": 0.86273873, - "learning_rate": 2.5809536810044706e-06, - "loss": 0.88419318, - "num_input_tokens_seen": 151380445, - "step": 7052, - "time_per_iteration": 2.7759416103363037 - }, - { - "auxiliary_loss_clip": 0.01106373, - "auxiliary_loss_mlp": 0.01058935, - "balance_loss_clip": 1.04475808, - "balance_loss_mlp": 1.04325902, - "epoch": 0.42404930106718774, - "flos": 20558428467840.0, - "grad_norm": 2.094212061505075, - "language_loss": 0.72460884, - "learning_rate": 2.5805810011104323e-06, - "loss": 0.74626195, - "num_input_tokens_seen": 151399325, - "step": 7053, - "time_per_iteration": 2.6969964504241943 - }, - { - "auxiliary_loss_clip": 0.0110264, - "auxiliary_loss_mlp": 0.00773448, - "balance_loss_clip": 1.05001807, - "balance_loss_mlp": 1.00098944, - "epoch": 0.4241094243198557, - "flos": 22308067946880.0, - "grad_norm": 7.333766574531878, - "language_loss": 0.82380986, - "learning_rate": 2.580208299200704e-06, - "loss": 0.84257072, - "num_input_tokens_seen": 151417240, - "step": 7054, - "time_per_iteration": 2.71956205368042 - }, - { - "auxiliary_loss_clip": 0.01052303, - "auxiliary_loss_mlp": 0.01036407, - "balance_loss_clip": 1.03336191, - "balance_loss_mlp": 1.03490484, - "epoch": 0.4241695475725237, - "flos": 70612445272320.0, - "grad_norm": 0.7897337987883358, - "language_loss": 0.60378659, - "learning_rate": 2.5798355752894183e-06, - "loss": 0.62467366, - "num_input_tokens_seen": 151476015, - "step": 7055, - "time_per_iteration": 3.155177116394043 - }, - { - "auxiliary_loss_clip": 0.01136773, - "auxiliary_loss_mlp": 0.01045155, - "balance_loss_clip": 1.05100691, - "balance_loss_mlp": 1.0298965, - "epoch": 0.42422967082519164, - "flos": 14027462824320.0, - "grad_norm": 2.6219010938669998, - "language_loss": 0.7752226, - "learning_rate": 2.5794628293907107e-06, - "loss": 0.79704189, - "num_input_tokens_seen": 151492035, - "step": 7056, - "time_per_iteration": 2.5975699424743652 - }, - { - "auxiliary_loss_clip": 0.01129986, - "auxiliary_loss_mlp": 0.01042696, - "balance_loss_clip": 1.05187988, - "balance_loss_mlp": 1.02583957, - "epoch": 0.4242897940778596, - "flos": 22345522853760.0, - "grad_norm": 2.481094371553488, - "language_loss": 0.8406778, - "learning_rate": 2.579090061518714e-06, - "loss": 0.86240464, - "num_input_tokens_seen": 151508970, - "step": 7057, - "time_per_iteration": 2.690188407897949 - }, - { - "auxiliary_loss_clip": 0.01095967, - "auxiliary_loss_mlp": 0.01043613, - "balance_loss_clip": 1.04596114, - "balance_loss_mlp": 1.02778184, - "epoch": 0.42434991733052757, - "flos": 22595855713920.0, - "grad_norm": 2.565187046091263, - "language_loss": 0.83179426, - "learning_rate": 2.5787172716875642e-06, - "loss": 0.85319012, - "num_input_tokens_seen": 151525295, - "step": 7058, - "time_per_iteration": 2.9978904724121094 - }, - { - "auxiliary_loss_clip": 0.01107732, - "auxiliary_loss_mlp": 0.0077171, - "balance_loss_clip": 1.04935992, - "balance_loss_mlp": 1.000875, - "epoch": 0.42441004058319554, - "flos": 20011437181440.0, - "grad_norm": 1.910708490679684, - "language_loss": 0.80493343, - "learning_rate": 2.5783444599113973e-06, - "loss": 0.82372791, - "num_input_tokens_seen": 151544435, - "step": 7059, - "time_per_iteration": 2.7227041721343994 - }, - { - "auxiliary_loss_clip": 0.01137284, - "auxiliary_loss_mlp": 0.01041284, - "balance_loss_clip": 1.05036783, - "balance_loss_mlp": 1.02469015, - "epoch": 0.4244701638358635, - "flos": 11144985235200.0, - "grad_norm": 2.371195034517477, - "language_loss": 0.70500332, - "learning_rate": 2.57797162620435e-06, - "loss": 0.726789, - "num_input_tokens_seen": 151559520, - "step": 7060, - "time_per_iteration": 2.6058552265167236 - }, - { - "auxiliary_loss_clip": 0.01128623, - "auxiliary_loss_mlp": 0.01038609, - "balance_loss_clip": 1.05295658, - "balance_loss_mlp": 1.02370787, - "epoch": 0.42453028708853147, - "flos": 23987753688960.0, - "grad_norm": 1.575928079295092, - "language_loss": 0.7634182, - "learning_rate": 2.577598770580562e-06, - "loss": 0.78509057, - "num_input_tokens_seen": 151579790, - "step": 7061, - "time_per_iteration": 2.6592459678649902 - }, - { - "auxiliary_loss_clip": 0.01127164, - "auxiliary_loss_mlp": 0.01039243, - "balance_loss_clip": 1.05133295, - "balance_loss_mlp": 1.02308464, - "epoch": 0.42459041034119943, - "flos": 18406338030720.0, - "grad_norm": 2.3470563522902195, - "language_loss": 0.73278493, - "learning_rate": 2.5772258930541693e-06, - "loss": 0.75444901, - "num_input_tokens_seen": 151598285, - "step": 7062, - "time_per_iteration": 2.5925838947296143 - }, - { - "auxiliary_loss_clip": 0.01110528, - "auxiliary_loss_mlp": 0.01044189, - "balance_loss_clip": 1.05038309, - "balance_loss_mlp": 1.02934098, - "epoch": 0.42465053359386745, - "flos": 20958006337920.0, - "grad_norm": 1.735369540351847, - "language_loss": 0.66238403, - "learning_rate": 2.5768529936393137e-06, - "loss": 0.68393123, - "num_input_tokens_seen": 151615430, - "step": 7063, - "time_per_iteration": 2.618459939956665 - }, - { - "auxiliary_loss_clip": 0.0109746, - "auxiliary_loss_mlp": 0.00773106, - "balance_loss_clip": 1.04320812, - "balance_loss_mlp": 1.0009284, - "epoch": 0.4247106568465354, - "flos": 33106190520960.0, - "grad_norm": 1.673900676033667, - "language_loss": 0.78570068, - "learning_rate": 2.5764800723501354e-06, - "loss": 0.80440634, - "num_input_tokens_seen": 151637030, - "step": 7064, - "time_per_iteration": 2.7396399974823 - }, - { - "auxiliary_loss_clip": 0.0113726, - "auxiliary_loss_mlp": 0.01038466, - "balance_loss_clip": 1.05053115, - "balance_loss_mlp": 1.02317119, - "epoch": 0.4247707800992034, - "flos": 20046916840320.0, - "grad_norm": 1.9847642008914126, - "language_loss": 0.75471151, - "learning_rate": 2.5761071292007736e-06, - "loss": 0.77646875, - "num_input_tokens_seen": 151655745, - "step": 7065, - "time_per_iteration": 2.532046318054199 - }, - { - "auxiliary_loss_clip": 0.01124888, - "auxiliary_loss_mlp": 0.01038463, - "balance_loss_clip": 1.05094182, - "balance_loss_mlp": 1.02257848, - "epoch": 0.42483090335187135, - "flos": 22385132576640.0, - "grad_norm": 1.3355357629490912, - "language_loss": 0.72402596, - "learning_rate": 2.5757341642053725e-06, - "loss": 0.74565947, - "num_input_tokens_seen": 151678040, - "step": 7066, - "time_per_iteration": 2.5829319953918457 - }, - { - "auxiliary_loss_clip": 0.01101493, - "auxiliary_loss_mlp": 0.01036883, - "balance_loss_clip": 1.04836977, - "balance_loss_mlp": 1.02044368, - "epoch": 0.4248910266045393, - "flos": 21356830022400.0, - "grad_norm": 2.4907013500628166, - "language_loss": 0.80009657, - "learning_rate": 2.5753611773780745e-06, - "loss": 0.82148039, - "num_input_tokens_seen": 151696410, - "step": 7067, - "time_per_iteration": 2.6051836013793945 - }, - { - "auxiliary_loss_clip": 0.01053553, - "auxiliary_loss_mlp": 0.01005501, - "balance_loss_clip": 1.02524805, - "balance_loss_mlp": 1.00387979, - "epoch": 0.4249511498572073, - "flos": 64008114099840.0, - "grad_norm": 0.9135939410418532, - "language_loss": 0.6341064, - "learning_rate": 2.574988168733022e-06, - "loss": 0.65469694, - "num_input_tokens_seen": 151756365, - "step": 7068, - "time_per_iteration": 4.699309825897217 - }, - { - "auxiliary_loss_clip": 0.0113454, - "auxiliary_loss_mlp": 0.01036767, - "balance_loss_clip": 1.04911804, - "balance_loss_mlp": 1.02070904, - "epoch": 0.42501127310987524, - "flos": 19607046888960.0, - "grad_norm": 1.9072894618048717, - "language_loss": 0.72502887, - "learning_rate": 2.574615138284361e-06, - "loss": 0.74674189, - "num_input_tokens_seen": 151775165, - "step": 7069, - "time_per_iteration": 5.814046382904053 - }, - { - "auxiliary_loss_clip": 0.01136556, - "auxiliary_loss_mlp": 0.01039486, - "balance_loss_clip": 1.05074239, - "balance_loss_mlp": 1.02286839, - "epoch": 0.4250713963625432, - "flos": 19462326992640.0, - "grad_norm": 2.348420544652142, - "language_loss": 0.79105788, - "learning_rate": 2.5742420860462364e-06, - "loss": 0.81281829, - "num_input_tokens_seen": 151792620, - "step": 7070, - "time_per_iteration": 2.6242294311523438 - }, - { - "auxiliary_loss_clip": 0.0112233, - "auxiliary_loss_mlp": 0.01033288, - "balance_loss_clip": 1.04764843, - "balance_loss_mlp": 1.01816082, - "epoch": 0.4251315196152112, - "flos": 25337707557120.0, - "grad_norm": 1.7541837021075046, - "language_loss": 0.70184052, - "learning_rate": 2.573869012032795e-06, - "loss": 0.72339666, - "num_input_tokens_seen": 151812850, - "step": 7071, - "time_per_iteration": 2.6695022583007812 - }, - { - "auxiliary_loss_clip": 0.01134965, - "auxiliary_loss_mlp": 0.01034152, - "balance_loss_clip": 1.05002129, - "balance_loss_mlp": 1.0191201, - "epoch": 0.42519164286787914, - "flos": 26359186527360.0, - "grad_norm": 2.353956848857114, - "language_loss": 0.71210682, - "learning_rate": 2.5734959162581824e-06, - "loss": 0.73379803, - "num_input_tokens_seen": 151831785, - "step": 7072, - "time_per_iteration": 2.654045581817627 - }, - { - "auxiliary_loss_clip": 0.01090703, - "auxiliary_loss_mlp": 0.01042672, - "balance_loss_clip": 1.04456139, - "balance_loss_mlp": 1.02779484, - "epoch": 0.4252517661205471, - "flos": 26031070765440.0, - "grad_norm": 1.5509538260814284, - "language_loss": 0.81704801, - "learning_rate": 2.5731227987365475e-06, - "loss": 0.83838177, - "num_input_tokens_seen": 151853885, - "step": 7073, - "time_per_iteration": 4.4267754554748535 - }, - { - "auxiliary_loss_clip": 0.01117821, - "auxiliary_loss_mlp": 0.01035489, - "balance_loss_clip": 1.04660416, - "balance_loss_mlp": 1.02130294, - "epoch": 0.42531188937321507, - "flos": 12713635059840.0, - "grad_norm": 2.6569023186466914, - "language_loss": 0.91360795, - "learning_rate": 2.5727496594820386e-06, - "loss": 0.93514109, - "num_input_tokens_seen": 151871780, - "step": 7074, - "time_per_iteration": 2.655850887298584 - }, - { - "auxiliary_loss_clip": 0.01128859, - "auxiliary_loss_mlp": 0.00774468, - "balance_loss_clip": 1.05061221, - "balance_loss_mlp": 1.0009917, - "epoch": 0.42537201262588303, - "flos": 22091670460800.0, - "grad_norm": 1.6066127617392931, - "language_loss": 0.64610291, - "learning_rate": 2.572376498508805e-06, - "loss": 0.66513622, - "num_input_tokens_seen": 151891600, - "step": 7075, - "time_per_iteration": 2.7072041034698486 - }, - { - "auxiliary_loss_clip": 0.01097292, - "auxiliary_loss_mlp": 0.01030165, - "balance_loss_clip": 1.04872322, - "balance_loss_mlp": 1.01664686, - "epoch": 0.42543213587855105, - "flos": 23003119094400.0, - "grad_norm": 1.6801281915446873, - "language_loss": 0.736256, - "learning_rate": 2.5720033158309973e-06, - "loss": 0.75753057, - "num_input_tokens_seen": 151911330, - "step": 7076, - "time_per_iteration": 2.7376084327697754 - }, - { - "auxiliary_loss_clip": 0.01107519, - "auxiliary_loss_mlp": 0.01042827, - "balance_loss_clip": 1.0442965, - "balance_loss_mlp": 1.02684128, - "epoch": 0.425492259131219, - "flos": 25082454533760.0, - "grad_norm": 2.293658429237098, - "language_loss": 0.78658164, - "learning_rate": 2.571630111462766e-06, - "loss": 0.80808508, - "num_input_tokens_seen": 151930355, - "step": 7077, - "time_per_iteration": 2.9069621562957764 - }, - { - "auxiliary_loss_clip": 0.01105315, - "auxiliary_loss_mlp": 0.01032074, - "balance_loss_clip": 1.04497409, - "balance_loss_mlp": 1.01881242, - "epoch": 0.425552382383887, - "flos": 22816850140800.0, - "grad_norm": 1.6369769525688158, - "language_loss": 0.73094088, - "learning_rate": 2.571256885418265e-06, - "loss": 0.75231481, - "num_input_tokens_seen": 151949695, - "step": 7078, - "time_per_iteration": 2.728288173675537 - }, - { - "auxiliary_loss_clip": 0.01104463, - "auxiliary_loss_mlp": 0.01040077, - "balance_loss_clip": 1.04849982, - "balance_loss_mlp": 1.02651131, - "epoch": 0.42561250563655495, - "flos": 13553585671680.0, - "grad_norm": 1.8849915988224846, - "language_loss": 0.79555357, - "learning_rate": 2.5708836377116445e-06, - "loss": 0.81699896, - "num_input_tokens_seen": 151967640, - "step": 7079, - "time_per_iteration": 2.6294121742248535 - }, - { - "auxiliary_loss_clip": 0.01125077, - "auxiliary_loss_mlp": 0.01035166, - "balance_loss_clip": 1.05348229, - "balance_loss_mlp": 1.02171898, - "epoch": 0.4256726288892229, - "flos": 46978303023360.0, - "grad_norm": 1.3719098160070018, - "language_loss": 0.71853465, - "learning_rate": 2.5705103683570592e-06, - "loss": 0.7401371, - "num_input_tokens_seen": 151994020, - "step": 7080, - "time_per_iteration": 2.8506548404693604 - }, - { - "auxiliary_loss_clip": 0.01130776, - "auxiliary_loss_mlp": 0.01033872, - "balance_loss_clip": 1.04765022, - "balance_loss_mlp": 1.02025867, - "epoch": 0.4257327521418909, - "flos": 23586451966080.0, - "grad_norm": 2.0309872529354283, - "language_loss": 0.80102706, - "learning_rate": 2.5701370773686646e-06, - "loss": 0.82267356, - "num_input_tokens_seen": 152013415, - "step": 7081, - "time_per_iteration": 2.698814868927002 - }, - { - "auxiliary_loss_clip": 0.01100197, - "auxiliary_loss_mlp": 0.01034532, - "balance_loss_clip": 1.04303122, - "balance_loss_mlp": 1.02065063, - "epoch": 0.42579287539455885, - "flos": 18989994124800.0, - "grad_norm": 1.6770375884870488, - "language_loss": 0.81524366, - "learning_rate": 2.5697637647606138e-06, - "loss": 0.83659089, - "num_input_tokens_seen": 152030860, - "step": 7082, - "time_per_iteration": 2.6388967037200928 - }, - { - "auxiliary_loss_clip": 0.01122609, - "auxiliary_loss_mlp": 0.01038264, - "balance_loss_clip": 1.05003822, - "balance_loss_mlp": 1.02411938, - "epoch": 0.4258529986472268, - "flos": 25191910252800.0, - "grad_norm": 2.777460036178925, - "language_loss": 0.70476681, - "learning_rate": 2.569390430547065e-06, - "loss": 0.72637558, - "num_input_tokens_seen": 152050395, - "step": 7083, - "time_per_iteration": 2.666609048843384 - }, - { - "auxiliary_loss_clip": 0.01045638, - "auxiliary_loss_mlp": 0.0101356, - "balance_loss_clip": 1.02604496, - "balance_loss_mlp": 1.01191545, - "epoch": 0.4259131218998948, - "flos": 69968280718080.0, - "grad_norm": 0.8664420799088798, - "language_loss": 0.6701948, - "learning_rate": 2.569017074742173e-06, - "loss": 0.69078678, - "num_input_tokens_seen": 152113555, - "step": 7084, - "time_per_iteration": 3.25407075881958 - }, - { - "auxiliary_loss_clip": 0.01120239, - "auxiliary_loss_mlp": 0.01042728, - "balance_loss_clip": 1.04841447, - "balance_loss_mlp": 1.02757668, - "epoch": 0.42597324515256274, - "flos": 18004964480640.0, - "grad_norm": 2.05020327260517, - "language_loss": 0.78917986, - "learning_rate": 2.5686436973600964e-06, - "loss": 0.81080949, - "num_input_tokens_seen": 152131575, - "step": 7085, - "time_per_iteration": 2.6294076442718506 - }, - { - "auxiliary_loss_clip": 0.01123765, - "auxiliary_loss_mlp": 0.01045859, - "balance_loss_clip": 1.05045295, - "balance_loss_mlp": 1.03036761, - "epoch": 0.4260333684052307, - "flos": 15158792563200.0, - "grad_norm": 2.015450242409387, - "language_loss": 0.76097858, - "learning_rate": 2.568270298414995e-06, - "loss": 0.78267479, - "num_input_tokens_seen": 152149435, - "step": 7086, - "time_per_iteration": 2.606201648712158 - }, - { - "auxiliary_loss_clip": 0.01107732, - "auxiliary_loss_mlp": 0.01040875, - "balance_loss_clip": 1.04528451, - "balance_loss_mlp": 1.02682662, - "epoch": 0.42609349165789867, - "flos": 14939342421120.0, - "grad_norm": 4.435400492712099, - "language_loss": 0.80159658, - "learning_rate": 2.5678968779210255e-06, - "loss": 0.82308263, - "num_input_tokens_seen": 152166860, - "step": 7087, - "time_per_iteration": 2.6517395973205566 - }, - { - "auxiliary_loss_clip": 0.01113938, - "auxiliary_loss_mlp": 0.0103375, - "balance_loss_clip": 1.04980528, - "balance_loss_mlp": 1.01878285, - "epoch": 0.42615361491056664, - "flos": 23731961961600.0, - "grad_norm": 1.6700745034234148, - "language_loss": 0.65982199, - "learning_rate": 2.5675234358923505e-06, - "loss": 0.68129885, - "num_input_tokens_seen": 152187475, - "step": 7088, - "time_per_iteration": 2.6658740043640137 - }, - { - "auxiliary_loss_clip": 0.01079891, - "auxiliary_loss_mlp": 0.01038449, - "balance_loss_clip": 1.04348373, - "balance_loss_mlp": 1.02308249, - "epoch": 0.42621373816323466, - "flos": 24936441747840.0, - "grad_norm": 2.4696048983575376, - "language_loss": 0.68491185, - "learning_rate": 2.56714997234313e-06, - "loss": 0.70609522, - "num_input_tokens_seen": 152207235, - "step": 7089, - "time_per_iteration": 2.816352128982544 - }, - { - "auxiliary_loss_clip": 0.01083453, - "auxiliary_loss_mlp": 0.01038038, - "balance_loss_clip": 1.04270887, - "balance_loss_mlp": 1.02359009, - "epoch": 0.4262738614159026, - "flos": 13552975140480.0, - "grad_norm": 2.0671888191777623, - "language_loss": 0.73030579, - "learning_rate": 2.566776487287525e-06, - "loss": 0.75152063, - "num_input_tokens_seen": 152224240, - "step": 7090, - "time_per_iteration": 2.801116704940796 - }, - { - "auxiliary_loss_clip": 0.01114766, - "auxiliary_loss_mlp": 0.0104358, - "balance_loss_clip": 1.0483079, - "balance_loss_mlp": 1.02875018, - "epoch": 0.4263339846685706, - "flos": 29748794284800.0, - "grad_norm": 1.7852421559677654, - "language_loss": 0.75632602, - "learning_rate": 2.5664029807396994e-06, - "loss": 0.77790952, - "num_input_tokens_seen": 152242595, - "step": 7091, - "time_per_iteration": 2.779731273651123 - }, - { - "auxiliary_loss_clip": 0.01081578, - "auxiliary_loss_mlp": 0.01031582, - "balance_loss_clip": 1.04725623, - "balance_loss_mlp": 1.01879716, - "epoch": 0.42639410792123855, - "flos": 16834204586880.0, - "grad_norm": 2.1009795194853567, - "language_loss": 0.82635152, - "learning_rate": 2.5660294527138156e-06, - "loss": 0.84748316, - "num_input_tokens_seen": 152260840, - "step": 7092, - "time_per_iteration": 2.7296979427337646 - }, - { - "auxiliary_loss_clip": 0.01113469, - "auxiliary_loss_mlp": 0.0104261, - "balance_loss_clip": 1.04653692, - "balance_loss_mlp": 1.02812648, - "epoch": 0.4264542311739065, - "flos": 28763118195840.0, - "grad_norm": 1.6936837646094385, - "language_loss": 0.73936713, - "learning_rate": 2.565655903224038e-06, - "loss": 0.76092792, - "num_input_tokens_seen": 152280580, - "step": 7093, - "time_per_iteration": 2.738494634628296 - }, - { - "auxiliary_loss_clip": 0.01124772, - "auxiliary_loss_mlp": 0.01037897, - "balance_loss_clip": 1.05013132, - "balance_loss_mlp": 1.02285314, - "epoch": 0.4265143544265745, - "flos": 24713615727360.0, - "grad_norm": 2.248863473367437, - "language_loss": 0.69831914, - "learning_rate": 2.565282332284532e-06, - "loss": 0.71994585, - "num_input_tokens_seen": 152298455, - "step": 7094, - "time_per_iteration": 2.696377754211426 - }, - { - "auxiliary_loss_clip": 0.01102522, - "auxiliary_loss_mlp": 0.01035266, - "balance_loss_clip": 1.05082488, - "balance_loss_mlp": 1.02069819, - "epoch": 0.42657447767924245, - "flos": 21865971352320.0, - "grad_norm": 1.593904094988334, - "language_loss": 0.8160966, - "learning_rate": 2.564908739909464e-06, - "loss": 0.83747452, - "num_input_tokens_seen": 152316995, - "step": 7095, - "time_per_iteration": 2.7906196117401123 - }, - { - "auxiliary_loss_clip": 0.01135526, - "auxiliary_loss_mlp": 0.01039866, - "balance_loss_clip": 1.05080557, - "balance_loss_mlp": 1.02575183, - "epoch": 0.4266346009319104, - "flos": 21470236237440.0, - "grad_norm": 1.8045956329002426, - "language_loss": 0.80642307, - "learning_rate": 2.5645351261129996e-06, - "loss": 0.82817698, - "num_input_tokens_seen": 152334800, - "step": 7096, - "time_per_iteration": 2.7473361492156982 - }, - { - "auxiliary_loss_clip": 0.01130201, - "auxiliary_loss_mlp": 0.01033007, - "balance_loss_clip": 1.05325663, - "balance_loss_mlp": 1.0182128, - "epoch": 0.4266947241845784, - "flos": 25519379569920.0, - "grad_norm": 2.602963129491376, - "language_loss": 0.64982784, - "learning_rate": 2.5641614909093066e-06, - "loss": 0.67145991, - "num_input_tokens_seen": 152355175, - "step": 7097, - "time_per_iteration": 2.683868408203125 - }, - { - "auxiliary_loss_clip": 0.01103674, - "auxiliary_loss_mlp": 0.01032153, - "balance_loss_clip": 1.04987097, - "balance_loss_mlp": 1.01799679, - "epoch": 0.42675484743724634, - "flos": 26541217676160.0, - "grad_norm": 1.7913732947115202, - "language_loss": 0.74682045, - "learning_rate": 2.5637878343125535e-06, - "loss": 0.76817876, - "num_input_tokens_seen": 152377245, - "step": 7098, - "time_per_iteration": 2.7669501304626465 - }, - { - "auxiliary_loss_clip": 0.0112361, - "auxiliary_loss_mlp": 0.01029914, - "balance_loss_clip": 1.05006361, - "balance_loss_mlp": 1.0165925, - "epoch": 0.4268149706899143, - "flos": 23112718467840.0, - "grad_norm": 1.7242280164199693, - "language_loss": 0.75574845, - "learning_rate": 2.5634141563369086e-06, - "loss": 0.77728367, - "num_input_tokens_seen": 152396985, - "step": 7099, - "time_per_iteration": 2.652024507522583 - }, - { - "auxiliary_loss_clip": 0.01113615, - "auxiliary_loss_mlp": 0.01044502, - "balance_loss_clip": 1.04767907, - "balance_loss_mlp": 1.02964246, - "epoch": 0.4268750939425823, - "flos": 22706532495360.0, - "grad_norm": 2.4499059435945956, - "language_loss": 0.82854998, - "learning_rate": 2.5630404569965432e-06, - "loss": 0.85013109, - "num_input_tokens_seen": 152415590, - "step": 7100, - "time_per_iteration": 2.66955304145813 - }, - { - "auxiliary_loss_clip": 0.01114994, - "auxiliary_loss_mlp": 0.01038973, - "balance_loss_clip": 1.05028403, - "balance_loss_mlp": 1.0246973, - "epoch": 0.42693521719525024, - "flos": 25374875155200.0, - "grad_norm": 1.3265740257801202, - "language_loss": 0.81932402, - "learning_rate": 2.562666736305627e-06, - "loss": 0.8408637, - "num_input_tokens_seen": 152436735, - "step": 7101, - "time_per_iteration": 2.734703540802002 - }, - { - "auxiliary_loss_clip": 0.01139197, - "auxiliary_loss_mlp": 0.01033271, - "balance_loss_clip": 1.0521878, - "balance_loss_mlp": 1.01856041, - "epoch": 0.42699534044791826, - "flos": 18150689957760.0, - "grad_norm": 6.39201802797086, - "language_loss": 0.72548246, - "learning_rate": 2.5622929942783314e-06, - "loss": 0.74720716, - "num_input_tokens_seen": 152455685, - "step": 7102, - "time_per_iteration": 2.6193687915802 - }, - { - "auxiliary_loss_clip": 0.01123058, - "auxiliary_loss_mlp": 0.01031478, - "balance_loss_clip": 1.05015755, - "balance_loss_mlp": 1.01770973, - "epoch": 0.4270554637005862, - "flos": 13698413308800.0, - "grad_norm": 2.0187490499372243, - "language_loss": 0.83425319, - "learning_rate": 2.5619192309288297e-06, - "loss": 0.8557986, - "num_input_tokens_seen": 152473500, - "step": 7103, - "time_per_iteration": 2.6151843070983887 - }, - { - "auxiliary_loss_clip": 0.01108466, - "auxiliary_loss_mlp": 0.01042825, - "balance_loss_clip": 1.04559612, - "balance_loss_mlp": 1.02617157, - "epoch": 0.4271155869532542, - "flos": 17493596507520.0, - "grad_norm": 4.588723714988328, - "language_loss": 0.74312592, - "learning_rate": 2.561545446271294e-06, - "loss": 0.76463884, - "num_input_tokens_seen": 152491320, - "step": 7104, - "time_per_iteration": 2.686087131500244 - }, - { - "auxiliary_loss_clip": 0.01118632, - "auxiliary_loss_mlp": 0.01030826, - "balance_loss_clip": 1.04769945, - "balance_loss_mlp": 1.01652098, - "epoch": 0.42717571020592215, - "flos": 32452293381120.0, - "grad_norm": 3.9751824788265226, - "language_loss": 0.7515536, - "learning_rate": 2.5611716403198987e-06, - "loss": 0.77304816, - "num_input_tokens_seen": 152511970, - "step": 7105, - "time_per_iteration": 2.69466495513916 - }, - { - "auxiliary_loss_clip": 0.01138696, - "auxiliary_loss_mlp": 0.01032922, - "balance_loss_clip": 1.05365109, - "balance_loss_mlp": 1.01949859, - "epoch": 0.4272358334585901, - "flos": 16253062444800.0, - "grad_norm": 1.828100931914864, - "language_loss": 0.77001148, - "learning_rate": 2.560797813088819e-06, - "loss": 0.79172766, - "num_input_tokens_seen": 152530515, - "step": 7106, - "time_per_iteration": 2.7470526695251465 - }, - { - "auxiliary_loss_clip": 0.01113386, - "auxiliary_loss_mlp": 0.01032071, - "balance_loss_clip": 1.05155849, - "balance_loss_mlp": 1.01898193, - "epoch": 0.4272959567112581, - "flos": 24200092938240.0, - "grad_norm": 2.105539726439896, - "language_loss": 0.79606462, - "learning_rate": 2.560423964592229e-06, - "loss": 0.81751919, - "num_input_tokens_seen": 152549295, - "step": 7107, - "time_per_iteration": 4.302187919616699 - }, - { - "auxiliary_loss_clip": 0.01084956, - "auxiliary_loss_mlp": 0.01035225, - "balance_loss_clip": 1.04738021, - "balance_loss_mlp": 1.02138472, - "epoch": 0.42735607996392605, - "flos": 27963495578880.0, - "grad_norm": 1.6344878343023064, - "language_loss": 0.67924458, - "learning_rate": 2.5600500948443075e-06, - "loss": 0.70044637, - "num_input_tokens_seen": 152570725, - "step": 7108, - "time_per_iteration": 6.044403314590454 - }, - { - "auxiliary_loss_clip": 0.01110243, - "auxiliary_loss_mlp": 0.01038292, - "balance_loss_clip": 1.05136764, - "balance_loss_mlp": 1.02539325, - "epoch": 0.427416203216594, - "flos": 20295597674880.0, - "grad_norm": 1.7692691179194058, - "language_loss": 0.71223509, - "learning_rate": 2.5596762038592294e-06, - "loss": 0.73372042, - "num_input_tokens_seen": 152588950, - "step": 7109, - "time_per_iteration": 2.6695122718811035 - }, - { - "auxiliary_loss_clip": 0.01120979, - "auxiliary_loss_mlp": 0.01033902, - "balance_loss_clip": 1.048154, - "balance_loss_mlp": 1.01738, - "epoch": 0.427476326469262, - "flos": 26943955943040.0, - "grad_norm": 2.0357298685431595, - "language_loss": 0.64665484, - "learning_rate": 2.559302291651174e-06, - "loss": 0.66820359, - "num_input_tokens_seen": 152608965, - "step": 7110, - "time_per_iteration": 2.6609907150268555 - }, - { - "auxiliary_loss_clip": 0.01132801, - "auxiliary_loss_mlp": 0.00771481, - "balance_loss_clip": 1.04796886, - "balance_loss_mlp": 1.00075054, - "epoch": 0.42753644972192995, - "flos": 25702847262720.0, - "grad_norm": 6.311104463147988, - "language_loss": 0.76556361, - "learning_rate": 2.5589283582343197e-06, - "loss": 0.7846064, - "num_input_tokens_seen": 152630220, - "step": 7111, - "time_per_iteration": 2.704688310623169 - }, - { - "auxiliary_loss_clip": 0.01111143, - "auxiliary_loss_mlp": 0.01033331, - "balance_loss_clip": 1.05706656, - "balance_loss_mlp": 1.01936615, - "epoch": 0.4275965729745979, - "flos": 18767419499520.0, - "grad_norm": 2.0174435424847084, - "language_loss": 0.72800988, - "learning_rate": 2.558554403622845e-06, - "loss": 0.74945462, - "num_input_tokens_seen": 152648835, - "step": 7112, - "time_per_iteration": 4.39399790763855 - }, - { - "auxiliary_loss_clip": 0.01107213, - "auxiliary_loss_mlp": 0.01037359, - "balance_loss_clip": 1.04838848, - "balance_loss_mlp": 1.02366805, - "epoch": 0.4276566962272659, - "flos": 23764424878080.0, - "grad_norm": 1.714295461522007, - "language_loss": 0.71427524, - "learning_rate": 2.5581804278309323e-06, - "loss": 0.73572093, - "num_input_tokens_seen": 152668375, - "step": 7113, - "time_per_iteration": 2.6834428310394287 - }, - { - "auxiliary_loss_clip": 0.01126637, - "auxiliary_loss_mlp": 0.01040655, - "balance_loss_clip": 1.05207372, - "balance_loss_mlp": 1.02700508, - "epoch": 0.42771681947993384, - "flos": 22492505306880.0, - "grad_norm": 1.6108261365545002, - "language_loss": 0.61758566, - "learning_rate": 2.5578064308727617e-06, - "loss": 0.63925862, - "num_input_tokens_seen": 152689725, - "step": 7114, - "time_per_iteration": 2.7341814041137695 - }, - { - "auxiliary_loss_clip": 0.01131369, - "auxiliary_loss_mlp": 0.01042209, - "balance_loss_clip": 1.05489218, - "balance_loss_mlp": 1.02556777, - "epoch": 0.42777694273260186, - "flos": 25044712318080.0, - "grad_norm": 1.6215320240925026, - "language_loss": 0.649822, - "learning_rate": 2.5574324127625153e-06, - "loss": 0.67155778, - "num_input_tokens_seen": 152709375, - "step": 7115, - "time_per_iteration": 2.6360361576080322 - }, - { - "auxiliary_loss_clip": 0.01110467, - "auxiliary_loss_mlp": 0.01037107, - "balance_loss_clip": 1.04954565, - "balance_loss_mlp": 1.02359438, - "epoch": 0.4278370659852698, - "flos": 18661519226880.0, - "grad_norm": 1.8869093124336491, - "language_loss": 0.74057275, - "learning_rate": 2.5570583735143753e-06, - "loss": 0.76204848, - "num_input_tokens_seen": 152727510, - "step": 7116, - "time_per_iteration": 2.701413869857788 - }, - { - "auxiliary_loss_clip": 0.01105537, - "auxiliary_loss_mlp": 0.01041231, - "balance_loss_clip": 1.04539752, - "balance_loss_mlp": 1.02783155, - "epoch": 0.4278971892379378, - "flos": 27308269635840.0, - "grad_norm": 1.8577367375008744, - "language_loss": 0.69426787, - "learning_rate": 2.5566843131425275e-06, - "loss": 0.71573555, - "num_input_tokens_seen": 152746670, - "step": 7117, - "time_per_iteration": 2.740729570388794 - }, - { - "auxiliary_loss_clip": 0.01110879, - "auxiliary_loss_mlp": 0.0103835, - "balance_loss_clip": 1.05176735, - "balance_loss_mlp": 1.02402163, - "epoch": 0.42795731249060576, - "flos": 12888698970240.0, - "grad_norm": 2.8863290375892148, - "language_loss": 0.69564569, - "learning_rate": 2.5563102316611536e-06, - "loss": 0.71713799, - "num_input_tokens_seen": 152760545, - "step": 7118, - "time_per_iteration": 2.7086899280548096 - }, - { - "auxiliary_loss_clip": 0.01092131, - "auxiliary_loss_mlp": 0.0104544, - "balance_loss_clip": 1.04521, - "balance_loss_mlp": 1.03076482, - "epoch": 0.4280174357432737, - "flos": 33401448316800.0, - "grad_norm": 2.453050871280299, - "language_loss": 0.74826419, - "learning_rate": 2.55593612908444e-06, - "loss": 0.76963991, - "num_input_tokens_seen": 152780970, - "step": 7119, - "time_per_iteration": 2.805619239807129 - }, - { - "auxiliary_loss_clip": 0.01069167, - "auxiliary_loss_mlp": 0.01038035, - "balance_loss_clip": 1.0436008, - "balance_loss_mlp": 1.02377188, - "epoch": 0.4280775589959417, - "flos": 18259104182400.0, - "grad_norm": 1.842272720773601, - "language_loss": 0.75238574, - "learning_rate": 2.555562005426573e-06, - "loss": 0.77345783, - "num_input_tokens_seen": 152798475, - "step": 7120, - "time_per_iteration": 2.8678669929504395 - }, - { - "auxiliary_loss_clip": 0.01112615, - "auxiliary_loss_mlp": 0.00770364, - "balance_loss_clip": 1.05290043, - "balance_loss_mlp": 1.00063229, - "epoch": 0.42813768224860965, - "flos": 21471277731840.0, - "grad_norm": 1.7037705311845839, - "language_loss": 0.76884449, - "learning_rate": 2.5551878607017385e-06, - "loss": 0.78767425, - "num_input_tokens_seen": 152817555, - "step": 7121, - "time_per_iteration": 2.776524305343628 - }, - { - "auxiliary_loss_clip": 0.01114442, - "auxiliary_loss_mlp": 0.01034456, - "balance_loss_clip": 1.05325198, - "balance_loss_mlp": 1.02162266, - "epoch": 0.4281978055012776, - "flos": 15669262696320.0, - "grad_norm": 1.9187062544957278, - "language_loss": 0.85698652, - "learning_rate": 2.554813694924126e-06, - "loss": 0.87847555, - "num_input_tokens_seen": 152836295, - "step": 7122, - "time_per_iteration": 2.7109732627868652 - }, - { - "auxiliary_loss_clip": 0.01083707, - "auxiliary_loss_mlp": 0.01035803, - "balance_loss_clip": 1.04868889, - "balance_loss_mlp": 1.02191544, - "epoch": 0.4282579287539456, - "flos": 17712005155200.0, - "grad_norm": 2.4146794334180632, - "language_loss": 0.81251013, - "learning_rate": 2.554439508107921e-06, - "loss": 0.83370531, - "num_input_tokens_seen": 152854950, - "step": 7123, - "time_per_iteration": 2.7866828441619873 - }, - { - "auxiliary_loss_clip": 0.01090954, - "auxiliary_loss_mlp": 0.01034043, - "balance_loss_clip": 1.04922438, - "balance_loss_mlp": 1.02011371, - "epoch": 0.42831805200661355, - "flos": 19281157770240.0, - "grad_norm": 1.7481094896376608, - "language_loss": 0.81089389, - "learning_rate": 2.5540653002673153e-06, - "loss": 0.8321439, - "num_input_tokens_seen": 152873995, - "step": 7124, - "time_per_iteration": 2.733530044555664 - }, - { - "auxiliary_loss_clip": 0.01125145, - "auxiliary_loss_mlp": 0.01037816, - "balance_loss_clip": 1.05205929, - "balance_loss_mlp": 1.02334404, - "epoch": 0.4283781752592815, - "flos": 19792633484160.0, - "grad_norm": 1.8145132685178345, - "language_loss": 0.80230892, - "learning_rate": 2.553691071416498e-06, - "loss": 0.82393849, - "num_input_tokens_seen": 152892925, - "step": 7125, - "time_per_iteration": 2.635104179382324 - }, - { - "auxiliary_loss_clip": 0.01132021, - "auxiliary_loss_mlp": 0.0076966, - "balance_loss_clip": 1.05282855, - "balance_loss_mlp": 1.00061083, - "epoch": 0.4284382985119495, - "flos": 16508064072960.0, - "grad_norm": 1.8752935538071442, - "language_loss": 0.74911773, - "learning_rate": 2.553316821569659e-06, - "loss": 0.76813453, - "num_input_tokens_seen": 152910935, - "step": 7126, - "time_per_iteration": 2.605344772338867 - }, - { - "auxiliary_loss_clip": 0.01124108, - "auxiliary_loss_mlp": 0.01031402, - "balance_loss_clip": 1.05336213, - "balance_loss_mlp": 1.01742435, - "epoch": 0.42849842176461744, - "flos": 23330767979520.0, - "grad_norm": 4.135943969267594, - "language_loss": 0.80782413, - "learning_rate": 2.5529425507409913e-06, - "loss": 0.82937926, - "num_input_tokens_seen": 152931030, - "step": 7127, - "time_per_iteration": 2.662910223007202 - }, - { - "auxiliary_loss_clip": 0.01088729, - "auxiliary_loss_mlp": 0.0104147, - "balance_loss_clip": 1.04972112, - "balance_loss_mlp": 1.02753484, - "epoch": 0.4285585450172854, - "flos": 17274433674240.0, - "grad_norm": 2.1393882563291773, - "language_loss": 0.76243544, - "learning_rate": 2.5525682589446867e-06, - "loss": 0.78373742, - "num_input_tokens_seen": 152948085, - "step": 7128, - "time_per_iteration": 2.7230868339538574 - }, - { - "auxiliary_loss_clip": 0.01089264, - "auxiliary_loss_mlp": 0.01035924, - "balance_loss_clip": 1.04796708, - "balance_loss_mlp": 1.02163041, - "epoch": 0.42861866826995343, - "flos": 24279599692800.0, - "grad_norm": 1.945213992632333, - "language_loss": 0.74079603, - "learning_rate": 2.552193946194937e-06, - "loss": 0.76204789, - "num_input_tokens_seen": 152966265, - "step": 7129, - "time_per_iteration": 2.775891065597534 - }, - { - "auxiliary_loss_clip": 0.01127944, - "auxiliary_loss_mlp": 0.00770117, - "balance_loss_clip": 1.05684757, - "balance_loss_mlp": 1.0005461, - "epoch": 0.4286787915226214, - "flos": 24353108876160.0, - "grad_norm": 1.5710338967277158, - "language_loss": 0.77974319, - "learning_rate": 2.5518196125059394e-06, - "loss": 0.79872382, - "num_input_tokens_seen": 152986775, - "step": 7130, - "time_per_iteration": 2.6977498531341553 - }, - { - "auxiliary_loss_clip": 0.01119463, - "auxiliary_loss_mlp": 0.01035523, - "balance_loss_clip": 1.05768883, - "balance_loss_mlp": 1.02184367, - "epoch": 0.42873891477528936, - "flos": 15449992122240.0, - "grad_norm": 2.320631391566952, - "language_loss": 0.73168224, - "learning_rate": 2.551445257891886e-06, - "loss": 0.75323212, - "num_input_tokens_seen": 153003595, - "step": 7131, - "time_per_iteration": 2.6973114013671875 - }, - { - "auxiliary_loss_clip": 0.01116554, - "auxiliary_loss_mlp": 0.0103667, - "balance_loss_clip": 1.05293584, - "balance_loss_mlp": 1.02260923, - "epoch": 0.4287990380279573, - "flos": 17639573379840.0, - "grad_norm": 5.223518802520722, - "language_loss": 0.77257997, - "learning_rate": 2.551070882366973e-06, - "loss": 0.79411221, - "num_input_tokens_seen": 153021960, - "step": 7132, - "time_per_iteration": 2.644556999206543 - }, - { - "auxiliary_loss_clip": 0.01097397, - "auxiliary_loss_mlp": 0.00771143, - "balance_loss_clip": 1.05195022, - "balance_loss_mlp": 1.00064743, - "epoch": 0.4288591612806253, - "flos": 27162328677120.0, - "grad_norm": 2.003525879431933, - "language_loss": 0.78719372, - "learning_rate": 2.550696485945397e-06, - "loss": 0.80587912, - "num_input_tokens_seen": 153042110, - "step": 7133, - "time_per_iteration": 2.7668325901031494 - }, - { - "auxiliary_loss_clip": 0.01111172, - "auxiliary_loss_mlp": 0.01034109, - "balance_loss_clip": 1.05302238, - "balance_loss_mlp": 1.02091813, - "epoch": 0.42891928453329325, - "flos": 17163182275200.0, - "grad_norm": 1.850568768068126, - "language_loss": 0.7449469, - "learning_rate": 2.550322068641355e-06, - "loss": 0.76639962, - "num_input_tokens_seen": 153058925, - "step": 7134, - "time_per_iteration": 2.714893341064453 - }, - { - "auxiliary_loss_clip": 0.01112422, - "auxiliary_loss_mlp": 0.01035997, - "balance_loss_clip": 1.04541016, - "balance_loss_mlp": 1.02241349, - "epoch": 0.4289794077859612, - "flos": 18187031543040.0, - "grad_norm": 1.9214467858451951, - "language_loss": 0.84098607, - "learning_rate": 2.5499476304690455e-06, - "loss": 0.86247027, - "num_input_tokens_seen": 153078070, - "step": 7135, - "time_per_iteration": 2.646799325942993 - }, - { - "auxiliary_loss_clip": 0.01060089, - "auxiliary_loss_mlp": 0.01040969, - "balance_loss_clip": 1.04197621, - "balance_loss_mlp": 1.02555561, - "epoch": 0.4290395310386292, - "flos": 28256885867520.0, - "grad_norm": 2.1625216270915493, - "language_loss": 0.75274026, - "learning_rate": 2.549573171442666e-06, - "loss": 0.77375078, - "num_input_tokens_seen": 153096680, - "step": 7136, - "time_per_iteration": 2.809598207473755 - }, - { - "auxiliary_loss_clip": 0.0112086, - "auxiliary_loss_mlp": 0.0103731, - "balance_loss_clip": 1.04999709, - "balance_loss_mlp": 1.02323103, - "epoch": 0.42909965429129715, - "flos": 16216074414720.0, - "grad_norm": 2.3663507288699743, - "language_loss": 0.79031229, - "learning_rate": 2.5491986915764175e-06, - "loss": 0.81189406, - "num_input_tokens_seen": 153113305, - "step": 7137, - "time_per_iteration": 2.5979957580566406 - }, - { - "auxiliary_loss_clip": 0.01139951, - "auxiliary_loss_mlp": 0.01034457, - "balance_loss_clip": 1.05516219, - "balance_loss_mlp": 1.02047372, - "epoch": 0.4291597775439651, - "flos": 23112862122240.0, - "grad_norm": 2.7024255480814166, - "language_loss": 0.76951313, - "learning_rate": 2.548824190884499e-06, - "loss": 0.7912572, - "num_input_tokens_seen": 153132735, - "step": 7138, - "time_per_iteration": 2.659080982208252 - }, - { - "auxiliary_loss_clip": 0.01053167, - "auxiliary_loss_mlp": 0.01001874, - "balance_loss_clip": 1.04265583, - "balance_loss_mlp": 1.000193, - "epoch": 0.4292199007966331, - "flos": 67546212681600.0, - "grad_norm": 0.770527259841848, - "language_loss": 0.56189907, - "learning_rate": 2.548449669381113e-06, - "loss": 0.58244956, - "num_input_tokens_seen": 153187925, - "step": 7139, - "time_per_iteration": 3.10082745552063 - }, - { - "auxiliary_loss_clip": 0.0113097, - "auxiliary_loss_mlp": 0.00769947, - "balance_loss_clip": 1.05131912, - "balance_loss_mlp": 1.00071657, - "epoch": 0.42928002404930105, - "flos": 22999850956800.0, - "grad_norm": 2.111862554587806, - "language_loss": 0.80871445, - "learning_rate": 2.5480751270804595e-06, - "loss": 0.82772362, - "num_input_tokens_seen": 153206990, - "step": 7140, - "time_per_iteration": 2.795779228210449 - }, - { - "auxiliary_loss_clip": 0.01122496, - "auxiliary_loss_mlp": 0.01032486, - "balance_loss_clip": 1.05028069, - "balance_loss_mlp": 1.01853812, - "epoch": 0.429340147301969, - "flos": 11544922241280.0, - "grad_norm": 1.8811141343222446, - "language_loss": 0.82105601, - "learning_rate": 2.5477005639967424e-06, - "loss": 0.84260583, - "num_input_tokens_seen": 153222345, - "step": 7141, - "time_per_iteration": 2.7634544372558594 - }, - { - "auxiliary_loss_clip": 0.0112355, - "auxiliary_loss_mlp": 0.0103971, - "balance_loss_clip": 1.05177212, - "balance_loss_mlp": 1.02569723, - "epoch": 0.42940027055463703, - "flos": 25264988472960.0, - "grad_norm": 3.1732751781519566, - "language_loss": 0.86466211, - "learning_rate": 2.547325980144166e-06, - "loss": 0.88629478, - "num_input_tokens_seen": 153240570, - "step": 7142, - "time_per_iteration": 2.73675537109375 - }, - { - "auxiliary_loss_clip": 0.01107323, - "auxiliary_loss_mlp": 0.0103324, - "balance_loss_clip": 1.05093384, - "balance_loss_mlp": 1.02018034, - "epoch": 0.429460393807305, - "flos": 23805004268160.0, - "grad_norm": 2.0666274749088704, - "language_loss": 0.78651458, - "learning_rate": 2.5469513755369323e-06, - "loss": 0.80792016, - "num_input_tokens_seen": 153259575, - "step": 7143, - "time_per_iteration": 2.704951047897339 - }, - { - "auxiliary_loss_clip": 0.01085856, - "auxiliary_loss_mlp": 0.01042533, - "balance_loss_clip": 1.04870784, - "balance_loss_mlp": 1.02862692, - "epoch": 0.42952051705997296, - "flos": 13918294414080.0, - "grad_norm": 1.8720341937391007, - "language_loss": 0.77237451, - "learning_rate": 2.5465767501892484e-06, - "loss": 0.79365838, - "num_input_tokens_seen": 153276650, - "step": 7144, - "time_per_iteration": 2.8080482482910156 - }, - { - "auxiliary_loss_clip": 0.01111048, - "auxiliary_loss_mlp": 0.01029176, - "balance_loss_clip": 1.05582607, - "balance_loss_mlp": 1.01565719, - "epoch": 0.4295806403126409, - "flos": 26760380509440.0, - "grad_norm": 2.7559580952375335, - "language_loss": 0.73788631, - "learning_rate": 2.54620210411532e-06, - "loss": 0.75928855, - "num_input_tokens_seen": 153298025, - "step": 7145, - "time_per_iteration": 2.876610040664673 - }, - { - "auxiliary_loss_clip": 0.01124065, - "auxiliary_loss_mlp": 0.01036309, - "balance_loss_clip": 1.05205083, - "balance_loss_mlp": 1.02291536, - "epoch": 0.4296407635653089, - "flos": 20952619297920.0, - "grad_norm": 2.2535739124191623, - "language_loss": 0.78997326, - "learning_rate": 2.545827437329352e-06, - "loss": 0.81157696, - "num_input_tokens_seen": 153315775, - "step": 7146, - "time_per_iteration": 4.237323999404907 - }, - { - "auxiliary_loss_clip": 0.01118325, - "auxiliary_loss_mlp": 0.01033233, - "balance_loss_clip": 1.04862475, - "balance_loss_mlp": 1.02041841, - "epoch": 0.42970088681797686, - "flos": 15852335339520.0, - "grad_norm": 2.134935554118882, - "language_loss": 0.83125973, - "learning_rate": 2.5454527498455532e-06, - "loss": 0.85277522, - "num_input_tokens_seen": 153332765, - "step": 7147, - "time_per_iteration": 4.170353412628174 - }, - { - "auxiliary_loss_clip": 0.01120236, - "auxiliary_loss_mlp": 0.01036768, - "balance_loss_clip": 1.05321455, - "balance_loss_mlp": 1.02217066, - "epoch": 0.4297610100706448, - "flos": 22382618624640.0, - "grad_norm": 2.0255914888463837, - "language_loss": 0.87308717, - "learning_rate": 2.545078041678131e-06, - "loss": 0.89465714, - "num_input_tokens_seen": 153350760, - "step": 7148, - "time_per_iteration": 4.25404167175293 - }, - { - "auxiliary_loss_clip": 0.01106949, - "auxiliary_loss_mlp": 0.0103361, - "balance_loss_clip": 1.0480504, - "balance_loss_mlp": 1.02031255, - "epoch": 0.4298211333233128, - "flos": 27925681536000.0, - "grad_norm": 1.5866853205406048, - "language_loss": 0.77782673, - "learning_rate": 2.5447033128412957e-06, - "loss": 0.79923236, - "num_input_tokens_seen": 153370765, - "step": 7149, - "time_per_iteration": 2.7506890296936035 - }, - { - "auxiliary_loss_clip": 0.01089941, - "auxiliary_loss_mlp": 0.01034354, - "balance_loss_clip": 1.04399276, - "balance_loss_mlp": 1.02023959, - "epoch": 0.42988125657598075, - "flos": 24425612478720.0, - "grad_norm": 1.8521512589115583, - "language_loss": 0.80214548, - "learning_rate": 2.544328563349256e-06, - "loss": 0.8233884, - "num_input_tokens_seen": 153390725, - "step": 7150, - "time_per_iteration": 2.7500832080841064 - }, - { - "auxiliary_loss_clip": 0.01129377, - "auxiliary_loss_mlp": 0.01039727, - "balance_loss_clip": 1.05486202, - "balance_loss_mlp": 1.02441442, - "epoch": 0.4299413798286487, - "flos": 15850180523520.0, - "grad_norm": 1.9985895227285218, - "language_loss": 0.75273871, - "learning_rate": 2.5439537932162222e-06, - "loss": 0.7744298, - "num_input_tokens_seen": 153408010, - "step": 7151, - "time_per_iteration": 5.016021251678467 - }, - { - "auxiliary_loss_clip": 0.01085345, - "auxiliary_loss_mlp": 0.01034438, - "balance_loss_clip": 1.0429914, - "balance_loss_mlp": 1.02001333, - "epoch": 0.4300015030813167, - "flos": 22309504490880.0, - "grad_norm": 2.1817188720110954, - "language_loss": 0.70050609, - "learning_rate": 2.543579002456406e-06, - "loss": 0.72170389, - "num_input_tokens_seen": 153426865, - "step": 7152, - "time_per_iteration": 2.7800815105438232 - }, - { - "auxiliary_loss_clip": 0.01111211, - "auxiliary_loss_mlp": 0.01037662, - "balance_loss_clip": 1.04997575, - "balance_loss_mlp": 1.02443016, - "epoch": 0.43006162633398465, - "flos": 34897666366080.0, - "grad_norm": 1.6446083910432685, - "language_loss": 0.71179092, - "learning_rate": 2.54320419108402e-06, - "loss": 0.73327965, - "num_input_tokens_seen": 153449410, - "step": 7153, - "time_per_iteration": 2.829648017883301 - }, - { - "auxiliary_loss_clip": 0.01119902, - "auxiliary_loss_mlp": 0.01033204, - "balance_loss_clip": 1.0488553, - "balance_loss_mlp": 1.01928604, - "epoch": 0.4301217495866526, - "flos": 15961575576960.0, - "grad_norm": 1.892610527455045, - "language_loss": 0.78175116, - "learning_rate": 2.542829359113276e-06, - "loss": 0.80328226, - "num_input_tokens_seen": 153467910, - "step": 7154, - "time_per_iteration": 2.723484516143799 - }, - { - "auxiliary_loss_clip": 0.01099683, - "auxiliary_loss_mlp": 0.01040214, - "balance_loss_clip": 1.04681695, - "balance_loss_mlp": 1.02599812, - "epoch": 0.43018187283932063, - "flos": 18770364414720.0, - "grad_norm": 1.5463056134535458, - "language_loss": 0.78802991, - "learning_rate": 2.542454506558389e-06, - "loss": 0.80942887, - "num_input_tokens_seen": 153487100, - "step": 7155, - "time_per_iteration": 2.7014451026916504 - }, - { - "auxiliary_loss_clip": 0.01105109, - "auxiliary_loss_mlp": 0.01032701, - "balance_loss_clip": 1.04913473, - "balance_loss_mlp": 1.01963592, - "epoch": 0.4302419960919886, - "flos": 20151703791360.0, - "grad_norm": 1.7272401238355637, - "language_loss": 0.88303947, - "learning_rate": 2.5420796334335723e-06, - "loss": 0.90441763, - "num_input_tokens_seen": 153505565, - "step": 7156, - "time_per_iteration": 2.696967363357544 - }, - { - "auxiliary_loss_clip": 0.01135167, - "auxiliary_loss_mlp": 0.01033618, - "balance_loss_clip": 1.05029023, - "balance_loss_mlp": 1.01970661, - "epoch": 0.43030211934465656, - "flos": 26432731624320.0, - "grad_norm": 1.8553568722970555, - "language_loss": 0.82653069, - "learning_rate": 2.541704739753042e-06, - "loss": 0.84821856, - "num_input_tokens_seen": 153526130, - "step": 7157, - "time_per_iteration": 2.706956148147583 - }, - { - "auxiliary_loss_clip": 0.01138655, - "auxiliary_loss_mlp": 0.01033446, - "balance_loss_clip": 1.05253196, - "balance_loss_mlp": 1.0191586, - "epoch": 0.43036224259732453, - "flos": 24389234979840.0, - "grad_norm": 1.8412394525159426, - "language_loss": 0.71535289, - "learning_rate": 2.5413298255310132e-06, - "loss": 0.73707396, - "num_input_tokens_seen": 153546370, - "step": 7158, - "time_per_iteration": 2.717587471008301 - }, - { - "auxiliary_loss_clip": 0.01122952, - "auxiliary_loss_mlp": 0.01034381, - "balance_loss_clip": 1.05053186, - "balance_loss_mlp": 1.02094615, - "epoch": 0.4304223658499925, - "flos": 17201714590080.0, - "grad_norm": 2.4063235591116, - "language_loss": 0.82592964, - "learning_rate": 2.5409548907817034e-06, - "loss": 0.84750295, - "num_input_tokens_seen": 153562800, - "step": 7159, - "time_per_iteration": 2.657625436782837 - }, - { - "auxiliary_loss_clip": 0.01105982, - "auxiliary_loss_mlp": 0.01034344, - "balance_loss_clip": 1.04629135, - "balance_loss_mlp": 1.02073002, - "epoch": 0.43048248910266046, - "flos": 14903000835840.0, - "grad_norm": 2.253245664419059, - "language_loss": 0.83222294, - "learning_rate": 2.54057993551933e-06, - "loss": 0.85362625, - "num_input_tokens_seen": 153578395, - "step": 7160, - "time_per_iteration": 2.6994106769561768 - }, - { - "auxiliary_loss_clip": 0.0112897, - "auxiliary_loss_mlp": 0.01040585, - "balance_loss_clip": 1.05215347, - "balance_loss_mlp": 1.02446771, - "epoch": 0.4305426123553284, - "flos": 21579835610880.0, - "grad_norm": 2.2814219127337236, - "language_loss": 0.77506208, - "learning_rate": 2.5402049597581116e-06, - "loss": 0.79675758, - "num_input_tokens_seen": 153596880, - "step": 7161, - "time_per_iteration": 2.819274425506592 - }, - { - "auxiliary_loss_clip": 0.01120227, - "auxiliary_loss_mlp": 0.0103714, - "balance_loss_clip": 1.04739952, - "balance_loss_mlp": 1.02265632, - "epoch": 0.4306027356079964, - "flos": 22601278667520.0, - "grad_norm": 2.279224529598255, - "language_loss": 0.73028505, - "learning_rate": 2.5398299635122662e-06, - "loss": 0.75185871, - "num_input_tokens_seen": 153616570, - "step": 7162, - "time_per_iteration": 2.62280011177063 - }, - { - "auxiliary_loss_clip": 0.01016488, - "auxiliary_loss_mlp": 0.00753107, - "balance_loss_clip": 1.02147388, - "balance_loss_mlp": 1.00100327, - "epoch": 0.43066285886066435, - "flos": 70672091806080.0, - "grad_norm": 0.7910606346239517, - "language_loss": 0.58986276, - "learning_rate": 2.5394549467960147e-06, - "loss": 0.60755867, - "num_input_tokens_seen": 153671450, - "step": 7163, - "time_per_iteration": 3.1325736045837402 - }, - { - "auxiliary_loss_clip": 0.01104143, - "auxiliary_loss_mlp": 0.01044649, - "balance_loss_clip": 1.04593122, - "balance_loss_mlp": 1.02948582, - "epoch": 0.4307229821133323, - "flos": 26720591218560.0, - "grad_norm": 1.8311930089659938, - "language_loss": 0.79205155, - "learning_rate": 2.5390799096235783e-06, - "loss": 0.81353945, - "num_input_tokens_seen": 153691405, - "step": 7164, - "time_per_iteration": 2.753256320953369 - }, - { - "auxiliary_loss_clip": 0.01138029, - "auxiliary_loss_mlp": 0.01040201, - "balance_loss_clip": 1.0510416, - "balance_loss_mlp": 1.02608645, - "epoch": 0.4307831053660003, - "flos": 26177119464960.0, - "grad_norm": 2.032413289263653, - "language_loss": 0.67551947, - "learning_rate": 2.538704852009177e-06, - "loss": 0.69730175, - "num_input_tokens_seen": 153711555, - "step": 7165, - "time_per_iteration": 2.719172477722168 - }, - { - "auxiliary_loss_clip": 0.01106688, - "auxiliary_loss_mlp": 0.00771886, - "balance_loss_clip": 1.05042744, - "balance_loss_mlp": 1.00068462, - "epoch": 0.43084322861866825, - "flos": 18910343715840.0, - "grad_norm": 2.1027726489364436, - "language_loss": 0.75451279, - "learning_rate": 2.538329773967034e-06, - "loss": 0.77329856, - "num_input_tokens_seen": 153730095, - "step": 7166, - "time_per_iteration": 2.710304021835327 - }, - { - "auxiliary_loss_clip": 0.01126475, - "auxiliary_loss_mlp": 0.01036095, - "balance_loss_clip": 1.05613852, - "balance_loss_mlp": 1.02310109, - "epoch": 0.4309033518713362, - "flos": 26432911192320.0, - "grad_norm": 1.6200122801673495, - "language_loss": 0.71809006, - "learning_rate": 2.537954675511372e-06, - "loss": 0.7397157, - "num_input_tokens_seen": 153749320, - "step": 7167, - "time_per_iteration": 2.676224946975708 - }, - { - "auxiliary_loss_clip": 0.01104337, - "auxiliary_loss_mlp": 0.00771035, - "balance_loss_clip": 1.04866242, - "balance_loss_mlp": 1.00059962, - "epoch": 0.43096347512400424, - "flos": 21213295274880.0, - "grad_norm": 1.6573858575043368, - "language_loss": 0.78183687, - "learning_rate": 2.537579556656414e-06, - "loss": 0.80059052, - "num_input_tokens_seen": 153767825, - "step": 7168, - "time_per_iteration": 2.8030035495758057 - }, - { - "auxiliary_loss_clip": 0.01111425, - "auxiliary_loss_mlp": 0.0104262, - "balance_loss_clip": 1.05006397, - "balance_loss_mlp": 1.02867889, - "epoch": 0.4310235983766722, - "flos": 16540131939840.0, - "grad_norm": 1.8701517899109106, - "language_loss": 0.82348084, - "learning_rate": 2.537204417416387e-06, - "loss": 0.84502125, - "num_input_tokens_seen": 153785350, - "step": 7169, - "time_per_iteration": 2.683119773864746 - }, - { - "auxiliary_loss_clip": 0.01047083, - "auxiliary_loss_mlp": 0.01001288, - "balance_loss_clip": 1.03727269, - "balance_loss_mlp": 0.99934483, - "epoch": 0.43108372162934017, - "flos": 64775704763520.0, - "grad_norm": 0.7280845280825856, - "language_loss": 0.60741472, - "learning_rate": 2.5368292578055132e-06, - "loss": 0.6278984, - "num_input_tokens_seen": 153856400, - "step": 7170, - "time_per_iteration": 3.345574140548706 - }, - { - "auxiliary_loss_clip": 0.01135698, - "auxiliary_loss_mlp": 0.01037021, - "balance_loss_clip": 1.05163968, - "balance_loss_mlp": 1.02352667, - "epoch": 0.43114384488200813, - "flos": 13444094039040.0, - "grad_norm": 1.7903297890514136, - "language_loss": 0.75776696, - "learning_rate": 2.536454077838021e-06, - "loss": 0.77949417, - "num_input_tokens_seen": 153875230, - "step": 7171, - "time_per_iteration": 2.612459897994995 - }, - { - "auxiliary_loss_clip": 0.01120974, - "auxiliary_loss_mlp": 0.01034567, - "balance_loss_clip": 1.05036652, - "balance_loss_mlp": 1.02106678, - "epoch": 0.4312039681346761, - "flos": 26286682924800.0, - "grad_norm": 3.289099345654009, - "language_loss": 0.77644551, - "learning_rate": 2.5360788775281357e-06, - "loss": 0.79800093, - "num_input_tokens_seen": 153894740, - "step": 7172, - "time_per_iteration": 2.69909930229187 - }, - { - "auxiliary_loss_clip": 0.01105721, - "auxiliary_loss_mlp": 0.010481, - "balance_loss_clip": 1.04574609, - "balance_loss_mlp": 1.03119648, - "epoch": 0.43126409138734406, - "flos": 20376684627840.0, - "grad_norm": 2.89880180493229, - "language_loss": 0.76759243, - "learning_rate": 2.535703656890086e-06, - "loss": 0.78913063, - "num_input_tokens_seen": 153913230, - "step": 7173, - "time_per_iteration": 2.6338369846343994 - }, - { - "auxiliary_loss_clip": 0.01130423, - "auxiliary_loss_mlp": 0.00772103, - "balance_loss_clip": 1.04817533, - "balance_loss_mlp": 1.00070202, - "epoch": 0.431324214640012, - "flos": 22123091882880.0, - "grad_norm": 1.4474212501027515, - "language_loss": 0.76933503, - "learning_rate": 2.5353284159381e-06, - "loss": 0.78836024, - "num_input_tokens_seen": 153933250, - "step": 7174, - "time_per_iteration": 2.809385061264038 - }, - { - "auxiliary_loss_clip": 0.01135393, - "auxiliary_loss_mlp": 0.01035645, - "balance_loss_clip": 1.0494926, - "balance_loss_mlp": 1.02004063, - "epoch": 0.43138433789268, - "flos": 15231008856960.0, - "grad_norm": 1.5868683627972313, - "language_loss": 0.8226738, - "learning_rate": 2.534953154686407e-06, - "loss": 0.84438419, - "num_input_tokens_seen": 153951325, - "step": 7175, - "time_per_iteration": 2.609368324279785 - }, - { - "auxiliary_loss_clip": 0.01092364, - "auxiliary_loss_mlp": 0.01052008, - "balance_loss_clip": 1.0459013, - "balance_loss_mlp": 1.03422189, - "epoch": 0.43144446114534796, - "flos": 18150294908160.0, - "grad_norm": 2.243705003900615, - "language_loss": 0.74261117, - "learning_rate": 2.5345778731492366e-06, - "loss": 0.76405489, - "num_input_tokens_seen": 153966975, - "step": 7176, - "time_per_iteration": 2.680771827697754 - }, - { - "auxiliary_loss_clip": 0.01122908, - "auxiliary_loss_mlp": 0.01035728, - "balance_loss_clip": 1.04637945, - "balance_loss_mlp": 1.0215838, - "epoch": 0.4315045843980159, - "flos": 22929861306240.0, - "grad_norm": 1.6403527990581428, - "language_loss": 0.73309958, - "learning_rate": 2.534202571340819e-06, - "loss": 0.754686, - "num_input_tokens_seen": 153986695, - "step": 7177, - "time_per_iteration": 2.760601758956909 - }, - { - "auxiliary_loss_clip": 0.011222, - "auxiliary_loss_mlp": 0.01043971, - "balance_loss_clip": 1.05072641, - "balance_loss_mlp": 1.02720976, - "epoch": 0.4315647076506839, - "flos": 22126862810880.0, - "grad_norm": 1.7813773885441684, - "language_loss": 0.81519645, - "learning_rate": 2.533827249275387e-06, - "loss": 0.83685815, - "num_input_tokens_seen": 154004710, - "step": 7178, - "time_per_iteration": 2.6687469482421875 - }, - { - "auxiliary_loss_clip": 0.01109607, - "auxiliary_loss_mlp": 0.01033774, - "balance_loss_clip": 1.04922378, - "balance_loss_mlp": 1.02013087, - "epoch": 0.43162483090335185, - "flos": 26871129118080.0, - "grad_norm": 32.445562208198496, - "language_loss": 0.84143358, - "learning_rate": 2.5334519069671725e-06, - "loss": 0.86286741, - "num_input_tokens_seen": 154024320, - "step": 7179, - "time_per_iteration": 2.696716547012329 - }, - { - "auxiliary_loss_clip": 0.01108857, - "auxiliary_loss_mlp": 0.010342, - "balance_loss_clip": 1.04713559, - "balance_loss_mlp": 1.0200026, - "epoch": 0.4316849541560198, - "flos": 13913122855680.0, - "grad_norm": 1.7762155940538253, - "language_loss": 0.75679082, - "learning_rate": 2.5330765444304075e-06, - "loss": 0.77822137, - "num_input_tokens_seen": 154041755, - "step": 7180, - "time_per_iteration": 2.6832194328308105 - }, - { - "auxiliary_loss_clip": 0.01104614, - "auxiliary_loss_mlp": 0.00776174, - "balance_loss_clip": 1.0417347, - "balance_loss_mlp": 1.00057638, - "epoch": 0.4317450774086878, - "flos": 16435165420800.0, - "grad_norm": 1.9971445999801452, - "language_loss": 0.81773126, - "learning_rate": 2.5327011616793274e-06, - "loss": 0.83653915, - "num_input_tokens_seen": 154056775, - "step": 7181, - "time_per_iteration": 2.6499931812286377 - }, - { - "auxiliary_loss_clip": 0.01110303, - "auxiliary_loss_mlp": 0.01040472, - "balance_loss_clip": 1.04747176, - "balance_loss_mlp": 1.02473664, - "epoch": 0.4318052006613558, - "flos": 20554980762240.0, - "grad_norm": 1.7092925782952597, - "language_loss": 0.89020073, - "learning_rate": 2.532325758728165e-06, - "loss": 0.91170847, - "num_input_tokens_seen": 154075015, - "step": 7182, - "time_per_iteration": 2.6567654609680176 - }, - { - "auxiliary_loss_clip": 0.01121856, - "auxiliary_loss_mlp": 0.00772189, - "balance_loss_clip": 1.05025744, - "balance_loss_mlp": 1.00049865, - "epoch": 0.43186532391402377, - "flos": 22820046451200.0, - "grad_norm": 1.602704996145881, - "language_loss": 0.75739694, - "learning_rate": 2.5319503355911566e-06, - "loss": 0.77633733, - "num_input_tokens_seen": 154095170, - "step": 7183, - "time_per_iteration": 2.6784613132476807 - }, - { - "auxiliary_loss_clip": 0.01123979, - "auxiliary_loss_mlp": 0.01032993, - "balance_loss_clip": 1.05125499, - "balance_loss_mlp": 1.01853919, - "epoch": 0.43192544716669173, - "flos": 25556583081600.0, - "grad_norm": 1.538308227417617, - "language_loss": 0.77589077, - "learning_rate": 2.5315748922825393e-06, - "loss": 0.7974605, - "num_input_tokens_seen": 154116895, - "step": 7184, - "time_per_iteration": 2.6501550674438477 - }, - { - "auxiliary_loss_clip": 0.01103086, - "auxiliary_loss_mlp": 0.01037708, - "balance_loss_clip": 1.04594743, - "balance_loss_mlp": 1.02377832, - "epoch": 0.4319855704193597, - "flos": 30954674701440.0, - "grad_norm": 1.7848849500644928, - "language_loss": 0.73435313, - "learning_rate": 2.5311994288165474e-06, - "loss": 0.75576103, - "num_input_tokens_seen": 154138395, - "step": 7185, - "time_per_iteration": 2.766298770904541 - }, - { - "auxiliary_loss_clip": 0.01122479, - "auxiliary_loss_mlp": 0.01042205, - "balance_loss_clip": 1.05223203, - "balance_loss_mlp": 1.02754247, - "epoch": 0.43204569367202766, - "flos": 24238732993920.0, - "grad_norm": 3.4842964823639515, - "language_loss": 0.75962853, - "learning_rate": 2.530823945207421e-06, - "loss": 0.78127533, - "num_input_tokens_seen": 154156775, - "step": 7186, - "time_per_iteration": 4.334157705307007 - }, - { - "auxiliary_loss_clip": 0.01099566, - "auxiliary_loss_mlp": 0.0103912, - "balance_loss_clip": 1.04762721, - "balance_loss_mlp": 1.02477932, - "epoch": 0.43210581692469563, - "flos": 18406948561920.0, - "grad_norm": 3.9729453010836218, - "language_loss": 0.76471615, - "learning_rate": 2.5304484414693962e-06, - "loss": 0.78610301, - "num_input_tokens_seen": 154177500, - "step": 7187, - "time_per_iteration": 5.956019401550293 - }, - { - "auxiliary_loss_clip": 0.01025499, - "auxiliary_loss_mlp": 0.01034135, - "balance_loss_clip": 1.03011787, - "balance_loss_mlp": 1.03272867, - "epoch": 0.4321659401773636, - "flos": 49832378910720.0, - "grad_norm": 0.8609493763660439, - "language_loss": 0.68115592, - "learning_rate": 2.530072917616714e-06, - "loss": 0.70175231, - "num_input_tokens_seen": 154237110, - "step": 7188, - "time_per_iteration": 3.246208667755127 - }, - { - "auxiliary_loss_clip": 0.01100014, - "auxiliary_loss_mlp": 0.01038065, - "balance_loss_clip": 1.0437665, - "balance_loss_mlp": 1.02437973, - "epoch": 0.43222606343003156, - "flos": 17128564542720.0, - "grad_norm": 1.9766532511253156, - "language_loss": 0.77875316, - "learning_rate": 2.529697373663614e-06, - "loss": 0.80013394, - "num_input_tokens_seen": 154253910, - "step": 7189, - "time_per_iteration": 2.681076765060425 - }, - { - "auxiliary_loss_clip": 0.01083825, - "auxiliary_loss_mlp": 0.01046889, - "balance_loss_clip": 1.04553795, - "balance_loss_mlp": 1.0314517, - "epoch": 0.4322861866826995, - "flos": 22749949059840.0, - "grad_norm": 1.8062049350419371, - "language_loss": 0.71379328, - "learning_rate": 2.5293218096243364e-06, - "loss": 0.73510039, - "num_input_tokens_seen": 154274770, - "step": 7190, - "time_per_iteration": 2.785278081893921 - }, - { - "auxiliary_loss_clip": 0.01109749, - "auxiliary_loss_mlp": 0.01039244, - "balance_loss_clip": 1.04681444, - "balance_loss_mlp": 1.02500999, - "epoch": 0.4323463099353675, - "flos": 27891925729920.0, - "grad_norm": 1.4390067860166444, - "language_loss": 0.79639554, - "learning_rate": 2.5289462255131223e-06, - "loss": 0.81788546, - "num_input_tokens_seen": 154295035, - "step": 7191, - "time_per_iteration": 4.571990728378296 - }, - { - "auxiliary_loss_clip": 0.0108611, - "auxiliary_loss_mlp": 0.01033322, - "balance_loss_clip": 1.04733062, - "balance_loss_mlp": 1.01954126, - "epoch": 0.43240643318803546, - "flos": 21614740652160.0, - "grad_norm": 1.5570148329267672, - "language_loss": 0.74904197, - "learning_rate": 2.5285706213442146e-06, - "loss": 0.77023631, - "num_input_tokens_seen": 154314905, - "step": 7192, - "time_per_iteration": 2.7427282333374023 - }, - { - "auxiliary_loss_clip": 0.01090847, - "auxiliary_loss_mlp": 0.01047049, - "balance_loss_clip": 1.04693365, - "balance_loss_mlp": 1.03140879, - "epoch": 0.4324665564407034, - "flos": 17558378686080.0, - "grad_norm": 2.028484656266998, - "language_loss": 0.7934891, - "learning_rate": 2.5281949971318557e-06, - "loss": 0.81486803, - "num_input_tokens_seen": 154331740, - "step": 7193, - "time_per_iteration": 2.708481550216675 - }, - { - "auxiliary_loss_clip": 0.01114828, - "auxiliary_loss_mlp": 0.0104506, - "balance_loss_clip": 1.04726183, - "balance_loss_mlp": 1.02971745, - "epoch": 0.4325266796933714, - "flos": 18402423448320.0, - "grad_norm": 1.769737496980083, - "language_loss": 0.75720823, - "learning_rate": 2.5278193528902897e-06, - "loss": 0.77880704, - "num_input_tokens_seen": 154348740, - "step": 7194, - "time_per_iteration": 2.685701608657837 - }, - { - "auxiliary_loss_clip": 0.01135356, - "auxiliary_loss_mlp": 0.01041388, - "balance_loss_clip": 1.05137146, - "balance_loss_mlp": 1.02693963, - "epoch": 0.4325868029460394, - "flos": 22564793427840.0, - "grad_norm": 3.855960133728433, - "language_loss": 0.59479225, - "learning_rate": 2.5274436886337613e-06, - "loss": 0.61655968, - "num_input_tokens_seen": 154368835, - "step": 7195, - "time_per_iteration": 2.634310483932495 - }, - { - "auxiliary_loss_clip": 0.01112701, - "auxiliary_loss_mlp": 0.01040238, - "balance_loss_clip": 1.04618812, - "balance_loss_mlp": 1.02434754, - "epoch": 0.43264692619870737, - "flos": 14605516396800.0, - "grad_norm": 2.711649843090413, - "language_loss": 0.65653574, - "learning_rate": 2.527068004376515e-06, - "loss": 0.67806506, - "num_input_tokens_seen": 154384620, - "step": 7196, - "time_per_iteration": 2.608530044555664 - }, - { - "auxiliary_loss_clip": 0.01141945, - "auxiliary_loss_mlp": 0.01038972, - "balance_loss_clip": 1.05338526, - "balance_loss_mlp": 1.02316523, - "epoch": 0.43270704945137534, - "flos": 21501657659520.0, - "grad_norm": 1.8654403969935065, - "language_loss": 0.72525519, - "learning_rate": 2.526692300132797e-06, - "loss": 0.74706435, - "num_input_tokens_seen": 154402865, - "step": 7197, - "time_per_iteration": 2.644087791442871 - }, - { - "auxiliary_loss_clip": 0.01124491, - "auxiliary_loss_mlp": 0.01040937, - "balance_loss_clip": 1.05245936, - "balance_loss_mlp": 1.02619135, - "epoch": 0.4327671727040433, - "flos": 25155891889920.0, - "grad_norm": 1.511486884186769, - "language_loss": 0.73146015, - "learning_rate": 2.5263165759168547e-06, - "loss": 0.75311446, - "num_input_tokens_seen": 154423625, - "step": 7198, - "time_per_iteration": 2.7317864894866943 - }, - { - "auxiliary_loss_clip": 0.0109556, - "auxiliary_loss_mlp": 0.01034886, - "balance_loss_clip": 1.04451466, - "balance_loss_mlp": 1.02034283, - "epoch": 0.43282729595671127, - "flos": 25447163276160.0, - "grad_norm": 1.539323937310933, - "language_loss": 0.80887341, - "learning_rate": 2.525940831742934e-06, - "loss": 0.8301779, - "num_input_tokens_seen": 154444775, - "step": 7199, - "time_per_iteration": 2.736016035079956 - }, - { - "auxiliary_loss_clip": 0.01121231, - "auxiliary_loss_mlp": 0.01034417, - "balance_loss_clip": 1.05255413, - "balance_loss_mlp": 1.0201118, - "epoch": 0.43288741920937923, - "flos": 24126116878080.0, - "grad_norm": 2.6908376787400186, - "language_loss": 0.68332666, - "learning_rate": 2.525565067625286e-06, - "loss": 0.70488322, - "num_input_tokens_seen": 154460815, - "step": 7200, - "time_per_iteration": 2.688460350036621 - }, - { - "auxiliary_loss_clip": 0.01114262, - "auxiliary_loss_mlp": 0.00772856, - "balance_loss_clip": 1.05025625, - "balance_loss_mlp": 1.00067294, - "epoch": 0.4329475424620472, - "flos": 19204955066880.0, - "grad_norm": 1.9560728888597885, - "language_loss": 0.87379515, - "learning_rate": 2.525189283578157e-06, - "loss": 0.89266634, - "num_input_tokens_seen": 154479145, - "step": 7201, - "time_per_iteration": 2.7547309398651123 - }, - { - "auxiliary_loss_clip": 0.01086041, - "auxiliary_loss_mlp": 0.01040787, - "balance_loss_clip": 1.04952443, - "balance_loss_mlp": 1.02395487, - "epoch": 0.43300766571471516, - "flos": 22638374438400.0, - "grad_norm": 2.3345355752276706, - "language_loss": 0.64547086, - "learning_rate": 2.5248134796157974e-06, - "loss": 0.66673917, - "num_input_tokens_seen": 154498905, - "step": 7202, - "time_per_iteration": 2.878486156463623 - }, - { - "auxiliary_loss_clip": 0.01082437, - "auxiliary_loss_mlp": 0.01030304, - "balance_loss_clip": 1.04730773, - "balance_loss_mlp": 1.01676202, - "epoch": 0.4330677889673831, - "flos": 22121080721280.0, - "grad_norm": 2.291722240352509, - "language_loss": 0.81795621, - "learning_rate": 2.5244376557524586e-06, - "loss": 0.83908355, - "num_input_tokens_seen": 154517270, - "step": 7203, - "time_per_iteration": 2.7338409423828125 - }, - { - "auxiliary_loss_clip": 0.01102737, - "auxiliary_loss_mlp": 0.01051208, - "balance_loss_clip": 1.04656279, - "balance_loss_mlp": 1.0357945, - "epoch": 0.4331279122200511, - "flos": 23221527742080.0, - "grad_norm": 1.8864588919547398, - "language_loss": 0.81453216, - "learning_rate": 2.5240618120023912e-06, - "loss": 0.83607161, - "num_input_tokens_seen": 154535945, - "step": 7204, - "time_per_iteration": 2.7719802856445312 - }, - { - "auxiliary_loss_clip": 0.01111895, - "auxiliary_loss_mlp": 0.01038561, - "balance_loss_clip": 1.04900229, - "balance_loss_mlp": 1.02450609, - "epoch": 0.43318803547271906, - "flos": 18259750627200.0, - "grad_norm": 2.1348551022614077, - "language_loss": 0.73979616, - "learning_rate": 2.5236859483798468e-06, - "loss": 0.76130074, - "num_input_tokens_seen": 154554935, - "step": 7205, - "time_per_iteration": 2.73463773727417 - }, - { - "auxiliary_loss_clip": 0.01139834, - "auxiliary_loss_mlp": 0.00772219, - "balance_loss_clip": 1.05782342, - "balance_loss_mlp": 1.00075722, - "epoch": 0.433248158725387, - "flos": 27418407713280.0, - "grad_norm": 1.7497294767683989, - "language_loss": 0.75183374, - "learning_rate": 2.5233100648990803e-06, - "loss": 0.77095425, - "num_input_tokens_seen": 154576065, - "step": 7206, - "time_per_iteration": 2.712897300720215 - }, - { - "auxiliary_loss_clip": 0.01082016, - "auxiliary_loss_mlp": 0.01036402, - "balance_loss_clip": 1.04904056, - "balance_loss_mlp": 1.02218044, - "epoch": 0.433308281978055, - "flos": 23218008209280.0, - "grad_norm": 5.825458886470942, - "language_loss": 0.79041201, - "learning_rate": 2.522934161574342e-06, - "loss": 0.81159621, - "num_input_tokens_seen": 154595110, - "step": 7207, - "time_per_iteration": 2.7708940505981445 - }, - { - "auxiliary_loss_clip": 0.01104721, - "auxiliary_loss_mlp": 0.01039597, - "balance_loss_clip": 1.04836667, - "balance_loss_mlp": 1.02374804, - "epoch": 0.433368405230723, - "flos": 15852407166720.0, - "grad_norm": 1.8464623058117935, - "language_loss": 0.81316662, - "learning_rate": 2.5225582384198888e-06, - "loss": 0.83460987, - "num_input_tokens_seen": 154612255, - "step": 7208, - "time_per_iteration": 2.869554281234741 - }, - { - "auxiliary_loss_clip": 0.01114033, - "auxiliary_loss_mlp": 0.01033629, - "balance_loss_clip": 1.04989004, - "balance_loss_mlp": 1.01924682, - "epoch": 0.433428528483391, - "flos": 19026084314880.0, - "grad_norm": 2.1101386955173154, - "language_loss": 0.70337081, - "learning_rate": 2.5221822954499744e-06, - "loss": 0.72484744, - "num_input_tokens_seen": 154630440, - "step": 7209, - "time_per_iteration": 2.692166805267334 - }, - { - "auxiliary_loss_clip": 0.01122508, - "auxiliary_loss_mlp": 0.0103772, - "balance_loss_clip": 1.04924512, - "balance_loss_mlp": 1.02234209, - "epoch": 0.43348865173605894, - "flos": 24718248581760.0, - "grad_norm": 1.435580666015418, - "language_loss": 0.81432891, - "learning_rate": 2.5218063326788557e-06, - "loss": 0.83593118, - "num_input_tokens_seen": 154652515, - "step": 7210, - "time_per_iteration": 2.7368991374969482 - }, - { - "auxiliary_loss_clip": 0.01111056, - "auxiliary_loss_mlp": 0.01040693, - "balance_loss_clip": 1.05043674, - "balance_loss_mlp": 1.02690065, - "epoch": 0.4335487749887269, - "flos": 22090664880000.0, - "grad_norm": 2.4268266327689005, - "language_loss": 0.82382917, - "learning_rate": 2.5214303501207885e-06, - "loss": 0.84534657, - "num_input_tokens_seen": 154670965, - "step": 7211, - "time_per_iteration": 2.6840522289276123 - }, - { - "auxiliary_loss_clip": 0.01124683, - "auxiliary_loss_mlp": 0.01036766, - "balance_loss_clip": 1.04992187, - "balance_loss_mlp": 1.02354002, - "epoch": 0.43360889824139487, - "flos": 22382941847040.0, - "grad_norm": 1.7229238689988244, - "language_loss": 0.74880648, - "learning_rate": 2.521054347790029e-06, - "loss": 0.77042103, - "num_input_tokens_seen": 154689980, - "step": 7212, - "time_per_iteration": 2.6535651683807373 - }, - { - "auxiliary_loss_clip": 0.01111992, - "auxiliary_loss_mlp": 0.01035698, - "balance_loss_clip": 1.05274439, - "balance_loss_mlp": 1.0224421, - "epoch": 0.43366902149406283, - "flos": 17528286067200.0, - "grad_norm": 1.7659929391516203, - "language_loss": 0.76887298, - "learning_rate": 2.5206783257008375e-06, - "loss": 0.7903499, - "num_input_tokens_seen": 154706570, - "step": 7213, - "time_per_iteration": 2.7639784812927246 - }, - { - "auxiliary_loss_clip": 0.01127555, - "auxiliary_loss_mlp": 0.01037213, - "balance_loss_clip": 1.05343771, - "balance_loss_mlp": 1.0235039, - "epoch": 0.4337291447467308, - "flos": 19022672522880.0, - "grad_norm": 2.352447655586991, - "language_loss": 0.64672804, - "learning_rate": 2.520302283867471e-06, - "loss": 0.66837579, - "num_input_tokens_seen": 154725210, - "step": 7214, - "time_per_iteration": 2.6546545028686523 - }, - { - "auxiliary_loss_clip": 0.01107197, - "auxiliary_loss_mlp": 0.01037553, - "balance_loss_clip": 1.04624152, - "balance_loss_mlp": 1.02401102, - "epoch": 0.43378926799939876, - "flos": 27234042180480.0, - "grad_norm": 1.8015946289097802, - "language_loss": 0.71728516, - "learning_rate": 2.519926222304191e-06, - "loss": 0.73873264, - "num_input_tokens_seen": 154745945, - "step": 7215, - "time_per_iteration": 2.7694337368011475 - }, - { - "auxiliary_loss_clip": 0.01105367, - "auxiliary_loss_mlp": 0.01038295, - "balance_loss_clip": 1.04855013, - "balance_loss_mlp": 1.02280354, - "epoch": 0.43384939125206673, - "flos": 15961108700160.0, - "grad_norm": 2.003102925000143, - "language_loss": 0.75037885, - "learning_rate": 2.519550141025255e-06, - "loss": 0.77181542, - "num_input_tokens_seen": 154763580, - "step": 7216, - "time_per_iteration": 2.725843667984009 - }, - { - "auxiliary_loss_clip": 0.01116821, - "auxiliary_loss_mlp": 0.01045067, - "balance_loss_clip": 1.05096495, - "balance_loss_mlp": 1.02885413, - "epoch": 0.4339095145047347, - "flos": 21793216354560.0, - "grad_norm": 2.430460894289381, - "language_loss": 0.75723612, - "learning_rate": 2.519174040044927e-06, - "loss": 0.77885503, - "num_input_tokens_seen": 154776825, - "step": 7217, - "time_per_iteration": 2.7089385986328125 - }, - { - "auxiliary_loss_clip": 0.01100856, - "auxiliary_loss_mlp": 0.01039272, - "balance_loss_clip": 1.04884839, - "balance_loss_mlp": 1.02414465, - "epoch": 0.43396963775740266, - "flos": 14209853109120.0, - "grad_norm": 1.9588734650761437, - "language_loss": 0.74091554, - "learning_rate": 2.5187979193774664e-06, - "loss": 0.76231682, - "num_input_tokens_seen": 154794025, - "step": 7218, - "time_per_iteration": 2.6733574867248535 - }, - { - "auxiliary_loss_clip": 0.01108387, - "auxiliary_loss_mlp": 0.01033005, - "balance_loss_clip": 1.05125904, - "balance_loss_mlp": 1.01892698, - "epoch": 0.4340297610100706, - "flos": 19719052473600.0, - "grad_norm": 1.867471044964119, - "language_loss": 0.69258481, - "learning_rate": 2.5184217790371367e-06, - "loss": 0.71399873, - "num_input_tokens_seen": 154813105, - "step": 7219, - "time_per_iteration": 2.6384527683258057 - }, - { - "auxiliary_loss_clip": 0.01103251, - "auxiliary_loss_mlp": 0.01039305, - "balance_loss_clip": 1.04848611, - "balance_loss_mlp": 1.02513123, - "epoch": 0.4340898842627386, - "flos": 18953508885120.0, - "grad_norm": 2.2592610231798274, - "language_loss": 0.77296734, - "learning_rate": 2.518045619038202e-06, - "loss": 0.79439294, - "num_input_tokens_seen": 154833525, - "step": 7220, - "time_per_iteration": 2.693434476852417 - }, - { - "auxiliary_loss_clip": 0.01068716, - "auxiliary_loss_mlp": 0.01037568, - "balance_loss_clip": 1.04492617, - "balance_loss_mlp": 1.02248216, - "epoch": 0.4341500075154066, - "flos": 22018304931840.0, - "grad_norm": 2.0152794755447183, - "language_loss": 0.6924417, - "learning_rate": 2.5176694393949243e-06, - "loss": 0.71350455, - "num_input_tokens_seen": 154853090, - "step": 7221, - "time_per_iteration": 2.8318276405334473 - }, - { - "auxiliary_loss_clip": 0.01126059, - "auxiliary_loss_mlp": 0.01040304, - "balance_loss_clip": 1.04850173, - "balance_loss_mlp": 1.02628446, - "epoch": 0.4342101307680746, - "flos": 23582465556480.0, - "grad_norm": 2.7538415889200554, - "language_loss": 0.65288424, - "learning_rate": 2.51729324012157e-06, - "loss": 0.67454779, - "num_input_tokens_seen": 154872055, - "step": 7222, - "time_per_iteration": 2.6848082542419434 - }, - { - "auxiliary_loss_clip": 0.01095727, - "auxiliary_loss_mlp": 0.0103288, - "balance_loss_clip": 1.04434943, - "balance_loss_mlp": 1.01868868, - "epoch": 0.43427025402074254, - "flos": 17967976450560.0, - "grad_norm": 2.2547341093747884, - "language_loss": 0.72800291, - "learning_rate": 2.5169170212324053e-06, - "loss": 0.74928898, - "num_input_tokens_seen": 154886645, - "step": 7223, - "time_per_iteration": 2.6691431999206543 - }, - { - "auxiliary_loss_clip": 0.0113251, - "auxiliary_loss_mlp": 0.01035818, - "balance_loss_clip": 1.04656434, - "balance_loss_mlp": 1.02130401, - "epoch": 0.4343303772734105, - "flos": 26286395616000.0, - "grad_norm": 1.8756720282844566, - "language_loss": 0.93602765, - "learning_rate": 2.516540782741694e-06, - "loss": 0.95771086, - "num_input_tokens_seen": 154906775, - "step": 7224, - "time_per_iteration": 2.667450189590454 - }, - { - "auxiliary_loss_clip": 0.01092783, - "auxiliary_loss_mlp": 0.01039248, - "balance_loss_clip": 1.04234195, - "balance_loss_mlp": 1.02426362, - "epoch": 0.43439050052607847, - "flos": 26833961520000.0, - "grad_norm": 1.4167248746748424, - "language_loss": 0.61521256, - "learning_rate": 2.5161645246637056e-06, - "loss": 0.63653284, - "num_input_tokens_seen": 154926990, - "step": 7225, - "time_per_iteration": 4.334634304046631 - }, - { - "auxiliary_loss_clip": 0.01107023, - "auxiliary_loss_mlp": 0.00773069, - "balance_loss_clip": 1.04763186, - "balance_loss_mlp": 1.00081611, - "epoch": 0.43445062377874644, - "flos": 21397660807680.0, - "grad_norm": 1.859930915167877, - "language_loss": 0.77928364, - "learning_rate": 2.5157882470127054e-06, - "loss": 0.79808456, - "num_input_tokens_seen": 154946210, - "step": 7226, - "time_per_iteration": 5.937607765197754 - }, - { - "auxiliary_loss_clip": 0.01118617, - "auxiliary_loss_mlp": 0.0103402, - "balance_loss_clip": 1.047508, - "balance_loss_mlp": 1.02045417, - "epoch": 0.4345107470314144, - "flos": 19901945548800.0, - "grad_norm": 1.6822192052663985, - "language_loss": 0.84638822, - "learning_rate": 2.515411949802964e-06, - "loss": 0.86791462, - "num_input_tokens_seen": 154964995, - "step": 7227, - "time_per_iteration": 2.6521942615509033 - }, - { - "auxiliary_loss_clip": 0.01117348, - "auxiliary_loss_mlp": 0.0103842, - "balance_loss_clip": 1.04574108, - "balance_loss_mlp": 1.02328634, - "epoch": 0.43457087028408237, - "flos": 26432623883520.0, - "grad_norm": 1.9493500401331498, - "language_loss": 0.76725572, - "learning_rate": 2.5150356330487498e-06, - "loss": 0.78881335, - "num_input_tokens_seen": 154984775, - "step": 7228, - "time_per_iteration": 2.6870598793029785 - }, - { - "auxiliary_loss_clip": 0.01089608, - "auxiliary_loss_mlp": 0.01040618, - "balance_loss_clip": 1.04927957, - "balance_loss_mlp": 1.02599132, - "epoch": 0.43463099353675033, - "flos": 31868816855040.0, - "grad_norm": 1.513481048537933, - "language_loss": 0.80442667, - "learning_rate": 2.5146592967643324e-06, - "loss": 0.82572889, - "num_input_tokens_seen": 155008125, - "step": 7229, - "time_per_iteration": 2.9437830448150635 - }, - { - "auxiliary_loss_clip": 0.01121336, - "auxiliary_loss_mlp": 0.01045576, - "balance_loss_clip": 1.04673219, - "balance_loss_mlp": 1.03047252, - "epoch": 0.4346911167894183, - "flos": 24571266128640.0, - "grad_norm": 2.5474712737755016, - "language_loss": 0.81467843, - "learning_rate": 2.5142829409639834e-06, - "loss": 0.83634758, - "num_input_tokens_seen": 155027885, - "step": 7230, - "time_per_iteration": 4.6465747356414795 - }, - { - "auxiliary_loss_clip": 0.0111898, - "auxiliary_loss_mlp": 0.01049467, - "balance_loss_clip": 1.04806113, - "balance_loss_mlp": 1.03399396, - "epoch": 0.43475124004208626, - "flos": 17090678672640.0, - "grad_norm": 2.126712012780947, - "language_loss": 0.76608211, - "learning_rate": 2.513906565661973e-06, - "loss": 0.78776658, - "num_input_tokens_seen": 155043375, - "step": 7231, - "time_per_iteration": 2.668262243270874 - }, - { - "auxiliary_loss_clip": 0.01085236, - "auxiliary_loss_mlp": 0.010365, - "balance_loss_clip": 1.04462624, - "balance_loss_mlp": 1.02319062, - "epoch": 0.4348113632947542, - "flos": 26104615862400.0, - "grad_norm": 1.4622957052763208, - "language_loss": 0.6875934, - "learning_rate": 2.513530170872575e-06, - "loss": 0.70881081, - "num_input_tokens_seen": 155062930, - "step": 7232, - "time_per_iteration": 2.7392327785491943 - }, - { - "auxiliary_loss_clip": 0.01098662, - "auxiliary_loss_mlp": 0.01036589, - "balance_loss_clip": 1.04562938, - "balance_loss_mlp": 1.02119923, - "epoch": 0.4348714865474222, - "flos": 34200496316160.0, - "grad_norm": 1.6380302947056737, - "language_loss": 0.72123957, - "learning_rate": 2.5131537566100605e-06, - "loss": 0.74259216, - "num_input_tokens_seen": 155084980, - "step": 7233, - "time_per_iteration": 2.8322300910949707 - }, - { - "auxiliary_loss_clip": 0.01073793, - "auxiliary_loss_mlp": 0.01045709, - "balance_loss_clip": 1.04429805, - "balance_loss_mlp": 1.02930558, - "epoch": 0.43493160980009016, - "flos": 31537468869120.0, - "grad_norm": 1.5095585359817736, - "language_loss": 0.74440682, - "learning_rate": 2.5127773228887053e-06, - "loss": 0.76560181, - "num_input_tokens_seen": 155107260, - "step": 7234, - "time_per_iteration": 2.9071762561798096 - }, - { - "auxiliary_loss_clip": 0.011103, - "auxiliary_loss_mlp": 0.01043772, - "balance_loss_clip": 1.04619622, - "balance_loss_mlp": 1.02835774, - "epoch": 0.4349917330527582, - "flos": 24061334699520.0, - "grad_norm": 2.005736270415063, - "language_loss": 0.59333825, - "learning_rate": 2.512400869722782e-06, - "loss": 0.61487895, - "num_input_tokens_seen": 155126720, - "step": 7235, - "time_per_iteration": 2.6738569736480713 - }, - { - "auxiliary_loss_clip": 0.01064764, - "auxiliary_loss_mlp": 0.01055431, - "balance_loss_clip": 1.03919065, - "balance_loss_mlp": 1.03892064, - "epoch": 0.43505185630542614, - "flos": 30519329863680.0, - "grad_norm": 1.6349929491691664, - "language_loss": 0.77779961, - "learning_rate": 2.512024397126566e-06, - "loss": 0.79900157, - "num_input_tokens_seen": 155148640, - "step": 7236, - "time_per_iteration": 2.8045287132263184 - }, - { - "auxiliary_loss_clip": 0.01129354, - "auxiliary_loss_mlp": 0.01036773, - "balance_loss_clip": 1.04843307, - "balance_loss_mlp": 1.02221155, - "epoch": 0.4351119795580941, - "flos": 15735158196480.0, - "grad_norm": 1.6962419767837338, - "language_loss": 0.81330889, - "learning_rate": 2.5116479051143345e-06, - "loss": 0.83497024, - "num_input_tokens_seen": 155165870, - "step": 7237, - "time_per_iteration": 2.648671865463257 - }, - { - "auxiliary_loss_clip": 0.01115513, - "auxiliary_loss_mlp": 0.0103661, - "balance_loss_clip": 1.04350662, - "balance_loss_mlp": 1.02228153, - "epoch": 0.4351721028107621, - "flos": 18731760272640.0, - "grad_norm": 3.1516026268664485, - "language_loss": 0.62781835, - "learning_rate": 2.5112713937003623e-06, - "loss": 0.64933956, - "num_input_tokens_seen": 155185315, - "step": 7238, - "time_per_iteration": 2.708812713623047 - }, - { - "auxiliary_loss_clip": 0.01093861, - "auxiliary_loss_mlp": 0.00771839, - "balance_loss_clip": 1.04551601, - "balance_loss_mlp": 1.00081944, - "epoch": 0.43523222606343004, - "flos": 25226887121280.0, - "grad_norm": 1.9011673436513334, - "language_loss": 0.85935599, - "learning_rate": 2.510894862898928e-06, - "loss": 0.87801301, - "num_input_tokens_seen": 155205790, - "step": 7239, - "time_per_iteration": 2.7664706707000732 - }, - { - "auxiliary_loss_clip": 0.01108836, - "auxiliary_loss_mlp": 0.01032451, - "balance_loss_clip": 1.04520702, - "balance_loss_mlp": 1.01814556, - "epoch": 0.435292349316098, - "flos": 22709190101760.0, - "grad_norm": 1.536559176560054, - "language_loss": 0.7257551, - "learning_rate": 2.510518312724309e-06, - "loss": 0.747168, - "num_input_tokens_seen": 155226475, - "step": 7240, - "time_per_iteration": 2.7275354862213135 - }, - { - "auxiliary_loss_clip": 0.01096929, - "auxiliary_loss_mlp": 0.01033031, - "balance_loss_clip": 1.04623103, - "balance_loss_mlp": 1.01821971, - "epoch": 0.43535247256876597, - "flos": 25775889569280.0, - "grad_norm": 2.0741794573690613, - "language_loss": 0.8174212, - "learning_rate": 2.5101417431907842e-06, - "loss": 0.83872074, - "num_input_tokens_seen": 155247110, - "step": 7241, - "time_per_iteration": 2.7412314414978027 - }, - { - "auxiliary_loss_clip": 0.01104486, - "auxiliary_loss_mlp": 0.00773075, - "balance_loss_clip": 1.04755354, - "balance_loss_mlp": 1.000664, - "epoch": 0.43541259582143393, - "flos": 17528142412800.0, - "grad_norm": 2.5029472103375627, - "language_loss": 0.7954601, - "learning_rate": 2.5097651543126345e-06, - "loss": 0.81423575, - "num_input_tokens_seen": 155261335, - "step": 7242, - "time_per_iteration": 2.7832155227661133 - }, - { - "auxiliary_loss_clip": 0.01105652, - "auxiliary_loss_mlp": 0.01038715, - "balance_loss_clip": 1.04170573, - "balance_loss_mlp": 1.0224551, - "epoch": 0.4354727190741019, - "flos": 15195205975680.0, - "grad_norm": 5.632863009629144, - "language_loss": 0.68174016, - "learning_rate": 2.509388546104138e-06, - "loss": 0.70318383, - "num_input_tokens_seen": 155278510, - "step": 7243, - "time_per_iteration": 2.731621742248535 - }, - { - "auxiliary_loss_clip": 0.01070337, - "auxiliary_loss_mlp": 0.01035269, - "balance_loss_clip": 1.04518962, - "balance_loss_mlp": 1.02096963, - "epoch": 0.43553284232676986, - "flos": 16649264436480.0, - "grad_norm": 1.737599591064028, - "language_loss": 0.81023276, - "learning_rate": 2.5090119185795766e-06, - "loss": 0.83128881, - "num_input_tokens_seen": 155296450, - "step": 7244, - "time_per_iteration": 2.869999885559082 - }, - { - "auxiliary_loss_clip": 0.0107405, - "auxiliary_loss_mlp": 0.01033039, - "balance_loss_clip": 1.04502463, - "balance_loss_mlp": 1.01974106, - "epoch": 0.43559296557943783, - "flos": 23400865370880.0, - "grad_norm": 1.7613354011100055, - "language_loss": 0.73543227, - "learning_rate": 2.508635271753234e-06, - "loss": 0.75650311, - "num_input_tokens_seen": 155316080, - "step": 7245, - "time_per_iteration": 2.8238213062286377 - }, - { - "auxiliary_loss_clip": 0.01073655, - "auxiliary_loss_mlp": 0.01040109, - "balance_loss_clip": 1.042413, - "balance_loss_mlp": 1.02626252, - "epoch": 0.4356530888321058, - "flos": 22419067950720.0, - "grad_norm": 1.8556670419976653, - "language_loss": 0.76651436, - "learning_rate": 2.508258605639389e-06, - "loss": 0.78765202, - "num_input_tokens_seen": 155336765, - "step": 7246, - "time_per_iteration": 2.74566912651062 - }, - { - "auxiliary_loss_clip": 0.01117733, - "auxiliary_loss_mlp": 0.01046964, - "balance_loss_clip": 1.04482377, - "balance_loss_mlp": 1.03185987, - "epoch": 0.43571321208477376, - "flos": 21616141282560.0, - "grad_norm": 1.8292531725431629, - "language_loss": 0.85409153, - "learning_rate": 2.5078819202523275e-06, - "loss": 0.8757385, - "num_input_tokens_seen": 155356440, - "step": 7247, - "time_per_iteration": 2.6183457374572754 - }, - { - "auxiliary_loss_clip": 0.01130523, - "auxiliary_loss_mlp": 0.01039193, - "balance_loss_clip": 1.047526, - "balance_loss_mlp": 1.02565122, - "epoch": 0.4357733353374418, - "flos": 23987358639360.0, - "grad_norm": 1.611147300467871, - "language_loss": 0.72544634, - "learning_rate": 2.507505215606333e-06, - "loss": 0.74714351, - "num_input_tokens_seen": 155377070, - "step": 7248, - "time_per_iteration": 2.614370822906494 - }, - { - "auxiliary_loss_clip": 0.01120332, - "auxiliary_loss_mlp": 0.01038636, - "balance_loss_clip": 1.0502224, - "balance_loss_mlp": 1.0246768, - "epoch": 0.43583345859010975, - "flos": 25264737077760.0, - "grad_norm": 1.6765876969892934, - "language_loss": 0.87089729, - "learning_rate": 2.5071284917156893e-06, - "loss": 0.89248699, - "num_input_tokens_seen": 155398415, - "step": 7249, - "time_per_iteration": 2.6826605796813965 - }, - { - "auxiliary_loss_clip": 0.01113045, - "auxiliary_loss_mlp": 0.01045156, - "balance_loss_clip": 1.04740214, - "balance_loss_mlp": 1.03150034, - "epoch": 0.4358935818427777, - "flos": 23696302734720.0, - "grad_norm": 2.0541786270405495, - "language_loss": 0.81998801, - "learning_rate": 2.506751748594683e-06, - "loss": 0.84157008, - "num_input_tokens_seen": 155415625, - "step": 7250, - "time_per_iteration": 2.6470022201538086 - }, - { - "auxiliary_loss_clip": 0.01124271, - "auxiliary_loss_mlp": 0.01035047, - "balance_loss_clip": 1.05197597, - "balance_loss_mlp": 1.02089727, - "epoch": 0.4359537050954457, - "flos": 29532827761920.0, - "grad_norm": 1.9267289360456135, - "language_loss": 0.84933323, - "learning_rate": 2.5063749862575988e-06, - "loss": 0.87092638, - "num_input_tokens_seen": 155435505, - "step": 7251, - "time_per_iteration": 2.665776014328003 - }, - { - "auxiliary_loss_clip": 0.01108984, - "auxiliary_loss_mlp": 0.01043132, - "balance_loss_clip": 1.04255629, - "balance_loss_mlp": 1.02783751, - "epoch": 0.43601382834811364, - "flos": 22711273090560.0, - "grad_norm": 2.7582881981862335, - "language_loss": 0.69538188, - "learning_rate": 2.5059982047187245e-06, - "loss": 0.71690303, - "num_input_tokens_seen": 155455425, - "step": 7252, - "time_per_iteration": 2.644498825073242 - }, - { - "auxiliary_loss_clip": 0.01102038, - "auxiliary_loss_mlp": 0.01039697, - "balance_loss_clip": 1.04452658, - "balance_loss_mlp": 1.02410412, - "epoch": 0.4360739516007816, - "flos": 19098731571840.0, - "grad_norm": 2.1859211403409717, - "language_loss": 0.83621645, - "learning_rate": 2.505621403992348e-06, - "loss": 0.85763383, - "num_input_tokens_seen": 155474250, - "step": 7253, - "time_per_iteration": 2.662623882293701 - }, - { - "auxiliary_loss_clip": 0.01119158, - "auxiliary_loss_mlp": 0.01041761, - "balance_loss_clip": 1.04809666, - "balance_loss_mlp": 1.0271399, - "epoch": 0.43613407485344957, - "flos": 23404420817280.0, - "grad_norm": 1.5459938146205512, - "language_loss": 0.70561367, - "learning_rate": 2.505244584092757e-06, - "loss": 0.7272228, - "num_input_tokens_seen": 155494685, - "step": 7254, - "time_per_iteration": 2.677427053451538 - }, - { - "auxiliary_loss_clip": 0.01106538, - "auxiliary_loss_mlp": 0.01041179, - "balance_loss_clip": 1.04567051, - "balance_loss_mlp": 1.02734506, - "epoch": 0.43619419810611754, - "flos": 22637799820800.0, - "grad_norm": 1.8056505398017555, - "language_loss": 0.812729, - "learning_rate": 2.5048677450342406e-06, - "loss": 0.83420616, - "num_input_tokens_seen": 155513040, - "step": 7255, - "time_per_iteration": 2.7150163650512695 - }, - { - "auxiliary_loss_clip": 0.01132135, - "auxiliary_loss_mlp": 0.01040806, - "balance_loss_clip": 1.04807031, - "balance_loss_mlp": 1.02626252, - "epoch": 0.4362543213587855, - "flos": 20047958334720.0, - "grad_norm": 1.9676871720710198, - "language_loss": 0.7780782, - "learning_rate": 2.504490886831089e-06, - "loss": 0.79980761, - "num_input_tokens_seen": 155530100, - "step": 7256, - "time_per_iteration": 2.551403522491455 - }, - { - "auxiliary_loss_clip": 0.0112974, - "auxiliary_loss_mlp": 0.01041375, - "balance_loss_clip": 1.04864502, - "balance_loss_mlp": 1.02721334, - "epoch": 0.43631444461145347, - "flos": 21361319222400.0, - "grad_norm": 1.9475980639851616, - "language_loss": 0.76180404, - "learning_rate": 2.5041140094975922e-06, - "loss": 0.78351521, - "num_input_tokens_seen": 155549375, - "step": 7257, - "time_per_iteration": 2.6217384338378906 - }, - { - "auxiliary_loss_clip": 0.01120044, - "auxiliary_loss_mlp": 0.01042182, - "balance_loss_clip": 1.04656029, - "balance_loss_mlp": 1.02711391, - "epoch": 0.43637456786412143, - "flos": 22418529246720.0, - "grad_norm": 1.6554456872207661, - "language_loss": 0.73254454, - "learning_rate": 2.5037371130480417e-06, - "loss": 0.75416678, - "num_input_tokens_seen": 155569395, - "step": 7258, - "time_per_iteration": 2.7399442195892334 - }, - { - "auxiliary_loss_clip": 0.01107425, - "auxiliary_loss_mlp": 0.01034778, - "balance_loss_clip": 1.0456903, - "balance_loss_mlp": 1.02084827, - "epoch": 0.4364346911167894, - "flos": 28548839612160.0, - "grad_norm": 2.059273423297749, - "language_loss": 0.76950562, - "learning_rate": 2.5033601974967297e-06, - "loss": 0.79092765, - "num_input_tokens_seen": 155589090, - "step": 7259, - "time_per_iteration": 2.814030647277832 - }, - { - "auxiliary_loss_clip": 0.01025258, - "auxiliary_loss_mlp": 0.01002872, - "balance_loss_clip": 1.02231717, - "balance_loss_mlp": 1.0011797, - "epoch": 0.43649481436945736, - "flos": 62659345380480.0, - "grad_norm": 0.7406116287283647, - "language_loss": 0.56990582, - "learning_rate": 2.5029832628579483e-06, - "loss": 0.59018713, - "num_input_tokens_seen": 155648660, - "step": 7260, - "time_per_iteration": 3.184105396270752 - }, - { - "auxiliary_loss_clip": 0.01114574, - "auxiliary_loss_mlp": 0.01046133, - "balance_loss_clip": 1.04780877, - "balance_loss_mlp": 1.03077888, - "epoch": 0.4365549376221254, - "flos": 30592120775040.0, - "grad_norm": 2.4789338774629024, - "language_loss": 0.71279275, - "learning_rate": 2.5026063091459907e-06, - "loss": 0.73439986, - "num_input_tokens_seen": 155669945, - "step": 7261, - "time_per_iteration": 2.781569242477417 - }, - { - "auxiliary_loss_clip": 0.01084597, - "auxiliary_loss_mlp": 0.01054365, - "balance_loss_clip": 1.04558206, - "balance_loss_mlp": 1.0377475, - "epoch": 0.43661506087479335, - "flos": 17165875795200.0, - "grad_norm": 1.8767730803011844, - "language_loss": 0.69520628, - "learning_rate": 2.5022293363751522e-06, - "loss": 0.71659589, - "num_input_tokens_seen": 155688555, - "step": 7262, - "time_per_iteration": 2.73209810256958 - }, - { - "auxiliary_loss_clip": 0.0106364, - "auxiliary_loss_mlp": 0.01034084, - "balance_loss_clip": 1.04300487, - "balance_loss_mlp": 1.02154875, - "epoch": 0.4366751841274613, - "flos": 22047499710720.0, - "grad_norm": 1.5954483681391127, - "language_loss": 0.79909682, - "learning_rate": 2.501852344559726e-06, - "loss": 0.82007402, - "num_input_tokens_seen": 155705370, - "step": 7263, - "time_per_iteration": 2.7780513763427734 - }, - { - "auxiliary_loss_clip": 0.01093795, - "auxiliary_loss_mlp": 0.01046831, - "balance_loss_clip": 1.0481534, - "balance_loss_mlp": 1.03220403, - "epoch": 0.4367353073801293, - "flos": 15997306631040.0, - "grad_norm": 1.6219151151282696, - "language_loss": 0.7545082, - "learning_rate": 2.50147533371401e-06, - "loss": 0.77591443, - "num_input_tokens_seen": 155721890, - "step": 7264, - "time_per_iteration": 4.158029079437256 - }, - { - "auxiliary_loss_clip": 0.01079604, - "auxiliary_loss_mlp": 0.01037561, - "balance_loss_clip": 1.04681587, - "balance_loss_mlp": 1.02243328, - "epoch": 0.43679543063279724, - "flos": 38217535868160.0, - "grad_norm": 2.5655359697781854, - "language_loss": 0.61799812, - "learning_rate": 2.501098303852298e-06, - "loss": 0.63916975, - "num_input_tokens_seen": 155743970, - "step": 7265, - "time_per_iteration": 4.454209804534912 - }, - { - "auxiliary_loss_clip": 0.01105521, - "auxiliary_loss_mlp": 0.01031338, - "balance_loss_clip": 1.04521823, - "balance_loss_mlp": 1.01762891, - "epoch": 0.4368555538854652, - "flos": 15193230727680.0, - "grad_norm": 2.0447032285328004, - "language_loss": 0.72610664, - "learning_rate": 2.5007212549888884e-06, - "loss": 0.74747527, - "num_input_tokens_seen": 155761830, - "step": 7266, - "time_per_iteration": 4.213090181350708 - }, - { - "auxiliary_loss_clip": 0.0110385, - "auxiliary_loss_mlp": 0.01037912, - "balance_loss_clip": 1.04488015, - "balance_loss_mlp": 1.02356541, - "epoch": 0.4369156771381332, - "flos": 23069086421760.0, - "grad_norm": 1.8602157597317315, - "language_loss": 0.82307518, - "learning_rate": 2.5003441871380794e-06, - "loss": 0.84449285, - "num_input_tokens_seen": 155779610, - "step": 7267, - "time_per_iteration": 2.6675074100494385 - }, - { - "auxiliary_loss_clip": 0.01126927, - "auxiliary_loss_mlp": 0.01028504, - "balance_loss_clip": 1.04546976, - "balance_loss_mlp": 1.01499796, - "epoch": 0.43697580039080114, - "flos": 23441085624960.0, - "grad_norm": 2.021840044875845, - "language_loss": 0.74740797, - "learning_rate": 2.4999671003141674e-06, - "loss": 0.76896226, - "num_input_tokens_seen": 155798765, - "step": 7268, - "time_per_iteration": 2.6228766441345215 - }, - { - "auxiliary_loss_clip": 0.01135364, - "auxiliary_loss_mlp": 0.01041324, - "balance_loss_clip": 1.04851401, - "balance_loss_mlp": 1.02567148, - "epoch": 0.4370359236434691, - "flos": 18514680428160.0, - "grad_norm": 2.5093195722714365, - "language_loss": 0.80133688, - "learning_rate": 2.499589994531454e-06, - "loss": 0.82310379, - "num_input_tokens_seen": 155817750, - "step": 7269, - "time_per_iteration": 4.289510726928711 - }, - { - "auxiliary_loss_clip": 0.01110775, - "auxiliary_loss_mlp": 0.01036898, - "balance_loss_clip": 1.04814553, - "balance_loss_mlp": 1.02253354, - "epoch": 0.43709604689613707, - "flos": 23222497409280.0, - "grad_norm": 1.7772509501505356, - "language_loss": 0.74977714, - "learning_rate": 2.499212869804237e-06, - "loss": 0.77125382, - "num_input_tokens_seen": 155836490, - "step": 7270, - "time_per_iteration": 2.7519397735595703 - }, - { - "auxiliary_loss_clip": 0.01068873, - "auxiliary_loss_mlp": 0.01045139, - "balance_loss_clip": 1.04005837, - "balance_loss_mlp": 1.02886677, - "epoch": 0.43715617014880503, - "flos": 23803711378560.0, - "grad_norm": 1.9652522706029574, - "language_loss": 0.79716229, - "learning_rate": 2.4988357261468182e-06, - "loss": 0.81830239, - "num_input_tokens_seen": 155856225, - "step": 7271, - "time_per_iteration": 2.8002872467041016 - }, - { - "auxiliary_loss_clip": 0.01036128, - "auxiliary_loss_mlp": 0.01021454, - "balance_loss_clip": 1.01824927, - "balance_loss_mlp": 1.01974964, - "epoch": 0.437216293401473, - "flos": 61941204766080.0, - "grad_norm": 0.7022630698763936, - "language_loss": 0.54855651, - "learning_rate": 2.4984585635734993e-06, - "loss": 0.56913233, - "num_input_tokens_seen": 155916770, - "step": 7272, - "time_per_iteration": 3.1893959045410156 - }, - { - "auxiliary_loss_clip": 0.0113475, - "auxiliary_loss_mlp": 0.01041916, - "balance_loss_clip": 1.0497241, - "balance_loss_mlp": 1.02704489, - "epoch": 0.43727641665414096, - "flos": 21982250655360.0, - "grad_norm": 1.6582852351426143, - "language_loss": 0.69981074, - "learning_rate": 2.498081382098581e-06, - "loss": 0.72157741, - "num_input_tokens_seen": 155936490, - "step": 7273, - "time_per_iteration": 2.622006893157959 - }, - { - "auxiliary_loss_clip": 0.01109468, - "auxiliary_loss_mlp": 0.01050566, - "balance_loss_clip": 1.04725552, - "balance_loss_mlp": 1.03434145, - "epoch": 0.437336539906809, - "flos": 39530860842240.0, - "grad_norm": 7.356047522605187, - "language_loss": 0.75699592, - "learning_rate": 2.497704181736367e-06, - "loss": 0.77859622, - "num_input_tokens_seen": 155957595, - "step": 7274, - "time_per_iteration": 2.850834846496582 - }, - { - "auxiliary_loss_clip": 0.0111429, - "auxiliary_loss_mlp": 0.0102741, - "balance_loss_clip": 1.04778564, - "balance_loss_mlp": 1.01473844, - "epoch": 0.43739666315947695, - "flos": 17457147181440.0, - "grad_norm": 1.6567651402589496, - "language_loss": 0.80280751, - "learning_rate": 2.49732696250116e-06, - "loss": 0.82422453, - "num_input_tokens_seen": 155975710, - "step": 7275, - "time_per_iteration": 2.638493776321411 - }, - { - "auxiliary_loss_clip": 0.01107442, - "auxiliary_loss_mlp": 0.01039938, - "balance_loss_clip": 1.04763556, - "balance_loss_mlp": 1.02628231, - "epoch": 0.4374567864121449, - "flos": 16358747235840.0, - "grad_norm": 1.960961081760492, - "language_loss": 0.81285107, - "learning_rate": 2.496949724407266e-06, - "loss": 0.83432496, - "num_input_tokens_seen": 155993090, - "step": 7276, - "time_per_iteration": 2.665069341659546 - }, - { - "auxiliary_loss_clip": 0.01119385, - "auxiliary_loss_mlp": 0.01033929, - "balance_loss_clip": 1.05310118, - "balance_loss_mlp": 1.01923609, - "epoch": 0.4375169096648129, - "flos": 30587523834240.0, - "grad_norm": 1.9041346547019917, - "language_loss": 0.7327143, - "learning_rate": 2.496572467468988e-06, - "loss": 0.75424743, - "num_input_tokens_seen": 156013685, - "step": 7277, - "time_per_iteration": 2.7329320907592773 - }, - { - "auxiliary_loss_clip": 0.01109724, - "auxiliary_loss_mlp": 0.0077177, - "balance_loss_clip": 1.04805493, - "balance_loss_mlp": 1.00070667, - "epoch": 0.43757703291748085, - "flos": 30555599621760.0, - "grad_norm": 1.7992627956176412, - "language_loss": 0.73366892, - "learning_rate": 2.4961951917006317e-06, - "loss": 0.7524839, - "num_input_tokens_seen": 156034300, - "step": 7278, - "time_per_iteration": 2.7531094551086426 - }, - { - "auxiliary_loss_clip": 0.01094743, - "auxiliary_loss_mlp": 0.0103965, - "balance_loss_clip": 1.0471983, - "balance_loss_mlp": 1.02677512, - "epoch": 0.4376371561701488, - "flos": 21397373498880.0, - "grad_norm": 1.4932293615412522, - "language_loss": 0.66024888, - "learning_rate": 2.4958178971165046e-06, - "loss": 0.68159282, - "num_input_tokens_seen": 156053805, - "step": 7279, - "time_per_iteration": 2.671842336654663 - }, - { - "auxiliary_loss_clip": 0.01139939, - "auxiliary_loss_mlp": 0.01037875, - "balance_loss_clip": 1.05298817, - "balance_loss_mlp": 1.02337885, - "epoch": 0.4376972794228168, - "flos": 23404384903680.0, - "grad_norm": 1.7693107777348598, - "language_loss": 0.81793606, - "learning_rate": 2.4954405837309126e-06, - "loss": 0.83971423, - "num_input_tokens_seen": 156073295, - "step": 7280, - "time_per_iteration": 2.588303565979004 - }, - { - "auxiliary_loss_clip": 0.01106326, - "auxiliary_loss_mlp": 0.01031835, - "balance_loss_clip": 1.04587424, - "balance_loss_mlp": 1.01867414, - "epoch": 0.43775740267548474, - "flos": 22892945103360.0, - "grad_norm": 1.5627499875085749, - "language_loss": 0.77005875, - "learning_rate": 2.4950632515581653e-06, - "loss": 0.79144037, - "num_input_tokens_seen": 156094540, - "step": 7281, - "time_per_iteration": 2.6939706802368164 - }, - { - "auxiliary_loss_clip": 0.011079, - "auxiliary_loss_mlp": 0.01037268, - "balance_loss_clip": 1.04824066, - "balance_loss_mlp": 1.02360058, - "epoch": 0.4378175259281527, - "flos": 23294390480640.0, - "grad_norm": 1.8010941727109018, - "language_loss": 0.75983417, - "learning_rate": 2.494685900612569e-06, - "loss": 0.78128588, - "num_input_tokens_seen": 156114070, - "step": 7282, - "time_per_iteration": 2.6834237575531006 - }, - { - "auxiliary_loss_clip": 0.01092611, - "auxiliary_loss_mlp": 0.01040673, - "balance_loss_clip": 1.04627228, - "balance_loss_mlp": 1.02654076, - "epoch": 0.43787764918082067, - "flos": 23876897339520.0, - "grad_norm": 2.1500126437968925, - "language_loss": 0.85044593, - "learning_rate": 2.4943085309084333e-06, - "loss": 0.87177879, - "num_input_tokens_seen": 156132130, - "step": 7283, - "time_per_iteration": 2.7042722702026367 - }, - { - "auxiliary_loss_clip": 0.01111303, - "auxiliary_loss_mlp": 0.01037633, - "balance_loss_clip": 1.04814124, - "balance_loss_mlp": 1.02266598, - "epoch": 0.43793777243348864, - "flos": 23988148738560.0, - "grad_norm": 14.144168664775597, - "language_loss": 0.80311596, - "learning_rate": 2.49393114246007e-06, - "loss": 0.82460535, - "num_input_tokens_seen": 156150820, - "step": 7284, - "time_per_iteration": 2.676689863204956 - }, - { - "auxiliary_loss_clip": 0.01123026, - "auxiliary_loss_mlp": 0.01038411, - "balance_loss_clip": 1.04910016, - "balance_loss_mlp": 1.02514315, - "epoch": 0.4379978956861566, - "flos": 18624064320000.0, - "grad_norm": 2.0075840095153925, - "language_loss": 0.80086255, - "learning_rate": 2.493553735281787e-06, - "loss": 0.82247692, - "num_input_tokens_seen": 156170125, - "step": 7285, - "time_per_iteration": 2.6446423530578613 - }, - { - "auxiliary_loss_clip": 0.01121831, - "auxiliary_loss_mlp": 0.01030999, - "balance_loss_clip": 1.04847312, - "balance_loss_mlp": 1.0175761, - "epoch": 0.43805801893882457, - "flos": 21981388728960.0, - "grad_norm": 2.1352627983894545, - "language_loss": 0.7498579, - "learning_rate": 2.493176309387897e-06, - "loss": 0.77138615, - "num_input_tokens_seen": 156187320, - "step": 7286, - "time_per_iteration": 2.6779184341430664 - }, - { - "auxiliary_loss_clip": 0.01095439, - "auxiliary_loss_mlp": 0.01032312, - "balance_loss_clip": 1.04372525, - "balance_loss_mlp": 1.0179832, - "epoch": 0.43811814219149253, - "flos": 26393337383040.0, - "grad_norm": 1.5473009908217328, - "language_loss": 0.73641115, - "learning_rate": 2.492798864792712e-06, - "loss": 0.75768864, - "num_input_tokens_seen": 156207455, - "step": 7287, - "time_per_iteration": 2.867501735687256 - }, - { - "auxiliary_loss_clip": 0.0111224, - "auxiliary_loss_mlp": 0.01045008, - "balance_loss_clip": 1.05047917, - "balance_loss_mlp": 1.03040457, - "epoch": 0.43817826544416055, - "flos": 17493309198720.0, - "grad_norm": 1.6804566494971647, - "language_loss": 0.8243767, - "learning_rate": 2.492421401510545e-06, - "loss": 0.84594917, - "num_input_tokens_seen": 156226560, - "step": 7288, - "time_per_iteration": 2.677922010421753 - }, - { - "auxiliary_loss_clip": 0.01094679, - "auxiliary_loss_mlp": 0.01031914, - "balance_loss_clip": 1.04326773, - "balance_loss_mlp": 1.01793718, - "epoch": 0.4382383886968285, - "flos": 21581020759680.0, - "grad_norm": 1.441403002582157, - "language_loss": 0.84301102, - "learning_rate": 2.4920439195557093e-06, - "loss": 0.86427689, - "num_input_tokens_seen": 156246740, - "step": 7289, - "time_per_iteration": 2.8586435317993164 - }, - { - "auxiliary_loss_clip": 0.0109844, - "auxiliary_loss_mlp": 0.01052991, - "balance_loss_clip": 1.04162121, - "balance_loss_mlp": 1.03685021, - "epoch": 0.4382985119494965, - "flos": 27923742201600.0, - "grad_norm": 1.6202567248687665, - "language_loss": 0.78218126, - "learning_rate": 2.4916664189425183e-06, - "loss": 0.80369556, - "num_input_tokens_seen": 156266440, - "step": 7290, - "time_per_iteration": 2.7211575508117676 - }, - { - "auxiliary_loss_clip": 0.01132305, - "auxiliary_loss_mlp": 0.01039679, - "balance_loss_clip": 1.05053866, - "balance_loss_mlp": 1.02617884, - "epoch": 0.43835863520216445, - "flos": 24936836797440.0, - "grad_norm": 1.8734686520238957, - "language_loss": 0.78314757, - "learning_rate": 2.491288899685288e-06, - "loss": 0.80486739, - "num_input_tokens_seen": 156286900, - "step": 7291, - "time_per_iteration": 2.629904270172119 - }, - { - "auxiliary_loss_clip": 0.0109159, - "auxiliary_loss_mlp": 0.01033172, - "balance_loss_clip": 1.04265332, - "balance_loss_mlp": 1.0194335, - "epoch": 0.4384187584548324, - "flos": 33510293504640.0, - "grad_norm": 1.5839432646062752, - "language_loss": 0.6487931, - "learning_rate": 2.4909113617983325e-06, - "loss": 0.67004073, - "num_input_tokens_seen": 156307690, - "step": 7292, - "time_per_iteration": 2.7952499389648438 - }, - { - "auxiliary_loss_clip": 0.01112801, - "auxiliary_loss_mlp": 0.01036982, - "balance_loss_clip": 1.04319155, - "balance_loss_mlp": 1.0226171, - "epoch": 0.4384788817075004, - "flos": 23951052967680.0, - "grad_norm": 1.6336411060838572, - "language_loss": 0.74232095, - "learning_rate": 2.49053380529597e-06, - "loss": 0.7638188, - "num_input_tokens_seen": 156326620, - "step": 7293, - "time_per_iteration": 2.636462688446045 - }, - { - "auxiliary_loss_clip": 0.01098755, - "auxiliary_loss_mlp": 0.01037795, - "balance_loss_clip": 1.0494585, - "balance_loss_mlp": 1.02318609, - "epoch": 0.43853900496016834, - "flos": 19098516090240.0, - "grad_norm": 4.136423906080754, - "language_loss": 0.78758669, - "learning_rate": 2.490156230192516e-06, - "loss": 0.80895221, - "num_input_tokens_seen": 156345495, - "step": 7294, - "time_per_iteration": 2.670069456100464 - }, - { - "auxiliary_loss_clip": 0.01089917, - "auxiliary_loss_mlp": 0.01038854, - "balance_loss_clip": 1.04422832, - "balance_loss_mlp": 1.02485299, - "epoch": 0.4385991282128363, - "flos": 13225362168960.0, - "grad_norm": 1.7954692393859477, - "language_loss": 0.7296086, - "learning_rate": 2.4897786365022883e-06, - "loss": 0.75089628, - "num_input_tokens_seen": 156363155, - "step": 7295, - "time_per_iteration": 2.7159199714660645 - }, - { - "auxiliary_loss_clip": 0.01090098, - "auxiliary_loss_mlp": 0.01044926, - "balance_loss_clip": 1.04397202, - "balance_loss_mlp": 1.02860653, - "epoch": 0.4386592514655043, - "flos": 14319883445760.0, - "grad_norm": 1.6136170201094728, - "language_loss": 0.75463378, - "learning_rate": 2.4894010242396063e-06, - "loss": 0.77598405, - "num_input_tokens_seen": 156380940, - "step": 7296, - "time_per_iteration": 2.7475438117980957 - }, - { - "auxiliary_loss_clip": 0.01118725, - "auxiliary_loss_mlp": 0.01032483, - "balance_loss_clip": 1.04859519, - "balance_loss_mlp": 1.0183568, - "epoch": 0.43871937471817224, - "flos": 22784423137920.0, - "grad_norm": 1.7142829102326689, - "language_loss": 0.69474953, - "learning_rate": 2.4890233934187873e-06, - "loss": 0.71626163, - "num_input_tokens_seen": 156400415, - "step": 7297, - "time_per_iteration": 2.6689095497131348 - }, - { - "auxiliary_loss_clip": 0.01111936, - "auxiliary_loss_mlp": 0.01033453, - "balance_loss_clip": 1.04589987, - "balance_loss_mlp": 1.02004242, - "epoch": 0.4387794979708402, - "flos": 28072304853120.0, - "grad_norm": 2.137486700340973, - "language_loss": 0.70327055, - "learning_rate": 2.4886457440541535e-06, - "loss": 0.72472441, - "num_input_tokens_seen": 156421120, - "step": 7298, - "time_per_iteration": 2.7896294593811035 - }, - { - "auxiliary_loss_clip": 0.01117974, - "auxiliary_loss_mlp": 0.0102859, - "balance_loss_clip": 1.0481534, - "balance_loss_mlp": 1.01508379, - "epoch": 0.43883962122350817, - "flos": 26249551240320.0, - "grad_norm": 1.5518132007083414, - "language_loss": 0.72407347, - "learning_rate": 2.4882680761600238e-06, - "loss": 0.74553907, - "num_input_tokens_seen": 156441535, - "step": 7299, - "time_per_iteration": 2.724134922027588 - }, - { - "auxiliary_loss_clip": 0.01100992, - "auxiliary_loss_mlp": 0.00773554, - "balance_loss_clip": 1.04556322, - "balance_loss_mlp": 1.00063753, - "epoch": 0.43889974447617613, - "flos": 25883765089920.0, - "grad_norm": 1.9116194577137513, - "language_loss": 0.7702527, - "learning_rate": 2.487890389750719e-06, - "loss": 0.78899813, - "num_input_tokens_seen": 156462015, - "step": 7300, - "time_per_iteration": 2.754582166671753 - }, - { - "auxiliary_loss_clip": 0.01105938, - "auxiliary_loss_mlp": 0.01033126, - "balance_loss_clip": 1.04505253, - "balance_loss_mlp": 1.01922047, - "epoch": 0.43895986772884416, - "flos": 25046615738880.0, - "grad_norm": 1.6899733258560021, - "language_loss": 0.70417237, - "learning_rate": 2.4875126848405626e-06, - "loss": 0.72556305, - "num_input_tokens_seen": 156482165, - "step": 7301, - "time_per_iteration": 2.8213343620300293 - }, - { - "auxiliary_loss_clip": 0.01082543, - "auxiliary_loss_mlp": 0.01042943, - "balance_loss_clip": 1.04282618, - "balance_loss_mlp": 1.0270884, - "epoch": 0.4390199909815121, - "flos": 25994585525760.0, - "grad_norm": 1.824867215084726, - "language_loss": 0.70808041, - "learning_rate": 2.4871349614438757e-06, - "loss": 0.72933531, - "num_input_tokens_seen": 156503170, - "step": 7302, - "time_per_iteration": 2.7875969409942627 - }, - { - "auxiliary_loss_clip": 0.01107602, - "auxiliary_loss_mlp": 0.01039104, - "balance_loss_clip": 1.04878247, - "balance_loss_mlp": 1.02599669, - "epoch": 0.4390801142341801, - "flos": 29022249888000.0, - "grad_norm": 1.5936626078522842, - "language_loss": 0.82381457, - "learning_rate": 2.486757219574983e-06, - "loss": 0.8452816, - "num_input_tokens_seen": 156523005, - "step": 7303, - "time_per_iteration": 2.838871717453003 - }, - { - "auxiliary_loss_clip": 0.01116821, - "auxiliary_loss_mlp": 0.01046972, - "balance_loss_clip": 1.04648411, - "balance_loss_mlp": 1.03164792, - "epoch": 0.43914023748684805, - "flos": 33438544087680.0, - "grad_norm": 10.027739157490931, - "language_loss": 0.69036293, - "learning_rate": 2.4863794592482067e-06, - "loss": 0.71200085, - "num_input_tokens_seen": 156544440, - "step": 7304, - "time_per_iteration": 5.9847636222839355 - }, - { - "auxiliary_loss_clip": 0.01105223, - "auxiliary_loss_mlp": 0.00770446, - "balance_loss_clip": 1.04475939, - "balance_loss_mlp": 1.0005337, - "epoch": 0.439200360739516, - "flos": 34531844302080.0, - "grad_norm": 1.5264108649470638, - "language_loss": 0.78100759, - "learning_rate": 2.486001680477873e-06, - "loss": 0.79976428, - "num_input_tokens_seen": 156565410, - "step": 7305, - "time_per_iteration": 4.283693313598633 - }, - { - "auxiliary_loss_clip": 0.01102752, - "auxiliary_loss_mlp": 0.01034686, - "balance_loss_clip": 1.04440284, - "balance_loss_mlp": 1.02097106, - "epoch": 0.439260483992184, - "flos": 21907843632000.0, - "grad_norm": 1.7445713343884877, - "language_loss": 0.68756545, - "learning_rate": 2.485623883278308e-06, - "loss": 0.70893979, - "num_input_tokens_seen": 156584210, - "step": 7306, - "time_per_iteration": 2.7069246768951416 - }, - { - "auxiliary_loss_clip": 0.01089881, - "auxiliary_loss_mlp": 0.01031704, - "balance_loss_clip": 1.0450325, - "balance_loss_mlp": 1.01757789, - "epoch": 0.43932060724485195, - "flos": 20996430912000.0, - "grad_norm": 2.2471251539247428, - "language_loss": 0.62507868, - "learning_rate": 2.4852460676638344e-06, - "loss": 0.64629447, - "num_input_tokens_seen": 156602730, - "step": 7307, - "time_per_iteration": 2.719836950302124 - }, - { - "auxiliary_loss_clip": 0.01130769, - "auxiliary_loss_mlp": 0.01032376, - "balance_loss_clip": 1.04645061, - "balance_loss_mlp": 1.0188818, - "epoch": 0.4393807304975199, - "flos": 17747053850880.0, - "grad_norm": 1.9621539490577573, - "language_loss": 0.71752089, - "learning_rate": 2.4848682336487828e-06, - "loss": 0.73915237, - "num_input_tokens_seen": 156619405, - "step": 7308, - "time_per_iteration": 4.218705892562866 - }, - { - "auxiliary_loss_clip": 0.0110959, - "auxiliary_loss_mlp": 0.01034032, - "balance_loss_clip": 1.0438807, - "balance_loss_mlp": 1.020859, - "epoch": 0.4394408537501879, - "flos": 22528523669760.0, - "grad_norm": 1.855171270613647, - "language_loss": 0.76671213, - "learning_rate": 2.4844903812474787e-06, - "loss": 0.78814828, - "num_input_tokens_seen": 156638165, - "step": 7309, - "time_per_iteration": 2.726790428161621 - }, - { - "auxiliary_loss_clip": 0.01111334, - "auxiliary_loss_mlp": 0.01031706, - "balance_loss_clip": 1.04383993, - "balance_loss_mlp": 1.01888466, - "epoch": 0.43950097700285584, - "flos": 23440654661760.0, - "grad_norm": 1.9900388133775502, - "language_loss": 0.7067014, - "learning_rate": 2.484112510474251e-06, - "loss": 0.72813171, - "num_input_tokens_seen": 156658845, - "step": 7310, - "time_per_iteration": 2.644737958908081 - }, - { - "auxiliary_loss_clip": 0.01099363, - "auxiliary_loss_mlp": 0.00771301, - "balance_loss_clip": 1.04282653, - "balance_loss_mlp": 1.00065351, - "epoch": 0.4395611002555238, - "flos": 23180696956800.0, - "grad_norm": 2.0308560550957813, - "language_loss": 0.76245713, - "learning_rate": 2.483734621343429e-06, - "loss": 0.78116381, - "num_input_tokens_seen": 156677275, - "step": 7311, - "time_per_iteration": 2.676393985748291 - }, - { - "auxiliary_loss_clip": 0.01118807, - "auxiliary_loss_mlp": 0.01036972, - "balance_loss_clip": 1.04605961, - "balance_loss_mlp": 1.02365649, - "epoch": 0.43962122350819177, - "flos": 22127365601280.0, - "grad_norm": 1.941188934607737, - "language_loss": 0.81554043, - "learning_rate": 2.483356713869341e-06, - "loss": 0.83709824, - "num_input_tokens_seen": 156695815, - "step": 7312, - "time_per_iteration": 2.734691858291626 - }, - { - "auxiliary_loss_clip": 0.01099053, - "auxiliary_loss_mlp": 0.01030855, - "balance_loss_clip": 1.04661798, - "balance_loss_mlp": 1.01802182, - "epoch": 0.43968134676085974, - "flos": 17420554200960.0, - "grad_norm": 4.309677618927981, - "language_loss": 0.85387003, - "learning_rate": 2.482978788066318e-06, - "loss": 0.8751691, - "num_input_tokens_seen": 156714385, - "step": 7313, - "time_per_iteration": 2.7130918502807617 - }, - { - "auxiliary_loss_clip": 0.01101603, - "auxiliary_loss_mlp": 0.01034502, - "balance_loss_clip": 1.04015613, - "balance_loss_mlp": 1.02131104, - "epoch": 0.43974147001352776, - "flos": 18952646958720.0, - "grad_norm": 1.7624997398560822, - "language_loss": 0.67982185, - "learning_rate": 2.4826008439486904e-06, - "loss": 0.70118284, - "num_input_tokens_seen": 156732615, - "step": 7314, - "time_per_iteration": 2.660019636154175 - }, - { - "auxiliary_loss_clip": 0.01107647, - "auxiliary_loss_mlp": 0.01029957, - "balance_loss_clip": 1.04436517, - "balance_loss_mlp": 1.01645088, - "epoch": 0.4398015932661957, - "flos": 18953508885120.0, - "grad_norm": 1.864599678602129, - "language_loss": 0.76799178, - "learning_rate": 2.4822228815307915e-06, - "loss": 0.78936785, - "num_input_tokens_seen": 156750920, - "step": 7315, - "time_per_iteration": 2.6958022117614746 - }, - { - "auxiliary_loss_clip": 0.01103713, - "auxiliary_loss_mlp": 0.01033338, - "balance_loss_clip": 1.04664755, - "balance_loss_mlp": 1.02002192, - "epoch": 0.4398617165188637, - "flos": 24199913370240.0, - "grad_norm": 2.581770130909348, - "language_loss": 0.74439812, - "learning_rate": 2.4818449008269523e-06, - "loss": 0.76576865, - "num_input_tokens_seen": 156768520, - "step": 7316, - "time_per_iteration": 2.7142746448516846 - }, - { - "auxiliary_loss_clip": 0.01091829, - "auxiliary_loss_mlp": 0.01038409, - "balance_loss_clip": 1.04720306, - "balance_loss_mlp": 1.02546883, - "epoch": 0.43992183977153165, - "flos": 22236677665920.0, - "grad_norm": 2.381148700310756, - "language_loss": 0.64676511, - "learning_rate": 2.481466901851506e-06, - "loss": 0.66806751, - "num_input_tokens_seen": 156788700, - "step": 7317, - "time_per_iteration": 2.6647984981536865 - }, - { - "auxiliary_loss_clip": 0.01100358, - "auxiliary_loss_mlp": 0.01036318, - "balance_loss_clip": 1.04315925, - "balance_loss_mlp": 1.02252579, - "epoch": 0.4399819630241996, - "flos": 18697465762560.0, - "grad_norm": 2.00656387252293, - "language_loss": 0.79769003, - "learning_rate": 2.4810888846187865e-06, - "loss": 0.81905675, - "num_input_tokens_seen": 156806470, - "step": 7318, - "time_per_iteration": 2.6569128036499023 - }, - { - "auxiliary_loss_clip": 0.01085209, - "auxiliary_loss_mlp": 0.0104302, - "balance_loss_clip": 1.03973842, - "balance_loss_mlp": 1.02808332, - "epoch": 0.4400420862768676, - "flos": 23879375377920.0, - "grad_norm": 1.4911827600564649, - "language_loss": 0.79173744, - "learning_rate": 2.4807108491431283e-06, - "loss": 0.81301975, - "num_input_tokens_seen": 156825895, - "step": 7319, - "time_per_iteration": 2.7476212978363037 - }, - { - "auxiliary_loss_clip": 0.01110516, - "auxiliary_loss_mlp": 0.01041368, - "balance_loss_clip": 1.0416882, - "balance_loss_mlp": 1.02647328, - "epoch": 0.44010220952953555, - "flos": 28037615293440.0, - "grad_norm": 1.9147413156076512, - "language_loss": 0.80129063, - "learning_rate": 2.4803327954388667e-06, - "loss": 0.82280946, - "num_input_tokens_seen": 156845990, - "step": 7320, - "time_per_iteration": 2.716813802719116 - }, - { - "auxiliary_loss_clip": 0.01088202, - "auxiliary_loss_mlp": 0.01041527, - "balance_loss_clip": 1.04271483, - "balance_loss_mlp": 1.02871788, - "epoch": 0.4401623327822035, - "flos": 23768985905280.0, - "grad_norm": 3.0980421986856777, - "language_loss": 0.69580001, - "learning_rate": 2.4799547235203376e-06, - "loss": 0.71709728, - "num_input_tokens_seen": 156866685, - "step": 7321, - "time_per_iteration": 2.753053903579712 - }, - { - "auxiliary_loss_clip": 0.01016924, - "auxiliary_loss_mlp": 0.01013574, - "balance_loss_clip": 1.02610326, - "balance_loss_mlp": 1.01153517, - "epoch": 0.4402224560348715, - "flos": 70774583264640.0, - "grad_norm": 0.8888992176827548, - "language_loss": 0.56922823, - "learning_rate": 2.4795766334018763e-06, - "loss": 0.58953327, - "num_input_tokens_seen": 156923450, - "step": 7322, - "time_per_iteration": 3.3513524532318115 - }, - { - "auxiliary_loss_clip": 0.01073209, - "auxiliary_loss_mlp": 0.01039777, - "balance_loss_clip": 1.03671217, - "balance_loss_mlp": 1.02677715, - "epoch": 0.44028257928753944, - "flos": 22891795868160.0, - "grad_norm": 1.5589182914821764, - "language_loss": 0.76272774, - "learning_rate": 2.479198525097822e-06, - "loss": 0.78385758, - "num_input_tokens_seen": 156944795, - "step": 7323, - "time_per_iteration": 2.7524306774139404 - }, - { - "auxiliary_loss_clip": 0.01119465, - "auxiliary_loss_mlp": 0.01043388, - "balance_loss_clip": 1.04591155, - "balance_loss_mlp": 1.0296607, - "epoch": 0.4403427025402074, - "flos": 17895760156800.0, - "grad_norm": 1.5124862196762965, - "language_loss": 0.80590653, - "learning_rate": 2.478820398622511e-06, - "loss": 0.82753503, - "num_input_tokens_seen": 156962755, - "step": 7324, - "time_per_iteration": 2.6558468341827393 - }, - { - "auxiliary_loss_clip": 0.01025531, - "auxiliary_loss_mlp": 0.0100492, - "balance_loss_clip": 1.02356136, - "balance_loss_mlp": 1.00322747, - "epoch": 0.4404028257928754, - "flos": 69562525708800.0, - "grad_norm": 0.6843753140185513, - "language_loss": 0.54592586, - "learning_rate": 2.478442253990283e-06, - "loss": 0.5662303, - "num_input_tokens_seen": 157028095, - "step": 7325, - "time_per_iteration": 3.228588819503784 - }, - { - "auxiliary_loss_clip": 0.01128033, - "auxiliary_loss_mlp": 0.01028317, - "balance_loss_clip": 1.04957604, - "balance_loss_mlp": 1.0163784, - "epoch": 0.44046294904554334, - "flos": 20923675914240.0, - "grad_norm": 1.4618535572581854, - "language_loss": 0.70052326, - "learning_rate": 2.4780640912154766e-06, - "loss": 0.72208667, - "num_input_tokens_seen": 157048365, - "step": 7326, - "time_per_iteration": 2.643843650817871 - }, - { - "auxiliary_loss_clip": 0.01081906, - "auxiliary_loss_mlp": 0.010325, - "balance_loss_clip": 1.03812075, - "balance_loss_mlp": 1.01949978, - "epoch": 0.44052307229821136, - "flos": 23623475909760.0, - "grad_norm": 1.533904509031544, - "language_loss": 0.76754719, - "learning_rate": 2.477685910312432e-06, - "loss": 0.78869128, - "num_input_tokens_seen": 157069130, - "step": 7327, - "time_per_iteration": 2.7409613132476807 - }, - { - "auxiliary_loss_clip": 0.01097799, - "auxiliary_loss_mlp": 0.01038346, - "balance_loss_clip": 1.04025364, - "balance_loss_mlp": 1.0256505, - "epoch": 0.4405831955508793, - "flos": 17597665186560.0, - "grad_norm": 1.9457575580966853, - "language_loss": 0.8413341, - "learning_rate": 2.4773077112954897e-06, - "loss": 0.86269557, - "num_input_tokens_seen": 157084940, - "step": 7328, - "time_per_iteration": 2.6578822135925293 - }, - { - "auxiliary_loss_clip": 0.01102477, - "auxiliary_loss_mlp": 0.01028668, - "balance_loss_clip": 1.04432774, - "balance_loss_mlp": 1.01576972, - "epoch": 0.4406433188035473, - "flos": 21463376739840.0, - "grad_norm": 2.377465022226765, - "language_loss": 0.77753079, - "learning_rate": 2.4769294941789908e-06, - "loss": 0.79884225, - "num_input_tokens_seen": 157102770, - "step": 7329, - "time_per_iteration": 2.6732001304626465 - }, - { - "auxiliary_loss_clip": 0.01114069, - "auxiliary_loss_mlp": 0.01039308, - "balance_loss_clip": 1.04399741, - "balance_loss_mlp": 1.02568269, - "epoch": 0.44070344205621526, - "flos": 22673566788480.0, - "grad_norm": 1.63533295854216, - "language_loss": 0.73525596, - "learning_rate": 2.476551258977278e-06, - "loss": 0.75678968, - "num_input_tokens_seen": 157122035, - "step": 7330, - "time_per_iteration": 2.6258528232574463 - }, - { - "auxiliary_loss_clip": 0.01104463, - "auxiliary_loss_mlp": 0.01039279, - "balance_loss_clip": 1.04494476, - "balance_loss_mlp": 1.02678585, - "epoch": 0.4407635653088832, - "flos": 23441193365760.0, - "grad_norm": 1.852759340776506, - "language_loss": 0.74862218, - "learning_rate": 2.4761730057046936e-06, - "loss": 0.77005959, - "num_input_tokens_seen": 157142800, - "step": 7331, - "time_per_iteration": 2.767972469329834 - }, - { - "auxiliary_loss_clip": 0.01075234, - "auxiliary_loss_mlp": 0.01034744, - "balance_loss_clip": 1.04043937, - "balance_loss_mlp": 1.02114189, - "epoch": 0.4408236885615512, - "flos": 24021294013440.0, - "grad_norm": 1.4106194210898035, - "language_loss": 0.76326358, - "learning_rate": 2.475794734375581e-06, - "loss": 0.78436339, - "num_input_tokens_seen": 157163295, - "step": 7332, - "time_per_iteration": 2.7810683250427246 - }, - { - "auxiliary_loss_clip": 0.01099425, - "auxiliary_loss_mlp": 0.01041411, - "balance_loss_clip": 1.04447377, - "balance_loss_mlp": 1.02958584, - "epoch": 0.44088381181421915, - "flos": 12676826597760.0, - "grad_norm": 1.919719554260373, - "language_loss": 0.73795688, - "learning_rate": 2.475416445004285e-06, - "loss": 0.75936526, - "num_input_tokens_seen": 157180890, - "step": 7333, - "time_per_iteration": 2.661736488342285 - }, - { - "auxiliary_loss_clip": 0.01086658, - "auxiliary_loss_mlp": 0.01034222, - "balance_loss_clip": 1.04458117, - "balance_loss_mlp": 1.02134728, - "epoch": 0.4409439350668871, - "flos": 24569865498240.0, - "grad_norm": 1.5776913121160454, - "language_loss": 0.79113179, - "learning_rate": 2.4750381376051493e-06, - "loss": 0.81234062, - "num_input_tokens_seen": 157200580, - "step": 7334, - "time_per_iteration": 2.8023018836975098 - }, - { - "auxiliary_loss_clip": 0.01102091, - "auxiliary_loss_mlp": 0.01039411, - "balance_loss_clip": 1.04475522, - "balance_loss_mlp": 1.02343714, - "epoch": 0.4410040583195551, - "flos": 22668574798080.0, - "grad_norm": 2.426268589885391, - "language_loss": 0.75184131, - "learning_rate": 2.47465981219252e-06, - "loss": 0.77325642, - "num_input_tokens_seen": 157218345, - "step": 7335, - "time_per_iteration": 2.7240371704101562 - }, - { - "auxiliary_loss_clip": 0.01101432, - "auxiliary_loss_mlp": 0.0103515, - "balance_loss_clip": 1.04350579, - "balance_loss_mlp": 1.02189362, - "epoch": 0.44106418157222305, - "flos": 10852528700160.0, - "grad_norm": 1.9825426915131346, - "language_loss": 0.72498572, - "learning_rate": 2.4742814687807423e-06, - "loss": 0.74635154, - "num_input_tokens_seen": 157234395, - "step": 7336, - "time_per_iteration": 2.6489880084991455 - }, - { - "auxiliary_loss_clip": 0.01118861, - "auxiliary_loss_mlp": 0.01040583, - "balance_loss_clip": 1.04398608, - "balance_loss_mlp": 1.02684367, - "epoch": 0.441124304824891, - "flos": 21726710323200.0, - "grad_norm": 2.2630715311051617, - "language_loss": 0.62847346, - "learning_rate": 2.473903107384165e-06, - "loss": 0.65006793, - "num_input_tokens_seen": 157254805, - "step": 7337, - "time_per_iteration": 2.632335901260376 - }, - { - "auxiliary_loss_clip": 0.01029242, - "auxiliary_loss_mlp": 0.00753616, - "balance_loss_clip": 1.0181427, - "balance_loss_mlp": 1.00070596, - "epoch": 0.441184428077559, - "flos": 63220486625280.0, - "grad_norm": 0.7364595311582042, - "language_loss": 0.52639711, - "learning_rate": 2.473524728017134e-06, - "loss": 0.54422569, - "num_input_tokens_seen": 157317870, - "step": 7338, - "time_per_iteration": 3.253746509552002 - }, - { - "auxiliary_loss_clip": 0.01106453, - "auxiliary_loss_mlp": 0.01046288, - "balance_loss_clip": 1.04105973, - "balance_loss_mlp": 1.03120804, - "epoch": 0.44124455133022694, - "flos": 21177959270400.0, - "grad_norm": 2.22639682548465, - "language_loss": 0.70776093, - "learning_rate": 2.473146330693997e-06, - "loss": 0.7292884, - "num_input_tokens_seen": 157336505, - "step": 7339, - "time_per_iteration": 2.655733823776245 - }, - { - "auxiliary_loss_clip": 0.01053755, - "auxiliary_loss_mlp": 0.01042988, - "balance_loss_clip": 1.03682137, - "balance_loss_mlp": 1.02918971, - "epoch": 0.4413046745828949, - "flos": 17457865453440.0, - "grad_norm": 1.5022359473102205, - "language_loss": 0.70075929, - "learning_rate": 2.472767915429105e-06, - "loss": 0.72172678, - "num_input_tokens_seen": 157354995, - "step": 7340, - "time_per_iteration": 2.767920970916748 - }, - { - "auxiliary_loss_clip": 0.01030747, - "auxiliary_loss_mlp": 0.01003789, - "balance_loss_clip": 1.02245617, - "balance_loss_mlp": 1.00190568, - "epoch": 0.4413647978355629, - "flos": 61586153804160.0, - "grad_norm": 0.8827965218567749, - "language_loss": 0.63983381, - "learning_rate": 2.4723894822368054e-06, - "loss": 0.66017926, - "num_input_tokens_seen": 157404260, - "step": 7341, - "time_per_iteration": 3.049508810043335 - }, - { - "auxiliary_loss_clip": 0.01091178, - "auxiliary_loss_mlp": 0.01040152, - "balance_loss_clip": 1.0418849, - "balance_loss_mlp": 1.02682424, - "epoch": 0.4414249210882309, - "flos": 27527001505920.0, - "grad_norm": 2.055823294856648, - "language_loss": 0.73636287, - "learning_rate": 2.47201103113145e-06, - "loss": 0.75767612, - "num_input_tokens_seen": 157423045, - "step": 7342, - "time_per_iteration": 2.795201063156128 - }, - { - "auxiliary_loss_clip": 0.01125069, - "auxiliary_loss_mlp": 0.01041127, - "balance_loss_clip": 1.04345822, - "balance_loss_mlp": 1.02709007, - "epoch": 0.44148504434089886, - "flos": 23513984277120.0, - "grad_norm": 2.2044048255358515, - "language_loss": 0.79979384, - "learning_rate": 2.4716325621273886e-06, - "loss": 0.82145584, - "num_input_tokens_seen": 157441815, - "step": 7343, - "time_per_iteration": 5.804108142852783 - }, - { - "auxiliary_loss_clip": 0.010937, - "auxiliary_loss_mlp": 0.01034348, - "balance_loss_clip": 1.04503846, - "balance_loss_mlp": 1.02072287, - "epoch": 0.4415451675935668, - "flos": 21580589796480.0, - "grad_norm": 2.707350721832692, - "language_loss": 0.76721787, - "learning_rate": 2.4712540752389725e-06, - "loss": 0.78849834, - "num_input_tokens_seen": 157460470, - "step": 7344, - "time_per_iteration": 2.7370471954345703 - }, - { - "auxiliary_loss_clip": 0.01038191, - "auxiliary_loss_mlp": 0.01020913, - "balance_loss_clip": 1.0274384, - "balance_loss_mlp": 1.01902914, - "epoch": 0.4416052908462348, - "flos": 59006368126080.0, - "grad_norm": 0.7980536604903562, - "language_loss": 0.63813043, - "learning_rate": 2.470875570480556e-06, - "loss": 0.65872145, - "num_input_tokens_seen": 157512655, - "step": 7345, - "time_per_iteration": 4.502060890197754 - }, - { - "auxiliary_loss_clip": 0.01130065, - "auxiliary_loss_mlp": 0.01040621, - "balance_loss_clip": 1.04656529, - "balance_loss_mlp": 1.02670372, - "epoch": 0.44166541409890275, - "flos": 26357642242560.0, - "grad_norm": 1.8234046338758734, - "language_loss": 0.86094856, - "learning_rate": 2.470497047866489e-06, - "loss": 0.88265538, - "num_input_tokens_seen": 157533700, - "step": 7346, - "time_per_iteration": 2.697648763656616 - }, - { - "auxiliary_loss_clip": 0.01119294, - "auxiliary_loss_mlp": 0.0104301, - "balance_loss_clip": 1.04583025, - "balance_loss_mlp": 1.02862179, - "epoch": 0.4417255373515707, - "flos": 20192678231040.0, - "grad_norm": 1.7966519054380148, - "language_loss": 0.80474353, - "learning_rate": 2.470118507411128e-06, - "loss": 0.8263666, - "num_input_tokens_seen": 157551105, - "step": 7347, - "time_per_iteration": 4.3498101234436035 - }, - { - "auxiliary_loss_clip": 0.01107859, - "auxiliary_loss_mlp": 0.01035246, - "balance_loss_clip": 1.04878783, - "balance_loss_mlp": 1.02088118, - "epoch": 0.4417856606042387, - "flos": 17887895078400.0, - "grad_norm": 1.7585337264872751, - "language_loss": 0.83156574, - "learning_rate": 2.4697399491288263e-06, - "loss": 0.85299683, - "num_input_tokens_seen": 157568285, - "step": 7348, - "time_per_iteration": 2.6866180896759033 - }, - { - "auxiliary_loss_clip": 0.01119234, - "auxiliary_loss_mlp": 0.01035311, - "balance_loss_clip": 1.04732084, - "balance_loss_mlp": 1.02139926, - "epoch": 0.44184578385690665, - "flos": 27964034282880.0, - "grad_norm": 2.0657656881846505, - "language_loss": 0.70507312, - "learning_rate": 2.469361373033938e-06, - "loss": 0.72661853, - "num_input_tokens_seen": 157590405, - "step": 7349, - "time_per_iteration": 2.7241854667663574 - }, - { - "auxiliary_loss_clip": 0.0109864, - "auxiliary_loss_mlp": 0.01033665, - "balance_loss_clip": 1.04184258, - "balance_loss_mlp": 1.01935983, - "epoch": 0.4419059071095746, - "flos": 23367899664000.0, - "grad_norm": 1.9069897602324009, - "language_loss": 0.74060279, - "learning_rate": 2.468982779140819e-06, - "loss": 0.76192582, - "num_input_tokens_seen": 157607420, - "step": 7350, - "time_per_iteration": 2.724295139312744 - }, - { - "auxiliary_loss_clip": 0.01129716, - "auxiliary_loss_mlp": 0.01036435, - "balance_loss_clip": 1.04692149, - "balance_loss_mlp": 1.02279782, - "epoch": 0.4419660303622426, - "flos": 15012169246080.0, - "grad_norm": 4.28906993354027, - "language_loss": 0.81133771, - "learning_rate": 2.468604167463827e-06, - "loss": 0.83299923, - "num_input_tokens_seen": 157624990, - "step": 7351, - "time_per_iteration": 2.6151175498962402 - }, - { - "auxiliary_loss_clip": 0.01077442, - "auxiliary_loss_mlp": 0.00770493, - "balance_loss_clip": 1.03664398, - "balance_loss_mlp": 1.00027013, - "epoch": 0.44202615361491054, - "flos": 25371750672000.0, - "grad_norm": 1.4842739809833707, - "language_loss": 0.72872806, - "learning_rate": 2.4682255380173176e-06, - "loss": 0.7472074, - "num_input_tokens_seen": 157645300, - "step": 7352, - "time_per_iteration": 2.822618007659912 - }, - { - "auxiliary_loss_clip": 0.01105652, - "auxiliary_loss_mlp": 0.01030051, - "balance_loss_clip": 1.05031562, - "balance_loss_mlp": 1.01625896, - "epoch": 0.4420862768675785, - "flos": 24681116897280.0, - "grad_norm": 2.2734813659209316, - "language_loss": 0.87014645, - "learning_rate": 2.467846890815649e-06, - "loss": 0.89150345, - "num_input_tokens_seen": 157664060, - "step": 7353, - "time_per_iteration": 2.8141496181488037 - }, - { - "auxiliary_loss_clip": 0.01131466, - "auxiliary_loss_mlp": 0.01036857, - "balance_loss_clip": 1.04851007, - "balance_loss_mlp": 1.02385104, - "epoch": 0.44214640012024653, - "flos": 19528437974400.0, - "grad_norm": 2.0005767830632464, - "language_loss": 0.75907683, - "learning_rate": 2.4674682258731795e-06, - "loss": 0.78076005, - "num_input_tokens_seen": 157680905, - "step": 7354, - "time_per_iteration": 2.6416475772857666 - }, - { - "auxiliary_loss_clip": 0.01087376, - "auxiliary_loss_mlp": 0.01035112, - "balance_loss_clip": 1.04345286, - "balance_loss_mlp": 1.02218962, - "epoch": 0.4422065233729145, - "flos": 47557434003840.0, - "grad_norm": 1.702490286843937, - "language_loss": 0.64954734, - "learning_rate": 2.467089543204268e-06, - "loss": 0.67077219, - "num_input_tokens_seen": 157701980, - "step": 7355, - "time_per_iteration": 2.9349570274353027 - }, - { - "auxiliary_loss_clip": 0.01133882, - "auxiliary_loss_mlp": 0.01035511, - "balance_loss_clip": 1.04775596, - "balance_loss_mlp": 1.02121234, - "epoch": 0.44226664662558246, - "flos": 19281050029440.0, - "grad_norm": 1.8300716428477437, - "language_loss": 0.78527248, - "learning_rate": 2.466710842823274e-06, - "loss": 0.80696642, - "num_input_tokens_seen": 157720555, - "step": 7356, - "time_per_iteration": 2.5932910442352295 - }, - { - "auxiliary_loss_clip": 0.01109756, - "auxiliary_loss_mlp": 0.00771729, - "balance_loss_clip": 1.04629183, - "balance_loss_mlp": 1.0004859, - "epoch": 0.4423267698782504, - "flos": 17821820010240.0, - "grad_norm": 1.6708598029973696, - "language_loss": 0.77472621, - "learning_rate": 2.4663321247445577e-06, - "loss": 0.79354107, - "num_input_tokens_seen": 157739160, - "step": 7357, - "time_per_iteration": 2.7050111293792725 - }, - { - "auxiliary_loss_clip": 0.01102733, - "auxiliary_loss_mlp": 0.01037231, - "balance_loss_clip": 1.04357672, - "balance_loss_mlp": 1.02280128, - "epoch": 0.4423868931309184, - "flos": 29204424691200.0, - "grad_norm": 1.492131344457668, - "language_loss": 0.73277801, - "learning_rate": 2.465953388982481e-06, - "loss": 0.75417769, - "num_input_tokens_seen": 157760020, - "step": 7358, - "time_per_iteration": 2.7339792251586914 - }, - { - "auxiliary_loss_clip": 0.01108517, - "auxiliary_loss_mlp": 0.01035507, - "balance_loss_clip": 1.04953265, - "balance_loss_mlp": 1.02198911, - "epoch": 0.44244701638358636, - "flos": 29713135057920.0, - "grad_norm": 1.890703165597896, - "language_loss": 0.75731266, - "learning_rate": 2.465574635551405e-06, - "loss": 0.77875292, - "num_input_tokens_seen": 157780435, - "step": 7359, - "time_per_iteration": 2.7597005367279053 - }, - { - "auxiliary_loss_clip": 0.01106411, - "auxiliary_loss_mlp": 0.01037766, - "balance_loss_clip": 1.04658461, - "balance_loss_mlp": 1.02315068, - "epoch": 0.4425071396362543, - "flos": 22930040874240.0, - "grad_norm": 1.6679218305876244, - "language_loss": 0.69988406, - "learning_rate": 2.4651958644656923e-06, - "loss": 0.72132587, - "num_input_tokens_seen": 157799420, - "step": 7360, - "time_per_iteration": 2.7118403911590576 - }, - { - "auxiliary_loss_clip": 0.01104133, - "auxiliary_loss_mlp": 0.01032941, - "balance_loss_clip": 1.04686546, - "balance_loss_mlp": 1.01859379, - "epoch": 0.4425672628889223, - "flos": 19792346175360.0, - "grad_norm": 3.404305353939149, - "language_loss": 0.69860107, - "learning_rate": 2.4648170757397053e-06, - "loss": 0.71997184, - "num_input_tokens_seen": 157817025, - "step": 7361, - "time_per_iteration": 2.672388792037964 - }, - { - "auxiliary_loss_clip": 0.01105237, - "auxiliary_loss_mlp": 0.01040581, - "balance_loss_clip": 1.04377937, - "balance_loss_mlp": 1.02539372, - "epoch": 0.44262738614159025, - "flos": 13662215377920.0, - "grad_norm": 2.0698565080434888, - "language_loss": 0.82494795, - "learning_rate": 2.464438269387809e-06, - "loss": 0.84640616, - "num_input_tokens_seen": 157834345, - "step": 7362, - "time_per_iteration": 2.6258609294891357 - }, - { - "auxiliary_loss_clip": 0.01102915, - "auxiliary_loss_mlp": 0.01040381, - "balance_loss_clip": 1.04801464, - "balance_loss_mlp": 1.02494311, - "epoch": 0.4426875093942582, - "flos": 14210212245120.0, - "grad_norm": 1.7089384580193987, - "language_loss": 0.74628377, - "learning_rate": 2.464059445424366e-06, - "loss": 0.76771677, - "num_input_tokens_seen": 157852290, - "step": 7363, - "time_per_iteration": 2.7868857383728027 - }, - { - "auxiliary_loss_clip": 0.01008645, - "auxiliary_loss_mlp": 0.01003596, - "balance_loss_clip": 1.02228582, - "balance_loss_mlp": 1.0016526, - "epoch": 0.4427476326469262, - "flos": 70117525728000.0, - "grad_norm": 0.6804595751696751, - "language_loss": 0.55677116, - "learning_rate": 2.463680603863743e-06, - "loss": 0.57689351, - "num_input_tokens_seen": 157923060, - "step": 7364, - "time_per_iteration": 3.3737823963165283 - }, - { - "auxiliary_loss_clip": 0.01109131, - "auxiliary_loss_mlp": 0.01040851, - "balance_loss_clip": 1.04670477, - "balance_loss_mlp": 1.02778566, - "epoch": 0.44280775589959415, - "flos": 25445080287360.0, - "grad_norm": 1.640155581598939, - "language_loss": 0.74618137, - "learning_rate": 2.463301744720305e-06, - "loss": 0.76768118, - "num_input_tokens_seen": 157944110, - "step": 7365, - "time_per_iteration": 2.789905071258545 - }, - { - "auxiliary_loss_clip": 0.01099825, - "auxiliary_loss_mlp": 0.01043396, - "balance_loss_clip": 1.04348397, - "balance_loss_mlp": 1.0287931, - "epoch": 0.4428678791522621, - "flos": 22857214049280.0, - "grad_norm": 1.5674103047703387, - "language_loss": 0.74297303, - "learning_rate": 2.4629228680084184e-06, - "loss": 0.76440525, - "num_input_tokens_seen": 157964295, - "step": 7366, - "time_per_iteration": 2.700286626815796 - }, - { - "auxiliary_loss_clip": 0.01108412, - "auxiliary_loss_mlp": 0.0103649, - "balance_loss_clip": 1.04708481, - "balance_loss_mlp": 1.02240598, - "epoch": 0.44292800240493013, - "flos": 25812446636160.0, - "grad_norm": 3.271133633367276, - "language_loss": 0.73245466, - "learning_rate": 2.46254397374245e-06, - "loss": 0.75390375, - "num_input_tokens_seen": 157983970, - "step": 7367, - "time_per_iteration": 2.6946957111358643 - }, - { - "auxiliary_loss_clip": 0.01130142, - "auxiliary_loss_mlp": 0.01040167, - "balance_loss_clip": 1.04803169, - "balance_loss_mlp": 1.02645779, - "epoch": 0.4429881256575981, - "flos": 32416885549440.0, - "grad_norm": 1.566124307945558, - "language_loss": 0.73996794, - "learning_rate": 2.4621650619367677e-06, - "loss": 0.76167101, - "num_input_tokens_seen": 158006515, - "step": 7368, - "time_per_iteration": 2.7544407844543457 - }, - { - "auxiliary_loss_clip": 0.01100906, - "auxiliary_loss_mlp": 0.01031006, - "balance_loss_clip": 1.04302347, - "balance_loss_mlp": 1.01735687, - "epoch": 0.44304824891026606, - "flos": 22163707186560.0, - "grad_norm": 2.0120848529023334, - "language_loss": 0.7961669, - "learning_rate": 2.4617861326057403e-06, - "loss": 0.81748605, - "num_input_tokens_seen": 158025565, - "step": 7369, - "time_per_iteration": 2.697190046310425 - }, - { - "auxiliary_loss_clip": 0.010901, - "auxiliary_loss_mlp": 0.01035588, - "balance_loss_clip": 1.04244113, - "balance_loss_mlp": 1.02251637, - "epoch": 0.443108372162934, - "flos": 25338569483520.0, - "grad_norm": 1.9393131166495303, - "language_loss": 0.72057104, - "learning_rate": 2.461407185763737e-06, - "loss": 0.74182796, - "num_input_tokens_seen": 158045620, - "step": 7370, - "time_per_iteration": 2.7959940433502197 - }, - { - "auxiliary_loss_clip": 0.01129082, - "auxiliary_loss_mlp": 0.0103749, - "balance_loss_clip": 1.04668999, - "balance_loss_mlp": 1.02349448, - "epoch": 0.443168495415602, - "flos": 23330947547520.0, - "grad_norm": 1.8535232870502223, - "language_loss": 0.70380038, - "learning_rate": 2.461028221425126e-06, - "loss": 0.72546607, - "num_input_tokens_seen": 158063505, - "step": 7371, - "time_per_iteration": 2.677718162536621 - }, - { - "auxiliary_loss_clip": 0.01119855, - "auxiliary_loss_mlp": 0.01031238, - "balance_loss_clip": 1.0492835, - "balance_loss_mlp": 1.01867962, - "epoch": 0.44322861866826996, - "flos": 21871502046720.0, - "grad_norm": 2.0883513439310577, - "language_loss": 0.68410224, - "learning_rate": 2.4606492396042786e-06, - "loss": 0.70561314, - "num_input_tokens_seen": 158080335, - "step": 7372, - "time_per_iteration": 2.6676101684570312 - }, - { - "auxiliary_loss_clip": 0.01096245, - "auxiliary_loss_mlp": 0.0103489, - "balance_loss_clip": 1.04236257, - "balance_loss_mlp": 1.0203104, - "epoch": 0.4432887419209379, - "flos": 20084407660800.0, - "grad_norm": 1.830573306058503, - "language_loss": 0.83560812, - "learning_rate": 2.4602702403155664e-06, - "loss": 0.85691947, - "num_input_tokens_seen": 158098955, - "step": 7373, - "time_per_iteration": 2.706554651260376 - }, - { - "auxiliary_loss_clip": 0.0103821, - "auxiliary_loss_mlp": 0.0100315, - "balance_loss_clip": 1.01858282, - "balance_loss_mlp": 1.00125432, - "epoch": 0.4433488651736059, - "flos": 70035540935040.0, - "grad_norm": 0.769882260063621, - "language_loss": 0.55201387, - "learning_rate": 2.4598912235733604e-06, - "loss": 0.57242751, - "num_input_tokens_seen": 158164110, - "step": 7374, - "time_per_iteration": 3.2373340129852295 - }, - { - "auxiliary_loss_clip": 0.01078736, - "auxiliary_loss_mlp": 0.01042384, - "balance_loss_clip": 1.04519641, - "balance_loss_mlp": 1.02773309, - "epoch": 0.44340898842627385, - "flos": 16282472705280.0, - "grad_norm": 2.3490774090653592, - "language_loss": 0.8289665, - "learning_rate": 2.4595121893920327e-06, - "loss": 0.85017765, - "num_input_tokens_seen": 158179850, - "step": 7375, - "time_per_iteration": 2.7468464374542236 - }, - { - "auxiliary_loss_clip": 0.01129641, - "auxiliary_loss_mlp": 0.01034202, - "balance_loss_clip": 1.04680073, - "balance_loss_mlp": 1.02032566, - "epoch": 0.4434691116789418, - "flos": 16611989097600.0, - "grad_norm": 1.9296092769688273, - "language_loss": 0.84076023, - "learning_rate": 2.4591331377859578e-06, - "loss": 0.86239868, - "num_input_tokens_seen": 158196590, - "step": 7376, - "time_per_iteration": 2.5597686767578125 - }, - { - "auxiliary_loss_clip": 0.01105366, - "auxiliary_loss_mlp": 0.01036479, - "balance_loss_clip": 1.04541779, - "balance_loss_mlp": 1.02299011, - "epoch": 0.4435292349316098, - "flos": 19063251912960.0, - "grad_norm": 1.7983383352892115, - "language_loss": 0.77172405, - "learning_rate": 2.4587540687695077e-06, - "loss": 0.79314244, - "num_input_tokens_seen": 158216355, - "step": 7377, - "time_per_iteration": 2.7065727710723877 - }, - { - "auxiliary_loss_clip": 0.01111732, - "auxiliary_loss_mlp": 0.01032577, - "balance_loss_clip": 1.04586828, - "balance_loss_mlp": 1.01916027, - "epoch": 0.44358935818427775, - "flos": 21251324799360.0, - "grad_norm": 2.2025516465061568, - "language_loss": 0.76422131, - "learning_rate": 2.458374982357057e-06, - "loss": 0.78566432, - "num_input_tokens_seen": 158235825, - "step": 7378, - "time_per_iteration": 2.6680550575256348 - }, - { - "auxiliary_loss_clip": 0.01104625, - "auxiliary_loss_mlp": 0.01055785, - "balance_loss_clip": 1.04471672, - "balance_loss_mlp": 1.0404191, - "epoch": 0.4436494814369457, - "flos": 12495298239360.0, - "grad_norm": 1.9484405267541265, - "language_loss": 0.69165838, - "learning_rate": 2.457995878562982e-06, - "loss": 0.7132625, - "num_input_tokens_seen": 158254230, - "step": 7379, - "time_per_iteration": 2.6700775623321533 - }, - { - "auxiliary_loss_clip": 0.01063579, - "auxiliary_loss_mlp": 0.01045674, - "balance_loss_clip": 1.03913927, - "balance_loss_mlp": 1.0297358, - "epoch": 0.44370960468961373, - "flos": 23659853408640.0, - "grad_norm": 2.073474855716146, - "language_loss": 0.7288872, - "learning_rate": 2.457616757401656e-06, - "loss": 0.74997967, - "num_input_tokens_seen": 158273400, - "step": 7380, - "time_per_iteration": 2.8017635345458984 - }, - { - "auxiliary_loss_clip": 0.01110205, - "auxiliary_loss_mlp": 0.01035159, - "balance_loss_clip": 1.04831696, - "balance_loss_mlp": 1.02124155, - "epoch": 0.4437697279422817, - "flos": 32416849635840.0, - "grad_norm": 1.6338701103198854, - "language_loss": 0.64961064, - "learning_rate": 2.457237618887458e-06, - "loss": 0.67106432, - "num_input_tokens_seen": 158296840, - "step": 7381, - "time_per_iteration": 2.791595458984375 - }, - { - "auxiliary_loss_clip": 0.01120176, - "auxiliary_loss_mlp": 0.0104083, - "balance_loss_clip": 1.04781485, - "balance_loss_mlp": 1.02696049, - "epoch": 0.44382985119494966, - "flos": 18112875914880.0, - "grad_norm": 5.151492667638541, - "language_loss": 0.80450714, - "learning_rate": 2.456858463034763e-06, - "loss": 0.82611728, - "num_input_tokens_seen": 158314935, - "step": 7382, - "time_per_iteration": 4.177164316177368 - }, - { - "auxiliary_loss_clip": 0.0112542, - "auxiliary_loss_mlp": 0.01039884, - "balance_loss_clip": 1.05130458, - "balance_loss_mlp": 1.02599657, - "epoch": 0.44388997444761763, - "flos": 30774151923840.0, - "grad_norm": 1.842434773727105, - "language_loss": 0.65955621, - "learning_rate": 2.456479289857949e-06, - "loss": 0.68120921, - "num_input_tokens_seen": 158334620, - "step": 7383, - "time_per_iteration": 4.142000436782837 - }, - { - "auxiliary_loss_clip": 0.01104406, - "auxiliary_loss_mlp": 0.01036969, - "balance_loss_clip": 1.04357898, - "balance_loss_mlp": 1.02228832, - "epoch": 0.4439500977002856, - "flos": 20339157893760.0, - "grad_norm": 2.431816949897044, - "language_loss": 0.76046586, - "learning_rate": 2.4561000993713953e-06, - "loss": 0.78187954, - "num_input_tokens_seen": 158350550, - "step": 7384, - "time_per_iteration": 4.309042453765869 - }, - { - "auxiliary_loss_clip": 0.01132692, - "auxiliary_loss_mlp": 0.01040021, - "balance_loss_clip": 1.04878867, - "balance_loss_mlp": 1.02595425, - "epoch": 0.44401022095295356, - "flos": 20371225760640.0, - "grad_norm": 1.6001418974541146, - "language_loss": 0.81145859, - "learning_rate": 2.4557208915894796e-06, - "loss": 0.83318579, - "num_input_tokens_seen": 158369555, - "step": 7385, - "time_per_iteration": 2.6569409370422363 - }, - { - "auxiliary_loss_clip": 0.01085589, - "auxiliary_loss_mlp": 0.01035837, - "balance_loss_clip": 1.04551208, - "balance_loss_mlp": 1.02062619, - "epoch": 0.4440703442056215, - "flos": 20230635928320.0, - "grad_norm": 1.8953258070837995, - "language_loss": 0.81531972, - "learning_rate": 2.455341666526582e-06, - "loss": 0.8365339, - "num_input_tokens_seen": 158388045, - "step": 7386, - "time_per_iteration": 2.757857084274292 - }, - { - "auxiliary_loss_clip": 0.01092623, - "auxiliary_loss_mlp": 0.01033697, - "balance_loss_clip": 1.04583073, - "balance_loss_mlp": 1.01829553, - "epoch": 0.4441304674582895, - "flos": 39494698824960.0, - "grad_norm": 2.1898431457791827, - "language_loss": 0.70026255, - "learning_rate": 2.4549624241970832e-06, - "loss": 0.72152579, - "num_input_tokens_seen": 158410115, - "step": 7387, - "time_per_iteration": 4.4056620597839355 - }, - { - "auxiliary_loss_clip": 0.01064296, - "auxiliary_loss_mlp": 0.01040123, - "balance_loss_clip": 1.04571772, - "balance_loss_mlp": 1.02586579, - "epoch": 0.44419059071095746, - "flos": 14829671220480.0, - "grad_norm": 1.9497255625781733, - "language_loss": 0.71838999, - "learning_rate": 2.4545831646153628e-06, - "loss": 0.73943412, - "num_input_tokens_seen": 158427765, - "step": 7388, - "time_per_iteration": 2.7504312992095947 - }, - { - "auxiliary_loss_clip": 0.01120562, - "auxiliary_loss_mlp": 0.01036757, - "balance_loss_clip": 1.04769969, - "balance_loss_mlp": 1.02277958, - "epoch": 0.4442507139636254, - "flos": 22637835734400.0, - "grad_norm": 1.8353800507100826, - "language_loss": 0.6930418, - "learning_rate": 2.4542038877958044e-06, - "loss": 0.71461499, - "num_input_tokens_seen": 158446375, - "step": 7389, - "time_per_iteration": 2.620847702026367 - }, - { - "auxiliary_loss_clip": 0.01119935, - "auxiliary_loss_mlp": 0.01035558, - "balance_loss_clip": 1.04713047, - "balance_loss_mlp": 1.02149689, - "epoch": 0.4443108372162934, - "flos": 38290721829120.0, - "grad_norm": 1.8033342781314554, - "language_loss": 0.75145507, - "learning_rate": 2.453824593752788e-06, - "loss": 0.77301002, - "num_input_tokens_seen": 158467260, - "step": 7390, - "time_per_iteration": 2.794739246368408 - }, - { - "auxiliary_loss_clip": 0.01112569, - "auxiliary_loss_mlp": 0.0104339, - "balance_loss_clip": 1.04474115, - "balance_loss_mlp": 1.0285244, - "epoch": 0.44437096046896135, - "flos": 17748993185280.0, - "grad_norm": 2.757944013002859, - "language_loss": 0.8139115, - "learning_rate": 2.4534452825006988e-06, - "loss": 0.83547109, - "num_input_tokens_seen": 158486720, - "step": 7391, - "time_per_iteration": 2.62081241607666 - }, - { - "auxiliary_loss_clip": 0.01100157, - "auxiliary_loss_mlp": 0.01039531, - "balance_loss_clip": 1.04489446, - "balance_loss_mlp": 1.02436733, - "epoch": 0.4444310837216293, - "flos": 13732348682880.0, - "grad_norm": 1.7057692393428199, - "language_loss": 0.73885345, - "learning_rate": 2.4530659540539185e-06, - "loss": 0.76025033, - "num_input_tokens_seen": 158502530, - "step": 7392, - "time_per_iteration": 2.619123935699463 - }, - { - "auxiliary_loss_clip": 0.01116796, - "auxiliary_loss_mlp": 0.01032995, - "balance_loss_clip": 1.04451931, - "balance_loss_mlp": 1.01976895, - "epoch": 0.44449120697429734, - "flos": 25010238240000.0, - "grad_norm": 1.6244243517648933, - "language_loss": 0.79316819, - "learning_rate": 2.4526866084268313e-06, - "loss": 0.81466603, - "num_input_tokens_seen": 158522715, - "step": 7393, - "time_per_iteration": 2.761636257171631 - }, - { - "auxiliary_loss_clip": 0.01123845, - "auxiliary_loss_mlp": 0.01034263, - "balance_loss_clip": 1.04784608, - "balance_loss_mlp": 1.02036357, - "epoch": 0.4445513302269653, - "flos": 32671707609600.0, - "grad_norm": 1.7936817608261026, - "language_loss": 0.80767369, - "learning_rate": 2.4523072456338226e-06, - "loss": 0.82925481, - "num_input_tokens_seen": 158543615, - "step": 7394, - "time_per_iteration": 2.731896162033081 - }, - { - "auxiliary_loss_clip": 0.01101431, - "auxiliary_loss_mlp": 0.010406, - "balance_loss_clip": 1.04235363, - "balance_loss_mlp": 1.02805972, - "epoch": 0.44461145347963327, - "flos": 11655814504320.0, - "grad_norm": 2.5483522979722886, - "language_loss": 0.79701138, - "learning_rate": 2.4519278656892785e-06, - "loss": 0.81843174, - "num_input_tokens_seen": 158560330, - "step": 7395, - "time_per_iteration": 2.6799733638763428 - }, - { - "auxiliary_loss_clip": 0.0110231, - "auxiliary_loss_mlp": 0.01040027, - "balance_loss_clip": 1.04210639, - "balance_loss_mlp": 1.02630031, - "epoch": 0.44467157673230123, - "flos": 20886759711360.0, - "grad_norm": 1.725775342310971, - "language_loss": 0.68280721, - "learning_rate": 2.451548468607584e-06, - "loss": 0.70423067, - "num_input_tokens_seen": 158579735, - "step": 7396, - "time_per_iteration": 2.7539262771606445 - }, - { - "auxiliary_loss_clip": 0.01115853, - "auxiliary_loss_mlp": 0.00771942, - "balance_loss_clip": 1.04396296, - "balance_loss_mlp": 1.00035286, - "epoch": 0.4447316999849692, - "flos": 18546137763840.0, - "grad_norm": 1.749232481773879, - "language_loss": 0.80780083, - "learning_rate": 2.451169054403126e-06, - "loss": 0.82667875, - "num_input_tokens_seen": 158597075, - "step": 7397, - "time_per_iteration": 2.6620333194732666 - }, - { - "auxiliary_loss_clip": 0.01119828, - "auxiliary_loss_mlp": 0.01038203, - "balance_loss_clip": 1.04740441, - "balance_loss_mlp": 1.02525663, - "epoch": 0.44479182323763716, - "flos": 23769057732480.0, - "grad_norm": 1.6626939297991263, - "language_loss": 0.67383635, - "learning_rate": 2.450789623090293e-06, - "loss": 0.69541669, - "num_input_tokens_seen": 158616650, - "step": 7398, - "time_per_iteration": 2.671193838119507 - }, - { - "auxiliary_loss_clip": 0.01097104, - "auxiliary_loss_mlp": 0.01040281, - "balance_loss_clip": 1.04477727, - "balance_loss_mlp": 1.0271976, - "epoch": 0.44485194649030513, - "flos": 16543831040640.0, - "grad_norm": 1.7055478439146432, - "language_loss": 0.69250667, - "learning_rate": 2.450410174683472e-06, - "loss": 0.71388054, - "num_input_tokens_seen": 158634515, - "step": 7399, - "time_per_iteration": 2.6823384761810303 - }, - { - "auxiliary_loss_clip": 0.01097596, - "auxiliary_loss_mlp": 0.01035766, - "balance_loss_clip": 1.04475021, - "balance_loss_mlp": 1.0225575, - "epoch": 0.4449120697429731, - "flos": 22600955445120.0, - "grad_norm": 1.8287170900617375, - "language_loss": 0.72332168, - "learning_rate": 2.4500307091970514e-06, - "loss": 0.74465525, - "num_input_tokens_seen": 158653760, - "step": 7400, - "time_per_iteration": 2.7227253913879395 - }, - { - "auxiliary_loss_clip": 0.01076093, - "auxiliary_loss_mlp": 0.00770024, - "balance_loss_clip": 1.04184151, - "balance_loss_mlp": 1.00039887, - "epoch": 0.44497219299564106, - "flos": 20004864992640.0, - "grad_norm": 1.6814996958378423, - "language_loss": 0.85252142, - "learning_rate": 2.449651226645422e-06, - "loss": 0.87098259, - "num_input_tokens_seen": 158672190, - "step": 7401, - "time_per_iteration": 2.757293701171875 - }, - { - "auxiliary_loss_clip": 0.01102171, - "auxiliary_loss_mlp": 0.0103733, - "balance_loss_clip": 1.04564703, - "balance_loss_mlp": 1.02497375, - "epoch": 0.445032316248309, - "flos": 25594253470080.0, - "grad_norm": 1.6805452055908299, - "language_loss": 0.83201802, - "learning_rate": 2.449271727042973e-06, - "loss": 0.85341299, - "num_input_tokens_seen": 158694115, - "step": 7402, - "time_per_iteration": 2.7132928371429443 - }, - { - "auxiliary_loss_clip": 0.01107267, - "auxiliary_loss_mlp": 0.01032822, - "balance_loss_clip": 1.0461576, - "balance_loss_mlp": 1.0188688, - "epoch": 0.445092439500977, - "flos": 21250426959360.0, - "grad_norm": 1.9019306445781163, - "language_loss": 0.7714172, - "learning_rate": 2.4488922104040947e-06, - "loss": 0.79281807, - "num_input_tokens_seen": 158711000, - "step": 7403, - "time_per_iteration": 2.6282217502593994 - }, - { - "auxiliary_loss_clip": 0.01023728, - "auxiliary_loss_mlp": 0.01005808, - "balance_loss_clip": 1.0202831, - "balance_loss_mlp": 1.00413918, - "epoch": 0.44515256275364495, - "flos": 57764900309760.0, - "grad_norm": 0.7456605721636542, - "language_loss": 0.59988129, - "learning_rate": 2.4485126767431793e-06, - "loss": 0.62017667, - "num_input_tokens_seen": 158769675, - "step": 7404, - "time_per_iteration": 3.173560619354248 - }, - { - "auxiliary_loss_clip": 0.01105136, - "auxiliary_loss_mlp": 0.01044638, - "balance_loss_clip": 1.04419279, - "balance_loss_mlp": 1.02934957, - "epoch": 0.4452126860063129, - "flos": 15596004908160.0, - "grad_norm": 1.6768296122026118, - "language_loss": 0.82246673, - "learning_rate": 2.4481331260746177e-06, - "loss": 0.8439644, - "num_input_tokens_seen": 158788215, - "step": 7405, - "time_per_iteration": 2.6669278144836426 - }, - { - "auxiliary_loss_clip": 0.01104648, - "auxiliary_loss_mlp": 0.01029929, - "balance_loss_clip": 1.04628932, - "balance_loss_mlp": 1.01669657, - "epoch": 0.4452728092589809, - "flos": 21617398258560.0, - "grad_norm": 4.56209401129754, - "language_loss": 0.75126898, - "learning_rate": 2.4477535584128036e-06, - "loss": 0.77261472, - "num_input_tokens_seen": 158809090, - "step": 7406, - "time_per_iteration": 2.6722404956817627 - }, - { - "auxiliary_loss_clip": 0.01091029, - "auxiliary_loss_mlp": 0.01030298, - "balance_loss_clip": 1.0434047, - "balance_loss_mlp": 1.01746488, - "epoch": 0.4453329325116489, - "flos": 29497491757440.0, - "grad_norm": 1.6633570284980403, - "language_loss": 0.6572476, - "learning_rate": 2.447373973772129e-06, - "loss": 0.67846084, - "num_input_tokens_seen": 158828320, - "step": 7407, - "time_per_iteration": 2.819289207458496 - }, - { - "auxiliary_loss_clip": 0.01102137, - "auxiliary_loss_mlp": 0.01031486, - "balance_loss_clip": 1.04499328, - "balance_loss_mlp": 1.0179081, - "epoch": 0.44539305576431687, - "flos": 21361139654400.0, - "grad_norm": 1.6186505097592758, - "language_loss": 0.67861688, - "learning_rate": 2.4469943721669887e-06, - "loss": 0.69995308, - "num_input_tokens_seen": 158847040, - "step": 7408, - "time_per_iteration": 2.6846649646759033 - }, - { - "auxiliary_loss_clip": 0.01128678, - "auxiliary_loss_mlp": 0.01035504, - "balance_loss_clip": 1.04559541, - "balance_loss_mlp": 1.02121043, - "epoch": 0.44545317901698483, - "flos": 41427626428800.0, - "grad_norm": 1.4740715510068387, - "language_loss": 0.72127414, - "learning_rate": 2.4466147536117776e-06, - "loss": 0.74291599, - "num_input_tokens_seen": 158870490, - "step": 7409, - "time_per_iteration": 2.7701869010925293 - }, - { - "auxiliary_loss_clip": 0.01107577, - "auxiliary_loss_mlp": 0.010375, - "balance_loss_clip": 1.04669523, - "balance_loss_mlp": 1.02308798, - "epoch": 0.4455133022696528, - "flos": 22055005653120.0, - "grad_norm": 1.9118661854704846, - "language_loss": 0.65146017, - "learning_rate": 2.4462351181208895e-06, - "loss": 0.67291093, - "num_input_tokens_seen": 158889920, - "step": 7410, - "time_per_iteration": 2.780905246734619 - }, - { - "auxiliary_loss_clip": 0.01104956, - "auxiliary_loss_mlp": 0.01038448, - "balance_loss_clip": 1.04414868, - "balance_loss_mlp": 1.02369618, - "epoch": 0.44557342552232077, - "flos": 23476960333440.0, - "grad_norm": 2.076728084707015, - "language_loss": 0.73772335, - "learning_rate": 2.4458554657087217e-06, - "loss": 0.75915742, - "num_input_tokens_seen": 158909580, - "step": 7411, - "time_per_iteration": 2.745547294616699 - }, - { - "auxiliary_loss_clip": 0.01061885, - "auxiliary_loss_mlp": 0.01033363, - "balance_loss_clip": 1.04457641, - "balance_loss_mlp": 1.01967764, - "epoch": 0.44563354877498873, - "flos": 19134678107520.0, - "grad_norm": 1.7330985507109689, - "language_loss": 0.79373199, - "learning_rate": 2.4454757963896695e-06, - "loss": 0.81468445, - "num_input_tokens_seen": 158924600, - "step": 7412, - "time_per_iteration": 2.76361346244812 - }, - { - "auxiliary_loss_clip": 0.01108589, - "auxiliary_loss_mlp": 0.01037974, - "balance_loss_clip": 1.04357016, - "balance_loss_mlp": 1.02453899, - "epoch": 0.4456936720276567, - "flos": 13621420506240.0, - "grad_norm": 1.9356381581130233, - "language_loss": 0.80161285, - "learning_rate": 2.4450961101781304e-06, - "loss": 0.82307845, - "num_input_tokens_seen": 158939345, - "step": 7413, - "time_per_iteration": 2.619915008544922 - }, - { - "auxiliary_loss_clip": 0.01113419, - "auxiliary_loss_mlp": 0.0103316, - "balance_loss_clip": 1.0433104, - "balance_loss_mlp": 1.01962328, - "epoch": 0.44575379528032466, - "flos": 14713715139840.0, - "grad_norm": 1.9889124982728665, - "language_loss": 0.76648301, - "learning_rate": 2.4447164070885026e-06, - "loss": 0.78794879, - "num_input_tokens_seen": 158955855, - "step": 7414, - "time_per_iteration": 2.5959794521331787 - }, - { - "auxiliary_loss_clip": 0.01096052, - "auxiliary_loss_mlp": 0.01040946, - "balance_loss_clip": 1.0415467, - "balance_loss_mlp": 1.02701616, - "epoch": 0.4458139185329926, - "flos": 24170682677760.0, - "grad_norm": 1.6599120729875612, - "language_loss": 0.83765483, - "learning_rate": 2.4443366871351837e-06, - "loss": 0.85902476, - "num_input_tokens_seen": 158976315, - "step": 7415, - "time_per_iteration": 2.785512685775757 - }, - { - "auxiliary_loss_clip": 0.01124247, - "auxiliary_loss_mlp": 0.01043831, - "balance_loss_clip": 1.04321933, - "balance_loss_mlp": 1.03093266, - "epoch": 0.4458740417856606, - "flos": 21762225895680.0, - "grad_norm": 2.1888037109264933, - "language_loss": 0.84245199, - "learning_rate": 2.4439569503325732e-06, - "loss": 0.86413276, - "num_input_tokens_seen": 158996725, - "step": 7416, - "time_per_iteration": 2.60307240486145 - }, - { - "auxiliary_loss_clip": 0.01095417, - "auxiliary_loss_mlp": 0.01034003, - "balance_loss_clip": 1.04398692, - "balance_loss_mlp": 1.01991272, - "epoch": 0.44593416503832856, - "flos": 21068790860160.0, - "grad_norm": 1.494230693182331, - "language_loss": 0.81091261, - "learning_rate": 2.4435771966950706e-06, - "loss": 0.83220685, - "num_input_tokens_seen": 159017255, - "step": 7417, - "time_per_iteration": 2.7423362731933594 - }, - { - "auxiliary_loss_clip": 0.01105133, - "auxiliary_loss_mlp": 0.01040379, - "balance_loss_clip": 1.04227042, - "balance_loss_mlp": 1.02650881, - "epoch": 0.4459942882909965, - "flos": 22600488568320.0, - "grad_norm": 2.47121292521638, - "language_loss": 0.81035185, - "learning_rate": 2.443197426237077e-06, - "loss": 0.8318069, - "num_input_tokens_seen": 159035010, - "step": 7418, - "time_per_iteration": 2.67476487159729 - }, - { - "auxiliary_loss_clip": 0.01120234, - "auxiliary_loss_mlp": 0.007712, - "balance_loss_clip": 1.04618478, - "balance_loss_mlp": 1.00049162, - "epoch": 0.4460544115436645, - "flos": 26505486622080.0, - "grad_norm": 2.084312717643635, - "language_loss": 0.77342117, - "learning_rate": 2.442817638972991e-06, - "loss": 0.79233551, - "num_input_tokens_seen": 159055345, - "step": 7419, - "time_per_iteration": 2.760847806930542 - }, - { - "auxiliary_loss_clip": 0.0108993, - "auxiliary_loss_mlp": 0.0103388, - "balance_loss_clip": 1.03954124, - "balance_loss_mlp": 1.02063632, - "epoch": 0.4461145347963325, - "flos": 17604021893760.0, - "grad_norm": 1.824664612180611, - "language_loss": 0.72570968, - "learning_rate": 2.4424378349172176e-06, - "loss": 0.74694777, - "num_input_tokens_seen": 159074225, - "step": 7420, - "time_per_iteration": 2.6990244388580322 - }, - { - "auxiliary_loss_clip": 0.01104512, - "auxiliary_loss_mlp": 0.01032052, - "balance_loss_clip": 1.0432508, - "balance_loss_mlp": 1.01793802, - "epoch": 0.44617465804900047, - "flos": 27268193036160.0, - "grad_norm": 1.5590654083825235, - "language_loss": 0.75280499, - "learning_rate": 2.442058014084156e-06, - "loss": 0.77417064, - "num_input_tokens_seen": 159095415, - "step": 7421, - "time_per_iteration": 2.751757860183716 - }, - { - "auxiliary_loss_clip": 0.01059239, - "auxiliary_loss_mlp": 0.01037453, - "balance_loss_clip": 1.03808808, - "balance_loss_mlp": 1.02374959, - "epoch": 0.44623478130166844, - "flos": 17786412178560.0, - "grad_norm": 1.7359325284030627, - "language_loss": 0.75753498, - "learning_rate": 2.44167817648821e-06, - "loss": 0.77850193, - "num_input_tokens_seen": 159114615, - "step": 7422, - "time_per_iteration": 4.3189520835876465 - }, - { - "auxiliary_loss_clip": 0.01125756, - "auxiliary_loss_mlp": 0.01033879, - "balance_loss_clip": 1.04443765, - "balance_loss_mlp": 1.02083755, - "epoch": 0.4462949045543364, - "flos": 23003011353600.0, - "grad_norm": 1.436007196155178, - "language_loss": 0.65393054, - "learning_rate": 2.441298322143784e-06, - "loss": 0.67552686, - "num_input_tokens_seen": 159134370, - "step": 7423, - "time_per_iteration": 4.272382020950317 - }, - { - "auxiliary_loss_clip": 0.01096555, - "auxiliary_loss_mlp": 0.01034422, - "balance_loss_clip": 1.04093194, - "balance_loss_mlp": 1.02195287, - "epoch": 0.44635502780700437, - "flos": 17820096157440.0, - "grad_norm": 1.6490570846190094, - "language_loss": 0.79002917, - "learning_rate": 2.4409184510652807e-06, - "loss": 0.8113389, - "num_input_tokens_seen": 159152540, - "step": 7424, - "time_per_iteration": 2.6641786098480225 - }, - { - "auxiliary_loss_clip": 0.01109138, - "auxiliary_loss_mlp": 0.01031872, - "balance_loss_clip": 1.04272473, - "balance_loss_mlp": 1.01960564, - "epoch": 0.44641515105967233, - "flos": 26688020561280.0, - "grad_norm": 1.5476168372337398, - "language_loss": 0.80515361, - "learning_rate": 2.4405385632671063e-06, - "loss": 0.82656378, - "num_input_tokens_seen": 159173425, - "step": 7425, - "time_per_iteration": 2.677921772003174 - }, - { - "auxiliary_loss_clip": 0.01111593, - "auxiliary_loss_mlp": 0.01030626, - "balance_loss_clip": 1.04249597, - "balance_loss_mlp": 1.01805556, - "epoch": 0.4464752743123403, - "flos": 18913324544640.0, - "grad_norm": 1.7505920916906397, - "language_loss": 0.77314126, - "learning_rate": 2.4401586587636655e-06, - "loss": 0.79456341, - "num_input_tokens_seen": 159191210, - "step": 7426, - "time_per_iteration": 4.264745712280273 - }, - { - "auxiliary_loss_clip": 0.01098153, - "auxiliary_loss_mlp": 0.00770786, - "balance_loss_clip": 1.04180968, - "balance_loss_mlp": 1.00042045, - "epoch": 0.44653539756500826, - "flos": 29570318582400.0, - "grad_norm": 2.512425150903693, - "language_loss": 0.64678168, - "learning_rate": 2.4397787375693634e-06, - "loss": 0.66547108, - "num_input_tokens_seen": 159211755, - "step": 7427, - "time_per_iteration": 2.746807336807251 - }, - { - "auxiliary_loss_clip": 0.01114285, - "auxiliary_loss_mlp": 0.01032662, - "balance_loss_clip": 1.04756093, - "balance_loss_mlp": 1.01968026, - "epoch": 0.44659552081767623, - "flos": 21468979261440.0, - "grad_norm": 1.6794687580888963, - "language_loss": 0.7564522, - "learning_rate": 2.439398799698608e-06, - "loss": 0.77792168, - "num_input_tokens_seen": 159230315, - "step": 7428, - "time_per_iteration": 2.675830364227295 - }, - { - "auxiliary_loss_clip": 0.01089417, - "auxiliary_loss_mlp": 0.0103803, - "balance_loss_clip": 1.03992331, - "balance_loss_mlp": 1.0244813, - "epoch": 0.4466556440703442, - "flos": 17931886260480.0, - "grad_norm": 2.160723316992149, - "language_loss": 0.77906388, - "learning_rate": 2.439018845165806e-06, - "loss": 0.80033839, - "num_input_tokens_seen": 159249810, - "step": 7429, - "time_per_iteration": 2.6864819526672363 - }, - { - "auxiliary_loss_clip": 0.01117759, - "auxiliary_loss_mlp": 0.01036133, - "balance_loss_clip": 1.04573584, - "balance_loss_mlp": 1.02222157, - "epoch": 0.44671576732301216, - "flos": 21107430915840.0, - "grad_norm": 1.6783165407459442, - "language_loss": 0.91421354, - "learning_rate": 2.438638873985366e-06, - "loss": 0.93575251, - "num_input_tokens_seen": 159271715, - "step": 7430, - "time_per_iteration": 2.6472880840301514 - }, - { - "auxiliary_loss_clip": 0.01105427, - "auxiliary_loss_mlp": 0.00772764, - "balance_loss_clip": 1.04418826, - "balance_loss_mlp": 1.000386, - "epoch": 0.4467758905756801, - "flos": 23508920459520.0, - "grad_norm": 1.918378394995702, - "language_loss": 0.79452366, - "learning_rate": 2.4382588861716954e-06, - "loss": 0.8133055, - "num_input_tokens_seen": 159290690, - "step": 7431, - "time_per_iteration": 2.7096598148345947 - }, - { - "auxiliary_loss_clip": 0.01108777, - "auxiliary_loss_mlp": 0.01036954, - "balance_loss_clip": 1.04568875, - "balance_loss_mlp": 1.02245188, - "epoch": 0.4468360138283481, - "flos": 18734022829440.0, - "grad_norm": 1.6794320575098944, - "language_loss": 0.79817986, - "learning_rate": 2.437878881739204e-06, - "loss": 0.81963724, - "num_input_tokens_seen": 159309400, - "step": 7432, - "time_per_iteration": 2.676522970199585 - }, - { - "auxiliary_loss_clip": 0.01094927, - "auxiliary_loss_mlp": 0.01040483, - "balance_loss_clip": 1.04654121, - "balance_loss_mlp": 1.02803755, - "epoch": 0.4468961370810161, - "flos": 23477139901440.0, - "grad_norm": 1.8261946877850768, - "language_loss": 0.76878047, - "learning_rate": 2.437498860702301e-06, - "loss": 0.79013455, - "num_input_tokens_seen": 159327425, - "step": 7433, - "time_per_iteration": 2.6820082664489746 - }, - { - "auxiliary_loss_clip": 0.01106089, - "auxiliary_loss_mlp": 0.01034932, - "balance_loss_clip": 1.04236984, - "balance_loss_mlp": 1.02372587, - "epoch": 0.4469562603336841, - "flos": 30075042539520.0, - "grad_norm": 1.6244691365264956, - "language_loss": 0.77377415, - "learning_rate": 2.437118823075398e-06, - "loss": 0.79518431, - "num_input_tokens_seen": 159345805, - "step": 7434, - "time_per_iteration": 2.7471024990081787 - }, - { - "auxiliary_loss_clip": 0.01118898, - "auxiliary_loss_mlp": 0.01031979, - "balance_loss_clip": 1.04707336, - "balance_loss_mlp": 1.01909828, - "epoch": 0.44701638358635204, - "flos": 22456415116800.0, - "grad_norm": 1.6740796261727897, - "language_loss": 0.64705265, - "learning_rate": 2.436738768872905e-06, - "loss": 0.6685614, - "num_input_tokens_seen": 159364595, - "step": 7435, - "time_per_iteration": 2.649425983428955 - }, - { - "auxiliary_loss_clip": 0.01112389, - "auxiliary_loss_mlp": 0.01029389, - "balance_loss_clip": 1.04875195, - "balance_loss_mlp": 1.01587653, - "epoch": 0.44707650683902, - "flos": 24057851080320.0, - "grad_norm": 1.6005542791240868, - "language_loss": 0.83477545, - "learning_rate": 2.4363586981092346e-06, - "loss": 0.85619318, - "num_input_tokens_seen": 159385265, - "step": 7436, - "time_per_iteration": 2.6727020740509033 - }, - { - "auxiliary_loss_clip": 0.01073439, - "auxiliary_loss_mlp": 0.01045352, - "balance_loss_clip": 1.0402267, - "balance_loss_mlp": 1.02884197, - "epoch": 0.44713663009168797, - "flos": 23766938830080.0, - "grad_norm": 2.1717582772549995, - "language_loss": 0.79815632, - "learning_rate": 2.435978610798798e-06, - "loss": 0.81934428, - "num_input_tokens_seen": 159405080, - "step": 7437, - "time_per_iteration": 2.7589898109436035 - }, - { - "auxiliary_loss_clip": 0.01079969, - "auxiliary_loss_mlp": 0.01037183, - "balance_loss_clip": 1.0433023, - "balance_loss_mlp": 1.02375364, - "epoch": 0.44719675334435594, - "flos": 24499265316480.0, - "grad_norm": 1.7231807337022225, - "language_loss": 0.71860999, - "learning_rate": 2.435598506956009e-06, - "loss": 0.7397815, - "num_input_tokens_seen": 159424595, - "step": 7438, - "time_per_iteration": 2.794978380203247 - }, - { - "auxiliary_loss_clip": 0.01084835, - "auxiliary_loss_mlp": 0.01035733, - "balance_loss_clip": 1.04564655, - "balance_loss_mlp": 1.02180314, - "epoch": 0.4472568765970239, - "flos": 29781759991680.0, - "grad_norm": 1.556366888574876, - "language_loss": 0.67619812, - "learning_rate": 2.4352183865952808e-06, - "loss": 0.69740379, - "num_input_tokens_seen": 159443865, - "step": 7439, - "time_per_iteration": 2.9251644611358643 - }, - { - "auxiliary_loss_clip": 0.01102346, - "auxiliary_loss_mlp": 0.01039634, - "balance_loss_clip": 1.0403614, - "balance_loss_mlp": 1.02436376, - "epoch": 0.44731699984969187, - "flos": 24643123286400.0, - "grad_norm": 1.714649831944237, - "language_loss": 0.73915118, - "learning_rate": 2.4348382497310285e-06, - "loss": 0.760571, - "num_input_tokens_seen": 159464525, - "step": 7440, - "time_per_iteration": 2.773106813430786 - }, - { - "auxiliary_loss_clip": 0.01072825, - "auxiliary_loss_mlp": 0.01042282, - "balance_loss_clip": 1.03706956, - "balance_loss_mlp": 1.02789354, - "epoch": 0.44737712310235983, - "flos": 29455691304960.0, - "grad_norm": 1.740924989183362, - "language_loss": 0.74161476, - "learning_rate": 2.4344580963776655e-06, - "loss": 0.76276582, - "num_input_tokens_seen": 159486385, - "step": 7441, - "time_per_iteration": 2.9042701721191406 - }, - { - "auxiliary_loss_clip": 0.01096694, - "auxiliary_loss_mlp": 0.01036467, - "balance_loss_clip": 1.04596698, - "balance_loss_mlp": 1.0220542, - "epoch": 0.4474372463550278, - "flos": 24896832024960.0, - "grad_norm": 1.9641422641471569, - "language_loss": 0.75060695, - "learning_rate": 2.4340779265496082e-06, - "loss": 0.77193856, - "num_input_tokens_seen": 159503880, - "step": 7442, - "time_per_iteration": 2.776219129562378 - }, - { - "auxiliary_loss_clip": 0.01131095, - "auxiliary_loss_mlp": 0.01033925, - "balance_loss_clip": 1.04641354, - "balance_loss_mlp": 1.01900017, - "epoch": 0.44749736960769576, - "flos": 33181603125120.0, - "grad_norm": 1.741320347682455, - "language_loss": 0.74572098, - "learning_rate": 2.433697740261273e-06, - "loss": 0.76737112, - "num_input_tokens_seen": 159522980, - "step": 7443, - "time_per_iteration": 2.783189058303833 - }, - { - "auxiliary_loss_clip": 0.01099877, - "auxiliary_loss_mlp": 0.0103204, - "balance_loss_clip": 1.03843653, - "balance_loss_mlp": 1.01699591, - "epoch": 0.4475574928603637, - "flos": 21071807602560.0, - "grad_norm": 1.581803518054495, - "language_loss": 0.77928406, - "learning_rate": 2.4333175375270748e-06, - "loss": 0.80060327, - "num_input_tokens_seen": 159543340, - "step": 7444, - "time_per_iteration": 2.750493049621582 - }, - { - "auxiliary_loss_clip": 0.01108777, - "auxiliary_loss_mlp": 0.01033259, - "balance_loss_clip": 1.04501557, - "balance_loss_mlp": 1.01988959, - "epoch": 0.4476176161130317, - "flos": 21862523646720.0, - "grad_norm": 2.5006881318170917, - "language_loss": 0.85238421, - "learning_rate": 2.4329373183614333e-06, - "loss": 0.87380457, - "num_input_tokens_seen": 159558210, - "step": 7445, - "time_per_iteration": 2.6802477836608887 - }, - { - "auxiliary_loss_clip": 0.01087309, - "auxiliary_loss_mlp": 0.0104165, - "balance_loss_clip": 1.04073787, - "balance_loss_mlp": 1.02471042, - "epoch": 0.4476777393656997, - "flos": 22528667324160.0, - "grad_norm": 3.110631371373827, - "language_loss": 0.63355798, - "learning_rate": 2.432557082778765e-06, - "loss": 0.65484762, - "num_input_tokens_seen": 159577920, - "step": 7446, - "time_per_iteration": 2.746697187423706 - }, - { - "auxiliary_loss_clip": 0.01039011, - "auxiliary_loss_mlp": 0.01002627, - "balance_loss_clip": 1.02036047, - "balance_loss_mlp": 1.00081527, - "epoch": 0.4477378626183677, - "flos": 49017133877760.0, - "grad_norm": 0.738380684617154, - "language_loss": 0.50261772, - "learning_rate": 2.4321768307934884e-06, - "loss": 0.5230341, - "num_input_tokens_seen": 159632295, - "step": 7447, - "time_per_iteration": 3.0176138877868652 - }, - { - "auxiliary_loss_clip": 0.01047805, - "auxiliary_loss_mlp": 0.0099926, - "balance_loss_clip": 1.0195471, - "balance_loss_mlp": 0.9976145, - "epoch": 0.44779798587103564, - "flos": 56542179392640.0, - "grad_norm": 0.7822716011451579, - "language_loss": 0.59427667, - "learning_rate": 2.4317965624200235e-06, - "loss": 0.61474735, - "num_input_tokens_seen": 159698435, - "step": 7448, - "time_per_iteration": 3.1922085285186768 - }, - { - "auxiliary_loss_clip": 0.01093955, - "auxiliary_loss_mlp": 0.01032649, - "balance_loss_clip": 1.04417181, - "balance_loss_mlp": 1.01983976, - "epoch": 0.4478581091237036, - "flos": 46498536040320.0, - "grad_norm": 1.6983811072489297, - "language_loss": 0.58952618, - "learning_rate": 2.431416277672789e-06, - "loss": 0.61079222, - "num_input_tokens_seen": 159722150, - "step": 7449, - "time_per_iteration": 2.9170258045196533 - }, - { - "auxiliary_loss_clip": 0.01096033, - "auxiliary_loss_mlp": 0.01031648, - "balance_loss_clip": 1.04244077, - "balance_loss_mlp": 1.01851141, - "epoch": 0.4479182323763716, - "flos": 20814363849600.0, - "grad_norm": 2.0305308033418497, - "language_loss": 0.8022064, - "learning_rate": 2.4310359765662065e-06, - "loss": 0.82348317, - "num_input_tokens_seen": 159740550, - "step": 7450, - "time_per_iteration": 2.640101671218872 - }, - { - "auxiliary_loss_clip": 0.01128944, - "auxiliary_loss_mlp": 0.0103919, - "balance_loss_clip": 1.04747844, - "balance_loss_mlp": 1.02609515, - "epoch": 0.44797835562903954, - "flos": 14245979212800.0, - "grad_norm": 2.0706353062233878, - "language_loss": 0.79404807, - "learning_rate": 2.430655659114697e-06, - "loss": 0.81572944, - "num_input_tokens_seen": 159758245, - "step": 7451, - "time_per_iteration": 2.6094324588775635 - }, - { - "auxiliary_loss_clip": 0.01008441, - "auxiliary_loss_mlp": 0.01004662, - "balance_loss_clip": 1.02162147, - "balance_loss_mlp": 1.00313568, - "epoch": 0.4480384788817075, - "flos": 63534560169600.0, - "grad_norm": 0.8263901394620045, - "language_loss": 0.62780499, - "learning_rate": 2.430275325332681e-06, - "loss": 0.64793605, - "num_input_tokens_seen": 159826790, - "step": 7452, - "time_per_iteration": 3.3816721439361572 - }, - { - "auxiliary_loss_clip": 0.01128154, - "auxiliary_loss_mlp": 0.01034079, - "balance_loss_clip": 1.04587567, - "balance_loss_mlp": 1.01958907, - "epoch": 0.44809860213437547, - "flos": 21652626522240.0, - "grad_norm": 1.717773614702603, - "language_loss": 0.62656605, - "learning_rate": 2.429894975234582e-06, - "loss": 0.64818835, - "num_input_tokens_seen": 159845805, - "step": 7453, - "time_per_iteration": 2.6495423316955566 - }, - { - "auxiliary_loss_clip": 0.0102644, - "auxiliary_loss_mlp": 0.01007957, - "balance_loss_clip": 1.01617622, - "balance_loss_mlp": 1.00627661, - "epoch": 0.44815872538704343, - "flos": 69190634246400.0, - "grad_norm": 0.7452851567935764, - "language_loss": 0.57032764, - "learning_rate": 2.4295146088348224e-06, - "loss": 0.59067166, - "num_input_tokens_seen": 159898860, - "step": 7454, - "time_per_iteration": 3.0483179092407227 - }, - { - "auxiliary_loss_clip": 0.0110232, - "auxiliary_loss_mlp": 0.0104097, - "balance_loss_clip": 1.04301405, - "balance_loss_mlp": 1.02651, - "epoch": 0.4482188486397114, - "flos": 12598289510400.0, - "grad_norm": 2.1814246614415795, - "language_loss": 0.75516129, - "learning_rate": 2.4291342261478255e-06, - "loss": 0.77659416, - "num_input_tokens_seen": 159911555, - "step": 7455, - "time_per_iteration": 2.639425039291382 - }, - { - "auxiliary_loss_clip": 0.01103634, - "auxiliary_loss_mlp": 0.0103636, - "balance_loss_clip": 1.0440948, - "balance_loss_mlp": 1.02343822, - "epoch": 0.44827897189237936, - "flos": 34058182631040.0, - "grad_norm": 1.8295063999245702, - "language_loss": 0.75630772, - "learning_rate": 2.428753827188016e-06, - "loss": 0.7777077, - "num_input_tokens_seen": 159931470, - "step": 7456, - "time_per_iteration": 2.809356451034546 - }, - { - "auxiliary_loss_clip": 0.01130195, - "auxiliary_loss_mlp": 0.01036439, - "balance_loss_clip": 1.05033028, - "balance_loss_mlp": 1.02355289, - "epoch": 0.44833909514504733, - "flos": 25147416280320.0, - "grad_norm": 60.5899352460765, - "language_loss": 0.76306677, - "learning_rate": 2.428373411969818e-06, - "loss": 0.78473306, - "num_input_tokens_seen": 159946115, - "step": 7457, - "time_per_iteration": 2.632532835006714 - }, - { - "auxiliary_loss_clip": 0.01111792, - "auxiliary_loss_mlp": 0.01031449, - "balance_loss_clip": 1.04215193, - "balance_loss_mlp": 1.01695263, - "epoch": 0.4483992183977153, - "flos": 16179984224640.0, - "grad_norm": 2.8627685619088203, - "language_loss": 0.68479908, - "learning_rate": 2.4279929805076576e-06, - "loss": 0.70623147, - "num_input_tokens_seen": 159963915, - "step": 7458, - "time_per_iteration": 2.6376359462738037 - }, - { - "auxiliary_loss_clip": 0.01091284, - "auxiliary_loss_mlp": 0.01033162, - "balance_loss_clip": 1.04267764, - "balance_loss_mlp": 1.018332, - "epoch": 0.44845934165038326, - "flos": 17746048270080.0, - "grad_norm": 1.5800915665139277, - "language_loss": 0.71851492, - "learning_rate": 2.427612532815961e-06, - "loss": 0.73975933, - "num_input_tokens_seen": 159982140, - "step": 7459, - "time_per_iteration": 2.713164806365967 - }, - { - "auxiliary_loss_clip": 0.01108578, - "auxiliary_loss_mlp": 0.01036526, - "balance_loss_clip": 1.04210949, - "balance_loss_mlp": 1.02282834, - "epoch": 0.4485194649030513, - "flos": 21835914647040.0, - "grad_norm": 1.672173614468041, - "language_loss": 0.70216429, - "learning_rate": 2.427232068909154e-06, - "loss": 0.72361535, - "num_input_tokens_seen": 160002280, - "step": 7460, - "time_per_iteration": 2.6243271827697754 - }, - { - "auxiliary_loss_clip": 0.01129261, - "auxiliary_loss_mlp": 0.01038736, - "balance_loss_clip": 1.04698896, - "balance_loss_mlp": 1.02463329, - "epoch": 0.44857958815571924, - "flos": 20084515401600.0, - "grad_norm": 1.9532472719910148, - "language_loss": 0.77566743, - "learning_rate": 2.4268515888016635e-06, - "loss": 0.79734743, - "num_input_tokens_seen": 160020260, - "step": 7461, - "time_per_iteration": 4.114460468292236 - }, - { - "auxiliary_loss_clip": 0.01128704, - "auxiliary_loss_mlp": 0.01034261, - "balance_loss_clip": 1.0455538, - "balance_loss_mlp": 1.02091575, - "epoch": 0.4486397114083872, - "flos": 27053519402880.0, - "grad_norm": 1.943200777150693, - "language_loss": 0.67738903, - "learning_rate": 2.4264710925079184e-06, - "loss": 0.69901872, - "num_input_tokens_seen": 160040240, - "step": 7462, - "time_per_iteration": 5.671550035476685 - }, - { - "auxiliary_loss_clip": 0.01046056, - "auxiliary_loss_mlp": 0.01002183, - "balance_loss_clip": 1.0179913, - "balance_loss_mlp": 1.0006094, - "epoch": 0.4486998346610552, - "flos": 67321195931520.0, - "grad_norm": 0.7528637907126196, - "language_loss": 0.5449208, - "learning_rate": 2.4260905800423462e-06, - "loss": 0.5654031, - "num_input_tokens_seen": 160093865, - "step": 7463, - "time_per_iteration": 3.132819890975952 - }, - { - "auxiliary_loss_clip": 0.01117188, - "auxiliary_loss_mlp": 0.01031184, - "balance_loss_clip": 1.04449058, - "balance_loss_mlp": 1.01758814, - "epoch": 0.44875995791372314, - "flos": 27636816360960.0, - "grad_norm": 2.3886431821168954, - "language_loss": 0.7580359, - "learning_rate": 2.4257100514193775e-06, - "loss": 0.77951968, - "num_input_tokens_seen": 160113590, - "step": 7464, - "time_per_iteration": 2.7005674839019775 - }, - { - "auxiliary_loss_clip": 0.01116572, - "auxiliary_loss_mlp": 0.01037604, - "balance_loss_clip": 1.04709184, - "balance_loss_mlp": 1.02484834, - "epoch": 0.4488200811663911, - "flos": 13005947940480.0, - "grad_norm": 1.7787597626645963, - "language_loss": 0.74147099, - "learning_rate": 2.425329506653441e-06, - "loss": 0.76301277, - "num_input_tokens_seen": 160131795, - "step": 7465, - "time_per_iteration": 4.423643112182617 - }, - { - "auxiliary_loss_clip": 0.01110783, - "auxiliary_loss_mlp": 0.01040781, - "balance_loss_clip": 1.04708648, - "balance_loss_mlp": 1.02503395, - "epoch": 0.44888020441905907, - "flos": 27489977562240.0, - "grad_norm": 2.0439366025173347, - "language_loss": 0.7991035, - "learning_rate": 2.424948945758966e-06, - "loss": 0.82061917, - "num_input_tokens_seen": 160150635, - "step": 7466, - "time_per_iteration": 2.7003092765808105 - }, - { - "auxiliary_loss_clip": 0.01110719, - "auxiliary_loss_mlp": 0.01035258, - "balance_loss_clip": 1.04898739, - "balance_loss_mlp": 1.02141774, - "epoch": 0.44894032767172704, - "flos": 18259678800000.0, - "grad_norm": 2.4307522297147357, - "language_loss": 0.81000906, - "learning_rate": 2.4245683687503844e-06, - "loss": 0.83146888, - "num_input_tokens_seen": 160168615, - "step": 7467, - "time_per_iteration": 2.6656453609466553 - }, - { - "auxiliary_loss_clip": 0.01074952, - "auxiliary_loss_mlp": 0.01032302, - "balance_loss_clip": 1.04580259, - "balance_loss_mlp": 1.01924217, - "epoch": 0.449000450924395, - "flos": 21579835610880.0, - "grad_norm": 2.1126461235100726, - "language_loss": 0.74707794, - "learning_rate": 2.424187775642129e-06, - "loss": 0.76815045, - "num_input_tokens_seen": 160187295, - "step": 7468, - "time_per_iteration": 2.7112534046173096 - }, - { - "auxiliary_loss_clip": 0.01097239, - "auxiliary_loss_mlp": 0.01031291, - "balance_loss_clip": 1.04224133, - "balance_loss_mlp": 1.01881611, - "epoch": 0.44906057417706297, - "flos": 17967904623360.0, - "grad_norm": 1.845085412210932, - "language_loss": 0.71481991, - "learning_rate": 2.4238071664486297e-06, - "loss": 0.7361052, - "num_input_tokens_seen": 160205115, - "step": 7469, - "time_per_iteration": 2.680678606033325 - }, - { - "auxiliary_loss_clip": 0.01115577, - "auxiliary_loss_mlp": 0.01040939, - "balance_loss_clip": 1.04739857, - "balance_loss_mlp": 1.02700388, - "epoch": 0.44912069742973093, - "flos": 20047347803520.0, - "grad_norm": 1.9353970520381958, - "language_loss": 0.71990728, - "learning_rate": 2.4234265411843203e-06, - "loss": 0.74147248, - "num_input_tokens_seen": 160222580, - "step": 7470, - "time_per_iteration": 2.6266865730285645 - }, - { - "auxiliary_loss_clip": 0.01085169, - "auxiliary_loss_mlp": 0.01037894, - "balance_loss_clip": 1.04166925, - "balance_loss_mlp": 1.02263546, - "epoch": 0.4491808206823989, - "flos": 21033526682880.0, - "grad_norm": 1.7352200929350259, - "language_loss": 0.76839507, - "learning_rate": 2.423045899863634e-06, - "loss": 0.78962576, - "num_input_tokens_seen": 160241520, - "step": 7471, - "time_per_iteration": 2.692333698272705 - }, - { - "auxiliary_loss_clip": 0.0112922, - "auxiliary_loss_mlp": 0.010358, - "balance_loss_clip": 1.04736388, - "balance_loss_mlp": 1.02259803, - "epoch": 0.44924094393506686, - "flos": 22967136645120.0, - "grad_norm": 1.6949435247941296, - "language_loss": 0.70284784, - "learning_rate": 2.4226652425010048e-06, - "loss": 0.72449803, - "num_input_tokens_seen": 160261815, - "step": 7472, - "time_per_iteration": 2.714059829711914 - }, - { - "auxiliary_loss_clip": 0.01033495, - "auxiliary_loss_mlp": 0.01004013, - "balance_loss_clip": 1.01477528, - "balance_loss_mlp": 1.00226104, - "epoch": 0.4493010671877349, - "flos": 59233467864960.0, - "grad_norm": 0.7390973196636706, - "language_loss": 0.6168009, - "learning_rate": 2.4222845691108676e-06, - "loss": 0.63717604, - "num_input_tokens_seen": 160317070, - "step": 7473, - "time_per_iteration": 3.1489851474761963 - }, - { - "auxiliary_loss_clip": 0.01131224, - "auxiliary_loss_mlp": 0.00771593, - "balance_loss_clip": 1.04812014, - "balance_loss_mlp": 1.0004611, - "epoch": 0.44936119044040285, - "flos": 18004892653440.0, - "grad_norm": 2.3114379148666817, - "language_loss": 0.78279471, - "learning_rate": 2.421903879707657e-06, - "loss": 0.80182284, - "num_input_tokens_seen": 160334980, - "step": 7474, - "time_per_iteration": 2.5561118125915527 - }, - { - "auxiliary_loss_clip": 0.01074804, - "auxiliary_loss_mlp": 0.01040047, - "balance_loss_clip": 1.03983307, - "balance_loss_mlp": 1.0254494, - "epoch": 0.4494213136930708, - "flos": 21251827589760.0, - "grad_norm": 1.6204554836894525, - "language_loss": 0.72024751, - "learning_rate": 2.4215231743058086e-06, - "loss": 0.74139607, - "num_input_tokens_seen": 160354500, - "step": 7475, - "time_per_iteration": 2.7745461463928223 - }, - { - "auxiliary_loss_clip": 0.01080301, - "auxiliary_loss_mlp": 0.01041054, - "balance_loss_clip": 1.04167461, - "balance_loss_mlp": 1.02563405, - "epoch": 0.4494814369457388, - "flos": 27418695022080.0, - "grad_norm": 2.241823557245511, - "language_loss": 0.76592773, - "learning_rate": 2.4211424529197594e-06, - "loss": 0.78714132, - "num_input_tokens_seen": 160373650, - "step": 7476, - "time_per_iteration": 2.7856860160827637 - }, - { - "auxiliary_loss_clip": 0.01122132, - "auxiliary_loss_mlp": 0.00773102, - "balance_loss_clip": 1.04493368, - "balance_loss_mlp": 1.00047529, - "epoch": 0.44954156019840674, - "flos": 22854053652480.0, - "grad_norm": 4.385259299883037, - "language_loss": 0.72134888, - "learning_rate": 2.4207617155639464e-06, - "loss": 0.74030131, - "num_input_tokens_seen": 160393430, - "step": 7477, - "time_per_iteration": 2.641645669937134 - }, - { - "auxiliary_loss_clip": 0.01103781, - "auxiliary_loss_mlp": 0.01047956, - "balance_loss_clip": 1.04083133, - "balance_loss_mlp": 1.03148091, - "epoch": 0.4496016834510747, - "flos": 17201570935680.0, - "grad_norm": 2.795464855062127, - "language_loss": 0.67799896, - "learning_rate": 2.4203809622528062e-06, - "loss": 0.69951636, - "num_input_tokens_seen": 160410545, - "step": 7478, - "time_per_iteration": 2.6307947635650635 - }, - { - "auxiliary_loss_clip": 0.01102543, - "auxiliary_loss_mlp": 0.01038923, - "balance_loss_clip": 1.04405093, - "balance_loss_mlp": 1.02537441, - "epoch": 0.4496618067037427, - "flos": 18916628595840.0, - "grad_norm": 1.8532543047361745, - "language_loss": 0.89243561, - "learning_rate": 2.420000193000779e-06, - "loss": 0.91385025, - "num_input_tokens_seen": 160428105, - "step": 7479, - "time_per_iteration": 2.733828544616699 - }, - { - "auxiliary_loss_clip": 0.01068922, - "auxiliary_loss_mlp": 0.01043272, - "balance_loss_clip": 1.04273605, - "balance_loss_mlp": 1.02804279, - "epoch": 0.44972192995641064, - "flos": 21031659175680.0, - "grad_norm": 2.916606412127397, - "language_loss": 0.75539804, - "learning_rate": 2.419619407822302e-06, - "loss": 0.77652001, - "num_input_tokens_seen": 160448815, - "step": 7480, - "time_per_iteration": 2.8518130779266357 - }, - { - "auxiliary_loss_clip": 0.01095249, - "auxiliary_loss_mlp": 0.01035055, - "balance_loss_clip": 1.04253781, - "balance_loss_mlp": 1.02012968, - "epoch": 0.4497820532090786, - "flos": 20777088510720.0, - "grad_norm": 1.9829776726262367, - "language_loss": 0.79885375, - "learning_rate": 2.419238606731815e-06, - "loss": 0.82015675, - "num_input_tokens_seen": 160465940, - "step": 7481, - "time_per_iteration": 2.7299835681915283 - }, - { - "auxiliary_loss_clip": 0.01102494, - "auxiliary_loss_mlp": 0.01039566, - "balance_loss_clip": 1.04328001, - "balance_loss_mlp": 1.02454567, - "epoch": 0.44984217646174657, - "flos": 33802606385280.0, - "grad_norm": 1.6381608125682177, - "language_loss": 0.68340528, - "learning_rate": 2.418857789743758e-06, - "loss": 0.70482588, - "num_input_tokens_seen": 160486710, - "step": 7482, - "time_per_iteration": 2.8123154640197754 - }, - { - "auxiliary_loss_clip": 0.01122196, - "auxiliary_loss_mlp": 0.01040775, - "balance_loss_clip": 1.04835725, - "balance_loss_mlp": 1.02638626, - "epoch": 0.44990229971441453, - "flos": 15518365660800.0, - "grad_norm": 2.0379383366397232, - "language_loss": 0.84707004, - "learning_rate": 2.418476956872571e-06, - "loss": 0.86869979, - "num_input_tokens_seen": 160503405, - "step": 7483, - "time_per_iteration": 2.718548536300659 - }, - { - "auxiliary_loss_clip": 0.01099077, - "auxiliary_loss_mlp": 0.01046214, - "balance_loss_clip": 1.04296637, - "balance_loss_mlp": 1.03027594, - "epoch": 0.4499624229670825, - "flos": 29861913191040.0, - "grad_norm": 1.8017494037756971, - "language_loss": 0.80644262, - "learning_rate": 2.4180961081326967e-06, - "loss": 0.82789552, - "num_input_tokens_seen": 160525080, - "step": 7484, - "time_per_iteration": 2.8435990810394287 - }, - { - "auxiliary_loss_clip": 0.01075163, - "auxiliary_loss_mlp": 0.01037509, - "balance_loss_clip": 1.03809166, - "balance_loss_mlp": 1.02145171, - "epoch": 0.45002254621975046, - "flos": 18513674847360.0, - "grad_norm": 2.526248303429359, - "language_loss": 0.75311351, - "learning_rate": 2.4177152435385754e-06, - "loss": 0.77424026, - "num_input_tokens_seen": 160540895, - "step": 7485, - "time_per_iteration": 2.7453646659851074 - }, - { - "auxiliary_loss_clip": 0.01027401, - "auxiliary_loss_mlp": 0.0100295, - "balance_loss_clip": 1.01817155, - "balance_loss_mlp": 1.00125754, - "epoch": 0.4500826694724185, - "flos": 70420394229120.0, - "grad_norm": 0.7859680562883086, - "language_loss": 0.58644986, - "learning_rate": 2.4173343631046504e-06, - "loss": 0.60675335, - "num_input_tokens_seen": 160598270, - "step": 7486, - "time_per_iteration": 3.2535924911499023 - }, - { - "auxiliary_loss_clip": 0.0111614, - "auxiliary_loss_mlp": 0.01045183, - "balance_loss_clip": 1.04657292, - "balance_loss_mlp": 1.02917325, - "epoch": 0.45014279272508645, - "flos": 15778897983360.0, - "grad_norm": 2.484631064514228, - "language_loss": 0.83677804, - "learning_rate": 2.4169534668453654e-06, - "loss": 0.85839128, - "num_input_tokens_seen": 160614720, - "step": 7487, - "time_per_iteration": 2.7236413955688477 - }, - { - "auxiliary_loss_clip": 0.01128709, - "auxiliary_loss_mlp": 0.01039106, - "balance_loss_clip": 1.04632056, - "balance_loss_mlp": 1.02443182, - "epoch": 0.4502029159777544, - "flos": 21799573061760.0, - "grad_norm": 1.5508029399024128, - "language_loss": 0.77568138, - "learning_rate": 2.4165725547751622e-06, - "loss": 0.79735959, - "num_input_tokens_seen": 160635170, - "step": 7488, - "time_per_iteration": 2.6660585403442383 - }, - { - "auxiliary_loss_clip": 0.0112874, - "auxiliary_loss_mlp": 0.01045145, - "balance_loss_clip": 1.04882014, - "balance_loss_mlp": 1.02954042, - "epoch": 0.4502630392304224, - "flos": 28767966531840.0, - "grad_norm": 1.97851616048007, - "language_loss": 0.72073781, - "learning_rate": 2.4161916269084858e-06, - "loss": 0.74247664, - "num_input_tokens_seen": 160654490, - "step": 7489, - "time_per_iteration": 2.7274820804595947 - }, - { - "auxiliary_loss_clip": 0.01109274, - "auxiliary_loss_mlp": 0.01039798, - "balance_loss_clip": 1.04584038, - "balance_loss_mlp": 1.02314413, - "epoch": 0.45032316248309034, - "flos": 15844182952320.0, - "grad_norm": 2.9737823054207926, - "language_loss": 0.6968661, - "learning_rate": 2.4158106832597817e-06, - "loss": 0.71835679, - "num_input_tokens_seen": 160669400, - "step": 7490, - "time_per_iteration": 2.650700569152832 - }, - { - "auxiliary_loss_clip": 0.01026171, - "auxiliary_loss_mlp": 0.01004705, - "balance_loss_clip": 1.0231657, - "balance_loss_mlp": 1.00323248, - "epoch": 0.4503832857357583, - "flos": 57853600945920.0, - "grad_norm": 0.7292674820176653, - "language_loss": 0.56675166, - "learning_rate": 2.415429723843495e-06, - "loss": 0.58706039, - "num_input_tokens_seen": 160733820, - "step": 7491, - "time_per_iteration": 3.1893656253814697 - }, - { - "auxiliary_loss_clip": 0.01116518, - "auxiliary_loss_mlp": 0.01037403, - "balance_loss_clip": 1.04746497, - "balance_loss_mlp": 1.02327061, - "epoch": 0.4504434089884263, - "flos": 23878082488320.0, - "grad_norm": 1.6154687272881363, - "language_loss": 0.7939685, - "learning_rate": 2.4150487486740713e-06, - "loss": 0.81550771, - "num_input_tokens_seen": 160753175, - "step": 7492, - "time_per_iteration": 2.7314138412475586 - }, - { - "auxiliary_loss_clip": 0.010986, - "auxiliary_loss_mlp": 0.00775969, - "balance_loss_clip": 1.04425228, - "balance_loss_mlp": 1.000494, - "epoch": 0.45050353224109424, - "flos": 17785083375360.0, - "grad_norm": 2.875303360797025, - "language_loss": 0.92825645, - "learning_rate": 2.4146677577659573e-06, - "loss": 0.94700211, - "num_input_tokens_seen": 160768310, - "step": 7493, - "time_per_iteration": 2.7123935222625732 - }, - { - "auxiliary_loss_clip": 0.01039208, - "auxiliary_loss_mlp": 0.01001589, - "balance_loss_clip": 1.02041435, - "balance_loss_mlp": 0.99994355, - "epoch": 0.4505636554937622, - "flos": 65063420703360.0, - "grad_norm": 0.8110713299155351, - "language_loss": 0.62929082, - "learning_rate": 2.4142867511336e-06, - "loss": 0.64969873, - "num_input_tokens_seen": 160827370, - "step": 7494, - "time_per_iteration": 3.289635181427002 - }, - { - "auxiliary_loss_clip": 0.01129658, - "auxiliary_loss_mlp": 0.01035034, - "balance_loss_clip": 1.04754305, - "balance_loss_mlp": 1.02150989, - "epoch": 0.45062377874643017, - "flos": 22200084685440.0, - "grad_norm": 1.7474777674384385, - "language_loss": 0.82263976, - "learning_rate": 2.4139057287914484e-06, - "loss": 0.84428668, - "num_input_tokens_seen": 160849140, - "step": 7495, - "time_per_iteration": 2.659642219543457 - }, - { - "auxiliary_loss_clip": 0.01115544, - "auxiliary_loss_mlp": 0.01041634, - "balance_loss_clip": 1.04483461, - "balance_loss_mlp": 1.02449155, - "epoch": 0.45068390199909814, - "flos": 37670293186560.0, - "grad_norm": 1.8332713503860085, - "language_loss": 0.86039978, - "learning_rate": 2.41352469075395e-06, - "loss": 0.8819716, - "num_input_tokens_seen": 160871280, - "step": 7496, - "time_per_iteration": 2.798741579055786 - }, - { - "auxiliary_loss_clip": 0.01134499, - "auxiliary_loss_mlp": 0.01035754, - "balance_loss_clip": 1.04969478, - "balance_loss_mlp": 1.02054274, - "epoch": 0.4507440252517661, - "flos": 22302501338880.0, - "grad_norm": 2.0558646291387066, - "language_loss": 0.76101983, - "learning_rate": 2.4131436370355534e-06, - "loss": 0.78272235, - "num_input_tokens_seen": 160888625, - "step": 7497, - "time_per_iteration": 2.6553680896759033 - }, - { - "auxiliary_loss_clip": 0.01098074, - "auxiliary_loss_mlp": 0.01037956, - "balance_loss_clip": 1.04377723, - "balance_loss_mlp": 1.02352023, - "epoch": 0.45080414850443407, - "flos": 13188374138880.0, - "grad_norm": 2.277785969464064, - "language_loss": 0.75305939, - "learning_rate": 2.4127625676507088e-06, - "loss": 0.77441967, - "num_input_tokens_seen": 160907040, - "step": 7498, - "time_per_iteration": 2.6950063705444336 - }, - { - "auxiliary_loss_clip": 0.01133264, - "auxiliary_loss_mlp": 0.01044893, - "balance_loss_clip": 1.04848719, - "balance_loss_mlp": 1.02897298, - "epoch": 0.4508642717571021, - "flos": 21944939402880.0, - "grad_norm": 3.3346599205762826, - "language_loss": 0.70080638, - "learning_rate": 2.4123814826138663e-06, - "loss": 0.72258794, - "num_input_tokens_seen": 160927115, - "step": 7499, - "time_per_iteration": 2.6134774684906006 - }, - { - "auxiliary_loss_clip": 0.01084574, - "auxiliary_loss_mlp": 0.0103806, - "balance_loss_clip": 1.04212165, - "balance_loss_mlp": 1.02309906, - "epoch": 0.45092439500977005, - "flos": 23367468700800.0, - "grad_norm": 1.9346658302408082, - "language_loss": 0.77361268, - "learning_rate": 2.412000381939477e-06, - "loss": 0.79483902, - "num_input_tokens_seen": 160944405, - "step": 7500, - "time_per_iteration": 4.306777000427246 - }, - { - "auxiliary_loss_clip": 0.01084228, - "auxiliary_loss_mlp": 0.01034656, - "balance_loss_clip": 1.04249573, - "balance_loss_mlp": 1.02007651, - "epoch": 0.450984518262438, - "flos": 20772958446720.0, - "grad_norm": 1.9176241989159464, - "language_loss": 0.63056326, - "learning_rate": 2.411619265641992e-06, - "loss": 0.65175211, - "num_input_tokens_seen": 160961345, - "step": 7501, - "time_per_iteration": 5.803133487701416 - }, - { - "auxiliary_loss_clip": 0.01135547, - "auxiliary_loss_mlp": 0.01040046, - "balance_loss_clip": 1.04915273, - "balance_loss_mlp": 1.02445376, - "epoch": 0.451044641515106, - "flos": 17707372300800.0, - "grad_norm": 1.9532762899000093, - "language_loss": 0.84446234, - "learning_rate": 2.411238133735863e-06, - "loss": 0.86621827, - "num_input_tokens_seen": 160977330, - "step": 7502, - "time_per_iteration": 2.604753017425537 - }, - { - "auxiliary_loss_clip": 0.01105383, - "auxiliary_loss_mlp": 0.01036548, - "balance_loss_clip": 1.04670203, - "balance_loss_mlp": 1.02238584, - "epoch": 0.45110476476777395, - "flos": 20594698225920.0, - "grad_norm": 1.3813112457968315, - "language_loss": 0.79642487, - "learning_rate": 2.4108569862355418e-06, - "loss": 0.81784415, - "num_input_tokens_seen": 160997280, - "step": 7503, - "time_per_iteration": 2.666677236557007 - }, - { - "auxiliary_loss_clip": 0.01104325, - "auxiliary_loss_mlp": 0.01036781, - "balance_loss_clip": 1.04764807, - "balance_loss_mlp": 1.02240419, - "epoch": 0.4511648880204419, - "flos": 16034043265920.0, - "grad_norm": 2.051596804130354, - "language_loss": 0.81191939, - "learning_rate": 2.410475823155484e-06, - "loss": 0.83333045, - "num_input_tokens_seen": 161014235, - "step": 7504, - "time_per_iteration": 4.276456117630005 - }, - { - "auxiliary_loss_clip": 0.01087433, - "auxiliary_loss_mlp": 0.01038305, - "balance_loss_clip": 1.04069161, - "balance_loss_mlp": 1.02469158, - "epoch": 0.4512250112731099, - "flos": 23978811202560.0, - "grad_norm": 1.5834485358881918, - "language_loss": 0.63315797, - "learning_rate": 2.4100946445101405e-06, - "loss": 0.65441537, - "num_input_tokens_seen": 161032360, - "step": 7505, - "time_per_iteration": 2.947556734085083 - }, - { - "auxiliary_loss_clip": 0.01014942, - "auxiliary_loss_mlp": 0.01003244, - "balance_loss_clip": 1.02198029, - "balance_loss_mlp": 1.00188541, - "epoch": 0.45128513452577784, - "flos": 71462308037760.0, - "grad_norm": 0.8317919198459461, - "language_loss": 0.58857071, - "learning_rate": 2.409713450313968e-06, - "loss": 0.60875255, - "num_input_tokens_seen": 161091360, - "step": 7506, - "time_per_iteration": 3.395158052444458 - }, - { - "auxiliary_loss_clip": 0.01075605, - "auxiliary_loss_mlp": 0.01037451, - "balance_loss_clip": 1.04096067, - "balance_loss_mlp": 1.02287173, - "epoch": 0.4513452577784458, - "flos": 22090844448000.0, - "grad_norm": 1.7149339287343461, - "language_loss": 0.79334831, - "learning_rate": 2.40933224058142e-06, - "loss": 0.81447887, - "num_input_tokens_seen": 161110825, - "step": 7507, - "time_per_iteration": 2.8281381130218506 - }, - { - "auxiliary_loss_clip": 0.01091142, - "auxiliary_loss_mlp": 0.01036706, - "balance_loss_clip": 1.0425905, - "balance_loss_mlp": 1.02066064, - "epoch": 0.4514053810311138, - "flos": 24276403382400.0, - "grad_norm": 1.5823194059388275, - "language_loss": 0.73703611, - "learning_rate": 2.4089510153269526e-06, - "loss": 0.75831455, - "num_input_tokens_seen": 161130685, - "step": 7508, - "time_per_iteration": 2.75742506980896 - }, - { - "auxiliary_loss_clip": 0.01118642, - "auxiliary_loss_mlp": 0.0103619, - "balance_loss_clip": 1.04927611, - "balance_loss_mlp": 1.02279091, - "epoch": 0.45146550428378174, - "flos": 17886781756800.0, - "grad_norm": 2.075832981658432, - "language_loss": 0.79118419, - "learning_rate": 2.4085697745650217e-06, - "loss": 0.81273252, - "num_input_tokens_seen": 161147555, - "step": 7509, - "time_per_iteration": 2.6641790866851807 - }, - { - "auxiliary_loss_clip": 0.01130929, - "auxiliary_loss_mlp": 0.01034567, - "balance_loss_clip": 1.05022097, - "balance_loss_mlp": 1.02104306, - "epoch": 0.4515256275364497, - "flos": 24243437675520.0, - "grad_norm": 1.9616298828862797, - "language_loss": 0.73389792, - "learning_rate": 2.4081885183100837e-06, - "loss": 0.75555289, - "num_input_tokens_seen": 161166255, - "step": 7510, - "time_per_iteration": 2.754516839981079 - }, - { - "auxiliary_loss_clip": 0.01129503, - "auxiliary_loss_mlp": 0.01032701, - "balance_loss_clip": 1.04575419, - "balance_loss_mlp": 1.01789534, - "epoch": 0.45158575078911767, - "flos": 20631039811200.0, - "grad_norm": 1.8899584921112549, - "language_loss": 0.77046561, - "learning_rate": 2.4078072465765964e-06, - "loss": 0.79208767, - "num_input_tokens_seen": 161184720, - "step": 7511, - "time_per_iteration": 2.633896589279175 - }, - { - "auxiliary_loss_clip": 0.01119455, - "auxiliary_loss_mlp": 0.01033368, - "balance_loss_clip": 1.04665303, - "balance_loss_mlp": 1.01832986, - "epoch": 0.45164587404178563, - "flos": 23327751237120.0, - "grad_norm": 1.8239087865443961, - "language_loss": 0.78791374, - "learning_rate": 2.4074259593790174e-06, - "loss": 0.80944192, - "num_input_tokens_seen": 161204360, - "step": 7512, - "time_per_iteration": 2.701643466949463 - }, - { - "auxiliary_loss_clip": 0.01094327, - "auxiliary_loss_mlp": 0.01039327, - "balance_loss_clip": 1.04103267, - "balance_loss_mlp": 1.02404392, - "epoch": 0.45170599729445365, - "flos": 23805973935360.0, - "grad_norm": 2.0955290596831713, - "language_loss": 0.87512183, - "learning_rate": 2.4070446567318053e-06, - "loss": 0.89645839, - "num_input_tokens_seen": 161223575, - "step": 7513, - "time_per_iteration": 2.716236114501953 - }, - { - "auxiliary_loss_clip": 0.01110578, - "auxiliary_loss_mlp": 0.0103311, - "balance_loss_clip": 1.0445292, - "balance_loss_mlp": 1.02031827, - "epoch": 0.4517661205471216, - "flos": 23512942782720.0, - "grad_norm": 2.109318524386585, - "language_loss": 0.6707387, - "learning_rate": 2.406663338649419e-06, - "loss": 0.69217563, - "num_input_tokens_seen": 161243805, - "step": 7514, - "time_per_iteration": 2.665377140045166 - }, - { - "auxiliary_loss_clip": 0.01113013, - "auxiliary_loss_mlp": 0.0103579, - "balance_loss_clip": 1.04554498, - "balance_loss_mlp": 1.01995873, - "epoch": 0.4518262437997896, - "flos": 23513948363520.0, - "grad_norm": 2.2260653694398242, - "language_loss": 0.69152886, - "learning_rate": 2.406282005146318e-06, - "loss": 0.71301687, - "num_input_tokens_seen": 161261450, - "step": 7515, - "time_per_iteration": 2.6233787536621094 - }, - { - "auxiliary_loss_clip": 0.01114597, - "auxiliary_loss_mlp": 0.01038013, - "balance_loss_clip": 1.04228842, - "balance_loss_mlp": 1.02269435, - "epoch": 0.45188636705245755, - "flos": 14568061489920.0, - "grad_norm": 6.104635540487547, - "language_loss": 0.82568568, - "learning_rate": 2.405900656236963e-06, - "loss": 0.84721178, - "num_input_tokens_seen": 161276965, - "step": 7516, - "time_per_iteration": 2.7125158309936523 - }, - { - "auxiliary_loss_clip": 0.0112394, - "auxiliary_loss_mlp": 0.0103396, - "balance_loss_clip": 1.04487455, - "balance_loss_mlp": 1.02003694, - "epoch": 0.4519464903051255, - "flos": 19901550499200.0, - "grad_norm": 1.657947130481532, - "language_loss": 0.65597039, - "learning_rate": 2.4055192919358137e-06, - "loss": 0.67754936, - "num_input_tokens_seen": 161295375, - "step": 7517, - "time_per_iteration": 2.6732585430145264 - }, - { - "auxiliary_loss_clip": 0.01091101, - "auxiliary_loss_mlp": 0.01032878, - "balance_loss_clip": 1.04268789, - "balance_loss_mlp": 1.02015853, - "epoch": 0.4520066135577935, - "flos": 18844376388480.0, - "grad_norm": 2.0502430920821904, - "language_loss": 0.63127112, - "learning_rate": 2.405137912257333e-06, - "loss": 0.65251088, - "num_input_tokens_seen": 161313010, - "step": 7518, - "time_per_iteration": 2.6873538494110107 - }, - { - "auxiliary_loss_clip": 0.01116444, - "auxiliary_loss_mlp": 0.01033811, - "balance_loss_clip": 1.0465678, - "balance_loss_mlp": 1.02015519, - "epoch": 0.45206673681046144, - "flos": 48214419713280.0, - "grad_norm": 1.68859992173611, - "language_loss": 0.59658802, - "learning_rate": 2.404756517215982e-06, - "loss": 0.61809057, - "num_input_tokens_seen": 161336690, - "step": 7519, - "time_per_iteration": 2.8561198711395264 - }, - { - "auxiliary_loss_clip": 0.01116298, - "auxiliary_loss_mlp": 0.01038351, - "balance_loss_clip": 1.0457139, - "balance_loss_mlp": 1.02468395, - "epoch": 0.4521268600631294, - "flos": 23842171866240.0, - "grad_norm": 1.5141513880128057, - "language_loss": 0.72439361, - "learning_rate": 2.404375106826223e-06, - "loss": 0.74594009, - "num_input_tokens_seen": 161357845, - "step": 7520, - "time_per_iteration": 2.709179162979126 - }, - { - "auxiliary_loss_clip": 0.0110396, - "auxiliary_loss_mlp": 0.01036085, - "balance_loss_clip": 1.04404962, - "balance_loss_mlp": 1.02297747, - "epoch": 0.4521869833157974, - "flos": 18843622202880.0, - "grad_norm": 2.131399149186965, - "language_loss": 0.75379634, - "learning_rate": 2.4039936811025194e-06, - "loss": 0.77519679, - "num_input_tokens_seen": 161375160, - "step": 7521, - "time_per_iteration": 2.78236722946167 - }, - { - "auxiliary_loss_clip": 0.01109339, - "auxiliary_loss_mlp": 0.01039668, - "balance_loss_clip": 1.04502964, - "balance_loss_mlp": 1.02507663, - "epoch": 0.45224710656846534, - "flos": 19788072456960.0, - "grad_norm": 2.2802922264962247, - "language_loss": 0.68217206, - "learning_rate": 2.4036122400593343e-06, - "loss": 0.70366216, - "num_input_tokens_seen": 161393690, - "step": 7522, - "time_per_iteration": 2.698141574859619 - }, - { - "auxiliary_loss_clip": 0.01111702, - "auxiliary_loss_mlp": 0.01036701, - "balance_loss_clip": 1.04239058, - "balance_loss_mlp": 1.02306962, - "epoch": 0.4523072298211333, - "flos": 28256131681920.0, - "grad_norm": 1.6149288487041198, - "language_loss": 0.6114409, - "learning_rate": 2.403230783711134e-06, - "loss": 0.63292497, - "num_input_tokens_seen": 161415015, - "step": 7523, - "time_per_iteration": 2.765838623046875 - }, - { - "auxiliary_loss_clip": 0.01122412, - "auxiliary_loss_mlp": 0.01039402, - "balance_loss_clip": 1.04672575, - "balance_loss_mlp": 1.02425027, - "epoch": 0.45236735307380127, - "flos": 11181039511680.0, - "grad_norm": 2.0249866031396837, - "language_loss": 0.78044772, - "learning_rate": 2.4028493120723813e-06, - "loss": 0.80206585, - "num_input_tokens_seen": 161432940, - "step": 7524, - "time_per_iteration": 2.6178715229034424 - }, - { - "auxiliary_loss_clip": 0.01083067, - "auxiliary_loss_mlp": 0.0103962, - "balance_loss_clip": 1.04386139, - "balance_loss_mlp": 1.02560115, - "epoch": 0.45242747632646924, - "flos": 22601386408320.0, - "grad_norm": 2.4629173570449447, - "language_loss": 0.63756073, - "learning_rate": 2.4024678251575417e-06, - "loss": 0.65878761, - "num_input_tokens_seen": 161452215, - "step": 7525, - "time_per_iteration": 2.767791509628296 - }, - { - "auxiliary_loss_clip": 0.01116902, - "auxiliary_loss_mlp": 0.01037108, - "balance_loss_clip": 1.04607654, - "balance_loss_mlp": 1.02390599, - "epoch": 0.45248759957913726, - "flos": 18256267008000.0, - "grad_norm": 1.8561008840058875, - "language_loss": 0.78973663, - "learning_rate": 2.402086322981083e-06, - "loss": 0.81127673, - "num_input_tokens_seen": 161469520, - "step": 7526, - "time_per_iteration": 2.6315999031066895 - }, - { - "auxiliary_loss_clip": 0.01098614, - "auxiliary_loss_mlp": 0.01030271, - "balance_loss_clip": 1.04242575, - "balance_loss_mlp": 1.01696694, - "epoch": 0.4525477228318052, - "flos": 22450094323200.0, - "grad_norm": 1.8159616365895555, - "language_loss": 0.80961096, - "learning_rate": 2.40170480555747e-06, - "loss": 0.83089983, - "num_input_tokens_seen": 161487335, - "step": 7527, - "time_per_iteration": 2.6868715286254883 - }, - { - "auxiliary_loss_clip": 0.01092415, - "auxiliary_loss_mlp": 0.01031467, - "balance_loss_clip": 1.04517341, - "balance_loss_mlp": 1.01763892, - "epoch": 0.4526078460844732, - "flos": 29644869260160.0, - "grad_norm": 11.448753069744305, - "language_loss": 0.6562798, - "learning_rate": 2.4013232729011706e-06, - "loss": 0.67751861, - "num_input_tokens_seen": 161510095, - "step": 7528, - "time_per_iteration": 2.816391944885254 - }, - { - "auxiliary_loss_clip": 0.01100127, - "auxiliary_loss_mlp": 0.01033759, - "balance_loss_clip": 1.04077947, - "balance_loss_mlp": 1.02030635, - "epoch": 0.45266796933714115, - "flos": 23039747988480.0, - "grad_norm": 1.584867366654962, - "language_loss": 0.75341809, - "learning_rate": 2.4009417250266525e-06, - "loss": 0.77475703, - "num_input_tokens_seen": 161528725, - "step": 7529, - "time_per_iteration": 2.688854694366455 - }, - { - "auxiliary_loss_clip": 0.01127981, - "auxiliary_loss_mlp": 0.0103405, - "balance_loss_clip": 1.04677176, - "balance_loss_mlp": 1.02092457, - "epoch": 0.4527280925898091, - "flos": 14428405411200.0, - "grad_norm": 2.148118662824089, - "language_loss": 0.73154545, - "learning_rate": 2.400560161948384e-06, - "loss": 0.75316578, - "num_input_tokens_seen": 161547195, - "step": 7530, - "time_per_iteration": 2.626149892807007 - }, - { - "auxiliary_loss_clip": 0.01097205, - "auxiliary_loss_mlp": 0.01036532, - "balance_loss_clip": 1.04691768, - "balance_loss_mlp": 1.0233357, - "epoch": 0.4527882158424771, - "flos": 22925515760640.0, - "grad_norm": 1.600682021317837, - "language_loss": 0.75962186, - "learning_rate": 2.400178583680834e-06, - "loss": 0.78095925, - "num_input_tokens_seen": 161565565, - "step": 7531, - "time_per_iteration": 2.7901298999786377 - }, - { - "auxiliary_loss_clip": 0.01122835, - "auxiliary_loss_mlp": 0.01036019, - "balance_loss_clip": 1.04418015, - "balance_loss_mlp": 1.02203524, - "epoch": 0.45284833909514505, - "flos": 25555326105600.0, - "grad_norm": 1.5467116056600763, - "language_loss": 0.66987002, - "learning_rate": 2.3997969902384717e-06, - "loss": 0.69145852, - "num_input_tokens_seen": 161586630, - "step": 7532, - "time_per_iteration": 2.693523645401001 - }, - { - "auxiliary_loss_clip": 0.01115241, - "auxiliary_loss_mlp": 0.0104024, - "balance_loss_clip": 1.04580188, - "balance_loss_mlp": 1.02715659, - "epoch": 0.452908462347813, - "flos": 18150007599360.0, - "grad_norm": 3.168484665922808, - "language_loss": 0.78721988, - "learning_rate": 2.399415381635768e-06, - "loss": 0.80877471, - "num_input_tokens_seen": 161603815, - "step": 7533, - "time_per_iteration": 2.6418774127960205 - }, - { - "auxiliary_loss_clip": 0.01101942, - "auxiliary_loss_mlp": 0.01039812, - "balance_loss_clip": 1.04315686, - "balance_loss_mlp": 1.0244813, - "epoch": 0.452968585600481, - "flos": 19062749122560.0, - "grad_norm": 2.220433880382594, - "language_loss": 0.83064616, - "learning_rate": 2.3990337578871927e-06, - "loss": 0.85206366, - "num_input_tokens_seen": 161622900, - "step": 7534, - "time_per_iteration": 2.751016855239868 - }, - { - "auxiliary_loss_clip": 0.01102917, - "auxiliary_loss_mlp": 0.0103851, - "balance_loss_clip": 1.04744101, - "balance_loss_mlp": 1.02389479, - "epoch": 0.45302870885314894, - "flos": 22051737515520.0, - "grad_norm": 1.8531826529396993, - "language_loss": 0.76665461, - "learning_rate": 2.3986521190072176e-06, - "loss": 0.78806889, - "num_input_tokens_seen": 161641700, - "step": 7535, - "time_per_iteration": 2.6611855030059814 - }, - { - "auxiliary_loss_clip": 0.01083875, - "auxiliary_loss_mlp": 0.01036335, - "balance_loss_clip": 1.04374576, - "balance_loss_mlp": 1.02368724, - "epoch": 0.4530888321058169, - "flos": 20376217751040.0, - "grad_norm": 1.5302063461742579, - "language_loss": 0.80437911, - "learning_rate": 2.3982704650103138e-06, - "loss": 0.82558113, - "num_input_tokens_seen": 161661955, - "step": 7536, - "time_per_iteration": 2.7666051387786865 - }, - { - "auxiliary_loss_clip": 0.01097222, - "auxiliary_loss_mlp": 0.01036263, - "balance_loss_clip": 1.04180908, - "balance_loss_mlp": 1.02248287, - "epoch": 0.4531489553584849, - "flos": 14830425406080.0, - "grad_norm": 2.016168707097938, - "language_loss": 0.76173598, - "learning_rate": 2.3978887959109544e-06, - "loss": 0.78307086, - "num_input_tokens_seen": 161679245, - "step": 7537, - "time_per_iteration": 2.690034866333008 - }, - { - "auxiliary_loss_clip": 0.01118629, - "auxiliary_loss_mlp": 0.01035481, - "balance_loss_clip": 1.04544806, - "balance_loss_mlp": 1.0222249, - "epoch": 0.45320907861115284, - "flos": 21944975316480.0, - "grad_norm": 1.9502516921913984, - "language_loss": 0.75985712, - "learning_rate": 2.3975071117236118e-06, - "loss": 0.78139818, - "num_input_tokens_seen": 161698795, - "step": 7538, - "time_per_iteration": 2.692582130432129 - }, - { - "auxiliary_loss_clip": 0.01037446, - "auxiliary_loss_mlp": 0.01009452, - "balance_loss_clip": 1.01847482, - "balance_loss_mlp": 1.00774765, - "epoch": 0.45326920186382086, - "flos": 66251455038720.0, - "grad_norm": 0.7823640203744525, - "language_loss": 0.62291718, - "learning_rate": 2.3971254124627593e-06, - "loss": 0.64338624, - "num_input_tokens_seen": 161761980, - "step": 7539, - "time_per_iteration": 6.417045593261719 - }, - { - "auxiliary_loss_clip": 0.01129753, - "auxiliary_loss_mlp": 0.01046019, - "balance_loss_clip": 1.04852843, - "balance_loss_mlp": 1.03270316, - "epoch": 0.4533293251164888, - "flos": 14684233052160.0, - "grad_norm": 1.7334435648675772, - "language_loss": 0.65637821, - "learning_rate": 2.396743698142872e-06, - "loss": 0.67813587, - "num_input_tokens_seen": 161779455, - "step": 7540, - "time_per_iteration": 2.7546002864837646 - }, - { - "auxiliary_loss_clip": 0.01106819, - "auxiliary_loss_mlp": 0.01043222, - "balance_loss_clip": 1.0439229, - "balance_loss_mlp": 1.02768898, - "epoch": 0.4533894483691568, - "flos": 22601206840320.0, - "grad_norm": 2.0843332238803587, - "language_loss": 0.84594655, - "learning_rate": 2.396361968778424e-06, - "loss": 0.86744702, - "num_input_tokens_seen": 161798980, - "step": 7541, - "time_per_iteration": 4.3779473304748535 - }, - { - "auxiliary_loss_clip": 0.01103981, - "auxiliary_loss_mlp": 0.01038274, - "balance_loss_clip": 1.04346132, - "balance_loss_mlp": 1.02451134, - "epoch": 0.45344957162182475, - "flos": 34751617666560.0, - "grad_norm": 1.786741767322354, - "language_loss": 0.76398253, - "learning_rate": 2.395980224383889e-06, - "loss": 0.78540504, - "num_input_tokens_seen": 161819745, - "step": 7542, - "time_per_iteration": 2.8061442375183105 - }, - { - "auxiliary_loss_clip": 0.01100521, - "auxiliary_loss_mlp": 0.01030908, - "balance_loss_clip": 1.04320002, - "balance_loss_mlp": 1.01665092, - "epoch": 0.4535096948744927, - "flos": 23550218121600.0, - "grad_norm": 4.384838077420028, - "language_loss": 0.80294377, - "learning_rate": 2.395598464973746e-06, - "loss": 0.82425809, - "num_input_tokens_seen": 161838575, - "step": 7543, - "time_per_iteration": 4.4142186641693115 - }, - { - "auxiliary_loss_clip": 0.01116855, - "auxiliary_loss_mlp": 0.00771625, - "balance_loss_clip": 1.04452896, - "balance_loss_mlp": 1.00043499, - "epoch": 0.4535698181271607, - "flos": 25557552748800.0, - "grad_norm": 1.7946145717938884, - "language_loss": 0.75708425, - "learning_rate": 2.395216690562469e-06, - "loss": 0.77596909, - "num_input_tokens_seen": 161858590, - "step": 7544, - "time_per_iteration": 2.706681966781616 - }, - { - "auxiliary_loss_clip": 0.01097765, - "auxiliary_loss_mlp": 0.01037632, - "balance_loss_clip": 1.04519629, - "balance_loss_mlp": 1.02378595, - "epoch": 0.45362994137982865, - "flos": 24864117713280.0, - "grad_norm": 1.7108154873098056, - "language_loss": 0.75483274, - "learning_rate": 2.3948349011645355e-06, - "loss": 0.7761867, - "num_input_tokens_seen": 161878390, - "step": 7545, - "time_per_iteration": 2.741312026977539 - }, - { - "auxiliary_loss_clip": 0.01106771, - "auxiliary_loss_mlp": 0.0103517, - "balance_loss_clip": 1.04418731, - "balance_loss_mlp": 1.02098417, - "epoch": 0.4536900646324966, - "flos": 30806794408320.0, - "grad_norm": 2.2011621045210057, - "language_loss": 0.72520149, - "learning_rate": 2.394453096794423e-06, - "loss": 0.74662089, - "num_input_tokens_seen": 161898610, - "step": 7546, - "time_per_iteration": 2.7891902923583984 - }, - { - "auxiliary_loss_clip": 0.01108307, - "auxiliary_loss_mlp": 0.01035115, - "balance_loss_clip": 1.04388261, - "balance_loss_mlp": 1.02008224, - "epoch": 0.4537501878851646, - "flos": 23404313076480.0, - "grad_norm": 1.593135285125141, - "language_loss": 0.75609434, - "learning_rate": 2.394071277466609e-06, - "loss": 0.77752858, - "num_input_tokens_seen": 161918210, - "step": 7547, - "time_per_iteration": 2.7260210514068604 - }, - { - "auxiliary_loss_clip": 0.01120791, - "auxiliary_loss_mlp": 0.01033715, - "balance_loss_clip": 1.04588616, - "balance_loss_mlp": 1.01945722, - "epoch": 0.45381031113783254, - "flos": 18149289327360.0, - "grad_norm": 2.150959748604014, - "language_loss": 0.70081824, - "learning_rate": 2.393689443195573e-06, - "loss": 0.72236335, - "num_input_tokens_seen": 161936950, - "step": 7548, - "time_per_iteration": 2.652388095855713 - }, - { - "auxiliary_loss_clip": 0.01129285, - "auxiliary_loss_mlp": 0.01039378, - "balance_loss_clip": 1.04662538, - "balance_loss_mlp": 1.0256331, - "epoch": 0.4538704343905005, - "flos": 25336666062720.0, - "grad_norm": 2.8840782688813293, - "language_loss": 0.73135072, - "learning_rate": 2.393307593995794e-06, - "loss": 0.75303733, - "num_input_tokens_seen": 161955550, - "step": 7549, - "time_per_iteration": 2.8452274799346924 - }, - { - "auxiliary_loss_clip": 0.01091023, - "auxiliary_loss_mlp": 0.01028579, - "balance_loss_clip": 1.040573, - "balance_loss_mlp": 1.01576996, - "epoch": 0.4539305576431685, - "flos": 28731445378560.0, - "grad_norm": 1.9190169905093657, - "language_loss": 0.65320408, - "learning_rate": 2.392925729881751e-06, - "loss": 0.67440009, - "num_input_tokens_seen": 161976760, - "step": 7550, - "time_per_iteration": 2.783653497695923 - }, - { - "auxiliary_loss_clip": 0.01113741, - "auxiliary_loss_mlp": 0.01035092, - "balance_loss_clip": 1.05046797, - "balance_loss_mlp": 1.02172232, - "epoch": 0.45399068089583644, - "flos": 22492397566080.0, - "grad_norm": 1.6128261499338563, - "language_loss": 0.69028163, - "learning_rate": 2.3925438508679263e-06, - "loss": 0.71176994, - "num_input_tokens_seen": 161996120, - "step": 7551, - "time_per_iteration": 2.6571664810180664 - }, - { - "auxiliary_loss_clip": 0.01115638, - "auxiliary_loss_mlp": 0.010339, - "balance_loss_clip": 1.04326105, - "balance_loss_mlp": 1.01979804, - "epoch": 0.45405080414850446, - "flos": 12893403651840.0, - "grad_norm": 1.789312830614556, - "language_loss": 0.79496789, - "learning_rate": 2.392161956968798e-06, - "loss": 0.81646329, - "num_input_tokens_seen": 162011125, - "step": 7552, - "time_per_iteration": 2.6482155323028564 - }, - { - "auxiliary_loss_clip": 0.01042694, - "auxiliary_loss_mlp": 0.0100358, - "balance_loss_clip": 1.02483499, - "balance_loss_mlp": 1.00200677, - "epoch": 0.4541109274011724, - "flos": 59766919724160.0, - "grad_norm": 0.8270469682211425, - "language_loss": 0.57826698, - "learning_rate": 2.39178004819885e-06, - "loss": 0.59872973, - "num_input_tokens_seen": 162068705, - "step": 7553, - "time_per_iteration": 3.1456856727600098 - }, - { - "auxiliary_loss_clip": 0.01064062, - "auxiliary_loss_mlp": 0.01034097, - "balance_loss_clip": 1.04350471, - "balance_loss_mlp": 1.02177691, - "epoch": 0.4541710506538404, - "flos": 28511743841280.0, - "grad_norm": 1.3658485385977341, - "language_loss": 0.76709622, - "learning_rate": 2.3913981245725626e-06, - "loss": 0.78807783, - "num_input_tokens_seen": 162089655, - "step": 7554, - "time_per_iteration": 2.8080356121063232 - }, - { - "auxiliary_loss_clip": 0.01108851, - "auxiliary_loss_mlp": 0.01035523, - "balance_loss_clip": 1.0467329, - "balance_loss_mlp": 1.02056265, - "epoch": 0.45423117390650836, - "flos": 17675591742720.0, - "grad_norm": 3.0408177613289014, - "language_loss": 0.7764836, - "learning_rate": 2.3910161861044194e-06, - "loss": 0.79792738, - "num_input_tokens_seen": 162108465, - "step": 7555, - "time_per_iteration": 2.6776504516601562 - }, - { - "auxiliary_loss_clip": 0.01059757, - "auxiliary_loss_mlp": 0.01032208, - "balance_loss_clip": 1.04157853, - "balance_loss_mlp": 1.01914918, - "epoch": 0.4542912971591763, - "flos": 28072556248320.0, - "grad_norm": 1.7035673731774164, - "language_loss": 0.72646725, - "learning_rate": 2.390634232808903e-06, - "loss": 0.74738687, - "num_input_tokens_seen": 162129910, - "step": 7556, - "time_per_iteration": 2.851022720336914 - }, - { - "auxiliary_loss_clip": 0.01133495, - "auxiliary_loss_mlp": 0.01038462, - "balance_loss_clip": 1.04808855, - "balance_loss_mlp": 1.02491426, - "epoch": 0.4543514204118443, - "flos": 22671771108480.0, - "grad_norm": 2.040538066845486, - "language_loss": 0.6298486, - "learning_rate": 2.3902522647004982e-06, - "loss": 0.65156817, - "num_input_tokens_seen": 162148840, - "step": 7557, - "time_per_iteration": 2.7630646228790283 - }, - { - "auxiliary_loss_clip": 0.01029784, - "auxiliary_loss_mlp": 0.0100461, - "balance_loss_clip": 1.02091062, - "balance_loss_mlp": 1.00302434, - "epoch": 0.45441154366451225, - "flos": 58216549921920.0, - "grad_norm": 0.683633086089208, - "language_loss": 0.57569897, - "learning_rate": 2.3898702817936875e-06, - "loss": 0.59604287, - "num_input_tokens_seen": 162208500, - "step": 7558, - "time_per_iteration": 3.1137866973876953 - }, - { - "auxiliary_loss_clip": 0.01120146, - "auxiliary_loss_mlp": 0.0104176, - "balance_loss_clip": 1.04774594, - "balance_loss_mlp": 1.02645946, - "epoch": 0.4544716669171802, - "flos": 16764286763520.0, - "grad_norm": 4.36821938683546, - "language_loss": 0.56214309, - "learning_rate": 2.3894882841029573e-06, - "loss": 0.58376217, - "num_input_tokens_seen": 162224650, - "step": 7559, - "time_per_iteration": 2.6453661918640137 - }, - { - "auxiliary_loss_clip": 0.01114034, - "auxiliary_loss_mlp": 0.00771404, - "balance_loss_clip": 1.04701853, - "balance_loss_mlp": 1.00053644, - "epoch": 0.4545317901698482, - "flos": 15925233991680.0, - "grad_norm": 3.62707185125481, - "language_loss": 0.72154331, - "learning_rate": 2.389106271642792e-06, - "loss": 0.74039769, - "num_input_tokens_seen": 162242930, - "step": 7560, - "time_per_iteration": 2.734957456588745 - }, - { - "auxiliary_loss_clip": 0.01047807, - "auxiliary_loss_mlp": 0.01042508, - "balance_loss_clip": 1.03757131, - "balance_loss_mlp": 1.02745199, - "epoch": 0.45459191342251615, - "flos": 17639752947840.0, - "grad_norm": 2.1379103724447517, - "language_loss": 0.69509232, - "learning_rate": 2.3887242444276775e-06, - "loss": 0.71599543, - "num_input_tokens_seen": 162261455, - "step": 7561, - "time_per_iteration": 2.8633503913879395 - }, - { - "auxiliary_loss_clip": 0.01103836, - "auxiliary_loss_mlp": 0.01038069, - "balance_loss_clip": 1.04502749, - "balance_loss_mlp": 1.02508128, - "epoch": 0.4546520366751841, - "flos": 16176608346240.0, - "grad_norm": 1.7850356135584633, - "language_loss": 0.85308814, - "learning_rate": 2.3883422024721015e-06, - "loss": 0.87450719, - "num_input_tokens_seen": 162279725, - "step": 7562, - "time_per_iteration": 2.6936264038085938 - }, - { - "auxiliary_loss_clip": 0.01113259, - "auxiliary_loss_mlp": 0.01038297, - "balance_loss_clip": 1.04309893, - "balance_loss_mlp": 1.0244745, - "epoch": 0.4547121599278521, - "flos": 19751443562880.0, - "grad_norm": 1.7930294917475702, - "language_loss": 0.89894032, - "learning_rate": 2.38796014579055e-06, - "loss": 0.92045587, - "num_input_tokens_seen": 162297865, - "step": 7563, - "time_per_iteration": 2.6632707118988037 - }, - { - "auxiliary_loss_clip": 0.01128772, - "auxiliary_loss_mlp": 0.00772113, - "balance_loss_clip": 1.04633093, - "balance_loss_mlp": 1.00060475, - "epoch": 0.45477228318052004, - "flos": 19937461121280.0, - "grad_norm": 1.7120070486519374, - "language_loss": 0.71349525, - "learning_rate": 2.3875780743975097e-06, - "loss": 0.73250407, - "num_input_tokens_seen": 162316010, - "step": 7564, - "time_per_iteration": 2.6610071659088135 - }, - { - "auxiliary_loss_clip": 0.01118776, - "auxiliary_loss_mlp": 0.01037586, - "balance_loss_clip": 1.04351079, - "balance_loss_mlp": 1.02376413, - "epoch": 0.454832406433188, - "flos": 21288312829440.0, - "grad_norm": 2.3273072225052998, - "language_loss": 0.67977536, - "learning_rate": 2.3871959883074713e-06, - "loss": 0.70133895, - "num_input_tokens_seen": 162336115, - "step": 7565, - "time_per_iteration": 2.645447015762329 - }, - { - "auxiliary_loss_clip": 0.01084701, - "auxiliary_loss_mlp": 0.01033288, - "balance_loss_clip": 1.04171932, - "balance_loss_mlp": 1.02002633, - "epoch": 0.45489252968585603, - "flos": 24498726612480.0, - "grad_norm": 1.877770036567151, - "language_loss": 0.80176723, - "learning_rate": 2.386813887534922e-06, - "loss": 0.82294714, - "num_input_tokens_seen": 162355705, - "step": 7566, - "time_per_iteration": 2.7949163913726807 - }, - { - "auxiliary_loss_clip": 0.01090452, - "auxiliary_loss_mlp": 0.01035417, - "balance_loss_clip": 1.04210711, - "balance_loss_mlp": 1.01981235, - "epoch": 0.454952652938524, - "flos": 17092474352640.0, - "grad_norm": 1.6100724605132029, - "language_loss": 0.73702621, - "learning_rate": 2.3864317720943508e-06, - "loss": 0.75828493, - "num_input_tokens_seen": 162374055, - "step": 7567, - "time_per_iteration": 2.8082687854766846 - }, - { - "auxiliary_loss_clip": 0.01093893, - "auxiliary_loss_mlp": 0.01039284, - "balance_loss_clip": 1.04401243, - "balance_loss_mlp": 1.02519345, - "epoch": 0.45501277619119196, - "flos": 27630387826560.0, - "grad_norm": 1.3909583171669249, - "language_loss": 0.81125635, - "learning_rate": 2.386049642000249e-06, - "loss": 0.83258814, - "num_input_tokens_seen": 162393560, - "step": 7568, - "time_per_iteration": 2.7837767601013184 - }, - { - "auxiliary_loss_clip": 0.01126615, - "auxiliary_loss_mlp": 0.01047153, - "balance_loss_clip": 1.04950857, - "balance_loss_mlp": 1.03145313, - "epoch": 0.4550728994438599, - "flos": 19974664632960.0, - "grad_norm": 2.2201304610210175, - "language_loss": 0.79881442, - "learning_rate": 2.3856674972671055e-06, - "loss": 0.82055211, - "num_input_tokens_seen": 162413170, - "step": 7569, - "time_per_iteration": 2.6318490505218506 - }, - { - "auxiliary_loss_clip": 0.01121847, - "auxiliary_loss_mlp": 0.01038069, - "balance_loss_clip": 1.04655576, - "balance_loss_mlp": 1.02286983, - "epoch": 0.4551330226965279, - "flos": 26066873646720.0, - "grad_norm": 1.3612588382742794, - "language_loss": 0.75316679, - "learning_rate": 2.385285337909412e-06, - "loss": 0.77476597, - "num_input_tokens_seen": 162434080, - "step": 7570, - "time_per_iteration": 2.6693389415740967 - }, - { - "auxiliary_loss_clip": 0.0110874, - "auxiliary_loss_mlp": 0.01042662, - "balance_loss_clip": 1.0496285, - "balance_loss_mlp": 1.02787971, - "epoch": 0.45519314594919585, - "flos": 32781091501440.0, - "grad_norm": 1.7331933441120846, - "language_loss": 0.74851429, - "learning_rate": 2.3849031639416596e-06, - "loss": 0.77002835, - "num_input_tokens_seen": 162455445, - "step": 7571, - "time_per_iteration": 2.8367550373077393 - }, - { - "auxiliary_loss_clip": 0.01118243, - "auxiliary_loss_mlp": 0.01037051, - "balance_loss_clip": 1.04903221, - "balance_loss_mlp": 1.02305007, - "epoch": 0.4552532692018638, - "flos": 19172671718400.0, - "grad_norm": 1.8103885190184377, - "language_loss": 0.81033444, - "learning_rate": 2.3845209753783414e-06, - "loss": 0.83188736, - "num_input_tokens_seen": 162474940, - "step": 7572, - "time_per_iteration": 2.654205322265625 - }, - { - "auxiliary_loss_clip": 0.01114723, - "auxiliary_loss_mlp": 0.01041135, - "balance_loss_clip": 1.04709005, - "balance_loss_mlp": 1.02511287, - "epoch": 0.4553133924545318, - "flos": 26027156183040.0, - "grad_norm": 1.7361541689984175, - "language_loss": 0.7262516, - "learning_rate": 2.3841387722339486e-06, - "loss": 0.74781018, - "num_input_tokens_seen": 162493340, - "step": 7573, - "time_per_iteration": 2.7468600273132324 - }, - { - "auxiliary_loss_clip": 0.01124507, - "auxiliary_loss_mlp": 0.01039816, - "balance_loss_clip": 1.04916418, - "balance_loss_mlp": 1.02327013, - "epoch": 0.45537351570719975, - "flos": 30661535808000.0, - "grad_norm": 1.869301925708578, - "language_loss": 0.74335551, - "learning_rate": 2.3837565545229748e-06, - "loss": 0.76499879, - "num_input_tokens_seen": 162514360, - "step": 7574, - "time_per_iteration": 2.7575597763061523 - }, - { - "auxiliary_loss_clip": 0.01121884, - "auxiliary_loss_mlp": 0.01036714, - "balance_loss_clip": 1.04758859, - "balance_loss_mlp": 1.02184868, - "epoch": 0.4554336389598677, - "flos": 24353396184960.0, - "grad_norm": 1.5603127476263212, - "language_loss": 0.7161333, - "learning_rate": 2.383374322259915e-06, - "loss": 0.7377193, - "num_input_tokens_seen": 162535240, - "step": 7575, - "time_per_iteration": 2.6638269424438477 - }, - { - "auxiliary_loss_clip": 0.01106959, - "auxiliary_loss_mlp": 0.01035456, - "balance_loss_clip": 1.04536855, - "balance_loss_mlp": 1.02120471, - "epoch": 0.4554937622125357, - "flos": 20557925677440.0, - "grad_norm": 1.872589408642276, - "language_loss": 0.73370463, - "learning_rate": 2.3829920754592617e-06, - "loss": 0.7551288, - "num_input_tokens_seen": 162553880, - "step": 7576, - "time_per_iteration": 2.686311721801758 - }, - { - "auxiliary_loss_clip": 0.01129005, - "auxiliary_loss_mlp": 0.01036522, - "balance_loss_clip": 1.04784572, - "balance_loss_mlp": 1.02179956, - "epoch": 0.45555388546520365, - "flos": 22820764723200.0, - "grad_norm": 1.7873556557153987, - "language_loss": 0.66664052, - "learning_rate": 2.382609814135511e-06, - "loss": 0.68829584, - "num_input_tokens_seen": 162574485, - "step": 7577, - "time_per_iteration": 2.6766581535339355 - }, - { - "auxiliary_loss_clip": 0.01103092, - "auxiliary_loss_mlp": 0.01046596, - "balance_loss_clip": 1.04435253, - "balance_loss_mlp": 1.0300076, - "epoch": 0.4556140087178716, - "flos": 21725992051200.0, - "grad_norm": 1.9298557564452474, - "language_loss": 0.74309111, - "learning_rate": 2.382227538303157e-06, - "loss": 0.76458794, - "num_input_tokens_seen": 162595130, - "step": 7578, - "time_per_iteration": 4.310480356216431 - }, - { - "auxiliary_loss_clip": 0.01079377, - "auxiliary_loss_mlp": 0.00774819, - "balance_loss_clip": 1.04437256, - "balance_loss_mlp": 1.00061071, - "epoch": 0.45567413197053963, - "flos": 25994513698560.0, - "grad_norm": 1.7583976894464832, - "language_loss": 0.69843179, - "learning_rate": 2.381845247976697e-06, - "loss": 0.71697378, - "num_input_tokens_seen": 162615720, - "step": 7579, - "time_per_iteration": 4.325899362564087 - }, - { - "auxiliary_loss_clip": 0.01116252, - "auxiliary_loss_mlp": 0.01033231, - "balance_loss_clip": 1.0446142, - "balance_loss_mlp": 1.0195992, - "epoch": 0.4557342552232076, - "flos": 21537604195200.0, - "grad_norm": 1.7639178263730233, - "language_loss": 0.78628397, - "learning_rate": 2.381462943170627e-06, - "loss": 0.80777884, - "num_input_tokens_seen": 162635825, - "step": 7580, - "time_per_iteration": 2.6391446590423584 - }, - { - "auxiliary_loss_clip": 0.0113405, - "auxiliary_loss_mlp": 0.01031474, - "balance_loss_clip": 1.05214024, - "balance_loss_mlp": 1.01697779, - "epoch": 0.45579437847587556, - "flos": 40001972647680.0, - "grad_norm": 1.99718885063772, - "language_loss": 0.68943548, - "learning_rate": 2.381080623899444e-06, - "loss": 0.71109068, - "num_input_tokens_seen": 162659130, - "step": 7581, - "time_per_iteration": 4.234206914901733 - }, - { - "auxiliary_loss_clip": 0.01111938, - "auxiliary_loss_mlp": 0.01032821, - "balance_loss_clip": 1.04282808, - "balance_loss_mlp": 1.01836669, - "epoch": 0.4558545017285435, - "flos": 31138501530240.0, - "grad_norm": 1.6647606381596314, - "language_loss": 0.73356318, - "learning_rate": 2.3806982901776455e-06, - "loss": 0.75501084, - "num_input_tokens_seen": 162681665, - "step": 7582, - "time_per_iteration": 4.333024978637695 - }, - { - "auxiliary_loss_clip": 0.0113626, - "auxiliary_loss_mlp": 0.01043946, - "balance_loss_clip": 1.05043411, - "balance_loss_mlp": 1.02829337, - "epoch": 0.4559146249812115, - "flos": 21725776569600.0, - "grad_norm": 1.9011112097623832, - "language_loss": 0.72327513, - "learning_rate": 2.380315942019729e-06, - "loss": 0.74507719, - "num_input_tokens_seen": 162702040, - "step": 7583, - "time_per_iteration": 2.633423089981079 - }, - { - "auxiliary_loss_clip": 0.01122524, - "auxiliary_loss_mlp": 0.01037395, - "balance_loss_clip": 1.05119634, - "balance_loss_mlp": 1.02291131, - "epoch": 0.45597474823387946, - "flos": 23805973935360.0, - "grad_norm": 1.6028864846132196, - "language_loss": 0.72692537, - "learning_rate": 2.379933579440195e-06, - "loss": 0.74852461, - "num_input_tokens_seen": 162722375, - "step": 7584, - "time_per_iteration": 2.6895499229431152 - }, - { - "auxiliary_loss_clip": 0.01089384, - "auxiliary_loss_mlp": 0.01040718, - "balance_loss_clip": 1.04311633, - "balance_loss_mlp": 1.02606773, - "epoch": 0.4560348714865474, - "flos": 31905661230720.0, - "grad_norm": 1.833639423310481, - "language_loss": 0.68204761, - "learning_rate": 2.379551202453541e-06, - "loss": 0.70334864, - "num_input_tokens_seen": 162746095, - "step": 7585, - "time_per_iteration": 2.7882261276245117 - }, - { - "auxiliary_loss_clip": 0.01132515, - "auxiliary_loss_mlp": 0.01030518, - "balance_loss_clip": 1.05002046, - "balance_loss_mlp": 1.01725006, - "epoch": 0.4560949947392154, - "flos": 22048828513920.0, - "grad_norm": 1.65915998971852, - "language_loss": 0.7634117, - "learning_rate": 2.379168811074267e-06, - "loss": 0.78504205, - "num_input_tokens_seen": 162766330, - "step": 7586, - "time_per_iteration": 2.636626720428467 - }, - { - "auxiliary_loss_clip": 0.01109504, - "auxiliary_loss_mlp": 0.01029829, - "balance_loss_clip": 1.04642403, - "balance_loss_mlp": 1.01651323, - "epoch": 0.45615511799188335, - "flos": 24571804832640.0, - "grad_norm": 44.63874812648689, - "language_loss": 0.78151405, - "learning_rate": 2.3787864053168747e-06, - "loss": 0.80290735, - "num_input_tokens_seen": 162784755, - "step": 7587, - "time_per_iteration": 2.7801096439361572 - }, - { - "auxiliary_loss_clip": 0.01105539, - "auxiliary_loss_mlp": 0.01044536, - "balance_loss_clip": 1.04288149, - "balance_loss_mlp": 1.02933669, - "epoch": 0.4562152412445513, - "flos": 18330709944960.0, - "grad_norm": 2.252015566278715, - "language_loss": 0.6950196, - "learning_rate": 2.378403985195863e-06, - "loss": 0.71652043, - "num_input_tokens_seen": 162803850, - "step": 7588, - "time_per_iteration": 2.7108840942382812 - }, - { - "auxiliary_loss_clip": 0.01118383, - "auxiliary_loss_mlp": 0.01036327, - "balance_loss_clip": 1.05038464, - "balance_loss_mlp": 1.02234375, - "epoch": 0.4562753644972193, - "flos": 13516525814400.0, - "grad_norm": 1.6983482750091652, - "language_loss": 0.79372728, - "learning_rate": 2.378021550725735e-06, - "loss": 0.81527448, - "num_input_tokens_seen": 162820775, - "step": 7589, - "time_per_iteration": 2.6967854499816895 - }, - { - "auxiliary_loss_clip": 0.01121003, - "auxiliary_loss_mlp": 0.01035976, - "balance_loss_clip": 1.04755974, - "balance_loss_mlp": 1.02120006, - "epoch": 0.45633548774988725, - "flos": 29639697701760.0, - "grad_norm": 2.457585749278853, - "language_loss": 0.62875861, - "learning_rate": 2.377639101920992e-06, - "loss": 0.6503284, - "num_input_tokens_seen": 162839695, - "step": 7590, - "time_per_iteration": 2.6659393310546875 - }, - { - "auxiliary_loss_clip": 0.01101858, - "auxiliary_loss_mlp": 0.01045493, - "balance_loss_clip": 1.04248881, - "balance_loss_mlp": 1.03150392, - "epoch": 0.4563956110025552, - "flos": 22233409528320.0, - "grad_norm": 1.8064400322650376, - "language_loss": 0.73125023, - "learning_rate": 2.377256638796135e-06, - "loss": 0.75272369, - "num_input_tokens_seen": 162856095, - "step": 7591, - "time_per_iteration": 2.7296926975250244 - }, - { - "auxiliary_loss_clip": 0.01113505, - "auxiliary_loss_mlp": 0.01043243, - "balance_loss_clip": 1.04979515, - "balance_loss_mlp": 1.02757883, - "epoch": 0.45645573425522323, - "flos": 17092043389440.0, - "grad_norm": 2.6622201495184923, - "language_loss": 0.76661623, - "learning_rate": 2.3768741613656695e-06, - "loss": 0.78818369, - "num_input_tokens_seen": 162874070, - "step": 7592, - "time_per_iteration": 2.855787992477417 - }, - { - "auxiliary_loss_clip": 0.01104851, - "auxiliary_loss_mlp": 0.01042123, - "balance_loss_clip": 1.04489005, - "balance_loss_mlp": 1.026191, - "epoch": 0.4565158575078912, - "flos": 20332334309760.0, - "grad_norm": 2.112667667080726, - "language_loss": 0.6938538, - "learning_rate": 2.376491669644098e-06, - "loss": 0.71532357, - "num_input_tokens_seen": 162891000, - "step": 7593, - "time_per_iteration": 2.7688679695129395 - }, - { - "auxiliary_loss_clip": 0.01110049, - "auxiliary_loss_mlp": 0.01034633, - "balance_loss_clip": 1.04238796, - "balance_loss_mlp": 1.02174079, - "epoch": 0.45657598076055916, - "flos": 23983013093760.0, - "grad_norm": 2.174557271524546, - "language_loss": 0.83913857, - "learning_rate": 2.3761091636459248e-06, - "loss": 0.86058539, - "num_input_tokens_seen": 162910120, - "step": 7594, - "time_per_iteration": 2.807098865509033 - }, - { - "auxiliary_loss_clip": 0.01036589, - "auxiliary_loss_mlp": 0.00753626, - "balance_loss_clip": 1.01769352, - "balance_loss_mlp": 1.00077426, - "epoch": 0.45663610401322713, - "flos": 69364297526400.0, - "grad_norm": 0.7884707903863047, - "language_loss": 0.52737939, - "learning_rate": 2.375726643385654e-06, - "loss": 0.54528153, - "num_input_tokens_seen": 162963720, - "step": 7595, - "time_per_iteration": 3.2812860012054443 - }, - { - "auxiliary_loss_clip": 0.01096992, - "auxiliary_loss_mlp": 0.01034204, - "balance_loss_clip": 1.04297972, - "balance_loss_mlp": 1.01864684, - "epoch": 0.4566962272658951, - "flos": 15149095891200.0, - "grad_norm": 2.562717754165903, - "language_loss": 0.87188721, - "learning_rate": 2.3753441088777915e-06, - "loss": 0.89319921, - "num_input_tokens_seen": 162975760, - "step": 7596, - "time_per_iteration": 2.683833122253418 - }, - { - "auxiliary_loss_clip": 0.01126007, - "auxiliary_loss_mlp": 0.01046188, - "balance_loss_clip": 1.05094647, - "balance_loss_mlp": 1.03226399, - "epoch": 0.45675635051856306, - "flos": 18697465762560.0, - "grad_norm": 8.947162495751469, - "language_loss": 0.77418292, - "learning_rate": 2.374961560136843e-06, - "loss": 0.79590482, - "num_input_tokens_seen": 162994865, - "step": 7597, - "time_per_iteration": 2.686328887939453 - }, - { - "auxiliary_loss_clip": 0.01117589, - "auxiliary_loss_mlp": 0.01038291, - "balance_loss_clip": 1.04493558, - "balance_loss_mlp": 1.02389073, - "epoch": 0.456816473771231, - "flos": 19098300608640.0, - "grad_norm": 1.6036220935275767, - "language_loss": 0.78581583, - "learning_rate": 2.374578997177314e-06, - "loss": 0.80737466, - "num_input_tokens_seen": 163014730, - "step": 7598, - "time_per_iteration": 2.6856606006622314 - }, - { - "auxiliary_loss_clip": 0.01128723, - "auxiliary_loss_mlp": 0.01034286, - "balance_loss_clip": 1.04699326, - "balance_loss_mlp": 1.02080941, - "epoch": 0.456876597023899, - "flos": 28950069507840.0, - "grad_norm": 3.021485745265107, - "language_loss": 0.71589166, - "learning_rate": 2.374196420013712e-06, - "loss": 0.73752177, - "num_input_tokens_seen": 163033405, - "step": 7599, - "time_per_iteration": 2.672055244445801 - }, - { - "auxiliary_loss_clip": 0.0109465, - "auxiliary_loss_mlp": 0.01038748, - "balance_loss_clip": 1.04185176, - "balance_loss_mlp": 1.02445507, - "epoch": 0.45693672027656695, - "flos": 23289470317440.0, - "grad_norm": 2.0431074720876046, - "language_loss": 0.70262265, - "learning_rate": 2.373813828660544e-06, - "loss": 0.72395658, - "num_input_tokens_seen": 163051400, - "step": 7600, - "time_per_iteration": 2.8163371086120605 - }, - { - "auxiliary_loss_clip": 0.01066248, - "auxiliary_loss_mlp": 0.01041467, - "balance_loss_clip": 1.04143667, - "balance_loss_mlp": 1.02802658, - "epoch": 0.4569968435292349, - "flos": 20558212986240.0, - "grad_norm": 6.700465706217943, - "language_loss": 0.79066253, - "learning_rate": 2.373431223132319e-06, - "loss": 0.81173962, - "num_input_tokens_seen": 163069250, - "step": 7601, - "time_per_iteration": 2.8098480701446533 - }, - { - "auxiliary_loss_clip": 0.01100447, - "auxiliary_loss_mlp": 0.01041284, - "balance_loss_clip": 1.04293573, - "balance_loss_mlp": 1.02730095, - "epoch": 0.4570569667819029, - "flos": 41282619223680.0, - "grad_norm": 6.824528646616988, - "language_loss": 0.71565419, - "learning_rate": 2.3730486034435448e-06, - "loss": 0.73707151, - "num_input_tokens_seen": 163091755, - "step": 7602, - "time_per_iteration": 2.8971548080444336 - }, - { - "auxiliary_loss_clip": 0.01115269, - "auxiliary_loss_mlp": 0.01034582, - "balance_loss_clip": 1.04276979, - "balance_loss_mlp": 1.01859641, - "epoch": 0.45711709003457085, - "flos": 26031573555840.0, - "grad_norm": 1.8661067599139867, - "language_loss": 0.73023772, - "learning_rate": 2.372665969608729e-06, - "loss": 0.75173628, - "num_input_tokens_seen": 163111600, - "step": 7603, - "time_per_iteration": 2.709261417388916 - }, - { - "auxiliary_loss_clip": 0.01120961, - "auxiliary_loss_mlp": 0.01043179, - "balance_loss_clip": 1.04799032, - "balance_loss_mlp": 1.02714539, - "epoch": 0.4571772132872388, - "flos": 22158068751360.0, - "grad_norm": 1.901129043888336, - "language_loss": 0.83068597, - "learning_rate": 2.372283321642383e-06, - "loss": 0.85232735, - "num_input_tokens_seen": 163127350, - "step": 7604, - "time_per_iteration": 2.713744640350342 - }, - { - "auxiliary_loss_clip": 0.01113838, - "auxiliary_loss_mlp": 0.01045941, - "balance_loss_clip": 1.05216503, - "balance_loss_mlp": 1.02981162, - "epoch": 0.45723733653990684, - "flos": 23878872587520.0, - "grad_norm": 2.0592585158299133, - "language_loss": 0.85998154, - "learning_rate": 2.371900659559016e-06, - "loss": 0.88157928, - "num_input_tokens_seen": 163145855, - "step": 7605, - "time_per_iteration": 2.6666319370269775 - }, - { - "auxiliary_loss_clip": 0.010831, - "auxiliary_loss_mlp": 0.01041844, - "balance_loss_clip": 1.04206753, - "balance_loss_mlp": 1.02670407, - "epoch": 0.4572974597925748, - "flos": 16871803148160.0, - "grad_norm": 1.8551011968860212, - "language_loss": 0.73551464, - "learning_rate": 2.371517983373138e-06, - "loss": 0.75676405, - "num_input_tokens_seen": 163163830, - "step": 7606, - "time_per_iteration": 2.8618602752685547 - }, - { - "auxiliary_loss_clip": 0.01100268, - "auxiliary_loss_mlp": 0.01043762, - "balance_loss_clip": 1.0450927, - "balance_loss_mlp": 1.02790761, - "epoch": 0.45735758304524277, - "flos": 13771491528960.0, - "grad_norm": 1.9296458941386103, - "language_loss": 0.80260599, - "learning_rate": 2.371135293099262e-06, - "loss": 0.82404631, - "num_input_tokens_seen": 163180700, - "step": 7607, - "time_per_iteration": 2.717987537384033 - }, - { - "auxiliary_loss_clip": 0.01097097, - "auxiliary_loss_mlp": 0.01046228, - "balance_loss_clip": 1.05015063, - "balance_loss_mlp": 1.03169668, - "epoch": 0.45741770629791073, - "flos": 21100750986240.0, - "grad_norm": 1.7686881404445909, - "language_loss": 0.81263912, - "learning_rate": 2.3707525887518982e-06, - "loss": 0.83407241, - "num_input_tokens_seen": 163199450, - "step": 7608, - "time_per_iteration": 2.7047500610351562 - }, - { - "auxiliary_loss_clip": 0.01110681, - "auxiliary_loss_mlp": 0.01043615, - "balance_loss_clip": 1.04563498, - "balance_loss_mlp": 1.02828515, - "epoch": 0.4574778295505787, - "flos": 23112898035840.0, - "grad_norm": 3.284613619336592, - "language_loss": 0.68429869, - "learning_rate": 2.370369870345559e-06, - "loss": 0.70584166, - "num_input_tokens_seen": 163217875, - "step": 7609, - "time_per_iteration": 2.7123308181762695 - }, - { - "auxiliary_loss_clip": 0.01105383, - "auxiliary_loss_mlp": 0.01045291, - "balance_loss_clip": 1.04979467, - "balance_loss_mlp": 1.03011012, - "epoch": 0.45753795280324666, - "flos": 24352929308160.0, - "grad_norm": 1.7858891409698046, - "language_loss": 0.80873275, - "learning_rate": 2.369987137894757e-06, - "loss": 0.83023953, - "num_input_tokens_seen": 163237430, - "step": 7610, - "time_per_iteration": 2.707108497619629 - }, - { - "auxiliary_loss_clip": 0.01122367, - "auxiliary_loss_mlp": 0.01042029, - "balance_loss_clip": 1.04675138, - "balance_loss_mlp": 1.02698421, - "epoch": 0.4575980760559146, - "flos": 16653789550080.0, - "grad_norm": 2.2133206913732746, - "language_loss": 0.82100248, - "learning_rate": 2.3696043914140057e-06, - "loss": 0.84264642, - "num_input_tokens_seen": 163253905, - "step": 7611, - "time_per_iteration": 2.6911368370056152 - }, - { - "auxiliary_loss_clip": 0.01127544, - "auxiliary_loss_mlp": 0.01034771, - "balance_loss_clip": 1.05061793, - "balance_loss_mlp": 1.01889205, - "epoch": 0.4576581993085826, - "flos": 35911423912320.0, - "grad_norm": 2.6253593942917677, - "language_loss": 0.73971558, - "learning_rate": 2.369221630917819e-06, - "loss": 0.76133871, - "num_input_tokens_seen": 163274285, - "step": 7612, - "time_per_iteration": 2.8162691593170166 - }, - { - "auxiliary_loss_clip": 0.01103651, - "auxiliary_loss_mlp": 0.01042157, - "balance_loss_clip": 1.04241323, - "balance_loss_mlp": 1.02680302, - "epoch": 0.45771832256125056, - "flos": 20080421251200.0, - "grad_norm": 1.6042487302929564, - "language_loss": 0.84652913, - "learning_rate": 2.368838856420711e-06, - "loss": 0.86798728, - "num_input_tokens_seen": 163293150, - "step": 7613, - "time_per_iteration": 2.66471266746521 - }, - { - "auxiliary_loss_clip": 0.01096161, - "auxiliary_loss_mlp": 0.01038746, - "balance_loss_clip": 1.04437852, - "balance_loss_mlp": 1.02373135, - "epoch": 0.4577784458139185, - "flos": 10744329957120.0, - "grad_norm": 2.314421678604919, - "language_loss": 0.75271547, - "learning_rate": 2.3684560679371965e-06, - "loss": 0.77406454, - "num_input_tokens_seen": 163310065, - "step": 7614, - "time_per_iteration": 2.740011215209961 - }, - { - "auxiliary_loss_clip": 0.01132592, - "auxiliary_loss_mlp": 0.01037968, - "balance_loss_clip": 1.05067575, - "balance_loss_mlp": 1.02378809, - "epoch": 0.4578385690665865, - "flos": 21907269014400.0, - "grad_norm": 1.5980870069512307, - "language_loss": 0.75026065, - "learning_rate": 2.368073265481791e-06, - "loss": 0.77196622, - "num_input_tokens_seen": 163329415, - "step": 7615, - "time_per_iteration": 2.694354772567749 - }, - { - "auxiliary_loss_clip": 0.01037366, - "auxiliary_loss_mlp": 0.01005104, - "balance_loss_clip": 1.02879357, - "balance_loss_mlp": 1.00286281, - "epoch": 0.45789869231925445, - "flos": 64758286667520.0, - "grad_norm": 0.785268606967784, - "language_loss": 0.57671446, - "learning_rate": 2.3676904490690105e-06, - "loss": 0.59713912, - "num_input_tokens_seen": 163385875, - "step": 7616, - "time_per_iteration": 3.2036197185516357 - }, - { - "auxiliary_loss_clip": 0.010986, - "auxiliary_loss_mlp": 0.00772301, - "balance_loss_clip": 1.04307699, - "balance_loss_mlp": 1.00081253, - "epoch": 0.4579588155719224, - "flos": 16144001775360.0, - "grad_norm": 1.6020549029918738, - "language_loss": 0.70836008, - "learning_rate": 2.3673076187133704e-06, - "loss": 0.72706908, - "num_input_tokens_seen": 163405170, - "step": 7617, - "time_per_iteration": 2.7075886726379395 - }, - { - "auxiliary_loss_clip": 0.01137127, - "auxiliary_loss_mlp": 0.01037359, - "balance_loss_clip": 1.05343175, - "balance_loss_mlp": 1.02264261, - "epoch": 0.45801893882459044, - "flos": 21395541905280.0, - "grad_norm": 1.8894449061399028, - "language_loss": 0.76292491, - "learning_rate": 2.36692477442939e-06, - "loss": 0.78466976, - "num_input_tokens_seen": 163423155, - "step": 7618, - "time_per_iteration": 5.8249146938323975 - }, - { - "auxiliary_loss_clip": 0.01101544, - "auxiliary_loss_mlp": 0.01045871, - "balance_loss_clip": 1.05301738, - "balance_loss_mlp": 1.03189957, - "epoch": 0.4580790620772584, - "flos": 19536554448000.0, - "grad_norm": 1.7481433677396025, - "language_loss": 0.77097881, - "learning_rate": 2.366541916231585e-06, - "loss": 0.79245299, - "num_input_tokens_seen": 163442450, - "step": 7619, - "time_per_iteration": 2.766615629196167 - }, - { - "auxiliary_loss_clip": 0.01134342, - "auxiliary_loss_mlp": 0.01040375, - "balance_loss_clip": 1.05348432, - "balance_loss_mlp": 1.02757239, - "epoch": 0.45813918532992637, - "flos": 16581070465920.0, - "grad_norm": 1.8920903156272437, - "language_loss": 0.72002041, - "learning_rate": 2.366159044134473e-06, - "loss": 0.74176759, - "num_input_tokens_seen": 163459810, - "step": 7620, - "time_per_iteration": 4.087975025177002 - }, - { - "auxiliary_loss_clip": 0.01109227, - "auxiliary_loss_mlp": 0.01032686, - "balance_loss_clip": 1.04942107, - "balance_loss_mlp": 1.01892948, - "epoch": 0.45819930858259433, - "flos": 42230301701760.0, - "grad_norm": 1.5465249381842834, - "language_loss": 0.77770388, - "learning_rate": 2.3657761581525748e-06, - "loss": 0.79912305, - "num_input_tokens_seen": 163482970, - "step": 7621, - "time_per_iteration": 2.9124109745025635 - }, - { - "auxiliary_loss_clip": 0.01044673, - "auxiliary_loss_mlp": 0.01001257, - "balance_loss_clip": 1.02584982, - "balance_loss_mlp": 0.99903959, - "epoch": 0.4582594318352623, - "flos": 63714795638400.0, - "grad_norm": 0.7823065471017115, - "language_loss": 0.64958, - "learning_rate": 2.3653932583004063e-06, - "loss": 0.6700393, - "num_input_tokens_seen": 163545330, - "step": 7622, - "time_per_iteration": 4.778898477554321 - }, - { - "auxiliary_loss_clip": 0.01120212, - "auxiliary_loss_mlp": 0.01034924, - "balance_loss_clip": 1.05105555, - "balance_loss_mlp": 1.02016604, - "epoch": 0.45831955508793026, - "flos": 26869979882880.0, - "grad_norm": 3.654827974152138, - "language_loss": 0.79468191, - "learning_rate": 2.3650103445924903e-06, - "loss": 0.81623328, - "num_input_tokens_seen": 163564620, - "step": 7623, - "time_per_iteration": 2.7033259868621826 - }, - { - "auxiliary_loss_clip": 0.01078844, - "auxiliary_loss_mlp": 0.01041828, - "balance_loss_clip": 1.04181957, - "balance_loss_mlp": 1.02728403, - "epoch": 0.45837967834059823, - "flos": 18733951002240.0, - "grad_norm": 1.8933831090876323, - "language_loss": 0.70283759, - "learning_rate": 2.3646274170433452e-06, - "loss": 0.72404432, - "num_input_tokens_seen": 163581010, - "step": 7624, - "time_per_iteration": 2.8526861667633057 - }, - { - "auxiliary_loss_clip": 0.01100025, - "auxiliary_loss_mlp": 0.01040188, - "balance_loss_clip": 1.04250479, - "balance_loss_mlp": 1.02558446, - "epoch": 0.4584398015932662, - "flos": 21178102924800.0, - "grad_norm": 2.2295023596293273, - "language_loss": 0.73171687, - "learning_rate": 2.364244475667491e-06, - "loss": 0.75311905, - "num_input_tokens_seen": 163599955, - "step": 7625, - "time_per_iteration": 2.77284574508667 - }, - { - "auxiliary_loss_clip": 0.01120178, - "auxiliary_loss_mlp": 0.01036964, - "balance_loss_clip": 1.05209434, - "balance_loss_mlp": 1.02369022, - "epoch": 0.45849992484593416, - "flos": 19790047704960.0, - "grad_norm": 2.499945379712242, - "language_loss": 0.77924562, - "learning_rate": 2.363861520479451e-06, - "loss": 0.80081707, - "num_input_tokens_seen": 163618545, - "step": 7626, - "time_per_iteration": 2.813945770263672 - }, - { - "auxiliary_loss_clip": 0.01137615, - "auxiliary_loss_mlp": 0.01040207, - "balance_loss_clip": 1.05263078, - "balance_loss_mlp": 1.02645612, - "epoch": 0.4585600480986021, - "flos": 18223265387520.0, - "grad_norm": 1.5689934094814115, - "language_loss": 0.84652817, - "learning_rate": 2.3634785514937445e-06, - "loss": 0.8683064, - "num_input_tokens_seen": 163636055, - "step": 7627, - "time_per_iteration": 2.659053087234497 - }, - { - "auxiliary_loss_clip": 0.01138145, - "auxiliary_loss_mlp": 0.01040233, - "balance_loss_clip": 1.05155802, - "balance_loss_mlp": 1.02531946, - "epoch": 0.4586201713512701, - "flos": 29022213974400.0, - "grad_norm": 1.5125222475387885, - "language_loss": 0.6911087, - "learning_rate": 2.3630955687248953e-06, - "loss": 0.71289253, - "num_input_tokens_seen": 163657485, - "step": 7628, - "time_per_iteration": 2.693678617477417 - }, - { - "auxiliary_loss_clip": 0.01118783, - "auxiliary_loss_mlp": 0.01034859, - "balance_loss_clip": 1.04731619, - "balance_loss_mlp": 1.02110827, - "epoch": 0.45868029460393805, - "flos": 23404600385280.0, - "grad_norm": 1.4972122231294245, - "language_loss": 0.78672099, - "learning_rate": 2.3627125721874265e-06, - "loss": 0.80825746, - "num_input_tokens_seen": 163676030, - "step": 7629, - "time_per_iteration": 2.6437535285949707 - }, - { - "auxiliary_loss_clip": 0.01113389, - "auxiliary_loss_mlp": 0.01045555, - "balance_loss_clip": 1.04590559, - "balance_loss_mlp": 1.03034973, - "epoch": 0.458740417856606, - "flos": 18221972497920.0, - "grad_norm": 2.2059444062956985, - "language_loss": 0.79377991, - "learning_rate": 2.3623295618958595e-06, - "loss": 0.81536937, - "num_input_tokens_seen": 163694490, - "step": 7630, - "time_per_iteration": 2.7565791606903076 - }, - { - "auxiliary_loss_clip": 0.01111942, - "auxiliary_loss_mlp": 0.01039415, - "balance_loss_clip": 1.04838312, - "balance_loss_mlp": 1.02481222, - "epoch": 0.458800541109274, - "flos": 34568760504960.0, - "grad_norm": 2.1212994157581293, - "language_loss": 0.72087741, - "learning_rate": 2.3619465378647198e-06, - "loss": 0.74239099, - "num_input_tokens_seen": 163717035, - "step": 7631, - "time_per_iteration": 2.7880306243896484 - }, - { - "auxiliary_loss_clip": 0.01094955, - "auxiliary_loss_mlp": 0.01048432, - "balance_loss_clip": 1.04605651, - "balance_loss_mlp": 1.03280342, - "epoch": 0.458860664361942, - "flos": 17712112896000.0, - "grad_norm": 2.4606182879569145, - "language_loss": 0.71433818, - "learning_rate": 2.361563500108531e-06, - "loss": 0.73577201, - "num_input_tokens_seen": 163734525, - "step": 7632, - "time_per_iteration": 2.7352800369262695 - }, - { - "auxiliary_loss_clip": 0.01081835, - "auxiliary_loss_mlp": 0.00774034, - "balance_loss_clip": 1.04268694, - "balance_loss_mlp": 1.00058782, - "epoch": 0.45892078761460997, - "flos": 18441889516800.0, - "grad_norm": 2.5758659525876824, - "language_loss": 0.68867576, - "learning_rate": 2.3611804486418178e-06, - "loss": 0.7072345, - "num_input_tokens_seen": 163752860, - "step": 7633, - "time_per_iteration": 2.848534107208252 - }, - { - "auxiliary_loss_clip": 0.01122955, - "auxiliary_loss_mlp": 0.01043952, - "balance_loss_clip": 1.05012798, - "balance_loss_mlp": 1.02942061, - "epoch": 0.45898091086727794, - "flos": 22672956257280.0, - "grad_norm": 1.690968390723207, - "language_loss": 0.80858737, - "learning_rate": 2.3607973834791062e-06, - "loss": 0.83025646, - "num_input_tokens_seen": 163772495, - "step": 7634, - "time_per_iteration": 2.6536448001861572 - }, - { - "auxiliary_loss_clip": 0.01122911, - "auxiliary_loss_mlp": 0.00773021, - "balance_loss_clip": 1.04987049, - "balance_loss_mlp": 1.00053596, - "epoch": 0.4590410341199459, - "flos": 21652949744640.0, - "grad_norm": 1.6933583063541449, - "language_loss": 0.81255853, - "learning_rate": 2.3604143046349216e-06, - "loss": 0.83151788, - "num_input_tokens_seen": 163791475, - "step": 7635, - "time_per_iteration": 2.6140496730804443 - }, - { - "auxiliary_loss_clip": 0.01110725, - "auxiliary_loss_mlp": 0.01043522, - "balance_loss_clip": 1.04990745, - "balance_loss_mlp": 1.02941322, - "epoch": 0.45910115737261387, - "flos": 36535372087680.0, - "grad_norm": 1.4938285014309638, - "language_loss": 0.64786839, - "learning_rate": 2.3600312121237905e-06, - "loss": 0.66941082, - "num_input_tokens_seen": 163812995, - "step": 7636, - "time_per_iteration": 2.9211695194244385 - }, - { - "auxiliary_loss_clip": 0.01117391, - "auxiliary_loss_mlp": 0.01034493, - "balance_loss_clip": 1.05096126, - "balance_loss_mlp": 1.0207361, - "epoch": 0.45916128062528183, - "flos": 24419866302720.0, - "grad_norm": 1.5704675488980822, - "language_loss": 0.8052876, - "learning_rate": 2.3596481059602395e-06, - "loss": 0.82680643, - "num_input_tokens_seen": 163833945, - "step": 7637, - "time_per_iteration": 2.703902244567871 - }, - { - "auxiliary_loss_clip": 0.0110221, - "auxiliary_loss_mlp": 0.0104296, - "balance_loss_clip": 1.04369295, - "balance_loss_mlp": 1.02650893, - "epoch": 0.4592214038779498, - "flos": 23221958705280.0, - "grad_norm": 1.340585421251073, - "language_loss": 0.75339955, - "learning_rate": 2.3592649861587965e-06, - "loss": 0.7748512, - "num_input_tokens_seen": 163853885, - "step": 7638, - "time_per_iteration": 2.8683316707611084 - }, - { - "auxiliary_loss_clip": 0.01118666, - "auxiliary_loss_mlp": 0.01037335, - "balance_loss_clip": 1.04785442, - "balance_loss_mlp": 1.02312553, - "epoch": 0.45928152713061776, - "flos": 19172133014400.0, - "grad_norm": 1.8020175509044534, - "language_loss": 0.74017608, - "learning_rate": 2.358881852733989e-06, - "loss": 0.76173615, - "num_input_tokens_seen": 163871855, - "step": 7639, - "time_per_iteration": 2.6385724544525146 - }, - { - "auxiliary_loss_clip": 0.01134704, - "auxiliary_loss_mlp": 0.01038079, - "balance_loss_clip": 1.05116391, - "balance_loss_mlp": 1.02403021, - "epoch": 0.4593416503832857, - "flos": 22414686491520.0, - "grad_norm": 1.704541952239469, - "language_loss": 0.68183744, - "learning_rate": 2.358498705700346e-06, - "loss": 0.7035653, - "num_input_tokens_seen": 163891450, - "step": 7640, - "time_per_iteration": 2.6786441802978516 - }, - { - "auxiliary_loss_clip": 0.01104644, - "auxiliary_loss_mlp": 0.01040873, - "balance_loss_clip": 1.04305553, - "balance_loss_mlp": 1.02640736, - "epoch": 0.4594017736359537, - "flos": 18880215183360.0, - "grad_norm": 1.6440653073556697, - "language_loss": 0.75610799, - "learning_rate": 2.3581155450723958e-06, - "loss": 0.77756315, - "num_input_tokens_seen": 163909345, - "step": 7641, - "time_per_iteration": 2.6967337131500244 - }, - { - "auxiliary_loss_clip": 0.01107468, - "auxiliary_loss_mlp": 0.0103519, - "balance_loss_clip": 1.04473758, - "balance_loss_mlp": 1.01987791, - "epoch": 0.45946189688862166, - "flos": 20518567349760.0, - "grad_norm": 1.7366807351650166, - "language_loss": 0.7477932, - "learning_rate": 2.357732370864668e-06, - "loss": 0.76921976, - "num_input_tokens_seen": 163926940, - "step": 7642, - "time_per_iteration": 2.7593836784362793 - }, - { - "auxiliary_loss_clip": 0.01033439, - "auxiliary_loss_mlp": 0.01015123, - "balance_loss_clip": 1.02063584, - "balance_loss_mlp": 1.01360917, - "epoch": 0.4595220201412896, - "flos": 61405990162560.0, - "grad_norm": 0.8870453562304583, - "language_loss": 0.58169055, - "learning_rate": 2.357349183091694e-06, - "loss": 0.60217613, - "num_input_tokens_seen": 163977785, - "step": 7643, - "time_per_iteration": 3.008721351623535 - }, - { - "auxiliary_loss_clip": 0.01126407, - "auxiliary_loss_mlp": 0.01039184, - "balance_loss_clip": 1.04902744, - "balance_loss_mlp": 1.02468801, - "epoch": 0.4595821433939576, - "flos": 23330947547520.0, - "grad_norm": 1.6727361984558426, - "language_loss": 0.92977291, - "learning_rate": 2.3569659817680016e-06, - "loss": 0.95142883, - "num_input_tokens_seen": 163996630, - "step": 7644, - "time_per_iteration": 2.6844348907470703 - }, - { - "auxiliary_loss_clip": 0.01118806, - "auxiliary_loss_mlp": 0.0103695, - "balance_loss_clip": 1.04879534, - "balance_loss_mlp": 1.02278805, - "epoch": 0.4596422666466256, - "flos": 14282356711680.0, - "grad_norm": 2.49930104784668, - "language_loss": 0.82485175, - "learning_rate": 2.3565827669081243e-06, - "loss": 0.84640932, - "num_input_tokens_seen": 164013190, - "step": 7645, - "time_per_iteration": 2.649367332458496 - }, - { - "auxiliary_loss_clip": 0.01010103, - "auxiliary_loss_mlp": 0.00999811, - "balance_loss_clip": 1.01816797, - "balance_loss_mlp": 0.99795145, - "epoch": 0.4597023898992936, - "flos": 65727337737600.0, - "grad_norm": 0.7581805782249401, - "language_loss": 0.59857589, - "learning_rate": 2.356199538526593e-06, - "loss": 0.61867499, - "num_input_tokens_seen": 164074030, - "step": 7646, - "time_per_iteration": 3.211512327194214 - }, - { - "auxiliary_loss_clip": 0.01116258, - "auxiliary_loss_mlp": 0.01035245, - "balance_loss_clip": 1.04631102, - "balance_loss_mlp": 1.02006984, - "epoch": 0.45976251315196154, - "flos": 26907075653760.0, - "grad_norm": 1.794903772385352, - "language_loss": 0.72503293, - "learning_rate": 2.355816296637939e-06, - "loss": 0.74654794, - "num_input_tokens_seen": 164095515, - "step": 7647, - "time_per_iteration": 2.792795419692993 - }, - { - "auxiliary_loss_clip": 0.01096575, - "auxiliary_loss_mlp": 0.01041791, - "balance_loss_clip": 1.04206514, - "balance_loss_mlp": 1.02684855, - "epoch": 0.4598226364046295, - "flos": 26618066824320.0, - "grad_norm": 1.7350588372730733, - "language_loss": 0.66805142, - "learning_rate": 2.3554330412566957e-06, - "loss": 0.68943512, - "num_input_tokens_seen": 164117270, - "step": 7648, - "time_per_iteration": 2.798882484436035 - }, - { - "auxiliary_loss_clip": 0.01120443, - "auxiliary_loss_mlp": 0.01037713, - "balance_loss_clip": 1.04601169, - "balance_loss_mlp": 1.0234313, - "epoch": 0.45988275965729747, - "flos": 24387762522240.0, - "grad_norm": 1.4487791655991338, - "language_loss": 0.78854847, - "learning_rate": 2.3550497723973953e-06, - "loss": 0.81013, - "num_input_tokens_seen": 164137850, - "step": 7649, - "time_per_iteration": 2.710026979446411 - }, - { - "auxiliary_loss_clip": 0.01071387, - "auxiliary_loss_mlp": 0.01039161, - "balance_loss_clip": 1.0469979, - "balance_loss_mlp": 1.02459955, - "epoch": 0.45994288290996543, - "flos": 24535822383360.0, - "grad_norm": 1.68877556398497, - "language_loss": 0.69140404, - "learning_rate": 2.3546664900745726e-06, - "loss": 0.71250951, - "num_input_tokens_seen": 164157960, - "step": 7650, - "time_per_iteration": 2.862882375717163 - }, - { - "auxiliary_loss_clip": 0.01128714, - "auxiliary_loss_mlp": 0.01042169, - "balance_loss_clip": 1.05184257, - "balance_loss_mlp": 1.02592099, - "epoch": 0.4600030061626334, - "flos": 14830245838080.0, - "grad_norm": 2.8986833449878686, - "language_loss": 0.844868, - "learning_rate": 2.354283194302761e-06, - "loss": 0.86657685, - "num_input_tokens_seen": 164174590, - "step": 7651, - "time_per_iteration": 2.624094247817993 - }, - { - "auxiliary_loss_clip": 0.01108337, - "auxiliary_loss_mlp": 0.00771732, - "balance_loss_clip": 1.04726708, - "balance_loss_mlp": 1.00045896, - "epoch": 0.46006312941530136, - "flos": 18113845582080.0, - "grad_norm": 1.8740934460638858, - "language_loss": 0.75375748, - "learning_rate": 2.3538998850964948e-06, - "loss": 0.77255821, - "num_input_tokens_seen": 164192935, - "step": 7652, - "time_per_iteration": 2.7064099311828613 - }, - { - "auxiliary_loss_clip": 0.01083449, - "auxiliary_loss_mlp": 0.01033562, - "balance_loss_clip": 1.04353166, - "balance_loss_mlp": 1.019364, - "epoch": 0.46012325266796933, - "flos": 21976468565760.0, - "grad_norm": 1.6780448716001595, - "language_loss": 0.75990206, - "learning_rate": 2.3535165624703097e-06, - "loss": 0.78107214, - "num_input_tokens_seen": 164213160, - "step": 7653, - "time_per_iteration": 2.840228319168091 - }, - { - "auxiliary_loss_clip": 0.01090017, - "auxiliary_loss_mlp": 0.01037352, - "balance_loss_clip": 1.04773235, - "balance_loss_mlp": 1.02063906, - "epoch": 0.4601833759206373, - "flos": 15268068714240.0, - "grad_norm": 4.060223218919271, - "language_loss": 0.65658432, - "learning_rate": 2.353133226438741e-06, - "loss": 0.67785805, - "num_input_tokens_seen": 164229330, - "step": 7654, - "time_per_iteration": 2.8097331523895264 - }, - { - "auxiliary_loss_clip": 0.0110323, - "auxiliary_loss_mlp": 0.01038674, - "balance_loss_clip": 1.04187179, - "balance_loss_mlp": 1.02436912, - "epoch": 0.46024349917330526, - "flos": 27088999061760.0, - "grad_norm": 1.8761760458574834, - "language_loss": 0.79274917, - "learning_rate": 2.3527498770163248e-06, - "loss": 0.81416821, - "num_input_tokens_seen": 164248240, - "step": 7655, - "time_per_iteration": 2.758086681365967 - }, - { - "auxiliary_loss_clip": 0.01090903, - "auxiliary_loss_mlp": 0.01032546, - "balance_loss_clip": 1.0439781, - "balance_loss_mlp": 1.01801491, - "epoch": 0.4603036224259732, - "flos": 24462923731200.0, - "grad_norm": 1.6240518023721515, - "language_loss": 0.68172526, - "learning_rate": 2.3523665142175985e-06, - "loss": 0.70295978, - "num_input_tokens_seen": 164268020, - "step": 7656, - "time_per_iteration": 2.740079402923584 - }, - { - "auxiliary_loss_clip": 0.01107571, - "auxiliary_loss_mlp": 0.01034222, - "balance_loss_clip": 1.04353023, - "balance_loss_mlp": 1.02023935, - "epoch": 0.4603637456786412, - "flos": 28109292883200.0, - "grad_norm": 2.01428243239582, - "language_loss": 0.80944681, - "learning_rate": 2.351983138057098e-06, - "loss": 0.83086479, - "num_input_tokens_seen": 164287305, - "step": 7657, - "time_per_iteration": 5.946510314941406 - }, - { - "auxiliary_loss_clip": 0.01130018, - "auxiliary_loss_mlp": 0.00771647, - "balance_loss_clip": 1.04671657, - "balance_loss_mlp": 1.00056028, - "epoch": 0.4604238689313092, - "flos": 24348942898560.0, - "grad_norm": 2.997035997447325, - "language_loss": 0.70678955, - "learning_rate": 2.3515997485493623e-06, - "loss": 0.72580624, - "num_input_tokens_seen": 164306835, - "step": 7658, - "time_per_iteration": 2.710728883743286 - }, - { - "auxiliary_loss_clip": 0.01037878, - "auxiliary_loss_mlp": 0.01003053, - "balance_loss_clip": 1.01928806, - "balance_loss_mlp": 1.00126505, - "epoch": 0.4604839921839772, - "flos": 53606229431040.0, - "grad_norm": 0.9879963677197028, - "language_loss": 0.62104321, - "learning_rate": 2.351216345708928e-06, - "loss": 0.64145255, - "num_input_tokens_seen": 164367095, - "step": 7659, - "time_per_iteration": 4.733903646469116 - }, - { - "auxiliary_loss_clip": 0.01079557, - "auxiliary_loss_mlp": 0.01042331, - "balance_loss_clip": 1.04242504, - "balance_loss_mlp": 1.02548122, - "epoch": 0.46054411543664514, - "flos": 31248424126080.0, - "grad_norm": 1.6833434349921483, - "language_loss": 0.68750244, - "learning_rate": 2.350832929550336e-06, - "loss": 0.70872128, - "num_input_tokens_seen": 164388895, - "step": 7660, - "time_per_iteration": 2.8501877784729004 - }, - { - "auxiliary_loss_clip": 0.01115644, - "auxiliary_loss_mlp": 0.01039595, - "balance_loss_clip": 1.04312992, - "balance_loss_mlp": 1.02450275, - "epoch": 0.4606042386893131, - "flos": 24092863862400.0, - "grad_norm": 4.508470627980692, - "language_loss": 0.77059424, - "learning_rate": 2.3504495000881227e-06, - "loss": 0.79214668, - "num_input_tokens_seen": 164409080, - "step": 7661, - "time_per_iteration": 4.375652313232422 - }, - { - "auxiliary_loss_clip": 0.01111668, - "auxiliary_loss_mlp": 0.01045702, - "balance_loss_clip": 1.04530478, - "balance_loss_mlp": 1.02989531, - "epoch": 0.46066436194198107, - "flos": 26578457101440.0, - "grad_norm": 1.8557827945777399, - "language_loss": 0.75165689, - "learning_rate": 2.3500660573368305e-06, - "loss": 0.77323061, - "num_input_tokens_seen": 164427585, - "step": 7662, - "time_per_iteration": 2.654381513595581 - }, - { - "auxiliary_loss_clip": 0.01104085, - "auxiliary_loss_mlp": 0.01041771, - "balance_loss_clip": 1.0422461, - "balance_loss_mlp": 1.02585697, - "epoch": 0.46072448519464904, - "flos": 17775602184960.0, - "grad_norm": 3.5055114571256922, - "language_loss": 0.79886508, - "learning_rate": 2.349682601310998e-06, - "loss": 0.82032371, - "num_input_tokens_seen": 164438455, - "step": 7663, - "time_per_iteration": 2.6240744590759277 - }, - { - "auxiliary_loss_clip": 0.0111588, - "auxiliary_loss_mlp": 0.01034844, - "balance_loss_clip": 1.04562616, - "balance_loss_mlp": 1.02098536, - "epoch": 0.460784608447317, - "flos": 15086109392640.0, - "grad_norm": 2.0015713101361565, - "language_loss": 0.73791528, - "learning_rate": 2.3492991320251653e-06, - "loss": 0.75942254, - "num_input_tokens_seen": 164456830, - "step": 7664, - "time_per_iteration": 2.673335075378418 - }, - { - "auxiliary_loss_clip": 0.01096445, - "auxiliary_loss_mlp": 0.01036863, - "balance_loss_clip": 1.04571927, - "balance_loss_mlp": 1.02313614, - "epoch": 0.46084473169998497, - "flos": 18588261438720.0, - "grad_norm": 1.5274295482700302, - "language_loss": 0.7257731, - "learning_rate": 2.3489156494938753e-06, - "loss": 0.74710619, - "num_input_tokens_seen": 164475375, - "step": 7665, - "time_per_iteration": 2.7057924270629883 - }, - { - "auxiliary_loss_clip": 0.01104187, - "auxiliary_loss_mlp": 0.01034968, - "balance_loss_clip": 1.04968786, - "balance_loss_mlp": 1.02148521, - "epoch": 0.46090485495265293, - "flos": 19494789909120.0, - "grad_norm": 1.7665019302136358, - "language_loss": 0.78369665, - "learning_rate": 2.348532153731669e-06, - "loss": 0.80508822, - "num_input_tokens_seen": 164492040, - "step": 7666, - "time_per_iteration": 2.6954169273376465 - }, - { - "auxiliary_loss_clip": 0.0108371, - "auxiliary_loss_mlp": 0.01035058, - "balance_loss_clip": 1.04061627, - "balance_loss_mlp": 1.01935792, - "epoch": 0.4609649782053209, - "flos": 33364927163520.0, - "grad_norm": 1.7291426769142197, - "language_loss": 0.74374932, - "learning_rate": 2.348148644753088e-06, - "loss": 0.76493704, - "num_input_tokens_seen": 164513665, - "step": 7667, - "time_per_iteration": 2.781087636947632 - }, - { - "auxiliary_loss_clip": 0.01083108, - "auxiliary_loss_mlp": 0.01038011, - "balance_loss_clip": 1.04470205, - "balance_loss_mlp": 1.02440965, - "epoch": 0.46102510145798886, - "flos": 23769165473280.0, - "grad_norm": 1.4213815945133983, - "language_loss": 0.75993818, - "learning_rate": 2.347765122572676e-06, - "loss": 0.78114939, - "num_input_tokens_seen": 164533890, - "step": 7668, - "time_per_iteration": 2.8653104305267334 - }, - { - "auxiliary_loss_clip": 0.010726, - "auxiliary_loss_mlp": 0.01033857, - "balance_loss_clip": 1.04025698, - "balance_loss_mlp": 1.02047563, - "epoch": 0.4610852247106568, - "flos": 23294821443840.0, - "grad_norm": 1.7696248586775516, - "language_loss": 0.78228277, - "learning_rate": 2.347381587204975e-06, - "loss": 0.80334735, - "num_input_tokens_seen": 164553815, - "step": 7669, - "time_per_iteration": 2.783662796020508 - }, - { - "auxiliary_loss_clip": 0.01110483, - "auxiliary_loss_mlp": 0.01038047, - "balance_loss_clip": 1.04095972, - "balance_loss_mlp": 1.02259183, - "epoch": 0.4611453479633248, - "flos": 25447450584960.0, - "grad_norm": 1.7322551840105593, - "language_loss": 0.82352221, - "learning_rate": 2.34699803866453e-06, - "loss": 0.84500754, - "num_input_tokens_seen": 164573125, - "step": 7670, - "time_per_iteration": 2.6722826957702637 - }, - { - "auxiliary_loss_clip": 0.01118191, - "auxiliary_loss_mlp": 0.01034929, - "balance_loss_clip": 1.04624724, - "balance_loss_mlp": 1.02086234, - "epoch": 0.4612054712159928, - "flos": 21139606523520.0, - "grad_norm": 1.6399167633004121, - "language_loss": 0.63361788, - "learning_rate": 2.3466144769658845e-06, - "loss": 0.6551491, - "num_input_tokens_seen": 164592575, - "step": 7671, - "time_per_iteration": 2.6507785320281982 - }, - { - "auxiliary_loss_clip": 0.01038838, - "auxiliary_loss_mlp": 0.01005964, - "balance_loss_clip": 1.02976012, - "balance_loss_mlp": 1.0044564, - "epoch": 0.4612655944686608, - "flos": 69959266404480.0, - "grad_norm": 0.6926647500019024, - "language_loss": 0.55842638, - "learning_rate": 2.346230902123583e-06, - "loss": 0.57887447, - "num_input_tokens_seen": 164659795, - "step": 7672, - "time_per_iteration": 3.330268144607544 - }, - { - "auxiliary_loss_clip": 0.01119098, - "auxiliary_loss_mlp": 0.01040288, - "balance_loss_clip": 1.04617, - "balance_loss_mlp": 1.02645397, - "epoch": 0.46132571772132874, - "flos": 16837149502080.0, - "grad_norm": 1.8809200572873195, - "language_loss": 0.70954943, - "learning_rate": 2.3458473141521715e-06, - "loss": 0.7311433, - "num_input_tokens_seen": 164678735, - "step": 7673, - "time_per_iteration": 2.65659499168396 - }, - { - "auxiliary_loss_clip": 0.01103001, - "auxiliary_loss_mlp": 0.01033294, - "balance_loss_clip": 1.04363799, - "balance_loss_mlp": 1.01938248, - "epoch": 0.4613858409739967, - "flos": 35808935431680.0, - "grad_norm": 1.9110713796675685, - "language_loss": 0.70837104, - "learning_rate": 2.345463713066195e-06, - "loss": 0.72973394, - "num_input_tokens_seen": 164700885, - "step": 7674, - "time_per_iteration": 2.8332366943359375 - }, - { - "auxiliary_loss_clip": 0.01103023, - "auxiliary_loss_mlp": 0.0104104, - "balance_loss_clip": 1.04143381, - "balance_loss_mlp": 1.02709818, - "epoch": 0.4614459642266647, - "flos": 35266756567680.0, - "grad_norm": 1.6933433527162, - "language_loss": 0.65489, - "learning_rate": 2.3450800988801996e-06, - "loss": 0.67633063, - "num_input_tokens_seen": 164726960, - "step": 7675, - "time_per_iteration": 2.8454952239990234 - }, - { - "auxiliary_loss_clip": 0.01047065, - "auxiliary_loss_mlp": 0.01003099, - "balance_loss_clip": 1.02009785, - "balance_loss_mlp": 1.00131118, - "epoch": 0.46150608747933264, - "flos": 66704610044160.0, - "grad_norm": 0.8598142136337862, - "language_loss": 0.58659744, - "learning_rate": 2.3446964716087327e-06, - "loss": 0.60709906, - "num_input_tokens_seen": 164788525, - "step": 7676, - "time_per_iteration": 3.1523091793060303 - }, - { - "auxiliary_loss_clip": 0.0101473, - "auxiliary_loss_mlp": 0.01002448, - "balance_loss_clip": 1.01614749, - "balance_loss_mlp": 1.00077868, - "epoch": 0.4615662107320006, - "flos": 55830177025920.0, - "grad_norm": 0.7931279707742926, - "language_loss": 0.62803817, - "learning_rate": 2.344312831266341e-06, - "loss": 0.64820993, - "num_input_tokens_seen": 164843525, - "step": 7677, - "time_per_iteration": 3.1055288314819336 - }, - { - "auxiliary_loss_clip": 0.01103004, - "auxiliary_loss_mlp": 0.01036602, - "balance_loss_clip": 1.04363084, - "balance_loss_mlp": 1.02309012, - "epoch": 0.46162633398466857, - "flos": 15483245137920.0, - "grad_norm": 2.4819209870900636, - "language_loss": 0.76371491, - "learning_rate": 2.3439291778675718e-06, - "loss": 0.78511101, - "num_input_tokens_seen": 164859895, - "step": 7678, - "time_per_iteration": 2.6796817779541016 - }, - { - "auxiliary_loss_clip": 0.01131922, - "auxiliary_loss_mlp": 0.01035943, - "balance_loss_clip": 1.04888463, - "balance_loss_mlp": 1.02157795, - "epoch": 0.46168645723733653, - "flos": 20011437181440.0, - "grad_norm": 2.4568506909255974, - "language_loss": 0.66881382, - "learning_rate": 2.343545511426974e-06, - "loss": 0.69049251, - "num_input_tokens_seen": 164878030, - "step": 7679, - "time_per_iteration": 2.669527053833008 - }, - { - "auxiliary_loss_clip": 0.01095986, - "auxiliary_loss_mlp": 0.01037988, - "balance_loss_clip": 1.04533219, - "balance_loss_mlp": 1.02469063, - "epoch": 0.4617465804900045, - "flos": 20298542590080.0, - "grad_norm": 2.335341416202827, - "language_loss": 0.70432782, - "learning_rate": 2.3431618319590963e-06, - "loss": 0.7256676, - "num_input_tokens_seen": 164895710, - "step": 7680, - "time_per_iteration": 2.7286808490753174 - }, - { - "auxiliary_loss_clip": 0.01137583, - "auxiliary_loss_mlp": 0.01043671, - "balance_loss_clip": 1.05160725, - "balance_loss_mlp": 1.02904963, - "epoch": 0.46180670374267246, - "flos": 22346312952960.0, - "grad_norm": 1.9037139750308347, - "language_loss": 0.63464803, - "learning_rate": 2.342778139478487e-06, - "loss": 0.65646052, - "num_input_tokens_seen": 164913365, - "step": 7681, - "time_per_iteration": 2.6214568614959717 - }, - { - "auxiliary_loss_clip": 0.01116453, - "auxiliary_loss_mlp": 0.01029466, - "balance_loss_clip": 1.04633749, - "balance_loss_mlp": 1.01636481, - "epoch": 0.46186682699534043, - "flos": 19895696582400.0, - "grad_norm": 1.5164971745129476, - "language_loss": 0.67357612, - "learning_rate": 2.342394433999697e-06, - "loss": 0.69503522, - "num_input_tokens_seen": 164931620, - "step": 7682, - "time_per_iteration": 2.647353410720825 - }, - { - "auxiliary_loss_clip": 0.01088835, - "auxiliary_loss_mlp": 0.01041013, - "balance_loss_clip": 1.04340196, - "balance_loss_mlp": 1.02619505, - "epoch": 0.4619269502480084, - "flos": 31503569408640.0, - "grad_norm": 2.227871519060849, - "language_loss": 0.73820949, - "learning_rate": 2.342010715537275e-06, - "loss": 0.75950789, - "num_input_tokens_seen": 164950905, - "step": 7683, - "time_per_iteration": 2.7580692768096924 - }, - { - "auxiliary_loss_clip": 0.01128951, - "auxiliary_loss_mlp": 0.01039533, - "balance_loss_clip": 1.04759753, - "balance_loss_mlp": 1.02627087, - "epoch": 0.46198707350067636, - "flos": 25009484054400.0, - "grad_norm": 1.7711337337418462, - "language_loss": 0.76479292, - "learning_rate": 2.3416269841057726e-06, - "loss": 0.7864778, - "num_input_tokens_seen": 164970950, - "step": 7684, - "time_per_iteration": 2.6827478408813477 - }, - { - "auxiliary_loss_clip": 0.01136661, - "auxiliary_loss_mlp": 0.01044253, - "balance_loss_clip": 1.0495609, - "balance_loss_mlp": 1.02969098, - "epoch": 0.4620471967533444, - "flos": 18292357198080.0, - "grad_norm": 1.8114594945271643, - "language_loss": 0.79657519, - "learning_rate": 2.3412432397197412e-06, - "loss": 0.81838435, - "num_input_tokens_seen": 164989855, - "step": 7685, - "time_per_iteration": 2.6539084911346436 - }, - { - "auxiliary_loss_clip": 0.01085193, - "auxiliary_loss_mlp": 0.01046975, - "balance_loss_clip": 1.04328656, - "balance_loss_mlp": 1.03158486, - "epoch": 0.46210732000601235, - "flos": 33985104410880.0, - "grad_norm": 2.276305365525513, - "language_loss": 0.66791403, - "learning_rate": 2.340859482393731e-06, - "loss": 0.68923569, - "num_input_tokens_seen": 165012290, - "step": 7686, - "time_per_iteration": 2.8229949474334717 - }, - { - "auxiliary_loss_clip": 0.01106797, - "auxiliary_loss_mlp": 0.00772257, - "balance_loss_clip": 1.04507184, - "balance_loss_mlp": 1.00066257, - "epoch": 0.4621674432586803, - "flos": 25009412227200.0, - "grad_norm": 2.1846142929829693, - "language_loss": 0.73938292, - "learning_rate": 2.340475712142296e-06, - "loss": 0.75817347, - "num_input_tokens_seen": 165030810, - "step": 7687, - "time_per_iteration": 2.8577284812927246 - }, - { - "auxiliary_loss_clip": 0.01066455, - "auxiliary_loss_mlp": 0.01038717, - "balance_loss_clip": 1.0470593, - "balance_loss_mlp": 1.02399492, - "epoch": 0.4622275665113483, - "flos": 22014031213440.0, - "grad_norm": 2.1409043019128253, - "language_loss": 0.74955392, - "learning_rate": 2.3400919289799873e-06, - "loss": 0.77060568, - "num_input_tokens_seen": 165050205, - "step": 7688, - "time_per_iteration": 2.8981478214263916 - }, - { - "auxiliary_loss_clip": 0.01076735, - "auxiliary_loss_mlp": 0.00771909, - "balance_loss_clip": 1.03838563, - "balance_loss_mlp": 1.0005393, - "epoch": 0.46228768976401624, - "flos": 24058820747520.0, - "grad_norm": 1.6416992765701228, - "language_loss": 0.78753114, - "learning_rate": 2.3397081329213585e-06, - "loss": 0.80601752, - "num_input_tokens_seen": 165069370, - "step": 7689, - "time_per_iteration": 2.8450090885162354 - }, - { - "auxiliary_loss_clip": 0.01117226, - "auxiliary_loss_mlp": 0.01039789, - "balance_loss_clip": 1.04319644, - "balance_loss_mlp": 1.02512646, - "epoch": 0.4623478130166842, - "flos": 26651391667200.0, - "grad_norm": 2.047300589730092, - "language_loss": 0.56996405, - "learning_rate": 2.339324323980964e-06, - "loss": 0.5915342, - "num_input_tokens_seen": 165089610, - "step": 7690, - "time_per_iteration": 2.6919097900390625 - }, - { - "auxiliary_loss_clip": 0.0111777, - "auxiliary_loss_mlp": 0.01042754, - "balance_loss_clip": 1.04474783, - "balance_loss_mlp": 1.02853799, - "epoch": 0.46240793626935217, - "flos": 20558428467840.0, - "grad_norm": 2.950419828824325, - "language_loss": 0.82586032, - "learning_rate": 2.3389405021733562e-06, - "loss": 0.84746557, - "num_input_tokens_seen": 165109050, - "step": 7691, - "time_per_iteration": 2.695331573486328 - }, - { - "auxiliary_loss_clip": 0.01108828, - "auxiliary_loss_mlp": 0.01034489, - "balance_loss_clip": 1.04660177, - "balance_loss_mlp": 1.02088761, - "epoch": 0.46246805952202014, - "flos": 22456055980800.0, - "grad_norm": 1.4872733065963748, - "language_loss": 0.75199407, - "learning_rate": 2.338556667513091e-06, - "loss": 0.77342725, - "num_input_tokens_seen": 165130130, - "step": 7692, - "time_per_iteration": 2.6822991371154785 - }, - { - "auxiliary_loss_clip": 0.01097579, - "auxiliary_loss_mlp": 0.01044516, - "balance_loss_clip": 1.04742086, - "balance_loss_mlp": 1.0297097, - "epoch": 0.4625281827746881, - "flos": 35041308854400.0, - "grad_norm": 1.6276482481397991, - "language_loss": 0.74345845, - "learning_rate": 2.338172820014723e-06, - "loss": 0.76487935, - "num_input_tokens_seen": 165152685, - "step": 7693, - "time_per_iteration": 2.8581414222717285 - }, - { - "auxiliary_loss_clip": 0.01087933, - "auxiliary_loss_mlp": 0.01056162, - "balance_loss_clip": 1.04530871, - "balance_loss_mlp": 1.04086781, - "epoch": 0.46258830602735607, - "flos": 21068647205760.0, - "grad_norm": 2.088066659615079, - "language_loss": 0.85329688, - "learning_rate": 2.337788959692808e-06, - "loss": 0.8747378, - "num_input_tokens_seen": 165173315, - "step": 7694, - "time_per_iteration": 2.730196237564087 - }, - { - "auxiliary_loss_clip": 0.01111115, - "auxiliary_loss_mlp": 0.01042848, - "balance_loss_clip": 1.04707479, - "balance_loss_mlp": 1.02936506, - "epoch": 0.46264842928002403, - "flos": 26177227205760.0, - "grad_norm": 2.853578946778756, - "language_loss": 0.79611814, - "learning_rate": 2.337405086561902e-06, - "loss": 0.81765783, - "num_input_tokens_seen": 165192395, - "step": 7695, - "time_per_iteration": 2.7454562187194824 - }, - { - "auxiliary_loss_clip": 0.01114811, - "auxiliary_loss_mlp": 0.01037414, - "balance_loss_clip": 1.04553604, - "balance_loss_mlp": 1.02390218, - "epoch": 0.462708552532692, - "flos": 16764214936320.0, - "grad_norm": 1.803891217274167, - "language_loss": 0.72445035, - "learning_rate": 2.3370212006365606e-06, - "loss": 0.74597263, - "num_input_tokens_seen": 165211355, - "step": 7696, - "time_per_iteration": 4.214217901229858 - }, - { - "auxiliary_loss_clip": 0.01110882, - "auxiliary_loss_mlp": 0.01046867, - "balance_loss_clip": 1.04748213, - "balance_loss_mlp": 1.03221607, - "epoch": 0.46276867578535996, - "flos": 15560453422080.0, - "grad_norm": 1.5710514609338178, - "language_loss": 0.69939005, - "learning_rate": 2.3366373019313423e-06, - "loss": 0.72096753, - "num_input_tokens_seen": 165229380, - "step": 7697, - "time_per_iteration": 4.213683843612671 - }, - { - "auxiliary_loss_clip": 0.01133171, - "auxiliary_loss_mlp": 0.01036334, - "balance_loss_clip": 1.05145979, - "balance_loss_mlp": 1.02264249, - "epoch": 0.462828799038028, - "flos": 22415404763520.0, - "grad_norm": 1.9243080556164578, - "language_loss": 0.84559363, - "learning_rate": 2.3362533904608025e-06, - "loss": 0.86728865, - "num_input_tokens_seen": 165247200, - "step": 7698, - "time_per_iteration": 2.6434006690979004 - }, - { - "auxiliary_loss_clip": 0.01130166, - "auxiliary_loss_mlp": 0.01037324, - "balance_loss_clip": 1.04838073, - "balance_loss_mlp": 1.02357352, - "epoch": 0.46288892229069595, - "flos": 21069580959360.0, - "grad_norm": 8.31912219741259, - "language_loss": 0.71345413, - "learning_rate": 2.335869466239502e-06, - "loss": 0.73512906, - "num_input_tokens_seen": 165265825, - "step": 7699, - "time_per_iteration": 4.157729387283325 - }, - { - "auxiliary_loss_clip": 0.01073609, - "auxiliary_loss_mlp": 0.01040377, - "balance_loss_clip": 1.04345739, - "balance_loss_mlp": 1.02550519, - "epoch": 0.4629490455433639, - "flos": 23185688947200.0, - "grad_norm": 1.732328117704307, - "language_loss": 0.71911675, - "learning_rate": 2.335485529281996e-06, - "loss": 0.74025667, - "num_input_tokens_seen": 165284380, - "step": 7700, - "time_per_iteration": 2.8432295322418213 - }, - { - "auxiliary_loss_clip": 0.01128125, - "auxiliary_loss_mlp": 0.00771852, - "balance_loss_clip": 1.04640698, - "balance_loss_mlp": 1.00047588, - "epoch": 0.4630091687960319, - "flos": 18835541642880.0, - "grad_norm": 2.4184025660528863, - "language_loss": 0.73149109, - "learning_rate": 2.3351015796028467e-06, - "loss": 0.7504909, - "num_input_tokens_seen": 165300320, - "step": 7701, - "time_per_iteration": 4.2371203899383545 - }, - { - "auxiliary_loss_clip": 0.01087014, - "auxiliary_loss_mlp": 0.01044166, - "balance_loss_clip": 1.04401398, - "balance_loss_mlp": 1.02921128, - "epoch": 0.46306929204869984, - "flos": 38907020407680.0, - "grad_norm": 2.4372676297457216, - "language_loss": 0.65005761, - "learning_rate": 2.3347176172166114e-06, - "loss": 0.67136943, - "num_input_tokens_seen": 165318130, - "step": 7702, - "time_per_iteration": 2.875633716583252 - }, - { - "auxiliary_loss_clip": 0.01103467, - "auxiliary_loss_mlp": 0.01032726, - "balance_loss_clip": 1.04441071, - "balance_loss_mlp": 1.01875424, - "epoch": 0.4631294153013678, - "flos": 19644178573440.0, - "grad_norm": 1.9024039666922008, - "language_loss": 0.73310453, - "learning_rate": 2.33433364213785e-06, - "loss": 0.75446641, - "num_input_tokens_seen": 165336225, - "step": 7703, - "time_per_iteration": 2.7307324409484863 - }, - { - "auxiliary_loss_clip": 0.01109216, - "auxiliary_loss_mlp": 0.0103683, - "balance_loss_clip": 1.04673266, - "balance_loss_mlp": 1.02145839, - "epoch": 0.4631895385540358, - "flos": 24608254158720.0, - "grad_norm": 1.9428423147374236, - "language_loss": 0.68751299, - "learning_rate": 2.3339496543811243e-06, - "loss": 0.70897353, - "num_input_tokens_seen": 165355005, - "step": 7704, - "time_per_iteration": 2.7113852500915527 - }, - { - "auxiliary_loss_clip": 0.01120314, - "auxiliary_loss_mlp": 0.01033991, - "balance_loss_clip": 1.04720986, - "balance_loss_mlp": 1.01935196, - "epoch": 0.46324966180670374, - "flos": 26320115508480.0, - "grad_norm": 2.3420396256779443, - "language_loss": 0.81331742, - "learning_rate": 2.3335656539609934e-06, - "loss": 0.83486044, - "num_input_tokens_seen": 165374910, - "step": 7705, - "time_per_iteration": 2.804708480834961 - }, - { - "auxiliary_loss_clip": 0.01119161, - "auxiliary_loss_mlp": 0.01035806, - "balance_loss_clip": 1.04762256, - "balance_loss_mlp": 1.02172124, - "epoch": 0.4633097850593717, - "flos": 19240506552960.0, - "grad_norm": 1.6909152504462979, - "language_loss": 0.77714217, - "learning_rate": 2.3331816408920196e-06, - "loss": 0.79869187, - "num_input_tokens_seen": 165392590, - "step": 7706, - "time_per_iteration": 2.67990779876709 - }, - { - "auxiliary_loss_clip": 0.01102016, - "auxiliary_loss_mlp": 0.01033802, - "balance_loss_clip": 1.04767776, - "balance_loss_mlp": 1.02023578, - "epoch": 0.46336990831203967, - "flos": 22783166161920.0, - "grad_norm": 2.039386256395222, - "language_loss": 0.699494, - "learning_rate": 2.3327976151887654e-06, - "loss": 0.7208522, - "num_input_tokens_seen": 165411195, - "step": 7707, - "time_per_iteration": 2.7109720706939697 - }, - { - "auxiliary_loss_clip": 0.01111011, - "auxiliary_loss_mlp": 0.01038647, - "balance_loss_clip": 1.04469609, - "balance_loss_mlp": 1.02306628, - "epoch": 0.46343003156470763, - "flos": 38210604543360.0, - "grad_norm": 1.931472234163978, - "language_loss": 0.61287057, - "learning_rate": 2.332413576865791e-06, - "loss": 0.63436711, - "num_input_tokens_seen": 165430150, - "step": 7708, - "time_per_iteration": 2.8489346504211426 - }, - { - "auxiliary_loss_clip": 0.01089075, - "auxiliary_loss_mlp": 0.01033464, - "balance_loss_clip": 1.04273093, - "balance_loss_mlp": 1.01930773, - "epoch": 0.4634901548173756, - "flos": 31938555110400.0, - "grad_norm": 2.4081522593734332, - "language_loss": 0.77443427, - "learning_rate": 2.3320295259376614e-06, - "loss": 0.79565972, - "num_input_tokens_seen": 165450595, - "step": 7709, - "time_per_iteration": 2.720604419708252 - }, - { - "auxiliary_loss_clip": 0.01134634, - "auxiliary_loss_mlp": 0.0103959, - "balance_loss_clip": 1.04938257, - "balance_loss_mlp": 1.02433753, - "epoch": 0.46355027807004356, - "flos": 20082540153600.0, - "grad_norm": 1.78810829524809, - "language_loss": 0.77216917, - "learning_rate": 2.3316454624189385e-06, - "loss": 0.79391134, - "num_input_tokens_seen": 165469515, - "step": 7710, - "time_per_iteration": 2.5303022861480713 - }, - { - "auxiliary_loss_clip": 0.01122514, - "auxiliary_loss_mlp": 0.01037619, - "balance_loss_clip": 1.04637122, - "balance_loss_mlp": 1.02172804, - "epoch": 0.4636104013227116, - "flos": 24061370613120.0, - "grad_norm": 2.2400017320201187, - "language_loss": 0.73509276, - "learning_rate": 2.3312613863241865e-06, - "loss": 0.75669408, - "num_input_tokens_seen": 165488125, - "step": 7711, - "time_per_iteration": 2.5654797554016113 - }, - { - "auxiliary_loss_clip": 0.0110546, - "auxiliary_loss_mlp": 0.01046309, - "balance_loss_clip": 1.04776788, - "balance_loss_mlp": 1.03109789, - "epoch": 0.46367052457537955, - "flos": 23914639555200.0, - "grad_norm": 1.4625168937424313, - "language_loss": 0.71734262, - "learning_rate": 2.33087729766797e-06, - "loss": 0.73886031, - "num_input_tokens_seen": 165509225, - "step": 7712, - "time_per_iteration": 2.6021108627319336 - }, - { - "auxiliary_loss_clip": 0.01109448, - "auxiliary_loss_mlp": 0.01039959, - "balance_loss_clip": 1.04681897, - "balance_loss_mlp": 1.02359128, - "epoch": 0.4637306478280475, - "flos": 26396533693440.0, - "grad_norm": 10.680731903132253, - "language_loss": 0.73100054, - "learning_rate": 2.3304931964648524e-06, - "loss": 0.75249463, - "num_input_tokens_seen": 165529945, - "step": 7713, - "time_per_iteration": 2.7074029445648193 - }, - { - "auxiliary_loss_clip": 0.01098034, - "auxiliary_loss_mlp": 0.01037925, - "balance_loss_clip": 1.0441041, - "balance_loss_mlp": 1.02191556, - "epoch": 0.4637907710807155, - "flos": 21980706370560.0, - "grad_norm": 1.6982870192648571, - "language_loss": 0.5889293, - "learning_rate": 2.3301090827294e-06, - "loss": 0.61028892, - "num_input_tokens_seen": 165550690, - "step": 7714, - "time_per_iteration": 2.710048198699951 - }, - { - "auxiliary_loss_clip": 0.01120282, - "auxiliary_loss_mlp": 0.01034073, - "balance_loss_clip": 1.04763293, - "balance_loss_mlp": 1.01950562, - "epoch": 0.46385089433338345, - "flos": 12422291846400.0, - "grad_norm": 1.91274815186046, - "language_loss": 0.70204347, - "learning_rate": 2.3297249564761784e-06, - "loss": 0.72358704, - "num_input_tokens_seen": 165567775, - "step": 7715, - "time_per_iteration": 2.6403465270996094 - }, - { - "auxiliary_loss_clip": 0.01138235, - "auxiliary_loss_mlp": 0.01041941, - "balance_loss_clip": 1.04938495, - "balance_loss_mlp": 1.02725387, - "epoch": 0.4639110175860514, - "flos": 23915752876800.0, - "grad_norm": 2.6000471859571777, - "language_loss": 0.68646967, - "learning_rate": 2.3293408177197527e-06, - "loss": 0.7082715, - "num_input_tokens_seen": 165587010, - "step": 7716, - "time_per_iteration": 2.6233439445495605 - }, - { - "auxiliary_loss_clip": 0.01132713, - "auxiliary_loss_mlp": 0.01031179, - "balance_loss_clip": 1.0472188, - "balance_loss_mlp": 1.01599193, - "epoch": 0.4639711408387194, - "flos": 25300396304640.0, - "grad_norm": 1.7614766285874086, - "language_loss": 0.809901, - "learning_rate": 2.328956666474691e-06, - "loss": 0.83153987, - "num_input_tokens_seen": 165607850, - "step": 7717, - "time_per_iteration": 2.6267318725585938 - }, - { - "auxiliary_loss_clip": 0.01131786, - "auxiliary_loss_mlp": 0.01036738, - "balance_loss_clip": 1.0477078, - "balance_loss_mlp": 1.02206373, - "epoch": 0.46403126409138734, - "flos": 21211822817280.0, - "grad_norm": 1.7513215449973674, - "language_loss": 0.73192513, - "learning_rate": 2.3285725027555593e-06, - "loss": 0.75361037, - "num_input_tokens_seen": 165627175, - "step": 7718, - "time_per_iteration": 2.5936009883880615 - }, - { - "auxiliary_loss_clip": 0.01129362, - "auxiliary_loss_mlp": 0.00772229, - "balance_loss_clip": 1.04671347, - "balance_loss_mlp": 1.00063276, - "epoch": 0.4640913873440553, - "flos": 35845564325760.0, - "grad_norm": 1.6991265809872926, - "language_loss": 0.70156294, - "learning_rate": 2.3281883265769254e-06, - "loss": 0.72057891, - "num_input_tokens_seen": 165648340, - "step": 7719, - "time_per_iteration": 2.7047362327575684 - }, - { - "auxiliary_loss_clip": 0.01112084, - "auxiliary_loss_mlp": 0.01036441, - "balance_loss_clip": 1.05082273, - "balance_loss_mlp": 1.02101541, - "epoch": 0.46415151059672327, - "flos": 19166207270400.0, - "grad_norm": 2.142564905802957, - "language_loss": 0.86823177, - "learning_rate": 2.327804137953357e-06, - "loss": 0.88971704, - "num_input_tokens_seen": 165667195, - "step": 7720, - "time_per_iteration": 2.7309963703155518 - }, - { - "auxiliary_loss_clip": 0.01032352, - "auxiliary_loss_mlp": 0.01008212, - "balance_loss_clip": 1.02414155, - "balance_loss_mlp": 1.00647151, - "epoch": 0.46421163384939124, - "flos": 58912750304640.0, - "grad_norm": 0.7188509278747012, - "language_loss": 0.55039424, - "learning_rate": 2.3274199368994226e-06, - "loss": 0.57079989, - "num_input_tokens_seen": 165726760, - "step": 7721, - "time_per_iteration": 3.236877679824829 - }, - { - "auxiliary_loss_clip": 0.01107525, - "auxiliary_loss_mlp": 0.01036882, - "balance_loss_clip": 1.04643178, - "balance_loss_mlp": 1.02240443, - "epoch": 0.4642717571020592, - "flos": 20157342226560.0, - "grad_norm": 2.140310045449241, - "language_loss": 0.79792923, - "learning_rate": 2.3270357234296918e-06, - "loss": 0.81937331, - "num_input_tokens_seen": 165745005, - "step": 7722, - "time_per_iteration": 2.660754919052124 - }, - { - "auxiliary_loss_clip": 0.01135285, - "auxiliary_loss_mlp": 0.01039973, - "balance_loss_clip": 1.04771972, - "balance_loss_mlp": 1.02478552, - "epoch": 0.46433188035472717, - "flos": 25046184775680.0, - "grad_norm": 1.8420199747356898, - "language_loss": 0.77947485, - "learning_rate": 2.3266514975587332e-06, - "loss": 0.80122739, - "num_input_tokens_seen": 165765750, - "step": 7723, - "time_per_iteration": 2.650667667388916 - }, - { - "auxiliary_loss_clip": 0.010296, - "auxiliary_loss_mlp": 0.01034411, - "balance_loss_clip": 1.03560913, - "balance_loss_mlp": 1.01945066, - "epoch": 0.4643920036073952, - "flos": 28075644817920.0, - "grad_norm": 1.6775959652720056, - "language_loss": 0.68506896, - "learning_rate": 2.326267259301118e-06, - "loss": 0.7057091, - "num_input_tokens_seen": 165787515, - "step": 7724, - "time_per_iteration": 3.0586209297180176 - }, - { - "auxiliary_loss_clip": 0.01115779, - "auxiliary_loss_mlp": 0.01034262, - "balance_loss_clip": 1.04832113, - "balance_loss_mlp": 1.0193367, - "epoch": 0.46445212686006315, - "flos": 18369350000640.0, - "grad_norm": 3.606583728635542, - "language_loss": 0.67163348, - "learning_rate": 2.325883008671415e-06, - "loss": 0.69313383, - "num_input_tokens_seen": 165806675, - "step": 7725, - "time_per_iteration": 2.9137332439422607 - }, - { - "auxiliary_loss_clip": 0.01113984, - "auxiliary_loss_mlp": 0.01038381, - "balance_loss_clip": 1.04604602, - "balance_loss_mlp": 1.02554178, - "epoch": 0.4645122501127311, - "flos": 31721618920320.0, - "grad_norm": 1.751091551827286, - "language_loss": 0.65037453, - "learning_rate": 2.3254987456841955e-06, - "loss": 0.67189825, - "num_input_tokens_seen": 165829835, - "step": 7726, - "time_per_iteration": 2.7184534072875977 - }, - { - "auxiliary_loss_clip": 0.0110497, - "auxiliary_loss_mlp": 0.00772968, - "balance_loss_clip": 1.04436016, - "balance_loss_mlp": 1.00061822, - "epoch": 0.4645723733653991, - "flos": 23768806337280.0, - "grad_norm": 1.6559858063545494, - "language_loss": 0.74796247, - "learning_rate": 2.3251144703540307e-06, - "loss": 0.76674187, - "num_input_tokens_seen": 165849380, - "step": 7727, - "time_per_iteration": 2.7193634510040283 - }, - { - "auxiliary_loss_clip": 0.01107461, - "auxiliary_loss_mlp": 0.0104049, - "balance_loss_clip": 1.0458529, - "balance_loss_mlp": 1.02506471, - "epoch": 0.46463249661806705, - "flos": 33145512935040.0, - "grad_norm": 2.1928121253358293, - "language_loss": 0.78549933, - "learning_rate": 2.3247301826954936e-06, - "loss": 0.80697882, - "num_input_tokens_seen": 165868620, - "step": 7728, - "time_per_iteration": 2.744900703430176 - }, - { - "auxiliary_loss_clip": 0.01092904, - "auxiliary_loss_mlp": 0.01038861, - "balance_loss_clip": 1.0414784, - "balance_loss_mlp": 1.02373958, - "epoch": 0.464692619870735, - "flos": 18296020385280.0, - "grad_norm": 2.0549050897499135, - "language_loss": 0.75892472, - "learning_rate": 2.324345882723155e-06, - "loss": 0.78024244, - "num_input_tokens_seen": 165885915, - "step": 7729, - "time_per_iteration": 2.7145724296569824 - }, - { - "auxiliary_loss_clip": 0.01108829, - "auxiliary_loss_mlp": 0.01047351, - "balance_loss_clip": 1.0485568, - "balance_loss_mlp": 1.03153229, - "epoch": 0.464752743123403, - "flos": 22638051216000.0, - "grad_norm": 1.8824527818993837, - "language_loss": 0.79760742, - "learning_rate": 2.323961570451588e-06, - "loss": 0.81916922, - "num_input_tokens_seen": 165905465, - "step": 7730, - "time_per_iteration": 2.7782390117645264 - }, - { - "auxiliary_loss_clip": 0.01130146, - "auxiliary_loss_mlp": 0.01037223, - "balance_loss_clip": 1.04756629, - "balance_loss_mlp": 1.02265573, - "epoch": 0.46481286637607094, - "flos": 20412128373120.0, - "grad_norm": 1.6262082138117517, - "language_loss": 0.77182668, - "learning_rate": 2.3235772458953655e-06, - "loss": 0.79350036, - "num_input_tokens_seen": 165924640, - "step": 7731, - "time_per_iteration": 2.617314577102661 - }, - { - "auxiliary_loss_clip": 0.01090917, - "auxiliary_loss_mlp": 0.01035098, - "balance_loss_clip": 1.04506755, - "balance_loss_mlp": 1.02119207, - "epoch": 0.4648729896287389, - "flos": 34275406129920.0, - "grad_norm": 1.6446435516271722, - "language_loss": 0.65999961, - "learning_rate": 2.323192909069061e-06, - "loss": 0.68125969, - "num_input_tokens_seen": 165945765, - "step": 7732, - "time_per_iteration": 2.806825876235962 - }, - { - "auxiliary_loss_clip": 0.01109545, - "auxiliary_loss_mlp": 0.0104247, - "balance_loss_clip": 1.04427695, - "balance_loss_mlp": 1.02551866, - "epoch": 0.4649331128814069, - "flos": 21321781326720.0, - "grad_norm": 2.341941786180864, - "language_loss": 0.72770941, - "learning_rate": 2.32280855998725e-06, - "loss": 0.74922955, - "num_input_tokens_seen": 165964025, - "step": 7733, - "time_per_iteration": 2.6884191036224365 - }, - { - "auxiliary_loss_clip": 0.01046209, - "auxiliary_loss_mlp": 0.01002418, - "balance_loss_clip": 1.01885557, - "balance_loss_mlp": 1.00089204, - "epoch": 0.46499323613407484, - "flos": 58308515717760.0, - "grad_norm": 1.2786299900123337, - "language_loss": 0.51944834, - "learning_rate": 2.3224241986645057e-06, - "loss": 0.53993464, - "num_input_tokens_seen": 166021950, - "step": 7734, - "time_per_iteration": 3.0932440757751465 - }, - { - "auxiliary_loss_clip": 0.01111419, - "auxiliary_loss_mlp": 0.01034362, - "balance_loss_clip": 1.05044913, - "balance_loss_mlp": 1.01990235, - "epoch": 0.4650533593867428, - "flos": 10889660384640.0, - "grad_norm": 2.1631100357564788, - "language_loss": 0.75439203, - "learning_rate": 2.3220398251154035e-06, - "loss": 0.77584982, - "num_input_tokens_seen": 166039675, - "step": 7735, - "time_per_iteration": 4.546087265014648 - }, - { - "auxiliary_loss_clip": 0.01087553, - "auxiliary_loss_mlp": 0.01045865, - "balance_loss_clip": 1.04543328, - "balance_loss_mlp": 1.0305233, - "epoch": 0.46511348263941077, - "flos": 19974592805760.0, - "grad_norm": 2.3653554564968435, - "language_loss": 0.69901764, - "learning_rate": 2.321655439354519e-06, - "loss": 0.72035182, - "num_input_tokens_seen": 166057745, - "step": 7736, - "time_per_iteration": 4.302860498428345 - }, - { - "auxiliary_loss_clip": 0.01128458, - "auxiliary_loss_mlp": 0.01036991, - "balance_loss_clip": 1.0473057, - "balance_loss_mlp": 1.0228653, - "epoch": 0.46517360589207873, - "flos": 19678401256320.0, - "grad_norm": 1.6411657567334208, - "language_loss": 0.71995008, - "learning_rate": 2.321271041396427e-06, - "loss": 0.74160457, - "num_input_tokens_seen": 166076440, - "step": 7737, - "time_per_iteration": 2.566603183746338 - }, - { - "auxiliary_loss_clip": 0.01111802, - "auxiliary_loss_mlp": 0.01040407, - "balance_loss_clip": 1.05224276, - "balance_loss_mlp": 1.02456391, - "epoch": 0.46523372914474675, - "flos": 16872665074560.0, - "grad_norm": 2.50928704064022, - "language_loss": 0.83606738, - "learning_rate": 2.3208866312557065e-06, - "loss": 0.85758948, - "num_input_tokens_seen": 166092520, - "step": 7738, - "time_per_iteration": 2.602149486541748 - }, - { - "auxiliary_loss_clip": 0.0103645, - "auxiliary_loss_mlp": 0.01000487, - "balance_loss_clip": 1.01920033, - "balance_loss_mlp": 0.99899715, - "epoch": 0.4652938523974147, - "flos": 53439138339840.0, - "grad_norm": 0.7761784242108043, - "language_loss": 0.57855058, - "learning_rate": 2.320502208946932e-06, - "loss": 0.59891999, - "num_input_tokens_seen": 166156285, - "step": 7739, - "time_per_iteration": 4.744653940200806 - }, - { - "auxiliary_loss_clip": 0.01111735, - "auxiliary_loss_mlp": 0.0104196, - "balance_loss_clip": 1.04867125, - "balance_loss_mlp": 1.02728581, - "epoch": 0.4653539756500827, - "flos": 15231296165760.0, - "grad_norm": 1.7825482177936647, - "language_loss": 0.85391408, - "learning_rate": 2.3201177744846815e-06, - "loss": 0.87545103, - "num_input_tokens_seen": 166173455, - "step": 7740, - "time_per_iteration": 4.26358962059021 - }, - { - "auxiliary_loss_clip": 0.01103788, - "auxiliary_loss_mlp": 0.01043392, - "balance_loss_clip": 1.04354095, - "balance_loss_mlp": 1.02769184, - "epoch": 0.46541409890275065, - "flos": 23732249270400.0, - "grad_norm": 1.728452967927443, - "language_loss": 0.75540549, - "learning_rate": 2.3197333278835327e-06, - "loss": 0.77687728, - "num_input_tokens_seen": 166194370, - "step": 7741, - "time_per_iteration": 2.7189860343933105 - }, - { - "auxiliary_loss_clip": 0.01102378, - "auxiliary_loss_mlp": 0.0103993, - "balance_loss_clip": 1.04642224, - "balance_loss_mlp": 1.02583992, - "epoch": 0.4654742221554186, - "flos": 20847329556480.0, - "grad_norm": 1.6912495786690362, - "language_loss": 0.80807334, - "learning_rate": 2.319348869158064e-06, - "loss": 0.82949644, - "num_input_tokens_seen": 166213195, - "step": 7742, - "time_per_iteration": 2.7285542488098145 - }, - { - "auxiliary_loss_clip": 0.01109172, - "auxiliary_loss_mlp": 0.01044204, - "balance_loss_clip": 1.04378545, - "balance_loss_mlp": 1.02846837, - "epoch": 0.4655343454080866, - "flos": 20704836303360.0, - "grad_norm": 2.554211916953899, - "language_loss": 0.7287879, - "learning_rate": 2.3189643983228555e-06, - "loss": 0.75032163, - "num_input_tokens_seen": 166231350, - "step": 7743, - "time_per_iteration": 2.8064794540405273 - }, - { - "auxiliary_loss_clip": 0.01097309, - "auxiliary_loss_mlp": 0.01035628, - "balance_loss_clip": 1.044186, - "balance_loss_mlp": 1.01989281, - "epoch": 0.46559446866075455, - "flos": 18989850470400.0, - "grad_norm": 1.9272268848768948, - "language_loss": 0.71113133, - "learning_rate": 2.318579915392483e-06, - "loss": 0.73246074, - "num_input_tokens_seen": 166250530, - "step": 7744, - "time_per_iteration": 2.7021846771240234 - }, - { - "auxiliary_loss_clip": 0.01081647, - "auxiliary_loss_mlp": 0.01033676, - "balance_loss_clip": 1.04821372, - "balance_loss_mlp": 1.01952028, - "epoch": 0.4656545919134225, - "flos": 34496364643200.0, - "grad_norm": 1.5788774332625253, - "language_loss": 0.84865856, - "learning_rate": 2.31819542038153e-06, - "loss": 0.86981177, - "num_input_tokens_seen": 166272545, - "step": 7745, - "time_per_iteration": 2.8962950706481934 - }, - { - "auxiliary_loss_clip": 0.01118243, - "auxiliary_loss_mlp": 0.01044667, - "balance_loss_clip": 1.04609525, - "balance_loss_mlp": 1.02958083, - "epoch": 0.4657147151660905, - "flos": 24310554238080.0, - "grad_norm": 1.3325532903447972, - "language_loss": 0.72868127, - "learning_rate": 2.317810913304574e-06, - "loss": 0.75031042, - "num_input_tokens_seen": 166292135, - "step": 7746, - "time_per_iteration": 2.654744863510132 - }, - { - "auxiliary_loss_clip": 0.01115957, - "auxiliary_loss_mlp": 0.01039896, - "balance_loss_clip": 1.04620576, - "balance_loss_mlp": 1.02557254, - "epoch": 0.46577483841875844, - "flos": 58795139220480.0, - "grad_norm": 2.5149225133479667, - "language_loss": 0.69942105, - "learning_rate": 2.3174263941761963e-06, - "loss": 0.72097951, - "num_input_tokens_seen": 166316710, - "step": 7747, - "time_per_iteration": 2.946551561355591 - }, - { - "auxiliary_loss_clip": 0.01087715, - "auxiliary_loss_mlp": 0.01043482, - "balance_loss_clip": 1.04082656, - "balance_loss_mlp": 1.0269475, - "epoch": 0.4658349616714264, - "flos": 31321969223040.0, - "grad_norm": 1.543824419854341, - "language_loss": 0.67369974, - "learning_rate": 2.317041863010978e-06, - "loss": 0.69501168, - "num_input_tokens_seen": 166338535, - "step": 7748, - "time_per_iteration": 2.7577450275421143 - }, - { - "auxiliary_loss_clip": 0.01095867, - "auxiliary_loss_mlp": 0.01040613, - "balance_loss_clip": 1.04655099, - "balance_loss_mlp": 1.0242455, - "epoch": 0.46589508492409437, - "flos": 14860338456960.0, - "grad_norm": 2.2493825617355805, - "language_loss": 0.6400212, - "learning_rate": 2.3166573198235007e-06, - "loss": 0.66138601, - "num_input_tokens_seen": 166355540, - "step": 7749, - "time_per_iteration": 2.6768271923065186 - }, - { - "auxiliary_loss_clip": 0.01124878, - "auxiliary_loss_mlp": 0.01035356, - "balance_loss_clip": 1.04833543, - "balance_loss_mlp": 1.01912558, - "epoch": 0.46595520817676234, - "flos": 12895989431040.0, - "grad_norm": 2.0851109379556414, - "language_loss": 0.74756414, - "learning_rate": 2.3162727646283456e-06, - "loss": 0.76916647, - "num_input_tokens_seen": 166372635, - "step": 7750, - "time_per_iteration": 2.6180553436279297 - }, - { - "auxiliary_loss_clip": 0.01112353, - "auxiliary_loss_mlp": 0.01032354, - "balance_loss_clip": 1.04888475, - "balance_loss_mlp": 1.01699984, - "epoch": 0.46601533142943036, - "flos": 32854169721600.0, - "grad_norm": 2.1197385056246, - "language_loss": 0.74433059, - "learning_rate": 2.3158881974400963e-06, - "loss": 0.76577765, - "num_input_tokens_seen": 166393175, - "step": 7751, - "time_per_iteration": 2.7448816299438477 - }, - { - "auxiliary_loss_clip": 0.01105983, - "auxiliary_loss_mlp": 0.01039216, - "balance_loss_clip": 1.049245, - "balance_loss_mlp": 1.02301598, - "epoch": 0.4660754546820983, - "flos": 19967517826560.0, - "grad_norm": 2.5234072122891176, - "language_loss": 0.73595881, - "learning_rate": 2.3155036182733345e-06, - "loss": 0.75741076, - "num_input_tokens_seen": 166408630, - "step": 7752, - "time_per_iteration": 2.6944475173950195 - }, - { - "auxiliary_loss_clip": 0.01108633, - "auxiliary_loss_mlp": 0.01040109, - "balance_loss_clip": 1.04941273, - "balance_loss_mlp": 1.02493417, - "epoch": 0.4661355779347663, - "flos": 26688164215680.0, - "grad_norm": 2.044776600528041, - "language_loss": 0.69086194, - "learning_rate": 2.315119027142644e-06, - "loss": 0.7123493, - "num_input_tokens_seen": 166428170, - "step": 7753, - "time_per_iteration": 2.736854076385498 - }, - { - "auxiliary_loss_clip": 0.01099142, - "auxiliary_loss_mlp": 0.01040064, - "balance_loss_clip": 1.04148221, - "balance_loss_mlp": 1.02494824, - "epoch": 0.46619570118743425, - "flos": 20959442881920.0, - "grad_norm": 2.155464287948458, - "language_loss": 0.72724748, - "learning_rate": 2.3147344240626076e-06, - "loss": 0.74863952, - "num_input_tokens_seen": 166446705, - "step": 7754, - "time_per_iteration": 2.6782143115997314 - }, - { - "auxiliary_loss_clip": 0.01113403, - "auxiliary_loss_mlp": 0.0103567, - "balance_loss_clip": 1.04633951, - "balance_loss_mlp": 1.01993394, - "epoch": 0.4662558244401022, - "flos": 24426079355520.0, - "grad_norm": 1.424199388432646, - "language_loss": 0.78797996, - "learning_rate": 2.3143498090478114e-06, - "loss": 0.80947065, - "num_input_tokens_seen": 166466750, - "step": 7755, - "time_per_iteration": 2.8091399669647217 - }, - { - "auxiliary_loss_clip": 0.01115387, - "auxiliary_loss_mlp": 0.01030352, - "balance_loss_clip": 1.04450297, - "balance_loss_mlp": 1.01545656, - "epoch": 0.4663159476927702, - "flos": 20595452411520.0, - "grad_norm": 1.631642654170447, - "language_loss": 0.72453964, - "learning_rate": 2.3139651821128382e-06, - "loss": 0.74599707, - "num_input_tokens_seen": 166485400, - "step": 7756, - "time_per_iteration": 2.7136480808258057 - }, - { - "auxiliary_loss_clip": 0.01117973, - "auxiliary_loss_mlp": 0.01036177, - "balance_loss_clip": 1.04585207, - "balance_loss_mlp": 1.02137136, - "epoch": 0.46637607094543815, - "flos": 25661872823040.0, - "grad_norm": 2.024488409117557, - "language_loss": 0.78578007, - "learning_rate": 2.313580543272274e-06, - "loss": 0.80732161, - "num_input_tokens_seen": 166505730, - "step": 7757, - "time_per_iteration": 2.6828832626342773 - }, - { - "auxiliary_loss_clip": 0.01090573, - "auxiliary_loss_mlp": 0.01031697, - "balance_loss_clip": 1.04173446, - "balance_loss_mlp": 1.01717782, - "epoch": 0.4664361941981061, - "flos": 24273853516800.0, - "grad_norm": 2.116616009232987, - "language_loss": 0.6656999, - "learning_rate": 2.313195892540705e-06, - "loss": 0.68692255, - "num_input_tokens_seen": 166523770, - "step": 7758, - "time_per_iteration": 2.7238266468048096 - }, - { - "auxiliary_loss_clip": 0.01098442, - "auxiliary_loss_mlp": 0.01044236, - "balance_loss_clip": 1.04272914, - "balance_loss_mlp": 1.02916837, - "epoch": 0.4664963174507741, - "flos": 18405871153920.0, - "grad_norm": 1.6471741103867168, - "language_loss": 0.74542332, - "learning_rate": 2.3128112299327147e-06, - "loss": 0.76685011, - "num_input_tokens_seen": 166542935, - "step": 7759, - "time_per_iteration": 2.648406744003296 - }, - { - "auxiliary_loss_clip": 0.01110559, - "auxiliary_loss_mlp": 0.01047546, - "balance_loss_clip": 1.04692769, - "balance_loss_mlp": 1.0325253, - "epoch": 0.46655644070344204, - "flos": 22455122227200.0, - "grad_norm": 1.575011375316493, - "language_loss": 0.77734709, - "learning_rate": 2.312426555462893e-06, - "loss": 0.79892808, - "num_input_tokens_seen": 166563935, - "step": 7760, - "time_per_iteration": 2.715393543243408 - }, - { - "auxiliary_loss_clip": 0.01104604, - "auxiliary_loss_mlp": 0.01034603, - "balance_loss_clip": 1.04476929, - "balance_loss_mlp": 1.01968408, - "epoch": 0.46661656395611, - "flos": 13808407731840.0, - "grad_norm": 1.8509707336449404, - "language_loss": 0.74408627, - "learning_rate": 2.3120418691458237e-06, - "loss": 0.76547837, - "num_input_tokens_seen": 166582175, - "step": 7761, - "time_per_iteration": 2.679760217666626 - }, - { - "auxiliary_loss_clip": 0.01118037, - "auxiliary_loss_mlp": 0.01038779, - "balance_loss_clip": 1.04605913, - "balance_loss_mlp": 1.02199411, - "epoch": 0.466676687208778, - "flos": 21652159645440.0, - "grad_norm": 1.9428650174374826, - "language_loss": 0.78880894, - "learning_rate": 2.3116571709960956e-06, - "loss": 0.81037712, - "num_input_tokens_seen": 166601870, - "step": 7762, - "time_per_iteration": 2.6236844062805176 - }, - { - "auxiliary_loss_clip": 0.01032755, - "auxiliary_loss_mlp": 0.01004567, - "balance_loss_clip": 1.01497078, - "balance_loss_mlp": 1.00300527, - "epoch": 0.46673681046144594, - "flos": 68534259068160.0, - "grad_norm": 0.7915263755311791, - "language_loss": 0.59707403, - "learning_rate": 2.311272461028297e-06, - "loss": 0.61744726, - "num_input_tokens_seen": 166668960, - "step": 7763, - "time_per_iteration": 3.2309603691101074 - }, - { - "auxiliary_loss_clip": 0.01092007, - "auxiliary_loss_mlp": 0.01038011, - "balance_loss_clip": 1.04239237, - "balance_loss_mlp": 1.02181077, - "epoch": 0.46679693371411396, - "flos": 15814449469440.0, - "grad_norm": 2.1149132662524766, - "language_loss": 0.78707278, - "learning_rate": 2.3108877392570146e-06, - "loss": 0.80837297, - "num_input_tokens_seen": 166686110, - "step": 7764, - "time_per_iteration": 2.667523145675659 - }, - { - "auxiliary_loss_clip": 0.01102497, - "auxiliary_loss_mlp": 0.01038126, - "balance_loss_clip": 1.05066562, - "balance_loss_mlp": 1.02470863, - "epoch": 0.4668570569667819, - "flos": 18514572687360.0, - "grad_norm": 1.9076684434806583, - "language_loss": 0.72103167, - "learning_rate": 2.310503005696839e-06, - "loss": 0.74243796, - "num_input_tokens_seen": 166703930, - "step": 7765, - "time_per_iteration": 2.695037364959717 - }, - { - "auxiliary_loss_clip": 0.0108654, - "auxiliary_loss_mlp": 0.01041419, - "balance_loss_clip": 1.04354358, - "balance_loss_mlp": 1.02578509, - "epoch": 0.4669171802194499, - "flos": 19206643006080.0, - "grad_norm": 3.5524770939500763, - "language_loss": 0.77958077, - "learning_rate": 2.3101182603623576e-06, - "loss": 0.80086035, - "num_input_tokens_seen": 166719940, - "step": 7766, - "time_per_iteration": 2.7083003520965576 - }, - { - "auxiliary_loss_clip": 0.01111478, - "auxiliary_loss_mlp": 0.01041119, - "balance_loss_clip": 1.0413723, - "balance_loss_mlp": 1.02596176, - "epoch": 0.46697730347211786, - "flos": 12276135406080.0, - "grad_norm": 2.008926604773062, - "language_loss": 0.64852947, - "learning_rate": 2.3097335032681607e-06, - "loss": 0.67005551, - "num_input_tokens_seen": 166738285, - "step": 7767, - "time_per_iteration": 2.6344571113586426 - }, - { - "auxiliary_loss_clip": 0.01120029, - "auxiliary_loss_mlp": 0.0104422, - "balance_loss_clip": 1.04623926, - "balance_loss_mlp": 1.02955675, - "epoch": 0.4670374267247858, - "flos": 23586739274880.0, - "grad_norm": 1.9514245068590486, - "language_loss": 0.74225283, - "learning_rate": 2.3093487344288393e-06, - "loss": 0.76389533, - "num_input_tokens_seen": 166758170, - "step": 7768, - "time_per_iteration": 2.7037155628204346 - }, - { - "auxiliary_loss_clip": 0.01101883, - "auxiliary_loss_mlp": 0.01035933, - "balance_loss_clip": 1.04606605, - "balance_loss_mlp": 1.02081776, - "epoch": 0.4670975499774538, - "flos": 15991093578240.0, - "grad_norm": 1.8795722363955685, - "language_loss": 0.70699239, - "learning_rate": 2.308963953858982e-06, - "loss": 0.72837055, - "num_input_tokens_seen": 166775750, - "step": 7769, - "time_per_iteration": 2.6716794967651367 - }, - { - "auxiliary_loss_clip": 0.0112823, - "auxiliary_loss_mlp": 0.01035755, - "balance_loss_clip": 1.04401624, - "balance_loss_mlp": 1.02156949, - "epoch": 0.46715767323012175, - "flos": 15377596260480.0, - "grad_norm": 2.0624542877059158, - "language_loss": 0.81268704, - "learning_rate": 2.3085791615731803e-06, - "loss": 0.83432686, - "num_input_tokens_seen": 166791720, - "step": 7770, - "time_per_iteration": 2.5958662033081055 - }, - { - "auxiliary_loss_clip": 0.01043437, - "auxiliary_loss_mlp": 0.01001838, - "balance_loss_clip": 1.01635242, - "balance_loss_mlp": 1.00027645, - "epoch": 0.4672177964827897, - "flos": 60252217401600.0, - "grad_norm": 0.7961749107066677, - "language_loss": 0.5562135, - "learning_rate": 2.3081943575860265e-06, - "loss": 0.57666636, - "num_input_tokens_seen": 166856360, - "step": 7771, - "time_per_iteration": 3.1569736003875732 - }, - { - "auxiliary_loss_clip": 0.01114939, - "auxiliary_loss_mlp": 0.00771824, - "balance_loss_clip": 1.04351723, - "balance_loss_mlp": 1.00060511, - "epoch": 0.4672779197354577, - "flos": 27636134002560.0, - "grad_norm": 1.896331384644372, - "language_loss": 0.65528286, - "learning_rate": 2.3078095419121117e-06, - "loss": 0.67415047, - "num_input_tokens_seen": 166875925, - "step": 7772, - "time_per_iteration": 2.7263035774230957 - }, - { - "auxiliary_loss_clip": 0.01113556, - "auxiliary_loss_mlp": 0.01034989, - "balance_loss_clip": 1.04692101, - "balance_loss_mlp": 1.02061212, - "epoch": 0.46733804298812565, - "flos": 31394257344000.0, - "grad_norm": 2.0574903106475513, - "language_loss": 0.63557553, - "learning_rate": 2.3074247145660283e-06, - "loss": 0.65706098, - "num_input_tokens_seen": 166896520, - "step": 7773, - "time_per_iteration": 2.691378593444824 - }, - { - "auxiliary_loss_clip": 0.01112174, - "auxiliary_loss_mlp": 0.01040289, - "balance_loss_clip": 1.04673469, - "balance_loss_mlp": 1.02454185, - "epoch": 0.4673981662407936, - "flos": 19500607912320.0, - "grad_norm": 1.9630472969764714, - "language_loss": 0.80073929, - "learning_rate": 2.3070398755623685e-06, - "loss": 0.8222639, - "num_input_tokens_seen": 166915370, - "step": 7774, - "time_per_iteration": 2.661416530609131 - }, - { - "auxiliary_loss_clip": 0.01096265, - "auxiliary_loss_mlp": 0.01033594, - "balance_loss_clip": 1.04382384, - "balance_loss_mlp": 1.01813269, - "epoch": 0.4674582894934616, - "flos": 20521835487360.0, - "grad_norm": 1.5987951306887498, - "language_loss": 0.77369159, - "learning_rate": 2.306655024915726e-06, - "loss": 0.79499024, - "num_input_tokens_seen": 166934875, - "step": 7775, - "time_per_iteration": 4.281586647033691 - }, - { - "auxiliary_loss_clip": 0.01096609, - "auxiliary_loss_mlp": 0.01036176, - "balance_loss_clip": 1.04498506, - "balance_loss_mlp": 1.02137041, - "epoch": 0.46751841274612954, - "flos": 22090952188800.0, - "grad_norm": 1.8524613051021832, - "language_loss": 0.69526893, - "learning_rate": 2.306270162640694e-06, - "loss": 0.71659672, - "num_input_tokens_seen": 166954285, - "step": 7776, - "time_per_iteration": 4.289973497390747 - }, - { - "auxiliary_loss_clip": 0.0112105, - "auxiliary_loss_mlp": 0.0103614, - "balance_loss_clip": 1.04810274, - "balance_loss_mlp": 1.02246058, - "epoch": 0.46757853599879756, - "flos": 26980082046720.0, - "grad_norm": 1.5322212077638444, - "language_loss": 0.73980904, - "learning_rate": 2.3058852887518678e-06, - "loss": 0.76138097, - "num_input_tokens_seen": 166975975, - "step": 7777, - "time_per_iteration": 2.7370285987854004 - }, - { - "auxiliary_loss_clip": 0.01118243, - "auxiliary_loss_mlp": 0.01036883, - "balance_loss_clip": 1.045416, - "balance_loss_mlp": 1.02208281, - "epoch": 0.4676386592514655, - "flos": 24134053783680.0, - "grad_norm": 2.891298768731385, - "language_loss": 0.69314432, - "learning_rate": 2.3055004032638394e-06, - "loss": 0.71469557, - "num_input_tokens_seen": 166996140, - "step": 7778, - "time_per_iteration": 4.159350633621216 - }, - { - "auxiliary_loss_clip": 0.01119786, - "auxiliary_loss_mlp": 0.01041292, - "balance_loss_clip": 1.04801941, - "balance_loss_mlp": 1.02624786, - "epoch": 0.4676987825041335, - "flos": 25483720343040.0, - "grad_norm": 2.158752703527913, - "language_loss": 0.73216277, - "learning_rate": 2.305115506191206e-06, - "loss": 0.75377357, - "num_input_tokens_seen": 167016105, - "step": 7779, - "time_per_iteration": 2.6880576610565186 - }, - { - "auxiliary_loss_clip": 0.0108513, - "auxiliary_loss_mlp": 0.01043402, - "balance_loss_clip": 1.04270327, - "balance_loss_mlp": 1.02963924, - "epoch": 0.46775890575680146, - "flos": 21945298538880.0, - "grad_norm": 1.532169986090066, - "language_loss": 0.72447348, - "learning_rate": 2.304730597548562e-06, - "loss": 0.74575877, - "num_input_tokens_seen": 167036185, - "step": 7780, - "time_per_iteration": 4.378252267837524 - }, - { - "auxiliary_loss_clip": 0.01098995, - "auxiliary_loss_mlp": 0.01052099, - "balance_loss_clip": 1.03960943, - "balance_loss_mlp": 1.03428912, - "epoch": 0.4678190290094694, - "flos": 25228395492480.0, - "grad_norm": 1.8072634784489867, - "language_loss": 0.74489224, - "learning_rate": 2.3043456773505023e-06, - "loss": 0.7664032, - "num_input_tokens_seen": 167054515, - "step": 7781, - "time_per_iteration": 2.684298038482666 - }, - { - "auxiliary_loss_clip": 0.01121556, - "auxiliary_loss_mlp": 0.01040282, - "balance_loss_clip": 1.04655743, - "balance_loss_mlp": 1.02464151, - "epoch": 0.4678791522621374, - "flos": 32268358811520.0, - "grad_norm": 3.3303395339611486, - "language_loss": 0.62934184, - "learning_rate": 2.3039607456116252e-06, - "loss": 0.65096015, - "num_input_tokens_seen": 167077245, - "step": 7782, - "time_per_iteration": 2.801643133163452 - }, - { - "auxiliary_loss_clip": 0.01112208, - "auxiliary_loss_mlp": 0.01044015, - "balance_loss_clip": 1.04610753, - "balance_loss_mlp": 1.02925098, - "epoch": 0.46793927551480535, - "flos": 27046480337280.0, - "grad_norm": 2.527604831052906, - "language_loss": 0.63679516, - "learning_rate": 2.3035758023465254e-06, - "loss": 0.65835738, - "num_input_tokens_seen": 167097235, - "step": 7783, - "time_per_iteration": 2.779493570327759 - }, - { - "auxiliary_loss_clip": 0.01126101, - "auxiliary_loss_mlp": 0.01040434, - "balance_loss_clip": 1.04948771, - "balance_loss_mlp": 1.02393532, - "epoch": 0.4679993987674733, - "flos": 17457398576640.0, - "grad_norm": 2.4796959185267884, - "language_loss": 0.67925286, - "learning_rate": 2.303190847569801e-06, - "loss": 0.70091814, - "num_input_tokens_seen": 167113155, - "step": 7784, - "time_per_iteration": 2.640165090560913 - }, - { - "auxiliary_loss_clip": 0.01100267, - "auxiliary_loss_mlp": 0.01033313, - "balance_loss_clip": 1.04564571, - "balance_loss_mlp": 1.0193001, - "epoch": 0.4680595220201413, - "flos": 17165121609600.0, - "grad_norm": 2.0879148282250304, - "language_loss": 0.84605902, - "learning_rate": 2.3028058812960497e-06, - "loss": 0.8673948, - "num_input_tokens_seen": 167131765, - "step": 7785, - "time_per_iteration": 2.6447336673736572 - }, - { - "auxiliary_loss_clip": 0.01095846, - "auxiliary_loss_mlp": 0.01038359, - "balance_loss_clip": 1.0473485, - "balance_loss_mlp": 1.02278996, - "epoch": 0.46811964527280925, - "flos": 11327591001600.0, - "grad_norm": 1.936392485305852, - "language_loss": 0.77363992, - "learning_rate": 2.3024209035398678e-06, - "loss": 0.79498196, - "num_input_tokens_seen": 167149030, - "step": 7786, - "time_per_iteration": 2.7023332118988037 - }, - { - "auxiliary_loss_clip": 0.01116619, - "auxiliary_loss_mlp": 0.01034917, - "balance_loss_clip": 1.04685593, - "balance_loss_mlp": 1.02089214, - "epoch": 0.4681797685254772, - "flos": 24278809593600.0, - "grad_norm": 2.0886119764466686, - "language_loss": 0.74195051, - "learning_rate": 2.302035914315856e-06, - "loss": 0.76346588, - "num_input_tokens_seen": 167167375, - "step": 7787, - "time_per_iteration": 2.704002618789673 - }, - { - "auxiliary_loss_clip": 0.0110227, - "auxiliary_loss_mlp": 0.01041247, - "balance_loss_clip": 1.04562151, - "balance_loss_mlp": 1.02654815, - "epoch": 0.4682398917781452, - "flos": 31650372293760.0, - "grad_norm": 1.9198703232455803, - "language_loss": 0.65471619, - "learning_rate": 2.3016509136386116e-06, - "loss": 0.67615134, - "num_input_tokens_seen": 167188065, - "step": 7788, - "time_per_iteration": 2.767409324645996 - }, - { - "auxiliary_loss_clip": 0.01117478, - "auxiliary_loss_mlp": 0.01034939, - "balance_loss_clip": 1.0463376, - "balance_loss_mlp": 1.02198708, - "epoch": 0.46830001503081314, - "flos": 28110765340800.0, - "grad_norm": 1.576175997941932, - "language_loss": 0.63680893, - "learning_rate": 2.3012659015227343e-06, - "loss": 0.65833306, - "num_input_tokens_seen": 167209675, - "step": 7789, - "time_per_iteration": 2.686382532119751 - }, - { - "auxiliary_loss_clip": 0.01034678, - "auxiliary_loss_mlp": 0.01000229, - "balance_loss_clip": 1.01769471, - "balance_loss_mlp": 0.99867934, - "epoch": 0.4683601382834811, - "flos": 57881718316800.0, - "grad_norm": 0.6946835696901172, - "language_loss": 0.61856973, - "learning_rate": 2.300880877982825e-06, - "loss": 0.63891876, - "num_input_tokens_seen": 167273940, - "step": 7790, - "time_per_iteration": 3.2082865238189697 - }, - { - "auxiliary_loss_clip": 0.01088531, - "auxiliary_loss_mlp": 0.01040894, - "balance_loss_clip": 1.04553008, - "balance_loss_mlp": 1.02514648, - "epoch": 0.46842026153614913, - "flos": 21871933009920.0, - "grad_norm": 1.7348641955250894, - "language_loss": 0.79120016, - "learning_rate": 2.3004958430334808e-06, - "loss": 0.81249446, - "num_input_tokens_seen": 167292730, - "step": 7791, - "time_per_iteration": 2.7868592739105225 - }, - { - "auxiliary_loss_clip": 0.0112267, - "auxiliary_loss_mlp": 0.01038559, - "balance_loss_clip": 1.05027902, - "balance_loss_mlp": 1.0236336, - "epoch": 0.4684803847888171, - "flos": 24900818434560.0, - "grad_norm": 1.5319083860586857, - "language_loss": 0.7509321, - "learning_rate": 2.3001107966893052e-06, - "loss": 0.77254432, - "num_input_tokens_seen": 167313460, - "step": 7792, - "time_per_iteration": 2.6591553688049316 - }, - { - "auxiliary_loss_clip": 0.01093652, - "auxiliary_loss_mlp": 0.01040808, - "balance_loss_clip": 1.03941143, - "balance_loss_mlp": 1.02582359, - "epoch": 0.46854050804148506, - "flos": 26251670142720.0, - "grad_norm": 1.6679874379457267, - "language_loss": 0.68283308, - "learning_rate": 2.299725738964898e-06, - "loss": 0.70417762, - "num_input_tokens_seen": 167335385, - "step": 7793, - "time_per_iteration": 2.714614152908325 - }, - { - "auxiliary_loss_clip": 0.01120793, - "auxiliary_loss_mlp": 0.00770869, - "balance_loss_clip": 1.05047464, - "balance_loss_mlp": 1.00063658, - "epoch": 0.468600631294153, - "flos": 21579799697280.0, - "grad_norm": 1.5900503410544595, - "language_loss": 0.74045742, - "learning_rate": 2.2993406698748607e-06, - "loss": 0.75937402, - "num_input_tokens_seen": 167353625, - "step": 7794, - "time_per_iteration": 2.631113052368164 - }, - { - "auxiliary_loss_clip": 0.01101487, - "auxiliary_loss_mlp": 0.01040191, - "balance_loss_clip": 1.04786825, - "balance_loss_mlp": 1.02505112, - "epoch": 0.468660754546821, - "flos": 25885632597120.0, - "grad_norm": 1.7758607044197945, - "language_loss": 0.63441491, - "learning_rate": 2.2989555894337953e-06, - "loss": 0.65583163, - "num_input_tokens_seen": 167374565, - "step": 7795, - "time_per_iteration": 2.755208969116211 - }, - { - "auxiliary_loss_clip": 0.01090992, - "auxiliary_loss_mlp": 0.01033775, - "balance_loss_clip": 1.04455793, - "balance_loss_mlp": 1.01939869, - "epoch": 0.46872087779948896, - "flos": 35475001666560.0, - "grad_norm": 1.5780628651808217, - "language_loss": 0.6815629, - "learning_rate": 2.298570497656304e-06, - "loss": 0.70281053, - "num_input_tokens_seen": 167395010, - "step": 7796, - "time_per_iteration": 2.8338258266448975 - }, - { - "auxiliary_loss_clip": 0.01132709, - "auxiliary_loss_mlp": 0.00772271, - "balance_loss_clip": 1.05046582, - "balance_loss_mlp": 1.00074291, - "epoch": 0.4687810010521569, - "flos": 26396425952640.0, - "grad_norm": 3.1208322005509705, - "language_loss": 0.7061345, - "learning_rate": 2.2981853945569894e-06, - "loss": 0.72518432, - "num_input_tokens_seen": 167415285, - "step": 7797, - "time_per_iteration": 2.7184929847717285 - }, - { - "auxiliary_loss_clip": 0.01108205, - "auxiliary_loss_mlp": 0.01035831, - "balance_loss_clip": 1.04716921, - "balance_loss_mlp": 1.01992226, - "epoch": 0.4688411243048249, - "flos": 19972761212160.0, - "grad_norm": 2.050220537358762, - "language_loss": 0.67158788, - "learning_rate": 2.297800280150454e-06, - "loss": 0.69302827, - "num_input_tokens_seen": 167432405, - "step": 7798, - "time_per_iteration": 2.707491159439087 - }, - { - "auxiliary_loss_clip": 0.01033434, - "auxiliary_loss_mlp": 0.00999628, - "balance_loss_clip": 1.01507461, - "balance_loss_mlp": 0.99782771, - "epoch": 0.46890124755749285, - "flos": 63977015900160.0, - "grad_norm": 0.9512995219109956, - "language_loss": 0.64611268, - "learning_rate": 2.2974151544513033e-06, - "loss": 0.66644335, - "num_input_tokens_seen": 167499365, - "step": 7799, - "time_per_iteration": 3.3521087169647217 - }, - { - "auxiliary_loss_clip": 0.01103151, - "auxiliary_loss_mlp": 0.01029152, - "balance_loss_clip": 1.0488441, - "balance_loss_mlp": 1.01467967, - "epoch": 0.4689613708101608, - "flos": 23768985905280.0, - "grad_norm": 1.342329921678728, - "language_loss": 0.72313237, - "learning_rate": 2.2970300174741395e-06, - "loss": 0.74445534, - "num_input_tokens_seen": 167520390, - "step": 7800, - "time_per_iteration": 2.7983593940734863 - }, - { - "auxiliary_loss_clip": 0.01128952, - "auxiliary_loss_mlp": 0.01035275, - "balance_loss_clip": 1.04984462, - "balance_loss_mlp": 1.0224781, - "epoch": 0.4690214940628288, - "flos": 24788705109120.0, - "grad_norm": 1.7150056694833848, - "language_loss": 0.7285912, - "learning_rate": 2.296644869233568e-06, - "loss": 0.75023353, - "num_input_tokens_seen": 167539865, - "step": 7801, - "time_per_iteration": 2.635540008544922 - }, - { - "auxiliary_loss_clip": 0.01097741, - "auxiliary_loss_mlp": 0.010419, - "balance_loss_clip": 1.04270506, - "balance_loss_mlp": 1.02579427, - "epoch": 0.46908161731549675, - "flos": 18077324428800.0, - "grad_norm": 1.930712957606368, - "language_loss": 0.62748474, - "learning_rate": 2.2962597097441936e-06, - "loss": 0.64888108, - "num_input_tokens_seen": 167558190, - "step": 7802, - "time_per_iteration": 2.8309857845306396 - }, - { - "auxiliary_loss_clip": 0.01131707, - "auxiliary_loss_mlp": 0.01041126, - "balance_loss_clip": 1.04824543, - "balance_loss_mlp": 1.02705908, - "epoch": 0.4691417405681647, - "flos": 25703350053120.0, - "grad_norm": 2.0983906256852647, - "language_loss": 0.73465741, - "learning_rate": 2.2958745390206206e-06, - "loss": 0.75638568, - "num_input_tokens_seen": 167577685, - "step": 7803, - "time_per_iteration": 2.639453172683716 - }, - { - "auxiliary_loss_clip": 0.01105851, - "auxiliary_loss_mlp": 0.00771349, - "balance_loss_clip": 1.04883635, - "balance_loss_mlp": 1.00065053, - "epoch": 0.46920186382083273, - "flos": 17457039440640.0, - "grad_norm": 2.3177200047102486, - "language_loss": 0.77396876, - "learning_rate": 2.2954893570774558e-06, - "loss": 0.7927407, - "num_input_tokens_seen": 167596390, - "step": 7804, - "time_per_iteration": 2.6661806106567383 - }, - { - "auxiliary_loss_clip": 0.01105528, - "auxiliary_loss_mlp": 0.01031688, - "balance_loss_clip": 1.04877174, - "balance_loss_mlp": 1.01763344, - "epoch": 0.4692619870735007, - "flos": 20339445202560.0, - "grad_norm": 2.089417814933236, - "language_loss": 0.77330643, - "learning_rate": 2.295104163929305e-06, - "loss": 0.79467863, - "num_input_tokens_seen": 167614980, - "step": 7805, - "time_per_iteration": 2.6670541763305664 - }, - { - "auxiliary_loss_clip": 0.01140382, - "auxiliary_loss_mlp": 0.01050591, - "balance_loss_clip": 1.05195141, - "balance_loss_mlp": 1.03487957, - "epoch": 0.46932211032616866, - "flos": 29496558003840.0, - "grad_norm": 1.6834011453476339, - "language_loss": 0.82446682, - "learning_rate": 2.2947189595907742e-06, - "loss": 0.84637654, - "num_input_tokens_seen": 167635895, - "step": 7806, - "time_per_iteration": 2.641126871109009 - }, - { - "auxiliary_loss_clip": 0.01109262, - "auxiliary_loss_mlp": 0.01041295, - "balance_loss_clip": 1.04739761, - "balance_loss_mlp": 1.02634656, - "epoch": 0.4693822335788366, - "flos": 36211242735360.0, - "grad_norm": 1.815437092056069, - "language_loss": 0.77320337, - "learning_rate": 2.294333744076472e-06, - "loss": 0.79470897, - "num_input_tokens_seen": 167657440, - "step": 7807, - "time_per_iteration": 2.768772840499878 - }, - { - "auxiliary_loss_clip": 0.0110914, - "auxiliary_loss_mlp": 0.01038695, - "balance_loss_clip": 1.05083752, - "balance_loss_mlp": 1.02354348, - "epoch": 0.4694423568315046, - "flos": 20338978325760.0, - "grad_norm": 2.201580678066969, - "language_loss": 0.51815701, - "learning_rate": 2.2939485174010035e-06, - "loss": 0.53963536, - "num_input_tokens_seen": 167675025, - "step": 7808, - "time_per_iteration": 2.6565470695495605 - }, - { - "auxiliary_loss_clip": 0.01003405, - "auxiliary_loss_mlp": 0.01005455, - "balance_loss_clip": 1.0168457, - "balance_loss_mlp": 1.00391757, - "epoch": 0.46950248008417256, - "flos": 64326353621760.0, - "grad_norm": 0.78732179125356, - "language_loss": 0.57700193, - "learning_rate": 2.293563279578978e-06, - "loss": 0.59709048, - "num_input_tokens_seen": 167729635, - "step": 7809, - "time_per_iteration": 3.1529645919799805 - }, - { - "auxiliary_loss_clip": 0.01087624, - "auxiliary_loss_mlp": 0.01039585, - "balance_loss_clip": 1.04826307, - "balance_loss_mlp": 1.02535129, - "epoch": 0.4695626033368405, - "flos": 19200106730880.0, - "grad_norm": 2.4452536224375403, - "language_loss": 0.7153672, - "learning_rate": 2.2931780306250045e-06, - "loss": 0.73663932, - "num_input_tokens_seen": 167745135, - "step": 7810, - "time_per_iteration": 2.730975389480591 - }, - { - "auxiliary_loss_clip": 0.01122205, - "auxiliary_loss_mlp": 0.01041582, - "balance_loss_clip": 1.04927683, - "balance_loss_mlp": 1.02719331, - "epoch": 0.4696227265895085, - "flos": 23002436736000.0, - "grad_norm": 3.7864250348919284, - "language_loss": 0.81469715, - "learning_rate": 2.29279277055369e-06, - "loss": 0.83633506, - "num_input_tokens_seen": 167763875, - "step": 7811, - "time_per_iteration": 2.689089059829712 - }, - { - "auxiliary_loss_clip": 0.01117579, - "auxiliary_loss_mlp": 0.01038248, - "balance_loss_clip": 1.04989529, - "balance_loss_mlp": 1.02302504, - "epoch": 0.46968284984217645, - "flos": 21870855601920.0, - "grad_norm": 1.6520361935296233, - "language_loss": 0.8041414, - "learning_rate": 2.292407499379644e-06, - "loss": 0.82569969, - "num_input_tokens_seen": 167784895, - "step": 7812, - "time_per_iteration": 2.6615161895751953 - }, - { - "auxiliary_loss_clip": 0.01075193, - "auxiliary_loss_mlp": 0.01036276, - "balance_loss_clip": 1.04313707, - "balance_loss_mlp": 1.02170289, - "epoch": 0.4697429730948444, - "flos": 19974987855360.0, - "grad_norm": 1.6393784799199496, - "language_loss": 0.74155343, - "learning_rate": 2.292022217117477e-06, - "loss": 0.76266813, - "num_input_tokens_seen": 167803185, - "step": 7813, - "time_per_iteration": 2.7426726818084717 - }, - { - "auxiliary_loss_clip": 0.01102658, - "auxiliary_loss_mlp": 0.01036665, - "balance_loss_clip": 1.04594994, - "balance_loss_mlp": 1.02108407, - "epoch": 0.4698030963475124, - "flos": 15156206784000.0, - "grad_norm": 2.3178266219619994, - "language_loss": 0.84324849, - "learning_rate": 2.291636923781798e-06, - "loss": 0.86464167, - "num_input_tokens_seen": 167816550, - "step": 7814, - "time_per_iteration": 2.6519999504089355 - }, - { - "auxiliary_loss_clip": 0.01105673, - "auxiliary_loss_mlp": 0.01036813, - "balance_loss_clip": 1.04427862, - "balance_loss_mlp": 1.02291358, - "epoch": 0.46986321960018035, - "flos": 15151178880000.0, - "grad_norm": 1.8698068393605216, - "language_loss": 0.81723464, - "learning_rate": 2.291251619387217e-06, - "loss": 0.83865952, - "num_input_tokens_seen": 167831845, - "step": 7815, - "time_per_iteration": 5.720506906509399 - }, - { - "auxiliary_loss_clip": 0.01088353, - "auxiliary_loss_mlp": 0.01038971, - "balance_loss_clip": 1.04897821, - "balance_loss_mlp": 1.023808, - "epoch": 0.4699233428528483, - "flos": 23108911626240.0, - "grad_norm": 2.071255754681328, - "language_loss": 0.77463031, - "learning_rate": 2.2908663039483468e-06, - "loss": 0.79590356, - "num_input_tokens_seen": 167850360, - "step": 7816, - "time_per_iteration": 2.738074541091919 - }, - { - "auxiliary_loss_clip": 0.01044982, - "auxiliary_loss_mlp": 0.01001103, - "balance_loss_clip": 1.01830792, - "balance_loss_mlp": 0.99944633, - "epoch": 0.46998346610551633, - "flos": 68105558246400.0, - "grad_norm": 0.838650178196428, - "language_loss": 0.58987319, - "learning_rate": 2.290480977479796e-06, - "loss": 0.6103341, - "num_input_tokens_seen": 167908660, - "step": 7817, - "time_per_iteration": 3.1292662620544434 - }, - { - "auxiliary_loss_clip": 0.01107632, - "auxiliary_loss_mlp": 0.01034089, - "balance_loss_clip": 1.04874861, - "balance_loss_mlp": 1.02005172, - "epoch": 0.4700435893581843, - "flos": 24129456842880.0, - "grad_norm": 1.7123630681211415, - "language_loss": 0.79417968, - "learning_rate": 2.2900956399961775e-06, - "loss": 0.81559694, - "num_input_tokens_seen": 167927905, - "step": 7818, - "time_per_iteration": 5.943104028701782 - }, - { - "auxiliary_loss_clip": 0.0113212, - "auxiliary_loss_mlp": 0.01037162, - "balance_loss_clip": 1.04868269, - "balance_loss_mlp": 1.02325034, - "epoch": 0.47010371261085226, - "flos": 20150518642560.0, - "grad_norm": 1.6838154241149696, - "language_loss": 0.83469647, - "learning_rate": 2.289710291512104e-06, - "loss": 0.85638928, - "num_input_tokens_seen": 167945995, - "step": 7819, - "time_per_iteration": 2.6600770950317383 - }, - { - "auxiliary_loss_clip": 0.01101069, - "auxiliary_loss_mlp": 0.0103721, - "balance_loss_clip": 1.04507041, - "balance_loss_mlp": 1.02214193, - "epoch": 0.47016383586352023, - "flos": 15122199582720.0, - "grad_norm": 2.5448578806987974, - "language_loss": 0.7640624, - "learning_rate": 2.289324932042186e-06, - "loss": 0.78544521, - "num_input_tokens_seen": 167963380, - "step": 7820, - "time_per_iteration": 2.720524549484253 - }, - { - "auxiliary_loss_clip": 0.01114996, - "auxiliary_loss_mlp": 0.01040886, - "balance_loss_clip": 1.05066848, - "balance_loss_mlp": 1.02641368, - "epoch": 0.4702239591161882, - "flos": 13552975140480.0, - "grad_norm": 1.835793139157851, - "language_loss": 0.74591041, - "learning_rate": 2.288939561601039e-06, - "loss": 0.76746929, - "num_input_tokens_seen": 167981740, - "step": 7821, - "time_per_iteration": 2.6208953857421875 - }, - { - "auxiliary_loss_clip": 0.0112785, - "auxiliary_loss_mlp": 0.01044502, - "balance_loss_clip": 1.04762793, - "balance_loss_mlp": 1.03104329, - "epoch": 0.47028408236885616, - "flos": 24276511123200.0, - "grad_norm": 1.8086110799443134, - "language_loss": 0.89176404, - "learning_rate": 2.2885541802032746e-06, - "loss": 0.91348755, - "num_input_tokens_seen": 167999380, - "step": 7822, - "time_per_iteration": 2.641425371170044 - }, - { - "auxiliary_loss_clip": 0.01113329, - "auxiliary_loss_mlp": 0.01033656, - "balance_loss_clip": 1.04665482, - "balance_loss_mlp": 1.01981544, - "epoch": 0.4703442056215241, - "flos": 22856926740480.0, - "grad_norm": 1.7930134528553263, - "language_loss": 0.79694283, - "learning_rate": 2.2881687878635055e-06, - "loss": 0.81841266, - "num_input_tokens_seen": 168018395, - "step": 7823, - "time_per_iteration": 2.632756233215332 - }, - { - "auxiliary_loss_clip": 0.01025068, - "auxiliary_loss_mlp": 0.01003424, - "balance_loss_clip": 1.02190793, - "balance_loss_mlp": 1.00163603, - "epoch": 0.4704043288741921, - "flos": 69240227950080.0, - "grad_norm": 0.8086269167579946, - "language_loss": 0.56642514, - "learning_rate": 2.2877833845963487e-06, - "loss": 0.5867101, - "num_input_tokens_seen": 168084080, - "step": 7824, - "time_per_iteration": 3.3140807151794434 - }, - { - "auxiliary_loss_clip": 0.01104679, - "auxiliary_loss_mlp": 0.01042887, - "balance_loss_clip": 1.04395127, - "balance_loss_mlp": 1.02718711, - "epoch": 0.47046445212686006, - "flos": 18041090584320.0, - "grad_norm": 1.8843796036347318, - "language_loss": 0.81223321, - "learning_rate": 2.2873979704164157e-06, - "loss": 0.83370888, - "num_input_tokens_seen": 168101555, - "step": 7825, - "time_per_iteration": 2.700547695159912 - }, - { - "auxiliary_loss_clip": 0.01111276, - "auxiliary_loss_mlp": 0.01036611, - "balance_loss_clip": 1.0480845, - "balance_loss_mlp": 1.02218676, - "epoch": 0.470524575379528, - "flos": 23951448017280.0, - "grad_norm": 1.7729512383292405, - "language_loss": 0.66719514, - "learning_rate": 2.287012545338324e-06, - "loss": 0.68867397, - "num_input_tokens_seen": 168121530, - "step": 7826, - "time_per_iteration": 2.6998069286346436 - }, - { - "auxiliary_loss_clip": 0.01105784, - "auxiliary_loss_mlp": 0.01039915, - "balance_loss_clip": 1.04433072, - "balance_loss_mlp": 1.02479887, - "epoch": 0.470584698632196, - "flos": 18113558273280.0, - "grad_norm": 1.8432989970829954, - "language_loss": 0.84173524, - "learning_rate": 2.2866271093766877e-06, - "loss": 0.86319232, - "num_input_tokens_seen": 168140335, - "step": 7827, - "time_per_iteration": 2.692657709121704 - }, - { - "auxiliary_loss_clip": 0.01024445, - "auxiliary_loss_mlp": 0.01004787, - "balance_loss_clip": 1.01622581, - "balance_loss_mlp": 1.00303495, - "epoch": 0.47064482188486395, - "flos": 57251916224640.0, - "grad_norm": 0.8086690003326286, - "language_loss": 0.5568617, - "learning_rate": 2.286241662546122e-06, - "loss": 0.57715398, - "num_input_tokens_seen": 168200535, - "step": 7828, - "time_per_iteration": 3.184593439102173 - }, - { - "auxiliary_loss_clip": 0.01128245, - "auxiliary_loss_mlp": 0.01033804, - "balance_loss_clip": 1.04770434, - "balance_loss_mlp": 1.02036309, - "epoch": 0.4707049451375319, - "flos": 17895077798400.0, - "grad_norm": 2.799236307786822, - "language_loss": 0.80882025, - "learning_rate": 2.285856204861245e-06, - "loss": 0.8304407, - "num_input_tokens_seen": 168219610, - "step": 7829, - "time_per_iteration": 2.5789284706115723 - }, - { - "auxiliary_loss_clip": 0.01128236, - "auxiliary_loss_mlp": 0.01036042, - "balance_loss_clip": 1.04866183, - "balance_loss_mlp": 1.02311337, - "epoch": 0.47076506839019994, - "flos": 25232669210880.0, - "grad_norm": 1.589084017915349, - "language_loss": 0.76252091, - "learning_rate": 2.2854707363366703e-06, - "loss": 0.78416359, - "num_input_tokens_seen": 168242505, - "step": 7830, - "time_per_iteration": 2.6604039669036865 - }, - { - "auxiliary_loss_clip": 0.01094201, - "auxiliary_loss_mlp": 0.01033866, - "balance_loss_clip": 1.04519463, - "balance_loss_mlp": 1.01907206, - "epoch": 0.4708251916428679, - "flos": 13479681438720.0, - "grad_norm": 1.9041514810278948, - "language_loss": 0.7839942, - "learning_rate": 2.2850852569870177e-06, - "loss": 0.8052749, - "num_input_tokens_seen": 168260220, - "step": 7831, - "time_per_iteration": 2.7709531784057617 - }, - { - "auxiliary_loss_clip": 0.01084793, - "auxiliary_loss_mlp": 0.01045555, - "balance_loss_clip": 1.03967106, - "balance_loss_mlp": 1.0289377, - "epoch": 0.47088531489553587, - "flos": 30147833450880.0, - "grad_norm": 3.4524245779244045, - "language_loss": 0.75518548, - "learning_rate": 2.2846997668269033e-06, - "loss": 0.7764889, - "num_input_tokens_seen": 168277360, - "step": 7832, - "time_per_iteration": 2.9078352451324463 - }, - { - "auxiliary_loss_clip": 0.01100887, - "auxiliary_loss_mlp": 0.01027155, - "balance_loss_clip": 1.04597783, - "balance_loss_mlp": 1.01476312, - "epoch": 0.47094543814820383, - "flos": 21798280172160.0, - "grad_norm": 1.3033633023675582, - "language_loss": 0.74446917, - "learning_rate": 2.2843142658709454e-06, - "loss": 0.76574957, - "num_input_tokens_seen": 168296605, - "step": 7833, - "time_per_iteration": 2.7040505409240723 - }, - { - "auxiliary_loss_clip": 0.01115931, - "auxiliary_loss_mlp": 0.01039232, - "balance_loss_clip": 1.04605532, - "balance_loss_mlp": 1.02489686, - "epoch": 0.4710055614008718, - "flos": 23003011353600.0, - "grad_norm": 1.6784231271486025, - "language_loss": 0.75652939, - "learning_rate": 2.283928754133762e-06, - "loss": 0.778081, - "num_input_tokens_seen": 168316205, - "step": 7834, - "time_per_iteration": 2.651439666748047 - }, - { - "auxiliary_loss_clip": 0.01080958, - "auxiliary_loss_mlp": 0.0104352, - "balance_loss_clip": 1.04571462, - "balance_loss_mlp": 1.02942359, - "epoch": 0.47106568465353976, - "flos": 42741346452480.0, - "grad_norm": 1.5705960877616694, - "language_loss": 0.66198736, - "learning_rate": 2.283543231629972e-06, - "loss": 0.68323219, - "num_input_tokens_seen": 168338935, - "step": 7835, - "time_per_iteration": 2.8833723068237305 - }, - { - "auxiliary_loss_clip": 0.01030822, - "auxiliary_loss_mlp": 0.0075266, - "balance_loss_clip": 1.01354921, - "balance_loss_mlp": 1.00055587, - "epoch": 0.4711258079062077, - "flos": 68554008570240.0, - "grad_norm": 0.8682696962056556, - "language_loss": 0.62114525, - "learning_rate": 2.283157698374194e-06, - "loss": 0.63898003, - "num_input_tokens_seen": 168392800, - "step": 7836, - "time_per_iteration": 3.271106243133545 - }, - { - "auxiliary_loss_clip": 0.01089899, - "auxiliary_loss_mlp": 0.00772396, - "balance_loss_clip": 1.04188919, - "balance_loss_mlp": 1.00066912, - "epoch": 0.4711859311588757, - "flos": 25446588658560.0, - "grad_norm": 2.9726849992756623, - "language_loss": 0.69634271, - "learning_rate": 2.2827721543810475e-06, - "loss": 0.71496564, - "num_input_tokens_seen": 168412940, - "step": 7837, - "time_per_iteration": 2.7227394580841064 - }, - { - "auxiliary_loss_clip": 0.01114908, - "auxiliary_loss_mlp": 0.01040024, - "balance_loss_clip": 1.04658818, - "balance_loss_mlp": 1.02449143, - "epoch": 0.47124605441154366, - "flos": 21981891519360.0, - "grad_norm": 1.834184212780789, - "language_loss": 0.66073495, - "learning_rate": 2.282386599665153e-06, - "loss": 0.68228424, - "num_input_tokens_seen": 168431995, - "step": 7838, - "time_per_iteration": 2.63415265083313 - }, - { - "auxiliary_loss_clip": 0.01101595, - "auxiliary_loss_mlp": 0.01040478, - "balance_loss_clip": 1.04245853, - "balance_loss_mlp": 1.02488542, - "epoch": 0.4713061776642116, - "flos": 25412689198080.0, - "grad_norm": 1.6613879226075605, - "language_loss": 0.77071315, - "learning_rate": 2.2820010342411304e-06, - "loss": 0.79213387, - "num_input_tokens_seen": 168454585, - "step": 7839, - "time_per_iteration": 2.702371835708618 - }, - { - "auxiliary_loss_clip": 0.01089161, - "auxiliary_loss_mlp": 0.01035056, - "balance_loss_clip": 1.04446244, - "balance_loss_mlp": 1.0215137, - "epoch": 0.4713663009168796, - "flos": 26542259170560.0, - "grad_norm": 2.064347613929302, - "language_loss": 0.72607076, - "learning_rate": 2.2816154581235993e-06, - "loss": 0.74731302, - "num_input_tokens_seen": 168471265, - "step": 7840, - "time_per_iteration": 2.7578155994415283 - }, - { - "auxiliary_loss_clip": 0.01098285, - "auxiliary_loss_mlp": 0.01033804, - "balance_loss_clip": 1.04248786, - "balance_loss_mlp": 1.01975548, - "epoch": 0.47142642416954755, - "flos": 23623583650560.0, - "grad_norm": 1.634270857219127, - "language_loss": 0.75153434, - "learning_rate": 2.2812298713271833e-06, - "loss": 0.77285522, - "num_input_tokens_seen": 168491360, - "step": 7841, - "time_per_iteration": 2.7571516036987305 - }, - { - "auxiliary_loss_clip": 0.01097356, - "auxiliary_loss_mlp": 0.01036522, - "balance_loss_clip": 1.04522789, - "balance_loss_mlp": 1.02271175, - "epoch": 0.4714865474222155, - "flos": 22310150935680.0, - "grad_norm": 1.514171980299406, - "language_loss": 0.70372689, - "learning_rate": 2.280844273866501e-06, - "loss": 0.72506565, - "num_input_tokens_seen": 168511335, - "step": 7842, - "time_per_iteration": 2.6693220138549805 - }, - { - "auxiliary_loss_clip": 0.01122506, - "auxiliary_loss_mlp": 0.01036861, - "balance_loss_clip": 1.05041289, - "balance_loss_mlp": 1.02272844, - "epoch": 0.4715466706748835, - "flos": 17822430541440.0, - "grad_norm": 2.3877412842319243, - "language_loss": 0.78754079, - "learning_rate": 2.280458665756177e-06, - "loss": 0.80913448, - "num_input_tokens_seen": 168529920, - "step": 7843, - "time_per_iteration": 2.584821939468384 - }, - { - "auxiliary_loss_clip": 0.01112783, - "auxiliary_loss_mlp": 0.01033598, - "balance_loss_clip": 1.04609227, - "balance_loss_mlp": 1.02013922, - "epoch": 0.4716067939275515, - "flos": 23659530186240.0, - "grad_norm": 1.5083750473310347, - "language_loss": 0.73945224, - "learning_rate": 2.280073047010832e-06, - "loss": 0.76091611, - "num_input_tokens_seen": 168550595, - "step": 7844, - "time_per_iteration": 2.6947662830352783 - }, - { - "auxiliary_loss_clip": 0.01103523, - "auxiliary_loss_mlp": 0.01045426, - "balance_loss_clip": 1.04754925, - "balance_loss_mlp": 1.03077483, - "epoch": 0.47166691718021947, - "flos": 17930162407680.0, - "grad_norm": 1.6596812780951513, - "language_loss": 0.7849918, - "learning_rate": 2.279687417645088e-06, - "loss": 0.8064813, - "num_input_tokens_seen": 168569765, - "step": 7845, - "time_per_iteration": 2.64786434173584 - }, - { - "auxiliary_loss_clip": 0.01116093, - "auxiliary_loss_mlp": 0.01035695, - "balance_loss_clip": 1.04657555, - "balance_loss_mlp": 1.02204597, - "epoch": 0.47172704043288743, - "flos": 26614583205120.0, - "grad_norm": 1.4795134607526772, - "language_loss": 0.73325998, - "learning_rate": 2.2793017776735703e-06, - "loss": 0.75477785, - "num_input_tokens_seen": 168591525, - "step": 7846, - "time_per_iteration": 2.6890015602111816 - }, - { - "auxiliary_loss_clip": 0.01112295, - "auxiliary_loss_mlp": 0.01033387, - "balance_loss_clip": 1.04567862, - "balance_loss_mlp": 1.02053618, - "epoch": 0.4717871636855554, - "flos": 27922700707200.0, - "grad_norm": 1.365245213481775, - "language_loss": 0.74306214, - "learning_rate": 2.2789161271109e-06, - "loss": 0.76451898, - "num_input_tokens_seen": 168611235, - "step": 7847, - "time_per_iteration": 2.664600133895874 - }, - { - "auxiliary_loss_clip": 0.01076671, - "auxiliary_loss_mlp": 0.01036211, - "balance_loss_clip": 1.04269147, - "balance_loss_mlp": 1.02244806, - "epoch": 0.47184728693822336, - "flos": 14502237816960.0, - "grad_norm": 1.614512390946798, - "language_loss": 0.80744767, - "learning_rate": 2.278530465971703e-06, - "loss": 0.82857651, - "num_input_tokens_seen": 168628710, - "step": 7848, - "time_per_iteration": 2.7662644386291504 - }, - { - "auxiliary_loss_clip": 0.01118674, - "auxiliary_loss_mlp": 0.01035868, - "balance_loss_clip": 1.04767179, - "balance_loss_mlp": 1.02170014, - "epoch": 0.47190741019089133, - "flos": 17856545483520.0, - "grad_norm": 3.381301580597114, - "language_loss": 0.70282733, - "learning_rate": 2.2781447942706032e-06, - "loss": 0.72437274, - "num_input_tokens_seen": 168645645, - "step": 7849, - "time_per_iteration": 2.628324031829834 - }, - { - "auxiliary_loss_clip": 0.01102555, - "auxiliary_loss_mlp": 0.01043039, - "balance_loss_clip": 1.04688513, - "balance_loss_mlp": 1.02679062, - "epoch": 0.4719675334435593, - "flos": 17895472848000.0, - "grad_norm": 2.2108635677358968, - "language_loss": 0.6920523, - "learning_rate": 2.277759112022224e-06, - "loss": 0.71350825, - "num_input_tokens_seen": 168664165, - "step": 7850, - "time_per_iteration": 2.678515672683716 - }, - { - "auxiliary_loss_clip": 0.01071934, - "auxiliary_loss_mlp": 0.0103323, - "balance_loss_clip": 1.04294968, - "balance_loss_mlp": 1.0192523, - "epoch": 0.47202765669622726, - "flos": 20704369426560.0, - "grad_norm": 1.8559154127156776, - "language_loss": 0.75022864, - "learning_rate": 2.2773734192411916e-06, - "loss": 0.77128029, - "num_input_tokens_seen": 168681940, - "step": 7851, - "time_per_iteration": 2.7907421588897705 - }, - { - "auxiliary_loss_clip": 0.01058717, - "auxiliary_loss_mlp": 0.0104416, - "balance_loss_clip": 1.03438354, - "balance_loss_mlp": 1.02636182, - "epoch": 0.4720877799488952, - "flos": 16360255607040.0, - "grad_norm": 1.8954666463572496, - "language_loss": 0.76087546, - "learning_rate": 2.276987715942132e-06, - "loss": 0.78190422, - "num_input_tokens_seen": 168698830, - "step": 7852, - "time_per_iteration": 2.751862049102783 - }, - { - "auxiliary_loss_clip": 0.01090696, - "auxiliary_loss_mlp": 0.01031466, - "balance_loss_clip": 1.0440855, - "balance_loss_mlp": 1.01667845, - "epoch": 0.4721479032015632, - "flos": 20668171495680.0, - "grad_norm": 1.6687991208994266, - "language_loss": 0.69092613, - "learning_rate": 2.2766020021396696e-06, - "loss": 0.71214771, - "num_input_tokens_seen": 168718305, - "step": 7853, - "time_per_iteration": 2.8860716819763184 - }, - { - "auxiliary_loss_clip": 0.01023698, - "auxiliary_loss_mlp": 0.01005171, - "balance_loss_clip": 1.03293765, - "balance_loss_mlp": 1.00360918, - "epoch": 0.47220802645423116, - "flos": 67750438435200.0, - "grad_norm": 0.7060966439190681, - "language_loss": 0.50175303, - "learning_rate": 2.276216277848432e-06, - "loss": 0.52204174, - "num_input_tokens_seen": 168782365, - "step": 7854, - "time_per_iteration": 4.915671110153198 - }, - { - "auxiliary_loss_clip": 0.0112187, - "auxiliary_loss_mlp": 0.01035341, - "balance_loss_clip": 1.04927993, - "balance_loss_mlp": 1.02046967, - "epoch": 0.4722681497068991, - "flos": 20921449271040.0, - "grad_norm": 1.8544471627611243, - "language_loss": 0.63919318, - "learning_rate": 2.2758305430830455e-06, - "loss": 0.66076523, - "num_input_tokens_seen": 168800485, - "step": 7855, - "time_per_iteration": 4.303591728210449 - }, - { - "auxiliary_loss_clip": 0.01115964, - "auxiliary_loss_mlp": 0.01039633, - "balance_loss_clip": 1.04526174, - "balance_loss_mlp": 1.02463675, - "epoch": 0.4723282729595671, - "flos": 28293083798400.0, - "grad_norm": 6.403691145457763, - "language_loss": 0.75835574, - "learning_rate": 2.2754447978581376e-06, - "loss": 0.77991176, - "num_input_tokens_seen": 168818965, - "step": 7856, - "time_per_iteration": 2.669156074523926 - }, - { - "auxiliary_loss_clip": 0.01102045, - "auxiliary_loss_mlp": 0.01036544, - "balance_loss_clip": 1.04435217, - "balance_loss_mlp": 1.02334714, - "epoch": 0.4723883962122351, - "flos": 27125053338240.0, - "grad_norm": 1.8316073665627561, - "language_loss": 0.7513321, - "learning_rate": 2.2750590421883347e-06, - "loss": 0.77271795, - "num_input_tokens_seen": 168840355, - "step": 7857, - "time_per_iteration": 5.926163673400879 - }, - { - "auxiliary_loss_clip": 0.0110506, - "auxiliary_loss_mlp": 0.01044055, - "balance_loss_clip": 1.04619288, - "balance_loss_mlp": 1.03164554, - "epoch": 0.47244851946490307, - "flos": 31537253387520.0, - "grad_norm": 1.4352718890089464, - "language_loss": 0.64871937, - "learning_rate": 2.2746732760882655e-06, - "loss": 0.67021048, - "num_input_tokens_seen": 168861765, - "step": 7858, - "time_per_iteration": 2.7516961097717285 - }, - { - "auxiliary_loss_clip": 0.01115653, - "auxiliary_loss_mlp": 0.00772171, - "balance_loss_clip": 1.04487467, - "balance_loss_mlp": 1.00070405, - "epoch": 0.47250864271757104, - "flos": 20886544229760.0, - "grad_norm": 4.333924209566871, - "language_loss": 0.70584702, - "learning_rate": 2.2742874995725575e-06, - "loss": 0.72472525, - "num_input_tokens_seen": 168881310, - "step": 7859, - "time_per_iteration": 2.63272762298584 - }, - { - "auxiliary_loss_clip": 0.01132339, - "auxiliary_loss_mlp": 0.01038437, - "balance_loss_clip": 1.0472064, - "balance_loss_mlp": 1.02420318, - "epoch": 0.472568765970239, - "flos": 20522086882560.0, - "grad_norm": 1.7578939418215658, - "language_loss": 0.62056947, - "learning_rate": 2.2739017126558413e-06, - "loss": 0.64227724, - "num_input_tokens_seen": 168899470, - "step": 7860, - "time_per_iteration": 2.579881429672241 - }, - { - "auxiliary_loss_clip": 0.01104772, - "auxiliary_loss_mlp": 0.01042498, - "balance_loss_clip": 1.04455113, - "balance_loss_mlp": 1.02835417, - "epoch": 0.47262888922290697, - "flos": 35805200417280.0, - "grad_norm": 2.5847882369160584, - "language_loss": 0.71352196, - "learning_rate": 2.2735159153527445e-06, - "loss": 0.73499465, - "num_input_tokens_seen": 168921495, - "step": 7861, - "time_per_iteration": 2.7616021633148193 - }, - { - "auxiliary_loss_clip": 0.01100093, - "auxiliary_loss_mlp": 0.01035425, - "balance_loss_clip": 1.04298115, - "balance_loss_mlp": 1.02136993, - "epoch": 0.47268901247557493, - "flos": 20667740532480.0, - "grad_norm": 1.877615917676971, - "language_loss": 0.85056359, - "learning_rate": 2.273130107677896e-06, - "loss": 0.87191874, - "num_input_tokens_seen": 168940515, - "step": 7862, - "time_per_iteration": 2.730851173400879 - }, - { - "auxiliary_loss_clip": 0.01126067, - "auxiliary_loss_mlp": 0.01032341, - "balance_loss_clip": 1.04310465, - "balance_loss_mlp": 1.01836395, - "epoch": 0.4727491357282429, - "flos": 19573291082880.0, - "grad_norm": 1.8403668162610285, - "language_loss": 0.84233111, - "learning_rate": 2.272744289645927e-06, - "loss": 0.86391521, - "num_input_tokens_seen": 168958340, - "step": 7863, - "time_per_iteration": 2.7247161865234375 - }, - { - "auxiliary_loss_clip": 0.01104075, - "auxiliary_loss_mlp": 0.01041818, - "balance_loss_clip": 1.04576826, - "balance_loss_mlp": 1.02810335, - "epoch": 0.47280925898091086, - "flos": 18217231902720.0, - "grad_norm": 2.0137135318025843, - "language_loss": 0.66243893, - "learning_rate": 2.272358461271467e-06, - "loss": 0.68389785, - "num_input_tokens_seen": 168974850, - "step": 7864, - "time_per_iteration": 2.7027535438537598 - }, - { - "auxiliary_loss_clip": 0.01126031, - "auxiliary_loss_mlp": 0.01038902, - "balance_loss_clip": 1.04373837, - "balance_loss_mlp": 1.02402425, - "epoch": 0.4728693822335788, - "flos": 17821820010240.0, - "grad_norm": 1.9458421333469222, - "language_loss": 0.64846861, - "learning_rate": 2.271972622569147e-06, - "loss": 0.67011791, - "num_input_tokens_seen": 168992860, - "step": 7865, - "time_per_iteration": 2.599947214126587 - }, - { - "auxiliary_loss_clip": 0.01095039, - "auxiliary_loss_mlp": 0.00771615, - "balance_loss_clip": 1.04065597, - "balance_loss_mlp": 1.00069022, - "epoch": 0.4729295054862468, - "flos": 20595057361920.0, - "grad_norm": 1.8988594463693396, - "language_loss": 0.73979223, - "learning_rate": 2.2715867735535976e-06, - "loss": 0.75845885, - "num_input_tokens_seen": 169010325, - "step": 7866, - "time_per_iteration": 2.6904079914093018 - }, - { - "auxiliary_loss_clip": 0.01127633, - "auxiliary_loss_mlp": 0.01036812, - "balance_loss_clip": 1.0444746, - "balance_loss_mlp": 1.02215528, - "epoch": 0.47298962873891476, - "flos": 23368079232000.0, - "grad_norm": 1.7138995799513466, - "language_loss": 0.82882631, - "learning_rate": 2.271200914239451e-06, - "loss": 0.85047078, - "num_input_tokens_seen": 169029840, - "step": 7867, - "time_per_iteration": 2.66166353225708 - }, - { - "auxiliary_loss_clip": 0.01113116, - "auxiliary_loss_mlp": 0.01035066, - "balance_loss_clip": 1.04474282, - "balance_loss_mlp": 1.02197099, - "epoch": 0.4730497519915827, - "flos": 22052240305920.0, - "grad_norm": 1.59304374398017, - "language_loss": 0.79711115, - "learning_rate": 2.2708150446413385e-06, - "loss": 0.81859303, - "num_input_tokens_seen": 169049975, - "step": 7868, - "time_per_iteration": 2.639418363571167 - }, - { - "auxiliary_loss_clip": 0.01048577, - "auxiliary_loss_mlp": 0.01036292, - "balance_loss_clip": 1.03682256, - "balance_loss_mlp": 1.02049041, - "epoch": 0.4731098752442507, - "flos": 21069724613760.0, - "grad_norm": 2.2697646545371772, - "language_loss": 0.74715841, - "learning_rate": 2.2704291647738915e-06, - "loss": 0.7680071, - "num_input_tokens_seen": 169069540, - "step": 7869, - "time_per_iteration": 2.822831153869629 - }, - { - "auxiliary_loss_clip": 0.01108509, - "auxiliary_loss_mlp": 0.01048779, - "balance_loss_clip": 1.04608214, - "balance_loss_mlp": 1.03300154, - "epoch": 0.4731699984969187, - "flos": 22528775064960.0, - "grad_norm": 2.141854382789547, - "language_loss": 0.73684996, - "learning_rate": 2.2700432746517443e-06, - "loss": 0.75842285, - "num_input_tokens_seen": 169089940, - "step": 7870, - "time_per_iteration": 2.7175748348236084 - }, - { - "auxiliary_loss_clip": 0.01133545, - "auxiliary_loss_mlp": 0.01041593, - "balance_loss_clip": 1.04755211, - "balance_loss_mlp": 1.02635777, - "epoch": 0.4732301217495867, - "flos": 24898124914560.0, - "grad_norm": 2.253339307670162, - "language_loss": 0.81085944, - "learning_rate": 2.2696573742895292e-06, - "loss": 0.83261085, - "num_input_tokens_seen": 169109650, - "step": 7871, - "time_per_iteration": 2.6193602085113525 - }, - { - "auxiliary_loss_clip": 0.01113818, - "auxiliary_loss_mlp": 0.01036061, - "balance_loss_clip": 1.04329586, - "balance_loss_mlp": 1.02133834, - "epoch": 0.47329024500225464, - "flos": 22784423137920.0, - "grad_norm": 1.5762073479047713, - "language_loss": 0.75922841, - "learning_rate": 2.269271463701879e-06, - "loss": 0.78072715, - "num_input_tokens_seen": 169128990, - "step": 7872, - "time_per_iteration": 2.6391725540161133 - }, - { - "auxiliary_loss_clip": 0.01091788, - "auxiliary_loss_mlp": 0.01038432, - "balance_loss_clip": 1.04121172, - "balance_loss_mlp": 1.02376986, - "epoch": 0.4733503682549226, - "flos": 38695902220800.0, - "grad_norm": 3.094756801604535, - "language_loss": 0.67562377, - "learning_rate": 2.268885542903428e-06, - "loss": 0.696926, - "num_input_tokens_seen": 169154645, - "step": 7873, - "time_per_iteration": 2.8466758728027344 - }, - { - "auxiliary_loss_clip": 0.01117181, - "auxiliary_loss_mlp": 0.01036678, - "balance_loss_clip": 1.04567063, - "balance_loss_mlp": 1.02267087, - "epoch": 0.47341049150759057, - "flos": 22966849336320.0, - "grad_norm": 1.6392218744116203, - "language_loss": 0.72839928, - "learning_rate": 2.26849961190881e-06, - "loss": 0.74993783, - "num_input_tokens_seen": 169174995, - "step": 7874, - "time_per_iteration": 2.721020221710205 - }, - { - "auxiliary_loss_clip": 0.01113028, - "auxiliary_loss_mlp": 0.01038664, - "balance_loss_clip": 1.04846478, - "balance_loss_mlp": 1.02471697, - "epoch": 0.47347061476025853, - "flos": 14538471661440.0, - "grad_norm": 3.032092549096925, - "language_loss": 0.65002596, - "learning_rate": 2.26811367073266e-06, - "loss": 0.67154288, - "num_input_tokens_seen": 169191815, - "step": 7875, - "time_per_iteration": 2.6652960777282715 - }, - { - "auxiliary_loss_clip": 0.01083743, - "auxiliary_loss_mlp": 0.01035273, - "balance_loss_clip": 1.04805076, - "balance_loss_mlp": 1.02059197, - "epoch": 0.4735307380129265, - "flos": 30263250827520.0, - "grad_norm": 2.768907187204124, - "language_loss": 0.8101728, - "learning_rate": 2.2677277193896125e-06, - "loss": 0.83136296, - "num_input_tokens_seen": 169210430, - "step": 7876, - "time_per_iteration": 2.7860774993896484 - }, - { - "auxiliary_loss_clip": 0.01096604, - "auxiliary_loss_mlp": 0.01049403, - "balance_loss_clip": 1.04034781, - "balance_loss_mlp": 1.03362572, - "epoch": 0.47359086126559446, - "flos": 19391044452480.0, - "grad_norm": 1.718915834241656, - "language_loss": 0.79123086, - "learning_rate": 2.267341757894304e-06, - "loss": 0.81269091, - "num_input_tokens_seen": 169229295, - "step": 7877, - "time_per_iteration": 2.6741349697113037 - }, - { - "auxiliary_loss_clip": 0.01119367, - "auxiliary_loss_mlp": 0.00771148, - "balance_loss_clip": 1.04634619, - "balance_loss_mlp": 1.00065994, - "epoch": 0.47365098451826243, - "flos": 21939408708480.0, - "grad_norm": 1.9321122257733154, - "language_loss": 0.7070595, - "learning_rate": 2.2669557862613685e-06, - "loss": 0.72596461, - "num_input_tokens_seen": 169247855, - "step": 7878, - "time_per_iteration": 2.65336012840271 - }, - { - "auxiliary_loss_clip": 0.01091201, - "auxiliary_loss_mlp": 0.01041141, - "balance_loss_clip": 1.04987168, - "balance_loss_mlp": 1.02767622, - "epoch": 0.4737111077709304, - "flos": 25845053207040.0, - "grad_norm": 1.650502341043129, - "language_loss": 0.75037253, - "learning_rate": 2.2665698045054425e-06, - "loss": 0.77169597, - "num_input_tokens_seen": 169268860, - "step": 7879, - "time_per_iteration": 2.731395721435547 - }, - { - "auxiliary_loss_clip": 0.01030587, - "auxiliary_loss_mlp": 0.01009103, - "balance_loss_clip": 1.02360272, - "balance_loss_mlp": 1.00741053, - "epoch": 0.47377123102359836, - "flos": 67760886314880.0, - "grad_norm": 0.7327852929375173, - "language_loss": 0.61306548, - "learning_rate": 2.266183812641164e-06, - "loss": 0.63346243, - "num_input_tokens_seen": 169331855, - "step": 7880, - "time_per_iteration": 3.224714756011963 - }, - { - "auxiliary_loss_clip": 0.0110857, - "auxiliary_loss_mlp": 0.01041962, - "balance_loss_clip": 1.04677773, - "balance_loss_mlp": 1.02690625, - "epoch": 0.4738313542762663, - "flos": 24315977191680.0, - "grad_norm": 1.5081125335533625, - "language_loss": 0.68397921, - "learning_rate": 2.2657978106831675e-06, - "loss": 0.70548451, - "num_input_tokens_seen": 169352175, - "step": 7881, - "time_per_iteration": 2.7536203861236572 - }, - { - "auxiliary_loss_clip": 0.01068036, - "auxiliary_loss_mlp": 0.01031577, - "balance_loss_clip": 1.04936802, - "balance_loss_mlp": 1.01798737, - "epoch": 0.4738914775289343, - "flos": 20705339093760.0, - "grad_norm": 1.7877053000392102, - "language_loss": 0.77066004, - "learning_rate": 2.265411798646092e-06, - "loss": 0.7916562, - "num_input_tokens_seen": 169371215, - "step": 7882, - "time_per_iteration": 2.873434543609619 - }, - { - "auxiliary_loss_clip": 0.01116489, - "auxiliary_loss_mlp": 0.01035892, - "balance_loss_clip": 1.04511285, - "balance_loss_mlp": 1.02132463, - "epoch": 0.4739516007816023, - "flos": 25446337263360.0, - "grad_norm": 2.3087904075212204, - "language_loss": 0.76111883, - "learning_rate": 2.2650257765445747e-06, - "loss": 0.78264266, - "num_input_tokens_seen": 169391745, - "step": 7883, - "time_per_iteration": 2.7326574325561523 - }, - { - "auxiliary_loss_clip": 0.01107432, - "auxiliary_loss_mlp": 0.01031652, - "balance_loss_clip": 1.04656231, - "balance_loss_mlp": 1.01863456, - "epoch": 0.4740117240342703, - "flos": 19974341410560.0, - "grad_norm": 1.7217647008431887, - "language_loss": 0.72281808, - "learning_rate": 2.2646397443932525e-06, - "loss": 0.74420893, - "num_input_tokens_seen": 169409845, - "step": 7884, - "time_per_iteration": 2.660172462463379 - }, - { - "auxiliary_loss_clip": 0.01123059, - "auxiliary_loss_mlp": 0.01037646, - "balance_loss_clip": 1.04745269, - "balance_loss_mlp": 1.02225614, - "epoch": 0.47407184728693824, - "flos": 15661146222720.0, - "grad_norm": 2.1356892731193557, - "language_loss": 0.82255256, - "learning_rate": 2.2642537022067655e-06, - "loss": 0.8441596, - "num_input_tokens_seen": 169426085, - "step": 7885, - "time_per_iteration": 2.6816513538360596 - }, - { - "auxiliary_loss_clip": 0.01093494, - "auxiliary_loss_mlp": 0.01050029, - "balance_loss_clip": 1.0418942, - "balance_loss_mlp": 1.0338043, - "epoch": 0.4741319705396062, - "flos": 18588800142720.0, - "grad_norm": 1.6528542083339792, - "language_loss": 0.73020607, - "learning_rate": 2.263867649999751e-06, - "loss": 0.75164127, - "num_input_tokens_seen": 169444705, - "step": 7886, - "time_per_iteration": 2.6734073162078857 - }, - { - "auxiliary_loss_clip": 0.01110604, - "auxiliary_loss_mlp": 0.01038225, - "balance_loss_clip": 1.04582644, - "balance_loss_mlp": 1.02251315, - "epoch": 0.47419209379227417, - "flos": 13261093223040.0, - "grad_norm": 2.0346146652784327, - "language_loss": 0.74043691, - "learning_rate": 2.263481587786849e-06, - "loss": 0.76192516, - "num_input_tokens_seen": 169460850, - "step": 7887, - "time_per_iteration": 2.6761467456817627 - }, - { - "auxiliary_loss_clip": 0.01118145, - "auxiliary_loss_mlp": 0.01031795, - "balance_loss_clip": 1.0474298, - "balance_loss_mlp": 1.01849771, - "epoch": 0.47425221704494214, - "flos": 20044043752320.0, - "grad_norm": 1.7788052130685665, - "language_loss": 0.77452385, - "learning_rate": 2.2630955155826993e-06, - "loss": 0.79602331, - "num_input_tokens_seen": 169478890, - "step": 7888, - "time_per_iteration": 2.6402924060821533 - }, - { - "auxiliary_loss_clip": 0.01118769, - "auxiliary_loss_mlp": 0.01034654, - "balance_loss_clip": 1.0469296, - "balance_loss_mlp": 1.02044427, - "epoch": 0.4743123402976101, - "flos": 27271892136960.0, - "grad_norm": 4.211713497556063, - "language_loss": 0.72521853, - "learning_rate": 2.2627094334019406e-06, - "loss": 0.7467528, - "num_input_tokens_seen": 169499690, - "step": 7889, - "time_per_iteration": 2.693746566772461 - }, - { - "auxiliary_loss_clip": 0.0104991, - "auxiliary_loss_mlp": 0.01005818, - "balance_loss_clip": 1.02273417, - "balance_loss_mlp": 1.00418472, - "epoch": 0.47437246355027807, - "flos": 55393970261760.0, - "grad_norm": 0.7194077429508707, - "language_loss": 0.5605737, - "learning_rate": 2.262323341259214e-06, - "loss": 0.58113098, - "num_input_tokens_seen": 169560475, - "step": 7890, - "time_per_iteration": 3.180250883102417 - }, - { - "auxiliary_loss_clip": 0.01120493, - "auxiliary_loss_mlp": 0.01032412, - "balance_loss_clip": 1.04944348, - "balance_loss_mlp": 1.01705146, - "epoch": 0.47443258680294603, - "flos": 23878477537920.0, - "grad_norm": 1.9527728253341778, - "language_loss": 0.65866226, - "learning_rate": 2.2619372391691605e-06, - "loss": 0.68019128, - "num_input_tokens_seen": 169580110, - "step": 7891, - "time_per_iteration": 2.6768221855163574 - }, - { - "auxiliary_loss_clip": 0.01135111, - "auxiliary_loss_mlp": 0.01039265, - "balance_loss_clip": 1.04865634, - "balance_loss_mlp": 1.02342188, - "epoch": 0.474492710055614, - "flos": 21977761455360.0, - "grad_norm": 2.2722368949670493, - "language_loss": 0.7100271, - "learning_rate": 2.26155112714642e-06, - "loss": 0.73177087, - "num_input_tokens_seen": 169597510, - "step": 7892, - "time_per_iteration": 2.5857720375061035 - }, - { - "auxiliary_loss_clip": 0.01021432, - "auxiliary_loss_mlp": 0.01001129, - "balance_loss_clip": 1.01879561, - "balance_loss_mlp": 0.99938869, - "epoch": 0.47455283330828196, - "flos": 62557180122240.0, - "grad_norm": 0.8083016633053688, - "language_loss": 0.5854069, - "learning_rate": 2.2611650052056355e-06, - "loss": 0.60563254, - "num_input_tokens_seen": 169660010, - "step": 7893, - "time_per_iteration": 3.298412799835205 - }, - { - "auxiliary_loss_clip": 0.01119918, - "auxiliary_loss_mlp": 0.01040659, - "balance_loss_clip": 1.04893851, - "balance_loss_mlp": 1.02661026, - "epoch": 0.47461295656094993, - "flos": 12093637380480.0, - "grad_norm": 2.1787400532077608, - "language_loss": 0.77515149, - "learning_rate": 2.2607788733614463e-06, - "loss": 0.79675728, - "num_input_tokens_seen": 169678485, - "step": 7894, - "time_per_iteration": 4.300025463104248 - }, - { - "auxiliary_loss_clip": 0.01119579, - "auxiliary_loss_mlp": 0.01038145, - "balance_loss_clip": 1.04634869, - "balance_loss_mlp": 1.02365553, - "epoch": 0.4746730798136179, - "flos": 20884568981760.0, - "grad_norm": 1.6992264056336024, - "language_loss": 0.75134289, - "learning_rate": 2.260392731628497e-06, - "loss": 0.77292013, - "num_input_tokens_seen": 169697335, - "step": 7895, - "time_per_iteration": 4.2042882442474365 - }, - { - "auxiliary_loss_clip": 0.01115221, - "auxiliary_loss_mlp": 0.01035192, - "balance_loss_clip": 1.04379582, - "balance_loss_mlp": 1.02000451, - "epoch": 0.4747332030662859, - "flos": 19974808287360.0, - "grad_norm": 2.3363867956596462, - "language_loss": 0.83016753, - "learning_rate": 2.260006580021429e-06, - "loss": 0.85167164, - "num_input_tokens_seen": 169715395, - "step": 7896, - "time_per_iteration": 2.6993515491485596 - }, - { - "auxiliary_loss_clip": 0.01115945, - "auxiliary_loss_mlp": 0.01033612, - "balance_loss_clip": 1.04578996, - "balance_loss_mlp": 1.01843619, - "epoch": 0.4747933263189539, - "flos": 16034186920320.0, - "grad_norm": 2.109517003677199, - "language_loss": 0.7557857, - "learning_rate": 2.259620418554886e-06, - "loss": 0.77728134, - "num_input_tokens_seen": 169733755, - "step": 7897, - "time_per_iteration": 4.253166198730469 - }, - { - "auxiliary_loss_clip": 0.01108787, - "auxiliary_loss_mlp": 0.01040894, - "balance_loss_clip": 1.04561198, - "balance_loss_mlp": 1.02645135, - "epoch": 0.47485344957162184, - "flos": 13955102876160.0, - "grad_norm": 2.267424442093673, - "language_loss": 0.63623869, - "learning_rate": 2.25923424724351e-06, - "loss": 0.65773547, - "num_input_tokens_seen": 169751390, - "step": 7898, - "time_per_iteration": 2.672621011734009 - }, - { - "auxiliary_loss_clip": 0.01091849, - "auxiliary_loss_mlp": 0.01057132, - "balance_loss_clip": 1.04254556, - "balance_loss_mlp": 1.03949475, - "epoch": 0.4749135728242898, - "flos": 20449080489600.0, - "grad_norm": 3.549969153580447, - "language_loss": 0.70200998, - "learning_rate": 2.258848066101946e-06, - "loss": 0.72349977, - "num_input_tokens_seen": 169769500, - "step": 7899, - "time_per_iteration": 2.6986401081085205 - }, - { - "auxiliary_loss_clip": 0.01119057, - "auxiliary_loss_mlp": 0.01040719, - "balance_loss_clip": 1.04576528, - "balance_loss_mlp": 1.02590108, - "epoch": 0.4749736960769578, - "flos": 28949961767040.0, - "grad_norm": 1.9384177803560112, - "language_loss": 0.68627715, - "learning_rate": 2.258461875144837e-06, - "loss": 0.70787489, - "num_input_tokens_seen": 169789215, - "step": 7900, - "time_per_iteration": 2.695420265197754 - }, - { - "auxiliary_loss_clip": 0.01088615, - "auxiliary_loss_mlp": 0.01048142, - "balance_loss_clip": 1.04223442, - "balance_loss_mlp": 1.0335629, - "epoch": 0.47503381932962574, - "flos": 31938770592000.0, - "grad_norm": 2.214181272016126, - "language_loss": 0.70571202, - "learning_rate": 2.2580756743868273e-06, - "loss": 0.72707957, - "num_input_tokens_seen": 169808825, - "step": 7901, - "time_per_iteration": 2.7880799770355225 - }, - { - "auxiliary_loss_clip": 0.01101024, - "auxiliary_loss_mlp": 0.01063852, - "balance_loss_clip": 1.04344749, - "balance_loss_mlp": 1.04805636, - "epoch": 0.4750939425822937, - "flos": 22127257860480.0, - "grad_norm": 1.723548754677231, - "language_loss": 0.73669708, - "learning_rate": 2.2576894638425636e-06, - "loss": 0.75834584, - "num_input_tokens_seen": 169827590, - "step": 7902, - "time_per_iteration": 2.67350172996521 - }, - { - "auxiliary_loss_clip": 0.01087876, - "auxiliary_loss_mlp": 0.01040789, - "balance_loss_clip": 1.04317856, - "balance_loss_mlp": 1.02710962, - "epoch": 0.47515406583496167, - "flos": 20850094903680.0, - "grad_norm": 1.7450056007143964, - "language_loss": 0.68050694, - "learning_rate": 2.257303243526688e-06, - "loss": 0.70179355, - "num_input_tokens_seen": 169844925, - "step": 7903, - "time_per_iteration": 2.7626256942749023 - }, - { - "auxiliary_loss_clip": 0.01104723, - "auxiliary_loss_mlp": 0.01035743, - "balance_loss_clip": 1.043818, - "balance_loss_mlp": 1.02206981, - "epoch": 0.47521418908762963, - "flos": 17524802448000.0, - "grad_norm": 1.9051075920789844, - "language_loss": 0.72356462, - "learning_rate": 2.256917013453848e-06, - "loss": 0.74496931, - "num_input_tokens_seen": 169862705, - "step": 7904, - "time_per_iteration": 2.6790597438812256 - }, - { - "auxiliary_loss_clip": 0.01065198, - "auxiliary_loss_mlp": 0.01045369, - "balance_loss_clip": 1.03584373, - "balance_loss_mlp": 1.02957416, - "epoch": 0.4752743123402976, - "flos": 20559434048640.0, - "grad_norm": 1.6154437659751681, - "language_loss": 0.86472631, - "learning_rate": 2.25653077363869e-06, - "loss": 0.88583207, - "num_input_tokens_seen": 169880155, - "step": 7905, - "time_per_iteration": 2.733799457550049 - }, - { - "auxiliary_loss_clip": 0.0110676, - "auxiliary_loss_mlp": 0.01037063, - "balance_loss_clip": 1.04021764, - "balance_loss_mlp": 1.02423561, - "epoch": 0.47533443559296557, - "flos": 26360623071360.0, - "grad_norm": 1.7729713006372103, - "language_loss": 0.82212102, - "learning_rate": 2.2561445240958583e-06, - "loss": 0.84355921, - "num_input_tokens_seen": 169901525, - "step": 7906, - "time_per_iteration": 2.6994829177856445 - }, - { - "auxiliary_loss_clip": 0.01029489, - "auxiliary_loss_mlp": 0.01023044, - "balance_loss_clip": 1.03056157, - "balance_loss_mlp": 1.02150619, - "epoch": 0.47539455884563353, - "flos": 65949660967680.0, - "grad_norm": 0.6767545541611142, - "language_loss": 0.58947372, - "learning_rate": 2.255758264840002e-06, - "loss": 0.60999906, - "num_input_tokens_seen": 169970345, - "step": 7907, - "time_per_iteration": 3.409289836883545 - }, - { - "auxiliary_loss_clip": 0.01112328, - "auxiliary_loss_mlp": 0.0103978, - "balance_loss_clip": 1.04298031, - "balance_loss_mlp": 1.02575445, - "epoch": 0.4754546820983015, - "flos": 17238128002560.0, - "grad_norm": 2.5037076646878664, - "language_loss": 0.81147426, - "learning_rate": 2.255371995885765e-06, - "loss": 0.83299541, - "num_input_tokens_seen": 169986440, - "step": 7908, - "time_per_iteration": 2.6126997470855713 - }, - { - "auxiliary_loss_clip": 0.01120375, - "auxiliary_loss_mlp": 0.01045183, - "balance_loss_clip": 1.04887652, - "balance_loss_mlp": 1.03041351, - "epoch": 0.47551480535096946, - "flos": 19825886499840.0, - "grad_norm": 1.7145689882234993, - "language_loss": 0.73805857, - "learning_rate": 2.254985717247797e-06, - "loss": 0.75971419, - "num_input_tokens_seen": 170005705, - "step": 7909, - "time_per_iteration": 2.7153172492980957 - }, - { - "auxiliary_loss_clip": 0.01098915, - "auxiliary_loss_mlp": 0.0103739, - "balance_loss_clip": 1.04232681, - "balance_loss_mlp": 1.02348399, - "epoch": 0.4755749286036375, - "flos": 22163958581760.0, - "grad_norm": 1.5099683944930966, - "language_loss": 0.75533628, - "learning_rate": 2.2545994289407457e-06, - "loss": 0.77669942, - "num_input_tokens_seen": 170023415, - "step": 7910, - "time_per_iteration": 2.7330431938171387 - }, - { - "auxiliary_loss_clip": 0.01113687, - "auxiliary_loss_mlp": 0.01030183, - "balance_loss_clip": 1.04379678, - "balance_loss_mlp": 1.01749897, - "epoch": 0.47563505185630545, - "flos": 21648280976640.0, - "grad_norm": 1.931062443356086, - "language_loss": 0.79401493, - "learning_rate": 2.2542131309792577e-06, - "loss": 0.81545365, - "num_input_tokens_seen": 170042395, - "step": 7911, - "time_per_iteration": 2.6149117946624756 - }, - { - "auxiliary_loss_clip": 0.01098041, - "auxiliary_loss_mlp": 0.00773063, - "balance_loss_clip": 1.04096794, - "balance_loss_mlp": 1.00061882, - "epoch": 0.4756951751089734, - "flos": 20628777254400.0, - "grad_norm": 2.2768804327487113, - "language_loss": 0.75414324, - "learning_rate": 2.253826823377983e-06, - "loss": 0.77285427, - "num_input_tokens_seen": 170061610, - "step": 7912, - "time_per_iteration": 2.680414915084839 - }, - { - "auxiliary_loss_clip": 0.01123715, - "auxiliary_loss_mlp": 0.01037472, - "balance_loss_clip": 1.04319668, - "balance_loss_mlp": 1.02353013, - "epoch": 0.4757552983616414, - "flos": 25848788221440.0, - "grad_norm": 1.4371041113730632, - "language_loss": 0.74065906, - "learning_rate": 2.253440506151569e-06, - "loss": 0.76227093, - "num_input_tokens_seen": 170083505, - "step": 7913, - "time_per_iteration": 2.6565608978271484 - }, - { - "auxiliary_loss_clip": 0.0110748, - "auxiliary_loss_mlp": 0.01031808, - "balance_loss_clip": 1.04591024, - "balance_loss_mlp": 1.01694882, - "epoch": 0.47581542161430934, - "flos": 18223013992320.0, - "grad_norm": 2.17158702079863, - "language_loss": 0.72123522, - "learning_rate": 2.253054179314666e-06, - "loss": 0.7426281, - "num_input_tokens_seen": 170100690, - "step": 7914, - "time_per_iteration": 2.6789934635162354 - }, - { - "auxiliary_loss_clip": 0.01103912, - "auxiliary_loss_mlp": 0.01042984, - "balance_loss_clip": 1.04652143, - "balance_loss_mlp": 1.02944756, - "epoch": 0.4758755448669773, - "flos": 21579763783680.0, - "grad_norm": 2.3786315570139345, - "language_loss": 0.64855683, - "learning_rate": 2.2526678428819227e-06, - "loss": 0.67002577, - "num_input_tokens_seen": 170119240, - "step": 7915, - "time_per_iteration": 2.65608549118042 - }, - { - "auxiliary_loss_clip": 0.01123163, - "auxiliary_loss_mlp": 0.01041838, - "balance_loss_clip": 1.04508734, - "balance_loss_mlp": 1.02774107, - "epoch": 0.47593566811964527, - "flos": 15231152511360.0, - "grad_norm": 1.7019759484121837, - "language_loss": 0.76935744, - "learning_rate": 2.2522814968679896e-06, - "loss": 0.79100746, - "num_input_tokens_seen": 170136450, - "step": 7916, - "time_per_iteration": 2.585491418838501 - }, - { - "auxiliary_loss_clip": 0.01125392, - "auxiliary_loss_mlp": 0.01036553, - "balance_loss_clip": 1.04389, - "balance_loss_mlp": 1.02302265, - "epoch": 0.47599579137231324, - "flos": 21543242630400.0, - "grad_norm": 2.0866631919048175, - "language_loss": 0.63895321, - "learning_rate": 2.2518951412875173e-06, - "loss": 0.66057259, - "num_input_tokens_seen": 170155295, - "step": 7917, - "time_per_iteration": 2.5544540882110596 - }, - { - "auxiliary_loss_clip": 0.01017258, - "auxiliary_loss_mlp": 0.01002335, - "balance_loss_clip": 1.01986837, - "balance_loss_mlp": 1.00074983, - "epoch": 0.4760559146249812, - "flos": 64554602595840.0, - "grad_norm": 0.8370962757635343, - "language_loss": 0.65689212, - "learning_rate": 2.2515087761551557e-06, - "loss": 0.67708808, - "num_input_tokens_seen": 170222325, - "step": 7918, - "time_per_iteration": 3.4263010025024414 - }, - { - "auxiliary_loss_clip": 0.01114985, - "auxiliary_loss_mlp": 0.00771917, - "balance_loss_clip": 1.04313397, - "balance_loss_mlp": 1.00057673, - "epoch": 0.47611603787764917, - "flos": 22233876405120.0, - "grad_norm": 2.4555452771674067, - "language_loss": 0.68450713, - "learning_rate": 2.2511224014855563e-06, - "loss": 0.70337617, - "num_input_tokens_seen": 170241625, - "step": 7919, - "time_per_iteration": 2.7581801414489746 - }, - { - "auxiliary_loss_clip": 0.01105197, - "auxiliary_loss_mlp": 0.01042973, - "balance_loss_clip": 1.04329574, - "balance_loss_mlp": 1.02922797, - "epoch": 0.47617616113031713, - "flos": 22780005765120.0, - "grad_norm": 1.6063666097186406, - "language_loss": 0.75389183, - "learning_rate": 2.2507360172933694e-06, - "loss": 0.77537358, - "num_input_tokens_seen": 170262470, - "step": 7920, - "time_per_iteration": 2.7888362407684326 - }, - { - "auxiliary_loss_clip": 0.01109747, - "auxiliary_loss_mlp": 0.01034602, - "balance_loss_clip": 1.04727352, - "balance_loss_mlp": 1.01956415, - "epoch": 0.4762362843829851, - "flos": 24133802388480.0, - "grad_norm": 1.5207523519625543, - "language_loss": 0.7761817, - "learning_rate": 2.2503496235932487e-06, - "loss": 0.79762518, - "num_input_tokens_seen": 170283460, - "step": 7921, - "time_per_iteration": 2.7462785243988037 - }, - { - "auxiliary_loss_clip": 0.01108901, - "auxiliary_loss_mlp": 0.01043608, - "balance_loss_clip": 1.0445503, - "balance_loss_mlp": 1.02778864, - "epoch": 0.47629640763565306, - "flos": 22452069571200.0, - "grad_norm": 3.2907516590332024, - "language_loss": 0.78146785, - "learning_rate": 2.249963220399845e-06, - "loss": 0.80299294, - "num_input_tokens_seen": 170304225, - "step": 7922, - "time_per_iteration": 2.6893417835235596 - }, - { - "auxiliary_loss_clip": 0.01094796, - "auxiliary_loss_mlp": 0.01043063, - "balance_loss_clip": 1.04391539, - "balance_loss_mlp": 1.02719617, - "epoch": 0.4763565308883211, - "flos": 11181398647680.0, - "grad_norm": 1.6628631162014398, - "language_loss": 0.7275365, - "learning_rate": 2.2495768077278104e-06, - "loss": 0.74891508, - "num_input_tokens_seen": 170322110, - "step": 7923, - "time_per_iteration": 2.732468605041504 - }, - { - "auxiliary_loss_clip": 0.01102187, - "auxiliary_loss_mlp": 0.01039061, - "balance_loss_clip": 1.04838657, - "balance_loss_mlp": 1.02511382, - "epoch": 0.47641665414098905, - "flos": 22382151747840.0, - "grad_norm": 1.679365493038583, - "language_loss": 0.82141626, - "learning_rate": 2.2491903855917992e-06, - "loss": 0.84282875, - "num_input_tokens_seen": 170340700, - "step": 7924, - "time_per_iteration": 2.7680320739746094 - }, - { - "auxiliary_loss_clip": 0.01126329, - "auxiliary_loss_mlp": 0.01038575, - "balance_loss_clip": 1.0495019, - "balance_loss_mlp": 1.02264822, - "epoch": 0.476476777393657, - "flos": 25046148862080.0, - "grad_norm": 2.2679110024074705, - "language_loss": 0.80316466, - "learning_rate": 2.2488039540064626e-06, - "loss": 0.82481372, - "num_input_tokens_seen": 170359780, - "step": 7925, - "time_per_iteration": 2.649615526199341 - }, - { - "auxiliary_loss_clip": 0.01101728, - "auxiliary_loss_mlp": 0.01041222, - "balance_loss_clip": 1.04264617, - "balance_loss_mlp": 1.02741158, - "epoch": 0.476536900646325, - "flos": 27269916888960.0, - "grad_norm": 1.5530829773494035, - "language_loss": 0.72051573, - "learning_rate": 2.2484175129864558e-06, - "loss": 0.74194521, - "num_input_tokens_seen": 170381260, - "step": 7926, - "time_per_iteration": 2.7393877506256104 - }, - { - "auxiliary_loss_clip": 0.0111858, - "auxiliary_loss_mlp": 0.01035544, - "balance_loss_clip": 1.04556048, - "balance_loss_mlp": 1.02015448, - "epoch": 0.47659702389899294, - "flos": 25301401885440.0, - "grad_norm": 1.973296217359943, - "language_loss": 0.68039131, - "learning_rate": 2.248031062546432e-06, - "loss": 0.70193255, - "num_input_tokens_seen": 170400595, - "step": 7927, - "time_per_iteration": 2.7364554405212402 - }, - { - "auxiliary_loss_clip": 0.01088729, - "auxiliary_loss_mlp": 0.01031301, - "balance_loss_clip": 1.04246449, - "balance_loss_mlp": 1.01772344, - "epoch": 0.4766571471516609, - "flos": 25992861672960.0, - "grad_norm": 1.624613635266834, - "language_loss": 0.67674315, - "learning_rate": 2.247644602701045e-06, - "loss": 0.69794345, - "num_input_tokens_seen": 170421110, - "step": 7928, - "time_per_iteration": 2.7200751304626465 - }, - { - "auxiliary_loss_clip": 0.01128959, - "auxiliary_loss_mlp": 0.0103446, - "balance_loss_clip": 1.04645658, - "balance_loss_mlp": 1.01979089, - "epoch": 0.4767172704043289, - "flos": 16032211672320.0, - "grad_norm": 2.0796504226810497, - "language_loss": 0.78678215, - "learning_rate": 2.2472581334649496e-06, - "loss": 0.80841631, - "num_input_tokens_seen": 170436700, - "step": 7929, - "time_per_iteration": 2.6817221641540527 - }, - { - "auxiliary_loss_clip": 0.01102478, - "auxiliary_loss_mlp": 0.01039975, - "balance_loss_clip": 1.04257607, - "balance_loss_mlp": 1.0262301, - "epoch": 0.47677739365699684, - "flos": 39235351651200.0, - "grad_norm": 1.8131309373477071, - "language_loss": 0.6663419, - "learning_rate": 2.2468716548528016e-06, - "loss": 0.68776643, - "num_input_tokens_seen": 170459555, - "step": 7930, - "time_per_iteration": 2.856072187423706 - }, - { - "auxiliary_loss_clip": 0.0111358, - "auxiliary_loss_mlp": 0.01036755, - "balance_loss_clip": 1.04616833, - "balance_loss_mlp": 1.02318919, - "epoch": 0.4768375169096648, - "flos": 24717781704960.0, - "grad_norm": 7.611219304969564, - "language_loss": 0.7973817, - "learning_rate": 2.2464851668792555e-06, - "loss": 0.81888509, - "num_input_tokens_seen": 170479175, - "step": 7931, - "time_per_iteration": 2.646108865737915 - }, - { - "auxiliary_loss_clip": 0.01100642, - "auxiliary_loss_mlp": 0.01036826, - "balance_loss_clip": 1.04248762, - "balance_loss_mlp": 1.02181768, - "epoch": 0.47689764016233277, - "flos": 22528667324160.0, - "grad_norm": 1.747640555146421, - "language_loss": 0.76035368, - "learning_rate": 2.2460986695589678e-06, - "loss": 0.78172839, - "num_input_tokens_seen": 170498450, - "step": 7932, - "time_per_iteration": 2.6632022857666016 - }, - { - "auxiliary_loss_clip": 0.01103619, - "auxiliary_loss_mlp": 0.00770594, - "balance_loss_clip": 1.04416108, - "balance_loss_mlp": 1.00076032, - "epoch": 0.47695776341500074, - "flos": 15120619384320.0, - "grad_norm": 1.7743205398157191, - "language_loss": 0.79733002, - "learning_rate": 2.245712162906593e-06, - "loss": 0.81607223, - "num_input_tokens_seen": 170516255, - "step": 7933, - "time_per_iteration": 4.2387471199035645 - }, - { - "auxiliary_loss_clip": 0.01123015, - "auxiliary_loss_mlp": 0.01041506, - "balance_loss_clip": 1.04555225, - "balance_loss_mlp": 1.02532899, - "epoch": 0.4770178866676687, - "flos": 14678917839360.0, - "grad_norm": 1.9828909232489866, - "language_loss": 0.73883361, - "learning_rate": 2.2453256469367888e-06, - "loss": 0.76047885, - "num_input_tokens_seen": 170532705, - "step": 7934, - "time_per_iteration": 4.074187517166138 - }, - { - "auxiliary_loss_clip": 0.01116756, - "auxiliary_loss_mlp": 0.01034977, - "balance_loss_clip": 1.04362082, - "balance_loss_mlp": 1.02075577, - "epoch": 0.47707800992033667, - "flos": 22565583527040.0, - "grad_norm": 1.8305920873714958, - "language_loss": 0.80197936, - "learning_rate": 2.244939121664211e-06, - "loss": 0.8234967, - "num_input_tokens_seen": 170551925, - "step": 7935, - "time_per_iteration": 2.650474786758423 - }, - { - "auxiliary_loss_clip": 0.01101181, - "auxiliary_loss_mlp": 0.01043502, - "balance_loss_clip": 1.04532123, - "balance_loss_mlp": 1.02818346, - "epoch": 0.4771381331730047, - "flos": 30918225375360.0, - "grad_norm": 5.908138115579588, - "language_loss": 0.71829689, - "learning_rate": 2.2445525871035177e-06, - "loss": 0.73974371, - "num_input_tokens_seen": 170572320, - "step": 7936, - "time_per_iteration": 4.428630113601685 - }, - { - "auxiliary_loss_clip": 0.01130752, - "auxiliary_loss_mlp": 0.01039041, - "balance_loss_clip": 1.04646921, - "balance_loss_mlp": 1.02419913, - "epoch": 0.47719825642567265, - "flos": 25738901539200.0, - "grad_norm": 2.4038439056994156, - "language_loss": 0.675704, - "learning_rate": 2.2441660432693656e-06, - "loss": 0.69740188, - "num_input_tokens_seen": 170589470, - "step": 7937, - "time_per_iteration": 4.458148241043091 - }, - { - "auxiliary_loss_clip": 0.01034806, - "auxiliary_loss_mlp": 0.00999407, - "balance_loss_clip": 1.01822138, - "balance_loss_mlp": 0.99804842, - "epoch": 0.4772583796783406, - "flos": 66355128668160.0, - "grad_norm": 0.7105047811157361, - "language_loss": 0.56384945, - "learning_rate": 2.2437794901764128e-06, - "loss": 0.58419156, - "num_input_tokens_seen": 170662265, - "step": 7938, - "time_per_iteration": 3.3967578411102295 - }, - { - "auxiliary_loss_clip": 0.01099667, - "auxiliary_loss_mlp": 0.0104562, - "balance_loss_clip": 1.04193783, - "balance_loss_mlp": 1.02908564, - "epoch": 0.4773185029310086, - "flos": 22051091070720.0, - "grad_norm": 3.053079154163393, - "language_loss": 0.88725203, - "learning_rate": 2.243392927839317e-06, - "loss": 0.90870488, - "num_input_tokens_seen": 170679680, - "step": 7939, - "time_per_iteration": 2.7099897861480713 - }, - { - "auxiliary_loss_clip": 0.01115778, - "auxiliary_loss_mlp": 0.01037609, - "balance_loss_clip": 1.04160845, - "balance_loss_mlp": 1.02400148, - "epoch": 0.47737862618367655, - "flos": 16727801523840.0, - "grad_norm": 1.7393189284877646, - "language_loss": 0.77381486, - "learning_rate": 2.2430063562727367e-06, - "loss": 0.79534876, - "num_input_tokens_seen": 170697340, - "step": 7940, - "time_per_iteration": 2.5913469791412354 - }, - { - "auxiliary_loss_clip": 0.01104457, - "auxiliary_loss_mlp": 0.01036057, - "balance_loss_clip": 1.04589248, - "balance_loss_mlp": 1.02288485, - "epoch": 0.4774387494363445, - "flos": 19609453100160.0, - "grad_norm": 1.5893003235088359, - "language_loss": 0.8474015, - "learning_rate": 2.2426197754913322e-06, - "loss": 0.8688066, - "num_input_tokens_seen": 170714905, - "step": 7941, - "time_per_iteration": 2.605090856552124 - }, - { - "auxiliary_loss_clip": 0.0110803, - "auxiliary_loss_mlp": 0.01041538, - "balance_loss_clip": 1.04433787, - "balance_loss_mlp": 1.02682161, - "epoch": 0.4774988726890125, - "flos": 16653969118080.0, - "grad_norm": 2.1303607813841237, - "language_loss": 0.75943714, - "learning_rate": 2.24223318550976e-06, - "loss": 0.78093278, - "num_input_tokens_seen": 170731810, - "step": 7942, - "time_per_iteration": 2.612901449203491 - }, - { - "auxiliary_loss_clip": 0.01115811, - "auxiliary_loss_mlp": 0.01038801, - "balance_loss_clip": 1.04779172, - "balance_loss_mlp": 1.02491331, - "epoch": 0.47755899594168044, - "flos": 20485565729280.0, - "grad_norm": 1.7564628488897216, - "language_loss": 0.6467554, - "learning_rate": 2.241846586342682e-06, - "loss": 0.66830152, - "num_input_tokens_seen": 170750270, - "step": 7943, - "time_per_iteration": 2.6675846576690674 - }, - { - "auxiliary_loss_clip": 0.01088131, - "auxiliary_loss_mlp": 0.01040732, - "balance_loss_clip": 1.04014313, - "balance_loss_mlp": 1.02544951, - "epoch": 0.4776191191943484, - "flos": 21652806090240.0, - "grad_norm": 3.30514620611564, - "language_loss": 0.73474699, - "learning_rate": 2.2414599780047577e-06, - "loss": 0.75603563, - "num_input_tokens_seen": 170769015, - "step": 7944, - "time_per_iteration": 2.6938626766204834 - }, - { - "auxiliary_loss_clip": 0.01116316, - "auxiliary_loss_mlp": 0.01035661, - "balance_loss_clip": 1.04835653, - "balance_loss_mlp": 1.01982975, - "epoch": 0.4776792424470164, - "flos": 18770220760320.0, - "grad_norm": 2.01255819211095, - "language_loss": 0.67873627, - "learning_rate": 2.2410733605106456e-06, - "loss": 0.70025599, - "num_input_tokens_seen": 170785725, - "step": 7945, - "time_per_iteration": 2.5940043926239014 - }, - { - "auxiliary_loss_clip": 0.0108787, - "auxiliary_loss_mlp": 0.00774963, - "balance_loss_clip": 1.03865957, - "balance_loss_mlp": 1.00055337, - "epoch": 0.47773936569968434, - "flos": 29715828577920.0, - "grad_norm": 1.9730762461064726, - "language_loss": 0.75473535, - "learning_rate": 2.240686733875009e-06, - "loss": 0.77336371, - "num_input_tokens_seen": 170804600, - "step": 7946, - "time_per_iteration": 2.762983560562134 - }, - { - "auxiliary_loss_clip": 0.01105207, - "auxiliary_loss_mlp": 0.01042769, - "balance_loss_clip": 1.04477096, - "balance_loss_mlp": 1.0274632, - "epoch": 0.4777994889523523, - "flos": 24791542283520.0, - "grad_norm": 2.190560640838335, - "language_loss": 0.79071236, - "learning_rate": 2.240300098112506e-06, - "loss": 0.81219208, - "num_input_tokens_seen": 170824230, - "step": 7947, - "time_per_iteration": 2.692763328552246 - }, - { - "auxiliary_loss_clip": 0.010955, - "auxiliary_loss_mlp": 0.01037272, - "balance_loss_clip": 1.0440042, - "balance_loss_mlp": 1.02317524, - "epoch": 0.47785961220502027, - "flos": 17858161595520.0, - "grad_norm": 2.294285239078615, - "language_loss": 0.7329706, - "learning_rate": 2.2399134532377998e-06, - "loss": 0.75429833, - "num_input_tokens_seen": 170843365, - "step": 7948, - "time_per_iteration": 2.6743998527526855 - }, - { - "auxiliary_loss_clip": 0.01106692, - "auxiliary_loss_mlp": 0.01038667, - "balance_loss_clip": 1.04329944, - "balance_loss_mlp": 1.0235039, - "epoch": 0.4779197354576883, - "flos": 20266546550400.0, - "grad_norm": 1.7991446580624026, - "language_loss": 0.78139675, - "learning_rate": 2.2395267992655514e-06, - "loss": 0.80285037, - "num_input_tokens_seen": 170863515, - "step": 7949, - "time_per_iteration": 2.694549560546875 - }, - { - "auxiliary_loss_clip": 0.01096582, - "auxiliary_loss_mlp": 0.0104007, - "balance_loss_clip": 1.04018211, - "balance_loss_mlp": 1.0263133, - "epoch": 0.47797985871035625, - "flos": 17056599644160.0, - "grad_norm": 2.242348781817659, - "language_loss": 0.74315739, - "learning_rate": 2.2391401362104227e-06, - "loss": 0.76452386, - "num_input_tokens_seen": 170881245, - "step": 7950, - "time_per_iteration": 2.718254327774048 - }, - { - "auxiliary_loss_clip": 0.01095843, - "auxiliary_loss_mlp": 0.01046859, - "balance_loss_clip": 1.04172587, - "balance_loss_mlp": 1.03179109, - "epoch": 0.4780399819630242, - "flos": 31358418549120.0, - "grad_norm": 1.9896022122587003, - "language_loss": 0.74343586, - "learning_rate": 2.2387534640870756e-06, - "loss": 0.7648629, - "num_input_tokens_seen": 170901285, - "step": 7951, - "time_per_iteration": 2.7827391624450684 - }, - { - "auxiliary_loss_clip": 0.01094802, - "auxiliary_loss_mlp": 0.01036097, - "balance_loss_clip": 1.04424548, - "balance_loss_mlp": 1.02120781, - "epoch": 0.4781001052156922, - "flos": 24899597372160.0, - "grad_norm": 2.198904574593956, - "language_loss": 0.80032581, - "learning_rate": 2.238366782910174e-06, - "loss": 0.82163477, - "num_input_tokens_seen": 170919740, - "step": 7952, - "time_per_iteration": 2.812988519668579 - }, - { - "auxiliary_loss_clip": 0.01107213, - "auxiliary_loss_mlp": 0.01044893, - "balance_loss_clip": 1.04275584, - "balance_loss_mlp": 1.03007555, - "epoch": 0.47816022846836015, - "flos": 18697717157760.0, - "grad_norm": 1.8177204893019177, - "language_loss": 0.7794894, - "learning_rate": 2.23798009269438e-06, - "loss": 0.80101049, - "num_input_tokens_seen": 170938510, - "step": 7953, - "time_per_iteration": 2.6617591381073 - }, - { - "auxiliary_loss_clip": 0.01120456, - "auxiliary_loss_mlp": 0.01038164, - "balance_loss_clip": 1.04588997, - "balance_loss_mlp": 1.0237813, - "epoch": 0.4782203517210281, - "flos": 11977573559040.0, - "grad_norm": 2.347215604083738, - "language_loss": 0.84714645, - "learning_rate": 2.2375933934543566e-06, - "loss": 0.86873269, - "num_input_tokens_seen": 170951170, - "step": 7954, - "time_per_iteration": 2.6208479404449463 - }, - { - "auxiliary_loss_clip": 0.01097068, - "auxiliary_loss_mlp": 0.01038972, - "balance_loss_clip": 1.0426054, - "balance_loss_mlp": 1.0248698, - "epoch": 0.4782804749736961, - "flos": 20813501923200.0, - "grad_norm": 1.4277916214046864, - "language_loss": 0.70472121, - "learning_rate": 2.237206685204768e-06, - "loss": 0.72608161, - "num_input_tokens_seen": 170970990, - "step": 7955, - "time_per_iteration": 2.821913719177246 - }, - { - "auxiliary_loss_clip": 0.0110203, - "auxiliary_loss_mlp": 0.01041668, - "balance_loss_clip": 1.04433143, - "balance_loss_mlp": 1.0281322, - "epoch": 0.47834059822636404, - "flos": 23840304359040.0, - "grad_norm": 1.5047634516845327, - "language_loss": 0.82269239, - "learning_rate": 2.2368199679602787e-06, - "loss": 0.84412932, - "num_input_tokens_seen": 170991215, - "step": 7956, - "time_per_iteration": 2.683924913406372 - }, - { - "auxiliary_loss_clip": 0.01105668, - "auxiliary_loss_mlp": 0.01036371, - "balance_loss_clip": 1.04529083, - "balance_loss_mlp": 1.02021837, - "epoch": 0.478400721479032, - "flos": 22633777497600.0, - "grad_norm": 2.448858103633137, - "language_loss": 0.84977531, - "learning_rate": 2.2364332417355516e-06, - "loss": 0.87119567, - "num_input_tokens_seen": 171007325, - "step": 7957, - "time_per_iteration": 2.6371145248413086 - }, - { - "auxiliary_loss_clip": 0.01118227, - "auxiliary_loss_mlp": 0.01040317, - "balance_loss_clip": 1.04562736, - "balance_loss_mlp": 1.02653635, - "epoch": 0.4784608447317, - "flos": 19354954262400.0, - "grad_norm": 1.5628888100251457, - "language_loss": 0.79777038, - "learning_rate": 2.2360465065452527e-06, - "loss": 0.81935579, - "num_input_tokens_seen": 171025650, - "step": 7958, - "time_per_iteration": 2.639721632003784 - }, - { - "auxiliary_loss_clip": 0.01085054, - "auxiliary_loss_mlp": 0.0077548, - "balance_loss_clip": 1.03763032, - "balance_loss_mlp": 1.00064015, - "epoch": 0.47852096798436794, - "flos": 24021114445440.0, - "grad_norm": 1.8018566992199279, - "language_loss": 0.82972836, - "learning_rate": 2.235659762404047e-06, - "loss": 0.84833372, - "num_input_tokens_seen": 171045045, - "step": 7959, - "time_per_iteration": 2.733668565750122 - }, - { - "auxiliary_loss_clip": 0.01090487, - "auxiliary_loss_mlp": 0.01036767, - "balance_loss_clip": 1.04364586, - "balance_loss_mlp": 1.02436292, - "epoch": 0.4785810912370359, - "flos": 25666433850240.0, - "grad_norm": 2.7562627438628504, - "language_loss": 0.73275614, - "learning_rate": 2.235273009326599e-06, - "loss": 0.75402862, - "num_input_tokens_seen": 171062910, - "step": 7960, - "time_per_iteration": 2.6994166374206543 - }, - { - "auxiliary_loss_clip": 0.0109086, - "auxiliary_loss_mlp": 0.0103472, - "balance_loss_clip": 1.04504585, - "balance_loss_mlp": 1.02170801, - "epoch": 0.47864121448970387, - "flos": 21432134885760.0, - "grad_norm": 1.6649690841938434, - "language_loss": 0.76878142, - "learning_rate": 2.2348862473275745e-06, - "loss": 0.79003716, - "num_input_tokens_seen": 171080875, - "step": 7961, - "time_per_iteration": 2.7051572799682617 - }, - { - "auxiliary_loss_clip": 0.01087757, - "auxiliary_loss_mlp": 0.0103463, - "balance_loss_clip": 1.04447055, - "balance_loss_mlp": 1.02050352, - "epoch": 0.47870133774237184, - "flos": 16143894034560.0, - "grad_norm": 7.35679067145723, - "language_loss": 0.7769649, - "learning_rate": 2.2344994764216405e-06, - "loss": 0.79818881, - "num_input_tokens_seen": 171099190, - "step": 7962, - "time_per_iteration": 2.7466347217559814 - }, - { - "auxiliary_loss_clip": 0.0110573, - "auxiliary_loss_mlp": 0.01042926, - "balance_loss_clip": 1.04702401, - "balance_loss_mlp": 1.02871001, - "epoch": 0.47876146099503986, - "flos": 26906788344960.0, - "grad_norm": 1.6387698321198922, - "language_loss": 0.64764994, - "learning_rate": 2.2341126966234635e-06, - "loss": 0.66913652, - "num_input_tokens_seen": 171119060, - "step": 7963, - "time_per_iteration": 2.77663516998291 - }, - { - "auxiliary_loss_clip": 0.01117113, - "auxiliary_loss_mlp": 0.01035904, - "balance_loss_clip": 1.04389668, - "balance_loss_mlp": 1.02196217, - "epoch": 0.4788215842477078, - "flos": 45332085778560.0, - "grad_norm": 1.655648847764305, - "language_loss": 0.77503848, - "learning_rate": 2.2337259079477083e-06, - "loss": 0.79656863, - "num_input_tokens_seen": 171141900, - "step": 7964, - "time_per_iteration": 2.9196712970733643 - }, - { - "auxiliary_loss_clip": 0.01120902, - "auxiliary_loss_mlp": 0.01036482, - "balance_loss_clip": 1.04660964, - "balance_loss_mlp": 1.02042508, - "epoch": 0.4788817075003758, - "flos": 22237180456320.0, - "grad_norm": 2.8996801764774505, - "language_loss": 0.76540697, - "learning_rate": 2.233339110409044e-06, - "loss": 0.78698087, - "num_input_tokens_seen": 171161045, - "step": 7965, - "time_per_iteration": 2.6720781326293945 - }, - { - "auxiliary_loss_clip": 0.0106828, - "auxiliary_loss_mlp": 0.0105005, - "balance_loss_clip": 1.03929722, - "balance_loss_mlp": 1.03433788, - "epoch": 0.47894183075304375, - "flos": 16471183783680.0, - "grad_norm": 1.712219755604538, - "language_loss": 0.74560332, - "learning_rate": 2.232952304022137e-06, - "loss": 0.76678663, - "num_input_tokens_seen": 171179675, - "step": 7966, - "time_per_iteration": 2.7669286727905273 - }, - { - "auxiliary_loss_clip": 0.01101486, - "auxiliary_loss_mlp": 0.0103808, - "balance_loss_clip": 1.0444622, - "balance_loss_mlp": 1.02388787, - "epoch": 0.4790019540057117, - "flos": 24282688262400.0, - "grad_norm": 2.605899190258409, - "language_loss": 0.73308432, - "learning_rate": 2.232565488801655e-06, - "loss": 0.75448, - "num_input_tokens_seen": 171201175, - "step": 7967, - "time_per_iteration": 2.7271900177001953 - }, - { - "auxiliary_loss_clip": 0.01102984, - "auxiliary_loss_mlp": 0.01032784, - "balance_loss_clip": 1.04409146, - "balance_loss_mlp": 1.01838326, - "epoch": 0.4790620772583797, - "flos": 25666469763840.0, - "grad_norm": 2.103515425969552, - "language_loss": 0.79279423, - "learning_rate": 2.232178664762267e-06, - "loss": 0.81415194, - "num_input_tokens_seen": 171221750, - "step": 7968, - "time_per_iteration": 2.707740545272827 - }, - { - "auxiliary_loss_clip": 0.0102077, - "auxiliary_loss_mlp": 0.01020427, - "balance_loss_clip": 1.02207994, - "balance_loss_mlp": 1.01903248, - "epoch": 0.47912220051104765, - "flos": 69428077102080.0, - "grad_norm": 0.7660555925772923, - "language_loss": 0.62198806, - "learning_rate": 2.2317918319186408e-06, - "loss": 0.64240003, - "num_input_tokens_seen": 171292235, - "step": 7969, - "time_per_iteration": 3.3662569522857666 - }, - { - "auxiliary_loss_clip": 0.01087594, - "auxiliary_loss_mlp": 0.01029477, - "balance_loss_clip": 1.04418397, - "balance_loss_mlp": 1.01662636, - "epoch": 0.4791823237637156, - "flos": 24168922911360.0, - "grad_norm": 1.7596129166374368, - "language_loss": 0.77306086, - "learning_rate": 2.2314049902854446e-06, - "loss": 0.79423159, - "num_input_tokens_seen": 171312215, - "step": 7970, - "time_per_iteration": 2.69364857673645 - }, - { - "auxiliary_loss_clip": 0.01116664, - "auxiliary_loss_mlp": 0.01038161, - "balance_loss_clip": 1.04511642, - "balance_loss_mlp": 1.0235939, - "epoch": 0.4792424470163836, - "flos": 24751465683840.0, - "grad_norm": 1.5706742055007812, - "language_loss": 0.70431626, - "learning_rate": 2.231018139877349e-06, - "loss": 0.72586453, - "num_input_tokens_seen": 171332975, - "step": 7971, - "time_per_iteration": 2.690791130065918 - }, - { - "auxiliary_loss_clip": 0.01072275, - "auxiliary_loss_mlp": 0.01033862, - "balance_loss_clip": 1.03982508, - "balance_loss_mlp": 1.01899683, - "epoch": 0.47930257026905154, - "flos": 23257905240960.0, - "grad_norm": 1.30993945009872, - "language_loss": 0.79995155, - "learning_rate": 2.230631280709021e-06, - "loss": 0.82101291, - "num_input_tokens_seen": 171353880, - "step": 7972, - "time_per_iteration": 2.829455852508545 - }, - { - "auxiliary_loss_clip": 0.0111891, - "auxiliary_loss_mlp": 0.01028077, - "balance_loss_clip": 1.0466361, - "balance_loss_mlp": 1.01299727, - "epoch": 0.4793626935217195, - "flos": 14064091718400.0, - "grad_norm": 2.2411370214837807, - "language_loss": 0.69401908, - "learning_rate": 2.2302444127951327e-06, - "loss": 0.71548891, - "num_input_tokens_seen": 171370930, - "step": 7973, - "time_per_iteration": 4.2368669509887695 - }, - { - "auxiliary_loss_clip": 0.01120125, - "auxiliary_loss_mlp": 0.01039183, - "balance_loss_clip": 1.05002046, - "balance_loss_mlp": 1.02575445, - "epoch": 0.4794228167743875, - "flos": 21798854789760.0, - "grad_norm": 1.967830357691446, - "language_loss": 0.78792048, - "learning_rate": 2.2298575361503523e-06, - "loss": 0.80951357, - "num_input_tokens_seen": 171387575, - "step": 7974, - "time_per_iteration": 2.666619300842285 - }, - { - "auxiliary_loss_clip": 0.01029245, - "auxiliary_loss_mlp": 0.01003452, - "balance_loss_clip": 1.02188838, - "balance_loss_mlp": 1.00173593, - "epoch": 0.47948294002705544, - "flos": 66968805553920.0, - "grad_norm": 0.7538441683533625, - "language_loss": 0.54051983, - "learning_rate": 2.2294706507893517e-06, - "loss": 0.56084681, - "num_input_tokens_seen": 171449980, - "step": 7975, - "time_per_iteration": 4.964555501937866 - }, - { - "auxiliary_loss_clip": 0.01114672, - "auxiliary_loss_mlp": 0.01039108, - "balance_loss_clip": 1.04530835, - "balance_loss_mlp": 1.02287221, - "epoch": 0.47954306327972346, - "flos": 12422471414400.0, - "grad_norm": 2.0524308160251707, - "language_loss": 0.89917016, - "learning_rate": 2.2290837567268008e-06, - "loss": 0.92070794, - "num_input_tokens_seen": 171465290, - "step": 7976, - "time_per_iteration": 4.202557802200317 - }, - { - "auxiliary_loss_clip": 0.01135185, - "auxiliary_loss_mlp": 0.01039598, - "balance_loss_clip": 1.05056477, - "balance_loss_mlp": 1.02431524, - "epoch": 0.4796031865323914, - "flos": 18361951799040.0, - "grad_norm": 2.222330138734667, - "language_loss": 0.73720783, - "learning_rate": 2.2286968539773713e-06, - "loss": 0.75895566, - "num_input_tokens_seen": 171481130, - "step": 7977, - "time_per_iteration": 2.653036117553711 - }, - { - "auxiliary_loss_clip": 0.01112997, - "auxiliary_loss_mlp": 0.00772063, - "balance_loss_clip": 1.0468123, - "balance_loss_mlp": 1.00047266, - "epoch": 0.4796633097850594, - "flos": 21835088634240.0, - "grad_norm": 1.5823767711410588, - "language_loss": 0.78372079, - "learning_rate": 2.228309942555734e-06, - "loss": 0.80257142, - "num_input_tokens_seen": 171501140, - "step": 7978, - "time_per_iteration": 2.7036852836608887 - }, - { - "auxiliary_loss_clip": 0.01106382, - "auxiliary_loss_mlp": 0.01039525, - "balance_loss_clip": 1.04519784, - "balance_loss_mlp": 1.02526784, - "epoch": 0.47972343303772735, - "flos": 23437350610560.0, - "grad_norm": 2.6635738232298944, - "language_loss": 0.89488423, - "learning_rate": 2.22792302247656e-06, - "loss": 0.91634321, - "num_input_tokens_seen": 171519835, - "step": 7979, - "time_per_iteration": 2.653221845626831 - }, - { - "auxiliary_loss_clip": 0.01122392, - "auxiliary_loss_mlp": 0.01040662, - "balance_loss_clip": 1.04798067, - "balance_loss_mlp": 1.02475905, - "epoch": 0.4797835562903953, - "flos": 24899776940160.0, - "grad_norm": 1.5901773617653536, - "language_loss": 0.76710582, - "learning_rate": 2.227536093754523e-06, - "loss": 0.78873634, - "num_input_tokens_seen": 171540980, - "step": 7980, - "time_per_iteration": 2.6700520515441895 - }, - { - "auxiliary_loss_clip": 0.01103639, - "auxiliary_loss_mlp": 0.01039114, - "balance_loss_clip": 1.04525447, - "balance_loss_mlp": 1.02261567, - "epoch": 0.4798436795430633, - "flos": 35042996793600.0, - "grad_norm": 1.9068781398056245, - "language_loss": 0.7128244, - "learning_rate": 2.227149156404295e-06, - "loss": 0.73425198, - "num_input_tokens_seen": 171563600, - "step": 7981, - "time_per_iteration": 2.817458391189575 - }, - { - "auxiliary_loss_clip": 0.01130721, - "auxiliary_loss_mlp": 0.01034361, - "balance_loss_clip": 1.05059981, - "balance_loss_mlp": 1.02040792, - "epoch": 0.47990380279573125, - "flos": 20590209025920.0, - "grad_norm": 2.189836625005686, - "language_loss": 0.70604527, - "learning_rate": 2.2267622104405473e-06, - "loss": 0.72769606, - "num_input_tokens_seen": 171580700, - "step": 7982, - "time_per_iteration": 2.639772891998291 - }, - { - "auxiliary_loss_clip": 0.01101365, - "auxiliary_loss_mlp": 0.01031884, - "balance_loss_clip": 1.04456162, - "balance_loss_mlp": 1.01928937, - "epoch": 0.4799639260483992, - "flos": 26359402008960.0, - "grad_norm": 6.366705109750511, - "language_loss": 0.71019757, - "learning_rate": 2.2263752558779544e-06, - "loss": 0.73153007, - "num_input_tokens_seen": 171602035, - "step": 7983, - "time_per_iteration": 2.7794454097747803 - }, - { - "auxiliary_loss_clip": 0.01038182, - "auxiliary_loss_mlp": 0.00752618, - "balance_loss_clip": 1.0209136, - "balance_loss_mlp": 1.00015247, - "epoch": 0.4800240493010672, - "flos": 70979021521920.0, - "grad_norm": 0.8025064053403466, - "language_loss": 0.59461898, - "learning_rate": 2.2259882927311883e-06, - "loss": 0.61252695, - "num_input_tokens_seen": 171659215, - "step": 7984, - "time_per_iteration": 3.1715712547302246 - }, - { - "auxiliary_loss_clip": 0.01068728, - "auxiliary_loss_mlp": 0.01050145, - "balance_loss_clip": 1.03732944, - "balance_loss_mlp": 1.03350329, - "epoch": 0.48008417255373514, - "flos": 17086656349440.0, - "grad_norm": 1.9659657952718743, - "language_loss": 0.66784835, - "learning_rate": 2.2256013210149247e-06, - "loss": 0.68903708, - "num_input_tokens_seen": 171675710, - "step": 7985, - "time_per_iteration": 2.8482425212860107 - }, - { - "auxiliary_loss_clip": 0.01105712, - "auxiliary_loss_mlp": 0.010384, - "balance_loss_clip": 1.04168916, - "balance_loss_mlp": 1.02367198, - "epoch": 0.4801442958064031, - "flos": 15413435055360.0, - "grad_norm": 1.731655766205416, - "language_loss": 0.69907761, - "learning_rate": 2.225214340743835e-06, - "loss": 0.72051871, - "num_input_tokens_seen": 171692510, - "step": 7986, - "time_per_iteration": 2.78254771232605 - }, - { - "auxiliary_loss_clip": 0.01094439, - "auxiliary_loss_mlp": 0.0104069, - "balance_loss_clip": 1.04537976, - "balance_loss_mlp": 1.02534223, - "epoch": 0.4802044190590711, - "flos": 11473747441920.0, - "grad_norm": 2.3008677930118844, - "language_loss": 0.78930938, - "learning_rate": 2.2248273519325956e-06, - "loss": 0.81066066, - "num_input_tokens_seen": 171710235, - "step": 7987, - "time_per_iteration": 2.8055880069732666 - }, - { - "auxiliary_loss_clip": 0.01076423, - "auxiliary_loss_mlp": 0.01042206, - "balance_loss_clip": 1.04216504, - "balance_loss_mlp": 1.02793634, - "epoch": 0.48026454231173904, - "flos": 20951003185920.0, - "grad_norm": 2.0041399034857537, - "language_loss": 0.75381374, - "learning_rate": 2.2244403545958812e-06, - "loss": 0.77499998, - "num_input_tokens_seen": 171726715, - "step": 7988, - "time_per_iteration": 2.7931642532348633 - }, - { - "auxiliary_loss_clip": 0.01099185, - "auxiliary_loss_mlp": 0.01033353, - "balance_loss_clip": 1.04829884, - "balance_loss_mlp": 1.01920891, - "epoch": 0.48032466556440706, - "flos": 20448110822400.0, - "grad_norm": 2.2052350267481984, - "language_loss": 0.79056877, - "learning_rate": 2.224053348748365e-06, - "loss": 0.81189418, - "num_input_tokens_seen": 171743605, - "step": 7989, - "time_per_iteration": 2.7195966243743896 - }, - { - "auxiliary_loss_clip": 0.01109361, - "auxiliary_loss_mlp": 0.01046506, - "balance_loss_clip": 1.04376316, - "balance_loss_mlp": 1.03094292, - "epoch": 0.480384788817075, - "flos": 37120823861760.0, - "grad_norm": 1.9525154549019321, - "language_loss": 0.73684812, - "learning_rate": 2.223666334404724e-06, - "loss": 0.75840676, - "num_input_tokens_seen": 171765445, - "step": 7990, - "time_per_iteration": 2.8826913833618164 - }, - { - "auxiliary_loss_clip": 0.01039921, - "auxiliary_loss_mlp": 0.00752733, - "balance_loss_clip": 1.02231336, - "balance_loss_mlp": 1.00023639, - "epoch": 0.480444912069743, - "flos": 69552577641600.0, - "grad_norm": 0.7651324576674445, - "language_loss": 0.59016085, - "learning_rate": 2.223279311579633e-06, - "loss": 0.60808742, - "num_input_tokens_seen": 171830115, - "step": 7991, - "time_per_iteration": 3.325892448425293 - }, - { - "auxiliary_loss_clip": 0.01119355, - "auxiliary_loss_mlp": 0.00772289, - "balance_loss_clip": 1.04751837, - "balance_loss_mlp": 1.00058734, - "epoch": 0.48050503532241096, - "flos": 29822231640960.0, - "grad_norm": 2.03548436048953, - "language_loss": 0.67551184, - "learning_rate": 2.222892280287768e-06, - "loss": 0.69442832, - "num_input_tokens_seen": 171849135, - "step": 7992, - "time_per_iteration": 2.7717204093933105 - }, - { - "auxiliary_loss_clip": 0.01102719, - "auxiliary_loss_mlp": 0.01037971, - "balance_loss_clip": 1.04047358, - "balance_loss_mlp": 1.02267683, - "epoch": 0.4805651585750789, - "flos": 23948539015680.0, - "grad_norm": 1.7261557593206558, - "language_loss": 0.76166683, - "learning_rate": 2.2225052405438056e-06, - "loss": 0.78307372, - "num_input_tokens_seen": 171868880, - "step": 7993, - "time_per_iteration": 2.739190101623535 - }, - { - "auxiliary_loss_clip": 0.01080291, - "auxiliary_loss_mlp": 0.01038498, - "balance_loss_clip": 1.04301596, - "balance_loss_mlp": 1.02469933, - "epoch": 0.4806252818277469, - "flos": 25665428269440.0, - "grad_norm": 1.8324818551458955, - "language_loss": 0.79029763, - "learning_rate": 2.222118192362422e-06, - "loss": 0.81148541, - "num_input_tokens_seen": 171889455, - "step": 7994, - "time_per_iteration": 2.775120973587036 - }, - { - "auxiliary_loss_clip": 0.01107812, - "auxiliary_loss_mlp": 0.0103252, - "balance_loss_clip": 1.04342794, - "balance_loss_mlp": 1.01851845, - "epoch": 0.48068540508041485, - "flos": 13151996640000.0, - "grad_norm": 2.168964016546684, - "language_loss": 0.79452056, - "learning_rate": 2.2217311357582946e-06, - "loss": 0.81592381, - "num_input_tokens_seen": 171906070, - "step": 7995, - "time_per_iteration": 2.684086561203003 - }, - { - "auxiliary_loss_clip": 0.01071477, - "auxiliary_loss_mlp": 0.01034963, - "balance_loss_clip": 1.04075575, - "balance_loss_mlp": 1.02081871, - "epoch": 0.4807455283330828, - "flos": 21176738208000.0, - "grad_norm": 1.4272883159105954, - "language_loss": 0.82732481, - "learning_rate": 2.2213440707461e-06, - "loss": 0.84838915, - "num_input_tokens_seen": 171926515, - "step": 7996, - "time_per_iteration": 2.801893711090088 - }, - { - "auxiliary_loss_clip": 0.0105538, - "auxiliary_loss_mlp": 0.01038724, - "balance_loss_clip": 1.03635919, - "balance_loss_mlp": 1.02432358, - "epoch": 0.4808056515857508, - "flos": 12275991751680.0, - "grad_norm": 1.7665973767451764, - "language_loss": 0.81008822, - "learning_rate": 2.220956997340516e-06, - "loss": 0.8310293, - "num_input_tokens_seen": 171943845, - "step": 7997, - "time_per_iteration": 2.7309181690216064 - }, - { - "auxiliary_loss_clip": 0.01079437, - "auxiliary_loss_mlp": 0.0103905, - "balance_loss_clip": 1.04144287, - "balance_loss_mlp": 1.0246973, - "epoch": 0.48086577483841875, - "flos": 24826052275200.0, - "grad_norm": 4.4511101438837555, - "language_loss": 0.7285195, - "learning_rate": 2.220569915556221e-06, - "loss": 0.74970436, - "num_input_tokens_seen": 171964970, - "step": 7998, - "time_per_iteration": 2.793765068054199 - }, - { - "auxiliary_loss_clip": 0.01129175, - "auxiliary_loss_mlp": 0.01042213, - "balance_loss_clip": 1.04769647, - "balance_loss_mlp": 1.02756786, - "epoch": 0.4809258980910867, - "flos": 24465365856000.0, - "grad_norm": 1.6928626075088686, - "language_loss": 0.71266204, - "learning_rate": 2.220182825407892e-06, - "loss": 0.73437595, - "num_input_tokens_seen": 171986340, - "step": 7999, - "time_per_iteration": 2.698373556137085 - }, - { - "auxiliary_loss_clip": 0.01120573, - "auxiliary_loss_mlp": 0.01049678, - "balance_loss_clip": 1.04650939, - "balance_loss_mlp": 1.035707, - "epoch": 0.4809860213437547, - "flos": 21215952881280.0, - "grad_norm": 3.5525090623309525, - "language_loss": 0.71445537, - "learning_rate": 2.2197957269102083e-06, - "loss": 0.73615789, - "num_input_tokens_seen": 172007300, - "step": 8000, - "time_per_iteration": 2.677906036376953 - }, - { - "auxiliary_loss_clip": 0.01120936, - "auxiliary_loss_mlp": 0.01045001, - "balance_loss_clip": 1.04962945, - "balance_loss_mlp": 1.03024244, - "epoch": 0.48104614459642264, - "flos": 37632084094080.0, - "grad_norm": 1.397364252260559, - "language_loss": 0.75031364, - "learning_rate": 2.2194086200778485e-06, - "loss": 0.77197301, - "num_input_tokens_seen": 172029585, - "step": 8001, - "time_per_iteration": 2.8079638481140137 - }, - { - "auxiliary_loss_clip": 0.01120097, - "auxiliary_loss_mlp": 0.01045878, - "balance_loss_clip": 1.04740191, - "balance_loss_mlp": 1.03150105, - "epoch": 0.48110626784909066, - "flos": 18406122549120.0, - "grad_norm": 1.760961408245497, - "language_loss": 0.8157444, - "learning_rate": 2.219021504925493e-06, - "loss": 0.83740413, - "num_input_tokens_seen": 172047495, - "step": 8002, - "time_per_iteration": 2.6615140438079834 - }, - { - "auxiliary_loss_clip": 0.01127724, - "auxiliary_loss_mlp": 0.01043569, - "balance_loss_clip": 1.05275476, - "balance_loss_mlp": 1.02780938, - "epoch": 0.48116639110175863, - "flos": 28439814856320.0, - "grad_norm": 1.7356718355873448, - "language_loss": 0.71858382, - "learning_rate": 2.218634381467819e-06, - "loss": 0.74029678, - "num_input_tokens_seen": 172067625, - "step": 8003, - "time_per_iteration": 2.7304186820983887 - }, - { - "auxiliary_loss_clip": 0.01114781, - "auxiliary_loss_mlp": 0.01040333, - "balance_loss_clip": 1.04751146, - "balance_loss_mlp": 1.02654088, - "epoch": 0.4812265143544266, - "flos": 21725237865600.0, - "grad_norm": 1.7533221004579713, - "language_loss": 0.82598346, - "learning_rate": 2.218247249719507e-06, - "loss": 0.84753454, - "num_input_tokens_seen": 172087885, - "step": 8004, - "time_per_iteration": 2.718576192855835 - }, - { - "auxiliary_loss_clip": 0.01110853, - "auxiliary_loss_mlp": 0.01042863, - "balance_loss_clip": 1.04705787, - "balance_loss_mlp": 1.02601874, - "epoch": 0.48128663760709456, - "flos": 13224679810560.0, - "grad_norm": 2.3721289724239587, - "language_loss": 0.77786469, - "learning_rate": 2.217860109695239e-06, - "loss": 0.79940188, - "num_input_tokens_seen": 172105815, - "step": 8005, - "time_per_iteration": 2.7602009773254395 - }, - { - "auxiliary_loss_clip": 0.01116298, - "auxiliary_loss_mlp": 0.01040763, - "balance_loss_clip": 1.04861951, - "balance_loss_mlp": 1.02662444, - "epoch": 0.4813467608597625, - "flos": 24243437675520.0, - "grad_norm": 1.8330364183017236, - "language_loss": 0.70666707, - "learning_rate": 2.217472961409692e-06, - "loss": 0.72823763, - "num_input_tokens_seen": 172126125, - "step": 8006, - "time_per_iteration": 2.7916948795318604 - }, - { - "auxiliary_loss_clip": 0.01101733, - "auxiliary_loss_mlp": 0.01039864, - "balance_loss_clip": 1.04409337, - "balance_loss_mlp": 1.02521324, - "epoch": 0.4814068841124305, - "flos": 27480424544640.0, - "grad_norm": 1.7951056960252978, - "language_loss": 0.70724428, - "learning_rate": 2.2170858048775495e-06, - "loss": 0.72866029, - "num_input_tokens_seen": 172141945, - "step": 8007, - "time_per_iteration": 2.7661349773406982 - }, - { - "auxiliary_loss_clip": 0.01130133, - "auxiliary_loss_mlp": 0.0103276, - "balance_loss_clip": 1.0476191, - "balance_loss_mlp": 1.01881254, - "epoch": 0.48146700736509845, - "flos": 19572896033280.0, - "grad_norm": 11.665968104344772, - "language_loss": 0.71553946, - "learning_rate": 2.2166986401134914e-06, - "loss": 0.73716843, - "num_input_tokens_seen": 172161095, - "step": 8008, - "time_per_iteration": 2.7019124031066895 - }, - { - "auxiliary_loss_clip": 0.01096611, - "auxiliary_loss_mlp": 0.01050794, - "balance_loss_clip": 1.04696894, - "balance_loss_mlp": 1.03467739, - "epoch": 0.4815271306177664, - "flos": 20627771673600.0, - "grad_norm": 2.289909942865545, - "language_loss": 0.60779428, - "learning_rate": 2.216311467132199e-06, - "loss": 0.62926841, - "num_input_tokens_seen": 172178750, - "step": 8009, - "time_per_iteration": 2.713092088699341 - }, - { - "auxiliary_loss_clip": 0.01022233, - "auxiliary_loss_mlp": 0.01005627, - "balance_loss_clip": 1.02350807, - "balance_loss_mlp": 1.00431013, - "epoch": 0.4815872538704344, - "flos": 67691076232320.0, - "grad_norm": 0.8584252589427176, - "language_loss": 0.61326265, - "learning_rate": 2.2159242859483547e-06, - "loss": 0.63354123, - "num_input_tokens_seen": 172240235, - "step": 8010, - "time_per_iteration": 3.2182729244232178 - }, - { - "auxiliary_loss_clip": 0.01123367, - "auxiliary_loss_mlp": 0.01044563, - "balance_loss_clip": 1.0506475, - "balance_loss_mlp": 1.02956653, - "epoch": 0.48164737712310235, - "flos": 22820764723200.0, - "grad_norm": 1.7901877328371896, - "language_loss": 0.73432398, - "learning_rate": 2.215537096576639e-06, - "loss": 0.75600326, - "num_input_tokens_seen": 172259875, - "step": 8011, - "time_per_iteration": 2.671487331390381 - }, - { - "auxiliary_loss_clip": 0.01103596, - "auxiliary_loss_mlp": 0.01035148, - "balance_loss_clip": 1.04422355, - "balance_loss_mlp": 1.02199948, - "epoch": 0.4817075003757703, - "flos": 23733865382400.0, - "grad_norm": 1.7774743588215727, - "language_loss": 0.79526579, - "learning_rate": 2.2151498990317354e-06, - "loss": 0.81665325, - "num_input_tokens_seen": 172280150, - "step": 8012, - "time_per_iteration": 5.769195079803467 - }, - { - "auxiliary_loss_clip": 0.01092738, - "auxiliary_loss_mlp": 0.01042222, - "balance_loss_clip": 1.04738641, - "balance_loss_mlp": 1.02718425, - "epoch": 0.4817676236284383, - "flos": 28182909807360.0, - "grad_norm": 1.8494845342416013, - "language_loss": 0.73714077, - "learning_rate": 2.214762693328326e-06, - "loss": 0.75849032, - "num_input_tokens_seen": 172300810, - "step": 8013, - "time_per_iteration": 2.77451491355896 - }, - { - "auxiliary_loss_clip": 0.01105203, - "auxiliary_loss_mlp": 0.0103627, - "balance_loss_clip": 1.05056131, - "balance_loss_mlp": 1.02266848, - "epoch": 0.48182774688110624, - "flos": 17091756080640.0, - "grad_norm": 2.3240899529345858, - "language_loss": 0.90755451, - "learning_rate": 2.214375479481094e-06, - "loss": 0.92896926, - "num_input_tokens_seen": 172317930, - "step": 8014, - "time_per_iteration": 4.2677695751190186 - }, - { - "auxiliary_loss_clip": 0.0113526, - "auxiliary_loss_mlp": 0.01039779, - "balance_loss_clip": 1.04945207, - "balance_loss_mlp": 1.02497888, - "epoch": 0.4818878701337742, - "flos": 12567873669120.0, - "grad_norm": 3.070306284191698, - "language_loss": 0.7404421, - "learning_rate": 2.213988257504722e-06, - "loss": 0.76219249, - "num_input_tokens_seen": 172336340, - "step": 8015, - "time_per_iteration": 4.188862085342407 - }, - { - "auxiliary_loss_clip": 0.01113922, - "auxiliary_loss_mlp": 0.01040149, - "balance_loss_clip": 1.04792023, - "balance_loss_mlp": 1.02514613, - "epoch": 0.48194799338644223, - "flos": 24608505553920.0, - "grad_norm": 2.1594847398910164, - "language_loss": 0.80143541, - "learning_rate": 2.213601027413894e-06, - "loss": 0.82297611, - "num_input_tokens_seen": 172354315, - "step": 8016, - "time_per_iteration": 2.745352268218994 - }, - { - "auxiliary_loss_clip": 0.01115904, - "auxiliary_loss_mlp": 0.010317, - "balance_loss_clip": 1.04995775, - "balance_loss_mlp": 1.01803231, - "epoch": 0.4820081166391102, - "flos": 21105204272640.0, - "grad_norm": 1.9897571760317019, - "language_loss": 0.77120233, - "learning_rate": 2.2132137892232933e-06, - "loss": 0.79267836, - "num_input_tokens_seen": 172372695, - "step": 8017, - "time_per_iteration": 2.7234907150268555 - }, - { - "auxiliary_loss_clip": 0.01117431, - "auxiliary_loss_mlp": 0.01033072, - "balance_loss_clip": 1.05067015, - "balance_loss_mlp": 1.01848102, - "epoch": 0.48206823989177816, - "flos": 25264593423360.0, - "grad_norm": 2.391907623354337, - "language_loss": 0.80211884, - "learning_rate": 2.2128265429476043e-06, - "loss": 0.8236239, - "num_input_tokens_seen": 172390905, - "step": 8018, - "time_per_iteration": 2.805011749267578 - }, - { - "auxiliary_loss_clip": 0.01113573, - "auxiliary_loss_mlp": 0.01031362, - "balance_loss_clip": 1.05918038, - "balance_loss_mlp": 1.01767111, - "epoch": 0.4821283631444461, - "flos": 24645062620800.0, - "grad_norm": 1.818966225047076, - "language_loss": 0.75859058, - "learning_rate": 2.2124392886015124e-06, - "loss": 0.78003991, - "num_input_tokens_seen": 172412295, - "step": 8019, - "time_per_iteration": 2.767993688583374 - }, - { - "auxiliary_loss_clip": 0.01092977, - "auxiliary_loss_mlp": 0.01036734, - "balance_loss_clip": 1.04580545, - "balance_loss_mlp": 1.02204108, - "epoch": 0.4821884863971141, - "flos": 23952094462080.0, - "grad_norm": 1.8745546244507358, - "language_loss": 0.7907865, - "learning_rate": 2.212052026199701e-06, - "loss": 0.8120836, - "num_input_tokens_seen": 172432625, - "step": 8020, - "time_per_iteration": 2.708779811859131 - }, - { - "auxiliary_loss_clip": 0.01127117, - "auxiliary_loss_mlp": 0.01036574, - "balance_loss_clip": 1.04847205, - "balance_loss_mlp": 1.02219176, - "epoch": 0.48224860964978206, - "flos": 17160668323200.0, - "grad_norm": 2.712415162483374, - "language_loss": 0.69893312, - "learning_rate": 2.211664755756855e-06, - "loss": 0.72057003, - "num_input_tokens_seen": 172450010, - "step": 8021, - "time_per_iteration": 2.6083900928497314 - }, - { - "auxiliary_loss_clip": 0.01102125, - "auxiliary_loss_mlp": 0.01031516, - "balance_loss_clip": 1.04406881, - "balance_loss_mlp": 1.01672244, - "epoch": 0.48230873290245, - "flos": 23075838178560.0, - "grad_norm": 1.7410194963021717, - "language_loss": 0.62778926, - "learning_rate": 2.2112774772876603e-06, - "loss": 0.6491257, - "num_input_tokens_seen": 172469080, - "step": 8022, - "time_per_iteration": 2.677368640899658 - }, - { - "auxiliary_loss_clip": 0.01108316, - "auxiliary_loss_mlp": 0.00770954, - "balance_loss_clip": 1.04996586, - "balance_loss_mlp": 1.00044918, - "epoch": 0.482368856155118, - "flos": 19353517718400.0, - "grad_norm": 2.505400955117215, - "language_loss": 0.66446078, - "learning_rate": 2.2108901908068028e-06, - "loss": 0.68325341, - "num_input_tokens_seen": 172484850, - "step": 8023, - "time_per_iteration": 2.6412739753723145 - }, - { - "auxiliary_loss_clip": 0.01054811, - "auxiliary_loss_mlp": 0.0104073, - "balance_loss_clip": 1.03875041, - "balance_loss_mlp": 1.02531052, - "epoch": 0.48242897940778595, - "flos": 20078984707200.0, - "grad_norm": 1.7010312143912936, - "language_loss": 0.76777267, - "learning_rate": 2.2105028963289683e-06, - "loss": 0.78872806, - "num_input_tokens_seen": 172503525, - "step": 8024, - "time_per_iteration": 2.858891010284424 - }, - { - "auxiliary_loss_clip": 0.01109606, - "auxiliary_loss_mlp": 0.01039089, - "balance_loss_clip": 1.04908574, - "balance_loss_mlp": 1.02432442, - "epoch": 0.4824891026604539, - "flos": 23403989854080.0, - "grad_norm": 1.4778625856906076, - "language_loss": 0.75417542, - "learning_rate": 2.2101155938688423e-06, - "loss": 0.77566242, - "num_input_tokens_seen": 172524360, - "step": 8025, - "time_per_iteration": 2.6743719577789307 - }, - { - "auxiliary_loss_clip": 0.01129031, - "auxiliary_loss_mlp": 0.01034028, - "balance_loss_clip": 1.04835987, - "balance_loss_mlp": 1.01994324, - "epoch": 0.4825492259131219, - "flos": 20368675895040.0, - "grad_norm": 1.785974704334164, - "language_loss": 0.71310222, - "learning_rate": 2.209728283441112e-06, - "loss": 0.73473275, - "num_input_tokens_seen": 172541480, - "step": 8026, - "time_per_iteration": 2.5739991664886475 - }, - { - "auxiliary_loss_clip": 0.01115668, - "auxiliary_loss_mlp": 0.01045724, - "balance_loss_clip": 1.04429471, - "balance_loss_mlp": 1.02949929, - "epoch": 0.48260934916578985, - "flos": 14319021519360.0, - "grad_norm": 2.0186797289800182, - "language_loss": 0.74956793, - "learning_rate": 2.209340965060465e-06, - "loss": 0.77118182, - "num_input_tokens_seen": 172559005, - "step": 8027, - "time_per_iteration": 2.7139828205108643 - }, - { - "auxiliary_loss_clip": 0.01105318, - "auxiliary_loss_mlp": 0.01037258, - "balance_loss_clip": 1.04597318, - "balance_loss_mlp": 1.02348971, - "epoch": 0.4826694724184578, - "flos": 22121152548480.0, - "grad_norm": 1.6779938031508344, - "language_loss": 0.67332339, - "learning_rate": 2.2089536387415868e-06, - "loss": 0.69474924, - "num_input_tokens_seen": 172578435, - "step": 8028, - "time_per_iteration": 2.809757709503174 - }, - { - "auxiliary_loss_clip": 0.01105459, - "auxiliary_loss_mlp": 0.01039975, - "balance_loss_clip": 1.04472148, - "balance_loss_mlp": 1.02583039, - "epoch": 0.48272959567112583, - "flos": 16181169373440.0, - "grad_norm": 1.5400710398474027, - "language_loss": 0.72719157, - "learning_rate": 2.2085663044991655e-06, - "loss": 0.7486459, - "num_input_tokens_seen": 172596095, - "step": 8029, - "time_per_iteration": 2.692643165588379 - }, - { - "auxiliary_loss_clip": 0.01103521, - "auxiliary_loss_mlp": 0.01033131, - "balance_loss_clip": 1.04666233, - "balance_loss_mlp": 1.01880252, - "epoch": 0.4827897189237938, - "flos": 23180445561600.0, - "grad_norm": 1.8484439777749806, - "language_loss": 0.84841061, - "learning_rate": 2.2081789623478896e-06, - "loss": 0.86977708, - "num_input_tokens_seen": 172615255, - "step": 8030, - "time_per_iteration": 2.6717677116394043 - }, - { - "auxiliary_loss_clip": 0.01094989, - "auxiliary_loss_mlp": 0.01034714, - "balance_loss_clip": 1.04217124, - "balance_loss_mlp": 1.02120733, - "epoch": 0.48284984217646176, - "flos": 21652626522240.0, - "grad_norm": 2.0183604756392715, - "language_loss": 0.74026352, - "learning_rate": 2.2077916123024466e-06, - "loss": 0.76156056, - "num_input_tokens_seen": 172633185, - "step": 8031, - "time_per_iteration": 2.640707015991211 - }, - { - "auxiliary_loss_clip": 0.01099826, - "auxiliary_loss_mlp": 0.0104306, - "balance_loss_clip": 1.04307055, - "balance_loss_mlp": 1.02747965, - "epoch": 0.48290996542912973, - "flos": 31467443304960.0, - "grad_norm": 1.5998759210668847, - "language_loss": 0.71785772, - "learning_rate": 2.2074042543775245e-06, - "loss": 0.7392866, - "num_input_tokens_seen": 172654280, - "step": 8032, - "time_per_iteration": 2.803567886352539 - }, - { - "auxiliary_loss_clip": 0.0110819, - "auxiliary_loss_mlp": 0.01037978, - "balance_loss_clip": 1.04093766, - "balance_loss_mlp": 1.02310669, - "epoch": 0.4829700886817977, - "flos": 24461954064000.0, - "grad_norm": 1.7179702458807065, - "language_loss": 0.73965132, - "learning_rate": 2.2070168885878126e-06, - "loss": 0.76111305, - "num_input_tokens_seen": 172675545, - "step": 8033, - "time_per_iteration": 2.7292799949645996 - }, - { - "auxiliary_loss_clip": 0.01073662, - "auxiliary_loss_mlp": 0.01036715, - "balance_loss_clip": 1.04669857, - "balance_loss_mlp": 1.0225054, - "epoch": 0.48303021193446566, - "flos": 25702164904320.0, - "grad_norm": 1.7431687715385025, - "language_loss": 0.83544624, - "learning_rate": 2.2066295149479996e-06, - "loss": 0.85655004, - "num_input_tokens_seen": 172696455, - "step": 8034, - "time_per_iteration": 2.807359218597412 - }, - { - "auxiliary_loss_clip": 0.01095417, - "auxiliary_loss_mlp": 0.01031736, - "balance_loss_clip": 1.04668856, - "balance_loss_mlp": 1.01843822, - "epoch": 0.4830903351871336, - "flos": 20085233673600.0, - "grad_norm": 1.6936524854207098, - "language_loss": 0.79185474, - "learning_rate": 2.2062421334727744e-06, - "loss": 0.81312621, - "num_input_tokens_seen": 172716720, - "step": 8035, - "time_per_iteration": 2.7641072273254395 - }, - { - "auxiliary_loss_clip": 0.01102103, - "auxiliary_loss_mlp": 0.00772882, - "balance_loss_clip": 1.04296494, - "balance_loss_mlp": 1.00034285, - "epoch": 0.4831504584398016, - "flos": 39452216014080.0, - "grad_norm": 1.8720500560152205, - "language_loss": 0.69804895, - "learning_rate": 2.2058547441768267e-06, - "loss": 0.71679878, - "num_input_tokens_seen": 172737435, - "step": 8036, - "time_per_iteration": 2.8137052059173584 - }, - { - "auxiliary_loss_clip": 0.01112606, - "auxiliary_loss_mlp": 0.01031964, - "balance_loss_clip": 1.04274416, - "balance_loss_mlp": 1.01839805, - "epoch": 0.48321058169246955, - "flos": 20006588845440.0, - "grad_norm": 1.9208219105474362, - "language_loss": 0.72910142, - "learning_rate": 2.205467347074847e-06, - "loss": 0.75054711, - "num_input_tokens_seen": 172755700, - "step": 8037, - "time_per_iteration": 2.635277271270752 - }, - { - "auxiliary_loss_clip": 0.01078506, - "auxiliary_loss_mlp": 0.0104898, - "balance_loss_clip": 1.04335546, - "balance_loss_mlp": 1.03224301, - "epoch": 0.4832707049451375, - "flos": 20741465197440.0, - "grad_norm": 3.147603880487906, - "language_loss": 0.68890101, - "learning_rate": 2.205079942181525e-06, - "loss": 0.71017587, - "num_input_tokens_seen": 172775185, - "step": 8038, - "time_per_iteration": 2.782864570617676 - }, - { - "auxiliary_loss_clip": 0.01090364, - "auxiliary_loss_mlp": 0.01038251, - "balance_loss_clip": 1.04244566, - "balance_loss_mlp": 1.02438653, - "epoch": 0.4833308281978055, - "flos": 33145584762240.0, - "grad_norm": 1.8173480840244864, - "language_loss": 0.79258525, - "learning_rate": 2.20469252951155e-06, - "loss": 0.81387138, - "num_input_tokens_seen": 172796990, - "step": 8039, - "time_per_iteration": 2.7726707458496094 - }, - { - "auxiliary_loss_clip": 0.01115294, - "auxiliary_loss_mlp": 0.01034301, - "balance_loss_clip": 1.04610348, - "balance_loss_mlp": 1.02035379, - "epoch": 0.48339095145047345, - "flos": 19099234362240.0, - "grad_norm": 1.6327731998252513, - "language_loss": 0.77608567, - "learning_rate": 2.2043051090796143e-06, - "loss": 0.79758161, - "num_input_tokens_seen": 172814915, - "step": 8040, - "time_per_iteration": 2.634373903274536 - }, - { - "auxiliary_loss_clip": 0.01117481, - "auxiliary_loss_mlp": 0.01034936, - "balance_loss_clip": 1.04517746, - "balance_loss_mlp": 1.02007651, - "epoch": 0.4834510747031414, - "flos": 34459448440320.0, - "grad_norm": 1.603418513383397, - "language_loss": 0.75737631, - "learning_rate": 2.203917680900409e-06, - "loss": 0.7789005, - "num_input_tokens_seen": 172837060, - "step": 8041, - "time_per_iteration": 2.7551445960998535 - }, - { - "auxiliary_loss_clip": 0.01089791, - "auxiliary_loss_mlp": 0.01038452, - "balance_loss_clip": 1.04363966, - "balance_loss_mlp": 1.02388501, - "epoch": 0.48351119795580944, - "flos": 27380845065600.0, - "grad_norm": 1.7873938615261085, - "language_loss": 0.6681267, - "learning_rate": 2.203530244988624e-06, - "loss": 0.6894092, - "num_input_tokens_seen": 172856545, - "step": 8042, - "time_per_iteration": 2.7318594455718994 - }, - { - "auxiliary_loss_clip": 0.01029662, - "auxiliary_loss_mlp": 0.0100431, - "balance_loss_clip": 1.0224936, - "balance_loss_mlp": 1.00289762, - "epoch": 0.4835713212084774, - "flos": 67143941291520.0, - "grad_norm": 0.6894070214322334, - "language_loss": 0.5854131, - "learning_rate": 2.2031428013589517e-06, - "loss": 0.60575283, - "num_input_tokens_seen": 172923055, - "step": 8043, - "time_per_iteration": 3.2759408950805664 - }, - { - "auxiliary_loss_clip": 0.01104355, - "auxiliary_loss_mlp": 0.01041979, - "balance_loss_clip": 1.04400086, - "balance_loss_mlp": 1.02605903, - "epoch": 0.48363144446114537, - "flos": 17967473660160.0, - "grad_norm": 1.92903629391714, - "language_loss": 0.71673858, - "learning_rate": 2.2027553500260847e-06, - "loss": 0.73820192, - "num_input_tokens_seen": 172940700, - "step": 8044, - "time_per_iteration": 2.6627197265625 - }, - { - "auxiliary_loss_clip": 0.01073602, - "auxiliary_loss_mlp": 0.01033421, - "balance_loss_clip": 1.04103553, - "balance_loss_mlp": 1.01863277, - "epoch": 0.48369156771381333, - "flos": 20593513077120.0, - "grad_norm": 1.3783700874379357, - "language_loss": 0.75982356, - "learning_rate": 2.202367891004714e-06, - "loss": 0.7808938, - "num_input_tokens_seen": 172961125, - "step": 8045, - "time_per_iteration": 2.7301156520843506 - }, - { - "auxiliary_loss_clip": 0.01083343, - "auxiliary_loss_mlp": 0.01040882, - "balance_loss_clip": 1.04626942, - "balance_loss_mlp": 1.02615929, - "epoch": 0.4837516909664813, - "flos": 22675075159680.0, - "grad_norm": 1.8085917066759625, - "language_loss": 0.70038342, - "learning_rate": 2.201980424309533e-06, - "loss": 0.72162569, - "num_input_tokens_seen": 172980405, - "step": 8046, - "time_per_iteration": 2.853160858154297 - }, - { - "auxiliary_loss_clip": 0.01127438, - "auxiliary_loss_mlp": 0.0103679, - "balance_loss_clip": 1.04603601, - "balance_loss_mlp": 1.02220488, - "epoch": 0.48381181421914926, - "flos": 25518625384320.0, - "grad_norm": 2.1605387354357193, - "language_loss": 0.82558095, - "learning_rate": 2.2015929499552337e-06, - "loss": 0.84722322, - "num_input_tokens_seen": 172999105, - "step": 8047, - "time_per_iteration": 2.711172103881836 - }, - { - "auxiliary_loss_clip": 0.01095021, - "auxiliary_loss_mlp": 0.01034535, - "balance_loss_clip": 1.04198444, - "balance_loss_mlp": 1.02066541, - "epoch": 0.4838719374718172, - "flos": 24207491139840.0, - "grad_norm": 1.6956601095110444, - "language_loss": 0.80573416, - "learning_rate": 2.2012054679565092e-06, - "loss": 0.82702971, - "num_input_tokens_seen": 173019935, - "step": 8048, - "time_per_iteration": 2.714733839035034 - }, - { - "auxiliary_loss_clip": 0.01119221, - "auxiliary_loss_mlp": 0.01039156, - "balance_loss_clip": 1.04571271, - "balance_loss_mlp": 1.02458251, - "epoch": 0.4839320607244852, - "flos": 26724577628160.0, - "grad_norm": 1.6136989522042802, - "language_loss": 0.81565118, - "learning_rate": 2.200817978328054e-06, - "loss": 0.83723497, - "num_input_tokens_seen": 173039700, - "step": 8049, - "time_per_iteration": 2.740396738052368 - }, - { - "auxiliary_loss_clip": 0.0110148, - "auxiliary_loss_mlp": 0.01032329, - "balance_loss_clip": 1.04652369, - "balance_loss_mlp": 1.01979959, - "epoch": 0.48399218397715316, - "flos": 20448900921600.0, - "grad_norm": 1.738899019363266, - "language_loss": 0.72696805, - "learning_rate": 2.2004304810845602e-06, - "loss": 0.74830616, - "num_input_tokens_seen": 173059170, - "step": 8050, - "time_per_iteration": 2.671696424484253 - }, - { - "auxiliary_loss_clip": 0.01036049, - "auxiliary_loss_mlp": 0.00752282, - "balance_loss_clip": 1.01914835, - "balance_loss_mlp": 1.00025868, - "epoch": 0.4840523072298211, - "flos": 67180570185600.0, - "grad_norm": 0.6909377773009905, - "language_loss": 0.562814, - "learning_rate": 2.200042976240723e-06, - "loss": 0.5806973, - "num_input_tokens_seen": 173119000, - "step": 8051, - "time_per_iteration": 6.922944784164429 - }, - { - "auxiliary_loss_clip": 0.01088902, - "auxiliary_loss_mlp": 0.01035544, - "balance_loss_clip": 1.04290557, - "balance_loss_mlp": 1.0208869, - "epoch": 0.4841124304824891, - "flos": 22411490181120.0, - "grad_norm": 1.8410570377760342, - "language_loss": 0.75224304, - "learning_rate": 2.199655463811236e-06, - "loss": 0.77348751, - "num_input_tokens_seen": 173137570, - "step": 8052, - "time_per_iteration": 2.7672088146209717 - }, - { - "auxiliary_loss_clip": 0.01115072, - "auxiliary_loss_mlp": 0.01037343, - "balance_loss_clip": 1.04730511, - "balance_loss_mlp": 1.02388382, - "epoch": 0.48417255373515705, - "flos": 13843959217920.0, - "grad_norm": 2.7757616025011296, - "language_loss": 0.6599009, - "learning_rate": 2.1992679438107936e-06, - "loss": 0.68142503, - "num_input_tokens_seen": 173154355, - "step": 8053, - "time_per_iteration": 2.7092020511627197 - }, - { - "auxiliary_loss_clip": 0.01118659, - "auxiliary_loss_mlp": 0.0103362, - "balance_loss_clip": 1.04970407, - "balance_loss_mlp": 1.02048898, - "epoch": 0.484232676987825, - "flos": 31649689935360.0, - "grad_norm": 1.9021914395644282, - "language_loss": 0.69075954, - "learning_rate": 2.198880416254091e-06, - "loss": 0.7122823, - "num_input_tokens_seen": 173174845, - "step": 8054, - "time_per_iteration": 5.934173583984375 - }, - { - "auxiliary_loss_clip": 0.01055753, - "auxiliary_loss_mlp": 0.01032099, - "balance_loss_clip": 1.03702974, - "balance_loss_mlp": 1.01789522, - "epoch": 0.48429280024049304, - "flos": 24095377814400.0, - "grad_norm": 1.7332498206286664, - "language_loss": 0.69624376, - "learning_rate": 2.1984928811558233e-06, - "loss": 0.71712232, - "num_input_tokens_seen": 173195025, - "step": 8055, - "time_per_iteration": 2.811734676361084 - }, - { - "auxiliary_loss_clip": 0.01121016, - "auxiliary_loss_mlp": 0.01038771, - "balance_loss_clip": 1.04966474, - "balance_loss_mlp": 1.02396512, - "epoch": 0.484352923493161, - "flos": 17530081747200.0, - "grad_norm": 2.8015304711701154, - "language_loss": 0.63522434, - "learning_rate": 2.198105338530685e-06, - "loss": 0.6568222, - "num_input_tokens_seen": 173213065, - "step": 8056, - "time_per_iteration": 2.6111772060394287 - }, - { - "auxiliary_loss_clip": 0.01115568, - "auxiliary_loss_mlp": 0.01036144, - "balance_loss_clip": 1.04465592, - "balance_loss_mlp": 1.0212791, - "epoch": 0.48441304674582897, - "flos": 29166862043520.0, - "grad_norm": 2.044514393553715, - "language_loss": 0.67968506, - "learning_rate": 2.1977177883933726e-06, - "loss": 0.70120221, - "num_input_tokens_seen": 173234545, - "step": 8057, - "time_per_iteration": 2.678311824798584 - }, - { - "auxiliary_loss_clip": 0.01089017, - "auxiliary_loss_mlp": 0.01041569, - "balance_loss_clip": 1.04114962, - "balance_loss_mlp": 1.02560723, - "epoch": 0.48447316999849693, - "flos": 15886701676800.0, - "grad_norm": 1.6304795591829788, - "language_loss": 0.8145591, - "learning_rate": 2.1973302307585827e-06, - "loss": 0.83586496, - "num_input_tokens_seen": 173252175, - "step": 8058, - "time_per_iteration": 2.676553964614868 - }, - { - "auxiliary_loss_clip": 0.0111574, - "auxiliary_loss_mlp": 0.01037327, - "balance_loss_clip": 1.04488969, - "balance_loss_mlp": 1.02229452, - "epoch": 0.4845332932511649, - "flos": 24381405815040.0, - "grad_norm": 1.66967797618368, - "language_loss": 0.79851902, - "learning_rate": 2.1969426656410097e-06, - "loss": 0.82004976, - "num_input_tokens_seen": 173268790, - "step": 8059, - "time_per_iteration": 2.672071933746338 - }, - { - "auxiliary_loss_clip": 0.01134552, - "auxiliary_loss_mlp": 0.010436, - "balance_loss_clip": 1.04998326, - "balance_loss_mlp": 1.02804327, - "epoch": 0.48459341650383286, - "flos": 37116478316160.0, - "grad_norm": 1.8700605031219397, - "language_loss": 0.6685822, - "learning_rate": 2.196555093055352e-06, - "loss": 0.69036371, - "num_input_tokens_seen": 173288030, - "step": 8060, - "time_per_iteration": 2.7481517791748047 - }, - { - "auxiliary_loss_clip": 0.01115717, - "auxiliary_loss_mlp": 0.01047797, - "balance_loss_clip": 1.04782832, - "balance_loss_mlp": 1.03283644, - "epoch": 0.48465353975650083, - "flos": 22966777509120.0, - "grad_norm": 1.918934253409618, - "language_loss": 0.67403054, - "learning_rate": 2.1961675130163046e-06, - "loss": 0.69566566, - "num_input_tokens_seen": 173305965, - "step": 8061, - "time_per_iteration": 2.6991710662841797 - }, - { - "auxiliary_loss_clip": 0.01112971, - "auxiliary_loss_mlp": 0.01047446, - "balance_loss_clip": 1.0495888, - "balance_loss_mlp": 1.03176975, - "epoch": 0.4847136630091688, - "flos": 17707695523200.0, - "grad_norm": 2.027913918653662, - "language_loss": 0.82387316, - "learning_rate": 2.1957799255385653e-06, - "loss": 0.84547728, - "num_input_tokens_seen": 173321985, - "step": 8062, - "time_per_iteration": 2.6427886486053467 - }, - { - "auxiliary_loss_clip": 0.01062707, - "auxiliary_loss_mlp": 0.0103913, - "balance_loss_clip": 1.04044425, - "balance_loss_mlp": 1.02433586, - "epoch": 0.48477378626183676, - "flos": 22018269018240.0, - "grad_norm": 1.5908761940571217, - "language_loss": 0.74599862, - "learning_rate": 2.1953923306368325e-06, - "loss": 0.76701701, - "num_input_tokens_seen": 173341315, - "step": 8063, - "time_per_iteration": 2.767857313156128 - }, - { - "auxiliary_loss_clip": 0.01103538, - "auxiliary_loss_mlp": 0.01036681, - "balance_loss_clip": 1.04380846, - "balance_loss_mlp": 1.02177346, - "epoch": 0.4848339095145047, - "flos": 27962956874880.0, - "grad_norm": 1.679199539296889, - "language_loss": 0.7897141, - "learning_rate": 2.1950047283258023e-06, - "loss": 0.81111628, - "num_input_tokens_seen": 173361055, - "step": 8064, - "time_per_iteration": 2.702838182449341 - }, - { - "auxiliary_loss_clip": 0.01127143, - "auxiliary_loss_mlp": 0.0077039, - "balance_loss_clip": 1.04982877, - "balance_loss_mlp": 1.00042999, - "epoch": 0.4848940327671727, - "flos": 21688752625920.0, - "grad_norm": 1.758395032785765, - "language_loss": 0.78960353, - "learning_rate": 2.194617118620173e-06, - "loss": 0.80857891, - "num_input_tokens_seen": 173379255, - "step": 8065, - "time_per_iteration": 2.6464266777038574 - }, - { - "auxiliary_loss_clip": 0.01109206, - "auxiliary_loss_mlp": 0.00771166, - "balance_loss_clip": 1.04239869, - "balance_loss_mlp": 1.00034332, - "epoch": 0.48495415601984065, - "flos": 20631578515200.0, - "grad_norm": 1.717828669503626, - "language_loss": 0.76373905, - "learning_rate": 2.194229501534644e-06, - "loss": 0.78254277, - "num_input_tokens_seen": 173398370, - "step": 8066, - "time_per_iteration": 2.622279405593872 - }, - { - "auxiliary_loss_clip": 0.01129705, - "auxiliary_loss_mlp": 0.01032468, - "balance_loss_clip": 1.05031133, - "balance_loss_mlp": 1.0188905, - "epoch": 0.4850142792725086, - "flos": 25628152930560.0, - "grad_norm": 1.606995638926956, - "language_loss": 0.7245208, - "learning_rate": 2.193841877083912e-06, - "loss": 0.74614257, - "num_input_tokens_seen": 173419595, - "step": 8067, - "time_per_iteration": 2.6863858699798584 - }, - { - "auxiliary_loss_clip": 0.01062315, - "auxiliary_loss_mlp": 0.01036403, - "balance_loss_clip": 1.04658556, - "balance_loss_mlp": 1.02155542, - "epoch": 0.4850744025251766, - "flos": 13771958405760.0, - "grad_norm": 2.9723717970034826, - "language_loss": 0.79098403, - "learning_rate": 2.1934542452826767e-06, - "loss": 0.81197119, - "num_input_tokens_seen": 173435390, - "step": 8068, - "time_per_iteration": 2.736361503601074 - }, - { - "auxiliary_loss_clip": 0.01096742, - "auxiliary_loss_mlp": 0.01035763, - "balance_loss_clip": 1.04122019, - "balance_loss_mlp": 1.02254295, - "epoch": 0.4851345257778446, - "flos": 20261339078400.0, - "grad_norm": 1.4037595191012704, - "language_loss": 0.84329617, - "learning_rate": 2.193066606145638e-06, - "loss": 0.86462128, - "num_input_tokens_seen": 173454095, - "step": 8069, - "time_per_iteration": 2.6671814918518066 - }, - { - "auxiliary_loss_clip": 0.01091404, - "auxiliary_loss_mlp": 0.01033062, - "balance_loss_clip": 1.04400659, - "balance_loss_mlp": 1.01972818, - "epoch": 0.48519464903051257, - "flos": 27089681420160.0, - "grad_norm": 1.7638547734342187, - "language_loss": 0.78171504, - "learning_rate": 2.192678959687493e-06, - "loss": 0.80295968, - "num_input_tokens_seen": 173475300, - "step": 8070, - "time_per_iteration": 2.7715907096862793 - }, - { - "auxiliary_loss_clip": 0.01066151, - "auxiliary_loss_mlp": 0.0103257, - "balance_loss_clip": 1.04079247, - "balance_loss_mlp": 1.01808023, - "epoch": 0.48525477228318054, - "flos": 17127235739520.0, - "grad_norm": 1.9176398781406192, - "language_loss": 0.78054178, - "learning_rate": 2.192291305922943e-06, - "loss": 0.80152905, - "num_input_tokens_seen": 173492005, - "step": 8071, - "time_per_iteration": 2.7427566051483154 - }, - { - "auxiliary_loss_clip": 0.01063848, - "auxiliary_loss_mlp": 0.0103312, - "balance_loss_clip": 1.04013515, - "balance_loss_mlp": 1.01852274, - "epoch": 0.4853148955358485, - "flos": 28180324028160.0, - "grad_norm": 1.9286974806008035, - "language_loss": 0.72312587, - "learning_rate": 2.1919036448666873e-06, - "loss": 0.7440955, - "num_input_tokens_seen": 173511995, - "step": 8072, - "time_per_iteration": 2.8457834720611572 - }, - { - "auxiliary_loss_clip": 0.01077736, - "auxiliary_loss_mlp": 0.01038365, - "balance_loss_clip": 1.04195118, - "balance_loss_mlp": 1.02361333, - "epoch": 0.48537501878851647, - "flos": 17493309198720.0, - "grad_norm": 2.206546835183074, - "language_loss": 0.87933266, - "learning_rate": 2.1915159765334262e-06, - "loss": 0.90049368, - "num_input_tokens_seen": 173530215, - "step": 8073, - "time_per_iteration": 2.7190656661987305 - }, - { - "auxiliary_loss_clip": 0.01081944, - "auxiliary_loss_mlp": 0.01041597, - "balance_loss_clip": 1.03932655, - "balance_loss_mlp": 1.02555168, - "epoch": 0.48543514204118443, - "flos": 28584857975040.0, - "grad_norm": 1.6453725477912577, - "language_loss": 0.60954368, - "learning_rate": 2.19112830093786e-06, - "loss": 0.63077909, - "num_input_tokens_seen": 173550920, - "step": 8074, - "time_per_iteration": 2.757408857345581 - }, - { - "auxiliary_loss_clip": 0.01088022, - "auxiliary_loss_mlp": 0.00773092, - "balance_loss_clip": 1.0409627, - "balance_loss_mlp": 1.00044906, - "epoch": 0.4854952652938524, - "flos": 20959981585920.0, - "grad_norm": 1.6130644581425704, - "language_loss": 0.735416, - "learning_rate": 2.19074061809469e-06, - "loss": 0.75402713, - "num_input_tokens_seen": 173569065, - "step": 8075, - "time_per_iteration": 2.8191847801208496 - }, - { - "auxiliary_loss_clip": 0.01121809, - "auxiliary_loss_mlp": 0.01039314, - "balance_loss_clip": 1.04537582, - "balance_loss_mlp": 1.02567613, - "epoch": 0.48555538854652036, - "flos": 66529543155840.0, - "grad_norm": 2.2867687714704665, - "language_loss": 0.81751764, - "learning_rate": 2.1903529280186163e-06, - "loss": 0.83912885, - "num_input_tokens_seen": 173596085, - "step": 8076, - "time_per_iteration": 3.0270113945007324 - }, - { - "auxiliary_loss_clip": 0.01107841, - "auxiliary_loss_mlp": 0.01038327, - "balance_loss_clip": 1.04600549, - "balance_loss_mlp": 1.02161372, - "epoch": 0.4856155117991883, - "flos": 15924982596480.0, - "grad_norm": 2.702312951735234, - "language_loss": 0.86105502, - "learning_rate": 2.1899652307243407e-06, - "loss": 0.88251674, - "num_input_tokens_seen": 173613900, - "step": 8077, - "time_per_iteration": 2.6272876262664795 - }, - { - "auxiliary_loss_clip": 0.01006449, - "auxiliary_loss_mlp": 0.0100721, - "balance_loss_clip": 1.01856184, - "balance_loss_mlp": 1.00564885, - "epoch": 0.4856756350518563, - "flos": 71047395060480.0, - "grad_norm": 0.8998346956373826, - "language_loss": 0.58465588, - "learning_rate": 2.189577526226564e-06, - "loss": 0.60479248, - "num_input_tokens_seen": 173671305, - "step": 8078, - "time_per_iteration": 3.254561424255371 - }, - { - "auxiliary_loss_clip": 0.01132159, - "auxiliary_loss_mlp": 0.01033911, - "balance_loss_clip": 1.04961872, - "balance_loss_mlp": 1.01946878, - "epoch": 0.48573575830452426, - "flos": 29825679346560.0, - "grad_norm": 1.7198368274974891, - "language_loss": 0.72365242, - "learning_rate": 2.1891898145399884e-06, - "loss": 0.74531311, - "num_input_tokens_seen": 173692070, - "step": 8079, - "time_per_iteration": 2.6532506942749023 - }, - { - "auxiliary_loss_clip": 0.01088509, - "auxiliary_loss_mlp": 0.0103276, - "balance_loss_clip": 1.04440176, - "balance_loss_mlp": 1.01868141, - "epoch": 0.4857958815571922, - "flos": 17639501552640.0, - "grad_norm": 2.749999314487442, - "language_loss": 0.79557705, - "learning_rate": 2.1888020956793172e-06, - "loss": 0.81678975, - "num_input_tokens_seen": 173709785, - "step": 8080, - "time_per_iteration": 2.6242940425872803 - }, - { - "auxiliary_loss_clip": 0.01097632, - "auxiliary_loss_mlp": 0.01033589, - "balance_loss_clip": 1.04023981, - "balance_loss_mlp": 1.01881862, - "epoch": 0.4858560048098602, - "flos": 21105491581440.0, - "grad_norm": 1.9603729393952303, - "language_loss": 0.84016395, - "learning_rate": 2.188414369659251e-06, - "loss": 0.86147618, - "num_input_tokens_seen": 173728770, - "step": 8081, - "time_per_iteration": 2.6701998710632324 - }, - { - "auxiliary_loss_clip": 0.01110096, - "auxiliary_loss_mlp": 0.01036956, - "balance_loss_clip": 1.04121375, - "balance_loss_mlp": 1.02081513, - "epoch": 0.4859161280625282, - "flos": 22090844448000.0, - "grad_norm": 1.4026106187948555, - "language_loss": 0.83353597, - "learning_rate": 2.1880266364944924e-06, - "loss": 0.85500646, - "num_input_tokens_seen": 173747355, - "step": 8082, - "time_per_iteration": 2.6535134315490723 - }, - { - "auxiliary_loss_clip": 0.01102933, - "auxiliary_loss_mlp": 0.01034217, - "balance_loss_clip": 1.04525304, - "balance_loss_mlp": 1.02117527, - "epoch": 0.4859762513151962, - "flos": 17493452853120.0, - "grad_norm": 1.9462739217424578, - "language_loss": 0.87314546, - "learning_rate": 2.187638896199746e-06, - "loss": 0.89451694, - "num_input_tokens_seen": 173764825, - "step": 8083, - "time_per_iteration": 2.6324520111083984 - }, - { - "auxiliary_loss_clip": 0.01080799, - "auxiliary_loss_mlp": 0.01047109, - "balance_loss_clip": 1.04719186, - "balance_loss_mlp": 1.03410375, - "epoch": 0.48603637456786414, - "flos": 18004246208640.0, - "grad_norm": 1.6025248177358018, - "language_loss": 0.80759108, - "learning_rate": 2.1872511487897126e-06, - "loss": 0.82887018, - "num_input_tokens_seen": 173783215, - "step": 8084, - "time_per_iteration": 2.679032325744629 - }, - { - "auxiliary_loss_clip": 0.01114846, - "auxiliary_loss_mlp": 0.01035804, - "balance_loss_clip": 1.04544878, - "balance_loss_mlp": 1.02149308, - "epoch": 0.4860964978205321, - "flos": 22492038430080.0, - "grad_norm": 1.9539653340908196, - "language_loss": 0.68145066, - "learning_rate": 2.186863394279098e-06, - "loss": 0.70295715, - "num_input_tokens_seen": 173801905, - "step": 8085, - "time_per_iteration": 2.6305296421051025 - }, - { - "auxiliary_loss_clip": 0.01113875, - "auxiliary_loss_mlp": 0.01040894, - "balance_loss_clip": 1.04487717, - "balance_loss_mlp": 1.02714896, - "epoch": 0.48615662107320007, - "flos": 23372532518400.0, - "grad_norm": 1.3763064439222144, - "language_loss": 0.77494752, - "learning_rate": 2.1864756326826046e-06, - "loss": 0.79649526, - "num_input_tokens_seen": 173824690, - "step": 8086, - "time_per_iteration": 2.6941890716552734 - }, - { - "auxiliary_loss_clip": 0.01125139, - "auxiliary_loss_mlp": 0.01028743, - "balance_loss_clip": 1.04536629, - "balance_loss_mlp": 1.01461661, - "epoch": 0.48621674432586803, - "flos": 34418833136640.0, - "grad_norm": 2.3947564981199347, - "language_loss": 0.7014342, - "learning_rate": 2.1860878640149355e-06, - "loss": 0.72297299, - "num_input_tokens_seen": 173844450, - "step": 8087, - "time_per_iteration": 2.7329354286193848 - }, - { - "auxiliary_loss_clip": 0.01119086, - "auxiliary_loss_mlp": 0.01040298, - "balance_loss_clip": 1.04627323, - "balance_loss_mlp": 1.0251466, - "epoch": 0.486276867578536, - "flos": 33107555237760.0, - "grad_norm": 1.710106545545042, - "language_loss": 0.72521967, - "learning_rate": 2.1857000882907974e-06, - "loss": 0.74681354, - "num_input_tokens_seen": 173864975, - "step": 8088, - "time_per_iteration": 2.747058391571045 - }, - { - "auxiliary_loss_clip": 0.01103115, - "auxiliary_loss_mlp": 0.01037287, - "balance_loss_clip": 1.04365635, - "balance_loss_mlp": 1.02306569, - "epoch": 0.48633699083120396, - "flos": 21470703114240.0, - "grad_norm": 1.7297894528285667, - "language_loss": 0.7543239, - "learning_rate": 2.185312305524892e-06, - "loss": 0.77572793, - "num_input_tokens_seen": 173883805, - "step": 8089, - "time_per_iteration": 2.6639740467071533 - }, - { - "auxiliary_loss_clip": 0.01092992, - "auxiliary_loss_mlp": 0.01031661, - "balance_loss_clip": 1.04379344, - "balance_loss_mlp": 1.01733255, - "epoch": 0.48639711408387193, - "flos": 20084335833600.0, - "grad_norm": 1.6351614757671693, - "language_loss": 0.84245062, - "learning_rate": 2.184924515731926e-06, - "loss": 0.86369717, - "num_input_tokens_seen": 173903520, - "step": 8090, - "time_per_iteration": 4.404139757156372 - }, - { - "auxiliary_loss_clip": 0.01122239, - "auxiliary_loss_mlp": 0.01033955, - "balance_loss_clip": 1.04544723, - "balance_loss_mlp": 1.0203594, - "epoch": 0.4864572373365399, - "flos": 20778884190720.0, - "grad_norm": 1.7197214823091769, - "language_loss": 0.76290631, - "learning_rate": 2.1845367189266045e-06, - "loss": 0.78446829, - "num_input_tokens_seen": 173924255, - "step": 8091, - "time_per_iteration": 2.7133665084838867 - }, - { - "auxiliary_loss_clip": 0.01115621, - "auxiliary_loss_mlp": 0.01029044, - "balance_loss_clip": 1.04440069, - "balance_loss_mlp": 1.01553202, - "epoch": 0.48651736058920786, - "flos": 26025360503040.0, - "grad_norm": 1.4953838782762103, - "language_loss": 0.80510461, - "learning_rate": 2.184148915123631e-06, - "loss": 0.82655126, - "num_input_tokens_seen": 173943285, - "step": 8092, - "time_per_iteration": 2.682349920272827 - }, - { - "auxiliary_loss_clip": 0.0110052, - "auxiliary_loss_mlp": 0.00775072, - "balance_loss_clip": 1.04398346, - "balance_loss_mlp": 1.00031447, - "epoch": 0.4865774838418758, - "flos": 20485601642880.0, - "grad_norm": 1.434156215667662, - "language_loss": 0.71867287, - "learning_rate": 2.1837611043377126e-06, - "loss": 0.73742878, - "num_input_tokens_seen": 173962205, - "step": 8093, - "time_per_iteration": 5.686015367507935 - }, - { - "auxiliary_loss_clip": 0.01123791, - "auxiliary_loss_mlp": 0.01034202, - "balance_loss_clip": 1.04521751, - "balance_loss_mlp": 1.02074885, - "epoch": 0.4866376070945438, - "flos": 23547704169600.0, - "grad_norm": 1.581585117496142, - "language_loss": 0.67704266, - "learning_rate": 2.1833732865835545e-06, - "loss": 0.69862258, - "num_input_tokens_seen": 173980945, - "step": 8094, - "time_per_iteration": 2.5890355110168457 - }, - { - "auxiliary_loss_clip": 0.01109259, - "auxiliary_loss_mlp": 0.01038119, - "balance_loss_clip": 1.04752278, - "balance_loss_mlp": 1.02342701, - "epoch": 0.4866977303472118, - "flos": 16690598012160.0, - "grad_norm": 2.317379685093866, - "language_loss": 0.66784161, - "learning_rate": 2.1829854618758636e-06, - "loss": 0.68931544, - "num_input_tokens_seen": 173998860, - "step": 8095, - "time_per_iteration": 2.640468120574951 - }, - { - "auxiliary_loss_clip": 0.01110152, - "auxiliary_loss_mlp": 0.0103636, - "balance_loss_clip": 1.04456031, - "balance_loss_mlp": 1.02123296, - "epoch": 0.4867578535998798, - "flos": 17896011552000.0, - "grad_norm": 2.1481069791390346, - "language_loss": 0.78540075, - "learning_rate": 2.182597630229345e-06, - "loss": 0.80686581, - "num_input_tokens_seen": 174016665, - "step": 8096, - "time_per_iteration": 2.585015058517456 - }, - { - "auxiliary_loss_clip": 0.01092726, - "auxiliary_loss_mlp": 0.01036143, - "balance_loss_clip": 1.03732872, - "balance_loss_mlp": 1.02165902, - "epoch": 0.48681797685254774, - "flos": 22637799820800.0, - "grad_norm": 1.880706326191671, - "language_loss": 0.67753577, - "learning_rate": 2.1822097916587067e-06, - "loss": 0.69882447, - "num_input_tokens_seen": 174034800, - "step": 8097, - "time_per_iteration": 2.6526336669921875 - }, - { - "auxiliary_loss_clip": 0.01097124, - "auxiliary_loss_mlp": 0.01039294, - "balance_loss_clip": 1.04311764, - "balance_loss_mlp": 1.02491093, - "epoch": 0.4868781001052157, - "flos": 20886077352960.0, - "grad_norm": 1.6144910396326548, - "language_loss": 0.71414316, - "learning_rate": 2.1818219461786543e-06, - "loss": 0.73550731, - "num_input_tokens_seen": 174054445, - "step": 8098, - "time_per_iteration": 2.6669986248016357 - }, - { - "auxiliary_loss_clip": 0.01119656, - "auxiliary_loss_mlp": 0.01037345, - "balance_loss_clip": 1.04642081, - "balance_loss_mlp": 1.02226543, - "epoch": 0.48693822335788367, - "flos": 41974940937600.0, - "grad_norm": 2.9804894060925458, - "language_loss": 0.66267806, - "learning_rate": 2.1814340938038956e-06, - "loss": 0.68424809, - "num_input_tokens_seen": 174077890, - "step": 8099, - "time_per_iteration": 2.7542026042938232 - }, - { - "auxiliary_loss_clip": 0.01070284, - "auxiliary_loss_mlp": 0.01040695, - "balance_loss_clip": 1.0372566, - "balance_loss_mlp": 1.02712917, - "epoch": 0.48699834661055164, - "flos": 24243294021120.0, - "grad_norm": 1.700994432394141, - "language_loss": 0.66787708, - "learning_rate": 2.181046234549138e-06, - "loss": 0.6889869, - "num_input_tokens_seen": 174097460, - "step": 8100, - "time_per_iteration": 2.7499735355377197 - }, - { - "auxiliary_loss_clip": 0.01087635, - "auxiliary_loss_mlp": 0.01033762, - "balance_loss_clip": 1.04155445, - "balance_loss_mlp": 1.02084517, - "epoch": 0.4870584698632196, - "flos": 25923877603200.0, - "grad_norm": 1.427277688843355, - "language_loss": 0.76812327, - "learning_rate": 2.180658368429088e-06, - "loss": 0.78933728, - "num_input_tokens_seen": 174120775, - "step": 8101, - "time_per_iteration": 2.7710418701171875 - }, - { - "auxiliary_loss_clip": 0.010432, - "auxiliary_loss_mlp": 0.00999689, - "balance_loss_clip": 1.01742899, - "balance_loss_mlp": 0.99847281, - "epoch": 0.48711859311588757, - "flos": 70211933648640.0, - "grad_norm": 0.6877166097191185, - "language_loss": 0.52341712, - "learning_rate": 2.1802704954584565e-06, - "loss": 0.54384601, - "num_input_tokens_seen": 174189135, - "step": 8102, - "time_per_iteration": 3.3232975006103516 - }, - { - "auxiliary_loss_clip": 0.0109639, - "auxiliary_loss_mlp": 0.0103608, - "balance_loss_clip": 1.04584694, - "balance_loss_mlp": 1.02250218, - "epoch": 0.48717871636855553, - "flos": 12342964659840.0, - "grad_norm": 2.1242457938350885, - "language_loss": 0.7405737, - "learning_rate": 2.1798826156519484e-06, - "loss": 0.7618984, - "num_input_tokens_seen": 174203250, - "step": 8103, - "time_per_iteration": 2.6988277435302734 - }, - { - "auxiliary_loss_clip": 0.01116672, - "auxiliary_loss_mlp": 0.01043644, - "balance_loss_clip": 1.04631233, - "balance_loss_mlp": 1.0288384, - "epoch": 0.4872388396212235, - "flos": 23477139901440.0, - "grad_norm": 1.6106517558680102, - "language_loss": 0.63064033, - "learning_rate": 2.1794947290242737e-06, - "loss": 0.65224349, - "num_input_tokens_seen": 174224145, - "step": 8104, - "time_per_iteration": 2.629725456237793 - }, - { - "auxiliary_loss_clip": 0.01125564, - "auxiliary_loss_mlp": 0.01032477, - "balance_loss_clip": 1.04695344, - "balance_loss_mlp": 1.01885152, - "epoch": 0.48729896287389146, - "flos": 31427582186880.0, - "grad_norm": 2.7588286364308217, - "language_loss": 0.69136071, - "learning_rate": 2.1791068355901413e-06, - "loss": 0.71294117, - "num_input_tokens_seen": 174244435, - "step": 8105, - "time_per_iteration": 2.6670045852661133 - }, - { - "auxiliary_loss_clip": 0.01084626, - "auxiliary_loss_mlp": 0.01030665, - "balance_loss_clip": 1.04264283, - "balance_loss_mlp": 1.01766491, - "epoch": 0.4873590861265594, - "flos": 19057936700160.0, - "grad_norm": 2.072109036230495, - "language_loss": 0.73534381, - "learning_rate": 2.178718935364259e-06, - "loss": 0.75649679, - "num_input_tokens_seen": 174262710, - "step": 8106, - "time_per_iteration": 2.679194927215576 - }, - { - "auxiliary_loss_clip": 0.01107932, - "auxiliary_loss_mlp": 0.00772241, - "balance_loss_clip": 1.04675412, - "balance_loss_mlp": 1.00038791, - "epoch": 0.4874192093792274, - "flos": 24348296453760.0, - "grad_norm": 2.6438945384360157, - "language_loss": 0.76877642, - "learning_rate": 2.1783310283613373e-06, - "loss": 0.78757817, - "num_input_tokens_seen": 174281545, - "step": 8107, - "time_per_iteration": 2.6732285022735596 - }, - { - "auxiliary_loss_clip": 0.01071333, - "auxiliary_loss_mlp": 0.01032073, - "balance_loss_clip": 1.04327512, - "balance_loss_mlp": 1.01932359, - "epoch": 0.4874793326318954, - "flos": 23112610727040.0, - "grad_norm": 3.5135482389125583, - "language_loss": 0.75034302, - "learning_rate": 2.1779431145960853e-06, - "loss": 0.77137709, - "num_input_tokens_seen": 174300290, - "step": 8108, - "time_per_iteration": 2.8071932792663574 - }, - { - "auxiliary_loss_clip": 0.01111368, - "auxiliary_loss_mlp": 0.01030979, - "balance_loss_clip": 1.04524517, - "balance_loss_mlp": 1.01917136, - "epoch": 0.4875394558845634, - "flos": 19026156142080.0, - "grad_norm": 1.7033835018380465, - "language_loss": 0.73611033, - "learning_rate": 2.177555194083212e-06, - "loss": 0.75753379, - "num_input_tokens_seen": 174318490, - "step": 8109, - "time_per_iteration": 2.642854928970337 - }, - { - "auxiliary_loss_clip": 0.01108586, - "auxiliary_loss_mlp": 0.01031639, - "balance_loss_clip": 1.04274952, - "balance_loss_mlp": 1.01813245, - "epoch": 0.48759957913723134, - "flos": 21433607343360.0, - "grad_norm": 1.8383730211114537, - "language_loss": 0.78698927, - "learning_rate": 2.177167266837428e-06, - "loss": 0.80839157, - "num_input_tokens_seen": 174335505, - "step": 8110, - "time_per_iteration": 2.6471641063690186 - }, - { - "auxiliary_loss_clip": 0.01114056, - "auxiliary_loss_mlp": 0.01041552, - "balance_loss_clip": 1.04712057, - "balance_loss_mlp": 1.02802181, - "epoch": 0.4876597023898993, - "flos": 17748669962880.0, - "grad_norm": 1.8514316559502986, - "language_loss": 0.72086185, - "learning_rate": 2.176779332873444e-06, - "loss": 0.74241793, - "num_input_tokens_seen": 174353990, - "step": 8111, - "time_per_iteration": 2.6277401447296143 - }, - { - "auxiliary_loss_clip": 0.01113402, - "auxiliary_loss_mlp": 0.01036579, - "balance_loss_clip": 1.04676926, - "balance_loss_mlp": 1.02329946, - "epoch": 0.4877198256425673, - "flos": 17019647527680.0, - "grad_norm": 1.5795214961704311, - "language_loss": 0.76318377, - "learning_rate": 2.17639139220597e-06, - "loss": 0.78468353, - "num_input_tokens_seen": 174373425, - "step": 8112, - "time_per_iteration": 2.598010301589966 - }, - { - "auxiliary_loss_clip": 0.01117365, - "auxiliary_loss_mlp": 0.01038377, - "balance_loss_clip": 1.04562628, - "balance_loss_mlp": 1.02425683, - "epoch": 0.48777994889523524, - "flos": 22384091082240.0, - "grad_norm": 1.710789031048389, - "language_loss": 0.75035822, - "learning_rate": 2.1760034448497166e-06, - "loss": 0.77191567, - "num_input_tokens_seen": 174393070, - "step": 8113, - "time_per_iteration": 2.6348531246185303 - }, - { - "auxiliary_loss_clip": 0.01028141, - "auxiliary_loss_mlp": 0.0075288, - "balance_loss_clip": 1.02038229, - "balance_loss_mlp": 1.0004046, - "epoch": 0.4878400721479032, - "flos": 61241772159360.0, - "grad_norm": 0.77879843500845, - "language_loss": 0.4887349, - "learning_rate": 2.1756154908193943e-06, - "loss": 0.50654507, - "num_input_tokens_seen": 174446880, - "step": 8114, - "time_per_iteration": 3.1273062229156494 - }, - { - "auxiliary_loss_clip": 0.0109717, - "auxiliary_loss_mlp": 0.01040496, - "balance_loss_clip": 1.04649258, - "balance_loss_mlp": 1.02591658, - "epoch": 0.48790019540057117, - "flos": 24536612482560.0, - "grad_norm": 1.616579350296871, - "language_loss": 0.76760268, - "learning_rate": 2.1752275301297155e-06, - "loss": 0.78897941, - "num_input_tokens_seen": 174468485, - "step": 8115, - "time_per_iteration": 2.759444236755371 - }, - { - "auxiliary_loss_clip": 0.01107443, - "auxiliary_loss_mlp": 0.01033169, - "balance_loss_clip": 1.0478245, - "balance_loss_mlp": 1.01930535, - "epoch": 0.48796031865323913, - "flos": 21833939399040.0, - "grad_norm": 2.031601085778298, - "language_loss": 0.71910083, - "learning_rate": 2.1748395627953915e-06, - "loss": 0.74050689, - "num_input_tokens_seen": 174486360, - "step": 8116, - "time_per_iteration": 2.7063751220703125 - }, - { - "auxiliary_loss_clip": 0.01088547, - "auxiliary_loss_mlp": 0.01035995, - "balance_loss_clip": 1.04164481, - "balance_loss_mlp": 1.02276874, - "epoch": 0.4880204419059071, - "flos": 18588907883520.0, - "grad_norm": 3.4734402196051, - "language_loss": 0.63002747, - "learning_rate": 2.1744515888311335e-06, - "loss": 0.65127283, - "num_input_tokens_seen": 174505075, - "step": 8117, - "time_per_iteration": 2.713792562484741 - }, - { - "auxiliary_loss_clip": 0.01093551, - "auxiliary_loss_mlp": 0.01042447, - "balance_loss_clip": 1.04097366, - "balance_loss_mlp": 1.02740264, - "epoch": 0.48808056515857506, - "flos": 19172168928000.0, - "grad_norm": 1.6679530296862457, - "language_loss": 0.79487926, - "learning_rate": 2.1740636082516533e-06, - "loss": 0.81623924, - "num_input_tokens_seen": 174523385, - "step": 8118, - "time_per_iteration": 2.6479125022888184 - }, - { - "auxiliary_loss_clip": 0.01102071, - "auxiliary_loss_mlp": 0.01036823, - "balance_loss_clip": 1.04363036, - "balance_loss_mlp": 1.02295303, - "epoch": 0.48814068841124303, - "flos": 20120497850880.0, - "grad_norm": 1.8682176240686432, - "language_loss": 0.6328088, - "learning_rate": 2.1736756210716645e-06, - "loss": 0.65419775, - "num_input_tokens_seen": 174542200, - "step": 8119, - "time_per_iteration": 2.6599643230438232 - }, - { - "auxiliary_loss_clip": 0.01061047, - "auxiliary_loss_mlp": 0.00770426, - "balance_loss_clip": 1.04209542, - "balance_loss_mlp": 1.00037444, - "epoch": 0.488200811663911, - "flos": 22965592360320.0, - "grad_norm": 1.676805190577927, - "language_loss": 0.72166741, - "learning_rate": 2.173287627305878e-06, - "loss": 0.73998219, - "num_input_tokens_seen": 174563620, - "step": 8120, - "time_per_iteration": 2.795185089111328 - }, - { - "auxiliary_loss_clip": 0.01118613, - "auxiliary_loss_mlp": 0.01031295, - "balance_loss_clip": 1.0469954, - "balance_loss_mlp": 1.01728177, - "epoch": 0.48826093491657896, - "flos": 33910697387520.0, - "grad_norm": 2.388334225725702, - "language_loss": 0.63951784, - "learning_rate": 2.1728996269690075e-06, - "loss": 0.66101694, - "num_input_tokens_seen": 174586465, - "step": 8121, - "time_per_iteration": 2.7527153491973877 - }, - { - "auxiliary_loss_clip": 0.01112786, - "auxiliary_loss_mlp": 0.01036976, - "balance_loss_clip": 1.04261351, - "balance_loss_mlp": 1.02283835, - "epoch": 0.488321058169247, - "flos": 23070307484160.0, - "grad_norm": 1.985568603421553, - "language_loss": 0.82805705, - "learning_rate": 2.1725116200757664e-06, - "loss": 0.84955472, - "num_input_tokens_seen": 174604035, - "step": 8122, - "time_per_iteration": 2.668754816055298 - }, - { - "auxiliary_loss_clip": 0.0111403, - "auxiliary_loss_mlp": 0.01043394, - "balance_loss_clip": 1.04526711, - "balance_loss_mlp": 1.02749181, - "epoch": 0.48838118142191494, - "flos": 19317714837120.0, - "grad_norm": 1.7149683973709622, - "language_loss": 0.85272485, - "learning_rate": 2.172123606640866e-06, - "loss": 0.87429905, - "num_input_tokens_seen": 174621715, - "step": 8123, - "time_per_iteration": 2.6014883518218994 - }, - { - "auxiliary_loss_clip": 0.01090574, - "auxiliary_loss_mlp": 0.01031767, - "balance_loss_clip": 1.04448855, - "balance_loss_mlp": 1.0185523, - "epoch": 0.4884413046745829, - "flos": 25410678036480.0, - "grad_norm": 1.3909354864913257, - "language_loss": 0.85614896, - "learning_rate": 2.1717355866790227e-06, - "loss": 0.87737238, - "num_input_tokens_seen": 174643835, - "step": 8124, - "time_per_iteration": 2.754786968231201 - }, - { - "auxiliary_loss_clip": 0.01103222, - "auxiliary_loss_mlp": 0.01031579, - "balance_loss_clip": 1.04439664, - "balance_loss_mlp": 1.0179534, - "epoch": 0.4885014279272509, - "flos": 20991546662400.0, - "grad_norm": 1.926010658269172, - "language_loss": 0.79547518, - "learning_rate": 2.171347560204948e-06, - "loss": 0.81682324, - "num_input_tokens_seen": 174660955, - "step": 8125, - "time_per_iteration": 2.667335271835327 - }, - { - "auxiliary_loss_clip": 0.01078395, - "auxiliary_loss_mlp": 0.01040727, - "balance_loss_clip": 1.04347515, - "balance_loss_mlp": 1.0263145, - "epoch": 0.48856155117991884, - "flos": 13771599269760.0, - "grad_norm": 2.02778788313487, - "language_loss": 0.72584462, - "learning_rate": 2.170959527233356e-06, - "loss": 0.74703586, - "num_input_tokens_seen": 174678270, - "step": 8126, - "time_per_iteration": 2.7370314598083496 - }, - { - "auxiliary_loss_clip": 0.0111111, - "auxiliary_loss_mlp": 0.01038149, - "balance_loss_clip": 1.0410614, - "balance_loss_mlp": 1.02405286, - "epoch": 0.4886216744325868, - "flos": 32087764206720.0, - "grad_norm": 1.7703486674415694, - "language_loss": 0.68917644, - "learning_rate": 2.1705714877789633e-06, - "loss": 0.71066898, - "num_input_tokens_seen": 174698360, - "step": 8127, - "time_per_iteration": 2.811074733734131 - }, - { - "auxiliary_loss_clip": 0.01125381, - "auxiliary_loss_mlp": 0.01033584, - "balance_loss_clip": 1.04334533, - "balance_loss_mlp": 1.01993454, - "epoch": 0.48868179768525477, - "flos": 19610063631360.0, - "grad_norm": 1.5960676368468543, - "language_loss": 0.76178646, - "learning_rate": 2.170183441856481e-06, - "loss": 0.78337616, - "num_input_tokens_seen": 174716755, - "step": 8128, - "time_per_iteration": 2.5751638412475586 - }, - { - "auxiliary_loss_clip": 0.01126548, - "auxiliary_loss_mlp": 0.01031229, - "balance_loss_clip": 1.04598355, - "balance_loss_mlp": 1.01818776, - "epoch": 0.48874192093792274, - "flos": 21286912199040.0, - "grad_norm": 1.5334009671548041, - "language_loss": 0.7574327, - "learning_rate": 2.1697953894806265e-06, - "loss": 0.77901042, - "num_input_tokens_seen": 174735560, - "step": 8129, - "time_per_iteration": 4.080120325088501 - }, - { - "auxiliary_loss_clip": 0.01113338, - "auxiliary_loss_mlp": 0.01031411, - "balance_loss_clip": 1.04372275, - "balance_loss_mlp": 1.0174098, - "epoch": 0.4888020441905907, - "flos": 14173439696640.0, - "grad_norm": 2.756799094025314, - "language_loss": 0.64951944, - "learning_rate": 2.169407330666114e-06, - "loss": 0.67096692, - "num_input_tokens_seen": 174752730, - "step": 8130, - "time_per_iteration": 4.153359413146973 - }, - { - "auxiliary_loss_clip": 0.01087218, - "auxiliary_loss_mlp": 0.01036252, - "balance_loss_clip": 1.0399828, - "balance_loss_mlp": 1.02282333, - "epoch": 0.48886216744325867, - "flos": 24097891766400.0, - "grad_norm": 1.9114203912665453, - "language_loss": 0.72505724, - "learning_rate": 2.169019265427658e-06, - "loss": 0.746292, - "num_input_tokens_seen": 174772520, - "step": 8131, - "time_per_iteration": 2.751070499420166 - }, - { - "auxiliary_loss_clip": 0.0111646, - "auxiliary_loss_mlp": 0.01041385, - "balance_loss_clip": 1.04625905, - "balance_loss_mlp": 1.0270561, - "epoch": 0.48892229069592663, - "flos": 38431419402240.0, - "grad_norm": 1.3981624070335212, - "language_loss": 0.69684219, - "learning_rate": 2.1686311937799745e-06, - "loss": 0.71842068, - "num_input_tokens_seen": 174796540, - "step": 8132, - "time_per_iteration": 4.478942632675171 - }, - { - "auxiliary_loss_clip": 0.01109765, - "auxiliary_loss_mlp": 0.01030128, - "balance_loss_clip": 1.04673529, - "balance_loss_mlp": 1.01630616, - "epoch": 0.4889824139485946, - "flos": 23843321101440.0, - "grad_norm": 1.328560083390073, - "language_loss": 0.69882882, - "learning_rate": 2.1682431157377797e-06, - "loss": 0.72022772, - "num_input_tokens_seen": 174817840, - "step": 8133, - "time_per_iteration": 4.2415807247161865 - }, - { - "auxiliary_loss_clip": 0.01062397, - "auxiliary_loss_mlp": 0.01042948, - "balance_loss_clip": 1.03593254, - "balance_loss_mlp": 1.02922726, - "epoch": 0.48904253720126256, - "flos": 24425827960320.0, - "grad_norm": 1.919712430573748, - "language_loss": 0.70950568, - "learning_rate": 2.1678550313157883e-06, - "loss": 0.73055917, - "num_input_tokens_seen": 174837885, - "step": 8134, - "time_per_iteration": 2.772383689880371 - }, - { - "auxiliary_loss_clip": 0.01084139, - "auxiliary_loss_mlp": 0.01035376, - "balance_loss_clip": 1.04342508, - "balance_loss_mlp": 1.02082086, - "epoch": 0.4891026604539306, - "flos": 24170682677760.0, - "grad_norm": 1.9244253075686233, - "language_loss": 0.80356431, - "learning_rate": 2.167466940528718e-06, - "loss": 0.82475942, - "num_input_tokens_seen": 174855240, - "step": 8135, - "time_per_iteration": 2.7362964153289795 - }, - { - "auxiliary_loss_clip": 0.01124035, - "auxiliary_loss_mlp": 0.01035694, - "balance_loss_clip": 1.04567957, - "balance_loss_mlp": 1.0232842, - "epoch": 0.48916278370659855, - "flos": 21470954509440.0, - "grad_norm": 1.8037329109010316, - "language_loss": 0.74794912, - "learning_rate": 2.1670788433912843e-06, - "loss": 0.76954633, - "num_input_tokens_seen": 174875145, - "step": 8136, - "time_per_iteration": 2.766477346420288 - }, - { - "auxiliary_loss_clip": 0.01097387, - "auxiliary_loss_mlp": 0.01043558, - "balance_loss_clip": 1.04352307, - "balance_loss_mlp": 1.02971756, - "epoch": 0.4892229069592665, - "flos": 22309755886080.0, - "grad_norm": 1.6588593954338173, - "language_loss": 0.73403543, - "learning_rate": 2.166690739918204e-06, - "loss": 0.75544488, - "num_input_tokens_seen": 174894770, - "step": 8137, - "time_per_iteration": 2.720778703689575 - }, - { - "auxiliary_loss_clip": 0.01051073, - "auxiliary_loss_mlp": 0.01031061, - "balance_loss_clip": 1.03699243, - "balance_loss_mlp": 1.01726234, - "epoch": 0.4892830302119345, - "flos": 12786856934400.0, - "grad_norm": 2.090077124931452, - "language_loss": 0.75336611, - "learning_rate": 2.1663026301241944e-06, - "loss": 0.77418739, - "num_input_tokens_seen": 174912780, - "step": 8138, - "time_per_iteration": 2.7975735664367676 - }, - { - "auxiliary_loss_clip": 0.01091927, - "auxiliary_loss_mlp": 0.01038351, - "balance_loss_clip": 1.04700375, - "balance_loss_mlp": 1.02536893, - "epoch": 0.48934315346460244, - "flos": 20813896972800.0, - "grad_norm": 1.6152276292204855, - "language_loss": 0.74018902, - "learning_rate": 2.165914514023972e-06, - "loss": 0.76149184, - "num_input_tokens_seen": 174931250, - "step": 8139, - "time_per_iteration": 2.7135186195373535 - }, - { - "auxiliary_loss_clip": 0.01115319, - "auxiliary_loss_mlp": 0.0103739, - "balance_loss_clip": 1.04502773, - "balance_loss_mlp": 1.02416921, - "epoch": 0.4894032767172704, - "flos": 19755537713280.0, - "grad_norm": 1.878714628680016, - "language_loss": 0.62168998, - "learning_rate": 2.165526391632255e-06, - "loss": 0.64321709, - "num_input_tokens_seen": 174951105, - "step": 8140, - "time_per_iteration": 2.6594550609588623 - }, - { - "auxiliary_loss_clip": 0.0109215, - "auxiliary_loss_mlp": 0.01040102, - "balance_loss_clip": 1.04310822, - "balance_loss_mlp": 1.02509928, - "epoch": 0.4894633999699384, - "flos": 17818982835840.0, - "grad_norm": 1.7004882369900214, - "language_loss": 0.82400143, - "learning_rate": 2.1651382629637608e-06, - "loss": 0.84532392, - "num_input_tokens_seen": 174969120, - "step": 8141, - "time_per_iteration": 2.648696184158325 - }, - { - "auxiliary_loss_clip": 0.01095522, - "auxiliary_loss_mlp": 0.01034005, - "balance_loss_clip": 1.04897892, - "balance_loss_mlp": 1.01975965, - "epoch": 0.48952352322260634, - "flos": 25523222325120.0, - "grad_norm": 1.6750975318537598, - "language_loss": 0.72031653, - "learning_rate": 2.1647501280332066e-06, - "loss": 0.74161184, - "num_input_tokens_seen": 174991295, - "step": 8142, - "time_per_iteration": 2.770524740219116 - }, - { - "auxiliary_loss_clip": 0.01124129, - "auxiliary_loss_mlp": 0.01033852, - "balance_loss_clip": 1.04588366, - "balance_loss_mlp": 1.02094769, - "epoch": 0.4895836464752743, - "flos": 29055502903680.0, - "grad_norm": 8.902000760681485, - "language_loss": 0.66877794, - "learning_rate": 2.1643619868553105e-06, - "loss": 0.6903578, - "num_input_tokens_seen": 175012830, - "step": 8143, - "time_per_iteration": 2.717714786529541 - }, - { - "auxiliary_loss_clip": 0.01116098, - "auxiliary_loss_mlp": 0.00770078, - "balance_loss_clip": 1.04774415, - "balance_loss_mlp": 1.00015235, - "epoch": 0.48964376972794227, - "flos": 33546958312320.0, - "grad_norm": 1.880195910988658, - "language_loss": 0.75596797, - "learning_rate": 2.163973839444793e-06, - "loss": 0.77482975, - "num_input_tokens_seen": 175035695, - "step": 8144, - "time_per_iteration": 2.801825761795044 - }, - { - "auxiliary_loss_clip": 0.01099436, - "auxiliary_loss_mlp": 0.01031587, - "balance_loss_clip": 1.04169714, - "balance_loss_mlp": 1.01753187, - "epoch": 0.48970389298061023, - "flos": 22054035985920.0, - "grad_norm": 1.9123659180679726, - "language_loss": 0.75693774, - "learning_rate": 2.1635856858163695e-06, - "loss": 0.77824795, - "num_input_tokens_seen": 175056425, - "step": 8145, - "time_per_iteration": 2.781550168991089 - }, - { - "auxiliary_loss_clip": 0.01108869, - "auxiliary_loss_mlp": 0.0077212, - "balance_loss_clip": 1.04549527, - "balance_loss_mlp": 1.00018287, - "epoch": 0.4897640162332782, - "flos": 20084299920000.0, - "grad_norm": 1.6675270752681912, - "language_loss": 0.80437362, - "learning_rate": 2.163197525984761e-06, - "loss": 0.82318354, - "num_input_tokens_seen": 175074800, - "step": 8146, - "time_per_iteration": 2.699277400970459 - }, - { - "auxiliary_loss_clip": 0.01109996, - "auxiliary_loss_mlp": 0.01033581, - "balance_loss_clip": 1.04312873, - "balance_loss_mlp": 1.02007508, - "epoch": 0.48982413948594616, - "flos": 23806225330560.0, - "grad_norm": 2.022171046548427, - "language_loss": 0.74193209, - "learning_rate": 2.162809359964687e-06, - "loss": 0.76336789, - "num_input_tokens_seen": 175094500, - "step": 8147, - "time_per_iteration": 2.732973337173462 - }, - { - "auxiliary_loss_clip": 0.01095071, - "auxiliary_loss_mlp": 0.01032519, - "balance_loss_clip": 1.0448947, - "balance_loss_mlp": 1.0193938, - "epoch": 0.4898842627386142, - "flos": 17639645207040.0, - "grad_norm": 2.1017800501084882, - "language_loss": 0.8286857, - "learning_rate": 2.162421187770864e-06, - "loss": 0.84996164, - "num_input_tokens_seen": 175112920, - "step": 8148, - "time_per_iteration": 2.662179708480835 - }, - { - "auxiliary_loss_clip": 0.01091374, - "auxiliary_loss_mlp": 0.01033444, - "balance_loss_clip": 1.04345882, - "balance_loss_mlp": 1.0213387, - "epoch": 0.48994438599128215, - "flos": 16617914841600.0, - "grad_norm": 1.9007753197415815, - "language_loss": 0.74256468, - "learning_rate": 2.162033009418015e-06, - "loss": 0.76381284, - "num_input_tokens_seen": 175129910, - "step": 8149, - "time_per_iteration": 2.7373321056365967 - }, - { - "auxiliary_loss_clip": 0.01130985, - "auxiliary_loss_mlp": 0.01037014, - "balance_loss_clip": 1.04766726, - "balance_loss_mlp": 1.02247095, - "epoch": 0.4900045092439501, - "flos": 26614834600320.0, - "grad_norm": 1.7000980888808985, - "language_loss": 0.76319683, - "learning_rate": 2.1616448249208567e-06, - "loss": 0.78487676, - "num_input_tokens_seen": 175148705, - "step": 8150, - "time_per_iteration": 2.653003692626953 - }, - { - "auxiliary_loss_clip": 0.01103787, - "auxiliary_loss_mlp": 0.01035673, - "balance_loss_clip": 1.04736936, - "balance_loss_mlp": 1.02152276, - "epoch": 0.4900646324966181, - "flos": 19902125116800.0, - "grad_norm": 2.127966402053614, - "language_loss": 0.72754669, - "learning_rate": 2.1612566342941106e-06, - "loss": 0.7489413, - "num_input_tokens_seen": 175167425, - "step": 8151, - "time_per_iteration": 2.7142715454101562 - }, - { - "auxiliary_loss_clip": 0.01018676, - "auxiliary_loss_mlp": 0.01008139, - "balance_loss_clip": 1.02870607, - "balance_loss_mlp": 1.00680435, - "epoch": 0.49012475574928605, - "flos": 59189620337280.0, - "grad_norm": 0.8300028938034224, - "language_loss": 0.54350889, - "learning_rate": 2.1608684375524977e-06, - "loss": 0.56377703, - "num_input_tokens_seen": 175227985, - "step": 8152, - "time_per_iteration": 3.218646764755249 - }, - { - "auxiliary_loss_clip": 0.01066533, - "auxiliary_loss_mlp": 0.01034489, - "balance_loss_clip": 1.04041779, - "balance_loss_mlp": 1.02058959, - "epoch": 0.490184879001954, - "flos": 45259797657600.0, - "grad_norm": 1.9767488244056508, - "language_loss": 0.61212152, - "learning_rate": 2.1604802347107364e-06, - "loss": 0.6331318, - "num_input_tokens_seen": 175251895, - "step": 8153, - "time_per_iteration": 3.043501615524292 - }, - { - "auxiliary_loss_clip": 0.01091315, - "auxiliary_loss_mlp": 0.01034977, - "balance_loss_clip": 1.04408598, - "balance_loss_mlp": 1.02139306, - "epoch": 0.490245002254622, - "flos": 28002135634560.0, - "grad_norm": 1.494326859026801, - "language_loss": 0.767699, - "learning_rate": 2.160092025783549e-06, - "loss": 0.78896195, - "num_input_tokens_seen": 175272770, - "step": 8154, - "time_per_iteration": 2.783686399459839 - }, - { - "auxiliary_loss_clip": 0.01032948, - "auxiliary_loss_mlp": 0.01009488, - "balance_loss_clip": 1.02573824, - "balance_loss_mlp": 1.00805795, - "epoch": 0.49030512550728994, - "flos": 58951318533120.0, - "grad_norm": 0.9569310885457037, - "language_loss": 0.6699397, - "learning_rate": 2.1597038107856564e-06, - "loss": 0.69036406, - "num_input_tokens_seen": 175336320, - "step": 8155, - "time_per_iteration": 3.2836861610412598 - }, - { - "auxiliary_loss_clip": 0.01128627, - "auxiliary_loss_mlp": 0.01033153, - "balance_loss_clip": 1.04858041, - "balance_loss_mlp": 1.01990271, - "epoch": 0.4903652487599579, - "flos": 19791843384960.0, - "grad_norm": 1.7952288566158678, - "language_loss": 0.76406527, - "learning_rate": 2.1593155897317784e-06, - "loss": 0.78568316, - "num_input_tokens_seen": 175353540, - "step": 8156, - "time_per_iteration": 2.77978515625 - }, - { - "auxiliary_loss_clip": 0.01115952, - "auxiliary_loss_mlp": 0.01033945, - "balance_loss_clip": 1.04693031, - "balance_loss_mlp": 1.02066517, - "epoch": 0.49042537201262587, - "flos": 21762082241280.0, - "grad_norm": 2.671892010748055, - "language_loss": 0.83756495, - "learning_rate": 2.1589273626366377e-06, - "loss": 0.85906386, - "num_input_tokens_seen": 175370445, - "step": 8157, - "time_per_iteration": 2.6860296726226807 - }, - { - "auxiliary_loss_clip": 0.01116981, - "auxiliary_loss_mlp": 0.0103483, - "balance_loss_clip": 1.04626417, - "balance_loss_mlp": 1.02103734, - "epoch": 0.49048549526529384, - "flos": 18953042008320.0, - "grad_norm": 1.6916175452091182, - "language_loss": 0.79447746, - "learning_rate": 2.158539129514956e-06, - "loss": 0.81599557, - "num_input_tokens_seen": 175389020, - "step": 8158, - "time_per_iteration": 2.723398208618164 - }, - { - "auxiliary_loss_clip": 0.01130092, - "auxiliary_loss_mlp": 0.01036013, - "balance_loss_clip": 1.0493114, - "balance_loss_mlp": 1.02237535, - "epoch": 0.4905456185179618, - "flos": 26906393295360.0, - "grad_norm": 1.5924994780725177, - "language_loss": 0.69469124, - "learning_rate": 2.158150890381454e-06, - "loss": 0.71635228, - "num_input_tokens_seen": 175409545, - "step": 8159, - "time_per_iteration": 2.685887575149536 - }, - { - "auxiliary_loss_clip": 0.01109209, - "auxiliary_loss_mlp": 0.01041597, - "balance_loss_clip": 1.04416955, - "balance_loss_mlp": 1.02719688, - "epoch": 0.49060574177062977, - "flos": 20412343854720.0, - "grad_norm": 1.8488353997421354, - "language_loss": 0.73372805, - "learning_rate": 2.157762645250854e-06, - "loss": 0.75523615, - "num_input_tokens_seen": 175429335, - "step": 8160, - "time_per_iteration": 2.7002642154693604 - }, - { - "auxiliary_loss_clip": 0.01111433, - "auxiliary_loss_mlp": 0.01040851, - "balance_loss_clip": 1.04374194, - "balance_loss_mlp": 1.02655184, - "epoch": 0.4906658650232978, - "flos": 17493704248320.0, - "grad_norm": 4.058452856445761, - "language_loss": 0.71791285, - "learning_rate": 2.1573743941378796e-06, - "loss": 0.73943567, - "num_input_tokens_seen": 175446955, - "step": 8161, - "time_per_iteration": 2.641211748123169 - }, - { - "auxiliary_loss_clip": 0.01077408, - "auxiliary_loss_mlp": 0.01036857, - "balance_loss_clip": 1.04114866, - "balance_loss_mlp": 1.02337408, - "epoch": 0.49072598827596575, - "flos": 26614439550720.0, - "grad_norm": 1.5881872934975843, - "language_loss": 0.68676394, - "learning_rate": 2.1569861370572517e-06, - "loss": 0.7079066, - "num_input_tokens_seen": 175468195, - "step": 8162, - "time_per_iteration": 2.7768666744232178 - }, - { - "auxiliary_loss_clip": 0.01114289, - "auxiliary_loss_mlp": 0.01037181, - "balance_loss_clip": 1.04699993, - "balance_loss_mlp": 1.02219641, - "epoch": 0.4907861115286337, - "flos": 20412595249920.0, - "grad_norm": 1.6090900616469643, - "language_loss": 0.63697332, - "learning_rate": 2.1565978740236944e-06, - "loss": 0.65848798, - "num_input_tokens_seen": 175487455, - "step": 8163, - "time_per_iteration": 2.658141851425171 - }, - { - "auxiliary_loss_clip": 0.01086004, - "auxiliary_loss_mlp": 0.01032891, - "balance_loss_clip": 1.03996313, - "balance_loss_mlp": 1.01987886, - "epoch": 0.4908462347813017, - "flos": 14064271286400.0, - "grad_norm": 2.5242130171230954, - "language_loss": 0.77383208, - "learning_rate": 2.1562096050519293e-06, - "loss": 0.79502106, - "num_input_tokens_seen": 175504450, - "step": 8164, - "time_per_iteration": 2.6626484394073486 - }, - { - "auxiliary_loss_clip": 0.01110027, - "auxiliary_loss_mlp": 0.01037706, - "balance_loss_clip": 1.04298282, - "balance_loss_mlp": 1.0221138, - "epoch": 0.49090635803396965, - "flos": 18735100237440.0, - "grad_norm": 1.6753117148295888, - "language_loss": 0.76749474, - "learning_rate": 2.1558213301566806e-06, - "loss": 0.78897208, - "num_input_tokens_seen": 175523600, - "step": 8165, - "time_per_iteration": 2.5757079124450684 - }, - { - "auxiliary_loss_clip": 0.0110394, - "auxiliary_loss_mlp": 0.01035745, - "balance_loss_clip": 1.04666007, - "balance_loss_mlp": 1.02205336, - "epoch": 0.4909664812866376, - "flos": 20558500295040.0, - "grad_norm": 1.5531816235742995, - "language_loss": 0.77461708, - "learning_rate": 2.1554330493526716e-06, - "loss": 0.79601395, - "num_input_tokens_seen": 175542720, - "step": 8166, - "time_per_iteration": 2.7169244289398193 - }, - { - "auxiliary_loss_clip": 0.01040608, - "auxiliary_loss_mlp": 0.00998968, - "balance_loss_clip": 1.02393854, - "balance_loss_mlp": 0.99768084, - "epoch": 0.4910266045393056, - "flos": 54684017948160.0, - "grad_norm": 0.7914566078875801, - "language_loss": 0.54175258, - "learning_rate": 2.1550447626546253e-06, - "loss": 0.56214833, - "num_input_tokens_seen": 175598640, - "step": 8167, - "time_per_iteration": 3.192706823348999 - }, - { - "auxiliary_loss_clip": 0.01081549, - "auxiliary_loss_mlp": 0.01036447, - "balance_loss_clip": 1.04554164, - "balance_loss_mlp": 1.02288687, - "epoch": 0.49108672779197354, - "flos": 16246454342400.0, - "grad_norm": 1.702915470367474, - "language_loss": 0.85894108, - "learning_rate": 2.1546564700772665e-06, - "loss": 0.88012105, - "num_input_tokens_seen": 175615675, - "step": 8168, - "time_per_iteration": 2.7353274822235107 - }, - { - "auxiliary_loss_clip": 0.01107152, - "auxiliary_loss_mlp": 0.01045094, - "balance_loss_clip": 1.04374826, - "balance_loss_mlp": 1.030586, - "epoch": 0.4911468510446415, - "flos": 19825419623040.0, - "grad_norm": 1.7298624053450853, - "language_loss": 0.73407066, - "learning_rate": 2.1542681716353193e-06, - "loss": 0.75559318, - "num_input_tokens_seen": 175632255, - "step": 8169, - "time_per_iteration": 5.773583173751831 - }, - { - "auxiliary_loss_clip": 0.01112799, - "auxiliary_loss_mlp": 0.01029653, - "balance_loss_clip": 1.04443777, - "balance_loss_mlp": 1.01692092, - "epoch": 0.4912069742973095, - "flos": 21212684743680.0, - "grad_norm": 1.4410309608870682, - "language_loss": 0.77824241, - "learning_rate": 2.1538798673435068e-06, - "loss": 0.79966694, - "num_input_tokens_seen": 175651625, - "step": 8170, - "time_per_iteration": 2.6583240032196045 - }, - { - "auxiliary_loss_clip": 0.01096689, - "auxiliary_loss_mlp": 0.010389, - "balance_loss_clip": 1.04164565, - "balance_loss_mlp": 1.02643037, - "epoch": 0.49126709754997744, - "flos": 19537129065600.0, - "grad_norm": 2.2423824181328853, - "language_loss": 0.76314211, - "learning_rate": 2.1534915572165545e-06, - "loss": 0.78449798, - "num_input_tokens_seen": 175669265, - "step": 8171, - "time_per_iteration": 4.3524169921875 - }, - { - "auxiliary_loss_clip": 0.01104096, - "auxiliary_loss_mlp": 0.01036347, - "balance_loss_clip": 1.04284763, - "balance_loss_mlp": 1.02299559, - "epoch": 0.4913272208026454, - "flos": 12239686080000.0, - "grad_norm": 1.898078833449508, - "language_loss": 0.82055932, - "learning_rate": 2.1531032412691875e-06, - "loss": 0.84196377, - "num_input_tokens_seen": 175686065, - "step": 8172, - "time_per_iteration": 4.201699495315552 - }, - { - "auxiliary_loss_clip": 0.0104227, - "auxiliary_loss_mlp": 0.01009809, - "balance_loss_clip": 1.02604604, - "balance_loss_mlp": 1.00842655, - "epoch": 0.49138734405531337, - "flos": 65465871661440.0, - "grad_norm": 0.6872688544677212, - "language_loss": 0.53258997, - "learning_rate": 2.1527149195161295e-06, - "loss": 0.55311078, - "num_input_tokens_seen": 175748595, - "step": 8173, - "time_per_iteration": 3.1827917098999023 - }, - { - "auxiliary_loss_clip": 0.0111451, - "auxiliary_loss_mlp": 0.00771219, - "balance_loss_clip": 1.04312336, - "balance_loss_mlp": 1.00013208, - "epoch": 0.4914474673079814, - "flos": 18439052342400.0, - "grad_norm": 2.1937948702767054, - "language_loss": 0.63081181, - "learning_rate": 2.152326591972107e-06, - "loss": 0.64966911, - "num_input_tokens_seen": 175766770, - "step": 8174, - "time_per_iteration": 2.591662883758545 - }, - { - "auxiliary_loss_clip": 0.01086287, - "auxiliary_loss_mlp": 0.01044728, - "balance_loss_clip": 1.04296112, - "balance_loss_mlp": 1.02985096, - "epoch": 0.49150759056064935, - "flos": 21685053525120.0, - "grad_norm": 1.9252900771693722, - "language_loss": 0.69252932, - "learning_rate": 2.1519382586518445e-06, - "loss": 0.71383941, - "num_input_tokens_seen": 175783605, - "step": 8175, - "time_per_iteration": 2.7286670207977295 - }, - { - "auxiliary_loss_clip": 0.01112428, - "auxiliary_loss_mlp": 0.01032945, - "balance_loss_clip": 1.0438236, - "balance_loss_mlp": 1.02018952, - "epoch": 0.4915677138133173, - "flos": 22382439056640.0, - "grad_norm": 1.7316891792167346, - "language_loss": 0.74424642, - "learning_rate": 2.151549919570068e-06, - "loss": 0.76570022, - "num_input_tokens_seen": 175801390, - "step": 8176, - "time_per_iteration": 2.623328685760498 - }, - { - "auxiliary_loss_clip": 0.01117272, - "auxiliary_loss_mlp": 0.0104375, - "balance_loss_clip": 1.04691124, - "balance_loss_mlp": 1.03022528, - "epoch": 0.4916278370659853, - "flos": 18402890325120.0, - "grad_norm": 1.776030453931397, - "language_loss": 0.70309961, - "learning_rate": 2.1511615747415036e-06, - "loss": 0.72470981, - "num_input_tokens_seen": 175819830, - "step": 8177, - "time_per_iteration": 2.642073154449463 - }, - { - "auxiliary_loss_clip": 0.01031811, - "auxiliary_loss_mlp": 0.00752155, - "balance_loss_clip": 1.02581143, - "balance_loss_mlp": 0.99997473, - "epoch": 0.49168796031865325, - "flos": 66609124715520.0, - "grad_norm": 0.6890109431226723, - "language_loss": 0.46192822, - "learning_rate": 2.150773224180877e-06, - "loss": 0.47976786, - "num_input_tokens_seen": 175881765, - "step": 8178, - "time_per_iteration": 3.195594072341919 - }, - { - "auxiliary_loss_clip": 0.0112992, - "auxiliary_loss_mlp": 0.01036689, - "balance_loss_clip": 1.04735565, - "balance_loss_mlp": 1.02215147, - "epoch": 0.4917480835713212, - "flos": 20959335141120.0, - "grad_norm": 1.748461689040465, - "language_loss": 0.65961659, - "learning_rate": 2.1503848679029147e-06, - "loss": 0.6812827, - "num_input_tokens_seen": 175901795, - "step": 8179, - "time_per_iteration": 2.675170421600342 - }, - { - "auxiliary_loss_clip": 0.01036062, - "auxiliary_loss_mlp": 0.01047888, - "balance_loss_clip": 1.03444839, - "balance_loss_mlp": 1.031497, - "epoch": 0.4918082068239892, - "flos": 15772900412160.0, - "grad_norm": 2.3413868243180493, - "language_loss": 0.70163, - "learning_rate": 2.149996505922343e-06, - "loss": 0.72246957, - "num_input_tokens_seen": 175917770, - "step": 8180, - "time_per_iteration": 2.9436681270599365 - }, - { - "auxiliary_loss_clip": 0.01099418, - "auxiliary_loss_mlp": 0.01037201, - "balance_loss_clip": 1.04268646, - "balance_loss_mlp": 1.02306247, - "epoch": 0.49186833007665715, - "flos": 24604806453120.0, - "grad_norm": 1.915055420772654, - "language_loss": 0.84369922, - "learning_rate": 2.1496081382538895e-06, - "loss": 0.86506534, - "num_input_tokens_seen": 175937000, - "step": 8181, - "time_per_iteration": 2.8556039333343506 - }, - { - "auxiliary_loss_clip": 0.01125975, - "auxiliary_loss_mlp": 0.010356, - "balance_loss_clip": 1.04886341, - "balance_loss_mlp": 1.0226841, - "epoch": 0.4919284533293251, - "flos": 22090557139200.0, - "grad_norm": 2.841846979456106, - "language_loss": 0.72482812, - "learning_rate": 2.1492197649122793e-06, - "loss": 0.74644387, - "num_input_tokens_seen": 175955170, - "step": 8182, - "time_per_iteration": 2.5908985137939453 - }, - { - "auxiliary_loss_clip": 0.01088743, - "auxiliary_loss_mlp": 0.01035989, - "balance_loss_clip": 1.04323542, - "balance_loss_mlp": 1.0227685, - "epoch": 0.4919885765819931, - "flos": 23368043318400.0, - "grad_norm": 2.038591418033226, - "language_loss": 0.72608387, - "learning_rate": 2.1488313859122412e-06, - "loss": 0.74733126, - "num_input_tokens_seen": 175973725, - "step": 8183, - "time_per_iteration": 2.7704007625579834 - }, - { - "auxiliary_loss_clip": 0.0106529, - "auxiliary_loss_mlp": 0.01035357, - "balance_loss_clip": 1.03853834, - "balance_loss_mlp": 1.0204556, - "epoch": 0.49204869983466104, - "flos": 21360493209600.0, - "grad_norm": 3.5725391309360406, - "language_loss": 0.77354276, - "learning_rate": 2.1484430012685015e-06, - "loss": 0.79454923, - "num_input_tokens_seen": 175993885, - "step": 8184, - "time_per_iteration": 2.8195126056671143 - }, - { - "auxiliary_loss_clip": 0.01094147, - "auxiliary_loss_mlp": 0.01040773, - "balance_loss_clip": 1.04233742, - "balance_loss_mlp": 1.02739143, - "epoch": 0.492108823087329, - "flos": 21142695093120.0, - "grad_norm": 1.8939343643350832, - "language_loss": 0.70917577, - "learning_rate": 2.148054610995789e-06, - "loss": 0.73052496, - "num_input_tokens_seen": 176014210, - "step": 8185, - "time_per_iteration": 2.678464412689209 - }, - { - "auxiliary_loss_clip": 0.01108334, - "auxiliary_loss_mlp": 0.01037918, - "balance_loss_clip": 1.0468477, - "balance_loss_mlp": 1.02306461, - "epoch": 0.49216894633999697, - "flos": 25116605389440.0, - "grad_norm": 1.7900274786799464, - "language_loss": 0.75134045, - "learning_rate": 2.147666215108831e-06, - "loss": 0.77280295, - "num_input_tokens_seen": 176033890, - "step": 8186, - "time_per_iteration": 2.754204273223877 - }, - { - "auxiliary_loss_clip": 0.01116557, - "auxiliary_loss_mlp": 0.01034757, - "balance_loss_clip": 1.04770708, - "balance_loss_mlp": 1.02050531, - "epoch": 0.49222906959266494, - "flos": 22637943475200.0, - "grad_norm": 2.9803647414716945, - "language_loss": 0.67526996, - "learning_rate": 2.1472778136223545e-06, - "loss": 0.69678307, - "num_input_tokens_seen": 176052720, - "step": 8187, - "time_per_iteration": 2.6845459938049316 - }, - { - "auxiliary_loss_clip": 0.0108036, - "auxiliary_loss_mlp": 0.01036841, - "balance_loss_clip": 1.04077077, - "balance_loss_mlp": 1.02301288, - "epoch": 0.49228919284533296, - "flos": 20410548174720.0, - "grad_norm": 1.410632675975, - "language_loss": 0.67109811, - "learning_rate": 2.1468894065510894e-06, - "loss": 0.6922701, - "num_input_tokens_seen": 176072545, - "step": 8188, - "time_per_iteration": 2.8322603702545166 - }, - { - "auxiliary_loss_clip": 0.01119978, - "auxiliary_loss_mlp": 0.01034509, - "balance_loss_clip": 1.04967701, - "balance_loss_mlp": 1.02131248, - "epoch": 0.4923493160980009, - "flos": 27122359818240.0, - "grad_norm": 1.8145698664310643, - "language_loss": 0.74643195, - "learning_rate": 2.1465009939097623e-06, - "loss": 0.76797676, - "num_input_tokens_seen": 176091490, - "step": 8189, - "time_per_iteration": 2.700728178024292 - }, - { - "auxiliary_loss_clip": 0.01102804, - "auxiliary_loss_mlp": 0.01027439, - "balance_loss_clip": 1.04349804, - "balance_loss_mlp": 1.0138967, - "epoch": 0.4924094393506689, - "flos": 35736683224320.0, - "grad_norm": 1.5012400452063497, - "language_loss": 0.63989937, - "learning_rate": 2.146112575713104e-06, - "loss": 0.66120183, - "num_input_tokens_seen": 176113200, - "step": 8190, - "time_per_iteration": 2.781034231185913 - }, - { - "auxiliary_loss_clip": 0.01127618, - "auxiliary_loss_mlp": 0.0103068, - "balance_loss_clip": 1.04802811, - "balance_loss_mlp": 1.01666641, - "epoch": 0.49246956260333685, - "flos": 20412487509120.0, - "grad_norm": 2.59854956867769, - "language_loss": 0.71723747, - "learning_rate": 2.1457241519758413e-06, - "loss": 0.73882031, - "num_input_tokens_seen": 176132485, - "step": 8191, - "time_per_iteration": 2.6378936767578125 - }, - { - "auxiliary_loss_clip": 0.01125365, - "auxiliary_loss_mlp": 0.00771087, - "balance_loss_clip": 1.04543817, - "balance_loss_mlp": 1.00005293, - "epoch": 0.4925296858560048, - "flos": 38976938231040.0, - "grad_norm": 1.5444009886503365, - "language_loss": 0.71964842, - "learning_rate": 2.1453357227127043e-06, - "loss": 0.73861289, - "num_input_tokens_seen": 176155755, - "step": 8192, - "time_per_iteration": 2.748840570449829 - }, - { - "auxiliary_loss_clip": 0.01029185, - "auxiliary_loss_mlp": 0.01001084, - "balance_loss_clip": 1.02257538, - "balance_loss_mlp": 0.9996711, - "epoch": 0.4925898091086728, - "flos": 64278917712000.0, - "grad_norm": 0.718294486843201, - "language_loss": 0.52137887, - "learning_rate": 2.1449472879384224e-06, - "loss": 0.54168153, - "num_input_tokens_seen": 176216295, - "step": 8193, - "time_per_iteration": 3.264312267303467 - }, - { - "auxiliary_loss_clip": 0.01125829, - "auxiliary_loss_mlp": 0.01041308, - "balance_loss_clip": 1.04740691, - "balance_loss_mlp": 1.02760482, - "epoch": 0.49264993236134075, - "flos": 23036372110080.0, - "grad_norm": 1.4111181716707888, - "language_loss": 0.76839447, - "learning_rate": 2.1445588476677246e-06, - "loss": 0.79006582, - "num_input_tokens_seen": 176235925, - "step": 8194, - "time_per_iteration": 2.7086539268493652 - }, - { - "auxiliary_loss_clip": 0.01098073, - "auxiliary_loss_mlp": 0.0103376, - "balance_loss_clip": 1.04026222, - "balance_loss_mlp": 1.02031338, - "epoch": 0.4927100556140087, - "flos": 24718212668160.0, - "grad_norm": 1.9420104205554047, - "language_loss": 0.70233512, - "learning_rate": 2.144170401915341e-06, - "loss": 0.72365344, - "num_input_tokens_seen": 176253865, - "step": 8195, - "time_per_iteration": 2.6881814002990723 - }, - { - "auxiliary_loss_clip": 0.01087059, - "auxiliary_loss_mlp": 0.01033387, - "balance_loss_clip": 1.04724264, - "balance_loss_mlp": 1.02013052, - "epoch": 0.4927701788666767, - "flos": 23505544581120.0, - "grad_norm": 2.097647655801467, - "language_loss": 0.81090224, - "learning_rate": 2.143781950696001e-06, - "loss": 0.83210671, - "num_input_tokens_seen": 176271525, - "step": 8196, - "time_per_iteration": 2.7997779846191406 - }, - { - "auxiliary_loss_clip": 0.01092387, - "auxiliary_loss_mlp": 0.01036049, - "balance_loss_clip": 1.04048955, - "balance_loss_mlp": 1.0212965, - "epoch": 0.49283030211934464, - "flos": 22928891639040.0, - "grad_norm": 1.9754651417860998, - "language_loss": 0.70963365, - "learning_rate": 2.1433934940244356e-06, - "loss": 0.73091799, - "num_input_tokens_seen": 176290810, - "step": 8197, - "time_per_iteration": 2.687640428543091 - }, - { - "auxiliary_loss_clip": 0.01113685, - "auxiliary_loss_mlp": 0.01037816, - "balance_loss_clip": 1.04734302, - "balance_loss_mlp": 1.0245595, - "epoch": 0.4928904253720126, - "flos": 16873024210560.0, - "grad_norm": 2.0854468186505133, - "language_loss": 0.84519106, - "learning_rate": 2.143005031915374e-06, - "loss": 0.86670601, - "num_input_tokens_seen": 176309165, - "step": 8198, - "time_per_iteration": 2.660125255584717 - }, - { - "auxiliary_loss_clip": 0.01120431, - "auxiliary_loss_mlp": 0.01037402, - "balance_loss_clip": 1.04784405, - "balance_loss_mlp": 1.02326965, - "epoch": 0.4929505486246806, - "flos": 14866551509760.0, - "grad_norm": 1.8081780640264744, - "language_loss": 0.76137328, - "learning_rate": 2.1426165643835467e-06, - "loss": 0.78295165, - "num_input_tokens_seen": 176324960, - "step": 8199, - "time_per_iteration": 2.6528286933898926 - }, - { - "auxiliary_loss_clip": 0.0110111, - "auxiliary_loss_mlp": 0.0103715, - "balance_loss_clip": 1.0420711, - "balance_loss_mlp": 1.02215934, - "epoch": 0.49301067187734854, - "flos": 23842351434240.0, - "grad_norm": 1.5743655623972015, - "language_loss": 0.60060918, - "learning_rate": 2.1422280914436864e-06, - "loss": 0.62199175, - "num_input_tokens_seen": 176346195, - "step": 8200, - "time_per_iteration": 2.725208044052124 - }, - { - "auxiliary_loss_clip": 0.01112367, - "auxiliary_loss_mlp": 0.01042559, - "balance_loss_clip": 1.04529691, - "balance_loss_mlp": 1.0288918, - "epoch": 0.49307079513001656, - "flos": 22491284244480.0, - "grad_norm": 1.489817328340962, - "language_loss": 0.79219347, - "learning_rate": 2.1418396131105213e-06, - "loss": 0.81374276, - "num_input_tokens_seen": 176366735, - "step": 8201, - "time_per_iteration": 2.6749329566955566 - }, - { - "auxiliary_loss_clip": 0.0112059, - "auxiliary_loss_mlp": 0.010363, - "balance_loss_clip": 1.04529119, - "balance_loss_mlp": 1.02063608, - "epoch": 0.4931309183826845, - "flos": 15924587546880.0, - "grad_norm": 2.8764138588073527, - "language_loss": 0.67214566, - "learning_rate": 2.141451129398785e-06, - "loss": 0.69371456, - "num_input_tokens_seen": 176384475, - "step": 8202, - "time_per_iteration": 2.6964852809906006 - }, - { - "auxiliary_loss_clip": 0.01101254, - "auxiliary_loss_mlp": 0.01032575, - "balance_loss_clip": 1.04416037, - "balance_loss_mlp": 1.01929486, - "epoch": 0.4931910416353525, - "flos": 27309059735040.0, - "grad_norm": 2.180124290012348, - "language_loss": 0.75387114, - "learning_rate": 2.1410626403232076e-06, - "loss": 0.77520943, - "num_input_tokens_seen": 176402645, - "step": 8203, - "time_per_iteration": 2.725586175918579 - }, - { - "auxiliary_loss_clip": 0.01070891, - "auxiliary_loss_mlp": 0.01037718, - "balance_loss_clip": 1.04055309, - "balance_loss_mlp": 1.02355599, - "epoch": 0.49325116488802045, - "flos": 20806139635200.0, - "grad_norm": 2.514240753505036, - "language_loss": 0.8037259, - "learning_rate": 2.1406741458985197e-06, - "loss": 0.82481205, - "num_input_tokens_seen": 176416715, - "step": 8204, - "time_per_iteration": 2.6802115440368652 - }, - { - "auxiliary_loss_clip": 0.01112932, - "auxiliary_loss_mlp": 0.01040079, - "balance_loss_clip": 1.04543495, - "balance_loss_mlp": 1.02662015, - "epoch": 0.4933112881406884, - "flos": 19865963099520.0, - "grad_norm": 1.919360097124168, - "language_loss": 0.65891969, - "learning_rate": 2.140285646139455e-06, - "loss": 0.68044984, - "num_input_tokens_seen": 176435755, - "step": 8205, - "time_per_iteration": 2.6556243896484375 - }, - { - "auxiliary_loss_clip": 0.01131728, - "auxiliary_loss_mlp": 0.01037034, - "balance_loss_clip": 1.04643822, - "balance_loss_mlp": 1.02157259, - "epoch": 0.4933714113933564, - "flos": 21827977741440.0, - "grad_norm": 2.0939603582763207, - "language_loss": 0.66682738, - "learning_rate": 2.139897141060744e-06, - "loss": 0.68851495, - "num_input_tokens_seen": 176453915, - "step": 8206, - "time_per_iteration": 2.6004998683929443 - }, - { - "auxiliary_loss_clip": 0.01078434, - "auxiliary_loss_mlp": 0.01042651, - "balance_loss_clip": 1.04006064, - "balance_loss_mlp": 1.02803612, - "epoch": 0.49343153464602435, - "flos": 27890130049920.0, - "grad_norm": 1.7303473596412533, - "language_loss": 0.76393557, - "learning_rate": 2.1395086306771196e-06, - "loss": 0.78514642, - "num_input_tokens_seen": 176475175, - "step": 8207, - "time_per_iteration": 2.7545268535614014 - }, - { - "auxiliary_loss_clip": 0.01104435, - "auxiliary_loss_mlp": 0.01037384, - "balance_loss_clip": 1.04703426, - "balance_loss_mlp": 1.02245331, - "epoch": 0.4934916578986923, - "flos": 24681080983680.0, - "grad_norm": 2.36511926609042, - "language_loss": 0.60212123, - "learning_rate": 2.1391201150033147e-06, - "loss": 0.62353945, - "num_input_tokens_seen": 176494250, - "step": 8208, - "time_per_iteration": 4.556094408035278 - }, - { - "auxiliary_loss_clip": 0.01108642, - "auxiliary_loss_mlp": 0.0103495, - "balance_loss_clip": 1.04619265, - "balance_loss_mlp": 1.01990545, - "epoch": 0.4935517811513603, - "flos": 23405139089280.0, - "grad_norm": 1.7507431161374047, - "language_loss": 0.78938925, - "learning_rate": 2.1387315940540598e-06, - "loss": 0.81082511, - "num_input_tokens_seen": 176513325, - "step": 8209, - "time_per_iteration": 4.171698093414307 - }, - { - "auxiliary_loss_clip": 0.01094204, - "auxiliary_loss_mlp": 0.00774879, - "balance_loss_clip": 1.03905034, - "balance_loss_mlp": 1.00007224, - "epoch": 0.49361190440402825, - "flos": 21944508439680.0, - "grad_norm": 2.001694580419455, - "language_loss": 0.79098332, - "learning_rate": 2.138343067844089e-06, - "loss": 0.80967414, - "num_input_tokens_seen": 176532915, - "step": 8210, - "time_per_iteration": 4.38470196723938 - }, - { - "auxiliary_loss_clip": 0.01113566, - "auxiliary_loss_mlp": 0.01039269, - "balance_loss_clip": 1.04458427, - "balance_loss_mlp": 1.02467823, - "epoch": 0.4936720276566962, - "flos": 25115671635840.0, - "grad_norm": 1.6707024820379262, - "language_loss": 0.81313854, - "learning_rate": 2.1379545363881363e-06, - "loss": 0.83466691, - "num_input_tokens_seen": 176552775, - "step": 8211, - "time_per_iteration": 4.290592193603516 - }, - { - "auxiliary_loss_clip": 0.01082515, - "auxiliary_loss_mlp": 0.01050398, - "balance_loss_clip": 1.04066169, - "balance_loss_mlp": 1.03376865, - "epoch": 0.4937321509093642, - "flos": 26358935132160.0, - "grad_norm": 2.2904212815365477, - "language_loss": 0.9144789, - "learning_rate": 2.137565999700933e-06, - "loss": 0.93580806, - "num_input_tokens_seen": 176572185, - "step": 8212, - "time_per_iteration": 2.77516508102417 - }, - { - "auxiliary_loss_clip": 0.010785, - "auxiliary_loss_mlp": 0.01041938, - "balance_loss_clip": 1.03849816, - "balance_loss_mlp": 1.02666783, - "epoch": 0.49379227416203214, - "flos": 22961390469120.0, - "grad_norm": 2.314209741920176, - "language_loss": 0.65430582, - "learning_rate": 2.1371774577972138e-06, - "loss": 0.67551017, - "num_input_tokens_seen": 176591490, - "step": 8213, - "time_per_iteration": 2.844672203063965 - }, - { - "auxiliary_loss_clip": 0.01074353, - "auxiliary_loss_mlp": 0.00772712, - "balance_loss_clip": 1.03954375, - "balance_loss_mlp": 1.00013876, - "epoch": 0.49385239741470016, - "flos": 32489101843200.0, - "grad_norm": 1.8844803433311228, - "language_loss": 0.7592994, - "learning_rate": 2.136788910691711e-06, - "loss": 0.77777004, - "num_input_tokens_seen": 176612715, - "step": 8214, - "time_per_iteration": 2.828538179397583 - }, - { - "auxiliary_loss_clip": 0.01131168, - "auxiliary_loss_mlp": 0.01038594, - "balance_loss_clip": 1.0492506, - "balance_loss_mlp": 1.02410388, - "epoch": 0.4939125206673681, - "flos": 22492864442880.0, - "grad_norm": 2.152096163807918, - "language_loss": 0.84490359, - "learning_rate": 2.1364003583991594e-06, - "loss": 0.86660123, - "num_input_tokens_seen": 176631950, - "step": 8215, - "time_per_iteration": 2.6413228511810303 - }, - { - "auxiliary_loss_clip": 0.01108159, - "auxiliary_loss_mlp": 0.01033701, - "balance_loss_clip": 1.04206347, - "balance_loss_mlp": 1.02092147, - "epoch": 0.4939726439200361, - "flos": 31176351486720.0, - "grad_norm": 1.5888417840027016, - "language_loss": 0.83245987, - "learning_rate": 2.136011800934292e-06, - "loss": 0.8538785, - "num_input_tokens_seen": 176653060, - "step": 8216, - "time_per_iteration": 2.67913818359375 - }, - { - "auxiliary_loss_clip": 0.01097989, - "auxiliary_loss_mlp": 0.01034559, - "balance_loss_clip": 1.04419255, - "balance_loss_mlp": 1.02112412, - "epoch": 0.49403276717270406, - "flos": 22674213233280.0, - "grad_norm": 2.8019860461659087, - "language_loss": 0.74546432, - "learning_rate": 2.1356232383118442e-06, - "loss": 0.76678985, - "num_input_tokens_seen": 176673895, - "step": 8217, - "time_per_iteration": 2.686866283416748 - }, - { - "auxiliary_loss_clip": 0.0112431, - "auxiliary_loss_mlp": 0.00771315, - "balance_loss_clip": 1.04717755, - "balance_loss_mlp": 1.00011575, - "epoch": 0.494092890425372, - "flos": 20741070147840.0, - "grad_norm": 1.5679905275329922, - "language_loss": 0.78933907, - "learning_rate": 2.1352346705465494e-06, - "loss": 0.80829537, - "num_input_tokens_seen": 176692550, - "step": 8218, - "time_per_iteration": 2.6126081943511963 - }, - { - "auxiliary_loss_clip": 0.01073156, - "auxiliary_loss_mlp": 0.00770777, - "balance_loss_clip": 1.03962803, - "balance_loss_mlp": 1.000103, - "epoch": 0.49415301367804, - "flos": 18369026778240.0, - "grad_norm": 2.059466953332075, - "language_loss": 0.77003837, - "learning_rate": 2.134846097653142e-06, - "loss": 0.78847766, - "num_input_tokens_seen": 176709335, - "step": 8219, - "time_per_iteration": 2.705432176589966 - }, - { - "auxiliary_loss_clip": 0.01103123, - "auxiliary_loss_mlp": 0.01034129, - "balance_loss_clip": 1.04458046, - "balance_loss_mlp": 1.02009845, - "epoch": 0.49421313693070795, - "flos": 17530620451200.0, - "grad_norm": 1.9177646932354293, - "language_loss": 0.62838733, - "learning_rate": 2.134457519646357e-06, - "loss": 0.64975989, - "num_input_tokens_seen": 176727715, - "step": 8220, - "time_per_iteration": 2.615745782852173 - }, - { - "auxiliary_loss_clip": 0.01124834, - "auxiliary_loss_mlp": 0.01032605, - "balance_loss_clip": 1.04509032, - "balance_loss_mlp": 1.01844347, - "epoch": 0.4942732601833759, - "flos": 20812173120000.0, - "grad_norm": 1.9687050610151906, - "language_loss": 0.72233951, - "learning_rate": 2.1340689365409296e-06, - "loss": 0.74391389, - "num_input_tokens_seen": 176747530, - "step": 8221, - "time_per_iteration": 2.6178054809570312 - }, - { - "auxiliary_loss_clip": 0.01085939, - "auxiliary_loss_mlp": 0.01035447, - "balance_loss_clip": 1.04544675, - "balance_loss_mlp": 1.02218497, - "epoch": 0.4943333834360439, - "flos": 15048941794560.0, - "grad_norm": 1.861092907129918, - "language_loss": 0.792252, - "learning_rate": 2.133680348351595e-06, - "loss": 0.81346589, - "num_input_tokens_seen": 176765260, - "step": 8222, - "time_per_iteration": 2.679504632949829 - }, - { - "auxiliary_loss_clip": 0.01115599, - "auxiliary_loss_mlp": 0.01036496, - "balance_loss_clip": 1.04686999, - "balance_loss_mlp": 1.022048, - "epoch": 0.49439350668871185, - "flos": 16070420764800.0, - "grad_norm": 2.9899447612273784, - "language_loss": 0.72679973, - "learning_rate": 2.133291755093088e-06, - "loss": 0.7483207, - "num_input_tokens_seen": 176781770, - "step": 8223, - "time_per_iteration": 2.581552028656006 - }, - { - "auxiliary_loss_clip": 0.01116938, - "auxiliary_loss_mlp": 0.01040425, - "balance_loss_clip": 1.04635167, - "balance_loss_mlp": 1.0257324, - "epoch": 0.4944536299413798, - "flos": 20880079781760.0, - "grad_norm": 2.0609443486265784, - "language_loss": 0.75248039, - "learning_rate": 2.132903156780144e-06, - "loss": 0.77405405, - "num_input_tokens_seen": 176800655, - "step": 8224, - "time_per_iteration": 2.6427581310272217 - }, - { - "auxiliary_loss_clip": 0.0110423, - "auxiliary_loss_mlp": 0.01033189, - "balance_loss_clip": 1.04815972, - "balance_loss_mlp": 1.01925385, - "epoch": 0.4945137531940478, - "flos": 26608908856320.0, - "grad_norm": 2.070444808683487, - "language_loss": 0.6428299, - "learning_rate": 2.1325145534274997e-06, - "loss": 0.66420412, - "num_input_tokens_seen": 176820610, - "step": 8225, - "time_per_iteration": 2.685084104537964 - }, - { - "auxiliary_loss_clip": 0.01105728, - "auxiliary_loss_mlp": 0.01034446, - "balance_loss_clip": 1.04689407, - "balance_loss_mlp": 1.02097511, - "epoch": 0.49457387644671574, - "flos": 23988148738560.0, - "grad_norm": 2.0654038990553834, - "language_loss": 0.76539797, - "learning_rate": 2.1321259450498893e-06, - "loss": 0.78679967, - "num_input_tokens_seen": 176840520, - "step": 8226, - "time_per_iteration": 2.776888132095337 - }, - { - "auxiliary_loss_clip": 0.01130995, - "auxiliary_loss_mlp": 0.01043657, - "balance_loss_clip": 1.04843736, - "balance_loss_mlp": 1.02849376, - "epoch": 0.49463399969938376, - "flos": 26976598427520.0, - "grad_norm": 1.7138853183765776, - "language_loss": 0.71274078, - "learning_rate": 2.131737331662051e-06, - "loss": 0.7344873, - "num_input_tokens_seen": 176860265, - "step": 8227, - "time_per_iteration": 2.6920416355133057 - }, - { - "auxiliary_loss_clip": 0.01109805, - "auxiliary_loss_mlp": 0.01042947, - "balance_loss_clip": 1.04749131, - "balance_loss_mlp": 1.02879047, - "epoch": 0.49469412295205173, - "flos": 29681534067840.0, - "grad_norm": 1.5610614491128025, - "language_loss": 0.7156117, - "learning_rate": 2.131348713278718e-06, - "loss": 0.73713928, - "num_input_tokens_seen": 176882910, - "step": 8228, - "time_per_iteration": 2.7586421966552734 - }, - { - "auxiliary_loss_clip": 0.01126513, - "auxiliary_loss_mlp": 0.01030651, - "balance_loss_clip": 1.04834974, - "balance_loss_mlp": 1.01664948, - "epoch": 0.4947542462047197, - "flos": 24131791226880.0, - "grad_norm": 1.7062154527873281, - "language_loss": 0.83690989, - "learning_rate": 2.1309600899146304e-06, - "loss": 0.85848153, - "num_input_tokens_seen": 176903030, - "step": 8229, - "time_per_iteration": 2.643385887145996 - }, - { - "auxiliary_loss_clip": 0.01117283, - "auxiliary_loss_mlp": 0.01035461, - "balance_loss_clip": 1.04470325, - "balance_loss_mlp": 1.0201304, - "epoch": 0.49481436945738766, - "flos": 20045049333120.0, - "grad_norm": 1.8291146066570236, - "language_loss": 0.74686736, - "learning_rate": 2.1305714615845227e-06, - "loss": 0.76839477, - "num_input_tokens_seen": 176919025, - "step": 8230, - "time_per_iteration": 2.6726033687591553 - }, - { - "auxiliary_loss_clip": 0.01112312, - "auxiliary_loss_mlp": 0.0103259, - "balance_loss_clip": 1.04797947, - "balance_loss_mlp": 1.01941717, - "epoch": 0.4948744927100556, - "flos": 15669550005120.0, - "grad_norm": 1.946821067065893, - "language_loss": 0.79830235, - "learning_rate": 2.1301828283031314e-06, - "loss": 0.81975138, - "num_input_tokens_seen": 176937945, - "step": 8231, - "time_per_iteration": 2.627202272415161 - }, - { - "auxiliary_loss_clip": 0.01038701, - "auxiliary_loss_mlp": 0.01000467, - "balance_loss_clip": 1.02304196, - "balance_loss_mlp": 0.99924535, - "epoch": 0.4949346159627236, - "flos": 68872071502080.0, - "grad_norm": 0.7441317598934056, - "language_loss": 0.60252988, - "learning_rate": 2.1297941900851944e-06, - "loss": 0.62292159, - "num_input_tokens_seen": 177004575, - "step": 8232, - "time_per_iteration": 3.299022912979126 - }, - { - "auxiliary_loss_clip": 0.01103975, - "auxiliary_loss_mlp": 0.01036844, - "balance_loss_clip": 1.04270494, - "balance_loss_mlp": 1.0220201, - "epoch": 0.49499473921539155, - "flos": 24790285307520.0, - "grad_norm": 1.6243536723265515, - "language_loss": 0.69376481, - "learning_rate": 2.1294055469454496e-06, - "loss": 0.71517295, - "num_input_tokens_seen": 177024155, - "step": 8233, - "time_per_iteration": 2.7124898433685303 - }, - { - "auxiliary_loss_clip": 0.01069129, - "auxiliary_loss_mlp": 0.01041459, - "balance_loss_clip": 1.03902805, - "balance_loss_mlp": 1.02584291, - "epoch": 0.4950548624680595, - "flos": 32707905540480.0, - "grad_norm": 1.998308286461765, - "language_loss": 0.66344726, - "learning_rate": 2.129016898898633e-06, - "loss": 0.68455309, - "num_input_tokens_seen": 177046185, - "step": 8234, - "time_per_iteration": 2.7932980060577393 - }, - { - "auxiliary_loss_clip": 0.01031932, - "auxiliary_loss_mlp": 0.01001723, - "balance_loss_clip": 1.02630067, - "balance_loss_mlp": 1.00048304, - "epoch": 0.4951149857207275, - "flos": 50082173066880.0, - "grad_norm": 0.7974470380945157, - "language_loss": 0.58048564, - "learning_rate": 2.128628245959482e-06, - "loss": 0.60082221, - "num_input_tokens_seen": 177099025, - "step": 8235, - "time_per_iteration": 3.095088481903076 - }, - { - "auxiliary_loss_clip": 0.01096356, - "auxiliary_loss_mlp": 0.01043085, - "balance_loss_clip": 1.0431416, - "balance_loss_mlp": 1.02861345, - "epoch": 0.49517510897339545, - "flos": 22236785406720.0, - "grad_norm": 1.5745194755893521, - "language_loss": 0.77200663, - "learning_rate": 2.1282395881427355e-06, - "loss": 0.793401, - "num_input_tokens_seen": 177118365, - "step": 8236, - "time_per_iteration": 2.7678022384643555 - }, - { - "auxiliary_loss_clip": 0.01081616, - "auxiliary_loss_mlp": 0.01037859, - "balance_loss_clip": 1.0420413, - "balance_loss_mlp": 1.02397156, - "epoch": 0.4952352322260634, - "flos": 25374120969600.0, - "grad_norm": 1.6979000405196067, - "language_loss": 0.73080051, - "learning_rate": 2.1278509254631315e-06, - "loss": 0.75199521, - "num_input_tokens_seen": 177136415, - "step": 8237, - "time_per_iteration": 2.764728307723999 - }, - { - "auxiliary_loss_clip": 0.01124754, - "auxiliary_loss_mlp": 0.01035631, - "balance_loss_clip": 1.04693317, - "balance_loss_mlp": 1.02215445, - "epoch": 0.4952953554787314, - "flos": 24608721035520.0, - "grad_norm": 1.914497446494958, - "language_loss": 0.75439888, - "learning_rate": 2.127462257935406e-06, - "loss": 0.77600276, - "num_input_tokens_seen": 177155690, - "step": 8238, - "time_per_iteration": 2.66549015045166 - }, - { - "auxiliary_loss_clip": 0.01084433, - "auxiliary_loss_mlp": 0.0104692, - "balance_loss_clip": 1.04372036, - "balance_loss_mlp": 1.03062415, - "epoch": 0.49535547873139935, - "flos": 17311278049920.0, - "grad_norm": 2.2478036902932508, - "language_loss": 0.73706102, - "learning_rate": 2.1270735855743008e-06, - "loss": 0.75837457, - "num_input_tokens_seen": 177173350, - "step": 8239, - "time_per_iteration": 2.703118324279785 - }, - { - "auxiliary_loss_clip": 0.0104307, - "auxiliary_loss_mlp": 0.01038928, - "balance_loss_clip": 1.04188919, - "balance_loss_mlp": 1.0223105, - "epoch": 0.4954156019840673, - "flos": 20740315962240.0, - "grad_norm": 2.5033228354450667, - "language_loss": 0.7926327, - "learning_rate": 2.126684908394552e-06, - "loss": 0.8134526, - "num_input_tokens_seen": 177191115, - "step": 8240, - "time_per_iteration": 2.9256656169891357 - }, - { - "auxiliary_loss_clip": 0.01116686, - "auxiliary_loss_mlp": 0.01040866, - "balance_loss_clip": 1.04832554, - "balance_loss_mlp": 1.0278666, - "epoch": 0.49547572523673533, - "flos": 12820684567680.0, - "grad_norm": 2.1558656465787367, - "language_loss": 0.8547368, - "learning_rate": 2.126296226410898e-06, - "loss": 0.87631238, - "num_input_tokens_seen": 177206155, - "step": 8241, - "time_per_iteration": 2.9096901416778564 - }, - { - "auxiliary_loss_clip": 0.01067537, - "auxiliary_loss_mlp": 0.01039414, - "balance_loss_clip": 1.04159331, - "balance_loss_mlp": 1.02591348, - "epoch": 0.4955358484894033, - "flos": 15597046402560.0, - "grad_norm": 1.820610909823573, - "language_loss": 0.77092397, - "learning_rate": 2.1259075396380794e-06, - "loss": 0.7919935, - "num_input_tokens_seen": 177224815, - "step": 8242, - "time_per_iteration": 2.6902410984039307 - }, - { - "auxiliary_loss_clip": 0.01104403, - "auxiliary_loss_mlp": 0.00771127, - "balance_loss_clip": 1.04569447, - "balance_loss_mlp": 1.00017774, - "epoch": 0.49559597174207126, - "flos": 26464368528000.0, - "grad_norm": 1.9730293387874334, - "language_loss": 0.67737073, - "learning_rate": 2.125518848090833e-06, - "loss": 0.69612604, - "num_input_tokens_seen": 177244490, - "step": 8243, - "time_per_iteration": 2.6972243785858154 - }, - { - "auxiliary_loss_clip": 0.01112124, - "auxiliary_loss_mlp": 0.01034088, - "balance_loss_clip": 1.04816341, - "balance_loss_mlp": 1.02076697, - "epoch": 0.4956560949947392, - "flos": 23148234040320.0, - "grad_norm": 2.2375947263106526, - "language_loss": 0.67908239, - "learning_rate": 2.125130151783901e-06, - "loss": 0.70054448, - "num_input_tokens_seen": 177264340, - "step": 8244, - "time_per_iteration": 2.762528419494629 - }, - { - "auxiliary_loss_clip": 0.01097015, - "auxiliary_loss_mlp": 0.01040284, - "balance_loss_clip": 1.04337358, - "balance_loss_mlp": 1.02460194, - "epoch": 0.4957162182474072, - "flos": 20773461237120.0, - "grad_norm": 1.8772229473228363, - "language_loss": 0.74776495, - "learning_rate": 2.12474145073202e-06, - "loss": 0.76913798, - "num_input_tokens_seen": 177283055, - "step": 8245, - "time_per_iteration": 2.7792561054229736 - }, - { - "auxiliary_loss_clip": 0.01115174, - "auxiliary_loss_mlp": 0.01036156, - "balance_loss_clip": 1.04705966, - "balance_loss_mlp": 1.02214909, - "epoch": 0.49577634150007516, - "flos": 18734202397440.0, - "grad_norm": 1.8990901453025917, - "language_loss": 0.8153336, - "learning_rate": 2.1243527449499306e-06, - "loss": 0.83684695, - "num_input_tokens_seen": 177301140, - "step": 8246, - "time_per_iteration": 2.5740935802459717 - }, - { - "auxiliary_loss_clip": 0.01090358, - "auxiliary_loss_mlp": 0.0104326, - "balance_loss_clip": 1.04562306, - "balance_loss_mlp": 1.02767944, - "epoch": 0.4958364647527431, - "flos": 25554176870400.0, - "grad_norm": 1.8707658617569873, - "language_loss": 0.83808625, - "learning_rate": 2.1239640344523733e-06, - "loss": 0.85942245, - "num_input_tokens_seen": 177323095, - "step": 8247, - "time_per_iteration": 4.410465955734253 - }, - { - "auxiliary_loss_clip": 0.01102086, - "auxiliary_loss_mlp": 0.01030712, - "balance_loss_clip": 1.05016184, - "balance_loss_mlp": 1.01716995, - "epoch": 0.4958965880054111, - "flos": 24425325169920.0, - "grad_norm": 1.9625896451991354, - "language_loss": 0.83650881, - "learning_rate": 2.123575319254087e-06, - "loss": 0.85783684, - "num_input_tokens_seen": 177339845, - "step": 8248, - "time_per_iteration": 4.395894289016724 - }, - { - "auxiliary_loss_clip": 0.01118567, - "auxiliary_loss_mlp": 0.01032735, - "balance_loss_clip": 1.04729056, - "balance_loss_mlp": 1.01836419, - "epoch": 0.49595671125807905, - "flos": 25083460114560.0, - "grad_norm": 1.8247689581014963, - "language_loss": 0.73558569, - "learning_rate": 2.123186599369812e-06, - "loss": 0.75709867, - "num_input_tokens_seen": 177359980, - "step": 8249, - "time_per_iteration": 4.36426305770874 - }, - { - "auxiliary_loss_clip": 0.01110094, - "auxiliary_loss_mlp": 0.01046161, - "balance_loss_clip": 1.04773486, - "balance_loss_mlp": 1.03169477, - "epoch": 0.496016834510747, - "flos": 16435883692800.0, - "grad_norm": 1.900690676640245, - "language_loss": 0.75902295, - "learning_rate": 2.122797874814289e-06, - "loss": 0.78058553, - "num_input_tokens_seen": 177378580, - "step": 8250, - "time_per_iteration": 4.203567266464233 - }, - { - "auxiliary_loss_clip": 0.011299, - "auxiliary_loss_mlp": 0.01042712, - "balance_loss_clip": 1.04861271, - "balance_loss_mlp": 1.02788305, - "epoch": 0.496076957763415, - "flos": 23437925228160.0, - "grad_norm": 1.7086851316152774, - "language_loss": 0.69983917, - "learning_rate": 2.1224091456022585e-06, - "loss": 0.72156531, - "num_input_tokens_seen": 177398790, - "step": 8251, - "time_per_iteration": 2.6825788021087646 - }, - { - "auxiliary_loss_clip": 0.01092939, - "auxiliary_loss_mlp": 0.00771421, - "balance_loss_clip": 1.04950809, - "balance_loss_mlp": 1.00016773, - "epoch": 0.49613708101608295, - "flos": 16909509450240.0, - "grad_norm": 1.9257049963935782, - "language_loss": 0.80088174, - "learning_rate": 2.122020411748461e-06, - "loss": 0.81952536, - "num_input_tokens_seen": 177416515, - "step": 8252, - "time_per_iteration": 2.7017300128936768 - }, - { - "auxiliary_loss_clip": 0.01130139, - "auxiliary_loss_mlp": 0.01033677, - "balance_loss_clip": 1.04937637, - "balance_loss_mlp": 1.01769102, - "epoch": 0.4961972042687509, - "flos": 16618094409600.0, - "grad_norm": 1.7413302103337327, - "language_loss": 0.81005448, - "learning_rate": 2.1216316732676363e-06, - "loss": 0.83169258, - "num_input_tokens_seen": 177434425, - "step": 8253, - "time_per_iteration": 2.5844311714172363 - }, - { - "auxiliary_loss_clip": 0.01092121, - "auxiliary_loss_mlp": 0.01031077, - "balance_loss_clip": 1.04245412, - "balance_loss_mlp": 1.01743925, - "epoch": 0.49625732752141893, - "flos": 28956749437440.0, - "grad_norm": 1.4814612406319185, - "language_loss": 0.67246485, - "learning_rate": 2.1212429301745275e-06, - "loss": 0.69369686, - "num_input_tokens_seen": 177459675, - "step": 8254, - "time_per_iteration": 2.815851926803589 - }, - { - "auxiliary_loss_clip": 0.01091336, - "auxiliary_loss_mlp": 0.01052712, - "balance_loss_clip": 1.04560924, - "balance_loss_mlp": 1.03665471, - "epoch": 0.4963174507740869, - "flos": 23112359331840.0, - "grad_norm": 1.7981030707772934, - "language_loss": 0.74278247, - "learning_rate": 2.1208541824838743e-06, - "loss": 0.76422298, - "num_input_tokens_seen": 177478895, - "step": 8255, - "time_per_iteration": 2.7599687576293945 - }, - { - "auxiliary_loss_clip": 0.01098276, - "auxiliary_loss_mlp": 0.01036505, - "balance_loss_clip": 1.04286051, - "balance_loss_mlp": 1.02203858, - "epoch": 0.49637757402675486, - "flos": 13917863450880.0, - "grad_norm": 1.736601635944992, - "language_loss": 0.81702995, - "learning_rate": 2.1204654302104183e-06, - "loss": 0.83837777, - "num_input_tokens_seen": 177494920, - "step": 8256, - "time_per_iteration": 2.640913724899292 - }, - { - "auxiliary_loss_clip": 0.01096211, - "auxiliary_loss_mlp": 0.01033961, - "balance_loss_clip": 1.04346132, - "balance_loss_mlp": 1.02055597, - "epoch": 0.49643769727942283, - "flos": 22309001700480.0, - "grad_norm": 1.6034861047711904, - "language_loss": 0.81197649, - "learning_rate": 2.120076673368901e-06, - "loss": 0.83327824, - "num_input_tokens_seen": 177515455, - "step": 8257, - "time_per_iteration": 2.724745512008667 - }, - { - "auxiliary_loss_clip": 0.01133163, - "auxiliary_loss_mlp": 0.01039711, - "balance_loss_clip": 1.04763043, - "balance_loss_mlp": 1.02435732, - "epoch": 0.4964978205320908, - "flos": 19500248776320.0, - "grad_norm": 1.9280789180083706, - "language_loss": 0.66280329, - "learning_rate": 2.1196879119740647e-06, - "loss": 0.68453205, - "num_input_tokens_seen": 177534040, - "step": 8258, - "time_per_iteration": 2.570275068283081 - }, - { - "auxiliary_loss_clip": 0.01110241, - "auxiliary_loss_mlp": 0.01032185, - "balance_loss_clip": 1.04361916, - "balance_loss_mlp": 1.01942396, - "epoch": 0.49655794378475876, - "flos": 23436524597760.0, - "grad_norm": 1.42579071834104, - "language_loss": 0.77627164, - "learning_rate": 2.1192991460406502e-06, - "loss": 0.79769588, - "num_input_tokens_seen": 177554510, - "step": 8259, - "time_per_iteration": 2.676722288131714 - }, - { - "auxiliary_loss_clip": 0.01097253, - "auxiliary_loss_mlp": 0.01038217, - "balance_loss_clip": 1.04436278, - "balance_loss_mlp": 1.02406085, - "epoch": 0.4966180670374267, - "flos": 26831124345600.0, - "grad_norm": 1.5162865829701626, - "language_loss": 0.78461975, - "learning_rate": 2.1189103755834e-06, - "loss": 0.80597448, - "num_input_tokens_seen": 177575780, - "step": 8260, - "time_per_iteration": 2.7226130962371826 - }, - { - "auxiliary_loss_clip": 0.01100503, - "auxiliary_loss_mlp": 0.01035808, - "balance_loss_clip": 1.04154015, - "balance_loss_mlp": 1.02135992, - "epoch": 0.4966781902900947, - "flos": 22009326531840.0, - "grad_norm": 3.0057343325073456, - "language_loss": 0.76335442, - "learning_rate": 2.1185216006170573e-06, - "loss": 0.78471756, - "num_input_tokens_seen": 177588965, - "step": 8261, - "time_per_iteration": 2.6477174758911133 - }, - { - "auxiliary_loss_clip": 0.01071745, - "auxiliary_loss_mlp": 0.0103251, - "balance_loss_clip": 1.03892851, - "balance_loss_mlp": 1.01939654, - "epoch": 0.49673831354276266, - "flos": 26213353309440.0, - "grad_norm": 1.835251427236856, - "language_loss": 0.89503151, - "learning_rate": 2.1181328211563627e-06, - "loss": 0.9160741, - "num_input_tokens_seen": 177608425, - "step": 8262, - "time_per_iteration": 2.757200241088867 - }, - { - "auxiliary_loss_clip": 0.01068117, - "auxiliary_loss_mlp": 0.01035611, - "balance_loss_clip": 1.04000998, - "balance_loss_mlp": 1.0223608, - "epoch": 0.4967984367954306, - "flos": 23182277155200.0, - "grad_norm": 1.5869779774184047, - "language_loss": 0.73859417, - "learning_rate": 2.11774403721606e-06, - "loss": 0.7596314, - "num_input_tokens_seen": 177628240, - "step": 8263, - "time_per_iteration": 2.799468994140625 - }, - { - "auxiliary_loss_clip": 0.0108327, - "auxiliary_loss_mlp": 0.01039108, - "balance_loss_clip": 1.0480659, - "balance_loss_mlp": 1.02325881, - "epoch": 0.4968585600480986, - "flos": 19281445079040.0, - "grad_norm": 3.1164108836460036, - "language_loss": 0.70163679, - "learning_rate": 2.1173552488108923e-06, - "loss": 0.72286057, - "num_input_tokens_seen": 177645920, - "step": 8264, - "time_per_iteration": 2.720449447631836 - }, - { - "auxiliary_loss_clip": 0.01098192, - "auxiliary_loss_mlp": 0.01032461, - "balance_loss_clip": 1.04328251, - "balance_loss_mlp": 1.01837087, - "epoch": 0.49691868330076655, - "flos": 22528703237760.0, - "grad_norm": 1.6446636391121152, - "language_loss": 0.65104395, - "learning_rate": 2.1169664559556007e-06, - "loss": 0.67235053, - "num_input_tokens_seen": 177667185, - "step": 8265, - "time_per_iteration": 2.683858633041382 - }, - { - "auxiliary_loss_clip": 0.01028918, - "auxiliary_loss_mlp": 0.01002907, - "balance_loss_clip": 1.0220778, - "balance_loss_mlp": 1.00148249, - "epoch": 0.4969788065534345, - "flos": 66577128675840.0, - "grad_norm": 0.930084427968553, - "language_loss": 0.53491867, - "learning_rate": 2.1165776586649304e-06, - "loss": 0.55523694, - "num_input_tokens_seen": 177733020, - "step": 8266, - "time_per_iteration": 3.2566375732421875 - }, - { - "auxiliary_loss_clip": 0.01113371, - "auxiliary_loss_mlp": 0.01032636, - "balance_loss_clip": 1.04611242, - "balance_loss_mlp": 1.01834857, - "epoch": 0.49703892980610254, - "flos": 24059503105920.0, - "grad_norm": 1.764439361537035, - "language_loss": 0.79587245, - "learning_rate": 2.1161888569536223e-06, - "loss": 0.81733251, - "num_input_tokens_seen": 177753370, - "step": 8267, - "time_per_iteration": 2.6278576850891113 - }, - { - "auxiliary_loss_clip": 0.01102001, - "auxiliary_loss_mlp": 0.01039107, - "balance_loss_clip": 1.04590034, - "balance_loss_mlp": 1.02316856, - "epoch": 0.4970990530587705, - "flos": 29126174912640.0, - "grad_norm": 2.2169439003129385, - "language_loss": 0.74835396, - "learning_rate": 2.1158000508364223e-06, - "loss": 0.76976496, - "num_input_tokens_seen": 177771530, - "step": 8268, - "time_per_iteration": 2.734259843826294 - }, - { - "auxiliary_loss_clip": 0.011141, - "auxiliary_loss_mlp": 0.00771431, - "balance_loss_clip": 1.04348183, - "balance_loss_mlp": 1.00014162, - "epoch": 0.49715917631143847, - "flos": 46026167258880.0, - "grad_norm": 4.0839840126254225, - "language_loss": 0.68041855, - "learning_rate": 2.115411240328073e-06, - "loss": 0.69927382, - "num_input_tokens_seen": 177796355, - "step": 8269, - "time_per_iteration": 2.90146541595459 - }, - { - "auxiliary_loss_clip": 0.01097171, - "auxiliary_loss_mlp": 0.01041712, - "balance_loss_clip": 1.04262531, - "balance_loss_mlp": 1.02837276, - "epoch": 0.49721929956410643, - "flos": 20191277600640.0, - "grad_norm": 2.5681883642378436, - "language_loss": 0.85533005, - "learning_rate": 2.1150224254433167e-06, - "loss": 0.87671888, - "num_input_tokens_seen": 177814300, - "step": 8270, - "time_per_iteration": 2.8005404472351074 - }, - { - "auxiliary_loss_clip": 0.01081529, - "auxiliary_loss_mlp": 0.00771255, - "balance_loss_clip": 1.04315615, - "balance_loss_mlp": 1.00016665, - "epoch": 0.4972794228167744, - "flos": 21653560275840.0, - "grad_norm": 1.8215552302583695, - "language_loss": 0.70831466, - "learning_rate": 2.114633606196899e-06, - "loss": 0.72684252, - "num_input_tokens_seen": 177833615, - "step": 8271, - "time_per_iteration": 2.91554594039917 - }, - { - "auxiliary_loss_clip": 0.01112057, - "auxiliary_loss_mlp": 0.01035877, - "balance_loss_clip": 1.04666567, - "balance_loss_mlp": 1.02128029, - "epoch": 0.49733954606944236, - "flos": 24279743347200.0, - "grad_norm": 1.5312065445139798, - "language_loss": 0.78403968, - "learning_rate": 2.1142447826035635e-06, - "loss": 0.80551904, - "num_input_tokens_seen": 177855315, - "step": 8272, - "time_per_iteration": 2.6702592372894287 - }, - { - "auxiliary_loss_clip": 0.01090488, - "auxiliary_loss_mlp": 0.01040546, - "balance_loss_clip": 1.0464623, - "balance_loss_mlp": 1.02679515, - "epoch": 0.4973996693221103, - "flos": 37852575730560.0, - "grad_norm": 2.547664660385474, - "language_loss": 0.6682387, - "learning_rate": 2.1138559546780544e-06, - "loss": 0.68954909, - "num_input_tokens_seen": 177875590, - "step": 8273, - "time_per_iteration": 2.8257791996002197 - }, - { - "auxiliary_loss_clip": 0.01089829, - "auxiliary_loss_mlp": 0.01037205, - "balance_loss_clip": 1.04431605, - "balance_loss_mlp": 1.02347827, - "epoch": 0.4974597925747783, - "flos": 21361426963200.0, - "grad_norm": 1.5692617693087136, - "language_loss": 0.78097814, - "learning_rate": 2.1134671224351163e-06, - "loss": 0.80224848, - "num_input_tokens_seen": 177894175, - "step": 8274, - "time_per_iteration": 2.6786539554595947 - }, - { - "auxiliary_loss_clip": 0.01087892, - "auxiliary_loss_mlp": 0.01037968, - "balance_loss_clip": 1.04171109, - "balance_loss_mlp": 1.02315021, - "epoch": 0.49751991582744626, - "flos": 30738133560960.0, - "grad_norm": 1.7539763145915706, - "language_loss": 0.75727397, - "learning_rate": 2.113078285889493e-06, - "loss": 0.77853251, - "num_input_tokens_seen": 177913920, - "step": 8275, - "time_per_iteration": 2.7289958000183105 - }, - { - "auxiliary_loss_clip": 0.01117048, - "auxiliary_loss_mlp": 0.01038819, - "balance_loss_clip": 1.04600728, - "balance_loss_mlp": 1.02240443, - "epoch": 0.4975800390801142, - "flos": 14100541044480.0, - "grad_norm": 2.0869085379368717, - "language_loss": 0.84277642, - "learning_rate": 2.1126894450559303e-06, - "loss": 0.86433506, - "num_input_tokens_seen": 177930425, - "step": 8276, - "time_per_iteration": 2.612114667892456 - }, - { - "auxiliary_loss_clip": 0.01122283, - "auxiliary_loss_mlp": 0.00770821, - "balance_loss_clip": 1.04578209, - "balance_loss_mlp": 1.00012255, - "epoch": 0.4976401623327822, - "flos": 24207275658240.0, - "grad_norm": 2.0722406374843283, - "language_loss": 0.70213616, - "learning_rate": 2.112300599949172e-06, - "loss": 0.72106719, - "num_input_tokens_seen": 177949885, - "step": 8277, - "time_per_iteration": 2.627364158630371 - }, - { - "auxiliary_loss_clip": 0.01109969, - "auxiliary_loss_mlp": 0.01038763, - "balance_loss_clip": 1.04542017, - "balance_loss_mlp": 1.02430928, - "epoch": 0.49770028558545015, - "flos": 21136769349120.0, - "grad_norm": 1.855614041136712, - "language_loss": 0.82644826, - "learning_rate": 2.111911750583964e-06, - "loss": 0.84793556, - "num_input_tokens_seen": 177965720, - "step": 8278, - "time_per_iteration": 2.653998613357544 - }, - { - "auxiliary_loss_clip": 0.01117237, - "auxiliary_loss_mlp": 0.01041122, - "balance_loss_clip": 1.04625261, - "balance_loss_mlp": 1.02723408, - "epoch": 0.4977604088381181, - "flos": 16763927627520.0, - "grad_norm": 2.0212653893375276, - "language_loss": 0.67471039, - "learning_rate": 2.111522896975052e-06, - "loss": 0.69629395, - "num_input_tokens_seen": 177983190, - "step": 8279, - "time_per_iteration": 2.607090473175049 - }, - { - "auxiliary_loss_clip": 0.01115839, - "auxiliary_loss_mlp": 0.01041996, - "balance_loss_clip": 1.04406691, - "balance_loss_mlp": 1.02692842, - "epoch": 0.49782053209078614, - "flos": 15703521292800.0, - "grad_norm": 2.1427811758671527, - "language_loss": 0.70507026, - "learning_rate": 2.1111340391371794e-06, - "loss": 0.72664863, - "num_input_tokens_seen": 178000155, - "step": 8280, - "time_per_iteration": 2.636384963989258 - }, - { - "auxiliary_loss_clip": 0.01090186, - "auxiliary_loss_mlp": 0.01035589, - "balance_loss_clip": 1.04237318, - "balance_loss_mlp": 1.02177858, - "epoch": 0.4978806553434541, - "flos": 24753692327040.0, - "grad_norm": 2.860421271049928, - "language_loss": 0.64889467, - "learning_rate": 2.1107451770850936e-06, - "loss": 0.67015243, - "num_input_tokens_seen": 178021060, - "step": 8281, - "time_per_iteration": 2.6961820125579834 - }, - { - "auxiliary_loss_clip": 0.0111999, - "auxiliary_loss_mlp": 0.01036047, - "balance_loss_clip": 1.0478642, - "balance_loss_mlp": 1.02102113, - "epoch": 0.49794077859612207, - "flos": 13115726881920.0, - "grad_norm": 2.7426965878502845, - "language_loss": 0.73226738, - "learning_rate": 2.1103563108335387e-06, - "loss": 0.75382769, - "num_input_tokens_seen": 178038180, - "step": 8282, - "time_per_iteration": 2.7749152183532715 - }, - { - "auxiliary_loss_clip": 0.01095648, - "auxiliary_loss_mlp": 0.01033955, - "balance_loss_clip": 1.04499686, - "balance_loss_mlp": 1.02106822, - "epoch": 0.49800090184879003, - "flos": 27525133998720.0, - "grad_norm": 1.749404235674241, - "language_loss": 0.73327482, - "learning_rate": 2.109967440397263e-06, - "loss": 0.75457078, - "num_input_tokens_seen": 178057565, - "step": 8283, - "time_per_iteration": 2.7039520740509033 - }, - { - "auxiliary_loss_clip": 0.01068275, - "auxiliary_loss_mlp": 0.01054525, - "balance_loss_clip": 1.0405463, - "balance_loss_mlp": 1.03883147, - "epoch": 0.498061025101458, - "flos": 19792489829760.0, - "grad_norm": 2.5573951668279102, - "language_loss": 0.7842927, - "learning_rate": 2.1095785657910095e-06, - "loss": 0.80552071, - "num_input_tokens_seen": 178076965, - "step": 8284, - "time_per_iteration": 2.7534518241882324 - }, - { - "auxiliary_loss_clip": 0.01103825, - "auxiliary_loss_mlp": 0.0104233, - "balance_loss_clip": 1.045488, - "balance_loss_mlp": 1.02733326, - "epoch": 0.49812114835412596, - "flos": 29893909230720.0, - "grad_norm": 1.7317298938274186, - "language_loss": 0.73607123, - "learning_rate": 2.109189687029526e-06, - "loss": 0.75753278, - "num_input_tokens_seen": 178095105, - "step": 8285, - "time_per_iteration": 2.696913719177246 - }, - { - "auxiliary_loss_clip": 0.01114659, - "auxiliary_loss_mlp": 0.01033722, - "balance_loss_clip": 1.0496074, - "balance_loss_mlp": 1.01902318, - "epoch": 0.49818127160679393, - "flos": 23147048891520.0, - "grad_norm": 1.6428187648074233, - "language_loss": 0.74194658, - "learning_rate": 2.1088008041275598e-06, - "loss": 0.76343036, - "num_input_tokens_seen": 178114505, - "step": 8286, - "time_per_iteration": 4.164494752883911 - }, - { - "auxiliary_loss_clip": 0.01106668, - "auxiliary_loss_mlp": 0.0104423, - "balance_loss_clip": 1.04752493, - "balance_loss_mlp": 1.02986491, - "epoch": 0.4982413948594619, - "flos": 21652806090240.0, - "grad_norm": 1.7990587687461415, - "language_loss": 0.85529351, - "learning_rate": 2.1084119170998545e-06, - "loss": 0.87680244, - "num_input_tokens_seen": 178131595, - "step": 8287, - "time_per_iteration": 4.236407279968262 - }, - { - "auxiliary_loss_clip": 0.01076576, - "auxiliary_loss_mlp": 0.01032511, - "balance_loss_clip": 1.04194725, - "balance_loss_mlp": 1.01822948, - "epoch": 0.49830151811212986, - "flos": 32486982940800.0, - "grad_norm": 1.6860437652999367, - "language_loss": 0.72530627, - "learning_rate": 2.108023025961159e-06, - "loss": 0.74639714, - "num_input_tokens_seen": 178152055, - "step": 8288, - "time_per_iteration": 4.404609680175781 - }, - { - "auxiliary_loss_clip": 0.01106449, - "auxiliary_loss_mlp": 0.01040352, - "balance_loss_clip": 1.04326916, - "balance_loss_mlp": 1.02459288, - "epoch": 0.4983616413647978, - "flos": 18142358002560.0, - "grad_norm": 3.334734045415943, - "language_loss": 0.79885554, - "learning_rate": 2.10763413072622e-06, - "loss": 0.82032353, - "num_input_tokens_seen": 178168150, - "step": 8289, - "time_per_iteration": 2.6629836559295654 - }, - { - "auxiliary_loss_clip": 0.01114454, - "auxiliary_loss_mlp": 0.0103885, - "balance_loss_clip": 1.0446074, - "balance_loss_mlp": 1.02460992, - "epoch": 0.4984217646174658, - "flos": 19718836992000.0, - "grad_norm": 2.0640091139098256, - "language_loss": 0.72874933, - "learning_rate": 2.107245231409784e-06, - "loss": 0.75028241, - "num_input_tokens_seen": 178186150, - "step": 8290, - "time_per_iteration": 4.18574333190918 - }, - { - "auxiliary_loss_clip": 0.0112064, - "auxiliary_loss_mlp": 0.01040925, - "balance_loss_clip": 1.04972208, - "balance_loss_mlp": 1.02428377, - "epoch": 0.49848188787013376, - "flos": 24936549488640.0, - "grad_norm": 1.4927804425375188, - "language_loss": 0.8397218, - "learning_rate": 2.106856328026598e-06, - "loss": 0.86133754, - "num_input_tokens_seen": 178207665, - "step": 8291, - "time_per_iteration": 2.716386556625366 - }, - { - "auxiliary_loss_clip": 0.01103944, - "auxiliary_loss_mlp": 0.01046379, - "balance_loss_clip": 1.04420066, - "balance_loss_mlp": 1.02930808, - "epoch": 0.4985420111228017, - "flos": 22382439056640.0, - "grad_norm": 1.6316694600084898, - "language_loss": 0.67022264, - "learning_rate": 2.106467420591409e-06, - "loss": 0.69172579, - "num_input_tokens_seen": 178226325, - "step": 8292, - "time_per_iteration": 2.7027721405029297 - }, - { - "auxiliary_loss_clip": 0.01127175, - "auxiliary_loss_mlp": 0.01039323, - "balance_loss_clip": 1.04806566, - "balance_loss_mlp": 1.02625203, - "epoch": 0.4986021343754697, - "flos": 16216469464320.0, - "grad_norm": 1.6633361946509924, - "language_loss": 0.66995132, - "learning_rate": 2.106078509118965e-06, - "loss": 0.6916163, - "num_input_tokens_seen": 178244960, - "step": 8293, - "time_per_iteration": 2.5719261169433594 - }, - { - "auxiliary_loss_clip": 0.01111406, - "auxiliary_loss_mlp": 0.01029657, - "balance_loss_clip": 1.04379749, - "balance_loss_mlp": 1.01533389, - "epoch": 0.4986622576281377, - "flos": 23403594804480.0, - "grad_norm": 1.8610494318021187, - "language_loss": 0.82020485, - "learning_rate": 2.1056895936240133e-06, - "loss": 0.84161556, - "num_input_tokens_seen": 178265400, - "step": 8294, - "time_per_iteration": 2.6504080295562744 - }, - { - "auxiliary_loss_clip": 0.01116097, - "auxiliary_loss_mlp": 0.01031185, - "balance_loss_clip": 1.04479063, - "balance_loss_mlp": 1.01604557, - "epoch": 0.49872238088080567, - "flos": 19974556892160.0, - "grad_norm": 2.2309244250260183, - "language_loss": 0.72901344, - "learning_rate": 2.1053006741213016e-06, - "loss": 0.75048614, - "num_input_tokens_seen": 178284535, - "step": 8295, - "time_per_iteration": 2.6195027828216553 - }, - { - "auxiliary_loss_clip": 0.01059073, - "auxiliary_loss_mlp": 0.01038092, - "balance_loss_clip": 1.03994107, - "balance_loss_mlp": 1.02466345, - "epoch": 0.49878250413347364, - "flos": 22893016930560.0, - "grad_norm": 1.8092757241660187, - "language_loss": 0.67607826, - "learning_rate": 2.1049117506255775e-06, - "loss": 0.69704998, - "num_input_tokens_seen": 178302425, - "step": 8296, - "time_per_iteration": 2.755263090133667 - }, - { - "auxiliary_loss_clip": 0.01104221, - "auxiliary_loss_mlp": 0.01042078, - "balance_loss_clip": 1.04649234, - "balance_loss_mlp": 1.02715254, - "epoch": 0.4988426273861416, - "flos": 32598449821440.0, - "grad_norm": 2.862724254512052, - "language_loss": 0.64573205, - "learning_rate": 2.1045228231515895e-06, - "loss": 0.66719502, - "num_input_tokens_seen": 178323065, - "step": 8297, - "time_per_iteration": 2.77134108543396 - }, - { - "auxiliary_loss_clip": 0.01068772, - "auxiliary_loss_mlp": 0.01035076, - "balance_loss_clip": 1.04186463, - "balance_loss_mlp": 1.02241552, - "epoch": 0.49890275063880957, - "flos": 20923604087040.0, - "grad_norm": 1.6802177929429785, - "language_loss": 0.70005518, - "learning_rate": 2.1041338917140857e-06, - "loss": 0.72109365, - "num_input_tokens_seen": 178343985, - "step": 8298, - "time_per_iteration": 2.7644965648651123 - }, - { - "auxiliary_loss_clip": 0.01123634, - "auxiliary_loss_mlp": 0.01037158, - "balance_loss_clip": 1.04611015, - "balance_loss_mlp": 1.02383053, - "epoch": 0.49896287389147753, - "flos": 18624459369600.0, - "grad_norm": 2.15895128631453, - "language_loss": 0.85060012, - "learning_rate": 2.103744956327814e-06, - "loss": 0.87220806, - "num_input_tokens_seen": 178362345, - "step": 8299, - "time_per_iteration": 2.6582682132720947 - }, - { - "auxiliary_loss_clip": 0.0109908, - "auxiliary_loss_mlp": 0.01042644, - "balance_loss_clip": 1.04576635, - "balance_loss_mlp": 1.02676535, - "epoch": 0.4990229971441455, - "flos": 24826555065600.0, - "grad_norm": 3.5746156367417177, - "language_loss": 0.69598472, - "learning_rate": 2.1033560170075234e-06, - "loss": 0.71740198, - "num_input_tokens_seen": 178383190, - "step": 8300, - "time_per_iteration": 2.725041151046753 - }, - { - "auxiliary_loss_clip": 0.01026277, - "auxiliary_loss_mlp": 0.01006258, - "balance_loss_clip": 1.02488732, - "balance_loss_mlp": 1.00483894, - "epoch": 0.49908312039681346, - "flos": 71384525136000.0, - "grad_norm": 0.7557607717879434, - "language_loss": 0.51092541, - "learning_rate": 2.1029670737679623e-06, - "loss": 0.53125077, - "num_input_tokens_seen": 178444250, - "step": 8301, - "time_per_iteration": 3.2866220474243164 - }, - { - "auxiliary_loss_clip": 0.01096877, - "auxiliary_loss_mlp": 0.01045659, - "balance_loss_clip": 1.04223108, - "balance_loss_mlp": 1.03140736, - "epoch": 0.4991432436494814, - "flos": 19828651847040.0, - "grad_norm": 1.7177443948136444, - "language_loss": 0.84648693, - "learning_rate": 2.102578126623879e-06, - "loss": 0.86791229, - "num_input_tokens_seen": 178463250, - "step": 8302, - "time_per_iteration": 2.66215181350708 - }, - { - "auxiliary_loss_clip": 0.01112659, - "auxiliary_loss_mlp": 0.01034193, - "balance_loss_clip": 1.04628754, - "balance_loss_mlp": 1.02111602, - "epoch": 0.4992033669021494, - "flos": 15121912273920.0, - "grad_norm": 5.640508686379792, - "language_loss": 0.68928391, - "learning_rate": 2.102189175590024e-06, - "loss": 0.71075243, - "num_input_tokens_seen": 178481340, - "step": 8303, - "time_per_iteration": 2.6031181812286377 - }, - { - "auxiliary_loss_clip": 0.01126853, - "auxiliary_loss_mlp": 0.01035164, - "balance_loss_clip": 1.04641497, - "balance_loss_mlp": 1.02095485, - "epoch": 0.49926349015481736, - "flos": 31207952476800.0, - "grad_norm": 1.6560759996443648, - "language_loss": 0.72727203, - "learning_rate": 2.101800220681144e-06, - "loss": 0.74889231, - "num_input_tokens_seen": 178501545, - "step": 8304, - "time_per_iteration": 2.706022262573242 - }, - { - "auxiliary_loss_clip": 0.01116141, - "auxiliary_loss_mlp": 0.01037357, - "balance_loss_clip": 1.0475409, - "balance_loss_mlp": 1.02420211, - "epoch": 0.4993236134074853, - "flos": 24900207903360.0, - "grad_norm": 2.1644384364092684, - "language_loss": 0.81342846, - "learning_rate": 2.10141126191199e-06, - "loss": 0.83496344, - "num_input_tokens_seen": 178519700, - "step": 8305, - "time_per_iteration": 2.6671528816223145 - }, - { - "auxiliary_loss_clip": 0.01024768, - "auxiliary_loss_mlp": 0.01003944, - "balance_loss_clip": 1.02671385, - "balance_loss_mlp": 1.00258529, - "epoch": 0.4993837366601533, - "flos": 70420573797120.0, - "grad_norm": 0.7597400638433706, - "language_loss": 0.56867081, - "learning_rate": 2.1010222992973107e-06, - "loss": 0.58895797, - "num_input_tokens_seen": 178576740, - "step": 8306, - "time_per_iteration": 3.322448492050171 - }, - { - "auxiliary_loss_clip": 0.01127996, - "auxiliary_loss_mlp": 0.01039143, - "balance_loss_clip": 1.04948568, - "balance_loss_mlp": 1.02432525, - "epoch": 0.4994438599128213, - "flos": 15961216440960.0, - "grad_norm": 2.2302114161499236, - "language_loss": 0.82741839, - "learning_rate": 2.1006333328518556e-06, - "loss": 0.84908974, - "num_input_tokens_seen": 178594745, - "step": 8307, - "time_per_iteration": 2.583996295928955 - }, - { - "auxiliary_loss_clip": 0.01126994, - "auxiliary_loss_mlp": 0.0103608, - "balance_loss_clip": 1.04805601, - "balance_loss_mlp": 1.02157855, - "epoch": 0.4995039831654893, - "flos": 27928303228800.0, - "grad_norm": 1.7094622949229625, - "language_loss": 0.60939324, - "learning_rate": 2.1002443625903748e-06, - "loss": 0.63102394, - "num_input_tokens_seen": 178614110, - "step": 8308, - "time_per_iteration": 2.6170315742492676 - }, - { - "auxiliary_loss_clip": 0.01120806, - "auxiliary_loss_mlp": 0.01031842, - "balance_loss_clip": 1.04421234, - "balance_loss_mlp": 1.01890182, - "epoch": 0.49956410641815724, - "flos": 24204797619840.0, - "grad_norm": 1.8375312667766532, - "language_loss": 0.74889386, - "learning_rate": 2.0998553885276168e-06, - "loss": 0.77042031, - "num_input_tokens_seen": 178634170, - "step": 8309, - "time_per_iteration": 2.6147258281707764 - }, - { - "auxiliary_loss_clip": 0.01102514, - "auxiliary_loss_mlp": 0.0103405, - "balance_loss_clip": 1.04401636, - "balance_loss_mlp": 1.02106261, - "epoch": 0.4996242296708252, - "flos": 16180127879040.0, - "grad_norm": 3.148005555228763, - "language_loss": 0.79502416, - "learning_rate": 2.0994664106783335e-06, - "loss": 0.8163898, - "num_input_tokens_seen": 178651775, - "step": 8310, - "time_per_iteration": 2.6420629024505615 - }, - { - "auxiliary_loss_clip": 0.01111922, - "auxiliary_loss_mlp": 0.01040825, - "balance_loss_clip": 1.04564738, - "balance_loss_mlp": 1.02757514, - "epoch": 0.49968435292349317, - "flos": 16873527000960.0, - "grad_norm": 1.4976626914983278, - "language_loss": 0.70989597, - "learning_rate": 2.0990774290572735e-06, - "loss": 0.73142344, - "num_input_tokens_seen": 178669720, - "step": 8311, - "time_per_iteration": 2.5778110027313232 - }, - { - "auxiliary_loss_clip": 0.01098554, - "auxiliary_loss_mlp": 0.01036482, - "balance_loss_clip": 1.04628289, - "balance_loss_mlp": 1.02355957, - "epoch": 0.49974447617616113, - "flos": 14939521989120.0, - "grad_norm": 2.0443790290482498, - "language_loss": 0.77375191, - "learning_rate": 2.098688443679187e-06, - "loss": 0.79510236, - "num_input_tokens_seen": 178686765, - "step": 8312, - "time_per_iteration": 2.6517751216888428 - }, - { - "auxiliary_loss_clip": 0.01095231, - "auxiliary_loss_mlp": 0.01035354, - "balance_loss_clip": 1.04635751, - "balance_loss_mlp": 1.02135265, - "epoch": 0.4998045994288291, - "flos": 26651535321600.0, - "grad_norm": 1.7937215644313522, - "language_loss": 0.84479403, - "learning_rate": 2.0982994545588256e-06, - "loss": 0.86609983, - "num_input_tokens_seen": 178705845, - "step": 8313, - "time_per_iteration": 2.7882683277130127 - }, - { - "auxiliary_loss_clip": 0.01098533, - "auxiliary_loss_mlp": 0.01032666, - "balance_loss_clip": 1.04393864, - "balance_loss_mlp": 1.01856351, - "epoch": 0.49986472268149706, - "flos": 20953768533120.0, - "grad_norm": 1.8469644022391951, - "language_loss": 0.80625784, - "learning_rate": 2.097910461710939e-06, - "loss": 0.82756978, - "num_input_tokens_seen": 178723410, - "step": 8314, - "time_per_iteration": 2.6792070865631104 - }, - { - "auxiliary_loss_clip": 0.01093189, - "auxiliary_loss_mlp": 0.00772869, - "balance_loss_clip": 1.04282761, - "balance_loss_mlp": 1.00018048, - "epoch": 0.49992484593416503, - "flos": 22783884433920.0, - "grad_norm": 1.9116629548957604, - "language_loss": 0.79824436, - "learning_rate": 2.0975214651502773e-06, - "loss": 0.8169049, - "num_input_tokens_seen": 178743560, - "step": 8315, - "time_per_iteration": 2.885185718536377 - }, - { - "auxiliary_loss_clip": 0.01126333, - "auxiliary_loss_mlp": 0.01033363, - "balance_loss_clip": 1.04775071, - "balance_loss_mlp": 1.02025628, - "epoch": 0.499984969186833, - "flos": 46786970252160.0, - "grad_norm": 1.6207947092177402, - "language_loss": 0.74976832, - "learning_rate": 2.0971324648915926e-06, - "loss": 0.77136528, - "num_input_tokens_seen": 178767225, - "step": 8316, - "time_per_iteration": 2.865182399749756 - }, - { - "auxiliary_loss_clip": 0.01104962, - "auxiliary_loss_mlp": 0.0103454, - "balance_loss_clip": 1.04472423, - "balance_loss_mlp": 1.02195168, - "epoch": 0.500045092439501, - "flos": 25556978131200.0, - "grad_norm": 1.839667572981257, - "language_loss": 0.81122506, - "learning_rate": 2.0967434609496343e-06, - "loss": 0.83262014, - "num_input_tokens_seen": 178786810, - "step": 8317, - "time_per_iteration": 2.781627893447876 - }, - { - "auxiliary_loss_clip": 0.011005, - "auxiliary_loss_mlp": 0.01038819, - "balance_loss_clip": 1.04331255, - "balance_loss_mlp": 1.02368522, - "epoch": 0.5001052156921689, - "flos": 20704764476160.0, - "grad_norm": 1.6607654789374993, - "language_loss": 0.83369392, - "learning_rate": 2.0963544533391548e-06, - "loss": 0.8550871, - "num_input_tokens_seen": 178805660, - "step": 8318, - "time_per_iteration": 2.790937662124634 - }, - { - "auxiliary_loss_clip": 0.01114137, - "auxiliary_loss_mlp": 0.01032915, - "balance_loss_clip": 1.04552984, - "balance_loss_mlp": 1.01974225, - "epoch": 0.500165338944837, - "flos": 21251109317760.0, - "grad_norm": 1.7594247797212967, - "language_loss": 0.81800634, - "learning_rate": 2.0959654420749045e-06, - "loss": 0.83947688, - "num_input_tokens_seen": 178824780, - "step": 8319, - "time_per_iteration": 2.6710760593414307 - }, - { - "auxiliary_loss_clip": 0.01080263, - "auxiliary_loss_mlp": 0.01030013, - "balance_loss_clip": 1.03828013, - "balance_loss_mlp": 1.01689363, - "epoch": 0.5002254621975049, - "flos": 27854398995840.0, - "grad_norm": 1.5279258864896563, - "language_loss": 0.71943277, - "learning_rate": 2.095576427171635e-06, - "loss": 0.7405355, - "num_input_tokens_seen": 178845640, - "step": 8320, - "time_per_iteration": 2.7864880561828613 - }, - { - "auxiliary_loss_clip": 0.01093478, - "auxiliary_loss_mlp": 0.01044698, - "balance_loss_clip": 1.04542255, - "balance_loss_mlp": 1.02964222, - "epoch": 0.5002855854501729, - "flos": 15551941898880.0, - "grad_norm": 2.783304711521318, - "language_loss": 0.76481223, - "learning_rate": 2.0951874086440978e-06, - "loss": 0.78619403, - "num_input_tokens_seen": 178862290, - "step": 8321, - "time_per_iteration": 2.7580785751342773 - }, - { - "auxiliary_loss_clip": 0.01115908, - "auxiliary_loss_mlp": 0.00771212, - "balance_loss_clip": 1.04681301, - "balance_loss_mlp": 1.00017464, - "epoch": 0.5003457087028408, - "flos": 16107408794880.0, - "grad_norm": 6.807525102727238, - "language_loss": 0.82965297, - "learning_rate": 2.0947983865070455e-06, - "loss": 0.84852415, - "num_input_tokens_seen": 178879805, - "step": 8322, - "time_per_iteration": 2.6580779552459717 - }, - { - "auxiliary_loss_clip": 0.01117442, - "auxiliary_loss_mlp": 0.0103527, - "balance_loss_clip": 1.0458411, - "balance_loss_mlp": 1.02163804, - "epoch": 0.5004058319555088, - "flos": 22710518904960.0, - "grad_norm": 2.2579769372834257, - "language_loss": 0.73329234, - "learning_rate": 2.094409360775228e-06, - "loss": 0.75481945, - "num_input_tokens_seen": 178896985, - "step": 8323, - "time_per_iteration": 2.6743083000183105 - }, - { - "auxiliary_loss_clip": 0.01086486, - "auxiliary_loss_mlp": 0.01036398, - "balance_loss_clip": 1.04470778, - "balance_loss_mlp": 1.02264738, - "epoch": 0.5004659552081767, - "flos": 30117956313600.0, - "grad_norm": 1.846103580376976, - "language_loss": 0.69483137, - "learning_rate": 2.0940203314633977e-06, - "loss": 0.71606022, - "num_input_tokens_seen": 178920605, - "step": 8324, - "time_per_iteration": 2.783973217010498 - }, - { - "auxiliary_loss_clip": 0.01106501, - "auxiliary_loss_mlp": 0.00771259, - "balance_loss_clip": 1.0422833, - "balance_loss_mlp": 1.0000751, - "epoch": 0.5005260784608447, - "flos": 18624710764800.0, - "grad_norm": 3.4520936591258224, - "language_loss": 0.72325313, - "learning_rate": 2.0936312985863077e-06, - "loss": 0.74203074, - "num_input_tokens_seen": 178937760, - "step": 8325, - "time_per_iteration": 4.274277448654175 - }, - { - "auxiliary_loss_clip": 0.01089915, - "auxiliary_loss_mlp": 0.01041836, - "balance_loss_clip": 1.04158878, - "balance_loss_mlp": 1.02669656, - "epoch": 0.5005862017135126, - "flos": 24859987649280.0, - "grad_norm": 1.7422514730064806, - "language_loss": 0.73518062, - "learning_rate": 2.093242262158709e-06, - "loss": 0.7564981, - "num_input_tokens_seen": 178957985, - "step": 8326, - "time_per_iteration": 4.3523108959198 - }, - { - "auxiliary_loss_clip": 0.01094661, - "auxiliary_loss_mlp": 0.01032547, - "balance_loss_clip": 1.04201293, - "balance_loss_mlp": 1.01984525, - "epoch": 0.5006463249661807, - "flos": 18734381965440.0, - "grad_norm": 1.5476902232241379, - "language_loss": 0.78111005, - "learning_rate": 2.0928532221953544e-06, - "loss": 0.80238211, - "num_input_tokens_seen": 178977070, - "step": 8327, - "time_per_iteration": 4.4682557582855225 - }, - { - "auxiliary_loss_clip": 0.01128169, - "auxiliary_loss_mlp": 0.01040162, - "balance_loss_clip": 1.04810429, - "balance_loss_mlp": 1.02641153, - "epoch": 0.5007064482188487, - "flos": 13042145871360.0, - "grad_norm": 2.1714411479157296, - "language_loss": 0.88089001, - "learning_rate": 2.092464178710997e-06, - "loss": 0.90257335, - "num_input_tokens_seen": 178994175, - "step": 8328, - "time_per_iteration": 2.5710413455963135 - }, - { - "auxiliary_loss_clip": 0.01091641, - "auxiliary_loss_mlp": 0.01034728, - "balance_loss_clip": 1.04136801, - "balance_loss_mlp": 1.02050591, - "epoch": 0.5007665714715166, - "flos": 21288671965440.0, - "grad_norm": 2.863428491996577, - "language_loss": 0.73827946, - "learning_rate": 2.092075131720388e-06, - "loss": 0.75954318, - "num_input_tokens_seen": 179013710, - "step": 8329, - "time_per_iteration": 2.7770020961761475 - }, - { - "auxiliary_loss_clip": 0.01124061, - "auxiliary_loss_mlp": 0.0103094, - "balance_loss_clip": 1.04667771, - "balance_loss_mlp": 1.01824427, - "epoch": 0.5008266947241846, - "flos": 29754576374400.0, - "grad_norm": 1.6131098934363575, - "language_loss": 0.79715234, - "learning_rate": 2.091686081238281e-06, - "loss": 0.81870234, - "num_input_tokens_seen": 179035255, - "step": 8330, - "time_per_iteration": 4.167505979537964 - }, - { - "auxiliary_loss_clip": 0.01021039, - "auxiliary_loss_mlp": 0.00752271, - "balance_loss_clip": 1.02094173, - "balance_loss_mlp": 0.9997682, - "epoch": 0.5008868179768525, - "flos": 63557829204480.0, - "grad_norm": 0.7263095406539528, - "language_loss": 0.5601325, - "learning_rate": 2.0912970272794282e-06, - "loss": 0.5778656, - "num_input_tokens_seen": 179090915, - "step": 8331, - "time_per_iteration": 3.008077621459961 - }, - { - "auxiliary_loss_clip": 0.01112181, - "auxiliary_loss_mlp": 0.01035155, - "balance_loss_clip": 1.04617071, - "balance_loss_mlp": 1.02216136, - "epoch": 0.5009469412295205, - "flos": 27375637593600.0, - "grad_norm": 2.025315423078993, - "language_loss": 0.65264666, - "learning_rate": 2.0909079698585833e-06, - "loss": 0.67412001, - "num_input_tokens_seen": 179109160, - "step": 8332, - "time_per_iteration": 2.6730518341064453 - }, - { - "auxiliary_loss_clip": 0.01120357, - "auxiliary_loss_mlp": 0.01033936, - "balance_loss_clip": 1.04410577, - "balance_loss_mlp": 1.02124023, - "epoch": 0.5010070644821885, - "flos": 27378833904000.0, - "grad_norm": 1.5954618594032755, - "language_loss": 0.75023079, - "learning_rate": 2.0905189089904993e-06, - "loss": 0.7717737, - "num_input_tokens_seen": 179130610, - "step": 8333, - "time_per_iteration": 2.685154914855957 - }, - { - "auxiliary_loss_clip": 0.01125291, - "auxiliary_loss_mlp": 0.01035107, - "balance_loss_clip": 1.04558921, - "balance_loss_mlp": 1.02145159, - "epoch": 0.5010671877348565, - "flos": 20662748542080.0, - "grad_norm": 1.9338828530124208, - "language_loss": 0.80424768, - "learning_rate": 2.090129844689929e-06, - "loss": 0.82585168, - "num_input_tokens_seen": 179147860, - "step": 8334, - "time_per_iteration": 2.627230405807495 - }, - { - "auxiliary_loss_clip": 0.01037349, - "auxiliary_loss_mlp": 0.01004574, - "balance_loss_clip": 1.02146554, - "balance_loss_mlp": 1.00316703, - "epoch": 0.5011273109875244, - "flos": 59128645000320.0, - "grad_norm": 0.8902108893007158, - "language_loss": 0.62708843, - "learning_rate": 2.089740776971626e-06, - "loss": 0.64750767, - "num_input_tokens_seen": 179210490, - "step": 8335, - "time_per_iteration": 3.2171308994293213 - }, - { - "auxiliary_loss_clip": 0.01110054, - "auxiliary_loss_mlp": 0.01029223, - "balance_loss_clip": 1.04289985, - "balance_loss_mlp": 1.01612818, - "epoch": 0.5011874342401924, - "flos": 25336342840320.0, - "grad_norm": 1.3859166459285381, - "language_loss": 0.79553854, - "learning_rate": 2.0893517058503435e-06, - "loss": 0.81693137, - "num_input_tokens_seen": 179231360, - "step": 8336, - "time_per_iteration": 2.6930394172668457 - }, - { - "auxiliary_loss_clip": 0.01082861, - "auxiliary_loss_mlp": 0.01032761, - "balance_loss_clip": 1.03948808, - "balance_loss_mlp": 1.01899827, - "epoch": 0.5012475574928603, - "flos": 20229953569920.0, - "grad_norm": 2.2337029404169457, - "language_loss": 0.80255198, - "learning_rate": 2.088962631340836e-06, - "loss": 0.82370824, - "num_input_tokens_seen": 179250625, - "step": 8337, - "time_per_iteration": 2.725379467010498 - }, - { - "auxiliary_loss_clip": 0.01129165, - "auxiliary_loss_mlp": 0.01038167, - "balance_loss_clip": 1.04644942, - "balance_loss_mlp": 1.0239507, - "epoch": 0.5013076807455283, - "flos": 22710123855360.0, - "grad_norm": 2.0126131839523835, - "language_loss": 0.79470736, - "learning_rate": 2.0885735534578555e-06, - "loss": 0.81638074, - "num_input_tokens_seen": 179267360, - "step": 8338, - "time_per_iteration": 2.6641087532043457 - }, - { - "auxiliary_loss_clip": 0.01100565, - "auxiliary_loss_mlp": 0.01029861, - "balance_loss_clip": 1.04381251, - "balance_loss_mlp": 1.01617527, - "epoch": 0.5013678039981962, - "flos": 24245161528320.0, - "grad_norm": 1.6605427604759349, - "language_loss": 0.85052264, - "learning_rate": 2.0881844722161583e-06, - "loss": 0.87182683, - "num_input_tokens_seen": 179289810, - "step": 8339, - "time_per_iteration": 2.7899603843688965 - }, - { - "auxiliary_loss_clip": 0.0111167, - "auxiliary_loss_mlp": 0.01037127, - "balance_loss_clip": 1.04381561, - "balance_loss_mlp": 1.02343023, - "epoch": 0.5014279272508643, - "flos": 26176688501760.0, - "grad_norm": 1.4822129376950433, - "language_loss": 0.70713747, - "learning_rate": 2.0877953876304962e-06, - "loss": 0.72862542, - "num_input_tokens_seen": 179310620, - "step": 8340, - "time_per_iteration": 2.773681402206421 - }, - { - "auxiliary_loss_clip": 0.01088541, - "auxiliary_loss_mlp": 0.01043525, - "balance_loss_clip": 1.04147744, - "balance_loss_mlp": 1.02764666, - "epoch": 0.5014880505035323, - "flos": 21430446946560.0, - "grad_norm": 1.9911594693512178, - "language_loss": 0.78301972, - "learning_rate": 2.0874062997156245e-06, - "loss": 0.80434036, - "num_input_tokens_seen": 179329005, - "step": 8341, - "time_per_iteration": 2.7607786655426025 - }, - { - "auxiliary_loss_clip": 0.01096808, - "auxiliary_loss_mlp": 0.01038511, - "balance_loss_clip": 1.04584622, - "balance_loss_mlp": 1.02391934, - "epoch": 0.5015481737562002, - "flos": 15770745596160.0, - "grad_norm": 4.243666050944008, - "language_loss": 0.89054161, - "learning_rate": 2.0870172084862975e-06, - "loss": 0.9118948, - "num_input_tokens_seen": 179343785, - "step": 8342, - "time_per_iteration": 2.7108232975006104 - }, - { - "auxiliary_loss_clip": 0.01103427, - "auxiliary_loss_mlp": 0.01036162, - "balance_loss_clip": 1.04467797, - "balance_loss_mlp": 1.02273893, - "epoch": 0.5016082970088682, - "flos": 26830801123200.0, - "grad_norm": 1.768885433843204, - "language_loss": 0.76325786, - "learning_rate": 2.0866281139572682e-06, - "loss": 0.78465378, - "num_input_tokens_seen": 179364070, - "step": 8343, - "time_per_iteration": 2.6551196575164795 - }, - { - "auxiliary_loss_clip": 0.01113632, - "auxiliary_loss_mlp": 0.01028707, - "balance_loss_clip": 1.04612589, - "balance_loss_mlp": 1.01574898, - "epoch": 0.5016684202615361, - "flos": 21470595373440.0, - "grad_norm": 1.8502078003194165, - "language_loss": 0.6725269, - "learning_rate": 2.086239016143293e-06, - "loss": 0.6939503, - "num_input_tokens_seen": 179384225, - "step": 8344, - "time_per_iteration": 2.634850263595581 - }, - { - "auxiliary_loss_clip": 0.01104392, - "auxiliary_loss_mlp": 0.0103805, - "balance_loss_clip": 1.04439509, - "balance_loss_mlp": 1.025056, - "epoch": 0.5017285435142042, - "flos": 26246821806720.0, - "grad_norm": 2.403480744645997, - "language_loss": 0.75519335, - "learning_rate": 2.0858499150591258e-06, - "loss": 0.77661783, - "num_input_tokens_seen": 179402595, - "step": 8345, - "time_per_iteration": 2.7551872730255127 - }, - { - "auxiliary_loss_clip": 0.01111042, - "auxiliary_loss_mlp": 0.01031467, - "balance_loss_clip": 1.04757214, - "balance_loss_mlp": 1.01661348, - "epoch": 0.5017886667668721, - "flos": 20777555387520.0, - "grad_norm": 2.18282722391055, - "language_loss": 0.78664625, - "learning_rate": 2.0854608107195203e-06, - "loss": 0.80807132, - "num_input_tokens_seen": 179419635, - "step": 8346, - "time_per_iteration": 2.661569833755493 - }, - { - "auxiliary_loss_clip": 0.01102528, - "auxiliary_loss_mlp": 0.00770029, - "balance_loss_clip": 1.04322028, - "balance_loss_mlp": 1.00006032, - "epoch": 0.5018487900195401, - "flos": 20156408472960.0, - "grad_norm": 1.5952257408001917, - "language_loss": 0.69384575, - "learning_rate": 2.0850717031392333e-06, - "loss": 0.71257138, - "num_input_tokens_seen": 179438770, - "step": 8347, - "time_per_iteration": 2.7273542881011963 - }, - { - "auxiliary_loss_clip": 0.0108784, - "auxiliary_loss_mlp": 0.01037703, - "balance_loss_clip": 1.04173744, - "balance_loss_mlp": 1.02352858, - "epoch": 0.501908913272208, - "flos": 18150689957760.0, - "grad_norm": 1.852088117198485, - "language_loss": 0.70635176, - "learning_rate": 2.0846825923330174e-06, - "loss": 0.72760713, - "num_input_tokens_seen": 179457475, - "step": 8348, - "time_per_iteration": 2.7395875453948975 - }, - { - "auxiliary_loss_clip": 0.01110808, - "auxiliary_loss_mlp": 0.01035347, - "balance_loss_clip": 1.04538929, - "balance_loss_mlp": 1.02306843, - "epoch": 0.501969036524876, - "flos": 23112287504640.0, - "grad_norm": 1.775170825025465, - "language_loss": 0.74760187, - "learning_rate": 2.0842934783156303e-06, - "loss": 0.76906341, - "num_input_tokens_seen": 179478140, - "step": 8349, - "time_per_iteration": 2.6996099948883057 - }, - { - "auxiliary_loss_clip": 0.01112401, - "auxiliary_loss_mlp": 0.01034238, - "balance_loss_clip": 1.0427202, - "balance_loss_mlp": 1.01971805, - "epoch": 0.5020291597775439, - "flos": 11363214314880.0, - "grad_norm": 2.078287176668375, - "language_loss": 0.63625813, - "learning_rate": 2.0839043611018266e-06, - "loss": 0.6577245, - "num_input_tokens_seen": 179494325, - "step": 8350, - "time_per_iteration": 2.6264822483062744 - }, - { - "auxiliary_loss_clip": 0.01015981, - "auxiliary_loss_mlp": 0.01015388, - "balance_loss_clip": 1.01908755, - "balance_loss_mlp": 1.01377916, - "epoch": 0.5020892830302119, - "flos": 64011094928640.0, - "grad_norm": 0.7752505604108973, - "language_loss": 0.59761232, - "learning_rate": 2.0835152407063597e-06, - "loss": 0.617926, - "num_input_tokens_seen": 179553545, - "step": 8351, - "time_per_iteration": 3.4168505668640137 - }, - { - "auxiliary_loss_clip": 0.01100468, - "auxiliary_loss_mlp": 0.0103649, - "balance_loss_clip": 1.04387021, - "balance_loss_mlp": 1.02232814, - "epoch": 0.5021494062828799, - "flos": 23732859801600.0, - "grad_norm": 1.746970205481512, - "language_loss": 0.74981982, - "learning_rate": 2.0831261171439873e-06, - "loss": 0.77118939, - "num_input_tokens_seen": 179573645, - "step": 8352, - "time_per_iteration": 2.7219762802124023 - }, - { - "auxiliary_loss_clip": 0.01097371, - "auxiliary_loss_mlp": 0.0103593, - "balance_loss_clip": 1.04458284, - "balance_loss_mlp": 1.02211952, - "epoch": 0.5022095295355479, - "flos": 21576747041280.0, - "grad_norm": 1.6929263676943664, - "language_loss": 0.71971965, - "learning_rate": 2.082736990429464e-06, - "loss": 0.74105263, - "num_input_tokens_seen": 179591435, - "step": 8353, - "time_per_iteration": 2.6912848949432373 - }, - { - "auxiliary_loss_clip": 0.01123337, - "auxiliary_loss_mlp": 0.01037374, - "balance_loss_clip": 1.05196476, - "balance_loss_mlp": 1.02265787, - "epoch": 0.5022696527882159, - "flos": 21397229844480.0, - "grad_norm": 1.8297806631316527, - "language_loss": 0.74025398, - "learning_rate": 2.0823478605775455e-06, - "loss": 0.76186109, - "num_input_tokens_seen": 179609955, - "step": 8354, - "time_per_iteration": 2.7325775623321533 - }, - { - "auxiliary_loss_clip": 0.0110051, - "auxiliary_loss_mlp": 0.01042571, - "balance_loss_clip": 1.04367399, - "balance_loss_mlp": 1.02817094, - "epoch": 0.5023297760408838, - "flos": 27160712565120.0, - "grad_norm": 1.8324523966840642, - "language_loss": 0.72395205, - "learning_rate": 2.0819587276029884e-06, - "loss": 0.74538279, - "num_input_tokens_seen": 179630875, - "step": 8355, - "time_per_iteration": 2.717954158782959 - }, - { - "auxiliary_loss_clip": 0.01117118, - "auxiliary_loss_mlp": 0.01041207, - "balance_loss_clip": 1.0459739, - "balance_loss_mlp": 1.02644253, - "epoch": 0.5023898992935518, - "flos": 26213820186240.0, - "grad_norm": 1.6992540953340016, - "language_loss": 0.81400853, - "learning_rate": 2.081569591520548e-06, - "loss": 0.83559179, - "num_input_tokens_seen": 179649835, - "step": 8356, - "time_per_iteration": 2.7149479389190674 - }, - { - "auxiliary_loss_clip": 0.01117006, - "auxiliary_loss_mlp": 0.01044256, - "balance_loss_clip": 1.04384911, - "balance_loss_mlp": 1.02906859, - "epoch": 0.5024500225462197, - "flos": 13440323111040.0, - "grad_norm": 2.281950898223197, - "language_loss": 0.76235557, - "learning_rate": 2.0811804523449803e-06, - "loss": 0.78396809, - "num_input_tokens_seen": 179667605, - "step": 8357, - "time_per_iteration": 2.6641504764556885 - }, - { - "auxiliary_loss_clip": 0.01115092, - "auxiliary_loss_mlp": 0.01038737, - "balance_loss_clip": 1.04538774, - "balance_loss_mlp": 1.02369308, - "epoch": 0.5025101457988878, - "flos": 21579584215680.0, - "grad_norm": 1.606830870939079, - "language_loss": 0.766074, - "learning_rate": 2.0807913100910417e-06, - "loss": 0.78761232, - "num_input_tokens_seen": 179686910, - "step": 8358, - "time_per_iteration": 2.715304136276245 - }, - { - "auxiliary_loss_clip": 0.01101769, - "auxiliary_loss_mlp": 0.0103829, - "balance_loss_clip": 1.04243326, - "balance_loss_mlp": 1.02330494, - "epoch": 0.5025702690515557, - "flos": 24645134448000.0, - "grad_norm": 2.4091387510851354, - "language_loss": 0.72286153, - "learning_rate": 2.0804021647734887e-06, - "loss": 0.7442621, - "num_input_tokens_seen": 179706395, - "step": 8359, - "time_per_iteration": 2.7783002853393555 - }, - { - "auxiliary_loss_clip": 0.01097913, - "auxiliary_loss_mlp": 0.01045718, - "balance_loss_clip": 1.04463625, - "balance_loss_mlp": 1.03208613, - "epoch": 0.5026303923042237, - "flos": 22090162089600.0, - "grad_norm": 1.9040983502257391, - "language_loss": 0.76839483, - "learning_rate": 2.080013016407077e-06, - "loss": 0.7898311, - "num_input_tokens_seen": 179725735, - "step": 8360, - "time_per_iteration": 2.6632778644561768 - }, - { - "auxiliary_loss_clip": 0.01085631, - "auxiliary_loss_mlp": 0.01038787, - "balance_loss_clip": 1.04737091, - "balance_loss_mlp": 1.02541208, - "epoch": 0.5026905155568916, - "flos": 23697200574720.0, - "grad_norm": 1.9221287440607566, - "language_loss": 0.7667141, - "learning_rate": 2.0796238650065645e-06, - "loss": 0.78795838, - "num_input_tokens_seen": 179746150, - "step": 8361, - "time_per_iteration": 2.7411348819732666 - }, - { - "auxiliary_loss_clip": 0.01096697, - "auxiliary_loss_mlp": 0.01034867, - "balance_loss_clip": 1.04426289, - "balance_loss_mlp": 1.01988244, - "epoch": 0.5027506388095596, - "flos": 25812410722560.0, - "grad_norm": 1.5686217043676736, - "language_loss": 0.85069525, - "learning_rate": 2.0792347105867065e-06, - "loss": 0.87201089, - "num_input_tokens_seen": 179767550, - "step": 8362, - "time_per_iteration": 2.827319622039795 - }, - { - "auxiliary_loss_clip": 0.01102707, - "auxiliary_loss_mlp": 0.01033879, - "balance_loss_clip": 1.0435946, - "balance_loss_mlp": 1.02022946, - "epoch": 0.5028107620622275, - "flos": 27526606456320.0, - "grad_norm": 1.54737690881779, - "language_loss": 0.78134143, - "learning_rate": 2.0788455531622605e-06, - "loss": 0.80270725, - "num_input_tokens_seen": 179790075, - "step": 8363, - "time_per_iteration": 2.76174259185791 - }, - { - "auxiliary_loss_clip": 0.01111576, - "auxiliary_loss_mlp": 0.01035086, - "balance_loss_clip": 1.04562223, - "balance_loss_mlp": 1.02060819, - "epoch": 0.5028708853148955, - "flos": 24534278098560.0, - "grad_norm": 3.229087026174198, - "language_loss": 0.75995886, - "learning_rate": 2.0784563927479838e-06, - "loss": 0.78142548, - "num_input_tokens_seen": 179806515, - "step": 8364, - "time_per_iteration": 4.35154914855957 - }, - { - "auxiliary_loss_clip": 0.01124922, - "auxiliary_loss_mlp": 0.01030963, - "balance_loss_clip": 1.04685044, - "balance_loss_mlp": 1.01810658, - "epoch": 0.5029310085675635, - "flos": 20813609664000.0, - "grad_norm": 1.5241312757107228, - "language_loss": 0.69465041, - "learning_rate": 2.0780672293586317e-06, - "loss": 0.71620929, - "num_input_tokens_seen": 179826450, - "step": 8365, - "time_per_iteration": 2.619415283203125 - }, - { - "auxiliary_loss_clip": 0.01103666, - "auxiliary_loss_mlp": 0.01034829, - "balance_loss_clip": 1.04435158, - "balance_loss_mlp": 1.0207144, - "epoch": 0.5029911318202315, - "flos": 22342470197760.0, - "grad_norm": 1.4884180792885182, - "language_loss": 0.73293805, - "learning_rate": 2.0776780630089635e-06, - "loss": 0.75432301, - "num_input_tokens_seen": 179846770, - "step": 8366, - "time_per_iteration": 4.228264331817627 - }, - { - "auxiliary_loss_clip": 0.01113401, - "auxiliary_loss_mlp": 0.01032302, - "balance_loss_clip": 1.04693627, - "balance_loss_mlp": 1.0189749, - "epoch": 0.5030512550728995, - "flos": 24352713826560.0, - "grad_norm": 1.4343945223262573, - "language_loss": 0.7806654, - "learning_rate": 2.077288893713735e-06, - "loss": 0.80212247, - "num_input_tokens_seen": 179866585, - "step": 8367, - "time_per_iteration": 4.1336071491241455 - }, - { - "auxiliary_loss_clip": 0.01113589, - "auxiliary_loss_mlp": 0.01031042, - "balance_loss_clip": 1.0443697, - "balance_loss_mlp": 1.01778555, - "epoch": 0.5031113783255674, - "flos": 18259930195200.0, - "grad_norm": 1.686368676940742, - "language_loss": 0.69880998, - "learning_rate": 2.0768997214877035e-06, - "loss": 0.72025627, - "num_input_tokens_seen": 179885575, - "step": 8368, - "time_per_iteration": 2.5836374759674072 - }, - { - "auxiliary_loss_clip": 0.01036914, - "auxiliary_loss_mlp": 0.01003217, - "balance_loss_clip": 1.0201298, - "balance_loss_mlp": 1.00156045, - "epoch": 0.5031715015782354, - "flos": 57253173200640.0, - "grad_norm": 0.8467965026864039, - "language_loss": 0.63315928, - "learning_rate": 2.0765105463456274e-06, - "loss": 0.65356052, - "num_input_tokens_seen": 179939650, - "step": 8369, - "time_per_iteration": 4.438805103302002 - }, - { - "auxiliary_loss_clip": 0.011076, - "auxiliary_loss_mlp": 0.01034663, - "balance_loss_clip": 1.04427028, - "balance_loss_mlp": 1.0215379, - "epoch": 0.5032316248309033, - "flos": 27527360641920.0, - "grad_norm": 2.0752589468807043, - "language_loss": 0.60782373, - "learning_rate": 2.076121368302263e-06, - "loss": 0.62924629, - "num_input_tokens_seen": 179961765, - "step": 8370, - "time_per_iteration": 2.65816330909729 - }, - { - "auxiliary_loss_clip": 0.01076531, - "auxiliary_loss_mlp": 0.01043773, - "balance_loss_clip": 1.04144311, - "balance_loss_mlp": 1.02868104, - "epoch": 0.5032917480835714, - "flos": 34495825939200.0, - "grad_norm": 1.8954281033433134, - "language_loss": 0.68462563, - "learning_rate": 2.0757321873723695e-06, - "loss": 0.70582867, - "num_input_tokens_seen": 179983015, - "step": 8371, - "time_per_iteration": 2.8479132652282715 - }, - { - "auxiliary_loss_clip": 0.01097422, - "auxiliary_loss_mlp": 0.01034396, - "balance_loss_clip": 1.04120922, - "balance_loss_mlp": 1.019364, - "epoch": 0.5033518713362393, - "flos": 33656773167360.0, - "grad_norm": 1.6611598690743674, - "language_loss": 0.67656618, - "learning_rate": 2.0753430035707042e-06, - "loss": 0.69788438, - "num_input_tokens_seen": 180003210, - "step": 8372, - "time_per_iteration": 2.767489194869995 - }, - { - "auxiliary_loss_clip": 0.01085092, - "auxiliary_loss_mlp": 0.01043333, - "balance_loss_clip": 1.04139996, - "balance_loss_mlp": 1.02714443, - "epoch": 0.5034119945889073, - "flos": 28185495586560.0, - "grad_norm": 1.9018001021824607, - "language_loss": 0.66726547, - "learning_rate": 2.0749538169120235e-06, - "loss": 0.68854976, - "num_input_tokens_seen": 180025530, - "step": 8373, - "time_per_iteration": 2.7779579162597656 - }, - { - "auxiliary_loss_clip": 0.0109703, - "auxiliary_loss_mlp": 0.01035617, - "balance_loss_clip": 1.04184651, - "balance_loss_mlp": 1.02208042, - "epoch": 0.5034721178415752, - "flos": 21358697529600.0, - "grad_norm": 1.7065424378128664, - "language_loss": 0.74679291, - "learning_rate": 2.0745646274110872e-06, - "loss": 0.76811939, - "num_input_tokens_seen": 180043180, - "step": 8374, - "time_per_iteration": 2.673182487487793 - }, - { - "auxiliary_loss_clip": 0.01100104, - "auxiliary_loss_mlp": 0.01040932, - "balance_loss_clip": 1.04264212, - "balance_loss_mlp": 1.02604842, - "epoch": 0.5035322410942432, - "flos": 22674823764480.0, - "grad_norm": 1.5424981365737231, - "language_loss": 0.68154198, - "learning_rate": 2.0741754350826525e-06, - "loss": 0.70295238, - "num_input_tokens_seen": 180062905, - "step": 8375, - "time_per_iteration": 2.6842665672302246 - }, - { - "auxiliary_loss_clip": 0.01077033, - "auxiliary_loss_mlp": 0.01034566, - "balance_loss_clip": 1.04517126, - "balance_loss_mlp": 1.0195334, - "epoch": 0.5035923643469111, - "flos": 19828723674240.0, - "grad_norm": 3.5828954699990656, - "language_loss": 0.79316169, - "learning_rate": 2.0737862399414777e-06, - "loss": 0.81427765, - "num_input_tokens_seen": 180082000, - "step": 8376, - "time_per_iteration": 2.7780654430389404 - }, - { - "auxiliary_loss_clip": 0.01117369, - "auxiliary_loss_mlp": 0.00771622, - "balance_loss_clip": 1.04441619, - "balance_loss_mlp": 1.00016475, - "epoch": 0.5036524875995791, - "flos": 30514625182080.0, - "grad_norm": 2.6140774214814693, - "language_loss": 0.59478593, - "learning_rate": 2.0733970420023213e-06, - "loss": 0.61367583, - "num_input_tokens_seen": 180101340, - "step": 8377, - "time_per_iteration": 2.8071539402008057 - }, - { - "auxiliary_loss_clip": 0.01101437, - "auxiliary_loss_mlp": 0.01036815, - "balance_loss_clip": 1.04309344, - "balance_loss_mlp": 1.02237928, - "epoch": 0.5037126108522471, - "flos": 14720574637440.0, - "grad_norm": 2.0235166884987663, - "language_loss": 0.76598781, - "learning_rate": 2.0730078412799425e-06, - "loss": 0.78737032, - "num_input_tokens_seen": 180119160, - "step": 8378, - "time_per_iteration": 2.7332303524017334 - }, - { - "auxiliary_loss_clip": 0.01086538, - "auxiliary_loss_mlp": 0.01035008, - "balance_loss_clip": 1.04592919, - "balance_loss_mlp": 1.02190685, - "epoch": 0.5037727341049151, - "flos": 25297702784640.0, - "grad_norm": 1.7029006786118923, - "language_loss": 0.75000858, - "learning_rate": 2.0726186377890985e-06, - "loss": 0.77122402, - "num_input_tokens_seen": 180138730, - "step": 8379, - "time_per_iteration": 2.8803420066833496 - }, - { - "auxiliary_loss_clip": 0.0111301, - "auxiliary_loss_mlp": 0.01035016, - "balance_loss_clip": 1.04890418, - "balance_loss_mlp": 1.02151, - "epoch": 0.5038328573575831, - "flos": 28541764632960.0, - "grad_norm": 2.071075437448324, - "language_loss": 0.67026305, - "learning_rate": 2.072229431544548e-06, - "loss": 0.69174337, - "num_input_tokens_seen": 180158810, - "step": 8380, - "time_per_iteration": 2.7347092628479004 - }, - { - "auxiliary_loss_clip": 0.01070606, - "auxiliary_loss_mlp": 0.01037412, - "balance_loss_clip": 1.04154301, - "balance_loss_mlp": 1.02420914, - "epoch": 0.503892980610251, - "flos": 31649869503360.0, - "grad_norm": 1.7540511910669407, - "language_loss": 0.63245583, - "learning_rate": 2.071840222561051e-06, - "loss": 0.65353596, - "num_input_tokens_seen": 180179700, - "step": 8381, - "time_per_iteration": 2.836247444152832 - }, - { - "auxiliary_loss_clip": 0.01101604, - "auxiliary_loss_mlp": 0.01039312, - "balance_loss_clip": 1.04428375, - "balance_loss_mlp": 1.02624631, - "epoch": 0.503953103862919, - "flos": 27089358197760.0, - "grad_norm": 1.4852984664170332, - "language_loss": 0.67586917, - "learning_rate": 2.071451010853365e-06, - "loss": 0.69727832, - "num_input_tokens_seen": 180199890, - "step": 8382, - "time_per_iteration": 2.776895523071289 - }, - { - "auxiliary_loss_clip": 0.01115945, - "auxiliary_loss_mlp": 0.010349, - "balance_loss_clip": 1.04923749, - "balance_loss_mlp": 1.02039194, - "epoch": 0.5040132271155869, - "flos": 15632957024640.0, - "grad_norm": 2.370012953933875, - "language_loss": 0.62379169, - "learning_rate": 2.0710617964362506e-06, - "loss": 0.64530009, - "num_input_tokens_seen": 180217840, - "step": 8383, - "time_per_iteration": 2.7200045585632324 - }, - { - "auxiliary_loss_clip": 0.0108883, - "auxiliary_loss_mlp": 0.01037077, - "balance_loss_clip": 1.04611087, - "balance_loss_mlp": 1.02349341, - "epoch": 0.504073350368255, - "flos": 13590106824960.0, - "grad_norm": 1.70449565256652, - "language_loss": 0.66918409, - "learning_rate": 2.070672579324465e-06, - "loss": 0.69044316, - "num_input_tokens_seen": 180236465, - "step": 8384, - "time_per_iteration": 2.7442476749420166 - }, - { - "auxiliary_loss_clip": 0.01108405, - "auxiliary_loss_mlp": 0.01040675, - "balance_loss_clip": 1.04502487, - "balance_loss_mlp": 1.02765775, - "epoch": 0.5041334736209229, - "flos": 29058160510080.0, - "grad_norm": 3.2853523964565072, - "language_loss": 0.7103979, - "learning_rate": 2.0702833595327674e-06, - "loss": 0.73188871, - "num_input_tokens_seen": 180258025, - "step": 8385, - "time_per_iteration": 2.7480194568634033 - }, - { - "auxiliary_loss_clip": 0.01110668, - "auxiliary_loss_mlp": 0.01029456, - "balance_loss_clip": 1.0450182, - "balance_loss_mlp": 1.01644468, - "epoch": 0.5041935968735909, - "flos": 24608361899520.0, - "grad_norm": 1.9814049774657359, - "language_loss": 0.83344412, - "learning_rate": 2.069894137075919e-06, - "loss": 0.8548454, - "num_input_tokens_seen": 180277825, - "step": 8386, - "time_per_iteration": 2.703789234161377 - }, - { - "auxiliary_loss_clip": 0.01108831, - "auxiliary_loss_mlp": 0.01037004, - "balance_loss_clip": 1.04437232, - "balance_loss_mlp": 1.02313972, - "epoch": 0.5042537201262588, - "flos": 26286934320000.0, - "grad_norm": 1.592773103928685, - "language_loss": 0.66832674, - "learning_rate": 2.0695049119686766e-06, - "loss": 0.68978512, - "num_input_tokens_seen": 180300465, - "step": 8387, - "time_per_iteration": 2.8348472118377686 - }, - { - "auxiliary_loss_clip": 0.0106703, - "auxiliary_loss_mlp": 0.01033704, - "balance_loss_clip": 1.03972006, - "balance_loss_mlp": 1.02091861, - "epoch": 0.5043138433789268, - "flos": 22017371178240.0, - "grad_norm": 1.386335560273684, - "language_loss": 0.80273068, - "learning_rate": 2.0691156842258016e-06, - "loss": 0.82373804, - "num_input_tokens_seen": 180321050, - "step": 8388, - "time_per_iteration": 2.8797311782836914 - }, - { - "auxiliary_loss_clip": 0.01112016, - "auxiliary_loss_mlp": 0.01032606, - "balance_loss_clip": 1.04459918, - "balance_loss_mlp": 1.01927233, - "epoch": 0.5043739666315947, - "flos": 28767104605440.0, - "grad_norm": 2.1659708262729436, - "language_loss": 0.69815123, - "learning_rate": 2.0687264538620537e-06, - "loss": 0.7195974, - "num_input_tokens_seen": 180338870, - "step": 8389, - "time_per_iteration": 2.7739861011505127 - }, - { - "auxiliary_loss_clip": 0.01090981, - "auxiliary_loss_mlp": 0.01040643, - "balance_loss_clip": 1.04124045, - "balance_loss_mlp": 1.02756596, - "epoch": 0.5044340898842627, - "flos": 27599253713280.0, - "grad_norm": 1.6276843858059296, - "language_loss": 0.6986587, - "learning_rate": 2.068337220892191e-06, - "loss": 0.71997494, - "num_input_tokens_seen": 180361285, - "step": 8390, - "time_per_iteration": 2.844275712966919 - }, - { - "auxiliary_loss_clip": 0.01033792, - "auxiliary_loss_mlp": 0.01003101, - "balance_loss_clip": 1.02656126, - "balance_loss_mlp": 1.00192666, - "epoch": 0.5044942131369307, - "flos": 67458050749440.0, - "grad_norm": 0.9139771068710668, - "language_loss": 0.52933067, - "learning_rate": 2.067947985330974e-06, - "loss": 0.54969966, - "num_input_tokens_seen": 180415170, - "step": 8391, - "time_per_iteration": 3.054262638092041 - }, - { - "auxiliary_loss_clip": 0.01015619, - "auxiliary_loss_mlp": 0.01001074, - "balance_loss_clip": 1.02201819, - "balance_loss_mlp": 0.99963111, - "epoch": 0.5045543363895987, - "flos": 58630849390080.0, - "grad_norm": 0.853635093218063, - "language_loss": 0.60675329, - "learning_rate": 2.0675587471931628e-06, - "loss": 0.62692022, - "num_input_tokens_seen": 180468060, - "step": 8392, - "time_per_iteration": 3.0727028846740723 - }, - { - "auxiliary_loss_clip": 0.01085218, - "auxiliary_loss_mlp": 0.01036141, - "balance_loss_clip": 1.04148042, - "balance_loss_mlp": 1.02351034, - "epoch": 0.5046144596422667, - "flos": 22526620248960.0, - "grad_norm": 2.343143032045354, - "language_loss": 0.84343797, - "learning_rate": 2.067169506493517e-06, - "loss": 0.86465156, - "num_input_tokens_seen": 180486610, - "step": 8393, - "time_per_iteration": 2.7260749340057373 - }, - { - "auxiliary_loss_clip": 0.01087949, - "auxiliary_loss_mlp": 0.01033749, - "balance_loss_clip": 1.04098725, - "balance_loss_mlp": 1.02107096, - "epoch": 0.5046745828949346, - "flos": 27454246508160.0, - "grad_norm": 1.8418334138160795, - "language_loss": 0.50936127, - "learning_rate": 2.0667802632467974e-06, - "loss": 0.53057826, - "num_input_tokens_seen": 180508135, - "step": 8394, - "time_per_iteration": 2.827000617980957 - }, - { - "auxiliary_loss_clip": 0.01121524, - "auxiliary_loss_mlp": 0.0103809, - "balance_loss_clip": 1.04323471, - "balance_loss_mlp": 1.02311766, - "epoch": 0.5047347061476026, - "flos": 17274541415040.0, - "grad_norm": 1.5679941994223312, - "language_loss": 0.75414777, - "learning_rate": 2.0663910174677627e-06, - "loss": 0.7757439, - "num_input_tokens_seen": 180527000, - "step": 8395, - "time_per_iteration": 2.6535708904266357 - }, - { - "auxiliary_loss_clip": 0.01106012, - "auxiliary_loss_mlp": 0.01041618, - "balance_loss_clip": 1.04312563, - "balance_loss_mlp": 1.02860057, - "epoch": 0.5047948294002705, - "flos": 16649515831680.0, - "grad_norm": 2.0910564250698562, - "language_loss": 0.68781769, - "learning_rate": 2.0660017691711737e-06, - "loss": 0.70929396, - "num_input_tokens_seen": 180544715, - "step": 8396, - "time_per_iteration": 2.700747013092041 - }, - { - "auxiliary_loss_clip": 0.01111788, - "auxiliary_loss_mlp": 0.0103291, - "balance_loss_clip": 1.04604292, - "balance_loss_mlp": 1.02059579, - "epoch": 0.5048549526529386, - "flos": 26865706164480.0, - "grad_norm": 3.479269791703844, - "language_loss": 0.78899479, - "learning_rate": 2.065612518371792e-06, - "loss": 0.81044173, - "num_input_tokens_seen": 180565365, - "step": 8397, - "time_per_iteration": 2.716320514678955 - }, - { - "auxiliary_loss_clip": 0.01078686, - "auxiliary_loss_mlp": 0.01033767, - "balance_loss_clip": 1.04137075, - "balance_loss_mlp": 1.02079701, - "epoch": 0.5049150759056065, - "flos": 21833939399040.0, - "grad_norm": 3.435063442023246, - "language_loss": 0.66291559, - "learning_rate": 2.065223265084376e-06, - "loss": 0.68404007, - "num_input_tokens_seen": 180586670, - "step": 8398, - "time_per_iteration": 2.773245334625244 - }, - { - "auxiliary_loss_clip": 0.01113858, - "auxiliary_loss_mlp": 0.00770983, - "balance_loss_clip": 1.04783058, - "balance_loss_mlp": 1.00018215, - "epoch": 0.5049751991582745, - "flos": 21685807710720.0, - "grad_norm": 1.5640615321007765, - "language_loss": 0.720043, - "learning_rate": 2.064834009323688e-06, - "loss": 0.73889136, - "num_input_tokens_seen": 180605085, - "step": 8399, - "time_per_iteration": 2.697688341140747 - }, - { - "auxiliary_loss_clip": 0.01091578, - "auxiliary_loss_mlp": 0.01053063, - "balance_loss_clip": 1.04215539, - "balance_loss_mlp": 1.03741038, - "epoch": 0.5050353224109424, - "flos": 21359379888000.0, - "grad_norm": 3.5795224523825695, - "language_loss": 0.81615806, - "learning_rate": 2.0644447511044878e-06, - "loss": 0.8376044, - "num_input_tokens_seen": 180624370, - "step": 8400, - "time_per_iteration": 2.7172608375549316 - }, - { - "auxiliary_loss_clip": 0.01084985, - "auxiliary_loss_mlp": 0.01039311, - "balance_loss_clip": 1.04359269, - "balance_loss_mlp": 1.02413547, - "epoch": 0.5050954456636104, - "flos": 22820082364800.0, - "grad_norm": 1.9975954417395212, - "language_loss": 0.78901821, - "learning_rate": 2.0640554904415362e-06, - "loss": 0.81026119, - "num_input_tokens_seen": 180642450, - "step": 8401, - "time_per_iteration": 2.790361166000366 - }, - { - "auxiliary_loss_clip": 0.01125612, - "auxiliary_loss_mlp": 0.00770602, - "balance_loss_clip": 1.04576373, - "balance_loss_mlp": 1.00024748, - "epoch": 0.5051555689162783, - "flos": 30448226891520.0, - "grad_norm": 1.6142524162989784, - "language_loss": 0.70102769, - "learning_rate": 2.063666227349593e-06, - "loss": 0.7199899, - "num_input_tokens_seen": 180665250, - "step": 8402, - "time_per_iteration": 2.6950721740722656 - }, - { - "auxiliary_loss_clip": 0.01112822, - "auxiliary_loss_mlp": 0.00771289, - "balance_loss_clip": 1.04341567, - "balance_loss_mlp": 1.00022268, - "epoch": 0.5052156921689464, - "flos": 21287953693440.0, - "grad_norm": 2.3922403816883433, - "language_loss": 0.69298434, - "learning_rate": 2.063276961843422e-06, - "loss": 0.71182549, - "num_input_tokens_seen": 180687425, - "step": 8403, - "time_per_iteration": 4.257136344909668 - }, - { - "auxiliary_loss_clip": 0.01109967, - "auxiliary_loss_mlp": 0.01043124, - "balance_loss_clip": 1.04455948, - "balance_loss_mlp": 1.03021932, - "epoch": 0.5052758154216143, - "flos": 25081305298560.0, - "grad_norm": 1.6578366313908228, - "language_loss": 0.85693455, - "learning_rate": 2.062887693937781e-06, - "loss": 0.87846541, - "num_input_tokens_seen": 180708725, - "step": 8404, - "time_per_iteration": 2.725935459136963 - }, - { - "auxiliary_loss_clip": 0.01087696, - "auxiliary_loss_mlp": 0.00769912, - "balance_loss_clip": 1.04370379, - "balance_loss_mlp": 1.00018847, - "epoch": 0.5053359386742823, - "flos": 20885502735360.0, - "grad_norm": 1.5507323053673605, - "language_loss": 0.75329977, - "learning_rate": 2.0624984236474322e-06, - "loss": 0.77187586, - "num_input_tokens_seen": 180727990, - "step": 8405, - "time_per_iteration": 4.237490653991699 - }, - { - "auxiliary_loss_clip": 0.01124188, - "auxiliary_loss_mlp": 0.01031903, - "balance_loss_clip": 1.04560125, - "balance_loss_mlp": 1.01756775, - "epoch": 0.5053960619269503, - "flos": 37743335493120.0, - "grad_norm": 1.5851552924914987, - "language_loss": 0.73046809, - "learning_rate": 2.0621091509871378e-06, - "loss": 0.75202894, - "num_input_tokens_seen": 180749765, - "step": 8406, - "time_per_iteration": 4.387450218200684 - }, - { - "auxiliary_loss_clip": 0.0108276, - "auxiliary_loss_mlp": 0.01031932, - "balance_loss_clip": 1.04293895, - "balance_loss_mlp": 1.01945066, - "epoch": 0.5054561851796182, - "flos": 23513840622720.0, - "grad_norm": 1.8244341787972256, - "language_loss": 0.76631331, - "learning_rate": 2.0617198759716568e-06, - "loss": 0.78746021, - "num_input_tokens_seen": 180769580, - "step": 8407, - "time_per_iteration": 2.765031099319458 - }, - { - "auxiliary_loss_clip": 0.01085678, - "auxiliary_loss_mlp": 0.01030739, - "balance_loss_clip": 1.04038286, - "balance_loss_mlp": 1.01838887, - "epoch": 0.5055163084322862, - "flos": 30410233280640.0, - "grad_norm": 1.769865286909125, - "language_loss": 0.63482308, - "learning_rate": 2.0613305986157535e-06, - "loss": 0.65598726, - "num_input_tokens_seen": 180790295, - "step": 8408, - "time_per_iteration": 2.7497997283935547 - }, - { - "auxiliary_loss_clip": 0.01094613, - "auxiliary_loss_mlp": 0.01046938, - "balance_loss_clip": 1.04494774, - "balance_loss_mlp": 1.03097582, - "epoch": 0.5055764316849541, - "flos": 20259651139200.0, - "grad_norm": 1.9259425074827412, - "language_loss": 0.63427341, - "learning_rate": 2.0609413189341865e-06, - "loss": 0.655689, - "num_input_tokens_seen": 180807875, - "step": 8409, - "time_per_iteration": 4.083381652832031 - }, - { - "auxiliary_loss_clip": 0.01099903, - "auxiliary_loss_mlp": 0.01029856, - "balance_loss_clip": 1.04535913, - "balance_loss_mlp": 1.01790488, - "epoch": 0.5056365549376222, - "flos": 26070895969920.0, - "grad_norm": 2.0381050127162528, - "language_loss": 0.71175253, - "learning_rate": 2.0605520369417193e-06, - "loss": 0.73305017, - "num_input_tokens_seen": 180831300, - "step": 8410, - "time_per_iteration": 2.7279632091522217 - }, - { - "auxiliary_loss_clip": 0.01097675, - "auxiliary_loss_mlp": 0.0104194, - "balance_loss_clip": 1.04237318, - "balance_loss_mlp": 1.02787888, - "epoch": 0.5056966781902901, - "flos": 19279074781440.0, - "grad_norm": 1.4485544779958848, - "language_loss": 0.79037184, - "learning_rate": 2.060162752653113e-06, - "loss": 0.81176794, - "num_input_tokens_seen": 180849055, - "step": 8411, - "time_per_iteration": 2.6837332248687744 - }, - { - "auxiliary_loss_clip": 0.01125313, - "auxiliary_loss_mlp": 0.01039106, - "balance_loss_clip": 1.04655755, - "balance_loss_mlp": 1.02372837, - "epoch": 0.5057568014429581, - "flos": 21323325611520.0, - "grad_norm": 1.8986612146492552, - "language_loss": 0.81808418, - "learning_rate": 2.0597734660831285e-06, - "loss": 0.83972836, - "num_input_tokens_seen": 180867395, - "step": 8412, - "time_per_iteration": 2.615809679031372 - }, - { - "auxiliary_loss_clip": 0.01103779, - "auxiliary_loss_mlp": 0.01041145, - "balance_loss_clip": 1.04390502, - "balance_loss_mlp": 1.02739954, - "epoch": 0.505816924695626, - "flos": 17493596507520.0, - "grad_norm": 1.9029826105260268, - "language_loss": 0.80660832, - "learning_rate": 2.0593841772465283e-06, - "loss": 0.82805753, - "num_input_tokens_seen": 180886670, - "step": 8413, - "time_per_iteration": 2.7692911624908447 - }, - { - "auxiliary_loss_clip": 0.0109162, - "auxiliary_loss_mlp": 0.00771431, - "balance_loss_clip": 1.04406643, - "balance_loss_mlp": 1.00020945, - "epoch": 0.505877047948294, - "flos": 21142084561920.0, - "grad_norm": 1.9410580169313951, - "language_loss": 0.80582374, - "learning_rate": 2.0589948861580737e-06, - "loss": 0.82445419, - "num_input_tokens_seen": 180904645, - "step": 8414, - "time_per_iteration": 2.6970348358154297 - }, - { - "auxiliary_loss_clip": 0.01107406, - "auxiliary_loss_mlp": 0.01030257, - "balance_loss_clip": 1.03923571, - "balance_loss_mlp": 1.0169946, - "epoch": 0.5059371712009619, - "flos": 36350036887680.0, - "grad_norm": 2.0609800291463225, - "language_loss": 0.62233627, - "learning_rate": 2.058605592832528e-06, - "loss": 0.64371288, - "num_input_tokens_seen": 180922340, - "step": 8415, - "time_per_iteration": 2.7197422981262207 - }, - { - "auxiliary_loss_clip": 0.01087332, - "auxiliary_loss_mlp": 0.01032316, - "balance_loss_clip": 1.04092574, - "balance_loss_mlp": 1.01899433, - "epoch": 0.50599729445363, - "flos": 22673387220480.0, - "grad_norm": 1.6231002317718672, - "language_loss": 0.81935573, - "learning_rate": 2.0582162972846515e-06, - "loss": 0.84055215, - "num_input_tokens_seen": 180941350, - "step": 8416, - "time_per_iteration": 2.782719612121582 - }, - { - "auxiliary_loss_clip": 0.01091272, - "auxiliary_loss_mlp": 0.0103737, - "balance_loss_clip": 1.04698849, - "balance_loss_mlp": 1.02498984, - "epoch": 0.5060574177062979, - "flos": 22747866071040.0, - "grad_norm": 1.5803053727793945, - "language_loss": 0.78981423, - "learning_rate": 2.0578269995292078e-06, - "loss": 0.81110072, - "num_input_tokens_seen": 180960720, - "step": 8417, - "time_per_iteration": 2.7089340686798096 - }, - { - "auxiliary_loss_clip": 0.01070059, - "auxiliary_loss_mlp": 0.01039058, - "balance_loss_clip": 1.0394783, - "balance_loss_mlp": 1.02599227, - "epoch": 0.5061175409589659, - "flos": 21653201139840.0, - "grad_norm": 1.8562945560748794, - "language_loss": 0.62433213, - "learning_rate": 2.0574376995809588e-06, - "loss": 0.64542329, - "num_input_tokens_seen": 180979725, - "step": 8418, - "time_per_iteration": 2.719282388687134 - }, - { - "auxiliary_loss_clip": 0.0109094, - "auxiliary_loss_mlp": 0.01035325, - "balance_loss_clip": 1.04258347, - "balance_loss_mlp": 1.02194929, - "epoch": 0.5061776642116339, - "flos": 21616249023360.0, - "grad_norm": 2.2787836153634724, - "language_loss": 0.77394211, - "learning_rate": 2.0570483974546653e-06, - "loss": 0.79520482, - "num_input_tokens_seen": 180998980, - "step": 8419, - "time_per_iteration": 2.741727113723755 - }, - { - "auxiliary_loss_clip": 0.01062039, - "auxiliary_loss_mlp": 0.0103574, - "balance_loss_clip": 1.04027188, - "balance_loss_mlp": 1.02160168, - "epoch": 0.5062377874643018, - "flos": 24426294837120.0, - "grad_norm": 1.7570247471688223, - "language_loss": 0.77180004, - "learning_rate": 2.0566590931650917e-06, - "loss": 0.79277784, - "num_input_tokens_seen": 181019165, - "step": 8420, - "time_per_iteration": 2.8240675926208496 - }, - { - "auxiliary_loss_clip": 0.01123562, - "auxiliary_loss_mlp": 0.01036164, - "balance_loss_clip": 1.04462767, - "balance_loss_mlp": 1.02188277, - "epoch": 0.5062979107169698, - "flos": 22524429519360.0, - "grad_norm": 1.730716034871051, - "language_loss": 0.77317429, - "learning_rate": 2.056269786726999e-06, - "loss": 0.79477155, - "num_input_tokens_seen": 181037110, - "step": 8421, - "time_per_iteration": 2.6797008514404297 - }, - { - "auxiliary_loss_clip": 0.01106529, - "auxiliary_loss_mlp": 0.01032526, - "balance_loss_clip": 1.04212284, - "balance_loss_mlp": 1.01860261, - "epoch": 0.5063580339696377, - "flos": 24571984400640.0, - "grad_norm": 1.4584078249019805, - "language_loss": 0.66635919, - "learning_rate": 2.0558804781551512e-06, - "loss": 0.68774974, - "num_input_tokens_seen": 181057775, - "step": 8422, - "time_per_iteration": 2.80218505859375 - }, - { - "auxiliary_loss_clip": 0.01123775, - "auxiliary_loss_mlp": 0.01032917, - "balance_loss_clip": 1.04679537, - "balance_loss_mlp": 1.01939869, - "epoch": 0.5064181572223058, - "flos": 22596143022720.0, - "grad_norm": 1.7069001340883154, - "language_loss": 0.818717, - "learning_rate": 2.05549116746431e-06, - "loss": 0.84028399, - "num_input_tokens_seen": 181078260, - "step": 8423, - "time_per_iteration": 2.6722168922424316 - }, - { - "auxiliary_loss_clip": 0.01124994, - "auxiliary_loss_mlp": 0.00771759, - "balance_loss_clip": 1.04458904, - "balance_loss_mlp": 1.00021005, - "epoch": 0.5064782804749737, - "flos": 25994944661760.0, - "grad_norm": 1.7762047243227106, - "language_loss": 0.74689841, - "learning_rate": 2.055101854669237e-06, - "loss": 0.76586592, - "num_input_tokens_seen": 181098755, - "step": 8424, - "time_per_iteration": 2.657538652420044 - }, - { - "auxiliary_loss_clip": 0.01121266, - "auxiliary_loss_mlp": 0.01037955, - "balance_loss_clip": 1.04494393, - "balance_loss_mlp": 1.02427602, - "epoch": 0.5065384037276417, - "flos": 28553041503360.0, - "grad_norm": 1.7147939268792267, - "language_loss": 0.71541035, - "learning_rate": 2.0547125397846975e-06, - "loss": 0.73700261, - "num_input_tokens_seen": 181121570, - "step": 8425, - "time_per_iteration": 2.6696951389312744 - }, - { - "auxiliary_loss_clip": 0.0108314, - "auxiliary_loss_mlp": 0.01043142, - "balance_loss_clip": 1.04042649, - "balance_loss_mlp": 1.02828813, - "epoch": 0.5065985269803096, - "flos": 22966023323520.0, - "grad_norm": 1.7834107132976578, - "language_loss": 0.7868796, - "learning_rate": 2.0543232228254524e-06, - "loss": 0.80814242, - "num_input_tokens_seen": 181140240, - "step": 8426, - "time_per_iteration": 2.702861785888672 - }, - { - "auxiliary_loss_clip": 0.01116039, - "auxiliary_loss_mlp": 0.01039376, - "balance_loss_clip": 1.0481956, - "balance_loss_mlp": 1.0255599, - "epoch": 0.5066586502329776, - "flos": 21608563512960.0, - "grad_norm": 2.9338643206598713, - "language_loss": 0.7762264, - "learning_rate": 2.053933903806265e-06, - "loss": 0.79778051, - "num_input_tokens_seen": 181158630, - "step": 8427, - "time_per_iteration": 2.5964066982269287 - }, - { - "auxiliary_loss_clip": 0.0112123, - "auxiliary_loss_mlp": 0.01028788, - "balance_loss_clip": 1.04505837, - "balance_loss_mlp": 1.014763, - "epoch": 0.5067187734856455, - "flos": 20339912079360.0, - "grad_norm": 2.519773325925209, - "language_loss": 0.71591479, - "learning_rate": 2.0535445827418997e-06, - "loss": 0.73741496, - "num_input_tokens_seen": 181176405, - "step": 8428, - "time_per_iteration": 2.5878183841705322 - }, - { - "auxiliary_loss_clip": 0.01105053, - "auxiliary_loss_mlp": 0.00769921, - "balance_loss_clip": 1.041857, - "balance_loss_mlp": 1.00016701, - "epoch": 0.5067788967383136, - "flos": 28841080665600.0, - "grad_norm": 1.637474951892814, - "language_loss": 0.83266222, - "learning_rate": 2.0531552596471168e-06, - "loss": 0.85141206, - "num_input_tokens_seen": 181197595, - "step": 8429, - "time_per_iteration": 2.6528842449188232 - }, - { - "auxiliary_loss_clip": 0.01094205, - "auxiliary_loss_mlp": 0.0103555, - "balance_loss_clip": 1.04527116, - "balance_loss_mlp": 1.02068472, - "epoch": 0.5068390199909815, - "flos": 32450174478720.0, - "grad_norm": 1.986559953193462, - "language_loss": 0.73507559, - "learning_rate": 2.052765934536682e-06, - "loss": 0.75637317, - "num_input_tokens_seen": 181218560, - "step": 8430, - "time_per_iteration": 2.8031511306762695 - }, - { - "auxiliary_loss_clip": 0.01057925, - "auxiliary_loss_mlp": 0.01041942, - "balance_loss_clip": 1.03520572, - "balance_loss_mlp": 1.02702332, - "epoch": 0.5068991432436495, - "flos": 23146582014720.0, - "grad_norm": 2.0458094547910766, - "language_loss": 0.77132332, - "learning_rate": 2.0523766074253575e-06, - "loss": 0.79232198, - "num_input_tokens_seen": 181237095, - "step": 8431, - "time_per_iteration": 2.7593939304351807 - }, - { - "auxiliary_loss_clip": 0.01108688, - "auxiliary_loss_mlp": 0.01035857, - "balance_loss_clip": 1.04256523, - "balance_loss_mlp": 1.02171338, - "epoch": 0.5069592664963174, - "flos": 19936096404480.0, - "grad_norm": 1.5904348009832192, - "language_loss": 0.72110546, - "learning_rate": 2.0519872783279074e-06, - "loss": 0.74255085, - "num_input_tokens_seen": 181255940, - "step": 8432, - "time_per_iteration": 2.6104278564453125 - }, - { - "auxiliary_loss_clip": 0.0100252, - "auxiliary_loss_mlp": 0.01010781, - "balance_loss_clip": 1.01845694, - "balance_loss_mlp": 1.00870693, - "epoch": 0.5070193897489854, - "flos": 65793771941760.0, - "grad_norm": 0.7570764213883562, - "language_loss": 0.63648349, - "learning_rate": 2.0515979472590945e-06, - "loss": 0.65661651, - "num_input_tokens_seen": 181316945, - "step": 8433, - "time_per_iteration": 3.395040273666382 - }, - { - "auxiliary_loss_clip": 0.01089015, - "auxiliary_loss_mlp": 0.01040915, - "balance_loss_clip": 1.04288781, - "balance_loss_mlp": 1.02685428, - "epoch": 0.5070795130016534, - "flos": 17275331514240.0, - "grad_norm": 2.2603713431070194, - "language_loss": 0.78218484, - "learning_rate": 2.051208614233681e-06, - "loss": 0.80348414, - "num_input_tokens_seen": 181335555, - "step": 8434, - "time_per_iteration": 2.705864667892456 - }, - { - "auxiliary_loss_clip": 0.01099616, - "auxiliary_loss_mlp": 0.01035206, - "balance_loss_clip": 1.04088449, - "balance_loss_mlp": 1.02169967, - "epoch": 0.5071396362543213, - "flos": 21069940095360.0, - "grad_norm": 1.6177485307205706, - "language_loss": 0.70698971, - "learning_rate": 2.0508192792664326e-06, - "loss": 0.72833788, - "num_input_tokens_seen": 181354580, - "step": 8435, - "time_per_iteration": 2.699631929397583 - }, - { - "auxiliary_loss_clip": 0.01115814, - "auxiliary_loss_mlp": 0.01036717, - "balance_loss_clip": 1.04539943, - "balance_loss_mlp": 1.02220905, - "epoch": 0.5071997595069894, - "flos": 23144822248320.0, - "grad_norm": 1.8141877812584497, - "language_loss": 0.72254074, - "learning_rate": 2.050429942372112e-06, - "loss": 0.74406612, - "num_input_tokens_seen": 181374320, - "step": 8436, - "time_per_iteration": 2.6646859645843506 - }, - { - "auxiliary_loss_clip": 0.01124514, - "auxiliary_loss_mlp": 0.01034184, - "balance_loss_clip": 1.04597569, - "balance_loss_mlp": 1.01978946, - "epoch": 0.5072598827596573, - "flos": 22747183712640.0, - "grad_norm": 1.5423854267163515, - "language_loss": 0.83801168, - "learning_rate": 2.050040603565483e-06, - "loss": 0.85959864, - "num_input_tokens_seen": 181392190, - "step": 8437, - "time_per_iteration": 2.6614348888397217 - }, - { - "auxiliary_loss_clip": 0.01110359, - "auxiliary_loss_mlp": 0.01028112, - "balance_loss_clip": 1.04387856, - "balance_loss_mlp": 1.01448607, - "epoch": 0.5073200060123253, - "flos": 22566301799040.0, - "grad_norm": 2.7232019997829924, - "language_loss": 0.80638587, - "learning_rate": 2.049651262861309e-06, - "loss": 0.82777059, - "num_input_tokens_seen": 181413890, - "step": 8438, - "time_per_iteration": 2.6778056621551514 - }, - { - "auxiliary_loss_clip": 0.01081177, - "auxiliary_loss_mlp": 0.01037532, - "balance_loss_clip": 1.04218078, - "balance_loss_mlp": 1.02235103, - "epoch": 0.5073801292649932, - "flos": 25806341324160.0, - "grad_norm": 1.4751942737164592, - "language_loss": 0.7943362, - "learning_rate": 2.0492619202743543e-06, - "loss": 0.81552327, - "num_input_tokens_seen": 181433240, - "step": 8439, - "time_per_iteration": 2.694603443145752 - }, - { - "auxiliary_loss_clip": 0.01088705, - "auxiliary_loss_mlp": 0.0077357, - "balance_loss_clip": 1.04178834, - "balance_loss_mlp": 1.00020123, - "epoch": 0.5074402525176612, - "flos": 25373941401600.0, - "grad_norm": 1.5360675692672114, - "language_loss": 0.71413541, - "learning_rate": 2.048872575819383e-06, - "loss": 0.7327581, - "num_input_tokens_seen": 181453535, - "step": 8440, - "time_per_iteration": 2.68709397315979 - }, - { - "auxiliary_loss_clip": 0.01096271, - "auxiliary_loss_mlp": 0.01036596, - "balance_loss_clip": 1.04103327, - "balance_loss_mlp": 1.0227561, - "epoch": 0.5075003757703291, - "flos": 26064431521920.0, - "grad_norm": 1.6763306182018036, - "language_loss": 0.7087847, - "learning_rate": 2.048483229511158e-06, - "loss": 0.73011339, - "num_input_tokens_seen": 181474195, - "step": 8441, - "time_per_iteration": 2.728649377822876 - }, - { - "auxiliary_loss_clip": 0.01113949, - "auxiliary_loss_mlp": 0.00771406, - "balance_loss_clip": 1.04312265, - "balance_loss_mlp": 1.00021851, - "epoch": 0.5075604990229972, - "flos": 21835447770240.0, - "grad_norm": 1.794299641086803, - "language_loss": 0.63846874, - "learning_rate": 2.0480938813644445e-06, - "loss": 0.65732235, - "num_input_tokens_seen": 181494000, - "step": 8442, - "time_per_iteration": 4.1495561599731445 - }, - { - "auxiliary_loss_clip": 0.01065064, - "auxiliary_loss_mlp": 0.01028245, - "balance_loss_clip": 1.03900802, - "balance_loss_mlp": 1.01582956, - "epoch": 0.5076206222756651, - "flos": 31978703537280.0, - "grad_norm": 1.7729718848020288, - "language_loss": 0.7149542, - "learning_rate": 2.047704531394006e-06, - "loss": 0.73588729, - "num_input_tokens_seen": 181515955, - "step": 8443, - "time_per_iteration": 2.84781551361084 - }, - { - "auxiliary_loss_clip": 0.01033895, - "auxiliary_loss_mlp": 0.01036606, - "balance_loss_clip": 1.03034997, - "balance_loss_mlp": 1.02093554, - "epoch": 0.5076807455283331, - "flos": 36904031326080.0, - "grad_norm": 1.237062481884337, - "language_loss": 0.62134659, - "learning_rate": 2.047315179614607e-06, - "loss": 0.64205158, - "num_input_tokens_seen": 181540225, - "step": 8444, - "time_per_iteration": 3.2103631496429443 - }, - { - "auxiliary_loss_clip": 0.01086312, - "auxiliary_loss_mlp": 0.01030312, - "balance_loss_clip": 1.04043984, - "balance_loss_mlp": 1.0172112, - "epoch": 0.507740868781001, - "flos": 29862415981440.0, - "grad_norm": 1.7245082556223335, - "language_loss": 0.64173615, - "learning_rate": 2.046925826041012e-06, - "loss": 0.66290236, - "num_input_tokens_seen": 181560125, - "step": 8445, - "time_per_iteration": 4.46838903427124 - }, - { - "auxiliary_loss_clip": 0.01013224, - "auxiliary_loss_mlp": 0.01008254, - "balance_loss_clip": 1.02398801, - "balance_loss_mlp": 1.00686538, - "epoch": 0.507800992033669, - "flos": 61918974247680.0, - "grad_norm": 0.8265855466772786, - "language_loss": 0.61854541, - "learning_rate": 2.0465364706879845e-06, - "loss": 0.63876021, - "num_input_tokens_seen": 181618830, - "step": 8446, - "time_per_iteration": 3.267681121826172 - }, - { - "auxiliary_loss_clip": 0.01080886, - "auxiliary_loss_mlp": 0.01028563, - "balance_loss_clip": 1.0391748, - "balance_loss_mlp": 1.0157063, - "epoch": 0.507861115286337, - "flos": 20700490757760.0, - "grad_norm": 1.574417237275669, - "language_loss": 0.8065623, - "learning_rate": 2.04614711357029e-06, - "loss": 0.82765681, - "num_input_tokens_seen": 181637120, - "step": 8447, - "time_per_iteration": 2.761584758758545 - }, - { - "auxiliary_loss_clip": 0.01111406, - "auxiliary_loss_mlp": 0.01031653, - "balance_loss_clip": 1.04490948, - "balance_loss_mlp": 1.01859963, - "epoch": 0.507921238539005, - "flos": 30847050576000.0, - "grad_norm": 1.8510365938740598, - "language_loss": 0.70990604, - "learning_rate": 2.0457577547026916e-06, - "loss": 0.73133665, - "num_input_tokens_seen": 181659965, - "step": 8448, - "time_per_iteration": 4.335421085357666 - }, - { - "auxiliary_loss_clip": 0.01121931, - "auxiliary_loss_mlp": 0.00769587, - "balance_loss_clip": 1.04565167, - "balance_loss_mlp": 1.00020599, - "epoch": 0.507981361791673, - "flos": 35700197984640.0, - "grad_norm": 3.0099403095172557, - "language_loss": 0.71958399, - "learning_rate": 2.045368394099955e-06, - "loss": 0.73849922, - "num_input_tokens_seen": 181685290, - "step": 8449, - "time_per_iteration": 2.7780673503875732 - }, - { - "auxiliary_loss_clip": 0.01094628, - "auxiliary_loss_mlp": 0.0103001, - "balance_loss_clip": 1.04017317, - "balance_loss_mlp": 1.01767778, - "epoch": 0.5080414850443409, - "flos": 27161466750720.0, - "grad_norm": 1.5810099588314865, - "language_loss": 0.73045403, - "learning_rate": 2.044979031776844e-06, - "loss": 0.7517004, - "num_input_tokens_seen": 181706080, - "step": 8450, - "time_per_iteration": 2.744396448135376 - }, - { - "auxiliary_loss_clip": 0.01123333, - "auxiliary_loss_mlp": 0.01027947, - "balance_loss_clip": 1.04468369, - "balance_loss_mlp": 1.01485837, - "epoch": 0.5081016082970089, - "flos": 27085192220160.0, - "grad_norm": 1.7103931675901212, - "language_loss": 0.77190459, - "learning_rate": 2.0445896677481234e-06, - "loss": 0.79341733, - "num_input_tokens_seen": 181724805, - "step": 8451, - "time_per_iteration": 2.683182716369629 - }, - { - "auxiliary_loss_clip": 0.01122238, - "auxiliary_loss_mlp": 0.01037138, - "balance_loss_clip": 1.04372776, - "balance_loss_mlp": 1.02413273, - "epoch": 0.5081617315496768, - "flos": 22856531690880.0, - "grad_norm": 1.9627256153454082, - "language_loss": 0.85055304, - "learning_rate": 2.044200302028559e-06, - "loss": 0.87214684, - "num_input_tokens_seen": 181743725, - "step": 8452, - "time_per_iteration": 2.684624671936035 - }, - { - "auxiliary_loss_clip": 0.01126785, - "auxiliary_loss_mlp": 0.01034895, - "balance_loss_clip": 1.04584098, - "balance_loss_mlp": 1.02078056, - "epoch": 0.5082218548023448, - "flos": 16281898087680.0, - "grad_norm": 4.065129026902181, - "language_loss": 0.77099299, - "learning_rate": 2.0438109346329143e-06, - "loss": 0.79260981, - "num_input_tokens_seen": 181757720, - "step": 8453, - "time_per_iteration": 2.572178602218628 - }, - { - "auxiliary_loss_clip": 0.01084848, - "auxiliary_loss_mlp": 0.01032198, - "balance_loss_clip": 1.04113591, - "balance_loss_mlp": 1.02010989, - "epoch": 0.5082819780550127, - "flos": 24460768915200.0, - "grad_norm": 1.6244227223176155, - "language_loss": 0.76530403, - "learning_rate": 2.0434215655759544e-06, - "loss": 0.78647447, - "num_input_tokens_seen": 181778545, - "step": 8454, - "time_per_iteration": 2.8153836727142334 - }, - { - "auxiliary_loss_clip": 0.01097667, - "auxiliary_loss_mlp": 0.01036941, - "balance_loss_clip": 1.03992426, - "balance_loss_mlp": 1.02275562, - "epoch": 0.5083421013076808, - "flos": 23403271582080.0, - "grad_norm": 1.5351507829324025, - "language_loss": 0.89199609, - "learning_rate": 2.0430321948724446e-06, - "loss": 0.91334224, - "num_input_tokens_seen": 181799495, - "step": 8455, - "time_per_iteration": 2.7793357372283936 - }, - { - "auxiliary_loss_clip": 0.01106838, - "auxiliary_loss_mlp": 0.00772606, - "balance_loss_clip": 1.04346323, - "balance_loss_mlp": 1.00026703, - "epoch": 0.5084022245603487, - "flos": 23872695448320.0, - "grad_norm": 1.6166334009695327, - "language_loss": 0.62119138, - "learning_rate": 2.042642822537149e-06, - "loss": 0.63998592, - "num_input_tokens_seen": 181818400, - "step": 8456, - "time_per_iteration": 2.7200372219085693 - }, - { - "auxiliary_loss_clip": 0.01034029, - "auxiliary_loss_mlp": 0.01006279, - "balance_loss_clip": 1.01840019, - "balance_loss_mlp": 1.00490177, - "epoch": 0.5084623478130167, - "flos": 62873336655360.0, - "grad_norm": 0.8116383799523507, - "language_loss": 0.6243, - "learning_rate": 2.0422534485848343e-06, - "loss": 0.64470303, - "num_input_tokens_seen": 181875975, - "step": 8457, - "time_per_iteration": 3.087890625 - }, - { - "auxiliary_loss_clip": 0.01113045, - "auxiliary_loss_mlp": 0.01032551, - "balance_loss_clip": 1.0439477, - "balance_loss_mlp": 1.01853776, - "epoch": 0.5085224710656846, - "flos": 22346133384960.0, - "grad_norm": 1.6206653077395385, - "language_loss": 0.67609936, - "learning_rate": 2.0418640730302644e-06, - "loss": 0.6975553, - "num_input_tokens_seen": 181896450, - "step": 8458, - "time_per_iteration": 2.6950957775115967 - }, - { - "auxiliary_loss_clip": 0.011096, - "auxiliary_loss_mlp": 0.01034441, - "balance_loss_clip": 1.04140186, - "balance_loss_mlp": 1.01998079, - "epoch": 0.5085825943183526, - "flos": 26066263115520.0, - "grad_norm": 1.6983738136244226, - "language_loss": 0.77766174, - "learning_rate": 2.0414746958882043e-06, - "loss": 0.79910213, - "num_input_tokens_seen": 181916770, - "step": 8459, - "time_per_iteration": 2.699784278869629 - }, - { - "auxiliary_loss_clip": 0.01127851, - "auxiliary_loss_mlp": 0.01035156, - "balance_loss_clip": 1.04686987, - "balance_loss_mlp": 1.02099431, - "epoch": 0.5086427175710206, - "flos": 17420733768960.0, - "grad_norm": 10.198892862393663, - "language_loss": 0.8050856, - "learning_rate": 2.0410853171734196e-06, - "loss": 0.82671559, - "num_input_tokens_seen": 181932710, - "step": 8460, - "time_per_iteration": 2.632998466491699 - }, - { - "auxiliary_loss_clip": 0.01101605, - "auxiliary_loss_mlp": 0.01038577, - "balance_loss_clip": 1.04293346, - "balance_loss_mlp": 1.0255115, - "epoch": 0.5087028408236886, - "flos": 20631758083200.0, - "grad_norm": 1.5613520556763807, - "language_loss": 0.68347144, - "learning_rate": 2.0406959369006754e-06, - "loss": 0.70487332, - "num_input_tokens_seen": 181950665, - "step": 8461, - "time_per_iteration": 2.7463462352752686 - }, - { - "auxiliary_loss_clip": 0.01118492, - "auxiliary_loss_mlp": 0.01030227, - "balance_loss_clip": 1.04215729, - "balance_loss_mlp": 1.01677442, - "epoch": 0.5087629640763566, - "flos": 25593822506880.0, - "grad_norm": 1.9214201788253797, - "language_loss": 0.76016432, - "learning_rate": 2.0403065550847375e-06, - "loss": 0.7816515, - "num_input_tokens_seen": 181971270, - "step": 8462, - "time_per_iteration": 2.780043363571167 - }, - { - "auxiliary_loss_clip": 0.01081215, - "auxiliary_loss_mlp": 0.01037857, - "balance_loss_clip": 1.0401057, - "balance_loss_mlp": 1.02322388, - "epoch": 0.5088230873290245, - "flos": 13261631927040.0, - "grad_norm": 2.117801536001897, - "language_loss": 0.81441897, - "learning_rate": 2.0399171717403706e-06, - "loss": 0.83560967, - "num_input_tokens_seen": 181988410, - "step": 8463, - "time_per_iteration": 2.7101564407348633 - }, - { - "auxiliary_loss_clip": 0.0110518, - "auxiliary_loss_mlp": 0.01035062, - "balance_loss_clip": 1.04148602, - "balance_loss_mlp": 1.02201426, - "epoch": 0.5088832105816925, - "flos": 20043469134720.0, - "grad_norm": 2.6576302734312733, - "language_loss": 0.76305163, - "learning_rate": 2.039527786882341e-06, - "loss": 0.78445399, - "num_input_tokens_seen": 182006530, - "step": 8464, - "time_per_iteration": 2.6081295013427734 - }, - { - "auxiliary_loss_clip": 0.01034964, - "auxiliary_loss_mlp": 0.0100043, - "balance_loss_clip": 1.01882601, - "balance_loss_mlp": 0.99929708, - "epoch": 0.5089433338343604, - "flos": 67422179018880.0, - "grad_norm": 0.6843560168430419, - "language_loss": 0.59347767, - "learning_rate": 2.0391384005254133e-06, - "loss": 0.61383158, - "num_input_tokens_seen": 182074240, - "step": 8465, - "time_per_iteration": 3.308885097503662 - }, - { - "auxiliary_loss_clip": 0.0111949, - "auxiliary_loss_mlp": 0.01033343, - "balance_loss_clip": 1.04262543, - "balance_loss_mlp": 1.0203197, - "epoch": 0.5090034570870284, - "flos": 22710339336960.0, - "grad_norm": 2.5778248190048787, - "language_loss": 0.80206662, - "learning_rate": 2.038749012684354e-06, - "loss": 0.82359493, - "num_input_tokens_seen": 182093360, - "step": 8466, - "time_per_iteration": 2.6912481784820557 - }, - { - "auxiliary_loss_clip": 0.01107512, - "auxiliary_loss_mlp": 0.0102939, - "balance_loss_clip": 1.03987598, - "balance_loss_mlp": 1.01634204, - "epoch": 0.5090635803396963, - "flos": 20445812352000.0, - "grad_norm": 1.5056043379234754, - "language_loss": 0.78307688, - "learning_rate": 2.0383596233739286e-06, - "loss": 0.80444586, - "num_input_tokens_seen": 182110170, - "step": 8467, - "time_per_iteration": 2.61828875541687 - }, - { - "auxiliary_loss_clip": 0.01119026, - "auxiliary_loss_mlp": 0.01034745, - "balance_loss_clip": 1.04424381, - "balance_loss_mlp": 1.02226961, - "epoch": 0.5091237035923644, - "flos": 23768878164480.0, - "grad_norm": 1.9340722959801353, - "language_loss": 0.74676347, - "learning_rate": 2.0379702326089013e-06, - "loss": 0.76830113, - "num_input_tokens_seen": 182129570, - "step": 8468, - "time_per_iteration": 2.6233344078063965 - }, - { - "auxiliary_loss_clip": 0.01119943, - "auxiliary_loss_mlp": 0.01029058, - "balance_loss_clip": 1.04366863, - "balance_loss_mlp": 1.01651728, - "epoch": 0.5091838268450323, - "flos": 18327908684160.0, - "grad_norm": 1.884666390581893, - "language_loss": 0.77613342, - "learning_rate": 2.03758084040404e-06, - "loss": 0.7976234, - "num_input_tokens_seen": 182147565, - "step": 8469, - "time_per_iteration": 2.579117774963379 - }, - { - "auxiliary_loss_clip": 0.01107532, - "auxiliary_loss_mlp": 0.01038411, - "balance_loss_clip": 1.04521155, - "balance_loss_mlp": 1.02425504, - "epoch": 0.5092439500977003, - "flos": 29057621806080.0, - "grad_norm": 1.5718905230515574, - "language_loss": 0.69481277, - "learning_rate": 2.037191446774109e-06, - "loss": 0.71627223, - "num_input_tokens_seen": 182169695, - "step": 8470, - "time_per_iteration": 2.6437594890594482 - }, - { - "auxiliary_loss_clip": 0.01096004, - "auxiliary_loss_mlp": 0.01045395, - "balance_loss_clip": 1.04067326, - "balance_loss_mlp": 1.02993393, - "epoch": 0.5093040733503682, - "flos": 13553908894080.0, - "grad_norm": 2.534594931806725, - "language_loss": 0.73583853, - "learning_rate": 2.0368020517338745e-06, - "loss": 0.75725245, - "num_input_tokens_seen": 182186385, - "step": 8471, - "time_per_iteration": 2.6213905811309814 - }, - { - "auxiliary_loss_clip": 0.01043282, - "auxiliary_loss_mlp": 0.00999685, - "balance_loss_clip": 1.01733398, - "balance_loss_mlp": 0.99825424, - "epoch": 0.5093641966030362, - "flos": 68906617407360.0, - "grad_norm": 0.7545989611287492, - "language_loss": 0.58065605, - "learning_rate": 2.036412655298103e-06, - "loss": 0.60108572, - "num_input_tokens_seen": 182247095, - "step": 8472, - "time_per_iteration": 3.1640241146087646 - }, - { - "auxiliary_loss_clip": 0.01069354, - "auxiliary_loss_mlp": 0.01036283, - "balance_loss_clip": 1.03772914, - "balance_loss_mlp": 1.0235815, - "epoch": 0.5094243198557042, - "flos": 21580948932480.0, - "grad_norm": 2.4665832849090994, - "language_loss": 0.68956393, - "learning_rate": 2.03602325748156e-06, - "loss": 0.71062028, - "num_input_tokens_seen": 182266380, - "step": 8473, - "time_per_iteration": 2.806593179702759 - }, - { - "auxiliary_loss_clip": 0.01097364, - "auxiliary_loss_mlp": 0.01035609, - "balance_loss_clip": 1.04190159, - "balance_loss_mlp": 1.02250814, - "epoch": 0.5094844431083722, - "flos": 28840721529600.0, - "grad_norm": 1.8851162187904098, - "language_loss": 0.85464561, - "learning_rate": 2.0356338582990105e-06, - "loss": 0.87597537, - "num_input_tokens_seen": 182284685, - "step": 8474, - "time_per_iteration": 2.7467737197875977 - }, - { - "auxiliary_loss_clip": 0.01097916, - "auxiliary_loss_mlp": 0.01035284, - "balance_loss_clip": 1.04213905, - "balance_loss_mlp": 1.02201009, - "epoch": 0.5095445663610402, - "flos": 14976114969600.0, - "grad_norm": 2.1580860587409867, - "language_loss": 0.65563238, - "learning_rate": 2.035244457765222e-06, - "loss": 0.6769644, - "num_input_tokens_seen": 182301810, - "step": 8475, - "time_per_iteration": 2.653343439102173 - }, - { - "auxiliary_loss_clip": 0.01101978, - "auxiliary_loss_mlp": 0.01044707, - "balance_loss_clip": 1.04155195, - "balance_loss_mlp": 1.03043771, - "epoch": 0.5096046896137081, - "flos": 20777088510720.0, - "grad_norm": 2.3692417745384886, - "language_loss": 0.82122153, - "learning_rate": 2.0348550558949605e-06, - "loss": 0.84268838, - "num_input_tokens_seen": 182320285, - "step": 8476, - "time_per_iteration": 2.735163927078247 - }, - { - "auxiliary_loss_clip": 0.01069648, - "auxiliary_loss_mlp": 0.01043833, - "balance_loss_clip": 1.03814852, - "balance_loss_mlp": 1.02698851, - "epoch": 0.5096648128663761, - "flos": 23185078416000.0, - "grad_norm": 5.724576330634238, - "language_loss": 0.80651575, - "learning_rate": 2.0344656527029917e-06, - "loss": 0.82765061, - "num_input_tokens_seen": 182339465, - "step": 8477, - "time_per_iteration": 2.8972108364105225 - }, - { - "auxiliary_loss_clip": 0.01096525, - "auxiliary_loss_mlp": 0.01028419, - "balance_loss_clip": 1.04044962, - "balance_loss_mlp": 1.01321959, - "epoch": 0.509724936119044, - "flos": 22309432663680.0, - "grad_norm": 1.8365176357872317, - "language_loss": 0.6178633, - "learning_rate": 2.034076248204082e-06, - "loss": 0.63911271, - "num_input_tokens_seen": 182358375, - "step": 8478, - "time_per_iteration": 2.77237606048584 - }, - { - "auxiliary_loss_clip": 0.01105596, - "auxiliary_loss_mlp": 0.01039662, - "balance_loss_clip": 1.04185414, - "balance_loss_mlp": 1.02667403, - "epoch": 0.509785059371712, - "flos": 26287077974400.0, - "grad_norm": 1.8436515105252975, - "language_loss": 0.66209054, - "learning_rate": 2.0336868424129968e-06, - "loss": 0.68354309, - "num_input_tokens_seen": 182377935, - "step": 8479, - "time_per_iteration": 2.667865514755249 - }, - { - "auxiliary_loss_clip": 0.01108822, - "auxiliary_loss_mlp": 0.01036542, - "balance_loss_clip": 1.0434258, - "balance_loss_mlp": 1.02382231, - "epoch": 0.50984518262438, - "flos": 22964586779520.0, - "grad_norm": 1.5755275700627138, - "language_loss": 0.69447386, - "learning_rate": 2.0332974353445037e-06, - "loss": 0.71592748, - "num_input_tokens_seen": 182396440, - "step": 8480, - "time_per_iteration": 2.630505323410034 - }, - { - "auxiliary_loss_clip": 0.01124122, - "auxiliary_loss_mlp": 0.0103478, - "balance_loss_clip": 1.04386926, - "balance_loss_mlp": 1.02133346, - "epoch": 0.509905305877048, - "flos": 26213389223040.0, - "grad_norm": 1.7899171043779052, - "language_loss": 0.79267204, - "learning_rate": 2.0329080270133688e-06, - "loss": 0.81426102, - "num_input_tokens_seen": 182415890, - "step": 8481, - "time_per_iteration": 2.6193926334381104 - }, - { - "auxiliary_loss_clip": 0.01104496, - "auxiliary_loss_mlp": 0.01034587, - "balance_loss_clip": 1.04157507, - "balance_loss_mlp": 1.02124786, - "epoch": 0.5099654291297159, - "flos": 20340055733760.0, - "grad_norm": 1.468990392476105, - "language_loss": 0.83301556, - "learning_rate": 2.0325186174343578e-06, - "loss": 0.85440642, - "num_input_tokens_seen": 182434235, - "step": 8482, - "time_per_iteration": 4.149403095245361 - }, - { - "auxiliary_loss_clip": 0.01113898, - "auxiliary_loss_mlp": 0.00771464, - "balance_loss_clip": 1.04287457, - "balance_loss_mlp": 1.00025356, - "epoch": 0.5100255523823839, - "flos": 29054820545280.0, - "grad_norm": 1.9010351115161617, - "language_loss": 0.85379988, - "learning_rate": 2.032129206622238e-06, - "loss": 0.87265354, - "num_input_tokens_seen": 182454360, - "step": 8483, - "time_per_iteration": 2.7000234127044678 - }, - { - "auxiliary_loss_clip": 0.01109801, - "auxiliary_loss_mlp": 0.01033991, - "balance_loss_clip": 1.04242575, - "balance_loss_mlp": 1.0214082, - "epoch": 0.5100856756350518, - "flos": 22455912326400.0, - "grad_norm": 2.079288328100567, - "language_loss": 0.82931423, - "learning_rate": 2.031739794591775e-06, - "loss": 0.85075212, - "num_input_tokens_seen": 182471940, - "step": 8484, - "time_per_iteration": 4.3401288986206055 - }, - { - "auxiliary_loss_clip": 0.01095037, - "auxiliary_loss_mlp": 0.01033642, - "balance_loss_clip": 1.0400697, - "balance_loss_mlp": 1.0194087, - "epoch": 0.5101457988877198, - "flos": 19171055606400.0, - "grad_norm": 2.530206097433835, - "language_loss": 0.81594586, - "learning_rate": 2.031350381357736e-06, - "loss": 0.83723271, - "num_input_tokens_seen": 182490685, - "step": 8485, - "time_per_iteration": 2.6573400497436523 - }, - { - "auxiliary_loss_clip": 0.01092909, - "auxiliary_loss_mlp": 0.01038281, - "balance_loss_clip": 1.03726983, - "balance_loss_mlp": 1.02494788, - "epoch": 0.5102059221403878, - "flos": 14866371941760.0, - "grad_norm": 1.9374375358888782, - "language_loss": 0.74155819, - "learning_rate": 2.0309609669348874e-06, - "loss": 0.76287007, - "num_input_tokens_seen": 182508325, - "step": 8486, - "time_per_iteration": 2.676863670349121 - }, - { - "auxiliary_loss_clip": 0.01078995, - "auxiliary_loss_mlp": 0.01037671, - "balance_loss_clip": 1.03769588, - "balance_loss_mlp": 1.0228231, - "epoch": 0.5102660453930558, - "flos": 22961103160320.0, - "grad_norm": 1.4946123985675848, - "language_loss": 0.70439661, - "learning_rate": 2.0305715513379953e-06, - "loss": 0.72556329, - "num_input_tokens_seen": 182527020, - "step": 8487, - "time_per_iteration": 2.740612030029297 - }, - { - "auxiliary_loss_clip": 0.01099488, - "auxiliary_loss_mlp": 0.01039832, - "balance_loss_clip": 1.04223216, - "balance_loss_mlp": 1.02521729, - "epoch": 0.5103261686457238, - "flos": 23149311448320.0, - "grad_norm": 2.286550245787084, - "language_loss": 0.73022705, - "learning_rate": 2.030182134581827e-06, - "loss": 0.75162029, - "num_input_tokens_seen": 182543505, - "step": 8488, - "time_per_iteration": 4.345505714416504 - }, - { - "auxiliary_loss_clip": 0.01081446, - "auxiliary_loss_mlp": 0.00771801, - "balance_loss_clip": 1.04138601, - "balance_loss_mlp": 1.00030088, - "epoch": 0.5103862918983917, - "flos": 14319237000960.0, - "grad_norm": 1.7726796746163496, - "language_loss": 0.69465196, - "learning_rate": 2.0297927166811503e-06, - "loss": 0.71318448, - "num_input_tokens_seen": 182562250, - "step": 8489, - "time_per_iteration": 2.7057676315307617 - }, - { - "auxiliary_loss_clip": 0.01096056, - "auxiliary_loss_mlp": 0.01035357, - "balance_loss_clip": 1.04011536, - "balance_loss_mlp": 1.02176082, - "epoch": 0.5104464151510597, - "flos": 25848536826240.0, - "grad_norm": 2.097372581248088, - "language_loss": 0.73219633, - "learning_rate": 2.0294032976507297e-06, - "loss": 0.75351048, - "num_input_tokens_seen": 182581910, - "step": 8490, - "time_per_iteration": 2.7062344551086426 - }, - { - "auxiliary_loss_clip": 0.01093699, - "auxiliary_loss_mlp": 0.01030609, - "balance_loss_clip": 1.04015577, - "balance_loss_mlp": 1.01796126, - "epoch": 0.5105065384037276, - "flos": 21652913831040.0, - "grad_norm": 1.454492701867694, - "language_loss": 0.80228478, - "learning_rate": 2.0290138775053337e-06, - "loss": 0.82352787, - "num_input_tokens_seen": 182601350, - "step": 8491, - "time_per_iteration": 2.670520782470703 - }, - { - "auxiliary_loss_clip": 0.01108835, - "auxiliary_loss_mlp": 0.01031094, - "balance_loss_clip": 1.04258561, - "balance_loss_mlp": 1.01813614, - "epoch": 0.5105666616563956, - "flos": 22491571553280.0, - "grad_norm": 1.8545470770344947, - "language_loss": 0.78970987, - "learning_rate": 2.028624456259728e-06, - "loss": 0.81110907, - "num_input_tokens_seen": 182619660, - "step": 8492, - "time_per_iteration": 2.681852102279663 - }, - { - "auxiliary_loss_clip": 0.01088193, - "auxiliary_loss_mlp": 0.01045644, - "balance_loss_clip": 1.04025435, - "balance_loss_mlp": 1.03187561, - "epoch": 0.5106267849090635, - "flos": 22455768672000.0, - "grad_norm": 1.9312934890574833, - "language_loss": 0.77364743, - "learning_rate": 2.0282350339286804e-06, - "loss": 0.79498577, - "num_input_tokens_seen": 182639815, - "step": 8493, - "time_per_iteration": 2.71234393119812 - }, - { - "auxiliary_loss_clip": 0.01079322, - "auxiliary_loss_mlp": 0.01035175, - "balance_loss_clip": 1.04074192, - "balance_loss_mlp": 1.02040458, - "epoch": 0.5106869081617316, - "flos": 23547093638400.0, - "grad_norm": 1.7772442138937719, - "language_loss": 0.84122825, - "learning_rate": 2.0278456105269574e-06, - "loss": 0.86237323, - "num_input_tokens_seen": 182659655, - "step": 8494, - "time_per_iteration": 2.737844944000244 - }, - { - "auxiliary_loss_clip": 0.0112627, - "auxiliary_loss_mlp": 0.01037758, - "balance_loss_clip": 1.04641843, - "balance_loss_mlp": 1.02502632, - "epoch": 0.5107470314143995, - "flos": 26792987080320.0, - "grad_norm": 1.9716326999087717, - "language_loss": 0.78846836, - "learning_rate": 2.027456186069326e-06, - "loss": 0.81010866, - "num_input_tokens_seen": 182677075, - "step": 8495, - "time_per_iteration": 2.5992324352264404 - }, - { - "auxiliary_loss_clip": 0.01088486, - "auxiliary_loss_mlp": 0.0103671, - "balance_loss_clip": 1.04210663, - "balance_loss_mlp": 1.02254176, - "epoch": 0.5108071546670675, - "flos": 25739691638400.0, - "grad_norm": 1.7860993635097173, - "language_loss": 0.78245926, - "learning_rate": 2.0270667605705535e-06, - "loss": 0.80371118, - "num_input_tokens_seen": 182699625, - "step": 8496, - "time_per_iteration": 2.764511823654175 - }, - { - "auxiliary_loss_clip": 0.01107232, - "auxiliary_loss_mlp": 0.01031296, - "balance_loss_clip": 1.04186177, - "balance_loss_mlp": 1.01885021, - "epoch": 0.5108672779197354, - "flos": 18697537589760.0, - "grad_norm": 2.583960220786706, - "language_loss": 0.78615016, - "learning_rate": 2.0266773340454066e-06, - "loss": 0.80753547, - "num_input_tokens_seen": 182717020, - "step": 8497, - "time_per_iteration": 2.614715337753296 - }, - { - "auxiliary_loss_clip": 0.01119749, - "auxiliary_loss_mlp": 0.01032774, - "balance_loss_clip": 1.04238069, - "balance_loss_mlp": 1.01958323, - "epoch": 0.5109274011724034, - "flos": 26688164215680.0, - "grad_norm": 1.8043712312754003, - "language_loss": 0.81731009, - "learning_rate": 2.0262879065086525e-06, - "loss": 0.83883524, - "num_input_tokens_seen": 182736955, - "step": 8498, - "time_per_iteration": 2.670713186264038 - }, - { - "auxiliary_loss_clip": 0.01086895, - "auxiliary_loss_mlp": 0.00771568, - "balance_loss_clip": 1.03893542, - "balance_loss_mlp": 1.00021791, - "epoch": 0.5109875244250714, - "flos": 22784028088320.0, - "grad_norm": 1.9502410959783398, - "language_loss": 0.70963287, - "learning_rate": 2.0258984779750584e-06, - "loss": 0.72821754, - "num_input_tokens_seen": 182757620, - "step": 8499, - "time_per_iteration": 2.6890597343444824 - }, - { - "auxiliary_loss_clip": 0.01063023, - "auxiliary_loss_mlp": 0.01039504, - "balance_loss_clip": 1.03797197, - "balance_loss_mlp": 1.0247463, - "epoch": 0.5110476476777394, - "flos": 35588515622400.0, - "grad_norm": 1.532594294583486, - "language_loss": 0.72400367, - "learning_rate": 2.0255090484593914e-06, - "loss": 0.74502897, - "num_input_tokens_seen": 182780195, - "step": 8500, - "time_per_iteration": 2.8889389038085938 - }, - { - "auxiliary_loss_clip": 0.01113898, - "auxiliary_loss_mlp": 0.01039834, - "balance_loss_clip": 1.04150367, - "balance_loss_mlp": 1.0244801, - "epoch": 0.5111077709304074, - "flos": 19280798634240.0, - "grad_norm": 2.6334939898019867, - "language_loss": 0.62424856, - "learning_rate": 2.0251196179764183e-06, - "loss": 0.64578593, - "num_input_tokens_seen": 182795765, - "step": 8501, - "time_per_iteration": 2.564922571182251 - }, - { - "auxiliary_loss_clip": 0.01120814, - "auxiliary_loss_mlp": 0.01040593, - "balance_loss_clip": 1.04017985, - "balance_loss_mlp": 1.0265801, - "epoch": 0.5111678941830753, - "flos": 20668207409280.0, - "grad_norm": 2.184561184824311, - "language_loss": 0.87622821, - "learning_rate": 2.024730186540907e-06, - "loss": 0.89784235, - "num_input_tokens_seen": 182813120, - "step": 8502, - "time_per_iteration": 2.6287243366241455 - }, - { - "auxiliary_loss_clip": 0.01106628, - "auxiliary_loss_mlp": 0.01038615, - "balance_loss_clip": 1.04065216, - "balance_loss_mlp": 1.02592492, - "epoch": 0.5112280174357433, - "flos": 26287903987200.0, - "grad_norm": 1.480449524900748, - "language_loss": 0.82794261, - "learning_rate": 2.0243407541676253e-06, - "loss": 0.84939504, - "num_input_tokens_seen": 182835745, - "step": 8503, - "time_per_iteration": 2.682711124420166 - }, - { - "auxiliary_loss_clip": 0.01025632, - "auxiliary_loss_mlp": 0.01004613, - "balance_loss_clip": 1.0205853, - "balance_loss_mlp": 1.00336099, - "epoch": 0.5112881406884112, - "flos": 59474247707520.0, - "grad_norm": 0.8583626669635097, - "language_loss": 0.63898063, - "learning_rate": 2.023951320871339e-06, - "loss": 0.65928316, - "num_input_tokens_seen": 182892540, - "step": 8504, - "time_per_iteration": 3.216397523880005 - }, - { - "auxiliary_loss_clip": 0.01091882, - "auxiliary_loss_mlp": 0.00771622, - "balance_loss_clip": 1.04488444, - "balance_loss_mlp": 1.00014472, - "epoch": 0.5113482639410792, - "flos": 26468857728000.0, - "grad_norm": 1.826391287063558, - "language_loss": 0.84206301, - "learning_rate": 2.023561886666816e-06, - "loss": 0.86069804, - "num_input_tokens_seen": 182911515, - "step": 8505, - "time_per_iteration": 2.8032052516937256 - }, - { - "auxiliary_loss_clip": 0.0110904, - "auxiliary_loss_mlp": 0.01030264, - "balance_loss_clip": 1.04468179, - "balance_loss_mlp": 1.01698971, - "epoch": 0.5114083871937471, - "flos": 29895848565120.0, - "grad_norm": 1.983310033112748, - "language_loss": 0.75608075, - "learning_rate": 2.0231724515688246e-06, - "loss": 0.77747381, - "num_input_tokens_seen": 182930860, - "step": 8506, - "time_per_iteration": 2.699448347091675 - }, - { - "auxiliary_loss_clip": 0.01122646, - "auxiliary_loss_mlp": 0.01034693, - "balance_loss_clip": 1.04428148, - "balance_loss_mlp": 1.01986337, - "epoch": 0.5114685104464152, - "flos": 24314576561280.0, - "grad_norm": 1.918965700593569, - "language_loss": 0.58023655, - "learning_rate": 2.022783015592131e-06, - "loss": 0.60180998, - "num_input_tokens_seen": 182949960, - "step": 8507, - "time_per_iteration": 2.5828280448913574 - }, - { - "auxiliary_loss_clip": 0.01114406, - "auxiliary_loss_mlp": 0.01042669, - "balance_loss_clip": 1.04659033, - "balance_loss_mlp": 1.02820277, - "epoch": 0.5115286336990831, - "flos": 17019288391680.0, - "grad_norm": 1.7197846358145388, - "language_loss": 0.85691231, - "learning_rate": 2.022393578751503e-06, - "loss": 0.87848306, - "num_input_tokens_seen": 182968085, - "step": 8508, - "time_per_iteration": 2.691185235977173 - }, - { - "auxiliary_loss_clip": 0.01090388, - "auxiliary_loss_mlp": 0.00770619, - "balance_loss_clip": 1.04480338, - "balance_loss_mlp": 1.00018072, - "epoch": 0.5115887569517511, - "flos": 23659386531840.0, - "grad_norm": 1.8624731533798382, - "language_loss": 0.72326827, - "learning_rate": 2.022004141061709e-06, - "loss": 0.74187839, - "num_input_tokens_seen": 182987275, - "step": 8509, - "time_per_iteration": 2.7239418029785156 - }, - { - "auxiliary_loss_clip": 0.01120525, - "auxiliary_loss_mlp": 0.00770526, - "balance_loss_clip": 1.04470599, - "balance_loss_mlp": 1.00009036, - "epoch": 0.511648880204419, - "flos": 16107193313280.0, - "grad_norm": 2.5868792605641477, - "language_loss": 0.76204944, - "learning_rate": 2.0216147025375153e-06, - "loss": 0.78095996, - "num_input_tokens_seen": 183004700, - "step": 8510, - "time_per_iteration": 2.6135294437408447 - }, - { - "auxiliary_loss_clip": 0.0112199, - "auxiliary_loss_mlp": 0.01035525, - "balance_loss_clip": 1.04560411, - "balance_loss_mlp": 1.022668, - "epoch": 0.511709003457087, - "flos": 32634970974720.0, - "grad_norm": 4.709097064233808, - "language_loss": 0.70997655, - "learning_rate": 2.0212252631936907e-06, - "loss": 0.73155165, - "num_input_tokens_seen": 183025830, - "step": 8511, - "time_per_iteration": 2.7760493755340576 - }, - { - "auxiliary_loss_clip": 0.01095679, - "auxiliary_loss_mlp": 0.01029146, - "balance_loss_clip": 1.04216874, - "balance_loss_mlp": 1.01593149, - "epoch": 0.511769126709755, - "flos": 21762082241280.0, - "grad_norm": 2.953853433531297, - "language_loss": 0.66357356, - "learning_rate": 2.020835823045001e-06, - "loss": 0.68482178, - "num_input_tokens_seen": 183045140, - "step": 8512, - "time_per_iteration": 2.723987340927124 - }, - { - "auxiliary_loss_clip": 0.01060265, - "auxiliary_loss_mlp": 0.01037084, - "balance_loss_clip": 1.0384953, - "balance_loss_mlp": 1.02158666, - "epoch": 0.511829249962423, - "flos": 23915357827200.0, - "grad_norm": 1.7575723482240548, - "language_loss": 0.67203867, - "learning_rate": 2.0204463821062146e-06, - "loss": 0.69301212, - "num_input_tokens_seen": 183063935, - "step": 8513, - "time_per_iteration": 2.759958505630493 - }, - { - "auxiliary_loss_clip": 0.01083159, - "auxiliary_loss_mlp": 0.01036169, - "balance_loss_clip": 1.04507256, - "balance_loss_mlp": 1.02201903, - "epoch": 0.511889373215091, - "flos": 23727005884800.0, - "grad_norm": 2.3341144576485116, - "language_loss": 0.68508673, - "learning_rate": 2.0200569403921e-06, - "loss": 0.70627999, - "num_input_tokens_seen": 183084135, - "step": 8514, - "time_per_iteration": 2.7791545391082764 - }, - { - "auxiliary_loss_clip": 0.01119085, - "auxiliary_loss_mlp": 0.0102933, - "balance_loss_clip": 1.04411948, - "balance_loss_mlp": 1.01689076, - "epoch": 0.5119494964677589, - "flos": 28111519526400.0, - "grad_norm": 1.6536407135597841, - "language_loss": 0.66139281, - "learning_rate": 2.019667497917424e-06, - "loss": 0.68287694, - "num_input_tokens_seen": 183104570, - "step": 8515, - "time_per_iteration": 2.6567435264587402 - }, - { - "auxiliary_loss_clip": 0.01109629, - "auxiliary_loss_mlp": 0.01035907, - "balance_loss_clip": 1.04417586, - "balance_loss_mlp": 1.02317524, - "epoch": 0.5120096197204269, - "flos": 24973214296320.0, - "grad_norm": 1.939516836327544, - "language_loss": 0.7526269, - "learning_rate": 2.019278054696955e-06, - "loss": 0.77408224, - "num_input_tokens_seen": 183123850, - "step": 8516, - "time_per_iteration": 2.7218270301818848 - }, - { - "auxiliary_loss_clip": 0.01093123, - "auxiliary_loss_mlp": 0.01039766, - "balance_loss_clip": 1.04275799, - "balance_loss_mlp": 1.02562129, - "epoch": 0.5120697429730948, - "flos": 17968012364160.0, - "grad_norm": 2.066446678045309, - "language_loss": 0.78090644, - "learning_rate": 2.0188886107454595e-06, - "loss": 0.80223525, - "num_input_tokens_seen": 183141725, - "step": 8517, - "time_per_iteration": 2.6922826766967773 - }, - { - "auxiliary_loss_clip": 0.01114661, - "auxiliary_loss_mlp": 0.01034987, - "balance_loss_clip": 1.0449543, - "balance_loss_mlp": 1.02086043, - "epoch": 0.5121298662257628, - "flos": 23292343405440.0, - "grad_norm": 1.7160803061965533, - "language_loss": 0.74111056, - "learning_rate": 2.0184991660777063e-06, - "loss": 0.76260698, - "num_input_tokens_seen": 183161300, - "step": 8518, - "time_per_iteration": 2.6781773567199707 - }, - { - "auxiliary_loss_clip": 0.01107849, - "auxiliary_loss_mlp": 0.0104112, - "balance_loss_clip": 1.04497719, - "balance_loss_mlp": 1.02699947, - "epoch": 0.5121899894784308, - "flos": 17311062568320.0, - "grad_norm": 1.7790366802945887, - "language_loss": 0.78405094, - "learning_rate": 2.0181097207084625e-06, - "loss": 0.80554068, - "num_input_tokens_seen": 183180495, - "step": 8519, - "time_per_iteration": 2.634488582611084 - }, - { - "auxiliary_loss_clip": 0.01126735, - "auxiliary_loss_mlp": 0.01036152, - "balance_loss_clip": 1.04811025, - "balance_loss_mlp": 1.02241898, - "epoch": 0.5122501127310988, - "flos": 24930085040640.0, - "grad_norm": 1.8142627745056843, - "language_loss": 0.79518384, - "learning_rate": 2.017720274652497e-06, - "loss": 0.81681275, - "num_input_tokens_seen": 183200330, - "step": 8520, - "time_per_iteration": 2.6977620124816895 - }, - { - "auxiliary_loss_clip": 0.01104965, - "auxiliary_loss_mlp": 0.01041606, - "balance_loss_clip": 1.0438292, - "balance_loss_mlp": 1.02683616, - "epoch": 0.5123102359837667, - "flos": 18442859184000.0, - "grad_norm": 2.180675544150299, - "language_loss": 0.81294155, - "learning_rate": 2.0173308279245765e-06, - "loss": 0.83440727, - "num_input_tokens_seen": 183218230, - "step": 8521, - "time_per_iteration": 4.264198303222656 - }, - { - "auxiliary_loss_clip": 0.0111372, - "auxiliary_loss_mlp": 0.01032737, - "balance_loss_clip": 1.04381251, - "balance_loss_mlp": 1.01808071, - "epoch": 0.5123703592364347, - "flos": 26684860164480.0, - "grad_norm": 1.8350455385455566, - "language_loss": 0.68333864, - "learning_rate": 2.0169413805394692e-06, - "loss": 0.70480323, - "num_input_tokens_seen": 183236735, - "step": 8522, - "time_per_iteration": 2.755563735961914 - }, - { - "auxiliary_loss_clip": 0.0109986, - "auxiliary_loss_mlp": 0.01043615, - "balance_loss_clip": 1.04744244, - "balance_loss_mlp": 1.02636552, - "epoch": 0.5124304824891026, - "flos": 28803948981120.0, - "grad_norm": 1.6735611690288588, - "language_loss": 0.61849087, - "learning_rate": 2.0165519325119433e-06, - "loss": 0.6399256, - "num_input_tokens_seen": 183257550, - "step": 8523, - "time_per_iteration": 2.752614974975586 - }, - { - "auxiliary_loss_clip": 0.01088964, - "auxiliary_loss_mlp": 0.01041136, - "balance_loss_clip": 1.04488027, - "balance_loss_mlp": 1.02776718, - "epoch": 0.5124906057417706, - "flos": 21761830846080.0, - "grad_norm": 2.1631882282248966, - "language_loss": 0.7807008, - "learning_rate": 2.0161624838567656e-06, - "loss": 0.80200177, - "num_input_tokens_seen": 183275515, - "step": 8524, - "time_per_iteration": 5.938940763473511 - }, - { - "auxiliary_loss_clip": 0.0110059, - "auxiliary_loss_mlp": 0.01035868, - "balance_loss_clip": 1.04444933, - "balance_loss_mlp": 1.02287436, - "epoch": 0.5125507289944387, - "flos": 18880538405760.0, - "grad_norm": 2.5285806743725834, - "language_loss": 0.7489953, - "learning_rate": 2.015773034588706e-06, - "loss": 0.77035987, - "num_input_tokens_seen": 183293880, - "step": 8525, - "time_per_iteration": 2.6603550910949707 - }, - { - "auxiliary_loss_clip": 0.01100341, - "auxiliary_loss_mlp": 0.01045872, - "balance_loss_clip": 1.04424882, - "balance_loss_mlp": 1.02996945, - "epoch": 0.5126108522471066, - "flos": 35627838036480.0, - "grad_norm": 1.6545403659553666, - "language_loss": 0.74193799, - "learning_rate": 2.015383584722531e-06, - "loss": 0.76340014, - "num_input_tokens_seen": 183315860, - "step": 8526, - "time_per_iteration": 2.7631187438964844 - }, - { - "auxiliary_loss_clip": 0.01117967, - "auxiliary_loss_mlp": 0.010412, - "balance_loss_clip": 1.04805541, - "balance_loss_mlp": 1.02755094, - "epoch": 0.5126709754997746, - "flos": 20190918464640.0, - "grad_norm": 1.7970307477050764, - "language_loss": 0.65624464, - "learning_rate": 2.0149941342730088e-06, - "loss": 0.6778363, - "num_input_tokens_seen": 183335480, - "step": 8527, - "time_per_iteration": 4.185753107070923 - }, - { - "auxiliary_loss_clip": 0.01099112, - "auxiliary_loss_mlp": 0.01038782, - "balance_loss_clip": 1.04767573, - "balance_loss_mlp": 1.02663493, - "epoch": 0.5127310987524425, - "flos": 18588548747520.0, - "grad_norm": 1.4652981434759074, - "language_loss": 0.74246556, - "learning_rate": 2.014604683254908e-06, - "loss": 0.76384449, - "num_input_tokens_seen": 183354395, - "step": 8528, - "time_per_iteration": 2.647268056869507 - }, - { - "auxiliary_loss_clip": 0.01110552, - "auxiliary_loss_mlp": 0.01034843, - "balance_loss_clip": 1.04382324, - "balance_loss_mlp": 1.02143764, - "epoch": 0.5127912220051105, - "flos": 22454691264000.0, - "grad_norm": 1.6345499952693072, - "language_loss": 0.82838154, - "learning_rate": 2.014215231682995e-06, - "loss": 0.84983552, - "num_input_tokens_seen": 183372980, - "step": 8529, - "time_per_iteration": 2.6546859741210938 - }, - { - "auxiliary_loss_clip": 0.0107231, - "auxiliary_loss_mlp": 0.01034968, - "balance_loss_clip": 1.04131067, - "balance_loss_mlp": 1.02149725, - "epoch": 0.5128513452577784, - "flos": 19093703667840.0, - "grad_norm": 2.6019709601767866, - "language_loss": 0.73687661, - "learning_rate": 2.01382577957204e-06, - "loss": 0.75794935, - "num_input_tokens_seen": 183390160, - "step": 8530, - "time_per_iteration": 2.754840612411499 - }, - { - "auxiliary_loss_clip": 0.01018433, - "auxiliary_loss_mlp": 0.01003338, - "balance_loss_clip": 1.02142978, - "balance_loss_mlp": 1.00163293, - "epoch": 0.5129114685104464, - "flos": 67892285243520.0, - "grad_norm": 0.7482622882096543, - "language_loss": 0.60775113, - "learning_rate": 2.0134363269368095e-06, - "loss": 0.62796879, - "num_input_tokens_seen": 183455280, - "step": 8531, - "time_per_iteration": 3.331425666809082 - }, - { - "auxiliary_loss_clip": 0.01096599, - "auxiliary_loss_mlp": 0.01039227, - "balance_loss_clip": 1.04599643, - "balance_loss_mlp": 1.02387309, - "epoch": 0.5129715917631144, - "flos": 20449152316800.0, - "grad_norm": 1.6723134032232012, - "language_loss": 0.76866412, - "learning_rate": 2.0130468737920725e-06, - "loss": 0.79002237, - "num_input_tokens_seen": 183473955, - "step": 8532, - "time_per_iteration": 2.8071939945220947 - }, - { - "auxiliary_loss_clip": 0.0110043, - "auxiliary_loss_mlp": 0.01036596, - "balance_loss_clip": 1.0434345, - "balance_loss_mlp": 1.02273178, - "epoch": 0.5130317150157824, - "flos": 35116146840960.0, - "grad_norm": 4.28948987854823, - "language_loss": 0.67031407, - "learning_rate": 2.012657420152597e-06, - "loss": 0.69168431, - "num_input_tokens_seen": 183497195, - "step": 8533, - "time_per_iteration": 2.7799179553985596 - }, - { - "auxiliary_loss_clip": 0.01094678, - "auxiliary_loss_mlp": 0.01039401, - "balance_loss_clip": 1.04602468, - "balance_loss_mlp": 1.02452362, - "epoch": 0.5130918382684503, - "flos": 19791627903360.0, - "grad_norm": 1.9915175591272611, - "language_loss": 0.8200537, - "learning_rate": 2.01226796603315e-06, - "loss": 0.84139454, - "num_input_tokens_seen": 183513675, - "step": 8534, - "time_per_iteration": 2.6692066192626953 - }, - { - "auxiliary_loss_clip": 0.01111793, - "auxiliary_loss_mlp": 0.01038613, - "balance_loss_clip": 1.04316652, - "balance_loss_mlp": 1.02398574, - "epoch": 0.5131519615211183, - "flos": 26323096337280.0, - "grad_norm": 1.4683279633381257, - "language_loss": 0.63850307, - "learning_rate": 2.0118785114485017e-06, - "loss": 0.66000712, - "num_input_tokens_seen": 183535165, - "step": 8535, - "time_per_iteration": 2.6881463527679443 - }, - { - "auxiliary_loss_clip": 0.01118055, - "auxiliary_loss_mlp": 0.01031488, - "balance_loss_clip": 1.04930139, - "balance_loss_mlp": 1.01707554, - "epoch": 0.5132120847737862, - "flos": 19171917532800.0, - "grad_norm": 1.558826189326605, - "language_loss": 0.69832361, - "learning_rate": 2.011489056413418e-06, - "loss": 0.71981907, - "num_input_tokens_seen": 183553780, - "step": 8536, - "time_per_iteration": 2.7181568145751953 - }, - { - "auxiliary_loss_clip": 0.01116762, - "auxiliary_loss_mlp": 0.01038725, - "balance_loss_clip": 1.04751253, - "balance_loss_mlp": 1.02378178, - "epoch": 0.5132722080264542, - "flos": 20230420446720.0, - "grad_norm": 1.9464397996960447, - "language_loss": 0.70725036, - "learning_rate": 2.011099600942669e-06, - "loss": 0.72880518, - "num_input_tokens_seen": 183572285, - "step": 8537, - "time_per_iteration": 2.6996657848358154 - }, - { - "auxiliary_loss_clip": 0.01080908, - "auxiliary_loss_mlp": 0.01034474, - "balance_loss_clip": 1.04291606, - "balance_loss_mlp": 1.02007353, - "epoch": 0.5133323312791223, - "flos": 16469459930880.0, - "grad_norm": 1.8282608051087097, - "language_loss": 0.8028723, - "learning_rate": 2.0107101450510214e-06, - "loss": 0.82402611, - "num_input_tokens_seen": 183589330, - "step": 8538, - "time_per_iteration": 2.752685308456421 - }, - { - "auxiliary_loss_clip": 0.01113197, - "auxiliary_loss_mlp": 0.01031357, - "balance_loss_clip": 1.0443325, - "balance_loss_mlp": 1.01739144, - "epoch": 0.5133924545317902, - "flos": 26068094709120.0, - "grad_norm": 2.0083592119837403, - "language_loss": 0.78388107, - "learning_rate": 2.0103206887532437e-06, - "loss": 0.80532658, - "num_input_tokens_seen": 183609205, - "step": 8539, - "time_per_iteration": 2.6856329441070557 - }, - { - "auxiliary_loss_clip": 0.0109867, - "auxiliary_loss_mlp": 0.01033877, - "balance_loss_clip": 1.04138374, - "balance_loss_mlp": 1.01994729, - "epoch": 0.5134525777844582, - "flos": 29131023248640.0, - "grad_norm": 1.7382927125385157, - "language_loss": 0.76111883, - "learning_rate": 2.009931232064105e-06, - "loss": 0.78244424, - "num_input_tokens_seen": 183629985, - "step": 8540, - "time_per_iteration": 2.780198574066162 - }, - { - "auxiliary_loss_clip": 0.01074682, - "auxiliary_loss_mlp": 0.01038818, - "balance_loss_clip": 1.04355264, - "balance_loss_mlp": 1.02344, - "epoch": 0.5135127010371261, - "flos": 17454776883840.0, - "grad_norm": 1.7132610384814069, - "language_loss": 0.746566, - "learning_rate": 2.0095417749983724e-06, - "loss": 0.76770097, - "num_input_tokens_seen": 183648220, - "step": 8541, - "time_per_iteration": 2.6982674598693848 - }, - { - "auxiliary_loss_clip": 0.01060333, - "auxiliary_loss_mlp": 0.01039276, - "balance_loss_clip": 1.0412941, - "balance_loss_mlp": 1.02475083, - "epoch": 0.5135728242897941, - "flos": 21944975316480.0, - "grad_norm": 1.5289233613121331, - "language_loss": 0.70432508, - "learning_rate": 2.0091523175708162e-06, - "loss": 0.72532117, - "num_input_tokens_seen": 183668230, - "step": 8542, - "time_per_iteration": 2.783440113067627 - }, - { - "auxiliary_loss_clip": 0.01102439, - "auxiliary_loss_mlp": 0.01029643, - "balance_loss_clip": 1.04426861, - "balance_loss_mlp": 1.01601708, - "epoch": 0.513632947542462, - "flos": 22674859678080.0, - "grad_norm": 1.886898343071389, - "language_loss": 0.79691696, - "learning_rate": 2.0087628597962023e-06, - "loss": 0.81823772, - "num_input_tokens_seen": 183687800, - "step": 8543, - "time_per_iteration": 2.906564950942993 - }, - { - "auxiliary_loss_clip": 0.01101285, - "auxiliary_loss_mlp": 0.01044679, - "balance_loss_clip": 1.04514194, - "balance_loss_mlp": 1.03012979, - "epoch": 0.51369307079513, - "flos": 29457163762560.0, - "grad_norm": 1.7217499667212701, - "language_loss": 0.67941636, - "learning_rate": 2.008373401689299e-06, - "loss": 0.700876, - "num_input_tokens_seen": 183709025, - "step": 8544, - "time_per_iteration": 2.815377950668335 - }, - { - "auxiliary_loss_clip": 0.01086355, - "auxiliary_loss_mlp": 0.01049073, - "balance_loss_clip": 1.03878117, - "balance_loss_mlp": 1.03430903, - "epoch": 0.513753194047798, - "flos": 18989347680000.0, - "grad_norm": 2.2112374430559214, - "language_loss": 0.72265953, - "learning_rate": 2.0079839432648765e-06, - "loss": 0.74401385, - "num_input_tokens_seen": 183725740, - "step": 8545, - "time_per_iteration": 2.7677536010742188 - }, - { - "auxiliary_loss_clip": 0.01115821, - "auxiliary_loss_mlp": 0.01045255, - "balance_loss_clip": 1.04458177, - "balance_loss_mlp": 1.03013897, - "epoch": 0.513813317300466, - "flos": 17821855923840.0, - "grad_norm": 2.431720560794894, - "language_loss": 0.82277304, - "learning_rate": 2.0075944845377016e-06, - "loss": 0.84438378, - "num_input_tokens_seen": 183743995, - "step": 8546, - "time_per_iteration": 2.6764519214630127 - }, - { - "auxiliary_loss_clip": 0.01110159, - "auxiliary_loss_mlp": 0.01037047, - "balance_loss_clip": 1.0421015, - "balance_loss_mlp": 1.02272379, - "epoch": 0.5138734405531339, - "flos": 24061191045120.0, - "grad_norm": 1.829642419824105, - "language_loss": 0.73038638, - "learning_rate": 2.007205025522544e-06, - "loss": 0.75185841, - "num_input_tokens_seen": 183764150, - "step": 8547, - "time_per_iteration": 2.664536714553833 - }, - { - "auxiliary_loss_clip": 0.01112692, - "auxiliary_loss_mlp": 0.01048016, - "balance_loss_clip": 1.04215682, - "balance_loss_mlp": 1.03369892, - "epoch": 0.5139335638058019, - "flos": 26097253574400.0, - "grad_norm": 1.6776951969003835, - "language_loss": 0.73548347, - "learning_rate": 2.0068155662341702e-06, - "loss": 0.75709057, - "num_input_tokens_seen": 183783280, - "step": 8548, - "time_per_iteration": 2.6639697551727295 - }, - { - "auxiliary_loss_clip": 0.01086334, - "auxiliary_loss_mlp": 0.01037281, - "balance_loss_clip": 1.03931546, - "balance_loss_mlp": 1.02296984, - "epoch": 0.5139936870584698, - "flos": 18917095472640.0, - "grad_norm": 1.6001321585074282, - "language_loss": 0.82261604, - "learning_rate": 2.0064261066873495e-06, - "loss": 0.84385222, - "num_input_tokens_seen": 183800725, - "step": 8549, - "time_per_iteration": 2.748581886291504 - }, - { - "auxiliary_loss_clip": 0.01115178, - "auxiliary_loss_mlp": 0.01033379, - "balance_loss_clip": 1.04665935, - "balance_loss_mlp": 1.0205524, - "epoch": 0.5140538103111378, - "flos": 16144001775360.0, - "grad_norm": 1.9742432137522015, - "language_loss": 0.71977437, - "learning_rate": 2.0060366468968504e-06, - "loss": 0.74125993, - "num_input_tokens_seen": 183818735, - "step": 8550, - "time_per_iteration": 2.651068687438965 - }, - { - "auxiliary_loss_clip": 0.01112958, - "auxiliary_loss_mlp": 0.01041915, - "balance_loss_clip": 1.04612732, - "balance_loss_mlp": 1.02725196, - "epoch": 0.5141139335638057, - "flos": 22420145358720.0, - "grad_norm": 1.8069208573649895, - "language_loss": 0.75043917, - "learning_rate": 2.0056471868774408e-06, - "loss": 0.77198792, - "num_input_tokens_seen": 183840015, - "step": 8551, - "time_per_iteration": 2.7058589458465576 - }, - { - "auxiliary_loss_clip": 0.01093993, - "auxiliary_loss_mlp": 0.01037756, - "balance_loss_clip": 1.0459106, - "balance_loss_mlp": 1.0240587, - "epoch": 0.5141740568164738, - "flos": 27089645506560.0, - "grad_norm": 1.6630090206247619, - "language_loss": 0.69182396, - "learning_rate": 2.0052577266438897e-06, - "loss": 0.71314144, - "num_input_tokens_seen": 183860145, - "step": 8552, - "time_per_iteration": 2.7040834426879883 - }, - { - "auxiliary_loss_clip": 0.01114038, - "auxiliary_loss_mlp": 0.01039378, - "balance_loss_clip": 1.04381299, - "balance_loss_mlp": 1.02445841, - "epoch": 0.5142341800691418, - "flos": 24973250209920.0, - "grad_norm": 2.1567314432200364, - "language_loss": 0.753088, - "learning_rate": 2.004868266210965e-06, - "loss": 0.7746222, - "num_input_tokens_seen": 183880540, - "step": 8553, - "time_per_iteration": 2.6321310997009277 - }, - { - "auxiliary_loss_clip": 0.01125852, - "auxiliary_loss_mlp": 0.0104126, - "balance_loss_clip": 1.04767513, - "balance_loss_mlp": 1.02800989, - "epoch": 0.5142943033218097, - "flos": 20704513080960.0, - "grad_norm": 1.7807872167537822, - "language_loss": 0.67740041, - "learning_rate": 2.004478805593435e-06, - "loss": 0.69907153, - "num_input_tokens_seen": 183900895, - "step": 8554, - "time_per_iteration": 2.5353291034698486 - }, - { - "auxiliary_loss_clip": 0.01118225, - "auxiliary_loss_mlp": 0.01040414, - "balance_loss_clip": 1.04483485, - "balance_loss_mlp": 1.02390337, - "epoch": 0.5143544265744777, - "flos": 22925479847040.0, - "grad_norm": 1.822401657137422, - "language_loss": 0.73321033, - "learning_rate": 2.004089344806068e-06, - "loss": 0.75479674, - "num_input_tokens_seen": 183920335, - "step": 8555, - "time_per_iteration": 2.8193295001983643 - }, - { - "auxiliary_loss_clip": 0.01089525, - "auxiliary_loss_mlp": 0.01039524, - "balance_loss_clip": 1.04645813, - "balance_loss_mlp": 1.02570128, - "epoch": 0.5144145498271456, - "flos": 15921391236480.0, - "grad_norm": 2.4707318139003327, - "language_loss": 0.74175709, - "learning_rate": 2.003699883863633e-06, - "loss": 0.76304758, - "num_input_tokens_seen": 183936220, - "step": 8556, - "time_per_iteration": 2.721573829650879 - }, - { - "auxiliary_loss_clip": 0.0109284, - "auxiliary_loss_mlp": 0.01036355, - "balance_loss_clip": 1.04400861, - "balance_loss_mlp": 1.02320015, - "epoch": 0.5144746730798136, - "flos": 19681238430720.0, - "grad_norm": 1.790105253554859, - "language_loss": 0.85782719, - "learning_rate": 2.003310422780898e-06, - "loss": 0.87911922, - "num_input_tokens_seen": 183953250, - "step": 8557, - "time_per_iteration": 2.70686674118042 - }, - { - "auxiliary_loss_clip": 0.01106764, - "auxiliary_loss_mlp": 0.01043673, - "balance_loss_clip": 1.04357624, - "balance_loss_mlp": 1.0292908, - "epoch": 0.5145347963324816, - "flos": 23914711382400.0, - "grad_norm": 1.6124493392185149, - "language_loss": 0.88770819, - "learning_rate": 2.0029209615726307e-06, - "loss": 0.90921259, - "num_input_tokens_seen": 183973865, - "step": 8558, - "time_per_iteration": 2.7256360054016113 - }, - { - "auxiliary_loss_clip": 0.01123218, - "auxiliary_loss_mlp": 0.00770892, - "balance_loss_clip": 1.04631722, - "balance_loss_mlp": 1.00014222, - "epoch": 0.5145949195851496, - "flos": 18260002022400.0, - "grad_norm": 2.0888380287595196, - "language_loss": 0.65300936, - "learning_rate": 2.002531500253602e-06, - "loss": 0.67195046, - "num_input_tokens_seen": 183992555, - "step": 8559, - "time_per_iteration": 2.64591646194458 - }, - { - "auxiliary_loss_clip": 0.01108519, - "auxiliary_loss_mlp": 0.00771269, - "balance_loss_clip": 1.04542136, - "balance_loss_mlp": 1.00025797, - "epoch": 0.5146550428378175, - "flos": 26213425136640.0, - "grad_norm": 1.9572467781311524, - "language_loss": 0.63094109, - "learning_rate": 2.002142038838577e-06, - "loss": 0.64973897, - "num_input_tokens_seen": 184010825, - "step": 8560, - "time_per_iteration": 4.225303888320923 - }, - { - "auxiliary_loss_clip": 0.0112394, - "auxiliary_loss_mlp": 0.01031949, - "balance_loss_clip": 1.04584384, - "balance_loss_mlp": 1.01820433, - "epoch": 0.5147151660904855, - "flos": 22674177319680.0, - "grad_norm": 1.85112269234195, - "language_loss": 0.70142567, - "learning_rate": 2.0017525773423265e-06, - "loss": 0.72298455, - "num_input_tokens_seen": 184030155, - "step": 8561, - "time_per_iteration": 2.6462759971618652 - }, - { - "auxiliary_loss_clip": 0.01099376, - "auxiliary_loss_mlp": 0.01032154, - "balance_loss_clip": 1.04134226, - "balance_loss_mlp": 1.01888585, - "epoch": 0.5147752893431534, - "flos": 24972388283520.0, - "grad_norm": 1.6885707870282478, - "language_loss": 0.66502726, - "learning_rate": 2.0013631157796177e-06, - "loss": 0.6863426, - "num_input_tokens_seen": 184051440, - "step": 8562, - "time_per_iteration": 2.6790151596069336 - }, - { - "auxiliary_loss_clip": 0.01118509, - "auxiliary_loss_mlp": 0.01035134, - "balance_loss_clip": 1.04731929, - "balance_loss_mlp": 1.02153838, - "epoch": 0.5148354125958214, - "flos": 22744669760640.0, - "grad_norm": 1.6641105551237323, - "language_loss": 0.77625287, - "learning_rate": 2.0009736541652188e-06, - "loss": 0.79778934, - "num_input_tokens_seen": 184070205, - "step": 8563, - "time_per_iteration": 5.86843466758728 - }, - { - "auxiliary_loss_clip": 0.01117165, - "auxiliary_loss_mlp": 0.01035106, - "balance_loss_clip": 1.04520798, - "balance_loss_mlp": 1.01931095, - "epoch": 0.5148955358484893, - "flos": 23068763199360.0, - "grad_norm": 1.8668644890701778, - "language_loss": 0.82346904, - "learning_rate": 2.0005841925139e-06, - "loss": 0.84499174, - "num_input_tokens_seen": 184087345, - "step": 8564, - "time_per_iteration": 2.6531171798706055 - }, - { - "auxiliary_loss_clip": 0.01105481, - "auxiliary_loss_mlp": 0.01035772, - "balance_loss_clip": 1.04333782, - "balance_loss_mlp": 1.02130592, - "epoch": 0.5149556591011574, - "flos": 20340127560960.0, - "grad_norm": 1.6929228826937828, - "language_loss": 0.73255026, - "learning_rate": 2.0001947308404283e-06, - "loss": 0.75396281, - "num_input_tokens_seen": 184107110, - "step": 8565, - "time_per_iteration": 2.8100740909576416 - }, - { - "auxiliary_loss_clip": 0.0111614, - "auxiliary_loss_mlp": 0.01036767, - "balance_loss_clip": 1.04448807, - "balance_loss_mlp": 1.02056694, - "epoch": 0.5150157823538254, - "flos": 22638230784000.0, - "grad_norm": 2.0356075529568596, - "language_loss": 0.68441874, - "learning_rate": 1.9998052691595715e-06, - "loss": 0.70594788, - "num_input_tokens_seen": 184127105, - "step": 8566, - "time_per_iteration": 4.174206972122192 - }, - { - "auxiliary_loss_clip": 0.01126685, - "auxiliary_loss_mlp": 0.00772285, - "balance_loss_clip": 1.04328656, - "balance_loss_mlp": 1.00031221, - "epoch": 0.5150759056064933, - "flos": 26067627832320.0, - "grad_norm": 1.624621701105177, - "language_loss": 0.78153682, - "learning_rate": 1.9994158074861005e-06, - "loss": 0.80052656, - "num_input_tokens_seen": 184148060, - "step": 8567, - "time_per_iteration": 2.6405906677246094 - }, - { - "auxiliary_loss_clip": 0.01115866, - "auxiliary_loss_mlp": 0.01034427, - "balance_loss_clip": 1.0444839, - "balance_loss_mlp": 1.01929939, - "epoch": 0.5151360288591613, - "flos": 25952641418880.0, - "grad_norm": 2.181301277452511, - "language_loss": 0.79243255, - "learning_rate": 1.9990263458347806e-06, - "loss": 0.81393552, - "num_input_tokens_seen": 184166175, - "step": 8568, - "time_per_iteration": 2.6806869506835938 - }, - { - "auxiliary_loss_clip": 0.01100678, - "auxiliary_loss_mlp": 0.01033449, - "balance_loss_clip": 1.04264474, - "balance_loss_mlp": 1.02017546, - "epoch": 0.5151961521118292, - "flos": 18507246312960.0, - "grad_norm": 2.356580017264164, - "language_loss": 0.9131906, - "learning_rate": 1.9986368842203825e-06, - "loss": 0.93453181, - "num_input_tokens_seen": 184182600, - "step": 8569, - "time_per_iteration": 2.6493630409240723 - }, - { - "auxiliary_loss_clip": 0.01128863, - "auxiliary_loss_mlp": 0.01034527, - "balance_loss_clip": 1.04688525, - "balance_loss_mlp": 1.0198164, - "epoch": 0.5152562753644973, - "flos": 22233696837120.0, - "grad_norm": 2.0115285980006967, - "language_loss": 0.76725376, - "learning_rate": 1.998247422657674e-06, - "loss": 0.78888762, - "num_input_tokens_seen": 184202020, - "step": 8570, - "time_per_iteration": 2.6327102184295654 - }, - { - "auxiliary_loss_clip": 0.01115897, - "auxiliary_loss_mlp": 0.01044719, - "balance_loss_clip": 1.04504037, - "balance_loss_mlp": 1.02880454, - "epoch": 0.5153163986171652, - "flos": 38436555047040.0, - "grad_norm": 1.735564613465363, - "language_loss": 0.73986542, - "learning_rate": 1.9978579611614227e-06, - "loss": 0.76147163, - "num_input_tokens_seen": 184224850, - "step": 8571, - "time_per_iteration": 2.879904270172119 - }, - { - "auxiliary_loss_clip": 0.01031454, - "auxiliary_loss_mlp": 0.01001432, - "balance_loss_clip": 1.02375364, - "balance_loss_mlp": 1.00009048, - "epoch": 0.5153765218698332, - "flos": 66384503015040.0, - "grad_norm": 0.7786581254678329, - "language_loss": 0.52855021, - "learning_rate": 1.9974684997463984e-06, - "loss": 0.54887909, - "num_input_tokens_seen": 184288520, - "step": 8572, - "time_per_iteration": 3.2987639904022217 - }, - { - "auxiliary_loss_clip": 0.01112833, - "auxiliary_loss_mlp": 0.01038733, - "balance_loss_clip": 1.04641247, - "balance_loss_mlp": 1.02542353, - "epoch": 0.5154366451225011, - "flos": 24024669891840.0, - "grad_norm": 1.82770535610101, - "language_loss": 0.76185274, - "learning_rate": 1.9970790384273687e-06, - "loss": 0.78336841, - "num_input_tokens_seen": 184308565, - "step": 8573, - "time_per_iteration": 2.6767003536224365 - }, - { - "auxiliary_loss_clip": 0.01111651, - "auxiliary_loss_mlp": 0.01028763, - "balance_loss_clip": 1.04382682, - "balance_loss_mlp": 1.01498199, - "epoch": 0.5154967683751691, - "flos": 23468843859840.0, - "grad_norm": 2.7144169534848976, - "language_loss": 0.77198601, - "learning_rate": 1.996689577219102e-06, - "loss": 0.7933901, - "num_input_tokens_seen": 184326795, - "step": 8574, - "time_per_iteration": 2.6607704162597656 - }, - { - "auxiliary_loss_clip": 0.01099994, - "auxiliary_loss_mlp": 0.01033635, - "balance_loss_clip": 1.04476404, - "balance_loss_mlp": 1.02018237, - "epoch": 0.515556891627837, - "flos": 23805650712960.0, - "grad_norm": 3.244613949266341, - "language_loss": 0.8558231, - "learning_rate": 1.996300116136367e-06, - "loss": 0.87715936, - "num_input_tokens_seen": 184345990, - "step": 8575, - "time_per_iteration": 2.6699635982513428 - }, - { - "auxiliary_loss_clip": 0.01113561, - "auxiliary_loss_mlp": 0.0103516, - "balance_loss_clip": 1.04307377, - "balance_loss_mlp": 1.02077138, - "epoch": 0.515617014880505, - "flos": 19828544106240.0, - "grad_norm": 1.6301780240264319, - "language_loss": 0.76920515, - "learning_rate": 1.995910655193932e-06, - "loss": 0.79069233, - "num_input_tokens_seen": 184366300, - "step": 8576, - "time_per_iteration": 2.7603139877319336 - }, - { - "auxiliary_loss_clip": 0.01078348, - "auxiliary_loss_mlp": 0.00773356, - "balance_loss_clip": 1.04196084, - "balance_loss_mlp": 1.00032973, - "epoch": 0.515677138133173, - "flos": 14245907385600.0, - "grad_norm": 2.480047069773859, - "language_loss": 0.76414418, - "learning_rate": 1.9955211944065654e-06, - "loss": 0.78266126, - "num_input_tokens_seen": 184383030, - "step": 8577, - "time_per_iteration": 2.694549083709717 - }, - { - "auxiliary_loss_clip": 0.01099471, - "auxiliary_loss_mlp": 0.01044811, - "balance_loss_clip": 1.04260516, - "balance_loss_mlp": 1.0279547, - "epoch": 0.515737261385841, - "flos": 28289707920000.0, - "grad_norm": 1.7162174586848327, - "language_loss": 0.80910254, - "learning_rate": 1.9951317337890353e-06, - "loss": 0.83054537, - "num_input_tokens_seen": 184403410, - "step": 8578, - "time_per_iteration": 2.740527391433716 - }, - { - "auxiliary_loss_clip": 0.01121615, - "auxiliary_loss_mlp": 0.01032969, - "balance_loss_clip": 1.04364657, - "balance_loss_mlp": 1.01914644, - "epoch": 0.515797384638509, - "flos": 27891925729920.0, - "grad_norm": 1.8526777225789184, - "language_loss": 0.75880611, - "learning_rate": 1.9947422733561105e-06, - "loss": 0.780352, - "num_input_tokens_seen": 184423830, - "step": 8579, - "time_per_iteration": 2.6643004417419434 - }, - { - "auxiliary_loss_clip": 0.01087857, - "auxiliary_loss_mlp": 0.01032352, - "balance_loss_clip": 1.04332745, - "balance_loss_mlp": 1.01849377, - "epoch": 0.5158575078911769, - "flos": 23040071210880.0, - "grad_norm": 3.647152473791378, - "language_loss": 0.7862978, - "learning_rate": 1.994352813122559e-06, - "loss": 0.80749989, - "num_input_tokens_seen": 184445050, - "step": 8580, - "time_per_iteration": 2.74796986579895 - }, - { - "auxiliary_loss_clip": 0.01086006, - "auxiliary_loss_mlp": 0.0104917, - "balance_loss_clip": 1.04050803, - "balance_loss_mlp": 1.03265989, - "epoch": 0.5159176311438449, - "flos": 12641346938880.0, - "grad_norm": 2.0718752995567966, - "language_loss": 0.73151392, - "learning_rate": 1.99396335310315e-06, - "loss": 0.75286567, - "num_input_tokens_seen": 184460775, - "step": 8581, - "time_per_iteration": 2.6738648414611816 - }, - { - "auxiliary_loss_clip": 0.01114558, - "auxiliary_loss_mlp": 0.01033417, - "balance_loss_clip": 1.0463438, - "balance_loss_mlp": 1.01976788, - "epoch": 0.5159777543965128, - "flos": 15558154951680.0, - "grad_norm": 2.080206363710033, - "language_loss": 0.74150515, - "learning_rate": 1.9935738933126508e-06, - "loss": 0.76298487, - "num_input_tokens_seen": 184477365, - "step": 8582, - "time_per_iteration": 2.649186134338379 - }, - { - "auxiliary_loss_clip": 0.01085634, - "auxiliary_loss_mlp": 0.0103519, - "balance_loss_clip": 1.04351485, - "balance_loss_mlp": 1.02202952, - "epoch": 0.5160378776491809, - "flos": 23221671396480.0, - "grad_norm": 4.912834420865202, - "language_loss": 0.65803373, - "learning_rate": 1.99318443376583e-06, - "loss": 0.67924196, - "num_input_tokens_seen": 184497045, - "step": 8583, - "time_per_iteration": 2.7025017738342285 - }, - { - "auxiliary_loss_clip": 0.0111508, - "auxiliary_loss_mlp": 0.01037055, - "balance_loss_clip": 1.04503357, - "balance_loss_mlp": 1.02199888, - "epoch": 0.5160980009018488, - "flos": 21944616180480.0, - "grad_norm": 1.4135833939266678, - "language_loss": 0.76130998, - "learning_rate": 1.9927949744774568e-06, - "loss": 0.78283131, - "num_input_tokens_seen": 184517675, - "step": 8584, - "time_per_iteration": 2.662471294403076 - }, - { - "auxiliary_loss_clip": 0.01093144, - "auxiliary_loss_mlp": 0.01043062, - "balance_loss_clip": 1.0425117, - "balance_loss_mlp": 1.02877474, - "epoch": 0.5161581241545168, - "flos": 22784064001920.0, - "grad_norm": 2.700643227023907, - "language_loss": 0.79112214, - "learning_rate": 1.9924055154622983e-06, - "loss": 0.81248415, - "num_input_tokens_seen": 184537745, - "step": 8585, - "time_per_iteration": 2.727789878845215 - }, - { - "auxiliary_loss_clip": 0.01105983, - "auxiliary_loss_mlp": 0.01033747, - "balance_loss_clip": 1.0444293, - "balance_loss_mlp": 1.02064013, - "epoch": 0.5162182474071847, - "flos": 19675384513920.0, - "grad_norm": 2.398879690546405, - "language_loss": 0.81236124, - "learning_rate": 1.9920160567351238e-06, - "loss": 0.83375853, - "num_input_tokens_seen": 184553630, - "step": 8586, - "time_per_iteration": 2.6371195316314697 - }, - { - "auxiliary_loss_clip": 0.01106215, - "auxiliary_loss_mlp": 0.0103541, - "balance_loss_clip": 1.04690671, - "balance_loss_mlp": 1.02083015, - "epoch": 0.5162783706598527, - "flos": 20046198568320.0, - "grad_norm": 1.819724898525227, - "language_loss": 0.71372288, - "learning_rate": 1.991626598310701e-06, - "loss": 0.73513913, - "num_input_tokens_seen": 184573530, - "step": 8587, - "time_per_iteration": 2.7760136127471924 - }, - { - "auxiliary_loss_clip": 0.01038098, - "auxiliary_loss_mlp": 0.01008101, - "balance_loss_clip": 1.02063632, - "balance_loss_mlp": 1.00669408, - "epoch": 0.5163384939125206, - "flos": 69959553713280.0, - "grad_norm": 0.7288340121404665, - "language_loss": 0.57740283, - "learning_rate": 1.9912371402037984e-06, - "loss": 0.59786481, - "num_input_tokens_seen": 184637875, - "step": 8588, - "time_per_iteration": 3.183241844177246 - }, - { - "auxiliary_loss_clip": 0.01101129, - "auxiliary_loss_mlp": 0.01040283, - "balance_loss_clip": 1.04456651, - "balance_loss_mlp": 1.02572727, - "epoch": 0.5163986171651886, - "flos": 17417034668160.0, - "grad_norm": 1.7775907605960104, - "language_loss": 0.75007761, - "learning_rate": 1.990847682429185e-06, - "loss": 0.77149177, - "num_input_tokens_seen": 184656125, - "step": 8589, - "time_per_iteration": 2.8228790760040283 - }, - { - "auxiliary_loss_clip": 0.01117201, - "auxiliary_loss_mlp": 0.01029876, - "balance_loss_clip": 1.04574263, - "balance_loss_mlp": 1.01678646, - "epoch": 0.5164587404178566, - "flos": 21322679166720.0, - "grad_norm": 1.76753328713407, - "language_loss": 0.67530292, - "learning_rate": 1.990458225001627e-06, - "loss": 0.69677365, - "num_input_tokens_seen": 184675920, - "step": 8590, - "time_per_iteration": 2.6443076133728027 - }, - { - "auxiliary_loss_clip": 0.0104106, - "auxiliary_loss_mlp": 0.01004207, - "balance_loss_clip": 1.02416718, - "balance_loss_mlp": 1.00274086, - "epoch": 0.5165188636705246, - "flos": 68057149691520.0, - "grad_norm": 1.576071766619913, - "language_loss": 0.55832803, - "learning_rate": 1.990068767935895e-06, - "loss": 0.57878071, - "num_input_tokens_seen": 184730520, - "step": 8591, - "time_per_iteration": 3.062364101409912 - }, - { - "auxiliary_loss_clip": 0.01096175, - "auxiliary_loss_mlp": 0.0102813, - "balance_loss_clip": 1.04139185, - "balance_loss_mlp": 1.01549983, - "epoch": 0.5165789869231926, - "flos": 19385657412480.0, - "grad_norm": 1.5710435869577224, - "language_loss": 0.81707442, - "learning_rate": 1.9896793112467566e-06, - "loss": 0.83831745, - "num_input_tokens_seen": 184748340, - "step": 8592, - "time_per_iteration": 2.6631641387939453 - }, - { - "auxiliary_loss_clip": 0.01108366, - "auxiliary_loss_mlp": 0.01031712, - "balance_loss_clip": 1.04346967, - "balance_loss_mlp": 1.01837873, - "epoch": 0.5166391101758605, - "flos": 20960197067520.0, - "grad_norm": 2.447309188835127, - "language_loss": 0.83472121, - "learning_rate": 1.989289854948979e-06, - "loss": 0.85612202, - "num_input_tokens_seen": 184766615, - "step": 8593, - "time_per_iteration": 2.6486148834228516 - }, - { - "auxiliary_loss_clip": 0.01097046, - "auxiliary_loss_mlp": 0.01044386, - "balance_loss_clip": 1.04197097, - "balance_loss_mlp": 1.02946699, - "epoch": 0.5166992334285285, - "flos": 29462407148160.0, - "grad_norm": 2.3092045349550374, - "language_loss": 0.69423366, - "learning_rate": 1.9889003990573314e-06, - "loss": 0.71564794, - "num_input_tokens_seen": 184788075, - "step": 8594, - "time_per_iteration": 2.7182230949401855 - }, - { - "auxiliary_loss_clip": 0.01082123, - "auxiliary_loss_mlp": 0.01030642, - "balance_loss_clip": 1.04193354, - "balance_loss_mlp": 1.01663446, - "epoch": 0.5167593566811964, - "flos": 20304360593280.0, - "grad_norm": 1.4197237581629922, - "language_loss": 0.77434355, - "learning_rate": 1.988510943586582e-06, - "loss": 0.79547119, - "num_input_tokens_seen": 184808710, - "step": 8595, - "time_per_iteration": 2.7374019622802734 - }, - { - "auxiliary_loss_clip": 0.01123588, - "auxiliary_loss_mlp": 0.01039202, - "balance_loss_clip": 1.0457046, - "balance_loss_mlp": 1.02551079, - "epoch": 0.5168194799338645, - "flos": 14611370313600.0, - "grad_norm": 1.5026096017220443, - "language_loss": 0.650635, - "learning_rate": 1.9881214885514986e-06, - "loss": 0.67226291, - "num_input_tokens_seen": 184826475, - "step": 8596, - "time_per_iteration": 2.581263542175293 - }, - { - "auxiliary_loss_clip": 0.01083842, - "auxiliary_loss_mlp": 0.01032453, - "balance_loss_clip": 1.0427258, - "balance_loss_mlp": 1.01740873, - "epoch": 0.5168796031865324, - "flos": 25007257411200.0, - "grad_norm": 1.5566562133380693, - "language_loss": 0.75481033, - "learning_rate": 1.9877320339668492e-06, - "loss": 0.77597326, - "num_input_tokens_seen": 184845245, - "step": 8597, - "time_per_iteration": 2.741926670074463 - }, - { - "auxiliary_loss_clip": 0.01124007, - "auxiliary_loss_mlp": 0.01026784, - "balance_loss_clip": 1.04456997, - "balance_loss_mlp": 1.01349235, - "epoch": 0.5169397264392004, - "flos": 26939969533440.0, - "grad_norm": 1.5821649734534613, - "language_loss": 0.81177652, - "learning_rate": 1.987342579847403e-06, - "loss": 0.83328438, - "num_input_tokens_seen": 184866605, - "step": 8598, - "time_per_iteration": 2.690035343170166 - }, - { - "auxiliary_loss_clip": 0.01071801, - "auxiliary_loss_mlp": 0.01046328, - "balance_loss_clip": 1.03745472, - "balance_loss_mlp": 1.03122449, - "epoch": 0.5169998496918683, - "flos": 25407804948480.0, - "grad_norm": 1.4930779887062733, - "language_loss": 0.75179017, - "learning_rate": 1.9869531262079273e-06, - "loss": 0.77297151, - "num_input_tokens_seen": 184886945, - "step": 8599, - "time_per_iteration": 2.8392081260681152 - }, - { - "auxiliary_loss_clip": 0.01105064, - "auxiliary_loss_mlp": 0.01033083, - "balance_loss_clip": 1.04534984, - "balance_loss_mlp": 1.02013683, - "epoch": 0.5170599729445363, - "flos": 24680793674880.0, - "grad_norm": 2.7626803107212825, - "language_loss": 0.72095126, - "learning_rate": 1.9865636730631904e-06, - "loss": 0.7423327, - "num_input_tokens_seen": 184905590, - "step": 8600, - "time_per_iteration": 4.393568515777588 - }, - { - "auxiliary_loss_clip": 0.01085277, - "auxiliary_loss_mlp": 0.01034751, - "balance_loss_clip": 1.03932548, - "balance_loss_mlp": 1.02074337, - "epoch": 0.5171200961972042, - "flos": 20994455664000.0, - "grad_norm": 1.381905387614244, - "language_loss": 0.73886168, - "learning_rate": 1.9861742204279602e-06, - "loss": 0.76006198, - "num_input_tokens_seen": 184925555, - "step": 8601, - "time_per_iteration": 2.7736306190490723 - }, - { - "auxiliary_loss_clip": 0.01114158, - "auxiliary_loss_mlp": 0.01040835, - "balance_loss_clip": 1.04510868, - "balance_loss_mlp": 1.02620816, - "epoch": 0.5171802194498722, - "flos": 22745639427840.0, - "grad_norm": 2.1013626788591817, - "language_loss": 0.83703583, - "learning_rate": 1.9857847683170045e-06, - "loss": 0.85858572, - "num_input_tokens_seen": 184944490, - "step": 8602, - "time_per_iteration": 4.306191444396973 - }, - { - "auxiliary_loss_clip": 0.01124659, - "auxiliary_loss_mlp": 0.01033871, - "balance_loss_clip": 1.04496753, - "balance_loss_mlp": 1.01937509, - "epoch": 0.5172403427025402, - "flos": 28176732668160.0, - "grad_norm": 1.7451034136925476, - "language_loss": 0.74647379, - "learning_rate": 1.9853953167450926e-06, - "loss": 0.76805902, - "num_input_tokens_seen": 184963190, - "step": 8603, - "time_per_iteration": 2.73425030708313 - }, - { - "auxiliary_loss_clip": 0.01101467, - "auxiliary_loss_mlp": 0.01037433, - "balance_loss_clip": 1.04518127, - "balance_loss_mlp": 1.02431369, - "epoch": 0.5173004659552082, - "flos": 20337829090560.0, - "grad_norm": 2.1792209860390503, - "language_loss": 0.72349811, - "learning_rate": 1.9850058657269915e-06, - "loss": 0.74488711, - "num_input_tokens_seen": 184981220, - "step": 8604, - "time_per_iteration": 2.740248441696167 - }, - { - "auxiliary_loss_clip": 0.01107237, - "auxiliary_loss_mlp": 0.01042176, - "balance_loss_clip": 1.04422593, - "balance_loss_mlp": 1.02716208, - "epoch": 0.5173605892078762, - "flos": 19063323740160.0, - "grad_norm": 1.7719196350127329, - "language_loss": 0.85052991, - "learning_rate": 1.984616415277469e-06, - "loss": 0.87202406, - "num_input_tokens_seen": 184998810, - "step": 8605, - "time_per_iteration": 4.264687538146973 - }, - { - "auxiliary_loss_clip": 0.01107777, - "auxiliary_loss_mlp": 0.01027945, - "balance_loss_clip": 1.04396403, - "balance_loss_mlp": 1.01552308, - "epoch": 0.5174207124605441, - "flos": 27995168396160.0, - "grad_norm": 1.6794634480750013, - "language_loss": 0.64467752, - "learning_rate": 1.984226965411294e-06, - "loss": 0.6660347, - "num_input_tokens_seen": 185021185, - "step": 8606, - "time_per_iteration": 2.7390646934509277 - }, - { - "auxiliary_loss_clip": 0.01096289, - "auxiliary_loss_mlp": 0.01031967, - "balance_loss_clip": 1.04330635, - "balance_loss_mlp": 1.01885414, - "epoch": 0.5174808357132121, - "flos": 19496657416320.0, - "grad_norm": 1.503605725156866, - "language_loss": 0.77918422, - "learning_rate": 1.983837516143234e-06, - "loss": 0.80046678, - "num_input_tokens_seen": 185038465, - "step": 8607, - "time_per_iteration": 2.718864917755127 - }, - { - "auxiliary_loss_clip": 0.01114878, - "auxiliary_loss_mlp": 0.01036994, - "balance_loss_clip": 1.04531431, - "balance_loss_mlp": 1.0226177, - "epoch": 0.51754095896588, - "flos": 22784171742720.0, - "grad_norm": 2.7158797821524585, - "language_loss": 0.72334993, - "learning_rate": 1.983448067488057e-06, - "loss": 0.74486864, - "num_input_tokens_seen": 185057340, - "step": 8608, - "time_per_iteration": 2.767817258834839 - }, - { - "auxiliary_loss_clip": 0.01119837, - "auxiliary_loss_mlp": 0.01034295, - "balance_loss_clip": 1.04469681, - "balance_loss_mlp": 1.01979923, - "epoch": 0.5176010822185481, - "flos": 22669257156480.0, - "grad_norm": 1.8609844806921267, - "language_loss": 0.8623482, - "learning_rate": 1.983058619460531e-06, - "loss": 0.88388956, - "num_input_tokens_seen": 185074935, - "step": 8609, - "time_per_iteration": 2.8063855171203613 - }, - { - "auxiliary_loss_clip": 0.01111694, - "auxiliary_loss_mlp": 0.01037765, - "balance_loss_clip": 1.04306316, - "balance_loss_mlp": 1.02484906, - "epoch": 0.517661205471216, - "flos": 23951196622080.0, - "grad_norm": 2.050130502752804, - "language_loss": 0.73473549, - "learning_rate": 1.9826691720754237e-06, - "loss": 0.75623012, - "num_input_tokens_seen": 185095050, - "step": 8610, - "time_per_iteration": 2.740083694458008 - }, - { - "auxiliary_loss_clip": 0.01129954, - "auxiliary_loss_mlp": 0.01038598, - "balance_loss_clip": 1.04616904, - "balance_loss_mlp": 1.02353036, - "epoch": 0.517721328723884, - "flos": 15596076735360.0, - "grad_norm": 2.3590336184711926, - "language_loss": 0.67205131, - "learning_rate": 1.9822797253475034e-06, - "loss": 0.69373685, - "num_input_tokens_seen": 185112275, - "step": 8611, - "time_per_iteration": 2.648165464401245 - }, - { - "auxiliary_loss_clip": 0.01122336, - "auxiliary_loss_mlp": 0.01039403, - "balance_loss_clip": 1.0434556, - "balance_loss_mlp": 1.02535403, - "epoch": 0.5177814519765519, - "flos": 20960197067520.0, - "grad_norm": 2.3905761842565485, - "language_loss": 0.77420157, - "learning_rate": 1.9818902792915373e-06, - "loss": 0.79581904, - "num_input_tokens_seen": 185132165, - "step": 8612, - "time_per_iteration": 2.663339376449585 - }, - { - "auxiliary_loss_clip": 0.01114318, - "auxiliary_loss_mlp": 0.01040798, - "balance_loss_clip": 1.04297137, - "balance_loss_mlp": 1.02688015, - "epoch": 0.5178415752292199, - "flos": 17967832796160.0, - "grad_norm": 2.1474229546439174, - "language_loss": 0.8168264, - "learning_rate": 1.981500833922294e-06, - "loss": 0.83837759, - "num_input_tokens_seen": 185151025, - "step": 8613, - "time_per_iteration": 2.6589057445526123 - }, - { - "auxiliary_loss_clip": 0.01128171, - "auxiliary_loss_mlp": 0.01042961, - "balance_loss_clip": 1.04804301, - "balance_loss_mlp": 1.02832222, - "epoch": 0.5179016984818878, - "flos": 17821496787840.0, - "grad_norm": 2.274335348251239, - "language_loss": 0.66216785, - "learning_rate": 1.981111389254541e-06, - "loss": 0.6838792, - "num_input_tokens_seen": 185168455, - "step": 8614, - "time_per_iteration": 2.692133903503418 - }, - { - "auxiliary_loss_clip": 0.01100612, - "auxiliary_loss_mlp": 0.01034486, - "balance_loss_clip": 1.04462051, - "balance_loss_mlp": 1.01982355, - "epoch": 0.5179618217345558, - "flos": 17820455293440.0, - "grad_norm": 2.0015033819610055, - "language_loss": 0.8693983, - "learning_rate": 1.9807219453030453e-06, - "loss": 0.89074928, - "num_input_tokens_seen": 185184415, - "step": 8615, - "time_per_iteration": 2.690483808517456 - }, - { - "auxiliary_loss_clip": 0.01113112, - "auxiliary_loss_mlp": 0.01044655, - "balance_loss_clip": 1.04499412, - "balance_loss_mlp": 1.03147638, - "epoch": 0.5180219449872238, - "flos": 22522131048960.0, - "grad_norm": 1.8105595259457619, - "language_loss": 0.8084923, - "learning_rate": 1.9803325020825763e-06, - "loss": 0.83007002, - "num_input_tokens_seen": 185202910, - "step": 8616, - "time_per_iteration": 2.6410508155822754 - }, - { - "auxiliary_loss_clip": 0.01120148, - "auxiliary_loss_mlp": 0.00772211, - "balance_loss_clip": 1.04987717, - "balance_loss_mlp": 1.00035763, - "epoch": 0.5180820682398918, - "flos": 23915465568000.0, - "grad_norm": 2.1203191332986675, - "language_loss": 0.75104189, - "learning_rate": 1.9799430596079e-06, - "loss": 0.76996547, - "num_input_tokens_seen": 185223085, - "step": 8617, - "time_per_iteration": 2.6979870796203613 - }, - { - "auxiliary_loss_clip": 0.01126304, - "auxiliary_loss_mlp": 0.01042481, - "balance_loss_clip": 1.04557788, - "balance_loss_mlp": 1.02717435, - "epoch": 0.5181421914925598, - "flos": 16979930064000.0, - "grad_norm": 1.6549706674723104, - "language_loss": 0.70240247, - "learning_rate": 1.979553617893785e-06, - "loss": 0.72409028, - "num_input_tokens_seen": 185241295, - "step": 8618, - "time_per_iteration": 2.6166911125183105 - }, - { - "auxiliary_loss_clip": 0.01038523, - "auxiliary_loss_mlp": 0.01004843, - "balance_loss_clip": 1.02117562, - "balance_loss_mlp": 1.00342429, - "epoch": 0.5182023147452277, - "flos": 66059870872320.0, - "grad_norm": 0.9503620431523022, - "language_loss": 0.67223799, - "learning_rate": 1.979164176954999e-06, - "loss": 0.69267166, - "num_input_tokens_seen": 185298295, - "step": 8619, - "time_per_iteration": 3.186922550201416 - }, - { - "auxiliary_loss_clip": 0.01079843, - "auxiliary_loss_mlp": 0.01035858, - "balance_loss_clip": 1.04400134, - "balance_loss_mlp": 1.02230954, - "epoch": 0.5182624379978957, - "flos": 18187749815040.0, - "grad_norm": 1.8983764009380637, - "language_loss": 0.79863739, - "learning_rate": 1.97877473680631e-06, - "loss": 0.8197943, - "num_input_tokens_seen": 185317000, - "step": 8620, - "time_per_iteration": 2.8446528911590576 - }, - { - "auxiliary_loss_clip": 0.01060893, - "auxiliary_loss_mlp": 0.00772403, - "balance_loss_clip": 1.04089034, - "balance_loss_mlp": 1.00029039, - "epoch": 0.5183225612505636, - "flos": 14026708638720.0, - "grad_norm": 2.0819192927399586, - "language_loss": 0.82402205, - "learning_rate": 1.9783852974624846e-06, - "loss": 0.84235501, - "num_input_tokens_seen": 185331185, - "step": 8621, - "time_per_iteration": 2.753957509994507 - }, - { - "auxiliary_loss_clip": 0.01097265, - "auxiliary_loss_mlp": 0.010405, - "balance_loss_clip": 1.03958249, - "balance_loss_mlp": 1.02750611, - "epoch": 0.5183826845032317, - "flos": 23659781581440.0, - "grad_norm": 2.428940739700658, - "language_loss": 0.65491748, - "learning_rate": 1.9779958589382905e-06, - "loss": 0.67629516, - "num_input_tokens_seen": 185348955, - "step": 8622, - "time_per_iteration": 2.7421741485595703 - }, - { - "auxiliary_loss_clip": 0.01106105, - "auxiliary_loss_mlp": 0.01044986, - "balance_loss_clip": 1.04371572, - "balance_loss_mlp": 1.03016257, - "epoch": 0.5184428077558996, - "flos": 15888605097600.0, - "grad_norm": 2.083884784089921, - "language_loss": 0.60552382, - "learning_rate": 1.977606421248497e-06, - "loss": 0.62703472, - "num_input_tokens_seen": 185367330, - "step": 8623, - "time_per_iteration": 2.690345048904419 - }, - { - "auxiliary_loss_clip": 0.0112578, - "auxiliary_loss_mlp": 0.01032047, - "balance_loss_clip": 1.04534173, - "balance_loss_mlp": 1.01890421, - "epoch": 0.5185029310085676, - "flos": 21030833162880.0, - "grad_norm": 1.609281256747452, - "language_loss": 0.76150465, - "learning_rate": 1.9772169844078685e-06, - "loss": 0.78308284, - "num_input_tokens_seen": 185385060, - "step": 8624, - "time_per_iteration": 2.613788366317749 - }, - { - "auxiliary_loss_clip": 0.0107795, - "auxiliary_loss_mlp": 0.01043066, - "balance_loss_clip": 1.03900456, - "balance_loss_mlp": 1.02859426, - "epoch": 0.5185630542612355, - "flos": 26542690133760.0, - "grad_norm": 2.373822325498003, - "language_loss": 0.70952767, - "learning_rate": 1.9768275484311756e-06, - "loss": 0.73073781, - "num_input_tokens_seen": 185403745, - "step": 8625, - "time_per_iteration": 2.7548205852508545 - }, - { - "auxiliary_loss_clip": 0.01100948, - "auxiliary_loss_mlp": 0.0103515, - "balance_loss_clip": 1.04119349, - "balance_loss_mlp": 1.02260327, - "epoch": 0.5186231775139035, - "flos": 20668422890880.0, - "grad_norm": 1.9009704883002407, - "language_loss": 0.67718256, - "learning_rate": 1.976438113333184e-06, - "loss": 0.69854349, - "num_input_tokens_seen": 185422620, - "step": 8626, - "time_per_iteration": 2.731328248977661 - }, - { - "auxiliary_loss_clip": 0.0111085, - "auxiliary_loss_mlp": 0.01033689, - "balance_loss_clip": 1.04271841, - "balance_loss_mlp": 1.02022982, - "epoch": 0.5186833007665714, - "flos": 20885502735360.0, - "grad_norm": 1.960489278080422, - "language_loss": 0.70780122, - "learning_rate": 1.9760486791286612e-06, - "loss": 0.72924662, - "num_input_tokens_seen": 185439380, - "step": 8627, - "time_per_iteration": 2.6464414596557617 - }, - { - "auxiliary_loss_clip": 0.011279, - "auxiliary_loss_mlp": 0.00772067, - "balance_loss_clip": 1.04576206, - "balance_loss_mlp": 1.00029826, - "epoch": 0.5187434240192395, - "flos": 20886903365760.0, - "grad_norm": 2.0333805073835007, - "language_loss": 0.7303592, - "learning_rate": 1.9756592458323753e-06, - "loss": 0.74935889, - "num_input_tokens_seen": 185458830, - "step": 8628, - "time_per_iteration": 2.7327346801757812 - }, - { - "auxiliary_loss_clip": 0.01102356, - "auxiliary_loss_mlp": 0.01031961, - "balance_loss_clip": 1.04561651, - "balance_loss_mlp": 1.01927686, - "epoch": 0.5188035472719074, - "flos": 19859929614720.0, - "grad_norm": 1.6190117042724865, - "language_loss": 0.77354944, - "learning_rate": 1.9752698134590927e-06, - "loss": 0.79489267, - "num_input_tokens_seen": 185477270, - "step": 8629, - "time_per_iteration": 2.77992582321167 - }, - { - "auxiliary_loss_clip": 0.01115143, - "auxiliary_loss_mlp": 0.01034186, - "balance_loss_clip": 1.04428935, - "balance_loss_mlp": 1.01932621, - "epoch": 0.5188636705245754, - "flos": 21138313633920.0, - "grad_norm": 2.228815370750346, - "language_loss": 0.75078702, - "learning_rate": 1.9748803820235815e-06, - "loss": 0.77228034, - "num_input_tokens_seen": 185495795, - "step": 8630, - "time_per_iteration": 2.6749987602233887 - }, - { - "auxiliary_loss_clip": 0.01112188, - "auxiliary_loss_mlp": 0.01038971, - "balance_loss_clip": 1.04358792, - "balance_loss_mlp": 1.02446306, - "epoch": 0.5189237937772434, - "flos": 22419786222720.0, - "grad_norm": 2.002083188679526, - "language_loss": 0.80665708, - "learning_rate": 1.9744909515406093e-06, - "loss": 0.82816863, - "num_input_tokens_seen": 185514885, - "step": 8631, - "time_per_iteration": 2.7432682514190674 - }, - { - "auxiliary_loss_clip": 0.01114617, - "auxiliary_loss_mlp": 0.01034953, - "balance_loss_clip": 1.04478788, - "balance_loss_mlp": 1.02031374, - "epoch": 0.5189839170299113, - "flos": 25446696399360.0, - "grad_norm": 1.4933919289773454, - "language_loss": 0.74756616, - "learning_rate": 1.974101522024942e-06, - "loss": 0.76906186, - "num_input_tokens_seen": 185537155, - "step": 8632, - "time_per_iteration": 2.726018190383911 - }, - { - "auxiliary_loss_clip": 0.01093075, - "auxiliary_loss_mlp": 0.01033441, - "balance_loss_clip": 1.04612803, - "balance_loss_mlp": 1.01946926, - "epoch": 0.5190440402825793, - "flos": 18587722734720.0, - "grad_norm": 1.8814471450767234, - "language_loss": 0.78911304, - "learning_rate": 1.9737120934913477e-06, - "loss": 0.81037819, - "num_input_tokens_seen": 185555520, - "step": 8633, - "time_per_iteration": 2.715510606765747 - }, - { - "auxiliary_loss_clip": 0.0111596, - "auxiliary_loss_mlp": 0.01028973, - "balance_loss_clip": 1.04581857, - "balance_loss_mlp": 1.01619983, - "epoch": 0.5191041635352472, - "flos": 21908633731200.0, - "grad_norm": 5.606824878452593, - "language_loss": 0.80551088, - "learning_rate": 1.9733226659545936e-06, - "loss": 0.82696015, - "num_input_tokens_seen": 185573855, - "step": 8634, - "time_per_iteration": 2.6477181911468506 - }, - { - "auxiliary_loss_clip": 0.01122619, - "auxiliary_loss_mlp": 0.0103901, - "balance_loss_clip": 1.04603028, - "balance_loss_mlp": 1.02571273, - "epoch": 0.5191642867879153, - "flos": 27527971173120.0, - "grad_norm": 1.5734156514364543, - "language_loss": 0.69467652, - "learning_rate": 1.9729332394294467e-06, - "loss": 0.71629286, - "num_input_tokens_seen": 185595145, - "step": 8635, - "time_per_iteration": 2.713585615158081 - }, - { - "auxiliary_loss_clip": 0.01102259, - "auxiliary_loss_mlp": 0.01035772, - "balance_loss_clip": 1.0431217, - "balance_loss_mlp": 1.02210498, - "epoch": 0.5192244100405832, - "flos": 15705999331200.0, - "grad_norm": 1.6343728145872918, - "language_loss": 0.77876496, - "learning_rate": 1.9725438139306742e-06, - "loss": 0.80014527, - "num_input_tokens_seen": 185613320, - "step": 8636, - "time_per_iteration": 2.6876139640808105 - }, - { - "auxiliary_loss_clip": 0.01127572, - "auxiliary_loss_mlp": 0.01032982, - "balance_loss_clip": 1.04695189, - "balance_loss_mlp": 1.01938009, - "epoch": 0.5192845332932512, - "flos": 12057080313600.0, - "grad_norm": 2.1121159964360596, - "language_loss": 0.71433318, - "learning_rate": 1.9721543894730425e-06, - "loss": 0.73593867, - "num_input_tokens_seen": 185630730, - "step": 8637, - "time_per_iteration": 2.6093368530273438 - }, - { - "auxiliary_loss_clip": 0.01088299, - "auxiliary_loss_mlp": 0.01033237, - "balance_loss_clip": 1.04357982, - "balance_loss_mlp": 1.01999319, - "epoch": 0.5193446565459191, - "flos": 18953185662720.0, - "grad_norm": 2.05486546466365, - "language_loss": 0.76026344, - "learning_rate": 1.9717649660713194e-06, - "loss": 0.78147888, - "num_input_tokens_seen": 185648515, - "step": 8638, - "time_per_iteration": 2.680696725845337 - }, - { - "auxiliary_loss_clip": 0.0109108, - "auxiliary_loss_mlp": 0.01028733, - "balance_loss_clip": 1.04291189, - "balance_loss_mlp": 1.01578116, - "epoch": 0.5194047797985871, - "flos": 20374960775040.0, - "grad_norm": 13.373516582231533, - "language_loss": 0.74382144, - "learning_rate": 1.971375543740272e-06, - "loss": 0.7650196, - "num_input_tokens_seen": 185665220, - "step": 8639, - "time_per_iteration": 4.318557500839233 - }, - { - "auxiliary_loss_clip": 0.01123361, - "auxiliary_loss_mlp": 0.01032334, - "balance_loss_clip": 1.04529893, - "balance_loss_mlp": 1.01838636, - "epoch": 0.519464903051255, - "flos": 24353001135360.0, - "grad_norm": 1.5657899745454023, - "language_loss": 0.77311909, - "learning_rate": 1.9709861224946665e-06, - "loss": 0.79467607, - "num_input_tokens_seen": 185683750, - "step": 8640, - "time_per_iteration": 2.5864639282226562 - }, - { - "auxiliary_loss_clip": 0.01082849, - "auxiliary_loss_mlp": 0.0103216, - "balance_loss_clip": 1.04260516, - "balance_loss_mlp": 1.01930904, - "epoch": 0.519525026303923, - "flos": 14061829161600.0, - "grad_norm": 2.0170540453425714, - "language_loss": 0.66183293, - "learning_rate": 1.97059670234927e-06, - "loss": 0.68298292, - "num_input_tokens_seen": 185700625, - "step": 8641, - "time_per_iteration": 2.692979574203491 - }, - { - "auxiliary_loss_clip": 0.01123177, - "auxiliary_loss_mlp": 0.01034193, - "balance_loss_clip": 1.04594493, - "balance_loss_mlp": 1.02172363, - "epoch": 0.519585149556591, - "flos": 28835873193600.0, - "grad_norm": 1.7554954360005686, - "language_loss": 0.76535702, - "learning_rate": 1.97020728331885e-06, - "loss": 0.78693068, - "num_input_tokens_seen": 185721155, - "step": 8642, - "time_per_iteration": 5.96128249168396 - }, - { - "auxiliary_loss_clip": 0.0112288, - "auxiliary_loss_mlp": 0.01031224, - "balance_loss_clip": 1.04584873, - "balance_loss_mlp": 1.01806307, - "epoch": 0.519645272809259, - "flos": 25373007648000.0, - "grad_norm": 2.255175934024536, - "language_loss": 0.83165199, - "learning_rate": 1.9698178654181726e-06, - "loss": 0.85319304, - "num_input_tokens_seen": 185740990, - "step": 8643, - "time_per_iteration": 2.81384539604187 - }, - { - "auxiliary_loss_clip": 0.01126122, - "auxiliary_loss_mlp": 0.01041822, - "balance_loss_clip": 1.04520261, - "balance_loss_mlp": 1.02785623, - "epoch": 0.519705396061927, - "flos": 25372863993600.0, - "grad_norm": 2.2020503225508645, - "language_loss": 0.7044059, - "learning_rate": 1.969428448662004e-06, - "loss": 0.72608531, - "num_input_tokens_seen": 185762235, - "step": 8644, - "time_per_iteration": 2.7107033729553223 - }, - { - "auxiliary_loss_clip": 0.01111108, - "auxiliary_loss_mlp": 0.00770711, - "balance_loss_clip": 1.04354811, - "balance_loss_mlp": 1.00015676, - "epoch": 0.5197655193145949, - "flos": 28476228268800.0, - "grad_norm": 1.5309653957313616, - "language_loss": 0.80272603, - "learning_rate": 1.9690390330651133e-06, - "loss": 0.82154423, - "num_input_tokens_seen": 185783415, - "step": 8645, - "time_per_iteration": 4.246826171875 - }, - { - "auxiliary_loss_clip": 0.01122573, - "auxiliary_loss_mlp": 0.01033869, - "balance_loss_clip": 1.04362488, - "balance_loss_mlp": 1.02058911, - "epoch": 0.5198256425672629, - "flos": 20009138711040.0, - "grad_norm": 1.7778396167930446, - "language_loss": 0.7800498, - "learning_rate": 1.968649618642264e-06, - "loss": 0.80161417, - "num_input_tokens_seen": 185801345, - "step": 8646, - "time_per_iteration": 2.630892276763916 - }, - { - "auxiliary_loss_clip": 0.01117401, - "auxiliary_loss_mlp": 0.01035003, - "balance_loss_clip": 1.04832959, - "balance_loss_mlp": 1.02218235, - "epoch": 0.5198857658199308, - "flos": 19828867328640.0, - "grad_norm": 1.6794769864367036, - "language_loss": 0.65647638, - "learning_rate": 1.9682602054082252e-06, - "loss": 0.67800039, - "num_input_tokens_seen": 185820815, - "step": 8647, - "time_per_iteration": 2.6543033123016357 - }, - { - "auxiliary_loss_clip": 0.01127292, - "auxiliary_loss_mlp": 0.01036653, - "balance_loss_clip": 1.04618931, - "balance_loss_mlp": 1.02208591, - "epoch": 0.5199458890725989, - "flos": 24461918150400.0, - "grad_norm": 1.7193073170603235, - "language_loss": 0.71425897, - "learning_rate": 1.967870793377763e-06, - "loss": 0.73589844, - "num_input_tokens_seen": 185841450, - "step": 8648, - "time_per_iteration": 2.6632113456726074 - }, - { - "auxiliary_loss_clip": 0.0110717, - "auxiliary_loss_mlp": 0.01035133, - "balance_loss_clip": 1.0474503, - "balance_loss_mlp": 1.02016664, - "epoch": 0.5200060123252668, - "flos": 23404779953280.0, - "grad_norm": 2.0932120653926853, - "language_loss": 0.64383608, - "learning_rate": 1.967481382565642e-06, - "loss": 0.66525912, - "num_input_tokens_seen": 185859935, - "step": 8649, - "time_per_iteration": 2.708676815032959 - }, - { - "auxiliary_loss_clip": 0.01101881, - "auxiliary_loss_mlp": 0.01035641, - "balance_loss_clip": 1.04480278, - "balance_loss_mlp": 1.02039409, - "epoch": 0.5200661355779348, - "flos": 17201355454080.0, - "grad_norm": 2.0779038173518978, - "language_loss": 0.70331943, - "learning_rate": 1.9670919729866315e-06, - "loss": 0.72469461, - "num_input_tokens_seen": 185876795, - "step": 8650, - "time_per_iteration": 2.650996446609497 - }, - { - "auxiliary_loss_clip": 0.01123307, - "auxiliary_loss_mlp": 0.01030812, - "balance_loss_clip": 1.04483724, - "balance_loss_mlp": 1.01754415, - "epoch": 0.5201262588306027, - "flos": 18515075477760.0, - "grad_norm": 1.793577075652819, - "language_loss": 0.77560079, - "learning_rate": 1.966702564655496e-06, - "loss": 0.79714197, - "num_input_tokens_seen": 185895570, - "step": 8651, - "time_per_iteration": 2.6181790828704834 - }, - { - "auxiliary_loss_clip": 0.01068752, - "auxiliary_loss_mlp": 0.01040289, - "balance_loss_clip": 1.04241145, - "balance_loss_mlp": 1.02557862, - "epoch": 0.5201863820832707, - "flos": 18619395552000.0, - "grad_norm": 1.579276828195563, - "language_loss": 0.78716815, - "learning_rate": 1.966313157587003e-06, - "loss": 0.80825853, - "num_input_tokens_seen": 185913700, - "step": 8652, - "time_per_iteration": 2.81169056892395 - }, - { - "auxiliary_loss_clip": 0.01087589, - "auxiliary_loss_mlp": 0.0103997, - "balance_loss_clip": 1.04238617, - "balance_loss_mlp": 1.02496183, - "epoch": 0.5202465053359386, - "flos": 22857142222080.0, - "grad_norm": 2.456126746607985, - "language_loss": 0.70069832, - "learning_rate": 1.9659237517959187e-06, - "loss": 0.7219739, - "num_input_tokens_seen": 185932460, - "step": 8653, - "time_per_iteration": 2.8110082149505615 - }, - { - "auxiliary_loss_clip": 0.01094035, - "auxiliary_loss_mlp": 0.01042704, - "balance_loss_clip": 1.04702687, - "balance_loss_mlp": 1.02864337, - "epoch": 0.5203066285886067, - "flos": 21981532383360.0, - "grad_norm": 1.546190224311193, - "language_loss": 0.78555804, - "learning_rate": 1.965534347297008e-06, - "loss": 0.80692542, - "num_input_tokens_seen": 185952030, - "step": 8654, - "time_per_iteration": 2.8240180015563965 - }, - { - "auxiliary_loss_clip": 0.01115002, - "auxiliary_loss_mlp": 0.01046231, - "balance_loss_clip": 1.04417038, - "balance_loss_mlp": 1.03130579, - "epoch": 0.5203667518412746, - "flos": 20233329448320.0, - "grad_norm": 1.7757606906195533, - "language_loss": 0.84137118, - "learning_rate": 1.9651449441050393e-06, - "loss": 0.86298347, - "num_input_tokens_seen": 185973130, - "step": 8655, - "time_per_iteration": 2.767338752746582 - }, - { - "auxiliary_loss_clip": 0.01113773, - "auxiliary_loss_mlp": 0.01038813, - "balance_loss_clip": 1.04705739, - "balance_loss_mlp": 1.02643943, - "epoch": 0.5204268750939426, - "flos": 15705460627200.0, - "grad_norm": 2.3853440972465227, - "language_loss": 0.66374946, - "learning_rate": 1.9647555422347777e-06, - "loss": 0.68527532, - "num_input_tokens_seen": 185990200, - "step": 8656, - "time_per_iteration": 2.6653099060058594 - }, - { - "auxiliary_loss_clip": 0.01083984, - "auxiliary_loss_mlp": 0.01043204, - "balance_loss_clip": 1.04517853, - "balance_loss_mlp": 1.02981043, - "epoch": 0.5204869983466105, - "flos": 27449469999360.0, - "grad_norm": 1.9804929730339849, - "language_loss": 0.73262924, - "learning_rate": 1.9643661417009893e-06, - "loss": 0.75390112, - "num_input_tokens_seen": 186009880, - "step": 8657, - "time_per_iteration": 2.8447728157043457 - }, - { - "auxiliary_loss_clip": 0.01091042, - "auxiliary_loss_mlp": 0.01039275, - "balance_loss_clip": 1.0432241, - "balance_loss_mlp": 1.02489877, - "epoch": 0.5205471215992785, - "flos": 20595452411520.0, - "grad_norm": 1.769785544944644, - "language_loss": 0.71705246, - "learning_rate": 1.9639767425184408e-06, - "loss": 0.73835564, - "num_input_tokens_seen": 186026680, - "step": 8658, - "time_per_iteration": 2.8423781394958496 - }, - { - "auxiliary_loss_clip": 0.01123437, - "auxiliary_loss_mlp": 0.01039751, - "balance_loss_clip": 1.04425454, - "balance_loss_mlp": 1.02607751, - "epoch": 0.5206072448519465, - "flos": 22127904305280.0, - "grad_norm": 1.7936056694778655, - "language_loss": 0.83181685, - "learning_rate": 1.963587344701897e-06, - "loss": 0.85344875, - "num_input_tokens_seen": 186046920, - "step": 8659, - "time_per_iteration": 2.662799596786499 - }, - { - "auxiliary_loss_clip": 0.01103478, - "auxiliary_loss_mlp": 0.01045743, - "balance_loss_clip": 1.043998, - "balance_loss_mlp": 1.02959061, - "epoch": 0.5206673681046144, - "flos": 18330422636160.0, - "grad_norm": 1.9906097398392346, - "language_loss": 0.75777173, - "learning_rate": 1.9631979482661253e-06, - "loss": 0.77926397, - "num_input_tokens_seen": 186062090, - "step": 8660, - "time_per_iteration": 2.6635682582855225 - }, - { - "auxiliary_loss_clip": 0.01123245, - "auxiliary_loss_mlp": 0.01039579, - "balance_loss_clip": 1.04523396, - "balance_loss_mlp": 1.02638865, - "epoch": 0.5207274913572825, - "flos": 20230240878720.0, - "grad_norm": 1.836365427627734, - "language_loss": 0.77897781, - "learning_rate": 1.9628085532258906e-06, - "loss": 0.80060601, - "num_input_tokens_seen": 186081135, - "step": 8661, - "time_per_iteration": 2.6036980152130127 - }, - { - "auxiliary_loss_clip": 0.01101785, - "auxiliary_loss_mlp": 0.0103675, - "balance_loss_clip": 1.04206395, - "balance_loss_mlp": 1.02354193, - "epoch": 0.5207876146099504, - "flos": 22127042378880.0, - "grad_norm": 1.6821546298299666, - "language_loss": 0.70456815, - "learning_rate": 1.9624191595959603e-06, - "loss": 0.72595346, - "num_input_tokens_seen": 186099700, - "step": 8662, - "time_per_iteration": 2.6941347122192383 - }, - { - "auxiliary_loss_clip": 0.01108537, - "auxiliary_loss_mlp": 0.01034478, - "balance_loss_clip": 1.04286838, - "balance_loss_mlp": 1.01910543, - "epoch": 0.5208477378626184, - "flos": 23878908501120.0, - "grad_norm": 1.571076572398917, - "language_loss": 0.69488823, - "learning_rate": 1.962029767391098e-06, - "loss": 0.71631837, - "num_input_tokens_seen": 186119740, - "step": 8663, - "time_per_iteration": 2.648148536682129 - }, - { - "auxiliary_loss_clip": 0.01096912, - "auxiliary_loss_mlp": 0.00772823, - "balance_loss_clip": 1.04340351, - "balance_loss_mlp": 1.00029683, - "epoch": 0.5209078611152863, - "flos": 20961525870720.0, - "grad_norm": 1.508064062466455, - "language_loss": 0.77011776, - "learning_rate": 1.961640376626072e-06, - "loss": 0.78881508, - "num_input_tokens_seen": 186140645, - "step": 8664, - "time_per_iteration": 2.713656187057495 - }, - { - "auxiliary_loss_clip": 0.01099911, - "auxiliary_loss_mlp": 0.01035751, - "balance_loss_clip": 1.04555953, - "balance_loss_mlp": 1.02207136, - "epoch": 0.5209679843679543, - "flos": 20667740532480.0, - "grad_norm": 2.174055653698437, - "language_loss": 0.76443201, - "learning_rate": 1.961250987315646e-06, - "loss": 0.78578866, - "num_input_tokens_seen": 186160130, - "step": 8665, - "time_per_iteration": 2.6254820823669434 - }, - { - "auxiliary_loss_clip": 0.0111827, - "auxiliary_loss_mlp": 0.0103845, - "balance_loss_clip": 1.04986227, - "balance_loss_mlp": 1.02577186, - "epoch": 0.5210281076206222, - "flos": 20227295963520.0, - "grad_norm": 1.6491776532454103, - "language_loss": 0.72156572, - "learning_rate": 1.960861599474586e-06, - "loss": 0.74313289, - "num_input_tokens_seen": 186179485, - "step": 8666, - "time_per_iteration": 2.680417060852051 - }, - { - "auxiliary_loss_clip": 0.01108853, - "auxiliary_loss_mlp": 0.01038135, - "balance_loss_clip": 1.04408336, - "balance_loss_mlp": 1.02222097, - "epoch": 0.5210882308732903, - "flos": 16069989801600.0, - "grad_norm": 2.5838170040517583, - "language_loss": 0.68477565, - "learning_rate": 1.9604722131176592e-06, - "loss": 0.70624554, - "num_input_tokens_seen": 186197140, - "step": 8667, - "time_per_iteration": 2.665583372116089 - }, - { - "auxiliary_loss_clip": 0.01089337, - "auxiliary_loss_mlp": 0.01039011, - "balance_loss_clip": 1.05282402, - "balance_loss_mlp": 1.02584982, - "epoch": 0.5211483541259582, - "flos": 24825298089600.0, - "grad_norm": 1.3808961063616443, - "language_loss": 0.81199509, - "learning_rate": 1.960082828259629e-06, - "loss": 0.83327854, - "num_input_tokens_seen": 186216800, - "step": 8668, - "time_per_iteration": 2.802410125732422 - }, - { - "auxiliary_loss_clip": 0.01105597, - "auxiliary_loss_mlp": 0.01031995, - "balance_loss_clip": 1.04507339, - "balance_loss_mlp": 1.01803613, - "epoch": 0.5212084773786262, - "flos": 20370651143040.0, - "grad_norm": 2.086648647266329, - "language_loss": 0.63722765, - "learning_rate": 1.9596934449152623e-06, - "loss": 0.65860361, - "num_input_tokens_seen": 186235320, - "step": 8669, - "time_per_iteration": 2.681579113006592 - }, - { - "auxiliary_loss_clip": 0.01102666, - "auxiliary_loss_mlp": 0.00771955, - "balance_loss_clip": 1.04595864, - "balance_loss_mlp": 1.00027704, - "epoch": 0.5212686006312941, - "flos": 23145468693120.0, - "grad_norm": 1.5766402887224458, - "language_loss": 0.66502392, - "learning_rate": 1.959304063099325e-06, - "loss": 0.68377018, - "num_input_tokens_seen": 186254460, - "step": 8670, - "time_per_iteration": 2.7425742149353027 - }, - { - "auxiliary_loss_clip": 0.01085453, - "auxiliary_loss_mlp": 0.01033651, - "balance_loss_clip": 1.04303861, - "balance_loss_mlp": 1.02063334, - "epoch": 0.5213287238839621, - "flos": 27774030314880.0, - "grad_norm": 2.122031398938641, - "language_loss": 0.76534224, - "learning_rate": 1.9589146828265806e-06, - "loss": 0.78653324, - "num_input_tokens_seen": 186269465, - "step": 8671, - "time_per_iteration": 2.7530081272125244 - }, - { - "auxiliary_loss_clip": 0.01096106, - "auxiliary_loss_mlp": 0.01041463, - "balance_loss_clip": 1.04865241, - "balance_loss_mlp": 1.02665734, - "epoch": 0.5213888471366301, - "flos": 19937676602880.0, - "grad_norm": 2.569347871916013, - "language_loss": 0.78284293, - "learning_rate": 1.958525304111796e-06, - "loss": 0.80421865, - "num_input_tokens_seen": 186288660, - "step": 8672, - "time_per_iteration": 2.7782974243164062 - }, - { - "auxiliary_loss_clip": 0.01085385, - "auxiliary_loss_mlp": 0.01032995, - "balance_loss_clip": 1.04014993, - "balance_loss_mlp": 1.02035856, - "epoch": 0.521448970389298, - "flos": 16982731324800.0, - "grad_norm": 1.8835859039826313, - "language_loss": 0.72004962, - "learning_rate": 1.958135926969736e-06, - "loss": 0.74123341, - "num_input_tokens_seen": 186305760, - "step": 8673, - "time_per_iteration": 2.7094011306762695 - }, - { - "auxiliary_loss_clip": 0.01108751, - "auxiliary_loss_mlp": 0.01034613, - "balance_loss_clip": 1.04249072, - "balance_loss_mlp": 1.02049243, - "epoch": 0.5215090936419661, - "flos": 18989706816000.0, - "grad_norm": 1.4914552209414809, - "language_loss": 0.74901187, - "learning_rate": 1.957746551415166e-06, - "loss": 0.77044559, - "num_input_tokens_seen": 186324135, - "step": 8674, - "time_per_iteration": 2.6582236289978027 - }, - { - "auxiliary_loss_clip": 0.01097767, - "auxiliary_loss_mlp": 0.0103511, - "balance_loss_clip": 1.0421474, - "balance_loss_mlp": 1.02030408, - "epoch": 0.521569216894634, - "flos": 16143427157760.0, - "grad_norm": 2.0310628766426615, - "language_loss": 0.86121237, - "learning_rate": 1.9573571774628506e-06, - "loss": 0.88254112, - "num_input_tokens_seen": 186340205, - "step": 8675, - "time_per_iteration": 2.659674882888794 - }, - { - "auxiliary_loss_clip": 0.01022959, - "auxiliary_loss_mlp": 0.01006796, - "balance_loss_clip": 1.01756668, - "balance_loss_mlp": 1.00524664, - "epoch": 0.521629340147302, - "flos": 57579493282560.0, - "grad_norm": 0.8681331347139113, - "language_loss": 0.63129932, - "learning_rate": 1.9569678051275556e-06, - "loss": 0.65159684, - "num_input_tokens_seen": 186396940, - "step": 8676, - "time_per_iteration": 3.205299139022827 - }, - { - "auxiliary_loss_clip": 0.01111064, - "auxiliary_loss_mlp": 0.01030098, - "balance_loss_clip": 1.04485834, - "balance_loss_mlp": 1.0172416, - "epoch": 0.5216894633999699, - "flos": 26796901662720.0, - "grad_norm": 1.5700830686566873, - "language_loss": 0.68696839, - "learning_rate": 1.956578434424046e-06, - "loss": 0.70837998, - "num_input_tokens_seen": 186418680, - "step": 8677, - "time_per_iteration": 2.7582013607025146 - }, - { - "auxiliary_loss_clip": 0.0111011, - "auxiliary_loss_mlp": 0.01032255, - "balance_loss_clip": 1.04261422, - "balance_loss_mlp": 1.01857519, - "epoch": 0.5217495866526379, - "flos": 26358719650560.0, - "grad_norm": 1.8246312930355708, - "language_loss": 0.65474886, - "learning_rate": 1.956189065367086e-06, - "loss": 0.67617249, - "num_input_tokens_seen": 186438265, - "step": 8678, - "time_per_iteration": 4.216279029846191 - }, - { - "auxiliary_loss_clip": 0.01101119, - "auxiliary_loss_mlp": 0.01036814, - "balance_loss_clip": 1.03927827, - "balance_loss_mlp": 1.02188301, - "epoch": 0.5218097099053058, - "flos": 23584009841280.0, - "grad_norm": 2.0476762683914287, - "language_loss": 0.67981493, - "learning_rate": 1.9557996979714414e-06, - "loss": 0.70119429, - "num_input_tokens_seen": 186456870, - "step": 8679, - "time_per_iteration": 2.7411186695098877 - }, - { - "auxiliary_loss_clip": 0.01125585, - "auxiliary_loss_mlp": 0.01038661, - "balance_loss_clip": 1.04630351, - "balance_loss_mlp": 1.02463043, - "epoch": 0.5218698331579739, - "flos": 18077396256000.0, - "grad_norm": 1.6988813784316565, - "language_loss": 0.66861475, - "learning_rate": 1.9554103322518764e-06, - "loss": 0.69025725, - "num_input_tokens_seen": 186476425, - "step": 8680, - "time_per_iteration": 2.656953811645508 - }, - { - "auxiliary_loss_clip": 0.0112586, - "auxiliary_loss_mlp": 0.01039387, - "balance_loss_clip": 1.04645705, - "balance_loss_mlp": 1.02533197, - "epoch": 0.5219299564106418, - "flos": 19281121856640.0, - "grad_norm": 2.024829019659845, - "language_loss": 0.83280826, - "learning_rate": 1.955020968223156e-06, - "loss": 0.85446072, - "num_input_tokens_seen": 186492555, - "step": 8681, - "time_per_iteration": 4.351206541061401 - }, - { - "auxiliary_loss_clip": 0.01098299, - "auxiliary_loss_mlp": 0.01033401, - "balance_loss_clip": 1.0424881, - "balance_loss_mlp": 1.02001929, - "epoch": 0.5219900796633098, - "flos": 26651355753600.0, - "grad_norm": 2.0563808347758563, - "language_loss": 0.77594543, - "learning_rate": 1.9546316059000454e-06, - "loss": 0.79726237, - "num_input_tokens_seen": 186513190, - "step": 8682, - "time_per_iteration": 2.836205005645752 - }, - { - "auxiliary_loss_clip": 0.01084257, - "auxiliary_loss_mlp": 0.01048472, - "balance_loss_clip": 1.03948176, - "balance_loss_mlp": 1.03558517, - "epoch": 0.5220502029159777, - "flos": 34312717382400.0, - "grad_norm": 1.4694894100116993, - "language_loss": 0.68905342, - "learning_rate": 1.9542422452973082e-06, - "loss": 0.71038067, - "num_input_tokens_seen": 186534830, - "step": 8683, - "time_per_iteration": 2.8703176975250244 - }, - { - "auxiliary_loss_clip": 0.01091474, - "auxiliary_loss_mlp": 0.01042368, - "balance_loss_clip": 1.04399586, - "balance_loss_mlp": 1.02824771, - "epoch": 0.5221103261686457, - "flos": 22156488552960.0, - "grad_norm": 1.7170989726331638, - "language_loss": 0.76116288, - "learning_rate": 1.9538528864297104e-06, - "loss": 0.78250128, - "num_input_tokens_seen": 186554390, - "step": 8684, - "time_per_iteration": 2.8443922996520996 - }, - { - "auxiliary_loss_clip": 0.0110091, - "auxiliary_loss_mlp": 0.00771126, - "balance_loss_clip": 1.0387888, - "balance_loss_mlp": 1.00024819, - "epoch": 0.5221704494213137, - "flos": 19208402772480.0, - "grad_norm": 1.8259321745961588, - "language_loss": 0.75595027, - "learning_rate": 1.9534635293120153e-06, - "loss": 0.7746706, - "num_input_tokens_seen": 186572360, - "step": 8685, - "time_per_iteration": 4.343646049499512 - }, - { - "auxiliary_loss_clip": 0.01101598, - "auxiliary_loss_mlp": 0.01041734, - "balance_loss_clip": 1.04539514, - "balance_loss_mlp": 1.02856123, - "epoch": 0.5222305726739817, - "flos": 19354056422400.0, - "grad_norm": 1.8098495762940472, - "language_loss": 0.80820441, - "learning_rate": 1.9530741739589876e-06, - "loss": 0.82963777, - "num_input_tokens_seen": 186590655, - "step": 8686, - "time_per_iteration": 2.9524481296539307 - }, - { - "auxiliary_loss_clip": 0.01102372, - "auxiliary_loss_mlp": 0.01034624, - "balance_loss_clip": 1.04477715, - "balance_loss_mlp": 1.02207708, - "epoch": 0.5222906959266497, - "flos": 27814789272960.0, - "grad_norm": 1.5584733304526452, - "language_loss": 0.69955659, - "learning_rate": 1.9526848203853927e-06, - "loss": 0.72092646, - "num_input_tokens_seen": 186610345, - "step": 8687, - "time_per_iteration": 2.8442130088806152 - }, - { - "auxiliary_loss_clip": 0.01119347, - "auxiliary_loss_mlp": 0.01033746, - "balance_loss_clip": 1.04286504, - "balance_loss_mlp": 1.02110982, - "epoch": 0.5223508191793176, - "flos": 12712988615040.0, - "grad_norm": 2.218511460216324, - "language_loss": 0.83229095, - "learning_rate": 1.9522954686059936e-06, - "loss": 0.85382187, - "num_input_tokens_seen": 186624360, - "step": 8688, - "time_per_iteration": 2.6338348388671875 - }, - { - "auxiliary_loss_clip": 0.01111374, - "auxiliary_loss_mlp": 0.00771369, - "balance_loss_clip": 1.04469848, - "balance_loss_mlp": 1.00028682, - "epoch": 0.5224109424319856, - "flos": 15632238752640.0, - "grad_norm": 2.3403806989505744, - "language_loss": 0.73484588, - "learning_rate": 1.9519061186355558e-06, - "loss": 0.75367332, - "num_input_tokens_seen": 186638680, - "step": 8689, - "time_per_iteration": 2.7219626903533936 - }, - { - "auxiliary_loss_clip": 0.01098413, - "auxiliary_loss_mlp": 0.01039301, - "balance_loss_clip": 1.04080057, - "balance_loss_mlp": 1.02569962, - "epoch": 0.5224710656846535, - "flos": 15742233175680.0, - "grad_norm": 1.8348188856486891, - "language_loss": 0.83713108, - "learning_rate": 1.9515167704888417e-06, - "loss": 0.85850823, - "num_input_tokens_seen": 186655840, - "step": 8690, - "time_per_iteration": 2.7358436584472656 - }, - { - "auxiliary_loss_clip": 0.01088108, - "auxiliary_loss_mlp": 0.01042101, - "balance_loss_clip": 1.04381537, - "balance_loss_mlp": 1.0276053, - "epoch": 0.5225311889373215, - "flos": 26030998938240.0, - "grad_norm": 2.015928049267595, - "language_loss": 0.79080188, - "learning_rate": 1.9511274241806173e-06, - "loss": 0.81210393, - "num_input_tokens_seen": 186674150, - "step": 8691, - "time_per_iteration": 2.813861131668091 - }, - { - "auxiliary_loss_clip": 0.01120671, - "auxiliary_loss_mlp": 0.01040201, - "balance_loss_clip": 1.04700625, - "balance_loss_mlp": 1.02552676, - "epoch": 0.5225913121899894, - "flos": 18369278173440.0, - "grad_norm": 2.3023499072102194, - "language_loss": 0.76491982, - "learning_rate": 1.950738079725646e-06, - "loss": 0.78652847, - "num_input_tokens_seen": 186690675, - "step": 8692, - "time_per_iteration": 2.73480224609375 - }, - { - "auxiliary_loss_clip": 0.01108877, - "auxiliary_loss_mlp": 0.01039055, - "balance_loss_clip": 1.04479527, - "balance_loss_mlp": 1.02631116, - "epoch": 0.5226514354426575, - "flos": 29273516501760.0, - "grad_norm": 1.6247734368015925, - "language_loss": 0.72325015, - "learning_rate": 1.950348737138691e-06, - "loss": 0.7447294, - "num_input_tokens_seen": 186710380, - "step": 8693, - "time_per_iteration": 2.782871723175049 - }, - { - "auxiliary_loss_clip": 0.01126187, - "auxiliary_loss_mlp": 0.01042643, - "balance_loss_clip": 1.04384446, - "balance_loss_mlp": 1.02753901, - "epoch": 0.5227115586953254, - "flos": 22853299466880.0, - "grad_norm": 7.53216872329228, - "language_loss": 0.8220976, - "learning_rate": 1.949959396434517e-06, - "loss": 0.84378588, - "num_input_tokens_seen": 186729135, - "step": 8694, - "time_per_iteration": 2.6748385429382324 - }, - { - "auxiliary_loss_clip": 0.01013741, - "auxiliary_loss_mlp": 0.01003883, - "balance_loss_clip": 1.02031374, - "balance_loss_mlp": 1.00224972, - "epoch": 0.5227716819479934, - "flos": 57474419022720.0, - "grad_norm": 0.775564151874101, - "language_loss": 0.55647832, - "learning_rate": 1.949570057627888e-06, - "loss": 0.57665455, - "num_input_tokens_seen": 186791115, - "step": 8695, - "time_per_iteration": 3.345134973526001 - }, - { - "auxiliary_loss_clip": 0.01061261, - "auxiliary_loss_mlp": 0.01041707, - "balance_loss_clip": 1.04356098, - "balance_loss_mlp": 1.0283134, - "epoch": 0.5228318052006613, - "flos": 13808264077440.0, - "grad_norm": 1.8671615474987673, - "language_loss": 0.732638, - "learning_rate": 1.9491807207335672e-06, - "loss": 0.75366765, - "num_input_tokens_seen": 186808660, - "step": 8696, - "time_per_iteration": 2.782350540161133 - }, - { - "auxiliary_loss_clip": 0.01099328, - "auxiliary_loss_mlp": 0.01039177, - "balance_loss_clip": 1.0429219, - "balance_loss_mlp": 1.02538478, - "epoch": 0.5228919284533293, - "flos": 15596184476160.0, - "grad_norm": 1.7190001113055795, - "language_loss": 0.71068561, - "learning_rate": 1.948791385766319e-06, - "loss": 0.73207062, - "num_input_tokens_seen": 186825900, - "step": 8697, - "time_per_iteration": 2.781651735305786 - }, - { - "auxiliary_loss_clip": 0.01092255, - "auxiliary_loss_mlp": 0.01037704, - "balance_loss_clip": 1.04413819, - "balance_loss_mlp": 1.02498996, - "epoch": 0.5229520517059973, - "flos": 22491499726080.0, - "grad_norm": 1.9475868659159346, - "language_loss": 0.80332339, - "learning_rate": 1.948402052740906e-06, - "loss": 0.82462299, - "num_input_tokens_seen": 186843735, - "step": 8698, - "time_per_iteration": 2.7078070640563965 - }, - { - "auxiliary_loss_clip": 0.01110911, - "auxiliary_loss_mlp": 0.01038923, - "balance_loss_clip": 1.04286766, - "balance_loss_mlp": 1.02576292, - "epoch": 0.5230121749586653, - "flos": 22090880361600.0, - "grad_norm": 1.6510046342053804, - "language_loss": 0.74265802, - "learning_rate": 1.948012721672093e-06, - "loss": 0.7641564, - "num_input_tokens_seen": 186862440, - "step": 8699, - "time_per_iteration": 2.667205333709717 - }, - { - "auxiliary_loss_clip": 0.01113513, - "auxiliary_loss_mlp": 0.00773315, - "balance_loss_clip": 1.04171407, - "balance_loss_mlp": 1.00029182, - "epoch": 0.5230722982113333, - "flos": 22127150119680.0, - "grad_norm": 1.8535119798273105, - "language_loss": 0.73102427, - "learning_rate": 1.947623392574642e-06, - "loss": 0.74989247, - "num_input_tokens_seen": 186880940, - "step": 8700, - "time_per_iteration": 2.7250688076019287 - }, - { - "auxiliary_loss_clip": 0.01100202, - "auxiliary_loss_mlp": 0.01039746, - "balance_loss_clip": 1.04480553, - "balance_loss_mlp": 1.02510738, - "epoch": 0.5231324214640012, - "flos": 25009268572800.0, - "grad_norm": 1.8378710861613805, - "language_loss": 0.67156309, - "learning_rate": 1.947234065463318e-06, - "loss": 0.69296253, - "num_input_tokens_seen": 186900785, - "step": 8701, - "time_per_iteration": 2.830300807952881 - }, - { - "auxiliary_loss_clip": 0.0110603, - "auxiliary_loss_mlp": 0.00771586, - "balance_loss_clip": 1.04569697, - "balance_loss_mlp": 1.0002594, - "epoch": 0.5231925447166692, - "flos": 25740517651200.0, - "grad_norm": 1.7245960424067608, - "language_loss": 0.66710031, - "learning_rate": 1.9468447403528826e-06, - "loss": 0.68587643, - "num_input_tokens_seen": 186920895, - "step": 8702, - "time_per_iteration": 2.725583791732788 - }, - { - "auxiliary_loss_clip": 0.01100659, - "auxiliary_loss_mlp": 0.01039254, - "balance_loss_clip": 1.04362679, - "balance_loss_mlp": 1.02464485, - "epoch": 0.5232526679693371, - "flos": 21433930565760.0, - "grad_norm": 1.7906940342438376, - "language_loss": 0.76647937, - "learning_rate": 1.946455417258101e-06, - "loss": 0.78787845, - "num_input_tokens_seen": 186940605, - "step": 8703, - "time_per_iteration": 2.7585973739624023 - }, - { - "auxiliary_loss_clip": 0.01117607, - "auxiliary_loss_mlp": 0.01043637, - "balance_loss_clip": 1.04529738, - "balance_loss_mlp": 1.02807403, - "epoch": 0.5233127912220051, - "flos": 35298393471360.0, - "grad_norm": 2.3077994186551036, - "language_loss": 0.76945215, - "learning_rate": 1.9460660961937348e-06, - "loss": 0.79106462, - "num_input_tokens_seen": 186960820, - "step": 8704, - "time_per_iteration": 2.8613169193267822 - }, - { - "auxiliary_loss_clip": 0.01102832, - "auxiliary_loss_mlp": 0.0104096, - "balance_loss_clip": 1.04692268, - "balance_loss_mlp": 1.02798438, - "epoch": 0.523372914474673, - "flos": 17051320344960.0, - "grad_norm": 1.8023730932949449, - "language_loss": 0.78725791, - "learning_rate": 1.9456767771745474e-06, - "loss": 0.80869591, - "num_input_tokens_seen": 186976240, - "step": 8705, - "time_per_iteration": 2.741025924682617 - }, - { - "auxiliary_loss_clip": 0.01106252, - "auxiliary_loss_mlp": 0.01037077, - "balance_loss_clip": 1.04467177, - "balance_loss_mlp": 1.02273059, - "epoch": 0.5234330377273411, - "flos": 18406302117120.0, - "grad_norm": 2.80572723928073, - "language_loss": 0.69824338, - "learning_rate": 1.9452874602153027e-06, - "loss": 0.71967667, - "num_input_tokens_seen": 186992855, - "step": 8706, - "time_per_iteration": 2.6872975826263428 - }, - { - "auxiliary_loss_clip": 0.01035877, - "auxiliary_loss_mlp": 0.01013693, - "balance_loss_clip": 1.01881003, - "balance_loss_mlp": 1.01213157, - "epoch": 0.523493160980009, - "flos": 65850296970240.0, - "grad_norm": 0.6808139995313122, - "language_loss": 0.52465838, - "learning_rate": 1.9448981453307623e-06, - "loss": 0.54515409, - "num_input_tokens_seen": 187051205, - "step": 8707, - "time_per_iteration": 3.2341713905334473 - }, - { - "auxiliary_loss_clip": 0.01098509, - "auxiliary_loss_mlp": 0.0103739, - "balance_loss_clip": 1.04139447, - "balance_loss_mlp": 1.02380002, - "epoch": 0.523553284232677, - "flos": 21872076664320.0, - "grad_norm": 1.6877057679435725, - "language_loss": 0.74618769, - "learning_rate": 1.9445088325356904e-06, - "loss": 0.76754665, - "num_input_tokens_seen": 187070540, - "step": 8708, - "time_per_iteration": 2.8342666625976562 - }, - { - "auxiliary_loss_clip": 0.0109528, - "auxiliary_loss_mlp": 0.01031158, - "balance_loss_clip": 1.04457259, - "balance_loss_mlp": 1.01772881, - "epoch": 0.5236134074853449, - "flos": 20848191482880.0, - "grad_norm": 1.566541485414049, - "language_loss": 0.7730183, - "learning_rate": 1.944119521844849e-06, - "loss": 0.79428267, - "num_input_tokens_seen": 187089975, - "step": 8709, - "time_per_iteration": 2.708807945251465 - }, - { - "auxiliary_loss_clip": 0.01074175, - "auxiliary_loss_mlp": 0.0103878, - "balance_loss_clip": 1.03733826, - "balance_loss_mlp": 1.02211428, - "epoch": 0.5236735307380129, - "flos": 25520421064320.0, - "grad_norm": 2.041376547108184, - "language_loss": 0.83508044, - "learning_rate": 1.9437302132730003e-06, - "loss": 0.85620999, - "num_input_tokens_seen": 187108775, - "step": 8710, - "time_per_iteration": 2.7781410217285156 - }, - { - "auxiliary_loss_clip": 0.01093974, - "auxiliary_loss_mlp": 0.01031634, - "balance_loss_clip": 1.04229414, - "balance_loss_mlp": 1.01794267, - "epoch": 0.523733653990681, - "flos": 23583112001280.0, - "grad_norm": 2.2254848949827983, - "language_loss": 0.69715381, - "learning_rate": 1.943340906834908e-06, - "loss": 0.7184099, - "num_input_tokens_seen": 187128830, - "step": 8711, - "time_per_iteration": 2.7991995811462402 - }, - { - "auxiliary_loss_clip": 0.01114283, - "auxiliary_loss_mlp": 0.01039219, - "balance_loss_clip": 1.04482269, - "balance_loss_mlp": 1.02475893, - "epoch": 0.5237937772433489, - "flos": 21106245767040.0, - "grad_norm": 2.0479693285364764, - "language_loss": 0.8319692, - "learning_rate": 1.9429516025453345e-06, - "loss": 0.85350424, - "num_input_tokens_seen": 187149570, - "step": 8712, - "time_per_iteration": 2.6913018226623535 - }, - { - "auxiliary_loss_clip": 0.01126488, - "auxiliary_loss_mlp": 0.01042299, - "balance_loss_clip": 1.04477775, - "balance_loss_mlp": 1.02704, - "epoch": 0.5238539004960169, - "flos": 19172887200000.0, - "grad_norm": 2.12392132979159, - "language_loss": 0.69795638, - "learning_rate": 1.9425623004190415e-06, - "loss": 0.71964419, - "num_input_tokens_seen": 187170575, - "step": 8713, - "time_per_iteration": 2.6037533283233643 - }, - { - "auxiliary_loss_clip": 0.01087813, - "auxiliary_loss_mlp": 0.01040708, - "balance_loss_clip": 1.03908944, - "balance_loss_mlp": 1.02369666, - "epoch": 0.5239140237486848, - "flos": 17888218300800.0, - "grad_norm": 2.8914750795344233, - "language_loss": 0.76703346, - "learning_rate": 1.9421730004707925e-06, - "loss": 0.78831869, - "num_input_tokens_seen": 187187190, - "step": 8714, - "time_per_iteration": 2.717984676361084 - }, - { - "auxiliary_loss_clip": 0.01086969, - "auxiliary_loss_mlp": 0.01044306, - "balance_loss_clip": 1.0413481, - "balance_loss_mlp": 1.02729511, - "epoch": 0.5239741470013528, - "flos": 17930413802880.0, - "grad_norm": 1.9287276707329408, - "language_loss": 0.7608462, - "learning_rate": 1.9417837027153483e-06, - "loss": 0.78215897, - "num_input_tokens_seen": 187204350, - "step": 8715, - "time_per_iteration": 2.6999671459198 - }, - { - "auxiliary_loss_clip": 0.01099192, - "auxiliary_loss_mlp": 0.01035578, - "balance_loss_clip": 1.0417552, - "balance_loss_mlp": 1.02110636, - "epoch": 0.5240342702540207, - "flos": 30993386584320.0, - "grad_norm": 2.1294970054785622, - "language_loss": 0.71165496, - "learning_rate": 1.9413944071674723e-06, - "loss": 0.73300266, - "num_input_tokens_seen": 187225605, - "step": 8716, - "time_per_iteration": 2.744347333908081 - }, - { - "auxiliary_loss_clip": 0.01121973, - "auxiliary_loss_mlp": 0.0103854, - "balance_loss_clip": 1.04380643, - "balance_loss_mlp": 1.02563596, - "epoch": 0.5240943935066887, - "flos": 25005066681600.0, - "grad_norm": 3.2118480553546087, - "language_loss": 0.87086689, - "learning_rate": 1.941005113841926e-06, - "loss": 0.89247203, - "num_input_tokens_seen": 187241335, - "step": 8717, - "time_per_iteration": 4.158156394958496 - }, - { - "auxiliary_loss_clip": 0.01109045, - "auxiliary_loss_mlp": 0.01035747, - "balance_loss_clip": 1.0454371, - "balance_loss_mlp": 1.02164412, - "epoch": 0.5241545167593566, - "flos": 23659099223040.0, - "grad_norm": 1.880090780763199, - "language_loss": 0.61121464, - "learning_rate": 1.9406158227534723e-06, - "loss": 0.63266253, - "num_input_tokens_seen": 187259925, - "step": 8718, - "time_per_iteration": 2.671760320663452 - }, - { - "auxiliary_loss_clip": 0.01094217, - "auxiliary_loss_mlp": 0.010389, - "balance_loss_clip": 1.04272294, - "balance_loss_mlp": 1.02387953, - "epoch": 0.5242146400120247, - "flos": 23400398494080.0, - "grad_norm": 1.8098933087704439, - "language_loss": 0.72060192, - "learning_rate": 1.940226533916872e-06, - "loss": 0.74193311, - "num_input_tokens_seen": 187279035, - "step": 8719, - "time_per_iteration": 2.815864324569702 - }, - { - "auxiliary_loss_clip": 0.01109147, - "auxiliary_loss_mlp": 0.01029429, - "balance_loss_clip": 1.04305363, - "balance_loss_mlp": 1.01676893, - "epoch": 0.5242747632646926, - "flos": 17749065012480.0, - "grad_norm": 1.9600898858885738, - "language_loss": 0.73258477, - "learning_rate": 1.9398372473468877e-06, - "loss": 0.7539705, - "num_input_tokens_seen": 187297555, - "step": 8720, - "time_per_iteration": 4.34027624130249 - }, - { - "auxiliary_loss_clip": 0.01110975, - "auxiliary_loss_mlp": 0.01037749, - "balance_loss_clip": 1.042588, - "balance_loss_mlp": 1.02323568, - "epoch": 0.5243348865173606, - "flos": 32597731549440.0, - "grad_norm": 1.7136870064395262, - "language_loss": 0.7059021, - "learning_rate": 1.939447963058281e-06, - "loss": 0.72738934, - "num_input_tokens_seen": 187320265, - "step": 8721, - "time_per_iteration": 4.457958698272705 - }, - { - "auxiliary_loss_clip": 0.01064422, - "auxiliary_loss_mlp": 0.0103891, - "balance_loss_clip": 1.03628516, - "balance_loss_mlp": 1.02399719, - "epoch": 0.5243950097700285, - "flos": 25484115392640.0, - "grad_norm": 1.8741175153878353, - "language_loss": 0.86506796, - "learning_rate": 1.939058681065813e-06, - "loss": 0.88610125, - "num_input_tokens_seen": 187338045, - "step": 8722, - "time_per_iteration": 2.851713180541992 - }, - { - "auxiliary_loss_clip": 0.01122948, - "auxiliary_loss_mlp": 0.01033308, - "balance_loss_clip": 1.0449574, - "balance_loss_mlp": 1.01830578, - "epoch": 0.5244551330226965, - "flos": 15268391936640.0, - "grad_norm": 1.8614764349338224, - "language_loss": 0.79853708, - "learning_rate": 1.938669401384247e-06, - "loss": 0.82009959, - "num_input_tokens_seen": 187356040, - "step": 8723, - "time_per_iteration": 2.567403554916382 - }, - { - "auxiliary_loss_clip": 0.01111191, - "auxiliary_loss_mlp": 0.0104214, - "balance_loss_clip": 1.04611158, - "balance_loss_mlp": 1.02747166, - "epoch": 0.5245152562753645, - "flos": 22237108629120.0, - "grad_norm": 2.070314434964904, - "language_loss": 0.75515735, - "learning_rate": 1.9382801240283426e-06, - "loss": 0.77669066, - "num_input_tokens_seen": 187374185, - "step": 8724, - "time_per_iteration": 4.372815847396851 - }, - { - "auxiliary_loss_clip": 0.01128433, - "auxiliary_loss_mlp": 0.01038668, - "balance_loss_clip": 1.04391563, - "balance_loss_mlp": 1.02228856, - "epoch": 0.5245753795280325, - "flos": 29426460612480.0, - "grad_norm": 1.7393951886603523, - "language_loss": 0.70450562, - "learning_rate": 1.9378908490128625e-06, - "loss": 0.72617668, - "num_input_tokens_seen": 187396640, - "step": 8725, - "time_per_iteration": 2.691462278366089 - }, - { - "auxiliary_loss_clip": 0.01014562, - "auxiliary_loss_mlp": 0.0100467, - "balance_loss_clip": 1.01748943, - "balance_loss_mlp": 1.0025723, - "epoch": 0.5246355027807005, - "flos": 58834392785280.0, - "grad_norm": 0.751972672191828, - "language_loss": 0.55635381, - "learning_rate": 1.937501576352568e-06, - "loss": 0.57654613, - "num_input_tokens_seen": 187455945, - "step": 8726, - "time_per_iteration": 3.2482144832611084 - }, - { - "auxiliary_loss_clip": 0.01023582, - "auxiliary_loss_mlp": 0.01000951, - "balance_loss_clip": 1.02279115, - "balance_loss_mlp": 0.9995268, - "epoch": 0.5246956260333684, - "flos": 64526592965760.0, - "grad_norm": 0.7878423938979384, - "language_loss": 0.58313322, - "learning_rate": 1.937112306062219e-06, - "loss": 0.60337853, - "num_input_tokens_seen": 187519975, - "step": 8727, - "time_per_iteration": 3.2606794834136963 - }, - { - "auxiliary_loss_clip": 0.01114413, - "auxiliary_loss_mlp": 0.01036047, - "balance_loss_clip": 1.0418663, - "balance_loss_mlp": 1.02111006, - "epoch": 0.5247557492860364, - "flos": 24533631653760.0, - "grad_norm": 1.3167349097133665, - "language_loss": 0.70678449, - "learning_rate": 1.9367230381565786e-06, - "loss": 0.72828913, - "num_input_tokens_seen": 187541775, - "step": 8728, - "time_per_iteration": 2.6979823112487793 - }, - { - "auxiliary_loss_clip": 0.01110188, - "auxiliary_loss_mlp": 0.01029551, - "balance_loss_clip": 1.04107904, - "balance_loss_mlp": 1.01636648, - "epoch": 0.5248158725387043, - "flos": 18806131382400.0, - "grad_norm": 1.4052080718589413, - "language_loss": 0.69816244, - "learning_rate": 1.9363337726504062e-06, - "loss": 0.71955991, - "num_input_tokens_seen": 187560425, - "step": 8729, - "time_per_iteration": 2.6898272037506104 - }, - { - "auxiliary_loss_clip": 0.01084395, - "auxiliary_loss_mlp": 0.01034673, - "balance_loss_clip": 1.04138565, - "balance_loss_mlp": 1.02001655, - "epoch": 0.5248759957913723, - "flos": 20955851521920.0, - "grad_norm": 1.9953537122640765, - "language_loss": 0.83565557, - "learning_rate": 1.935944509558464e-06, - "loss": 0.85684621, - "num_input_tokens_seen": 187579930, - "step": 8730, - "time_per_iteration": 2.719953775405884 - }, - { - "auxiliary_loss_clip": 0.01087481, - "auxiliary_loss_mlp": 0.01037052, - "balance_loss_clip": 1.04011822, - "balance_loss_mlp": 1.02177548, - "epoch": 0.5249361190440403, - "flos": 18660980522880.0, - "grad_norm": 2.0205964009231816, - "language_loss": 0.79403269, - "learning_rate": 1.9355552488955125e-06, - "loss": 0.81527805, - "num_input_tokens_seen": 187595365, - "step": 8731, - "time_per_iteration": 2.741563081741333 - }, - { - "auxiliary_loss_clip": 0.01105082, - "auxiliary_loss_mlp": 0.01030893, - "balance_loss_clip": 1.03996611, - "balance_loss_mlp": 1.0172075, - "epoch": 0.5249962422967083, - "flos": 24863327614080.0, - "grad_norm": 1.917738069421625, - "language_loss": 0.83558822, - "learning_rate": 1.935165990676312e-06, - "loss": 0.85694802, - "num_input_tokens_seen": 187614715, - "step": 8732, - "time_per_iteration": 2.672537326812744 - }, - { - "auxiliary_loss_clip": 0.01109755, - "auxiliary_loss_mlp": 0.01037546, - "balance_loss_clip": 1.04267287, - "balance_loss_mlp": 1.0239923, - "epoch": 0.5250563655493762, - "flos": 15262681674240.0, - "grad_norm": 1.7357983281517446, - "language_loss": 0.77602309, - "learning_rate": 1.9347767349156237e-06, - "loss": 0.79749608, - "num_input_tokens_seen": 187630745, - "step": 8733, - "time_per_iteration": 2.651329278945923 - }, - { - "auxiliary_loss_clip": 0.01126312, - "auxiliary_loss_mlp": 0.01036227, - "balance_loss_clip": 1.04450274, - "balance_loss_mlp": 1.02157617, - "epoch": 0.5251164888020442, - "flos": 18625177641600.0, - "grad_norm": 1.892740616554097, - "language_loss": 0.8202911, - "learning_rate": 1.934387481628208e-06, - "loss": 0.84191644, - "num_input_tokens_seen": 187648200, - "step": 8734, - "time_per_iteration": 2.608727216720581 - }, - { - "auxiliary_loss_clip": 0.01091339, - "auxiliary_loss_mlp": 0.01028225, - "balance_loss_clip": 1.04116642, - "balance_loss_mlp": 1.01467109, - "epoch": 0.5251766120547121, - "flos": 29710764760320.0, - "grad_norm": 1.3668287037138613, - "language_loss": 0.76932037, - "learning_rate": 1.933998230828826e-06, - "loss": 0.79051596, - "num_input_tokens_seen": 187669205, - "step": 8735, - "time_per_iteration": 2.703274965286255 - }, - { - "auxiliary_loss_clip": 0.01112983, - "auxiliary_loss_mlp": 0.01038692, - "balance_loss_clip": 1.04413259, - "balance_loss_mlp": 1.02544188, - "epoch": 0.5252367353073801, - "flos": 23440295525760.0, - "grad_norm": 1.7627870360178364, - "language_loss": 0.80808437, - "learning_rate": 1.9336089825322376e-06, - "loss": 0.82960117, - "num_input_tokens_seen": 187690890, - "step": 8736, - "time_per_iteration": 2.6869864463806152 - }, - { - "auxiliary_loss_clip": 0.01124902, - "auxiliary_loss_mlp": 0.0103679, - "balance_loss_clip": 1.04460597, - "balance_loss_mlp": 1.02199018, - "epoch": 0.5252968585600482, - "flos": 30810708990720.0, - "grad_norm": 2.2019442049314626, - "language_loss": 0.69824821, - "learning_rate": 1.9332197367532033e-06, - "loss": 0.71986508, - "num_input_tokens_seen": 187713045, - "step": 8737, - "time_per_iteration": 2.694178342819214 - }, - { - "auxiliary_loss_clip": 0.01101601, - "auxiliary_loss_mlp": 0.01038957, - "balance_loss_clip": 1.04274702, - "balance_loss_mlp": 1.02473521, - "epoch": 0.5253569818127161, - "flos": 20628274464000.0, - "grad_norm": 1.4444028137471083, - "language_loss": 0.77386785, - "learning_rate": 1.9328304935064833e-06, - "loss": 0.79527342, - "num_input_tokens_seen": 187733640, - "step": 8738, - "time_per_iteration": 2.7655301094055176 - }, - { - "auxiliary_loss_clip": 0.01012696, - "auxiliary_loss_mlp": 0.00752303, - "balance_loss_clip": 1.01498532, - "balance_loss_mlp": 0.99995118, - "epoch": 0.5254171050653841, - "flos": 63428695810560.0, - "grad_norm": 0.7418872270660203, - "language_loss": 0.54437888, - "learning_rate": 1.932441252806837e-06, - "loss": 0.56202877, - "num_input_tokens_seen": 187792930, - "step": 8739, - "time_per_iteration": 3.183931350708008 - }, - { - "auxiliary_loss_clip": 0.01093164, - "auxiliary_loss_mlp": 0.01039099, - "balance_loss_clip": 1.03987527, - "balance_loss_mlp": 1.02572989, - "epoch": 0.525477228318052, - "flos": 34670782108800.0, - "grad_norm": 1.6115423077763054, - "language_loss": 0.84719479, - "learning_rate": 1.9320520146690263e-06, - "loss": 0.8685174, - "num_input_tokens_seen": 187812495, - "step": 8740, - "time_per_iteration": 2.8701846599578857 - }, - { - "auxiliary_loss_clip": 0.01106251, - "auxiliary_loss_mlp": 0.00771888, - "balance_loss_clip": 1.03936994, - "balance_loss_mlp": 1.00030541, - "epoch": 0.52553735157072, - "flos": 17930844766080.0, - "grad_norm": 2.112576285349714, - "language_loss": 0.69466913, - "learning_rate": 1.9316627791078093e-06, - "loss": 0.71345055, - "num_input_tokens_seen": 187829685, - "step": 8741, - "time_per_iteration": 2.721233606338501 - }, - { - "auxiliary_loss_clip": 0.01101687, - "auxiliary_loss_mlp": 0.0103584, - "balance_loss_clip": 1.04140949, - "balance_loss_mlp": 1.02171421, - "epoch": 0.5255974748233879, - "flos": 9940864584960.0, - "grad_norm": 1.8031333880336204, - "language_loss": 0.66328311, - "learning_rate": 1.931273546137947e-06, - "loss": 0.68465841, - "num_input_tokens_seen": 187846495, - "step": 8742, - "time_per_iteration": 2.695504903793335 - }, - { - "auxiliary_loss_clip": 0.01086092, - "auxiliary_loss_mlp": 0.01042238, - "balance_loss_clip": 1.03882444, - "balance_loss_mlp": 1.02666903, - "epoch": 0.5256575980760559, - "flos": 16868427269760.0, - "grad_norm": 1.9909144400242709, - "language_loss": 0.63219392, - "learning_rate": 1.9308843157741983e-06, - "loss": 0.65347725, - "num_input_tokens_seen": 187862010, - "step": 8743, - "time_per_iteration": 2.712376832962036 - }, - { - "auxiliary_loss_clip": 0.0102969, - "auxiliary_loss_mlp": 0.01008337, - "balance_loss_clip": 1.01230693, - "balance_loss_mlp": 1.00641751, - "epoch": 0.5257177213287239, - "flos": 62386210362240.0, - "grad_norm": 0.7739828883360421, - "language_loss": 0.5410347, - "learning_rate": 1.930495088031323e-06, - "loss": 0.56141496, - "num_input_tokens_seen": 187922730, - "step": 8744, - "time_per_iteration": 3.281756639480591 - }, - { - "auxiliary_loss_clip": 0.01106094, - "auxiliary_loss_mlp": 0.01037818, - "balance_loss_clip": 1.04534447, - "balance_loss_mlp": 1.02202296, - "epoch": 0.5257778445813919, - "flos": 20776908942720.0, - "grad_norm": 2.5030900138953274, - "language_loss": 0.75859022, - "learning_rate": 1.9301058629240814e-06, - "loss": 0.7800293, - "num_input_tokens_seen": 187940160, - "step": 8745, - "time_per_iteration": 2.642817258834839 - }, - { - "auxiliary_loss_clip": 0.01110515, - "auxiliary_loss_mlp": 0.0104281, - "balance_loss_clip": 1.04153466, - "balance_loss_mlp": 1.02948213, - "epoch": 0.5258379678340598, - "flos": 17018606033280.0, - "grad_norm": 1.7823830080970366, - "language_loss": 0.8089028, - "learning_rate": 1.9297166404672324e-06, - "loss": 0.83043599, - "num_input_tokens_seen": 187958625, - "step": 8746, - "time_per_iteration": 2.5678205490112305 - }, - { - "auxiliary_loss_clip": 0.01108698, - "auxiliary_loss_mlp": 0.01036575, - "balance_loss_clip": 1.04006267, - "balance_loss_mlp": 1.02191806, - "epoch": 0.5258980910867278, - "flos": 21068754946560.0, - "grad_norm": 2.1394959039376475, - "language_loss": 0.75231433, - "learning_rate": 1.9293274206755353e-06, - "loss": 0.77376711, - "num_input_tokens_seen": 187977575, - "step": 8747, - "time_per_iteration": -0.009610652923583984 - }, - { - "auxiliary_loss_clip": 0.0105854, - "auxiliary_loss_mlp": 0.01033909, - "balance_loss_clip": 1.03949201, - "balance_loss_mlp": 1.01987767, - "epoch": 0.5259582143393957, - "flos": 18004461690240.0, - "grad_norm": 2.0175880820051058, - "language_loss": 0.82632613, - "learning_rate": 1.9289382035637505e-06, - "loss": 0.84725058, - "num_input_tokens_seen": 187996650, - "step": 8748, - "time_per_iteration": 2.7604665756225586 - }, - { - "auxiliary_loss_clip": 0.01099486, - "auxiliary_loss_mlp": 0.01033119, - "balance_loss_clip": 1.03856742, - "balance_loss_mlp": 1.01846862, - "epoch": 0.5260183375920637, - "flos": 22783848520320.0, - "grad_norm": 2.328081087853481, - "language_loss": 0.80873966, - "learning_rate": 1.9285489891466345e-06, - "loss": 0.83006573, - "num_input_tokens_seen": 188013510, - "step": 8749, - "time_per_iteration": 2.6853184700012207 - }, - { - "auxiliary_loss_clip": 0.01109749, - "auxiliary_loss_mlp": 0.01040189, - "balance_loss_clip": 1.04381132, - "balance_loss_mlp": 1.02556193, - "epoch": 0.5260784608447318, - "flos": 27052406081280.0, - "grad_norm": 1.7699462129252088, - "language_loss": 0.72291499, - "learning_rate": 1.9281597774389487e-06, - "loss": 0.74441439, - "num_input_tokens_seen": 188032085, - "step": 8750, - "time_per_iteration": 2.6771364212036133 - }, - { - "auxiliary_loss_clip": 0.01098374, - "auxiliary_loss_mlp": 0.01037371, - "balance_loss_clip": 1.03887165, - "balance_loss_mlp": 1.02362585, - "epoch": 0.5261385840973997, - "flos": 20662820369280.0, - "grad_norm": 1.3348346616556535, - "language_loss": 0.76186317, - "learning_rate": 1.9277705684554517e-06, - "loss": 0.78322065, - "num_input_tokens_seen": 188050590, - "step": 8751, - "time_per_iteration": 2.7016804218292236 - }, - { - "auxiliary_loss_clip": 0.01119796, - "auxiliary_loss_mlp": 0.01039709, - "balance_loss_clip": 1.04339051, - "balance_loss_mlp": 1.02622056, - "epoch": 0.5261987073500677, - "flos": 23622649896960.0, - "grad_norm": 1.7424279065253616, - "language_loss": 0.75831163, - "learning_rate": 1.927381362210902e-06, - "loss": 0.77990663, - "num_input_tokens_seen": 188071620, - "step": 8752, - "time_per_iteration": 2.7128703594207764 - }, - { - "auxiliary_loss_clip": 0.01112565, - "auxiliary_loss_mlp": 0.01033514, - "balance_loss_clip": 1.04177046, - "balance_loss_mlp": 1.01780224, - "epoch": 0.5262588306027356, - "flos": 27636241743360.0, - "grad_norm": 2.1757268908288707, - "language_loss": 0.67754769, - "learning_rate": 1.926992158720058e-06, - "loss": 0.69900852, - "num_input_tokens_seen": 188091740, - "step": 8753, - "time_per_iteration": 2.678269147872925 - }, - { - "auxiliary_loss_clip": 0.01111599, - "auxiliary_loss_mlp": 0.01034275, - "balance_loss_clip": 1.04266751, - "balance_loss_mlp": 1.02072084, - "epoch": 0.5263189538554036, - "flos": 21759711943680.0, - "grad_norm": 1.6342208992061138, - "language_loss": 0.84114075, - "learning_rate": 1.9266029579976785e-06, - "loss": 0.86259949, - "num_input_tokens_seen": 188111165, - "step": 8754, - "time_per_iteration": 2.6858248710632324 - }, - { - "auxiliary_loss_clip": 0.01109767, - "auxiliary_loss_mlp": 0.01035863, - "balance_loss_clip": 1.04159164, - "balance_loss_mlp": 1.02159333, - "epoch": 0.5263790771080715, - "flos": 14276359140480.0, - "grad_norm": 2.0064086672514323, - "language_loss": 0.87360156, - "learning_rate": 1.926213760058522e-06, - "loss": 0.89505792, - "num_input_tokens_seen": 188127825, - "step": 8755, - "time_per_iteration": 2.5783674716949463 - }, - { - "auxiliary_loss_clip": 0.01007681, - "auxiliary_loss_mlp": 0.01000927, - "balance_loss_clip": 1.01328659, - "balance_loss_mlp": 0.99918669, - "epoch": 0.5264392003607395, - "flos": 65806413528960.0, - "grad_norm": 0.7404552494369754, - "language_loss": 0.5880959, - "learning_rate": 1.9258245649173477e-06, - "loss": 0.60818201, - "num_input_tokens_seen": 188194050, - "step": 8756, - "time_per_iteration": 3.308302402496338 - }, - { - "auxiliary_loss_clip": 0.01094156, - "auxiliary_loss_mlp": 0.01036712, - "balance_loss_clip": 1.0415833, - "balance_loss_mlp": 1.02182269, - "epoch": 0.5264993236134075, - "flos": 21032413361280.0, - "grad_norm": 1.6572717697992079, - "language_loss": 0.70703959, - "learning_rate": 1.925435372588913e-06, - "loss": 0.72834826, - "num_input_tokens_seen": 188212565, - "step": 8757, - "time_per_iteration": 4.195650100708008 - }, - { - "auxiliary_loss_clip": 0.0110952, - "auxiliary_loss_mlp": 0.01040036, - "balance_loss_clip": 1.04061294, - "balance_loss_mlp": 1.02590346, - "epoch": 0.5265594468660755, - "flos": 16618202150400.0, - "grad_norm": 2.0494500796269577, - "language_loss": 0.88039553, - "learning_rate": 1.9250461830879768e-06, - "loss": 0.90189111, - "num_input_tokens_seen": 188229505, - "step": 8758, - "time_per_iteration": 2.63089656829834 - }, - { - "auxiliary_loss_clip": 0.01061465, - "auxiliary_loss_mlp": 0.01037569, - "balance_loss_clip": 1.03887105, - "balance_loss_mlp": 1.02301979, - "epoch": 0.5266195701187434, - "flos": 24134125610880.0, - "grad_norm": 1.4473751902891179, - "language_loss": 0.75895298, - "learning_rate": 1.9246569964292965e-06, - "loss": 0.77994329, - "num_input_tokens_seen": 188250395, - "step": 8759, - "time_per_iteration": 4.702188968658447 - }, - { - "auxiliary_loss_clip": 0.01098136, - "auxiliary_loss_mlp": 0.01030934, - "balance_loss_clip": 1.04185557, - "balance_loss_mlp": 1.0181073, - "epoch": 0.5266796933714114, - "flos": 15844111125120.0, - "grad_norm": 1.7900777891811301, - "language_loss": 0.71485013, - "learning_rate": 1.9242678126276307e-06, - "loss": 0.73614085, - "num_input_tokens_seen": 188266785, - "step": 8760, - "time_per_iteration": 4.256975412368774 - }, - { - "auxiliary_loss_clip": 0.01098696, - "auxiliary_loss_mlp": 0.01040967, - "balance_loss_clip": 1.04177952, - "balance_loss_mlp": 1.02593493, - "epoch": 0.5267398166240793, - "flos": 20951434149120.0, - "grad_norm": 2.6157951761776697, - "language_loss": 0.75801802, - "learning_rate": 1.923878631697736e-06, - "loss": 0.77941465, - "num_input_tokens_seen": 188282525, - "step": 8761, - "time_per_iteration": 2.685028553009033 - }, - { - "auxiliary_loss_clip": 0.01104735, - "auxiliary_loss_mlp": 0.00771727, - "balance_loss_clip": 1.03871739, - "balance_loss_mlp": 1.00023258, - "epoch": 0.5267999398767473, - "flos": 20996394998400.0, - "grad_norm": 1.8739254444127986, - "language_loss": 0.70466101, - "learning_rate": 1.923489453654373e-06, - "loss": 0.72342563, - "num_input_tokens_seen": 188301395, - "step": 8762, - "time_per_iteration": 2.727120876312256 - }, - { - "auxiliary_loss_clip": 0.01014324, - "auxiliary_loss_mlp": 0.00999661, - "balance_loss_clip": 1.00980198, - "balance_loss_mlp": 0.99816543, - "epoch": 0.5268600631294152, - "flos": 66849401767680.0, - "grad_norm": 0.9282030794038212, - "language_loss": 0.65443593, - "learning_rate": 1.9231002785122963e-06, - "loss": 0.67457575, - "num_input_tokens_seen": 188357665, - "step": 8763, - "time_per_iteration": 3.109525203704834 - }, - { - "auxiliary_loss_clip": 0.01109455, - "auxiliary_loss_mlp": 0.01030406, - "balance_loss_clip": 1.04166603, - "balance_loss_mlp": 1.01676226, - "epoch": 0.5269201863820833, - "flos": 17165552572800.0, - "grad_norm": 1.6243900815433006, - "language_loss": 0.71050072, - "learning_rate": 1.922711106286265e-06, - "loss": 0.73189938, - "num_input_tokens_seen": 188376935, - "step": 8764, - "time_per_iteration": 4.168430328369141 - }, - { - "auxiliary_loss_clip": 0.01080487, - "auxiliary_loss_mlp": 0.01033821, - "balance_loss_clip": 1.03809977, - "balance_loss_mlp": 1.01832938, - "epoch": 0.5269803096347513, - "flos": 20522589672960.0, - "grad_norm": 1.5962933914095123, - "language_loss": 0.74318087, - "learning_rate": 1.9223219369910368e-06, - "loss": 0.76432389, - "num_input_tokens_seen": 188394995, - "step": 8765, - "time_per_iteration": 2.7441658973693848 - }, - { - "auxiliary_loss_clip": 0.01098499, - "auxiliary_loss_mlp": 0.01037098, - "balance_loss_clip": 1.03631091, - "balance_loss_mlp": 1.02200055, - "epoch": 0.5270404328874192, - "flos": 27230989524480.0, - "grad_norm": 1.60818818085183, - "language_loss": 0.85403508, - "learning_rate": 1.9219327706413677e-06, - "loss": 0.87539107, - "num_input_tokens_seen": 188415475, - "step": 8766, - "time_per_iteration": 2.7902116775512695 - }, - { - "auxiliary_loss_clip": 0.0112556, - "auxiliary_loss_mlp": 0.01039583, - "balance_loss_clip": 1.0449605, - "balance_loss_mlp": 1.02492046, - "epoch": 0.5271005561400872, - "flos": 23110491824640.0, - "grad_norm": 1.780636206979604, - "language_loss": 0.79070592, - "learning_rate": 1.921543607252017e-06, - "loss": 0.81235737, - "num_input_tokens_seen": 188435665, - "step": 8767, - "time_per_iteration": 2.6986846923828125 - }, - { - "auxiliary_loss_clip": 0.01114967, - "auxiliary_loss_mlp": 0.01039357, - "balance_loss_clip": 1.04406393, - "balance_loss_mlp": 1.02407432, - "epoch": 0.5271606793927551, - "flos": 22564793427840.0, - "grad_norm": 1.6576657234027676, - "language_loss": 0.73513746, - "learning_rate": 1.9211544468377394e-06, - "loss": 0.75668073, - "num_input_tokens_seen": 188455405, - "step": 8768, - "time_per_iteration": 2.695497989654541 - }, - { - "auxiliary_loss_clip": 0.01092606, - "auxiliary_loss_mlp": 0.01048135, - "balance_loss_clip": 1.03795791, - "balance_loss_mlp": 1.03445613, - "epoch": 0.5272208026454231, - "flos": 18764259102720.0, - "grad_norm": 1.9012673693956994, - "language_loss": 0.7428031, - "learning_rate": 1.9207652894132933e-06, - "loss": 0.76421046, - "num_input_tokens_seen": 188472940, - "step": 8769, - "time_per_iteration": 2.7763235569000244 - }, - { - "auxiliary_loss_clip": 0.01082308, - "auxiliary_loss_mlp": 0.0104049, - "balance_loss_clip": 1.03746688, - "balance_loss_mlp": 1.02675128, - "epoch": 0.5272809258980911, - "flos": 20412164286720.0, - "grad_norm": 1.8328085669464766, - "language_loss": 0.7360974, - "learning_rate": 1.920376134993436e-06, - "loss": 0.75732535, - "num_input_tokens_seen": 188493035, - "step": 8770, - "time_per_iteration": 2.7274930477142334 - }, - { - "auxiliary_loss_clip": 0.011224, - "auxiliary_loss_mlp": 0.01035685, - "balance_loss_clip": 1.04366255, - "balance_loss_mlp": 1.02199364, - "epoch": 0.5273410491507591, - "flos": 28256742213120.0, - "grad_norm": 1.7661010025178618, - "language_loss": 0.68258119, - "learning_rate": 1.9199869835929224e-06, - "loss": 0.704162, - "num_input_tokens_seen": 188513860, - "step": 8771, - "time_per_iteration": 2.6751418113708496 - }, - { - "auxiliary_loss_clip": 0.01109367, - "auxiliary_loss_mlp": 0.01038799, - "balance_loss_clip": 1.0429647, - "balance_loss_mlp": 1.02500653, - "epoch": 0.527401172403427, - "flos": 22455158140800.0, - "grad_norm": 1.9220412670697933, - "language_loss": 0.76438117, - "learning_rate": 1.9195978352265115e-06, - "loss": 0.78586286, - "num_input_tokens_seen": 188533345, - "step": 8772, - "time_per_iteration": 2.7865138053894043 - }, - { - "auxiliary_loss_clip": 0.01107055, - "auxiliary_loss_mlp": 0.01047604, - "balance_loss_clip": 1.04159784, - "balance_loss_mlp": 1.03290582, - "epoch": 0.527461295656095, - "flos": 21031084558080.0, - "grad_norm": 2.1683746410962472, - "language_loss": 0.65569091, - "learning_rate": 1.9192086899089585e-06, - "loss": 0.67723751, - "num_input_tokens_seen": 188551550, - "step": 8773, - "time_per_iteration": 2.648556709289551 - }, - { - "auxiliary_loss_clip": 0.01089634, - "auxiliary_loss_mlp": 0.01040938, - "balance_loss_clip": 1.04127073, - "balance_loss_mlp": 1.02838576, - "epoch": 0.5275214189087629, - "flos": 26322018929280.0, - "grad_norm": 1.7479537399696432, - "language_loss": 0.85893595, - "learning_rate": 1.91881954765502e-06, - "loss": 0.88024169, - "num_input_tokens_seen": 188571615, - "step": 8774, - "time_per_iteration": 2.8036038875579834 - }, - { - "auxiliary_loss_clip": 0.01088366, - "auxiliary_loss_mlp": 0.01035546, - "balance_loss_clip": 1.03889024, - "balance_loss_mlp": 1.02204525, - "epoch": 0.5275815421614309, - "flos": 20047024581120.0, - "grad_norm": 1.657417688760408, - "language_loss": 0.80199802, - "learning_rate": 1.9184304084794523e-06, - "loss": 0.82323706, - "num_input_tokens_seen": 188591965, - "step": 8775, - "time_per_iteration": 2.7011687755584717 - }, - { - "auxiliary_loss_clip": 0.01096581, - "auxiliary_loss_mlp": 0.01042615, - "balance_loss_clip": 1.03883219, - "balance_loss_mlp": 1.02843523, - "epoch": 0.5276416654140988, - "flos": 21432206712960.0, - "grad_norm": 1.7666023716485497, - "language_loss": 0.83578467, - "learning_rate": 1.918041272397012e-06, - "loss": 0.85717654, - "num_input_tokens_seen": 188610675, - "step": 8776, - "time_per_iteration": 2.6593801975250244 - }, - { - "auxiliary_loss_clip": 0.01093105, - "auxiliary_loss_mlp": 0.01036597, - "balance_loss_clip": 1.04135871, - "balance_loss_mlp": 1.0225482, - "epoch": 0.5277017886667669, - "flos": 17165085696000.0, - "grad_norm": 1.7073238735749807, - "language_loss": 0.67856812, - "learning_rate": 1.9176521394224547e-06, - "loss": 0.6998651, - "num_input_tokens_seen": 188628235, - "step": 8777, - "time_per_iteration": 2.684119462966919 - }, - { - "auxiliary_loss_clip": 0.01098291, - "auxiliary_loss_mlp": 0.01042578, - "balance_loss_clip": 1.0435065, - "balance_loss_mlp": 1.02887487, - "epoch": 0.5277619119194349, - "flos": 20448146736000.0, - "grad_norm": 1.6906001817136074, - "language_loss": 0.8258512, - "learning_rate": 1.9172630095705358e-06, - "loss": 0.84725994, - "num_input_tokens_seen": 188648925, - "step": 8778, - "time_per_iteration": 2.682415723800659 - }, - { - "auxiliary_loss_clip": 0.01111904, - "auxiliary_loss_mlp": 0.01042858, - "balance_loss_clip": 1.04339361, - "balance_loss_mlp": 1.02807617, - "epoch": 0.5278220351721028, - "flos": 24061083304320.0, - "grad_norm": 2.7851808389493913, - "language_loss": 0.79809994, - "learning_rate": 1.916873882856013e-06, - "loss": 0.81964755, - "num_input_tokens_seen": 188668125, - "step": 8779, - "time_per_iteration": 2.6585779190063477 - }, - { - "auxiliary_loss_clip": 0.01105817, - "auxiliary_loss_mlp": 0.01036255, - "balance_loss_clip": 1.04011083, - "balance_loss_mlp": 1.02326131, - "epoch": 0.5278821584247708, - "flos": 24642907804800.0, - "grad_norm": 2.3801118784221487, - "language_loss": 0.76782715, - "learning_rate": 1.9164847592936406e-06, - "loss": 0.78924787, - "num_input_tokens_seen": 188684410, - "step": 8780, - "time_per_iteration": 2.64528489112854 - }, - { - "auxiliary_loss_clip": 0.01092369, - "auxiliary_loss_mlp": 0.01031311, - "balance_loss_clip": 1.04324102, - "balance_loss_mlp": 1.01723862, - "epoch": 0.5279422816774387, - "flos": 35408244240000.0, - "grad_norm": 1.6460087018057796, - "language_loss": 0.7001918, - "learning_rate": 1.916095638898174e-06, - "loss": 0.72142857, - "num_input_tokens_seen": 188706130, - "step": 8781, - "time_per_iteration": 2.8247299194335938 - }, - { - "auxiliary_loss_clip": 0.01107498, - "auxiliary_loss_mlp": 0.01040285, - "balance_loss_clip": 1.04195011, - "balance_loss_mlp": 1.02773809, - "epoch": 0.5280024049301068, - "flos": 22967028904320.0, - "grad_norm": 1.5355974889681627, - "language_loss": 0.72236538, - "learning_rate": 1.9157065216843696e-06, - "loss": 0.7438432, - "num_input_tokens_seen": 188725030, - "step": 8782, - "time_per_iteration": 2.6150832176208496 - }, - { - "auxiliary_loss_clip": 0.01090709, - "auxiliary_loss_mlp": 0.01033496, - "balance_loss_clip": 1.03973758, - "balance_loss_mlp": 1.0204308, - "epoch": 0.5280625281827747, - "flos": 21507619317120.0, - "grad_norm": 1.8366229943518229, - "language_loss": 0.68489599, - "learning_rate": 1.915317407666982e-06, - "loss": 0.70613807, - "num_input_tokens_seen": 188744325, - "step": 8783, - "time_per_iteration": 2.7228338718414307 - }, - { - "auxiliary_loss_clip": 0.01120029, - "auxiliary_loss_mlp": 0.01042206, - "balance_loss_clip": 1.04475784, - "balance_loss_mlp": 1.02599382, - "epoch": 0.5281226514354427, - "flos": 31208167958400.0, - "grad_norm": 1.8621065563663965, - "language_loss": 0.69557488, - "learning_rate": 1.9149282968607674e-06, - "loss": 0.71719718, - "num_input_tokens_seen": 188765100, - "step": 8784, - "time_per_iteration": 2.756030797958374 - }, - { - "auxiliary_loss_clip": 0.01124818, - "auxiliary_loss_mlp": 0.01034258, - "balance_loss_clip": 1.04128921, - "balance_loss_mlp": 1.01935077, - "epoch": 0.5281827746881106, - "flos": 25077821679360.0, - "grad_norm": 3.8002246773271238, - "language_loss": 0.7503646, - "learning_rate": 1.91453918928048e-06, - "loss": 0.77195537, - "num_input_tokens_seen": 188783995, - "step": 8785, - "time_per_iteration": 2.6486949920654297 - }, - { - "auxiliary_loss_clip": 0.01110957, - "auxiliary_loss_mlp": 0.01035187, - "balance_loss_clip": 1.04315662, - "balance_loss_mlp": 1.02070904, - "epoch": 0.5282428979407786, - "flos": 20631255292800.0, - "grad_norm": 1.5855662273934061, - "language_loss": 0.83260286, - "learning_rate": 1.9141500849408745e-06, - "loss": 0.85406423, - "num_input_tokens_seen": 188803120, - "step": 8786, - "time_per_iteration": 2.6352970600128174 - }, - { - "auxiliary_loss_clip": 0.01083443, - "auxiliary_loss_mlp": 0.01025911, - "balance_loss_clip": 1.04014111, - "balance_loss_mlp": 1.0136745, - "epoch": 0.5283030211934465, - "flos": 22419391173120.0, - "grad_norm": 2.305341017618089, - "language_loss": 0.82486933, - "learning_rate": 1.9137609838567076e-06, - "loss": 0.84596282, - "num_input_tokens_seen": 188820960, - "step": 8787, - "time_per_iteration": 2.712639570236206 - }, - { - "auxiliary_loss_clip": 0.01066097, - "auxiliary_loss_mlp": 0.01026546, - "balance_loss_clip": 1.03866088, - "balance_loss_mlp": 1.01387453, - "epoch": 0.5283631444461145, - "flos": 23615467176960.0, - "grad_norm": 1.663088771358256, - "language_loss": 0.83609009, - "learning_rate": 1.9133718860427316e-06, - "loss": 0.85701656, - "num_input_tokens_seen": 188837165, - "step": 8788, - "time_per_iteration": 2.7158761024475098 - }, - { - "auxiliary_loss_clip": 0.01087908, - "auxiliary_loss_mlp": 0.01041692, - "balance_loss_clip": 1.04602289, - "balance_loss_mlp": 1.02696919, - "epoch": 0.5284232676987825, - "flos": 32671994918400.0, - "grad_norm": 1.8980499308542007, - "language_loss": 0.75046682, - "learning_rate": 1.9129827915137027e-06, - "loss": 0.77176291, - "num_input_tokens_seen": 188858555, - "step": 8789, - "time_per_iteration": 2.806339979171753 - }, - { - "auxiliary_loss_clip": 0.01113755, - "auxiliary_loss_mlp": 0.01037056, - "balance_loss_clip": 1.04411733, - "balance_loss_mlp": 1.02322817, - "epoch": 0.5284833909514505, - "flos": 26760919213440.0, - "grad_norm": 1.5263217177178625, - "language_loss": 0.69562709, - "learning_rate": 1.9125937002843754e-06, - "loss": 0.71713525, - "num_input_tokens_seen": 188879050, - "step": 8790, - "time_per_iteration": 2.701814651489258 - }, - { - "auxiliary_loss_clip": 0.01117978, - "auxiliary_loss_mlp": 0.01029171, - "balance_loss_clip": 1.04194212, - "balance_loss_mlp": 1.01685631, - "epoch": 0.5285435142041185, - "flos": 22090700793600.0, - "grad_norm": 1.472851859989372, - "language_loss": 0.79096156, - "learning_rate": 1.9122046123695036e-06, - "loss": 0.812433, - "num_input_tokens_seen": 188898885, - "step": 8791, - "time_per_iteration": 2.609342575073242 - }, - { - "auxiliary_loss_clip": 0.01063984, - "auxiliary_loss_mlp": 0.01029869, - "balance_loss_clip": 1.04006243, - "balance_loss_mlp": 1.01632702, - "epoch": 0.5286036374567864, - "flos": 20375463565440.0, - "grad_norm": 2.747278304747908, - "language_loss": 0.66302419, - "learning_rate": 1.9118155277838423e-06, - "loss": 0.6839627, - "num_input_tokens_seen": 188917225, - "step": 8792, - "time_per_iteration": 2.713622570037842 - }, - { - "auxiliary_loss_clip": 0.01090251, - "auxiliary_loss_mlp": 0.01040633, - "balance_loss_clip": 1.03743482, - "balance_loss_mlp": 1.02670956, - "epoch": 0.5286637607094544, - "flos": 24352175122560.0, - "grad_norm": 1.9116255636929125, - "language_loss": 0.79727674, - "learning_rate": 1.9114264465421443e-06, - "loss": 0.81858563, - "num_input_tokens_seen": 188936120, - "step": 8793, - "time_per_iteration": 2.6645493507385254 - }, - { - "auxiliary_loss_clip": 0.01121499, - "auxiliary_loss_mlp": 0.01045468, - "balance_loss_clip": 1.04323554, - "balance_loss_mlp": 1.03118658, - "epoch": 0.5287238839621223, - "flos": 17271165536640.0, - "grad_norm": 2.655732529836172, - "language_loss": 0.84749115, - "learning_rate": 1.9110373686591645e-06, - "loss": 0.86916077, - "num_input_tokens_seen": 188953405, - "step": 8794, - "time_per_iteration": 2.8306803703308105 - }, - { - "auxiliary_loss_clip": 0.01097868, - "auxiliary_loss_mlp": 0.0103516, - "balance_loss_clip": 1.03908813, - "balance_loss_mlp": 1.02062225, - "epoch": 0.5287840072147904, - "flos": 17566890209280.0, - "grad_norm": 2.1997369626435894, - "language_loss": 0.676875, - "learning_rate": 1.9106482941496564e-06, - "loss": 0.69820529, - "num_input_tokens_seen": 188971150, - "step": 8795, - "time_per_iteration": 2.703134059906006 - }, - { - "auxiliary_loss_clip": 0.01098455, - "auxiliary_loss_mlp": 0.010334, - "balance_loss_clip": 1.04339266, - "balance_loss_mlp": 1.01989961, - "epoch": 0.5288441304674583, - "flos": 18552099421440.0, - "grad_norm": 2.036052201037856, - "language_loss": 0.80291003, - "learning_rate": 1.910259223028374e-06, - "loss": 0.82422858, - "num_input_tokens_seen": 188989550, - "step": 8796, - "time_per_iteration": 2.6733570098876953 - }, - { - "auxiliary_loss_clip": 0.01079591, - "auxiliary_loss_mlp": 0.01043571, - "balance_loss_clip": 1.03867388, - "balance_loss_mlp": 1.02758455, - "epoch": 0.5289042537201263, - "flos": 20814507504000.0, - "grad_norm": 1.5572831824692925, - "language_loss": 0.69010925, - "learning_rate": 1.909870155310071e-06, - "loss": 0.71134079, - "num_input_tokens_seen": 189008795, - "step": 8797, - "time_per_iteration": 4.254164934158325 - }, - { - "auxiliary_loss_clip": 0.01101135, - "auxiliary_loss_mlp": 0.01036632, - "balance_loss_clip": 1.04237545, - "balance_loss_mlp": 1.02374518, - "epoch": 0.5289643769727942, - "flos": 15735265937280.0, - "grad_norm": 1.6872492204324914, - "language_loss": 0.82684171, - "learning_rate": 1.9094810910095005e-06, - "loss": 0.84821934, - "num_input_tokens_seen": 189025540, - "step": 8798, - "time_per_iteration": 2.7167000770568848 - }, - { - "auxiliary_loss_clip": 0.01096424, - "auxiliary_loss_mlp": 0.00774405, - "balance_loss_clip": 1.03896332, - "balance_loss_mlp": 1.00029516, - "epoch": 0.5290245002254622, - "flos": 19537308633600.0, - "grad_norm": 1.9585595365508919, - "language_loss": 0.70825863, - "learning_rate": 1.9090920301414166e-06, - "loss": 0.72696698, - "num_input_tokens_seen": 189044885, - "step": 8799, - "time_per_iteration": 4.350652694702148 - }, - { - "auxiliary_loss_clip": 0.01111399, - "auxiliary_loss_mlp": 0.01038005, - "balance_loss_clip": 1.04659581, - "balance_loss_mlp": 1.02507114, - "epoch": 0.5290846234781301, - "flos": 15815131827840.0, - "grad_norm": 2.2031970702340704, - "language_loss": 0.69286144, - "learning_rate": 1.9087029727205716e-06, - "loss": 0.71435547, - "num_input_tokens_seen": 189061280, - "step": 8800, - "time_per_iteration": 4.109759569168091 - }, - { - "auxiliary_loss_clip": 0.01017957, - "auxiliary_loss_mlp": 0.01037827, - "balance_loss_clip": 1.01865292, - "balance_loss_mlp": 1.03631306, - "epoch": 0.5291447467307981, - "flos": 70057624821120.0, - "grad_norm": 0.9935539305247675, - "language_loss": 0.56959099, - "learning_rate": 1.9083139187617193e-06, - "loss": 0.59014881, - "num_input_tokens_seen": 189114775, - "step": 8801, - "time_per_iteration": 3.1419920921325684 - }, - { - "auxiliary_loss_clip": 0.01110756, - "auxiliary_loss_mlp": 0.01036206, - "balance_loss_clip": 1.04886377, - "balance_loss_mlp": 1.02271795, - "epoch": 0.529204869983466, - "flos": 28364186770560.0, - "grad_norm": 1.5688016044474997, - "language_loss": 0.6425091, - "learning_rate": 1.9079248682796123e-06, - "loss": 0.6639787, - "num_input_tokens_seen": 189134700, - "step": 8802, - "time_per_iteration": 2.7467000484466553 - }, - { - "auxiliary_loss_clip": 0.01101463, - "auxiliary_loss_mlp": 0.01031341, - "balance_loss_clip": 1.04380429, - "balance_loss_mlp": 1.01772761, - "epoch": 0.5292649932361341, - "flos": 33758830684800.0, - "grad_norm": 3.351871019760029, - "language_loss": 0.69098222, - "learning_rate": 1.907535821289003e-06, - "loss": 0.71231019, - "num_input_tokens_seen": 189155365, - "step": 8803, - "time_per_iteration": 4.278867721557617 - }, - { - "auxiliary_loss_clip": 0.01106005, - "auxiliary_loss_mlp": 0.00770288, - "balance_loss_clip": 1.04076648, - "balance_loss_mlp": 1.00028872, - "epoch": 0.5293251164888021, - "flos": 20447679859200.0, - "grad_norm": 1.7989646267917587, - "language_loss": 0.76156348, - "learning_rate": 1.9071467778046458e-06, - "loss": 0.78032649, - "num_input_tokens_seen": 189173885, - "step": 8804, - "time_per_iteration": 2.683661699295044 - }, - { - "auxiliary_loss_clip": 0.01032487, - "auxiliary_loss_mlp": 0.01019664, - "balance_loss_clip": 1.01553822, - "balance_loss_mlp": 1.01836514, - "epoch": 0.52938523974147, - "flos": 66545312204160.0, - "grad_norm": 0.7526453486337231, - "language_loss": 0.5290755, - "learning_rate": 1.906757737841291e-06, - "loss": 0.54959702, - "num_input_tokens_seen": 189236515, - "step": 8805, - "time_per_iteration": 3.243603467941284 - }, - { - "auxiliary_loss_clip": 0.0103203, - "auxiliary_loss_mlp": 0.01016047, - "balance_loss_clip": 1.01495409, - "balance_loss_mlp": 1.01418769, - "epoch": 0.529445362994138, - "flos": 67151734542720.0, - "grad_norm": 0.7522317031499139, - "language_loss": 0.6378004, - "learning_rate": 1.906368701413693e-06, - "loss": 0.65828121, - "num_input_tokens_seen": 189300500, - "step": 8806, - "time_per_iteration": 3.185899257659912 - }, - { - "auxiliary_loss_clip": 0.01112977, - "auxiliary_loss_mlp": 0.01034283, - "balance_loss_clip": 1.04236031, - "balance_loss_mlp": 1.02053213, - "epoch": 0.5295054862468059, - "flos": 17749316407680.0, - "grad_norm": 1.5696878511475738, - "language_loss": 0.72756052, - "learning_rate": 1.9059796685366026e-06, - "loss": 0.74903309, - "num_input_tokens_seen": 189319745, - "step": 8807, - "time_per_iteration": 2.652667284011841 - }, - { - "auxiliary_loss_clip": 0.01079975, - "auxiliary_loss_mlp": 0.01030639, - "balance_loss_clip": 1.04053009, - "balance_loss_mlp": 1.01760888, - "epoch": 0.529565609499474, - "flos": 11397401084160.0, - "grad_norm": 2.191041401806776, - "language_loss": 0.69626606, - "learning_rate": 1.9055906392247723e-06, - "loss": 0.71737224, - "num_input_tokens_seen": 189334550, - "step": 8808, - "time_per_iteration": 2.6991183757781982 - }, - { - "auxiliary_loss_clip": 0.01109251, - "auxiliary_loss_mlp": 0.01032489, - "balance_loss_clip": 1.041991, - "balance_loss_mlp": 1.01962066, - "epoch": 0.5296257327521419, - "flos": 17196363463680.0, - "grad_norm": 1.8261828078243632, - "language_loss": 0.8653447, - "learning_rate": 1.9052016134929554e-06, - "loss": 0.88676214, - "num_input_tokens_seen": 189351735, - "step": 8809, - "time_per_iteration": 2.5995731353759766 - }, - { - "auxiliary_loss_clip": 0.0111469, - "auxiliary_loss_mlp": 0.01041403, - "balance_loss_clip": 1.04281509, - "balance_loss_mlp": 1.02607894, - "epoch": 0.5296858560048099, - "flos": 39964086777600.0, - "grad_norm": 1.9222242916722383, - "language_loss": 0.64388674, - "learning_rate": 1.9048125913559016e-06, - "loss": 0.66544765, - "num_input_tokens_seen": 189373105, - "step": 8810, - "time_per_iteration": 2.776230573654175 - }, - { - "auxiliary_loss_clip": 0.01119011, - "auxiliary_loss_mlp": 0.01038636, - "balance_loss_clip": 1.04296374, - "balance_loss_mlp": 1.02509344, - "epoch": 0.5297459792574778, - "flos": 20961418129920.0, - "grad_norm": 1.8063937788931883, - "language_loss": 0.68213391, - "learning_rate": 1.9044235728283646e-06, - "loss": 0.70371044, - "num_input_tokens_seen": 189394615, - "step": 8811, - "time_per_iteration": 2.684617757797241 - }, - { - "auxiliary_loss_clip": 0.01007367, - "auxiliary_loss_mlp": 0.0100546, - "balance_loss_clip": 1.01854634, - "balance_loss_mlp": 1.00402915, - "epoch": 0.5298061025101458, - "flos": 66523620389760.0, - "grad_norm": 0.689972629111167, - "language_loss": 0.53345251, - "learning_rate": 1.9040345579250953e-06, - "loss": 0.55358076, - "num_input_tokens_seen": 189459750, - "step": 8812, - "time_per_iteration": 3.3905134201049805 - }, - { - "auxiliary_loss_clip": 0.01023218, - "auxiliary_loss_mlp": 0.01004548, - "balance_loss_clip": 1.01716316, - "balance_loss_mlp": 1.00321257, - "epoch": 0.5298662257628137, - "flos": 67662994775040.0, - "grad_norm": 0.7359658604916758, - "language_loss": 0.56288284, - "learning_rate": 1.9036455466608453e-06, - "loss": 0.58316052, - "num_input_tokens_seen": 189527540, - "step": 8813, - "time_per_iteration": 3.2840702533721924 - }, - { - "auxiliary_loss_clip": 0.01064136, - "auxiliary_loss_mlp": 0.01033208, - "balance_loss_clip": 1.0387466, - "balance_loss_mlp": 1.01986289, - "epoch": 0.5299263490154817, - "flos": 19646405216640.0, - "grad_norm": 1.8723589062576662, - "language_loss": 0.81484783, - "learning_rate": 1.9032565390503657e-06, - "loss": 0.83582127, - "num_input_tokens_seen": 189546900, - "step": 8814, - "time_per_iteration": 2.7889370918273926 - }, - { - "auxiliary_loss_clip": 0.01129463, - "auxiliary_loss_mlp": 0.01035706, - "balance_loss_clip": 1.04835963, - "balance_loss_mlp": 1.02225351, - "epoch": 0.5299864722681497, - "flos": 22055005653120.0, - "grad_norm": 1.8736963674991467, - "language_loss": 0.85159796, - "learning_rate": 1.9028675351084076e-06, - "loss": 0.87324965, - "num_input_tokens_seen": 189566490, - "step": 8815, - "time_per_iteration": 2.588376998901367 - }, - { - "auxiliary_loss_clip": 0.01119356, - "auxiliary_loss_mlp": 0.01030819, - "balance_loss_clip": 1.04443836, - "balance_loss_mlp": 1.01802766, - "epoch": 0.5300465955208177, - "flos": 21763698353280.0, - "grad_norm": 2.360835312755498, - "language_loss": 0.66173548, - "learning_rate": 1.9024785348497225e-06, - "loss": 0.6832372, - "num_input_tokens_seen": 189585580, - "step": 8816, - "time_per_iteration": 2.6367204189300537 - }, - { - "auxiliary_loss_clip": 0.01098885, - "auxiliary_loss_mlp": 0.01037316, - "balance_loss_clip": 1.04165578, - "balance_loss_mlp": 1.02370238, - "epoch": 0.5301067187734857, - "flos": 42996491735040.0, - "grad_norm": 1.8428826452353317, - "language_loss": 0.72204578, - "learning_rate": 1.9020895382890611e-06, - "loss": 0.74340779, - "num_input_tokens_seen": 189608485, - "step": 8817, - "time_per_iteration": 2.8511815071105957 - }, - { - "auxiliary_loss_clip": 0.01093351, - "auxiliary_loss_mlp": 0.0103512, - "balance_loss_clip": 1.03981018, - "balance_loss_mlp": 1.01959896, - "epoch": 0.5301668420261536, - "flos": 20554298403840.0, - "grad_norm": 1.7783802077805728, - "language_loss": 0.65400332, - "learning_rate": 1.9017005454411743e-06, - "loss": 0.67528808, - "num_input_tokens_seen": 189627815, - "step": 8818, - "time_per_iteration": 2.757228374481201 - }, - { - "auxiliary_loss_clip": 0.01075022, - "auxiliary_loss_mlp": 0.01033272, - "balance_loss_clip": 1.04101062, - "balance_loss_mlp": 1.01816273, - "epoch": 0.5302269652788216, - "flos": 17486665182720.0, - "grad_norm": 1.8529738404346974, - "language_loss": 0.75020683, - "learning_rate": 1.9013115563208126e-06, - "loss": 0.77128971, - "num_input_tokens_seen": 189644850, - "step": 8819, - "time_per_iteration": 2.7458016872406006 - }, - { - "auxiliary_loss_clip": 0.01088004, - "auxiliary_loss_mlp": 0.01047287, - "balance_loss_clip": 1.04190588, - "balance_loss_mlp": 1.03143191, - "epoch": 0.5302870885314895, - "flos": 14574202715520.0, - "grad_norm": 2.236781046268797, - "language_loss": 0.81955135, - "learning_rate": 1.9009225709427267e-06, - "loss": 0.84090424, - "num_input_tokens_seen": 189660945, - "step": 8820, - "time_per_iteration": 2.7917025089263916 - }, - { - "auxiliary_loss_clip": 0.01101102, - "auxiliary_loss_mlp": 0.01034433, - "balance_loss_clip": 1.04137421, - "balance_loss_mlp": 1.02192223, - "epoch": 0.5303472117841576, - "flos": 23438032968960.0, - "grad_norm": 1.5105877277652986, - "language_loss": 0.72733676, - "learning_rate": 1.9005335893216667e-06, - "loss": 0.74869215, - "num_input_tokens_seen": 189680425, - "step": 8821, - "time_per_iteration": 2.664912462234497 - }, - { - "auxiliary_loss_clip": 0.01092575, - "auxiliary_loss_mlp": 0.01032249, - "balance_loss_clip": 1.04237318, - "balance_loss_mlp": 1.01958346, - "epoch": 0.5304073350368255, - "flos": 22709010533760.0, - "grad_norm": 1.4432589414019072, - "language_loss": 0.74112785, - "learning_rate": 1.9001446114723824e-06, - "loss": 0.76237607, - "num_input_tokens_seen": 189700375, - "step": 8822, - "time_per_iteration": 2.7494471073150635 - }, - { - "auxiliary_loss_clip": 0.01087967, - "auxiliary_loss_mlp": 0.01034985, - "balance_loss_clip": 1.03945005, - "balance_loss_mlp": 1.02029884, - "epoch": 0.5304674582894935, - "flos": 27928554624000.0, - "grad_norm": 1.6561028390766985, - "language_loss": 0.67739707, - "learning_rate": 1.8997556374096257e-06, - "loss": 0.69862658, - "num_input_tokens_seen": 189721225, - "step": 8823, - "time_per_iteration": 2.8298280239105225 - }, - { - "auxiliary_loss_clip": 0.01127487, - "auxiliary_loss_mlp": 0.01042695, - "balance_loss_clip": 1.0455004, - "balance_loss_mlp": 1.02722192, - "epoch": 0.5305275815421614, - "flos": 21250642440960.0, - "grad_norm": 1.7679489191905855, - "language_loss": 0.69459474, - "learning_rate": 1.8993666671481444e-06, - "loss": 0.71629655, - "num_input_tokens_seen": 189740170, - "step": 8824, - "time_per_iteration": 2.7093706130981445 - }, - { - "auxiliary_loss_clip": 0.01098459, - "auxiliary_loss_mlp": 0.00770579, - "balance_loss_clip": 1.04351103, - "balance_loss_mlp": 1.00028551, - "epoch": 0.5305877047948294, - "flos": 17603088140160.0, - "grad_norm": 2.079936946962719, - "language_loss": 0.7578221, - "learning_rate": 1.898977700702689e-06, - "loss": 0.77651244, - "num_input_tokens_seen": 189757890, - "step": 8825, - "time_per_iteration": 2.7240397930145264 - }, - { - "auxiliary_loss_clip": 0.01042177, - "auxiliary_loss_mlp": 0.01041743, - "balance_loss_clip": 1.03510904, - "balance_loss_mlp": 1.02771175, - "epoch": 0.5306478280474973, - "flos": 15195493284480.0, - "grad_norm": 1.902170532497994, - "language_loss": 0.85671568, - "learning_rate": 1.8985887380880103e-06, - "loss": 0.87755489, - "num_input_tokens_seen": 189775390, - "step": 8826, - "time_per_iteration": 2.786893367767334 - }, - { - "auxiliary_loss_clip": 0.0112111, - "auxiliary_loss_mlp": 0.0103337, - "balance_loss_clip": 1.04376101, - "balance_loss_mlp": 1.01967907, - "epoch": 0.5307079513001653, - "flos": 15341218761600.0, - "grad_norm": 1.3295158202050776, - "language_loss": 0.64655942, - "learning_rate": 1.8981997793188558e-06, - "loss": 0.66810423, - "num_input_tokens_seen": 189793975, - "step": 8827, - "time_per_iteration": 2.650259017944336 - }, - { - "auxiliary_loss_clip": 0.01100521, - "auxiliary_loss_mlp": 0.01041689, - "balance_loss_clip": 1.04230511, - "balance_loss_mlp": 1.02720535, - "epoch": 0.5307680745528333, - "flos": 43544452688640.0, - "grad_norm": 1.5763280036459053, - "language_loss": 0.60055244, - "learning_rate": 1.8978108244099762e-06, - "loss": 0.62197453, - "num_input_tokens_seen": 189817870, - "step": 8828, - "time_per_iteration": 2.9273712635040283 - }, - { - "auxiliary_loss_clip": 0.01115165, - "auxiliary_loss_mlp": 0.01032955, - "balance_loss_clip": 1.04400516, - "balance_loss_mlp": 1.01779199, - "epoch": 0.5308281978055013, - "flos": 20048928001920.0, - "grad_norm": 1.6623375431972864, - "language_loss": 0.81171465, - "learning_rate": 1.8974218733761208e-06, - "loss": 0.83319587, - "num_input_tokens_seen": 189837905, - "step": 8829, - "time_per_iteration": 2.6640090942382812 - }, - { - "auxiliary_loss_clip": 0.01104846, - "auxiliary_loss_mlp": 0.01035043, - "balance_loss_clip": 1.043993, - "balance_loss_mlp": 1.02136946, - "epoch": 0.5308883210581693, - "flos": 20703938463360.0, - "grad_norm": 1.3895948203919835, - "language_loss": 0.78245443, - "learning_rate": 1.8970329262320375e-06, - "loss": 0.80385327, - "num_input_tokens_seen": 189856970, - "step": 8830, - "time_per_iteration": 2.736316680908203 - }, - { - "auxiliary_loss_clip": 0.01111385, - "auxiliary_loss_mlp": 0.01033264, - "balance_loss_clip": 1.04335451, - "balance_loss_mlp": 1.02036524, - "epoch": 0.5309484443108372, - "flos": 14355506759040.0, - "grad_norm": 2.4391763831493165, - "language_loss": 0.8031435, - "learning_rate": 1.8966439829924768e-06, - "loss": 0.82458997, - "num_input_tokens_seen": 189872830, - "step": 8831, - "time_per_iteration": 2.6151957511901855 - }, - { - "auxiliary_loss_clip": 0.01108777, - "auxiliary_loss_mlp": 0.01032917, - "balance_loss_clip": 1.0430057, - "balance_loss_mlp": 1.01951742, - "epoch": 0.5310085675635052, - "flos": 20010503427840.0, - "grad_norm": 4.592110703983282, - "language_loss": 0.73025942, - "learning_rate": 1.896255043672186e-06, - "loss": 0.75167632, - "num_input_tokens_seen": 189891635, - "step": 8832, - "time_per_iteration": 2.6464226245880127 - }, - { - "auxiliary_loss_clip": 0.01089691, - "auxiliary_loss_mlp": 0.01036866, - "balance_loss_clip": 1.04126275, - "balance_loss_mlp": 1.02198887, - "epoch": 0.5310686908161731, - "flos": 22127293774080.0, - "grad_norm": 2.4188792138763513, - "language_loss": 0.75694382, - "learning_rate": 1.8958661082859143e-06, - "loss": 0.77820939, - "num_input_tokens_seen": 189909050, - "step": 8833, - "time_per_iteration": 2.757716178894043 - }, - { - "auxiliary_loss_clip": 0.01087272, - "auxiliary_loss_mlp": 0.01036493, - "balance_loss_clip": 1.03743505, - "balance_loss_mlp": 1.02260494, - "epoch": 0.5311288140688412, - "flos": 24717889445760.0, - "grad_norm": 1.6684529348681687, - "language_loss": 0.73618537, - "learning_rate": 1.8954771768484103e-06, - "loss": 0.75742298, - "num_input_tokens_seen": 189927405, - "step": 8834, - "time_per_iteration": 2.7447376251220703 - }, - { - "auxiliary_loss_clip": 0.01127832, - "auxiliary_loss_mlp": 0.01042563, - "balance_loss_clip": 1.04435921, - "balance_loss_mlp": 1.02734029, - "epoch": 0.5311889373215091, - "flos": 24097712198400.0, - "grad_norm": 1.9940250251862053, - "language_loss": 0.77417272, - "learning_rate": 1.8950882493744226e-06, - "loss": 0.79587668, - "num_input_tokens_seen": 189947740, - "step": 8835, - "time_per_iteration": 2.654860734939575 - }, - { - "auxiliary_loss_clip": 0.01097251, - "auxiliary_loss_mlp": 0.01046402, - "balance_loss_clip": 1.04259109, - "balance_loss_mlp": 1.03138208, - "epoch": 0.5312490605741771, - "flos": 22017012042240.0, - "grad_norm": 2.4706637723930505, - "language_loss": 0.72355223, - "learning_rate": 1.8946993258786985e-06, - "loss": 0.7449888, - "num_input_tokens_seen": 189966495, - "step": 8836, - "time_per_iteration": 2.694772243499756 - }, - { - "auxiliary_loss_clip": 0.01104585, - "auxiliary_loss_mlp": 0.01040344, - "balance_loss_clip": 1.04374099, - "balance_loss_mlp": 1.02537167, - "epoch": 0.531309183826845, - "flos": 19390541662080.0, - "grad_norm": 1.705704926785557, - "language_loss": 0.81026083, - "learning_rate": 1.894310406375987e-06, - "loss": 0.8317101, - "num_input_tokens_seen": 189985325, - "step": 8837, - "time_per_iteration": 4.218893527984619 - }, - { - "auxiliary_loss_clip": 0.01107393, - "auxiliary_loss_mlp": 0.01036209, - "balance_loss_clip": 1.04489708, - "balance_loss_mlp": 1.02216005, - "epoch": 0.531369307079513, - "flos": 20190056538240.0, - "grad_norm": 1.8031911656804687, - "language_loss": 0.8618502, - "learning_rate": 1.893921490881035e-06, - "loss": 0.88328624, - "num_input_tokens_seen": 190003290, - "step": 8838, - "time_per_iteration": 4.327972888946533 - }, - { - "auxiliary_loss_clip": 0.01097617, - "auxiliary_loss_mlp": 0.01036447, - "balance_loss_clip": 1.04136765, - "balance_loss_mlp": 1.02366185, - "epoch": 0.5314294303321809, - "flos": 18880143356160.0, - "grad_norm": 1.7768925166398193, - "language_loss": 0.72961235, - "learning_rate": 1.8935325794085906e-06, - "loss": 0.75095296, - "num_input_tokens_seen": 190023260, - "step": 8839, - "time_per_iteration": 4.2734081745147705 - }, - { - "auxiliary_loss_clip": 0.0110159, - "auxiliary_loss_mlp": 0.01042278, - "balance_loss_clip": 1.04086304, - "balance_loss_mlp": 1.02885473, - "epoch": 0.531489553584849, - "flos": 23040035297280.0, - "grad_norm": 1.7238696185302183, - "language_loss": 0.76902539, - "learning_rate": 1.8931436719734023e-06, - "loss": 0.79046404, - "num_input_tokens_seen": 190042035, - "step": 8840, - "time_per_iteration": 2.708387613296509 - }, - { - "auxiliary_loss_clip": 0.01085488, - "auxiliary_loss_mlp": 0.01033907, - "balance_loss_clip": 1.04072022, - "balance_loss_mlp": 1.01934612, - "epoch": 0.5315496768375169, - "flos": 19790478668160.0, - "grad_norm": 2.0047240823259385, - "language_loss": 0.77301592, - "learning_rate": 1.892754768590216e-06, - "loss": 0.7942099, - "num_input_tokens_seen": 190057545, - "step": 8841, - "time_per_iteration": 2.6982758045196533 - }, - { - "auxiliary_loss_clip": 0.0102526, - "auxiliary_loss_mlp": 0.01022764, - "balance_loss_clip": 1.01826656, - "balance_loss_mlp": 1.02119017, - "epoch": 0.5316098000901849, - "flos": 71023228185600.0, - "grad_norm": 0.6981779601463162, - "language_loss": 0.56741858, - "learning_rate": 1.8923658692737793e-06, - "loss": 0.58789885, - "num_input_tokens_seen": 190123800, - "step": 8842, - "time_per_iteration": 4.895024299621582 - }, - { - "auxiliary_loss_clip": 0.01102673, - "auxiliary_loss_mlp": 0.01041259, - "balance_loss_clip": 1.04331183, - "balance_loss_mlp": 1.02621484, - "epoch": 0.5316699233428529, - "flos": 16435560470400.0, - "grad_norm": 1.8735975877067965, - "language_loss": 0.73998511, - "learning_rate": 1.8919769740388407e-06, - "loss": 0.76142448, - "num_input_tokens_seen": 190141625, - "step": 8843, - "time_per_iteration": 2.66169810295105 - }, - { - "auxiliary_loss_clip": 0.01023627, - "auxiliary_loss_mlp": 0.0100589, - "balance_loss_clip": 1.01690733, - "balance_loss_mlp": 1.00456095, - "epoch": 0.5317300465955208, - "flos": 67420814302080.0, - "grad_norm": 0.8814346849515853, - "language_loss": 0.61057651, - "learning_rate": 1.891588082900145e-06, - "loss": 0.63087165, - "num_input_tokens_seen": 190198110, - "step": 8844, - "time_per_iteration": 3.297545909881592 - }, - { - "auxiliary_loss_clip": 0.01032752, - "auxiliary_loss_mlp": 0.01005725, - "balance_loss_clip": 1.01528263, - "balance_loss_mlp": 1.00425863, - "epoch": 0.5317901698481888, - "flos": 59508075340800.0, - "grad_norm": 0.8422745451421196, - "language_loss": 0.62147105, - "learning_rate": 1.8911991958724411e-06, - "loss": 0.64185584, - "num_input_tokens_seen": 190259950, - "step": 8845, - "time_per_iteration": 3.1747312545776367 - }, - { - "auxiliary_loss_clip": 0.01088974, - "auxiliary_loss_mlp": 0.01040872, - "balance_loss_clip": 1.04063165, - "balance_loss_mlp": 1.02521944, - "epoch": 0.5318502931008567, - "flos": 19129219240320.0, - "grad_norm": 1.8386701394288745, - "language_loss": 0.74980247, - "learning_rate": 1.890810312970474e-06, - "loss": 0.77110094, - "num_input_tokens_seen": 190278265, - "step": 8846, - "time_per_iteration": 2.734652519226074 - }, - { - "auxiliary_loss_clip": 0.01111858, - "auxiliary_loss_mlp": 0.01034985, - "balance_loss_clip": 1.04369533, - "balance_loss_mlp": 1.0226109, - "epoch": 0.5319104163535248, - "flos": 24681045070080.0, - "grad_norm": 1.562458752543025, - "language_loss": 0.75478411, - "learning_rate": 1.8904214342089903e-06, - "loss": 0.77625251, - "num_input_tokens_seen": 190298400, - "step": 8847, - "time_per_iteration": 2.7175981998443604 - }, - { - "auxiliary_loss_clip": 0.0110005, - "auxiliary_loss_mlp": 0.01032122, - "balance_loss_clip": 1.04175198, - "balance_loss_mlp": 1.0193609, - "epoch": 0.5319705396061927, - "flos": 19385513758080.0, - "grad_norm": 1.5938668259379032, - "language_loss": 0.87875456, - "learning_rate": 1.8900325596027378e-06, - "loss": 0.90007627, - "num_input_tokens_seen": 190316235, - "step": 8848, - "time_per_iteration": 2.777731418609619 - }, - { - "auxiliary_loss_clip": 0.01084561, - "auxiliary_loss_mlp": 0.01041363, - "balance_loss_clip": 1.04119325, - "balance_loss_mlp": 1.02549624, - "epoch": 0.5320306628588607, - "flos": 18259319664000.0, - "grad_norm": 2.1051582434291833, - "language_loss": 0.74326992, - "learning_rate": 1.8896436891664609e-06, - "loss": 0.76452917, - "num_input_tokens_seen": 190335060, - "step": 8849, - "time_per_iteration": 2.7248313426971436 - }, - { - "auxiliary_loss_clip": 0.01107496, - "auxiliary_loss_mlp": 0.01030316, - "balance_loss_clip": 1.03895473, - "balance_loss_mlp": 1.0154624, - "epoch": 0.5320907861115286, - "flos": 23732321097600.0, - "grad_norm": 1.8915242874982603, - "language_loss": 0.79657137, - "learning_rate": 1.8892548229149066e-06, - "loss": 0.81794947, - "num_input_tokens_seen": 190353265, - "step": 8850, - "time_per_iteration": 2.7357401847839355 - }, - { - "auxiliary_loss_clip": 0.01121659, - "auxiliary_loss_mlp": 0.01031858, - "balance_loss_clip": 1.04192996, - "balance_loss_mlp": 1.01804209, - "epoch": 0.5321509093641966, - "flos": 34495251321600.0, - "grad_norm": 1.633301633467878, - "language_loss": 0.55076206, - "learning_rate": 1.888865960862821e-06, - "loss": 0.57229722, - "num_input_tokens_seen": 190376575, - "step": 8851, - "time_per_iteration": 2.730081081390381 - }, - { - "auxiliary_loss_clip": 0.01110617, - "auxiliary_loss_mlp": 0.01036207, - "balance_loss_clip": 1.04243159, - "balance_loss_mlp": 1.0228914, - "epoch": 0.5322110326168645, - "flos": 20010934391040.0, - "grad_norm": 1.5393101812132837, - "language_loss": 0.68206942, - "learning_rate": 1.8884771030249484e-06, - "loss": 0.70353764, - "num_input_tokens_seen": 190395185, - "step": 8852, - "time_per_iteration": 2.685267925262451 - }, - { - "auxiliary_loss_clip": 0.01020981, - "auxiliary_loss_mlp": 0.00752764, - "balance_loss_clip": 1.01425028, - "balance_loss_mlp": 0.99977398, - "epoch": 0.5322711558695326, - "flos": 64631164435200.0, - "grad_norm": 0.7921902417648442, - "language_loss": 0.62794167, - "learning_rate": 1.8880882494160357e-06, - "loss": 0.64567912, - "num_input_tokens_seen": 190452595, - "step": 8853, - "time_per_iteration": 3.154197931289673 - }, - { - "auxiliary_loss_clip": 0.01113411, - "auxiliary_loss_mlp": 0.01027799, - "balance_loss_clip": 1.04064846, - "balance_loss_mlp": 1.01379788, - "epoch": 0.5323312791222005, - "flos": 14939342421120.0, - "grad_norm": 2.437651920606879, - "language_loss": 0.79789698, - "learning_rate": 1.8876994000508278e-06, - "loss": 0.81930912, - "num_input_tokens_seen": 190469140, - "step": 8854, - "time_per_iteration": 2.6569535732269287 - }, - { - "auxiliary_loss_clip": 0.01092841, - "auxiliary_loss_mlp": 0.0102808, - "balance_loss_clip": 1.0418992, - "balance_loss_mlp": 1.01586115, - "epoch": 0.5323914023748685, - "flos": 23440834229760.0, - "grad_norm": 1.7182223658194644, - "language_loss": 0.73290253, - "learning_rate": 1.8873105549440698e-06, - "loss": 0.75411177, - "num_input_tokens_seen": 190489015, - "step": 8855, - "time_per_iteration": 2.6984002590179443 - }, - { - "auxiliary_loss_clip": 0.01095667, - "auxiliary_loss_mlp": 0.0077104, - "balance_loss_clip": 1.03969502, - "balance_loss_mlp": 1.00030267, - "epoch": 0.5324515256275365, - "flos": 26286180134400.0, - "grad_norm": 1.9960339019119333, - "language_loss": 0.6505388, - "learning_rate": 1.886921714110507e-06, - "loss": 0.66920584, - "num_input_tokens_seen": 190508065, - "step": 8856, - "time_per_iteration": 2.7057278156280518 - }, - { - "auxiliary_loss_clip": 0.01100444, - "auxiliary_loss_mlp": 0.0103908, - "balance_loss_clip": 1.04079795, - "balance_loss_mlp": 1.02341616, - "epoch": 0.5325116488802044, - "flos": 26870913636480.0, - "grad_norm": 2.078757662178109, - "language_loss": 0.77651089, - "learning_rate": 1.8865328775648842e-06, - "loss": 0.79790616, - "num_input_tokens_seen": 190527045, - "step": 8857, - "time_per_iteration": 2.764199733734131 - }, - { - "auxiliary_loss_clip": 0.01092407, - "auxiliary_loss_mlp": 0.01034608, - "balance_loss_clip": 1.04279578, - "balance_loss_mlp": 1.02039194, - "epoch": 0.5325717721328724, - "flos": 25884734757120.0, - "grad_norm": 2.3746118235231592, - "language_loss": 0.70823711, - "learning_rate": 1.8861440453219456e-06, - "loss": 0.72950727, - "num_input_tokens_seen": 190544075, - "step": 8858, - "time_per_iteration": 2.735534191131592 - }, - { - "auxiliary_loss_clip": 0.01108427, - "auxiliary_loss_mlp": 0.01040186, - "balance_loss_clip": 1.0411067, - "balance_loss_mlp": 1.02518916, - "epoch": 0.5326318953855403, - "flos": 21799321666560.0, - "grad_norm": 1.83105211007431, - "language_loss": 0.69232476, - "learning_rate": 1.8857552173964367e-06, - "loss": 0.71381092, - "num_input_tokens_seen": 190566030, - "step": 8859, - "time_per_iteration": 2.773764133453369 - }, - { - "auxiliary_loss_clip": 0.01109944, - "auxiliary_loss_mlp": 0.01028838, - "balance_loss_clip": 1.04517436, - "balance_loss_mlp": 1.01671481, - "epoch": 0.5326920186382084, - "flos": 20922921728640.0, - "grad_norm": 1.8423028887831514, - "language_loss": 0.69617528, - "learning_rate": 1.8853663938031013e-06, - "loss": 0.71756315, - "num_input_tokens_seen": 190585605, - "step": 8860, - "time_per_iteration": 2.689471483230591 - }, - { - "auxiliary_loss_clip": 0.01102885, - "auxiliary_loss_mlp": 0.01035827, - "balance_loss_clip": 1.0451107, - "balance_loss_mlp": 1.02258921, - "epoch": 0.5327521418908763, - "flos": 21433427775360.0, - "grad_norm": 2.3281195979693297, - "language_loss": 0.78340018, - "learning_rate": 1.884977574556683e-06, - "loss": 0.80478734, - "num_input_tokens_seen": 190604625, - "step": 8861, - "time_per_iteration": 2.66679048538208 - }, - { - "auxiliary_loss_clip": 0.01077125, - "auxiliary_loss_mlp": 0.01040454, - "balance_loss_clip": 1.03987145, - "balance_loss_mlp": 1.02606571, - "epoch": 0.5328122651435443, - "flos": 21760250647680.0, - "grad_norm": 1.7664447291359346, - "language_loss": 0.85554659, - "learning_rate": 1.8845887596719279e-06, - "loss": 0.87672234, - "num_input_tokens_seen": 190625060, - "step": 8862, - "time_per_iteration": 2.7928006649017334 - }, - { - "auxiliary_loss_clip": 0.0109879, - "auxiliary_loss_mlp": 0.01039782, - "balance_loss_clip": 1.03952289, - "balance_loss_mlp": 1.0237242, - "epoch": 0.5328723883962122, - "flos": 18296487262080.0, - "grad_norm": 2.2696975914187116, - "language_loss": 0.62147439, - "learning_rate": 1.8841999491635778e-06, - "loss": 0.64286011, - "num_input_tokens_seen": 190643150, - "step": 8863, - "time_per_iteration": 2.685253381729126 - }, - { - "auxiliary_loss_clip": 0.01098767, - "auxiliary_loss_mlp": 0.01040661, - "balance_loss_clip": 1.04511809, - "balance_loss_mlp": 1.02661765, - "epoch": 0.5329325116488802, - "flos": 25374911068800.0, - "grad_norm": 1.8529881391436633, - "language_loss": 0.73310483, - "learning_rate": 1.883811143046377e-06, - "loss": 0.75449914, - "num_input_tokens_seen": 190662725, - "step": 8864, - "time_per_iteration": 2.703639030456543 - }, - { - "auxiliary_loss_clip": 0.01120661, - "auxiliary_loss_mlp": 0.01035736, - "balance_loss_clip": 1.04301071, - "balance_loss_mlp": 1.02275968, - "epoch": 0.5329926349015481, - "flos": 25592098654080.0, - "grad_norm": 1.6333657309737846, - "language_loss": 0.64201105, - "learning_rate": 1.8834223413350702e-06, - "loss": 0.66357499, - "num_input_tokens_seen": 190683680, - "step": 8865, - "time_per_iteration": 2.691087245941162 - }, - { - "auxiliary_loss_clip": 0.01113033, - "auxiliary_loss_mlp": 0.01029706, - "balance_loss_clip": 1.0424211, - "balance_loss_mlp": 1.01641965, - "epoch": 0.5330527581542162, - "flos": 22889605138560.0, - "grad_norm": 3.0767575494694985, - "language_loss": 0.78091645, - "learning_rate": 1.8830335440443989e-06, - "loss": 0.80234385, - "num_input_tokens_seen": 190703350, - "step": 8866, - "time_per_iteration": 2.674612283706665 - }, - { - "auxiliary_loss_clip": 0.01108068, - "auxiliary_loss_mlp": 0.0103023, - "balance_loss_clip": 1.04092908, - "balance_loss_mlp": 1.01696241, - "epoch": 0.5331128814068841, - "flos": 16026752805120.0, - "grad_norm": 1.842224927457961, - "language_loss": 0.73840493, - "learning_rate": 1.882644751189108e-06, - "loss": 0.75978798, - "num_input_tokens_seen": 190721170, - "step": 8867, - "time_per_iteration": 2.6963648796081543 - }, - { - "auxiliary_loss_clip": 0.01098718, - "auxiliary_loss_mlp": 0.01039247, - "balance_loss_clip": 1.04040504, - "balance_loss_mlp": 1.02402985, - "epoch": 0.5331730046595521, - "flos": 39344699629440.0, - "grad_norm": 1.5703549780422514, - "language_loss": 0.71881396, - "learning_rate": 1.88225596278394e-06, - "loss": 0.74019361, - "num_input_tokens_seen": 190743795, - "step": 8868, - "time_per_iteration": 2.830118417739868 - }, - { - "auxiliary_loss_clip": 0.01090763, - "auxiliary_loss_mlp": 0.01034625, - "balance_loss_clip": 1.04197335, - "balance_loss_mlp": 1.0212791, - "epoch": 0.5332331279122201, - "flos": 24024382583040.0, - "grad_norm": 5.550281122060094, - "language_loss": 0.78397369, - "learning_rate": 1.881867178843637e-06, - "loss": 0.80522758, - "num_input_tokens_seen": 190761560, - "step": 8869, - "time_per_iteration": 2.738565444946289 - }, - { - "auxiliary_loss_clip": 0.01114623, - "auxiliary_loss_mlp": 0.01037147, - "balance_loss_clip": 1.0432862, - "balance_loss_mlp": 1.02336633, - "epoch": 0.533293251164888, - "flos": 17129318728320.0, - "grad_norm": 1.7588400416982446, - "language_loss": 0.75840724, - "learning_rate": 1.8814783993829434e-06, - "loss": 0.77992487, - "num_input_tokens_seen": 190778875, - "step": 8870, - "time_per_iteration": 2.598963499069214 - }, - { - "auxiliary_loss_clip": 0.01100618, - "auxiliary_loss_mlp": 0.01038316, - "balance_loss_clip": 1.04231286, - "balance_loss_mlp": 1.02373052, - "epoch": 0.533353374417556, - "flos": 22126360020480.0, - "grad_norm": 5.617051153369423, - "language_loss": 0.75663799, - "learning_rate": 1.8810896244165997e-06, - "loss": 0.7780273, - "num_input_tokens_seen": 190799830, - "step": 8871, - "time_per_iteration": 2.7459628582000732 - }, - { - "auxiliary_loss_clip": 0.01099152, - "auxiliary_loss_mlp": 0.0103356, - "balance_loss_clip": 1.04201055, - "balance_loss_mlp": 1.0202924, - "epoch": 0.533413497670224, - "flos": 15011091838080.0, - "grad_norm": 1.8041252581471448, - "language_loss": 0.7247498, - "learning_rate": 1.8807008539593498e-06, - "loss": 0.74607694, - "num_input_tokens_seen": 190817155, - "step": 8872, - "time_per_iteration": 2.6604373455047607 - }, - { - "auxiliary_loss_clip": 0.01100126, - "auxiliary_loss_mlp": 0.0104147, - "balance_loss_clip": 1.04733372, - "balance_loss_mlp": 1.02694392, - "epoch": 0.533473620922892, - "flos": 19609955890560.0, - "grad_norm": 1.7875555414889834, - "language_loss": 0.65306997, - "learning_rate": 1.880312088025936e-06, - "loss": 0.67448598, - "num_input_tokens_seen": 190835240, - "step": 8873, - "time_per_iteration": 2.6587424278259277 - }, - { - "auxiliary_loss_clip": 0.01098214, - "auxiliary_loss_mlp": 0.0104372, - "balance_loss_clip": 1.04254389, - "balance_loss_mlp": 1.03035116, - "epoch": 0.5335337441755599, - "flos": 14282644020480.0, - "grad_norm": 2.157272820213575, - "language_loss": 0.80225539, - "learning_rate": 1.879923326631099e-06, - "loss": 0.82367474, - "num_input_tokens_seen": 190851620, - "step": 8874, - "time_per_iteration": 2.723454475402832 - }, - { - "auxiliary_loss_clip": 0.01112328, - "auxiliary_loss_mlp": 0.01030163, - "balance_loss_clip": 1.04300058, - "balance_loss_mlp": 1.01653171, - "epoch": 0.5335938674282279, - "flos": 20814830726400.0, - "grad_norm": 1.8315602861194333, - "language_loss": 0.69789159, - "learning_rate": 1.879534569789582e-06, - "loss": 0.71931654, - "num_input_tokens_seen": 190870545, - "step": 8875, - "time_per_iteration": 2.6051578521728516 - }, - { - "auxiliary_loss_clip": 0.01045431, - "auxiliary_loss_mlp": 0.01001312, - "balance_loss_clip": 1.01922286, - "balance_loss_mlp": 0.99979252, - "epoch": 0.5336539906808958, - "flos": 71396448451200.0, - "grad_norm": 0.7211200965927701, - "language_loss": 0.59631079, - "learning_rate": 1.879145817516126e-06, - "loss": 0.61677825, - "num_input_tokens_seen": 190931995, - "step": 8876, - "time_per_iteration": 3.3114185333251953 - }, - { - "auxiliary_loss_clip": 0.01113481, - "auxiliary_loss_mlp": 0.01040016, - "balance_loss_clip": 1.04467189, - "balance_loss_mlp": 1.02705741, - "epoch": 0.5337141139335638, - "flos": 20152996680960.0, - "grad_norm": 1.6786856291224888, - "language_loss": 0.74847406, - "learning_rate": 1.8787570698254727e-06, - "loss": 0.77000904, - "num_input_tokens_seen": 190949890, - "step": 8877, - "time_per_iteration": 4.474783182144165 - }, - { - "auxiliary_loss_clip": 0.01030394, - "auxiliary_loss_mlp": 0.01002162, - "balance_loss_clip": 1.01585436, - "balance_loss_mlp": 1.00046921, - "epoch": 0.5337742371862317, - "flos": 67728387484800.0, - "grad_norm": 0.7582021069840851, - "language_loss": 0.57155037, - "learning_rate": 1.8783683267323629e-06, - "loss": 0.59187591, - "num_input_tokens_seen": 191008480, - "step": 8878, - "time_per_iteration": 4.623803615570068 - }, - { - "auxiliary_loss_clip": 0.0112711, - "auxiliary_loss_mlp": 0.0103613, - "balance_loss_clip": 1.04414368, - "balance_loss_mlp": 1.02169418, - "epoch": 0.5338343604388998, - "flos": 25008909436800.0, - "grad_norm": 1.4672061419232192, - "language_loss": 0.72301328, - "learning_rate": 1.8779795882515395e-06, - "loss": 0.74464571, - "num_input_tokens_seen": 191028995, - "step": 8879, - "time_per_iteration": 2.646631956100464 - }, - { - "auxiliary_loss_clip": 0.01126385, - "auxiliary_loss_mlp": 0.01039416, - "balance_loss_clip": 1.04535294, - "balance_loss_mlp": 1.02487254, - "epoch": 0.5338944836915677, - "flos": 17601256546560.0, - "grad_norm": 2.878615745391383, - "language_loss": 0.83403212, - "learning_rate": 1.8775908543977416e-06, - "loss": 0.85569012, - "num_input_tokens_seen": 191045285, - "step": 8880, - "time_per_iteration": 2.578953504562378 - }, - { - "auxiliary_loss_clip": 0.01053817, - "auxiliary_loss_mlp": 0.01036139, - "balance_loss_clip": 1.03627348, - "balance_loss_mlp": 1.02279377, - "epoch": 0.5339546069442357, - "flos": 21724124544000.0, - "grad_norm": 1.3711441541735603, - "language_loss": 0.79637486, - "learning_rate": 1.8772021251857107e-06, - "loss": 0.81727445, - "num_input_tokens_seen": 191066105, - "step": 8881, - "time_per_iteration": 4.335238695144653 - }, - { - "auxiliary_loss_clip": 0.0102058, - "auxiliary_loss_mlp": 0.00999984, - "balance_loss_clip": 1.01616335, - "balance_loss_mlp": 0.99846381, - "epoch": 0.5340147301969036, - "flos": 69723583315200.0, - "grad_norm": 0.7924040124288975, - "language_loss": 0.59248376, - "learning_rate": 1.8768134006301882e-06, - "loss": 0.61268938, - "num_input_tokens_seen": 191126315, - "step": 8882, - "time_per_iteration": 3.1252357959747314 - }, - { - "auxiliary_loss_clip": 0.01025577, - "auxiliary_loss_mlp": 0.01019116, - "balance_loss_clip": 1.01780772, - "balance_loss_mlp": 1.01768577, - "epoch": 0.5340748534495716, - "flos": 63880701580800.0, - "grad_norm": 0.8651438881324313, - "language_loss": 0.63574433, - "learning_rate": 1.876424680745913e-06, - "loss": 0.65619123, - "num_input_tokens_seen": 191174240, - "step": 8883, - "time_per_iteration": 3.0245001316070557 - }, - { - "auxiliary_loss_clip": 0.01079245, - "auxiliary_loss_mlp": 0.01040102, - "balance_loss_clip": 1.03873086, - "balance_loss_mlp": 1.02523685, - "epoch": 0.5341349767022396, - "flos": 28694313694080.0, - "grad_norm": 2.1049960022330385, - "language_loss": 0.8200773, - "learning_rate": 1.8760359655476272e-06, - "loss": 0.8412708, - "num_input_tokens_seen": 191193335, - "step": 8884, - "time_per_iteration": 2.8096158504486084 - }, - { - "auxiliary_loss_clip": 0.01088886, - "auxiliary_loss_mlp": 0.01042992, - "balance_loss_clip": 1.0403688, - "balance_loss_mlp": 1.02865684, - "epoch": 0.5341950999549075, - "flos": 16289691338880.0, - "grad_norm": 1.6281705583461854, - "language_loss": 0.72372848, - "learning_rate": 1.8756472550500695e-06, - "loss": 0.74504721, - "num_input_tokens_seen": 191210900, - "step": 8885, - "time_per_iteration": 2.6555016040802 - }, - { - "auxiliary_loss_clip": 0.01103878, - "auxiliary_loss_mlp": 0.01037971, - "balance_loss_clip": 1.04014146, - "balance_loss_mlp": 1.02301598, - "epoch": 0.5342552232075756, - "flos": 14355650413440.0, - "grad_norm": 2.9046192596208846, - "language_loss": 0.79004246, - "learning_rate": 1.87525854926798e-06, - "loss": 0.81146097, - "num_input_tokens_seen": 191226730, - "step": 8886, - "time_per_iteration": 2.6476478576660156 - }, - { - "auxiliary_loss_clip": 0.01083524, - "auxiliary_loss_mlp": 0.00772223, - "balance_loss_clip": 1.04013681, - "balance_loss_mlp": 1.00027037, - "epoch": 0.5343153464602435, - "flos": 30297976300800.0, - "grad_norm": 1.5332505330022492, - "language_loss": 0.750615, - "learning_rate": 1.8748698482160996e-06, - "loss": 0.76917243, - "num_input_tokens_seen": 191250435, - "step": 8887, - "time_per_iteration": 2.7690041065216064 - }, - { - "auxiliary_loss_clip": 0.01095123, - "auxiliary_loss_mlp": 0.01034453, - "balance_loss_clip": 1.03800249, - "balance_loss_mlp": 1.02050543, - "epoch": 0.5343754697129115, - "flos": 15596292216960.0, - "grad_norm": 2.322348043408552, - "language_loss": 0.68717337, - "learning_rate": 1.8744811519091663e-06, - "loss": 0.70846909, - "num_input_tokens_seen": 191268315, - "step": 8888, - "time_per_iteration": 2.631999969482422 - }, - { - "auxiliary_loss_clip": 0.01118819, - "auxiliary_loss_mlp": 0.01041785, - "balance_loss_clip": 1.04266095, - "balance_loss_mlp": 1.02738404, - "epoch": 0.5344355929655794, - "flos": 16909617191040.0, - "grad_norm": 2.080624189448151, - "language_loss": 0.77346873, - "learning_rate": 1.8740924603619208e-06, - "loss": 0.79507482, - "num_input_tokens_seen": 191287000, - "step": 8889, - "time_per_iteration": 2.621675729751587 - }, - { - "auxiliary_loss_clip": 0.01122598, - "auxiliary_loss_mlp": 0.01042684, - "balance_loss_clip": 1.04449213, - "balance_loss_mlp": 1.02922511, - "epoch": 0.5344957162182474, - "flos": 16798186224000.0, - "grad_norm": 2.052201989860069, - "language_loss": 0.69323713, - "learning_rate": 1.873703773589102e-06, - "loss": 0.71489, - "num_input_tokens_seen": 191304565, - "step": 8890, - "time_per_iteration": 2.6052801609039307 - }, - { - "auxiliary_loss_clip": 0.01128191, - "auxiliary_loss_mlp": 0.01052498, - "balance_loss_clip": 1.04494905, - "balance_loss_mlp": 1.0359515, - "epoch": 0.5345558394709153, - "flos": 12705590413440.0, - "grad_norm": 2.21737658698942, - "language_loss": 0.77022809, - "learning_rate": 1.8733150916054483e-06, - "loss": 0.79203498, - "num_input_tokens_seen": 191318300, - "step": 8891, - "time_per_iteration": 2.533200263977051 - }, - { - "auxiliary_loss_clip": 0.01103794, - "auxiliary_loss_mlp": 0.01042349, - "balance_loss_clip": 1.04030669, - "balance_loss_mlp": 1.02807951, - "epoch": 0.5346159627235834, - "flos": 22455050400000.0, - "grad_norm": 2.8109589169570857, - "language_loss": 0.74259919, - "learning_rate": 1.872926414425699e-06, - "loss": 0.76406056, - "num_input_tokens_seen": 191337925, - "step": 8892, - "time_per_iteration": 2.674466609954834 - }, - { - "auxiliary_loss_clip": 0.01107598, - "auxiliary_loss_mlp": 0.01038096, - "balance_loss_clip": 1.04592252, - "balance_loss_mlp": 1.02414215, - "epoch": 0.5346760859762513, - "flos": 22415763899520.0, - "grad_norm": 1.9745937936433648, - "language_loss": 0.87865257, - "learning_rate": 1.8725377420645932e-06, - "loss": 0.90010953, - "num_input_tokens_seen": 191357120, - "step": 8893, - "time_per_iteration": 2.7012922763824463 - }, - { - "auxiliary_loss_clip": 0.0111971, - "auxiliary_loss_mlp": 0.01036459, - "balance_loss_clip": 1.04291701, - "balance_loss_mlp": 1.02377474, - "epoch": 0.5347362092289193, - "flos": 22816131868800.0, - "grad_norm": 1.9421223728327293, - "language_loss": 0.72379559, - "learning_rate": 1.872149074536869e-06, - "loss": 0.74535728, - "num_input_tokens_seen": 191375395, - "step": 8894, - "time_per_iteration": 2.590670108795166 - }, - { - "auxiliary_loss_clip": 0.01111441, - "auxiliary_loss_mlp": 0.01031552, - "balance_loss_clip": 1.04253268, - "balance_loss_mlp": 1.01799238, - "epoch": 0.5347963324815872, - "flos": 23219480666880.0, - "grad_norm": 1.965554622310178, - "language_loss": 0.74611443, - "learning_rate": 1.8717604118572648e-06, - "loss": 0.76754439, - "num_input_tokens_seen": 191395595, - "step": 8895, - "time_per_iteration": 2.6462347507476807 - }, - { - "auxiliary_loss_clip": 0.01089565, - "auxiliary_loss_mlp": 0.01036535, - "balance_loss_clip": 1.04067063, - "balance_loss_mlp": 1.02246881, - "epoch": 0.5348564557342552, - "flos": 22601350494720.0, - "grad_norm": 1.8148089507657776, - "language_loss": 0.76860476, - "learning_rate": 1.8713717540405178e-06, - "loss": 0.78986579, - "num_input_tokens_seen": 191413730, - "step": 8896, - "time_per_iteration": 2.6798579692840576 - }, - { - "auxiliary_loss_clip": 0.01093639, - "auxiliary_loss_mlp": 0.01027964, - "balance_loss_clip": 1.04279101, - "balance_loss_mlp": 1.01502943, - "epoch": 0.5349165789869232, - "flos": 18002378701440.0, - "grad_norm": 1.8518658883520687, - "language_loss": 0.78188956, - "learning_rate": 1.8709831011013676e-06, - "loss": 0.80310559, - "num_input_tokens_seen": 191432400, - "step": 8897, - "time_per_iteration": 2.6509950160980225 - }, - { - "auxiliary_loss_clip": 0.01113143, - "auxiliary_loss_mlp": 0.01032016, - "balance_loss_clip": 1.04366183, - "balance_loss_mlp": 1.01799703, - "epoch": 0.5349767022395912, - "flos": 17159770483200.0, - "grad_norm": 1.7403204910626056, - "language_loss": 0.75393677, - "learning_rate": 1.8705944530545509e-06, - "loss": 0.7753883, - "num_input_tokens_seen": 191448855, - "step": 8898, - "time_per_iteration": 2.682753086090088 - }, - { - "auxiliary_loss_clip": 0.01037971, - "auxiliary_loss_mlp": 0.01005108, - "balance_loss_clip": 1.0205543, - "balance_loss_mlp": 1.00373161, - "epoch": 0.5350368254922592, - "flos": 70992058158720.0, - "grad_norm": 0.9010106507685076, - "language_loss": 0.57955837, - "learning_rate": 1.8702058099148052e-06, - "loss": 0.59998918, - "num_input_tokens_seen": 191519690, - "step": 8899, - "time_per_iteration": 3.3475701808929443 - }, - { - "auxiliary_loss_clip": 0.01101715, - "auxiliary_loss_mlp": 0.0103468, - "balance_loss_clip": 1.04445124, - "balance_loss_mlp": 1.02107263, - "epoch": 0.5350969487449271, - "flos": 27417833095680.0, - "grad_norm": 2.547752496503206, - "language_loss": 0.69974548, - "learning_rate": 1.869817171696868e-06, - "loss": 0.72110939, - "num_input_tokens_seen": 191539380, - "step": 8900, - "time_per_iteration": 2.7260618209838867 - }, - { - "auxiliary_loss_clip": 0.01099442, - "auxiliary_loss_mlp": 0.01035798, - "balance_loss_clip": 1.03943968, - "balance_loss_mlp": 1.02212465, - "epoch": 0.5351570719975951, - "flos": 19316134638720.0, - "grad_norm": 1.7903210344042488, - "language_loss": 0.71756148, - "learning_rate": 1.8694285384154777e-06, - "loss": 0.73891389, - "num_input_tokens_seen": 191557400, - "step": 8901, - "time_per_iteration": 2.661510467529297 - }, - { - "auxiliary_loss_clip": 0.01087314, - "auxiliary_loss_mlp": 0.01036972, - "balance_loss_clip": 1.03631806, - "balance_loss_mlp": 1.02237511, - "epoch": 0.535217195250263, - "flos": 19828580019840.0, - "grad_norm": 1.7989041746924002, - "language_loss": 0.77021015, - "learning_rate": 1.8690399100853699e-06, - "loss": 0.791453, - "num_input_tokens_seen": 191575860, - "step": 8902, - "time_per_iteration": 2.69665789604187 - }, - { - "auxiliary_loss_clip": 0.01087231, - "auxiliary_loss_mlp": 0.01041891, - "balance_loss_clip": 1.04053831, - "balance_loss_mlp": 1.0283792, - "epoch": 0.535277318502931, - "flos": 22127868391680.0, - "grad_norm": 1.509633063766185, - "language_loss": 0.70147592, - "learning_rate": 1.868651286721281e-06, - "loss": 0.72276717, - "num_input_tokens_seen": 191595775, - "step": 8903, - "time_per_iteration": 2.676028251647949 - }, - { - "auxiliary_loss_clip": 0.0111537, - "auxiliary_loss_mlp": 0.00772296, - "balance_loss_clip": 1.04395127, - "balance_loss_mlp": 1.00028765, - "epoch": 0.5353374417555989, - "flos": 25045897466880.0, - "grad_norm": 1.6001480056643833, - "language_loss": 0.72911739, - "learning_rate": 1.86826266833795e-06, - "loss": 0.74799401, - "num_input_tokens_seen": 191617785, - "step": 8904, - "time_per_iteration": 2.7466139793395996 - }, - { - "auxiliary_loss_clip": 0.01099985, - "auxiliary_loss_mlp": 0.01041546, - "balance_loss_clip": 1.04453778, - "balance_loss_mlp": 1.02705002, - "epoch": 0.535397565008267, - "flos": 19388710068480.0, - "grad_norm": 1.8242307652956307, - "language_loss": 0.73365581, - "learning_rate": 1.8678740549501103e-06, - "loss": 0.7550711, - "num_input_tokens_seen": 191636900, - "step": 8905, - "time_per_iteration": 2.772406578063965 - }, - { - "auxiliary_loss_clip": 0.01105525, - "auxiliary_loss_mlp": 0.0103776, - "balance_loss_clip": 1.04141188, - "balance_loss_mlp": 1.02607787, - "epoch": 0.5354576882609349, - "flos": 21471205904640.0, - "grad_norm": 1.6628200467542797, - "language_loss": 0.83795619, - "learning_rate": 1.8674854465725005e-06, - "loss": 0.85938901, - "num_input_tokens_seen": 191656720, - "step": 8906, - "time_per_iteration": 2.7151100635528564 - }, - { - "auxiliary_loss_clip": 0.01115256, - "auxiliary_loss_mlp": 0.00771962, - "balance_loss_clip": 1.04406035, - "balance_loss_mlp": 1.00027847, - "epoch": 0.5355178115136029, - "flos": 20777519473920.0, - "grad_norm": 1.884591574516044, - "language_loss": 0.74096596, - "learning_rate": 1.8670968432198563e-06, - "loss": 0.75983804, - "num_input_tokens_seen": 191674445, - "step": 8907, - "time_per_iteration": 2.6978471279144287 - }, - { - "auxiliary_loss_clip": 0.01106969, - "auxiliary_loss_mlp": 0.01040191, - "balance_loss_clip": 1.04144001, - "balance_loss_mlp": 1.02508759, - "epoch": 0.5355779347662708, - "flos": 23514020190720.0, - "grad_norm": 2.160786888323469, - "language_loss": 0.76593792, - "learning_rate": 1.866708244906912e-06, - "loss": 0.7874096, - "num_input_tokens_seen": 191695000, - "step": 8908, - "time_per_iteration": 2.6536221504211426 - }, - { - "auxiliary_loss_clip": 0.01097449, - "auxiliary_loss_mlp": 0.00772377, - "balance_loss_clip": 1.04248428, - "balance_loss_mlp": 1.00030112, - "epoch": 0.5356380580189388, - "flos": 20303211358080.0, - "grad_norm": 3.03117864576072, - "language_loss": 0.740637, - "learning_rate": 1.8663196516484055e-06, - "loss": 0.75933528, - "num_input_tokens_seen": 191713295, - "step": 8909, - "time_per_iteration": 2.665473461151123 - }, - { - "auxiliary_loss_clip": 0.01082798, - "auxiliary_loss_mlp": 0.01042054, - "balance_loss_clip": 1.0436362, - "balance_loss_mlp": 1.02891159, - "epoch": 0.5356981812716068, - "flos": 21361642444800.0, - "grad_norm": 2.1999922776778233, - "language_loss": 0.84319562, - "learning_rate": 1.8659310634590702e-06, - "loss": 0.86444414, - "num_input_tokens_seen": 191732725, - "step": 8910, - "time_per_iteration": 2.715521812438965 - }, - { - "auxiliary_loss_clip": 0.01102329, - "auxiliary_loss_mlp": 0.0103318, - "balance_loss_clip": 1.04114723, - "balance_loss_mlp": 1.01928067, - "epoch": 0.5357583045242748, - "flos": 23111246010240.0, - "grad_norm": 1.6725390900013062, - "language_loss": 0.81822705, - "learning_rate": 1.8655424803536427e-06, - "loss": 0.8395822, - "num_input_tokens_seen": 191753765, - "step": 8911, - "time_per_iteration": 2.715254068374634 - }, - { - "auxiliary_loss_clip": 0.0108401, - "auxiliary_loss_mlp": 0.01044047, - "balance_loss_clip": 1.04012454, - "balance_loss_mlp": 1.03019536, - "epoch": 0.5358184277769428, - "flos": 21141761339520.0, - "grad_norm": 5.639232337071921, - "language_loss": 0.69078076, - "learning_rate": 1.8651539023468585e-06, - "loss": 0.71206129, - "num_input_tokens_seen": 191773560, - "step": 8912, - "time_per_iteration": 2.6743216514587402 - }, - { - "auxiliary_loss_clip": 0.01098459, - "auxiliary_loss_mlp": 0.01036297, - "balance_loss_clip": 1.04129279, - "balance_loss_mlp": 1.02273059, - "epoch": 0.5358785510296107, - "flos": 16282400878080.0, - "grad_norm": 2.041064157993178, - "language_loss": 0.71507263, - "learning_rate": 1.8647653294534509e-06, - "loss": 0.73642015, - "num_input_tokens_seen": 191791255, - "step": 8913, - "time_per_iteration": 2.6959731578826904 - }, - { - "auxiliary_loss_clip": 0.01092724, - "auxiliary_loss_mlp": 0.01039004, - "balance_loss_clip": 1.04161441, - "balance_loss_mlp": 1.02512836, - "epoch": 0.5359386742822787, - "flos": 16976877408000.0, - "grad_norm": 1.9206134465038889, - "language_loss": 0.72290546, - "learning_rate": 1.864376761688156e-06, - "loss": 0.74422276, - "num_input_tokens_seen": 191809325, - "step": 8914, - "time_per_iteration": 2.678020477294922 - }, - { - "auxiliary_loss_clip": 0.01104699, - "auxiliary_loss_mlp": 0.01039806, - "balance_loss_clip": 1.04611683, - "balance_loss_mlp": 1.02468383, - "epoch": 0.5359987975349466, - "flos": 20812927305600.0, - "grad_norm": 1.8719693529557881, - "language_loss": 0.70668626, - "learning_rate": 1.8639881990657079e-06, - "loss": 0.72813135, - "num_input_tokens_seen": 191829795, - "step": 8915, - "time_per_iteration": 2.653940200805664 - }, - { - "auxiliary_loss_clip": 0.01094002, - "auxiliary_loss_mlp": 0.01045487, - "balance_loss_clip": 1.04047489, - "balance_loss_mlp": 1.03118742, - "epoch": 0.5360589207876146, - "flos": 22199941031040.0, - "grad_norm": 1.5982896811499068, - "language_loss": 0.74664176, - "learning_rate": 1.8635996416008408e-06, - "loss": 0.76803666, - "num_input_tokens_seen": 191850840, - "step": 8916, - "time_per_iteration": 4.3477959632873535 - }, - { - "auxiliary_loss_clip": 0.01081313, - "auxiliary_loss_mlp": 0.00772126, - "balance_loss_clip": 1.04081666, - "balance_loss_mlp": 1.00021815, - "epoch": 0.5361190440402825, - "flos": 31394365084800.0, - "grad_norm": 1.8553858112595492, - "language_loss": 0.72677946, - "learning_rate": 1.863211089308289e-06, - "loss": 0.74531382, - "num_input_tokens_seen": 191869520, - "step": 8917, - "time_per_iteration": 2.808074712753296 - }, - { - "auxiliary_loss_clip": 0.01102423, - "auxiliary_loss_mlp": 0.01041518, - "balance_loss_clip": 1.0441047, - "balance_loss_mlp": 1.02715325, - "epoch": 0.5361791672929506, - "flos": 16069882060800.0, - "grad_norm": 1.960367430660897, - "language_loss": 0.71014392, - "learning_rate": 1.8628225422027865e-06, - "loss": 0.73158336, - "num_input_tokens_seen": 191887240, - "step": 8918, - "time_per_iteration": 4.185984134674072 - }, - { - "auxiliary_loss_clip": 0.01106012, - "auxiliary_loss_mlp": 0.01036881, - "balance_loss_clip": 1.0469594, - "balance_loss_mlp": 1.02306461, - "epoch": 0.5362392905456185, - "flos": 20740926493440.0, - "grad_norm": 1.4605213362212828, - "language_loss": 0.74976659, - "learning_rate": 1.862434000299067e-06, - "loss": 0.77119553, - "num_input_tokens_seen": 191905690, - "step": 8919, - "time_per_iteration": 2.694120407104492 - }, - { - "auxiliary_loss_clip": 0.01093376, - "auxiliary_loss_mlp": 0.01035953, - "balance_loss_clip": 1.04010797, - "balance_loss_mlp": 1.02207744, - "epoch": 0.5362994137982865, - "flos": 17340077779200.0, - "grad_norm": 1.9976483392210334, - "language_loss": 0.71690488, - "learning_rate": 1.862045463611864e-06, - "loss": 0.73819816, - "num_input_tokens_seen": 191920725, - "step": 8920, - "time_per_iteration": 2.6273410320281982 - }, - { - "auxiliary_loss_clip": 0.01105087, - "auxiliary_loss_mlp": 0.01040608, - "balance_loss_clip": 1.03961456, - "balance_loss_mlp": 1.02532554, - "epoch": 0.5363595370509544, - "flos": 42813957795840.0, - "grad_norm": 1.3877970230156793, - "language_loss": 0.68828928, - "learning_rate": 1.8616569321559105e-06, - "loss": 0.70974618, - "num_input_tokens_seen": 191944645, - "step": 8921, - "time_per_iteration": 4.31537938117981 - }, - { - "auxiliary_loss_clip": 0.01114121, - "auxiliary_loss_mlp": 0.01036194, - "balance_loss_clip": 1.04631782, - "balance_loss_mlp": 1.0227288, - "epoch": 0.5364196603036224, - "flos": 19171953446400.0, - "grad_norm": 1.8336561717381605, - "language_loss": 0.81926084, - "learning_rate": 1.86126840594594e-06, - "loss": 0.84076393, - "num_input_tokens_seen": 191962265, - "step": 8922, - "time_per_iteration": 2.6045267581939697 - }, - { - "auxiliary_loss_clip": 0.01117037, - "auxiliary_loss_mlp": 0.01031036, - "balance_loss_clip": 1.04637003, - "balance_loss_mlp": 1.01782727, - "epoch": 0.5364797835562904, - "flos": 17931060247680.0, - "grad_norm": 2.029402038210475, - "language_loss": 0.76969302, - "learning_rate": 1.860879884996686e-06, - "loss": 0.79117376, - "num_input_tokens_seen": 191978850, - "step": 8923, - "time_per_iteration": 2.627131223678589 - }, - { - "auxiliary_loss_clip": 0.01097305, - "auxiliary_loss_mlp": 0.01035531, - "balance_loss_clip": 1.04099584, - "balance_loss_mlp": 1.02144074, - "epoch": 0.5365399068089584, - "flos": 30228058477440.0, - "grad_norm": 1.4696173336709724, - "language_loss": 0.70680726, - "learning_rate": 1.8604913693228804e-06, - "loss": 0.72813559, - "num_input_tokens_seen": 192002000, - "step": 8924, - "time_per_iteration": 2.7947139739990234 - }, - { - "auxiliary_loss_clip": 0.01093943, - "auxiliary_loss_mlp": 0.01040337, - "balance_loss_clip": 1.0430336, - "balance_loss_mlp": 1.02501917, - "epoch": 0.5366000300616264, - "flos": 24891696380160.0, - "grad_norm": 2.0937693746484456, - "language_loss": 0.87335229, - "learning_rate": 1.8601028589392558e-06, - "loss": 0.8946951, - "num_input_tokens_seen": 192019100, - "step": 8925, - "time_per_iteration": 2.768362045288086 - }, - { - "auxiliary_loss_clip": 0.01123484, - "auxiliary_loss_mlp": 0.01031699, - "balance_loss_clip": 1.04188776, - "balance_loss_mlp": 1.01764417, - "epoch": 0.5366601533142943, - "flos": 29826649013760.0, - "grad_norm": 1.5047259348419413, - "language_loss": 0.77962756, - "learning_rate": 1.8597143538605455e-06, - "loss": 0.80117941, - "num_input_tokens_seen": 192041660, - "step": 8926, - "time_per_iteration": 2.715451955795288 - }, - { - "auxiliary_loss_clip": 0.01087054, - "auxiliary_loss_mlp": 0.01032082, - "balance_loss_clip": 1.04502523, - "balance_loss_mlp": 1.01944578, - "epoch": 0.5367202765669623, - "flos": 27199352620800.0, - "grad_norm": 1.5425961750104156, - "language_loss": 0.66906953, - "learning_rate": 1.85932585410148e-06, - "loss": 0.69026089, - "num_input_tokens_seen": 192063540, - "step": 8927, - "time_per_iteration": 2.7890443801879883 - }, - { - "auxiliary_loss_clip": 0.0111207, - "auxiliary_loss_mlp": 0.0103082, - "balance_loss_clip": 1.04044211, - "balance_loss_mlp": 1.01719475, - "epoch": 0.5367803998196302, - "flos": 20229953569920.0, - "grad_norm": 1.7627850836145547, - "language_loss": 0.73644257, - "learning_rate": 1.8589373596767929e-06, - "loss": 0.75787145, - "num_input_tokens_seen": 192081760, - "step": 8928, - "time_per_iteration": 2.6679322719573975 - }, - { - "auxiliary_loss_clip": 0.01097621, - "auxiliary_loss_mlp": 0.01033753, - "balance_loss_clip": 1.03983617, - "balance_loss_mlp": 1.02038312, - "epoch": 0.5368405230722982, - "flos": 32154629374080.0, - "grad_norm": 1.8947277080350169, - "language_loss": 0.63138568, - "learning_rate": 1.8585488706012154e-06, - "loss": 0.65269947, - "num_input_tokens_seen": 192101620, - "step": 8929, - "time_per_iteration": 2.77915620803833 - }, - { - "auxiliary_loss_clip": 0.01112721, - "auxiliary_loss_mlp": 0.01035038, - "balance_loss_clip": 1.04284871, - "balance_loss_mlp": 1.02102494, - "epoch": 0.5369006463249661, - "flos": 26247935128320.0, - "grad_norm": 1.6504217106645076, - "language_loss": 0.65814567, - "learning_rate": 1.8581603868894781e-06, - "loss": 0.67962325, - "num_input_tokens_seen": 192121805, - "step": 8930, - "time_per_iteration": 2.671699285507202 - }, - { - "auxiliary_loss_clip": 0.01070837, - "auxiliary_loss_mlp": 0.01029378, - "balance_loss_clip": 1.03888655, - "balance_loss_mlp": 1.01519203, - "epoch": 0.5369607695776342, - "flos": 26211306234240.0, - "grad_norm": 1.4657060850123025, - "language_loss": 0.67106915, - "learning_rate": 1.8577719085563136e-06, - "loss": 0.69207126, - "num_input_tokens_seen": 192141765, - "step": 8931, - "time_per_iteration": 2.791450023651123 - }, - { - "auxiliary_loss_clip": 0.0107183, - "auxiliary_loss_mlp": 0.01035308, - "balance_loss_clip": 1.03937209, - "balance_loss_mlp": 1.02028155, - "epoch": 0.5370208928303021, - "flos": 25009017177600.0, - "grad_norm": 1.6675319791175172, - "language_loss": 0.76147091, - "learning_rate": 1.8573834356164525e-06, - "loss": 0.78254229, - "num_input_tokens_seen": 192161560, - "step": 8932, - "time_per_iteration": 2.817074775695801 - }, - { - "auxiliary_loss_clip": 0.0108812, - "auxiliary_loss_mlp": 0.01035166, - "balance_loss_clip": 1.04271507, - "balance_loss_mlp": 1.02086663, - "epoch": 0.5370810160829701, - "flos": 31792147274880.0, - "grad_norm": 1.7321457922490968, - "language_loss": 0.66103363, - "learning_rate": 1.8569949680846261e-06, - "loss": 0.68226647, - "num_input_tokens_seen": 192180190, - "step": 8933, - "time_per_iteration": 2.7999963760375977 - }, - { - "auxiliary_loss_clip": 0.01106374, - "auxiliary_loss_mlp": 0.0077107, - "balance_loss_clip": 1.04321599, - "balance_loss_mlp": 1.00030327, - "epoch": 0.537141139335638, - "flos": 23842602829440.0, - "grad_norm": 1.7096623259043264, - "language_loss": 0.83137345, - "learning_rate": 1.856606505975565e-06, - "loss": 0.8501479, - "num_input_tokens_seen": 192198855, - "step": 8934, - "time_per_iteration": 2.77140474319458 - }, - { - "auxiliary_loss_clip": 0.01083657, - "auxiliary_loss_mlp": 0.01038537, - "balance_loss_clip": 1.03906775, - "balance_loss_mlp": 1.02371967, - "epoch": 0.537201262588306, - "flos": 18508826511360.0, - "grad_norm": 1.9684207217946548, - "language_loss": 0.79907835, - "learning_rate": 1.856218049303999e-06, - "loss": 0.82030034, - "num_input_tokens_seen": 192216555, - "step": 8935, - "time_per_iteration": 2.714343547821045 - }, - { - "auxiliary_loss_clip": 0.01111571, - "auxiliary_loss_mlp": 0.01041616, - "balance_loss_clip": 1.04217649, - "balance_loss_mlp": 1.02750206, - "epoch": 0.537261385840974, - "flos": 25662950231040.0, - "grad_norm": 2.937428754588345, - "language_loss": 0.84070867, - "learning_rate": 1.855829598084659e-06, - "loss": 0.86224055, - "num_input_tokens_seen": 192236910, - "step": 8936, - "time_per_iteration": 2.6816179752349854 - }, - { - "auxiliary_loss_clip": 0.01092497, - "auxiliary_loss_mlp": 0.01030736, - "balance_loss_clip": 1.04575956, - "balance_loss_mlp": 1.018255, - "epoch": 0.537321509093642, - "flos": 40735017406080.0, - "grad_norm": 1.2320449417851727, - "language_loss": 0.72774732, - "learning_rate": 1.8554411523322754e-06, - "loss": 0.74897963, - "num_input_tokens_seen": 192260790, - "step": 8937, - "time_per_iteration": 2.9294662475585938 - }, - { - "auxiliary_loss_clip": 0.01097303, - "auxiliary_loss_mlp": 0.0103947, - "balance_loss_clip": 1.03866911, - "balance_loss_mlp": 1.02411556, - "epoch": 0.53738163234631, - "flos": 17238487138560.0, - "grad_norm": 2.4958463124017825, - "language_loss": 0.82070464, - "learning_rate": 1.8550527120615778e-06, - "loss": 0.84207237, - "num_input_tokens_seen": 192277230, - "step": 8938, - "time_per_iteration": 2.7016329765319824 - }, - { - "auxiliary_loss_clip": 0.01128942, - "auxiliary_loss_mlp": 0.01037787, - "balance_loss_clip": 1.04445028, - "balance_loss_mlp": 1.02425027, - "epoch": 0.5374417555989779, - "flos": 12821977457280.0, - "grad_norm": 2.39037719214814, - "language_loss": 0.80410939, - "learning_rate": 1.8546642772872957e-06, - "loss": 0.8257767, - "num_input_tokens_seen": 192292840, - "step": 8939, - "time_per_iteration": 2.588257312774658 - }, - { - "auxiliary_loss_clip": 0.01012372, - "auxiliary_loss_mlp": 0.01007323, - "balance_loss_clip": 1.01498079, - "balance_loss_mlp": 1.00561845, - "epoch": 0.5375018788516459, - "flos": 67256018703360.0, - "grad_norm": 0.706070728219951, - "language_loss": 0.52408826, - "learning_rate": 1.8542758480241589e-06, - "loss": 0.5442853, - "num_input_tokens_seen": 192358240, - "step": 8940, - "time_per_iteration": 3.276360273361206 - }, - { - "auxiliary_loss_clip": 0.01083174, - "auxiliary_loss_mlp": 0.01033229, - "balance_loss_clip": 1.04148936, - "balance_loss_mlp": 1.01995516, - "epoch": 0.5375620021043138, - "flos": 18114168804480.0, - "grad_norm": 2.0987581231461725, - "language_loss": 0.71804386, - "learning_rate": 1.8538874242868965e-06, - "loss": 0.73920786, - "num_input_tokens_seen": 192377370, - "step": 8941, - "time_per_iteration": 2.732537269592285 - }, - { - "auxiliary_loss_clip": 0.01092897, - "auxiliary_loss_mlp": 0.01030607, - "balance_loss_clip": 1.03881931, - "balance_loss_mlp": 1.01767242, - "epoch": 0.5376221253569818, - "flos": 23149383275520.0, - "grad_norm": 1.733585832372728, - "language_loss": 0.79825974, - "learning_rate": 1.853499006090237e-06, - "loss": 0.81949472, - "num_input_tokens_seen": 192396450, - "step": 8942, - "time_per_iteration": 2.723686695098877 - }, - { - "auxiliary_loss_clip": 0.01126783, - "auxiliary_loss_mlp": 0.01038334, - "balance_loss_clip": 1.04432559, - "balance_loss_mlp": 1.02416599, - "epoch": 0.5376822486096497, - "flos": 29972302663680.0, - "grad_norm": 1.8527940596038397, - "language_loss": 0.70161736, - "learning_rate": 1.853110593448911e-06, - "loss": 0.72326851, - "num_input_tokens_seen": 192417390, - "step": 8943, - "time_per_iteration": 2.683830499649048 - }, - { - "auxiliary_loss_clip": 0.01030181, - "auxiliary_loss_mlp": 0.01002794, - "balance_loss_clip": 1.01417148, - "balance_loss_mlp": 1.00145841, - "epoch": 0.5377423718623178, - "flos": 54168950874240.0, - "grad_norm": 0.8559023322108498, - "language_loss": 0.5964179, - "learning_rate": 1.852722186377645e-06, - "loss": 0.61674768, - "num_input_tokens_seen": 192478060, - "step": 8944, - "time_per_iteration": 3.195451498031616 - }, - { - "auxiliary_loss_clip": 0.01075816, - "auxiliary_loss_mlp": 0.01037224, - "balance_loss_clip": 1.04020023, - "balance_loss_mlp": 1.02198291, - "epoch": 0.5378024951149857, - "flos": 23257079228160.0, - "grad_norm": 2.0363151234070567, - "language_loss": 0.77896553, - "learning_rate": 1.852333784891169e-06, - "loss": 0.80009592, - "num_input_tokens_seen": 192495985, - "step": 8945, - "time_per_iteration": 2.7992632389068604 - }, - { - "auxiliary_loss_clip": 0.01114593, - "auxiliary_loss_mlp": 0.01035525, - "balance_loss_clip": 1.04309297, - "balance_loss_mlp": 1.02173805, - "epoch": 0.5378626183676537, - "flos": 24024095274240.0, - "grad_norm": 1.6722587357114949, - "language_loss": 0.68561995, - "learning_rate": 1.8519453890042112e-06, - "loss": 0.70712113, - "num_input_tokens_seen": 192515445, - "step": 8946, - "time_per_iteration": 2.6522717475891113 - }, - { - "auxiliary_loss_clip": 0.01078154, - "auxiliary_loss_mlp": 0.0104253, - "balance_loss_clip": 1.04271758, - "balance_loss_mlp": 1.02895761, - "epoch": 0.5379227416203216, - "flos": 27161789973120.0, - "grad_norm": 1.8248631368800923, - "language_loss": 0.76991701, - "learning_rate": 1.851556998731498e-06, - "loss": 0.79112387, - "num_input_tokens_seen": 192536530, - "step": 8947, - "time_per_iteration": 2.796123743057251 - }, - { - "auxiliary_loss_clip": 0.0111442, - "auxiliary_loss_mlp": 0.01032597, - "balance_loss_clip": 1.04487777, - "balance_loss_mlp": 1.01940608, - "epoch": 0.5379828648729896, - "flos": 24681619687680.0, - "grad_norm": 1.55307874766799, - "language_loss": 0.60198331, - "learning_rate": 1.8511686140877592e-06, - "loss": 0.6234535, - "num_input_tokens_seen": 192556075, - "step": 8948, - "time_per_iteration": 2.7054309844970703 - }, - { - "auxiliary_loss_clip": 0.01082153, - "auxiliary_loss_mlp": 0.01037517, - "balance_loss_clip": 1.03970575, - "balance_loss_mlp": 1.02415979, - "epoch": 0.5380429881256577, - "flos": 22523280284160.0, - "grad_norm": 1.6281037537893495, - "language_loss": 0.79697102, - "learning_rate": 1.8507802350877205e-06, - "loss": 0.81816769, - "num_input_tokens_seen": 192575535, - "step": 8949, - "time_per_iteration": 2.8140738010406494 - }, - { - "auxiliary_loss_clip": 0.01078335, - "auxiliary_loss_mlp": 0.01042356, - "balance_loss_clip": 1.03704572, - "balance_loss_mlp": 1.02679944, - "epoch": 0.5381031113783256, - "flos": 26979543342720.0, - "grad_norm": 2.0888170828860444, - "language_loss": 0.77963328, - "learning_rate": 1.850391861746111e-06, - "loss": 0.80084026, - "num_input_tokens_seen": 192594490, - "step": 8950, - "time_per_iteration": 2.7498505115509033 - }, - { - "auxiliary_loss_clip": 0.01110071, - "auxiliary_loss_mlp": 0.01029664, - "balance_loss_clip": 1.05072141, - "balance_loss_mlp": 1.01671791, - "epoch": 0.5381632346309936, - "flos": 24754087376640.0, - "grad_norm": 1.5816580812213883, - "language_loss": 0.72668755, - "learning_rate": 1.8500034940776573e-06, - "loss": 0.7480849, - "num_input_tokens_seen": 192615650, - "step": 8951, - "time_per_iteration": 2.7927658557891846 - }, - { - "auxiliary_loss_clip": 0.01122901, - "auxiliary_loss_mlp": 0.00772698, - "balance_loss_clip": 1.04232633, - "balance_loss_mlp": 1.00031877, - "epoch": 0.5382233578836615, - "flos": 15560058372480.0, - "grad_norm": 1.7038907930473366, - "language_loss": 0.74791837, - "learning_rate": 1.849615132097085e-06, - "loss": 0.76687431, - "num_input_tokens_seen": 192633840, - "step": 8952, - "time_per_iteration": 2.663555860519409 - }, - { - "auxiliary_loss_clip": 0.01103413, - "auxiliary_loss_mlp": 0.01034816, - "balance_loss_clip": 1.04635072, - "balance_loss_mlp": 1.02090442, - "epoch": 0.5382834811363295, - "flos": 25084501608960.0, - "grad_norm": 1.486507819644587, - "language_loss": 0.79733002, - "learning_rate": 1.8492267758191228e-06, - "loss": 0.81871235, - "num_input_tokens_seen": 192655890, - "step": 8953, - "time_per_iteration": 2.7213597297668457 - }, - { - "auxiliary_loss_clip": 0.01092412, - "auxiliary_loss_mlp": 0.01036672, - "balance_loss_clip": 1.04632258, - "balance_loss_mlp": 1.02147865, - "epoch": 0.5383436043889974, - "flos": 13297901685120.0, - "grad_norm": 1.8841614793520622, - "language_loss": 0.80665779, - "learning_rate": 1.8488384252584964e-06, - "loss": 0.82794857, - "num_input_tokens_seen": 192673025, - "step": 8954, - "time_per_iteration": 2.7119338512420654 - }, - { - "auxiliary_loss_clip": 0.01124989, - "auxiliary_loss_mlp": 0.0103348, - "balance_loss_clip": 1.04552889, - "balance_loss_mlp": 1.0192287, - "epoch": 0.5384037276416654, - "flos": 23039388852480.0, - "grad_norm": 2.080642260770838, - "language_loss": 0.76782274, - "learning_rate": 1.8484500804299318e-06, - "loss": 0.78940743, - "num_input_tokens_seen": 192692190, - "step": 8955, - "time_per_iteration": 4.170248746871948 - }, - { - "auxiliary_loss_clip": 0.01100368, - "auxiliary_loss_mlp": 0.01043375, - "balance_loss_clip": 1.04422796, - "balance_loss_mlp": 1.02911186, - "epoch": 0.5384638508943334, - "flos": 20631147552000.0, - "grad_norm": 1.64526518725267, - "language_loss": 0.78446829, - "learning_rate": 1.8480617413481557e-06, - "loss": 0.8059057, - "num_input_tokens_seen": 192710380, - "step": 8956, - "time_per_iteration": 4.346608638763428 - }, - { - "auxiliary_loss_clip": 0.01014882, - "auxiliary_loss_mlp": 0.01009567, - "balance_loss_clip": 1.01641572, - "balance_loss_mlp": 1.00802886, - "epoch": 0.5385239741470014, - "flos": 66737683491840.0, - "grad_norm": 0.8632221777835867, - "language_loss": 0.63366526, - "learning_rate": 1.8476734080278932e-06, - "loss": 0.6539098, - "num_input_tokens_seen": 192768995, - "step": 8957, - "time_per_iteration": 4.689607381820679 - }, - { - "auxiliary_loss_clip": 0.01003314, - "auxiliary_loss_mlp": 0.00999601, - "balance_loss_clip": 1.01686144, - "balance_loss_mlp": 0.99808067, - "epoch": 0.5385840973996693, - "flos": 64716058229760.0, - "grad_norm": 0.7163688318545376, - "language_loss": 0.5155347, - "learning_rate": 1.8472850804838705e-06, - "loss": 0.53556383, - "num_input_tokens_seen": 192825585, - "step": 8958, - "time_per_iteration": 3.263490676879883 - }, - { - "auxiliary_loss_clip": 0.01118278, - "auxiliary_loss_mlp": 0.01034558, - "balance_loss_clip": 1.04870462, - "balance_loss_mlp": 1.01945472, - "epoch": 0.5386442206523373, - "flos": 26141783460480.0, - "grad_norm": 1.5599827476789179, - "language_loss": 0.77335596, - "learning_rate": 1.8468967587308128e-06, - "loss": 0.79488432, - "num_input_tokens_seen": 192847335, - "step": 8959, - "time_per_iteration": 2.6936423778533936 - }, - { - "auxiliary_loss_clip": 0.01078149, - "auxiliary_loss_mlp": 0.01035897, - "balance_loss_clip": 1.04148221, - "balance_loss_mlp": 1.02258778, - "epoch": 0.5387043439050052, - "flos": 18251849635200.0, - "grad_norm": 2.554990268603387, - "language_loss": 0.84077597, - "learning_rate": 1.8465084427834455e-06, - "loss": 0.86191648, - "num_input_tokens_seen": 192862205, - "step": 8960, - "time_per_iteration": 4.281194686889648 - }, - { - "auxiliary_loss_clip": 0.01114712, - "auxiliary_loss_mlp": 0.01032916, - "balance_loss_clip": 1.0460726, - "balance_loss_mlp": 1.01955807, - "epoch": 0.5387644671576732, - "flos": 29788296266880.0, - "grad_norm": 1.4386251393877574, - "language_loss": 0.78275657, - "learning_rate": 1.8461201326564933e-06, - "loss": 0.80423284, - "num_input_tokens_seen": 192883695, - "step": 8961, - "time_per_iteration": 2.7518913745880127 - }, - { - "auxiliary_loss_clip": 0.01089107, - "auxiliary_loss_mlp": 0.01035524, - "balance_loss_clip": 1.041345, - "balance_loss_mlp": 1.02189803, - "epoch": 0.5388245904103413, - "flos": 22374466237440.0, - "grad_norm": 11.100507002897315, - "language_loss": 0.84070158, - "learning_rate": 1.845731828364681e-06, - "loss": 0.86194789, - "num_input_tokens_seen": 192900190, - "step": 8962, - "time_per_iteration": 2.745964288711548 - }, - { - "auxiliary_loss_clip": 0.01020426, - "auxiliary_loss_mlp": 0.01002497, - "balance_loss_clip": 1.01872444, - "balance_loss_mlp": 1.00114429, - "epoch": 0.5388847136630092, - "flos": 69807794751360.0, - "grad_norm": 0.7287303599556714, - "language_loss": 0.5418579, - "learning_rate": 1.8453435299227333e-06, - "loss": 0.56208712, - "num_input_tokens_seen": 192958675, - "step": 8963, - "time_per_iteration": 3.0952982902526855 - }, - { - "auxiliary_loss_clip": 0.01022568, - "auxiliary_loss_mlp": 0.01009564, - "balance_loss_clip": 1.01615238, - "balance_loss_mlp": 1.00817513, - "epoch": 0.5389448369156772, - "flos": 69822303845760.0, - "grad_norm": 1.4175775222807738, - "language_loss": 0.63305563, - "learning_rate": 1.8449552373453744e-06, - "loss": 0.65337688, - "num_input_tokens_seen": 193033135, - "step": 8964, - "time_per_iteration": 3.2670536041259766 - }, - { - "auxiliary_loss_clip": 0.01065573, - "auxiliary_loss_mlp": 0.01035006, - "balance_loss_clip": 1.04052043, - "balance_loss_mlp": 1.02049828, - "epoch": 0.5390049601683451, - "flos": 31722444933120.0, - "grad_norm": 1.4839412969014603, - "language_loss": 0.69941193, - "learning_rate": 1.8445669506473287e-06, - "loss": 0.72041768, - "num_input_tokens_seen": 193055570, - "step": 8965, - "time_per_iteration": 2.8793537616729736 - }, - { - "auxiliary_loss_clip": 0.01097921, - "auxiliary_loss_mlp": 0.00772841, - "balance_loss_clip": 1.04318738, - "balance_loss_mlp": 1.00031877, - "epoch": 0.5390650834210131, - "flos": 18113486446080.0, - "grad_norm": 3.9331383698311297, - "language_loss": 0.82359982, - "learning_rate": 1.8441786698433192e-06, - "loss": 0.84230745, - "num_input_tokens_seen": 193073120, - "step": 8966, - "time_per_iteration": 2.7008259296417236 - }, - { - "auxiliary_loss_clip": 0.0112489, - "auxiliary_loss_mlp": 0.01032097, - "balance_loss_clip": 1.04688132, - "balance_loss_mlp": 1.01831603, - "epoch": 0.539125206673681, - "flos": 17416711445760.0, - "grad_norm": 1.8273360824105822, - "language_loss": 0.72234643, - "learning_rate": 1.8437903949480706e-06, - "loss": 0.74391627, - "num_input_tokens_seen": 193090105, - "step": 8967, - "time_per_iteration": 2.536813974380493 - }, - { - "auxiliary_loss_clip": 0.01101272, - "auxiliary_loss_mlp": 0.01034325, - "balance_loss_clip": 1.04193211, - "balance_loss_mlp": 1.02177858, - "epoch": 0.539185329926349, - "flos": 22198935450240.0, - "grad_norm": 2.8461637045489394, - "language_loss": 0.81760883, - "learning_rate": 1.8434021259763065e-06, - "loss": 0.83896482, - "num_input_tokens_seen": 193109325, - "step": 8968, - "time_per_iteration": 2.6812336444854736 - }, - { - "auxiliary_loss_clip": 0.01095464, - "auxiliary_loss_mlp": 0.01039812, - "balance_loss_clip": 1.04489422, - "balance_loss_mlp": 1.0244931, - "epoch": 0.539245453179017, - "flos": 21434397442560.0, - "grad_norm": 1.479768408322399, - "language_loss": 0.74093103, - "learning_rate": 1.8430138629427484e-06, - "loss": 0.76228386, - "num_input_tokens_seen": 193130595, - "step": 8969, - "time_per_iteration": 2.775066614151001 - }, - { - "auxiliary_loss_clip": 0.01089398, - "auxiliary_loss_mlp": 0.00772297, - "balance_loss_clip": 1.03885353, - "balance_loss_mlp": 1.00019646, - "epoch": 0.539305576431685, - "flos": 20735000749440.0, - "grad_norm": 1.789523366494458, - "language_loss": 0.82301641, - "learning_rate": 1.8426256058621205e-06, - "loss": 0.84163332, - "num_input_tokens_seen": 193148930, - "step": 8970, - "time_per_iteration": 2.709660053253174 - }, - { - "auxiliary_loss_clip": 0.0109962, - "auxiliary_loss_mlp": 0.01036868, - "balance_loss_clip": 1.04434752, - "balance_loss_mlp": 1.02398705, - "epoch": 0.5393656996843529, - "flos": 30920452018560.0, - "grad_norm": 1.3749735874734272, - "language_loss": 0.75481087, - "learning_rate": 1.842237354749146e-06, - "loss": 0.77617574, - "num_input_tokens_seen": 193170140, - "step": 8971, - "time_per_iteration": 2.759859800338745 - }, - { - "auxiliary_loss_clip": 0.01031428, - "auxiliary_loss_mlp": 0.01020808, - "balance_loss_clip": 1.01404476, - "balance_loss_mlp": 1.01906729, - "epoch": 0.5394258229370209, - "flos": 50317781351040.0, - "grad_norm": 0.8852076637627846, - "language_loss": 0.60268009, - "learning_rate": 1.8418491096185465e-06, - "loss": 0.62320244, - "num_input_tokens_seen": 193227235, - "step": 8972, - "time_per_iteration": 3.1906497478485107 - }, - { - "auxiliary_loss_clip": 0.01113524, - "auxiliary_loss_mlp": 0.01042903, - "balance_loss_clip": 1.0430851, - "balance_loss_mlp": 1.02806175, - "epoch": 0.5394859461896888, - "flos": 25411935012480.0, - "grad_norm": 1.3798913966673876, - "language_loss": 0.78418267, - "learning_rate": 1.841460870485045e-06, - "loss": 0.80574697, - "num_input_tokens_seen": 193248435, - "step": 8973, - "time_per_iteration": 2.67616868019104 - }, - { - "auxiliary_loss_clip": 0.01119952, - "auxiliary_loss_mlp": 0.01038926, - "balance_loss_clip": 1.04402721, - "balance_loss_mlp": 1.0234288, - "epoch": 0.5395460694423568, - "flos": 25478476957440.0, - "grad_norm": 1.97267381364002, - "language_loss": 0.73745018, - "learning_rate": 1.8410726373633623e-06, - "loss": 0.75903904, - "num_input_tokens_seen": 193267490, - "step": 8974, - "time_per_iteration": 2.6896610260009766 - }, - { - "auxiliary_loss_clip": 0.01038786, - "auxiliary_loss_mlp": 0.01002204, - "balance_loss_clip": 1.01252413, - "balance_loss_mlp": 1.00089288, - "epoch": 0.5396061926950249, - "flos": 53249493507840.0, - "grad_norm": 0.7368178577125409, - "language_loss": 0.51070768, - "learning_rate": 1.8406844102682215e-06, - "loss": 0.53111756, - "num_input_tokens_seen": 193326050, - "step": 8975, - "time_per_iteration": 3.1316938400268555 - }, - { - "auxiliary_loss_clip": 0.01110433, - "auxiliary_loss_mlp": 0.01042663, - "balance_loss_clip": 1.04242885, - "balance_loss_mlp": 1.02821445, - "epoch": 0.5396663159476928, - "flos": 26725080418560.0, - "grad_norm": 2.630341512403146, - "language_loss": 0.72291577, - "learning_rate": 1.840296189214344e-06, - "loss": 0.74444675, - "num_input_tokens_seen": 193348785, - "step": 8976, - "time_per_iteration": 2.722482681274414 - }, - { - "auxiliary_loss_clip": 0.01107068, - "auxiliary_loss_mlp": 0.00771891, - "balance_loss_clip": 1.0392096, - "balance_loss_mlp": 1.00027895, - "epoch": 0.5397264392003608, - "flos": 23253380127360.0, - "grad_norm": 1.6269165395400453, - "language_loss": 0.69827849, - "learning_rate": 1.8399079742164509e-06, - "loss": 0.71706808, - "num_input_tokens_seen": 193367080, - "step": 8977, - "time_per_iteration": 2.661503553390503 - }, - { - "auxiliary_loss_clip": 0.0105269, - "auxiliary_loss_mlp": 0.01038261, - "balance_loss_clip": 1.03996563, - "balance_loss_mlp": 1.02390814, - "epoch": 0.5397865624530287, - "flos": 18294188791680.0, - "grad_norm": 1.662156020825611, - "language_loss": 0.7259683, - "learning_rate": 1.8395197652892636e-06, - "loss": 0.74687779, - "num_input_tokens_seen": 193383715, - "step": 8978, - "time_per_iteration": 2.7381365299224854 - }, - { - "auxiliary_loss_clip": 0.01087228, - "auxiliary_loss_mlp": 0.010397, - "balance_loss_clip": 1.04297757, - "balance_loss_mlp": 1.02373815, - "epoch": 0.5398466857056967, - "flos": 15297514888320.0, - "grad_norm": 1.853626793115837, - "language_loss": 0.74536407, - "learning_rate": 1.8391315624475028e-06, - "loss": 0.76663339, - "num_input_tokens_seen": 193400560, - "step": 8979, - "time_per_iteration": 2.694063425064087 - }, - { - "auxiliary_loss_clip": 0.01072362, - "auxiliary_loss_mlp": 0.01049968, - "balance_loss_clip": 1.04104912, - "balance_loss_mlp": 1.03438091, - "epoch": 0.5399068089583646, - "flos": 17821748183040.0, - "grad_norm": 1.8942057962212562, - "language_loss": 0.76699525, - "learning_rate": 1.8387433657058892e-06, - "loss": 0.78821856, - "num_input_tokens_seen": 193418680, - "step": 8980, - "time_per_iteration": 2.820065498352051 - }, - { - "auxiliary_loss_clip": 0.01123296, - "auxiliary_loss_mlp": 0.01035485, - "balance_loss_clip": 1.04266453, - "balance_loss_mlp": 1.02159715, - "epoch": 0.5399669322110326, - "flos": 27381635164800.0, - "grad_norm": 1.799033275645953, - "language_loss": 0.82047689, - "learning_rate": 1.8383551750791431e-06, - "loss": 0.84206468, - "num_input_tokens_seen": 193439310, - "step": 8981, - "time_per_iteration": 2.6362786293029785 - }, - { - "auxiliary_loss_clip": 0.01114328, - "auxiliary_loss_mlp": 0.01033767, - "balance_loss_clip": 1.0414052, - "balance_loss_mlp": 1.01837707, - "epoch": 0.5400270554637006, - "flos": 20449116403200.0, - "grad_norm": 1.8414706821019682, - "language_loss": 0.66744691, - "learning_rate": 1.8379669905819857e-06, - "loss": 0.68892789, - "num_input_tokens_seen": 193458115, - "step": 8982, - "time_per_iteration": 2.621446371078491 - }, - { - "auxiliary_loss_clip": 0.01087174, - "auxiliary_loss_mlp": 0.00771772, - "balance_loss_clip": 1.04236412, - "balance_loss_mlp": 1.00037217, - "epoch": 0.5400871787163686, - "flos": 21689578638720.0, - "grad_norm": 1.585959219226275, - "language_loss": 0.82838899, - "learning_rate": 1.8375788122291358e-06, - "loss": 0.84697849, - "num_input_tokens_seen": 193477365, - "step": 8983, - "time_per_iteration": 2.725118637084961 - }, - { - "auxiliary_loss_clip": 0.0107373, - "auxiliary_loss_mlp": 0.01037262, - "balance_loss_clip": 1.03868723, - "balance_loss_mlp": 1.0226711, - "epoch": 0.5401473019690365, - "flos": 19204739585280.0, - "grad_norm": 1.7940455633993566, - "language_loss": 0.71052921, - "learning_rate": 1.8371906400353138e-06, - "loss": 0.73163915, - "num_input_tokens_seen": 193495595, - "step": 8984, - "time_per_iteration": 2.7552812099456787 - }, - { - "auxiliary_loss_clip": 0.01129583, - "auxiliary_loss_mlp": 0.01039978, - "balance_loss_clip": 1.04673409, - "balance_loss_mlp": 1.02464724, - "epoch": 0.5402074252217045, - "flos": 20627376624000.0, - "grad_norm": 1.7153215255445333, - "language_loss": 0.80088288, - "learning_rate": 1.8368024740152386e-06, - "loss": 0.82257855, - "num_input_tokens_seen": 193514035, - "step": 8985, - "time_per_iteration": 2.6251611709594727 - }, - { - "auxiliary_loss_clip": 0.01076326, - "auxiliary_loss_mlp": 0.01030482, - "balance_loss_clip": 1.03776312, - "balance_loss_mlp": 1.01603341, - "epoch": 0.5402675484743724, - "flos": 24973465691520.0, - "grad_norm": 1.6597478268739005, - "language_loss": 0.79092562, - "learning_rate": 1.83641431418363e-06, - "loss": 0.81199366, - "num_input_tokens_seen": 193535445, - "step": 8986, - "time_per_iteration": 2.7512738704681396 - }, - { - "auxiliary_loss_clip": 0.01105948, - "auxiliary_loss_mlp": 0.01041249, - "balance_loss_clip": 1.0403738, - "balance_loss_mlp": 1.02647913, - "epoch": 0.5403276717270404, - "flos": 19459022941440.0, - "grad_norm": 1.5813568652048575, - "language_loss": 0.77027225, - "learning_rate": 1.8360261605552075e-06, - "loss": 0.79174423, - "num_input_tokens_seen": 193554780, - "step": 8987, - "time_per_iteration": 2.678215265274048 - }, - { - "auxiliary_loss_clip": 0.01094562, - "auxiliary_loss_mlp": 0.01035835, - "balance_loss_clip": 1.04025865, - "balance_loss_mlp": 1.021613, - "epoch": 0.5403877949797083, - "flos": 18442140912000.0, - "grad_norm": 3.169719409567684, - "language_loss": 0.71186262, - "learning_rate": 1.8356380131446887e-06, - "loss": 0.73316658, - "num_input_tokens_seen": 193573580, - "step": 8988, - "time_per_iteration": 2.779327869415283 - }, - { - "auxiliary_loss_clip": 0.01073421, - "auxiliary_loss_mlp": 0.01040131, - "balance_loss_clip": 1.03765535, - "balance_loss_mlp": 1.02508116, - "epoch": 0.5404479182323764, - "flos": 28292868316800.0, - "grad_norm": 2.25930737507901, - "language_loss": 0.67611122, - "learning_rate": 1.8352498719667934e-06, - "loss": 0.69724679, - "num_input_tokens_seen": 193590490, - "step": 8989, - "time_per_iteration": 2.7891674041748047 - }, - { - "auxiliary_loss_clip": 0.01111206, - "auxiliary_loss_mlp": 0.01041114, - "balance_loss_clip": 1.04164839, - "balance_loss_mlp": 1.02667785, - "epoch": 0.5405080414850444, - "flos": 23367325046400.0, - "grad_norm": 1.5585472280182338, - "language_loss": 0.77394271, - "learning_rate": 1.8348617370362399e-06, - "loss": 0.79546589, - "num_input_tokens_seen": 193609900, - "step": 8990, - "time_per_iteration": 2.6976635456085205 - }, - { - "auxiliary_loss_clip": 0.01106061, - "auxiliary_loss_mlp": 0.01026872, - "balance_loss_clip": 1.03980994, - "balance_loss_mlp": 1.01427758, - "epoch": 0.5405681647377123, - "flos": 21106425335040.0, - "grad_norm": 1.9802166321118257, - "language_loss": 0.69258702, - "learning_rate": 1.834473608367745e-06, - "loss": 0.71391636, - "num_input_tokens_seen": 193629775, - "step": 8991, - "time_per_iteration": 2.6734046936035156 - }, - { - "auxiliary_loss_clip": 0.01061373, - "auxiliary_loss_mlp": 0.01034138, - "balance_loss_clip": 1.03470838, - "balance_loss_mlp": 1.01864719, - "epoch": 0.5406282879903803, - "flos": 20449188230400.0, - "grad_norm": 1.8615919781627641, - "language_loss": 0.75722122, - "learning_rate": 1.8340854859760277e-06, - "loss": 0.77817637, - "num_input_tokens_seen": 193648070, - "step": 8992, - "time_per_iteration": 2.7986576557159424 - }, - { - "auxiliary_loss_clip": 0.01094937, - "auxiliary_loss_mlp": 0.01042345, - "balance_loss_clip": 1.03807545, - "balance_loss_mlp": 1.02672255, - "epoch": 0.5406884112430482, - "flos": 14209493973120.0, - "grad_norm": 2.5485108966117704, - "language_loss": 0.76453286, - "learning_rate": 1.8336973698758056e-06, - "loss": 0.78590572, - "num_input_tokens_seen": 193665060, - "step": 8993, - "time_per_iteration": 2.7208335399627686 - }, - { - "auxiliary_loss_clip": 0.01106981, - "auxiliary_loss_mlp": 0.01031441, - "balance_loss_clip": 1.03966081, - "balance_loss_mlp": 1.01783895, - "epoch": 0.5407485344957162, - "flos": 23875568536320.0, - "grad_norm": 1.7082267966393392, - "language_loss": 0.70645487, - "learning_rate": 1.8333092600817959e-06, - "loss": 0.72783911, - "num_input_tokens_seen": 193683620, - "step": 8994, - "time_per_iteration": 2.724794626235962 - }, - { - "auxiliary_loss_clip": 0.01107598, - "auxiliary_loss_mlp": 0.01031331, - "balance_loss_clip": 1.03957391, - "balance_loss_mlp": 1.01583362, - "epoch": 0.5408086577483842, - "flos": 23148485435520.0, - "grad_norm": 3.058822592256831, - "language_loss": 0.75407541, - "learning_rate": 1.8329211566087157e-06, - "loss": 0.77546465, - "num_input_tokens_seen": 193702990, - "step": 8995, - "time_per_iteration": 5.971833229064941 - }, - { - "auxiliary_loss_clip": 0.0110732, - "auxiliary_loss_mlp": 0.01036119, - "balance_loss_clip": 1.04115582, - "balance_loss_mlp": 1.02335215, - "epoch": 0.5408687810010522, - "flos": 18771046773120.0, - "grad_norm": 1.7630879917097735, - "language_loss": 0.73701608, - "learning_rate": 1.832533059471282e-06, - "loss": 0.75845045, - "num_input_tokens_seen": 193721785, - "step": 8996, - "time_per_iteration": 4.209546327590942 - }, - { - "auxiliary_loss_clip": 0.0107249, - "auxiliary_loss_mlp": 0.0103344, - "balance_loss_clip": 1.03679025, - "balance_loss_mlp": 1.02018428, - "epoch": 0.5409289042537201, - "flos": 13881557779200.0, - "grad_norm": 2.7958611639566557, - "language_loss": 0.73200142, - "learning_rate": 1.8321449686842115e-06, - "loss": 0.75306082, - "num_input_tokens_seen": 193740315, - "step": 8997, - "time_per_iteration": 2.6815428733825684 - }, - { - "auxiliary_loss_clip": 0.0112099, - "auxiliary_loss_mlp": 0.01036623, - "balance_loss_clip": 1.04214144, - "balance_loss_mlp": 1.02241874, - "epoch": 0.5409890275063881, - "flos": 14465357527680.0, - "grad_norm": 2.1382567541010706, - "language_loss": 0.71990108, - "learning_rate": 1.8317568842622207e-06, - "loss": 0.74147719, - "num_input_tokens_seen": 193757580, - "step": 8998, - "time_per_iteration": 2.516322374343872 - }, - { - "auxiliary_loss_clip": 0.01084198, - "auxiliary_loss_mlp": 0.01038336, - "balance_loss_clip": 1.03824925, - "balance_loss_mlp": 1.02481771, - "epoch": 0.541049150759056, - "flos": 48977449349760.0, - "grad_norm": 1.4737906597538892, - "language_loss": 0.7077291, - "learning_rate": 1.8313688062200256e-06, - "loss": 0.72895443, - "num_input_tokens_seen": 193780965, - "step": 8999, - "time_per_iteration": 4.582181215286255 - }, - { - "auxiliary_loss_clip": 0.01092675, - "auxiliary_loss_mlp": 0.01037736, - "balance_loss_clip": 1.04016924, - "balance_loss_mlp": 1.02372253, - "epoch": 0.541109274011724, - "flos": 18147601388160.0, - "grad_norm": 2.7892757576067972, - "language_loss": 0.80210066, - "learning_rate": 1.8309807345723422e-06, - "loss": 0.82340479, - "num_input_tokens_seen": 193797855, - "step": 9000, - "time_per_iteration": 2.6335151195526123 - }, - { - "auxiliary_loss_clip": 0.01069713, - "auxiliary_loss_mlp": 0.01033155, - "balance_loss_clip": 1.03577805, - "balance_loss_mlp": 1.01837265, - "epoch": 0.541169397264392, - "flos": 20522553759360.0, - "grad_norm": 1.6231589706551275, - "language_loss": 0.73037231, - "learning_rate": 1.8305926693338863e-06, - "loss": 0.75140095, - "num_input_tokens_seen": 193817375, - "step": 9001, - "time_per_iteration": 2.854574680328369 - }, - { - "auxiliary_loss_clip": 0.01088976, - "auxiliary_loss_mlp": 0.01037285, - "balance_loss_clip": 1.03875196, - "balance_loss_mlp": 1.0225749, - "epoch": 0.54122952051706, - "flos": 20044043752320.0, - "grad_norm": 2.3946252475459704, - "language_loss": 0.85775471, - "learning_rate": 1.8302046105193734e-06, - "loss": 0.87901723, - "num_input_tokens_seen": 193832205, - "step": 9002, - "time_per_iteration": 2.83799409866333 - }, - { - "auxiliary_loss_clip": 0.01071827, - "auxiliary_loss_mlp": 0.01036651, - "balance_loss_clip": 1.03876507, - "balance_loss_mlp": 1.0244441, - "epoch": 0.541289643769728, - "flos": 19062246332160.0, - "grad_norm": 1.9022782971983632, - "language_loss": 0.78010678, - "learning_rate": 1.8298165581435183e-06, - "loss": 0.80119157, - "num_input_tokens_seen": 193849830, - "step": 9003, - "time_per_iteration": 2.8771512508392334 - }, - { - "auxiliary_loss_clip": 0.01105804, - "auxiliary_loss_mlp": 0.01031034, - "balance_loss_clip": 1.03998888, - "balance_loss_mlp": 1.01659191, - "epoch": 0.5413497670223959, - "flos": 22382295402240.0, - "grad_norm": 2.4815464780266905, - "language_loss": 0.69489288, - "learning_rate": 1.8294285122210372e-06, - "loss": 0.71626127, - "num_input_tokens_seen": 193869945, - "step": 9004, - "time_per_iteration": 2.7296600341796875 - }, - { - "auxiliary_loss_clip": 0.01027886, - "auxiliary_loss_mlp": 0.01000864, - "balance_loss_clip": 1.01221299, - "balance_loss_mlp": 0.99943334, - "epoch": 0.5414098902750639, - "flos": 70031734093440.0, - "grad_norm": 0.9691738453098017, - "language_loss": 0.59067202, - "learning_rate": 1.8290404727666434e-06, - "loss": 0.61095953, - "num_input_tokens_seen": 193930860, - "step": 9005, - "time_per_iteration": 3.2482104301452637 - }, - { - "auxiliary_loss_clip": 0.011229, - "auxiliary_loss_mlp": 0.00771475, - "balance_loss_clip": 1.04402518, - "balance_loss_mlp": 1.00026715, - "epoch": 0.5414700135277318, - "flos": 21798962530560.0, - "grad_norm": 3.1081571461352357, - "language_loss": 0.78251934, - "learning_rate": 1.8286524397950517e-06, - "loss": 0.80146307, - "num_input_tokens_seen": 193949075, - "step": 9006, - "time_per_iteration": 2.646697521209717 - }, - { - "auxiliary_loss_clip": 0.01099607, - "auxiliary_loss_mlp": 0.01035785, - "balance_loss_clip": 1.04162097, - "balance_loss_mlp": 1.02380466, - "epoch": 0.5415301367803999, - "flos": 16907929251840.0, - "grad_norm": 2.04905315291525, - "language_loss": 0.82968152, - "learning_rate": 1.8282644133209777e-06, - "loss": 0.85103542, - "num_input_tokens_seen": 193967630, - "step": 9007, - "time_per_iteration": 2.6906566619873047 - }, - { - "auxiliary_loss_clip": 0.01105367, - "auxiliary_loss_mlp": 0.01035166, - "balance_loss_clip": 1.04186976, - "balance_loss_mlp": 1.02084875, - "epoch": 0.5415902600330678, - "flos": 25704176065920.0, - "grad_norm": 2.002533361325265, - "language_loss": 0.67188275, - "learning_rate": 1.8278763933591334e-06, - "loss": 0.69328809, - "num_input_tokens_seen": 193988730, - "step": 9008, - "time_per_iteration": 2.6538190841674805 - }, - { - "auxiliary_loss_clip": 0.01126211, - "auxiliary_loss_mlp": 0.01033213, - "balance_loss_clip": 1.04396832, - "balance_loss_mlp": 1.01836514, - "epoch": 0.5416503832857358, - "flos": 19208151377280.0, - "grad_norm": 1.9615897276879948, - "language_loss": 0.73713046, - "learning_rate": 1.827488379924234e-06, - "loss": 0.75872469, - "num_input_tokens_seen": 194005160, - "step": 9009, - "time_per_iteration": 2.5716910362243652 - }, - { - "auxiliary_loss_clip": 0.01072637, - "auxiliary_loss_mlp": 0.01036076, - "balance_loss_clip": 1.04184818, - "balance_loss_mlp": 1.02171135, - "epoch": 0.5417105065384037, - "flos": 12713706887040.0, - "grad_norm": 2.1963503735452417, - "language_loss": 0.87984347, - "learning_rate": 1.8271003730309923e-06, - "loss": 0.90093064, - "num_input_tokens_seen": 194021700, - "step": 9010, - "time_per_iteration": 2.725271701812744 - }, - { - "auxiliary_loss_clip": 0.01120446, - "auxiliary_loss_mlp": 0.01037388, - "balance_loss_clip": 1.04260874, - "balance_loss_mlp": 1.02448332, - "epoch": 0.5417706297910717, - "flos": 30335933998080.0, - "grad_norm": 1.8667479755469423, - "language_loss": 0.65187848, - "learning_rate": 1.826712372694122e-06, - "loss": 0.67345679, - "num_input_tokens_seen": 194042620, - "step": 9011, - "time_per_iteration": 2.6546692848205566 - }, - { - "auxiliary_loss_clip": 0.01111756, - "auxiliary_loss_mlp": 0.01036661, - "balance_loss_clip": 1.04458547, - "balance_loss_mlp": 1.02324426, - "epoch": 0.5418307530437396, - "flos": 29020992912000.0, - "grad_norm": 2.8570982701345797, - "language_loss": 0.79252279, - "learning_rate": 1.8263243789283362e-06, - "loss": 0.81400692, - "num_input_tokens_seen": 194061800, - "step": 9012, - "time_per_iteration": 2.6907572746276855 - }, - { - "auxiliary_loss_clip": 0.01119813, - "auxiliary_loss_mlp": 0.01033195, - "balance_loss_clip": 1.04184949, - "balance_loss_mlp": 1.01965845, - "epoch": 0.5418908762964076, - "flos": 16873455173760.0, - "grad_norm": 2.191987247231765, - "language_loss": 0.74450612, - "learning_rate": 1.8259363917483466e-06, - "loss": 0.76603615, - "num_input_tokens_seen": 194079890, - "step": 9013, - "time_per_iteration": 2.6294262409210205 - }, - { - "auxiliary_loss_clip": 0.01085863, - "auxiliary_loss_mlp": 0.01030984, - "balance_loss_clip": 1.04200959, - "balance_loss_mlp": 1.01776361, - "epoch": 0.5419509995490756, - "flos": 18949702043520.0, - "grad_norm": 2.094538198423721, - "language_loss": 0.72306025, - "learning_rate": 1.8255484111688667e-06, - "loss": 0.74422872, - "num_input_tokens_seen": 194097625, - "step": 9014, - "time_per_iteration": 2.653125524520874 - }, - { - "auxiliary_loss_clip": 0.01099897, - "auxiliary_loss_mlp": 0.01032361, - "balance_loss_clip": 1.04301429, - "balance_loss_mlp": 1.01888418, - "epoch": 0.5420111228017436, - "flos": 18077719478400.0, - "grad_norm": 1.5497382301526352, - "language_loss": 0.807073, - "learning_rate": 1.8251604372046085e-06, - "loss": 0.82839555, - "num_input_tokens_seen": 194116055, - "step": 9015, - "time_per_iteration": 2.6197831630706787 - }, - { - "auxiliary_loss_clip": 0.01117394, - "auxiliary_loss_mlp": 0.01039918, - "balance_loss_clip": 1.04648256, - "balance_loss_mlp": 1.02635145, - "epoch": 0.5420712460544116, - "flos": 19061779455360.0, - "grad_norm": 2.4637362060141053, - "language_loss": 0.81252277, - "learning_rate": 1.8247724698702843e-06, - "loss": 0.83409584, - "num_input_tokens_seen": 194130365, - "step": 9016, - "time_per_iteration": 2.617722988128662 - }, - { - "auxiliary_loss_clip": 0.01121755, - "auxiliary_loss_mlp": 0.01030314, - "balance_loss_clip": 1.04375124, - "balance_loss_mlp": 1.01745152, - "epoch": 0.5421313693070795, - "flos": 18187103370240.0, - "grad_norm": 1.6999373176246328, - "language_loss": 0.81182349, - "learning_rate": 1.8243845091806053e-06, - "loss": 0.83334422, - "num_input_tokens_seen": 194148975, - "step": 9017, - "time_per_iteration": 2.629488706588745 - }, - { - "auxiliary_loss_clip": 0.01119384, - "auxiliary_loss_mlp": 0.01035787, - "balance_loss_clip": 1.04308951, - "balance_loss_mlp": 1.02270925, - "epoch": 0.5421914925597475, - "flos": 13005947940480.0, - "grad_norm": 1.767329743248484, - "language_loss": 0.77847707, - "learning_rate": 1.8239965551502837e-06, - "loss": 0.80002874, - "num_input_tokens_seen": 194167185, - "step": 9018, - "time_per_iteration": 2.595520257949829 - }, - { - "auxiliary_loss_clip": 0.01121333, - "auxiliary_loss_mlp": 0.010389, - "balance_loss_clip": 1.04014397, - "balance_loss_mlp": 1.02462447, - "epoch": 0.5422516158124154, - "flos": 46758457831680.0, - "grad_norm": 1.6302803515957, - "language_loss": 0.66417134, - "learning_rate": 1.8236086077940303e-06, - "loss": 0.68577361, - "num_input_tokens_seen": 194192840, - "step": 9019, - "time_per_iteration": 2.8572912216186523 - }, - { - "auxiliary_loss_clip": 0.01101197, - "auxiliary_loss_mlp": 0.01036576, - "balance_loss_clip": 1.03910589, - "balance_loss_mlp": 1.02315295, - "epoch": 0.5423117390650835, - "flos": 31758642864000.0, - "grad_norm": 1.5350920710342792, - "language_loss": 0.69515598, - "learning_rate": 1.8232206671265555e-06, - "loss": 0.71653378, - "num_input_tokens_seen": 194213150, - "step": 9020, - "time_per_iteration": 2.710081100463867 - }, - { - "auxiliary_loss_clip": 0.01082322, - "auxiliary_loss_mlp": 0.01037191, - "balance_loss_clip": 1.03962088, - "balance_loss_mlp": 1.02462053, - "epoch": 0.5423718623177514, - "flos": 27201974313600.0, - "grad_norm": 1.5706670053852172, - "language_loss": 0.80494618, - "learning_rate": 1.8228327331625717e-06, - "loss": 0.82614136, - "num_input_tokens_seen": 194234665, - "step": 9021, - "time_per_iteration": 2.760133743286133 - }, - { - "auxiliary_loss_clip": 0.01069543, - "auxiliary_loss_mlp": 0.01037659, - "balance_loss_clip": 1.03820395, - "balance_loss_mlp": 1.02405667, - "epoch": 0.5424319855704194, - "flos": 23546447193600.0, - "grad_norm": 2.2946341773433496, - "language_loss": 0.78887641, - "learning_rate": 1.822444805916788e-06, - "loss": 0.80994844, - "num_input_tokens_seen": 194253790, - "step": 9022, - "time_per_iteration": 2.8245437145233154 - }, - { - "auxiliary_loss_clip": 0.01085662, - "auxiliary_loss_mlp": 0.00771451, - "balance_loss_clip": 1.03742123, - "balance_loss_mlp": 1.00025558, - "epoch": 0.5424921088230873, - "flos": 26615624699520.0, - "grad_norm": 1.6811700220554942, - "language_loss": 0.8234387, - "learning_rate": 1.822056885403915e-06, - "loss": 0.84200984, - "num_input_tokens_seen": 194274950, - "step": 9023, - "time_per_iteration": 2.722637891769409 - }, - { - "auxiliary_loss_clip": 0.01105066, - "auxiliary_loss_mlp": 0.01031053, - "balance_loss_clip": 1.04266286, - "balance_loss_mlp": 1.01798785, - "epoch": 0.5425522320757553, - "flos": 23586811102080.0, - "grad_norm": 1.7285453701222258, - "language_loss": 0.71582222, - "learning_rate": 1.8216689716386627e-06, - "loss": 0.73718333, - "num_input_tokens_seen": 194296155, - "step": 9024, - "time_per_iteration": 2.6643166542053223 - }, - { - "auxiliary_loss_clip": 0.01109023, - "auxiliary_loss_mlp": 0.01034832, - "balance_loss_clip": 1.03978658, - "balance_loss_mlp": 1.02231479, - "epoch": 0.5426123553284232, - "flos": 30592264429440.0, - "grad_norm": 1.7605396052132907, - "language_loss": 0.65074313, - "learning_rate": 1.8212810646357405e-06, - "loss": 0.67218173, - "num_input_tokens_seen": 194318025, - "step": 9025, - "time_per_iteration": 2.6963577270507812 - }, - { - "auxiliary_loss_clip": 0.0109579, - "auxiliary_loss_mlp": 0.00769932, - "balance_loss_clip": 1.04664063, - "balance_loss_mlp": 1.00038803, - "epoch": 0.5426724785810912, - "flos": 12495118671360.0, - "grad_norm": 2.055737651503127, - "language_loss": 0.73914909, - "learning_rate": 1.8208931644098591e-06, - "loss": 0.7578063, - "num_input_tokens_seen": 194336150, - "step": 9026, - "time_per_iteration": 2.6317172050476074 - }, - { - "auxiliary_loss_clip": 0.01095155, - "auxiliary_loss_mlp": 0.01040442, - "balance_loss_clip": 1.03804421, - "balance_loss_mlp": 1.02545154, - "epoch": 0.5427326018337592, - "flos": 26064611089920.0, - "grad_norm": 2.1949475938623224, - "language_loss": 0.7840718, - "learning_rate": 1.8205052709757265e-06, - "loss": 0.80542773, - "num_input_tokens_seen": 194355980, - "step": 9027, - "time_per_iteration": 2.6076927185058594 - }, - { - "auxiliary_loss_clip": 0.01004652, - "auxiliary_loss_mlp": 0.01011362, - "balance_loss_clip": 1.00918782, - "balance_loss_mlp": 1.00950241, - "epoch": 0.5427927250864272, - "flos": 65984745576960.0, - "grad_norm": 0.759944437260396, - "language_loss": 0.56566465, - "learning_rate": 1.8201173843480515e-06, - "loss": 0.58582479, - "num_input_tokens_seen": 194422660, - "step": 9028, - "time_per_iteration": 3.173718214035034 - }, - { - "auxiliary_loss_clip": 0.01078653, - "auxiliary_loss_mlp": 0.01029607, - "balance_loss_clip": 1.0437665, - "balance_loss_mlp": 1.01519442, - "epoch": 0.5428528483390952, - "flos": 19975382904960.0, - "grad_norm": 2.1789279213341857, - "language_loss": 0.7763471, - "learning_rate": 1.8197295045415442e-06, - "loss": 0.79742968, - "num_input_tokens_seen": 194438545, - "step": 9029, - "time_per_iteration": 2.6010968685150146 - }, - { - "auxiliary_loss_clip": 0.01080602, - "auxiliary_loss_mlp": 0.01029952, - "balance_loss_clip": 1.0426538, - "balance_loss_mlp": 1.01611137, - "epoch": 0.5429129715917631, - "flos": 21832323287040.0, - "grad_norm": 1.5227150839007966, - "language_loss": 0.8289423, - "learning_rate": 1.8193416315709112e-06, - "loss": 0.85004783, - "num_input_tokens_seen": 194458060, - "step": 9030, - "time_per_iteration": 2.673872232437134 - }, - { - "auxiliary_loss_clip": 0.01119103, - "auxiliary_loss_mlp": 0.0103115, - "balance_loss_clip": 1.04308653, - "balance_loss_mlp": 1.01801896, - "epoch": 0.5429730948444311, - "flos": 27782685492480.0, - "grad_norm": 1.5242093045096456, - "language_loss": 0.74554878, - "learning_rate": 1.8189537654508623e-06, - "loss": 0.76705134, - "num_input_tokens_seen": 194477405, - "step": 9031, - "time_per_iteration": 2.6361796855926514 - }, - { - "auxiliary_loss_clip": 0.01099875, - "auxiliary_loss_mlp": 0.01039492, - "balance_loss_clip": 1.03957534, - "balance_loss_mlp": 1.02664721, - "epoch": 0.543033218097099, - "flos": 26760452336640.0, - "grad_norm": 1.8557133497087115, - "language_loss": 0.85526693, - "learning_rate": 1.8185659061961045e-06, - "loss": 0.87666059, - "num_input_tokens_seen": 194497085, - "step": 9032, - "time_per_iteration": 2.633051872253418 - }, - { - "auxiliary_loss_clip": 0.01101785, - "auxiliary_loss_mlp": 0.01037126, - "balance_loss_clip": 1.04154074, - "balance_loss_mlp": 1.02405477, - "epoch": 0.5430933413497671, - "flos": 22675254727680.0, - "grad_norm": 1.789713495487195, - "language_loss": 0.74318242, - "learning_rate": 1.8181780538213457e-06, - "loss": 0.76457155, - "num_input_tokens_seen": 194516785, - "step": 9033, - "time_per_iteration": 2.654573917388916 - }, - { - "auxiliary_loss_clip": 0.01080113, - "auxiliary_loss_mlp": 0.01040958, - "balance_loss_clip": 1.03826129, - "balance_loss_mlp": 1.0267365, - "epoch": 0.543153464602435, - "flos": 24607499973120.0, - "grad_norm": 1.5302152204895145, - "language_loss": 0.75507742, - "learning_rate": 1.8177902083412935e-06, - "loss": 0.77628815, - "num_input_tokens_seen": 194536475, - "step": 9034, - "time_per_iteration": 6.07684326171875 - }, - { - "auxiliary_loss_clip": 0.01080457, - "auxiliary_loss_mlp": 0.01035889, - "balance_loss_clip": 1.04235947, - "balance_loss_mlp": 1.02360463, - "epoch": 0.543213587855103, - "flos": 19025725178880.0, - "grad_norm": 1.697596865274133, - "language_loss": 0.84559906, - "learning_rate": 1.817402369770655e-06, - "loss": 0.86676252, - "num_input_tokens_seen": 194554495, - "step": 9035, - "time_per_iteration": 4.246930122375488 - }, - { - "auxiliary_loss_clip": 0.01010369, - "auxiliary_loss_mlp": 0.01004655, - "balance_loss_clip": 1.01446867, - "balance_loss_mlp": 1.00328398, - "epoch": 0.5432737111077709, - "flos": 65686435125120.0, - "grad_norm": 0.7105133860132232, - "language_loss": 0.55900681, - "learning_rate": 1.8170145381241364e-06, - "loss": 0.57915699, - "num_input_tokens_seen": 194617620, - "step": 9036, - "time_per_iteration": 3.214927911758423 - }, - { - "auxiliary_loss_clip": 0.0106374, - "auxiliary_loss_mlp": 0.01035958, - "balance_loss_clip": 1.04064369, - "balance_loss_mlp": 1.02285123, - "epoch": 0.5433338343604389, - "flos": 22091670460800.0, - "grad_norm": 1.4967561616212492, - "language_loss": 0.75198317, - "learning_rate": 1.8166267134164451e-06, - "loss": 0.77298009, - "num_input_tokens_seen": 194637690, - "step": 9037, - "time_per_iteration": 2.815127372741699 - }, - { - "auxiliary_loss_clip": 0.01089499, - "auxiliary_loss_mlp": 0.01036314, - "balance_loss_clip": 1.039361, - "balance_loss_mlp": 1.02274799, - "epoch": 0.5433939576131068, - "flos": 34672649616000.0, - "grad_norm": 1.6562121389813547, - "language_loss": 0.66519392, - "learning_rate": 1.8162388956622875e-06, - "loss": 0.68645203, - "num_input_tokens_seen": 194659520, - "step": 9038, - "time_per_iteration": 2.788142681121826 - }, - { - "auxiliary_loss_clip": 0.01105433, - "auxiliary_loss_mlp": 0.01036988, - "balance_loss_clip": 1.03904057, - "balance_loss_mlp": 1.02456677, - "epoch": 0.5434540808657748, - "flos": 20303355012480.0, - "grad_norm": 1.9500381910938636, - "language_loss": 0.7809025, - "learning_rate": 1.8158510848763692e-06, - "loss": 0.80232668, - "num_input_tokens_seen": 194677645, - "step": 9039, - "time_per_iteration": 4.200030326843262 - }, - { - "auxiliary_loss_clip": 0.01076379, - "auxiliary_loss_mlp": 0.01038367, - "balance_loss_clip": 1.03707099, - "balance_loss_mlp": 1.02523017, - "epoch": 0.5435142041184428, - "flos": 23112790295040.0, - "grad_norm": 1.9066978344822971, - "language_loss": 0.76675421, - "learning_rate": 1.8154632810733962e-06, - "loss": 0.7879017, - "num_input_tokens_seen": 194697400, - "step": 9040, - "time_per_iteration": 2.752359628677368 - }, - { - "auxiliary_loss_clip": 0.01021921, - "auxiliary_loss_mlp": 0.0101024, - "balance_loss_clip": 1.01599014, - "balance_loss_mlp": 1.00891709, - "epoch": 0.5435743273711108, - "flos": 64012746954240.0, - "grad_norm": 0.6657326543890927, - "language_loss": 0.52456856, - "learning_rate": 1.815075484268074e-06, - "loss": 0.54489017, - "num_input_tokens_seen": 194761205, - "step": 9041, - "time_per_iteration": 3.19743275642395 - }, - { - "auxiliary_loss_clip": 0.01092893, - "auxiliary_loss_mlp": 0.01043232, - "balance_loss_clip": 1.04014623, - "balance_loss_mlp": 1.0300709, - "epoch": 0.5436344506237788, - "flos": 25118903859840.0, - "grad_norm": 1.6935261425615555, - "language_loss": 0.76397556, - "learning_rate": 1.8146876944751078e-06, - "loss": 0.78533685, - "num_input_tokens_seen": 194782445, - "step": 9042, - "time_per_iteration": 2.7176172733306885 - }, - { - "auxiliary_loss_clip": 0.01082719, - "auxiliary_loss_mlp": 0.01030979, - "balance_loss_clip": 1.04040313, - "balance_loss_mlp": 1.01886773, - "epoch": 0.5436945738764467, - "flos": 19572967860480.0, - "grad_norm": 1.7014237411229687, - "language_loss": 0.67346215, - "learning_rate": 1.8142999117092033e-06, - "loss": 0.69459915, - "num_input_tokens_seen": 194800325, - "step": 9043, - "time_per_iteration": 2.7166213989257812 - }, - { - "auxiliary_loss_clip": 0.0107861, - "auxiliary_loss_mlp": 0.01032764, - "balance_loss_clip": 1.03779316, - "balance_loss_mlp": 1.01971054, - "epoch": 0.5437546971291147, - "flos": 21142515525120.0, - "grad_norm": 1.5921714365650326, - "language_loss": 0.84577447, - "learning_rate": 1.8139121359850644e-06, - "loss": 0.86688828, - "num_input_tokens_seen": 194818675, - "step": 9044, - "time_per_iteration": 2.758593797683716 - }, - { - "auxiliary_loss_clip": 0.01123207, - "auxiliary_loss_mlp": 0.01031023, - "balance_loss_clip": 1.04196227, - "balance_loss_mlp": 1.01723039, - "epoch": 0.5438148203817826, - "flos": 25118688378240.0, - "grad_norm": 1.5431059852471993, - "language_loss": 0.62074721, - "learning_rate": 1.8135243673173956e-06, - "loss": 0.64228952, - "num_input_tokens_seen": 194836595, - "step": 9045, - "time_per_iteration": 2.6207923889160156 - }, - { - "auxiliary_loss_clip": 0.0112166, - "auxiliary_loss_mlp": 0.01035257, - "balance_loss_clip": 1.04318917, - "balance_loss_mlp": 1.02179205, - "epoch": 0.5438749436344507, - "flos": 23002939526400.0, - "grad_norm": 1.4293832885602564, - "language_loss": 0.70140386, - "learning_rate": 1.8131366057209023e-06, - "loss": 0.72297299, - "num_input_tokens_seen": 194857520, - "step": 9046, - "time_per_iteration": 2.6262285709381104 - }, - { - "auxiliary_loss_clip": 0.01117279, - "auxiliary_loss_mlp": 0.01029233, - "balance_loss_clip": 1.04171467, - "balance_loss_mlp": 1.01709127, - "epoch": 0.5439350668871186, - "flos": 15487016065920.0, - "grad_norm": 1.95554521575616, - "language_loss": 0.7724129, - "learning_rate": 1.8127488512102868e-06, - "loss": 0.79387808, - "num_input_tokens_seen": 194876020, - "step": 9047, - "time_per_iteration": 2.592041492462158 - }, - { - "auxiliary_loss_clip": 0.01094716, - "auxiliary_loss_mlp": 0.01047772, - "balance_loss_clip": 1.04039311, - "balance_loss_mlp": 1.03321636, - "epoch": 0.5439951901397866, - "flos": 17238415311360.0, - "grad_norm": 1.5854248061222735, - "language_loss": 0.7262761, - "learning_rate": 1.8123611038002547e-06, - "loss": 0.74770093, - "num_input_tokens_seen": 194894650, - "step": 9048, - "time_per_iteration": 2.667393684387207 - }, - { - "auxiliary_loss_clip": 0.01069346, - "auxiliary_loss_mlp": 0.01045305, - "balance_loss_clip": 1.03664947, - "balance_loss_mlp": 1.0298202, - "epoch": 0.5440553133924545, - "flos": 18661016436480.0, - "grad_norm": 1.9805900660696516, - "language_loss": 0.93650311, - "learning_rate": 1.8119733635055076e-06, - "loss": 0.95764971, - "num_input_tokens_seen": 194911935, - "step": 9049, - "time_per_iteration": 2.7119088172912598 - }, - { - "auxiliary_loss_clip": 0.0110651, - "auxiliary_loss_mlp": 0.01032835, - "balance_loss_clip": 1.03992295, - "balance_loss_mlp": 1.02054429, - "epoch": 0.5441154366451225, - "flos": 27122934435840.0, - "grad_norm": 1.7800719649484351, - "language_loss": 0.73936987, - "learning_rate": 1.8115856303407492e-06, - "loss": 0.76076329, - "num_input_tokens_seen": 194931620, - "step": 9050, - "time_per_iteration": 2.631661891937256 - }, - { - "auxiliary_loss_clip": 0.01111441, - "auxiliary_loss_mlp": 0.01030882, - "balance_loss_clip": 1.0437777, - "balance_loss_mlp": 1.01755428, - "epoch": 0.5441755598977904, - "flos": 25993867253760.0, - "grad_norm": 1.737903905046117, - "language_loss": 0.66990525, - "learning_rate": 1.8111979043206832e-06, - "loss": 0.69132841, - "num_input_tokens_seen": 194952560, - "step": 9051, - "time_per_iteration": 2.648484230041504 - }, - { - "auxiliary_loss_clip": 0.01080337, - "auxiliary_loss_mlp": 0.01033354, - "balance_loss_clip": 1.03722811, - "balance_loss_mlp": 1.02039015, - "epoch": 0.5442356831504584, - "flos": 32380041173760.0, - "grad_norm": 2.245844605247971, - "language_loss": 0.67334735, - "learning_rate": 1.810810185460011e-06, - "loss": 0.69448429, - "num_input_tokens_seen": 194973915, - "step": 9052, - "time_per_iteration": 2.778211832046509 - }, - { - "auxiliary_loss_clip": 0.01121064, - "auxiliary_loss_mlp": 0.01033478, - "balance_loss_clip": 1.04266417, - "balance_loss_mlp": 1.02010286, - "epoch": 0.5442958064031264, - "flos": 24164290056960.0, - "grad_norm": 1.8200748140762566, - "language_loss": 0.92835879, - "learning_rate": 1.810422473773436e-06, - "loss": 0.9499042, - "num_input_tokens_seen": 194990170, - "step": 9053, - "time_per_iteration": 2.6110095977783203 - }, - { - "auxiliary_loss_clip": 0.01093907, - "auxiliary_loss_mlp": 0.01034949, - "balance_loss_clip": 1.04024363, - "balance_loss_mlp": 1.02203834, - "epoch": 0.5443559296557944, - "flos": 18764690065920.0, - "grad_norm": 2.3950140888374687, - "language_loss": 0.83948398, - "learning_rate": 1.8100347692756595e-06, - "loss": 0.86077261, - "num_input_tokens_seen": 195006395, - "step": 9054, - "time_per_iteration": 2.6261367797851562 - }, - { - "auxiliary_loss_clip": 0.01090647, - "auxiliary_loss_mlp": 0.01034581, - "balance_loss_clip": 1.03965771, - "balance_loss_mlp": 1.02094352, - "epoch": 0.5444160529084624, - "flos": 22632556435200.0, - "grad_norm": 2.6065175825707327, - "language_loss": 0.68213475, - "learning_rate": 1.8096470719813836e-06, - "loss": 0.70338708, - "num_input_tokens_seen": 195025080, - "step": 9055, - "time_per_iteration": 2.623518705368042 - }, - { - "auxiliary_loss_clip": 0.01000083, - "auxiliary_loss_mlp": 0.00999074, - "balance_loss_clip": 1.01110244, - "balance_loss_mlp": 0.99770337, - "epoch": 0.5444761761611303, - "flos": 69671909600640.0, - "grad_norm": 0.7426728731430834, - "language_loss": 0.57650024, - "learning_rate": 1.80925938190531e-06, - "loss": 0.59649181, - "num_input_tokens_seen": 195085725, - "step": 9056, - "time_per_iteration": 3.2228453159332275 - }, - { - "auxiliary_loss_clip": 0.01087409, - "auxiliary_loss_mlp": 0.01036027, - "balance_loss_clip": 1.04208684, - "balance_loss_mlp": 1.02234185, - "epoch": 0.5445362994137983, - "flos": 14278442129280.0, - "grad_norm": 1.75653480561415, - "language_loss": 0.69749284, - "learning_rate": 1.8088716990621395e-06, - "loss": 0.71872711, - "num_input_tokens_seen": 195102585, - "step": 9057, - "time_per_iteration": 2.7110843658447266 - }, - { - "auxiliary_loss_clip": 0.01106044, - "auxiliary_loss_mlp": 0.01038419, - "balance_loss_clip": 1.04014075, - "balance_loss_mlp": 1.02472818, - "epoch": 0.5445964226664662, - "flos": 28986195611520.0, - "grad_norm": 2.0738816921888366, - "language_loss": 0.75373238, - "learning_rate": 1.8084840234665738e-06, - "loss": 0.775177, - "num_input_tokens_seen": 195120055, - "step": 9058, - "time_per_iteration": 2.7001023292541504 - }, - { - "auxiliary_loss_clip": 0.01003793, - "auxiliary_loss_mlp": 0.01003874, - "balance_loss_clip": 1.01181531, - "balance_loss_mlp": 1.00230026, - "epoch": 0.5446565459191343, - "flos": 68620230270720.0, - "grad_norm": 0.7925901763726337, - "language_loss": 0.6261481, - "learning_rate": 1.808096355133312e-06, - "loss": 0.6462248, - "num_input_tokens_seen": 195181045, - "step": 9059, - "time_per_iteration": 3.355748414993286 - }, - { - "auxiliary_loss_clip": 0.01107073, - "auxiliary_loss_mlp": 0.0103287, - "balance_loss_clip": 1.0414511, - "balance_loss_mlp": 1.01922059, - "epoch": 0.5447166691718022, - "flos": 16216469464320.0, - "grad_norm": 1.790354282478879, - "language_loss": 0.79365647, - "learning_rate": 1.8077086940770572e-06, - "loss": 0.81505585, - "num_input_tokens_seen": 195198840, - "step": 9060, - "time_per_iteration": 2.6523141860961914 - }, - { - "auxiliary_loss_clip": 0.01111799, - "auxiliary_loss_mlp": 0.01033132, - "balance_loss_clip": 1.04219317, - "balance_loss_mlp": 1.01976824, - "epoch": 0.5447767924244702, - "flos": 25849039616640.0, - "grad_norm": 1.7487339019361072, - "language_loss": 0.8006283, - "learning_rate": 1.8073210403125072e-06, - "loss": 0.82207763, - "num_input_tokens_seen": 195218720, - "step": 9061, - "time_per_iteration": 2.660477876663208 - }, - { - "auxiliary_loss_clip": 0.01107514, - "auxiliary_loss_mlp": 0.01028575, - "balance_loss_clip": 1.04152489, - "balance_loss_mlp": 1.01595628, - "epoch": 0.5448369156771381, - "flos": 19677718897920.0, - "grad_norm": 1.667542325640746, - "language_loss": 0.8699556, - "learning_rate": 1.8069333938543627e-06, - "loss": 0.89131653, - "num_input_tokens_seen": 195235770, - "step": 9062, - "time_per_iteration": 2.6527698040008545 - }, - { - "auxiliary_loss_clip": 0.0109274, - "auxiliary_loss_mlp": 0.01037371, - "balance_loss_clip": 1.03916395, - "balance_loss_mlp": 1.02188551, - "epoch": 0.5448970389298061, - "flos": 19281804215040.0, - "grad_norm": 1.6766222611874342, - "language_loss": 0.82069784, - "learning_rate": 1.8065457547173233e-06, - "loss": 0.84199893, - "num_input_tokens_seen": 195254870, - "step": 9063, - "time_per_iteration": 2.651977062225342 - }, - { - "auxiliary_loss_clip": 0.01118028, - "auxiliary_loss_mlp": 0.01032916, - "balance_loss_clip": 1.0406127, - "balance_loss_mlp": 1.01958823, - "epoch": 0.544957162182474, - "flos": 20991690316800.0, - "grad_norm": 1.769153488212037, - "language_loss": 0.63484013, - "learning_rate": 1.8061581229160878e-06, - "loss": 0.65634954, - "num_input_tokens_seen": 195273390, - "step": 9064, - "time_per_iteration": 2.595914602279663 - }, - { - "auxiliary_loss_clip": 0.0112242, - "auxiliary_loss_mlp": 0.01037085, - "balance_loss_clip": 1.04264021, - "balance_loss_mlp": 1.02337003, - "epoch": 0.545017285435142, - "flos": 25374587846400.0, - "grad_norm": 1.6143269954810184, - "language_loss": 0.79795569, - "learning_rate": 1.8057704984653566e-06, - "loss": 0.81955075, - "num_input_tokens_seen": 195295635, - "step": 9065, - "time_per_iteration": 2.647632360458374 - }, - { - "auxiliary_loss_clip": 0.01082455, - "auxiliary_loss_mlp": 0.01032837, - "balance_loss_clip": 1.04022825, - "balance_loss_mlp": 1.0211482, - "epoch": 0.54507740868781, - "flos": 19134749934720.0, - "grad_norm": 2.1024584454927626, - "language_loss": 0.77589709, - "learning_rate": 1.805382881379827e-06, - "loss": 0.79705, - "num_input_tokens_seen": 195312545, - "step": 9066, - "time_per_iteration": 2.750904083251953 - }, - { - "auxiliary_loss_clip": 0.01106868, - "auxiliary_loss_mlp": 0.0103139, - "balance_loss_clip": 1.04005289, - "balance_loss_mlp": 1.01794958, - "epoch": 0.545137531940478, - "flos": 26249802635520.0, - "grad_norm": 2.0527073359497665, - "language_loss": 0.75859725, - "learning_rate": 1.8049952716741975e-06, - "loss": 0.77997983, - "num_input_tokens_seen": 195332955, - "step": 9067, - "time_per_iteration": 2.68332839012146 - }, - { - "auxiliary_loss_clip": 0.0108798, - "auxiliary_loss_mlp": 0.01038091, - "balance_loss_clip": 1.04256892, - "balance_loss_mlp": 1.02183652, - "epoch": 0.545197655193146, - "flos": 37555629995520.0, - "grad_norm": 6.876378009840058, - "language_loss": 0.63596183, - "learning_rate": 1.8046076693631682e-06, - "loss": 0.65722257, - "num_input_tokens_seen": 195355930, - "step": 9068, - "time_per_iteration": 2.893052816390991 - }, - { - "auxiliary_loss_clip": 0.01080095, - "auxiliary_loss_mlp": 0.01041608, - "balance_loss_clip": 1.0446372, - "balance_loss_mlp": 1.02935874, - "epoch": 0.5452577784458139, - "flos": 26031250333440.0, - "grad_norm": 1.5002235169223528, - "language_loss": 0.7186054, - "learning_rate": 1.8042200744614343e-06, - "loss": 0.73982239, - "num_input_tokens_seen": 195376445, - "step": 9069, - "time_per_iteration": 2.7437844276428223 - }, - { - "auxiliary_loss_clip": 0.01118098, - "auxiliary_loss_mlp": 0.01028881, - "balance_loss_clip": 1.04397726, - "balance_loss_mlp": 1.0169543, - "epoch": 0.5453179016984819, - "flos": 17639034675840.0, - "grad_norm": 1.9248359915141238, - "language_loss": 0.73836279, - "learning_rate": 1.8038324869836957e-06, - "loss": 0.75983256, - "num_input_tokens_seen": 195393725, - "step": 9070, - "time_per_iteration": 2.629026174545288 - }, - { - "auxiliary_loss_clip": 0.01104842, - "auxiliary_loss_mlp": 0.01038375, - "balance_loss_clip": 1.0405302, - "balance_loss_mlp": 1.02508879, - "epoch": 0.5453780249511498, - "flos": 23216679406080.0, - "grad_norm": 2.895965777257026, - "language_loss": 0.60386193, - "learning_rate": 1.8034449069446489e-06, - "loss": 0.62529415, - "num_input_tokens_seen": 195411380, - "step": 9071, - "time_per_iteration": 2.787898540496826 - }, - { - "auxiliary_loss_clip": 0.0103628, - "auxiliary_loss_mlp": 0.01019994, - "balance_loss_clip": 1.01031959, - "balance_loss_mlp": 1.01858091, - "epoch": 0.5454381482038179, - "flos": 68696504801280.0, - "grad_norm": 0.701915733274622, - "language_loss": 0.57096583, - "learning_rate": 1.80305733435899e-06, - "loss": 0.59152853, - "num_input_tokens_seen": 195482015, - "step": 9072, - "time_per_iteration": 3.3096070289611816 - }, - { - "auxiliary_loss_clip": 0.01088718, - "auxiliary_loss_mlp": 0.0104092, - "balance_loss_clip": 1.03829658, - "balance_loss_mlp": 1.02696621, - "epoch": 0.5454982714564858, - "flos": 13260626346240.0, - "grad_norm": 1.6985686628313852, - "language_loss": 0.6941787, - "learning_rate": 1.8026697692414174e-06, - "loss": 0.71547508, - "num_input_tokens_seen": 195500440, - "step": 9073, - "time_per_iteration": 5.942334413528442 - }, - { - "auxiliary_loss_clip": 0.01094077, - "auxiliary_loss_mlp": 0.01042156, - "balance_loss_clip": 1.03799677, - "balance_loss_mlp": 1.02981734, - "epoch": 0.5455583947091538, - "flos": 21835878733440.0, - "grad_norm": 1.7477774368009211, - "language_loss": 0.7124452, - "learning_rate": 1.802282211606627e-06, - "loss": 0.73380756, - "num_input_tokens_seen": 195520860, - "step": 9074, - "time_per_iteration": 2.6760778427124023 - }, - { - "auxiliary_loss_clip": 0.0110625, - "auxiliary_loss_mlp": 0.01038683, - "balance_loss_clip": 1.04050887, - "balance_loss_mlp": 1.02611828, - "epoch": 0.5456185179618217, - "flos": 17817438551040.0, - "grad_norm": 1.854490114521215, - "language_loss": 0.68543398, - "learning_rate": 1.8018946614693148e-06, - "loss": 0.70688331, - "num_input_tokens_seen": 195538615, - "step": 9075, - "time_per_iteration": 4.19740891456604 - }, - { - "auxiliary_loss_clip": 0.01109026, - "auxiliary_loss_mlp": 0.01034737, - "balance_loss_clip": 1.04411292, - "balance_loss_mlp": 1.02303696, - "epoch": 0.5456786412144897, - "flos": 21069401391360.0, - "grad_norm": 1.8542702472429493, - "language_loss": 0.80530715, - "learning_rate": 1.8015071188441768e-06, - "loss": 0.82674479, - "num_input_tokens_seen": 195557460, - "step": 9076, - "time_per_iteration": 2.6821329593658447 - }, - { - "auxiliary_loss_clip": 0.01109363, - "auxiliary_loss_mlp": 0.01032135, - "balance_loss_clip": 1.04109383, - "balance_loss_mlp": 1.01970196, - "epoch": 0.5457387644671576, - "flos": 23294965098240.0, - "grad_norm": 1.6176910715306643, - "language_loss": 0.80137533, - "learning_rate": 1.8011195837459089e-06, - "loss": 0.82279032, - "num_input_tokens_seen": 195577985, - "step": 9077, - "time_per_iteration": 2.6378607749938965 - }, - { - "auxiliary_loss_clip": 0.01103737, - "auxiliary_loss_mlp": 0.01035436, - "balance_loss_clip": 1.04032636, - "balance_loss_mlp": 1.02293682, - "epoch": 0.5457988877198257, - "flos": 21617039122560.0, - "grad_norm": 2.2183628478116346, - "language_loss": 0.67997038, - "learning_rate": 1.8007320561892064e-06, - "loss": 0.70136213, - "num_input_tokens_seen": 195597620, - "step": 9078, - "time_per_iteration": 4.261017560958862 - }, - { - "auxiliary_loss_clip": 0.01114465, - "auxiliary_loss_mlp": 0.01039359, - "balance_loss_clip": 1.04379976, - "balance_loss_mlp": 1.02579284, - "epoch": 0.5458590109724936, - "flos": 23762485543680.0, - "grad_norm": 1.8448340723101526, - "language_loss": 0.80507636, - "learning_rate": 1.800344536188764e-06, - "loss": 0.82661462, - "num_input_tokens_seen": 195615910, - "step": 9079, - "time_per_iteration": 2.6384685039520264 - }, - { - "auxiliary_loss_clip": 0.01124513, - "auxiliary_loss_mlp": 0.01034947, - "balance_loss_clip": 1.04221058, - "balance_loss_mlp": 1.02032018, - "epoch": 0.5459191342251616, - "flos": 24424283675520.0, - "grad_norm": 1.6928746227882223, - "language_loss": 0.75848919, - "learning_rate": 1.799957023759277e-06, - "loss": 0.78008378, - "num_input_tokens_seen": 195635620, - "step": 9080, - "time_per_iteration": 2.6506381034851074 - }, - { - "auxiliary_loss_clip": 0.01080273, - "auxiliary_loss_mlp": 0.01037485, - "balance_loss_clip": 1.03795743, - "balance_loss_mlp": 1.0230484, - "epoch": 0.5459792574778296, - "flos": 23623009032960.0, - "grad_norm": 2.0769433103494737, - "language_loss": 0.83164978, - "learning_rate": 1.7995695189154392e-06, - "loss": 0.85282731, - "num_input_tokens_seen": 195652495, - "step": 9081, - "time_per_iteration": 2.705381393432617 - }, - { - "auxiliary_loss_clip": 0.0112596, - "auxiliary_loss_mlp": 0.0103236, - "balance_loss_clip": 1.04470921, - "balance_loss_mlp": 1.01884151, - "epoch": 0.5460393807304975, - "flos": 19135540033920.0, - "grad_norm": 1.688461125774873, - "language_loss": 0.70063365, - "learning_rate": 1.7991820216719461e-06, - "loss": 0.72221684, - "num_input_tokens_seen": 195671965, - "step": 9082, - "time_per_iteration": 2.6176023483276367 - }, - { - "auxiliary_loss_clip": 0.01115168, - "auxiliary_loss_mlp": 0.01030163, - "balance_loss_clip": 1.03972983, - "balance_loss_mlp": 1.01709151, - "epoch": 0.5460995039831655, - "flos": 35918534805120.0, - "grad_norm": 1.559424348169526, - "language_loss": 0.66653717, - "learning_rate": 1.7987945320434906e-06, - "loss": 0.68799043, - "num_input_tokens_seen": 195694725, - "step": 9083, - "time_per_iteration": 2.710636854171753 - }, - { - "auxiliary_loss_clip": 0.01091037, - "auxiliary_loss_mlp": 0.01033037, - "balance_loss_clip": 1.03879106, - "balance_loss_mlp": 1.01998401, - "epoch": 0.5461596272358334, - "flos": 26759231274240.0, - "grad_norm": 1.7271294710436846, - "language_loss": 0.78584135, - "learning_rate": 1.798407050044766e-06, - "loss": 0.80708218, - "num_input_tokens_seen": 195714090, - "step": 9084, - "time_per_iteration": 2.6876227855682373 - }, - { - "auxiliary_loss_clip": 0.01111571, - "auxiliary_loss_mlp": 0.01037411, - "balance_loss_clip": 1.042117, - "balance_loss_mlp": 1.02412558, - "epoch": 0.5462197504885015, - "flos": 20886580143360.0, - "grad_norm": 2.0534049917888852, - "language_loss": 0.75331509, - "learning_rate": 1.7980195756904675e-06, - "loss": 0.77480489, - "num_input_tokens_seen": 195733585, - "step": 9085, - "time_per_iteration": 2.710315704345703 - }, - { - "auxiliary_loss_clip": 0.01098293, - "auxiliary_loss_mlp": 0.01035765, - "balance_loss_clip": 1.0397166, - "balance_loss_mlp": 1.02216959, - "epoch": 0.5462798737411694, - "flos": 25804976607360.0, - "grad_norm": 2.0038443585531174, - "language_loss": 0.75082123, - "learning_rate": 1.7976321089952857e-06, - "loss": 0.7721619, - "num_input_tokens_seen": 195752820, - "step": 9086, - "time_per_iteration": 2.7101428508758545 - }, - { - "auxiliary_loss_clip": 0.01102837, - "auxiliary_loss_mlp": 0.01035671, - "balance_loss_clip": 1.03951812, - "balance_loss_mlp": 1.02227759, - "epoch": 0.5463399969938374, - "flos": 25775027642880.0, - "grad_norm": 1.6829711206227542, - "language_loss": 0.77097058, - "learning_rate": 1.7972446499739155e-06, - "loss": 0.79235566, - "num_input_tokens_seen": 195773740, - "step": 9087, - "time_per_iteration": 2.6439003944396973 - }, - { - "auxiliary_loss_clip": 0.01114018, - "auxiliary_loss_mlp": 0.01042361, - "balance_loss_clip": 1.04376245, - "balance_loss_mlp": 1.02707863, - "epoch": 0.5464001202465053, - "flos": 18843298980480.0, - "grad_norm": 1.9617582228039958, - "language_loss": 0.77464199, - "learning_rate": 1.7968571986410484e-06, - "loss": 0.79620576, - "num_input_tokens_seen": 195792125, - "step": 9088, - "time_per_iteration": 2.62850022315979 - }, - { - "auxiliary_loss_clip": 0.00993547, - "auxiliary_loss_mlp": 0.00999929, - "balance_loss_clip": 1.02517176, - "balance_loss_mlp": 0.99852258, - "epoch": 0.5464602434991733, - "flos": 69049541623680.0, - "grad_norm": 0.7268281858475805, - "language_loss": 0.57717931, - "learning_rate": 1.7964697550113758e-06, - "loss": 0.59711409, - "num_input_tokens_seen": 195854935, - "step": 9089, - "time_per_iteration": 3.532050371170044 - }, - { - "auxiliary_loss_clip": 0.01085451, - "auxiliary_loss_mlp": 0.01038489, - "balance_loss_clip": 1.03805399, - "balance_loss_mlp": 1.02422571, - "epoch": 0.5465203667518412, - "flos": 27560039040000.0, - "grad_norm": 1.7593878993297172, - "language_loss": 0.76682436, - "learning_rate": 1.7960823190995918e-06, - "loss": 0.78806376, - "num_input_tokens_seen": 195874715, - "step": 9090, - "time_per_iteration": 3.0779287815093994 - }, - { - "auxiliary_loss_clip": 0.01106384, - "auxiliary_loss_mlp": 0.01039408, - "balance_loss_clip": 1.03928399, - "balance_loss_mlp": 1.0233984, - "epoch": 0.5465804900045093, - "flos": 21210206705280.0, - "grad_norm": 1.8843979676244431, - "language_loss": 0.74037111, - "learning_rate": 1.7956948909203855e-06, - "loss": 0.76182902, - "num_input_tokens_seen": 195892610, - "step": 9091, - "time_per_iteration": 2.6843886375427246 - }, - { - "auxiliary_loss_clip": 0.01103772, - "auxiliary_loss_mlp": 0.01037785, - "balance_loss_clip": 1.04514658, - "balance_loss_mlp": 1.02397454, - "epoch": 0.5466406132571772, - "flos": 22488949860480.0, - "grad_norm": 1.8168674877061988, - "language_loss": 0.78466463, - "learning_rate": 1.7953074704884498e-06, - "loss": 0.80608022, - "num_input_tokens_seen": 195911085, - "step": 9092, - "time_per_iteration": 2.6951024532318115 - }, - { - "auxiliary_loss_clip": 0.01125215, - "auxiliary_loss_mlp": 0.01034303, - "balance_loss_clip": 1.04363537, - "balance_loss_mlp": 1.01997435, - "epoch": 0.5467007365098452, - "flos": 17675843137920.0, - "grad_norm": 2.188123152779193, - "language_loss": 0.74691254, - "learning_rate": 1.794920057818476e-06, - "loss": 0.76850772, - "num_input_tokens_seen": 195929845, - "step": 9093, - "time_per_iteration": 2.596165657043457 - }, - { - "auxiliary_loss_clip": 0.01112494, - "auxiliary_loss_mlp": 0.01040653, - "balance_loss_clip": 1.04044032, - "balance_loss_mlp": 1.02444029, - "epoch": 0.5467608597625132, - "flos": 15698852524800.0, - "grad_norm": 2.4498750676664414, - "language_loss": 0.6874221, - "learning_rate": 1.7945326529251533e-06, - "loss": 0.70895356, - "num_input_tokens_seen": 195946350, - "step": 9094, - "time_per_iteration": 2.617203712463379 - }, - { - "auxiliary_loss_clip": 0.01100239, - "auxiliary_loss_mlp": 0.0103544, - "balance_loss_clip": 1.04255402, - "balance_loss_mlp": 1.02238083, - "epoch": 0.5468209830151811, - "flos": 24312816794880.0, - "grad_norm": 3.189829826251606, - "language_loss": 0.67888498, - "learning_rate": 1.7941452558231731e-06, - "loss": 0.70024174, - "num_input_tokens_seen": 195959840, - "step": 9095, - "time_per_iteration": 2.709214687347412 - }, - { - "auxiliary_loss_clip": 0.01085979, - "auxiliary_loss_mlp": 0.01036228, - "balance_loss_clip": 1.0412364, - "balance_loss_mlp": 1.0228703, - "epoch": 0.5468811062678491, - "flos": 29166323339520.0, - "grad_norm": 1.772487886139895, - "language_loss": 0.66687673, - "learning_rate": 1.7937578665272256e-06, - "loss": 0.68809879, - "num_input_tokens_seen": 195981125, - "step": 9096, - "time_per_iteration": 2.768289804458618 - }, - { - "auxiliary_loss_clip": 0.01013718, - "auxiliary_loss_mlp": 0.01003083, - "balance_loss_clip": 1.01639581, - "balance_loss_mlp": 1.00179529, - "epoch": 0.546941229520517, - "flos": 67867037982720.0, - "grad_norm": 0.7380745619271847, - "language_loss": 0.57528484, - "learning_rate": 1.7933704850520007e-06, - "loss": 0.59545285, - "num_input_tokens_seen": 196038880, - "step": 9097, - "time_per_iteration": 3.353034496307373 - }, - { - "auxiliary_loss_clip": 0.01023908, - "auxiliary_loss_mlp": 0.00999165, - "balance_loss_clip": 1.01245689, - "balance_loss_mlp": 0.99754351, - "epoch": 0.5470013527731851, - "flos": 58270306625280.0, - "grad_norm": 0.9199423088856966, - "language_loss": 0.64710629, - "learning_rate": 1.7929831114121868e-06, - "loss": 0.66733694, - "num_input_tokens_seen": 196099215, - "step": 9098, - "time_per_iteration": 3.1356828212738037 - }, - { - "auxiliary_loss_clip": 0.01114825, - "auxiliary_loss_mlp": 0.01037808, - "balance_loss_clip": 1.04415989, - "balance_loss_mlp": 1.02378869, - "epoch": 0.547061476025853, - "flos": 22965915582720.0, - "grad_norm": 2.132166365058938, - "language_loss": 0.73123235, - "learning_rate": 1.7925957456224753e-06, - "loss": 0.75275862, - "num_input_tokens_seen": 196120370, - "step": 9099, - "time_per_iteration": 2.662252426147461 - }, - { - "auxiliary_loss_clip": 0.01097751, - "auxiliary_loss_mlp": 0.01035706, - "balance_loss_clip": 1.04278708, - "balance_loss_mlp": 1.02327275, - "epoch": 0.547121599278521, - "flos": 29968244426880.0, - "grad_norm": 1.880355780986747, - "language_loss": 0.72515011, - "learning_rate": 1.7922083876975537e-06, - "loss": 0.74648476, - "num_input_tokens_seen": 196139075, - "step": 9100, - "time_per_iteration": 2.859636068344116 - }, - { - "auxiliary_loss_clip": 0.01106059, - "auxiliary_loss_mlp": 0.00770753, - "balance_loss_clip": 1.04162157, - "balance_loss_mlp": 1.00017691, - "epoch": 0.5471817225311889, - "flos": 36535443914880.0, - "grad_norm": 1.8314110929237357, - "language_loss": 0.68211091, - "learning_rate": 1.7918210376521102e-06, - "loss": 0.70087898, - "num_input_tokens_seen": 196159990, - "step": 9101, - "time_per_iteration": 2.747811794281006 - }, - { - "auxiliary_loss_clip": 0.01123228, - "auxiliary_loss_mlp": 0.01034884, - "balance_loss_clip": 1.04393971, - "balance_loss_mlp": 1.02121687, - "epoch": 0.5472418457838569, - "flos": 25775243124480.0, - "grad_norm": 1.907951209204745, - "language_loss": 0.77796781, - "learning_rate": 1.7914336955008343e-06, - "loss": 0.79954892, - "num_input_tokens_seen": 196180570, - "step": 9102, - "time_per_iteration": 2.6425788402557373 - }, - { - "auxiliary_loss_clip": 0.01087581, - "auxiliary_loss_mlp": 0.01039397, - "balance_loss_clip": 1.04114008, - "balance_loss_mlp": 1.02447212, - "epoch": 0.5473019690365248, - "flos": 27887687925120.0, - "grad_norm": 1.553646996990172, - "language_loss": 0.72080058, - "learning_rate": 1.791046361258413e-06, - "loss": 0.74207032, - "num_input_tokens_seen": 196200300, - "step": 9103, - "time_per_iteration": 2.7307486534118652 - }, - { - "auxiliary_loss_clip": 0.01088884, - "auxiliary_loss_mlp": 0.01031551, - "balance_loss_clip": 1.0425241, - "balance_loss_mlp": 1.01806211, - "epoch": 0.5473620922891929, - "flos": 57631490219520.0, - "grad_norm": 1.4283303897304696, - "language_loss": 0.65195155, - "learning_rate": 1.7906590349395356e-06, - "loss": 0.67315584, - "num_input_tokens_seen": 196228525, - "step": 9104, - "time_per_iteration": 3.0792930126190186 - }, - { - "auxiliary_loss_clip": 0.01109949, - "auxiliary_loss_mlp": 0.0103298, - "balance_loss_clip": 1.04480743, - "balance_loss_mlp": 1.01883578, - "epoch": 0.5474222155418608, - "flos": 19354056422400.0, - "grad_norm": 1.90483998435302, - "language_loss": 0.82428771, - "learning_rate": 1.790271716558888e-06, - "loss": 0.84571701, - "num_input_tokens_seen": 196247690, - "step": 9105, - "time_per_iteration": 3.3235061168670654 - }, - { - "auxiliary_loss_clip": 0.01119165, - "auxiliary_loss_mlp": 0.01030088, - "balance_loss_clip": 1.04210079, - "balance_loss_mlp": 1.01735604, - "epoch": 0.5474823387945288, - "flos": 25120448144640.0, - "grad_norm": 1.6592382133296117, - "language_loss": 0.80052161, - "learning_rate": 1.7898844061311575e-06, - "loss": 0.82201409, - "num_input_tokens_seen": 196268555, - "step": 9106, - "time_per_iteration": 2.7082676887512207 - }, - { - "auxiliary_loss_clip": 0.01115376, - "auxiliary_loss_mlp": 0.01036861, - "balance_loss_clip": 1.04689944, - "balance_loss_mlp": 1.02419519, - "epoch": 0.5475424620471967, - "flos": 18004174381440.0, - "grad_norm": 1.7933883779040884, - "language_loss": 0.69402343, - "learning_rate": 1.7894971036710322e-06, - "loss": 0.71554577, - "num_input_tokens_seen": 196285585, - "step": 9107, - "time_per_iteration": 2.626214027404785 - }, - { - "auxiliary_loss_clip": 0.01115289, - "auxiliary_loss_mlp": 0.01035057, - "balance_loss_clip": 1.04319263, - "balance_loss_mlp": 1.02166939, - "epoch": 0.5476025852998647, - "flos": 22309324922880.0, - "grad_norm": 2.6929722220667824, - "language_loss": 0.63537276, - "learning_rate": 1.789109809193197e-06, - "loss": 0.65687621, - "num_input_tokens_seen": 196305085, - "step": 9108, - "time_per_iteration": 2.6056766510009766 - }, - { - "auxiliary_loss_clip": 0.01122102, - "auxiliary_loss_mlp": 0.01029913, - "balance_loss_clip": 1.0446291, - "balance_loss_mlp": 1.01750922, - "epoch": 0.5476627085525327, - "flos": 20120497850880.0, - "grad_norm": 1.7311986454715018, - "language_loss": 0.75234431, - "learning_rate": 1.7887225227123396e-06, - "loss": 0.77386445, - "num_input_tokens_seen": 196323945, - "step": 9109, - "time_per_iteration": 2.562833786010742 - }, - { - "auxiliary_loss_clip": 0.01093609, - "auxiliary_loss_mlp": 0.01035669, - "balance_loss_clip": 1.04307365, - "balance_loss_mlp": 1.02143562, - "epoch": 0.5477228318052006, - "flos": 17712579772800.0, - "grad_norm": 1.7887859684809904, - "language_loss": 0.77939326, - "learning_rate": 1.7883352442431457e-06, - "loss": 0.800686, - "num_input_tokens_seen": 196342200, - "step": 9110, - "time_per_iteration": 2.62839674949646 - }, - { - "auxiliary_loss_clip": 0.01106302, - "auxiliary_loss_mlp": 0.01032426, - "balance_loss_clip": 1.04262304, - "balance_loss_mlp": 1.01997423, - "epoch": 0.5477829550578687, - "flos": 25848895962240.0, - "grad_norm": 1.525983194059855, - "language_loss": 0.71175343, - "learning_rate": 1.7879479738002993e-06, - "loss": 0.73314071, - "num_input_tokens_seen": 196362940, - "step": 9111, - "time_per_iteration": 2.664486885070801 - }, - { - "auxiliary_loss_clip": 0.01111586, - "auxiliary_loss_mlp": 0.01044961, - "balance_loss_clip": 1.0436976, - "balance_loss_mlp": 1.0317409, - "epoch": 0.5478430783105366, - "flos": 23039676161280.0, - "grad_norm": 1.5197619181850293, - "language_loss": 0.71096945, - "learning_rate": 1.7875607113984876e-06, - "loss": 0.73253489, - "num_input_tokens_seen": 196383070, - "step": 9112, - "time_per_iteration": 2.7334086894989014 - }, - { - "auxiliary_loss_clip": 0.01067523, - "auxiliary_loss_mlp": 0.01034936, - "balance_loss_clip": 1.03873658, - "balance_loss_mlp": 1.02179968, - "epoch": 0.5479032015632046, - "flos": 16071210864000.0, - "grad_norm": 2.172543516099556, - "language_loss": 0.87877554, - "learning_rate": 1.7871734570523953e-06, - "loss": 0.89980012, - "num_input_tokens_seen": 196398485, - "step": 9113, - "time_per_iteration": 5.9666571617126465 - }, - { - "auxiliary_loss_clip": 0.01070074, - "auxiliary_loss_mlp": 0.01032166, - "balance_loss_clip": 1.04229951, - "balance_loss_mlp": 1.01853991, - "epoch": 0.5479633248158725, - "flos": 24278701852800.0, - "grad_norm": 1.4694487805740626, - "language_loss": 0.73041236, - "learning_rate": 1.7867862107767067e-06, - "loss": 0.7514348, - "num_input_tokens_seen": 196417725, - "step": 9114, - "time_per_iteration": 4.333765745162964 - }, - { - "auxiliary_loss_clip": 0.01093195, - "auxiliary_loss_mlp": 0.00770887, - "balance_loss_clip": 1.03821266, - "balance_loss_mlp": 1.00027823, - "epoch": 0.5480234480685405, - "flos": 26358216860160.0, - "grad_norm": 1.6145561495164014, - "language_loss": 0.72155976, - "learning_rate": 1.7863989725861066e-06, - "loss": 0.74020058, - "num_input_tokens_seen": 196437840, - "step": 9115, - "time_per_iteration": 2.6793766021728516 - }, - { - "auxiliary_loss_clip": 0.01084634, - "auxiliary_loss_mlp": 0.00774539, - "balance_loss_clip": 1.03983831, - "balance_loss_mlp": 1.00038791, - "epoch": 0.5480835713212084, - "flos": 22055077480320.0, - "grad_norm": 1.7266092862770852, - "language_loss": 0.72229278, - "learning_rate": 1.7860117424952781e-06, - "loss": 0.74088448, - "num_input_tokens_seen": 196457300, - "step": 9116, - "time_per_iteration": 2.738142490386963 - }, - { - "auxiliary_loss_clip": 0.01095127, - "auxiliary_loss_mlp": 0.01039685, - "balance_loss_clip": 1.04102373, - "balance_loss_mlp": 1.0259639, - "epoch": 0.5481436945738765, - "flos": 25301042749440.0, - "grad_norm": 4.413930764564679, - "language_loss": 0.76158273, - "learning_rate": 1.7856245205189063e-06, - "loss": 0.78293079, - "num_input_tokens_seen": 196476720, - "step": 9117, - "time_per_iteration": 2.693359613418579 - }, - { - "auxiliary_loss_clip": 0.01070482, - "auxiliary_loss_mlp": 0.01035701, - "balance_loss_clip": 1.03514457, - "balance_loss_mlp": 1.02292752, - "epoch": 0.5482038178265444, - "flos": 33580857772800.0, - "grad_norm": 1.575829874902699, - "language_loss": 0.62537289, - "learning_rate": 1.785237306671674e-06, - "loss": 0.64643478, - "num_input_tokens_seen": 196496765, - "step": 9118, - "time_per_iteration": 4.42430305480957 - }, - { - "auxiliary_loss_clip": 0.01124628, - "auxiliary_loss_mlp": 0.01036624, - "balance_loss_clip": 1.04479444, - "balance_loss_mlp": 1.02259278, - "epoch": 0.5482639410792124, - "flos": 19026192055680.0, - "grad_norm": 2.694246810130355, - "language_loss": 0.79018009, - "learning_rate": 1.7848501009682646e-06, - "loss": 0.81179261, - "num_input_tokens_seen": 196516220, - "step": 9119, - "time_per_iteration": 2.606593608856201 - }, - { - "auxiliary_loss_clip": 0.01092726, - "auxiliary_loss_mlp": 0.00769453, - "balance_loss_clip": 1.04150975, - "balance_loss_mlp": 1.00022948, - "epoch": 0.5483240643318803, - "flos": 25410318900480.0, - "grad_norm": 1.8682271604905119, - "language_loss": 0.82534289, - "learning_rate": 1.7844629034233604e-06, - "loss": 0.8439647, - "num_input_tokens_seen": 196533860, - "step": 9120, - "time_per_iteration": 2.694546699523926 - }, - { - "auxiliary_loss_clip": 0.01089359, - "auxiliary_loss_mlp": 0.01039031, - "balance_loss_clip": 1.04395008, - "balance_loss_mlp": 1.02531016, - "epoch": 0.5483841875845483, - "flos": 21466896272640.0, - "grad_norm": 1.8000226938726367, - "language_loss": 0.80031526, - "learning_rate": 1.7840757140516455e-06, - "loss": 0.82159919, - "num_input_tokens_seen": 196551305, - "step": 9121, - "time_per_iteration": 2.7422945499420166 - }, - { - "auxiliary_loss_clip": 0.01076146, - "auxiliary_loss_mlp": 0.01038978, - "balance_loss_clip": 1.03803313, - "balance_loss_mlp": 1.02408934, - "epoch": 0.5484443108372163, - "flos": 24747263792640.0, - "grad_norm": 1.9827939120507885, - "language_loss": 0.60996848, - "learning_rate": 1.7836885328678008e-06, - "loss": 0.63111973, - "num_input_tokens_seen": 196569420, - "step": 9122, - "time_per_iteration": 2.782677412033081 - }, - { - "auxiliary_loss_clip": 0.01106377, - "auxiliary_loss_mlp": 0.01038556, - "balance_loss_clip": 1.04853153, - "balance_loss_mlp": 1.0268079, - "epoch": 0.5485044340898843, - "flos": 25375377945600.0, - "grad_norm": 1.5587852273808862, - "language_loss": 0.71594763, - "learning_rate": 1.7833013598865084e-06, - "loss": 0.73739696, - "num_input_tokens_seen": 196590610, - "step": 9123, - "time_per_iteration": 2.756350517272949 - }, - { - "auxiliary_loss_clip": 0.01121133, - "auxiliary_loss_mlp": 0.01033494, - "balance_loss_clip": 1.04210067, - "balance_loss_mlp": 1.0208813, - "epoch": 0.5485645573425523, - "flos": 12641167370880.0, - "grad_norm": 2.3735658261361845, - "language_loss": 0.83559448, - "learning_rate": 1.7829141951224505e-06, - "loss": 0.85714072, - "num_input_tokens_seen": 196606495, - "step": 9124, - "time_per_iteration": 2.61197829246521 - }, - { - "auxiliary_loss_clip": 0.01094486, - "auxiliary_loss_mlp": 0.01033029, - "balance_loss_clip": 1.04321349, - "balance_loss_mlp": 1.01992834, - "epoch": 0.5486246805952202, - "flos": 28329425383680.0, - "grad_norm": 1.5486509111854319, - "language_loss": 0.80518043, - "learning_rate": 1.7825270385903075e-06, - "loss": 0.82645559, - "num_input_tokens_seen": 196626365, - "step": 9125, - "time_per_iteration": 2.773972749710083 - }, - { - "auxiliary_loss_clip": 0.01111849, - "auxiliary_loss_mlp": 0.01032276, - "balance_loss_clip": 1.04336679, - "balance_loss_mlp": 1.01903141, - "epoch": 0.5486848038478882, - "flos": 16800017817600.0, - "grad_norm": 4.333134351852335, - "language_loss": 0.74312758, - "learning_rate": 1.7821398903047617e-06, - "loss": 0.76456887, - "num_input_tokens_seen": 196644465, - "step": 9126, - "time_per_iteration": 2.654529333114624 - }, - { - "auxiliary_loss_clip": 0.01107646, - "auxiliary_loss_mlp": 0.01037249, - "balance_loss_clip": 1.03968537, - "balance_loss_mlp": 1.02193701, - "epoch": 0.5487449271005561, - "flos": 17236224581760.0, - "grad_norm": 2.710645426319007, - "language_loss": 0.66802239, - "learning_rate": 1.7817527502804928e-06, - "loss": 0.6894713, - "num_input_tokens_seen": 196659160, - "step": 9127, - "time_per_iteration": 2.615807294845581 - }, - { - "auxiliary_loss_clip": 0.01078683, - "auxiliary_loss_mlp": 0.01039383, - "balance_loss_clip": 1.03928149, - "balance_loss_mlp": 1.0249052, - "epoch": 0.5488050503532241, - "flos": 17340867878400.0, - "grad_norm": 2.0894273864631225, - "language_loss": 0.82909453, - "learning_rate": 1.781365618532181e-06, - "loss": 0.85027516, - "num_input_tokens_seen": 196677410, - "step": 9128, - "time_per_iteration": 2.681060791015625 - }, - { - "auxiliary_loss_clip": 0.01074302, - "auxiliary_loss_mlp": 0.01037438, - "balance_loss_clip": 1.03565645, - "balance_loss_mlp": 1.02254319, - "epoch": 0.548865173605892, - "flos": 17239169496960.0, - "grad_norm": 1.9025486027385248, - "language_loss": 0.74247289, - "learning_rate": 1.7809784950745078e-06, - "loss": 0.76359022, - "num_input_tokens_seen": 196696765, - "step": 9129, - "time_per_iteration": 2.681459426879883 - }, - { - "auxiliary_loss_clip": 0.01077104, - "auxiliary_loss_mlp": 0.01037347, - "balance_loss_clip": 1.03771412, - "balance_loss_mlp": 1.02210581, - "epoch": 0.5489252968585601, - "flos": 17456716218240.0, - "grad_norm": 3.0707794644461854, - "language_loss": 0.63489515, - "learning_rate": 1.7805913799221511e-06, - "loss": 0.65603966, - "num_input_tokens_seen": 196714895, - "step": 9130, - "time_per_iteration": 2.743734359741211 - }, - { - "auxiliary_loss_clip": 0.01124543, - "auxiliary_loss_mlp": 0.00771634, - "balance_loss_clip": 1.04329586, - "balance_loss_mlp": 1.00023222, - "epoch": 0.548985420111228, - "flos": 26323383646080.0, - "grad_norm": 1.7961020275949398, - "language_loss": 0.62998879, - "learning_rate": 1.7802042730897915e-06, - "loss": 0.64895058, - "num_input_tokens_seen": 196735510, - "step": 9131, - "time_per_iteration": 2.7136600017547607 - }, - { - "auxiliary_loss_clip": 0.01109321, - "auxiliary_loss_mlp": 0.01039388, - "balance_loss_clip": 1.04004657, - "balance_loss_mlp": 1.02416492, - "epoch": 0.549045543363896, - "flos": 18693730748160.0, - "grad_norm": 1.6718560353245449, - "language_loss": 0.7504952, - "learning_rate": 1.7798171745921084e-06, - "loss": 0.77198231, - "num_input_tokens_seen": 196752855, - "step": 9132, - "time_per_iteration": 2.686460494995117 - }, - { - "auxiliary_loss_clip": 0.01107553, - "auxiliary_loss_mlp": 0.01033276, - "balance_loss_clip": 1.03815818, - "balance_loss_mlp": 1.02046108, - "epoch": 0.5491056666165639, - "flos": 24717386655360.0, - "grad_norm": 1.5443073078358045, - "language_loss": 0.81107825, - "learning_rate": 1.7794300844437795e-06, - "loss": 0.83248657, - "num_input_tokens_seen": 196772230, - "step": 9133, - "time_per_iteration": 2.607304811477661 - }, - { - "auxiliary_loss_clip": 0.0109676, - "auxiliary_loss_mlp": 0.00770878, - "balance_loss_clip": 1.04211152, - "balance_loss_mlp": 1.00023055, - "epoch": 0.5491657898692319, - "flos": 21576926609280.0, - "grad_norm": 2.2143971437865275, - "language_loss": 0.69978988, - "learning_rate": 1.7790430026594841e-06, - "loss": 0.71846628, - "num_input_tokens_seen": 196790405, - "step": 9134, - "time_per_iteration": 2.655400037765503 - }, - { - "auxiliary_loss_clip": 0.01085592, - "auxiliary_loss_mlp": 0.0104003, - "balance_loss_clip": 1.03952289, - "balance_loss_mlp": 1.0263567, - "epoch": 0.5492259131219, - "flos": 50476432746240.0, - "grad_norm": 2.156005038881863, - "language_loss": 0.61240542, - "learning_rate": 1.7786559292539004e-06, - "loss": 0.63366163, - "num_input_tokens_seen": 196813785, - "step": 9135, - "time_per_iteration": 2.911567449569702 - }, - { - "auxiliary_loss_clip": 0.01112825, - "auxiliary_loss_mlp": 0.01036574, - "balance_loss_clip": 1.042696, - "balance_loss_mlp": 1.02169049, - "epoch": 0.5492860363745679, - "flos": 25119262995840.0, - "grad_norm": 1.746391133416305, - "language_loss": 0.72368252, - "learning_rate": 1.7782688642417058e-06, - "loss": 0.74517649, - "num_input_tokens_seen": 196834390, - "step": 9136, - "time_per_iteration": 2.6732101440429688 - }, - { - "auxiliary_loss_clip": 0.01060281, - "auxiliary_loss_mlp": 0.0104408, - "balance_loss_clip": 1.03961897, - "balance_loss_mlp": 1.02839267, - "epoch": 0.5493461596272359, - "flos": 22633777497600.0, - "grad_norm": 2.424259272269788, - "language_loss": 0.68256485, - "learning_rate": 1.7778818076375781e-06, - "loss": 0.70360851, - "num_input_tokens_seen": 196853290, - "step": 9137, - "time_per_iteration": 2.7947540283203125 - }, - { - "auxiliary_loss_clip": 0.01030828, - "auxiliary_loss_mlp": 0.01011299, - "balance_loss_clip": 1.01489806, - "balance_loss_mlp": 1.00992203, - "epoch": 0.5494062828799038, - "flos": 66151800754560.0, - "grad_norm": 0.7420439748923869, - "language_loss": 0.65270352, - "learning_rate": 1.7774947594561947e-06, - "loss": 0.67312479, - "num_input_tokens_seen": 196913120, - "step": 9138, - "time_per_iteration": 3.2256250381469727 - }, - { - "auxiliary_loss_clip": 0.0111256, - "auxiliary_loss_mlp": 0.01032689, - "balance_loss_clip": 1.04488194, - "balance_loss_mlp": 1.01902211, - "epoch": 0.5494664061325718, - "flos": 21105958458240.0, - "grad_norm": 1.8659950166851553, - "language_loss": 0.75243253, - "learning_rate": 1.7771077197122321e-06, - "loss": 0.77388501, - "num_input_tokens_seen": 196931530, - "step": 9139, - "time_per_iteration": 2.7239251136779785 - }, - { - "auxiliary_loss_clip": 0.01110681, - "auxiliary_loss_mlp": 0.01033207, - "balance_loss_clip": 1.04175556, - "balance_loss_mlp": 1.01932561, - "epoch": 0.5495265293852397, - "flos": 14392566616320.0, - "grad_norm": 1.6260992267363037, - "language_loss": 0.70765269, - "learning_rate": 1.7767206884203672e-06, - "loss": 0.72909158, - "num_input_tokens_seen": 196949430, - "step": 9140, - "time_per_iteration": 2.647174119949341 - }, - { - "auxiliary_loss_clip": 0.01090583, - "auxiliary_loss_mlp": 0.01036785, - "balance_loss_clip": 1.03731537, - "balance_loss_mlp": 1.02207434, - "epoch": 0.5495866526379077, - "flos": 25549148966400.0, - "grad_norm": 1.8985191424105816, - "language_loss": 0.7687242, - "learning_rate": 1.7763336655952762e-06, - "loss": 0.78999794, - "num_input_tokens_seen": 196968265, - "step": 9141, - "time_per_iteration": 2.65411639213562 - }, - { - "auxiliary_loss_clip": 0.01084812, - "auxiliary_loss_mlp": 0.01036963, - "balance_loss_clip": 1.0427072, - "balance_loss_mlp": 1.02342081, - "epoch": 0.5496467758905756, - "flos": 21317256213120.0, - "grad_norm": 2.1277262842794697, - "language_loss": 0.7463578, - "learning_rate": 1.7759466512516346e-06, - "loss": 0.7675755, - "num_input_tokens_seen": 196984930, - "step": 9142, - "time_per_iteration": 2.7200329303741455 - }, - { - "auxiliary_loss_clip": 0.01098795, - "auxiliary_loss_mlp": 0.01036884, - "balance_loss_clip": 1.04416585, - "balance_loss_mlp": 1.02186954, - "epoch": 0.5497068991432437, - "flos": 22233086305920.0, - "grad_norm": 5.155975597587774, - "language_loss": 0.7661894, - "learning_rate": 1.7755596454041192e-06, - "loss": 0.78754616, - "num_input_tokens_seen": 197002320, - "step": 9143, - "time_per_iteration": 2.6951520442962646 - }, - { - "auxiliary_loss_clip": 0.01091779, - "auxiliary_loss_mlp": 0.01037521, - "balance_loss_clip": 1.03912258, - "balance_loss_mlp": 1.02332926, - "epoch": 0.5497670223959116, - "flos": 18479093028480.0, - "grad_norm": 2.8186227807908466, - "language_loss": 0.79572552, - "learning_rate": 1.7751726480674044e-06, - "loss": 0.81701857, - "num_input_tokens_seen": 197020825, - "step": 9144, - "time_per_iteration": 2.661098003387451 - }, - { - "auxiliary_loss_clip": 0.01112844, - "auxiliary_loss_mlp": 0.01034684, - "balance_loss_clip": 1.04339552, - "balance_loss_mlp": 1.02086163, - "epoch": 0.5498271456485796, - "flos": 29205107049600.0, - "grad_norm": 1.6865855857111283, - "language_loss": 0.70998669, - "learning_rate": 1.7747856592561645e-06, - "loss": 0.731462, - "num_input_tokens_seen": 197040450, - "step": 9145, - "time_per_iteration": 2.6857175827026367 - }, - { - "auxiliary_loss_clip": 0.01109884, - "auxiliary_loss_mlp": 0.01033378, - "balance_loss_clip": 1.04158354, - "balance_loss_mlp": 1.02063489, - "epoch": 0.5498872689012475, - "flos": 34824372664320.0, - "grad_norm": 1.7292068512536125, - "language_loss": 0.70875257, - "learning_rate": 1.774398678985076e-06, - "loss": 0.73018515, - "num_input_tokens_seen": 197063930, - "step": 9146, - "time_per_iteration": 2.7719805240631104 - }, - { - "auxiliary_loss_clip": 0.01096176, - "auxiliary_loss_mlp": 0.01029792, - "balance_loss_clip": 1.04054928, - "balance_loss_mlp": 1.01708448, - "epoch": 0.5499473921539155, - "flos": 25921938268800.0, - "grad_norm": 1.7336366982972622, - "language_loss": 0.63770372, - "learning_rate": 1.7740117072688113e-06, - "loss": 0.65896338, - "num_input_tokens_seen": 197082660, - "step": 9147, - "time_per_iteration": 2.6603379249572754 - }, - { - "auxiliary_loss_clip": 0.01125139, - "auxiliary_loss_mlp": 0.01033, - "balance_loss_clip": 1.04582083, - "balance_loss_mlp": 1.01920164, - "epoch": 0.5500075154065835, - "flos": 22273701609600.0, - "grad_norm": 2.1607061922348088, - "language_loss": 0.81009579, - "learning_rate": 1.7736247441220458e-06, - "loss": 0.8316772, - "num_input_tokens_seen": 197100675, - "step": 9148, - "time_per_iteration": 2.620183229446411 - }, - { - "auxiliary_loss_clip": 0.01101315, - "auxiliary_loss_mlp": 0.01039357, - "balance_loss_clip": 1.04367983, - "balance_loss_mlp": 1.02550507, - "epoch": 0.5500676386592515, - "flos": 28037507552640.0, - "grad_norm": 1.7340881050910257, - "language_loss": 0.79154336, - "learning_rate": 1.773237789559453e-06, - "loss": 0.81295007, - "num_input_tokens_seen": 197121320, - "step": 9149, - "time_per_iteration": 2.734495162963867 - }, - { - "auxiliary_loss_clip": 0.01082615, - "auxiliary_loss_mlp": 0.0102795, - "balance_loss_clip": 1.0412097, - "balance_loss_mlp": 1.01476002, - "epoch": 0.5501277619119195, - "flos": 23914819123200.0, - "grad_norm": 4.0693062888880185, - "language_loss": 0.72006851, - "learning_rate": 1.7728508435957052e-06, - "loss": 0.74117416, - "num_input_tokens_seen": 197138965, - "step": 9150, - "time_per_iteration": 2.66481876373291 - }, - { - "auxiliary_loss_clip": 0.01099742, - "auxiliary_loss_mlp": 0.01033518, - "balance_loss_clip": 1.03804266, - "balance_loss_mlp": 1.0189085, - "epoch": 0.5501878851645874, - "flos": 20923783655040.0, - "grad_norm": 3.1249847499070014, - "language_loss": 0.75043446, - "learning_rate": 1.772463906245477e-06, - "loss": 0.77176708, - "num_input_tokens_seen": 197156460, - "step": 9151, - "time_per_iteration": 2.704946517944336 - }, - { - "auxiliary_loss_clip": 0.0109205, - "auxiliary_loss_mlp": 0.01033656, - "balance_loss_clip": 1.03899741, - "balance_loss_mlp": 1.01981556, - "epoch": 0.5502480084172554, - "flos": 20665298407680.0, - "grad_norm": 2.3903222148465035, - "language_loss": 0.76302028, - "learning_rate": 1.7720769775234394e-06, - "loss": 0.78427732, - "num_input_tokens_seen": 197175140, - "step": 9152, - "time_per_iteration": 5.871058464050293 - }, - { - "auxiliary_loss_clip": 0.01098821, - "auxiliary_loss_mlp": 0.01033709, - "balance_loss_clip": 1.04291546, - "balance_loss_mlp": 1.02058983, - "epoch": 0.5503081316699233, - "flos": 26432552056320.0, - "grad_norm": 1.865148989318078, - "language_loss": 0.82033801, - "learning_rate": 1.7716900574442662e-06, - "loss": 0.84166336, - "num_input_tokens_seen": 197194345, - "step": 9153, - "time_per_iteration": 2.741382598876953 - }, - { - "auxiliary_loss_clip": 0.01110131, - "auxiliary_loss_mlp": 0.01029037, - "balance_loss_clip": 1.04423809, - "balance_loss_mlp": 1.01572764, - "epoch": 0.5503682549225913, - "flos": 30629144718720.0, - "grad_norm": 1.7497509025726563, - "language_loss": 0.74392802, - "learning_rate": 1.7713031460226294e-06, - "loss": 0.76531971, - "num_input_tokens_seen": 197215535, - "step": 9154, - "time_per_iteration": 4.345115900039673 - }, - { - "auxiliary_loss_clip": 0.01104154, - "auxiliary_loss_mlp": 0.01039546, - "balance_loss_clip": 1.04041803, - "balance_loss_mlp": 1.02451348, - "epoch": 0.5504283781752592, - "flos": 22565439872640.0, - "grad_norm": 1.5994441828682415, - "language_loss": 0.73138744, - "learning_rate": 1.770916243273199e-06, - "loss": 0.75282443, - "num_input_tokens_seen": 197234945, - "step": 9155, - "time_per_iteration": 2.6851611137390137 - }, - { - "auxiliary_loss_clip": 0.01021957, - "auxiliary_loss_mlp": 0.01001594, - "balance_loss_clip": 1.01543474, - "balance_loss_mlp": 1.00016963, - "epoch": 0.5504885014279273, - "flos": 67901009270400.0, - "grad_norm": 0.7575867212346565, - "language_loss": 0.55399221, - "learning_rate": 1.7705293492106483e-06, - "loss": 0.57422775, - "num_input_tokens_seen": 197302285, - "step": 9156, - "time_per_iteration": 3.300373077392578 - }, - { - "auxiliary_loss_clip": 0.0110824, - "auxiliary_loss_mlp": 0.01037205, - "balance_loss_clip": 1.03954601, - "balance_loss_mlp": 1.02354383, - "epoch": 0.5505486246805952, - "flos": 22450058409600.0, - "grad_norm": 1.7338338818713679, - "language_loss": 0.82676858, - "learning_rate": 1.7701424638496475e-06, - "loss": 0.84822297, - "num_input_tokens_seen": 197321575, - "step": 9157, - "time_per_iteration": 4.260001182556152 - }, - { - "auxiliary_loss_clip": 0.01128779, - "auxiliary_loss_mlp": 0.01036261, - "balance_loss_clip": 1.04512608, - "balance_loss_mlp": 1.02101421, - "epoch": 0.5506087479332632, - "flos": 26906896085760.0, - "grad_norm": 2.1665568405651916, - "language_loss": 0.7574966, - "learning_rate": 1.7697555872048677e-06, - "loss": 0.77914703, - "num_input_tokens_seen": 197340255, - "step": 9158, - "time_per_iteration": 2.634035587310791 - }, - { - "auxiliary_loss_clip": 0.01079995, - "auxiliary_loss_mlp": 0.01032346, - "balance_loss_clip": 1.04036868, - "balance_loss_mlp": 1.01919723, - "epoch": 0.5506688711859311, - "flos": 22930256355840.0, - "grad_norm": 1.7765349720842452, - "language_loss": 0.7011236, - "learning_rate": 1.769368719290979e-06, - "loss": 0.72224694, - "num_input_tokens_seen": 197360360, - "step": 9159, - "time_per_iteration": 2.765982151031494 - }, - { - "auxiliary_loss_clip": 0.01074937, - "auxiliary_loss_mlp": 0.00772606, - "balance_loss_clip": 1.03859997, - "balance_loss_mlp": 1.00024915, - "epoch": 0.5507289944385991, - "flos": 29606408772480.0, - "grad_norm": 1.5184177470515237, - "language_loss": 0.6844312, - "learning_rate": 1.7689818601226516e-06, - "loss": 0.70290661, - "num_input_tokens_seen": 197381905, - "step": 9160, - "time_per_iteration": 2.7715611457824707 - }, - { - "auxiliary_loss_clip": 0.01121201, - "auxiliary_loss_mlp": 0.01036642, - "balance_loss_clip": 1.04361653, - "balance_loss_mlp": 1.02297473, - "epoch": 0.5507891176912671, - "flos": 15334431091200.0, - "grad_norm": 2.346039254378587, - "language_loss": 0.71789527, - "learning_rate": 1.7685950097145552e-06, - "loss": 0.7394737, - "num_input_tokens_seen": 197398555, - "step": 9161, - "time_per_iteration": 2.641042470932007 - }, - { - "auxiliary_loss_clip": 0.01112875, - "auxiliary_loss_mlp": 0.01042589, - "balance_loss_clip": 1.04357731, - "balance_loss_mlp": 1.02879643, - "epoch": 0.5508492409439351, - "flos": 26578313447040.0, - "grad_norm": 1.6233913779896265, - "language_loss": 0.69443804, - "learning_rate": 1.768208168081359e-06, - "loss": 0.71599269, - "num_input_tokens_seen": 197419630, - "step": 9162, - "time_per_iteration": 2.693645715713501 - }, - { - "auxiliary_loss_clip": 0.01122811, - "auxiliary_loss_mlp": 0.01038789, - "balance_loss_clip": 1.04462349, - "balance_loss_mlp": 1.02506185, - "epoch": 0.5509093641966031, - "flos": 25443428261760.0, - "grad_norm": 1.863003505887403, - "language_loss": 0.85338551, - "learning_rate": 1.767821335237733e-06, - "loss": 0.87500155, - "num_input_tokens_seen": 197438480, - "step": 9163, - "time_per_iteration": 2.6538877487182617 - }, - { - "auxiliary_loss_clip": 0.01088872, - "auxiliary_loss_mlp": 0.01032282, - "balance_loss_clip": 1.04132617, - "balance_loss_mlp": 1.01908576, - "epoch": 0.550969487449271, - "flos": 18698543170560.0, - "grad_norm": 1.8611061255519936, - "language_loss": 0.80892253, - "learning_rate": 1.7674345111983441e-06, - "loss": 0.83013415, - "num_input_tokens_seen": 197456755, - "step": 9164, - "time_per_iteration": 2.813016891479492 - }, - { - "auxiliary_loss_clip": 0.0110727, - "auxiliary_loss_mlp": 0.01033027, - "balance_loss_clip": 1.04617882, - "balance_loss_mlp": 1.01856649, - "epoch": 0.551029610701939, - "flos": 22708723224960.0, - "grad_norm": 1.8149479270660511, - "language_loss": 0.73350954, - "learning_rate": 1.767047695977863e-06, - "loss": 0.75491256, - "num_input_tokens_seen": 197475530, - "step": 9165, - "time_per_iteration": 2.6487855911254883 - }, - { - "auxiliary_loss_clip": 0.01103747, - "auxiliary_loss_mlp": 0.01041326, - "balance_loss_clip": 1.04083133, - "balance_loss_mlp": 1.02677011, - "epoch": 0.5510897339546069, - "flos": 12420496166400.0, - "grad_norm": 1.9553906281347788, - "language_loss": 0.78998721, - "learning_rate": 1.7666608895909563e-06, - "loss": 0.8114379, - "num_input_tokens_seen": 197490835, - "step": 9166, - "time_per_iteration": 2.578125 - }, - { - "auxiliary_loss_clip": 0.01089384, - "auxiliary_loss_mlp": 0.01032199, - "balance_loss_clip": 1.03881669, - "balance_loss_mlp": 1.01822138, - "epoch": 0.5511498572072749, - "flos": 18770579896320.0, - "grad_norm": 2.156469581369372, - "language_loss": 0.76529676, - "learning_rate": 1.7662740920522913e-06, - "loss": 0.78651255, - "num_input_tokens_seen": 197508770, - "step": 9167, - "time_per_iteration": 2.7045888900756836 - }, - { - "auxiliary_loss_clip": 0.01112145, - "auxiliary_loss_mlp": 0.01032029, - "balance_loss_clip": 1.04281187, - "balance_loss_mlp": 1.01811707, - "epoch": 0.5512099804599428, - "flos": 19573326996480.0, - "grad_norm": 2.0156954118398227, - "language_loss": 0.79765004, - "learning_rate": 1.7658873033765374e-06, - "loss": 0.81909174, - "num_input_tokens_seen": 197527340, - "step": 9168, - "time_per_iteration": 2.669908046722412 - }, - { - "auxiliary_loss_clip": 0.0111534, - "auxiliary_loss_mlp": 0.0104239, - "balance_loss_clip": 1.04542589, - "balance_loss_mlp": 1.02830565, - "epoch": 0.5512701037126109, - "flos": 26245600744320.0, - "grad_norm": 1.6113858397633185, - "language_loss": 0.69293267, - "learning_rate": 1.7655005235783591e-06, - "loss": 0.71450996, - "num_input_tokens_seen": 197547280, - "step": 9169, - "time_per_iteration": 2.70609450340271 - }, - { - "auxiliary_loss_clip": 0.01106964, - "auxiliary_loss_mlp": 0.01029287, - "balance_loss_clip": 1.04113257, - "balance_loss_mlp": 1.01710367, - "epoch": 0.5513302269652788, - "flos": 21945406279680.0, - "grad_norm": 1.9890616519308366, - "language_loss": 0.85510826, - "learning_rate": 1.7651137526724251e-06, - "loss": 0.87647074, - "num_input_tokens_seen": 197565045, - "step": 9170, - "time_per_iteration": 2.670785427093506 - }, - { - "auxiliary_loss_clip": 0.01022762, - "auxiliary_loss_mlp": 0.01003909, - "balance_loss_clip": 1.02287233, - "balance_loss_mlp": 1.00240731, - "epoch": 0.5513903502179468, - "flos": 68235948616320.0, - "grad_norm": 0.7781167580815929, - "language_loss": 0.59840322, - "learning_rate": 1.7647269906734017e-06, - "loss": 0.61866993, - "num_input_tokens_seen": 197625005, - "step": 9171, - "time_per_iteration": 3.2524025440216064 - }, - { - "auxiliary_loss_clip": 0.01085077, - "auxiliary_loss_mlp": 0.01041997, - "balance_loss_clip": 1.03855562, - "balance_loss_mlp": 1.02763844, - "epoch": 0.5514504734706147, - "flos": 18734238311040.0, - "grad_norm": 1.556060427891405, - "language_loss": 0.70670319, - "learning_rate": 1.7643402375959533e-06, - "loss": 0.72797394, - "num_input_tokens_seen": 197645050, - "step": 9172, - "time_per_iteration": 2.708811044692993 - }, - { - "auxiliary_loss_clip": 0.01120195, - "auxiliary_loss_mlp": 0.01038202, - "balance_loss_clip": 1.04229403, - "balance_loss_mlp": 1.02470756, - "epoch": 0.5515105967232827, - "flos": 22270972176000.0, - "grad_norm": 1.7490660409709138, - "language_loss": 0.75727642, - "learning_rate": 1.7639534934547474e-06, - "loss": 0.77886033, - "num_input_tokens_seen": 197663910, - "step": 9173, - "time_per_iteration": 2.6022469997406006 - }, - { - "auxiliary_loss_clip": 0.01083041, - "auxiliary_loss_mlp": 0.01033938, - "balance_loss_clip": 1.04071558, - "balance_loss_mlp": 1.02043712, - "epoch": 0.5515707199759508, - "flos": 22557682535040.0, - "grad_norm": 1.9060639151270278, - "language_loss": 0.75156957, - "learning_rate": 1.7635667582644484e-06, - "loss": 0.77273941, - "num_input_tokens_seen": 197681580, - "step": 9174, - "time_per_iteration": 2.758668899536133 - }, - { - "auxiliary_loss_clip": 0.01102936, - "auxiliary_loss_mlp": 0.01034738, - "balance_loss_clip": 1.0414834, - "balance_loss_mlp": 1.02056456, - "epoch": 0.5516308432286187, - "flos": 28291072636800.0, - "grad_norm": 2.209520073538634, - "language_loss": 0.72830188, - "learning_rate": 1.7631800320397217e-06, - "loss": 0.74967873, - "num_input_tokens_seen": 197702095, - "step": 9175, - "time_per_iteration": 2.6674885749816895 - }, - { - "auxiliary_loss_clip": 0.01112767, - "auxiliary_loss_mlp": 0.010363, - "balance_loss_clip": 1.04439914, - "balance_loss_mlp": 1.02324057, - "epoch": 0.5516909664812867, - "flos": 18764474584320.0, - "grad_norm": 1.7828415192194789, - "language_loss": 0.69321132, - "learning_rate": 1.7627933147952318e-06, - "loss": 0.71470201, - "num_input_tokens_seen": 197720720, - "step": 9176, - "time_per_iteration": 2.721855878829956 - }, - { - "auxiliary_loss_clip": 0.01112205, - "auxiliary_loss_mlp": 0.01032754, - "balance_loss_clip": 1.04404604, - "balance_loss_mlp": 1.02004051, - "epoch": 0.5517510897339546, - "flos": 27740346336000.0, - "grad_norm": 1.6320384621008008, - "language_loss": 0.70890021, - "learning_rate": 1.7624066065456435e-06, - "loss": 0.73034984, - "num_input_tokens_seen": 197741820, - "step": 9177, - "time_per_iteration": 2.6951122283935547 - }, - { - "auxiliary_loss_clip": 0.01111799, - "auxiliary_loss_mlp": 0.01031495, - "balance_loss_clip": 1.0442878, - "balance_loss_mlp": 1.01811981, - "epoch": 0.5518112129866226, - "flos": 18404470523520.0, - "grad_norm": 1.5626252071778102, - "language_loss": 0.80647016, - "learning_rate": 1.7620199073056204e-06, - "loss": 0.82790309, - "num_input_tokens_seen": 197759160, - "step": 9178, - "time_per_iteration": 2.6048829555511475 - }, - { - "auxiliary_loss_clip": 0.01063405, - "auxiliary_loss_mlp": 0.01046955, - "balance_loss_clip": 1.04167509, - "balance_loss_mlp": 1.03129053, - "epoch": 0.5518713362392905, - "flos": 25082670015360.0, - "grad_norm": 2.211793529411812, - "language_loss": 0.7505163, - "learning_rate": 1.761633217089826e-06, - "loss": 0.77161986, - "num_input_tokens_seen": 197779760, - "step": 9179, - "time_per_iteration": 2.808234453201294 - }, - { - "auxiliary_loss_clip": 0.01114825, - "auxiliary_loss_mlp": 0.0104374, - "balance_loss_clip": 1.04556203, - "balance_loss_mlp": 1.02984655, - "epoch": 0.5519314594919585, - "flos": 36538999361280.0, - "grad_norm": 1.9934221112233521, - "language_loss": 0.7009306, - "learning_rate": 1.761246535912924e-06, - "loss": 0.7225163, - "num_input_tokens_seen": 197801545, - "step": 9180, - "time_per_iteration": 2.788222551345825 - }, - { - "auxiliary_loss_clip": 0.01106377, - "auxiliary_loss_mlp": 0.01041353, - "balance_loss_clip": 1.0398531, - "balance_loss_mlp": 1.02672613, - "epoch": 0.5519915827446265, - "flos": 20448613612800.0, - "grad_norm": 1.9005454733047327, - "language_loss": 0.67093515, - "learning_rate": 1.7608598637895776e-06, - "loss": 0.69241244, - "num_input_tokens_seen": 197820760, - "step": 9181, - "time_per_iteration": 2.7013533115386963 - }, - { - "auxiliary_loss_clip": 0.01126813, - "auxiliary_loss_mlp": 0.0103403, - "balance_loss_clip": 1.0449146, - "balance_loss_mlp": 1.02041602, - "epoch": 0.5520517059972945, - "flos": 23768052151680.0, - "grad_norm": 2.0355295280850347, - "language_loss": 0.79382825, - "learning_rate": 1.7604732007344486e-06, - "loss": 0.8154366, - "num_input_tokens_seen": 197840195, - "step": 9182, - "time_per_iteration": 2.6580309867858887 - }, - { - "auxiliary_loss_clip": 0.0108505, - "auxiliary_loss_mlp": 0.01029722, - "balance_loss_clip": 1.0405935, - "balance_loss_mlp": 1.01576233, - "epoch": 0.5521118292499624, - "flos": 22196457411840.0, - "grad_norm": 2.3123904881057524, - "language_loss": 0.83006704, - "learning_rate": 1.7600865467622003e-06, - "loss": 0.85121477, - "num_input_tokens_seen": 197859475, - "step": 9183, - "time_per_iteration": 2.744466543197632 - }, - { - "auxiliary_loss_clip": 0.01100335, - "auxiliary_loss_mlp": 0.01028792, - "balance_loss_clip": 1.0419153, - "balance_loss_mlp": 1.01544046, - "epoch": 0.5521719525026304, - "flos": 23583291569280.0, - "grad_norm": 1.2881660479793424, - "language_loss": 0.67605364, - "learning_rate": 1.7596999018874936e-06, - "loss": 0.6973449, - "num_input_tokens_seen": 197879395, - "step": 9184, - "time_per_iteration": 2.6846580505371094 - }, - { - "auxiliary_loss_clip": 0.01110729, - "auxiliary_loss_mlp": 0.01028759, - "balance_loss_clip": 1.04261684, - "balance_loss_mlp": 1.01442409, - "epoch": 0.5522320757552983, - "flos": 26137617482880.0, - "grad_norm": 1.486667996359971, - "language_loss": 0.76359147, - "learning_rate": 1.7593132661249917e-06, - "loss": 0.78498632, - "num_input_tokens_seen": 197900815, - "step": 9185, - "time_per_iteration": 2.6278598308563232 - }, - { - "auxiliary_loss_clip": 0.01084681, - "auxiliary_loss_mlp": 0.01041899, - "balance_loss_clip": 1.04073203, - "balance_loss_mlp": 1.02742732, - "epoch": 0.5522921990079663, - "flos": 24676160820480.0, - "grad_norm": 1.6270174778631188, - "language_loss": 0.74294305, - "learning_rate": 1.7589266394893536e-06, - "loss": 0.76420891, - "num_input_tokens_seen": 197918985, - "step": 9186, - "time_per_iteration": 2.7178421020507812 - }, - { - "auxiliary_loss_clip": 0.01094897, - "auxiliary_loss_mlp": 0.0103984, - "balance_loss_clip": 1.04445529, - "balance_loss_mlp": 1.02626204, - "epoch": 0.5523523222606344, - "flos": 22748153379840.0, - "grad_norm": 2.1270117067296725, - "language_loss": 0.66701925, - "learning_rate": 1.7585400219952421e-06, - "loss": 0.68836665, - "num_input_tokens_seen": 197937725, - "step": 9187, - "time_per_iteration": 2.7278029918670654 - }, - { - "auxiliary_loss_clip": 0.01101824, - "auxiliary_loss_mlp": 0.01034427, - "balance_loss_clip": 1.04459238, - "balance_loss_mlp": 1.02054477, - "epoch": 0.5524124455133023, - "flos": 19755825022080.0, - "grad_norm": 1.575939713951601, - "language_loss": 0.7774123, - "learning_rate": 1.758153413657318e-06, - "loss": 0.79877484, - "num_input_tokens_seen": 197955635, - "step": 9188, - "time_per_iteration": 2.753506660461426 - }, - { - "auxiliary_loss_clip": 0.01095705, - "auxiliary_loss_mlp": 0.01031864, - "balance_loss_clip": 1.04053175, - "balance_loss_mlp": 1.01806509, - "epoch": 0.5524725687659703, - "flos": 23294821443840.0, - "grad_norm": 1.82344252580878, - "language_loss": 0.81139189, - "learning_rate": 1.7577668144902394e-06, - "loss": 0.83266759, - "num_input_tokens_seen": 197974490, - "step": 9189, - "time_per_iteration": 2.7089128494262695 - }, - { - "auxiliary_loss_clip": 0.01104025, - "auxiliary_loss_mlp": 0.00770543, - "balance_loss_clip": 1.04259682, - "balance_loss_mlp": 1.00024211, - "epoch": 0.5525326920186382, - "flos": 24862178378880.0, - "grad_norm": 1.4850448399521246, - "language_loss": 0.76478475, - "learning_rate": 1.7573802245086684e-06, - "loss": 0.78353041, - "num_input_tokens_seen": 197995735, - "step": 9190, - "time_per_iteration": 2.611971855163574 - }, - { - "auxiliary_loss_clip": 0.01125599, - "auxiliary_loss_mlp": 0.01041501, - "balance_loss_clip": 1.04273391, - "balance_loss_mlp": 1.02648067, - "epoch": 0.5525928152713062, - "flos": 13735580906880.0, - "grad_norm": 2.4141637541410508, - "language_loss": 0.78987861, - "learning_rate": 1.7569936437272627e-06, - "loss": 0.81154966, - "num_input_tokens_seen": 198009685, - "step": 9191, - "time_per_iteration": 2.545794725418091 - }, - { - "auxiliary_loss_clip": 0.01050104, - "auxiliary_loss_mlp": 0.01035439, - "balance_loss_clip": 1.03439641, - "balance_loss_mlp": 1.02133703, - "epoch": 0.5526529385239741, - "flos": 13071592045440.0, - "grad_norm": 2.484462687188894, - "language_loss": 0.68966973, - "learning_rate": 1.7566070721606829e-06, - "loss": 0.71052521, - "num_input_tokens_seen": 198026845, - "step": 9192, - "time_per_iteration": 6.08718204498291 - }, - { - "auxiliary_loss_clip": 0.01110861, - "auxiliary_loss_mlp": 0.01035933, - "balance_loss_clip": 1.04424548, - "balance_loss_mlp": 1.02356553, - "epoch": 0.5527130617766421, - "flos": 23148377694720.0, - "grad_norm": 1.4810056841060688, - "language_loss": 0.77680272, - "learning_rate": 1.756220509823588e-06, - "loss": 0.7982707, - "num_input_tokens_seen": 198045275, - "step": 9193, - "time_per_iteration": 4.1960039138793945 - }, - { - "auxiliary_loss_clip": 0.01083568, - "auxiliary_loss_mlp": 0.01034731, - "balance_loss_clip": 1.03722787, - "balance_loss_mlp": 1.02139795, - "epoch": 0.55277318502931, - "flos": 21285547482240.0, - "grad_norm": 1.4323494490195217, - "language_loss": 0.78473246, - "learning_rate": 1.7558339567306344e-06, - "loss": 0.80591547, - "num_input_tokens_seen": 198065760, - "step": 9194, - "time_per_iteration": 2.730219841003418 - }, - { - "auxiliary_loss_clip": 0.01089289, - "auxiliary_loss_mlp": 0.01036551, - "balance_loss_clip": 1.04286909, - "balance_loss_mlp": 1.02309823, - "epoch": 0.5528333082819781, - "flos": 38324549462400.0, - "grad_norm": 2.5114324389353224, - "language_loss": 0.69563878, - "learning_rate": 1.7554474128964825e-06, - "loss": 0.71689719, - "num_input_tokens_seen": 198087595, - "step": 9195, - "time_per_iteration": 2.898447275161743 - }, - { - "auxiliary_loss_clip": 0.01107137, - "auxiliary_loss_mlp": 0.01036404, - "balance_loss_clip": 1.04293728, - "balance_loss_mlp": 1.02215791, - "epoch": 0.552893431534646, - "flos": 13553621585280.0, - "grad_norm": 1.952206040801574, - "language_loss": 0.74276292, - "learning_rate": 1.7550608783357887e-06, - "loss": 0.76419842, - "num_input_tokens_seen": 198104620, - "step": 9196, - "time_per_iteration": 2.775261878967285 - }, - { - "auxiliary_loss_clip": 0.01105394, - "auxiliary_loss_mlp": 0.01038639, - "balance_loss_clip": 1.04212689, - "balance_loss_mlp": 1.02461457, - "epoch": 0.552953554787314, - "flos": 21939408708480.0, - "grad_norm": 2.1600616911977384, - "language_loss": 0.76948142, - "learning_rate": 1.7546743530632115e-06, - "loss": 0.79092181, - "num_input_tokens_seen": 198123565, - "step": 9197, - "time_per_iteration": 4.16440224647522 - }, - { - "auxiliary_loss_clip": 0.01097995, - "auxiliary_loss_mlp": 0.01032629, - "balance_loss_clip": 1.03984201, - "balance_loss_mlp": 1.01995707, - "epoch": 0.5530136780399819, - "flos": 43658002558080.0, - "grad_norm": 1.6850679441105894, - "language_loss": 0.76054031, - "learning_rate": 1.754287837093407e-06, - "loss": 0.78184652, - "num_input_tokens_seen": 198148270, - "step": 9198, - "time_per_iteration": 2.950439453125 - }, - { - "auxiliary_loss_clip": 0.01119177, - "auxiliary_loss_mlp": 0.01029802, - "balance_loss_clip": 1.04138994, - "balance_loss_mlp": 1.01700497, - "epoch": 0.5530738012926499, - "flos": 25045502417280.0, - "grad_norm": 1.499755291272354, - "language_loss": 0.79495585, - "learning_rate": 1.7539013304410327e-06, - "loss": 0.81644565, - "num_input_tokens_seen": 198168810, - "step": 9199, - "time_per_iteration": 2.619361162185669 - }, - { - "auxiliary_loss_clip": 0.01078304, - "auxiliary_loss_mlp": 0.01039784, - "balance_loss_clip": 1.03867352, - "balance_loss_mlp": 1.02552032, - "epoch": 0.553133924545318, - "flos": 16472081623680.0, - "grad_norm": 1.9832278976810611, - "language_loss": 0.63797927, - "learning_rate": 1.7535148331207443e-06, - "loss": 0.65916014, - "num_input_tokens_seen": 198186200, - "step": 9200, - "time_per_iteration": 2.6335854530334473 - }, - { - "auxiliary_loss_clip": 0.01102034, - "auxiliary_loss_mlp": 0.01033407, - "balance_loss_clip": 1.04273176, - "balance_loss_mlp": 1.01869619, - "epoch": 0.5531940477979859, - "flos": 24606207083520.0, - "grad_norm": 1.4982382349332672, - "language_loss": 0.66065866, - "learning_rate": 1.7531283451471978e-06, - "loss": 0.68201303, - "num_input_tokens_seen": 198207050, - "step": 9201, - "time_per_iteration": 2.7522671222686768 - }, - { - "auxiliary_loss_clip": 0.01108187, - "auxiliary_loss_mlp": 0.01034798, - "balance_loss_clip": 1.04183888, - "balance_loss_mlp": 1.02056432, - "epoch": 0.5532541710506539, - "flos": 22159577122560.0, - "grad_norm": 1.9333851468305103, - "language_loss": 0.61028016, - "learning_rate": 1.7527418665350502e-06, - "loss": 0.63171005, - "num_input_tokens_seen": 198224565, - "step": 9202, - "time_per_iteration": 2.6281580924987793 - }, - { - "auxiliary_loss_clip": 0.0110847, - "auxiliary_loss_mlp": 0.00770781, - "balance_loss_clip": 1.0422498, - "balance_loss_mlp": 1.00029778, - "epoch": 0.5533142943033218, - "flos": 21397265758080.0, - "grad_norm": 1.7184873612817428, - "language_loss": 0.64222115, - "learning_rate": 1.7523553972989548e-06, - "loss": 0.66101366, - "num_input_tokens_seen": 198244790, - "step": 9203, - "time_per_iteration": 2.6509506702423096 - }, - { - "auxiliary_loss_clip": 0.01108951, - "auxiliary_loss_mlp": 0.0103371, - "balance_loss_clip": 1.04175293, - "balance_loss_mlp": 1.02028739, - "epoch": 0.5533744175559898, - "flos": 23550541344000.0, - "grad_norm": 1.4819756399271273, - "language_loss": 0.63615203, - "learning_rate": 1.7519689374535683e-06, - "loss": 0.65757859, - "num_input_tokens_seen": 198264375, - "step": 9204, - "time_per_iteration": 2.7008473873138428 - }, - { - "auxiliary_loss_clip": 0.01106611, - "auxiliary_loss_mlp": 0.01030715, - "balance_loss_clip": 1.04070532, - "balance_loss_mlp": 1.0184958, - "epoch": 0.5534345408086577, - "flos": 24061514267520.0, - "grad_norm": 1.5985992235632864, - "language_loss": 0.77158082, - "learning_rate": 1.7515824870135445e-06, - "loss": 0.79295409, - "num_input_tokens_seen": 198283895, - "step": 9205, - "time_per_iteration": 2.6544225215911865 - }, - { - "auxiliary_loss_clip": 0.01059768, - "auxiliary_loss_mlp": 0.01039078, - "balance_loss_clip": 1.03511405, - "balance_loss_mlp": 1.02576268, - "epoch": 0.5534946640613257, - "flos": 33771831408000.0, - "grad_norm": 1.4391383519913163, - "language_loss": 0.72826385, - "learning_rate": 1.751196045993537e-06, - "loss": 0.74925232, - "num_input_tokens_seen": 198310035, - "step": 9206, - "time_per_iteration": 2.832268476486206 - }, - { - "auxiliary_loss_clip": 0.01073531, - "auxiliary_loss_mlp": 0.01034139, - "balance_loss_clip": 1.03840923, - "balance_loss_mlp": 1.0208354, - "epoch": 0.5535547873139937, - "flos": 15159223526400.0, - "grad_norm": 2.230271879861814, - "language_loss": 0.75639313, - "learning_rate": 1.7508096144082012e-06, - "loss": 0.77746987, - "num_input_tokens_seen": 198327810, - "step": 9207, - "time_per_iteration": 2.7088775634765625 - }, - { - "auxiliary_loss_clip": 0.01088202, - "auxiliary_loss_mlp": 0.01033804, - "balance_loss_clip": 1.0419991, - "balance_loss_mlp": 1.02010703, - "epoch": 0.5536149105666617, - "flos": 16980863817600.0, - "grad_norm": 71.24671792095333, - "language_loss": 0.61898887, - "learning_rate": 1.750423192272189e-06, - "loss": 0.6402089, - "num_input_tokens_seen": 198343150, - "step": 9208, - "time_per_iteration": 2.749739646911621 - }, - { - "auxiliary_loss_clip": 0.01123136, - "auxiliary_loss_mlp": 0.0103585, - "balance_loss_clip": 1.04367232, - "balance_loss_mlp": 1.02285004, - "epoch": 0.5536750338193296, - "flos": 18149935772160.0, - "grad_norm": 2.006267106077657, - "language_loss": 0.64258868, - "learning_rate": 1.7500367796001547e-06, - "loss": 0.66417855, - "num_input_tokens_seen": 198360925, - "step": 9209, - "time_per_iteration": 2.6854724884033203 - }, - { - "auxiliary_loss_clip": 0.01084442, - "auxiliary_loss_mlp": 0.0104196, - "balance_loss_clip": 1.03969955, - "balance_loss_mlp": 1.02729774, - "epoch": 0.5537351570719976, - "flos": 22747794243840.0, - "grad_norm": 1.8841222831412607, - "language_loss": 0.82470959, - "learning_rate": 1.7496503764067513e-06, - "loss": 0.84597361, - "num_input_tokens_seen": 198379265, - "step": 9210, - "time_per_iteration": 2.746532917022705 - }, - { - "auxiliary_loss_clip": 0.01098481, - "auxiliary_loss_mlp": 0.01029278, - "balance_loss_clip": 1.04068804, - "balance_loss_mlp": 1.016523, - "epoch": 0.5537952803246655, - "flos": 26356026130560.0, - "grad_norm": 1.6369703268884894, - "language_loss": 0.72731483, - "learning_rate": 1.74926398270663e-06, - "loss": 0.74859238, - "num_input_tokens_seen": 198399490, - "step": 9211, - "time_per_iteration": 2.767152786254883 - }, - { - "auxiliary_loss_clip": 0.01089972, - "auxiliary_loss_mlp": 0.01037477, - "balance_loss_clip": 1.03941226, - "balance_loss_mlp": 1.02259946, - "epoch": 0.5538554035773335, - "flos": 18037427397120.0, - "grad_norm": 1.965979716525238, - "language_loss": 0.6684767, - "learning_rate": 1.7488775985144437e-06, - "loss": 0.68975115, - "num_input_tokens_seen": 198419110, - "step": 9212, - "time_per_iteration": 2.6946139335632324 - }, - { - "auxiliary_loss_clip": 0.01092654, - "auxiliary_loss_mlp": 0.01030144, - "balance_loss_clip": 1.04305434, - "balance_loss_mlp": 1.01557696, - "epoch": 0.5539155268300014, - "flos": 31686247002240.0, - "grad_norm": 1.403594998374367, - "language_loss": 0.51636183, - "learning_rate": 1.7484912238448443e-06, - "loss": 0.53758979, - "num_input_tokens_seen": 198441360, - "step": 9213, - "time_per_iteration": 2.7821476459503174 - }, - { - "auxiliary_loss_clip": 0.01092111, - "auxiliary_loss_mlp": 0.01030874, - "balance_loss_clip": 1.04350758, - "balance_loss_mlp": 1.01752245, - "epoch": 0.5539756500826695, - "flos": 15193769431680.0, - "grad_norm": 3.6308307245288214, - "language_loss": 0.86044586, - "learning_rate": 1.7481048587124827e-06, - "loss": 0.88167566, - "num_input_tokens_seen": 198459835, - "step": 9214, - "time_per_iteration": 2.7264554500579834 - }, - { - "auxiliary_loss_clip": 0.01110148, - "auxiliary_loss_mlp": 0.01032811, - "balance_loss_clip": 1.04324055, - "balance_loss_mlp": 1.02003813, - "epoch": 0.5540357733353375, - "flos": 26353117128960.0, - "grad_norm": 2.235553679927881, - "language_loss": 0.70002753, - "learning_rate": 1.7477185031320108e-06, - "loss": 0.72145712, - "num_input_tokens_seen": 198478955, - "step": 9215, - "time_per_iteration": 2.684901714324951 - }, - { - "auxiliary_loss_clip": 0.01093255, - "auxiliary_loss_mlp": 0.0103064, - "balance_loss_clip": 1.03972387, - "balance_loss_mlp": 1.01641822, - "epoch": 0.5540958965880054, - "flos": 21323684747520.0, - "grad_norm": 1.5213166138329088, - "language_loss": 0.73443544, - "learning_rate": 1.7473321571180773e-06, - "loss": 0.75567436, - "num_input_tokens_seen": 198499030, - "step": 9216, - "time_per_iteration": 2.6930174827575684 - }, - { - "auxiliary_loss_clip": 0.01095704, - "auxiliary_loss_mlp": 0.0103884, - "balance_loss_clip": 1.04206526, - "balance_loss_mlp": 1.02541757, - "epoch": 0.5541560198406734, - "flos": 25666828899840.0, - "grad_norm": 1.8909182551573178, - "language_loss": 0.71728694, - "learning_rate": 1.7469458206853345e-06, - "loss": 0.73863238, - "num_input_tokens_seen": 198520265, - "step": 9217, - "time_per_iteration": 2.705566644668579 - }, - { - "auxiliary_loss_clip": 0.01102416, - "auxiliary_loss_mlp": 0.01027899, - "balance_loss_clip": 1.04219627, - "balance_loss_mlp": 1.01496446, - "epoch": 0.5542161430933413, - "flos": 21939624190080.0, - "grad_norm": 1.8150794810366015, - "language_loss": 0.78261054, - "learning_rate": 1.7465594938484315e-06, - "loss": 0.80391365, - "num_input_tokens_seen": 198539645, - "step": 9218, - "time_per_iteration": 2.6569690704345703 - }, - { - "auxiliary_loss_clip": 0.01077956, - "auxiliary_loss_mlp": 0.01037085, - "balance_loss_clip": 1.03790164, - "balance_loss_mlp": 1.02161169, - "epoch": 0.5542762663460093, - "flos": 19571459489280.0, - "grad_norm": 1.6224660724744044, - "language_loss": 0.72173905, - "learning_rate": 1.7461731766220176e-06, - "loss": 0.74288952, - "num_input_tokens_seen": 198558710, - "step": 9219, - "time_per_iteration": 2.685511350631714 - }, - { - "auxiliary_loss_clip": 0.01108862, - "auxiliary_loss_mlp": 0.01039965, - "balance_loss_clip": 1.04482341, - "balance_loss_mlp": 1.0262028, - "epoch": 0.5543363895986773, - "flos": 19499063627520.0, - "grad_norm": 1.5105706382424104, - "language_loss": 0.71297967, - "learning_rate": 1.7457868690207426e-06, - "loss": 0.73446798, - "num_input_tokens_seen": 198577050, - "step": 9220, - "time_per_iteration": 2.6306073665618896 - }, - { - "auxiliary_loss_clip": 0.01120811, - "auxiliary_loss_mlp": 0.01026848, - "balance_loss_clip": 1.04381871, - "balance_loss_mlp": 1.01429546, - "epoch": 0.5543965128513453, - "flos": 22635609091200.0, - "grad_norm": 1.6307293256223026, - "language_loss": 0.79449409, - "learning_rate": 1.7454005710592547e-06, - "loss": 0.81597066, - "num_input_tokens_seen": 198595290, - "step": 9221, - "time_per_iteration": 2.664358139038086 - }, - { - "auxiliary_loss_clip": 0.01090389, - "auxiliary_loss_mlp": 0.01034475, - "balance_loss_clip": 1.04653525, - "balance_loss_mlp": 1.02108812, - "epoch": 0.5544566361040132, - "flos": 25989952671360.0, - "grad_norm": 1.9685503329730023, - "language_loss": 0.83722961, - "learning_rate": 1.7450142827522027e-06, - "loss": 0.85847831, - "num_input_tokens_seen": 198614110, - "step": 9222, - "time_per_iteration": 2.770050048828125 - }, - { - "auxiliary_loss_clip": 0.01100221, - "auxiliary_loss_mlp": 0.00771629, - "balance_loss_clip": 1.04789209, - "balance_loss_mlp": 1.00036037, - "epoch": 0.5545167593566812, - "flos": 28257568225920.0, - "grad_norm": 1.9185335813275248, - "language_loss": 0.75431746, - "learning_rate": 1.7446280041142344e-06, - "loss": 0.773036, - "num_input_tokens_seen": 198633880, - "step": 9223, - "time_per_iteration": 2.794182062149048 - }, - { - "auxiliary_loss_clip": 0.01091289, - "auxiliary_loss_mlp": 0.0103417, - "balance_loss_clip": 1.04017019, - "balance_loss_mlp": 1.0201149, - "epoch": 0.5545768826093491, - "flos": 28476551491200.0, - "grad_norm": 1.614917501509061, - "language_loss": 0.82090491, - "learning_rate": 1.7442417351599986e-06, - "loss": 0.84215945, - "num_input_tokens_seen": 198653505, - "step": 9224, - "time_per_iteration": 2.7137935161590576 - }, - { - "auxiliary_loss_clip": 0.01108448, - "auxiliary_loss_mlp": 0.01043106, - "balance_loss_clip": 1.04417324, - "balance_loss_mlp": 1.02924204, - "epoch": 0.5546370058620171, - "flos": 18478051534080.0, - "grad_norm": 1.7607532408743478, - "language_loss": 0.57043874, - "learning_rate": 1.743855475904141e-06, - "loss": 0.59195429, - "num_input_tokens_seen": 198671890, - "step": 9225, - "time_per_iteration": 2.616447687149048 - }, - { - "auxiliary_loss_clip": 0.01112997, - "auxiliary_loss_mlp": 0.01038411, - "balance_loss_clip": 1.04317498, - "balance_loss_mlp": 1.02444005, - "epoch": 0.554697129114685, - "flos": 22930507751040.0, - "grad_norm": 1.6222452828903178, - "language_loss": 0.67458808, - "learning_rate": 1.7434692263613098e-06, - "loss": 0.69610214, - "num_input_tokens_seen": 198691995, - "step": 9226, - "time_per_iteration": 2.663339138031006 - }, - { - "auxiliary_loss_clip": 0.0108551, - "auxiliary_loss_mlp": 0.0103467, - "balance_loss_clip": 1.03901601, - "balance_loss_mlp": 1.02121162, - "epoch": 0.5547572523673531, - "flos": 21797166850560.0, - "grad_norm": 1.6061917148762987, - "language_loss": 0.74387592, - "learning_rate": 1.7430829865461518e-06, - "loss": 0.76507771, - "num_input_tokens_seen": 198712440, - "step": 9227, - "time_per_iteration": 2.762258529663086 - }, - { - "auxiliary_loss_clip": 0.01087938, - "auxiliary_loss_mlp": 0.01034743, - "balance_loss_clip": 1.04223549, - "balance_loss_mlp": 1.02071249, - "epoch": 0.5548173756200211, - "flos": 22342829333760.0, - "grad_norm": 1.8589261758591291, - "language_loss": 0.73263627, - "learning_rate": 1.7426967564733118e-06, - "loss": 0.7538631, - "num_input_tokens_seen": 198731515, - "step": 9228, - "time_per_iteration": 2.762092113494873 - }, - { - "auxiliary_loss_clip": 0.01122414, - "auxiliary_loss_mlp": 0.01031991, - "balance_loss_clip": 1.04351175, - "balance_loss_mlp": 1.01886559, - "epoch": 0.554877498872689, - "flos": 17858736213120.0, - "grad_norm": 1.672332446894358, - "language_loss": 0.75519872, - "learning_rate": 1.7423105361574373e-06, - "loss": 0.77674282, - "num_input_tokens_seen": 198749750, - "step": 9229, - "time_per_iteration": 2.6003267765045166 - }, - { - "auxiliary_loss_clip": 0.01110807, - "auxiliary_loss_mlp": 0.00772253, - "balance_loss_clip": 1.0439682, - "balance_loss_mlp": 1.00026536, - "epoch": 0.554937622125357, - "flos": 17238343484160.0, - "grad_norm": 1.7587828151966396, - "language_loss": 0.68663722, - "learning_rate": 1.741924325613172e-06, - "loss": 0.70546782, - "num_input_tokens_seen": 198768320, - "step": 9230, - "time_per_iteration": 2.6502435207366943 - }, - { - "auxiliary_loss_clip": 0.01078746, - "auxiliary_loss_mlp": 0.01039366, - "balance_loss_clip": 1.04407859, - "balance_loss_mlp": 1.02506709, - "epoch": 0.5549977453780249, - "flos": 25368087484800.0, - "grad_norm": 2.162588573655947, - "language_loss": 0.6800701, - "learning_rate": 1.741538124855163e-06, - "loss": 0.70125121, - "num_input_tokens_seen": 198787230, - "step": 9231, - "time_per_iteration": 4.46450400352478 - }, - { - "auxiliary_loss_clip": 0.01125233, - "auxiliary_loss_mlp": 0.01040313, - "balance_loss_clip": 1.04339528, - "balance_loss_mlp": 1.02537608, - "epoch": 0.555057868630693, - "flos": 25079114568960.0, - "grad_norm": 1.7058695185820383, - "language_loss": 0.78623915, - "learning_rate": 1.7411519338980548e-06, - "loss": 0.80789459, - "num_input_tokens_seen": 198806720, - "step": 9232, - "time_per_iteration": 4.17819356918335 - }, - { - "auxiliary_loss_clip": 0.01077674, - "auxiliary_loss_mlp": 0.01038155, - "balance_loss_clip": 1.03794336, - "balance_loss_mlp": 1.02523899, - "epoch": 0.5551179918833609, - "flos": 26104220812800.0, - "grad_norm": 1.530027860156435, - "language_loss": 0.82512534, - "learning_rate": 1.7407657527564898e-06, - "loss": 0.84628367, - "num_input_tokens_seen": 198826235, - "step": 9233, - "time_per_iteration": 2.7746078968048096 - }, - { - "auxiliary_loss_clip": 0.01108881, - "auxiliary_loss_mlp": 0.01040385, - "balance_loss_clip": 1.04062366, - "balance_loss_mlp": 1.02632475, - "epoch": 0.5551781151360289, - "flos": 19384759572480.0, - "grad_norm": 8.113354085779601, - "language_loss": 0.74638891, - "learning_rate": 1.7403795814451142e-06, - "loss": 0.76788163, - "num_input_tokens_seen": 198842655, - "step": 9234, - "time_per_iteration": 2.6174590587615967 - }, - { - "auxiliary_loss_clip": 0.01094953, - "auxiliary_loss_mlp": 0.01029345, - "balance_loss_clip": 1.03896558, - "balance_loss_mlp": 1.01647031, - "epoch": 0.5552382383886968, - "flos": 21725956137600.0, - "grad_norm": 4.639125305136136, - "language_loss": 0.64988184, - "learning_rate": 1.7399934199785706e-06, - "loss": 0.67112482, - "num_input_tokens_seen": 198861210, - "step": 9235, - "time_per_iteration": 2.6820857524871826 - }, - { - "auxiliary_loss_clip": 0.0106692, - "auxiliary_loss_mlp": 0.01042767, - "balance_loss_clip": 1.03562975, - "balance_loss_mlp": 1.02793705, - "epoch": 0.5552983616413648, - "flos": 14356189117440.0, - "grad_norm": 1.66240052317675, - "language_loss": 0.67842531, - "learning_rate": 1.7396072683715029e-06, - "loss": 0.69952214, - "num_input_tokens_seen": 198880045, - "step": 9236, - "time_per_iteration": 4.265462160110474 - }, - { - "auxiliary_loss_clip": 0.01116825, - "auxiliary_loss_mlp": 0.01028362, - "balance_loss_clip": 1.04261172, - "balance_loss_mlp": 1.01549888, - "epoch": 0.5553584848940327, - "flos": 25478548784640.0, - "grad_norm": 1.8489707966449562, - "language_loss": 0.86189765, - "learning_rate": 1.7392211266385536e-06, - "loss": 0.88334954, - "num_input_tokens_seen": 198900210, - "step": 9237, - "time_per_iteration": 2.662736654281616 - }, - { - "auxiliary_loss_clip": 0.01108193, - "auxiliary_loss_mlp": 0.01037757, - "balance_loss_clip": 1.04178131, - "balance_loss_mlp": 1.02388716, - "epoch": 0.5554186081467007, - "flos": 22163850840960.0, - "grad_norm": 2.008755703666539, - "language_loss": 0.73663169, - "learning_rate": 1.7388349947943652e-06, - "loss": 0.75809121, - "num_input_tokens_seen": 198919055, - "step": 9238, - "time_per_iteration": 2.6842122077941895 - }, - { - "auxiliary_loss_clip": 0.01105716, - "auxiliary_loss_mlp": 0.01031608, - "balance_loss_clip": 1.0387727, - "balance_loss_mlp": 1.01777411, - "epoch": 0.5554787313993687, - "flos": 49746656125440.0, - "grad_norm": 1.8187915692087442, - "language_loss": 0.78551757, - "learning_rate": 1.73844887285358e-06, - "loss": 0.80689085, - "num_input_tokens_seen": 198943505, - "step": 9239, - "time_per_iteration": 2.887911558151245 - }, - { - "auxiliary_loss_clip": 0.01106485, - "auxiliary_loss_mlp": 0.01030728, - "balance_loss_clip": 1.04819751, - "balance_loss_mlp": 1.01699483, - "epoch": 0.5555388546520367, - "flos": 22127365601280.0, - "grad_norm": 1.7617963791060023, - "language_loss": 0.8016845, - "learning_rate": 1.7380627608308393e-06, - "loss": 0.82305664, - "num_input_tokens_seen": 198963590, - "step": 9240, - "time_per_iteration": 2.759277582168579 - }, - { - "auxiliary_loss_clip": 0.0109666, - "auxiliary_loss_mlp": 0.01034491, - "balance_loss_clip": 1.04089236, - "balance_loss_mlp": 1.02099013, - "epoch": 0.5555989779047047, - "flos": 24682122478080.0, - "grad_norm": 2.168471057936508, - "language_loss": 0.65255535, - "learning_rate": 1.737676658740786e-06, - "loss": 0.67386687, - "num_input_tokens_seen": 198982680, - "step": 9241, - "time_per_iteration": 2.7321317195892334 - }, - { - "auxiliary_loss_clip": 0.01110689, - "auxiliary_loss_mlp": 0.0077113, - "balance_loss_clip": 1.04320502, - "balance_loss_mlp": 1.00029731, - "epoch": 0.5556591011573726, - "flos": 16106510954880.0, - "grad_norm": 1.885035131778914, - "language_loss": 0.72406638, - "learning_rate": 1.7372905665980594e-06, - "loss": 0.74288458, - "num_input_tokens_seen": 199000185, - "step": 9242, - "time_per_iteration": 2.6891591548919678 - }, - { - "auxiliary_loss_clip": 0.01106836, - "auxiliary_loss_mlp": 0.01034566, - "balance_loss_clip": 1.04584861, - "balance_loss_mlp": 1.02024293, - "epoch": 0.5557192244100406, - "flos": 12933695733120.0, - "grad_norm": 1.6675932055368092, - "language_loss": 0.64065903, - "learning_rate": 1.7369044844173012e-06, - "loss": 0.66207308, - "num_input_tokens_seen": 199018380, - "step": 9243, - "time_per_iteration": 3.1710290908813477 - }, - { - "auxiliary_loss_clip": 0.01094198, - "auxiliary_loss_mlp": 0.00771105, - "balance_loss_clip": 1.04436445, - "balance_loss_mlp": 1.00027966, - "epoch": 0.5557793476627085, - "flos": 23111712887040.0, - "grad_norm": 2.6865994829235333, - "language_loss": 0.75548631, - "learning_rate": 1.7365184122131509e-06, - "loss": 0.77413929, - "num_input_tokens_seen": 199037115, - "step": 9244, - "time_per_iteration": 2.686121940612793 - }, - { - "auxiliary_loss_clip": 0.01091692, - "auxiliary_loss_mlp": 0.01036173, - "balance_loss_clip": 1.03900838, - "balance_loss_mlp": 1.02352512, - "epoch": 0.5558394709153766, - "flos": 21428040735360.0, - "grad_norm": 2.0505810415857506, - "language_loss": 0.75051856, - "learning_rate": 1.7361323500002486e-06, - "loss": 0.77179724, - "num_input_tokens_seen": 199053375, - "step": 9245, - "time_per_iteration": 2.6561057567596436 - }, - { - "auxiliary_loss_clip": 0.01099057, - "auxiliary_loss_mlp": 0.01034743, - "balance_loss_clip": 1.04262114, - "balance_loss_mlp": 1.02087283, - "epoch": 0.5558995941680445, - "flos": 25078324469760.0, - "grad_norm": 2.0581034442408055, - "language_loss": 0.79967058, - "learning_rate": 1.7357462977932348e-06, - "loss": 0.82100856, - "num_input_tokens_seen": 199070930, - "step": 9246, - "time_per_iteration": 2.6968653202056885 - }, - { - "auxiliary_loss_clip": 0.01120892, - "auxiliary_loss_mlp": 0.01037931, - "balance_loss_clip": 1.0435034, - "balance_loss_mlp": 1.0241977, - "epoch": 0.5559597174207125, - "flos": 20011149872640.0, - "grad_norm": 1.8340386723611697, - "language_loss": 0.73825908, - "learning_rate": 1.7353602556067471e-06, - "loss": 0.75984728, - "num_input_tokens_seen": 199088675, - "step": 9247, - "time_per_iteration": 2.5861082077026367 - }, - { - "auxiliary_loss_clip": 0.01091731, - "auxiliary_loss_mlp": 0.01035279, - "balance_loss_clip": 1.04089963, - "balance_loss_mlp": 1.0214448, - "epoch": 0.5560198406733804, - "flos": 16835677044480.0, - "grad_norm": 2.6765383510534324, - "language_loss": 0.74975288, - "learning_rate": 1.7349742234554254e-06, - "loss": 0.77102304, - "num_input_tokens_seen": 199103075, - "step": 9248, - "time_per_iteration": 2.634092092514038 - }, - { - "auxiliary_loss_clip": 0.00999886, - "auxiliary_loss_mlp": 0.01011469, - "balance_loss_clip": 1.01177704, - "balance_loss_mlp": 1.00989556, - "epoch": 0.5560799639260484, - "flos": 70697051758080.0, - "grad_norm": 0.8462101410465201, - "language_loss": 0.59490269, - "learning_rate": 1.7345882013539081e-06, - "loss": 0.61501622, - "num_input_tokens_seen": 199160325, - "step": 9249, - "time_per_iteration": 3.389267683029175 - }, - { - "auxiliary_loss_clip": 0.01118078, - "auxiliary_loss_mlp": 0.01029785, - "balance_loss_clip": 1.04007614, - "balance_loss_mlp": 1.01592088, - "epoch": 0.5561400871787163, - "flos": 23148593176320.0, - "grad_norm": 2.8767161081984427, - "language_loss": 0.79950154, - "learning_rate": 1.734202189316832e-06, - "loss": 0.82098025, - "num_input_tokens_seen": 199179760, - "step": 9250, - "time_per_iteration": 2.578690528869629 - }, - { - "auxiliary_loss_clip": 0.01098469, - "auxiliary_loss_mlp": 0.01034798, - "balance_loss_clip": 1.04169929, - "balance_loss_mlp": 1.02075529, - "epoch": 0.5562002104313843, - "flos": 17566423332480.0, - "grad_norm": 3.104352444179477, - "language_loss": 0.68685251, - "learning_rate": 1.733816187358836e-06, - "loss": 0.7081852, - "num_input_tokens_seen": 199196695, - "step": 9251, - "time_per_iteration": 2.7810349464416504 - }, - { - "auxiliary_loss_clip": 0.01109089, - "auxiliary_loss_mlp": 0.01033405, - "balance_loss_clip": 1.04200792, - "balance_loss_mlp": 1.02018476, - "epoch": 0.5562603336840523, - "flos": 25045430590080.0, - "grad_norm": 1.5038625186154766, - "language_loss": 0.75750792, - "learning_rate": 1.7334301954945569e-06, - "loss": 0.77893281, - "num_input_tokens_seen": 199217845, - "step": 9252, - "time_per_iteration": 2.663238286972046 - }, - { - "auxiliary_loss_clip": 0.01107916, - "auxiliary_loss_mlp": 0.01039535, - "balance_loss_clip": 1.04108679, - "balance_loss_mlp": 1.02441943, - "epoch": 0.5563204569367203, - "flos": 29059022436480.0, - "grad_norm": 1.5228616100256118, - "language_loss": 0.72854966, - "learning_rate": 1.7330442137386313e-06, - "loss": 0.7500242, - "num_input_tokens_seen": 199239250, - "step": 9253, - "time_per_iteration": 2.6020450592041016 - }, - { - "auxiliary_loss_clip": 0.01093689, - "auxiliary_loss_mlp": 0.01032948, - "balance_loss_clip": 1.04451489, - "balance_loss_mlp": 1.02043748, - "epoch": 0.5563805801893883, - "flos": 22090449398400.0, - "grad_norm": 1.6703038143704756, - "language_loss": 0.83143723, - "learning_rate": 1.7326582421056965e-06, - "loss": 0.85270357, - "num_input_tokens_seen": 199258320, - "step": 9254, - "time_per_iteration": 2.701199531555176 - }, - { - "auxiliary_loss_clip": 0.01012318, - "auxiliary_loss_mlp": 0.01004464, - "balance_loss_clip": 1.01460981, - "balance_loss_mlp": 1.0030154, - "epoch": 0.5564407034420562, - "flos": 58636128689280.0, - "grad_norm": 0.8693463823650434, - "language_loss": 0.64875168, - "learning_rate": 1.732272280610387e-06, - "loss": 0.6689195, - "num_input_tokens_seen": 199314840, - "step": 9255, - "time_per_iteration": 3.1222445964813232 - }, - { - "auxiliary_loss_clip": 0.01111592, - "auxiliary_loss_mlp": 0.01033344, - "balance_loss_clip": 1.04527521, - "balance_loss_mlp": 1.02035666, - "epoch": 0.5565008266947242, - "flos": 23112323418240.0, - "grad_norm": 2.147539486852423, - "language_loss": 0.69487607, - "learning_rate": 1.7318863292673399e-06, - "loss": 0.7163254, - "num_input_tokens_seen": 199335405, - "step": 9256, - "time_per_iteration": 2.642542600631714 - }, - { - "auxiliary_loss_clip": 0.01085774, - "auxiliary_loss_mlp": 0.01031728, - "balance_loss_clip": 1.04269767, - "balance_loss_mlp": 1.01939559, - "epoch": 0.5565609499473921, - "flos": 21578399066880.0, - "grad_norm": 1.6171582584602333, - "language_loss": 0.75981283, - "learning_rate": 1.73150038809119e-06, - "loss": 0.78098786, - "num_input_tokens_seen": 199354345, - "step": 9257, - "time_per_iteration": 2.712520122528076 - }, - { - "auxiliary_loss_clip": 0.01074562, - "auxiliary_loss_mlp": 0.01036038, - "balance_loss_clip": 1.04019046, - "balance_loss_mlp": 1.0233897, - "epoch": 0.5566210732000602, - "flos": 18369637309440.0, - "grad_norm": 3.6499733263034746, - "language_loss": 0.60697454, - "learning_rate": 1.7311144570965724e-06, - "loss": 0.62808049, - "num_input_tokens_seen": 199372250, - "step": 9258, - "time_per_iteration": 2.751559257507324 - }, - { - "auxiliary_loss_clip": 0.01084702, - "auxiliary_loss_mlp": 0.01035032, - "balance_loss_clip": 1.03922486, - "balance_loss_mlp": 1.02042937, - "epoch": 0.5566811964527281, - "flos": 25703350053120.0, - "grad_norm": 1.5966024354647115, - "language_loss": 0.79111505, - "learning_rate": 1.7307285362981215e-06, - "loss": 0.81231236, - "num_input_tokens_seen": 199392815, - "step": 9259, - "time_per_iteration": 2.7664895057678223 - }, - { - "auxiliary_loss_clip": 0.01088989, - "auxiliary_loss_mlp": 0.0103733, - "balance_loss_clip": 1.04242945, - "balance_loss_mlp": 1.02328086, - "epoch": 0.5567413197053961, - "flos": 26943991856640.0, - "grad_norm": 1.7833081696281723, - "language_loss": 0.81253225, - "learning_rate": 1.7303426257104712e-06, - "loss": 0.83379543, - "num_input_tokens_seen": 199412375, - "step": 9260, - "time_per_iteration": 2.79059100151062 - }, - { - "auxiliary_loss_clip": 0.01120889, - "auxiliary_loss_mlp": 0.01039805, - "balance_loss_clip": 1.04265976, - "balance_loss_mlp": 1.02585721, - "epoch": 0.556801442958064, - "flos": 20850597694080.0, - "grad_norm": 1.513133023380305, - "language_loss": 0.69277883, - "learning_rate": 1.729956725348256e-06, - "loss": 0.71438575, - "num_input_tokens_seen": 199431490, - "step": 9261, - "time_per_iteration": 2.5942957401275635 - }, - { - "auxiliary_loss_clip": 0.01009344, - "auxiliary_loss_mlp": 0.01005985, - "balance_loss_clip": 1.01376081, - "balance_loss_mlp": 1.00455499, - "epoch": 0.556861566210732, - "flos": 70498213044480.0, - "grad_norm": 0.7654306967564637, - "language_loss": 0.61116695, - "learning_rate": 1.729570835226108e-06, - "loss": 0.63132024, - "num_input_tokens_seen": 199495855, - "step": 9262, - "time_per_iteration": 3.2477405071258545 - }, - { - "auxiliary_loss_clip": 0.01109024, - "auxiliary_loss_mlp": 0.0103923, - "balance_loss_clip": 1.03991163, - "balance_loss_mlp": 1.02594411, - "epoch": 0.5569216894633999, - "flos": 25337276593920.0, - "grad_norm": 1.6344264149627976, - "language_loss": 0.64423072, - "learning_rate": 1.7291849553586622e-06, - "loss": 0.66571325, - "num_input_tokens_seen": 199515870, - "step": 9263, - "time_per_iteration": 2.658576488494873 - }, - { - "auxiliary_loss_clip": 0.01095378, - "auxiliary_loss_mlp": 0.010346, - "balance_loss_clip": 1.03873014, - "balance_loss_mlp": 1.02134418, - "epoch": 0.556981812716068, - "flos": 22638733574400.0, - "grad_norm": 1.867976542015905, - "language_loss": 0.73368537, - "learning_rate": 1.7287990857605497e-06, - "loss": 0.75498509, - "num_input_tokens_seen": 199535745, - "step": 9264, - "time_per_iteration": 2.7003254890441895 - }, - { - "auxiliary_loss_clip": 0.01095238, - "auxiliary_loss_mlp": 0.01029532, - "balance_loss_clip": 1.04636014, - "balance_loss_mlp": 1.01672268, - "epoch": 0.5570419359687359, - "flos": 11035852738560.0, - "grad_norm": 2.2771016341265526, - "language_loss": 0.76178783, - "learning_rate": 1.7284132264464022e-06, - "loss": 0.78303552, - "num_input_tokens_seen": 199554035, - "step": 9265, - "time_per_iteration": 2.7386014461517334 - }, - { - "auxiliary_loss_clip": 0.01090389, - "auxiliary_loss_mlp": 0.01034586, - "balance_loss_clip": 1.04179025, - "balance_loss_mlp": 1.02249825, - "epoch": 0.5571020592214039, - "flos": 22823135020800.0, - "grad_norm": 1.339030652191656, - "language_loss": 0.70789158, - "learning_rate": 1.7280273774308536e-06, - "loss": 0.72914135, - "num_input_tokens_seen": 199576120, - "step": 9266, - "time_per_iteration": 2.741800546646118 - }, - { - "auxiliary_loss_clip": 0.01094155, - "auxiliary_loss_mlp": 0.0103873, - "balance_loss_clip": 1.03911209, - "balance_loss_mlp": 1.0255034, - "epoch": 0.5571621824740719, - "flos": 22927778317440.0, - "grad_norm": 2.0031056980063506, - "language_loss": 0.68157613, - "learning_rate": 1.727641538728533e-06, - "loss": 0.70290494, - "num_input_tokens_seen": 199593780, - "step": 9267, - "time_per_iteration": 2.7874062061309814 - }, - { - "auxiliary_loss_clip": 0.01104037, - "auxiliary_loss_mlp": 0.01038856, - "balance_loss_clip": 1.03991306, - "balance_loss_mlp": 1.02653575, - "epoch": 0.5572223057267398, - "flos": 22966705681920.0, - "grad_norm": 1.918660534651482, - "language_loss": 0.74570519, - "learning_rate": 1.7272557103540736e-06, - "loss": 0.76713407, - "num_input_tokens_seen": 199613220, - "step": 9268, - "time_per_iteration": 2.7008538246154785 - }, - { - "auxiliary_loss_clip": 0.01103292, - "auxiliary_loss_mlp": 0.00770403, - "balance_loss_clip": 1.04299617, - "balance_loss_mlp": 1.00017905, - "epoch": 0.5572824289794078, - "flos": 20960053413120.0, - "grad_norm": 1.8745085493520866, - "language_loss": 0.75087655, - "learning_rate": 1.726869892322104e-06, - "loss": 0.76961344, - "num_input_tokens_seen": 199632085, - "step": 9269, - "time_per_iteration": 2.653756856918335 - }, - { - "auxiliary_loss_clip": 0.01081519, - "auxiliary_loss_mlp": 0.01046232, - "balance_loss_clip": 1.03722787, - "balance_loss_mlp": 1.03201032, - "epoch": 0.5573425522320757, - "flos": 25042413847680.0, - "grad_norm": 1.688879717720704, - "language_loss": 0.82588089, - "learning_rate": 1.726484084647256e-06, - "loss": 0.84715831, - "num_input_tokens_seen": 199649295, - "step": 9270, - "time_per_iteration": 4.278396844863892 - }, - { - "auxiliary_loss_clip": 0.01079257, - "auxiliary_loss_mlp": 0.01039234, - "balance_loss_clip": 1.04120445, - "balance_loss_mlp": 1.02594197, - "epoch": 0.5574026754847438, - "flos": 23659637927040.0, - "grad_norm": 2.0078243728297167, - "language_loss": 0.79825968, - "learning_rate": 1.7260982873441591e-06, - "loss": 0.81944454, - "num_input_tokens_seen": 199668870, - "step": 9271, - "time_per_iteration": 6.1330788135528564 - }, - { - "auxiliary_loss_clip": 0.01099668, - "auxiliary_loss_mlp": 0.01031508, - "balance_loss_clip": 1.04303491, - "balance_loss_mlp": 1.01848447, - "epoch": 0.5574627987374117, - "flos": 24782240661120.0, - "grad_norm": 2.2903855544483394, - "language_loss": 0.90515852, - "learning_rate": 1.725712500427442e-06, - "loss": 0.92647034, - "num_input_tokens_seen": 199684870, - "step": 9272, - "time_per_iteration": 2.6802456378936768 - }, - { - "auxiliary_loss_clip": 0.01086004, - "auxiliary_loss_mlp": 0.01032973, - "balance_loss_clip": 1.04199028, - "balance_loss_mlp": 1.02049148, - "epoch": 0.5575229219900797, - "flos": 21834944979840.0, - "grad_norm": 2.009692341926254, - "language_loss": 0.83817393, - "learning_rate": 1.7253267239117347e-06, - "loss": 0.85936373, - "num_input_tokens_seen": 199701975, - "step": 9273, - "time_per_iteration": 2.714702606201172 - }, - { - "auxiliary_loss_clip": 0.01111871, - "auxiliary_loss_mlp": 0.01043586, - "balance_loss_clip": 1.0435437, - "balance_loss_mlp": 1.0286727, - "epoch": 0.5575830452427476, - "flos": 27815148408960.0, - "grad_norm": 2.029727061879287, - "language_loss": 0.74000418, - "learning_rate": 1.7249409578116655e-06, - "loss": 0.76155877, - "num_input_tokens_seen": 199721865, - "step": 9274, - "time_per_iteration": 2.6897573471069336 - }, - { - "auxiliary_loss_clip": 0.01102598, - "auxiliary_loss_mlp": 0.01036471, - "balance_loss_clip": 1.04597545, - "balance_loss_mlp": 1.02202296, - "epoch": 0.5576431684954156, - "flos": 17812805696640.0, - "grad_norm": 2.7929550344218885, - "language_loss": 0.7749905, - "learning_rate": 1.7245552021418629e-06, - "loss": 0.79638124, - "num_input_tokens_seen": 199736455, - "step": 9275, - "time_per_iteration": 2.6423583030700684 - }, - { - "auxiliary_loss_clip": 0.01093646, - "auxiliary_loss_mlp": 0.01035097, - "balance_loss_clip": 1.04310751, - "balance_loss_mlp": 1.02178109, - "epoch": 0.5577032917480835, - "flos": 15486872411520.0, - "grad_norm": 1.5365384810156146, - "language_loss": 0.75059974, - "learning_rate": 1.7241694569169546e-06, - "loss": 0.77188718, - "num_input_tokens_seen": 199753125, - "step": 9276, - "time_per_iteration": 4.227986812591553 - }, - { - "auxiliary_loss_clip": 0.01098066, - "auxiliary_loss_mlp": 0.01035646, - "balance_loss_clip": 1.04026711, - "balance_loss_mlp": 1.02219296, - "epoch": 0.5577634150007516, - "flos": 21579763783680.0, - "grad_norm": 1.8156811956405543, - "language_loss": 0.75730252, - "learning_rate": 1.7237837221515678e-06, - "loss": 0.77863955, - "num_input_tokens_seen": 199771365, - "step": 9277, - "time_per_iteration": 2.651348114013672 - }, - { - "auxiliary_loss_clip": 0.01117192, - "auxiliary_loss_mlp": 0.01033742, - "balance_loss_clip": 1.04269838, - "balance_loss_mlp": 1.02087963, - "epoch": 0.5578235382534195, - "flos": 21139750177920.0, - "grad_norm": 1.871466977383403, - "language_loss": 0.71828836, - "learning_rate": 1.7233979978603304e-06, - "loss": 0.73979771, - "num_input_tokens_seen": 199790035, - "step": 9278, - "time_per_iteration": 2.657386302947998 - }, - { - "auxiliary_loss_clip": 0.0108587, - "auxiliary_loss_mlp": 0.01037056, - "balance_loss_clip": 1.04430723, - "balance_loss_mlp": 1.02232218, - "epoch": 0.5578836615060875, - "flos": 26505199313280.0, - "grad_norm": 1.586228481919935, - "language_loss": 0.75729156, - "learning_rate": 1.723012284057868e-06, - "loss": 0.77852082, - "num_input_tokens_seen": 199811125, - "step": 9279, - "time_per_iteration": 2.751840353012085 - }, - { - "auxiliary_loss_clip": 0.01093934, - "auxiliary_loss_mlp": 0.01037128, - "balance_loss_clip": 1.03794658, - "balance_loss_mlp": 1.02376509, - "epoch": 0.5579437847587555, - "flos": 20153786780160.0, - "grad_norm": 1.6097529730476008, - "language_loss": 0.67559254, - "learning_rate": 1.7226265807588082e-06, - "loss": 0.69690311, - "num_input_tokens_seen": 199829915, - "step": 9280, - "time_per_iteration": 2.6563684940338135 - }, - { - "auxiliary_loss_clip": 0.01106752, - "auxiliary_loss_mlp": 0.01041709, - "balance_loss_clip": 1.0392946, - "balance_loss_mlp": 1.02810693, - "epoch": 0.5580039080114234, - "flos": 26102281478400.0, - "grad_norm": 1.6056594505621422, - "language_loss": 0.73215401, - "learning_rate": 1.7222408879777763e-06, - "loss": 0.75363857, - "num_input_tokens_seen": 199850670, - "step": 9281, - "time_per_iteration": 2.6871986389160156 - }, - { - "auxiliary_loss_clip": 0.01086628, - "auxiliary_loss_mlp": 0.00770991, - "balance_loss_clip": 1.04039741, - "balance_loss_mlp": 1.0002861, - "epoch": 0.5580640312640914, - "flos": 13771671096960.0, - "grad_norm": 3.0582981113882317, - "language_loss": 0.75378543, - "learning_rate": 1.7218552057293974e-06, - "loss": 0.77236158, - "num_input_tokens_seen": 199867645, - "step": 9282, - "time_per_iteration": 2.680744171142578 - }, - { - "auxiliary_loss_clip": 0.01055422, - "auxiliary_loss_mlp": 0.01036854, - "balance_loss_clip": 1.03532624, - "balance_loss_mlp": 1.02328229, - "epoch": 0.5581241545167593, - "flos": 17675986792320.0, - "grad_norm": 2.212590462669887, - "language_loss": 0.6592958, - "learning_rate": 1.721469534028297e-06, - "loss": 0.68021852, - "num_input_tokens_seen": 199886320, - "step": 9283, - "time_per_iteration": 2.7523255348205566 - }, - { - "auxiliary_loss_clip": 0.01087506, - "auxiliary_loss_mlp": 0.01030166, - "balance_loss_clip": 1.04440904, - "balance_loss_mlp": 1.01841235, - "epoch": 0.5581842777694274, - "flos": 19569161018880.0, - "grad_norm": 1.7248818916670352, - "language_loss": 0.82969356, - "learning_rate": 1.7210838728890994e-06, - "loss": 0.85087025, - "num_input_tokens_seen": 199904895, - "step": 9284, - "time_per_iteration": 2.6912968158721924 - }, - { - "auxiliary_loss_clip": 0.01097795, - "auxiliary_loss_mlp": 0.0103561, - "balance_loss_clip": 1.04244661, - "balance_loss_mlp": 1.02261066, - "epoch": 0.5582444010220953, - "flos": 20595165102720.0, - "grad_norm": 2.3068151709488736, - "language_loss": 0.85949606, - "learning_rate": 1.7206982223264304e-06, - "loss": 0.88083011, - "num_input_tokens_seen": 199921090, - "step": 9285, - "time_per_iteration": 2.6835310459136963 - }, - { - "auxiliary_loss_clip": 0.01095995, - "auxiliary_loss_mlp": 0.01037997, - "balance_loss_clip": 1.0437417, - "balance_loss_mlp": 1.02543855, - "epoch": 0.5583045242747633, - "flos": 19135504120320.0, - "grad_norm": 2.6758058324476024, - "language_loss": 0.73497176, - "learning_rate": 1.720312582354912e-06, - "loss": 0.75631171, - "num_input_tokens_seen": 199939925, - "step": 9286, - "time_per_iteration": 2.7510128021240234 - }, - { - "auxiliary_loss_clip": 0.01119969, - "auxiliary_loss_mlp": 0.01032279, - "balance_loss_clip": 1.04193521, - "balance_loss_mlp": 1.01924896, - "epoch": 0.5583646475274312, - "flos": 27454569730560.0, - "grad_norm": 2.5542622351497104, - "language_loss": 0.7366401, - "learning_rate": 1.7199269529891684e-06, - "loss": 0.7581625, - "num_input_tokens_seen": 199960015, - "step": 9287, - "time_per_iteration": 2.7764368057250977 - }, - { - "auxiliary_loss_clip": 0.01087822, - "auxiliary_loss_mlp": 0.01038543, - "balance_loss_clip": 1.04215682, - "balance_loss_mlp": 1.0240171, - "epoch": 0.5584247707800992, - "flos": 23653784010240.0, - "grad_norm": 1.5995445525462566, - "language_loss": 0.75250727, - "learning_rate": 1.7195413342438233e-06, - "loss": 0.77377093, - "num_input_tokens_seen": 199980505, - "step": 9288, - "time_per_iteration": 2.711667060852051 - }, - { - "auxiliary_loss_clip": 0.01101347, - "auxiliary_loss_mlp": 0.01045442, - "balance_loss_clip": 1.04461765, - "balance_loss_mlp": 1.03062999, - "epoch": 0.5584848940327671, - "flos": 13698880185600.0, - "grad_norm": 2.3847574468541075, - "language_loss": 0.77486145, - "learning_rate": 1.7191557261334984e-06, - "loss": 0.79632932, - "num_input_tokens_seen": 199999020, - "step": 9289, - "time_per_iteration": 2.726365566253662 - }, - { - "auxiliary_loss_clip": 0.01092544, - "auxiliary_loss_mlp": 0.01034807, - "balance_loss_clip": 1.04270971, - "balance_loss_mlp": 1.02084172, - "epoch": 0.5585450172854352, - "flos": 27016208150400.0, - "grad_norm": 1.8546991944448898, - "language_loss": 0.61392409, - "learning_rate": 1.718770128672817e-06, - "loss": 0.63519758, - "num_input_tokens_seen": 200019020, - "step": 9290, - "time_per_iteration": 2.7546441555023193 - }, - { - "auxiliary_loss_clip": 0.01071377, - "auxiliary_loss_mlp": 0.01032544, - "balance_loss_clip": 1.03871763, - "balance_loss_mlp": 1.01945531, - "epoch": 0.5586051405381031, - "flos": 23185653033600.0, - "grad_norm": 2.64639974160875, - "language_loss": 0.68249333, - "learning_rate": 1.7183845418764e-06, - "loss": 0.70353258, - "num_input_tokens_seen": 200038110, - "step": 9291, - "time_per_iteration": 3.030916452407837 - }, - { - "auxiliary_loss_clip": 0.01091279, - "auxiliary_loss_mlp": 0.01045913, - "balance_loss_clip": 1.04114079, - "balance_loss_mlp": 1.03218007, - "epoch": 0.5586652637907711, - "flos": 20775544225920.0, - "grad_norm": 1.7635760067758424, - "language_loss": 0.84269536, - "learning_rate": 1.7179989657588698e-06, - "loss": 0.86406732, - "num_input_tokens_seen": 200056210, - "step": 9292, - "time_per_iteration": 2.6990363597869873 - }, - { - "auxiliary_loss_clip": 0.01090195, - "auxiliary_loss_mlp": 0.01046206, - "balance_loss_clip": 1.03904271, - "balance_loss_mlp": 1.03265166, - "epoch": 0.5587253870434391, - "flos": 28219897837440.0, - "grad_norm": 2.3637237833932687, - "language_loss": 0.73976684, - "learning_rate": 1.7176134003348476e-06, - "loss": 0.76113087, - "num_input_tokens_seen": 200075620, - "step": 9293, - "time_per_iteration": 2.7066195011138916 - }, - { - "auxiliary_loss_clip": 0.0108672, - "auxiliary_loss_mlp": 0.01044291, - "balance_loss_clip": 1.04188502, - "balance_loss_mlp": 1.03185785, - "epoch": 0.558785510296107, - "flos": 26615732440320.0, - "grad_norm": 1.7291294273759894, - "language_loss": 0.72083485, - "learning_rate": 1.7172278456189523e-06, - "loss": 0.74214494, - "num_input_tokens_seen": 200095945, - "step": 9294, - "time_per_iteration": 2.7188310623168945 - }, - { - "auxiliary_loss_clip": 0.01098814, - "auxiliary_loss_mlp": 0.00770939, - "balance_loss_clip": 1.04345989, - "balance_loss_mlp": 1.0002197, - "epoch": 0.558845633548775, - "flos": 20156767608960.0, - "grad_norm": 2.0034844848738995, - "language_loss": 0.68573147, - "learning_rate": 1.716842301625806e-06, - "loss": 0.70442897, - "num_input_tokens_seen": 200114185, - "step": 9295, - "time_per_iteration": 2.645157814025879 - }, - { - "auxiliary_loss_clip": 0.01120796, - "auxiliary_loss_mlp": 0.01037699, - "balance_loss_clip": 1.04437232, - "balance_loss_mlp": 1.02404976, - "epoch": 0.5589057568014429, - "flos": 24350774492160.0, - "grad_norm": 1.451861251832641, - "language_loss": 0.81153715, - "learning_rate": 1.7164567683700281e-06, - "loss": 0.83312207, - "num_input_tokens_seen": 200135030, - "step": 9296, - "time_per_iteration": 2.638831853866577 - }, - { - "auxiliary_loss_clip": 0.01109007, - "auxiliary_loss_mlp": 0.01036287, - "balance_loss_clip": 1.0433023, - "balance_loss_mlp": 1.02302504, - "epoch": 0.558965880054111, - "flos": 21105168359040.0, - "grad_norm": 2.39482931377815, - "language_loss": 0.65407717, - "learning_rate": 1.7160712458662379e-06, - "loss": 0.67553014, - "num_input_tokens_seen": 200154290, - "step": 9297, - "time_per_iteration": 2.6714565753936768 - }, - { - "auxiliary_loss_clip": 0.01088452, - "auxiliary_loss_mlp": 0.01039165, - "balance_loss_clip": 1.04224098, - "balance_loss_mlp": 1.024997, - "epoch": 0.5590260033067789, - "flos": 18436071513600.0, - "grad_norm": 1.768502931317098, - "language_loss": 0.75242859, - "learning_rate": 1.7156857341290544e-06, - "loss": 0.77370477, - "num_input_tokens_seen": 200171555, - "step": 9298, - "time_per_iteration": 2.7061312198638916 - }, - { - "auxiliary_loss_clip": 0.01019627, - "auxiliary_loss_mlp": 0.01016507, - "balance_loss_clip": 1.01274395, - "balance_loss_mlp": 1.01488543, - "epoch": 0.5590861265594469, - "flos": 70577432490240.0, - "grad_norm": 0.6867151105979278, - "language_loss": 0.52393436, - "learning_rate": 1.7153002331730967e-06, - "loss": 0.54429573, - "num_input_tokens_seen": 200237010, - "step": 9299, - "time_per_iteration": 3.2783946990966797 - }, - { - "auxiliary_loss_clip": 0.01104521, - "auxiliary_loss_mlp": 0.01037017, - "balance_loss_clip": 1.04119837, - "balance_loss_mlp": 1.02390957, - "epoch": 0.5591462498121148, - "flos": 30664408896000.0, - "grad_norm": 1.9265460961114051, - "language_loss": 0.69143355, - "learning_rate": 1.7149147430129824e-06, - "loss": 0.7128489, - "num_input_tokens_seen": 200260820, - "step": 9300, - "time_per_iteration": 2.716351270675659 - }, - { - "auxiliary_loss_clip": 0.01065458, - "auxiliary_loss_mlp": 0.01057284, - "balance_loss_clip": 1.03432143, - "balance_loss_mlp": 1.04067802, - "epoch": 0.5592063730647828, - "flos": 18150438562560.0, - "grad_norm": 2.0948179426753164, - "language_loss": 0.81994128, - "learning_rate": 1.7145292636633293e-06, - "loss": 0.84116876, - "num_input_tokens_seen": 200278035, - "step": 9301, - "time_per_iteration": 2.6983389854431152 - }, - { - "auxiliary_loss_clip": 0.01117535, - "auxiliary_loss_mlp": 0.01032183, - "balance_loss_clip": 1.04067254, - "balance_loss_mlp": 1.0186348, - "epoch": 0.5592664963174507, - "flos": 24060400945920.0, - "grad_norm": 3.1722185850775553, - "language_loss": 0.68140459, - "learning_rate": 1.714143795138756e-06, - "loss": 0.70290172, - "num_input_tokens_seen": 200297255, - "step": 9302, - "time_per_iteration": 2.5997016429901123 - }, - { - "auxiliary_loss_clip": 0.01088292, - "auxiliary_loss_mlp": 0.01028765, - "balance_loss_clip": 1.04123783, - "balance_loss_mlp": 1.01426911, - "epoch": 0.5593266195701188, - "flos": 19827897661440.0, - "grad_norm": 1.7171276141981482, - "language_loss": 0.70894414, - "learning_rate": 1.713758337453878e-06, - "loss": 0.7301147, - "num_input_tokens_seen": 200317505, - "step": 9303, - "time_per_iteration": 2.720726728439331 - }, - { - "auxiliary_loss_clip": 0.01045978, - "auxiliary_loss_mlp": 0.01043666, - "balance_loss_clip": 1.03466618, - "balance_loss_mlp": 1.02934885, - "epoch": 0.5593867428227867, - "flos": 25300755440640.0, - "grad_norm": 3.8871936508431606, - "language_loss": 0.72614998, - "learning_rate": 1.7133728906233124e-06, - "loss": 0.74704641, - "num_input_tokens_seen": 200338350, - "step": 9304, - "time_per_iteration": 2.7727861404418945 - }, - { - "auxiliary_loss_clip": 0.01107464, - "auxiliary_loss_mlp": 0.01030237, - "balance_loss_clip": 1.04120493, - "balance_loss_mlp": 1.0174104, - "epoch": 0.5594468660754547, - "flos": 12933013374720.0, - "grad_norm": 2.306388303475261, - "language_loss": 0.77981883, - "learning_rate": 1.7129874546616763e-06, - "loss": 0.80119586, - "num_input_tokens_seen": 200353965, - "step": 9305, - "time_per_iteration": 2.5945067405700684 - }, - { - "auxiliary_loss_clip": 0.01069392, - "auxiliary_loss_mlp": 0.01030263, - "balance_loss_clip": 1.04184294, - "balance_loss_mlp": 1.01778793, - "epoch": 0.5595069893281227, - "flos": 19062713208960.0, - "grad_norm": 1.7491845938042618, - "language_loss": 0.69805098, - "learning_rate": 1.7126020295835836e-06, - "loss": 0.71904755, - "num_input_tokens_seen": 200373595, - "step": 9306, - "time_per_iteration": 2.8083784580230713 - }, - { - "auxiliary_loss_clip": 0.01018297, - "auxiliary_loss_mlp": 0.01002442, - "balance_loss_clip": 1.015836, - "balance_loss_mlp": 1.00099397, - "epoch": 0.5595671125807906, - "flos": 70273375862400.0, - "grad_norm": 0.9194279331367995, - "language_loss": 0.60304606, - "learning_rate": 1.7122166154036518e-06, - "loss": 0.62325346, - "num_input_tokens_seen": 200429155, - "step": 9307, - "time_per_iteration": 3.301408052444458 - }, - { - "auxiliary_loss_clip": 0.01104522, - "auxiliary_loss_mlp": 0.01035417, - "balance_loss_clip": 1.0423522, - "balance_loss_mlp": 1.02234626, - "epoch": 0.5596272358334586, - "flos": 20665513889280.0, - "grad_norm": 1.8556565203900444, - "language_loss": 0.73943615, - "learning_rate": 1.7118312121364943e-06, - "loss": 0.76083553, - "num_input_tokens_seen": 200448290, - "step": 9308, - "time_per_iteration": 2.6449387073516846 - }, - { - "auxiliary_loss_clip": 0.01051886, - "auxiliary_loss_mlp": 0.01038908, - "balance_loss_clip": 1.03424501, - "balance_loss_mlp": 1.02397084, - "epoch": 0.5596873590861265, - "flos": 25041013217280.0, - "grad_norm": 2.1877402567653808, - "language_loss": 0.69691569, - "learning_rate": 1.7114458197967257e-06, - "loss": 0.71782362, - "num_input_tokens_seen": 200466555, - "step": 9309, - "time_per_iteration": 4.464626312255859 - }, - { - "auxiliary_loss_clip": 0.01093684, - "auxiliary_loss_mlp": 0.01037862, - "balance_loss_clip": 1.04161119, - "balance_loss_mlp": 1.02288949, - "epoch": 0.5597474823387946, - "flos": 25958387594880.0, - "grad_norm": 1.9102617963629012, - "language_loss": 0.75523353, - "learning_rate": 1.7110604383989613e-06, - "loss": 0.77654898, - "num_input_tokens_seen": 200485980, - "step": 9310, - "time_per_iteration": 4.4445412158966064 - }, - { - "auxiliary_loss_clip": 0.01112006, - "auxiliary_loss_mlp": 0.01037268, - "balance_loss_clip": 1.04378152, - "balance_loss_mlp": 1.02286768, - "epoch": 0.5598076055914625, - "flos": 26177442687360.0, - "grad_norm": 2.0703892527912813, - "language_loss": 0.69657761, - "learning_rate": 1.7106750679578133e-06, - "loss": 0.71807039, - "num_input_tokens_seen": 200504555, - "step": 9311, - "time_per_iteration": 4.303341865539551 - }, - { - "auxiliary_loss_clip": 0.01105172, - "auxiliary_loss_mlp": 0.01034066, - "balance_loss_clip": 1.04042637, - "balance_loss_mlp": 1.02103674, - "epoch": 0.5598677288441305, - "flos": 11655778590720.0, - "grad_norm": 1.8932120118757645, - "language_loss": 0.71856189, - "learning_rate": 1.7102897084878962e-06, - "loss": 0.73995423, - "num_input_tokens_seen": 200522700, - "step": 9312, - "time_per_iteration": 2.610438823699951 - }, - { - "auxiliary_loss_clip": 0.01080705, - "auxiliary_loss_mlp": 0.01033643, - "balance_loss_clip": 1.04290187, - "balance_loss_mlp": 1.02023816, - "epoch": 0.5599278520967984, - "flos": 22966597941120.0, - "grad_norm": 2.1557841469459746, - "language_loss": 0.89152771, - "learning_rate": 1.709904360003822e-06, - "loss": 0.91267115, - "num_input_tokens_seen": 200541910, - "step": 9313, - "time_per_iteration": 2.6854610443115234 - }, - { - "auxiliary_loss_clip": 0.01081962, - "auxiliary_loss_mlp": 0.01044977, - "balance_loss_clip": 1.0415206, - "balance_loss_mlp": 1.03109467, - "epoch": 0.5599879753494664, - "flos": 21215557831680.0, - "grad_norm": 1.521477055933408, - "language_loss": 0.77815449, - "learning_rate": 1.709519022520204e-06, - "loss": 0.79942387, - "num_input_tokens_seen": 200562600, - "step": 9314, - "time_per_iteration": 4.262527942657471 - }, - { - "auxiliary_loss_clip": 0.01082652, - "auxiliary_loss_mlp": 0.01031612, - "balance_loss_clip": 1.0416466, - "balance_loss_mlp": 1.01851654, - "epoch": 0.5600480986021343, - "flos": 31903219105920.0, - "grad_norm": 1.6753660628338782, - "language_loss": 0.70509619, - "learning_rate": 1.7091336960516537e-06, - "loss": 0.72623885, - "num_input_tokens_seen": 200584795, - "step": 9315, - "time_per_iteration": 2.7611892223358154 - }, - { - "auxiliary_loss_clip": 0.0110321, - "auxiliary_loss_mlp": 0.01041043, - "balance_loss_clip": 1.04375148, - "balance_loss_mlp": 1.02726793, - "epoch": 0.5601082218548024, - "flos": 28476048700800.0, - "grad_norm": 1.7587170023253702, - "language_loss": 0.66601861, - "learning_rate": 1.7087483806127824e-06, - "loss": 0.68746114, - "num_input_tokens_seen": 200606945, - "step": 9316, - "time_per_iteration": 2.675050973892212 - }, - { - "auxiliary_loss_clip": 0.0108131, - "auxiliary_loss_mlp": 0.01037022, - "balance_loss_clip": 1.037871, - "balance_loss_mlp": 1.0214529, - "epoch": 0.5601683451074703, - "flos": 24097173494400.0, - "grad_norm": 2.414777902457845, - "language_loss": 0.87209964, - "learning_rate": 1.7083630762182022e-06, - "loss": 0.89328289, - "num_input_tokens_seen": 200626340, - "step": 9317, - "time_per_iteration": 2.7405858039855957 - }, - { - "auxiliary_loss_clip": 0.01115616, - "auxiliary_loss_mlp": 0.01038233, - "balance_loss_clip": 1.04544759, - "balance_loss_mlp": 1.02290869, - "epoch": 0.5602284683601383, - "flos": 26356205698560.0, - "grad_norm": 1.8555836482261492, - "language_loss": 0.76961493, - "learning_rate": 1.7079777828825233e-06, - "loss": 0.79115343, - "num_input_tokens_seen": 200644520, - "step": 9318, - "time_per_iteration": 2.683375597000122 - }, - { - "auxiliary_loss_clip": 0.0110569, - "auxiliary_loss_mlp": 0.01040718, - "balance_loss_clip": 1.04080641, - "balance_loss_mlp": 1.02822459, - "epoch": 0.5602885916128063, - "flos": 24496392228480.0, - "grad_norm": 1.6342768124534643, - "language_loss": 0.76235765, - "learning_rate": 1.7075925006203558e-06, - "loss": 0.7838217, - "num_input_tokens_seen": 200664845, - "step": 9319, - "time_per_iteration": 2.6256465911865234 - }, - { - "auxiliary_loss_clip": 0.01107325, - "auxiliary_loss_mlp": 0.01036412, - "balance_loss_clip": 1.04242063, - "balance_loss_mlp": 1.02393723, - "epoch": 0.5603487148654742, - "flos": 27345006270720.0, - "grad_norm": 1.4761895802927258, - "language_loss": 0.85648036, - "learning_rate": 1.7072072294463101e-06, - "loss": 0.87791771, - "num_input_tokens_seen": 200686535, - "step": 9320, - "time_per_iteration": 2.7295455932617188 - }, - { - "auxiliary_loss_clip": 0.0103543, - "auxiliary_loss_mlp": 0.01003142, - "balance_loss_clip": 1.01980209, - "balance_loss_mlp": 1.00181246, - "epoch": 0.5604088381181422, - "flos": 54087756180480.0, - "grad_norm": 0.7528149861495326, - "language_loss": 0.52530909, - "learning_rate": 1.706821969374996e-06, - "loss": 0.54569471, - "num_input_tokens_seen": 200736965, - "step": 9321, - "time_per_iteration": 3.0199856758117676 - }, - { - "auxiliary_loss_clip": 0.01097468, - "auxiliary_loss_mlp": 0.01035636, - "balance_loss_clip": 1.04187417, - "balance_loss_mlp": 1.02274311, - "epoch": 0.5604689613708101, - "flos": 22236390357120.0, - "grad_norm": 1.366292846882571, - "language_loss": 0.74232858, - "learning_rate": 1.7064367204210216e-06, - "loss": 0.7636596, - "num_input_tokens_seen": 200757420, - "step": 9322, - "time_per_iteration": 2.7239301204681396 - }, - { - "auxiliary_loss_clip": 0.01120105, - "auxiliary_loss_mlp": 0.01033893, - "balance_loss_clip": 1.04226124, - "balance_loss_mlp": 1.01925397, - "epoch": 0.5605290846234782, - "flos": 35297782940160.0, - "grad_norm": 1.6268223998146492, - "language_loss": 0.74119061, - "learning_rate": 1.7060514825989963e-06, - "loss": 0.7627306, - "num_input_tokens_seen": 200779520, - "step": 9323, - "time_per_iteration": 2.7277660369873047 - }, - { - "auxiliary_loss_clip": 0.01097354, - "auxiliary_loss_mlp": 0.01034103, - "balance_loss_clip": 1.04408789, - "balance_loss_mlp": 1.01961303, - "epoch": 0.5605892078761461, - "flos": 20263314326400.0, - "grad_norm": 2.353968750169446, - "language_loss": 0.61679977, - "learning_rate": 1.7056662559235286e-06, - "loss": 0.63811433, - "num_input_tokens_seen": 200799485, - "step": 9324, - "time_per_iteration": 2.681330442428589 - }, - { - "auxiliary_loss_clip": 0.01068442, - "auxiliary_loss_mlp": 0.0103778, - "balance_loss_clip": 1.03685164, - "balance_loss_mlp": 1.02353454, - "epoch": 0.5606493311288141, - "flos": 17308333134720.0, - "grad_norm": 1.7599111661375368, - "language_loss": 0.87798876, - "learning_rate": 1.705281040409226e-06, - "loss": 0.89905095, - "num_input_tokens_seen": 200817540, - "step": 9325, - "time_per_iteration": 2.73244571685791 - }, - { - "auxiliary_loss_clip": 0.01098073, - "auxiliary_loss_mlp": 0.01034138, - "balance_loss_clip": 1.04064608, - "balance_loss_mlp": 1.01970756, - "epoch": 0.560709454381482, - "flos": 21652985658240.0, - "grad_norm": 1.5582793995716135, - "language_loss": 0.7359941, - "learning_rate": 1.7048958360706952e-06, - "loss": 0.75731623, - "num_input_tokens_seen": 200838380, - "step": 9326, - "time_per_iteration": 2.685098886489868 - }, - { - "auxiliary_loss_clip": 0.01099795, - "auxiliary_loss_mlp": 0.01027968, - "balance_loss_clip": 1.04008412, - "balance_loss_mlp": 1.01316798, - "epoch": 0.56076957763415, - "flos": 20303355012480.0, - "grad_norm": 1.8644433543241015, - "language_loss": 0.78216934, - "learning_rate": 1.7045106429225447e-06, - "loss": 0.80344701, - "num_input_tokens_seen": 200855640, - "step": 9327, - "time_per_iteration": 2.7206430435180664 - }, - { - "auxiliary_loss_clip": 0.01106989, - "auxiliary_loss_mlp": 0.01034784, - "balance_loss_clip": 1.04609513, - "balance_loss_mlp": 1.02029371, - "epoch": 0.5608297008868179, - "flos": 25045897466880.0, - "grad_norm": 1.6309153070460434, - "language_loss": 0.78084052, - "learning_rate": 1.7041254609793795e-06, - "loss": 0.80225813, - "num_input_tokens_seen": 200876585, - "step": 9328, - "time_per_iteration": 2.6724750995635986 - }, - { - "auxiliary_loss_clip": 0.01119639, - "auxiliary_loss_mlp": 0.01031594, - "balance_loss_clip": 1.04266322, - "balance_loss_mlp": 1.01832008, - "epoch": 0.560889824139486, - "flos": 19866825025920.0, - "grad_norm": 1.4710158195252034, - "language_loss": 0.73393631, - "learning_rate": 1.7037402902558066e-06, - "loss": 0.75544858, - "num_input_tokens_seen": 200898175, - "step": 9329, - "time_per_iteration": 2.610711097717285 - }, - { - "auxiliary_loss_clip": 0.01100007, - "auxiliary_loss_mlp": 0.00773419, - "balance_loss_clip": 1.04148126, - "balance_loss_mlp": 1.00026274, - "epoch": 0.5609499473921539, - "flos": 22929394429440.0, - "grad_norm": 1.5539142345159989, - "language_loss": 0.83609939, - "learning_rate": 1.7033551307664324e-06, - "loss": 0.85483366, - "num_input_tokens_seen": 200917515, - "step": 9330, - "time_per_iteration": 2.7287333011627197 - }, - { - "auxiliary_loss_clip": 0.01042257, - "auxiliary_loss_mlp": 0.01001028, - "balance_loss_clip": 1.01692343, - "balance_loss_mlp": 0.99974674, - "epoch": 0.5610100706448219, - "flos": 53035825455360.0, - "grad_norm": 0.7095685041475404, - "language_loss": 0.57797414, - "learning_rate": 1.7029699825258603e-06, - "loss": 0.59840697, - "num_input_tokens_seen": 200978615, - "step": 9331, - "time_per_iteration": 3.197101354598999 - }, - { - "auxiliary_loss_clip": 0.01082146, - "auxiliary_loss_mlp": 0.01038466, - "balance_loss_clip": 1.0445832, - "balance_loss_mlp": 1.02405381, - "epoch": 0.5610701938974898, - "flos": 21834944979840.0, - "grad_norm": 1.957386899067858, - "language_loss": 0.82066166, - "learning_rate": 1.7025848455486971e-06, - "loss": 0.8418678, - "num_input_tokens_seen": 200997745, - "step": 9332, - "time_per_iteration": 2.706125497817993 - }, - { - "auxiliary_loss_clip": 0.01106958, - "auxiliary_loss_mlp": 0.01043073, - "balance_loss_clip": 1.04060066, - "balance_loss_mlp": 1.02800488, - "epoch": 0.5611303171501578, - "flos": 17457183095040.0, - "grad_norm": 1.7807099110593088, - "language_loss": 0.81912845, - "learning_rate": 1.7021997198495454e-06, - "loss": 0.8406288, - "num_input_tokens_seen": 201016370, - "step": 9333, - "time_per_iteration": 2.6288132667541504 - }, - { - "auxiliary_loss_clip": 0.01119893, - "auxiliary_loss_mlp": 0.01030061, - "balance_loss_clip": 1.04119062, - "balance_loss_mlp": 1.01676321, - "epoch": 0.5611904404028258, - "flos": 22637799820800.0, - "grad_norm": 1.6112092331225492, - "language_loss": 0.72989404, - "learning_rate": 1.7018146054430108e-06, - "loss": 0.75139362, - "num_input_tokens_seen": 201034310, - "step": 9334, - "time_per_iteration": 2.6088995933532715 - }, - { - "auxiliary_loss_clip": 0.01098453, - "auxiliary_loss_mlp": 0.01040678, - "balance_loss_clip": 1.0453335, - "balance_loss_mlp": 1.02690315, - "epoch": 0.5612505636554938, - "flos": 14316327999360.0, - "grad_norm": 2.5253764454191416, - "language_loss": 0.71248639, - "learning_rate": 1.7014295023436961e-06, - "loss": 0.73387766, - "num_input_tokens_seen": 201052030, - "step": 9335, - "time_per_iteration": 2.633389949798584 - }, - { - "auxiliary_loss_clip": 0.0109857, - "auxiliary_loss_mlp": 0.01034463, - "balance_loss_clip": 1.03983665, - "balance_loss_mlp": 1.02066469, - "epoch": 0.5613106869081618, - "flos": 16508279554560.0, - "grad_norm": 1.8386426637696407, - "language_loss": 0.77176088, - "learning_rate": 1.701044410566205e-06, - "loss": 0.79309118, - "num_input_tokens_seen": 201068445, - "step": 9336, - "time_per_iteration": 2.681753158569336 - }, - { - "auxiliary_loss_clip": 0.01108773, - "auxiliary_loss_mlp": 0.01033965, - "balance_loss_clip": 1.0423466, - "balance_loss_mlp": 1.02086353, - "epoch": 0.5613708101608297, - "flos": 24058569352320.0, - "grad_norm": 2.6196694346701817, - "language_loss": 0.64508319, - "learning_rate": 1.7006593301251393e-06, - "loss": 0.66651058, - "num_input_tokens_seen": 201082140, - "step": 9337, - "time_per_iteration": 2.629194498062134 - }, - { - "auxiliary_loss_clip": 0.01025154, - "auxiliary_loss_mlp": 0.01003147, - "balance_loss_clip": 1.01963842, - "balance_loss_mlp": 1.00190687, - "epoch": 0.5614309334134977, - "flos": 64905735997440.0, - "grad_norm": 0.8917713489246797, - "language_loss": 0.62551695, - "learning_rate": 1.700274261035102e-06, - "loss": 0.64579999, - "num_input_tokens_seen": 201137245, - "step": 9338, - "time_per_iteration": 3.1740610599517822 - }, - { - "auxiliary_loss_clip": 0.01091363, - "auxiliary_loss_mlp": 0.01035931, - "balance_loss_clip": 1.04291368, - "balance_loss_mlp": 1.02275264, - "epoch": 0.5614910566661656, - "flos": 32919849740160.0, - "grad_norm": 1.9155240319962232, - "language_loss": 0.65588379, - "learning_rate": 1.6998892033106946e-06, - "loss": 0.67715669, - "num_input_tokens_seen": 201157270, - "step": 9339, - "time_per_iteration": 2.795539617538452 - }, - { - "auxiliary_loss_clip": 0.0110324, - "auxiliary_loss_mlp": 0.01043787, - "balance_loss_clip": 1.04000616, - "balance_loss_mlp": 1.0283432, - "epoch": 0.5615511799188336, - "flos": 18588871969920.0, - "grad_norm": 1.9415000376687095, - "language_loss": 0.69498181, - "learning_rate": 1.6995041569665184e-06, - "loss": 0.716452, - "num_input_tokens_seen": 201174530, - "step": 9340, - "time_per_iteration": 2.6073222160339355 - }, - { - "auxiliary_loss_clip": 0.01076412, - "auxiliary_loss_mlp": 0.0103814, - "balance_loss_clip": 1.04082394, - "balance_loss_mlp": 1.02536726, - "epoch": 0.5616113031715015, - "flos": 22820010537600.0, - "grad_norm": 1.461608284307224, - "language_loss": 0.77235413, - "learning_rate": 1.6991191220171756e-06, - "loss": 0.79349971, - "num_input_tokens_seen": 201194905, - "step": 9341, - "time_per_iteration": 2.712812662124634 - }, - { - "auxiliary_loss_clip": 0.01069621, - "auxiliary_loss_mlp": 0.01037705, - "balance_loss_clip": 1.03758025, - "balance_loss_mlp": 1.0230068, - "epoch": 0.5616714264241696, - "flos": 22345702421760.0, - "grad_norm": 1.556156421929591, - "language_loss": 0.79645002, - "learning_rate": 1.6987340984772653e-06, - "loss": 0.81752324, - "num_input_tokens_seen": 201213715, - "step": 9342, - "time_per_iteration": 2.774918556213379 - }, - { - "auxiliary_loss_clip": 0.01091015, - "auxiliary_loss_mlp": 0.01035282, - "balance_loss_clip": 1.03911448, - "balance_loss_mlp": 1.02109551, - "epoch": 0.5617315496768375, - "flos": 18807783408000.0, - "grad_norm": 2.3711889370259907, - "language_loss": 0.76042008, - "learning_rate": 1.6983490863613882e-06, - "loss": 0.78168309, - "num_input_tokens_seen": 201231415, - "step": 9343, - "time_per_iteration": 2.7124969959259033 - }, - { - "auxiliary_loss_clip": 0.01080837, - "auxiliary_loss_mlp": 0.01044577, - "balance_loss_clip": 1.04475522, - "balance_loss_mlp": 1.03011727, - "epoch": 0.5617916729295055, - "flos": 18369314087040.0, - "grad_norm": 2.196794276196035, - "language_loss": 0.69644189, - "learning_rate": 1.6979640856841442e-06, - "loss": 0.71769607, - "num_input_tokens_seen": 201249625, - "step": 9344, - "time_per_iteration": 2.7265472412109375 - }, - { - "auxiliary_loss_clip": 0.01121229, - "auxiliary_loss_mlp": 0.01038625, - "balance_loss_clip": 1.04347157, - "balance_loss_mlp": 1.02447486, - "epoch": 0.5618517961821734, - "flos": 28179964892160.0, - "grad_norm": 3.2350770637683106, - "language_loss": 0.6636014, - "learning_rate": 1.6975790964601318e-06, - "loss": 0.68519998, - "num_input_tokens_seen": 201271205, - "step": 9345, - "time_per_iteration": 2.686527729034424 - }, - { - "auxiliary_loss_clip": 0.01098571, - "auxiliary_loss_mlp": 0.01032052, - "balance_loss_clip": 1.04279995, - "balance_loss_mlp": 1.0190227, - "epoch": 0.5619119194348414, - "flos": 15486872411520.0, - "grad_norm": 1.9772946469645978, - "language_loss": 0.87311339, - "learning_rate": 1.6971941187039512e-06, - "loss": 0.89441955, - "num_input_tokens_seen": 201287700, - "step": 9346, - "time_per_iteration": 2.6551971435546875 - }, - { - "auxiliary_loss_clip": 0.0109764, - "auxiliary_loss_mlp": 0.01036969, - "balance_loss_clip": 1.04373372, - "balance_loss_mlp": 1.02243173, - "epoch": 0.5619720426875094, - "flos": 29128652951040.0, - "grad_norm": 2.320939151148892, - "language_loss": 0.59135818, - "learning_rate": 1.6968091524301993e-06, - "loss": 0.61270428, - "num_input_tokens_seen": 201307530, - "step": 9347, - "time_per_iteration": 2.701704263687134 - }, - { - "auxiliary_loss_clip": 0.01113798, - "auxiliary_loss_mlp": 0.01039809, - "balance_loss_clip": 1.0449301, - "balance_loss_mlp": 1.02461553, - "epoch": 0.5620321659401774, - "flos": 18003743418240.0, - "grad_norm": 3.390094180858037, - "language_loss": 0.69345069, - "learning_rate": 1.6964241976534745e-06, - "loss": 0.7149868, - "num_input_tokens_seen": 201326210, - "step": 9348, - "time_per_iteration": 2.6152281761169434 - }, - { - "auxiliary_loss_clip": 0.01072866, - "auxiliary_loss_mlp": 0.01035332, - "balance_loss_clip": 1.03694952, - "balance_loss_mlp": 1.02000761, - "epoch": 0.5620922891928454, - "flos": 20594518657920.0, - "grad_norm": 12.292181580280033, - "language_loss": 0.79008943, - "learning_rate": 1.6960392543883754e-06, - "loss": 0.81117141, - "num_input_tokens_seen": 201346120, - "step": 9349, - "time_per_iteration": 5.937277793884277 - }, - { - "auxiliary_loss_clip": 0.01068645, - "auxiliary_loss_mlp": 0.0103743, - "balance_loss_clip": 1.04074883, - "balance_loss_mlp": 1.02314854, - "epoch": 0.5621524124455133, - "flos": 26287006147200.0, - "grad_norm": 2.217082199318971, - "language_loss": 0.67245173, - "learning_rate": 1.6956543226494975e-06, - "loss": 0.6935125, - "num_input_tokens_seen": 201365700, - "step": 9350, - "time_per_iteration": 4.385211229324341 - }, - { - "auxiliary_loss_clip": 0.01069908, - "auxiliary_loss_mlp": 0.01039418, - "balance_loss_clip": 1.03964508, - "balance_loss_mlp": 1.02451682, - "epoch": 0.5622125356981813, - "flos": 12750299867520.0, - "grad_norm": 2.668539433171336, - "language_loss": 0.78305924, - "learning_rate": 1.6952694024514381e-06, - "loss": 0.80415249, - "num_input_tokens_seen": 201382795, - "step": 9351, - "time_per_iteration": 2.6691691875457764 - }, - { - "auxiliary_loss_clip": 0.01099605, - "auxiliary_loss_mlp": 0.00772893, - "balance_loss_clip": 1.03920138, - "balance_loss_mlp": 1.00020838, - "epoch": 0.5622726589508492, - "flos": 23805327490560.0, - "grad_norm": 1.4861648044093183, - "language_loss": 0.59128547, - "learning_rate": 1.6948844938087945e-06, - "loss": 0.61001039, - "num_input_tokens_seen": 201402780, - "step": 9352, - "time_per_iteration": 2.753941297531128 - }, - { - "auxiliary_loss_clip": 0.01105703, - "auxiliary_loss_mlp": 0.0103746, - "balance_loss_clip": 1.0406158, - "balance_loss_mlp": 1.02476466, - "epoch": 0.5623327822035172, - "flos": 24718212668160.0, - "grad_norm": 1.334754568183942, - "language_loss": 0.71630079, - "learning_rate": 1.6944995967361604e-06, - "loss": 0.73773241, - "num_input_tokens_seen": 201424140, - "step": 9353, - "time_per_iteration": 4.249570369720459 - }, - { - "auxiliary_loss_clip": 0.01098184, - "auxiliary_loss_mlp": 0.01032581, - "balance_loss_clip": 1.04213238, - "balance_loss_mlp": 1.01918769, - "epoch": 0.5623929054561851, - "flos": 14019274523520.0, - "grad_norm": 2.376274628807619, - "language_loss": 0.7593621, - "learning_rate": 1.6941147112481327e-06, - "loss": 0.78066975, - "num_input_tokens_seen": 201439645, - "step": 9354, - "time_per_iteration": 2.689899206161499 - }, - { - "auxiliary_loss_clip": 0.01089457, - "auxiliary_loss_mlp": 0.01035605, - "balance_loss_clip": 1.04167855, - "balance_loss_mlp": 1.02183056, - "epoch": 0.5624530287088532, - "flos": 20704405340160.0, - "grad_norm": 1.8223711210662343, - "language_loss": 0.72909653, - "learning_rate": 1.6937298373593056e-06, - "loss": 0.75034714, - "num_input_tokens_seen": 201459970, - "step": 9355, - "time_per_iteration": 2.755100965499878 - }, - { - "auxiliary_loss_clip": 0.01104288, - "auxiliary_loss_mlp": 0.01032084, - "balance_loss_clip": 1.04146492, - "balance_loss_mlp": 1.01845825, - "epoch": 0.5625131519615211, - "flos": 21470918595840.0, - "grad_norm": 1.4719507883232867, - "language_loss": 0.7346037, - "learning_rate": 1.693344975084274e-06, - "loss": 0.75596744, - "num_input_tokens_seen": 201480055, - "step": 9356, - "time_per_iteration": 2.641638994216919 - }, - { - "auxiliary_loss_clip": 0.01119375, - "auxiliary_loss_mlp": 0.0103593, - "balance_loss_clip": 1.04301476, - "balance_loss_mlp": 1.02204823, - "epoch": 0.5625732752141891, - "flos": 18698004466560.0, - "grad_norm": 2.3002614331876687, - "language_loss": 0.83191347, - "learning_rate": 1.6929601244376318e-06, - "loss": 0.85346651, - "num_input_tokens_seen": 201497645, - "step": 9357, - "time_per_iteration": 2.6374433040618896 - }, - { - "auxiliary_loss_clip": 0.01108702, - "auxiliary_loss_mlp": 0.01033262, - "balance_loss_clip": 1.04158151, - "balance_loss_mlp": 1.02019668, - "epoch": 0.562633398466857, - "flos": 16216900427520.0, - "grad_norm": 2.42238754199954, - "language_loss": 0.72483993, - "learning_rate": 1.6925752854339722e-06, - "loss": 0.74625957, - "num_input_tokens_seen": 201515455, - "step": 9358, - "time_per_iteration": 2.6288702487945557 - }, - { - "auxiliary_loss_clip": 0.01118085, - "auxiliary_loss_mlp": 0.01042212, - "balance_loss_clip": 1.04183221, - "balance_loss_mlp": 1.02859807, - "epoch": 0.562693521719525, - "flos": 22491930689280.0, - "grad_norm": 2.2438292834488838, - "language_loss": 0.7763263, - "learning_rate": 1.6921904580878885e-06, - "loss": 0.79792929, - "num_input_tokens_seen": 201534500, - "step": 9359, - "time_per_iteration": 2.6272196769714355 - }, - { - "auxiliary_loss_clip": 0.0109706, - "auxiliary_loss_mlp": 0.01033721, - "balance_loss_clip": 1.04087317, - "balance_loss_mlp": 1.0212934, - "epoch": 0.562753644972193, - "flos": 25331171281920.0, - "grad_norm": 1.8703344042445116, - "language_loss": 0.70466304, - "learning_rate": 1.6918056424139736e-06, - "loss": 0.72597086, - "num_input_tokens_seen": 201553280, - "step": 9360, - "time_per_iteration": 2.6694719791412354 - }, - { - "auxiliary_loss_clip": 0.00993761, - "auxiliary_loss_mlp": 0.00999248, - "balance_loss_clip": 1.01494741, - "balance_loss_mlp": 0.99799061, - "epoch": 0.562813768224861, - "flos": 67392622126080.0, - "grad_norm": 0.7735600550199924, - "language_loss": 0.5555625, - "learning_rate": 1.6914208384268197e-06, - "loss": 0.57549262, - "num_input_tokens_seen": 201610030, - "step": 9361, - "time_per_iteration": 3.2061593532562256 - }, - { - "auxiliary_loss_clip": 0.01093709, - "auxiliary_loss_mlp": 0.01035172, - "balance_loss_clip": 1.04106104, - "balance_loss_mlp": 1.02236927, - "epoch": 0.562873891477529, - "flos": 23331163029120.0, - "grad_norm": 1.4272041180912485, - "language_loss": 0.8169086, - "learning_rate": 1.691036046141018e-06, - "loss": 0.83819747, - "num_input_tokens_seen": 201628370, - "step": 9362, - "time_per_iteration": 2.648585319519043 - }, - { - "auxiliary_loss_clip": 0.01084349, - "auxiliary_loss_mlp": 0.00771085, - "balance_loss_clip": 1.03982627, - "balance_loss_mlp": 1.00021708, - "epoch": 0.5629340147301969, - "flos": 38472824805120.0, - "grad_norm": 1.5810217639510977, - "language_loss": 0.7460767, - "learning_rate": 1.6906512655711614e-06, - "loss": 0.76463103, - "num_input_tokens_seen": 201649790, - "step": 9363, - "time_per_iteration": 2.8376948833465576 - }, - { - "auxiliary_loss_clip": 0.01114455, - "auxiliary_loss_mlp": 0.01034672, - "balance_loss_clip": 1.04345608, - "balance_loss_mlp": 1.02068281, - "epoch": 0.5629941379828649, - "flos": 29242023252480.0, - "grad_norm": 1.625625465741998, - "language_loss": 0.82640725, - "learning_rate": 1.690266496731839e-06, - "loss": 0.84789848, - "num_input_tokens_seen": 201669175, - "step": 9364, - "time_per_iteration": 2.6790480613708496 - }, - { - "auxiliary_loss_clip": 0.0107898, - "auxiliary_loss_mlp": 0.0103866, - "balance_loss_clip": 1.03860497, - "balance_loss_mlp": 1.02573752, - "epoch": 0.5630542612355328, - "flos": 19420885676160.0, - "grad_norm": 2.0942443962927513, - "language_loss": 0.65238589, - "learning_rate": 1.689881739637642e-06, - "loss": 0.67356229, - "num_input_tokens_seen": 201687000, - "step": 9365, - "time_per_iteration": 2.6504223346710205 - }, - { - "auxiliary_loss_clip": 0.01099908, - "auxiliary_loss_mlp": 0.01040371, - "balance_loss_clip": 1.0423665, - "balance_loss_mlp": 1.0259583, - "epoch": 0.5631143844882008, - "flos": 22266303408000.0, - "grad_norm": 5.761173374312871, - "language_loss": 0.8185727, - "learning_rate": 1.6894969943031611e-06, - "loss": 0.83997548, - "num_input_tokens_seen": 201703335, - "step": 9366, - "time_per_iteration": 2.6865267753601074 - }, - { - "auxiliary_loss_clip": 0.01118809, - "auxiliary_loss_mlp": 0.01033751, - "balance_loss_clip": 1.04305601, - "balance_loss_mlp": 1.02106667, - "epoch": 0.5631745077408687, - "flos": 22965305051520.0, - "grad_norm": 1.4687745386206819, - "language_loss": 0.73388821, - "learning_rate": 1.6891122607429845e-06, - "loss": 0.75541377, - "num_input_tokens_seen": 201723495, - "step": 9367, - "time_per_iteration": 2.6309821605682373 - }, - { - "auxiliary_loss_clip": 0.01020057, - "auxiliary_loss_mlp": 0.01004541, - "balance_loss_clip": 1.01475585, - "balance_loss_mlp": 1.0032177, - "epoch": 0.5632346309935368, - "flos": 65080515576960.0, - "grad_norm": 0.6203732228424765, - "language_loss": 0.53471267, - "learning_rate": 1.6887275389717028e-06, - "loss": 0.5549587, - "num_input_tokens_seen": 201792615, - "step": 9368, - "time_per_iteration": 3.285132884979248 - }, - { - "auxiliary_loss_clip": 0.01119712, - "auxiliary_loss_mlp": 0.01038636, - "balance_loss_clip": 1.04367208, - "balance_loss_mlp": 1.02514756, - "epoch": 0.5632947542462047, - "flos": 23002903612800.0, - "grad_norm": 1.6032046035258145, - "language_loss": 0.69323123, - "learning_rate": 1.6883428290039046e-06, - "loss": 0.71481466, - "num_input_tokens_seen": 201812520, - "step": 9369, - "time_per_iteration": 2.5828912258148193 - }, - { - "auxiliary_loss_clip": 0.01081861, - "auxiliary_loss_mlp": 0.01036769, - "balance_loss_clip": 1.03560948, - "balance_loss_mlp": 1.02258897, - "epoch": 0.5633548774988727, - "flos": 30482593228800.0, - "grad_norm": 1.8644770946275213, - "language_loss": 0.75840139, - "learning_rate": 1.6879581308541763e-06, - "loss": 0.77958775, - "num_input_tokens_seen": 201834185, - "step": 9370, - "time_per_iteration": 2.7649481296539307 - }, - { - "auxiliary_loss_clip": 0.01095504, - "auxiliary_loss_mlp": 0.01038896, - "balance_loss_clip": 1.04126322, - "balance_loss_mlp": 1.02440023, - "epoch": 0.5634150007515406, - "flos": 18515039564160.0, - "grad_norm": 2.2895815027179864, - "language_loss": 0.755108, - "learning_rate": 1.687573444537108e-06, - "loss": 0.776452, - "num_input_tokens_seen": 201851305, - "step": 9371, - "time_per_iteration": 2.591031312942505 - }, - { - "auxiliary_loss_clip": 0.01106226, - "auxiliary_loss_mlp": 0.01040784, - "balance_loss_clip": 1.04110384, - "balance_loss_mlp": 1.02787304, - "epoch": 0.5634751240042086, - "flos": 19244672530560.0, - "grad_norm": 1.7615457998604214, - "language_loss": 0.76489764, - "learning_rate": 1.687188770067285e-06, - "loss": 0.78636777, - "num_input_tokens_seen": 201870350, - "step": 9372, - "time_per_iteration": 2.619053840637207 - }, - { - "auxiliary_loss_clip": 0.01090528, - "auxiliary_loss_mlp": 0.01030605, - "balance_loss_clip": 1.03906, - "balance_loss_mlp": 1.01705718, - "epoch": 0.5635352472568766, - "flos": 12020630987520.0, - "grad_norm": 2.266062441891877, - "language_loss": 0.71336401, - "learning_rate": 1.6868041074592956e-06, - "loss": 0.73457533, - "num_input_tokens_seen": 201886800, - "step": 9373, - "time_per_iteration": 2.624600887298584 - }, - { - "auxiliary_loss_clip": 0.01090554, - "auxiliary_loss_mlp": 0.01031384, - "balance_loss_clip": 1.04418933, - "balance_loss_mlp": 1.0168401, - "epoch": 0.5635953705095446, - "flos": 21871645701120.0, - "grad_norm": 2.1043627154333797, - "language_loss": 0.82543874, - "learning_rate": 1.6864194567277264e-06, - "loss": 0.84665811, - "num_input_tokens_seen": 201904730, - "step": 9374, - "time_per_iteration": 2.644887924194336 - }, - { - "auxiliary_loss_clip": 0.01104117, - "auxiliary_loss_mlp": 0.01030499, - "balance_loss_clip": 1.03739262, - "balance_loss_mlp": 1.01734459, - "epoch": 0.5636554937622126, - "flos": 27126166659840.0, - "grad_norm": 1.7268514389800265, - "language_loss": 0.66357785, - "learning_rate": 1.6860348178871618e-06, - "loss": 0.68492401, - "num_input_tokens_seen": 201924850, - "step": 9375, - "time_per_iteration": 2.65166974067688 - }, - { - "auxiliary_loss_clip": 0.01084894, - "auxiliary_loss_mlp": 0.00770652, - "balance_loss_clip": 1.04238153, - "balance_loss_mlp": 1.00019169, - "epoch": 0.5637156170148805, - "flos": 12926405272320.0, - "grad_norm": 2.3049359861127696, - "language_loss": 0.81049269, - "learning_rate": 1.6856501909521889e-06, - "loss": 0.82904816, - "num_input_tokens_seen": 201939500, - "step": 9376, - "time_per_iteration": 2.766364336013794 - }, - { - "auxiliary_loss_clip": 0.01101359, - "auxiliary_loss_mlp": 0.01034779, - "balance_loss_clip": 1.04133999, - "balance_loss_mlp": 1.02115881, - "epoch": 0.5637757402675485, - "flos": 45551033130240.0, - "grad_norm": 1.6449694311006493, - "language_loss": 0.6926713, - "learning_rate": 1.6852655759373925e-06, - "loss": 0.71403265, - "num_input_tokens_seen": 201963000, - "step": 9377, - "time_per_iteration": 2.870060443878174 - }, - { - "auxiliary_loss_clip": 0.01074381, - "auxiliary_loss_mlp": 0.01032969, - "balance_loss_clip": 1.03875685, - "balance_loss_mlp": 1.01979017, - "epoch": 0.5638358635202164, - "flos": 20886041439360.0, - "grad_norm": 1.3919625147372467, - "language_loss": 0.74771237, - "learning_rate": 1.6848809728573565e-06, - "loss": 0.76878589, - "num_input_tokens_seen": 201983145, - "step": 9378, - "time_per_iteration": 2.749613046646118 - }, - { - "auxiliary_loss_clip": 0.01122728, - "auxiliary_loss_mlp": 0.01035934, - "balance_loss_clip": 1.04050553, - "balance_loss_mlp": 1.02154493, - "epoch": 0.5638959867728844, - "flos": 18806562345600.0, - "grad_norm": 2.63873718495401, - "language_loss": 0.81853002, - "learning_rate": 1.6844963817266656e-06, - "loss": 0.84011662, - "num_input_tokens_seen": 202000335, - "step": 9379, - "time_per_iteration": 2.625277280807495 - }, - { - "auxiliary_loss_clip": 0.01093031, - "auxiliary_loss_mlp": 0.01036774, - "balance_loss_clip": 1.03674948, - "balance_loss_mlp": 1.02336287, - "epoch": 0.5639561100255523, - "flos": 27490336698240.0, - "grad_norm": 2.218934810530396, - "language_loss": 0.7167027, - "learning_rate": 1.6841118025599042e-06, - "loss": 0.73800081, - "num_input_tokens_seen": 202018275, - "step": 9380, - "time_per_iteration": 2.715791940689087 - }, - { - "auxiliary_loss_clip": 0.01086194, - "auxiliary_loss_mlp": 0.01039984, - "balance_loss_clip": 1.0455358, - "balance_loss_mlp": 1.02485633, - "epoch": 0.5640162332782204, - "flos": 18076570243200.0, - "grad_norm": 2.0069687855649234, - "language_loss": 0.74178547, - "learning_rate": 1.6837272353716542e-06, - "loss": 0.76304728, - "num_input_tokens_seen": 202034330, - "step": 9381, - "time_per_iteration": 2.8091652393341064 - }, - { - "auxiliary_loss_clip": 0.01068257, - "auxiliary_loss_mlp": 0.01041209, - "balance_loss_clip": 1.03590226, - "balance_loss_mlp": 1.02741027, - "epoch": 0.5640763565308883, - "flos": 20884856290560.0, - "grad_norm": 2.008212488841835, - "language_loss": 0.72358, - "learning_rate": 1.683342680176499e-06, - "loss": 0.74467456, - "num_input_tokens_seen": 202053100, - "step": 9382, - "time_per_iteration": 2.750049114227295 - }, - { - "auxiliary_loss_clip": 0.0103983, - "auxiliary_loss_mlp": 0.01012073, - "balance_loss_clip": 1.01468074, - "balance_loss_mlp": 1.01088643, - "epoch": 0.5641364797835563, - "flos": 64447912224000.0, - "grad_norm": 0.7132903418918451, - "language_loss": 0.54439944, - "learning_rate": 1.682958136989022e-06, - "loss": 0.56491846, - "num_input_tokens_seen": 202120125, - "step": 9383, - "time_per_iteration": 3.308600425720215 - }, - { - "auxiliary_loss_clip": 0.01106116, - "auxiliary_loss_mlp": 0.01030643, - "balance_loss_clip": 1.04080617, - "balance_loss_mlp": 1.01664162, - "epoch": 0.5641966030362242, - "flos": 18660944609280.0, - "grad_norm": 1.7587549687902173, - "language_loss": 0.71036148, - "learning_rate": 1.6825736058238033e-06, - "loss": 0.73172909, - "num_input_tokens_seen": 202138030, - "step": 9384, - "time_per_iteration": 2.705378532409668 - }, - { - "auxiliary_loss_clip": 0.01098378, - "auxiliary_loss_mlp": 0.01035226, - "balance_loss_clip": 1.04193604, - "balance_loss_mlp": 1.02113533, - "epoch": 0.5642567262888922, - "flos": 22492325738880.0, - "grad_norm": 2.5060474723218724, - "language_loss": 0.75891483, - "learning_rate": 1.6821890866954263e-06, - "loss": 0.78025091, - "num_input_tokens_seen": 202155580, - "step": 9385, - "time_per_iteration": 2.648486375808716 - }, - { - "auxiliary_loss_clip": 0.01102679, - "auxiliary_loss_mlp": 0.01034721, - "balance_loss_clip": 1.03705001, - "balance_loss_mlp": 1.02121449, - "epoch": 0.5643168495415603, - "flos": 13003972692480.0, - "grad_norm": 1.9370694733196534, - "language_loss": 0.82360542, - "learning_rate": 1.6818045796184703e-06, - "loss": 0.84497941, - "num_input_tokens_seen": 202170365, - "step": 9386, - "time_per_iteration": 2.6014211177825928 - }, - { - "auxiliary_loss_clip": 0.01108433, - "auxiliary_loss_mlp": 0.01035233, - "balance_loss_clip": 1.04246962, - "balance_loss_mlp": 1.02117205, - "epoch": 0.5643769727942282, - "flos": 18588297352320.0, - "grad_norm": 2.256739627854675, - "language_loss": 0.69928676, - "learning_rate": 1.681420084607516e-06, - "loss": 0.72072339, - "num_input_tokens_seen": 202189095, - "step": 9387, - "time_per_iteration": 2.6225178241729736 - }, - { - "auxiliary_loss_clip": 0.01110032, - "auxiliary_loss_mlp": 0.01036058, - "balance_loss_clip": 1.04169261, - "balance_loss_mlp": 1.02292085, - "epoch": 0.5644370960468962, - "flos": 33806269572480.0, - "grad_norm": 1.4294069994917775, - "language_loss": 0.74616826, - "learning_rate": 1.6810356016771452e-06, - "loss": 0.76762915, - "num_input_tokens_seen": 202213500, - "step": 9388, - "time_per_iteration": 4.3489909172058105 - }, - { - "auxiliary_loss_clip": 0.01103005, - "auxiliary_loss_mlp": 0.01033351, - "balance_loss_clip": 1.04041004, - "balance_loss_mlp": 1.02143562, - "epoch": 0.5644972192995641, - "flos": 21214911386880.0, - "grad_norm": 1.5515532198665989, - "language_loss": 0.81965339, - "learning_rate": 1.6806511308419353e-06, - "loss": 0.84101695, - "num_input_tokens_seen": 202231920, - "step": 9389, - "time_per_iteration": 5.713036060333252 - }, - { - "auxiliary_loss_clip": 0.01083726, - "auxiliary_loss_mlp": 0.01035772, - "balance_loss_clip": 1.03770804, - "balance_loss_mlp": 1.02090037, - "epoch": 0.5645573425522321, - "flos": 18587722734720.0, - "grad_norm": 2.017294292301613, - "language_loss": 0.63844502, - "learning_rate": 1.680266672116467e-06, - "loss": 0.65964001, - "num_input_tokens_seen": 202247600, - "step": 9390, - "time_per_iteration": 2.718738079071045 - }, - { - "auxiliary_loss_clip": 0.01096947, - "auxiliary_loss_mlp": 0.01030588, - "balance_loss_clip": 1.04229331, - "balance_loss_mlp": 1.01875103, - "epoch": 0.5646174658049, - "flos": 18113809668480.0, - "grad_norm": 1.8385345725956297, - "language_loss": 0.92190915, - "learning_rate": 1.6798822255153192e-06, - "loss": 0.94318449, - "num_input_tokens_seen": 202265350, - "step": 9391, - "time_per_iteration": 2.6871705055236816 - }, - { - "auxiliary_loss_clip": 0.01118295, - "auxiliary_loss_mlp": 0.01037212, - "balance_loss_clip": 1.04650784, - "balance_loss_mlp": 1.02288282, - "epoch": 0.564677589057568, - "flos": 28329964087680.0, - "grad_norm": 2.30014312113224, - "language_loss": 0.60238105, - "learning_rate": 1.6794977910530684e-06, - "loss": 0.62393618, - "num_input_tokens_seen": 202284285, - "step": 9392, - "time_per_iteration": 2.6965878009796143 - }, - { - "auxiliary_loss_clip": 0.01068376, - "auxiliary_loss_mlp": 0.01027584, - "balance_loss_clip": 1.03531122, - "balance_loss_mlp": 1.01367223, - "epoch": 0.564737712310236, - "flos": 22163743100160.0, - "grad_norm": 2.2381091213593924, - "language_loss": 0.81505215, - "learning_rate": 1.6791133687442937e-06, - "loss": 0.83601177, - "num_input_tokens_seen": 202303450, - "step": 9393, - "time_per_iteration": 4.253687620162964 - }, - { - "auxiliary_loss_clip": 0.01095131, - "auxiliary_loss_mlp": 0.01031195, - "balance_loss_clip": 1.03995085, - "balance_loss_mlp": 1.01804614, - "epoch": 0.564797835562904, - "flos": 20959011918720.0, - "grad_norm": 1.6857006339700658, - "language_loss": 0.87381589, - "learning_rate": 1.6787289586035725e-06, - "loss": 0.89507914, - "num_input_tokens_seen": 202322315, - "step": 9394, - "time_per_iteration": 2.6733334064483643 - }, - { - "auxiliary_loss_clip": 0.0110875, - "auxiliary_loss_mlp": 0.01033757, - "balance_loss_clip": 1.04296374, - "balance_loss_mlp": 1.02065587, - "epoch": 0.5648579588155719, - "flos": 17420302805760.0, - "grad_norm": 1.9505278392416294, - "language_loss": 0.84685338, - "learning_rate": 1.6783445606454814e-06, - "loss": 0.86827838, - "num_input_tokens_seen": 202339905, - "step": 9395, - "time_per_iteration": 2.6754062175750732 - }, - { - "auxiliary_loss_clip": 0.0102964, - "auxiliary_loss_mlp": 0.01000117, - "balance_loss_clip": 1.01416993, - "balance_loss_mlp": 0.99888915, - "epoch": 0.5649180820682399, - "flos": 69929568835200.0, - "grad_norm": 0.7966393150311729, - "language_loss": 0.58260763, - "learning_rate": 1.677960174884597e-06, - "loss": 0.60290521, - "num_input_tokens_seen": 202397320, - "step": 9396, - "time_per_iteration": 3.176486015319824 - }, - { - "auxiliary_loss_clip": 0.01099184, - "auxiliary_loss_mlp": 0.01030849, - "balance_loss_clip": 1.04099381, - "balance_loss_mlp": 1.01762295, - "epoch": 0.5649782053209078, - "flos": 24973070641920.0, - "grad_norm": 1.8659420980935195, - "language_loss": 0.70408708, - "learning_rate": 1.6775758013354943e-06, - "loss": 0.72538739, - "num_input_tokens_seen": 202416865, - "step": 9397, - "time_per_iteration": 2.76436710357666 - }, - { - "auxiliary_loss_clip": 0.01087737, - "auxiliary_loss_mlp": 0.01036875, - "balance_loss_clip": 1.0412184, - "balance_loss_mlp": 1.02305877, - "epoch": 0.5650383285735758, - "flos": 21726602582400.0, - "grad_norm": 1.7242630837852022, - "language_loss": 0.66510224, - "learning_rate": 1.67719144001275e-06, - "loss": 0.68634838, - "num_input_tokens_seen": 202436210, - "step": 9398, - "time_per_iteration": 2.8452060222625732 - }, - { - "auxiliary_loss_clip": 0.0102199, - "auxiliary_loss_mlp": 0.01002651, - "balance_loss_clip": 1.01533413, - "balance_loss_mlp": 1.00157201, - "epoch": 0.5650984518262439, - "flos": 65904484636800.0, - "grad_norm": 0.7636877487193632, - "language_loss": 0.58165693, - "learning_rate": 1.6768070909309386e-06, - "loss": 0.60190332, - "num_input_tokens_seen": 202492925, - "step": 9399, - "time_per_iteration": 3.1523597240448 - }, - { - "auxiliary_loss_clip": 0.01076045, - "auxiliary_loss_mlp": 0.01036845, - "balance_loss_clip": 1.03608418, - "balance_loss_mlp": 1.02109778, - "epoch": 0.5651585750789118, - "flos": 21032592929280.0, - "grad_norm": 2.707299355352823, - "language_loss": 0.7311101, - "learning_rate": 1.6764227541046347e-06, - "loss": 0.75223899, - "num_input_tokens_seen": 202511905, - "step": 9400, - "time_per_iteration": 2.778313636779785 - }, - { - "auxiliary_loss_clip": 0.01093566, - "auxiliary_loss_mlp": 0.01038541, - "balance_loss_clip": 1.04261565, - "balance_loss_mlp": 1.02349663, - "epoch": 0.5652186983315798, - "flos": 18551919853440.0, - "grad_norm": 1.7896331589473868, - "language_loss": 0.6111843, - "learning_rate": 1.676038429548412e-06, - "loss": 0.63250542, - "num_input_tokens_seen": 202529815, - "step": 9401, - "time_per_iteration": 2.7110683917999268 - }, - { - "auxiliary_loss_clip": 0.01077473, - "auxiliary_loss_mlp": 0.01030698, - "balance_loss_clip": 1.03607464, - "balance_loss_mlp": 1.01735282, - "epoch": 0.5652788215842477, - "flos": 18478662065280.0, - "grad_norm": 3.6521869515488405, - "language_loss": 0.81323993, - "learning_rate": 1.6756541172768453e-06, - "loss": 0.83432162, - "num_input_tokens_seen": 202547710, - "step": 9402, - "time_per_iteration": 2.8134961128234863 - }, - { - "auxiliary_loss_clip": 0.0106172, - "auxiliary_loss_mlp": 0.01043189, - "balance_loss_clip": 1.03186333, - "balance_loss_mlp": 1.02785897, - "epoch": 0.5653389448369157, - "flos": 30044052080640.0, - "grad_norm": 1.434807389128129, - "language_loss": 0.77711642, - "learning_rate": 1.6752698173045068e-06, - "loss": 0.79816544, - "num_input_tokens_seen": 202568835, - "step": 9403, - "time_per_iteration": 2.9176833629608154 - }, - { - "auxiliary_loss_clip": 0.01064861, - "auxiliary_loss_mlp": 0.01036876, - "balance_loss_clip": 1.03543758, - "balance_loss_mlp": 1.02137828, - "epoch": 0.5653990680895836, - "flos": 16727550128640.0, - "grad_norm": 1.6891349615397695, - "language_loss": 0.69381618, - "learning_rate": 1.6748855296459685e-06, - "loss": 0.71483362, - "num_input_tokens_seen": 202587385, - "step": 9404, - "time_per_iteration": 2.8122291564941406 - }, - { - "auxiliary_loss_clip": 0.01081972, - "auxiliary_loss_mlp": 0.01035091, - "balance_loss_clip": 1.03926969, - "balance_loss_mlp": 1.02245533, - "epoch": 0.5654591913422516, - "flos": 14538256179840.0, - "grad_norm": 1.8707097320787585, - "language_loss": 0.66802347, - "learning_rate": 1.6745012543158045e-06, - "loss": 0.68919408, - "num_input_tokens_seen": 202604815, - "step": 9405, - "time_per_iteration": 2.6256675720214844 - }, - { - "auxiliary_loss_clip": 0.01087827, - "auxiliary_loss_mlp": 0.01038368, - "balance_loss_clip": 1.03976154, - "balance_loss_mlp": 1.02543378, - "epoch": 0.5655193145949196, - "flos": 26209905603840.0, - "grad_norm": 1.7731068900459501, - "language_loss": 0.74520212, - "learning_rate": 1.6741169913285852e-06, - "loss": 0.76646411, - "num_input_tokens_seen": 202623775, - "step": 9406, - "time_per_iteration": 2.7220685482025146 - }, - { - "auxiliary_loss_clip": 0.01061139, - "auxiliary_loss_mlp": 0.01043351, - "balance_loss_clip": 1.03829598, - "balance_loss_mlp": 1.02655435, - "epoch": 0.5655794378475876, - "flos": 25046579825280.0, - "grad_norm": 1.7152353741974506, - "language_loss": 0.7952764, - "learning_rate": 1.673732740698882e-06, - "loss": 0.81632137, - "num_input_tokens_seen": 202643375, - "step": 9407, - "time_per_iteration": 2.785325765609741 - }, - { - "auxiliary_loss_clip": 0.01077703, - "auxiliary_loss_mlp": 0.01039246, - "balance_loss_clip": 1.03728688, - "balance_loss_mlp": 1.02510178, - "epoch": 0.5656395611002555, - "flos": 31032852652800.0, - "grad_norm": 1.3619251628826352, - "language_loss": 0.71023029, - "learning_rate": 1.6733485024412666e-06, - "loss": 0.73139971, - "num_input_tokens_seen": 202668400, - "step": 9408, - "time_per_iteration": 2.8171489238739014 - }, - { - "auxiliary_loss_clip": 0.01061658, - "auxiliary_loss_mlp": 0.01035867, - "balance_loss_clip": 1.03865576, - "balance_loss_mlp": 1.02198541, - "epoch": 0.5656996843529235, - "flos": 20229522606720.0, - "grad_norm": 1.9952093590252573, - "language_loss": 0.81203496, - "learning_rate": 1.672964276570308e-06, - "loss": 0.8330102, - "num_input_tokens_seen": 202685125, - "step": 9409, - "time_per_iteration": 2.770899772644043 - }, - { - "auxiliary_loss_clip": 0.01076156, - "auxiliary_loss_mlp": 0.01030595, - "balance_loss_clip": 1.03786421, - "balance_loss_mlp": 1.01730919, - "epoch": 0.5657598076055914, - "flos": 20996251344000.0, - "grad_norm": 1.8859201816541107, - "language_loss": 0.78039193, - "learning_rate": 1.6725800631005776e-06, - "loss": 0.80145949, - "num_input_tokens_seen": 202703830, - "step": 9410, - "time_per_iteration": 2.6944680213928223 - }, - { - "auxiliary_loss_clip": 0.01121778, - "auxiliary_loss_mlp": 0.01042462, - "balance_loss_clip": 1.04339719, - "balance_loss_mlp": 1.02865767, - "epoch": 0.5658199308582594, - "flos": 11545999649280.0, - "grad_norm": 2.199230863577756, - "language_loss": 0.83460367, - "learning_rate": 1.6721958620466432e-06, - "loss": 0.85624611, - "num_input_tokens_seen": 202719835, - "step": 9411, - "time_per_iteration": 2.576122760772705 - }, - { - "auxiliary_loss_clip": 0.01112938, - "auxiliary_loss_mlp": 0.01033542, - "balance_loss_clip": 1.04195237, - "balance_loss_mlp": 1.01830769, - "epoch": 0.5658800541109275, - "flos": 14172146807040.0, - "grad_norm": 3.221148840875553, - "language_loss": 0.67855954, - "learning_rate": 1.6718116734230749e-06, - "loss": 0.70002437, - "num_input_tokens_seen": 202736795, - "step": 9412, - "time_per_iteration": 2.6416120529174805 - }, - { - "auxiliary_loss_clip": 0.01104164, - "auxiliary_loss_mlp": 0.01032428, - "balance_loss_clip": 1.04040003, - "balance_loss_mlp": 1.02026224, - "epoch": 0.5659401773635954, - "flos": 27305073325440.0, - "grad_norm": 1.6585263288332466, - "language_loss": 0.58582389, - "learning_rate": 1.6714274972444413e-06, - "loss": 0.60718977, - "num_input_tokens_seen": 202756900, - "step": 9413, - "time_per_iteration": 2.678048610687256 - }, - { - "auxiliary_loss_clip": 0.01039217, - "auxiliary_loss_mlp": 0.01044241, - "balance_loss_clip": 1.03433728, - "balance_loss_mlp": 1.02943516, - "epoch": 0.5660003006162634, - "flos": 16728196573440.0, - "grad_norm": 1.5449777270978375, - "language_loss": 0.69369984, - "learning_rate": 1.6710433335253092e-06, - "loss": 0.71453446, - "num_input_tokens_seen": 202775145, - "step": 9414, - "time_per_iteration": 2.7721176147460938 - }, - { - "auxiliary_loss_clip": 0.01048825, - "auxiliary_loss_mlp": 0.01033596, - "balance_loss_clip": 1.04257154, - "balance_loss_mlp": 1.02139449, - "epoch": 0.5660604238689313, - "flos": 21653452535040.0, - "grad_norm": 1.812121190686056, - "language_loss": 0.78028589, - "learning_rate": 1.670659182280247e-06, - "loss": 0.80111009, - "num_input_tokens_seen": 202794505, - "step": 9415, - "time_per_iteration": 3.0027029514312744 - }, - { - "auxiliary_loss_clip": 0.01020707, - "auxiliary_loss_mlp": 0.01005189, - "balance_loss_clip": 1.01482093, - "balance_loss_mlp": 1.00411057, - "epoch": 0.5661205471215993, - "flos": 68824022083200.0, - "grad_norm": 0.6894107195855314, - "language_loss": 0.4917945, - "learning_rate": 1.670275043523822e-06, - "loss": 0.51205349, - "num_input_tokens_seen": 202858580, - "step": 9416, - "time_per_iteration": 3.564145565032959 - }, - { - "auxiliary_loss_clip": 0.01107627, - "auxiliary_loss_mlp": 0.00770936, - "balance_loss_clip": 1.04195189, - "balance_loss_mlp": 1.00020862, - "epoch": 0.5661806703742672, - "flos": 28621774177920.0, - "grad_norm": 1.657672708695628, - "language_loss": 0.62541103, - "learning_rate": 1.6698909172706e-06, - "loss": 0.64419663, - "num_input_tokens_seen": 202878565, - "step": 9417, - "time_per_iteration": 2.6624128818511963 - }, - { - "auxiliary_loss_clip": 0.01098355, - "auxiliary_loss_mlp": 0.01033838, - "balance_loss_clip": 1.03992152, - "balance_loss_mlp": 1.02003968, - "epoch": 0.5662407936269352, - "flos": 21397948116480.0, - "grad_norm": 1.9219049023075434, - "language_loss": 0.68760461, - "learning_rate": 1.6695068035351479e-06, - "loss": 0.7089265, - "num_input_tokens_seen": 202897350, - "step": 9418, - "time_per_iteration": 2.686701774597168 - }, - { - "auxiliary_loss_clip": 0.0110608, - "auxiliary_loss_mlp": 0.01034957, - "balance_loss_clip": 1.03848708, - "balance_loss_mlp": 1.01997232, - "epoch": 0.5663009168796032, - "flos": 25660005315840.0, - "grad_norm": 1.8426385136450754, - "language_loss": 0.65225303, - "learning_rate": 1.6691227023320304e-06, - "loss": 0.67366338, - "num_input_tokens_seen": 202916745, - "step": 9419, - "time_per_iteration": 2.7483572959899902 - }, - { - "auxiliary_loss_clip": 0.00978175, - "auxiliary_loss_mlp": 0.01018666, - "balance_loss_clip": 1.01932096, - "balance_loss_mlp": 1.01722336, - "epoch": 0.5663610401322712, - "flos": 67930458422400.0, - "grad_norm": 0.7448874820638522, - "language_loss": 0.59677726, - "learning_rate": 1.6687386136758135e-06, - "loss": 0.61674571, - "num_input_tokens_seen": 202982375, - "step": 9420, - "time_per_iteration": 3.422990083694458 - }, - { - "auxiliary_loss_clip": 0.01098663, - "auxiliary_loss_mlp": 0.00770427, - "balance_loss_clip": 1.0412631, - "balance_loss_mlp": 1.00017929, - "epoch": 0.5664211633849391, - "flos": 24609367480320.0, - "grad_norm": 1.5681535851968893, - "language_loss": 0.74130625, - "learning_rate": 1.6683545375810618e-06, - "loss": 0.75999713, - "num_input_tokens_seen": 203002430, - "step": 9421, - "time_per_iteration": 2.8006680011749268 - }, - { - "auxiliary_loss_clip": 0.0108426, - "auxiliary_loss_mlp": 0.01035979, - "balance_loss_clip": 1.03777134, - "balance_loss_mlp": 1.02212119, - "epoch": 0.5664812866376071, - "flos": 11648811352320.0, - "grad_norm": 2.1577016458252567, - "language_loss": 0.72988069, - "learning_rate": 1.6679704740623389e-06, - "loss": 0.75108308, - "num_input_tokens_seen": 203019425, - "step": 9422, - "time_per_iteration": 2.6400234699249268 - }, - { - "auxiliary_loss_clip": 0.01105093, - "auxiliary_loss_mlp": 0.01037861, - "balance_loss_clip": 1.04141676, - "balance_loss_mlp": 1.02530825, - "epoch": 0.566541409890275, - "flos": 24643985212800.0, - "grad_norm": 1.7654112494568213, - "language_loss": 0.81893075, - "learning_rate": 1.6675864231342085e-06, - "loss": 0.84036028, - "num_input_tokens_seen": 203039035, - "step": 9423, - "time_per_iteration": 2.673105239868164 - }, - { - "auxiliary_loss_clip": 0.01090689, - "auxiliary_loss_mlp": 0.01037493, - "balance_loss_clip": 1.03944159, - "balance_loss_mlp": 1.02356339, - "epoch": 0.566601533142943, - "flos": 22270577126400.0, - "grad_norm": 1.4934148877619189, - "language_loss": 0.8075555, - "learning_rate": 1.6672023848112353e-06, - "loss": 0.82883728, - "num_input_tokens_seen": 203059320, - "step": 9424, - "time_per_iteration": 2.6597039699554443 - }, - { - "auxiliary_loss_clip": 0.01124321, - "auxiliary_loss_mlp": 0.00771519, - "balance_loss_clip": 1.04382432, - "balance_loss_mlp": 1.00018978, - "epoch": 0.5666616563956111, - "flos": 29971656218880.0, - "grad_norm": 2.0092362269175297, - "language_loss": 0.78882873, - "learning_rate": 1.6668183591079805e-06, - "loss": 0.80778712, - "num_input_tokens_seen": 203078490, - "step": 9425, - "time_per_iteration": 2.6688153743743896 - }, - { - "auxiliary_loss_clip": 0.01090837, - "auxiliary_loss_mlp": 0.01034858, - "balance_loss_clip": 1.0417583, - "balance_loss_mlp": 1.02170324, - "epoch": 0.566721779648279, - "flos": 17781456101760.0, - "grad_norm": 1.976091068193849, - "language_loss": 0.5920769, - "learning_rate": 1.6664343460390064e-06, - "loss": 0.61333382, - "num_input_tokens_seen": 203096065, - "step": 9426, - "time_per_iteration": 2.6646664142608643 - }, - { - "auxiliary_loss_clip": 0.01110034, - "auxiliary_loss_mlp": 0.01032331, - "balance_loss_clip": 1.04102027, - "balance_loss_mlp": 1.01922381, - "epoch": 0.566781902900947, - "flos": 21033490769280.0, - "grad_norm": 2.110311025280775, - "language_loss": 0.81678975, - "learning_rate": 1.6660503456188764e-06, - "loss": 0.83821344, - "num_input_tokens_seen": 203115270, - "step": 9427, - "time_per_iteration": 5.8222620487213135 - }, - { - "auxiliary_loss_clip": 0.01117064, - "auxiliary_loss_mlp": 0.01038278, - "balance_loss_clip": 1.04323864, - "balance_loss_mlp": 1.02506411, - "epoch": 0.5668420261536149, - "flos": 23148593176320.0, - "grad_norm": 1.814267468057716, - "language_loss": 0.86105633, - "learning_rate": 1.6656663578621498e-06, - "loss": 0.88260972, - "num_input_tokens_seen": 203134290, - "step": 9428, - "time_per_iteration": 4.0940985679626465 - }, - { - "auxiliary_loss_clip": 0.01102233, - "auxiliary_loss_mlp": 0.01034099, - "balance_loss_clip": 1.04397511, - "balance_loss_mlp": 1.01996648, - "epoch": 0.5669021494062829, - "flos": 22601601889920.0, - "grad_norm": 2.604927880391597, - "language_loss": 0.73541754, - "learning_rate": 1.6652823827833886e-06, - "loss": 0.75678086, - "num_input_tokens_seen": 203152935, - "step": 9429, - "time_per_iteration": 2.711982011795044 - }, - { - "auxiliary_loss_clip": 0.01100688, - "auxiliary_loss_mlp": 0.00772268, - "balance_loss_clip": 1.04164147, - "balance_loss_mlp": 1.00020123, - "epoch": 0.5669622726589508, - "flos": 17381231786880.0, - "grad_norm": 3.499205688936759, - "language_loss": 0.75380534, - "learning_rate": 1.6648984203971538e-06, - "loss": 0.77253491, - "num_input_tokens_seen": 203170110, - "step": 9430, - "time_per_iteration": 2.775536060333252 - }, - { - "auxiliary_loss_clip": 0.0111876, - "auxiliary_loss_mlp": 0.01036284, - "balance_loss_clip": 1.04125142, - "balance_loss_mlp": 1.02263451, - "epoch": 0.5670223959116188, - "flos": 18763253521920.0, - "grad_norm": 1.7932678929965582, - "language_loss": 0.72862244, - "learning_rate": 1.6645144707180032e-06, - "loss": 0.75017291, - "num_input_tokens_seen": 203188825, - "step": 9431, - "time_per_iteration": 2.7299160957336426 - }, - { - "auxiliary_loss_clip": 0.01068382, - "auxiliary_loss_mlp": 0.01037407, - "balance_loss_clip": 1.03856969, - "balance_loss_mlp": 1.02459264, - "epoch": 0.5670825191642868, - "flos": 13553334276480.0, - "grad_norm": 1.899230938499918, - "language_loss": 0.73544705, - "learning_rate": 1.6641305337604984e-06, - "loss": 0.75650489, - "num_input_tokens_seen": 203206860, - "step": 9432, - "time_per_iteration": 2.68713641166687 - }, - { - "auxiliary_loss_clip": 0.01066627, - "auxiliary_loss_mlp": 0.01032044, - "balance_loss_clip": 1.03716183, - "balance_loss_mlp": 1.01875782, - "epoch": 0.5671426424169548, - "flos": 22054035985920.0, - "grad_norm": 1.4657818599236931, - "language_loss": 0.78099382, - "learning_rate": 1.663746609539197e-06, - "loss": 0.80198044, - "num_input_tokens_seen": 203225625, - "step": 9433, - "time_per_iteration": 4.3982954025268555 - }, - { - "auxiliary_loss_clip": 0.01123451, - "auxiliary_loss_mlp": 0.01038623, - "balance_loss_clip": 1.04226542, - "balance_loss_mlp": 1.02239299, - "epoch": 0.5672027656696227, - "flos": 21323972056320.0, - "grad_norm": 1.9415050552486373, - "language_loss": 0.6311425, - "learning_rate": 1.6633626980686582e-06, - "loss": 0.65276325, - "num_input_tokens_seen": 203242920, - "step": 9434, - "time_per_iteration": 2.6829726696014404 - }, - { - "auxiliary_loss_clip": 0.01106985, - "auxiliary_loss_mlp": 0.01029655, - "balance_loss_clip": 1.04066229, - "balance_loss_mlp": 1.01654196, - "epoch": 0.5672628889222907, - "flos": 23514056104320.0, - "grad_norm": 2.0456781967901025, - "language_loss": 0.66337132, - "learning_rate": 1.6629787993634399e-06, - "loss": 0.68473774, - "num_input_tokens_seen": 203261995, - "step": 9435, - "time_per_iteration": 2.7055511474609375 - }, - { - "auxiliary_loss_clip": 0.01092568, - "auxiliary_loss_mlp": 0.00770808, - "balance_loss_clip": 1.03747869, - "balance_loss_mlp": 1.00008333, - "epoch": 0.5673230121749586, - "flos": 27121928855040.0, - "grad_norm": 1.9714061310868114, - "language_loss": 0.71574509, - "learning_rate": 1.6625949134380984e-06, - "loss": 0.73437893, - "num_input_tokens_seen": 203280670, - "step": 9436, - "time_per_iteration": 2.7314302921295166 - }, - { - "auxiliary_loss_clip": 0.01119804, - "auxiliary_loss_mlp": 0.01034867, - "balance_loss_clip": 1.041466, - "balance_loss_mlp": 1.02099752, - "epoch": 0.5673831354276266, - "flos": 31141985149440.0, - "grad_norm": 1.474374193730658, - "language_loss": 0.7411499, - "learning_rate": 1.6622110403071921e-06, - "loss": 0.76269662, - "num_input_tokens_seen": 203304800, - "step": 9437, - "time_per_iteration": 2.6829545497894287 - }, - { - "auxiliary_loss_clip": 0.01115825, - "auxiliary_loss_mlp": 0.01036618, - "balance_loss_clip": 1.04766893, - "balance_loss_mlp": 1.02231264, - "epoch": 0.5674432586802945, - "flos": 27673193859840.0, - "grad_norm": 2.0226289672132096, - "language_loss": 0.6118415, - "learning_rate": 1.661827179985277e-06, - "loss": 0.63336593, - "num_input_tokens_seen": 203324060, - "step": 9438, - "time_per_iteration": 2.6840946674346924 - }, - { - "auxiliary_loss_clip": 0.01097885, - "auxiliary_loss_mlp": 0.01032312, - "balance_loss_clip": 1.03924835, - "balance_loss_mlp": 1.0185318, - "epoch": 0.5675033819329626, - "flos": 26615157822720.0, - "grad_norm": 1.5530482991602657, - "language_loss": 0.75020033, - "learning_rate": 1.661443332486909e-06, - "loss": 0.77150226, - "num_input_tokens_seen": 203344360, - "step": 9439, - "time_per_iteration": 2.6898789405822754 - }, - { - "auxiliary_loss_clip": 0.01092055, - "auxiliary_loss_mlp": 0.01036149, - "balance_loss_clip": 1.04008341, - "balance_loss_mlp": 1.02168322, - "epoch": 0.5675635051856306, - "flos": 19098372435840.0, - "grad_norm": 1.924986803502997, - "language_loss": 0.83848387, - "learning_rate": 1.6610594978266438e-06, - "loss": 0.85976589, - "num_input_tokens_seen": 203362115, - "step": 9440, - "time_per_iteration": 2.7438228130340576 - }, - { - "auxiliary_loss_clip": 0.01087383, - "auxiliary_loss_mlp": 0.01036961, - "balance_loss_clip": 1.0389899, - "balance_loss_mlp": 1.02264404, - "epoch": 0.5676236284382985, - "flos": 17566315591680.0, - "grad_norm": 3.3538120018942843, - "language_loss": 0.75190175, - "learning_rate": 1.6606756760190365e-06, - "loss": 0.7731452, - "num_input_tokens_seen": 203380550, - "step": 9441, - "time_per_iteration": 2.6487948894500732 - }, - { - "auxiliary_loss_clip": 0.01066366, - "auxiliary_loss_mlp": 0.01037451, - "balance_loss_clip": 1.03523147, - "balance_loss_mlp": 1.02376556, - "epoch": 0.5676837516909665, - "flos": 15954069634560.0, - "grad_norm": 1.8078445069287523, - "language_loss": 0.83109975, - "learning_rate": 1.6602918670786413e-06, - "loss": 0.85213792, - "num_input_tokens_seen": 203396590, - "step": 9442, - "time_per_iteration": 2.692474842071533 - }, - { - "auxiliary_loss_clip": 0.01083606, - "auxiliary_loss_mlp": 0.0103585, - "balance_loss_clip": 1.04210138, - "balance_loss_mlp": 1.02311242, - "epoch": 0.5677438749436344, - "flos": 18295912644480.0, - "grad_norm": 2.0214699890453414, - "language_loss": 0.74567246, - "learning_rate": 1.6599080710200126e-06, - "loss": 0.76686704, - "num_input_tokens_seen": 203414280, - "step": 9443, - "time_per_iteration": 2.742173433303833 - }, - { - "auxiliary_loss_clip": 0.01093942, - "auxiliary_loss_mlp": 0.01036542, - "balance_loss_clip": 1.04245853, - "balance_loss_mlp": 1.02310669, - "epoch": 0.5678039981963025, - "flos": 17931311642880.0, - "grad_norm": 2.2236359492875817, - "language_loss": 0.77068752, - "learning_rate": 1.6595242878577046e-06, - "loss": 0.79199237, - "num_input_tokens_seen": 203433280, - "step": 9444, - "time_per_iteration": 2.65165376663208 - }, - { - "auxiliary_loss_clip": 0.01083168, - "auxiliary_loss_mlp": 0.01042977, - "balance_loss_clip": 1.04132152, - "balance_loss_mlp": 1.02910697, - "epoch": 0.5678641214489704, - "flos": 19316350120320.0, - "grad_norm": 1.9769562357276376, - "language_loss": 0.80988097, - "learning_rate": 1.6591405176062687e-06, - "loss": 0.83114243, - "num_input_tokens_seen": 203449935, - "step": 9445, - "time_per_iteration": 2.692103147506714 - }, - { - "auxiliary_loss_clip": 0.01115981, - "auxiliary_loss_mlp": 0.01030041, - "balance_loss_clip": 1.03910589, - "balance_loss_mlp": 1.01635599, - "epoch": 0.5679242447016384, - "flos": 27751084502400.0, - "grad_norm": 1.8145653139656197, - "language_loss": 0.71126974, - "learning_rate": 1.658756760280259e-06, - "loss": 0.73272997, - "num_input_tokens_seen": 203473025, - "step": 9446, - "time_per_iteration": 2.6656479835510254 - }, - { - "auxiliary_loss_clip": 0.01084809, - "auxiliary_loss_mlp": 0.01029841, - "balance_loss_clip": 1.03896558, - "balance_loss_mlp": 1.01640046, - "epoch": 0.5679843679543063, - "flos": 23769093646080.0, - "grad_norm": 1.9173533022587075, - "language_loss": 0.73434311, - "learning_rate": 1.6583730158942276e-06, - "loss": 0.75548959, - "num_input_tokens_seen": 203492895, - "step": 9447, - "time_per_iteration": 2.7948012351989746 - }, - { - "auxiliary_loss_clip": 0.01099661, - "auxiliary_loss_mlp": 0.01034648, - "balance_loss_clip": 1.04186499, - "balance_loss_mlp": 1.02139819, - "epoch": 0.5680444912069743, - "flos": 25591883172480.0, - "grad_norm": 3.5475375147623294, - "language_loss": 0.7504915, - "learning_rate": 1.657989284462725e-06, - "loss": 0.77183461, - "num_input_tokens_seen": 203513710, - "step": 9448, - "time_per_iteration": 2.700333595275879 - }, - { - "auxiliary_loss_clip": 0.01079167, - "auxiliary_loss_mlp": 0.01049109, - "balance_loss_clip": 1.04264426, - "balance_loss_mlp": 1.0336951, - "epoch": 0.5681046144596422, - "flos": 23695799944320.0, - "grad_norm": 2.3399913967333865, - "language_loss": 0.76352537, - "learning_rate": 1.6576055660003038e-06, - "loss": 0.78480804, - "num_input_tokens_seen": 203531630, - "step": 9449, - "time_per_iteration": 2.7736854553222656 - }, - { - "auxiliary_loss_clip": 0.01096359, - "auxiliary_loss_mlp": 0.01042326, - "balance_loss_clip": 1.04059768, - "balance_loss_mlp": 1.02729404, - "epoch": 0.5681647377123102, - "flos": 28000770917760.0, - "grad_norm": 1.7507923980752478, - "language_loss": 0.74660265, - "learning_rate": 1.6572218605215128e-06, - "loss": 0.76798952, - "num_input_tokens_seen": 203551885, - "step": 9450, - "time_per_iteration": 2.749420642852783 - }, - { - "auxiliary_loss_clip": 0.01102012, - "auxiliary_loss_mlp": 0.01039617, - "balance_loss_clip": 1.04193068, - "balance_loss_mlp": 1.02674794, - "epoch": 0.5682248609649782, - "flos": 22747758330240.0, - "grad_norm": 2.689223250754005, - "language_loss": 0.66906244, - "learning_rate": 1.6568381680409038e-06, - "loss": 0.69047868, - "num_input_tokens_seen": 203572250, - "step": 9451, - "time_per_iteration": 2.753199338912964 - }, - { - "auxiliary_loss_clip": 0.01096067, - "auxiliary_loss_mlp": 0.01038718, - "balance_loss_clip": 1.03942561, - "balance_loss_mlp": 1.02265501, - "epoch": 0.5682849842176462, - "flos": 21288600138240.0, - "grad_norm": 3.0838986562683557, - "language_loss": 0.71882987, - "learning_rate": 1.656454488573026e-06, - "loss": 0.74017769, - "num_input_tokens_seen": 203590605, - "step": 9452, - "time_per_iteration": 2.6950924396514893 - }, - { - "auxiliary_loss_clip": 0.01076417, - "auxiliary_loss_mlp": 0.01030065, - "balance_loss_clip": 1.03938448, - "balance_loss_mlp": 1.01734543, - "epoch": 0.5683451074703142, - "flos": 21141689512320.0, - "grad_norm": 1.8642874843773423, - "language_loss": 0.70013601, - "learning_rate": 1.656070822132428e-06, - "loss": 0.72120082, - "num_input_tokens_seen": 203610080, - "step": 9453, - "time_per_iteration": 2.7006165981292725 - }, - { - "auxiliary_loss_clip": 0.01076829, - "auxiliary_loss_mlp": 0.00769854, - "balance_loss_clip": 1.04066825, - "balance_loss_mlp": 1.00014949, - "epoch": 0.5684052307229821, - "flos": 22344481359360.0, - "grad_norm": 2.037972918051024, - "language_loss": 0.70139372, - "learning_rate": 1.6556871687336592e-06, - "loss": 0.71986055, - "num_input_tokens_seen": 203630060, - "step": 9454, - "time_per_iteration": 2.759376287460327 - }, - { - "auxiliary_loss_clip": 0.01095428, - "auxiliary_loss_mlp": 0.01031911, - "balance_loss_clip": 1.03987896, - "balance_loss_mlp": 1.01938248, - "epoch": 0.5684653539756501, - "flos": 21798639308160.0, - "grad_norm": 1.989743078970872, - "language_loss": 0.6078186, - "learning_rate": 1.6553035283912671e-06, - "loss": 0.62909198, - "num_input_tokens_seen": 203649065, - "step": 9455, - "time_per_iteration": 2.678152322769165 - }, - { - "auxiliary_loss_clip": 0.01082741, - "auxiliary_loss_mlp": 0.0103652, - "balance_loss_clip": 1.0447154, - "balance_loss_mlp": 1.02253652, - "epoch": 0.568525477228318, - "flos": 22999635475200.0, - "grad_norm": 4.296474832454859, - "language_loss": 0.73108375, - "learning_rate": 1.6549199011198e-06, - "loss": 0.75227636, - "num_input_tokens_seen": 203667545, - "step": 9456, - "time_per_iteration": 2.7307004928588867 - }, - { - "auxiliary_loss_clip": 0.01099598, - "auxiliary_loss_mlp": 0.01031688, - "balance_loss_clip": 1.04188192, - "balance_loss_mlp": 1.01902199, - "epoch": 0.568585600480986, - "flos": 21392489249280.0, - "grad_norm": 1.662795047431792, - "language_loss": 0.77013254, - "learning_rate": 1.6545362869338048e-06, - "loss": 0.79144537, - "num_input_tokens_seen": 203686025, - "step": 9457, - "time_per_iteration": 2.665708303451538 - }, - { - "auxiliary_loss_clip": 0.01111194, - "auxiliary_loss_mlp": 0.01036842, - "balance_loss_clip": 1.0429163, - "balance_loss_mlp": 1.02280521, - "epoch": 0.568645723733654, - "flos": 30007351359360.0, - "grad_norm": 2.0672888051412817, - "language_loss": 0.66191971, - "learning_rate": 1.6541526858478285e-06, - "loss": 0.68340003, - "num_input_tokens_seen": 203705540, - "step": 9458, - "time_per_iteration": 2.780771017074585 - }, - { - "auxiliary_loss_clip": 0.01110997, - "auxiliary_loss_mlp": 0.01031454, - "balance_loss_clip": 1.04201722, - "balance_loss_mlp": 1.01742291, - "epoch": 0.568705846986322, - "flos": 20412667077120.0, - "grad_norm": 2.504426538314312, - "language_loss": 0.68920743, - "learning_rate": 1.6537690978764167e-06, - "loss": 0.71063197, - "num_input_tokens_seen": 203723670, - "step": 9459, - "time_per_iteration": 2.637176513671875 - }, - { - "auxiliary_loss_clip": 0.01095236, - "auxiliary_loss_mlp": 0.01032887, - "balance_loss_clip": 1.0442152, - "balance_loss_mlp": 1.01929152, - "epoch": 0.5687659702389899, - "flos": 17456752131840.0, - "grad_norm": 2.127788828908428, - "language_loss": 0.76758575, - "learning_rate": 1.6533855230341155e-06, - "loss": 0.788867, - "num_input_tokens_seen": 203739705, - "step": 9460, - "time_per_iteration": 2.7338075637817383 - }, - { - "auxiliary_loss_clip": 0.01066336, - "auxiliary_loss_mlp": 0.0103936, - "balance_loss_clip": 1.04204893, - "balance_loss_mlp": 1.02563262, - "epoch": 0.5688260934916579, - "flos": 25406081095680.0, - "grad_norm": 1.8378075196350074, - "language_loss": 0.71994978, - "learning_rate": 1.65300196133547e-06, - "loss": 0.74100673, - "num_input_tokens_seen": 203759000, - "step": 9461, - "time_per_iteration": 2.9295692443847656 - }, - { - "auxiliary_loss_clip": 0.01110974, - "auxiliary_loss_mlp": 0.01036974, - "balance_loss_clip": 1.04267561, - "balance_loss_mlp": 1.02314544, - "epoch": 0.5688862167443258, - "flos": 21608024808960.0, - "grad_norm": 2.3363777583338794, - "language_loss": 0.73092425, - "learning_rate": 1.6526184127950249e-06, - "loss": 0.75240374, - "num_input_tokens_seen": 203774295, - "step": 9462, - "time_per_iteration": 2.639132022857666 - }, - { - "auxiliary_loss_clip": 0.01105415, - "auxiliary_loss_mlp": 0.01026496, - "balance_loss_clip": 1.03986573, - "balance_loss_mlp": 1.01507592, - "epoch": 0.5689463399969938, - "flos": 22418996123520.0, - "grad_norm": 1.9966058203681178, - "language_loss": 0.72878397, - "learning_rate": 1.6522348774273246e-06, - "loss": 0.75010306, - "num_input_tokens_seen": 203792710, - "step": 9463, - "time_per_iteration": 2.687623977661133 - }, - { - "auxiliary_loss_clip": 0.01108157, - "auxiliary_loss_mlp": 0.01032686, - "balance_loss_clip": 1.04214895, - "balance_loss_mlp": 1.02012718, - "epoch": 0.5690064632496618, - "flos": 18296810484480.0, - "grad_norm": 2.136514167684146, - "language_loss": 0.73800778, - "learning_rate": 1.6518513552469123e-06, - "loss": 0.75941622, - "num_input_tokens_seen": 203811645, - "step": 9464, - "time_per_iteration": 2.6446449756622314 - }, - { - "auxiliary_loss_clip": 0.01110623, - "auxiliary_loss_mlp": 0.0077176, - "balance_loss_clip": 1.04163098, - "balance_loss_mlp": 1.00012827, - "epoch": 0.5690665865023298, - "flos": 21579260993280.0, - "grad_norm": 2.0135063282733108, - "language_loss": 0.84068149, - "learning_rate": 1.6514678462683312e-06, - "loss": 0.85950536, - "num_input_tokens_seen": 203830040, - "step": 9465, - "time_per_iteration": 2.6243364810943604 - }, - { - "auxiliary_loss_clip": 0.01092541, - "auxiliary_loss_mlp": 0.01032276, - "balance_loss_clip": 1.03678536, - "balance_loss_mlp": 1.0195086, - "epoch": 0.5691267097549978, - "flos": 24421446501120.0, - "grad_norm": 1.6434295280058835, - "language_loss": 0.72125626, - "learning_rate": 1.651084350506125e-06, - "loss": 0.74250448, - "num_input_tokens_seen": 203851245, - "step": 9466, - "time_per_iteration": 5.837533712387085 - }, - { - "auxiliary_loss_clip": 0.01016007, - "auxiliary_loss_mlp": 0.01001581, - "balance_loss_clip": 1.01873374, - "balance_loss_mlp": 1.00037718, - "epoch": 0.5691868330076657, - "flos": 61657906199040.0, - "grad_norm": 0.7155703714304625, - "language_loss": 0.55334294, - "learning_rate": 1.6507008679748343e-06, - "loss": 0.57351875, - "num_input_tokens_seen": 203916400, - "step": 9467, - "time_per_iteration": 4.8396992683410645 - }, - { - "auxiliary_loss_clip": 0.01107605, - "auxiliary_loss_mlp": 0.0103869, - "balance_loss_clip": 1.04263473, - "balance_loss_mlp": 1.02364564, - "epoch": 0.5692469562603337, - "flos": 21325193118720.0, - "grad_norm": 16.186384536861027, - "language_loss": 0.6343258, - "learning_rate": 1.6503173986890023e-06, - "loss": 0.65578872, - "num_input_tokens_seen": 203935870, - "step": 9468, - "time_per_iteration": 2.6212332248687744 - }, - { - "auxiliary_loss_clip": 0.01066902, - "auxiliary_loss_mlp": 0.01038069, - "balance_loss_clip": 1.03614831, - "balance_loss_mlp": 1.02334094, - "epoch": 0.5693070795130016, - "flos": 23367899664000.0, - "grad_norm": 2.927691999708818, - "language_loss": 0.78902012, - "learning_rate": 1.64993394266317e-06, - "loss": 0.81006986, - "num_input_tokens_seen": 203954950, - "step": 9469, - "time_per_iteration": 2.745016098022461 - }, - { - "auxiliary_loss_clip": 0.01085393, - "auxiliary_loss_mlp": 0.01053274, - "balance_loss_clip": 1.04159784, - "balance_loss_mlp": 1.03830147, - "epoch": 0.5693672027656697, - "flos": 18697250280960.0, - "grad_norm": 2.217720738619104, - "language_loss": 0.69655335, - "learning_rate": 1.6495504999118769e-06, - "loss": 0.71793997, - "num_input_tokens_seen": 203972715, - "step": 9470, - "time_per_iteration": 2.6895534992218018 - }, - { - "auxiliary_loss_clip": 0.01097198, - "auxiliary_loss_mlp": 0.01036868, - "balance_loss_clip": 1.04529762, - "balance_loss_mlp": 1.02352285, - "epoch": 0.5694273260183376, - "flos": 20449188230400.0, - "grad_norm": 1.6026966116267123, - "language_loss": 0.74473977, - "learning_rate": 1.6491670704496644e-06, - "loss": 0.76608038, - "num_input_tokens_seen": 203990775, - "step": 9471, - "time_per_iteration": 2.6734213829040527 - }, - { - "auxiliary_loss_clip": 0.01077759, - "auxiliary_loss_mlp": 0.01040388, - "balance_loss_clip": 1.0421195, - "balance_loss_mlp": 1.02579701, - "epoch": 0.5694874492710056, - "flos": 17603195880960.0, - "grad_norm": 1.75714793559233, - "language_loss": 0.57588744, - "learning_rate": 1.6487836542910716e-06, - "loss": 0.59706891, - "num_input_tokens_seen": 204008845, - "step": 9472, - "time_per_iteration": 4.335491180419922 - }, - { - "auxiliary_loss_clip": 0.01082559, - "auxiliary_loss_mlp": 0.01032344, - "balance_loss_clip": 1.03902221, - "balance_loss_mlp": 1.01946378, - "epoch": 0.5695475725236735, - "flos": 13370836250880.0, - "grad_norm": 1.9281443896441626, - "language_loss": 0.73845899, - "learning_rate": 1.648400251450638e-06, - "loss": 0.75960797, - "num_input_tokens_seen": 204023755, - "step": 9473, - "time_per_iteration": 2.706148147583008 - }, - { - "auxiliary_loss_clip": 0.01017729, - "auxiliary_loss_mlp": 0.01007582, - "balance_loss_clip": 1.02078795, - "balance_loss_mlp": 1.00631857, - "epoch": 0.5696076957763415, - "flos": 68174398661760.0, - "grad_norm": 0.6469732305814715, - "language_loss": 0.57547617, - "learning_rate": 1.6480168619429023e-06, - "loss": 0.59572935, - "num_input_tokens_seen": 204091255, - "step": 9474, - "time_per_iteration": 3.2811825275421143 - }, - { - "auxiliary_loss_clip": 0.01106855, - "auxiliary_loss_mlp": 0.01038889, - "balance_loss_clip": 1.04254341, - "balance_loss_mlp": 1.02532923, - "epoch": 0.5696678190290094, - "flos": 33838301525760.0, - "grad_norm": 2.207374996280549, - "language_loss": 0.53488398, - "learning_rate": 1.6476334857824017e-06, - "loss": 0.55634141, - "num_input_tokens_seen": 204113285, - "step": 9475, - "time_per_iteration": 2.701791524887085 - }, - { - "auxiliary_loss_clip": 0.01122912, - "auxiliary_loss_mlp": 0.01039618, - "balance_loss_clip": 1.04524517, - "balance_loss_mlp": 1.0262965, - "epoch": 0.5697279422816774, - "flos": 26356600748160.0, - "grad_norm": 1.6070261580589493, - "language_loss": 0.79622197, - "learning_rate": 1.647250122983675e-06, - "loss": 0.81784725, - "num_input_tokens_seen": 204133045, - "step": 9476, - "time_per_iteration": 2.695966958999634 - }, - { - "auxiliary_loss_clip": 0.01101607, - "auxiliary_loss_mlp": 0.01038712, - "balance_loss_clip": 1.04603529, - "balance_loss_mlp": 1.0258019, - "epoch": 0.5697880655343454, - "flos": 22930507751040.0, - "grad_norm": 1.9576279407758228, - "language_loss": 0.66811013, - "learning_rate": 1.6468667735612592e-06, - "loss": 0.68951333, - "num_input_tokens_seen": 204152590, - "step": 9477, - "time_per_iteration": 2.6981940269470215 - }, - { - "auxiliary_loss_clip": 0.0108821, - "auxiliary_loss_mlp": 0.01037709, - "balance_loss_clip": 1.04286826, - "balance_loss_mlp": 1.02403569, - "epoch": 0.5698481887870134, - "flos": 26761314263040.0, - "grad_norm": 1.587062911340377, - "language_loss": 0.70738614, - "learning_rate": 1.6464834375296906e-06, - "loss": 0.72864532, - "num_input_tokens_seen": 204171815, - "step": 9478, - "time_per_iteration": 2.779813766479492 - }, - { - "auxiliary_loss_clip": 0.01084042, - "auxiliary_loss_mlp": 0.01031832, - "balance_loss_clip": 1.03916287, - "balance_loss_mlp": 1.0200479, - "epoch": 0.5699083120396814, - "flos": 15742269089280.0, - "grad_norm": 4.484039953055517, - "language_loss": 0.6938777, - "learning_rate": 1.6461001149035055e-06, - "loss": 0.71503651, - "num_input_tokens_seen": 204188535, - "step": 9479, - "time_per_iteration": 2.712655782699585 - }, - { - "auxiliary_loss_clip": 0.01078443, - "auxiliary_loss_mlp": 0.01033369, - "balance_loss_clip": 1.04121661, - "balance_loss_mlp": 1.02166843, - "epoch": 0.5699684352923493, - "flos": 19537272720000.0, - "grad_norm": 2.2062311419155205, - "language_loss": 0.71329868, - "learning_rate": 1.6457168056972392e-06, - "loss": 0.73441678, - "num_input_tokens_seen": 204208365, - "step": 9480, - "time_per_iteration": 2.727628469467163 - }, - { - "auxiliary_loss_clip": 0.01089043, - "auxiliary_loss_mlp": 0.00769268, - "balance_loss_clip": 1.04188204, - "balance_loss_mlp": 1.00015211, - "epoch": 0.5700285585450173, - "flos": 16253349753600.0, - "grad_norm": 2.49302312393396, - "language_loss": 0.7201618, - "learning_rate": 1.6453335099254276e-06, - "loss": 0.73874491, - "num_input_tokens_seen": 204226560, - "step": 9481, - "time_per_iteration": 2.6870779991149902 - }, - { - "auxiliary_loss_clip": 0.01111632, - "auxiliary_loss_mlp": 0.01037308, - "balance_loss_clip": 1.04494166, - "balance_loss_mlp": 1.02441525, - "epoch": 0.5700886817976852, - "flos": 19864993432320.0, - "grad_norm": 2.3265371075794046, - "language_loss": 0.78086042, - "learning_rate": 1.6449502276026041e-06, - "loss": 0.80234993, - "num_input_tokens_seen": 204245410, - "step": 9482, - "time_per_iteration": 2.648545742034912 - }, - { - "auxiliary_loss_clip": 0.01099058, - "auxiliary_loss_mlp": 0.01031061, - "balance_loss_clip": 1.04446602, - "balance_loss_mlp": 1.01918221, - "epoch": 0.5701488050503533, - "flos": 23841704989440.0, - "grad_norm": 1.4982420423731841, - "language_loss": 0.77999502, - "learning_rate": 1.6445669587433043e-06, - "loss": 0.80129617, - "num_input_tokens_seen": 204264840, - "step": 9483, - "time_per_iteration": 2.716085910797119 - }, - { - "auxiliary_loss_clip": 0.01098634, - "auxiliary_loss_mlp": 0.01043773, - "balance_loss_clip": 1.04435062, - "balance_loss_mlp": 1.03189337, - "epoch": 0.5702089283030212, - "flos": 23659673840640.0, - "grad_norm": 1.773078274148673, - "language_loss": 0.81291378, - "learning_rate": 1.6441837033620612e-06, - "loss": 0.83433783, - "num_input_tokens_seen": 204284335, - "step": 9484, - "time_per_iteration": 2.7283802032470703 - }, - { - "auxiliary_loss_clip": 0.01120809, - "auxiliary_loss_mlp": 0.0077026, - "balance_loss_clip": 1.04378128, - "balance_loss_mlp": 1.00009394, - "epoch": 0.5702690515556892, - "flos": 27891171544320.0, - "grad_norm": 294.9687469035841, - "language_loss": 0.60670495, - "learning_rate": 1.6438004614734073e-06, - "loss": 0.6256156, - "num_input_tokens_seen": 204302590, - "step": 9485, - "time_per_iteration": 2.7182137966156006 - }, - { - "auxiliary_loss_clip": 0.01107456, - "auxiliary_loss_mlp": 0.01033766, - "balance_loss_clip": 1.04291701, - "balance_loss_mlp": 1.02048063, - "epoch": 0.5703291748083571, - "flos": 24023951619840.0, - "grad_norm": 2.0199937842049676, - "language_loss": 0.65740418, - "learning_rate": 1.6434172330918757e-06, - "loss": 0.67881644, - "num_input_tokens_seen": 204323055, - "step": 9486, - "time_per_iteration": 2.7076590061187744 - }, - { - "auxiliary_loss_clip": 0.01026531, - "auxiliary_loss_mlp": 0.01001416, - "balance_loss_clip": 1.02014589, - "balance_loss_mlp": 1.00029588, - "epoch": 0.5703892980610251, - "flos": 57023382919680.0, - "grad_norm": 0.6682653451732087, - "language_loss": 0.47990364, - "learning_rate": 1.6430340182319978e-06, - "loss": 0.50018317, - "num_input_tokens_seen": 204386160, - "step": 9487, - "time_per_iteration": 3.3227086067199707 - }, - { - "auxiliary_loss_clip": 0.0108502, - "auxiliary_loss_mlp": 0.00770885, - "balance_loss_clip": 1.04171848, - "balance_loss_mlp": 1.00012314, - "epoch": 0.570449421313693, - "flos": 24351025887360.0, - "grad_norm": 1.5998860502141972, - "language_loss": 0.85676056, - "learning_rate": 1.6426508169083067e-06, - "loss": 0.87531954, - "num_input_tokens_seen": 204406315, - "step": 9488, - "time_per_iteration": 2.7443041801452637 - }, - { - "auxiliary_loss_clip": 0.01084932, - "auxiliary_loss_mlp": 0.01036169, - "balance_loss_clip": 1.04087424, - "balance_loss_mlp": 1.02245951, - "epoch": 0.570509544566361, - "flos": 24828566227200.0, - "grad_norm": 1.4382001019160457, - "language_loss": 0.78847331, - "learning_rate": 1.6422676291353314e-06, - "loss": 0.80968434, - "num_input_tokens_seen": 204427645, - "step": 9489, - "time_per_iteration": 2.7456719875335693 - }, - { - "auxiliary_loss_clip": 0.01099206, - "auxiliary_loss_mlp": 0.01028445, - "balance_loss_clip": 1.04345155, - "balance_loss_mlp": 1.01655364, - "epoch": 0.570569667819029, - "flos": 21397301671680.0, - "grad_norm": 1.7750907148912565, - "language_loss": 0.70044166, - "learning_rate": 1.641884454927604e-06, - "loss": 0.72171819, - "num_input_tokens_seen": 204445910, - "step": 9490, - "time_per_iteration": 2.646172046661377 - }, - { - "auxiliary_loss_clip": 0.01085076, - "auxiliary_loss_mlp": 0.0103304, - "balance_loss_clip": 1.04102945, - "balance_loss_mlp": 1.02055264, - "epoch": 0.570629791071697, - "flos": 23216751233280.0, - "grad_norm": 1.5662629922292932, - "language_loss": 0.76374, - "learning_rate": 1.6415012942996548e-06, - "loss": 0.78492117, - "num_input_tokens_seen": 204464680, - "step": 9491, - "time_per_iteration": 2.686228036880493 - }, - { - "auxiliary_loss_clip": 0.01010704, - "auxiliary_loss_mlp": 0.0075136, - "balance_loss_clip": 1.01657176, - "balance_loss_mlp": 0.99964297, - "epoch": 0.570689914324365, - "flos": 65284666525440.0, - "grad_norm": 0.7940313966382696, - "language_loss": 0.57365447, - "learning_rate": 1.641118147266011e-06, - "loss": 0.5912751, - "num_input_tokens_seen": 204525580, - "step": 9492, - "time_per_iteration": 3.275951623916626 - }, - { - "auxiliary_loss_clip": 0.01091927, - "auxiliary_loss_mlp": 0.00770164, - "balance_loss_clip": 1.0425539, - "balance_loss_mlp": 1.00009966, - "epoch": 0.5707500375770329, - "flos": 21141904993920.0, - "grad_norm": 1.811585397599456, - "language_loss": 0.71563506, - "learning_rate": 1.6407350138412035e-06, - "loss": 0.73425597, - "num_input_tokens_seen": 204541320, - "step": 9493, - "time_per_iteration": 2.6741974353790283 - }, - { - "auxiliary_loss_clip": 0.0112282, - "auxiliary_loss_mlp": 0.01032391, - "balance_loss_clip": 1.0450213, - "balance_loss_mlp": 1.01957655, - "epoch": 0.5708101608297009, - "flos": 20812747737600.0, - "grad_norm": 1.647557383472974, - "language_loss": 0.7782768, - "learning_rate": 1.6403518940397606e-06, - "loss": 0.79982895, - "num_input_tokens_seen": 204560275, - "step": 9494, - "time_per_iteration": 2.6302967071533203 - }, - { - "auxiliary_loss_clip": 0.01124725, - "auxiliary_loss_mlp": 0.01031331, - "balance_loss_clip": 1.04463601, - "balance_loss_mlp": 1.01786041, - "epoch": 0.5708702840823688, - "flos": 25812338895360.0, - "grad_norm": 2.0991801198395166, - "language_loss": 0.80634642, - "learning_rate": 1.6399687878762096e-06, - "loss": 0.82790697, - "num_input_tokens_seen": 204579430, - "step": 9495, - "time_per_iteration": 2.628124237060547 - }, - { - "auxiliary_loss_clip": 0.01077213, - "auxiliary_loss_mlp": 0.01041189, - "balance_loss_clip": 1.03985035, - "balance_loss_mlp": 1.02567959, - "epoch": 0.5709304073350369, - "flos": 23651916503040.0, - "grad_norm": 2.1559343585674067, - "language_loss": 0.66669941, - "learning_rate": 1.6395856953650784e-06, - "loss": 0.68788344, - "num_input_tokens_seen": 204597710, - "step": 9496, - "time_per_iteration": 2.7877724170684814 - }, - { - "auxiliary_loss_clip": 0.01125369, - "auxiliary_loss_mlp": 0.01038193, - "balance_loss_clip": 1.04586279, - "balance_loss_mlp": 1.02479351, - "epoch": 0.5709905305877048, - "flos": 16107552449280.0, - "grad_norm": 2.6392695697640387, - "language_loss": 0.69406897, - "learning_rate": 1.6392026165208938e-06, - "loss": 0.71570456, - "num_input_tokens_seen": 204616140, - "step": 9497, - "time_per_iteration": 2.5715434551239014 - }, - { - "auxiliary_loss_clip": 0.01107343, - "auxiliary_loss_mlp": 0.00770833, - "balance_loss_clip": 1.04470205, - "balance_loss_mlp": 1.00010204, - "epoch": 0.5710506538403728, - "flos": 24750819239040.0, - "grad_norm": 2.381002532737965, - "language_loss": 0.81296104, - "learning_rate": 1.638819551358182e-06, - "loss": 0.83174282, - "num_input_tokens_seen": 204636470, - "step": 9498, - "time_per_iteration": 2.7146875858306885 - }, - { - "auxiliary_loss_clip": 0.01122241, - "auxiliary_loss_mlp": 0.01039082, - "balance_loss_clip": 1.04371977, - "balance_loss_mlp": 1.02453244, - "epoch": 0.5711107770930407, - "flos": 21982250655360.0, - "grad_norm": 1.8640767096069095, - "language_loss": 0.66366005, - "learning_rate": 1.638436499891469e-06, - "loss": 0.68527335, - "num_input_tokens_seen": 204656640, - "step": 9499, - "time_per_iteration": 2.59460711479187 - }, - { - "auxiliary_loss_clip": 0.01090983, - "auxiliary_loss_mlp": 0.01034376, - "balance_loss_clip": 1.04218864, - "balance_loss_mlp": 1.02126861, - "epoch": 0.5711709003457087, - "flos": 19574009354880.0, - "grad_norm": 1.5439081268362653, - "language_loss": 0.71755552, - "learning_rate": 1.6380534621352805e-06, - "loss": 0.73880911, - "num_input_tokens_seen": 204675475, - "step": 9500, - "time_per_iteration": 2.6723949909210205 - }, - { - "auxiliary_loss_clip": 0.01092856, - "auxiliary_loss_mlp": 0.01032614, - "balance_loss_clip": 1.04149878, - "balance_loss_mlp": 1.01973963, - "epoch": 0.5712310235983766, - "flos": 24242683489920.0, - "grad_norm": 1.9336466751975971, - "language_loss": 0.76224887, - "learning_rate": 1.6376704381041407e-06, - "loss": 0.78350353, - "num_input_tokens_seen": 204695385, - "step": 9501, - "time_per_iteration": 2.7653119564056396 - }, - { - "auxiliary_loss_clip": 0.01101056, - "auxiliary_loss_mlp": 0.01035695, - "balance_loss_clip": 1.04289281, - "balance_loss_mlp": 1.02269506, - "epoch": 0.5712911468510447, - "flos": 20996143603200.0, - "grad_norm": 1.6146609274124086, - "language_loss": 0.75141633, - "learning_rate": 1.6372874278125742e-06, - "loss": 0.77278382, - "num_input_tokens_seen": 204714730, - "step": 9502, - "time_per_iteration": 2.6820828914642334 - }, - { - "auxiliary_loss_clip": 0.01088314, - "auxiliary_loss_mlp": 0.01027948, - "balance_loss_clip": 1.04387522, - "balance_loss_mlp": 1.01492405, - "epoch": 0.5713512701037126, - "flos": 18916987731840.0, - "grad_norm": 1.5621825440350152, - "language_loss": 0.82271576, - "learning_rate": 1.636904431275105e-06, - "loss": 0.84387839, - "num_input_tokens_seen": 204735025, - "step": 9503, - "time_per_iteration": 2.663109302520752 - }, - { - "auxiliary_loss_clip": 0.01085944, - "auxiliary_loss_mlp": 0.01033945, - "balance_loss_clip": 1.04204583, - "balance_loss_mlp": 1.02192843, - "epoch": 0.5714113933563806, - "flos": 17413443308160.0, - "grad_norm": 2.684901451113001, - "language_loss": 0.86263931, - "learning_rate": 1.6365214485062553e-06, - "loss": 0.88383818, - "num_input_tokens_seen": 204751365, - "step": 9504, - "time_per_iteration": 2.763122320175171 - }, - { - "auxiliary_loss_clip": 0.01075538, - "auxiliary_loss_mlp": 0.01028568, - "balance_loss_clip": 1.04011607, - "balance_loss_mlp": 1.01565766, - "epoch": 0.5714715166090486, - "flos": 20193360589440.0, - "grad_norm": 1.7486163539852246, - "language_loss": 0.75459665, - "learning_rate": 1.6361384795205496e-06, - "loss": 0.77563769, - "num_input_tokens_seen": 204768980, - "step": 9505, - "time_per_iteration": 4.519685506820679 - }, - { - "auxiliary_loss_clip": 0.0111822, - "auxiliary_loss_mlp": 0.0103209, - "balance_loss_clip": 1.04235733, - "balance_loss_mlp": 1.02002621, - "epoch": 0.5715316398617165, - "flos": 18551668458240.0, - "grad_norm": 1.4826686830874622, - "language_loss": 0.81888402, - "learning_rate": 1.635755524332509e-06, - "loss": 0.84038711, - "num_input_tokens_seen": 204788110, - "step": 9506, - "time_per_iteration": 5.6948935985565186 - }, - { - "auxiliary_loss_clip": 0.01080083, - "auxiliary_loss_mlp": 0.00770857, - "balance_loss_clip": 1.03905082, - "balance_loss_mlp": 1.00010204, - "epoch": 0.5715917631143845, - "flos": 18478195188480.0, - "grad_norm": 1.7330193393772828, - "language_loss": 0.77595812, - "learning_rate": 1.6353725829566552e-06, - "loss": 0.79446745, - "num_input_tokens_seen": 204807240, - "step": 9507, - "time_per_iteration": 2.7299420833587646 - }, - { - "auxiliary_loss_clip": 0.01098783, - "auxiliary_loss_mlp": 0.01037694, - "balance_loss_clip": 1.04040074, - "balance_loss_mlp": 1.02350807, - "epoch": 0.5716518863670524, - "flos": 24020037037440.0, - "grad_norm": 1.9478835056583133, - "language_loss": 0.6852861, - "learning_rate": 1.63498965540751e-06, - "loss": 0.70665085, - "num_input_tokens_seen": 204826415, - "step": 9508, - "time_per_iteration": 2.7023262977600098 - }, - { - "auxiliary_loss_clip": 0.01121987, - "auxiliary_loss_mlp": 0.01031189, - "balance_loss_clip": 1.04333735, - "balance_loss_mlp": 1.01777184, - "epoch": 0.5717120096197205, - "flos": 17819485626240.0, - "grad_norm": 2.087333212498838, - "language_loss": 0.80104595, - "learning_rate": 1.634606741699593e-06, - "loss": 0.82257771, - "num_input_tokens_seen": 204844305, - "step": 9509, - "time_per_iteration": 2.6331591606140137 - }, - { - "auxiliary_loss_clip": 0.01104906, - "auxiliary_loss_mlp": 0.01033683, - "balance_loss_clip": 1.04276729, - "balance_loss_mlp": 1.02071953, - "epoch": 0.5717721328723884, - "flos": 21866043179520.0, - "grad_norm": 1.9468766397229225, - "language_loss": 0.71857727, - "learning_rate": 1.6342238418474255e-06, - "loss": 0.73996317, - "num_input_tokens_seen": 204861765, - "step": 9510, - "time_per_iteration": 2.6763837337493896 - }, - { - "auxiliary_loss_clip": 0.01096671, - "auxiliary_loss_mlp": 0.01031456, - "balance_loss_clip": 1.04109251, - "balance_loss_mlp": 1.01920152, - "epoch": 0.5718322561250564, - "flos": 28437624126720.0, - "grad_norm": 1.5755083758344817, - "language_loss": 0.69395983, - "learning_rate": 1.6338409558655264e-06, - "loss": 0.71524119, - "num_input_tokens_seen": 204882505, - "step": 9511, - "time_per_iteration": 4.320638418197632 - }, - { - "auxiliary_loss_clip": 0.01097735, - "auxiliary_loss_mlp": 0.01035503, - "balance_loss_clip": 1.04172611, - "balance_loss_mlp": 1.02338552, - "epoch": 0.5718923793777243, - "flos": 13551825905280.0, - "grad_norm": 2.0067389560068047, - "language_loss": 0.6147874, - "learning_rate": 1.6334580837684152e-06, - "loss": 0.63611984, - "num_input_tokens_seen": 204899830, - "step": 9512, - "time_per_iteration": 2.759669065475464 - }, - { - "auxiliary_loss_clip": 0.01095927, - "auxiliary_loss_mlp": 0.01029716, - "balance_loss_clip": 1.04188657, - "balance_loss_mlp": 1.01700234, - "epoch": 0.5719525026303923, - "flos": 17822035491840.0, - "grad_norm": 2.401258082797128, - "language_loss": 0.76018667, - "learning_rate": 1.6330752255706104e-06, - "loss": 0.78144312, - "num_input_tokens_seen": 204918100, - "step": 9513, - "time_per_iteration": 2.7117698192596436 - }, - { - "auxiliary_loss_clip": 0.01030995, - "auxiliary_loss_mlp": 0.00999994, - "balance_loss_clip": 1.01519012, - "balance_loss_mlp": 0.99881381, - "epoch": 0.5720126258830602, - "flos": 61298042814720.0, - "grad_norm": 0.8987559536316853, - "language_loss": 0.66807652, - "learning_rate": 1.6326923812866288e-06, - "loss": 0.68838638, - "num_input_tokens_seen": 204972925, - "step": 9514, - "time_per_iteration": 3.1701343059539795 - }, - { - "auxiliary_loss_clip": 0.01114643, - "auxiliary_loss_mlp": 0.01042868, - "balance_loss_clip": 1.0446943, - "balance_loss_mlp": 1.02930832, - "epoch": 0.5720727491357283, - "flos": 23988040997760.0, - "grad_norm": 2.0869347470902704, - "language_loss": 0.81355566, - "learning_rate": 1.63230955093099e-06, - "loss": 0.83513075, - "num_input_tokens_seen": 204990910, - "step": 9515, - "time_per_iteration": 2.668982744216919 - }, - { - "auxiliary_loss_clip": 0.01098965, - "auxiliary_loss_mlp": 0.01032673, - "balance_loss_clip": 1.04036427, - "balance_loss_mlp": 1.01993597, - "epoch": 0.5721328723883962, - "flos": 23405426398080.0, - "grad_norm": 3.1746972716468664, - "language_loss": 0.85928082, - "learning_rate": 1.6319267345182092e-06, - "loss": 0.88059723, - "num_input_tokens_seen": 205010500, - "step": 9516, - "time_per_iteration": 2.6741178035736084 - }, - { - "auxiliary_loss_clip": 0.01083742, - "auxiliary_loss_mlp": 0.01031013, - "balance_loss_clip": 1.04019785, - "balance_loss_mlp": 1.01784658, - "epoch": 0.5721929956410642, - "flos": 18804910320000.0, - "grad_norm": 1.8608727945257042, - "language_loss": 0.87884629, - "learning_rate": 1.6315439320628038e-06, - "loss": 0.8999939, - "num_input_tokens_seen": 205028560, - "step": 9517, - "time_per_iteration": 2.699981451034546 - }, - { - "auxiliary_loss_clip": 0.01066403, - "auxiliary_loss_mlp": 0.01031636, - "balance_loss_clip": 1.03665698, - "balance_loss_mlp": 1.01866579, - "epoch": 0.5722531188937322, - "flos": 27196659100800.0, - "grad_norm": 1.632945668541975, - "language_loss": 0.85146403, - "learning_rate": 1.6311611435792893e-06, - "loss": 0.87244439, - "num_input_tokens_seen": 205048650, - "step": 9518, - "time_per_iteration": 2.8667659759521484 - }, - { - "auxiliary_loss_clip": 0.01104733, - "auxiliary_loss_mlp": 0.01033736, - "balance_loss_clip": 1.04255366, - "balance_loss_mlp": 1.02131414, - "epoch": 0.5723132421464001, - "flos": 15195672852480.0, - "grad_norm": 1.838676422571758, - "language_loss": 0.7901606, - "learning_rate": 1.6307783690821812e-06, - "loss": 0.81154531, - "num_input_tokens_seen": 205066480, - "step": 9519, - "time_per_iteration": 2.593822479248047 - }, - { - "auxiliary_loss_clip": 0.01117664, - "auxiliary_loss_mlp": 0.01029991, - "balance_loss_clip": 1.04276991, - "balance_loss_mlp": 1.01755762, - "epoch": 0.5723733653990681, - "flos": 27599433281280.0, - "grad_norm": 1.4978137038182386, - "language_loss": 0.83191645, - "learning_rate": 1.6303956085859944e-06, - "loss": 0.85339302, - "num_input_tokens_seen": 205087475, - "step": 9520, - "time_per_iteration": 2.664851427078247 - }, - { - "auxiliary_loss_clip": 0.01098568, - "auxiliary_loss_mlp": 0.01044625, - "balance_loss_clip": 1.04248536, - "balance_loss_mlp": 1.03115487, - "epoch": 0.572433488651736, - "flos": 18222870337920.0, - "grad_norm": 2.1952309591015267, - "language_loss": 0.72542965, - "learning_rate": 1.630012862105243e-06, - "loss": 0.74686158, - "num_input_tokens_seen": 205106495, - "step": 9521, - "time_per_iteration": 2.7253611087799072 - }, - { - "auxiliary_loss_clip": 0.011175, - "auxiliary_loss_mlp": 0.00769564, - "balance_loss_clip": 1.04164016, - "balance_loss_mlp": 1.00010264, - "epoch": 0.5724936119044041, - "flos": 31249106484480.0, - "grad_norm": 2.153094973040902, - "language_loss": 0.78315163, - "learning_rate": 1.6296301296544415e-06, - "loss": 0.80202222, - "num_input_tokens_seen": 205128285, - "step": 9522, - "time_per_iteration": 2.6890037059783936 - }, - { - "auxiliary_loss_clip": 0.01088616, - "auxiliary_loss_mlp": 0.01034098, - "balance_loss_clip": 1.04117084, - "balance_loss_mlp": 1.02251649, - "epoch": 0.572553735157072, - "flos": 19202189719680.0, - "grad_norm": 1.511112661891623, - "language_loss": 0.71476662, - "learning_rate": 1.629247411248102e-06, - "loss": 0.73599374, - "num_input_tokens_seen": 205146595, - "step": 9523, - "time_per_iteration": 2.6567182540893555 - }, - { - "auxiliary_loss_clip": 0.01092274, - "auxiliary_loss_mlp": 0.01033734, - "balance_loss_clip": 1.03921247, - "balance_loss_mlp": 1.02187228, - "epoch": 0.57261385840974, - "flos": 21214911386880.0, - "grad_norm": 2.2130630300856207, - "language_loss": 0.70017171, - "learning_rate": 1.628864706900738e-06, - "loss": 0.72143173, - "num_input_tokens_seen": 205164295, - "step": 9524, - "time_per_iteration": 2.700518846511841 - }, - { - "auxiliary_loss_clip": 0.01107505, - "auxiliary_loss_mlp": 0.01031795, - "balance_loss_clip": 1.04225564, - "balance_loss_mlp": 1.01971316, - "epoch": 0.5726739816624079, - "flos": 33984529793280.0, - "grad_norm": 1.461112152817653, - "language_loss": 0.65126455, - "learning_rate": 1.6284820166268615e-06, - "loss": 0.67265761, - "num_input_tokens_seen": 205185380, - "step": 9525, - "time_per_iteration": 2.7389535903930664 - }, - { - "auxiliary_loss_clip": 0.01091158, - "auxiliary_loss_mlp": 0.01035018, - "balance_loss_clip": 1.03928351, - "balance_loss_mlp": 1.023139, - "epoch": 0.5727341049150759, - "flos": 24275972419200.0, - "grad_norm": 1.930578654391071, - "language_loss": 0.72484279, - "learning_rate": 1.628099340440984e-06, - "loss": 0.7461046, - "num_input_tokens_seen": 205204895, - "step": 9526, - "time_per_iteration": 2.702472448348999 - }, - { - "auxiliary_loss_clip": 0.01103623, - "auxiliary_loss_mlp": 0.01038123, - "balance_loss_clip": 1.03998101, - "balance_loss_mlp": 1.02604759, - "epoch": 0.5727942281677438, - "flos": 28400564269440.0, - "grad_norm": 2.0565235980515206, - "language_loss": 0.8007257, - "learning_rate": 1.6277166783576176e-06, - "loss": 0.8221432, - "num_input_tokens_seen": 205223440, - "step": 9527, - "time_per_iteration": 2.7238149642944336 - }, - { - "auxiliary_loss_clip": 0.01101882, - "auxiliary_loss_mlp": 0.01036542, - "balance_loss_clip": 1.03860235, - "balance_loss_mlp": 1.02360809, - "epoch": 0.5728543514204119, - "flos": 19536769929600.0, - "grad_norm": 1.770832454252008, - "language_loss": 0.72136271, - "learning_rate": 1.6273340303912713e-06, - "loss": 0.74274695, - "num_input_tokens_seen": 205242800, - "step": 9528, - "time_per_iteration": 2.593954086303711 - }, - { - "auxiliary_loss_clip": 0.01117957, - "auxiliary_loss_mlp": 0.01036459, - "balance_loss_clip": 1.04303622, - "balance_loss_mlp": 1.02363753, - "epoch": 0.5729144746730798, - "flos": 21506757390720.0, - "grad_norm": 2.0200513223103846, - "language_loss": 0.86137569, - "learning_rate": 1.6269513965564557e-06, - "loss": 0.88291985, - "num_input_tokens_seen": 205259465, - "step": 9529, - "time_per_iteration": 2.6399447917938232 - }, - { - "auxiliary_loss_clip": 0.01022279, - "auxiliary_loss_mlp": 0.00999796, - "balance_loss_clip": 1.01659954, - "balance_loss_mlp": 0.99862826, - "epoch": 0.5729745979257478, - "flos": 58681628242560.0, - "grad_norm": 0.7634342678167043, - "language_loss": 0.56170225, - "learning_rate": 1.6265687768676813e-06, - "loss": 0.58192301, - "num_input_tokens_seen": 205314100, - "step": 9530, - "time_per_iteration": 3.081955671310425 - }, - { - "auxiliary_loss_clip": 0.01096881, - "auxiliary_loss_mlp": 0.01030649, - "balance_loss_clip": 1.04126835, - "balance_loss_mlp": 1.01860929, - "epoch": 0.5730347211784158, - "flos": 18552099421440.0, - "grad_norm": 1.8014631294656338, - "language_loss": 0.66785836, - "learning_rate": 1.6261861713394553e-06, - "loss": 0.6891337, - "num_input_tokens_seen": 205333420, - "step": 9531, - "time_per_iteration": 2.650801658630371 - }, - { - "auxiliary_loss_clip": 0.01102348, - "auxiliary_loss_mlp": 0.01042246, - "balance_loss_clip": 1.03970659, - "balance_loss_mlp": 1.02834046, - "epoch": 0.5730948444310837, - "flos": 38031482396160.0, - "grad_norm": 2.1479743871986314, - "language_loss": 0.75923574, - "learning_rate": 1.6258035799862876e-06, - "loss": 0.78068173, - "num_input_tokens_seen": 205350995, - "step": 9532, - "time_per_iteration": 2.7268972396850586 - }, - { - "auxiliary_loss_clip": 0.01117449, - "auxiliary_loss_mlp": 0.01031067, - "balance_loss_clip": 1.0426352, - "balance_loss_mlp": 1.01828206, - "epoch": 0.5731549676837517, - "flos": 25227066689280.0, - "grad_norm": 1.3324145118640112, - "language_loss": 0.78908527, - "learning_rate": 1.625421002822686e-06, - "loss": 0.81057048, - "num_input_tokens_seen": 205372675, - "step": 9533, - "time_per_iteration": 2.6636223793029785 - }, - { - "auxiliary_loss_clip": 0.01105019, - "auxiliary_loss_mlp": 0.01029773, - "balance_loss_clip": 1.04237115, - "balance_loss_mlp": 1.01806676, - "epoch": 0.5732150909364196, - "flos": 23368222886400.0, - "grad_norm": 1.7921135162563215, - "language_loss": 0.85584033, - "learning_rate": 1.6250384398631574e-06, - "loss": 0.87718827, - "num_input_tokens_seen": 205392590, - "step": 9534, - "time_per_iteration": 2.6173202991485596 - }, - { - "auxiliary_loss_clip": 0.01098044, - "auxiliary_loss_mlp": 0.01038668, - "balance_loss_clip": 1.0421629, - "balance_loss_mlp": 1.02537584, - "epoch": 0.5732752141890877, - "flos": 23079357711360.0, - "grad_norm": 1.8285457434330181, - "language_loss": 0.7536543, - "learning_rate": 1.6246558911222085e-06, - "loss": 0.77502143, - "num_input_tokens_seen": 205414885, - "step": 9535, - "time_per_iteration": 2.6797807216644287 - }, - { - "auxiliary_loss_clip": 0.0110163, - "auxiliary_loss_mlp": 0.01032829, - "balance_loss_clip": 1.04250264, - "balance_loss_mlp": 1.01984715, - "epoch": 0.5733353374417556, - "flos": 24352282863360.0, - "grad_norm": 1.4660219442049842, - "language_loss": 0.71041429, - "learning_rate": 1.624273356614346e-06, - "loss": 0.73175883, - "num_input_tokens_seen": 205434440, - "step": 9536, - "time_per_iteration": 2.6927666664123535 - }, - { - "auxiliary_loss_clip": 0.0107587, - "auxiliary_loss_mlp": 0.01034692, - "balance_loss_clip": 1.03728056, - "balance_loss_mlp": 1.02034533, - "epoch": 0.5733954606944236, - "flos": 27198849830400.0, - "grad_norm": 1.9779932456354445, - "language_loss": 0.69794559, - "learning_rate": 1.6238908363540755e-06, - "loss": 0.71905118, - "num_input_tokens_seen": 205454225, - "step": 9537, - "time_per_iteration": 2.758420944213867 - }, - { - "auxiliary_loss_clip": 0.01119262, - "auxiliary_loss_mlp": 0.01036385, - "balance_loss_clip": 1.04359508, - "balance_loss_mlp": 1.02364206, - "epoch": 0.5734555839470915, - "flos": 28765129357440.0, - "grad_norm": 1.8277858348507134, - "language_loss": 0.62517941, - "learning_rate": 1.623508330355902e-06, - "loss": 0.64673591, - "num_input_tokens_seen": 205474750, - "step": 9538, - "time_per_iteration": 2.6978628635406494 - }, - { - "auxiliary_loss_clip": 0.01105121, - "auxiliary_loss_mlp": 0.0103457, - "balance_loss_clip": 1.04219174, - "balance_loss_mlp": 1.02135563, - "epoch": 0.5735157071997595, - "flos": 22966813422720.0, - "grad_norm": 1.6582870130678489, - "language_loss": 0.83564949, - "learning_rate": 1.6231258386343306e-06, - "loss": 0.85704643, - "num_input_tokens_seen": 205495495, - "step": 9539, - "time_per_iteration": 2.7695393562316895 - }, - { - "auxiliary_loss_clip": 0.01086088, - "auxiliary_loss_mlp": 0.01038955, - "balance_loss_clip": 1.04798675, - "balance_loss_mlp": 1.02566326, - "epoch": 0.5735758304524274, - "flos": 18989455420800.0, - "grad_norm": 2.207302017109072, - "language_loss": 0.73048598, - "learning_rate": 1.6227433612038647e-06, - "loss": 0.75173634, - "num_input_tokens_seen": 205510070, - "step": 9540, - "time_per_iteration": 2.760653018951416 - }, - { - "auxiliary_loss_clip": 0.01101303, - "auxiliary_loss_mlp": 0.00769854, - "balance_loss_clip": 1.03920221, - "balance_loss_mlp": 1.00004601, - "epoch": 0.5736359537050955, - "flos": 28397942576640.0, - "grad_norm": 2.4125489920069074, - "language_loss": 0.79765099, - "learning_rate": 1.6223608980790089e-06, - "loss": 0.81636256, - "num_input_tokens_seen": 205530190, - "step": 9541, - "time_per_iteration": 2.789978504180908 - }, - { - "auxiliary_loss_clip": 0.01096764, - "auxiliary_loss_mlp": 0.01033683, - "balance_loss_clip": 1.040447, - "balance_loss_mlp": 1.02054572, - "epoch": 0.5736960769577634, - "flos": 15627210848640.0, - "grad_norm": 2.579963788523863, - "language_loss": 0.6497947, - "learning_rate": 1.6219784492742654e-06, - "loss": 0.67109919, - "num_input_tokens_seen": 205547380, - "step": 9542, - "time_per_iteration": 2.684465169906616 - }, - { - "auxiliary_loss_clip": 0.01094703, - "auxiliary_loss_mlp": 0.01032355, - "balance_loss_clip": 1.03985989, - "balance_loss_mlp": 1.01992106, - "epoch": 0.5737562002104314, - "flos": 18003994813440.0, - "grad_norm": 2.1591412151518625, - "language_loss": 0.82844281, - "learning_rate": 1.6215960148041365e-06, - "loss": 0.84971344, - "num_input_tokens_seen": 205566540, - "step": 9543, - "time_per_iteration": 2.724700450897217 - }, - { - "auxiliary_loss_clip": 0.01078135, - "auxiliary_loss_mlp": 0.01034179, - "balance_loss_clip": 1.03842759, - "balance_loss_mlp": 1.01990938, - "epoch": 0.5738163234630994, - "flos": 20698192287360.0, - "grad_norm": 2.0892075264702616, - "language_loss": 0.73500836, - "learning_rate": 1.6212135946831257e-06, - "loss": 0.75613153, - "num_input_tokens_seen": 205584200, - "step": 9544, - "time_per_iteration": 2.7072341442108154 - }, - { - "auxiliary_loss_clip": 0.01063343, - "auxiliary_loss_mlp": 0.01034841, - "balance_loss_clip": 1.03527069, - "balance_loss_mlp": 1.02173972, - "epoch": 0.5738764467157673, - "flos": 23149311448320.0, - "grad_norm": 1.791719724630014, - "language_loss": 0.76021409, - "learning_rate": 1.620831188925733e-06, - "loss": 0.78119594, - "num_input_tokens_seen": 205604675, - "step": 9545, - "time_per_iteration": 4.402756690979004 - }, - { - "auxiliary_loss_clip": 0.0109842, - "auxiliary_loss_mlp": 0.0103679, - "balance_loss_clip": 1.04495752, - "balance_loss_mlp": 1.02345061, - "epoch": 0.5739365699684353, - "flos": 29492930730240.0, - "grad_norm": 1.94712066693327, - "language_loss": 0.56656086, - "learning_rate": 1.620448797546459e-06, - "loss": 0.58791304, - "num_input_tokens_seen": 205624680, - "step": 9546, - "time_per_iteration": 6.025787115097046 - }, - { - "auxiliary_loss_clip": 0.01091236, - "auxiliary_loss_mlp": 0.01033391, - "balance_loss_clip": 1.03923881, - "balance_loss_mlp": 1.02023625, - "epoch": 0.5739966932211032, - "flos": 14027247342720.0, - "grad_norm": 2.369322585416499, - "language_loss": 0.7595309, - "learning_rate": 1.6200664205598055e-06, - "loss": 0.78077716, - "num_input_tokens_seen": 205641950, - "step": 9547, - "time_per_iteration": 2.71240496635437 - }, - { - "auxiliary_loss_clip": 0.01104111, - "auxiliary_loss_mlp": 0.01030548, - "balance_loss_clip": 1.03877449, - "balance_loss_mlp": 1.01709485, - "epoch": 0.5740568164737713, - "flos": 19062030850560.0, - "grad_norm": 5.307379698295213, - "language_loss": 0.74525601, - "learning_rate": 1.6196840579802704e-06, - "loss": 0.76660264, - "num_input_tokens_seen": 205660130, - "step": 9548, - "time_per_iteration": 2.651829957962036 - }, - { - "auxiliary_loss_clip": 0.01085909, - "auxiliary_loss_mlp": 0.0103587, - "balance_loss_clip": 1.03760338, - "balance_loss_mlp": 1.02268577, - "epoch": 0.5741169397264392, - "flos": 22127832478080.0, - "grad_norm": 4.02154100378115, - "language_loss": 0.69476151, - "learning_rate": 1.619301709822355e-06, - "loss": 0.71597928, - "num_input_tokens_seen": 205678895, - "step": 9549, - "time_per_iteration": 2.7304623126983643 - }, - { - "auxiliary_loss_clip": 0.01068231, - "auxiliary_loss_mlp": 0.01031011, - "balance_loss_clip": 1.04319942, - "balance_loss_mlp": 1.01907182, - "epoch": 0.5741770629791072, - "flos": 24936836797440.0, - "grad_norm": 1.4366767261825364, - "language_loss": 0.79742229, - "learning_rate": 1.6189193761005564e-06, - "loss": 0.81841469, - "num_input_tokens_seen": 205698450, - "step": 9550, - "time_per_iteration": 2.759152889251709 - }, - { - "auxiliary_loss_clip": 0.01091678, - "auxiliary_loss_mlp": 0.01036065, - "balance_loss_clip": 1.04081261, - "balance_loss_mlp": 1.0213902, - "epoch": 0.5742371862317751, - "flos": 18801462614400.0, - "grad_norm": 1.889418417446442, - "language_loss": 0.67791235, - "learning_rate": 1.6185370568293727e-06, - "loss": 0.69918978, - "num_input_tokens_seen": 205714870, - "step": 9551, - "time_per_iteration": 4.226199150085449 - }, - { - "auxiliary_loss_clip": 0.01082087, - "auxiliary_loss_mlp": 0.0103572, - "balance_loss_clip": 1.04173434, - "balance_loss_mlp": 1.02287543, - "epoch": 0.5742973094844431, - "flos": 24460661174400.0, - "grad_norm": 2.3194402923297157, - "language_loss": 0.7223655, - "learning_rate": 1.6181547520233031e-06, - "loss": 0.74354362, - "num_input_tokens_seen": 205736045, - "step": 9552, - "time_per_iteration": 2.736600160598755 - }, - { - "auxiliary_loss_clip": 0.01103832, - "auxiliary_loss_mlp": 0.01033342, - "balance_loss_clip": 1.04454732, - "balance_loss_mlp": 1.02040219, - "epoch": 0.574357432737111, - "flos": 21652770176640.0, - "grad_norm": 2.128940953023755, - "language_loss": 0.79823256, - "learning_rate": 1.617772461696843e-06, - "loss": 0.81960428, - "num_input_tokens_seen": 205754445, - "step": 9553, - "time_per_iteration": 2.6895127296447754 - }, - { - "auxiliary_loss_clip": 0.01111471, - "auxiliary_loss_mlp": 0.01032858, - "balance_loss_clip": 1.04313147, - "balance_loss_mlp": 1.02050185, - "epoch": 0.5744175559897791, - "flos": 16544728880640.0, - "grad_norm": 1.880148698667659, - "language_loss": 0.8353495, - "learning_rate": 1.6173901858644895e-06, - "loss": 0.85679281, - "num_input_tokens_seen": 205770595, - "step": 9554, - "time_per_iteration": 2.615577220916748 - }, - { - "auxiliary_loss_clip": 0.01115074, - "auxiliary_loss_mlp": 0.0077091, - "balance_loss_clip": 1.04545319, - "balance_loss_mlp": 1.0001241, - "epoch": 0.574477679242447, - "flos": 24207598880640.0, - "grad_norm": 1.4793540146055872, - "language_loss": 0.71076667, - "learning_rate": 1.6170079245407385e-06, - "loss": 0.72962654, - "num_input_tokens_seen": 205791935, - "step": 9555, - "time_per_iteration": 2.7411417961120605 - }, - { - "auxiliary_loss_clip": 0.01093974, - "auxiliary_loss_mlp": 0.0103121, - "balance_loss_clip": 1.04077876, - "balance_loss_mlp": 1.01763785, - "epoch": 0.574537802495115, - "flos": 14903000835840.0, - "grad_norm": 2.2805548015379755, - "language_loss": 0.72663784, - "learning_rate": 1.6166256777400853e-06, - "loss": 0.7478897, - "num_input_tokens_seen": 205807260, - "step": 9556, - "time_per_iteration": 2.6720690727233887 - }, - { - "auxiliary_loss_clip": 0.01111378, - "auxiliary_loss_mlp": 0.01033136, - "balance_loss_clip": 1.04576373, - "balance_loss_mlp": 1.02015448, - "epoch": 0.5745979257477829, - "flos": 24934969290240.0, - "grad_norm": 1.744837604754053, - "language_loss": 0.74087226, - "learning_rate": 1.6162434454770248e-06, - "loss": 0.76231742, - "num_input_tokens_seen": 205826885, - "step": 9557, - "time_per_iteration": 2.7899231910705566 - }, - { - "auxiliary_loss_clip": 0.01108542, - "auxiliary_loss_mlp": 0.01034037, - "balance_loss_clip": 1.04274464, - "balance_loss_mlp": 1.02157927, - "epoch": 0.5746580490004509, - "flos": 17235757704960.0, - "grad_norm": 1.5016834383596844, - "language_loss": 0.67902005, - "learning_rate": 1.6158612277660514e-06, - "loss": 0.70044577, - "num_input_tokens_seen": 205844630, - "step": 9558, - "time_per_iteration": 2.762430429458618 - }, - { - "auxiliary_loss_clip": 0.01094279, - "auxiliary_loss_mlp": 0.01052047, - "balance_loss_clip": 1.04277229, - "balance_loss_mlp": 1.03471398, - "epoch": 0.5747181722531189, - "flos": 13187871348480.0, - "grad_norm": 2.4192829019987148, - "language_loss": 0.72013688, - "learning_rate": 1.615479024621659e-06, - "loss": 0.74160016, - "num_input_tokens_seen": 205860960, - "step": 9559, - "time_per_iteration": 2.757319688796997 - }, - { - "auxiliary_loss_clip": 0.01097547, - "auxiliary_loss_mlp": 0.00769026, - "balance_loss_clip": 1.04342794, - "balance_loss_mlp": 1.00012159, - "epoch": 0.5747782955057869, - "flos": 22963006581120.0, - "grad_norm": 1.6274858947785595, - "language_loss": 0.78883743, - "learning_rate": 1.6150968360583398e-06, - "loss": 0.8075031, - "num_input_tokens_seen": 205880675, - "step": 9560, - "time_per_iteration": 2.746260166168213 - }, - { - "auxiliary_loss_clip": 0.01052934, - "auxiliary_loss_mlp": 0.01029841, - "balance_loss_clip": 1.03918111, - "balance_loss_mlp": 1.0164957, - "epoch": 0.5748384187584549, - "flos": 23403235668480.0, - "grad_norm": 2.1977539095196903, - "language_loss": 0.64321613, - "learning_rate": 1.614714662090588e-06, - "loss": 0.6640439, - "num_input_tokens_seen": 205900050, - "step": 9561, - "time_per_iteration": 2.8124732971191406 - }, - { - "auxiliary_loss_clip": 0.01116845, - "auxiliary_loss_mlp": 0.01039625, - "balance_loss_clip": 1.04539895, - "balance_loss_mlp": 1.02567124, - "epoch": 0.5748985420111228, - "flos": 17785514338560.0, - "grad_norm": 2.0210299953328414, - "language_loss": 0.7193495, - "learning_rate": 1.6143325027328945e-06, - "loss": 0.74091417, - "num_input_tokens_seen": 205918855, - "step": 9562, - "time_per_iteration": 2.7868704795837402 - }, - { - "auxiliary_loss_clip": 0.01067199, - "auxiliary_loss_mlp": 0.01032486, - "balance_loss_clip": 1.03979492, - "balance_loss_mlp": 1.02039778, - "epoch": 0.5749586652637908, - "flos": 19866250408320.0, - "grad_norm": 1.4806264841650407, - "language_loss": 0.84100068, - "learning_rate": 1.613950357999751e-06, - "loss": 0.86199754, - "num_input_tokens_seen": 205936970, - "step": 9563, - "time_per_iteration": 2.7772703170776367 - }, - { - "auxiliary_loss_clip": 0.01073481, - "auxiliary_loss_mlp": 0.01039774, - "balance_loss_clip": 1.0434773, - "balance_loss_mlp": 1.02635074, - "epoch": 0.5750187885164587, - "flos": 21287235421440.0, - "grad_norm": 2.0689431633426802, - "language_loss": 0.5717746, - "learning_rate": 1.6135682279056488e-06, - "loss": 0.59290713, - "num_input_tokens_seen": 205954630, - "step": 9564, - "time_per_iteration": 2.8411808013916016 - }, - { - "auxiliary_loss_clip": 0.01092301, - "auxiliary_loss_mlp": 0.01036175, - "balance_loss_clip": 1.04144359, - "balance_loss_mlp": 1.0226326, - "epoch": 0.5750789117691267, - "flos": 18804658924800.0, - "grad_norm": 1.7191674250507119, - "language_loss": 0.76114881, - "learning_rate": 1.613186112465078e-06, - "loss": 0.78243363, - "num_input_tokens_seen": 205971510, - "step": 9565, - "time_per_iteration": 2.822044610977173 - }, - { - "auxiliary_loss_clip": 0.01002918, - "auxiliary_loss_mlp": 0.01012299, - "balance_loss_clip": 1.01532471, - "balance_loss_mlp": 1.01098824, - "epoch": 0.5751390350217946, - "flos": 70663224124800.0, - "grad_norm": 0.74248986424084, - "language_loss": 0.60725588, - "learning_rate": 1.6128040116925287e-06, - "loss": 0.62740809, - "num_input_tokens_seen": 206035125, - "step": 9566, - "time_per_iteration": 3.427154064178467 - }, - { - "auxiliary_loss_clip": 0.01093716, - "auxiliary_loss_mlp": 0.0103477, - "balance_loss_clip": 1.04347396, - "balance_loss_mlp": 1.02224672, - "epoch": 0.5751991582744627, - "flos": 14246338348800.0, - "grad_norm": 2.3384715191144214, - "language_loss": 0.75378191, - "learning_rate": 1.6124219256024901e-06, - "loss": 0.77506685, - "num_input_tokens_seen": 206052075, - "step": 9567, - "time_per_iteration": 2.8895022869110107 - }, - { - "auxiliary_loss_clip": 0.0110852, - "auxiliary_loss_mlp": 0.0103381, - "balance_loss_clip": 1.04461062, - "balance_loss_mlp": 1.02136469, - "epoch": 0.5752592815271306, - "flos": 18328160079360.0, - "grad_norm": 1.398692478003959, - "language_loss": 0.74487442, - "learning_rate": 1.6120398542094504e-06, - "loss": 0.7662977, - "num_input_tokens_seen": 206069970, - "step": 9568, - "time_per_iteration": 2.745008945465088 - }, - { - "auxiliary_loss_clip": 0.01122376, - "auxiliary_loss_mlp": 0.01031079, - "balance_loss_clip": 1.04557085, - "balance_loss_mlp": 1.01852036, - "epoch": 0.5753194047797986, - "flos": 20922742160640.0, - "grad_norm": 1.8288224744161317, - "language_loss": 0.71572077, - "learning_rate": 1.6116577975278994e-06, - "loss": 0.73725533, - "num_input_tokens_seen": 206088950, - "step": 9569, - "time_per_iteration": 2.9613218307495117 - }, - { - "auxiliary_loss_clip": 0.01113684, - "auxiliary_loss_mlp": 0.01037553, - "balance_loss_clip": 1.04693925, - "balance_loss_mlp": 1.02399325, - "epoch": 0.5753795280324665, - "flos": 19281804215040.0, - "grad_norm": 2.1991270484780916, - "language_loss": 0.55975366, - "learning_rate": 1.6112757555723223e-06, - "loss": 0.58126599, - "num_input_tokens_seen": 206107780, - "step": 9570, - "time_per_iteration": 2.6928811073303223 - }, - { - "auxiliary_loss_clip": 0.01118829, - "auxiliary_loss_mlp": 0.01034724, - "balance_loss_clip": 1.04458117, - "balance_loss_mlp": 1.02252328, - "epoch": 0.5754396512851345, - "flos": 21652877917440.0, - "grad_norm": 1.4030574698632734, - "language_loss": 0.64338309, - "learning_rate": 1.6108937283572082e-06, - "loss": 0.66491854, - "num_input_tokens_seen": 206127445, - "step": 9571, - "time_per_iteration": 2.635603427886963 - }, - { - "auxiliary_loss_clip": 0.01111717, - "auxiliary_loss_mlp": 0.01031618, - "balance_loss_clip": 1.04484558, - "balance_loss_mlp": 1.01890385, - "epoch": 0.5754997745378025, - "flos": 51021700179840.0, - "grad_norm": 1.5230879857727748, - "language_loss": 0.67137802, - "learning_rate": 1.6105117158970434e-06, - "loss": 0.69281137, - "num_input_tokens_seen": 206152005, - "step": 9572, - "time_per_iteration": 2.9080519676208496 - }, - { - "auxiliary_loss_clip": 0.01101219, - "auxiliary_loss_mlp": 0.01032315, - "balance_loss_clip": 1.04746473, - "balance_loss_mlp": 1.01870155, - "epoch": 0.5755598977904705, - "flos": 22856890826880.0, - "grad_norm": 1.7883651828614429, - "language_loss": 0.72390687, - "learning_rate": 1.6101297182063123e-06, - "loss": 0.74524224, - "num_input_tokens_seen": 206169875, - "step": 9573, - "time_per_iteration": 2.815703868865967 - }, - { - "auxiliary_loss_clip": 0.01118198, - "auxiliary_loss_mlp": 0.01031966, - "balance_loss_clip": 1.04730046, - "balance_loss_mlp": 1.0202539, - "epoch": 0.5756200210431385, - "flos": 38472824805120.0, - "grad_norm": 1.8637575754568128, - "language_loss": 0.76394922, - "learning_rate": 1.6097477352995022e-06, - "loss": 0.78545088, - "num_input_tokens_seen": 206192635, - "step": 9574, - "time_per_iteration": 2.778196096420288 - }, - { - "auxiliary_loss_clip": 0.01068081, - "auxiliary_loss_mlp": 0.01036908, - "balance_loss_clip": 1.03836775, - "balance_loss_mlp": 1.02201867, - "epoch": 0.5756801442958064, - "flos": 23910006700800.0, - "grad_norm": 2.572143968399992, - "language_loss": 0.66373074, - "learning_rate": 1.6093657671910968e-06, - "loss": 0.68478066, - "num_input_tokens_seen": 206211485, - "step": 9575, - "time_per_iteration": 2.780195951461792 - }, - { - "auxiliary_loss_clip": 0.01097887, - "auxiliary_loss_mlp": 0.01031317, - "balance_loss_clip": 1.04497039, - "balance_loss_mlp": 1.01917517, - "epoch": 0.5757402675484744, - "flos": 21105276099840.0, - "grad_norm": 1.5189421087528554, - "language_loss": 0.79787755, - "learning_rate": 1.6089838138955804e-06, - "loss": 0.81916952, - "num_input_tokens_seen": 206231740, - "step": 9576, - "time_per_iteration": 2.7809135913848877 - }, - { - "auxiliary_loss_clip": 0.01096091, - "auxiliary_loss_mlp": 0.0102674, - "balance_loss_clip": 1.0435828, - "balance_loss_mlp": 1.01512265, - "epoch": 0.5758003908011423, - "flos": 20559110826240.0, - "grad_norm": 1.7619408585744085, - "language_loss": 0.69726396, - "learning_rate": 1.6086018754274372e-06, - "loss": 0.71849227, - "num_input_tokens_seen": 206250975, - "step": 9577, - "time_per_iteration": 2.732150077819824 - }, - { - "auxiliary_loss_clip": 0.01111358, - "auxiliary_loss_mlp": 0.01035186, - "balance_loss_clip": 1.04446626, - "balance_loss_mlp": 1.02306843, - "epoch": 0.5758605140538103, - "flos": 16473015377280.0, - "grad_norm": 2.216832845639703, - "language_loss": 0.66558278, - "learning_rate": 1.6082199518011504e-06, - "loss": 0.6870482, - "num_input_tokens_seen": 206268800, - "step": 9578, - "time_per_iteration": 2.639571189880371 - }, - { - "auxiliary_loss_clip": 0.01091288, - "auxiliary_loss_mlp": 0.01032209, - "balance_loss_clip": 1.04414392, - "balance_loss_mlp": 1.01997256, - "epoch": 0.5759206373064782, - "flos": 21287558643840.0, - "grad_norm": 1.7735647320590846, - "language_loss": 0.72313404, - "learning_rate": 1.6078380430312016e-06, - "loss": 0.74436903, - "num_input_tokens_seen": 206287190, - "step": 9579, - "time_per_iteration": 2.6910343170166016 - }, - { - "auxiliary_loss_clip": 0.0110168, - "auxiliary_loss_mlp": 0.01035785, - "balance_loss_clip": 1.04436874, - "balance_loss_mlp": 1.02170634, - "epoch": 0.5759807605591463, - "flos": 26067879227520.0, - "grad_norm": 4.803146579630836, - "language_loss": 0.65395081, - "learning_rate": 1.6074561491320742e-06, - "loss": 0.67532551, - "num_input_tokens_seen": 206307020, - "step": 9580, - "time_per_iteration": 2.7227509021759033 - }, - { - "auxiliary_loss_clip": 0.01092842, - "auxiliary_loss_mlp": 0.01034767, - "balance_loss_clip": 1.04106581, - "balance_loss_mlp": 1.0212729, - "epoch": 0.5760408838118142, - "flos": 18873068376960.0, - "grad_norm": 1.9154940218320493, - "language_loss": 0.85214174, - "learning_rate": 1.6070742701182486e-06, - "loss": 0.87341785, - "num_input_tokens_seen": 206324095, - "step": 9581, - "time_per_iteration": 2.699432849884033 - }, - { - "auxiliary_loss_clip": 0.0113104, - "auxiliary_loss_mlp": 0.01036775, - "balance_loss_clip": 1.05060983, - "balance_loss_mlp": 1.02360821, - "epoch": 0.5761010070644822, - "flos": 15378134964480.0, - "grad_norm": 2.109676550381332, - "language_loss": 0.67354548, - "learning_rate": 1.6066924060042057e-06, - "loss": 0.69522369, - "num_input_tokens_seen": 206343210, - "step": 9582, - "time_per_iteration": 2.6381587982177734 - }, - { - "auxiliary_loss_clip": 0.01026383, - "auxiliary_loss_mlp": 0.01001724, - "balance_loss_clip": 1.01951599, - "balance_loss_mlp": 1.00040722, - "epoch": 0.5761611303171501, - "flos": 71471932882560.0, - "grad_norm": 0.6463341323488921, - "language_loss": 0.57134479, - "learning_rate": 1.6063105568044271e-06, - "loss": 0.59162581, - "num_input_tokens_seen": 206415935, - "step": 9583, - "time_per_iteration": 3.52109694480896 - }, - { - "auxiliary_loss_clip": 0.01090801, - "auxiliary_loss_mlp": 0.01030991, - "balance_loss_clip": 1.04208195, - "balance_loss_mlp": 1.01818216, - "epoch": 0.5762212535698181, - "flos": 16246167033600.0, - "grad_norm": 1.791358766979404, - "language_loss": 0.82729411, - "learning_rate": 1.6059287225333912e-06, - "loss": 0.84851205, - "num_input_tokens_seen": 206431900, - "step": 9584, - "time_per_iteration": 2.7258176803588867 - }, - { - "auxiliary_loss_clip": 0.0104221, - "auxiliary_loss_mlp": 0.01002028, - "balance_loss_clip": 1.0174526, - "balance_loss_mlp": 1.00080013, - "epoch": 0.5762813768224861, - "flos": 70185504216960.0, - "grad_norm": 0.623568426409687, - "language_loss": 0.49559212, - "learning_rate": 1.6055469032055773e-06, - "loss": 0.51603448, - "num_input_tokens_seen": 206501200, - "step": 9585, - "time_per_iteration": 7.823396682739258 - }, - { - "auxiliary_loss_clip": 0.01092491, - "auxiliary_loss_mlp": 0.01027016, - "balance_loss_clip": 1.04217815, - "balance_loss_mlp": 1.01516044, - "epoch": 0.5763415000751541, - "flos": 20518028645760.0, - "grad_norm": 1.574762209284147, - "language_loss": 0.85150623, - "learning_rate": 1.605165098835465e-06, - "loss": 0.87270141, - "num_input_tokens_seen": 206520575, - "step": 9586, - "time_per_iteration": 2.6869027614593506 - }, - { - "auxiliary_loss_clip": 0.0110803, - "auxiliary_loss_mlp": 0.01034855, - "balance_loss_clip": 1.04531455, - "balance_loss_mlp": 1.02091956, - "epoch": 0.5764016233278221, - "flos": 15815526877440.0, - "grad_norm": 2.1680790738732796, - "language_loss": 0.80101568, - "learning_rate": 1.6047833094375308e-06, - "loss": 0.8224445, - "num_input_tokens_seen": 206538060, - "step": 9587, - "time_per_iteration": 2.664121627807617 - }, - { - "auxiliary_loss_clip": 0.01091421, - "auxiliary_loss_mlp": 0.01037732, - "balance_loss_clip": 1.04280019, - "balance_loss_mlp": 1.02400517, - "epoch": 0.57646174658049, - "flos": 20772312001920.0, - "grad_norm": 1.6197519148440016, - "language_loss": 0.66023791, - "learning_rate": 1.6044015350262542e-06, - "loss": 0.68152946, - "num_input_tokens_seen": 206557320, - "step": 9588, - "time_per_iteration": 2.6596546173095703 - }, - { - "auxiliary_loss_clip": 0.01095166, - "auxiliary_loss_mlp": 0.01039726, - "balance_loss_clip": 1.04326534, - "balance_loss_mlp": 1.02583766, - "epoch": 0.576521869833158, - "flos": 23549930812800.0, - "grad_norm": 2.4954533064787383, - "language_loss": 0.78688884, - "learning_rate": 1.6040197756161104e-06, - "loss": 0.80823773, - "num_input_tokens_seen": 206575780, - "step": 9589, - "time_per_iteration": 2.799503803253174 - }, - { - "auxiliary_loss_clip": 0.01114482, - "auxiliary_loss_mlp": 0.01025254, - "balance_loss_clip": 1.041682, - "balance_loss_mlp": 1.01353538, - "epoch": 0.5765819930858259, - "flos": 20266582464000.0, - "grad_norm": 2.2193599120856304, - "language_loss": 0.79450285, - "learning_rate": 1.6036380312215762e-06, - "loss": 0.81590021, - "num_input_tokens_seen": 206594100, - "step": 9590, - "time_per_iteration": 4.355879545211792 - }, - { - "auxiliary_loss_clip": 0.01052935, - "auxiliary_loss_mlp": 0.00769289, - "balance_loss_clip": 1.03650951, - "balance_loss_mlp": 1.00013447, - "epoch": 0.5766421163384939, - "flos": 23148772744320.0, - "grad_norm": 1.8083193654510727, - "language_loss": 0.63346255, - "learning_rate": 1.6032563018571283e-06, - "loss": 0.65168482, - "num_input_tokens_seen": 206613325, - "step": 9591, - "time_per_iteration": 2.8449039459228516 - }, - { - "auxiliary_loss_clip": 0.01122211, - "auxiliary_loss_mlp": 0.00769941, - "balance_loss_clip": 1.04640627, - "balance_loss_mlp": 1.00013709, - "epoch": 0.5767022395911618, - "flos": 25848895962240.0, - "grad_norm": 2.331025746602298, - "language_loss": 0.78112143, - "learning_rate": 1.6028745875372406e-06, - "loss": 0.80004299, - "num_input_tokens_seen": 206634265, - "step": 9592, - "time_per_iteration": 2.7304346561431885 - }, - { - "auxiliary_loss_clip": 0.01004052, - "auxiliary_loss_mlp": 0.01021446, - "balance_loss_clip": 1.02547979, - "balance_loss_mlp": 1.01965749, - "epoch": 0.5767623628438299, - "flos": 68293299657600.0, - "grad_norm": 0.7436002967471621, - "language_loss": 0.59609032, - "learning_rate": 1.6024928882763885e-06, - "loss": 0.61634529, - "num_input_tokens_seen": 206696990, - "step": 9593, - "time_per_iteration": 3.461658477783203 - }, - { - "auxiliary_loss_clip": 0.01110844, - "auxiliary_loss_mlp": 0.01041399, - "balance_loss_clip": 1.042449, - "balance_loss_mlp": 1.02810097, - "epoch": 0.5768224860964978, - "flos": 30188448754560.0, - "grad_norm": 1.9449888897854992, - "language_loss": 0.71144432, - "learning_rate": 1.6021112040890463e-06, - "loss": 0.73296678, - "num_input_tokens_seen": 206717815, - "step": 9594, - "time_per_iteration": 2.8465657234191895 - }, - { - "auxiliary_loss_clip": 0.01085879, - "auxiliary_loss_mlp": 0.01033309, - "balance_loss_clip": 1.04293251, - "balance_loss_mlp": 1.02196598, - "epoch": 0.5768826093491658, - "flos": 17895041884800.0, - "grad_norm": 2.485745999068748, - "language_loss": 0.70693135, - "learning_rate": 1.6017295349896863e-06, - "loss": 0.72812331, - "num_input_tokens_seen": 206735985, - "step": 9595, - "time_per_iteration": 2.724013566970825 - }, - { - "auxiliary_loss_clip": 0.01120342, - "auxiliary_loss_mlp": 0.01030885, - "balance_loss_clip": 1.04522467, - "balance_loss_mlp": 1.01821947, - "epoch": 0.5769427326018337, - "flos": 17457183095040.0, - "grad_norm": 2.28937358102888, - "language_loss": 0.69969249, - "learning_rate": 1.6013478809927828e-06, - "loss": 0.72120476, - "num_input_tokens_seen": 206753370, - "step": 9596, - "time_per_iteration": 2.602410316467285 - }, - { - "auxiliary_loss_clip": 0.01097835, - "auxiliary_loss_mlp": 0.01033862, - "balance_loss_clip": 1.04560232, - "balance_loss_mlp": 1.01944959, - "epoch": 0.5770028558545017, - "flos": 39421728345600.0, - "grad_norm": 1.7463690567151626, - "language_loss": 0.67612261, - "learning_rate": 1.6009662421128074e-06, - "loss": 0.69743955, - "num_input_tokens_seen": 206777645, - "step": 9597, - "time_per_iteration": 2.9427249431610107 - }, - { - "auxiliary_loss_clip": 0.01096299, - "auxiliary_loss_mlp": 0.01033961, - "balance_loss_clip": 1.04274464, - "balance_loss_mlp": 1.02137804, - "epoch": 0.5770629791071697, - "flos": 21536383132800.0, - "grad_norm": 1.8692422611288704, - "language_loss": 0.81584179, - "learning_rate": 1.6005846183642323e-06, - "loss": 0.83714437, - "num_input_tokens_seen": 206794865, - "step": 9598, - "time_per_iteration": 2.748018503189087 - }, - { - "auxiliary_loss_clip": 0.01073806, - "auxiliary_loss_mlp": 0.01042323, - "balance_loss_clip": 1.03563309, - "balance_loss_mlp": 1.0270164, - "epoch": 0.5771231023598377, - "flos": 20886795624960.0, - "grad_norm": 1.6175391320992503, - "language_loss": 0.7306143, - "learning_rate": 1.6002030097615277e-06, - "loss": 0.7517755, - "num_input_tokens_seen": 206814095, - "step": 9599, - "time_per_iteration": 2.7712650299072266 - }, - { - "auxiliary_loss_clip": 0.01115679, - "auxiliary_loss_mlp": 0.01033218, - "balance_loss_clip": 1.04342914, - "balance_loss_mlp": 1.0211184, - "epoch": 0.5771832256125057, - "flos": 18077216688000.0, - "grad_norm": 3.919070780783451, - "language_loss": 0.78193593, - "learning_rate": 1.5998214163191663e-06, - "loss": 0.80342484, - "num_input_tokens_seen": 206832245, - "step": 9600, - "time_per_iteration": 2.6597604751586914 - }, - { - "auxiliary_loss_clip": 0.01113425, - "auxiliary_loss_mlp": 0.0077084, - "balance_loss_clip": 1.04604816, - "balance_loss_mlp": 1.00016284, - "epoch": 0.5772433488651736, - "flos": 26359078786560.0, - "grad_norm": 1.665079650983798, - "language_loss": 0.72689855, - "learning_rate": 1.5994398380516163e-06, - "loss": 0.74574125, - "num_input_tokens_seen": 206851535, - "step": 9601, - "time_per_iteration": 2.7263121604919434 - }, - { - "auxiliary_loss_clip": 0.01064473, - "auxiliary_loss_mlp": 0.01036032, - "balance_loss_clip": 1.04480124, - "balance_loss_mlp": 1.02311611, - "epoch": 0.5773034721178416, - "flos": 19680987035520.0, - "grad_norm": 2.0948856363437534, - "language_loss": 0.68606448, - "learning_rate": 1.599058274973348e-06, - "loss": 0.70706952, - "num_input_tokens_seen": 206870595, - "step": 9602, - "time_per_iteration": 2.8572375774383545 - }, - { - "auxiliary_loss_clip": 0.01088049, - "auxiliary_loss_mlp": 0.01035522, - "balance_loss_clip": 1.03997481, - "balance_loss_mlp": 1.02274275, - "epoch": 0.5773635953705095, - "flos": 25082885496960.0, - "grad_norm": 1.4139424352201144, - "language_loss": 0.73376763, - "learning_rate": 1.5986767270988297e-06, - "loss": 0.75500333, - "num_input_tokens_seen": 206892320, - "step": 9603, - "time_per_iteration": 2.816098928451538 - }, - { - "auxiliary_loss_clip": 0.01108536, - "auxiliary_loss_mlp": 0.01029532, - "balance_loss_clip": 1.0450983, - "balance_loss_mlp": 1.01732492, - "epoch": 0.5774237186231775, - "flos": 21032987978880.0, - "grad_norm": 1.7349679186761677, - "language_loss": 0.76407522, - "learning_rate": 1.5982951944425298e-06, - "loss": 0.78545588, - "num_input_tokens_seen": 206912485, - "step": 9604, - "time_per_iteration": 2.718163013458252 - }, - { - "auxiliary_loss_clip": 0.01086662, - "auxiliary_loss_mlp": 0.0103562, - "balance_loss_clip": 1.04304457, - "balance_loss_mlp": 1.02200651, - "epoch": 0.5774838418758454, - "flos": 15231727128960.0, - "grad_norm": 2.5247859182247026, - "language_loss": 0.83387136, - "learning_rate": 1.5979136770189174e-06, - "loss": 0.85509419, - "num_input_tokens_seen": 206929100, - "step": 9605, - "time_per_iteration": 2.8076066970825195 - }, - { - "auxiliary_loss_clip": 0.01096142, - "auxiliary_loss_mlp": 0.01031724, - "balance_loss_clip": 1.04626584, - "balance_loss_mlp": 1.01667333, - "epoch": 0.5775439651285135, - "flos": 23582609210880.0, - "grad_norm": 1.8595500746131972, - "language_loss": 0.77926147, - "learning_rate": 1.5975321748424581e-06, - "loss": 0.80054009, - "num_input_tokens_seen": 206947020, - "step": 9606, - "time_per_iteration": 2.7766621112823486 - }, - { - "auxiliary_loss_clip": 0.01117345, - "auxiliary_loss_mlp": 0.01035757, - "balance_loss_clip": 1.04331446, - "balance_loss_mlp": 1.02362752, - "epoch": 0.5776040883811814, - "flos": 18040515966720.0, - "grad_norm": 1.672602422897938, - "language_loss": 0.73896575, - "learning_rate": 1.597150687927619e-06, - "loss": 0.76049674, - "num_input_tokens_seen": 206964065, - "step": 9607, - "time_per_iteration": 2.6057968139648438 - }, - { - "auxiliary_loss_clip": 0.01076534, - "auxiliary_loss_mlp": 0.01034666, - "balance_loss_clip": 1.04220486, - "balance_loss_mlp": 1.02155876, - "epoch": 0.5776642116338494, - "flos": 18624638937600.0, - "grad_norm": 1.6326461875987317, - "language_loss": 0.69385672, - "learning_rate": 1.5967692162888664e-06, - "loss": 0.71496868, - "num_input_tokens_seen": 206981940, - "step": 9608, - "time_per_iteration": 2.784708023071289 - }, - { - "auxiliary_loss_clip": 0.01084539, - "auxiliary_loss_mlp": 0.01033053, - "balance_loss_clip": 1.03977787, - "balance_loss_mlp": 1.01979709, - "epoch": 0.5777243348865173, - "flos": 28402539517440.0, - "grad_norm": 1.6850838728782904, - "language_loss": 0.76766187, - "learning_rate": 1.596387759940665e-06, - "loss": 0.78883779, - "num_input_tokens_seen": 207002365, - "step": 9609, - "time_per_iteration": 2.7439122200012207 - }, - { - "auxiliary_loss_clip": 0.01090565, - "auxiliary_loss_mlp": 0.01033653, - "balance_loss_clip": 1.04297495, - "balance_loss_mlp": 1.02154744, - "epoch": 0.5777844581391853, - "flos": 24024705805440.0, - "grad_norm": 1.7626877282975804, - "language_loss": 0.76948774, - "learning_rate": 1.5960063188974808e-06, - "loss": 0.79072988, - "num_input_tokens_seen": 207021195, - "step": 9610, - "time_per_iteration": 2.748898506164551 - }, - { - "auxiliary_loss_clip": 0.0108266, - "auxiliary_loss_mlp": 0.01029905, - "balance_loss_clip": 1.03885353, - "balance_loss_mlp": 1.01625562, - "epoch": 0.5778445813918534, - "flos": 17777361951360.0, - "grad_norm": 2.997373910278609, - "language_loss": 0.68867594, - "learning_rate": 1.5956248931737777e-06, - "loss": 0.70980155, - "num_input_tokens_seen": 207037465, - "step": 9611, - "time_per_iteration": 2.7037806510925293 - }, - { - "auxiliary_loss_clip": 0.01103482, - "auxiliary_loss_mlp": 0.01028915, - "balance_loss_clip": 1.03957248, - "balance_loss_mlp": 1.01607609, - "epoch": 0.5779047046445213, - "flos": 22233194046720.0, - "grad_norm": 1.7435127822918648, - "language_loss": 0.83207917, - "learning_rate": 1.5952434827840185e-06, - "loss": 0.85340309, - "num_input_tokens_seen": 207054230, - "step": 9612, - "time_per_iteration": 2.6507790088653564 - }, - { - "auxiliary_loss_clip": 0.01119736, - "auxiliary_loss_mlp": 0.01030573, - "balance_loss_clip": 1.04522681, - "balance_loss_mlp": 1.01779914, - "epoch": 0.5779648278971893, - "flos": 21434361528960.0, - "grad_norm": 1.6430153650030166, - "language_loss": 0.79567391, - "learning_rate": 1.594862087742667e-06, - "loss": 0.81717706, - "num_input_tokens_seen": 207073150, - "step": 9613, - "time_per_iteration": 2.679202079772949 - }, - { - "auxiliary_loss_clip": 0.01107, - "auxiliary_loss_mlp": 0.01032074, - "balance_loss_clip": 1.04167032, - "balance_loss_mlp": 1.02013552, - "epoch": 0.5780249511498572, - "flos": 19026120228480.0, - "grad_norm": 1.7764623177151277, - "language_loss": 0.77572, - "learning_rate": 1.5944807080641863e-06, - "loss": 0.7971108, - "num_input_tokens_seen": 207090375, - "step": 9614, - "time_per_iteration": 2.6978790760040283 - }, - { - "auxiliary_loss_clip": 0.01086413, - "auxiliary_loss_mlp": 0.01033429, - "balance_loss_clip": 1.04169321, - "balance_loss_mlp": 1.020715, - "epoch": 0.5780850744025252, - "flos": 12124663752960.0, - "grad_norm": 2.2008207091737093, - "language_loss": 0.81598818, - "learning_rate": 1.5940993437630375e-06, - "loss": 0.83718669, - "num_input_tokens_seen": 207106030, - "step": 9615, - "time_per_iteration": 2.7248473167419434 - }, - { - "auxiliary_loss_clip": 0.01104516, - "auxiliary_loss_mlp": 0.01032639, - "balance_loss_clip": 1.03926682, - "balance_loss_mlp": 1.01978278, - "epoch": 0.5781451976551931, - "flos": 25044425009280.0, - "grad_norm": 1.4596798757523364, - "language_loss": 0.67086244, - "learning_rate": 1.5937179948536825e-06, - "loss": 0.69223398, - "num_input_tokens_seen": 207125435, - "step": 9616, - "time_per_iteration": 2.7597362995147705 - }, - { - "auxiliary_loss_clip": 0.01106834, - "auxiliary_loss_mlp": 0.01032261, - "balance_loss_clip": 1.04345763, - "balance_loss_mlp": 1.01935697, - "epoch": 0.5782053209078611, - "flos": 19245606284160.0, - "grad_norm": 1.6175721800228267, - "language_loss": 0.77521074, - "learning_rate": 1.5933366613505812e-06, - "loss": 0.79660165, - "num_input_tokens_seen": 207145095, - "step": 9617, - "time_per_iteration": 2.8377323150634766 - }, - { - "auxiliary_loss_clip": 0.01094943, - "auxiliary_loss_mlp": 0.01035216, - "balance_loss_clip": 1.04236281, - "balance_loss_mlp": 1.02231812, - "epoch": 0.578265444160529, - "flos": 25993831340160.0, - "grad_norm": 1.5155731004031996, - "language_loss": 0.75113726, - "learning_rate": 1.5929553432681947e-06, - "loss": 0.77243888, - "num_input_tokens_seen": 207166045, - "step": 9618, - "time_per_iteration": 2.665472984313965 - }, - { - "auxiliary_loss_clip": 0.0111694, - "auxiliary_loss_mlp": 0.01028064, - "balance_loss_clip": 1.04336691, - "balance_loss_mlp": 1.01594067, - "epoch": 0.5783255674131971, - "flos": 21798603394560.0, - "grad_norm": 2.8083861615500445, - "language_loss": 0.81775922, - "learning_rate": 1.5925740406209826e-06, - "loss": 0.83920932, - "num_input_tokens_seen": 207185290, - "step": 9619, - "time_per_iteration": 2.6156482696533203 - }, - { - "auxiliary_loss_clip": 0.01099184, - "auxiliary_loss_mlp": 0.01032562, - "balance_loss_clip": 1.04264188, - "balance_loss_mlp": 1.0207603, - "epoch": 0.578385690665865, - "flos": 24789746603520.0, - "grad_norm": 1.7083869874707343, - "language_loss": 0.72963226, - "learning_rate": 1.5921927534234039e-06, - "loss": 0.75094968, - "num_input_tokens_seen": 207205505, - "step": 9620, - "time_per_iteration": 2.7066376209259033 - }, - { - "auxiliary_loss_clip": 0.01096891, - "auxiliary_loss_mlp": 0.01030675, - "balance_loss_clip": 1.04079533, - "balance_loss_mlp": 1.01831877, - "epoch": 0.578445813918533, - "flos": 21212864311680.0, - "grad_norm": 8.221069459540734, - "language_loss": 0.76836628, - "learning_rate": 1.591811481689916e-06, - "loss": 0.78964192, - "num_input_tokens_seen": 207225315, - "step": 9621, - "time_per_iteration": 2.746229887008667 - }, - { - "auxiliary_loss_clip": 0.01054178, - "auxiliary_loss_mlp": 0.0104303, - "balance_loss_clip": 1.03465438, - "balance_loss_mlp": 1.02871835, - "epoch": 0.5785059371712009, - "flos": 25046795306880.0, - "grad_norm": 1.8397649270084009, - "language_loss": 0.70646143, - "learning_rate": 1.5914302254349787e-06, - "loss": 0.72743344, - "num_input_tokens_seen": 207247690, - "step": 9622, - "time_per_iteration": 2.7708969116210938 - }, - { - "auxiliary_loss_clip": 0.01024027, - "auxiliary_loss_mlp": 0.01003845, - "balance_loss_clip": 1.01965523, - "balance_loss_mlp": 1.00259304, - "epoch": 0.5785660604238689, - "flos": 70843172284800.0, - "grad_norm": 0.7693139889423115, - "language_loss": 0.55946988, - "learning_rate": 1.5910489846730476e-06, - "loss": 0.57974857, - "num_input_tokens_seen": 207301735, - "step": 9623, - "time_per_iteration": 3.2743892669677734 - }, - { - "auxiliary_loss_clip": 0.01084844, - "auxiliary_loss_mlp": 0.01037987, - "balance_loss_clip": 1.04244125, - "balance_loss_mlp": 1.02392614, - "epoch": 0.578626183676537, - "flos": 31649977244160.0, - "grad_norm": 2.0494784145389677, - "language_loss": 0.71381462, - "learning_rate": 1.5906677594185799e-06, - "loss": 0.73504293, - "num_input_tokens_seen": 207321240, - "step": 9624, - "time_per_iteration": 2.761348247528076 - }, - { - "auxiliary_loss_clip": 0.01084192, - "auxiliary_loss_mlp": 0.01039308, - "balance_loss_clip": 1.03928137, - "balance_loss_mlp": 1.02572453, - "epoch": 0.5786863069292049, - "flos": 21865181253120.0, - "grad_norm": 2.0143803075104687, - "language_loss": 0.82421607, - "learning_rate": 1.5902865496860322e-06, - "loss": 0.845451, - "num_input_tokens_seen": 207339540, - "step": 9625, - "time_per_iteration": 4.566919326782227 - }, - { - "auxiliary_loss_clip": 0.01116336, - "auxiliary_loss_mlp": 0.01033709, - "balance_loss_clip": 1.042328, - "balance_loss_mlp": 1.02037549, - "epoch": 0.5787464301818729, - "flos": 23364954748800.0, - "grad_norm": 1.438878240234706, - "language_loss": 0.70356315, - "learning_rate": 1.5899053554898591e-06, - "loss": 0.72506356, - "num_input_tokens_seen": 207360470, - "step": 9626, - "time_per_iteration": 2.6495361328125 - }, - { - "auxiliary_loss_clip": 0.01095761, - "auxiliary_loss_mlp": 0.01036775, - "balance_loss_clip": 1.0427779, - "balance_loss_mlp": 1.02442503, - "epoch": 0.5788065534345408, - "flos": 30004011394560.0, - "grad_norm": 1.470476031522724, - "language_loss": 0.72111934, - "learning_rate": 1.5895241768445166e-06, - "loss": 0.74244475, - "num_input_tokens_seen": 207383080, - "step": 9627, - "time_per_iteration": 2.8884880542755127 - }, - { - "auxiliary_loss_clip": 0.01104923, - "auxiliary_loss_mlp": 0.0103066, - "balance_loss_clip": 1.04045546, - "balance_loss_mlp": 1.01872754, - "epoch": 0.5788666766872088, - "flos": 24527849564160.0, - "grad_norm": 5.936898137308074, - "language_loss": 0.83902895, - "learning_rate": 1.589143013764458e-06, - "loss": 0.8603847, - "num_input_tokens_seen": 207401000, - "step": 9628, - "time_per_iteration": 2.746950626373291 - }, - { - "auxiliary_loss_clip": 0.01093971, - "auxiliary_loss_mlp": 0.01031789, - "balance_loss_clip": 1.03782499, - "balance_loss_mlp": 1.01856256, - "epoch": 0.5789267999398767, - "flos": 23732823888000.0, - "grad_norm": 1.5735702827765405, - "language_loss": 0.72260225, - "learning_rate": 1.5887618662641376e-06, - "loss": 0.74385989, - "num_input_tokens_seen": 207419230, - "step": 9629, - "time_per_iteration": 4.194722652435303 - }, - { - "auxiliary_loss_clip": 0.01096902, - "auxiliary_loss_mlp": 0.0103477, - "balance_loss_clip": 1.043715, - "balance_loss_mlp": 1.02154994, - "epoch": 0.5789869231925447, - "flos": 21135045496320.0, - "grad_norm": 2.2622526010485062, - "language_loss": 0.74250948, - "learning_rate": 1.5883807343580087e-06, - "loss": 0.76382619, - "num_input_tokens_seen": 207437615, - "step": 9630, - "time_per_iteration": 2.754213571548462 - }, - { - "auxiliary_loss_clip": 0.01083141, - "auxiliary_loss_mlp": 0.00770695, - "balance_loss_clip": 1.0400362, - "balance_loss_mlp": 1.00009274, - "epoch": 0.5790470464452127, - "flos": 21209632087680.0, - "grad_norm": 1.6843723839781237, - "language_loss": 0.78927267, - "learning_rate": 1.587999618060523e-06, - "loss": 0.8078109, - "num_input_tokens_seen": 207457270, - "step": 9631, - "time_per_iteration": 2.757955551147461 - }, - { - "auxiliary_loss_clip": 0.01116603, - "auxiliary_loss_mlp": 0.01029207, - "balance_loss_clip": 1.04169166, - "balance_loss_mlp": 1.01674962, - "epoch": 0.5791071696978807, - "flos": 23404384903680.0, - "grad_norm": 1.5220400196762927, - "language_loss": 0.75543463, - "learning_rate": 1.5876185173861333e-06, - "loss": 0.77689266, - "num_input_tokens_seen": 207477890, - "step": 9632, - "time_per_iteration": 2.5955679416656494 - }, - { - "auxiliary_loss_clip": 0.01090291, - "auxiliary_loss_mlp": 0.01030616, - "balance_loss_clip": 1.04132521, - "balance_loss_mlp": 1.01704419, - "epoch": 0.5791672929505486, - "flos": 24206521472640.0, - "grad_norm": 2.166079097569446, - "language_loss": 0.79483461, - "learning_rate": 1.5872374323492915e-06, - "loss": 0.81604362, - "num_input_tokens_seen": 207497670, - "step": 9633, - "time_per_iteration": 3.0309832096099854 - }, - { - "auxiliary_loss_clip": 0.01090489, - "auxiliary_loss_mlp": 0.0104029, - "balance_loss_clip": 1.04247785, - "balance_loss_mlp": 1.02621174, - "epoch": 0.5792274162032166, - "flos": 24348871071360.0, - "grad_norm": 1.6628345099755575, - "language_loss": 0.77489352, - "learning_rate": 1.5868563629644464e-06, - "loss": 0.79620135, - "num_input_tokens_seen": 207516105, - "step": 9634, - "time_per_iteration": 2.742804765701294 - }, - { - "auxiliary_loss_clip": 0.01103303, - "auxiliary_loss_mlp": 0.01039877, - "balance_loss_clip": 1.04325557, - "balance_loss_mlp": 1.0265131, - "epoch": 0.5792875394558845, - "flos": 20449403712000.0, - "grad_norm": 2.0206641079359695, - "language_loss": 0.63376474, - "learning_rate": 1.5864753092460502e-06, - "loss": 0.65519655, - "num_input_tokens_seen": 207533685, - "step": 9635, - "time_per_iteration": 2.758554220199585 - }, - { - "auxiliary_loss_clip": 0.01090702, - "auxiliary_loss_mlp": 0.01040857, - "balance_loss_clip": 1.0402782, - "balance_loss_mlp": 1.02797055, - "epoch": 0.5793476627085525, - "flos": 24060329118720.0, - "grad_norm": 1.4022803042470642, - "language_loss": 0.77229643, - "learning_rate": 1.5860942712085516e-06, - "loss": 0.793612, - "num_input_tokens_seen": 207552840, - "step": 9636, - "time_per_iteration": 2.6893904209136963 - }, - { - "auxiliary_loss_clip": 0.01087778, - "auxiliary_loss_mlp": 0.01033423, - "balance_loss_clip": 1.03770018, - "balance_loss_mlp": 1.02124608, - "epoch": 0.5794077859612206, - "flos": 22054287381120.0, - "grad_norm": 1.6516741793622702, - "language_loss": 0.68164212, - "learning_rate": 1.5857132488663998e-06, - "loss": 0.70285416, - "num_input_tokens_seen": 207572095, - "step": 9637, - "time_per_iteration": 2.7232043743133545 - }, - { - "auxiliary_loss_clip": 0.01076767, - "auxiliary_loss_mlp": 0.01035713, - "balance_loss_clip": 1.04049063, - "balance_loss_mlp": 1.02214098, - "epoch": 0.5794679092138885, - "flos": 11434855991040.0, - "grad_norm": 2.739438707467598, - "language_loss": 0.72531378, - "learning_rate": 1.585332242234043e-06, - "loss": 0.74643862, - "num_input_tokens_seen": 207587495, - "step": 9638, - "time_per_iteration": 2.819202423095703 - }, - { - "auxiliary_loss_clip": 0.01107966, - "auxiliary_loss_mlp": 0.0103288, - "balance_loss_clip": 1.04470587, - "balance_loss_mlp": 1.02056587, - "epoch": 0.5795280324665565, - "flos": 18880215183360.0, - "grad_norm": 1.716063507275685, - "language_loss": 0.72309893, - "learning_rate": 1.5849512513259291e-06, - "loss": 0.74450737, - "num_input_tokens_seen": 207606795, - "step": 9639, - "time_per_iteration": 2.683488130569458 - }, - { - "auxiliary_loss_clip": 0.01094721, - "auxiliary_loss_mlp": 0.01039725, - "balance_loss_clip": 1.0399698, - "balance_loss_mlp": 1.02682686, - "epoch": 0.5795881557192244, - "flos": 13005947940480.0, - "grad_norm": 1.8567608995858262, - "language_loss": 0.70044529, - "learning_rate": 1.5845702761565054e-06, - "loss": 0.72178972, - "num_input_tokens_seen": 207623620, - "step": 9640, - "time_per_iteration": 2.672945737838745 - }, - { - "auxiliary_loss_clip": 0.01096614, - "auxiliary_loss_mlp": 0.01042841, - "balance_loss_clip": 1.0413754, - "balance_loss_mlp": 1.02858996, - "epoch": 0.5796482789718924, - "flos": 19932397303680.0, - "grad_norm": 2.4123450370287958, - "language_loss": 0.7753675, - "learning_rate": 1.5841893167402183e-06, - "loss": 0.79676205, - "num_input_tokens_seen": 207639380, - "step": 9641, - "time_per_iteration": 2.688164472579956 - }, - { - "auxiliary_loss_clip": 0.01119399, - "auxiliary_loss_mlp": 0.01036698, - "balance_loss_clip": 1.04407382, - "balance_loss_mlp": 1.02385926, - "epoch": 0.5797084022245603, - "flos": 21650794928640.0, - "grad_norm": 1.8311937480298248, - "language_loss": 0.73798597, - "learning_rate": 1.5838083730915143e-06, - "loss": 0.75954694, - "num_input_tokens_seen": 207657915, - "step": 9642, - "time_per_iteration": 2.624521017074585 - }, - { - "auxiliary_loss_clip": 0.01102536, - "auxiliary_loss_mlp": 0.01038535, - "balance_loss_clip": 1.04526544, - "balance_loss_mlp": 1.02577972, - "epoch": 0.5797685254772283, - "flos": 26031573555840.0, - "grad_norm": 5.942363913556237, - "language_loss": 0.73259425, - "learning_rate": 1.5834274452248378e-06, - "loss": 0.75400496, - "num_input_tokens_seen": 207678620, - "step": 9643, - "time_per_iteration": 2.715672254562378 - }, - { - "auxiliary_loss_clip": 0.01121691, - "auxiliary_loss_mlp": 0.01033759, - "balance_loss_clip": 1.04416251, - "balance_loss_mlp": 1.02062845, - "epoch": 0.5798286487298963, - "flos": 22705167778560.0, - "grad_norm": 1.8659489070776951, - "language_loss": 0.67181957, - "learning_rate": 1.5830465331546352e-06, - "loss": 0.69337404, - "num_input_tokens_seen": 207696980, - "step": 9644, - "time_per_iteration": 2.6038551330566406 - }, - { - "auxiliary_loss_clip": 0.01116177, - "auxiliary_loss_mlp": 0.0103453, - "balance_loss_clip": 1.04553771, - "balance_loss_mlp": 1.02103531, - "epoch": 0.5798887719825643, - "flos": 23148988225920.0, - "grad_norm": 2.1679759651263044, - "language_loss": 0.85346615, - "learning_rate": 1.5826656368953496e-06, - "loss": 0.8749733, - "num_input_tokens_seen": 207714065, - "step": 9645, - "time_per_iteration": 2.667259931564331 - }, - { - "auxiliary_loss_clip": 0.01122251, - "auxiliary_loss_mlp": 0.01030168, - "balance_loss_clip": 1.04620934, - "balance_loss_mlp": 1.01735902, - "epoch": 0.5799488952352322, - "flos": 24426043441920.0, - "grad_norm": 2.1123906469300935, - "language_loss": 0.75605559, - "learning_rate": 1.5822847564614244e-06, - "loss": 0.77757978, - "num_input_tokens_seen": 207734720, - "step": 9646, - "time_per_iteration": 2.559659481048584 - }, - { - "auxiliary_loss_clip": 0.01099999, - "auxiliary_loss_mlp": 0.01037708, - "balance_loss_clip": 1.04342473, - "balance_loss_mlp": 1.02371335, - "epoch": 0.5800090184879002, - "flos": 38395903829760.0, - "grad_norm": 1.698650252646941, - "language_loss": 0.59495735, - "learning_rate": 1.5819038918673038e-06, - "loss": 0.61633444, - "num_input_tokens_seen": 207755435, - "step": 9647, - "time_per_iteration": 2.7939651012420654 - }, - { - "auxiliary_loss_clip": 0.0107788, - "auxiliary_loss_mlp": 0.0105249, - "balance_loss_clip": 1.04142165, - "balance_loss_mlp": 1.03642702, - "epoch": 0.5800691417405681, - "flos": 19784840232960.0, - "grad_norm": 1.6988187353884752, - "language_loss": 0.84499681, - "learning_rate": 1.5815230431274288e-06, - "loss": 0.86630046, - "num_input_tokens_seen": 207773570, - "step": 9648, - "time_per_iteration": 2.7750449180603027 - }, - { - "auxiliary_loss_clip": 0.01032269, - "auxiliary_loss_mlp": 0.01003411, - "balance_loss_clip": 1.01776171, - "balance_loss_mlp": 1.0021714, - "epoch": 0.5801292649932361, - "flos": 70314565783680.0, - "grad_norm": 0.8432525659417933, - "language_loss": 0.62929457, - "learning_rate": 1.581142210256242e-06, - "loss": 0.64965135, - "num_input_tokens_seen": 207830095, - "step": 9649, - "time_per_iteration": 3.21219801902771 - }, - { - "auxiliary_loss_clip": 0.01078275, - "auxiliary_loss_mlp": 0.0103905, - "balance_loss_clip": 1.03673697, - "balance_loss_mlp": 1.02525127, - "epoch": 0.5801893882459042, - "flos": 18734812928640.0, - "grad_norm": 1.587591091557097, - "language_loss": 0.82462633, - "learning_rate": 1.5807613932681857e-06, - "loss": 0.84579957, - "num_input_tokens_seen": 207848555, - "step": 9650, - "time_per_iteration": 2.8374016284942627 - }, - { - "auxiliary_loss_clip": 0.0108491, - "auxiliary_loss_mlp": 0.01036427, - "balance_loss_clip": 1.03912425, - "balance_loss_mlp": 1.0230515, - "epoch": 0.5802495114985721, - "flos": 15596507698560.0, - "grad_norm": 3.679017793776146, - "language_loss": 0.7786057, - "learning_rate": 1.580380592177698e-06, - "loss": 0.79981905, - "num_input_tokens_seen": 207867060, - "step": 9651, - "time_per_iteration": 2.728508949279785 - }, - { - "auxiliary_loss_clip": 0.01103104, - "auxiliary_loss_mlp": 0.01039294, - "balance_loss_clip": 1.04429924, - "balance_loss_mlp": 1.02555537, - "epoch": 0.5803096347512401, - "flos": 18255405081600.0, - "grad_norm": 1.8929228958840072, - "language_loss": 0.74471784, - "learning_rate": 1.5799998069992213e-06, - "loss": 0.76614177, - "num_input_tokens_seen": 207884520, - "step": 9652, - "time_per_iteration": 2.6977131366729736 - }, - { - "auxiliary_loss_clip": 0.01092621, - "auxiliary_loss_mlp": 0.01028533, - "balance_loss_clip": 1.04145324, - "balance_loss_mlp": 1.0150857, - "epoch": 0.580369758003908, - "flos": 22893160584960.0, - "grad_norm": 2.031010770866024, - "language_loss": 0.7703613, - "learning_rate": 1.579619037747193e-06, - "loss": 0.79157287, - "num_input_tokens_seen": 207905370, - "step": 9653, - "time_per_iteration": 2.7233431339263916 - }, - { - "auxiliary_loss_clip": 0.01121993, - "auxiliary_loss_mlp": 0.01034522, - "balance_loss_clip": 1.04465187, - "balance_loss_mlp": 1.02035964, - "epoch": 0.580429881256576, - "flos": 18697681244160.0, - "grad_norm": 1.9204408515131524, - "language_loss": 0.74248046, - "learning_rate": 1.5792382844360534e-06, - "loss": 0.76404566, - "num_input_tokens_seen": 207923790, - "step": 9654, - "time_per_iteration": 2.595330238342285 - }, - { - "auxiliary_loss_clip": 0.01054131, - "auxiliary_loss_mlp": 0.01037747, - "balance_loss_clip": 1.04102838, - "balance_loss_mlp": 1.02466965, - "epoch": 0.5804900045092439, - "flos": 24681978823680.0, - "grad_norm": 1.627345886244452, - "language_loss": 0.70138443, - "learning_rate": 1.5788575470802408e-06, - "loss": 0.72230321, - "num_input_tokens_seen": 207942335, - "step": 9655, - "time_per_iteration": 2.8097565174102783 - }, - { - "auxiliary_loss_clip": 0.01125048, - "auxiliary_loss_mlp": 0.01038459, - "balance_loss_clip": 1.04366922, - "balance_loss_mlp": 1.02495217, - "epoch": 0.580550127761912, - "flos": 23112790295040.0, - "grad_norm": 1.8908787804935243, - "language_loss": 0.69673449, - "learning_rate": 1.5784768256941915e-06, - "loss": 0.71836954, - "num_input_tokens_seen": 207961975, - "step": 9656, - "time_per_iteration": 2.6233110427856445 - }, - { - "auxiliary_loss_clip": 0.01107455, - "auxiliary_loss_mlp": 0.01034723, - "balance_loss_clip": 1.04619503, - "balance_loss_mlp": 1.02208686, - "epoch": 0.5806102510145799, - "flos": 18475681236480.0, - "grad_norm": 1.5577317145380594, - "language_loss": 0.71972537, - "learning_rate": 1.5780961202923433e-06, - "loss": 0.7411471, - "num_input_tokens_seen": 207979520, - "step": 9657, - "time_per_iteration": 2.616337537765503 - }, - { - "auxiliary_loss_clip": 0.01111294, - "auxiliary_loss_mlp": 0.01037621, - "balance_loss_clip": 1.04370785, - "balance_loss_mlp": 1.0237869, - "epoch": 0.5806703742672479, - "flos": 23915645136000.0, - "grad_norm": 1.9819747060784367, - "language_loss": 0.70975304, - "learning_rate": 1.5777154308891328e-06, - "loss": 0.73124212, - "num_input_tokens_seen": 207998375, - "step": 9658, - "time_per_iteration": 2.6383109092712402 - }, - { - "auxiliary_loss_clip": 0.01031383, - "auxiliary_loss_mlp": 0.01001283, - "balance_loss_clip": 1.01641989, - "balance_loss_mlp": 1.00009727, - "epoch": 0.5807304975199158, - "flos": 66311999412480.0, - "grad_norm": 0.7167527277810166, - "language_loss": 0.5357672, - "learning_rate": 1.5773347574989953e-06, - "loss": 0.55609381, - "num_input_tokens_seen": 208060605, - "step": 9659, - "time_per_iteration": 3.1848106384277344 - }, - { - "auxiliary_loss_clip": 0.0111162, - "auxiliary_loss_mlp": 0.01040087, - "balance_loss_clip": 1.04272866, - "balance_loss_mlp": 1.02638984, - "epoch": 0.5807906207725838, - "flos": 31722444933120.0, - "grad_norm": 1.8377682291636406, - "language_loss": 0.61835778, - "learning_rate": 1.576954100136366e-06, - "loss": 0.63987488, - "num_input_tokens_seen": 208080320, - "step": 9660, - "time_per_iteration": 2.7875893115997314 - }, - { - "auxiliary_loss_clip": 0.01108259, - "auxiliary_loss_mlp": 0.01035512, - "balance_loss_clip": 1.03933334, - "balance_loss_mlp": 1.02131391, - "epoch": 0.5808507440252517, - "flos": 23801161512960.0, - "grad_norm": 1.4582842247400174, - "language_loss": 0.65268373, - "learning_rate": 1.5765734588156797e-06, - "loss": 0.6741215, - "num_input_tokens_seen": 208099305, - "step": 9661, - "time_per_iteration": 2.640033721923828 - }, - { - "auxiliary_loss_clip": 0.01060469, - "auxiliary_loss_mlp": 0.01027812, - "balance_loss_clip": 1.03416336, - "balance_loss_mlp": 1.01562285, - "epoch": 0.5809108672779197, - "flos": 13698449222400.0, - "grad_norm": 13.818010552074016, - "language_loss": 0.74664855, - "learning_rate": 1.5761928335513704e-06, - "loss": 0.76753139, - "num_input_tokens_seen": 208116960, - "step": 9662, - "time_per_iteration": 2.78912091255188 - }, - { - "auxiliary_loss_clip": 0.0103935, - "auxiliary_loss_mlp": 0.01000149, - "balance_loss_clip": 1.01472378, - "balance_loss_mlp": 0.99883789, - "epoch": 0.5809709905305876, - "flos": 69134866381440.0, - "grad_norm": 0.8720581464390529, - "language_loss": 0.58341724, - "learning_rate": 1.5758122243578709e-06, - "loss": 0.60381216, - "num_input_tokens_seen": 208182190, - "step": 9663, - "time_per_iteration": 3.2206766605377197 - }, - { - "auxiliary_loss_clip": 0.01099545, - "auxiliary_loss_mlp": 0.01034444, - "balance_loss_clip": 1.04324317, - "balance_loss_mlp": 1.02127123, - "epoch": 0.5810311137832557, - "flos": 19827538525440.0, - "grad_norm": 2.2012699158511073, - "language_loss": 0.82044816, - "learning_rate": 1.5754316312496152e-06, - "loss": 0.84178805, - "num_input_tokens_seen": 208197015, - "step": 9664, - "time_per_iteration": 5.9192726612091064 - }, - { - "auxiliary_loss_clip": 0.01089768, - "auxiliary_loss_mlp": 0.00771212, - "balance_loss_clip": 1.03780138, - "balance_loss_mlp": 1.0000962, - "epoch": 0.5810912370359237, - "flos": 29238503719680.0, - "grad_norm": 4.331316838714664, - "language_loss": 0.81583905, - "learning_rate": 1.5750510542410337e-06, - "loss": 0.83444887, - "num_input_tokens_seen": 208215795, - "step": 9665, - "time_per_iteration": 2.7813103199005127 - }, - { - "auxiliary_loss_clip": 0.01104588, - "auxiliary_loss_mlp": 0.01035909, - "balance_loss_clip": 1.0461179, - "balance_loss_mlp": 1.02123475, - "epoch": 0.5811513602885916, - "flos": 22785572373120.0, - "grad_norm": 1.7229241789226792, - "language_loss": 0.81392443, - "learning_rate": 1.5746704933465599e-06, - "loss": 0.83532941, - "num_input_tokens_seen": 208234655, - "step": 9666, - "time_per_iteration": 2.7249464988708496 - }, - { - "auxiliary_loss_clip": 0.01101961, - "auxiliary_loss_mlp": 0.01035898, - "balance_loss_clip": 1.04181623, - "balance_loss_mlp": 1.02339292, - "epoch": 0.5812114835412596, - "flos": 18734346051840.0, - "grad_norm": 1.7975787773576042, - "language_loss": 0.80100554, - "learning_rate": 1.5742899485806227e-06, - "loss": 0.82238424, - "num_input_tokens_seen": 208251300, - "step": 9667, - "time_per_iteration": 2.600576639175415 - }, - { - "auxiliary_loss_clip": 0.01117108, - "auxiliary_loss_mlp": 0.01037273, - "balance_loss_clip": 1.04451418, - "balance_loss_mlp": 1.02237177, - "epoch": 0.5812716067939275, - "flos": 26431295080320.0, - "grad_norm": 1.4400303722288619, - "language_loss": 0.78809667, - "learning_rate": 1.573909419957653e-06, - "loss": 0.80964047, - "num_input_tokens_seen": 208272685, - "step": 9668, - "time_per_iteration": 4.22690486907959 - }, - { - "auxiliary_loss_clip": 0.01098312, - "auxiliary_loss_mlp": 0.01033665, - "balance_loss_clip": 1.04209864, - "balance_loss_mlp": 1.02148795, - "epoch": 0.5813317300465956, - "flos": 43397865285120.0, - "grad_norm": 1.8465293320084986, - "language_loss": 0.64245093, - "learning_rate": 1.5735289074920819e-06, - "loss": 0.66377068, - "num_input_tokens_seen": 208294315, - "step": 9669, - "time_per_iteration": 2.8652687072753906 - }, - { - "auxiliary_loss_clip": 0.01069091, - "auxiliary_loss_mlp": 0.01041038, - "balance_loss_clip": 1.03997946, - "balance_loss_mlp": 1.02672672, - "epoch": 0.5813918532992635, - "flos": 24785472885120.0, - "grad_norm": 1.4411692985545548, - "language_loss": 0.7307651, - "learning_rate": 1.5731484111983363e-06, - "loss": 0.75186646, - "num_input_tokens_seen": 208315610, - "step": 9670, - "time_per_iteration": 2.829456329345703 - }, - { - "auxiliary_loss_clip": 0.01086705, - "auxiliary_loss_mlp": 0.01034661, - "balance_loss_clip": 1.03999424, - "balance_loss_mlp": 1.02194691, - "epoch": 0.5814519765519315, - "flos": 22857357703680.0, - "grad_norm": 2.0479138475359844, - "language_loss": 0.7874738, - "learning_rate": 1.5727679310908464e-06, - "loss": 0.80868745, - "num_input_tokens_seen": 208334725, - "step": 9671, - "time_per_iteration": 2.7991318702697754 - }, - { - "auxiliary_loss_clip": 0.0107985, - "auxiliary_loss_mlp": 0.01044541, - "balance_loss_clip": 1.0416975, - "balance_loss_mlp": 1.02910936, - "epoch": 0.5815120998045994, - "flos": 24060831909120.0, - "grad_norm": 1.9838213735263186, - "language_loss": 0.61369407, - "learning_rate": 1.5723874671840399e-06, - "loss": 0.634938, - "num_input_tokens_seen": 208353825, - "step": 9672, - "time_per_iteration": 2.8498592376708984 - }, - { - "auxiliary_loss_clip": 0.01065855, - "auxiliary_loss_mlp": 0.01038617, - "balance_loss_clip": 1.04000103, - "balance_loss_mlp": 1.02496195, - "epoch": 0.5815722230572674, - "flos": 24279491952000.0, - "grad_norm": 2.0691966635939365, - "language_loss": 0.81397313, - "learning_rate": 1.572007019492342e-06, - "loss": 0.83501786, - "num_input_tokens_seen": 208374160, - "step": 9673, - "time_per_iteration": 2.8208439350128174 - }, - { - "auxiliary_loss_clip": 0.0108779, - "auxiliary_loss_mlp": 0.01038429, - "balance_loss_clip": 1.04342866, - "balance_loss_mlp": 1.0242784, - "epoch": 0.5816323463099353, - "flos": 22200371994240.0, - "grad_norm": 1.86389400550988, - "language_loss": 0.88404083, - "learning_rate": 1.5716265880301817e-06, - "loss": 0.905303, - "num_input_tokens_seen": 208392105, - "step": 9674, - "time_per_iteration": 2.7522170543670654 - }, - { - "auxiliary_loss_clip": 0.01120808, - "auxiliary_loss_mlp": 0.00770234, - "balance_loss_clip": 1.04347241, - "balance_loss_mlp": 1.00026846, - "epoch": 0.5816924695626033, - "flos": 24134448833280.0, - "grad_norm": 1.4106486697266074, - "language_loss": 0.78974068, - "learning_rate": 1.571246172811984e-06, - "loss": 0.80865109, - "num_input_tokens_seen": 208411755, - "step": 9675, - "time_per_iteration": 2.6588079929351807 - }, - { - "auxiliary_loss_clip": 0.01106314, - "auxiliary_loss_mlp": 0.01035578, - "balance_loss_clip": 1.04066849, - "balance_loss_mlp": 1.02178526, - "epoch": 0.5817525928152713, - "flos": 21324223451520.0, - "grad_norm": 1.863415006013356, - "language_loss": 0.70507479, - "learning_rate": 1.5708657738521748e-06, - "loss": 0.72649372, - "num_input_tokens_seen": 208429995, - "step": 9676, - "time_per_iteration": 2.64201283454895 - }, - { - "auxiliary_loss_clip": 0.01058756, - "auxiliary_loss_mlp": 0.01033649, - "balance_loss_clip": 1.0396111, - "balance_loss_mlp": 1.02030993, - "epoch": 0.5818127160679393, - "flos": 26934510666240.0, - "grad_norm": 2.6670948708651636, - "language_loss": 0.63821483, - "learning_rate": 1.5704853911651779e-06, - "loss": 0.65913892, - "num_input_tokens_seen": 208443655, - "step": 9677, - "time_per_iteration": 2.818047523498535 - }, - { - "auxiliary_loss_clip": 0.01020823, - "auxiliary_loss_mlp": 0.01010612, - "balance_loss_clip": 1.02114296, - "balance_loss_mlp": 1.00937831, - "epoch": 0.5818728393206073, - "flos": 63918626342400.0, - "grad_norm": 0.8047469836092298, - "language_loss": 0.54188442, - "learning_rate": 1.5701050247654182e-06, - "loss": 0.56219876, - "num_input_tokens_seen": 208498405, - "step": 9678, - "time_per_iteration": 3.2669215202331543 - }, - { - "auxiliary_loss_clip": 0.01019281, - "auxiliary_loss_mlp": 0.0100911, - "balance_loss_clip": 1.01330447, - "balance_loss_mlp": 1.00782299, - "epoch": 0.5819329625732752, - "flos": 64954108638720.0, - "grad_norm": 0.7377482843760589, - "language_loss": 0.56218177, - "learning_rate": 1.569724674667319e-06, - "loss": 0.58246571, - "num_input_tokens_seen": 208559075, - "step": 9679, - "time_per_iteration": 3.130009174346924 - }, - { - "auxiliary_loss_clip": 0.01118656, - "auxiliary_loss_mlp": 0.01031808, - "balance_loss_clip": 1.04236495, - "balance_loss_mlp": 1.01982164, - "epoch": 0.5819930858259432, - "flos": 21215270522880.0, - "grad_norm": 1.65967573029577, - "language_loss": 0.65638047, - "learning_rate": 1.5693443408853032e-06, - "loss": 0.67788512, - "num_input_tokens_seen": 208577770, - "step": 9680, - "time_per_iteration": 2.63765811920166 - }, - { - "auxiliary_loss_clip": 0.01095966, - "auxiliary_loss_mlp": 0.01030097, - "balance_loss_clip": 1.04104781, - "balance_loss_mlp": 1.01797342, - "epoch": 0.5820532090786111, - "flos": 19458520151040.0, - "grad_norm": 1.9145859585775957, - "language_loss": 0.83394265, - "learning_rate": 1.5689640234337933e-06, - "loss": 0.85520327, - "num_input_tokens_seen": 208595110, - "step": 9681, - "time_per_iteration": 2.6886913776397705 - }, - { - "auxiliary_loss_clip": 0.0112012, - "auxiliary_loss_mlp": 0.01033373, - "balance_loss_clip": 1.04263687, - "balance_loss_mlp": 1.02064157, - "epoch": 0.5821133323312792, - "flos": 17712615686400.0, - "grad_norm": 1.6180763493056738, - "language_loss": 0.76095504, - "learning_rate": 1.5685837223272109e-06, - "loss": 0.78248996, - "num_input_tokens_seen": 208612080, - "step": 9682, - "time_per_iteration": 2.616946220397949 - }, - { - "auxiliary_loss_clip": 0.01054825, - "auxiliary_loss_mlp": 0.01035748, - "balance_loss_clip": 1.03545356, - "balance_loss_mlp": 1.0205251, - "epoch": 0.5821734555839471, - "flos": 24571804832640.0, - "grad_norm": 1.897202579717977, - "language_loss": 0.7534517, - "learning_rate": 1.568203437579977e-06, - "loss": 0.77435744, - "num_input_tokens_seen": 208630235, - "step": 9683, - "time_per_iteration": 2.7519571781158447 - }, - { - "auxiliary_loss_clip": 0.01098515, - "auxiliary_loss_mlp": 0.01032751, - "balance_loss_clip": 1.04482961, - "balance_loss_mlp": 1.0191133, - "epoch": 0.5822335788366151, - "flos": 22382259488640.0, - "grad_norm": 1.7304603651050097, - "language_loss": 0.73967683, - "learning_rate": 1.5678231692065116e-06, - "loss": 0.76098949, - "num_input_tokens_seen": 208647925, - "step": 9684, - "time_per_iteration": 2.585839033126831 - }, - { - "auxiliary_loss_clip": 0.01095398, - "auxiliary_loss_mlp": 0.01040225, - "balance_loss_clip": 1.04306865, - "balance_loss_mlp": 1.02714145, - "epoch": 0.582293702089283, - "flos": 26722494639360.0, - "grad_norm": 1.9911340281622987, - "language_loss": 0.78017914, - "learning_rate": 1.5674429172212348e-06, - "loss": 0.80153537, - "num_input_tokens_seen": 208666180, - "step": 9685, - "time_per_iteration": 2.6262004375457764 - }, - { - "auxiliary_loss_clip": 0.01119541, - "auxiliary_loss_mlp": 0.01037721, - "balance_loss_clip": 1.04301238, - "balance_loss_mlp": 1.02463138, - "epoch": 0.582353825341951, - "flos": 17348661129600.0, - "grad_norm": 1.534499166945951, - "language_loss": 0.75514185, - "learning_rate": 1.5670626816385667e-06, - "loss": 0.7767145, - "num_input_tokens_seen": 208684240, - "step": 9686, - "time_per_iteration": 2.4799644947052 - }, - { - "auxiliary_loss_clip": 0.01029752, - "auxiliary_loss_mlp": 0.00999968, - "balance_loss_clip": 1.01506877, - "balance_loss_mlp": 0.99893057, - "epoch": 0.5824139485946189, - "flos": 55473261534720.0, - "grad_norm": 0.8130045203422185, - "language_loss": 0.57394326, - "learning_rate": 1.5666824624729244e-06, - "loss": 0.59424043, - "num_input_tokens_seen": 208736090, - "step": 9687, - "time_per_iteration": 2.9722440242767334 - }, - { - "auxiliary_loss_clip": 0.01079028, - "auxiliary_loss_mlp": 0.01038321, - "balance_loss_clip": 1.03950655, - "balance_loss_mlp": 1.02262747, - "epoch": 0.582474071847287, - "flos": 20303031790080.0, - "grad_norm": 1.7516030258378996, - "language_loss": 0.70063931, - "learning_rate": 1.566302259738727e-06, - "loss": 0.72181278, - "num_input_tokens_seen": 208754600, - "step": 9688, - "time_per_iteration": 2.802976369857788 - }, - { - "auxiliary_loss_clip": 0.01110989, - "auxiliary_loss_mlp": 0.01033418, - "balance_loss_clip": 1.04311526, - "balance_loss_mlp": 1.02075768, - "epoch": 0.5825341950999549, - "flos": 23878010661120.0, - "grad_norm": 2.126323858989827, - "language_loss": 0.65013343, - "learning_rate": 1.5659220734503918e-06, - "loss": 0.67157751, - "num_input_tokens_seen": 208773140, - "step": 9689, - "time_per_iteration": 2.6299288272857666 - }, - { - "auxiliary_loss_clip": 0.01095981, - "auxiliary_loss_mlp": 0.00770437, - "balance_loss_clip": 1.04142618, - "balance_loss_mlp": 1.00009274, - "epoch": 0.5825943183526229, - "flos": 23113041690240.0, - "grad_norm": 1.599269729220552, - "language_loss": 0.7352339, - "learning_rate": 1.5655419036223341e-06, - "loss": 0.75389808, - "num_input_tokens_seen": 208793410, - "step": 9690, - "time_per_iteration": 2.6903798580169678 - }, - { - "auxiliary_loss_clip": 0.01096107, - "auxiliary_loss_mlp": 0.01038726, - "balance_loss_clip": 1.03903055, - "balance_loss_mlp": 1.02372348, - "epoch": 0.5826544416052909, - "flos": 22857429530880.0, - "grad_norm": 1.61399606195473, - "language_loss": 0.75654376, - "learning_rate": 1.5651617502689717e-06, - "loss": 0.77789205, - "num_input_tokens_seen": 208811920, - "step": 9691, - "time_per_iteration": 2.7056210041046143 - }, - { - "auxiliary_loss_clip": 0.01109061, - "auxiliary_loss_mlp": 0.01032641, - "balance_loss_clip": 1.04082966, - "balance_loss_mlp": 1.01972461, - "epoch": 0.5827145648579588, - "flos": 31501845555840.0, - "grad_norm": 2.2562223304416755, - "language_loss": 0.80682158, - "learning_rate": 1.5647816134047184e-06, - "loss": 0.82823855, - "num_input_tokens_seen": 208834720, - "step": 9692, - "time_per_iteration": 2.7577641010284424 - }, - { - "auxiliary_loss_clip": 0.01028968, - "auxiliary_loss_mlp": 0.01002786, - "balance_loss_clip": 1.01420581, - "balance_loss_mlp": 1.00161159, - "epoch": 0.5827746881106268, - "flos": 69811817074560.0, - "grad_norm": 0.7560919402716259, - "language_loss": 0.5693723, - "learning_rate": 1.5644014930439907e-06, - "loss": 0.58968985, - "num_input_tokens_seen": 208898415, - "step": 9693, - "time_per_iteration": 3.145176887512207 - }, - { - "auxiliary_loss_clip": 0.01105496, - "auxiliary_loss_mlp": 0.0076985, - "balance_loss_clip": 1.04020321, - "balance_loss_mlp": 1.00010538, - "epoch": 0.5828348113632947, - "flos": 23112395245440.0, - "grad_norm": 2.61225629767126, - "language_loss": 0.79375291, - "learning_rate": 1.5640213892012025e-06, - "loss": 0.81250644, - "num_input_tokens_seen": 208919045, - "step": 9694, - "time_per_iteration": 2.7443995475769043 - }, - { - "auxiliary_loss_clip": 0.01083069, - "auxiliary_loss_mlp": 0.01042673, - "balance_loss_clip": 1.03822398, - "balance_loss_mlp": 1.02909541, - "epoch": 0.5828949346159628, - "flos": 21873082245120.0, - "grad_norm": 1.4254101237523094, - "language_loss": 0.76205015, - "learning_rate": 1.5636413018907656e-06, - "loss": 0.78330755, - "num_input_tokens_seen": 208939375, - "step": 9695, - "time_per_iteration": 2.688107490539551 - }, - { - "auxiliary_loss_clip": 0.01027446, - "auxiliary_loss_mlp": 0.01003052, - "balance_loss_clip": 1.01271224, - "balance_loss_mlp": 1.00191391, - "epoch": 0.5829550578686307, - "flos": 65962553950080.0, - "grad_norm": 0.7742487055111029, - "language_loss": 0.54982823, - "learning_rate": 1.563261231127095e-06, - "loss": 0.57013327, - "num_input_tokens_seen": 209004760, - "step": 9696, - "time_per_iteration": 3.239593029022217 - }, - { - "auxiliary_loss_clip": 0.0108245, - "auxiliary_loss_mlp": 0.01030212, - "balance_loss_clip": 1.04170382, - "balance_loss_mlp": 1.01751041, - "epoch": 0.5830151811212987, - "flos": 16289799079680.0, - "grad_norm": 2.124266497676036, - "language_loss": 0.76664579, - "learning_rate": 1.5628811769246021e-06, - "loss": 0.78777242, - "num_input_tokens_seen": 209022930, - "step": 9697, - "time_per_iteration": 2.6790308952331543 - }, - { - "auxiliary_loss_clip": 0.01121339, - "auxiliary_loss_mlp": 0.01035657, - "balance_loss_clip": 1.04233479, - "balance_loss_mlp": 1.02154899, - "epoch": 0.5830753043739666, - "flos": 24168851084160.0, - "grad_norm": 1.5579611092820027, - "language_loss": 0.77714729, - "learning_rate": 1.5625011392976991e-06, - "loss": 0.79871726, - "num_input_tokens_seen": 209043740, - "step": 9698, - "time_per_iteration": 2.635885715484619 - }, - { - "auxiliary_loss_clip": 0.01079274, - "auxiliary_loss_mlp": 0.01038337, - "balance_loss_clip": 1.0413661, - "balance_loss_mlp": 1.02498519, - "epoch": 0.5831354276266346, - "flos": 27059050097280.0, - "grad_norm": 1.5784163010462595, - "language_loss": 0.84167337, - "learning_rate": 1.5621211182607966e-06, - "loss": 0.86284947, - "num_input_tokens_seen": 209068885, - "step": 9699, - "time_per_iteration": 2.8312487602233887 - }, - { - "auxiliary_loss_clip": 0.01095092, - "auxiliary_loss_mlp": 0.010366, - "balance_loss_clip": 1.03954756, - "balance_loss_mlp": 1.02281952, - "epoch": 0.5831955508793025, - "flos": 23623475909760.0, - "grad_norm": 2.065302984121428, - "language_loss": 0.65489984, - "learning_rate": 1.561741113828305e-06, - "loss": 0.67621672, - "num_input_tokens_seen": 209087340, - "step": 9700, - "time_per_iteration": 2.784442901611328 - }, - { - "auxiliary_loss_clip": 0.01108875, - "auxiliary_loss_mlp": 0.01034575, - "balance_loss_clip": 1.04089403, - "balance_loss_mlp": 1.02150953, - "epoch": 0.5832556741319705, - "flos": 24973250209920.0, - "grad_norm": 1.5991522353668115, - "language_loss": 0.71547067, - "learning_rate": 1.5613611260146344e-06, - "loss": 0.73690522, - "num_input_tokens_seen": 209108840, - "step": 9701, - "time_per_iteration": 2.6895313262939453 - }, - { - "auxiliary_loss_clip": 0.01096283, - "auxiliary_loss_mlp": 0.01041435, - "balance_loss_clip": 1.04180253, - "balance_loss_mlp": 1.02841139, - "epoch": 0.5833157973846385, - "flos": 23221563655680.0, - "grad_norm": 1.6635802287235106, - "language_loss": 0.85541105, - "learning_rate": 1.5609811548341936e-06, - "loss": 0.87678826, - "num_input_tokens_seen": 209127985, - "step": 9702, - "time_per_iteration": 2.6746225357055664 - }, - { - "auxiliary_loss_clip": 0.01102319, - "auxiliary_loss_mlp": 0.01033634, - "balance_loss_clip": 1.04071856, - "balance_loss_mlp": 1.02131367, - "epoch": 0.5833759206373065, - "flos": 21977941023360.0, - "grad_norm": 1.4183987857502756, - "language_loss": 0.77847046, - "learning_rate": 1.560601200301392e-06, - "loss": 0.79983002, - "num_input_tokens_seen": 209146885, - "step": 9703, - "time_per_iteration": 4.3035502433776855 - }, - { - "auxiliary_loss_clip": 0.01122779, - "auxiliary_loss_mlp": 0.01034804, - "balance_loss_clip": 1.04359257, - "balance_loss_mlp": 1.0208385, - "epoch": 0.5834360438899745, - "flos": 21762405463680.0, - "grad_norm": 1.8064531110729998, - "language_loss": 0.71067387, - "learning_rate": 1.5602212624306366e-06, - "loss": 0.73224974, - "num_input_tokens_seen": 209166130, - "step": 9704, - "time_per_iteration": 4.107022762298584 - }, - { - "auxiliary_loss_clip": 0.01094563, - "auxiliary_loss_mlp": 0.01038062, - "balance_loss_clip": 1.04187346, - "balance_loss_mlp": 1.02561641, - "epoch": 0.5834961671426424, - "flos": 15992566035840.0, - "grad_norm": 1.6675564380890735, - "language_loss": 0.81363106, - "learning_rate": 1.559841341236335e-06, - "loss": 0.8349573, - "num_input_tokens_seen": 209183350, - "step": 9705, - "time_per_iteration": 2.7058465480804443 - }, - { - "auxiliary_loss_clip": 0.010702, - "auxiliary_loss_mlp": 0.01034129, - "balance_loss_clip": 1.03672004, - "balance_loss_mlp": 1.02125466, - "epoch": 0.5835562903953104, - "flos": 22818322598400.0, - "grad_norm": 1.7137147806220967, - "language_loss": 0.80614948, - "learning_rate": 1.5594614367328937e-06, - "loss": 0.82719278, - "num_input_tokens_seen": 209203945, - "step": 9706, - "time_per_iteration": 2.776280164718628 - }, - { - "auxiliary_loss_clip": 0.01105997, - "auxiliary_loss_mlp": 0.0103669, - "balance_loss_clip": 1.04129124, - "balance_loss_mlp": 1.02315402, - "epoch": 0.5836164136479783, - "flos": 48468056624640.0, - "grad_norm": 2.0771057832537414, - "language_loss": 0.74647468, - "learning_rate": 1.5590815489347187e-06, - "loss": 0.76790154, - "num_input_tokens_seen": 209227080, - "step": 9707, - "time_per_iteration": 2.857609272003174 - }, - { - "auxiliary_loss_clip": 0.01081909, - "auxiliary_loss_mlp": 0.01031553, - "balance_loss_clip": 1.03649998, - "balance_loss_mlp": 1.01878548, - "epoch": 0.5836765369006464, - "flos": 26905998245760.0, - "grad_norm": 2.7159127892637067, - "language_loss": 0.81819087, - "learning_rate": 1.5587016778562163e-06, - "loss": 0.83932543, - "num_input_tokens_seen": 209248170, - "step": 9708, - "time_per_iteration": 4.28432822227478 - }, - { - "auxiliary_loss_clip": 0.01102304, - "auxiliary_loss_mlp": 0.01032201, - "balance_loss_clip": 1.0439347, - "balance_loss_mlp": 1.01914191, - "epoch": 0.5837366601533143, - "flos": 20084048524800.0, - "grad_norm": 1.4146539482815383, - "language_loss": 0.78367102, - "learning_rate": 1.5583218235117896e-06, - "loss": 0.80501604, - "num_input_tokens_seen": 209267730, - "step": 9709, - "time_per_iteration": 2.6337647438049316 - }, - { - "auxiliary_loss_clip": 0.01017869, - "auxiliary_loss_mlp": 0.00999553, - "balance_loss_clip": 1.01163578, - "balance_loss_mlp": 0.99844998, - "epoch": 0.5837967834059823, - "flos": 65363885971200.0, - "grad_norm": 0.7723563596720286, - "language_loss": 0.5654794, - "learning_rate": 1.557941985915844e-06, - "loss": 0.58565366, - "num_input_tokens_seen": 209332510, - "step": 9710, - "time_per_iteration": 3.255643844604492 - }, - { - "auxiliary_loss_clip": 0.01084064, - "auxiliary_loss_mlp": 0.01035883, - "balance_loss_clip": 1.03939962, - "balance_loss_mlp": 1.02429581, - "epoch": 0.5838569066586502, - "flos": 25338641310720.0, - "grad_norm": 1.5220841159249796, - "language_loss": 0.6560964, - "learning_rate": 1.5575621650827833e-06, - "loss": 0.67729586, - "num_input_tokens_seen": 209353355, - "step": 9711, - "time_per_iteration": 2.7771286964416504 - }, - { - "auxiliary_loss_clip": 0.01124372, - "auxiliary_loss_mlp": 0.01037032, - "balance_loss_clip": 1.04342008, - "balance_loss_mlp": 1.02279854, - "epoch": 0.5839170299113182, - "flos": 22229243550720.0, - "grad_norm": 1.6457925309868888, - "language_loss": 0.78601259, - "learning_rate": 1.5571823610270085e-06, - "loss": 0.80762661, - "num_input_tokens_seen": 209370960, - "step": 9712, - "time_per_iteration": 2.6130564212799072 - }, - { - "auxiliary_loss_clip": 0.01079932, - "auxiliary_loss_mlp": 0.0077171, - "balance_loss_clip": 1.03610897, - "balance_loss_mlp": 1.00007439, - "epoch": 0.5839771531639861, - "flos": 22200012858240.0, - "grad_norm": 1.6123088749448828, - "language_loss": 0.73624194, - "learning_rate": 1.5568025737629234e-06, - "loss": 0.75475836, - "num_input_tokens_seen": 209390955, - "step": 9713, - "time_per_iteration": 2.752688407897949 - }, - { - "auxiliary_loss_clip": 0.01098855, - "auxiliary_loss_mlp": 0.0103448, - "balance_loss_clip": 1.03949571, - "balance_loss_mlp": 1.02000761, - "epoch": 0.5840372764166541, - "flos": 22419355259520.0, - "grad_norm": 2.057640389539287, - "language_loss": 0.69393289, - "learning_rate": 1.5564228033049292e-06, - "loss": 0.71526623, - "num_input_tokens_seen": 209410260, - "step": 9714, - "time_per_iteration": 2.697676181793213 - }, - { - "auxiliary_loss_clip": 0.01118564, - "auxiliary_loss_mlp": 0.01037008, - "balance_loss_clip": 1.04040492, - "balance_loss_mlp": 1.02368677, - "epoch": 0.5840973996693221, - "flos": 19828256797440.0, - "grad_norm": 1.733937894535342, - "language_loss": 0.80418617, - "learning_rate": 1.5560430496674268e-06, - "loss": 0.82574189, - "num_input_tokens_seen": 209429920, - "step": 9715, - "time_per_iteration": 2.5865848064422607 - }, - { - "auxiliary_loss_clip": 0.01094879, - "auxiliary_loss_mlp": 0.0103561, - "balance_loss_clip": 1.03690863, - "balance_loss_mlp": 1.02182388, - "epoch": 0.5841575229219901, - "flos": 21142982401920.0, - "grad_norm": 2.4772648960449586, - "language_loss": 0.72541732, - "learning_rate": 1.5556633128648167e-06, - "loss": 0.74672222, - "num_input_tokens_seen": 209449470, - "step": 9716, - "time_per_iteration": 2.760240077972412 - }, - { - "auxiliary_loss_clip": 0.01088946, - "auxiliary_loss_mlp": 0.01033627, - "balance_loss_clip": 1.03793585, - "balance_loss_mlp": 1.02124131, - "epoch": 0.5842176461746581, - "flos": 24640322025600.0, - "grad_norm": 1.7815945401286815, - "language_loss": 0.75058079, - "learning_rate": 1.5552835929114976e-06, - "loss": 0.7718066, - "num_input_tokens_seen": 209467695, - "step": 9717, - "time_per_iteration": 2.7470862865448 - }, - { - "auxiliary_loss_clip": 0.01109202, - "auxiliary_loss_mlp": 0.01038785, - "balance_loss_clip": 1.04155052, - "balance_loss_mlp": 1.02575004, - "epoch": 0.584277769427326, - "flos": 19131158574720.0, - "grad_norm": 3.2108802254609827, - "language_loss": 0.79614913, - "learning_rate": 1.5549038898218697e-06, - "loss": 0.81762898, - "num_input_tokens_seen": 209484250, - "step": 9718, - "time_per_iteration": 2.6843111515045166 - }, - { - "auxiliary_loss_clip": 0.01094695, - "auxiliary_loss_mlp": 0.01032977, - "balance_loss_clip": 1.03992128, - "balance_loss_mlp": 1.01880288, - "epoch": 0.584337892679994, - "flos": 22675111073280.0, - "grad_norm": 1.6948464280827684, - "language_loss": 0.67670137, - "learning_rate": 1.5545242036103306e-06, - "loss": 0.69797808, - "num_input_tokens_seen": 209502830, - "step": 9719, - "time_per_iteration": 2.658722400665283 - }, - { - "auxiliary_loss_clip": 0.01119777, - "auxiliary_loss_mlp": 0.01038004, - "balance_loss_clip": 1.04168653, - "balance_loss_mlp": 1.02466464, - "epoch": 0.5843980159326619, - "flos": 31284083352960.0, - "grad_norm": 1.997670996956063, - "language_loss": 0.75795102, - "learning_rate": 1.5541445342912786e-06, - "loss": 0.77952886, - "num_input_tokens_seen": 209525995, - "step": 9720, - "time_per_iteration": 2.6901891231536865 - }, - { - "auxiliary_loss_clip": 0.01082891, - "auxiliary_loss_mlp": 0.01039482, - "balance_loss_clip": 1.04280281, - "balance_loss_mlp": 1.02657783, - "epoch": 0.58445813918533, - "flos": 22748117466240.0, - "grad_norm": 1.7155190503214905, - "language_loss": 0.83123529, - "learning_rate": 1.5537648818791105e-06, - "loss": 0.85245907, - "num_input_tokens_seen": 209545895, - "step": 9721, - "time_per_iteration": 2.71907639503479 - }, - { - "auxiliary_loss_clip": 0.01037273, - "auxiliary_loss_mlp": 0.01006637, - "balance_loss_clip": 1.01290512, - "balance_loss_mlp": 1.00543344, - "epoch": 0.5845182624379979, - "flos": 60686556658560.0, - "grad_norm": 0.9400176499911559, - "language_loss": 0.7134223, - "learning_rate": 1.5533852463882226e-06, - "loss": 0.73386145, - "num_input_tokens_seen": 209602315, - "step": 9722, - "time_per_iteration": 3.1959645748138428 - }, - { - "auxiliary_loss_clip": 0.01099534, - "auxiliary_loss_mlp": 0.01040774, - "balance_loss_clip": 1.03890538, - "balance_loss_mlp": 1.02751184, - "epoch": 0.5845783856906659, - "flos": 16362446336640.0, - "grad_norm": 1.9834511811038693, - "language_loss": 0.89731622, - "learning_rate": 1.5530056278330113e-06, - "loss": 0.91871929, - "num_input_tokens_seen": 209617615, - "step": 9723, - "time_per_iteration": 2.592627763748169 - }, - { - "auxiliary_loss_clip": 0.01094383, - "auxiliary_loss_mlp": 0.01038255, - "balance_loss_clip": 1.04275918, - "balance_loss_mlp": 1.02554142, - "epoch": 0.5846385089433338, - "flos": 20083402080000.0, - "grad_norm": 1.398468813522248, - "language_loss": 0.68486446, - "learning_rate": 1.5526260262278709e-06, - "loss": 0.70619082, - "num_input_tokens_seen": 209637005, - "step": 9724, - "time_per_iteration": 2.655640125274658 - }, - { - "auxiliary_loss_clip": 0.01110347, - "auxiliary_loss_mlp": 0.01036604, - "balance_loss_clip": 1.04291487, - "balance_loss_mlp": 1.02341366, - "epoch": 0.5846986321960018, - "flos": 17311062568320.0, - "grad_norm": 1.717409456716096, - "language_loss": 0.86049938, - "learning_rate": 1.552246441587197e-06, - "loss": 0.88196886, - "num_input_tokens_seen": 209653170, - "step": 9725, - "time_per_iteration": 2.6035261154174805 - }, - { - "auxiliary_loss_clip": 0.01095255, - "auxiliary_loss_mlp": 0.010422, - "balance_loss_clip": 1.04249406, - "balance_loss_mlp": 1.02926588, - "epoch": 0.5847587554486697, - "flos": 17197907748480.0, - "grad_norm": 1.6193535846243259, - "language_loss": 0.82923484, - "learning_rate": 1.5518668739253821e-06, - "loss": 0.85060942, - "num_input_tokens_seen": 209671275, - "step": 9726, - "time_per_iteration": 2.655017137527466 - }, - { - "auxiliary_loss_clip": 0.01055108, - "auxiliary_loss_mlp": 0.00770936, - "balance_loss_clip": 1.03983736, - "balance_loss_mlp": 1.00008965, - "epoch": 0.5848188787013378, - "flos": 24529106540160.0, - "grad_norm": 1.736262693329601, - "language_loss": 0.66609311, - "learning_rate": 1.5514873232568206e-06, - "loss": 0.68435353, - "num_input_tokens_seen": 209690380, - "step": 9727, - "time_per_iteration": 2.820906639099121 - }, - { - "auxiliary_loss_clip": 0.01083507, - "auxiliary_loss_mlp": 0.01045274, - "balance_loss_clip": 1.03799105, - "balance_loss_mlp": 1.03056347, - "epoch": 0.5848790019540057, - "flos": 20628382204800.0, - "grad_norm": 1.7999573427153348, - "language_loss": 0.81628853, - "learning_rate": 1.5511077895959055e-06, - "loss": 0.83757633, - "num_input_tokens_seen": 209708845, - "step": 9728, - "time_per_iteration": 2.7597923278808594 - }, - { - "auxiliary_loss_clip": 0.01103874, - "auxiliary_loss_mlp": 0.01042076, - "balance_loss_clip": 1.03965843, - "balance_loss_mlp": 1.0296309, - "epoch": 0.5849391252066737, - "flos": 22418852469120.0, - "grad_norm": 2.078641796720901, - "language_loss": 0.77696002, - "learning_rate": 1.550728272957027e-06, - "loss": 0.79841954, - "num_input_tokens_seen": 209729000, - "step": 9729, - "time_per_iteration": 2.663864850997925 - }, - { - "auxiliary_loss_clip": 0.01102359, - "auxiliary_loss_mlp": 0.0103712, - "balance_loss_clip": 1.03954148, - "balance_loss_mlp": 1.022475, - "epoch": 0.5849992484593417, - "flos": 25410929431680.0, - "grad_norm": 1.8450519403802392, - "language_loss": 0.70192915, - "learning_rate": 1.5503487733545782e-06, - "loss": 0.72332394, - "num_input_tokens_seen": 209747435, - "step": 9730, - "time_per_iteration": 2.6668407917022705 - }, - { - "auxiliary_loss_clip": 0.01124849, - "auxiliary_loss_mlp": 0.01036601, - "balance_loss_clip": 1.04504502, - "balance_loss_mlp": 1.02224803, - "epoch": 0.5850593717120096, - "flos": 21065163586560.0, - "grad_norm": 1.6923527463370078, - "language_loss": 0.78973091, - "learning_rate": 1.5499692908029482e-06, - "loss": 0.81134546, - "num_input_tokens_seen": 209764910, - "step": 9731, - "time_per_iteration": 2.6093108654022217 - }, - { - "auxiliary_loss_clip": 0.01103256, - "auxiliary_loss_mlp": 0.01046113, - "balance_loss_clip": 1.04004776, - "balance_loss_mlp": 1.03114593, - "epoch": 0.5851194949646776, - "flos": 25301545539840.0, - "grad_norm": 2.322897025480009, - "language_loss": 0.70276213, - "learning_rate": 1.549589825316528e-06, - "loss": 0.7242558, - "num_input_tokens_seen": 209786115, - "step": 9732, - "time_per_iteration": 2.6483914852142334 - }, - { - "auxiliary_loss_clip": 0.01068434, - "auxiliary_loss_mlp": 0.01041994, - "balance_loss_clip": 1.03862739, - "balance_loss_mlp": 1.02584136, - "epoch": 0.5851796182173455, - "flos": 23587242065280.0, - "grad_norm": 1.8361177860467572, - "language_loss": 0.53096974, - "learning_rate": 1.5492103769097075e-06, - "loss": 0.55207402, - "num_input_tokens_seen": 209806095, - "step": 9733, - "time_per_iteration": 2.7837493419647217 - }, - { - "auxiliary_loss_clip": 0.0110623, - "auxiliary_loss_mlp": 0.01037809, - "balance_loss_clip": 1.04327631, - "balance_loss_mlp": 1.023206, - "epoch": 0.5852397414700136, - "flos": 24822712310400.0, - "grad_norm": 2.1555850580582945, - "language_loss": 0.87172639, - "learning_rate": 1.5488309455968739e-06, - "loss": 0.89316678, - "num_input_tokens_seen": 209823650, - "step": 9734, - "time_per_iteration": 2.647822618484497 - }, - { - "auxiliary_loss_clip": 0.0109023, - "auxiliary_loss_mlp": 0.01035437, - "balance_loss_clip": 1.03915906, - "balance_loss_mlp": 1.02305174, - "epoch": 0.5852998647226815, - "flos": 19937784343680.0, - "grad_norm": 1.6523754491187739, - "language_loss": 0.72117126, - "learning_rate": 1.5484515313924163e-06, - "loss": 0.74242795, - "num_input_tokens_seen": 209843220, - "step": 9735, - "time_per_iteration": 2.6707499027252197 - }, - { - "auxiliary_loss_clip": 0.01111823, - "auxiliary_loss_mlp": 0.01038537, - "balance_loss_clip": 1.04385519, - "balance_loss_mlp": 1.02448797, - "epoch": 0.5853599879753495, - "flos": 16720367408640.0, - "grad_norm": 5.660280505854459, - "language_loss": 0.74303764, - "learning_rate": 1.5480721343107217e-06, - "loss": 0.76454127, - "num_input_tokens_seen": 209854880, - "step": 9736, - "time_per_iteration": 2.6474769115448 - }, - { - "auxiliary_loss_clip": 0.01084732, - "auxiliary_loss_mlp": 0.01038896, - "balance_loss_clip": 1.03950977, - "balance_loss_mlp": 1.0241437, - "epoch": 0.5854201112280174, - "flos": 44456583680640.0, - "grad_norm": 1.705724680337342, - "language_loss": 0.7066859, - "learning_rate": 1.5476927543661772e-06, - "loss": 0.72792208, - "num_input_tokens_seen": 209877870, - "step": 9737, - "time_per_iteration": 2.8703529834747314 - }, - { - "auxiliary_loss_clip": 0.01079098, - "auxiliary_loss_mlp": 0.01042352, - "balance_loss_clip": 1.03875983, - "balance_loss_mlp": 1.02830887, - "epoch": 0.5854802344806854, - "flos": 20339193807360.0, - "grad_norm": 1.7465210824086157, - "language_loss": 0.82571793, - "learning_rate": 1.547313391573169e-06, - "loss": 0.84693247, - "num_input_tokens_seen": 209896690, - "step": 9738, - "time_per_iteration": 2.6930525302886963 - }, - { - "auxiliary_loss_clip": 0.01123353, - "auxiliary_loss_mlp": 0.00771973, - "balance_loss_clip": 1.04294574, - "balance_loss_mlp": 1.00014758, - "epoch": 0.5855403577333533, - "flos": 20921054221440.0, - "grad_norm": 1.6403149295747592, - "language_loss": 0.68084544, - "learning_rate": 1.546934045946082e-06, - "loss": 0.6997987, - "num_input_tokens_seen": 209914640, - "step": 9739, - "time_per_iteration": 2.6120223999023438 - }, - { - "auxiliary_loss_clip": 0.01122823, - "auxiliary_loss_mlp": 0.01028069, - "balance_loss_clip": 1.04343581, - "balance_loss_mlp": 1.01383555, - "epoch": 0.5856004809860214, - "flos": 20448649526400.0, - "grad_norm": 2.346965983276941, - "language_loss": 0.5878849, - "learning_rate": 1.5465547174993017e-06, - "loss": 0.60939384, - "num_input_tokens_seen": 209933375, - "step": 9740, - "time_per_iteration": 2.6393442153930664 - }, - { - "auxiliary_loss_clip": 0.01091861, - "auxiliary_loss_mlp": 0.01034284, - "balance_loss_clip": 1.03964174, - "balance_loss_mlp": 1.01996112, - "epoch": 0.5856606042386893, - "flos": 19640766781440.0, - "grad_norm": 1.8171598434150709, - "language_loss": 0.75508714, - "learning_rate": 1.5461754062472113e-06, - "loss": 0.77634859, - "num_input_tokens_seen": 209952055, - "step": 9741, - "time_per_iteration": 2.6550915241241455 - }, - { - "auxiliary_loss_clip": 0.01085436, - "auxiliary_loss_mlp": 0.01034709, - "balance_loss_clip": 1.03900838, - "balance_loss_mlp": 1.02109587, - "epoch": 0.5857207274913573, - "flos": 21686166846720.0, - "grad_norm": 1.6487285096737663, - "language_loss": 0.75935274, - "learning_rate": 1.5457961122041959e-06, - "loss": 0.78055418, - "num_input_tokens_seen": 209971190, - "step": 9742, - "time_per_iteration": 4.381955146789551 - }, - { - "auxiliary_loss_clip": 0.01098042, - "auxiliary_loss_mlp": 0.01033792, - "balance_loss_clip": 1.04340363, - "balance_loss_mlp": 1.0209775, - "epoch": 0.5857808507440253, - "flos": 23182708118400.0, - "grad_norm": 1.6035533638401356, - "language_loss": 0.74864548, - "learning_rate": 1.5454168353846369e-06, - "loss": 0.76996386, - "num_input_tokens_seen": 209990695, - "step": 9743, - "time_per_iteration": 5.72803258895874 - }, - { - "auxiliary_loss_clip": 0.01098389, - "auxiliary_loss_mlp": 0.01032176, - "balance_loss_clip": 1.04424453, - "balance_loss_mlp": 1.01949835, - "epoch": 0.5858409739966932, - "flos": 27235299156480.0, - "grad_norm": 1.98808093933083, - "language_loss": 0.81046313, - "learning_rate": 1.5450375758029172e-06, - "loss": 0.83176875, - "num_input_tokens_seen": 210010210, - "step": 9744, - "time_per_iteration": 2.7265267372131348 - }, - { - "auxiliary_loss_clip": 0.01094798, - "auxiliary_loss_mlp": 0.01030607, - "balance_loss_clip": 1.04087067, - "balance_loss_mlp": 1.01669562, - "epoch": 0.5859010972493612, - "flos": 27855512317440.0, - "grad_norm": 1.7065591540492446, - "language_loss": 0.71426034, - "learning_rate": 1.5446583334734183e-06, - "loss": 0.73551434, - "num_input_tokens_seen": 210030030, - "step": 9745, - "time_per_iteration": 2.737842082977295 - }, - { - "auxiliary_loss_clip": 0.01023206, - "auxiliary_loss_mlp": 0.01004158, - "balance_loss_clip": 1.01973987, - "balance_loss_mlp": 1.00301957, - "epoch": 0.5859612205020291, - "flos": 70007064428160.0, - "grad_norm": 0.7272764484566879, - "language_loss": 0.53267932, - "learning_rate": 1.5442791084105204e-06, - "loss": 0.552953, - "num_input_tokens_seen": 210094840, - "step": 9746, - "time_per_iteration": 3.3027215003967285 - }, - { - "auxiliary_loss_clip": 0.01094571, - "auxiliary_loss_mlp": 0.01035687, - "balance_loss_clip": 1.04237437, - "balance_loss_mlp": 1.02163196, - "epoch": 0.5860213437546972, - "flos": 24056019486720.0, - "grad_norm": 2.0261235602549466, - "language_loss": 0.73138428, - "learning_rate": 1.5438999006286054e-06, - "loss": 0.75268686, - "num_input_tokens_seen": 210114660, - "step": 9747, - "time_per_iteration": 4.224852085113525 - }, - { - "auxiliary_loss_clip": 0.01092652, - "auxiliary_loss_mlp": 0.01046673, - "balance_loss_clip": 1.03909874, - "balance_loss_mlp": 1.03123569, - "epoch": 0.5860814670073651, - "flos": 18947583141120.0, - "grad_norm": 1.867050340664373, - "language_loss": 0.81183696, - "learning_rate": 1.543520710142051e-06, - "loss": 0.83323026, - "num_input_tokens_seen": 210132770, - "step": 9748, - "time_per_iteration": 2.6568126678466797 - }, - { - "auxiliary_loss_clip": 0.01111974, - "auxiliary_loss_mlp": 0.01038317, - "balance_loss_clip": 1.04387689, - "balance_loss_mlp": 1.0241785, - "epoch": 0.5861415902600331, - "flos": 22561848512640.0, - "grad_norm": 1.7272716772059427, - "language_loss": 0.72221619, - "learning_rate": 1.5431415369652375e-06, - "loss": 0.7437191, - "num_input_tokens_seen": 210151895, - "step": 9749, - "time_per_iteration": 2.6895384788513184 - }, - { - "auxiliary_loss_clip": 0.01101508, - "auxiliary_loss_mlp": 0.01035837, - "balance_loss_clip": 1.04664361, - "balance_loss_mlp": 1.02205098, - "epoch": 0.586201713512701, - "flos": 14392027912320.0, - "grad_norm": 2.592210537631562, - "language_loss": 0.75040287, - "learning_rate": 1.5427623811125428e-06, - "loss": 0.77177632, - "num_input_tokens_seen": 210168040, - "step": 9750, - "time_per_iteration": 2.737083911895752 - }, - { - "auxiliary_loss_clip": 0.0108729, - "auxiliary_loss_mlp": 0.01036704, - "balance_loss_clip": 1.04378581, - "balance_loss_mlp": 1.02202928, - "epoch": 0.586261836765369, - "flos": 19498560837120.0, - "grad_norm": 1.8612157402372733, - "language_loss": 0.70927167, - "learning_rate": 1.542383242598344e-06, - "loss": 0.73051161, - "num_input_tokens_seen": 210187720, - "step": 9751, - "time_per_iteration": 2.7111241817474365 - }, - { - "auxiliary_loss_clip": 0.01125805, - "auxiliary_loss_mlp": 0.01043313, - "balance_loss_clip": 1.04531717, - "balance_loss_mlp": 1.02769637, - "epoch": 0.5863219600180369, - "flos": 20701819560960.0, - "grad_norm": 1.7129799601344229, - "language_loss": 0.74548101, - "learning_rate": 1.5420041214370184e-06, - "loss": 0.76717222, - "num_input_tokens_seen": 210206080, - "step": 9752, - "time_per_iteration": 2.626716136932373 - }, - { - "auxiliary_loss_clip": 0.01108046, - "auxiliary_loss_mlp": 0.01031989, - "balance_loss_clip": 1.04339004, - "balance_loss_mlp": 1.01842308, - "epoch": 0.586382083270705, - "flos": 19792130693760.0, - "grad_norm": 1.767262069370236, - "language_loss": 0.77331054, - "learning_rate": 1.541625017642943e-06, - "loss": 0.79471087, - "num_input_tokens_seen": 210225660, - "step": 9753, - "time_per_iteration": 2.6093239784240723 - }, - { - "auxiliary_loss_clip": 0.01116295, - "auxiliary_loss_mlp": 0.01029138, - "balance_loss_clip": 1.04288065, - "balance_loss_mlp": 1.01651943, - "epoch": 0.5864422065233729, - "flos": 16500558130560.0, - "grad_norm": 1.6790243104766265, - "language_loss": 0.70988512, - "learning_rate": 1.5412459312304927e-06, - "loss": 0.73133945, - "num_input_tokens_seen": 210242725, - "step": 9754, - "time_per_iteration": 2.5604028701782227 - }, - { - "auxiliary_loss_clip": 0.01095441, - "auxiliary_loss_mlp": 0.01034082, - "balance_loss_clip": 1.0401392, - "balance_loss_mlp": 1.0194732, - "epoch": 0.5865023297760409, - "flos": 20413277608320.0, - "grad_norm": 2.0857561604768065, - "language_loss": 0.72379315, - "learning_rate": 1.540866862214043e-06, - "loss": 0.7450884, - "num_input_tokens_seen": 210263225, - "step": 9755, - "time_per_iteration": 2.656785011291504 - }, - { - "auxiliary_loss_clip": 0.01012678, - "auxiliary_loss_mlp": 0.01004177, - "balance_loss_clip": 1.01731849, - "balance_loss_mlp": 1.00294328, - "epoch": 0.5865624530287089, - "flos": 63350769254400.0, - "grad_norm": 0.7450356800362308, - "language_loss": 0.56920898, - "learning_rate": 1.540487810607967e-06, - "loss": 0.58937752, - "num_input_tokens_seen": 210322310, - "step": 9756, - "time_per_iteration": 3.2905054092407227 - }, - { - "auxiliary_loss_clip": 0.01115752, - "auxiliary_loss_mlp": 0.01031709, - "balance_loss_clip": 1.04039788, - "balance_loss_mlp": 1.01922202, - "epoch": 0.5866225762813768, - "flos": 27016279977600.0, - "grad_norm": 11.015446509800649, - "language_loss": 0.76104087, - "learning_rate": 1.5401087764266396e-06, - "loss": 0.78251553, - "num_input_tokens_seen": 210340845, - "step": 9757, - "time_per_iteration": 2.6325418949127197 - }, - { - "auxiliary_loss_clip": 0.01021435, - "auxiliary_loss_mlp": 0.01009977, - "balance_loss_clip": 1.01624918, - "balance_loss_mlp": 1.00884426, - "epoch": 0.5866826995340448, - "flos": 72987038507520.0, - "grad_norm": 0.8546616305193999, - "language_loss": 0.60420328, - "learning_rate": 1.5397297596844337e-06, - "loss": 0.62451738, - "num_input_tokens_seen": 210397815, - "step": 9758, - "time_per_iteration": 3.227780342102051 - }, - { - "auxiliary_loss_clip": 0.0112535, - "auxiliary_loss_mlp": 0.01036264, - "balance_loss_clip": 1.0447619, - "balance_loss_mlp": 1.02245307, - "epoch": 0.5867428227867127, - "flos": 21285727050240.0, - "grad_norm": 2.191365428773927, - "language_loss": 0.71787071, - "learning_rate": 1.5393507603957212e-06, - "loss": 0.73948681, - "num_input_tokens_seen": 210413900, - "step": 9759, - "time_per_iteration": 2.593574047088623 - }, - { - "auxiliary_loss_clip": 0.01096792, - "auxiliary_loss_mlp": 0.0103787, - "balance_loss_clip": 1.04106188, - "balance_loss_mlp": 1.02525759, - "epoch": 0.5868029460393808, - "flos": 33468852188160.0, - "grad_norm": 1.6194048366561686, - "language_loss": 0.72730052, - "learning_rate": 1.5389717785748742e-06, - "loss": 0.74864709, - "num_input_tokens_seen": 210434110, - "step": 9760, - "time_per_iteration": 2.7872965335845947 - }, - { - "auxiliary_loss_clip": 0.01107006, - "auxiliary_loss_mlp": 0.01032523, - "balance_loss_clip": 1.04269731, - "balance_loss_mlp": 1.01910627, - "epoch": 0.5868630692920487, - "flos": 17889475276800.0, - "grad_norm": 1.9662195987833622, - "language_loss": 0.72611898, - "learning_rate": 1.5385928142362637e-06, - "loss": 0.74751425, - "num_input_tokens_seen": 210451685, - "step": 9761, - "time_per_iteration": 2.701533317565918 - }, - { - "auxiliary_loss_clip": 0.01106159, - "auxiliary_loss_mlp": 0.01036709, - "balance_loss_clip": 1.04491735, - "balance_loss_mlp": 1.02211809, - "epoch": 0.5869231925447167, - "flos": 21035035054080.0, - "grad_norm": 1.7395731063260564, - "language_loss": 0.75217378, - "learning_rate": 1.5382138673942597e-06, - "loss": 0.77360249, - "num_input_tokens_seen": 210470825, - "step": 9762, - "time_per_iteration": 2.721714496612549 - }, - { - "auxiliary_loss_clip": 0.01082216, - "auxiliary_loss_mlp": 0.01036155, - "balance_loss_clip": 1.03985929, - "balance_loss_mlp": 1.02164149, - "epoch": 0.5869833157973846, - "flos": 74738219293440.0, - "grad_norm": 4.660992958273475, - "language_loss": 0.72322762, - "learning_rate": 1.5378349380632317e-06, - "loss": 0.74441129, - "num_input_tokens_seen": 210500075, - "step": 9763, - "time_per_iteration": 3.1116628646850586 - }, - { - "auxiliary_loss_clip": 0.01101878, - "auxiliary_loss_mlp": 0.01034613, - "balance_loss_clip": 1.03773355, - "balance_loss_mlp": 1.02203679, - "epoch": 0.5870434390500526, - "flos": 17638998762240.0, - "grad_norm": 1.815727939349207, - "language_loss": 0.80352604, - "learning_rate": 1.53745602625755e-06, - "loss": 0.82489097, - "num_input_tokens_seen": 210518150, - "step": 9764, - "time_per_iteration": 2.682579278945923 - }, - { - "auxiliary_loss_clip": 0.01091583, - "auxiliary_loss_mlp": 0.01034941, - "balance_loss_clip": 1.04217017, - "balance_loss_mlp": 1.02132726, - "epoch": 0.5871035623027205, - "flos": 21506146859520.0, - "grad_norm": 1.83004906571999, - "language_loss": 0.79265928, - "learning_rate": 1.5370771319915819e-06, - "loss": 0.81392443, - "num_input_tokens_seen": 210537760, - "step": 9765, - "time_per_iteration": 2.6972546577453613 - }, - { - "auxiliary_loss_clip": 0.01088979, - "auxiliary_loss_mlp": 0.01039927, - "balance_loss_clip": 1.04256606, - "balance_loss_mlp": 1.02595556, - "epoch": 0.5871636855553886, - "flos": 13551861818880.0, - "grad_norm": 1.76294195099967, - "language_loss": 0.83693898, - "learning_rate": 1.5366982552796947e-06, - "loss": 0.85822797, - "num_input_tokens_seen": 210555515, - "step": 9766, - "time_per_iteration": 2.7466630935668945 - }, - { - "auxiliary_loss_clip": 0.01111118, - "auxiliary_loss_mlp": 0.01037087, - "balance_loss_clip": 1.04195547, - "balance_loss_mlp": 1.02393794, - "epoch": 0.5872238088080565, - "flos": 26212922346240.0, - "grad_norm": 1.5937380342892973, - "language_loss": 0.6981988, - "learning_rate": 1.536319396136257e-06, - "loss": 0.71968091, - "num_input_tokens_seen": 210575000, - "step": 9767, - "time_per_iteration": 2.6740965843200684 - }, - { - "auxiliary_loss_clip": 0.0110439, - "auxiliary_loss_mlp": 0.0077267, - "balance_loss_clip": 1.04049277, - "balance_loss_mlp": 1.00008368, - "epoch": 0.5872839320607245, - "flos": 30665198995200.0, - "grad_norm": 2.1136221747138095, - "language_loss": 0.6360091, - "learning_rate": 1.5359405545756336e-06, - "loss": 0.65477967, - "num_input_tokens_seen": 210595185, - "step": 9768, - "time_per_iteration": 2.7575178146362305 - }, - { - "auxiliary_loss_clip": 0.01037412, - "auxiliary_loss_mlp": 0.00751529, - "balance_loss_clip": 1.01318574, - "balance_loss_mlp": 0.99987358, - "epoch": 0.5873440553133924, - "flos": 60303570871680.0, - "grad_norm": 0.7223687744232398, - "language_loss": 0.53866827, - "learning_rate": 1.5355617306121914e-06, - "loss": 0.55655766, - "num_input_tokens_seen": 210653210, - "step": 9769, - "time_per_iteration": 3.1609816551208496 - }, - { - "auxiliary_loss_clip": 0.01084812, - "auxiliary_loss_mlp": 0.01042021, - "balance_loss_clip": 1.03922772, - "balance_loss_mlp": 1.02880073, - "epoch": 0.5874041785660604, - "flos": 21539292134400.0, - "grad_norm": 1.4066762666706196, - "language_loss": 0.70984697, - "learning_rate": 1.5351829242602945e-06, - "loss": 0.73111528, - "num_input_tokens_seen": 210673750, - "step": 9770, - "time_per_iteration": 2.7312963008880615 - }, - { - "auxiliary_loss_clip": 0.01073411, - "auxiliary_loss_mlp": 0.01035898, - "balance_loss_clip": 1.0386194, - "balance_loss_mlp": 1.02226591, - "epoch": 0.5874643018187284, - "flos": 24388947671040.0, - "grad_norm": 1.7359405395861034, - "language_loss": 0.681171, - "learning_rate": 1.5348041355343077e-06, - "loss": 0.70226407, - "num_input_tokens_seen": 210692960, - "step": 9771, - "time_per_iteration": 2.7748193740844727 - }, - { - "auxiliary_loss_clip": 0.01072231, - "auxiliary_loss_mlp": 0.01041976, - "balance_loss_clip": 1.03671551, - "balance_loss_mlp": 1.02564466, - "epoch": 0.5875244250713964, - "flos": 28147717457280.0, - "grad_norm": 1.5217173137024316, - "language_loss": 0.661672, - "learning_rate": 1.5344253644485954e-06, - "loss": 0.68281412, - "num_input_tokens_seen": 210714040, - "step": 9772, - "time_per_iteration": 2.841942071914673 - }, - { - "auxiliary_loss_clip": 0.01124952, - "auxiliary_loss_mlp": 0.01044932, - "balance_loss_clip": 1.045434, - "balance_loss_mlp": 1.03047216, - "epoch": 0.5875845483240644, - "flos": 25812410722560.0, - "grad_norm": 1.4922365157265927, - "language_loss": 0.74535245, - "learning_rate": 1.534046611017519e-06, - "loss": 0.76705128, - "num_input_tokens_seen": 210733710, - "step": 9773, - "time_per_iteration": 2.6284871101379395 - }, - { - "auxiliary_loss_clip": 0.01087977, - "auxiliary_loss_mlp": 0.0104147, - "balance_loss_clip": 1.04292727, - "balance_loss_mlp": 1.02706945, - "epoch": 0.5876446715767323, - "flos": 26906572863360.0, - "grad_norm": 1.947316209295704, - "language_loss": 0.52915788, - "learning_rate": 1.5336678752554421e-06, - "loss": 0.55045235, - "num_input_tokens_seen": 210753580, - "step": 9774, - "time_per_iteration": 2.7891509532928467 - }, - { - "auxiliary_loss_clip": 0.01113387, - "auxiliary_loss_mlp": 0.01039669, - "balance_loss_clip": 1.04437912, - "balance_loss_mlp": 1.02526808, - "epoch": 0.5877047948294003, - "flos": 36684832579200.0, - "grad_norm": 2.3607783176851824, - "language_loss": 0.64713901, - "learning_rate": 1.5332891571767264e-06, - "loss": 0.66866958, - "num_input_tokens_seen": 210773495, - "step": 9775, - "time_per_iteration": 2.771148920059204 - }, - { - "auxiliary_loss_clip": 0.01105141, - "auxiliary_loss_mlp": 0.01036995, - "balance_loss_clip": 1.04033184, - "balance_loss_mlp": 1.02344131, - "epoch": 0.5877649180820682, - "flos": 26724721282560.0, - "grad_norm": 1.636403069820384, - "language_loss": 0.73844278, - "learning_rate": 1.5329104567957326e-06, - "loss": 0.75986409, - "num_input_tokens_seen": 210793645, - "step": 9776, - "time_per_iteration": 2.690695285797119 - }, - { - "auxiliary_loss_clip": 0.01119488, - "auxiliary_loss_mlp": 0.01039689, - "balance_loss_clip": 1.0420121, - "balance_loss_mlp": 1.0264504, - "epoch": 0.5878250413347362, - "flos": 21032197879680.0, - "grad_norm": 1.5421458331894318, - "language_loss": 0.73914766, - "learning_rate": 1.532531774126821e-06, - "loss": 0.76073945, - "num_input_tokens_seen": 210813415, - "step": 9777, - "time_per_iteration": 2.6284945011138916 - }, - { - "auxiliary_loss_clip": 0.01083567, - "auxiliary_loss_mlp": 0.01038914, - "balance_loss_clip": 1.04067087, - "balance_loss_mlp": 1.02573574, - "epoch": 0.5878851645874041, - "flos": 25484259047040.0, - "grad_norm": 1.8412101918270336, - "language_loss": 0.74325955, - "learning_rate": 1.5321531091843512e-06, - "loss": 0.76448435, - "num_input_tokens_seen": 210833850, - "step": 9778, - "time_per_iteration": 2.7255308628082275 - }, - { - "auxiliary_loss_clip": 0.01072977, - "auxiliary_loss_mlp": 0.01040231, - "balance_loss_clip": 1.03567362, - "balance_loss_mlp": 1.0246737, - "epoch": 0.5879452878400722, - "flos": 23769129559680.0, - "grad_norm": 1.8337946976743424, - "language_loss": 0.70162809, - "learning_rate": 1.5317744619826824e-06, - "loss": 0.72276014, - "num_input_tokens_seen": 210853115, - "step": 9779, - "time_per_iteration": 2.715529680252075 - }, - { - "auxiliary_loss_clip": 0.01121839, - "auxiliary_loss_mlp": 0.00771635, - "balance_loss_clip": 1.04201186, - "balance_loss_mlp": 1.00009024, - "epoch": 0.5880054110927401, - "flos": 17824513530240.0, - "grad_norm": 2.202026224542238, - "language_loss": 0.66388619, - "learning_rate": 1.5313958325361727e-06, - "loss": 0.68282098, - "num_input_tokens_seen": 210872090, - "step": 9780, - "time_per_iteration": 2.628286361694336 - }, - { - "auxiliary_loss_clip": 0.01091434, - "auxiliary_loss_mlp": 0.01038369, - "balance_loss_clip": 1.04466867, - "balance_loss_mlp": 1.02406991, - "epoch": 0.5880655343454081, - "flos": 19463404400640.0, - "grad_norm": 1.8753551233884636, - "language_loss": 0.72474289, - "learning_rate": 1.5310172208591807e-06, - "loss": 0.74604088, - "num_input_tokens_seen": 210888490, - "step": 9781, - "time_per_iteration": 4.2804930210113525 - }, - { - "auxiliary_loss_clip": 0.01092565, - "auxiliary_loss_mlp": 0.00771373, - "balance_loss_clip": 1.04225159, - "balance_loss_mlp": 1.00005984, - "epoch": 0.588125657598076, - "flos": 21397588980480.0, - "grad_norm": 1.5003005055277707, - "language_loss": 0.70744377, - "learning_rate": 1.5306386269660622e-06, - "loss": 0.72608316, - "num_input_tokens_seen": 210908220, - "step": 9782, - "time_per_iteration": 4.278367519378662 - }, - { - "auxiliary_loss_clip": 0.01105689, - "auxiliary_loss_mlp": 0.01041859, - "balance_loss_clip": 1.03929675, - "balance_loss_mlp": 1.02716005, - "epoch": 0.588185780850744, - "flos": 16034653797120.0, - "grad_norm": 2.093864455539888, - "language_loss": 0.70450729, - "learning_rate": 1.5302600508711741e-06, - "loss": 0.72598279, - "num_input_tokens_seen": 210923945, - "step": 9783, - "time_per_iteration": 4.194809436798096 - }, - { - "auxiliary_loss_clip": 0.01085302, - "auxiliary_loss_mlp": 0.01036158, - "balance_loss_clip": 1.04440248, - "balance_loss_mlp": 1.02117932, - "epoch": 0.588245904103412, - "flos": 23728226947200.0, - "grad_norm": 2.1947417455944653, - "language_loss": 0.69071788, - "learning_rate": 1.5298814925888719e-06, - "loss": 0.71193242, - "num_input_tokens_seen": 210941955, - "step": 9784, - "time_per_iteration": 2.7187066078186035 - }, - { - "auxiliary_loss_clip": 0.01072816, - "auxiliary_loss_mlp": 0.01034537, - "balance_loss_clip": 1.03863633, - "balance_loss_mlp": 1.02094078, - "epoch": 0.58830602735608, - "flos": 33802534558080.0, - "grad_norm": 24.973572945721454, - "language_loss": 0.69460654, - "learning_rate": 1.5295029521335102e-06, - "loss": 0.71568, - "num_input_tokens_seen": 210963105, - "step": 9785, - "time_per_iteration": 2.878143548965454 - }, - { - "auxiliary_loss_clip": 0.01107899, - "auxiliary_loss_mlp": 0.01029541, - "balance_loss_clip": 1.04268789, - "balance_loss_mlp": 1.01706553, - "epoch": 0.588366150608748, - "flos": 17090714586240.0, - "grad_norm": 1.9508012380203874, - "language_loss": 0.77078086, - "learning_rate": 1.5291244295194448e-06, - "loss": 0.79215527, - "num_input_tokens_seen": 210978720, - "step": 9786, - "time_per_iteration": 2.6095898151397705 - }, - { - "auxiliary_loss_clip": 0.01101968, - "auxiliary_loss_mlp": 0.01029534, - "balance_loss_clip": 1.04132032, - "balance_loss_mlp": 1.01609302, - "epoch": 0.5884262738614159, - "flos": 22127186033280.0, - "grad_norm": 1.4529797212559594, - "language_loss": 0.79197991, - "learning_rate": 1.5287459247610276e-06, - "loss": 0.81329501, - "num_input_tokens_seen": 210998750, - "step": 9787, - "time_per_iteration": 4.223788261413574 - }, - { - "auxiliary_loss_clip": 0.01081001, - "auxiliary_loss_mlp": 0.01036004, - "balance_loss_clip": 1.04142892, - "balance_loss_mlp": 1.02382052, - "epoch": 0.5884863971140839, - "flos": 21031838743680.0, - "grad_norm": 2.5032495709629186, - "language_loss": 0.6604932, - "learning_rate": 1.5283674378726116e-06, - "loss": 0.68166327, - "num_input_tokens_seen": 211017550, - "step": 9788, - "time_per_iteration": 2.770289659500122 - }, - { - "auxiliary_loss_clip": 0.01089935, - "auxiliary_loss_mlp": 0.01038801, - "balance_loss_clip": 1.04031539, - "balance_loss_mlp": 1.02356613, - "epoch": 0.5885465203667518, - "flos": 23805112008960.0, - "grad_norm": 2.4491161231159495, - "language_loss": 0.80353689, - "learning_rate": 1.5279889688685506e-06, - "loss": 0.82482433, - "num_input_tokens_seen": 211034135, - "step": 9789, - "time_per_iteration": 2.7129344940185547 - }, - { - "auxiliary_loss_clip": 0.01088956, - "auxiliary_loss_mlp": 0.00771498, - "balance_loss_clip": 1.04013371, - "balance_loss_mlp": 0.99999416, - "epoch": 0.5886066436194198, - "flos": 18880574319360.0, - "grad_norm": 1.8752240370073765, - "language_loss": 0.7074194, - "learning_rate": 1.5276105177631944e-06, - "loss": 0.72602391, - "num_input_tokens_seen": 211053850, - "step": 9790, - "time_per_iteration": 2.7234628200531006 - }, - { - "auxiliary_loss_clip": 0.01082257, - "auxiliary_loss_mlp": 0.01034309, - "balance_loss_clip": 1.04143536, - "balance_loss_mlp": 1.02096915, - "epoch": 0.5886667668720877, - "flos": 24790141653120.0, - "grad_norm": 1.7147674530197825, - "language_loss": 0.83315635, - "learning_rate": 1.527232084570895e-06, - "loss": 0.85432208, - "num_input_tokens_seen": 211072165, - "step": 9791, - "time_per_iteration": 2.711566686630249 - }, - { - "auxiliary_loss_clip": 0.0110606, - "auxiliary_loss_mlp": 0.01044469, - "balance_loss_clip": 1.04232645, - "balance_loss_mlp": 1.0296278, - "epoch": 0.5887268901247558, - "flos": 21614381516160.0, - "grad_norm": 1.5737373299770356, - "language_loss": 0.7653091, - "learning_rate": 1.5268536693060026e-06, - "loss": 0.78681433, - "num_input_tokens_seen": 211089630, - "step": 9792, - "time_per_iteration": 2.634300947189331 - }, - { - "auxiliary_loss_clip": 0.0105802, - "auxiliary_loss_mlp": 0.01047083, - "balance_loss_clip": 1.03111851, - "balance_loss_mlp": 1.03123975, - "epoch": 0.5887870133774237, - "flos": 20481722974080.0, - "grad_norm": 2.6665803472381935, - "language_loss": 0.68956935, - "learning_rate": 1.5264752719828662e-06, - "loss": 0.7106204, - "num_input_tokens_seen": 211106120, - "step": 9793, - "time_per_iteration": 2.7154650688171387 - }, - { - "auxiliary_loss_clip": 0.01116924, - "auxiliary_loss_mlp": 0.01033012, - "balance_loss_clip": 1.04252207, - "balance_loss_mlp": 1.01923692, - "epoch": 0.5888471366300917, - "flos": 19206283870080.0, - "grad_norm": 1.9062241907170245, - "language_loss": 0.60218275, - "learning_rate": 1.5260968926158353e-06, - "loss": 0.62368208, - "num_input_tokens_seen": 211122450, - "step": 9794, - "time_per_iteration": 2.584721088409424 - }, - { - "auxiliary_loss_clip": 0.01087928, - "auxiliary_loss_mlp": 0.01038963, - "balance_loss_clip": 1.04045248, - "balance_loss_mlp": 1.0248251, - "epoch": 0.5889072598827596, - "flos": 19972904866560.0, - "grad_norm": 1.5367259931320274, - "language_loss": 0.65087652, - "learning_rate": 1.525718531219257e-06, - "loss": 0.67214543, - "num_input_tokens_seen": 211141765, - "step": 9795, - "time_per_iteration": 2.6578221321105957 - }, - { - "auxiliary_loss_clip": 0.01080946, - "auxiliary_loss_mlp": 0.01041808, - "balance_loss_clip": 1.03947282, - "balance_loss_mlp": 1.02942848, - "epoch": 0.5889673831354276, - "flos": 20741249715840.0, - "grad_norm": 1.5439612087123358, - "language_loss": 0.74185097, - "learning_rate": 1.5253401878074801e-06, - "loss": 0.76307845, - "num_input_tokens_seen": 211160475, - "step": 9796, - "time_per_iteration": 2.7106168270111084 - }, - { - "auxiliary_loss_clip": 0.01094109, - "auxiliary_loss_mlp": 0.01035258, - "balance_loss_clip": 1.03922713, - "balance_loss_mlp": 1.02194858, - "epoch": 0.5890275063880956, - "flos": 25300935008640.0, - "grad_norm": 1.398085740010997, - "language_loss": 0.82796204, - "learning_rate": 1.5249618623948507e-06, - "loss": 0.84925568, - "num_input_tokens_seen": 211180480, - "step": 9797, - "time_per_iteration": 2.7226924896240234 - }, - { - "auxiliary_loss_clip": 0.01089451, - "auxiliary_loss_mlp": 0.01032137, - "balance_loss_clip": 1.03643203, - "balance_loss_mlp": 1.01857686, - "epoch": 0.5890876296407636, - "flos": 11765377964160.0, - "grad_norm": 2.441249596431382, - "language_loss": 0.792216, - "learning_rate": 1.5245835549957152e-06, - "loss": 0.81343186, - "num_input_tokens_seen": 211198000, - "step": 9798, - "time_per_iteration": 2.661177396774292 - }, - { - "auxiliary_loss_clip": 0.01116784, - "auxiliary_loss_mlp": 0.01033567, - "balance_loss_clip": 1.04251814, - "balance_loss_mlp": 1.02085924, - "epoch": 0.5891477528934316, - "flos": 13589460380160.0, - "grad_norm": 4.031600606780585, - "language_loss": 0.74594498, - "learning_rate": 1.5242052656244186e-06, - "loss": 0.76744843, - "num_input_tokens_seen": 211214765, - "step": 9799, - "time_per_iteration": 2.597598075866699 - }, - { - "auxiliary_loss_clip": 0.0108372, - "auxiliary_loss_mlp": 0.01033117, - "balance_loss_clip": 1.03822446, - "balance_loss_mlp": 1.01848447, - "epoch": 0.5892078761460995, - "flos": 15049193189760.0, - "grad_norm": 1.9844034954522878, - "language_loss": 0.7639305, - "learning_rate": 1.5238269942953064e-06, - "loss": 0.78509891, - "num_input_tokens_seen": 211232335, - "step": 9800, - "time_per_iteration": 2.6959407329559326 - }, - { - "auxiliary_loss_clip": 0.01068975, - "auxiliary_loss_mlp": 0.01043567, - "balance_loss_clip": 1.03649104, - "balance_loss_mlp": 1.02863002, - "epoch": 0.5892679993987675, - "flos": 15778215624960.0, - "grad_norm": 2.091540130493628, - "language_loss": 0.78984964, - "learning_rate": 1.523448741022722e-06, - "loss": 0.81097507, - "num_input_tokens_seen": 211249985, - "step": 9801, - "time_per_iteration": 2.7329885959625244 - }, - { - "auxiliary_loss_clip": 0.01084752, - "auxiliary_loss_mlp": 0.01033442, - "balance_loss_clip": 1.04138374, - "balance_loss_mlp": 1.01958394, - "epoch": 0.5893281226514354, - "flos": 25265203954560.0, - "grad_norm": 1.6724920210450809, - "language_loss": 0.66076094, - "learning_rate": 1.5230705058210088e-06, - "loss": 0.68194282, - "num_input_tokens_seen": 211268425, - "step": 9802, - "time_per_iteration": 2.9191880226135254 - }, - { - "auxiliary_loss_clip": 0.01106682, - "auxiliary_loss_mlp": 0.01030935, - "balance_loss_clip": 1.04172468, - "balance_loss_mlp": 1.01782823, - "epoch": 0.5893882459041034, - "flos": 19458232842240.0, - "grad_norm": 1.576394450599596, - "language_loss": 0.78281248, - "learning_rate": 1.5226922887045108e-06, - "loss": 0.80418861, - "num_input_tokens_seen": 211286680, - "step": 9803, - "time_per_iteration": 2.6395671367645264 - }, - { - "auxiliary_loss_clip": 0.01110111, - "auxiliary_loss_mlp": 0.01036458, - "balance_loss_clip": 1.04354095, - "balance_loss_mlp": 1.0227071, - "epoch": 0.5894483691567713, - "flos": 20634056553600.0, - "grad_norm": 1.421228889325947, - "language_loss": 0.73083454, - "learning_rate": 1.5223140896875686e-06, - "loss": 0.75230026, - "num_input_tokens_seen": 211307700, - "step": 9804, - "time_per_iteration": 2.7451324462890625 - }, - { - "auxiliary_loss_clip": 0.01091882, - "auxiliary_loss_mlp": 0.01030959, - "balance_loss_clip": 1.04156733, - "balance_loss_mlp": 1.01769745, - "epoch": 0.5895084924094394, - "flos": 17778223877760.0, - "grad_norm": 1.6374324136970364, - "language_loss": 0.74669635, - "learning_rate": 1.5219359087845234e-06, - "loss": 0.76792479, - "num_input_tokens_seen": 211324835, - "step": 9805, - "time_per_iteration": 2.6853296756744385 - }, - { - "auxiliary_loss_clip": 0.01113863, - "auxiliary_loss_mlp": 0.00772031, - "balance_loss_clip": 1.04102564, - "balance_loss_mlp": 1.00008976, - "epoch": 0.5895686156621073, - "flos": 20121072468480.0, - "grad_norm": 1.677515475610003, - "language_loss": 0.77973545, - "learning_rate": 1.5215577460097174e-06, - "loss": 0.79859436, - "num_input_tokens_seen": 211344130, - "step": 9806, - "time_per_iteration": 2.6450774669647217 - }, - { - "auxiliary_loss_clip": 0.01117555, - "auxiliary_loss_mlp": 0.01031595, - "balance_loss_clip": 1.0410825, - "balance_loss_mlp": 1.01801682, - "epoch": 0.5896287389147753, - "flos": 20850058990080.0, - "grad_norm": 1.7162663032269994, - "language_loss": 0.76973009, - "learning_rate": 1.5211796013774887e-06, - "loss": 0.79122162, - "num_input_tokens_seen": 211362915, - "step": 9807, - "time_per_iteration": 2.5557191371917725 - }, - { - "auxiliary_loss_clip": 0.01115136, - "auxiliary_loss_mlp": 0.01029659, - "balance_loss_clip": 1.04593015, - "balance_loss_mlp": 1.01563966, - "epoch": 0.5896888621674432, - "flos": 14537897043840.0, - "grad_norm": 1.9630689597763404, - "language_loss": 0.74407029, - "learning_rate": 1.5208014749021786e-06, - "loss": 0.76551819, - "num_input_tokens_seen": 211380700, - "step": 9808, - "time_per_iteration": 2.649773359298706 - }, - { - "auxiliary_loss_clip": 0.01069554, - "auxiliary_loss_mlp": 0.01030007, - "balance_loss_clip": 1.03687882, - "balance_loss_mlp": 1.01540375, - "epoch": 0.5897489854201112, - "flos": 20886759711360.0, - "grad_norm": 2.8224307817464194, - "language_loss": 0.72173887, - "learning_rate": 1.5204233665981236e-06, - "loss": 0.74273449, - "num_input_tokens_seen": 211400095, - "step": 9809, - "time_per_iteration": 2.8795154094696045 - }, - { - "auxiliary_loss_clip": 0.01097105, - "auxiliary_loss_mlp": 0.01035609, - "balance_loss_clip": 1.03962231, - "balance_loss_mlp": 1.02155459, - "epoch": 0.5898091086727792, - "flos": 20011149872640.0, - "grad_norm": 1.9654509433248524, - "language_loss": 0.82251418, - "learning_rate": 1.5200452764796627e-06, - "loss": 0.84384131, - "num_input_tokens_seen": 211417810, - "step": 9810, - "time_per_iteration": 2.7300972938537598 - }, - { - "auxiliary_loss_clip": 0.01108515, - "auxiliary_loss_mlp": 0.01035386, - "balance_loss_clip": 1.04266787, - "balance_loss_mlp": 1.02213001, - "epoch": 0.5898692319254472, - "flos": 16253242012800.0, - "grad_norm": 2.8325616643541043, - "language_loss": 0.80945516, - "learning_rate": 1.5196672045611336e-06, - "loss": 0.83089411, - "num_input_tokens_seen": 211436020, - "step": 9811, - "time_per_iteration": 2.6033973693847656 - }, - { - "auxiliary_loss_clip": 0.01114433, - "auxiliary_loss_mlp": 0.01031742, - "balance_loss_clip": 1.04528522, - "balance_loss_mlp": 1.01666236, - "epoch": 0.5899293551781152, - "flos": 20448541785600.0, - "grad_norm": 2.9067717634400174, - "language_loss": 0.77026772, - "learning_rate": 1.5192891508568715e-06, - "loss": 0.79172945, - "num_input_tokens_seen": 211454335, - "step": 9812, - "time_per_iteration": 2.6283788681030273 - }, - { - "auxiliary_loss_clip": 0.01085179, - "auxiliary_loss_mlp": 0.01030145, - "balance_loss_clip": 1.04126084, - "balance_loss_mlp": 1.01832533, - "epoch": 0.5899894784307831, - "flos": 13881701433600.0, - "grad_norm": 2.0160065726104426, - "language_loss": 0.70596051, - "learning_rate": 1.5189111153812133e-06, - "loss": 0.72711378, - "num_input_tokens_seen": 211472775, - "step": 9813, - "time_per_iteration": 2.7235190868377686 - }, - { - "auxiliary_loss_clip": 0.01094818, - "auxiliary_loss_mlp": 0.01038761, - "balance_loss_clip": 1.04338694, - "balance_loss_mlp": 1.02489126, - "epoch": 0.5900496016834511, - "flos": 20083797129600.0, - "grad_norm": 10.075807478503481, - "language_loss": 0.72172022, - "learning_rate": 1.518533098148494e-06, - "loss": 0.74305606, - "num_input_tokens_seen": 211492195, - "step": 9814, - "time_per_iteration": 2.7245450019836426 - }, - { - "auxiliary_loss_clip": 0.01093647, - "auxiliary_loss_mlp": 0.01037117, - "balance_loss_clip": 1.04272461, - "balance_loss_mlp": 1.02348518, - "epoch": 0.590109724936119, - "flos": 20259148348800.0, - "grad_norm": 1.7959189057174523, - "language_loss": 0.78608483, - "learning_rate": 1.5181550991730476e-06, - "loss": 0.80739248, - "num_input_tokens_seen": 211510220, - "step": 9815, - "time_per_iteration": 2.624587297439575 - }, - { - "auxiliary_loss_clip": 0.0109595, - "auxiliary_loss_mlp": 0.0077231, - "balance_loss_clip": 1.04222631, - "balance_loss_mlp": 1.00011277, - "epoch": 0.590169848188787, - "flos": 24235069806720.0, - "grad_norm": 1.934955250523914, - "language_loss": 0.75605524, - "learning_rate": 1.5177771184692083e-06, - "loss": 0.77473778, - "num_input_tokens_seen": 211526260, - "step": 9816, - "time_per_iteration": 2.805889844894409 - }, - { - "auxiliary_loss_clip": 0.01120987, - "auxiliary_loss_mlp": 0.01039982, - "balance_loss_clip": 1.04457593, - "balance_loss_mlp": 1.02636814, - "epoch": 0.590229971441455, - "flos": 17784724239360.0, - "grad_norm": 1.761702620923252, - "language_loss": 0.81330854, - "learning_rate": 1.517399156051309e-06, - "loss": 0.8349182, - "num_input_tokens_seen": 211542890, - "step": 9817, - "time_per_iteration": 2.5694470405578613 - }, - { - "auxiliary_loss_clip": 0.0106411, - "auxiliary_loss_mlp": 0.01046757, - "balance_loss_clip": 1.03651428, - "balance_loss_mlp": 1.03204691, - "epoch": 0.590290094694123, - "flos": 22236893147520.0, - "grad_norm": 1.6227389463072333, - "language_loss": 0.7634322, - "learning_rate": 1.517021211933682e-06, - "loss": 0.78454089, - "num_input_tokens_seen": 211562685, - "step": 9818, - "time_per_iteration": 2.7369279861450195 - }, - { - "auxiliary_loss_clip": 0.0108334, - "auxiliary_loss_mlp": 0.01037737, - "balance_loss_clip": 1.04248178, - "balance_loss_mlp": 1.02451682, - "epoch": 0.5903502179467909, - "flos": 19098623831040.0, - "grad_norm": 2.2508579930127333, - "language_loss": 0.66751575, - "learning_rate": 1.5166432861306592e-06, - "loss": 0.68872648, - "num_input_tokens_seen": 211579960, - "step": 9819, - "time_per_iteration": 2.683518648147583 - }, - { - "auxiliary_loss_clip": 0.01121974, - "auxiliary_loss_mlp": 0.01034215, - "balance_loss_clip": 1.04451931, - "balance_loss_mlp": 1.02100039, - "epoch": 0.5904103411994589, - "flos": 24235500769920.0, - "grad_norm": 1.5861802995785013, - "language_loss": 0.78221858, - "learning_rate": 1.5162653786565714e-06, - "loss": 0.80378044, - "num_input_tokens_seen": 211599310, - "step": 9820, - "time_per_iteration": 2.67228627204895 - }, - { - "auxiliary_loss_clip": 0.01010393, - "auxiliary_loss_mlp": 0.01023264, - "balance_loss_clip": 1.01880515, - "balance_loss_mlp": 1.02123773, - "epoch": 0.5904704644521268, - "flos": 64876613045760.0, - "grad_norm": 0.9671648573222682, - "language_loss": 0.65189892, - "learning_rate": 1.5158874895257487e-06, - "loss": 0.67223543, - "num_input_tokens_seen": 211658790, - "step": 9821, - "time_per_iteration": 4.79486083984375 - }, - { - "auxiliary_loss_clip": 0.01079974, - "auxiliary_loss_mlp": 0.01038386, - "balance_loss_clip": 1.04072082, - "balance_loss_mlp": 1.0247488, - "epoch": 0.5905305877047948, - "flos": 19609991804160.0, - "grad_norm": 1.8549459171527238, - "language_loss": 0.61307114, - "learning_rate": 1.515509618752521e-06, - "loss": 0.63425475, - "num_input_tokens_seen": 211677240, - "step": 9822, - "time_per_iteration": 5.756153345108032 - }, - { - "auxiliary_loss_clip": 0.01122858, - "auxiliary_loss_mlp": 0.01041517, - "balance_loss_clip": 1.04382062, - "balance_loss_mlp": 1.02788556, - "epoch": 0.5905907109574628, - "flos": 18989634988800.0, - "grad_norm": 2.151764899841445, - "language_loss": 0.82442653, - "learning_rate": 1.5151317663512173e-06, - "loss": 0.84607029, - "num_input_tokens_seen": 211695485, - "step": 9823, - "time_per_iteration": 2.6660759449005127 - }, - { - "auxiliary_loss_clip": 0.01098497, - "auxiliary_loss_mlp": 0.01032384, - "balance_loss_clip": 1.04229546, - "balance_loss_mlp": 1.0183413, - "epoch": 0.5906508342101308, - "flos": 22200407907840.0, - "grad_norm": 1.984006151976339, - "language_loss": 0.72755098, - "learning_rate": 1.514753932336165e-06, - "loss": 0.74885976, - "num_input_tokens_seen": 211713090, - "step": 9824, - "time_per_iteration": 2.679081439971924 - }, - { - "auxiliary_loss_clip": 0.01095276, - "auxiliary_loss_mlp": 0.00772718, - "balance_loss_clip": 1.04067087, - "balance_loss_mlp": 1.00008035, - "epoch": 0.5907109574627988, - "flos": 20886687884160.0, - "grad_norm": 2.158910240340413, - "language_loss": 0.82870126, - "learning_rate": 1.514376116721693e-06, - "loss": 0.84738123, - "num_input_tokens_seen": 211732510, - "step": 9825, - "time_per_iteration": 2.719106674194336 - }, - { - "auxiliary_loss_clip": 0.0110445, - "auxiliary_loss_mlp": 0.01034274, - "balance_loss_clip": 1.04120886, - "balance_loss_mlp": 1.02252591, - "epoch": 0.5907710807154667, - "flos": 21506649649920.0, - "grad_norm": 1.7542204465206233, - "language_loss": 0.76779485, - "learning_rate": 1.5139983195221272e-06, - "loss": 0.78918207, - "num_input_tokens_seen": 211748695, - "step": 9826, - "time_per_iteration": 4.231219291687012 - }, - { - "auxiliary_loss_clip": 0.01094981, - "auxiliary_loss_mlp": 0.01031253, - "balance_loss_clip": 1.04213846, - "balance_loss_mlp": 1.01828933, - "epoch": 0.5908312039681347, - "flos": 22018376759040.0, - "grad_norm": 1.9593281360323977, - "language_loss": 0.72049826, - "learning_rate": 1.513620540751793e-06, - "loss": 0.74176061, - "num_input_tokens_seen": 211768545, - "step": 9827, - "time_per_iteration": 2.654449462890625 - }, - { - "auxiliary_loss_clip": 0.01073518, - "auxiliary_loss_mlp": 0.010335, - "balance_loss_clip": 1.03849053, - "balance_loss_mlp": 1.02111387, - "epoch": 0.5908913272208026, - "flos": 18479523991680.0, - "grad_norm": 1.6640399072146284, - "language_loss": 0.79552126, - "learning_rate": 1.5132427804250178e-06, - "loss": 0.8165915, - "num_input_tokens_seen": 211786665, - "step": 9828, - "time_per_iteration": 2.8060965538024902 - }, - { - "auxiliary_loss_clip": 0.01065495, - "auxiliary_loss_mlp": 0.01038324, - "balance_loss_clip": 1.04091191, - "balance_loss_mlp": 1.02510321, - "epoch": 0.5909514504734706, - "flos": 12312189682560.0, - "grad_norm": 1.8739746775685384, - "language_loss": 0.88231647, - "learning_rate": 1.5128650385561241e-06, - "loss": 0.90335464, - "num_input_tokens_seen": 211801215, - "step": 9829, - "time_per_iteration": 2.819425106048584 - }, - { - "auxiliary_loss_clip": 0.01023107, - "auxiliary_loss_mlp": 0.01007549, - "balance_loss_clip": 1.01821566, - "balance_loss_mlp": 1.00632119, - "epoch": 0.5910115737261386, - "flos": 70213262451840.0, - "grad_norm": 0.7698473487867592, - "language_loss": 0.57849222, - "learning_rate": 1.5124873151594376e-06, - "loss": 0.59879881, - "num_input_tokens_seen": 211857005, - "step": 9830, - "time_per_iteration": 3.1567955017089844 - }, - { - "auxiliary_loss_clip": 0.01114755, - "auxiliary_loss_mlp": 0.00772402, - "balance_loss_clip": 1.04340577, - "balance_loss_mlp": 1.0002377, - "epoch": 0.5910716969788066, - "flos": 22017766227840.0, - "grad_norm": 2.1363303387386723, - "language_loss": 0.75768107, - "learning_rate": 1.5121096102492812e-06, - "loss": 0.77655268, - "num_input_tokens_seen": 211876675, - "step": 9831, - "time_per_iteration": 2.7048380374908447 - }, - { - "auxiliary_loss_clip": 0.01089263, - "auxiliary_loss_mlp": 0.01034604, - "balance_loss_clip": 1.04322839, - "balance_loss_mlp": 1.02142549, - "epoch": 0.5911318202314745, - "flos": 21251648021760.0, - "grad_norm": 1.6552693507472749, - "language_loss": 0.77847427, - "learning_rate": 1.5117319238399767e-06, - "loss": 0.79971302, - "num_input_tokens_seen": 211895725, - "step": 9832, - "time_per_iteration": 2.716529369354248 - }, - { - "auxiliary_loss_clip": 0.01105775, - "auxiliary_loss_mlp": 0.01031029, - "balance_loss_clip": 1.04159164, - "balance_loss_mlp": 1.01780295, - "epoch": 0.5911919434841425, - "flos": 17821604528640.0, - "grad_norm": 1.9563179904860062, - "language_loss": 0.83245647, - "learning_rate": 1.511354255945847e-06, - "loss": 0.8538245, - "num_input_tokens_seen": 211913860, - "step": 9833, - "time_per_iteration": 2.641958236694336 - }, - { - "auxiliary_loss_clip": 0.0110771, - "auxiliary_loss_mlp": 0.01038041, - "balance_loss_clip": 1.04046118, - "balance_loss_mlp": 1.02435589, - "epoch": 0.5912520667368104, - "flos": 20374781207040.0, - "grad_norm": 1.5336556134798032, - "language_loss": 0.74267918, - "learning_rate": 1.5109766065812123e-06, - "loss": 0.76413667, - "num_input_tokens_seen": 211932880, - "step": 9834, - "time_per_iteration": 2.628453016281128 - }, - { - "auxiliary_loss_clip": 0.01119479, - "auxiliary_loss_mlp": 0.01034016, - "balance_loss_clip": 1.04244208, - "balance_loss_mlp": 1.02121329, - "epoch": 0.5913121899894784, - "flos": 17930557457280.0, - "grad_norm": 2.771797648904754, - "language_loss": 0.78298235, - "learning_rate": 1.5105989757603942e-06, - "loss": 0.80451727, - "num_input_tokens_seen": 211948625, - "step": 9835, - "time_per_iteration": 2.5689404010772705 - }, - { - "auxiliary_loss_clip": 0.01095655, - "auxiliary_loss_mlp": 0.01036768, - "balance_loss_clip": 1.03806067, - "balance_loss_mlp": 1.0237323, - "epoch": 0.5913723132421465, - "flos": 22126934638080.0, - "grad_norm": 1.8733256786117318, - "language_loss": 0.73799431, - "learning_rate": 1.5102213634977117e-06, - "loss": 0.75931853, - "num_input_tokens_seen": 211965355, - "step": 9836, - "time_per_iteration": 2.695117712020874 - }, - { - "auxiliary_loss_clip": 0.01083057, - "auxiliary_loss_mlp": 0.01035766, - "balance_loss_clip": 1.03943884, - "balance_loss_mlp": 1.02149653, - "epoch": 0.5914324364948144, - "flos": 15697918771200.0, - "grad_norm": 1.9392468028622023, - "language_loss": 0.82138634, - "learning_rate": 1.5098437698074841e-06, - "loss": 0.84257448, - "num_input_tokens_seen": 211982245, - "step": 9837, - "time_per_iteration": 2.6912343502044678 - }, - { - "auxiliary_loss_clip": 0.01078463, - "auxiliary_loss_mlp": 0.01035071, - "balance_loss_clip": 1.03632522, - "balance_loss_mlp": 1.02026534, - "epoch": 0.5914925597474824, - "flos": 22747327367040.0, - "grad_norm": 2.27741138864597, - "language_loss": 0.79637218, - "learning_rate": 1.5094661947040304e-06, - "loss": 0.81750751, - "num_input_tokens_seen": 212000250, - "step": 9838, - "time_per_iteration": 2.6449244022369385 - }, - { - "auxiliary_loss_clip": 0.010718, - "auxiliary_loss_mlp": 0.01039396, - "balance_loss_clip": 1.04010475, - "balance_loss_mlp": 1.02605057, - "epoch": 0.5915526830001503, - "flos": 18292788161280.0, - "grad_norm": 1.9685283368258655, - "language_loss": 0.69672906, - "learning_rate": 1.5090886382016673e-06, - "loss": 0.71784103, - "num_input_tokens_seen": 212017505, - "step": 9839, - "time_per_iteration": 2.76196026802063 - }, - { - "auxiliary_loss_clip": 0.01093291, - "auxiliary_loss_mlp": 0.01043789, - "balance_loss_clip": 1.04008913, - "balance_loss_mlp": 1.0308131, - "epoch": 0.5916128062528183, - "flos": 17019072910080.0, - "grad_norm": 2.7566603972322943, - "language_loss": 0.65802211, - "learning_rate": 1.5087111003147124e-06, - "loss": 0.67939293, - "num_input_tokens_seen": 212034595, - "step": 9840, - "time_per_iteration": 2.647179365158081 - }, - { - "auxiliary_loss_clip": 0.01095524, - "auxiliary_loss_mlp": 0.01030956, - "balance_loss_clip": 1.04105091, - "balance_loss_mlp": 1.0170027, - "epoch": 0.5916729295054862, - "flos": 24754231031040.0, - "grad_norm": 1.7835451737672352, - "language_loss": 0.81441593, - "learning_rate": 1.5083335810574813e-06, - "loss": 0.83568072, - "num_input_tokens_seen": 212055775, - "step": 9841, - "time_per_iteration": 2.693742036819458 - }, - { - "auxiliary_loss_clip": 0.01090733, - "auxiliary_loss_mlp": 0.01030377, - "balance_loss_clip": 1.04020691, - "balance_loss_mlp": 1.01772296, - "epoch": 0.5917330527581542, - "flos": 15958199698560.0, - "grad_norm": 1.7111294758223268, - "language_loss": 0.69152761, - "learning_rate": 1.507956080444291e-06, - "loss": 0.71273863, - "num_input_tokens_seen": 212074000, - "step": 9842, - "time_per_iteration": 2.6797986030578613 - }, - { - "auxiliary_loss_clip": 0.01093141, - "auxiliary_loss_mlp": 0.0103715, - "balance_loss_clip": 1.03811431, - "balance_loss_mlp": 1.02367949, - "epoch": 0.5917931760108222, - "flos": 23800730549760.0, - "grad_norm": 3.159007391867861, - "language_loss": 0.83409858, - "learning_rate": 1.5075785984894549e-06, - "loss": 0.85540152, - "num_input_tokens_seen": 212091415, - "step": 9843, - "time_per_iteration": 2.7194371223449707 - }, - { - "auxiliary_loss_clip": 0.01090728, - "auxiliary_loss_mlp": 0.01031987, - "balance_loss_clip": 1.03646731, - "balance_loss_mlp": 1.01762211, - "epoch": 0.5918532992634902, - "flos": 23249609199360.0, - "grad_norm": 5.395713728013965, - "language_loss": 0.81329596, - "learning_rate": 1.5072011352072875e-06, - "loss": 0.83452308, - "num_input_tokens_seen": 212105255, - "step": 9844, - "time_per_iteration": 2.7136270999908447 - }, - { - "auxiliary_loss_clip": 0.01068008, - "auxiliary_loss_mlp": 0.01030142, - "balance_loss_clip": 1.03874016, - "balance_loss_mlp": 1.01633775, - "epoch": 0.5919134225161581, - "flos": 19499853726720.0, - "grad_norm": 1.8542895008446525, - "language_loss": 0.74591327, - "learning_rate": 1.5068236906121032e-06, - "loss": 0.7668947, - "num_input_tokens_seen": 212122765, - "step": 9845, - "time_per_iteration": 2.781914710998535 - }, - { - "auxiliary_loss_clip": 0.01077949, - "auxiliary_loss_mlp": 0.0103435, - "balance_loss_clip": 1.03821266, - "balance_loss_mlp": 1.01837575, - "epoch": 0.5919735457688261, - "flos": 38800940567040.0, - "grad_norm": 1.69458434045341, - "language_loss": 0.63799906, - "learning_rate": 1.506446264718213e-06, - "loss": 0.65912199, - "num_input_tokens_seen": 212143960, - "step": 9846, - "time_per_iteration": 2.8427982330322266 - }, - { - "auxiliary_loss_clip": 0.01076538, - "auxiliary_loss_mlp": 0.00769552, - "balance_loss_clip": 1.03801441, - "balance_loss_mlp": 1.00004482, - "epoch": 0.592033669021494, - "flos": 22163994495360.0, - "grad_norm": 1.809865828874733, - "language_loss": 0.76013452, - "learning_rate": 1.506068857539931e-06, - "loss": 0.77859539, - "num_input_tokens_seen": 212162005, - "step": 9847, - "time_per_iteration": 2.737806797027588 - }, - { - "auxiliary_loss_clip": 0.01092495, - "auxiliary_loss_mlp": 0.01031315, - "balance_loss_clip": 1.03829622, - "balance_loss_mlp": 1.01720047, - "epoch": 0.592093792274162, - "flos": 22710985781760.0, - "grad_norm": 1.7217593328479819, - "language_loss": 0.62444723, - "learning_rate": 1.5056914690915667e-06, - "loss": 0.64568532, - "num_input_tokens_seen": 212181635, - "step": 9848, - "time_per_iteration": 2.768158197402954 - }, - { - "auxiliary_loss_clip": 0.01108627, - "auxiliary_loss_mlp": 0.01039243, - "balance_loss_clip": 1.04256344, - "balance_loss_mlp": 1.02609384, - "epoch": 0.59215391552683, - "flos": 22528954632960.0, - "grad_norm": 1.7269094299177161, - "language_loss": 0.75832105, - "learning_rate": 1.5053140993874312e-06, - "loss": 0.7797997, - "num_input_tokens_seen": 212201615, - "step": 9849, - "time_per_iteration": 2.6506807804107666 - }, - { - "auxiliary_loss_clip": 0.01095576, - "auxiliary_loss_mlp": 0.01036342, - "balance_loss_clip": 1.04088306, - "balance_loss_mlp": 1.02223945, - "epoch": 0.592214038779498, - "flos": 24499013921280.0, - "grad_norm": 2.077646783474588, - "language_loss": 0.75440395, - "learning_rate": 1.5049367484418353e-06, - "loss": 0.7757231, - "num_input_tokens_seen": 212219355, - "step": 9850, - "time_per_iteration": 2.738163471221924 - }, - { - "auxiliary_loss_clip": 0.01079223, - "auxiliary_loss_mlp": 0.01038556, - "balance_loss_clip": 1.0389607, - "balance_loss_mlp": 1.02532411, - "epoch": 0.592274162032166, - "flos": 21831353619840.0, - "grad_norm": 2.0657919494048094, - "language_loss": 0.75485742, - "learning_rate": 1.5045594162690868e-06, - "loss": 0.77603519, - "num_input_tokens_seen": 212236710, - "step": 9851, - "time_per_iteration": 2.7006642818450928 - }, - { - "auxiliary_loss_clip": 0.0109594, - "auxiliary_loss_mlp": 0.0103171, - "balance_loss_clip": 1.04149699, - "balance_loss_mlp": 1.01846635, - "epoch": 0.5923342852848339, - "flos": 24608146417920.0, - "grad_norm": 1.9468749498411155, - "language_loss": 0.7089386, - "learning_rate": 1.5041821028834954e-06, - "loss": 0.73021513, - "num_input_tokens_seen": 212256195, - "step": 9852, - "time_per_iteration": 2.706106424331665 - }, - { - "auxiliary_loss_clip": 0.01104361, - "auxiliary_loss_mlp": 0.0077249, - "balance_loss_clip": 1.04451549, - "balance_loss_mlp": 1.00008225, - "epoch": 0.5923944085375019, - "flos": 19938143479680.0, - "grad_norm": 1.600717143056076, - "language_loss": 0.80555183, - "learning_rate": 1.5038048082993685e-06, - "loss": 0.82432032, - "num_input_tokens_seen": 212274085, - "step": 9853, - "time_per_iteration": 2.7119646072387695 - }, - { - "auxiliary_loss_clip": 0.01088586, - "auxiliary_loss_mlp": 0.01028953, - "balance_loss_clip": 1.03719842, - "balance_loss_mlp": 1.01654959, - "epoch": 0.5924545317901698, - "flos": 28658510812800.0, - "grad_norm": 1.9598293021275044, - "language_loss": 0.67597294, - "learning_rate": 1.5034275325310124e-06, - "loss": 0.69714832, - "num_input_tokens_seen": 212295530, - "step": 9854, - "time_per_iteration": 2.7060039043426514 - }, - { - "auxiliary_loss_clip": 0.01081304, - "auxiliary_loss_mlp": 0.01029538, - "balance_loss_clip": 1.03990042, - "balance_loss_mlp": 1.01680636, - "epoch": 0.5925146550428378, - "flos": 19864885691520.0, - "grad_norm": 1.7821900938554989, - "language_loss": 0.88811159, - "learning_rate": 1.5030502755927344e-06, - "loss": 0.90921998, - "num_input_tokens_seen": 212313770, - "step": 9855, - "time_per_iteration": 2.749842882156372 - }, - { - "auxiliary_loss_clip": 0.01097397, - "auxiliary_loss_mlp": 0.01031382, - "balance_loss_clip": 1.04023433, - "balance_loss_mlp": 1.01912177, - "epoch": 0.5925747782955058, - "flos": 15122989681920.0, - "grad_norm": 1.7553886735756365, - "language_loss": 0.86097872, - "learning_rate": 1.5026730374988397e-06, - "loss": 0.8822664, - "num_input_tokens_seen": 212331525, - "step": 9856, - "time_per_iteration": 2.8213181495666504 - }, - { - "auxiliary_loss_clip": 0.0110594, - "auxiliary_loss_mlp": 0.01036211, - "balance_loss_clip": 1.03984308, - "balance_loss_mlp": 1.02389097, - "epoch": 0.5926349015481738, - "flos": 18405440190720.0, - "grad_norm": 3.6746631679389536, - "language_loss": 0.77349007, - "learning_rate": 1.5022958182636332e-06, - "loss": 0.79491156, - "num_input_tokens_seen": 212347295, - "step": 9857, - "time_per_iteration": 2.6580264568328857 - }, - { - "auxiliary_loss_clip": 0.0107388, - "auxiliary_loss_mlp": 0.01051977, - "balance_loss_clip": 1.03587079, - "balance_loss_mlp": 1.03689682, - "epoch": 0.5926950248008417, - "flos": 23111138269440.0, - "grad_norm": 2.383524132494838, - "language_loss": 0.64598405, - "learning_rate": 1.501918617901419e-06, - "loss": 0.66724265, - "num_input_tokens_seen": 212365750, - "step": 9858, - "time_per_iteration": 2.7002615928649902 - }, - { - "auxiliary_loss_clip": 0.01103608, - "auxiliary_loss_mlp": 0.01033595, - "balance_loss_clip": 1.04055738, - "balance_loss_mlp": 1.02088773, - "epoch": 0.5927551480535097, - "flos": 28033916192640.0, - "grad_norm": 1.88700094462338, - "language_loss": 0.77598989, - "learning_rate": 1.501541436426501e-06, - "loss": 0.79736185, - "num_input_tokens_seen": 212385300, - "step": 9859, - "time_per_iteration": 4.434144496917725 - }, - { - "auxiliary_loss_clip": 0.01078779, - "auxiliary_loss_mlp": 0.00771508, - "balance_loss_clip": 1.04448819, - "balance_loss_mlp": 1.00007796, - "epoch": 0.5928152713061776, - "flos": 21798675221760.0, - "grad_norm": 4.274702781757113, - "language_loss": 0.74740881, - "learning_rate": 1.5011642738531818e-06, - "loss": 0.7659117, - "num_input_tokens_seen": 212402140, - "step": 9860, - "time_per_iteration": 2.8576431274414062 - }, - { - "auxiliary_loss_clip": 0.01080315, - "auxiliary_loss_mlp": 0.01034538, - "balance_loss_clip": 1.04223692, - "balance_loss_mlp": 1.02289104, - "epoch": 0.5928753945588456, - "flos": 24316839118080.0, - "grad_norm": 1.6207851458155365, - "language_loss": 0.7622723, - "learning_rate": 1.500787130195763e-06, - "loss": 0.7834208, - "num_input_tokens_seen": 212421790, - "step": 9861, - "time_per_iteration": 5.779749393463135 - }, - { - "auxiliary_loss_clip": 0.01079641, - "auxiliary_loss_mlp": 0.01032307, - "balance_loss_clip": 1.03737628, - "balance_loss_mlp": 1.0201298, - "epoch": 0.5929355178115137, - "flos": 26464619923200.0, - "grad_norm": 2.31911103307255, - "language_loss": 0.70733476, - "learning_rate": 1.5004100054685465e-06, - "loss": 0.72845423, - "num_input_tokens_seen": 212442115, - "step": 9862, - "time_per_iteration": 2.7879045009613037 - }, - { - "auxiliary_loss_clip": 0.01057596, - "auxiliary_loss_mlp": 0.01034108, - "balance_loss_clip": 1.03278732, - "balance_loss_mlp": 1.02148342, - "epoch": 0.5929956410641816, - "flos": 24965995662720.0, - "grad_norm": 1.7884457502004503, - "language_loss": 0.78123254, - "learning_rate": 1.500032899685832e-06, - "loss": 0.80214959, - "num_input_tokens_seen": 212459535, - "step": 9863, - "time_per_iteration": 2.7296791076660156 - }, - { - "auxiliary_loss_clip": 0.01089944, - "auxiliary_loss_mlp": 0.01040962, - "balance_loss_clip": 1.03986549, - "balance_loss_mlp": 1.02770567, - "epoch": 0.5930557643168496, - "flos": 26208325405440.0, - "grad_norm": 2.4622472815237506, - "language_loss": 0.70487082, - "learning_rate": 1.499655812861921e-06, - "loss": 0.72617984, - "num_input_tokens_seen": 212479385, - "step": 9864, - "time_per_iteration": 2.6773011684417725 - }, - { - "auxiliary_loss_clip": 0.01089195, - "auxiliary_loss_mlp": 0.01036172, - "balance_loss_clip": 1.03835356, - "balance_loss_mlp": 1.0226891, - "epoch": 0.5931158875695175, - "flos": 27854937699840.0, - "grad_norm": 1.4468399758370936, - "language_loss": 0.67205417, - "learning_rate": 1.4992787450111112e-06, - "loss": 0.69330788, - "num_input_tokens_seen": 212500060, - "step": 9865, - "time_per_iteration": 4.260905981063843 - }, - { - "auxiliary_loss_clip": 0.01098878, - "auxiliary_loss_mlp": 0.0103771, - "balance_loss_clip": 1.04014802, - "balance_loss_mlp": 1.02411962, - "epoch": 0.5931760108221855, - "flos": 15413650536960.0, - "grad_norm": 1.9702875461989908, - "language_loss": 0.77913535, - "learning_rate": 1.4989016961477015e-06, - "loss": 0.80050123, - "num_input_tokens_seen": 212518590, - "step": 9866, - "time_per_iteration": 2.6692967414855957 - }, - { - "auxiliary_loss_clip": 0.01090663, - "auxiliary_loss_mlp": 0.01031022, - "balance_loss_clip": 1.04043937, - "balance_loss_mlp": 1.01891649, - "epoch": 0.5932361340748534, - "flos": 30188520581760.0, - "grad_norm": 2.3223854732809364, - "language_loss": 0.71955562, - "learning_rate": 1.4985246662859903e-06, - "loss": 0.74077249, - "num_input_tokens_seen": 212538190, - "step": 9867, - "time_per_iteration": 2.73850679397583 - }, - { - "auxiliary_loss_clip": 0.01094459, - "auxiliary_loss_mlp": 0.0103051, - "balance_loss_clip": 1.04182947, - "balance_loss_mlp": 1.01644969, - "epoch": 0.5932962573275214, - "flos": 20157557708160.0, - "grad_norm": 1.577108097655746, - "language_loss": 0.66789985, - "learning_rate": 1.4981476554402732e-06, - "loss": 0.68914956, - "num_input_tokens_seen": 212557820, - "step": 9868, - "time_per_iteration": 2.776890277862549 - }, - { - "auxiliary_loss_clip": 0.01060162, - "auxiliary_loss_mlp": 0.00771363, - "balance_loss_clip": 1.03597963, - "balance_loss_mlp": 1.00004768, - "epoch": 0.5933563805801894, - "flos": 25445906300160.0, - "grad_norm": 1.613226423561444, - "language_loss": 0.75353992, - "learning_rate": 1.4977706636248478e-06, - "loss": 0.77185524, - "num_input_tokens_seen": 212577645, - "step": 9869, - "time_per_iteration": 2.8630988597869873 - }, - { - "auxiliary_loss_clip": 0.010636, - "auxiliary_loss_mlp": 0.01038006, - "balance_loss_clip": 1.03897762, - "balance_loss_mlp": 1.02469635, - "epoch": 0.5934165038328574, - "flos": 59995740337920.0, - "grad_norm": 1.8583969258808255, - "language_loss": 0.74005115, - "learning_rate": 1.4973936908540091e-06, - "loss": 0.76106727, - "num_input_tokens_seen": 212603430, - "step": 9870, - "time_per_iteration": 3.0915732383728027 - }, - { - "auxiliary_loss_clip": 0.01071863, - "auxiliary_loss_mlp": 0.01030945, - "balance_loss_clip": 1.03705025, - "balance_loss_mlp": 1.01810646, - "epoch": 0.5934766270855253, - "flos": 24420548661120.0, - "grad_norm": 2.145127507644007, - "language_loss": 0.7232281, - "learning_rate": 1.4970167371420517e-06, - "loss": 0.7442562, - "num_input_tokens_seen": 212620730, - "step": 9871, - "time_per_iteration": 2.7629406452178955 - }, - { - "auxiliary_loss_clip": 0.01086004, - "auxiliary_loss_mlp": 0.01031261, - "balance_loss_clip": 1.04104018, - "balance_loss_mlp": 1.01764774, - "epoch": 0.5935367503381933, - "flos": 23513158264320.0, - "grad_norm": 2.0164353140130835, - "language_loss": 0.74587923, - "learning_rate": 1.496639802503271e-06, - "loss": 0.76705188, - "num_input_tokens_seen": 212639745, - "step": 9872, - "time_per_iteration": 2.74772310256958 - }, - { - "auxiliary_loss_clip": 0.01111382, - "auxiliary_loss_mlp": 0.01038485, - "balance_loss_clip": 1.04180598, - "balance_loss_mlp": 1.02414966, - "epoch": 0.5935968735908612, - "flos": 18948337326720.0, - "grad_norm": 2.3277369002939388, - "language_loss": 0.79620034, - "learning_rate": 1.4962628869519583e-06, - "loss": 0.81769902, - "num_input_tokens_seen": 212655915, - "step": 9873, - "time_per_iteration": 2.663547992706299 - }, - { - "auxiliary_loss_clip": 0.01108216, - "auxiliary_loss_mlp": 0.01034928, - "balance_loss_clip": 1.04269648, - "balance_loss_mlp": 1.021523, - "epoch": 0.5936569968435292, - "flos": 25483433034240.0, - "grad_norm": 1.6892324145577737, - "language_loss": 0.8490203, - "learning_rate": 1.4958859905024078e-06, - "loss": 0.87045169, - "num_input_tokens_seen": 212676115, - "step": 9874, - "time_per_iteration": 2.654606580734253 - }, - { - "auxiliary_loss_clip": 0.01019729, - "auxiliary_loss_mlp": 0.01001192, - "balance_loss_clip": 1.01379979, - "balance_loss_mlp": 0.99991626, - "epoch": 0.5937171200961973, - "flos": 66378361789440.0, - "grad_norm": 0.7079839888277836, - "language_loss": 0.59980857, - "learning_rate": 1.4955091131689115e-06, - "loss": 0.62001777, - "num_input_tokens_seen": 212737560, - "step": 9875, - "time_per_iteration": 3.3108227252960205 - }, - { - "auxiliary_loss_clip": 0.01094208, - "auxiliary_loss_mlp": 0.01033507, - "balance_loss_clip": 1.03624558, - "balance_loss_mlp": 1.01859426, - "epoch": 0.5937772433488652, - "flos": 14903467712640.0, - "grad_norm": 5.919714877847386, - "language_loss": 0.7768054, - "learning_rate": 1.4951322549657594e-06, - "loss": 0.79808253, - "num_input_tokens_seen": 212755365, - "step": 9876, - "time_per_iteration": 2.6835005283355713 - }, - { - "auxiliary_loss_clip": 0.01097876, - "auxiliary_loss_mlp": 0.01028372, - "balance_loss_clip": 1.03590453, - "balance_loss_mlp": 1.01630843, - "epoch": 0.5938373666015332, - "flos": 22561489376640.0, - "grad_norm": 1.528829961767438, - "language_loss": 0.75805295, - "learning_rate": 1.494755415907243e-06, - "loss": 0.77931547, - "num_input_tokens_seen": 212773875, - "step": 9877, - "time_per_iteration": 2.703756332397461 - }, - { - "auxiliary_loss_clip": 0.0110632, - "auxiliary_loss_mlp": 0.01028449, - "balance_loss_clip": 1.03964424, - "balance_loss_mlp": 1.01493096, - "epoch": 0.5938974898542011, - "flos": 18440883936000.0, - "grad_norm": 2.6694319382348666, - "language_loss": 0.81408948, - "learning_rate": 1.4943785960076522e-06, - "loss": 0.83543718, - "num_input_tokens_seen": 212790590, - "step": 9878, - "time_per_iteration": 2.6299495697021484 - }, - { - "auxiliary_loss_clip": 0.01090649, - "auxiliary_loss_mlp": 0.00772164, - "balance_loss_clip": 1.03885496, - "balance_loss_mlp": 1.00006008, - "epoch": 0.5939576131068691, - "flos": 45586728270720.0, - "grad_norm": 1.7408999007224344, - "language_loss": 0.71310401, - "learning_rate": 1.4940017952812754e-06, - "loss": 0.73173207, - "num_input_tokens_seen": 212812265, - "step": 9879, - "time_per_iteration": 2.9403438568115234 - }, - { - "auxiliary_loss_clip": 0.01107517, - "auxiliary_loss_mlp": 0.01037191, - "balance_loss_clip": 1.04333889, - "balance_loss_mlp": 1.02471602, - "epoch": 0.594017736359537, - "flos": 23587708942080.0, - "grad_norm": 1.6220417937962182, - "language_loss": 0.5754692, - "learning_rate": 1.493625013742401e-06, - "loss": 0.59691632, - "num_input_tokens_seen": 212831915, - "step": 9880, - "time_per_iteration": 2.722222089767456 - }, - { - "auxiliary_loss_clip": 0.01108825, - "auxiliary_loss_mlp": 0.01034905, - "balance_loss_clip": 1.04171181, - "balance_loss_mlp": 1.02144003, - "epoch": 0.594077859612205, - "flos": 29457235589760.0, - "grad_norm": 1.8505883622927, - "language_loss": 0.77141905, - "learning_rate": 1.4932482514053177e-06, - "loss": 0.79285634, - "num_input_tokens_seen": 212851350, - "step": 9881, - "time_per_iteration": 2.7424824237823486 - }, - { - "auxiliary_loss_clip": 0.01104617, - "auxiliary_loss_mlp": 0.01027481, - "balance_loss_clip": 1.0387702, - "balance_loss_mlp": 1.01456428, - "epoch": 0.594137982864873, - "flos": 16800089644800.0, - "grad_norm": 2.611625845648677, - "language_loss": 0.82625538, - "learning_rate": 1.4928715082843112e-06, - "loss": 0.84757638, - "num_input_tokens_seen": 212867995, - "step": 9882, - "time_per_iteration": 2.6125638484954834 - }, - { - "auxiliary_loss_clip": 0.01108328, - "auxiliary_loss_mlp": 0.01036545, - "balance_loss_clip": 1.04283643, - "balance_loss_mlp": 1.02419496, - "epoch": 0.594198106117541, - "flos": 12750263953920.0, - "grad_norm": 2.4545417723722434, - "language_loss": 0.79556072, - "learning_rate": 1.492494784393667e-06, - "loss": 0.81700939, - "num_input_tokens_seen": 212885220, - "step": 9883, - "time_per_iteration": 2.6739277839660645 - }, - { - "auxiliary_loss_clip": 0.01090609, - "auxiliary_loss_mlp": 0.00770805, - "balance_loss_clip": 1.04405499, - "balance_loss_mlp": 1.00010085, - "epoch": 0.5942582293702089, - "flos": 20996538652800.0, - "grad_norm": 2.530798381383893, - "language_loss": 0.7459439, - "learning_rate": 1.4921180797476725e-06, - "loss": 0.76455808, - "num_input_tokens_seen": 212903195, - "step": 9884, - "time_per_iteration": 2.720139503479004 - }, - { - "auxiliary_loss_clip": 0.01118755, - "auxiliary_loss_mlp": 0.01030314, - "balance_loss_clip": 1.04366493, - "balance_loss_mlp": 1.01757646, - "epoch": 0.5943183526228769, - "flos": 28291431772800.0, - "grad_norm": 2.040352336443274, - "language_loss": 0.66608262, - "learning_rate": 1.4917413943606106e-06, - "loss": 0.68757325, - "num_input_tokens_seen": 212923340, - "step": 9885, - "time_per_iteration": 2.6618847846984863 - }, - { - "auxiliary_loss_clip": 0.01093907, - "auxiliary_loss_mlp": 0.01041351, - "balance_loss_clip": 1.04138327, - "balance_loss_mlp": 1.02835155, - "epoch": 0.5943784758755448, - "flos": 26614619118720.0, - "grad_norm": 2.630158617128694, - "language_loss": 0.77534634, - "learning_rate": 1.4913647282467667e-06, - "loss": 0.79669893, - "num_input_tokens_seen": 212942755, - "step": 9886, - "time_per_iteration": 2.7532429695129395 - }, - { - "auxiliary_loss_clip": 0.01025813, - "auxiliary_loss_mlp": 0.01001276, - "balance_loss_clip": 1.01382208, - "balance_loss_mlp": 0.99997658, - "epoch": 0.5944385991282128, - "flos": 64190935347840.0, - "grad_norm": 0.9149518659336237, - "language_loss": 0.64530778, - "learning_rate": 1.490988081420423e-06, - "loss": 0.66557866, - "num_input_tokens_seen": 212999355, - "step": 9887, - "time_per_iteration": 3.060612440109253 - }, - { - "auxiliary_loss_clip": 0.01097622, - "auxiliary_loss_mlp": 0.01032109, - "balance_loss_clip": 1.03770781, - "balance_loss_mlp": 1.01940084, - "epoch": 0.5944987223808808, - "flos": 19571998193280.0, - "grad_norm": 1.6915419105373903, - "language_loss": 0.69181025, - "learning_rate": 1.4906114538958615e-06, - "loss": 0.71310759, - "num_input_tokens_seen": 213018570, - "step": 9888, - "time_per_iteration": 2.617629051208496 - }, - { - "auxiliary_loss_clip": 0.01088883, - "auxiliary_loss_mlp": 0.01034911, - "balance_loss_clip": 1.03844309, - "balance_loss_mlp": 1.02113008, - "epoch": 0.5945588456335488, - "flos": 26177586341760.0, - "grad_norm": 2.5005305893435685, - "language_loss": 0.79495192, - "learning_rate": 1.490234845687366e-06, - "loss": 0.81618989, - "num_input_tokens_seen": 213037735, - "step": 9889, - "time_per_iteration": 2.685150146484375 - }, - { - "auxiliary_loss_clip": 0.01080162, - "auxiliary_loss_mlp": 0.01026954, - "balance_loss_clip": 1.03793621, - "balance_loss_mlp": 1.01496744, - "epoch": 0.5946189688862168, - "flos": 20446494710400.0, - "grad_norm": 1.6110540672551508, - "language_loss": 0.70713383, - "learning_rate": 1.4898582568092154e-06, - "loss": 0.72820497, - "num_input_tokens_seen": 213057160, - "step": 9890, - "time_per_iteration": 2.7299606800079346 - }, - { - "auxiliary_loss_clip": 0.01088716, - "auxiliary_loss_mlp": 0.01032845, - "balance_loss_clip": 1.04451787, - "balance_loss_mlp": 1.01896358, - "epoch": 0.5946790921388847, - "flos": 13437521850240.0, - "grad_norm": 1.9451498476517268, - "language_loss": 0.69461864, - "learning_rate": 1.489481687275691e-06, - "loss": 0.71583426, - "num_input_tokens_seen": 213073630, - "step": 9891, - "time_per_iteration": 2.7253577709198 - }, - { - "auxiliary_loss_clip": 0.01104108, - "auxiliary_loss_mlp": 0.01040464, - "balance_loss_clip": 1.04076028, - "balance_loss_mlp": 1.02784514, - "epoch": 0.5947392153915527, - "flos": 20412272027520.0, - "grad_norm": 1.8738043279095635, - "language_loss": 0.53252602, - "learning_rate": 1.4891051371010726e-06, - "loss": 0.55397171, - "num_input_tokens_seen": 213092450, - "step": 9892, - "time_per_iteration": 2.630176067352295 - }, - { - "auxiliary_loss_clip": 0.01007775, - "auxiliary_loss_mlp": 0.01004642, - "balance_loss_clip": 1.01469183, - "balance_loss_mlp": 1.00331867, - "epoch": 0.5947993386442206, - "flos": 65619138994560.0, - "grad_norm": 0.662438980473289, - "language_loss": 0.54533142, - "learning_rate": 1.4887286062996375e-06, - "loss": 0.56545562, - "num_input_tokens_seen": 213155465, - "step": 9893, - "time_per_iteration": 3.3319764137268066 - }, - { - "auxiliary_loss_clip": 0.01079474, - "auxiliary_loss_mlp": 0.01035837, - "balance_loss_clip": 1.04197478, - "balance_loss_mlp": 1.02362406, - "epoch": 0.5948594618968887, - "flos": 23183103168000.0, - "grad_norm": 1.5803116085974762, - "language_loss": 0.74965519, - "learning_rate": 1.4883520948856658e-06, - "loss": 0.77080828, - "num_input_tokens_seen": 213174875, - "step": 9894, - "time_per_iteration": 2.708012104034424 - }, - { - "auxiliary_loss_clip": 0.01084394, - "auxiliary_loss_mlp": 0.01031071, - "balance_loss_clip": 1.04066491, - "balance_loss_mlp": 1.01860142, - "epoch": 0.5949195851495566, - "flos": 13626771632640.0, - "grad_norm": 1.7370359553625463, - "language_loss": 0.77732074, - "learning_rate": 1.487975602873434e-06, - "loss": 0.79847538, - "num_input_tokens_seen": 213192695, - "step": 9895, - "time_per_iteration": 2.6831347942352295 - }, - { - "auxiliary_loss_clip": 0.01067508, - "auxiliary_loss_mlp": 0.01037328, - "balance_loss_clip": 1.03781974, - "balance_loss_mlp": 1.0233922, - "epoch": 0.5949797084022246, - "flos": 19751012599680.0, - "grad_norm": 1.6095460497638086, - "language_loss": 0.79347014, - "learning_rate": 1.4875991302772182e-06, - "loss": 0.81451851, - "num_input_tokens_seen": 213211195, - "step": 9896, - "time_per_iteration": 2.7621328830718994 - }, - { - "auxiliary_loss_clip": 0.01106477, - "auxiliary_loss_mlp": 0.01035793, - "balance_loss_clip": 1.04062951, - "balance_loss_mlp": 1.02315736, - "epoch": 0.5950398316548925, - "flos": 25773878407680.0, - "grad_norm": 1.5421424712505716, - "language_loss": 0.83955193, - "learning_rate": 1.4872226771112954e-06, - "loss": 0.86097461, - "num_input_tokens_seen": 213231975, - "step": 9897, - "time_per_iteration": 2.7152647972106934 - }, - { - "auxiliary_loss_clip": 0.01092695, - "auxiliary_loss_mlp": 0.01037147, - "balance_loss_clip": 1.04191113, - "balance_loss_mlp": 1.02490425, - "epoch": 0.5950999549075605, - "flos": 23039029716480.0, - "grad_norm": 1.9245000057416703, - "language_loss": 0.70950294, - "learning_rate": 1.486846243389939e-06, - "loss": 0.73080134, - "num_input_tokens_seen": 213249760, - "step": 9898, - "time_per_iteration": 4.332275867462158 - }, - { - "auxiliary_loss_clip": 0.01105674, - "auxiliary_loss_mlp": 0.01044981, - "balance_loss_clip": 1.03863168, - "balance_loss_mlp": 1.02892375, - "epoch": 0.5951600781602284, - "flos": 32446367637120.0, - "grad_norm": 2.443382879492767, - "language_loss": 0.64050412, - "learning_rate": 1.4864698291274251e-06, - "loss": 0.66201067, - "num_input_tokens_seen": 213269890, - "step": 9899, - "time_per_iteration": 2.747209072113037 - }, - { - "auxiliary_loss_clip": 0.01117539, - "auxiliary_loss_mlp": 0.01028742, - "balance_loss_clip": 1.04378319, - "balance_loss_mlp": 1.01740563, - "epoch": 0.5952202014128964, - "flos": 23800874204160.0, - "grad_norm": 1.865552618204713, - "language_loss": 0.71956146, - "learning_rate": 1.4860934343380267e-06, - "loss": 0.74102432, - "num_input_tokens_seen": 213289400, - "step": 9900, - "time_per_iteration": 5.790768146514893 - }, - { - "auxiliary_loss_clip": 0.01114892, - "auxiliary_loss_mlp": 0.01032325, - "balance_loss_clip": 1.04192626, - "balance_loss_mlp": 1.01949835, - "epoch": 0.5952803246655644, - "flos": 22492182084480.0, - "grad_norm": 1.7457638078039162, - "language_loss": 0.84428406, - "learning_rate": 1.4857170590360169e-06, - "loss": 0.86575621, - "num_input_tokens_seen": 213308040, - "step": 9901, - "time_per_iteration": 2.7782936096191406 - }, - { - "auxiliary_loss_clip": 0.00993307, - "auxiliary_loss_mlp": 0.01008976, - "balance_loss_clip": 1.01768923, - "balance_loss_mlp": 1.00779581, - "epoch": 0.5953404479182324, - "flos": 51234688851840.0, - "grad_norm": 0.8002603783256921, - "language_loss": 0.58178693, - "learning_rate": 1.4853407032356674e-06, - "loss": 0.60180974, - "num_input_tokens_seen": 213358585, - "step": 9902, - "time_per_iteration": 3.245389699935913 - }, - { - "auxiliary_loss_clip": 0.01059574, - "auxiliary_loss_mlp": 0.01029206, - "balance_loss_clip": 1.03823233, - "balance_loss_mlp": 1.01596808, - "epoch": 0.5954005711709004, - "flos": 23112682554240.0, - "grad_norm": 2.326170730098328, - "language_loss": 0.77513373, - "learning_rate": 1.4849643669512503e-06, - "loss": 0.79602152, - "num_input_tokens_seen": 213379585, - "step": 9903, - "time_per_iteration": 2.938472032546997 - }, - { - "auxiliary_loss_clip": 0.01080471, - "auxiliary_loss_mlp": 0.01035506, - "balance_loss_clip": 1.04236233, - "balance_loss_mlp": 1.02275109, - "epoch": 0.5954606944235683, - "flos": 35954732736000.0, - "grad_norm": 3.664262182530453, - "language_loss": 0.7767508, - "learning_rate": 1.4845880501970362e-06, - "loss": 0.79791057, - "num_input_tokens_seen": 213401465, - "step": 9904, - "time_per_iteration": 4.397410869598389 - }, - { - "auxiliary_loss_clip": 0.01102001, - "auxiliary_loss_mlp": 0.01038114, - "balance_loss_clip": 1.04016399, - "balance_loss_mlp": 1.02507877, - "epoch": 0.5955208176762363, - "flos": 30443665864320.0, - "grad_norm": 1.9431813333035064, - "language_loss": 0.72943354, - "learning_rate": 1.4842117529872942e-06, - "loss": 0.7508347, - "num_input_tokens_seen": 213422720, - "step": 9905, - "time_per_iteration": 2.7936177253723145 - }, - { - "auxiliary_loss_clip": 0.01109363, - "auxiliary_loss_mlp": 0.01030507, - "balance_loss_clip": 1.04223228, - "balance_loss_mlp": 1.01717925, - "epoch": 0.5955809409289042, - "flos": 17640112083840.0, - "grad_norm": 1.9824269605474862, - "language_loss": 0.70172507, - "learning_rate": 1.483835475336295e-06, - "loss": 0.72312379, - "num_input_tokens_seen": 213439480, - "step": 9906, - "time_per_iteration": 2.6985738277435303 - }, - { - "auxiliary_loss_clip": 0.01106299, - "auxiliary_loss_mlp": 0.01032912, - "balance_loss_clip": 1.04149914, - "balance_loss_mlp": 1.01987052, - "epoch": 0.5956410641815723, - "flos": 24279887001600.0, - "grad_norm": 1.8692952809001842, - "language_loss": 0.75197554, - "learning_rate": 1.4834592172583057e-06, - "loss": 0.77336764, - "num_input_tokens_seen": 213458895, - "step": 9907, - "time_per_iteration": 2.6980481147766113 - }, - { - "auxiliary_loss_clip": 0.01088924, - "auxiliary_loss_mlp": 0.01032034, - "balance_loss_clip": 1.03741193, - "balance_loss_mlp": 1.0194813, - "epoch": 0.5957011874342402, - "flos": 35734277013120.0, - "grad_norm": 1.635771489703633, - "language_loss": 0.67245162, - "learning_rate": 1.483082978767595e-06, - "loss": 0.69366121, - "num_input_tokens_seen": 213481730, - "step": 9908, - "time_per_iteration": 2.7698655128479004 - }, - { - "auxiliary_loss_clip": 0.01040116, - "auxiliary_loss_mlp": 0.01031975, - "balance_loss_clip": 1.03187275, - "balance_loss_mlp": 1.0195055, - "epoch": 0.5957613106869082, - "flos": 21245004005760.0, - "grad_norm": 1.9181869047737456, - "language_loss": 0.76516539, - "learning_rate": 1.4827067598784298e-06, - "loss": 0.78588629, - "num_input_tokens_seen": 213497225, - "step": 9909, - "time_per_iteration": 2.8098058700561523 - }, - { - "auxiliary_loss_clip": 0.0103764, - "auxiliary_loss_mlp": 0.01004774, - "balance_loss_clip": 1.01340699, - "balance_loss_mlp": 1.00373673, - "epoch": 0.5958214339395761, - "flos": 65940969876480.0, - "grad_norm": 0.9280508663350204, - "language_loss": 0.73383075, - "learning_rate": 1.4823305606050753e-06, - "loss": 0.75425494, - "num_input_tokens_seen": 213556890, - "step": 9910, - "time_per_iteration": 3.228283166885376 - }, - { - "auxiliary_loss_clip": 0.0109102, - "auxiliary_loss_mlp": 0.01035168, - "balance_loss_clip": 1.03882253, - "balance_loss_mlp": 1.02188206, - "epoch": 0.5958815571922441, - "flos": 23218690567680.0, - "grad_norm": 2.4798653486938544, - "language_loss": 0.69676727, - "learning_rate": 1.481954380961799e-06, - "loss": 0.71802914, - "num_input_tokens_seen": 213575800, - "step": 9911, - "time_per_iteration": 2.6699378490448 - }, - { - "auxiliary_loss_clip": 0.01116036, - "auxiliary_loss_mlp": 0.01033392, - "balance_loss_clip": 1.04485154, - "balance_loss_mlp": 1.01942098, - "epoch": 0.595941680444912, - "flos": 16538623568640.0, - "grad_norm": 1.9669774674890577, - "language_loss": 0.65873277, - "learning_rate": 1.4815782209628631e-06, - "loss": 0.68022704, - "num_input_tokens_seen": 213592740, - "step": 9912, - "time_per_iteration": 2.642876386642456 - }, - { - "auxiliary_loss_clip": 0.0108881, - "auxiliary_loss_mlp": 0.01037146, - "balance_loss_clip": 1.04177618, - "balance_loss_mlp": 1.02360988, - "epoch": 0.59600180369758, - "flos": 27818883423360.0, - "grad_norm": 1.9028573243158677, - "language_loss": 0.73863906, - "learning_rate": 1.4812020806225337e-06, - "loss": 0.7598986, - "num_input_tokens_seen": 213611970, - "step": 9913, - "time_per_iteration": 2.860369920730591 - }, - { - "auxiliary_loss_clip": 0.01083137, - "auxiliary_loss_mlp": 0.00770309, - "balance_loss_clip": 1.03919995, - "balance_loss_mlp": 1.0000217, - "epoch": 0.596061926950248, - "flos": 29491566013440.0, - "grad_norm": 2.1966155200103907, - "language_loss": 0.79778421, - "learning_rate": 1.4808259599550738e-06, - "loss": 0.81631863, - "num_input_tokens_seen": 213632230, - "step": 9914, - "time_per_iteration": 2.790907382965088 - }, - { - "auxiliary_loss_clip": 0.01079867, - "auxiliary_loss_mlp": 0.01029281, - "balance_loss_clip": 1.03796613, - "balance_loss_mlp": 1.01610804, - "epoch": 0.596122050202916, - "flos": 16836790366080.0, - "grad_norm": 1.724717360749454, - "language_loss": 0.67540228, - "learning_rate": 1.4804498589747448e-06, - "loss": 0.69649374, - "num_input_tokens_seen": 213649645, - "step": 9915, - "time_per_iteration": 2.701197385787964 - }, - { - "auxiliary_loss_clip": 0.01088406, - "auxiliary_loss_mlp": 0.01030943, - "balance_loss_clip": 1.03837395, - "balance_loss_mlp": 1.0187242, - "epoch": 0.596182173455584, - "flos": 20996646393600.0, - "grad_norm": 1.462048268018942, - "language_loss": 0.78788066, - "learning_rate": 1.4800737776958095e-06, - "loss": 0.8090741, - "num_input_tokens_seen": 213668850, - "step": 9916, - "time_per_iteration": 2.7466511726379395 - }, - { - "auxiliary_loss_clip": 0.01093274, - "auxiliary_loss_mlp": 0.01031597, - "balance_loss_clip": 1.03742838, - "balance_loss_mlp": 1.01851332, - "epoch": 0.5962422967082519, - "flos": 16065680169600.0, - "grad_norm": 1.8319257164110343, - "language_loss": 0.8272475, - "learning_rate": 1.4796977161325286e-06, - "loss": 0.84849626, - "num_input_tokens_seen": 213685695, - "step": 9917, - "time_per_iteration": 2.6762564182281494 - }, - { - "auxiliary_loss_clip": 0.01090404, - "auxiliary_loss_mlp": 0.01034476, - "balance_loss_clip": 1.04083288, - "balance_loss_mlp": 1.02195954, - "epoch": 0.5963024199609199, - "flos": 12166966995840.0, - "grad_norm": 1.8036319058685593, - "language_loss": 0.76979315, - "learning_rate": 1.4793216742991625e-06, - "loss": 0.79104197, - "num_input_tokens_seen": 213703515, - "step": 9918, - "time_per_iteration": 2.707718849182129 - }, - { - "auxiliary_loss_clip": 0.01108865, - "auxiliary_loss_mlp": 0.01038575, - "balance_loss_clip": 1.04414129, - "balance_loss_mlp": 1.02538431, - "epoch": 0.5963625432135878, - "flos": 28074280101120.0, - "grad_norm": 2.6956936924639012, - "language_loss": 0.78955698, - "learning_rate": 1.4789456522099707e-06, - "loss": 0.8110314, - "num_input_tokens_seen": 213724170, - "step": 9919, - "time_per_iteration": 2.732933759689331 - }, - { - "auxiliary_loss_clip": 0.01091105, - "auxiliary_loss_mlp": 0.01037217, - "balance_loss_clip": 1.04111147, - "balance_loss_mlp": 1.02323401, - "epoch": 0.5964226664662559, - "flos": 19860324664320.0, - "grad_norm": 1.8773735409019414, - "language_loss": 0.77863061, - "learning_rate": 1.4785696498792122e-06, - "loss": 0.79991376, - "num_input_tokens_seen": 213740620, - "step": 9920, - "time_per_iteration": 2.6758365631103516 - }, - { - "auxiliary_loss_clip": 0.01105504, - "auxiliary_loss_mlp": 0.01037014, - "balance_loss_clip": 1.04226005, - "balance_loss_mlp": 1.02303123, - "epoch": 0.5964827897189238, - "flos": 12932618325120.0, - "grad_norm": 2.199993791667526, - "language_loss": 0.82559252, - "learning_rate": 1.4781936673211446e-06, - "loss": 0.84701777, - "num_input_tokens_seen": 213755390, - "step": 9921, - "time_per_iteration": 2.631972312927246 - }, - { - "auxiliary_loss_clip": 0.0110339, - "auxiliary_loss_mlp": 0.01032591, - "balance_loss_clip": 1.0396421, - "balance_loss_mlp": 1.01888192, - "epoch": 0.5965429129715918, - "flos": 18150797698560.0, - "grad_norm": 3.5044992121063103, - "language_loss": 0.80699342, - "learning_rate": 1.4778177045500252e-06, - "loss": 0.82835329, - "num_input_tokens_seen": 213773225, - "step": 9922, - "time_per_iteration": 2.646479606628418 - }, - { - "auxiliary_loss_clip": 0.01107944, - "auxiliary_loss_mlp": 0.00770214, - "balance_loss_clip": 1.04096532, - "balance_loss_mlp": 1.000036, - "epoch": 0.5966030362242597, - "flos": 21763231476480.0, - "grad_norm": 1.7423002236659255, - "language_loss": 0.77125442, - "learning_rate": 1.477441761580111e-06, - "loss": 0.79003608, - "num_input_tokens_seen": 213791860, - "step": 9923, - "time_per_iteration": 2.646597385406494 - }, - { - "auxiliary_loss_clip": 0.01105997, - "auxiliary_loss_mlp": 0.01038842, - "balance_loss_clip": 1.04343677, - "balance_loss_mlp": 1.02382815, - "epoch": 0.5966631594769277, - "flos": 18807208790400.0, - "grad_norm": 1.7872252192325138, - "language_loss": 0.76111019, - "learning_rate": 1.4770658384256573e-06, - "loss": 0.78255856, - "num_input_tokens_seen": 213809455, - "step": 9924, - "time_per_iteration": 2.784302234649658 - }, - { - "auxiliary_loss_clip": 0.01098024, - "auxiliary_loss_mlp": 0.0103727, - "balance_loss_clip": 1.03841281, - "balance_loss_mlp": 1.02270854, - "epoch": 0.5967232827295956, - "flos": 14064163545600.0, - "grad_norm": 2.5918588496554222, - "language_loss": 0.66627729, - "learning_rate": 1.4766899351009204e-06, - "loss": 0.6876303, - "num_input_tokens_seen": 213826615, - "step": 9925, - "time_per_iteration": 2.6964471340179443 - }, - { - "auxiliary_loss_clip": 0.01088743, - "auxiliary_loss_mlp": 0.01035345, - "balance_loss_clip": 1.04202008, - "balance_loss_mlp": 1.0219934, - "epoch": 0.5967834059822636, - "flos": 17238235743360.0, - "grad_norm": 2.607968523577736, - "language_loss": 0.71629661, - "learning_rate": 1.4763140516201528e-06, - "loss": 0.7375375, - "num_input_tokens_seen": 213844495, - "step": 9926, - "time_per_iteration": 2.739656448364258 - }, - { - "auxiliary_loss_clip": 0.01076071, - "auxiliary_loss_mlp": 0.00771823, - "balance_loss_clip": 1.04067254, - "balance_loss_mlp": 1.0001483, - "epoch": 0.5968435292349316, - "flos": 42520244284800.0, - "grad_norm": 1.798681806501109, - "language_loss": 0.70456839, - "learning_rate": 1.4759381879976088e-06, - "loss": 0.72304738, - "num_input_tokens_seen": 213869125, - "step": 9927, - "time_per_iteration": 2.9877870082855225 - }, - { - "auxiliary_loss_clip": 0.01071922, - "auxiliary_loss_mlp": 0.01029018, - "balance_loss_clip": 1.03775859, - "balance_loss_mlp": 1.01547647, - "epoch": 0.5969036524875996, - "flos": 37630898945280.0, - "grad_norm": 1.7276883821850428, - "language_loss": 0.63847625, - "learning_rate": 1.4755623442475415e-06, - "loss": 0.6594857, - "num_input_tokens_seen": 213891115, - "step": 9928, - "time_per_iteration": 2.889533042907715 - }, - { - "auxiliary_loss_clip": 0.01115406, - "auxiliary_loss_mlp": 0.0103325, - "balance_loss_clip": 1.04134023, - "balance_loss_mlp": 1.02103138, - "epoch": 0.5969637757402676, - "flos": 23148377694720.0, - "grad_norm": 1.6663701476220254, - "language_loss": 0.69803309, - "learning_rate": 1.4751865203842022e-06, - "loss": 0.71951973, - "num_input_tokens_seen": 213911925, - "step": 9929, - "time_per_iteration": 2.6571357250213623 - }, - { - "auxiliary_loss_clip": 0.01073832, - "auxiliary_loss_mlp": 0.01034603, - "balance_loss_clip": 1.04385591, - "balance_loss_mlp": 1.02244925, - "epoch": 0.5970238989929355, - "flos": 24020934877440.0, - "grad_norm": 1.7972325287685906, - "language_loss": 0.76839757, - "learning_rate": 1.4748107164218431e-06, - "loss": 0.78948194, - "num_input_tokens_seen": 213930715, - "step": 9930, - "time_per_iteration": 2.7475857734680176 - }, - { - "auxiliary_loss_clip": 0.0109514, - "auxiliary_loss_mlp": 0.01034752, - "balance_loss_clip": 1.04357862, - "balance_loss_mlp": 1.02017856, - "epoch": 0.5970840222456035, - "flos": 19426883247360.0, - "grad_norm": 1.7574249474808616, - "language_loss": 0.68748617, - "learning_rate": 1.4744349323747146e-06, - "loss": 0.70878506, - "num_input_tokens_seen": 213950015, - "step": 9931, - "time_per_iteration": 2.713695526123047 - }, - { - "auxiliary_loss_clip": 0.01025314, - "auxiliary_loss_mlp": 0.01000381, - "balance_loss_clip": 1.01468325, - "balance_loss_mlp": 0.99920666, - "epoch": 0.5971441454982714, - "flos": 62976615235200.0, - "grad_norm": 0.8553027537300191, - "language_loss": 0.64182514, - "learning_rate": 1.474059168257065e-06, - "loss": 0.66208208, - "num_input_tokens_seen": 214003330, - "step": 9932, - "time_per_iteration": 3.106821060180664 - }, - { - "auxiliary_loss_clip": 0.01084112, - "auxiliary_loss_mlp": 0.01032121, - "balance_loss_clip": 1.03818321, - "balance_loss_mlp": 1.01869833, - "epoch": 0.5972042687509395, - "flos": 20266223328000.0, - "grad_norm": 2.9993889514324463, - "language_loss": 0.73966062, - "learning_rate": 1.4736834240831454e-06, - "loss": 0.76082295, - "num_input_tokens_seen": 214021680, - "step": 9933, - "time_per_iteration": 2.718324899673462 - }, - { - "auxiliary_loss_clip": 0.01028586, - "auxiliary_loss_mlp": 0.01004687, - "balance_loss_clip": 1.02009809, - "balance_loss_mlp": 1.00334597, - "epoch": 0.5972643920036074, - "flos": 71652383832960.0, - "grad_norm": 0.6592973095113355, - "language_loss": 0.52000248, - "learning_rate": 1.473307699867203e-06, - "loss": 0.54033524, - "num_input_tokens_seen": 214090265, - "step": 9934, - "time_per_iteration": 3.265408515930176 - }, - { - "auxiliary_loss_clip": 0.01038691, - "auxiliary_loss_mlp": 0.01008472, - "balance_loss_clip": 1.01466894, - "balance_loss_mlp": 1.00733399, - "epoch": 0.5973245152562754, - "flos": 56892702263040.0, - "grad_norm": 0.8334850866606021, - "language_loss": 0.54153717, - "learning_rate": 1.4729319956234849e-06, - "loss": 0.5620088, - "num_input_tokens_seen": 214146375, - "step": 9935, - "time_per_iteration": 3.07120680809021 - }, - { - "auxiliary_loss_clip": 0.01095451, - "auxiliary_loss_mlp": 0.01033243, - "balance_loss_clip": 1.04008901, - "balance_loss_mlp": 1.01956391, - "epoch": 0.5973846385089433, - "flos": 24164361884160.0, - "grad_norm": 1.5706852760220016, - "language_loss": 0.66061485, - "learning_rate": 1.4725563113662394e-06, - "loss": 0.68190181, - "num_input_tokens_seen": 214165340, - "step": 9936, - "time_per_iteration": 2.724457263946533 - }, - { - "auxiliary_loss_clip": 0.01060903, - "auxiliary_loss_mlp": 0.01035654, - "balance_loss_clip": 1.03609622, - "balance_loss_mlp": 1.02246332, - "epoch": 0.5974447617616113, - "flos": 17670599752320.0, - "grad_norm": 1.9876387260879245, - "language_loss": 0.6771605, - "learning_rate": 1.4721806471097103e-06, - "loss": 0.69812608, - "num_input_tokens_seen": 214181360, - "step": 9937, - "time_per_iteration": 2.75978422164917 - }, - { - "auxiliary_loss_clip": 0.0111018, - "auxiliary_loss_mlp": 0.01032313, - "balance_loss_clip": 1.04208851, - "balance_loss_mlp": 1.01846123, - "epoch": 0.5975048850142792, - "flos": 22892514140160.0, - "grad_norm": 2.408863368051578, - "language_loss": 0.77660179, - "learning_rate": 1.4718050028681442e-06, - "loss": 0.79802668, - "num_input_tokens_seen": 214198525, - "step": 9938, - "time_per_iteration": 4.499311447143555 - }, - { - "auxiliary_loss_clip": 0.01105785, - "auxiliary_loss_mlp": 0.01034855, - "balance_loss_clip": 1.03925014, - "balance_loss_mlp": 1.02100301, - "epoch": 0.5975650082669473, - "flos": 24353108876160.0, - "grad_norm": 1.4410606641316148, - "language_loss": 0.75726342, - "learning_rate": 1.4714293786557855e-06, - "loss": 0.77866983, - "num_input_tokens_seen": 214218710, - "step": 9939, - "time_per_iteration": 4.202291011810303 - }, - { - "auxiliary_loss_clip": 0.01073866, - "auxiliary_loss_mlp": 0.01032947, - "balance_loss_clip": 1.04116249, - "balance_loss_mlp": 1.01718175, - "epoch": 0.5976251315196152, - "flos": 20923352691840.0, - "grad_norm": 4.812638761028828, - "language_loss": 0.68618965, - "learning_rate": 1.471053774486878e-06, - "loss": 0.70725775, - "num_input_tokens_seen": 214237800, - "step": 9940, - "time_per_iteration": 4.418368339538574 - }, - { - "auxiliary_loss_clip": 0.01090139, - "auxiliary_loss_mlp": 0.01036739, - "balance_loss_clip": 1.04158998, - "balance_loss_mlp": 1.02415049, - "epoch": 0.5976852547722832, - "flos": 35844594658560.0, - "grad_norm": 1.3494600203677949, - "language_loss": 0.70370513, - "learning_rate": 1.470678190375664e-06, - "loss": 0.72497392, - "num_input_tokens_seen": 214260355, - "step": 9941, - "time_per_iteration": 2.7807397842407227 - }, - { - "auxiliary_loss_clip": 0.01092498, - "auxiliary_loss_mlp": 0.01034522, - "balance_loss_clip": 1.03824401, - "balance_loss_mlp": 1.02123034, - "epoch": 0.5977453780249512, - "flos": 12855948744960.0, - "grad_norm": 1.9808022638780955, - "language_loss": 0.77407408, - "learning_rate": 1.470302626336386e-06, - "loss": 0.79534429, - "num_input_tokens_seen": 214277120, - "step": 9942, - "time_per_iteration": 2.6881802082061768 - }, - { - "auxiliary_loss_clip": 0.01071168, - "auxiliary_loss_mlp": 0.01037338, - "balance_loss_clip": 1.03963232, - "balance_loss_mlp": 1.02418923, - "epoch": 0.5978055012776191, - "flos": 20959155573120.0, - "grad_norm": 1.9541019064521015, - "language_loss": 0.76172185, - "learning_rate": 1.4699270823832857e-06, - "loss": 0.78280699, - "num_input_tokens_seen": 214295300, - "step": 9943, - "time_per_iteration": 4.4215734004974365 - }, - { - "auxiliary_loss_clip": 0.0105205, - "auxiliary_loss_mlp": 0.01034121, - "balance_loss_clip": 1.03876281, - "balance_loss_mlp": 1.02149105, - "epoch": 0.5978656245302871, - "flos": 34058003063040.0, - "grad_norm": 1.735048648764757, - "language_loss": 0.62473679, - "learning_rate": 1.4695515585306032e-06, - "loss": 0.64559853, - "num_input_tokens_seen": 214317050, - "step": 9944, - "time_per_iteration": 2.8701138496398926 - }, - { - "auxiliary_loss_clip": 0.0109987, - "auxiliary_loss_mlp": 0.0103879, - "balance_loss_clip": 1.04420114, - "balance_loss_mlp": 1.02530718, - "epoch": 0.597925747782955, - "flos": 37373275624320.0, - "grad_norm": 1.7121148929704375, - "language_loss": 0.72442955, - "learning_rate": 1.4691760547925795e-06, - "loss": 0.74581611, - "num_input_tokens_seen": 214337470, - "step": 9945, - "time_per_iteration": 2.7868094444274902 - }, - { - "auxiliary_loss_clip": 0.01063078, - "auxiliary_loss_mlp": 0.01035839, - "balance_loss_clip": 1.03817308, - "balance_loss_mlp": 1.02280903, - "epoch": 0.5979858710356231, - "flos": 25374803328000.0, - "grad_norm": 2.215747344961558, - "language_loss": 0.66905904, - "learning_rate": 1.4688005711834522e-06, - "loss": 0.6900481, - "num_input_tokens_seen": 214357975, - "step": 9946, - "time_per_iteration": 2.83195161819458 - }, - { - "auxiliary_loss_clip": 0.01104512, - "auxiliary_loss_mlp": 0.01042624, - "balance_loss_clip": 1.03969336, - "balance_loss_mlp": 1.0275619, - "epoch": 0.598045994288291, - "flos": 13698413308800.0, - "grad_norm": 1.928704516420183, - "language_loss": 0.88898396, - "learning_rate": 1.468425107717461e-06, - "loss": 0.91045535, - "num_input_tokens_seen": 214374125, - "step": 9947, - "time_per_iteration": 2.5993123054504395 - }, - { - "auxiliary_loss_clip": 0.01112155, - "auxiliary_loss_mlp": 0.01032443, - "balance_loss_clip": 1.04039431, - "balance_loss_mlp": 1.02080822, - "epoch": 0.598106117540959, - "flos": 21981352815360.0, - "grad_norm": 1.8699586676771087, - "language_loss": 0.72236538, - "learning_rate": 1.4680496644088432e-06, - "loss": 0.74381137, - "num_input_tokens_seen": 214393395, - "step": 9948, - "time_per_iteration": 2.6766860485076904 - }, - { - "auxiliary_loss_clip": 0.01093809, - "auxiliary_loss_mlp": 0.01035478, - "balance_loss_clip": 1.03969812, - "balance_loss_mlp": 1.02129257, - "epoch": 0.5981662407936269, - "flos": 20559362221440.0, - "grad_norm": 1.8848269321833362, - "language_loss": 0.89223683, - "learning_rate": 1.4676742412718347e-06, - "loss": 0.91352975, - "num_input_tokens_seen": 214411550, - "step": 9949, - "time_per_iteration": 2.731804370880127 - }, - { - "auxiliary_loss_clip": 0.01105698, - "auxiliary_loss_mlp": 0.01030419, - "balance_loss_clip": 1.0420059, - "balance_loss_mlp": 1.01814604, - "epoch": 0.5982263640462949, - "flos": 14063840323200.0, - "grad_norm": 2.0634992965968917, - "language_loss": 0.70250058, - "learning_rate": 1.467298838320673e-06, - "loss": 0.72386169, - "num_input_tokens_seen": 214429780, - "step": 9950, - "time_per_iteration": 2.666879415512085 - }, - { - "auxiliary_loss_clip": 0.01103442, - "auxiliary_loss_mlp": 0.01031809, - "balance_loss_clip": 1.0392406, - "balance_loss_mlp": 1.01904809, - "epoch": 0.5982864872989628, - "flos": 17707228646400.0, - "grad_norm": 1.610292824709656, - "language_loss": 0.78345191, - "learning_rate": 1.4669234555695921e-06, - "loss": 0.80480444, - "num_input_tokens_seen": 214447775, - "step": 9951, - "time_per_iteration": 2.624361753463745 - }, - { - "auxiliary_loss_clip": 0.01096152, - "auxiliary_loss_mlp": 0.01038536, - "balance_loss_clip": 1.0411104, - "balance_loss_mlp": 1.02471995, - "epoch": 0.5983466105516309, - "flos": 16764789553920.0, - "grad_norm": 1.4677439185999286, - "language_loss": 0.73951542, - "learning_rate": 1.4665480930328275e-06, - "loss": 0.76086229, - "num_input_tokens_seen": 214467245, - "step": 9952, - "time_per_iteration": 2.780212640762329 - }, - { - "auxiliary_loss_clip": 0.01097597, - "auxiliary_loss_mlp": 0.00771764, - "balance_loss_clip": 1.04058945, - "balance_loss_mlp": 1.0000577, - "epoch": 0.5984067338042988, - "flos": 20042714949120.0, - "grad_norm": 2.0876696722134493, - "language_loss": 0.79496032, - "learning_rate": 1.466172750724613e-06, - "loss": 0.81365395, - "num_input_tokens_seen": 214484385, - "step": 9953, - "time_per_iteration": 2.6629557609558105 - }, - { - "auxiliary_loss_clip": 0.01088175, - "auxiliary_loss_mlp": 0.01034264, - "balance_loss_clip": 1.04368794, - "balance_loss_mlp": 1.02172363, - "epoch": 0.5984668570569668, - "flos": 26319900026880.0, - "grad_norm": 1.571611875852805, - "language_loss": 0.69577867, - "learning_rate": 1.4657974286591807e-06, - "loss": 0.71700311, - "num_input_tokens_seen": 214503465, - "step": 9954, - "time_per_iteration": 2.772745132446289 - }, - { - "auxiliary_loss_clip": 0.01092663, - "auxiliary_loss_mlp": 0.0103537, - "balance_loss_clip": 1.03927422, - "balance_loss_mlp": 1.02299023, - "epoch": 0.5985269803096348, - "flos": 20593728558720.0, - "grad_norm": 1.8709505635033254, - "language_loss": 0.73055756, - "learning_rate": 1.4654221268507637e-06, - "loss": 0.75183785, - "num_input_tokens_seen": 214520725, - "step": 9955, - "time_per_iteration": 2.6827971935272217 - }, - { - "auxiliary_loss_clip": 0.01118308, - "auxiliary_loss_mlp": 0.01034246, - "balance_loss_clip": 1.04205883, - "balance_loss_mlp": 1.0209837, - "epoch": 0.5985871035623027, - "flos": 26865382942080.0, - "grad_norm": 1.5476020192092728, - "language_loss": 0.68627518, - "learning_rate": 1.4650468453135934e-06, - "loss": 0.70780075, - "num_input_tokens_seen": 214540675, - "step": 9956, - "time_per_iteration": 2.6055126190185547 - }, - { - "auxiliary_loss_clip": 0.01120333, - "auxiliary_loss_mlp": 0.01033532, - "balance_loss_clip": 1.0435667, - "balance_loss_mlp": 1.02041864, - "epoch": 0.5986472268149707, - "flos": 19609704495360.0, - "grad_norm": 5.767015828905461, - "language_loss": 0.74026513, - "learning_rate": 1.4646715840618999e-06, - "loss": 0.76180387, - "num_input_tokens_seen": 214559910, - "step": 9957, - "time_per_iteration": 2.670759677886963 - }, - { - "auxiliary_loss_clip": 0.01082315, - "auxiliary_loss_mlp": 0.01029692, - "balance_loss_clip": 1.04125023, - "balance_loss_mlp": 1.01696002, - "epoch": 0.5987073500676386, - "flos": 21794616984960.0, - "grad_norm": 2.0517993540808157, - "language_loss": 0.84612942, - "learning_rate": 1.4642963431099138e-06, - "loss": 0.86724949, - "num_input_tokens_seen": 214575960, - "step": 9958, - "time_per_iteration": 2.710693597793579 - }, - { - "auxiliary_loss_clip": 0.01088695, - "auxiliary_loss_mlp": 0.00771117, - "balance_loss_clip": 1.04130435, - "balance_loss_mlp": 1.00005364, - "epoch": 0.5987674733203067, - "flos": 24314361079680.0, - "grad_norm": 1.9589439151063424, - "language_loss": 0.6649909, - "learning_rate": 1.463921122471864e-06, - "loss": 0.68358904, - "num_input_tokens_seen": 214594230, - "step": 9959, - "time_per_iteration": 2.7052528858184814 - }, - { - "auxiliary_loss_clip": 0.0110604, - "auxiliary_loss_mlp": 0.01031714, - "balance_loss_clip": 1.04048181, - "balance_loss_mlp": 1.01915514, - "epoch": 0.5988275965729746, - "flos": 21320201128320.0, - "grad_norm": 1.6803724665796522, - "language_loss": 0.83453488, - "learning_rate": 1.4635459221619796e-06, - "loss": 0.85591239, - "num_input_tokens_seen": 214613130, - "step": 9960, - "time_per_iteration": 2.698373317718506 - }, - { - "auxiliary_loss_clip": 0.0110105, - "auxiliary_loss_mlp": 0.01026917, - "balance_loss_clip": 1.04384398, - "balance_loss_mlp": 1.01451361, - "epoch": 0.5988877198256426, - "flos": 25118041933440.0, - "grad_norm": 1.4637618649833892, - "language_loss": 0.79449862, - "learning_rate": 1.4631707421944868e-06, - "loss": 0.81577832, - "num_input_tokens_seen": 214634470, - "step": 9961, - "time_per_iteration": 2.763143539428711 - }, - { - "auxiliary_loss_clip": 0.01115923, - "auxiliary_loss_mlp": 0.01034214, - "balance_loss_clip": 1.04150534, - "balance_loss_mlp": 1.02107751, - "epoch": 0.5989478430783105, - "flos": 26429104350720.0, - "grad_norm": 1.7720947984672266, - "language_loss": 0.66938126, - "learning_rate": 1.4627955825836136e-06, - "loss": 0.69088268, - "num_input_tokens_seen": 214654030, - "step": 9962, - "time_per_iteration": 2.6398210525512695 - }, - { - "auxiliary_loss_clip": 0.01100963, - "auxiliary_loss_mlp": 0.01040353, - "balance_loss_clip": 1.03867447, - "balance_loss_mlp": 1.02583313, - "epoch": 0.5990079663309785, - "flos": 25778439434880.0, - "grad_norm": 1.3371562951805418, - "language_loss": 0.74043596, - "learning_rate": 1.4624204433435857e-06, - "loss": 0.76184916, - "num_input_tokens_seen": 214676985, - "step": 9963, - "time_per_iteration": 2.716456651687622 - }, - { - "auxiliary_loss_clip": 0.01105789, - "auxiliary_loss_mlp": 0.010335, - "balance_loss_clip": 1.04120398, - "balance_loss_mlp": 1.02003562, - "epoch": 0.5990680895836464, - "flos": 36831779118720.0, - "grad_norm": 1.8119605341465645, - "language_loss": 0.68010569, - "learning_rate": 1.4620453244886281e-06, - "loss": 0.70149863, - "num_input_tokens_seen": 214700105, - "step": 9964, - "time_per_iteration": 2.764112710952759 - }, - { - "auxiliary_loss_clip": 0.01082495, - "auxiliary_loss_mlp": 0.01029274, - "balance_loss_clip": 1.04189765, - "balance_loss_mlp": 1.0158987, - "epoch": 0.5991282128363145, - "flos": 24133550993280.0, - "grad_norm": 1.838028773427246, - "language_loss": 0.76536453, - "learning_rate": 1.4616702260329662e-06, - "loss": 0.78648221, - "num_input_tokens_seen": 214717885, - "step": 9965, - "time_per_iteration": 2.6872916221618652 - }, - { - "auxiliary_loss_clip": 0.01100107, - "auxiliary_loss_mlp": 0.01029644, - "balance_loss_clip": 1.03997707, - "balance_loss_mlp": 1.01664448, - "epoch": 0.5991883360889824, - "flos": 10304064956160.0, - "grad_norm": 1.881941118756219, - "language_loss": 0.77352554, - "learning_rate": 1.4612951479908229e-06, - "loss": 0.79482305, - "num_input_tokens_seen": 214733680, - "step": 9966, - "time_per_iteration": 2.645473003387451 - }, - { - "auxiliary_loss_clip": 0.01080024, - "auxiliary_loss_mlp": 0.01029432, - "balance_loss_clip": 1.04003799, - "balance_loss_mlp": 1.01742721, - "epoch": 0.5992484593416504, - "flos": 23951196622080.0, - "grad_norm": 1.4675663731632993, - "language_loss": 0.73089266, - "learning_rate": 1.460920090376422e-06, - "loss": 0.75198722, - "num_input_tokens_seen": 214753285, - "step": 9967, - "time_per_iteration": 2.7043392658233643 - }, - { - "auxiliary_loss_clip": 0.0111042, - "auxiliary_loss_mlp": 0.01035802, - "balance_loss_clip": 1.04168642, - "balance_loss_mlp": 1.02200305, - "epoch": 0.5993085825943184, - "flos": 11944105061760.0, - "grad_norm": 2.0432757361111724, - "language_loss": 0.68492925, - "learning_rate": 1.4605450532039847e-06, - "loss": 0.70639145, - "num_input_tokens_seen": 214767810, - "step": 9968, - "time_per_iteration": 2.618802070617676 - }, - { - "auxiliary_loss_clip": 0.01104497, - "auxiliary_loss_mlp": 0.01037187, - "balance_loss_clip": 1.03805614, - "balance_loss_mlp": 1.02315605, - "epoch": 0.5993687058469863, - "flos": 19026838500480.0, - "grad_norm": 1.5933947258371375, - "language_loss": 0.79251635, - "learning_rate": 1.4601700364877334e-06, - "loss": 0.81393319, - "num_input_tokens_seen": 214786040, - "step": 9969, - "time_per_iteration": 2.6758008003234863 - }, - { - "auxiliary_loss_clip": 0.01100647, - "auxiliary_loss_mlp": 0.01031223, - "balance_loss_clip": 1.03998137, - "balance_loss_mlp": 1.0176506, - "epoch": 0.5994288290996543, - "flos": 14282967242880.0, - "grad_norm": 1.6601112189929519, - "language_loss": 0.80936122, - "learning_rate": 1.4597950402418889e-06, - "loss": 0.83067989, - "num_input_tokens_seen": 214803110, - "step": 9970, - "time_per_iteration": 2.7434401512145996 - }, - { - "auxiliary_loss_clip": 0.01064445, - "auxiliary_loss_mlp": 0.01044271, - "balance_loss_clip": 1.0378437, - "balance_loss_mlp": 1.02879751, - "epoch": 0.5994889523523222, - "flos": 19206643006080.0, - "grad_norm": 2.015109530583561, - "language_loss": 0.61666113, - "learning_rate": 1.4594200644806697e-06, - "loss": 0.6377483, - "num_input_tokens_seen": 214819945, - "step": 9971, - "time_per_iteration": 2.6593470573425293 - }, - { - "auxiliary_loss_clip": 0.01112816, - "auxiliary_loss_mlp": 0.01033245, - "balance_loss_clip": 1.04096997, - "balance_loss_mlp": 1.02065659, - "epoch": 0.5995490756049903, - "flos": 28037040675840.0, - "grad_norm": 1.7466561522631148, - "language_loss": 0.79054534, - "learning_rate": 1.4590451092182962e-06, - "loss": 0.81200594, - "num_input_tokens_seen": 214838810, - "step": 9972, - "time_per_iteration": 2.657733917236328 - }, - { - "auxiliary_loss_clip": 0.01077287, - "auxiliary_loss_mlp": 0.0103561, - "balance_loss_clip": 1.03948355, - "balance_loss_mlp": 1.0220139, - "epoch": 0.5996091988576582, - "flos": 29052953038080.0, - "grad_norm": 2.7295276371688657, - "language_loss": 0.76414442, - "learning_rate": 1.4586701744689864e-06, - "loss": 0.78527337, - "num_input_tokens_seen": 214857040, - "step": 9973, - "time_per_iteration": 2.804370880126953 - }, - { - "auxiliary_loss_clip": 0.01080222, - "auxiliary_loss_mlp": 0.01031483, - "balance_loss_clip": 1.03798461, - "balance_loss_mlp": 1.01820862, - "epoch": 0.5996693221103262, - "flos": 20813968800000.0, - "grad_norm": 2.687412315258338, - "language_loss": 0.65429473, - "learning_rate": 1.4582952602469578e-06, - "loss": 0.6754117, - "num_input_tokens_seen": 214873375, - "step": 9974, - "time_per_iteration": 2.7193095684051514 - }, - { - "auxiliary_loss_clip": 0.01106109, - "auxiliary_loss_mlp": 0.01032556, - "balance_loss_clip": 1.0399034, - "balance_loss_mlp": 1.01984227, - "epoch": 0.5997294453629941, - "flos": 23768914078080.0, - "grad_norm": 1.3699302504221633, - "language_loss": 0.74378854, - "learning_rate": 1.457920366566428e-06, - "loss": 0.76517522, - "num_input_tokens_seen": 214893900, - "step": 9975, - "time_per_iteration": 2.6727962493896484 - }, - { - "auxiliary_loss_clip": 0.01117306, - "auxiliary_loss_mlp": 0.01031631, - "balance_loss_clip": 1.04184341, - "balance_loss_mlp": 1.01771951, - "epoch": 0.5997895686156621, - "flos": 20960017499520.0, - "grad_norm": 1.8689128111534072, - "language_loss": 0.77081978, - "learning_rate": 1.457545493441611e-06, - "loss": 0.79230917, - "num_input_tokens_seen": 214912110, - "step": 9976, - "time_per_iteration": 2.5855295658111572 - }, - { - "auxiliary_loss_clip": 0.01101132, - "auxiliary_loss_mlp": 0.01036343, - "balance_loss_clip": 1.04325271, - "balance_loss_mlp": 1.0225029, - "epoch": 0.59984969186833, - "flos": 28365443746560.0, - "grad_norm": 2.489782776024688, - "language_loss": 0.74998355, - "learning_rate": 1.4571706408867237e-06, - "loss": 0.77135837, - "num_input_tokens_seen": 214930140, - "step": 9977, - "time_per_iteration": 4.355423212051392 - }, - { - "auxiliary_loss_clip": 0.01081083, - "auxiliary_loss_mlp": 0.01029688, - "balance_loss_clip": 1.03771675, - "balance_loss_mlp": 1.01639032, - "epoch": 0.5999098151209981, - "flos": 22565906749440.0, - "grad_norm": 1.7961745328309484, - "language_loss": 0.69053113, - "learning_rate": 1.4567958089159802e-06, - "loss": 0.71163881, - "num_input_tokens_seen": 214949200, - "step": 9978, - "time_per_iteration": 2.687735080718994 - }, - { - "auxiliary_loss_clip": 0.01124045, - "auxiliary_loss_mlp": 0.01035056, - "balance_loss_clip": 1.04541636, - "balance_loss_mlp": 1.02081037, - "epoch": 0.599969938373666, - "flos": 18768712389120.0, - "grad_norm": 1.9378111201967976, - "language_loss": 0.81427479, - "learning_rate": 1.456420997543594e-06, - "loss": 0.8358658, - "num_input_tokens_seen": 214965775, - "step": 9979, - "time_per_iteration": 5.60455322265625 - }, - { - "auxiliary_loss_clip": 0.01113469, - "auxiliary_loss_mlp": 0.0103294, - "balance_loss_clip": 1.04139137, - "balance_loss_mlp": 1.02011895, - "epoch": 0.600030061626334, - "flos": 11327231865600.0, - "grad_norm": 2.0199004568827577, - "language_loss": 0.70054936, - "learning_rate": 1.4560462067837782e-06, - "loss": 0.72201335, - "num_input_tokens_seen": 214982480, - "step": 9980, - "time_per_iteration": 2.5815303325653076 - }, - { - "auxiliary_loss_clip": 0.01105293, - "auxiliary_loss_mlp": 0.01032543, - "balance_loss_clip": 1.03971553, - "balance_loss_mlp": 1.01786244, - "epoch": 0.600090184879002, - "flos": 16578664254720.0, - "grad_norm": 2.2746227330860327, - "language_loss": 0.686566, - "learning_rate": 1.4556714366507445e-06, - "loss": 0.70794439, - "num_input_tokens_seen": 214998110, - "step": 9981, - "time_per_iteration": 2.635133743286133 - }, - { - "auxiliary_loss_clip": 0.01106547, - "auxiliary_loss_mlp": 0.01036545, - "balance_loss_clip": 1.04316497, - "balance_loss_mlp": 1.02458215, - "epoch": 0.6001503081316699, - "flos": 23618627573760.0, - "grad_norm": 1.8281310539755133, - "language_loss": 0.78525096, - "learning_rate": 1.4552966871587048e-06, - "loss": 0.80668187, - "num_input_tokens_seen": 215017995, - "step": 9982, - "time_per_iteration": 4.6227052211761475 - }, - { - "auxiliary_loss_clip": 0.01066865, - "auxiliary_loss_mlp": 0.01043371, - "balance_loss_clip": 1.03895831, - "balance_loss_mlp": 1.02730179, - "epoch": 0.6002104313843379, - "flos": 20667668705280.0, - "grad_norm": 1.558592797835216, - "language_loss": 0.73127562, - "learning_rate": 1.4549219583218686e-06, - "loss": 0.75237799, - "num_input_tokens_seen": 215038285, - "step": 9983, - "time_per_iteration": 2.851017951965332 - }, - { - "auxiliary_loss_clip": 0.01075266, - "auxiliary_loss_mlp": 0.01033335, - "balance_loss_clip": 1.03699243, - "balance_loss_mlp": 1.01962018, - "epoch": 0.6002705546370058, - "flos": 22455229968000.0, - "grad_norm": 4.7484025968689325, - "language_loss": 0.78227878, - "learning_rate": 1.454547250154447e-06, - "loss": 0.80336481, - "num_input_tokens_seen": 215057825, - "step": 9984, - "time_per_iteration": 2.6935315132141113 - }, - { - "auxiliary_loss_clip": 0.01109117, - "auxiliary_loss_mlp": 0.01035425, - "balance_loss_clip": 1.04397178, - "balance_loss_mlp": 1.02215743, - "epoch": 0.6003306778896739, - "flos": 25191982080000.0, - "grad_norm": 1.729800567101094, - "language_loss": 0.83458543, - "learning_rate": 1.4541725626706485e-06, - "loss": 0.85603082, - "num_input_tokens_seen": 215077790, - "step": 9985, - "time_per_iteration": 2.7772903442382812 - }, - { - "auxiliary_loss_clip": 0.01106318, - "auxiliary_loss_mlp": 0.01039651, - "balance_loss_clip": 1.04176068, - "balance_loss_mlp": 1.02729487, - "epoch": 0.6003908011423418, - "flos": 26687733252480.0, - "grad_norm": 2.2153021552569956, - "language_loss": 0.71093589, - "learning_rate": 1.4537978958846809e-06, - "loss": 0.73239559, - "num_input_tokens_seen": 215097650, - "step": 9986, - "time_per_iteration": 2.794067859649658 - }, - { - "auxiliary_loss_clip": 0.0112089, - "auxiliary_loss_mlp": 0.00771497, - "balance_loss_clip": 1.04465151, - "balance_loss_mlp": 1.00010371, - "epoch": 0.6004509243950098, - "flos": 22565080736640.0, - "grad_norm": 1.3997582574427474, - "language_loss": 0.71425599, - "learning_rate": 1.4534232498107514e-06, - "loss": 0.73317981, - "num_input_tokens_seen": 215118235, - "step": 9987, - "time_per_iteration": 2.689911365509033 - }, - { - "auxiliary_loss_clip": 0.01096945, - "auxiliary_loss_mlp": 0.01039105, - "balance_loss_clip": 1.04330432, - "balance_loss_mlp": 1.02589071, - "epoch": 0.6005110476476777, - "flos": 19719303868800.0, - "grad_norm": 1.7371829608849618, - "language_loss": 0.84939432, - "learning_rate": 1.4530486244630673e-06, - "loss": 0.87075484, - "num_input_tokens_seen": 215136755, - "step": 9988, - "time_per_iteration": 2.7220449447631836 - }, - { - "auxiliary_loss_clip": 0.01108518, - "auxiliary_loss_mlp": 0.01035377, - "balance_loss_clip": 1.04211533, - "balance_loss_mlp": 1.02187085, - "epoch": 0.6005711709003457, - "flos": 17712543859200.0, - "grad_norm": 1.6453818743399957, - "language_loss": 0.65595025, - "learning_rate": 1.4526740198558346e-06, - "loss": 0.6773892, - "num_input_tokens_seen": 215155225, - "step": 9989, - "time_per_iteration": 2.708707809448242 - }, - { - "auxiliary_loss_clip": 0.0110487, - "auxiliary_loss_mlp": 0.01034775, - "balance_loss_clip": 1.04078543, - "balance_loss_mlp": 1.02239513, - "epoch": 0.6006312941530136, - "flos": 18514464946560.0, - "grad_norm": 1.5276583445435046, - "language_loss": 0.8036738, - "learning_rate": 1.452299436003257e-06, - "loss": 0.82507026, - "num_input_tokens_seen": 215174815, - "step": 9990, - "time_per_iteration": 2.6760056018829346 - }, - { - "auxiliary_loss_clip": 0.0107479, - "auxiliary_loss_mlp": 0.01031552, - "balance_loss_clip": 1.03909266, - "balance_loss_mlp": 1.01817632, - "epoch": 0.6006914174056817, - "flos": 21390837223680.0, - "grad_norm": 2.0016484487093833, - "language_loss": 0.8290872, - "learning_rate": 1.4519248729195403e-06, - "loss": 0.85015059, - "num_input_tokens_seen": 215192045, - "step": 9991, - "time_per_iteration": 2.6902015209198 - }, - { - "auxiliary_loss_clip": 0.01062355, - "auxiliary_loss_mlp": 0.01042556, - "balance_loss_clip": 1.03686535, - "balance_loss_mlp": 1.02867436, - "epoch": 0.6007515406583496, - "flos": 12750515349120.0, - "grad_norm": 1.9693626562875086, - "language_loss": 0.82834661, - "learning_rate": 1.4515503306188878e-06, - "loss": 0.84939575, - "num_input_tokens_seen": 215209885, - "step": 9992, - "time_per_iteration": 2.750401496887207 - }, - { - "auxiliary_loss_clip": 0.01095422, - "auxiliary_loss_mlp": 0.00771119, - "balance_loss_clip": 1.04209352, - "balance_loss_mlp": 1.0001328, - "epoch": 0.6008116639110176, - "flos": 19206894401280.0, - "grad_norm": 1.855753675619843, - "language_loss": 0.66424763, - "learning_rate": 1.4511758091155008e-06, - "loss": 0.68291306, - "num_input_tokens_seen": 215228150, - "step": 9993, - "time_per_iteration": 2.664606809616089 - }, - { - "auxiliary_loss_clip": 0.01080718, - "auxiliary_loss_mlp": 0.01034631, - "balance_loss_clip": 1.03863966, - "balance_loss_mlp": 1.02051032, - "epoch": 0.6008717871636855, - "flos": 17055342668160.0, - "grad_norm": 2.4957386160129182, - "language_loss": 0.80870563, - "learning_rate": 1.4508013084235826e-06, - "loss": 0.82985908, - "num_input_tokens_seen": 215243755, - "step": 9994, - "time_per_iteration": 2.640841007232666 - }, - { - "auxiliary_loss_clip": 0.01071985, - "auxiliary_loss_mlp": 0.01029355, - "balance_loss_clip": 1.03745914, - "balance_loss_mlp": 1.01653457, - "epoch": 0.6009319104163535, - "flos": 20298686244480.0, - "grad_norm": 1.874968253383489, - "language_loss": 0.72665036, - "learning_rate": 1.4504268285573337e-06, - "loss": 0.7476638, - "num_input_tokens_seen": 215262130, - "step": 9995, - "time_per_iteration": 2.694720506668091 - }, - { - "auxiliary_loss_clip": 0.01094635, - "auxiliary_loss_mlp": 0.01038473, - "balance_loss_clip": 1.03786469, - "balance_loss_mlp": 1.02479935, - "epoch": 0.6009920336690215, - "flos": 21836776573440.0, - "grad_norm": 1.6925252532660184, - "language_loss": 0.80807674, - "learning_rate": 1.4500523695309546e-06, - "loss": 0.82940787, - "num_input_tokens_seen": 215281785, - "step": 9996, - "time_per_iteration": 2.6821236610412598 - }, - { - "auxiliary_loss_clip": 0.01056059, - "auxiliary_loss_mlp": 0.01045573, - "balance_loss_clip": 1.0363729, - "balance_loss_mlp": 1.03094554, - "epoch": 0.6010521569216895, - "flos": 22596107109120.0, - "grad_norm": 2.5847377804090548, - "language_loss": 0.78435457, - "learning_rate": 1.4496779313586447e-06, - "loss": 0.80537087, - "num_input_tokens_seen": 215297550, - "step": 9997, - "time_per_iteration": 2.763819694519043 - }, - { - "auxiliary_loss_clip": 0.01106886, - "auxiliary_loss_mlp": 0.0103365, - "balance_loss_clip": 1.0403868, - "balance_loss_mlp": 1.01968443, - "epoch": 0.6011122801743575, - "flos": 19171702051200.0, - "grad_norm": 1.6202780199081332, - "language_loss": 0.73208427, - "learning_rate": 1.4493035140546028e-06, - "loss": 0.75348961, - "num_input_tokens_seen": 215316360, - "step": 9998, - "time_per_iteration": 2.642061471939087 - }, - { - "auxiliary_loss_clip": 0.01085494, - "auxiliary_loss_mlp": 0.01033235, - "balance_loss_clip": 1.03910601, - "balance_loss_mlp": 1.01992536, - "epoch": 0.6011724034270254, - "flos": 25010022758400.0, - "grad_norm": 1.482726748062067, - "language_loss": 0.72144544, - "learning_rate": 1.448929117633027e-06, - "loss": 0.74263275, - "num_input_tokens_seen": 215336405, - "step": 9999, - "time_per_iteration": 2.726409673690796 - }, - { - "auxiliary_loss_clip": 0.01067323, - "auxiliary_loss_mlp": 0.0103545, - "balance_loss_clip": 1.03762555, - "balance_loss_mlp": 1.02221787, - "epoch": 0.6012325266796934, - "flos": 21797669640960.0, - "grad_norm": 1.6696696942731026, - "language_loss": 0.78647506, - "learning_rate": 1.4485547421081142e-06, - "loss": 0.80750275, - "num_input_tokens_seen": 215356590, - "step": 10000, - "time_per_iteration": 2.8357326984405518 - }, - { - "auxiliary_loss_clip": 0.01121882, - "auxiliary_loss_mlp": 0.0103934, - "balance_loss_clip": 1.04357147, - "balance_loss_mlp": 1.02509475, - "epoch": 0.6012926499323613, - "flos": 19573003774080.0, - "grad_norm": 1.8951447876838274, - "language_loss": 0.7747916, - "learning_rate": 1.4481803874940608e-06, - "loss": 0.79640388, - "num_input_tokens_seen": 215374295, - "step": 10001, - "time_per_iteration": 2.623619556427002 - }, - { - "auxiliary_loss_clip": 0.01110485, - "auxiliary_loss_mlp": 0.01029908, - "balance_loss_clip": 1.04319382, - "balance_loss_mlp": 1.01584136, - "epoch": 0.6013527731850293, - "flos": 34860786076800.0, - "grad_norm": 1.8091026033907125, - "language_loss": 0.5879162, - "learning_rate": 1.4478060538050624e-06, - "loss": 0.60932016, - "num_input_tokens_seen": 215394535, - "step": 10002, - "time_per_iteration": 2.7854535579681396 - }, - { - "auxiliary_loss_clip": 0.01101715, - "auxiliary_loss_mlp": 0.01040915, - "balance_loss_clip": 1.04363275, - "balance_loss_mlp": 1.02503633, - "epoch": 0.6014128964376972, - "flos": 23291948355840.0, - "grad_norm": 1.7477200306776974, - "language_loss": 0.7803607, - "learning_rate": 1.447431741055314e-06, - "loss": 0.80178702, - "num_input_tokens_seen": 215414355, - "step": 10003, - "time_per_iteration": 2.717195987701416 - }, - { - "auxiliary_loss_clip": 0.01119246, - "auxiliary_loss_mlp": 0.01034591, - "balance_loss_clip": 1.04236484, - "balance_loss_mlp": 1.02104282, - "epoch": 0.6014730196903653, - "flos": 24820916630400.0, - "grad_norm": 2.556614535664238, - "language_loss": 0.77315271, - "learning_rate": 1.4470574492590091e-06, - "loss": 0.79469103, - "num_input_tokens_seen": 215428280, - "step": 10004, - "time_per_iteration": 2.7323880195617676 - }, - { - "auxiliary_loss_clip": 0.01103784, - "auxiliary_loss_mlp": 0.01030419, - "balance_loss_clip": 1.04035211, - "balance_loss_mlp": 1.01653695, - "epoch": 0.6015331429430332, - "flos": 23112359331840.0, - "grad_norm": 1.5669492652896668, - "language_loss": 0.72698373, - "learning_rate": 1.4466831784303408e-06, - "loss": 0.74832577, - "num_input_tokens_seen": 215448970, - "step": 10005, - "time_per_iteration": 2.6966609954833984 - }, - { - "auxiliary_loss_clip": 0.0111171, - "auxiliary_loss_mlp": 0.01028792, - "balance_loss_clip": 1.03977418, - "balance_loss_mlp": 1.01611972, - "epoch": 0.6015932661957012, - "flos": 19201363706880.0, - "grad_norm": 2.133433515954987, - "language_loss": 0.7512781, - "learning_rate": 1.4463089285835026e-06, - "loss": 0.77268308, - "num_input_tokens_seen": 215465260, - "step": 10006, - "time_per_iteration": 2.5414936542510986 - }, - { - "auxiliary_loss_clip": 0.01089042, - "auxiliary_loss_mlp": 0.01034372, - "balance_loss_clip": 1.03682578, - "balance_loss_mlp": 1.0206567, - "epoch": 0.6016533894483691, - "flos": 18113630100480.0, - "grad_norm": 2.222329085457676, - "language_loss": 0.73606133, - "learning_rate": 1.445934699732685e-06, - "loss": 0.75729549, - "num_input_tokens_seen": 215482725, - "step": 10007, - "time_per_iteration": 2.7956955432891846 - }, - { - "auxiliary_loss_clip": 0.0109466, - "auxiliary_loss_mlp": 0.01027479, - "balance_loss_clip": 1.04082942, - "balance_loss_mlp": 1.0153439, - "epoch": 0.6017135127010371, - "flos": 16216900427520.0, - "grad_norm": 1.6405140373840412, - "language_loss": 0.69996077, - "learning_rate": 1.4455604918920785e-06, - "loss": 0.72118211, - "num_input_tokens_seen": 215500420, - "step": 10008, - "time_per_iteration": 2.740049362182617 - }, - { - "auxiliary_loss_clip": 0.01104877, - "auxiliary_loss_mlp": 0.01024718, - "balance_loss_clip": 1.03994632, - "balance_loss_mlp": 1.01252937, - "epoch": 0.6017736359537051, - "flos": 23444246021760.0, - "grad_norm": 1.594791938839471, - "language_loss": 0.76377881, - "learning_rate": 1.4451863050758748e-06, - "loss": 0.78507471, - "num_input_tokens_seen": 215522260, - "step": 10009, - "time_per_iteration": 2.6797382831573486 - }, - { - "auxiliary_loss_clip": 0.0109029, - "auxiliary_loss_mlp": 0.00770516, - "balance_loss_clip": 1.03898764, - "balance_loss_mlp": 1.00010157, - "epoch": 0.601833759206373, - "flos": 23514056104320.0, - "grad_norm": 1.9797273750165876, - "language_loss": 0.74202949, - "learning_rate": 1.4448121392982608e-06, - "loss": 0.76063752, - "num_input_tokens_seen": 215541715, - "step": 10010, - "time_per_iteration": 2.7184016704559326 - }, - { - "auxiliary_loss_clip": 0.01028511, - "auxiliary_loss_mlp": 0.00998357, - "balance_loss_clip": 1.01324391, - "balance_loss_mlp": 0.99717093, - "epoch": 0.6018938824590411, - "flos": 63991668648960.0, - "grad_norm": 0.8045921055289736, - "language_loss": 0.55051792, - "learning_rate": 1.4444379945734268e-06, - "loss": 0.57078665, - "num_input_tokens_seen": 215603020, - "step": 10011, - "time_per_iteration": 3.2024238109588623 - }, - { - "auxiliary_loss_clip": 0.01107806, - "auxiliary_loss_mlp": 0.01034347, - "balance_loss_clip": 1.04110157, - "balance_loss_mlp": 1.02186561, - "epoch": 0.601954005711709, - "flos": 34640007131520.0, - "grad_norm": 1.3534958886387711, - "language_loss": 0.62085426, - "learning_rate": 1.44406387091556e-06, - "loss": 0.64227581, - "num_input_tokens_seen": 215625115, - "step": 10012, - "time_per_iteration": 2.756197452545166 - }, - { - "auxiliary_loss_clip": 0.01074106, - "auxiliary_loss_mlp": 0.01028149, - "balance_loss_clip": 1.03729844, - "balance_loss_mlp": 1.01547122, - "epoch": 0.602014128964377, - "flos": 19427062815360.0, - "grad_norm": 2.02443112791839, - "language_loss": 0.74996275, - "learning_rate": 1.4436897683388462e-06, - "loss": 0.77098525, - "num_input_tokens_seen": 215643730, - "step": 10013, - "time_per_iteration": 2.718114137649536 - }, - { - "auxiliary_loss_clip": 0.01109921, - "auxiliary_loss_mlp": 0.01028766, - "balance_loss_clip": 1.03983474, - "balance_loss_mlp": 1.01669037, - "epoch": 0.6020742522170449, - "flos": 28329389470080.0, - "grad_norm": 1.6563944160673858, - "language_loss": 0.81454921, - "learning_rate": 1.4433156868574732e-06, - "loss": 0.83593607, - "num_input_tokens_seen": 215664425, - "step": 10014, - "time_per_iteration": 2.6359105110168457 - }, - { - "auxiliary_loss_clip": 0.01089157, - "auxiliary_loss_mlp": 0.01030481, - "balance_loss_clip": 1.037884, - "balance_loss_mlp": 1.01777339, - "epoch": 0.6021343754697129, - "flos": 22747040058240.0, - "grad_norm": 1.540720048754759, - "language_loss": 0.72213233, - "learning_rate": 1.442941626485624e-06, - "loss": 0.74332869, - "num_input_tokens_seen": 215684280, - "step": 10015, - "time_per_iteration": 2.7502388954162598 - }, - { - "auxiliary_loss_clip": 0.01020446, - "auxiliary_loss_mlp": 0.01001943, - "balance_loss_clip": 1.01667976, - "balance_loss_mlp": 1.00080478, - "epoch": 0.6021944987223808, - "flos": 65752007402880.0, - "grad_norm": 0.8150448703202539, - "language_loss": 0.5473066, - "learning_rate": 1.4425675872374848e-06, - "loss": 0.56753051, - "num_input_tokens_seen": 215739780, - "step": 10016, - "time_per_iteration": 4.701697826385498 - }, - { - "auxiliary_loss_clip": 0.01094661, - "auxiliary_loss_mlp": 0.01029515, - "balance_loss_clip": 1.04152966, - "balance_loss_mlp": 1.01637208, - "epoch": 0.6022546219750489, - "flos": 16105182151680.0, - "grad_norm": 1.5504792190081969, - "language_loss": 0.82899499, - "learning_rate": 1.4421935691272381e-06, - "loss": 0.85023677, - "num_input_tokens_seen": 215757885, - "step": 10017, - "time_per_iteration": 2.636793851852417 - }, - { - "auxiliary_loss_clip": 0.01091797, - "auxiliary_loss_mlp": 0.01031972, - "balance_loss_clip": 1.0407809, - "balance_loss_mlp": 1.01946664, - "epoch": 0.6023147452277168, - "flos": 25512555985920.0, - "grad_norm": 1.7715837391634046, - "language_loss": 0.83621204, - "learning_rate": 1.4418195721690677e-06, - "loss": 0.85744977, - "num_input_tokens_seen": 215776415, - "step": 10018, - "time_per_iteration": 6.060548543930054 - }, - { - "auxiliary_loss_clip": 0.01093456, - "auxiliary_loss_mlp": 0.01038236, - "balance_loss_clip": 1.03801382, - "balance_loss_mlp": 1.02431202, - "epoch": 0.6023748684803848, - "flos": 22636075968000.0, - "grad_norm": 1.733441285539822, - "language_loss": 0.78400528, - "learning_rate": 1.4414455963771549e-06, - "loss": 0.80532229, - "num_input_tokens_seen": 215794865, - "step": 10019, - "time_per_iteration": 2.6781299114227295 - }, - { - "auxiliary_loss_clip": 0.01075209, - "auxiliary_loss_mlp": 0.00770827, - "balance_loss_clip": 1.03914475, - "balance_loss_mlp": 1.00017881, - "epoch": 0.6024349917330527, - "flos": 26210444307840.0, - "grad_norm": 2.381543125857722, - "language_loss": 0.73964417, - "learning_rate": 1.441071641765681e-06, - "loss": 0.7581045, - "num_input_tokens_seen": 215816840, - "step": 10020, - "time_per_iteration": 2.7956390380859375 - }, - { - "auxiliary_loss_clip": 0.01095191, - "auxiliary_loss_mlp": 0.01033486, - "balance_loss_clip": 1.04020286, - "balance_loss_mlp": 1.0205282, - "epoch": 0.6024951149857207, - "flos": 21251755762560.0, - "grad_norm": 2.1093873668761765, - "language_loss": 0.64171422, - "learning_rate": 1.4406977083488264e-06, - "loss": 0.663001, - "num_input_tokens_seen": 215836100, - "step": 10021, - "time_per_iteration": 4.23021388053894 - }, - { - "auxiliary_loss_clip": 0.01102751, - "auxiliary_loss_mlp": 0.01033404, - "balance_loss_clip": 1.03910637, - "balance_loss_mlp": 1.01996362, - "epoch": 0.6025552382383887, - "flos": 26943453152640.0, - "grad_norm": 1.41151849166176, - "language_loss": 0.80664903, - "learning_rate": 1.4403237961407704e-06, - "loss": 0.82801056, - "num_input_tokens_seen": 215858480, - "step": 10022, - "time_per_iteration": 2.6966497898101807 - }, - { - "auxiliary_loss_clip": 0.0110378, - "auxiliary_loss_mlp": 0.01030649, - "balance_loss_clip": 1.04190755, - "balance_loss_mlp": 1.0179832, - "epoch": 0.6026153614910567, - "flos": 31684379495040.0, - "grad_norm": 1.480979872703277, - "language_loss": 0.66483712, - "learning_rate": 1.439949905155693e-06, - "loss": 0.68618143, - "num_input_tokens_seen": 215879950, - "step": 10023, - "time_per_iteration": 2.691399574279785 - }, - { - "auxiliary_loss_clip": 0.01104501, - "auxiliary_loss_mlp": 0.01032886, - "balance_loss_clip": 1.03789723, - "balance_loss_mlp": 1.02022552, - "epoch": 0.6026754847437247, - "flos": 29312731175040.0, - "grad_norm": 2.162444553659901, - "language_loss": 0.74503481, - "learning_rate": 1.4395760354077707e-06, - "loss": 0.76640868, - "num_input_tokens_seen": 215899830, - "step": 10024, - "time_per_iteration": 2.7364046573638916 - }, - { - "auxiliary_loss_clip": 0.01104535, - "auxiliary_loss_mlp": 0.01036059, - "balance_loss_clip": 1.04094052, - "balance_loss_mlp": 1.02257693, - "epoch": 0.6027356079963926, - "flos": 23586775188480.0, - "grad_norm": 1.6406938647308078, - "language_loss": 0.72738647, - "learning_rate": 1.4392021869111815e-06, - "loss": 0.74879241, - "num_input_tokens_seen": 215920440, - "step": 10025, - "time_per_iteration": 2.6431972980499268 - }, - { - "auxiliary_loss_clip": 0.01119748, - "auxiliary_loss_mlp": 0.01037727, - "balance_loss_clip": 1.04081619, - "balance_loss_mlp": 1.02376747, - "epoch": 0.6027957312490606, - "flos": 20813753318400.0, - "grad_norm": 2.306954455043105, - "language_loss": 0.6677472, - "learning_rate": 1.4388283596801016e-06, - "loss": 0.68932194, - "num_input_tokens_seen": 215940535, - "step": 10026, - "time_per_iteration": 2.6187641620635986 - }, - { - "auxiliary_loss_clip": 0.0110922, - "auxiliary_loss_mlp": 0.01036818, - "balance_loss_clip": 1.03789234, - "balance_loss_mlp": 1.02471268, - "epoch": 0.6028558545017285, - "flos": 19935773182080.0, - "grad_norm": 1.830391575126131, - "language_loss": 0.80050242, - "learning_rate": 1.4384545537287061e-06, - "loss": 0.82196277, - "num_input_tokens_seen": 215958045, - "step": 10027, - "time_per_iteration": 2.576110601425171 - }, - { - "auxiliary_loss_clip": 0.01081954, - "auxiliary_loss_mlp": 0.01036412, - "balance_loss_clip": 1.03722823, - "balance_loss_mlp": 1.02301311, - "epoch": 0.6029159777543965, - "flos": 22820836550400.0, - "grad_norm": 2.0223053255723236, - "language_loss": 0.70934105, - "learning_rate": 1.438080769071171e-06, - "loss": 0.73052478, - "num_input_tokens_seen": 215977330, - "step": 10028, - "time_per_iteration": 2.7288432121276855 - }, - { - "auxiliary_loss_clip": 0.01084702, - "auxiliary_loss_mlp": 0.01035574, - "balance_loss_clip": 1.04540849, - "balance_loss_mlp": 1.02254987, - "epoch": 0.6029761010070644, - "flos": 23587242065280.0, - "grad_norm": 2.1142238314038595, - "language_loss": 0.84057522, - "learning_rate": 1.437707005721669e-06, - "loss": 0.86177796, - "num_input_tokens_seen": 215997865, - "step": 10029, - "time_per_iteration": 2.7901382446289062 - }, - { - "auxiliary_loss_clip": 0.0109278, - "auxiliary_loss_mlp": 0.01032236, - "balance_loss_clip": 1.0393126, - "balance_loss_mlp": 1.0201664, - "epoch": 0.6030362242597325, - "flos": 13662430859520.0, - "grad_norm": 2.2431865033670744, - "language_loss": 0.79994917, - "learning_rate": 1.437333263694373e-06, - "loss": 0.82119942, - "num_input_tokens_seen": 216016230, - "step": 10030, - "time_per_iteration": 2.780527114868164 - }, - { - "auxiliary_loss_clip": 0.01048723, - "auxiliary_loss_mlp": 0.01042121, - "balance_loss_clip": 1.03655624, - "balance_loss_mlp": 1.02806032, - "epoch": 0.6030963475124004, - "flos": 24422883045120.0, - "grad_norm": 1.9455803489075072, - "language_loss": 0.71241331, - "learning_rate": 1.4369595430034572e-06, - "loss": 0.73332179, - "num_input_tokens_seen": 216035785, - "step": 10031, - "time_per_iteration": 2.8193559646606445 - }, - { - "auxiliary_loss_clip": 0.0107322, - "auxiliary_loss_mlp": 0.01037048, - "balance_loss_clip": 1.0378077, - "balance_loss_mlp": 1.02281427, - "epoch": 0.6031564707650684, - "flos": 29644043247360.0, - "grad_norm": 2.2622695973651834, - "language_loss": 0.72744608, - "learning_rate": 1.4365858436630912e-06, - "loss": 0.74854881, - "num_input_tokens_seen": 216059555, - "step": 10032, - "time_per_iteration": 2.8426249027252197 - }, - { - "auxiliary_loss_clip": 0.0110112, - "auxiliary_loss_mlp": 0.01034912, - "balance_loss_clip": 1.04412532, - "balance_loss_mlp": 1.02163815, - "epoch": 0.6032165940177363, - "flos": 16618776768000.0, - "grad_norm": 1.8175959049184216, - "language_loss": 0.68774295, - "learning_rate": 1.4362121656874465e-06, - "loss": 0.70910323, - "num_input_tokens_seen": 216077235, - "step": 10033, - "time_per_iteration": 2.700209379196167 - }, - { - "auxiliary_loss_clip": 0.01089272, - "auxiliary_loss_mlp": 0.01037723, - "balance_loss_clip": 1.04015613, - "balance_loss_mlp": 1.02396595, - "epoch": 0.6032767172704043, - "flos": 17488173553920.0, - "grad_norm": 2.115327938975923, - "language_loss": 0.7568332, - "learning_rate": 1.4358385090906934e-06, - "loss": 0.77810311, - "num_input_tokens_seen": 216094985, - "step": 10034, - "time_per_iteration": 2.6627981662750244 - }, - { - "auxiliary_loss_clip": 0.01095189, - "auxiliary_loss_mlp": 0.01030387, - "balance_loss_clip": 1.04141998, - "balance_loss_mlp": 1.01710701, - "epoch": 0.6033368405230723, - "flos": 26832955939200.0, - "grad_norm": 3.2723425599009026, - "language_loss": 0.74862671, - "learning_rate": 1.4354648738870004e-06, - "loss": 0.7698825, - "num_input_tokens_seen": 216115905, - "step": 10035, - "time_per_iteration": 2.8429391384124756 - }, - { - "auxiliary_loss_clip": 0.01082466, - "auxiliary_loss_mlp": 0.01027098, - "balance_loss_clip": 1.03635907, - "balance_loss_mlp": 1.0147779, - "epoch": 0.6033969637757403, - "flos": 16909904499840.0, - "grad_norm": 1.7778569727517832, - "language_loss": 0.8656829, - "learning_rate": 1.435091260090536e-06, - "loss": 0.88677853, - "num_input_tokens_seen": 216132420, - "step": 10036, - "time_per_iteration": 2.7539496421813965 - }, - { - "auxiliary_loss_clip": 0.0107738, - "auxiliary_loss_mlp": 0.01034344, - "balance_loss_clip": 1.03851438, - "balance_loss_mlp": 1.02084994, - "epoch": 0.6034570870284083, - "flos": 22930076787840.0, - "grad_norm": 1.8216360444833892, - "language_loss": 0.70128858, - "learning_rate": 1.4347176677154676e-06, - "loss": 0.72240573, - "num_input_tokens_seen": 216149800, - "step": 10037, - "time_per_iteration": 2.6496496200561523 - }, - { - "auxiliary_loss_clip": 0.0109976, - "auxiliary_loss_mlp": 0.01037189, - "balance_loss_clip": 1.03967977, - "balance_loss_mlp": 1.02270496, - "epoch": 0.6035172102810762, - "flos": 23366319465600.0, - "grad_norm": 1.570748886934951, - "language_loss": 0.8512125, - "learning_rate": 1.4343440967759616e-06, - "loss": 0.87258202, - "num_input_tokens_seen": 216168200, - "step": 10038, - "time_per_iteration": 2.6828958988189697 - }, - { - "auxiliary_loss_clip": 0.01098827, - "auxiliary_loss_mlp": 0.01034673, - "balance_loss_clip": 1.04050255, - "balance_loss_mlp": 1.02128005, - "epoch": 0.6035773335337442, - "flos": 20887082933760.0, - "grad_norm": 2.3203593434406242, - "language_loss": 0.76504898, - "learning_rate": 1.4339705472861846e-06, - "loss": 0.78638399, - "num_input_tokens_seen": 216187105, - "step": 10039, - "time_per_iteration": 2.6590511798858643 - }, - { - "auxiliary_loss_clip": 0.01102907, - "auxiliary_loss_mlp": 0.01031911, - "balance_loss_clip": 1.03922081, - "balance_loss_mlp": 1.019382, - "epoch": 0.6036374567864121, - "flos": 24936298093440.0, - "grad_norm": 1.8339871345285923, - "language_loss": 0.71111763, - "learning_rate": 1.433597019260301e-06, - "loss": 0.73246586, - "num_input_tokens_seen": 216205440, - "step": 10040, - "time_per_iteration": 2.6712801456451416 - }, - { - "auxiliary_loss_clip": 0.01109688, - "auxiliary_loss_mlp": 0.01031148, - "balance_loss_clip": 1.04312241, - "balance_loss_mlp": 1.01598454, - "epoch": 0.6036975800390801, - "flos": 23148269953920.0, - "grad_norm": 2.0137364812654166, - "language_loss": 0.78602934, - "learning_rate": 1.433223512712475e-06, - "loss": 0.80743772, - "num_input_tokens_seen": 216223130, - "step": 10041, - "time_per_iteration": 2.670166015625 - }, - { - "auxiliary_loss_clip": 0.01096185, - "auxiliary_loss_mlp": 0.01029552, - "balance_loss_clip": 1.04166305, - "balance_loss_mlp": 1.01649821, - "epoch": 0.603757703291748, - "flos": 18660729127680.0, - "grad_norm": 1.7274066455029002, - "language_loss": 0.75525141, - "learning_rate": 1.4328500276568704e-06, - "loss": 0.77650881, - "num_input_tokens_seen": 216240260, - "step": 10042, - "time_per_iteration": 2.6106081008911133 - }, - { - "auxiliary_loss_clip": 0.0106962, - "auxiliary_loss_mlp": 0.01029029, - "balance_loss_clip": 1.03727007, - "balance_loss_mlp": 1.01701236, - "epoch": 0.6038178265444161, - "flos": 19682603147520.0, - "grad_norm": 1.9258503206144206, - "language_loss": 0.84721899, - "learning_rate": 1.4324765641076498e-06, - "loss": 0.86820555, - "num_input_tokens_seen": 216258510, - "step": 10043, - "time_per_iteration": 2.71673846244812 - }, - { - "auxiliary_loss_clip": 0.01081507, - "auxiliary_loss_mlp": 0.01040859, - "balance_loss_clip": 1.03832972, - "balance_loss_mlp": 1.02579701, - "epoch": 0.603877949797084, - "flos": 22638230784000.0, - "grad_norm": 1.8215258720973655, - "language_loss": 0.70104671, - "learning_rate": 1.432103122078974e-06, - "loss": 0.72227025, - "num_input_tokens_seen": 216277550, - "step": 10044, - "time_per_iteration": 2.7252089977264404 - }, - { - "auxiliary_loss_clip": 0.01106435, - "auxiliary_loss_mlp": 0.01032617, - "balance_loss_clip": 1.04218245, - "balance_loss_mlp": 1.01826382, - "epoch": 0.603938073049752, - "flos": 25447881548160.0, - "grad_norm": 1.9233339181851183, - "language_loss": 0.78067368, - "learning_rate": 1.4317297015850057e-06, - "loss": 0.80206418, - "num_input_tokens_seen": 216296690, - "step": 10045, - "time_per_iteration": 2.6885697841644287 - }, - { - "auxiliary_loss_clip": 0.01071663, - "auxiliary_loss_mlp": 0.01034579, - "balance_loss_clip": 1.04522324, - "balance_loss_mlp": 1.02084029, - "epoch": 0.6039981963024199, - "flos": 22340135813760.0, - "grad_norm": 1.7431481861658145, - "language_loss": 0.77048129, - "learning_rate": 1.4313563026399036e-06, - "loss": 0.79154372, - "num_input_tokens_seen": 216316110, - "step": 10046, - "time_per_iteration": 2.762124538421631 - }, - { - "auxiliary_loss_clip": 0.01061952, - "auxiliary_loss_mlp": 0.01040938, - "balance_loss_clip": 1.03495252, - "balance_loss_mlp": 1.02685905, - "epoch": 0.6040583195550879, - "flos": 20703148364160.0, - "grad_norm": 1.791420221750128, - "language_loss": 0.87246406, - "learning_rate": 1.430982925257827e-06, - "loss": 0.893493, - "num_input_tokens_seen": 216333855, - "step": 10047, - "time_per_iteration": 2.7445127964019775 - }, - { - "auxiliary_loss_clip": 0.01104302, - "auxiliary_loss_mlp": 0.01030965, - "balance_loss_clip": 1.04149449, - "balance_loss_mlp": 1.01879954, - "epoch": 0.604118442807756, - "flos": 27163118776320.0, - "grad_norm": 1.4945345993269403, - "language_loss": 0.75776327, - "learning_rate": 1.4306095694529358e-06, - "loss": 0.77911592, - "num_input_tokens_seen": 216354890, - "step": 10048, - "time_per_iteration": 2.730748414993286 - }, - { - "auxiliary_loss_clip": 0.01108329, - "auxiliary_loss_mlp": 0.01044251, - "balance_loss_clip": 1.04174399, - "balance_loss_mlp": 1.02869403, - "epoch": 0.6041785660604239, - "flos": 30881524654080.0, - "grad_norm": 2.2243998349441183, - "language_loss": 0.66556633, - "learning_rate": 1.430236235239386e-06, - "loss": 0.68709219, - "num_input_tokens_seen": 216376055, - "step": 10049, - "time_per_iteration": 2.6866142749786377 - }, - { - "auxiliary_loss_clip": 0.01089915, - "auxiliary_loss_mlp": 0.01042714, - "balance_loss_clip": 1.03830862, - "balance_loss_mlp": 1.02849865, - "epoch": 0.6042386893130919, - "flos": 19938215306880.0, - "grad_norm": 1.639569270992707, - "language_loss": 0.66928005, - "learning_rate": 1.429862922631336e-06, - "loss": 0.69060636, - "num_input_tokens_seen": 216396295, - "step": 10050, - "time_per_iteration": 2.744527816772461 - }, - { - "auxiliary_loss_clip": 0.01083354, - "auxiliary_loss_mlp": 0.01036031, - "balance_loss_clip": 1.03962123, - "balance_loss_mlp": 1.02269185, - "epoch": 0.6042988125657598, - "flos": 32415915882240.0, - "grad_norm": 1.7210161547813447, - "language_loss": 0.6963383, - "learning_rate": 1.4294896316429408e-06, - "loss": 0.71753216, - "num_input_tokens_seen": 216416605, - "step": 10051, - "time_per_iteration": 2.820204734802246 - }, - { - "auxiliary_loss_clip": 0.01100825, - "auxiliary_loss_mlp": 0.01032129, - "balance_loss_clip": 1.03741777, - "balance_loss_mlp": 1.01908135, - "epoch": 0.6043589358184278, - "flos": 17420805596160.0, - "grad_norm": 2.3541607325849987, - "language_loss": 0.64901161, - "learning_rate": 1.4291163622883553e-06, - "loss": 0.67034107, - "num_input_tokens_seen": 216435130, - "step": 10052, - "time_per_iteration": 2.682201385498047 - }, - { - "auxiliary_loss_clip": 0.01094174, - "auxiliary_loss_mlp": 0.01034325, - "balance_loss_clip": 1.0397222, - "balance_loss_mlp": 1.0204432, - "epoch": 0.6044190590710957, - "flos": 27672834723840.0, - "grad_norm": 1.5756389367941481, - "language_loss": 0.69104528, - "learning_rate": 1.4287431145817358e-06, - "loss": 0.71233022, - "num_input_tokens_seen": 216455640, - "step": 10053, - "time_per_iteration": 2.8296010494232178 - }, - { - "auxiliary_loss_clip": 0.01018297, - "auxiliary_loss_mlp": 0.01003475, - "balance_loss_clip": 1.01298642, - "balance_loss_mlp": 1.0022707, - "epoch": 0.6044791823237637, - "flos": 65316267515520.0, - "grad_norm": 0.7275681454160189, - "language_loss": 0.60339212, - "learning_rate": 1.4283698885372336e-06, - "loss": 0.62360984, - "num_input_tokens_seen": 216518130, - "step": 10054, - "time_per_iteration": 3.3135299682617188 - }, - { - "auxiliary_loss_clip": 0.01055185, - "auxiliary_loss_mlp": 0.01033215, - "balance_loss_clip": 1.03634906, - "balance_loss_mlp": 1.019768, - "epoch": 0.6045393055764317, - "flos": 24492369905280.0, - "grad_norm": 1.5749604097549974, - "language_loss": 0.8565892, - "learning_rate": 1.4279966841690027e-06, - "loss": 0.87747318, - "num_input_tokens_seen": 216536845, - "step": 10055, - "time_per_iteration": 2.803851842880249 - }, - { - "auxiliary_loss_clip": 0.0109594, - "auxiliary_loss_mlp": 0.01048723, - "balance_loss_clip": 1.04159987, - "balance_loss_mlp": 1.03321385, - "epoch": 0.6045994288290997, - "flos": 19054345340160.0, - "grad_norm": 2.24817202299257, - "language_loss": 0.74068117, - "learning_rate": 1.4276235014911952e-06, - "loss": 0.76212776, - "num_input_tokens_seen": 216551860, - "step": 10056, - "time_per_iteration": 4.305849313735962 - }, - { - "auxiliary_loss_clip": 0.01073635, - "auxiliary_loss_mlp": 0.01035962, - "balance_loss_clip": 1.03811693, - "balance_loss_mlp": 1.02309358, - "epoch": 0.6046595520817676, - "flos": 26576697335040.0, - "grad_norm": 1.7955377697616153, - "language_loss": 0.80028808, - "learning_rate": 1.4272503405179616e-06, - "loss": 0.82138407, - "num_input_tokens_seen": 216574775, - "step": 10057, - "time_per_iteration": 5.891208648681641 - }, - { - "auxiliary_loss_clip": 0.0111396, - "auxiliary_loss_mlp": 0.00770338, - "balance_loss_clip": 1.04094028, - "balance_loss_mlp": 1.00008702, - "epoch": 0.6047196753344356, - "flos": 13582277660160.0, - "grad_norm": 2.047185386836812, - "language_loss": 0.75578213, - "learning_rate": 1.4268772012634527e-06, - "loss": 0.77462518, - "num_input_tokens_seen": 216590100, - "step": 10058, - "time_per_iteration": 2.6869444847106934 - }, - { - "auxiliary_loss_clip": 0.0110179, - "auxiliary_loss_mlp": 0.01030868, - "balance_loss_clip": 1.03934133, - "balance_loss_mlp": 1.01811314, - "epoch": 0.6047797985871035, - "flos": 25520456977920.0, - "grad_norm": 1.9889135378311975, - "language_loss": 0.70937455, - "learning_rate": 1.4265040837418176e-06, - "loss": 0.73070109, - "num_input_tokens_seen": 216610145, - "step": 10059, - "time_per_iteration": 2.7275924682617188 - }, - { - "auxiliary_loss_clip": 0.01092569, - "auxiliary_loss_mlp": 0.0103084, - "balance_loss_clip": 1.03944898, - "balance_loss_mlp": 1.01753664, - "epoch": 0.6048399218397715, - "flos": 20520147548160.0, - "grad_norm": 1.7704655084920065, - "language_loss": 0.76338398, - "learning_rate": 1.4261309879672054e-06, - "loss": 0.78461802, - "num_input_tokens_seen": 216630625, - "step": 10060, - "time_per_iteration": 4.274925470352173 - }, - { - "auxiliary_loss_clip": 0.01104515, - "auxiliary_loss_mlp": 0.01034248, - "balance_loss_clip": 1.03981733, - "balance_loss_mlp": 1.02105165, - "epoch": 0.6049000450924396, - "flos": 20408788408320.0, - "grad_norm": 1.9626551853189032, - "language_loss": 0.73588789, - "learning_rate": 1.4257579139537628e-06, - "loss": 0.75727558, - "num_input_tokens_seen": 216649255, - "step": 10061, - "time_per_iteration": 2.6950912475585938 - }, - { - "auxiliary_loss_clip": 0.01076727, - "auxiliary_loss_mlp": 0.00771397, - "balance_loss_clip": 1.04075074, - "balance_loss_mlp": 1.00014019, - "epoch": 0.6049601683451075, - "flos": 20741357456640.0, - "grad_norm": 2.92695225177956, - "language_loss": 0.67823231, - "learning_rate": 1.425384861715639e-06, - "loss": 0.69671357, - "num_input_tokens_seen": 216668100, - "step": 10062, - "time_per_iteration": 2.7427420616149902 - }, - { - "auxiliary_loss_clip": 0.01099001, - "auxiliary_loss_mlp": 0.010396, - "balance_loss_clip": 1.03907073, - "balance_loss_mlp": 1.02500868, - "epoch": 0.6050202915977755, - "flos": 20083114771200.0, - "grad_norm": 2.0011992400768173, - "language_loss": 0.71559471, - "learning_rate": 1.425011831266978e-06, - "loss": 0.73698068, - "num_input_tokens_seen": 216686125, - "step": 10063, - "time_per_iteration": 2.652628183364868 - }, - { - "auxiliary_loss_clip": 0.01111808, - "auxiliary_loss_mlp": 0.01037973, - "balance_loss_clip": 1.03926516, - "balance_loss_mlp": 1.02516413, - "epoch": 0.6050804148504434, - "flos": 15960821391360.0, - "grad_norm": 1.8208827458989, - "language_loss": 0.84698188, - "learning_rate": 1.424638822621926e-06, - "loss": 0.86847973, - "num_input_tokens_seen": 216704265, - "step": 10064, - "time_per_iteration": 2.6407761573791504 - }, - { - "auxiliary_loss_clip": 0.01105098, - "auxiliary_loss_mlp": 0.01032994, - "balance_loss_clip": 1.04044116, - "balance_loss_mlp": 1.01974392, - "epoch": 0.6051405381031114, - "flos": 17456644391040.0, - "grad_norm": 2.095191883416591, - "language_loss": 0.79596299, - "learning_rate": 1.4242658357946278e-06, - "loss": 0.81734389, - "num_input_tokens_seen": 216721765, - "step": 10065, - "time_per_iteration": 2.633913040161133 - }, - { - "auxiliary_loss_clip": 0.01067386, - "auxiliary_loss_mlp": 0.01033127, - "balance_loss_clip": 1.03866124, - "balance_loss_mlp": 1.0181725, - "epoch": 0.6052006613557793, - "flos": 11400130517760.0, - "grad_norm": 2.398871193370657, - "language_loss": 0.78276229, - "learning_rate": 1.423892870799226e-06, - "loss": 0.80376744, - "num_input_tokens_seen": 216738295, - "step": 10066, - "time_per_iteration": 2.729074001312256 - }, - { - "auxiliary_loss_clip": 0.01059487, - "auxiliary_loss_mlp": 0.01033515, - "balance_loss_clip": 1.03963447, - "balance_loss_mlp": 1.01981831, - "epoch": 0.6052607846084473, - "flos": 24750998807040.0, - "grad_norm": 1.7528217462877862, - "language_loss": 0.7308799, - "learning_rate": 1.4235199276498655e-06, - "loss": 0.75180995, - "num_input_tokens_seen": 216759875, - "step": 10067, - "time_per_iteration": 2.81003999710083 - }, - { - "auxiliary_loss_clip": 0.01094022, - "auxiliary_loss_mlp": 0.00770796, - "balance_loss_clip": 1.04127932, - "balance_loss_mlp": 1.00018191, - "epoch": 0.6053209078611153, - "flos": 20741141975040.0, - "grad_norm": 1.357631083448857, - "language_loss": 0.68994391, - "learning_rate": 1.4231470063606863e-06, - "loss": 0.70859212, - "num_input_tokens_seen": 216780705, - "step": 10068, - "time_per_iteration": 2.7258529663085938 - }, - { - "auxiliary_loss_clip": 0.010988, - "auxiliary_loss_mlp": 0.01031117, - "balance_loss_clip": 1.03859472, - "balance_loss_mlp": 1.01821876, - "epoch": 0.6053810311137833, - "flos": 18953149749120.0, - "grad_norm": 3.7091992376991096, - "language_loss": 0.870857, - "learning_rate": 1.4227741069458303e-06, - "loss": 0.89215624, - "num_input_tokens_seen": 216797625, - "step": 10069, - "time_per_iteration": 2.57892107963562 - }, - { - "auxiliary_loss_clip": 0.01081389, - "auxiliary_loss_mlp": 0.01029042, - "balance_loss_clip": 1.03757524, - "balance_loss_mlp": 1.01611388, - "epoch": 0.6054411543664512, - "flos": 23951124794880.0, - "grad_norm": 1.6595378120531261, - "language_loss": 0.83174849, - "learning_rate": 1.4224012294194387e-06, - "loss": 0.85285282, - "num_input_tokens_seen": 216817610, - "step": 10070, - "time_per_iteration": 2.7172200679779053 - }, - { - "auxiliary_loss_clip": 0.01100339, - "auxiliary_loss_mlp": 0.01034986, - "balance_loss_clip": 1.04162169, - "balance_loss_mlp": 1.02189064, - "epoch": 0.6055012776191192, - "flos": 20593979953920.0, - "grad_norm": 1.9849870448156475, - "language_loss": 0.85964417, - "learning_rate": 1.4220283737956496e-06, - "loss": 0.88099742, - "num_input_tokens_seen": 216836835, - "step": 10071, - "time_per_iteration": 2.677682638168335 - }, - { - "auxiliary_loss_clip": 0.01109082, - "auxiliary_loss_mlp": 0.01035663, - "balance_loss_clip": 1.04172432, - "balance_loss_mlp": 1.02102959, - "epoch": 0.6055614008717871, - "flos": 30298191782400.0, - "grad_norm": 1.8218197035918635, - "language_loss": 0.77151179, - "learning_rate": 1.421655540088603e-06, - "loss": 0.79295927, - "num_input_tokens_seen": 216856760, - "step": 10072, - "time_per_iteration": 2.806692123413086 - }, - { - "auxiliary_loss_clip": 0.01094577, - "auxiliary_loss_mlp": 0.01028599, - "balance_loss_clip": 1.0381639, - "balance_loss_mlp": 1.01447272, - "epoch": 0.6056215241244551, - "flos": 27125017424640.0, - "grad_norm": 1.5487316274587832, - "language_loss": 0.74428165, - "learning_rate": 1.4212827283124367e-06, - "loss": 0.76551342, - "num_input_tokens_seen": 216878795, - "step": 10073, - "time_per_iteration": 2.746279239654541 - }, - { - "auxiliary_loss_clip": 0.00997245, - "auxiliary_loss_mlp": 0.01001533, - "balance_loss_clip": 1.01025248, - "balance_loss_mlp": 1.00035894, - "epoch": 0.6056816473771232, - "flos": 56007323925120.0, - "grad_norm": 0.7538510449367495, - "language_loss": 0.55113828, - "learning_rate": 1.4209099384812863e-06, - "loss": 0.57112598, - "num_input_tokens_seen": 216937800, - "step": 10074, - "time_per_iteration": 3.3036320209503174 - }, - { - "auxiliary_loss_clip": 0.01075201, - "auxiliary_loss_mlp": 0.01042355, - "balance_loss_clip": 1.03847015, - "balance_loss_mlp": 1.02714372, - "epoch": 0.6057417706297911, - "flos": 23549499849600.0, - "grad_norm": 1.7766669021243995, - "language_loss": 0.81689596, - "learning_rate": 1.4205371706092894e-06, - "loss": 0.83807153, - "num_input_tokens_seen": 216955280, - "step": 10075, - "time_per_iteration": 2.731048583984375 - }, - { - "auxiliary_loss_clip": 0.01107881, - "auxiliary_loss_mlp": 0.01025575, - "balance_loss_clip": 1.04031885, - "balance_loss_mlp": 1.01165175, - "epoch": 0.6058018938824591, - "flos": 27744296832000.0, - "grad_norm": 1.740054911914685, - "language_loss": 0.77907681, - "learning_rate": 1.4201644247105813e-06, - "loss": 0.80041134, - "num_input_tokens_seen": 216976950, - "step": 10076, - "time_per_iteration": 2.6934380531311035 - }, - { - "auxiliary_loss_clip": 0.01106108, - "auxiliary_loss_mlp": 0.01036344, - "balance_loss_clip": 1.03907084, - "balance_loss_mlp": 1.02240217, - "epoch": 0.605862017135127, - "flos": 22783381643520.0, - "grad_norm": 1.6512555736365901, - "language_loss": 0.72421932, - "learning_rate": 1.4197917007992964e-06, - "loss": 0.74564385, - "num_input_tokens_seen": 216996945, - "step": 10077, - "time_per_iteration": 2.6461181640625 - }, - { - "auxiliary_loss_clip": 0.01117207, - "auxiliary_loss_mlp": 0.0103146, - "balance_loss_clip": 1.04170644, - "balance_loss_mlp": 1.01762605, - "epoch": 0.605922140387795, - "flos": 21215019127680.0, - "grad_norm": 1.9059777517343863, - "language_loss": 0.55426162, - "learning_rate": 1.4194189988895682e-06, - "loss": 0.57574832, - "num_input_tokens_seen": 217016580, - "step": 10078, - "time_per_iteration": 2.6261439323425293 - }, - { - "auxiliary_loss_clip": 0.01073319, - "auxiliary_loss_mlp": 0.01031331, - "balance_loss_clip": 1.03767908, - "balance_loss_mlp": 1.0181284, - "epoch": 0.6059822636404629, - "flos": 27268372604160.0, - "grad_norm": 1.6895659179757812, - "language_loss": 0.70538819, - "learning_rate": 1.4190463189955297e-06, - "loss": 0.72643465, - "num_input_tokens_seen": 217037300, - "step": 10079, - "time_per_iteration": 2.830202102661133 - }, - { - "auxiliary_loss_clip": 0.01092187, - "auxiliary_loss_mlp": 0.01039196, - "balance_loss_clip": 1.03862, - "balance_loss_mlp": 1.02637529, - "epoch": 0.606042386893131, - "flos": 20631327120000.0, - "grad_norm": 1.6859252666783793, - "language_loss": 0.6267547, - "learning_rate": 1.4186736611313131e-06, - "loss": 0.64806855, - "num_input_tokens_seen": 217055805, - "step": 10080, - "time_per_iteration": 2.6813855171203613 - }, - { - "auxiliary_loss_clip": 0.01094103, - "auxiliary_loss_mlp": 0.01031366, - "balance_loss_clip": 1.03858209, - "balance_loss_mlp": 1.01722753, - "epoch": 0.6061025101457989, - "flos": 23002293081600.0, - "grad_norm": 2.6314265017345613, - "language_loss": 0.71340102, - "learning_rate": 1.4183010253110492e-06, - "loss": 0.73465574, - "num_input_tokens_seen": 217074175, - "step": 10081, - "time_per_iteration": 2.750216007232666 - }, - { - "auxiliary_loss_clip": 0.01091896, - "auxiliary_loss_mlp": 0.01029512, - "balance_loss_clip": 1.03969479, - "balance_loss_mlp": 1.01624978, - "epoch": 0.6061626333984669, - "flos": 29898937134720.0, - "grad_norm": 1.724175069330151, - "language_loss": 0.69190812, - "learning_rate": 1.4179284115488691e-06, - "loss": 0.71312225, - "num_input_tokens_seen": 217095695, - "step": 10082, - "time_per_iteration": 2.7279422283172607 - }, - { - "auxiliary_loss_clip": 0.01117243, - "auxiliary_loss_mlp": 0.01032658, - "balance_loss_clip": 1.04338622, - "balance_loss_mlp": 1.01974726, - "epoch": 0.6062227566511348, - "flos": 25009196745600.0, - "grad_norm": 1.3736157370589637, - "language_loss": 0.65741009, - "learning_rate": 1.4175558198589015e-06, - "loss": 0.67890906, - "num_input_tokens_seen": 217116260, - "step": 10083, - "time_per_iteration": 2.6431922912597656 - }, - { - "auxiliary_loss_clip": 0.01104697, - "auxiliary_loss_mlp": 0.01033772, - "balance_loss_clip": 1.03986526, - "balance_loss_mlp": 1.02053976, - "epoch": 0.6062828799038028, - "flos": 19463943104640.0, - "grad_norm": 1.8569136538666067, - "language_loss": 0.74291378, - "learning_rate": 1.4171832502552764e-06, - "loss": 0.7642985, - "num_input_tokens_seen": 217134465, - "step": 10084, - "time_per_iteration": 2.693331003189087 - }, - { - "auxiliary_loss_clip": 0.01089491, - "auxiliary_loss_mlp": 0.01040114, - "balance_loss_clip": 1.03806448, - "balance_loss_mlp": 1.02654219, - "epoch": 0.6063430031564707, - "flos": 13589568120960.0, - "grad_norm": 14.01820477469797, - "language_loss": 0.72177935, - "learning_rate": 1.4168107027521204e-06, - "loss": 0.74307537, - "num_input_tokens_seen": 217149920, - "step": 10085, - "time_per_iteration": 2.6207504272460938 - }, - { - "auxiliary_loss_clip": 0.01115179, - "auxiliary_loss_mlp": 0.0103546, - "balance_loss_clip": 1.04101026, - "balance_loss_mlp": 1.02325344, - "epoch": 0.6064031264091387, - "flos": 23255499029760.0, - "grad_norm": 1.9650382613535748, - "language_loss": 0.76113385, - "learning_rate": 1.4164381773635605e-06, - "loss": 0.78264022, - "num_input_tokens_seen": 217168165, - "step": 10086, - "time_per_iteration": 2.6350982189178467 - }, - { - "auxiliary_loss_clip": 0.01079834, - "auxiliary_loss_mlp": 0.01033915, - "balance_loss_clip": 1.03654695, - "balance_loss_mlp": 1.02082586, - "epoch": 0.6064632496618068, - "flos": 22458462192000.0, - "grad_norm": 1.6281495100420569, - "language_loss": 0.72623181, - "learning_rate": 1.4160656741037246e-06, - "loss": 0.74736929, - "num_input_tokens_seen": 217190070, - "step": 10087, - "time_per_iteration": 2.7133493423461914 - }, - { - "auxiliary_loss_clip": 0.01101404, - "auxiliary_loss_mlp": 0.0103704, - "balance_loss_clip": 1.03922224, - "balance_loss_mlp": 1.02555394, - "epoch": 0.6065233729144747, - "flos": 25118652464640.0, - "grad_norm": 1.8336458297983596, - "language_loss": 0.83669853, - "learning_rate": 1.4156931929867355e-06, - "loss": 0.85808301, - "num_input_tokens_seen": 217209370, - "step": 10088, - "time_per_iteration": 2.6913206577301025 - }, - { - "auxiliary_loss_clip": 0.01058404, - "auxiliary_loss_mlp": 0.00771924, - "balance_loss_clip": 1.03367972, - "balance_loss_mlp": 1.00013125, - "epoch": 0.6065834961671427, - "flos": 23477355383040.0, - "grad_norm": 2.41510818695702, - "language_loss": 0.7150932, - "learning_rate": 1.4153207340267201e-06, - "loss": 0.73339653, - "num_input_tokens_seen": 217226990, - "step": 10089, - "time_per_iteration": 2.71724271774292 - }, - { - "auxiliary_loss_clip": 0.01104996, - "auxiliary_loss_mlp": 0.01039267, - "balance_loss_clip": 1.04092312, - "balance_loss_mlp": 1.02694106, - "epoch": 0.6066436194198106, - "flos": 17019396132480.0, - "grad_norm": 3.755310304725579, - "language_loss": 0.82807851, - "learning_rate": 1.4149482972378009e-06, - "loss": 0.84952104, - "num_input_tokens_seen": 217244585, - "step": 10090, - "time_per_iteration": 2.600306510925293 - }, - { - "auxiliary_loss_clip": 0.01082916, - "auxiliary_loss_mlp": 0.01036874, - "balance_loss_clip": 1.04005432, - "balance_loss_mlp": 1.02280176, - "epoch": 0.6067037426724786, - "flos": 18514752255360.0, - "grad_norm": 2.395523786898732, - "language_loss": 0.75284386, - "learning_rate": 1.4145758826341e-06, - "loss": 0.77404171, - "num_input_tokens_seen": 217263435, - "step": 10091, - "time_per_iteration": 2.7555627822875977 - }, - { - "auxiliary_loss_clip": 0.0111346, - "auxiliary_loss_mlp": 0.01037619, - "balance_loss_clip": 1.04098213, - "balance_loss_mlp": 1.02436924, - "epoch": 0.6067638659251465, - "flos": 22345989730560.0, - "grad_norm": 1.5349996815844518, - "language_loss": 0.79607046, - "learning_rate": 1.4142034902297415e-06, - "loss": 0.81758124, - "num_input_tokens_seen": 217283725, - "step": 10092, - "time_per_iteration": 2.607757568359375 - }, - { - "auxiliary_loss_clip": 0.01094482, - "auxiliary_loss_mlp": 0.01037242, - "balance_loss_clip": 1.03954625, - "balance_loss_mlp": 1.02349734, - "epoch": 0.6068239891778145, - "flos": 12451019748480.0, - "grad_norm": 1.7756923536136626, - "language_loss": 0.7618677, - "learning_rate": 1.4138311200388444e-06, - "loss": 0.78318495, - "num_input_tokens_seen": 217301120, - "step": 10093, - "time_per_iteration": 2.730297327041626 - }, - { - "auxiliary_loss_clip": 0.01088328, - "auxiliary_loss_mlp": 0.01043446, - "balance_loss_clip": 1.0393225, - "balance_loss_mlp": 1.02897358, - "epoch": 0.6068841124304825, - "flos": 23185868515200.0, - "grad_norm": 1.8396370870528131, - "language_loss": 0.87565696, - "learning_rate": 1.4134587720755304e-06, - "loss": 0.89697462, - "num_input_tokens_seen": 217319585, - "step": 10094, - "time_per_iteration": 2.7664146423339844 - }, - { - "auxiliary_loss_clip": 0.01107836, - "auxiliary_loss_mlp": 0.01029861, - "balance_loss_clip": 1.04203224, - "balance_loss_mlp": 1.01675439, - "epoch": 0.6069442356831505, - "flos": 18587902302720.0, - "grad_norm": 1.805883260375072, - "language_loss": 0.71895981, - "learning_rate": 1.413086446353919e-06, - "loss": 0.74033689, - "num_input_tokens_seen": 217338880, - "step": 10095, - "time_per_iteration": 2.610901355743408 - }, - { - "auxiliary_loss_clip": 0.01089454, - "auxiliary_loss_mlp": 0.01034743, - "balance_loss_clip": 1.03730071, - "balance_loss_mlp": 1.02213049, - "epoch": 0.6070043589358184, - "flos": 20960340721920.0, - "grad_norm": 1.8353844932279613, - "language_loss": 0.76935136, - "learning_rate": 1.4127141428881273e-06, - "loss": 0.79059333, - "num_input_tokens_seen": 217357480, - "step": 10096, - "time_per_iteration": 5.823329925537109 - }, - { - "auxiliary_loss_clip": 0.01119601, - "auxiliary_loss_mlp": 0.01041655, - "balance_loss_clip": 1.04269695, - "balance_loss_mlp": 1.02889967, - "epoch": 0.6070644821884864, - "flos": 11692443398400.0, - "grad_norm": 2.030764189672632, - "language_loss": 0.80070782, - "learning_rate": 1.4123418616922749e-06, - "loss": 0.82232034, - "num_input_tokens_seen": 217374575, - "step": 10097, - "time_per_iteration": 2.63212513923645 - }, - { - "auxiliary_loss_clip": 0.01090335, - "auxiliary_loss_mlp": 0.01032018, - "balance_loss_clip": 1.04231095, - "balance_loss_mlp": 1.01897645, - "epoch": 0.6071246054411543, - "flos": 19310568030720.0, - "grad_norm": 1.5236568833124404, - "language_loss": 0.67320025, - "learning_rate": 1.411969602780478e-06, - "loss": 0.69442379, - "num_input_tokens_seen": 217392950, - "step": 10098, - "time_per_iteration": 2.6840009689331055 - }, - { - "auxiliary_loss_clip": 0.01114691, - "auxiliary_loss_mlp": 0.01029516, - "balance_loss_clip": 1.04036307, - "balance_loss_mlp": 1.0169934, - "epoch": 0.6071847286938223, - "flos": 17749029098880.0, - "grad_norm": 2.4274073378556125, - "language_loss": 0.80730307, - "learning_rate": 1.4115973661668523e-06, - "loss": 0.82874513, - "num_input_tokens_seen": 217412145, - "step": 10099, - "time_per_iteration": 2.5781733989715576 - }, - { - "auxiliary_loss_clip": 0.01085094, - "auxiliary_loss_mlp": 0.01039748, - "balance_loss_clip": 1.03784657, - "balance_loss_mlp": 1.02517462, - "epoch": 0.6072448519464904, - "flos": 22637512512000.0, - "grad_norm": 2.246118750219277, - "language_loss": 0.70420504, - "learning_rate": 1.4112251518655133e-06, - "loss": 0.7254535, - "num_input_tokens_seen": 217432080, - "step": 10100, - "time_per_iteration": 4.310024738311768 - }, - { - "auxiliary_loss_clip": 0.01077866, - "auxiliary_loss_mlp": 0.01036569, - "balance_loss_clip": 1.03830409, - "balance_loss_mlp": 1.02207279, - "epoch": 0.6073049751991583, - "flos": 19537308633600.0, - "grad_norm": 1.6047311801163284, - "language_loss": 0.70821762, - "learning_rate": 1.4108529598905764e-06, - "loss": 0.72936189, - "num_input_tokens_seen": 217450945, - "step": 10101, - "time_per_iteration": 2.726445198059082 - }, - { - "auxiliary_loss_clip": 0.01084441, - "auxiliary_loss_mlp": 0.01034143, - "balance_loss_clip": 1.03571582, - "balance_loss_mlp": 1.02082181, - "epoch": 0.6073650984518263, - "flos": 28294233033600.0, - "grad_norm": 2.197032989023165, - "language_loss": 0.69728243, - "learning_rate": 1.410480790256154e-06, - "loss": 0.71846825, - "num_input_tokens_seen": 217473105, - "step": 10102, - "time_per_iteration": 2.7282192707061768 - }, - { - "auxiliary_loss_clip": 0.0111817, - "auxiliary_loss_mlp": 0.01035861, - "balance_loss_clip": 1.04134989, - "balance_loss_mlp": 1.0230341, - "epoch": 0.6074252217044942, - "flos": 25664422688640.0, - "grad_norm": 1.8635985471124068, - "language_loss": 0.73704481, - "learning_rate": 1.4101086429763589e-06, - "loss": 0.7585851, - "num_input_tokens_seen": 217491780, - "step": 10103, - "time_per_iteration": 2.6332626342773438 - }, - { - "auxiliary_loss_clip": 0.01077723, - "auxiliary_loss_mlp": 0.01037617, - "balance_loss_clip": 1.04122865, - "balance_loss_mlp": 1.02333558, - "epoch": 0.6074853449571622, - "flos": 22857106308480.0, - "grad_norm": 1.5666292395017738, - "language_loss": 0.76782012, - "learning_rate": 1.4097365180653032e-06, - "loss": 0.78897351, - "num_input_tokens_seen": 217510605, - "step": 10104, - "time_per_iteration": 2.7046008110046387 - }, - { - "auxiliary_loss_clip": 0.01012823, - "auxiliary_loss_mlp": 0.01009652, - "balance_loss_clip": 1.01738811, - "balance_loss_mlp": 1.00849557, - "epoch": 0.6075454682098301, - "flos": 67111406547840.0, - "grad_norm": 0.7409971394494129, - "language_loss": 0.55891275, - "learning_rate": 1.4093644155370977e-06, - "loss": 0.57913756, - "num_input_tokens_seen": 217574815, - "step": 10105, - "time_per_iteration": 3.2526538372039795 - }, - { - "auxiliary_loss_clip": 0.01030607, - "auxiliary_loss_mlp": 0.01011283, - "balance_loss_clip": 1.01659429, - "balance_loss_mlp": 1.01022172, - "epoch": 0.6076055914624982, - "flos": 70712024751360.0, - "grad_norm": 0.768019180696257, - "language_loss": 0.56802553, - "learning_rate": 1.4089923354058533e-06, - "loss": 0.58844441, - "num_input_tokens_seen": 217632375, - "step": 10106, - "time_per_iteration": 3.158289909362793 - }, - { - "auxiliary_loss_clip": 0.01063356, - "auxiliary_loss_mlp": 0.01035278, - "balance_loss_clip": 1.03482223, - "balance_loss_mlp": 1.02204537, - "epoch": 0.6076657147151661, - "flos": 28364545906560.0, - "grad_norm": 1.5438087958158528, - "language_loss": 0.68604589, - "learning_rate": 1.4086202776856784e-06, - "loss": 0.7070322, - "num_input_tokens_seen": 217653055, - "step": 10107, - "time_per_iteration": 2.922015905380249 - }, - { - "auxiliary_loss_clip": 0.01104951, - "auxiliary_loss_mlp": 0.01029821, - "balance_loss_clip": 1.03881001, - "balance_loss_mlp": 1.01635098, - "epoch": 0.6077258379678341, - "flos": 15049767807360.0, - "grad_norm": 1.8478390173687478, - "language_loss": 0.81575567, - "learning_rate": 1.4082482423906815e-06, - "loss": 0.83710343, - "num_input_tokens_seen": 217671520, - "step": 10108, - "time_per_iteration": 2.6345651149749756 - }, - { - "auxiliary_loss_clip": 0.01090498, - "auxiliary_loss_mlp": 0.01037826, - "balance_loss_clip": 1.03763413, - "balance_loss_mlp": 1.02306151, - "epoch": 0.607785961220502, - "flos": 36167251553280.0, - "grad_norm": 2.15332165440763, - "language_loss": 0.71337903, - "learning_rate": 1.4078762295349714e-06, - "loss": 0.73466218, - "num_input_tokens_seen": 217691880, - "step": 10109, - "time_per_iteration": 2.874757766723633 - }, - { - "auxiliary_loss_clip": 0.01090295, - "auxiliary_loss_mlp": 0.01033773, - "balance_loss_clip": 1.03903341, - "balance_loss_mlp": 1.02175713, - "epoch": 0.60784608447317, - "flos": 22524249951360.0, - "grad_norm": 1.6052444437933584, - "language_loss": 0.79990447, - "learning_rate": 1.407504239132653e-06, - "loss": 0.82114512, - "num_input_tokens_seen": 217710530, - "step": 10110, - "time_per_iteration": 2.6963181495666504 - }, - { - "auxiliary_loss_clip": 0.01089001, - "auxiliary_loss_mlp": 0.01029745, - "balance_loss_clip": 1.03760231, - "balance_loss_mlp": 1.01529717, - "epoch": 0.6079062077258379, - "flos": 23841166285440.0, - "grad_norm": 2.270664246588292, - "language_loss": 0.70269084, - "learning_rate": 1.4071322711978338e-06, - "loss": 0.72387832, - "num_input_tokens_seen": 217728650, - "step": 10111, - "time_per_iteration": 2.6903553009033203 - }, - { - "auxiliary_loss_clip": 0.01085414, - "auxiliary_loss_mlp": 0.010291, - "balance_loss_clip": 1.04066074, - "balance_loss_mlp": 1.01539087, - "epoch": 0.6079663309785059, - "flos": 23367037737600.0, - "grad_norm": 1.6556748056408641, - "language_loss": 0.65621054, - "learning_rate": 1.4067603257446186e-06, - "loss": 0.67735571, - "num_input_tokens_seen": 217747135, - "step": 10112, - "time_per_iteration": 2.7705774307250977 - }, - { - "auxiliary_loss_clip": 0.01029897, - "auxiliary_loss_mlp": 0.00999602, - "balance_loss_clip": 1.01457083, - "balance_loss_mlp": 0.99854136, - "epoch": 0.6080264542311739, - "flos": 71382873110400.0, - "grad_norm": 0.6359208638260742, - "language_loss": 0.49526292, - "learning_rate": 1.4063884027871105e-06, - "loss": 0.51555794, - "num_input_tokens_seen": 217811860, - "step": 10113, - "time_per_iteration": 3.2169973850250244 - }, - { - "auxiliary_loss_clip": 0.01030037, - "auxiliary_loss_mlp": 0.01000401, - "balance_loss_clip": 1.01493645, - "balance_loss_mlp": 0.99929249, - "epoch": 0.6080865774838419, - "flos": 66529833442560.0, - "grad_norm": 0.8386978497659568, - "language_loss": 0.56947362, - "learning_rate": 1.4060165023394147e-06, - "loss": 0.58977795, - "num_input_tokens_seen": 217866510, - "step": 10114, - "time_per_iteration": 3.1260786056518555 - }, - { - "auxiliary_loss_clip": 0.01118489, - "auxiliary_loss_mlp": 0.01029714, - "balance_loss_clip": 1.04061675, - "balance_loss_mlp": 1.01540279, - "epoch": 0.6081467007365099, - "flos": 19207935895680.0, - "grad_norm": 2.0279729583270405, - "language_loss": 0.70046329, - "learning_rate": 1.4056446244156317e-06, - "loss": 0.72194529, - "num_input_tokens_seen": 217885650, - "step": 10115, - "time_per_iteration": 2.627066135406494 - }, - { - "auxiliary_loss_clip": 0.01076474, - "auxiliary_loss_mlp": 0.01030702, - "balance_loss_clip": 1.03560662, - "balance_loss_mlp": 1.01668298, - "epoch": 0.6082068239891778, - "flos": 24167737762560.0, - "grad_norm": 1.5787360311779992, - "language_loss": 0.72676456, - "learning_rate": 1.4052727690298642e-06, - "loss": 0.74783635, - "num_input_tokens_seen": 217905300, - "step": 10116, - "time_per_iteration": 2.713207721710205 - }, - { - "auxiliary_loss_clip": 0.01090032, - "auxiliary_loss_mlp": 0.01036221, - "balance_loss_clip": 1.03843713, - "balance_loss_mlp": 1.02108169, - "epoch": 0.6082669472418458, - "flos": 37413316310400.0, - "grad_norm": 1.6151215779769803, - "language_loss": 0.53940326, - "learning_rate": 1.4049009361962138e-06, - "loss": 0.56066579, - "num_input_tokens_seen": 217927845, - "step": 10117, - "time_per_iteration": 2.809150218963623 - }, - { - "auxiliary_loss_clip": 0.01097513, - "auxiliary_loss_mlp": 0.01030118, - "balance_loss_clip": 1.04143286, - "balance_loss_mlp": 1.01718414, - "epoch": 0.6083270704945137, - "flos": 15085534775040.0, - "grad_norm": 1.724080776440041, - "language_loss": 0.70168173, - "learning_rate": 1.4045291259287786e-06, - "loss": 0.72295797, - "num_input_tokens_seen": 217946145, - "step": 10118, - "time_per_iteration": 2.6340367794036865 - }, - { - "auxiliary_loss_clip": 0.01051915, - "auxiliary_loss_mlp": 0.01030313, - "balance_loss_clip": 1.03519964, - "balance_loss_mlp": 1.01717043, - "epoch": 0.6083871937471818, - "flos": 20668458804480.0, - "grad_norm": 1.7207126950990799, - "language_loss": 0.74843824, - "learning_rate": 1.4041573382416588e-06, - "loss": 0.76926053, - "num_input_tokens_seen": 217965190, - "step": 10119, - "time_per_iteration": 2.7610390186309814 - }, - { - "auxiliary_loss_clip": 0.01102909, - "auxiliary_loss_mlp": 0.01034672, - "balance_loss_clip": 1.04056787, - "balance_loss_mlp": 1.02195883, - "epoch": 0.6084473169998497, - "flos": 21506901045120.0, - "grad_norm": 1.7294665557102438, - "language_loss": 0.67426908, - "learning_rate": 1.4037855731489525e-06, - "loss": 0.69564486, - "num_input_tokens_seen": 217983625, - "step": 10120, - "time_per_iteration": 2.6205523014068604 - }, - { - "auxiliary_loss_clip": 0.01108129, - "auxiliary_loss_mlp": 0.01033833, - "balance_loss_clip": 1.04188108, - "balance_loss_mlp": 1.02035594, - "epoch": 0.6085074402525177, - "flos": 26870051710080.0, - "grad_norm": 1.6306465435700652, - "language_loss": 0.74561995, - "learning_rate": 1.4034138306647571e-06, - "loss": 0.76703954, - "num_input_tokens_seen": 218006005, - "step": 10121, - "time_per_iteration": 2.6655447483062744 - }, - { - "auxiliary_loss_clip": 0.01103879, - "auxiliary_loss_mlp": 0.01034712, - "balance_loss_clip": 1.03920245, - "balance_loss_mlp": 1.02181315, - "epoch": 0.6085675635051856, - "flos": 10889839952640.0, - "grad_norm": 1.8102237735068374, - "language_loss": 0.80563319, - "learning_rate": 1.4030421108031685e-06, - "loss": 0.8270191, - "num_input_tokens_seen": 218024195, - "step": 10122, - "time_per_iteration": 2.5725269317626953 - }, - { - "auxiliary_loss_clip": 0.011003, - "auxiliary_loss_mlp": 0.01033892, - "balance_loss_clip": 1.03930187, - "balance_loss_mlp": 1.01991475, - "epoch": 0.6086276867578536, - "flos": 34862186707200.0, - "grad_norm": 2.5216051585049994, - "language_loss": 0.55656278, - "learning_rate": 1.402670413578284e-06, - "loss": 0.5779047, - "num_input_tokens_seen": 218047190, - "step": 10123, - "time_per_iteration": 2.7452590465545654 - }, - { - "auxiliary_loss_clip": 0.01107373, - "auxiliary_loss_mlp": 0.01041375, - "balance_loss_clip": 1.0430057, - "balance_loss_mlp": 1.02773786, - "epoch": 0.6086878100105215, - "flos": 20047706939520.0, - "grad_norm": 2.4791520044019526, - "language_loss": 0.73864502, - "learning_rate": 1.4022987390041965e-06, - "loss": 0.76013255, - "num_input_tokens_seen": 218065945, - "step": 10124, - "time_per_iteration": 2.6622564792633057 - }, - { - "auxiliary_loss_clip": 0.01089528, - "auxiliary_loss_mlp": 0.01035903, - "balance_loss_clip": 1.03544164, - "balance_loss_mlp": 1.0215143, - "epoch": 0.6087479332631895, - "flos": 18332469711360.0, - "grad_norm": 2.9318658727845577, - "language_loss": 0.65483487, - "learning_rate": 1.4019270870950006e-06, - "loss": 0.67608917, - "num_input_tokens_seen": 218085285, - "step": 10125, - "time_per_iteration": 2.677290439605713 - }, - { - "auxiliary_loss_clip": 0.01116071, - "auxiliary_loss_mlp": 0.01033414, - "balance_loss_clip": 1.04222536, - "balance_loss_mlp": 1.0202589, - "epoch": 0.6088080565158575, - "flos": 24493411399680.0, - "grad_norm": 1.769901084210043, - "language_loss": 0.76367819, - "learning_rate": 1.40155545786479e-06, - "loss": 0.785173, - "num_input_tokens_seen": 218104735, - "step": 10126, - "time_per_iteration": 2.6574339866638184 - }, - { - "auxiliary_loss_clip": 0.01079175, - "auxiliary_loss_mlp": 0.01032387, - "balance_loss_clip": 1.04002953, - "balance_loss_mlp": 1.01883876, - "epoch": 0.6088681797685255, - "flos": 10269016260480.0, - "grad_norm": 2.3378560015936705, - "language_loss": 0.70790273, - "learning_rate": 1.4011838513276558e-06, - "loss": 0.72901833, - "num_input_tokens_seen": 218121855, - "step": 10127, - "time_per_iteration": 2.6849265098571777 - }, - { - "auxiliary_loss_clip": 0.01121141, - "auxiliary_loss_mlp": 0.01035296, - "balance_loss_clip": 1.04394543, - "balance_loss_mlp": 1.02121782, - "epoch": 0.6089283030211935, - "flos": 21973703218560.0, - "grad_norm": 2.1716351382875874, - "language_loss": 0.72938377, - "learning_rate": 1.400812267497691e-06, - "loss": 0.75094813, - "num_input_tokens_seen": 218137325, - "step": 10128, - "time_per_iteration": 2.5779154300689697 - }, - { - "auxiliary_loss_clip": 0.01065888, - "auxiliary_loss_mlp": 0.01033014, - "balance_loss_clip": 1.03911877, - "balance_loss_mlp": 1.02046144, - "epoch": 0.6089884262738614, - "flos": 17785191116160.0, - "grad_norm": 2.2560816683992075, - "language_loss": 0.7314086, - "learning_rate": 1.4004407063889842e-06, - "loss": 0.7523976, - "num_input_tokens_seen": 218155530, - "step": 10129, - "time_per_iteration": 2.765955924987793 - }, - { - "auxiliary_loss_clip": 0.01113573, - "auxiliary_loss_mlp": 0.01033476, - "balance_loss_clip": 1.03910589, - "balance_loss_mlp": 1.02067268, - "epoch": 0.6090485495265294, - "flos": 36910423946880.0, - "grad_norm": 1.6122727527780822, - "language_loss": 0.65641886, - "learning_rate": 1.400069168015626e-06, - "loss": 0.67788941, - "num_input_tokens_seen": 218182535, - "step": 10130, - "time_per_iteration": 2.78676438331604 - }, - { - "auxiliary_loss_clip": 0.01086426, - "auxiliary_loss_mlp": 0.0103051, - "balance_loss_clip": 1.03784049, - "balance_loss_mlp": 1.01903617, - "epoch": 0.6091086727791973, - "flos": 19899036547200.0, - "grad_norm": 1.8425930589011128, - "language_loss": 0.76978183, - "learning_rate": 1.3996976523917054e-06, - "loss": 0.79095113, - "num_input_tokens_seen": 218201740, - "step": 10131, - "time_per_iteration": 2.5955772399902344 - }, - { - "auxiliary_loss_clip": 0.0108451, - "auxiliary_loss_mlp": 0.01035484, - "balance_loss_clip": 1.04026079, - "balance_loss_mlp": 1.02387881, - "epoch": 0.6091687960318654, - "flos": 22163635359360.0, - "grad_norm": 1.697349608419957, - "language_loss": 0.76859689, - "learning_rate": 1.3993261595313093e-06, - "loss": 0.78979683, - "num_input_tokens_seen": 218219800, - "step": 10132, - "time_per_iteration": 2.7611875534057617 - }, - { - "auxiliary_loss_clip": 0.01112693, - "auxiliary_loss_mlp": 0.01033853, - "balance_loss_clip": 1.04171348, - "balance_loss_mlp": 1.02192605, - "epoch": 0.6092289192845333, - "flos": 21465280160640.0, - "grad_norm": 1.734329950775569, - "language_loss": 0.75766826, - "learning_rate": 1.3989546894485261e-06, - "loss": 0.77913374, - "num_input_tokens_seen": 218237585, - "step": 10133, - "time_per_iteration": 2.5794837474823 - }, - { - "auxiliary_loss_clip": 0.0110335, - "auxiliary_loss_mlp": 0.01034974, - "balance_loss_clip": 1.03942025, - "balance_loss_mlp": 1.0217123, - "epoch": 0.6092890425372013, - "flos": 28694924225280.0, - "grad_norm": 1.7908978482931064, - "language_loss": 0.63917655, - "learning_rate": 1.3985832421574414e-06, - "loss": 0.66055977, - "num_input_tokens_seen": 218258700, - "step": 10134, - "time_per_iteration": 2.7197823524475098 - }, - { - "auxiliary_loss_clip": 0.01091736, - "auxiliary_loss_mlp": 0.01033364, - "balance_loss_clip": 1.04008186, - "balance_loss_mlp": 1.02060866, - "epoch": 0.6093491657898692, - "flos": 20813178700800.0, - "grad_norm": 1.9213179565189793, - "language_loss": 0.7841872, - "learning_rate": 1.3982118176721397e-06, - "loss": 0.80543816, - "num_input_tokens_seen": 218275655, - "step": 10135, - "time_per_iteration": 4.243841171264648 - }, - { - "auxiliary_loss_clip": 0.01093049, - "auxiliary_loss_mlp": 0.01030115, - "balance_loss_clip": 1.04067171, - "balance_loss_mlp": 1.0183785, - "epoch": 0.6094092890425372, - "flos": 25446983708160.0, - "grad_norm": 1.9609713951304055, - "language_loss": 0.72346425, - "learning_rate": 1.3978404160067069e-06, - "loss": 0.7446959, - "num_input_tokens_seen": 218295720, - "step": 10136, - "time_per_iteration": 4.175207853317261 - }, - { - "auxiliary_loss_clip": 0.01118097, - "auxiliary_loss_mlp": 0.01029914, - "balance_loss_clip": 1.04258895, - "balance_loss_mlp": 1.01715255, - "epoch": 0.6094694122952051, - "flos": 35621265847680.0, - "grad_norm": 1.7802525821484743, - "language_loss": 0.74853754, - "learning_rate": 1.3974690371752253e-06, - "loss": 0.77001762, - "num_input_tokens_seen": 218316745, - "step": 10137, - "time_per_iteration": 2.7007157802581787 - }, - { - "auxiliary_loss_clip": 0.01100831, - "auxiliary_loss_mlp": 0.0104803, - "balance_loss_clip": 1.0380677, - "balance_loss_mlp": 1.03291392, - "epoch": 0.6095295355478731, - "flos": 24456962073600.0, - "grad_norm": 2.07495429210998, - "language_loss": 0.80021697, - "learning_rate": 1.3970976811917785e-06, - "loss": 0.82170558, - "num_input_tokens_seen": 218335385, - "step": 10138, - "time_per_iteration": 2.642719268798828 - }, - { - "auxiliary_loss_clip": 0.01085336, - "auxiliary_loss_mlp": 0.01035484, - "balance_loss_clip": 1.03812051, - "balance_loss_mlp": 1.02355671, - "epoch": 0.6095896588005411, - "flos": 15633208419840.0, - "grad_norm": 2.0335546536806746, - "language_loss": 0.81230104, - "learning_rate": 1.3967263480704481e-06, - "loss": 0.83350921, - "num_input_tokens_seen": 218353320, - "step": 10139, - "time_per_iteration": 4.268277645111084 - }, - { - "auxiliary_loss_clip": 0.01077185, - "auxiliary_loss_mlp": 0.01037802, - "balance_loss_clip": 1.03831828, - "balance_loss_mlp": 1.02411103, - "epoch": 0.6096497820532091, - "flos": 15550577182080.0, - "grad_norm": 2.12947943365166, - "language_loss": 0.83466005, - "learning_rate": 1.396355037825315e-06, - "loss": 0.85580993, - "num_input_tokens_seen": 218365620, - "step": 10140, - "time_per_iteration": 2.8175792694091797 - }, - { - "auxiliary_loss_clip": 0.01105576, - "auxiliary_loss_mlp": 0.01033997, - "balance_loss_clip": 1.04053175, - "balance_loss_mlp": 1.02132499, - "epoch": 0.6097099053058771, - "flos": 24204474397440.0, - "grad_norm": 1.6865480520512064, - "language_loss": 0.7552228, - "learning_rate": 1.3959837504704592e-06, - "loss": 0.77661854, - "num_input_tokens_seen": 218383785, - "step": 10141, - "time_per_iteration": 2.6393468379974365 - }, - { - "auxiliary_loss_clip": 0.01087905, - "auxiliary_loss_mlp": 0.01037932, - "balance_loss_clip": 1.03879404, - "balance_loss_mlp": 1.02400196, - "epoch": 0.609770028558545, - "flos": 19570238426880.0, - "grad_norm": 2.2429886109126955, - "language_loss": 0.76329374, - "learning_rate": 1.3956124860199603e-06, - "loss": 0.7845521, - "num_input_tokens_seen": 218399055, - "step": 10142, - "time_per_iteration": 2.6924803256988525 - }, - { - "auxiliary_loss_clip": 0.01117011, - "auxiliary_loss_mlp": 0.01034845, - "balance_loss_clip": 1.04226887, - "balance_loss_mlp": 1.02116001, - "epoch": 0.609830151811213, - "flos": 23949185460480.0, - "grad_norm": 1.9503172342998385, - "language_loss": 0.77012557, - "learning_rate": 1.3952412444878964e-06, - "loss": 0.7916441, - "num_input_tokens_seen": 218419120, - "step": 10143, - "time_per_iteration": 2.8441388607025146 - }, - { - "auxiliary_loss_clip": 0.01100288, - "auxiliary_loss_mlp": 0.01040669, - "balance_loss_clip": 1.0388006, - "balance_loss_mlp": 1.02585077, - "epoch": 0.6098902750638809, - "flos": 16179732829440.0, - "grad_norm": 1.761002506839972, - "language_loss": 0.75323224, - "learning_rate": 1.3948700258883448e-06, - "loss": 0.77464181, - "num_input_tokens_seen": 218435290, - "step": 10144, - "time_per_iteration": 2.6133413314819336 - }, - { - "auxiliary_loss_clip": 0.01087547, - "auxiliary_loss_mlp": 0.01034684, - "balance_loss_clip": 1.03644156, - "balance_loss_mlp": 1.02106476, - "epoch": 0.609950398316549, - "flos": 44526393763200.0, - "grad_norm": 2.2237996959363566, - "language_loss": 0.72757131, - "learning_rate": 1.394498830235383e-06, - "loss": 0.7487936, - "num_input_tokens_seen": 218457880, - "step": 10145, - "time_per_iteration": 2.939194679260254 - }, - { - "auxiliary_loss_clip": 0.01090456, - "auxiliary_loss_mlp": 0.0103494, - "balance_loss_clip": 1.03637707, - "balance_loss_mlp": 1.02223277, - "epoch": 0.6100105215692169, - "flos": 23221743223680.0, - "grad_norm": 1.7269520496185313, - "language_loss": 0.69230616, - "learning_rate": 1.3941276575430862e-06, - "loss": 0.7135601, - "num_input_tokens_seen": 218475930, - "step": 10146, - "time_per_iteration": 2.6565699577331543 - }, - { - "auxiliary_loss_clip": 0.01068091, - "auxiliary_loss_mlp": 0.00769179, - "balance_loss_clip": 1.03681684, - "balance_loss_mlp": 1.00011373, - "epoch": 0.6100706448218849, - "flos": 15012564295680.0, - "grad_norm": 1.635331048644393, - "language_loss": 0.77205098, - "learning_rate": 1.3937565078255289e-06, - "loss": 0.79042363, - "num_input_tokens_seen": 218493675, - "step": 10147, - "time_per_iteration": 2.7440903186798096 - }, - { - "auxiliary_loss_clip": 0.01093041, - "auxiliary_loss_mlp": 0.01032822, - "balance_loss_clip": 1.03794551, - "balance_loss_mlp": 1.01953053, - "epoch": 0.6101307680745528, - "flos": 19639976682240.0, - "grad_norm": 2.1080947760938895, - "language_loss": 0.78184944, - "learning_rate": 1.393385381096786e-06, - "loss": 0.80310804, - "num_input_tokens_seen": 218511780, - "step": 10148, - "time_per_iteration": 2.638685703277588 - }, - { - "auxiliary_loss_clip": 0.01080447, - "auxiliary_loss_mlp": 0.01036994, - "balance_loss_clip": 1.0334245, - "balance_loss_mlp": 1.02181888, - "epoch": 0.6101908913272208, - "flos": 29935566028800.0, - "grad_norm": 2.0657205711801776, - "language_loss": 0.54227436, - "learning_rate": 1.39301427737093e-06, - "loss": 0.56344879, - "num_input_tokens_seen": 218531850, - "step": 10149, - "time_per_iteration": 2.800041437149048 - }, - { - "auxiliary_loss_clip": 0.01092603, - "auxiliary_loss_mlp": 0.01036713, - "balance_loss_clip": 1.04295909, - "balance_loss_mlp": 1.02440453, - "epoch": 0.6102510145798887, - "flos": 21798639308160.0, - "grad_norm": 1.8291736341547842, - "language_loss": 0.8044911, - "learning_rate": 1.3926431966620333e-06, - "loss": 0.82578421, - "num_input_tokens_seen": 218551245, - "step": 10150, - "time_per_iteration": 2.725109577178955 - }, - { - "auxiliary_loss_clip": 0.01091495, - "auxiliary_loss_mlp": 0.01041201, - "balance_loss_clip": 1.04189467, - "balance_loss_mlp": 1.02752137, - "epoch": 0.6103111378325567, - "flos": 20706129192960.0, - "grad_norm": 3.373563414576853, - "language_loss": 0.68982595, - "learning_rate": 1.3922721389841684e-06, - "loss": 0.71115291, - "num_input_tokens_seen": 218571365, - "step": 10151, - "time_per_iteration": 2.672344923019409 - }, - { - "auxiliary_loss_clip": 0.01114149, - "auxiliary_loss_mlp": 0.01030926, - "balance_loss_clip": 1.03988385, - "balance_loss_mlp": 1.01889241, - "epoch": 0.6103712610852247, - "flos": 29381643417600.0, - "grad_norm": 1.7910960351729457, - "language_loss": 0.7080698, - "learning_rate": 1.3919011043514036e-06, - "loss": 0.72952056, - "num_input_tokens_seen": 218588315, - "step": 10152, - "time_per_iteration": 2.687704086303711 - }, - { - "auxiliary_loss_clip": 0.01081357, - "auxiliary_loss_mlp": 0.01034717, - "balance_loss_clip": 1.04208827, - "balance_loss_mlp": 1.02176535, - "epoch": 0.6104313843378927, - "flos": 20813035046400.0, - "grad_norm": 1.9308044180202404, - "language_loss": 0.77972472, - "learning_rate": 1.391530092777811e-06, - "loss": 0.80088544, - "num_input_tokens_seen": 218605940, - "step": 10153, - "time_per_iteration": 2.737981081008911 - }, - { - "auxiliary_loss_clip": 0.01090347, - "auxiliary_loss_mlp": 0.01032715, - "balance_loss_clip": 1.03806591, - "balance_loss_mlp": 1.01951218, - "epoch": 0.6104915075905607, - "flos": 26578457101440.0, - "grad_norm": 3.840439775750313, - "language_loss": 0.79736745, - "learning_rate": 1.3911591042774573e-06, - "loss": 0.81859809, - "num_input_tokens_seen": 218626100, - "step": 10154, - "time_per_iteration": 2.737769365310669 - }, - { - "auxiliary_loss_clip": 0.01105395, - "auxiliary_loss_mlp": 0.01031455, - "balance_loss_clip": 1.04265189, - "balance_loss_mlp": 1.01911139, - "epoch": 0.6105516308432286, - "flos": 23915788790400.0, - "grad_norm": 1.6402345774234983, - "language_loss": 0.70273185, - "learning_rate": 1.3907881388644116e-06, - "loss": 0.72410041, - "num_input_tokens_seen": 218645060, - "step": 10155, - "time_per_iteration": 2.710547924041748 - }, - { - "auxiliary_loss_clip": 0.0110624, - "auxiliary_loss_mlp": 0.01033447, - "balance_loss_clip": 1.04239929, - "balance_loss_mlp": 1.01990473, - "epoch": 0.6106117540958966, - "flos": 31577365900800.0, - "grad_norm": 1.4885347094481511, - "language_loss": 0.71531796, - "learning_rate": 1.3904171965527413e-06, - "loss": 0.73671484, - "num_input_tokens_seen": 218667690, - "step": 10156, - "time_per_iteration": 2.7332398891448975 - }, - { - "auxiliary_loss_clip": 0.0109286, - "auxiliary_loss_mlp": 0.01032773, - "balance_loss_clip": 1.04169869, - "balance_loss_mlp": 1.01951134, - "epoch": 0.6106718773485645, - "flos": 19608160210560.0, - "grad_norm": 1.5588894396348068, - "language_loss": 0.6765914, - "learning_rate": 1.3900462773565114e-06, - "loss": 0.69784772, - "num_input_tokens_seen": 218687505, - "step": 10157, - "time_per_iteration": 2.7539916038513184 - }, - { - "auxiliary_loss_clip": 0.01075332, - "auxiliary_loss_mlp": 0.01028524, - "balance_loss_clip": 1.03566861, - "balance_loss_mlp": 1.01551235, - "epoch": 0.6107320006012326, - "flos": 17123895774720.0, - "grad_norm": 1.7948221929891892, - "language_loss": 0.72670758, - "learning_rate": 1.3896753812897877e-06, - "loss": 0.74774617, - "num_input_tokens_seen": 218705315, - "step": 10158, - "time_per_iteration": 2.7469441890716553 - }, - { - "auxiliary_loss_clip": 0.01103253, - "auxiliary_loss_mlp": 0.01033152, - "balance_loss_clip": 1.0429678, - "balance_loss_mlp": 1.02017009, - "epoch": 0.6107921238539005, - "flos": 30148228500480.0, - "grad_norm": 1.507227050717275, - "language_loss": 0.69370097, - "learning_rate": 1.389304508366635e-06, - "loss": 0.715065, - "num_input_tokens_seen": 218725735, - "step": 10159, - "time_per_iteration": 2.7083382606506348 - }, - { - "auxiliary_loss_clip": 0.01118821, - "auxiliary_loss_mlp": 0.01031902, - "balance_loss_clip": 1.04300821, - "balance_loss_mlp": 1.01859236, - "epoch": 0.6108522471065685, - "flos": 18440273404800.0, - "grad_norm": 1.9516164322769225, - "language_loss": 0.78660917, - "learning_rate": 1.3889336586011167e-06, - "loss": 0.80811644, - "num_input_tokens_seen": 218743215, - "step": 10160, - "time_per_iteration": 2.5400774478912354 - }, - { - "auxiliary_loss_clip": 0.01029498, - "auxiliary_loss_mlp": 0.01003038, - "balance_loss_clip": 1.01565576, - "balance_loss_mlp": 1.00198889, - "epoch": 0.6109123703592364, - "flos": 64135454791680.0, - "grad_norm": 0.8179177002663486, - "language_loss": 0.61458665, - "learning_rate": 1.388562832007295e-06, - "loss": 0.63491201, - "num_input_tokens_seen": 218806440, - "step": 10161, - "time_per_iteration": 3.3134469985961914 - }, - { - "auxiliary_loss_clip": 0.01099659, - "auxiliary_loss_mlp": 0.00772317, - "balance_loss_clip": 1.04388893, - "balance_loss_mlp": 1.00015724, - "epoch": 0.6109724936119044, - "flos": 20667848273280.0, - "grad_norm": 4.3292370915840666, - "language_loss": 0.76713967, - "learning_rate": 1.3881920285992324e-06, - "loss": 0.78585941, - "num_input_tokens_seen": 218825720, - "step": 10162, - "time_per_iteration": 2.666212797164917 - }, - { - "auxiliary_loss_clip": 0.01115754, - "auxiliary_loss_mlp": 0.01032204, - "balance_loss_clip": 1.04164445, - "balance_loss_mlp": 1.0187993, - "epoch": 0.6110326168645723, - "flos": 31351882273920.0, - "grad_norm": 1.703540773348326, - "language_loss": 0.71334386, - "learning_rate": 1.3878212483909888e-06, - "loss": 0.73482347, - "num_input_tokens_seen": 218847735, - "step": 10163, - "time_per_iteration": 2.65462327003479 - }, - { - "auxiliary_loss_clip": 0.01112689, - "auxiliary_loss_mlp": 0.0102818, - "balance_loss_clip": 1.03985834, - "balance_loss_mlp": 1.01618207, - "epoch": 0.6110927401172404, - "flos": 25003378742400.0, - "grad_norm": 1.8985707771161122, - "language_loss": 0.59787023, - "learning_rate": 1.387450491396625e-06, - "loss": 0.61927891, - "num_input_tokens_seen": 218866585, - "step": 10164, - "time_per_iteration": 2.5967462062835693 - }, - { - "auxiliary_loss_clip": 0.01098803, - "auxiliary_loss_mlp": 0.01031481, - "balance_loss_clip": 1.04045308, - "balance_loss_mlp": 1.01886845, - "epoch": 0.6111528633699083, - "flos": 26248078782720.0, - "grad_norm": 1.6376390692210252, - "language_loss": 0.75717723, - "learning_rate": 1.3870797576302003e-06, - "loss": 0.77848011, - "num_input_tokens_seen": 218885560, - "step": 10165, - "time_per_iteration": 2.706014633178711 - }, - { - "auxiliary_loss_clip": 0.01092416, - "auxiliary_loss_mlp": 0.01029472, - "balance_loss_clip": 1.04052818, - "balance_loss_mlp": 1.01629317, - "epoch": 0.6112129866225763, - "flos": 22382474970240.0, - "grad_norm": 1.6019347929518222, - "language_loss": 0.79179376, - "learning_rate": 1.3867090471057719e-06, - "loss": 0.81301266, - "num_input_tokens_seen": 218905055, - "step": 10166, - "time_per_iteration": 2.648865222930908 - }, - { - "auxiliary_loss_clip": 0.01089634, - "auxiliary_loss_mlp": 0.01029582, - "balance_loss_clip": 1.03888917, - "balance_loss_mlp": 1.01634979, - "epoch": 0.6112731098752443, - "flos": 25227892702080.0, - "grad_norm": 1.8171337399867642, - "language_loss": 0.67561293, - "learning_rate": 1.3863383598373987e-06, - "loss": 0.69680506, - "num_input_tokens_seen": 218924030, - "step": 10167, - "time_per_iteration": 2.700876474380493 - }, - { - "auxiliary_loss_clip": 0.01114313, - "auxiliary_loss_mlp": 0.01035637, - "balance_loss_clip": 1.04177189, - "balance_loss_mlp": 1.02360249, - "epoch": 0.6113332331279122, - "flos": 22893160584960.0, - "grad_norm": 1.916507906954157, - "language_loss": 0.79281151, - "learning_rate": 1.3859676958391364e-06, - "loss": 0.81431103, - "num_input_tokens_seen": 218943750, - "step": 10168, - "time_per_iteration": 2.7144253253936768 - }, - { - "auxiliary_loss_clip": 0.01121355, - "auxiliary_loss_mlp": 0.01040045, - "balance_loss_clip": 1.04141057, - "balance_loss_mlp": 1.02516127, - "epoch": 0.6113933563805802, - "flos": 18620329305600.0, - "grad_norm": 5.812497502727784, - "language_loss": 0.85299641, - "learning_rate": 1.3855970551250398e-06, - "loss": 0.87461042, - "num_input_tokens_seen": 218957585, - "step": 10169, - "time_per_iteration": 2.5470833778381348 - }, - { - "auxiliary_loss_clip": 0.01112463, - "auxiliary_loss_mlp": 0.01031208, - "balance_loss_clip": 1.03939247, - "balance_loss_mlp": 1.01953125, - "epoch": 0.6114534796332481, - "flos": 41866275317760.0, - "grad_norm": 1.6486085762416796, - "language_loss": 0.78718483, - "learning_rate": 1.3852264377091652e-06, - "loss": 0.80862153, - "num_input_tokens_seen": 218980025, - "step": 10170, - "time_per_iteration": 2.729773998260498 - }, - { - "auxiliary_loss_clip": 0.01098191, - "auxiliary_loss_mlp": 0.01039397, - "balance_loss_clip": 1.03988242, - "balance_loss_mlp": 1.02454388, - "epoch": 0.6115136028859162, - "flos": 21908454163200.0, - "grad_norm": 2.373771480482072, - "language_loss": 0.68857706, - "learning_rate": 1.3848558436055651e-06, - "loss": 0.70995295, - "num_input_tokens_seen": 218998200, - "step": 10171, - "time_per_iteration": 2.8418185710906982 - }, - { - "auxiliary_loss_clip": 0.01084168, - "auxiliary_loss_mlp": 0.01037331, - "balance_loss_clip": 1.03639293, - "balance_loss_mlp": 1.0224179, - "epoch": 0.6115737261385841, - "flos": 28804846821120.0, - "grad_norm": 1.5357621569118813, - "language_loss": 0.79195881, - "learning_rate": 1.3844852728282934e-06, - "loss": 0.81317377, - "num_input_tokens_seen": 219017910, - "step": 10172, - "time_per_iteration": 2.7578020095825195 - }, - { - "auxiliary_loss_clip": 0.01083831, - "auxiliary_loss_mlp": 0.0103911, - "balance_loss_clip": 1.03985405, - "balance_loss_mlp": 1.02511525, - "epoch": 0.6116338493912521, - "flos": 21251468453760.0, - "grad_norm": 2.0161581722139252, - "language_loss": 0.67301053, - "learning_rate": 1.3841147253914022e-06, - "loss": 0.69423997, - "num_input_tokens_seen": 219037730, - "step": 10173, - "time_per_iteration": 2.767425298690796 - }, - { - "auxiliary_loss_clip": 0.01093328, - "auxiliary_loss_mlp": 0.01039514, - "balance_loss_clip": 1.0412842, - "balance_loss_mlp": 1.02572155, - "epoch": 0.61169397264392, - "flos": 17530189488000.0, - "grad_norm": 1.7178312614749116, - "language_loss": 0.55863279, - "learning_rate": 1.3837442013089416e-06, - "loss": 0.57996118, - "num_input_tokens_seen": 219056755, - "step": 10174, - "time_per_iteration": 4.367191314697266 - }, - { - "auxiliary_loss_clip": 0.01098909, - "auxiliary_loss_mlp": 0.0103905, - "balance_loss_clip": 1.0425694, - "balance_loss_mlp": 1.02503705, - "epoch": 0.611754095896588, - "flos": 23951555758080.0, - "grad_norm": 1.9312072143196408, - "language_loss": 0.66054702, - "learning_rate": 1.3833737005949628e-06, - "loss": 0.68192655, - "num_input_tokens_seen": 219076985, - "step": 10175, - "time_per_iteration": 4.3369996547698975 - }, - { - "auxiliary_loss_clip": 0.0110119, - "auxiliary_loss_mlp": 0.00770739, - "balance_loss_clip": 1.03765738, - "balance_loss_mlp": 1.00009918, - "epoch": 0.6118142191492559, - "flos": 25994872834560.0, - "grad_norm": 2.243694545574497, - "language_loss": 0.83143312, - "learning_rate": 1.3830032232635154e-06, - "loss": 0.85015237, - "num_input_tokens_seen": 219096050, - "step": 10176, - "time_per_iteration": 2.6386196613311768 - }, - { - "auxiliary_loss_clip": 0.01097242, - "auxiliary_loss_mlp": 0.01040776, - "balance_loss_clip": 1.04172039, - "balance_loss_mlp": 1.02604187, - "epoch": 0.611874342401924, - "flos": 24603190341120.0, - "grad_norm": 1.9428160095597935, - "language_loss": 0.77491206, - "learning_rate": 1.3826327693286474e-06, - "loss": 0.79629225, - "num_input_tokens_seen": 219112665, - "step": 10177, - "time_per_iteration": 2.68098521232605 - }, - { - "auxiliary_loss_clip": 0.01100764, - "auxiliary_loss_mlp": 0.00771744, - "balance_loss_clip": 1.03818965, - "balance_loss_mlp": 1.00019312, - "epoch": 0.6119344656545919, - "flos": 15887132640000.0, - "grad_norm": 3.7169342629070505, - "language_loss": 0.75467336, - "learning_rate": 1.3822623388044065e-06, - "loss": 0.77339846, - "num_input_tokens_seen": 219129120, - "step": 10178, - "time_per_iteration": 2.600816011428833 - }, - { - "auxiliary_loss_clip": 0.01088953, - "auxiliary_loss_mlp": 0.01045893, - "balance_loss_clip": 1.03788781, - "balance_loss_mlp": 1.03069353, - "epoch": 0.6119945889072599, - "flos": 21652877917440.0, - "grad_norm": 1.6054240792862575, - "language_loss": 0.67197716, - "learning_rate": 1.3818919317048402e-06, - "loss": 0.69332558, - "num_input_tokens_seen": 219148950, - "step": 10179, - "time_per_iteration": 4.199966669082642 - }, - { - "auxiliary_loss_clip": 0.0109746, - "auxiliary_loss_mlp": 0.01034998, - "balance_loss_clip": 1.0424819, - "balance_loss_mlp": 1.02241588, - "epoch": 0.6120547121599279, - "flos": 13772533023360.0, - "grad_norm": 1.7918927990683708, - "language_loss": 0.83621407, - "learning_rate": 1.3815215480439933e-06, - "loss": 0.85753864, - "num_input_tokens_seen": 219165585, - "step": 10180, - "time_per_iteration": 2.617266893386841 - }, - { - "auxiliary_loss_clip": 0.01117181, - "auxiliary_loss_mlp": 0.01032718, - "balance_loss_clip": 1.04272151, - "balance_loss_mlp": 1.01881814, - "epoch": 0.6121148354125958, - "flos": 20079164275200.0, - "grad_norm": 1.5733186243311148, - "language_loss": 0.7745713, - "learning_rate": 1.3811511878359113e-06, - "loss": 0.79607022, - "num_input_tokens_seen": 219183280, - "step": 10181, - "time_per_iteration": 2.542682409286499 - }, - { - "auxiliary_loss_clip": 0.01117399, - "auxiliary_loss_mlp": 0.01035122, - "balance_loss_clip": 1.041821, - "balance_loss_mlp": 1.02228367, - "epoch": 0.6121749586652638, - "flos": 13471313569920.0, - "grad_norm": 15.811946001131306, - "language_loss": 0.80652797, - "learning_rate": 1.3807808510946384e-06, - "loss": 0.82805324, - "num_input_tokens_seen": 219197200, - "step": 10182, - "time_per_iteration": 2.6980040073394775 - }, - { - "auxiliary_loss_clip": 0.01077836, - "auxiliary_loss_mlp": 0.01037108, - "balance_loss_clip": 1.03717065, - "balance_loss_mlp": 1.02501428, - "epoch": 0.6122350819179317, - "flos": 20120533764480.0, - "grad_norm": 1.5906642026710172, - "language_loss": 0.82815677, - "learning_rate": 1.3804105378342177e-06, - "loss": 0.84930623, - "num_input_tokens_seen": 219216825, - "step": 10183, - "time_per_iteration": 2.808246612548828 - }, - { - "auxiliary_loss_clip": 0.01025033, - "auxiliary_loss_mlp": 0.01005337, - "balance_loss_clip": 1.01312232, - "balance_loss_mlp": 1.00417471, - "epoch": 0.6122952051705998, - "flos": 65429242767360.0, - "grad_norm": 0.7045561187177276, - "language_loss": 0.62833804, - "learning_rate": 1.3800402480686914e-06, - "loss": 0.64864177, - "num_input_tokens_seen": 219283795, - "step": 10184, - "time_per_iteration": 3.2871408462524414 - }, - { - "auxiliary_loss_clip": 0.01108097, - "auxiliary_loss_mlp": 0.01037833, - "balance_loss_clip": 1.042454, - "balance_loss_mlp": 1.02517307, - "epoch": 0.6123553284232677, - "flos": 20376253664640.0, - "grad_norm": 1.792613488461195, - "language_loss": 0.82103658, - "learning_rate": 1.379669981812101e-06, - "loss": 0.8424958, - "num_input_tokens_seen": 219302385, - "step": 10185, - "time_per_iteration": 2.623692750930786 - }, - { - "auxiliary_loss_clip": 0.0109256, - "auxiliary_loss_mlp": 0.01038333, - "balance_loss_clip": 1.04070401, - "balance_loss_mlp": 1.02442169, - "epoch": 0.6124154516759357, - "flos": 23987645948160.0, - "grad_norm": 1.7206353448570937, - "language_loss": 0.74358237, - "learning_rate": 1.3792997390784868e-06, - "loss": 0.76489139, - "num_input_tokens_seen": 219319765, - "step": 10186, - "time_per_iteration": 2.657557725906372 - }, - { - "auxiliary_loss_clip": 0.01099771, - "auxiliary_loss_mlp": 0.0103362, - "balance_loss_clip": 1.03756428, - "balance_loss_mlp": 1.021294, - "epoch": 0.6124755749286036, - "flos": 21468799693440.0, - "grad_norm": 1.5881045533275502, - "language_loss": 0.7818836, - "learning_rate": 1.3789295198818895e-06, - "loss": 0.80321753, - "num_input_tokens_seen": 219337440, - "step": 10187, - "time_per_iteration": 2.625558376312256 - }, - { - "auxiliary_loss_clip": 0.01113087, - "auxiliary_loss_mlp": 0.01033487, - "balance_loss_clip": 1.03851271, - "balance_loss_mlp": 1.02038562, - "epoch": 0.6125356981812716, - "flos": 23879195809920.0, - "grad_norm": 1.8256616870215527, - "language_loss": 0.83049744, - "learning_rate": 1.3785593242363462e-06, - "loss": 0.85196316, - "num_input_tokens_seen": 219357525, - "step": 10188, - "time_per_iteration": 2.6045219898223877 - }, - { - "auxiliary_loss_clip": 0.0108702, - "auxiliary_loss_mlp": 0.0103141, - "balance_loss_clip": 1.04232693, - "balance_loss_mlp": 1.01822519, - "epoch": 0.6125958214339395, - "flos": 14425604150400.0, - "grad_norm": 1.7058207723590004, - "language_loss": 0.7547375, - "learning_rate": 1.378189152155896e-06, - "loss": 0.77592176, - "num_input_tokens_seen": 219374855, - "step": 10189, - "time_per_iteration": 2.7627220153808594 - }, - { - "auxiliary_loss_clip": 0.01101171, - "auxiliary_loss_mlp": 0.0104025, - "balance_loss_clip": 1.03780556, - "balance_loss_mlp": 1.02642107, - "epoch": 0.6126559446866076, - "flos": 23259090389760.0, - "grad_norm": 1.513309715943079, - "language_loss": 0.74214786, - "learning_rate": 1.3778190036545758e-06, - "loss": 0.76356208, - "num_input_tokens_seen": 219394740, - "step": 10190, - "time_per_iteration": 2.617075204849243 - }, - { - "auxiliary_loss_clip": 0.01104454, - "auxiliary_loss_mlp": 0.0103683, - "balance_loss_clip": 1.04099751, - "balance_loss_mlp": 1.02338362, - "epoch": 0.6127160679392755, - "flos": 26864808324480.0, - "grad_norm": 1.7858486096662998, - "language_loss": 0.68623936, - "learning_rate": 1.3774488787464207e-06, - "loss": 0.70765221, - "num_input_tokens_seen": 219413755, - "step": 10191, - "time_per_iteration": 2.681180477142334 - }, - { - "auxiliary_loss_clip": 0.0110296, - "auxiliary_loss_mlp": 0.0103819, - "balance_loss_clip": 1.0385741, - "balance_loss_mlp": 1.02425456, - "epoch": 0.6127761911919435, - "flos": 26396425952640.0, - "grad_norm": 2.13769200790618, - "language_loss": 0.73452723, - "learning_rate": 1.377078777445467e-06, - "loss": 0.75593865, - "num_input_tokens_seen": 219433560, - "step": 10192, - "time_per_iteration": 2.6742324829101562 - }, - { - "auxiliary_loss_clip": 0.01075917, - "auxiliary_loss_mlp": 0.01033242, - "balance_loss_clip": 1.03988755, - "balance_loss_mlp": 1.02090943, - "epoch": 0.6128363144446115, - "flos": 22634747164800.0, - "grad_norm": 2.0088299944144636, - "language_loss": 0.83632165, - "learning_rate": 1.3767086997657478e-06, - "loss": 0.85741329, - "num_input_tokens_seen": 219452640, - "step": 10193, - "time_per_iteration": 2.701087474822998 - }, - { - "auxiliary_loss_clip": 0.01082703, - "auxiliary_loss_mlp": 0.01035438, - "balance_loss_clip": 1.03853893, - "balance_loss_mlp": 1.02231348, - "epoch": 0.6128964376972794, - "flos": 26759051706240.0, - "grad_norm": 2.1771802645074105, - "language_loss": 0.6991539, - "learning_rate": 1.3763386457212979e-06, - "loss": 0.72033525, - "num_input_tokens_seen": 219468585, - "step": 10194, - "time_per_iteration": 2.6878440380096436 - }, - { - "auxiliary_loss_clip": 0.01010189, - "auxiliary_loss_mlp": 0.01003845, - "balance_loss_clip": 1.01538479, - "balance_loss_mlp": 1.002653, - "epoch": 0.6129565609499474, - "flos": 65567929178880.0, - "grad_norm": 0.8185640373649049, - "language_loss": 0.58629549, - "learning_rate": 1.375968615326149e-06, - "loss": 0.60643584, - "num_input_tokens_seen": 219523015, - "step": 10195, - "time_per_iteration": 3.05383038520813 - }, - { - "auxiliary_loss_clip": 0.01095455, - "auxiliary_loss_mlp": 0.01035767, - "balance_loss_clip": 1.04045796, - "balance_loss_mlp": 1.02244508, - "epoch": 0.6130166842026153, - "flos": 16362087200640.0, - "grad_norm": 2.135532256863793, - "language_loss": 0.69762802, - "learning_rate": 1.3755986085943324e-06, - "loss": 0.71894026, - "num_input_tokens_seen": 219539980, - "step": 10196, - "time_per_iteration": 2.6125218868255615 - }, - { - "auxiliary_loss_clip": 0.01089403, - "auxiliary_loss_mlp": 0.01036656, - "balance_loss_clip": 1.03711545, - "balance_loss_mlp": 1.02356637, - "epoch": 0.6130768074552834, - "flos": 23652455207040.0, - "grad_norm": 1.7041901113683988, - "language_loss": 0.71497107, - "learning_rate": 1.3752286255398788e-06, - "loss": 0.73623163, - "num_input_tokens_seen": 219556980, - "step": 10197, - "time_per_iteration": 2.687622547149658 - }, - { - "auxiliary_loss_clip": 0.01102107, - "auxiliary_loss_mlp": 0.01046474, - "balance_loss_clip": 1.03892088, - "balance_loss_mlp": 1.03226423, - "epoch": 0.6131369307079513, - "flos": 20047455544320.0, - "grad_norm": 2.1425841144655533, - "language_loss": 0.79149073, - "learning_rate": 1.3748586661768191e-06, - "loss": 0.81297648, - "num_input_tokens_seen": 219576410, - "step": 10198, - "time_per_iteration": 2.6697170734405518 - }, - { - "auxiliary_loss_clip": 0.01092328, - "auxiliary_loss_mlp": 0.01031323, - "balance_loss_clip": 1.04398417, - "balance_loss_mlp": 1.01794744, - "epoch": 0.6131970539606193, - "flos": 22672166158080.0, - "grad_norm": 1.4352269101197792, - "language_loss": 0.74505019, - "learning_rate": 1.374488730519181e-06, - "loss": 0.76628667, - "num_input_tokens_seen": 219597180, - "step": 10199, - "time_per_iteration": 2.789501905441284 - }, - { - "auxiliary_loss_clip": 0.01092976, - "auxiliary_loss_mlp": 0.01040283, - "balance_loss_clip": 1.03864002, - "balance_loss_mlp": 1.02596581, - "epoch": 0.6132571772132872, - "flos": 26870913636480.0, - "grad_norm": 2.276152956596312, - "language_loss": 0.62111485, - "learning_rate": 1.374118818580993e-06, - "loss": 0.64244747, - "num_input_tokens_seen": 219617630, - "step": 10200, - "time_per_iteration": 2.7012946605682373 - }, - { - "auxiliary_loss_clip": 0.01092122, - "auxiliary_loss_mlp": 0.01030786, - "balance_loss_clip": 1.04091394, - "balance_loss_mlp": 1.01772022, - "epoch": 0.6133173004659552, - "flos": 22892657794560.0, - "grad_norm": 2.1392566641947464, - "language_loss": 0.6911571, - "learning_rate": 1.3737489303762822e-06, - "loss": 0.71238619, - "num_input_tokens_seen": 219637025, - "step": 10201, - "time_per_iteration": 2.7815003395080566 - }, - { - "auxiliary_loss_clip": 0.01091125, - "auxiliary_loss_mlp": 0.01031281, - "balance_loss_clip": 1.03879607, - "balance_loss_mlp": 1.018466, - "epoch": 0.6133774237186231, - "flos": 20485098852480.0, - "grad_norm": 1.7663162719665984, - "language_loss": 0.83417988, - "learning_rate": 1.3733790659190746e-06, - "loss": 0.85540396, - "num_input_tokens_seen": 219656625, - "step": 10202, - "time_per_iteration": 2.6809394359588623 - }, - { - "auxiliary_loss_clip": 0.01037873, - "auxiliary_loss_mlp": 0.0100084, - "balance_loss_clip": 1.01421046, - "balance_loss_mlp": 0.99977362, - "epoch": 0.6134375469712912, - "flos": 69413065217280.0, - "grad_norm": 0.8928245444729744, - "language_loss": 0.67083746, - "learning_rate": 1.3730092252233953e-06, - "loss": 0.69122458, - "num_input_tokens_seen": 219718090, - "step": 10203, - "time_per_iteration": 3.153150796890259 - }, - { - "auxiliary_loss_clip": 0.01107329, - "auxiliary_loss_mlp": 0.01030437, - "balance_loss_clip": 1.04093874, - "balance_loss_mlp": 1.01783061, - "epoch": 0.6134976702239591, - "flos": 41281541815680.0, - "grad_norm": 1.5881826993460113, - "language_loss": 0.61211205, - "learning_rate": 1.37263940830327e-06, - "loss": 0.63348967, - "num_input_tokens_seen": 219740100, - "step": 10204, - "time_per_iteration": 2.8730733394622803 - }, - { - "auxiliary_loss_clip": 0.01079745, - "auxiliary_loss_mlp": 0.0102996, - "balance_loss_clip": 1.03856349, - "balance_loss_mlp": 1.0171572, - "epoch": 0.6135577934766271, - "flos": 22346600261760.0, - "grad_norm": 1.8494988248574857, - "language_loss": 0.72484612, - "learning_rate": 1.3722696151727204e-06, - "loss": 0.74594319, - "num_input_tokens_seen": 219761225, - "step": 10205, - "time_per_iteration": 2.789635419845581 - }, - { - "auxiliary_loss_clip": 0.0110225, - "auxiliary_loss_mlp": 0.01027505, - "balance_loss_clip": 1.04025602, - "balance_loss_mlp": 1.01416492, - "epoch": 0.6136179167292951, - "flos": 23728155120000.0, - "grad_norm": 1.726684216662775, - "language_loss": 0.76188898, - "learning_rate": 1.3718998458457701e-06, - "loss": 0.78318655, - "num_input_tokens_seen": 219780085, - "step": 10206, - "time_per_iteration": 2.6312029361724854 - }, - { - "auxiliary_loss_clip": 0.01085288, - "auxiliary_loss_mlp": 0.01031734, - "balance_loss_clip": 1.04444122, - "balance_loss_mlp": 1.0182395, - "epoch": 0.613678039981963, - "flos": 26024678144640.0, - "grad_norm": 2.2620215533288013, - "language_loss": 0.7565456, - "learning_rate": 1.3715301003364407e-06, - "loss": 0.7777158, - "num_input_tokens_seen": 219797895, - "step": 10207, - "time_per_iteration": 2.768277645111084 - }, - { - "auxiliary_loss_clip": 0.01103864, - "auxiliary_loss_mlp": 0.01034203, - "balance_loss_clip": 1.04067349, - "balance_loss_mlp": 1.02212703, - "epoch": 0.613738163234631, - "flos": 9859957200000.0, - "grad_norm": 2.399068150435751, - "language_loss": 0.83005822, - "learning_rate": 1.3711603786587525e-06, - "loss": 0.85143888, - "num_input_tokens_seen": 219811295, - "step": 10208, - "time_per_iteration": 2.5925726890563965 - }, - { - "auxiliary_loss_clip": 0.0109897, - "auxiliary_loss_mlp": 0.01034142, - "balance_loss_clip": 1.04265046, - "balance_loss_mlp": 1.01999795, - "epoch": 0.613798286487299, - "flos": 33182070001920.0, - "grad_norm": 1.8170662176874706, - "language_loss": 0.72382063, - "learning_rate": 1.3707906808267265e-06, - "loss": 0.74515176, - "num_input_tokens_seen": 219832735, - "step": 10209, - "time_per_iteration": 2.7966833114624023 - }, - { - "auxiliary_loss_clip": 0.01115107, - "auxiliary_loss_mlp": 0.01038487, - "balance_loss_clip": 1.04209638, - "balance_loss_mlp": 1.02545118, - "epoch": 0.613858409739967, - "flos": 25627901535360.0, - "grad_norm": 1.6402518788547593, - "language_loss": 0.74474829, - "learning_rate": 1.37042100685438e-06, - "loss": 0.76628423, - "num_input_tokens_seen": 219852755, - "step": 10210, - "time_per_iteration": 2.615272045135498 - }, - { - "auxiliary_loss_clip": 0.01010153, - "auxiliary_loss_mlp": 0.01001177, - "balance_loss_clip": 1.01308346, - "balance_loss_mlp": 0.99999064, - "epoch": 0.6139185329926349, - "flos": 67192313932800.0, - "grad_norm": 0.8597503962544338, - "language_loss": 0.64958251, - "learning_rate": 1.3700513567557325e-06, - "loss": 0.66969585, - "num_input_tokens_seen": 219922785, - "step": 10211, - "time_per_iteration": 3.410182476043701 - }, - { - "auxiliary_loss_clip": 0.01093321, - "auxiliary_loss_mlp": 0.00771551, - "balance_loss_clip": 1.03993869, - "balance_loss_mlp": 1.00015092, - "epoch": 0.6139786562453029, - "flos": 21543637680000.0, - "grad_norm": 2.0754424248675893, - "language_loss": 0.7585628, - "learning_rate": 1.369681730544801e-06, - "loss": 0.77721149, - "num_input_tokens_seen": 219942215, - "step": 10212, - "time_per_iteration": 3.0132839679718018 - }, - { - "auxiliary_loss_clip": 0.01087691, - "auxiliary_loss_mlp": 0.01041947, - "balance_loss_clip": 1.03709769, - "balance_loss_mlp": 1.02745092, - "epoch": 0.6140387794979708, - "flos": 26068489758720.0, - "grad_norm": 1.8964126815157365, - "language_loss": 0.74028683, - "learning_rate": 1.3693121282356009e-06, - "loss": 0.76158321, - "num_input_tokens_seen": 219963830, - "step": 10213, - "time_per_iteration": 2.757840871810913 - }, - { - "auxiliary_loss_clip": 0.01100654, - "auxiliary_loss_mlp": 0.01037388, - "balance_loss_clip": 1.04215121, - "balance_loss_mlp": 1.02341056, - "epoch": 0.6140989027506388, - "flos": 23694614795520.0, - "grad_norm": 1.4673821315924696, - "language_loss": 0.73059738, - "learning_rate": 1.3689425498421483e-06, - "loss": 0.7519778, - "num_input_tokens_seen": 219983815, - "step": 10214, - "time_per_iteration": 5.874944686889648 - }, - { - "auxiliary_loss_clip": 0.01119065, - "auxiliary_loss_mlp": 0.01032618, - "balance_loss_clip": 1.04233837, - "balance_loss_mlp": 1.01859856, - "epoch": 0.6141590260033067, - "flos": 22231721589120.0, - "grad_norm": 1.979046579810642, - "language_loss": 0.74642611, - "learning_rate": 1.3685729953784572e-06, - "loss": 0.76794291, - "num_input_tokens_seen": 220003165, - "step": 10215, - "time_per_iteration": 4.103458404541016 - }, - { - "auxiliary_loss_clip": 0.0110334, - "auxiliary_loss_mlp": 0.01036271, - "balance_loss_clip": 1.04110682, - "balance_loss_mlp": 1.02308035, - "epoch": 0.6142191492559748, - "flos": 23871653953920.0, - "grad_norm": 3.0132083118300526, - "language_loss": 0.78161263, - "learning_rate": 1.368203464858542e-06, - "loss": 0.80300874, - "num_input_tokens_seen": 220021015, - "step": 10216, - "time_per_iteration": 2.6554577350616455 - }, - { - "auxiliary_loss_clip": 0.01116166, - "auxiliary_loss_mlp": 0.01038341, - "balance_loss_clip": 1.04212427, - "balance_loss_mlp": 1.02428079, - "epoch": 0.6142792725086427, - "flos": 15042513260160.0, - "grad_norm": 2.385226690327553, - "language_loss": 0.80102211, - "learning_rate": 1.3678339582964147e-06, - "loss": 0.82256722, - "num_input_tokens_seen": 220035780, - "step": 10217, - "time_per_iteration": 2.5665090084075928 - }, - { - "auxiliary_loss_clip": 0.01096361, - "auxiliary_loss_mlp": 0.01032764, - "balance_loss_clip": 1.04036403, - "balance_loss_mlp": 1.0193646, - "epoch": 0.6143393957613107, - "flos": 23330947547520.0, - "grad_norm": 2.363056906877031, - "language_loss": 0.7822212, - "learning_rate": 1.3674644757060865e-06, - "loss": 0.80351239, - "num_input_tokens_seen": 220054280, - "step": 10218, - "time_per_iteration": 2.659820795059204 - }, - { - "auxiliary_loss_clip": 0.01108038, - "auxiliary_loss_mlp": 0.01034908, - "balance_loss_clip": 1.04290485, - "balance_loss_mlp": 1.02203321, - "epoch": 0.6143995190139786, - "flos": 20117086058880.0, - "grad_norm": 1.5950804577882065, - "language_loss": 0.8189528, - "learning_rate": 1.367095017101569e-06, - "loss": 0.84038228, - "num_input_tokens_seen": 220074120, - "step": 10219, - "time_per_iteration": 4.207094192504883 - }, - { - "auxiliary_loss_clip": 0.01098839, - "auxiliary_loss_mlp": 0.01035321, - "balance_loss_clip": 1.0370295, - "balance_loss_mlp": 1.02146316, - "epoch": 0.6144596422666466, - "flos": 42303559489920.0, - "grad_norm": 2.5627103076938424, - "language_loss": 0.66738832, - "learning_rate": 1.3667255824968717e-06, - "loss": 0.68872988, - "num_input_tokens_seen": 220096320, - "step": 10220, - "time_per_iteration": 2.7829878330230713 - }, - { - "auxiliary_loss_clip": 0.01103534, - "auxiliary_loss_mlp": 0.01029408, - "balance_loss_clip": 1.03913307, - "balance_loss_mlp": 1.01669455, - "epoch": 0.6145197655193146, - "flos": 21573622558080.0, - "grad_norm": 1.8637833274966709, - "language_loss": 0.71766376, - "learning_rate": 1.3663561719060041e-06, - "loss": 0.73899317, - "num_input_tokens_seen": 220114850, - "step": 10221, - "time_per_iteration": 2.621060609817505 - }, - { - "auxiliary_loss_clip": 0.01066987, - "auxiliary_loss_mlp": 0.01030601, - "balance_loss_clip": 1.03472996, - "balance_loss_mlp": 1.01779163, - "epoch": 0.6145798887719826, - "flos": 21471098163840.0, - "grad_norm": 1.725179067455254, - "language_loss": 0.79747754, - "learning_rate": 1.3659867853429735e-06, - "loss": 0.81845343, - "num_input_tokens_seen": 220133395, - "step": 10222, - "time_per_iteration": 2.7557356357574463 - }, - { - "auxiliary_loss_clip": 0.01092387, - "auxiliary_loss_mlp": 0.01042666, - "balance_loss_clip": 1.04074025, - "balance_loss_mlp": 1.02842045, - "epoch": 0.6146400120246506, - "flos": 20777016683520.0, - "grad_norm": 1.8633750333173091, - "language_loss": 0.76163048, - "learning_rate": 1.365617422821788e-06, - "loss": 0.78298092, - "num_input_tokens_seen": 220152790, - "step": 10223, - "time_per_iteration": 2.649580717086792 - }, - { - "auxiliary_loss_clip": 0.01093219, - "auxiliary_loss_mlp": 0.01034751, - "balance_loss_clip": 1.04123545, - "balance_loss_mlp": 1.02193058, - "epoch": 0.6147001352773185, - "flos": 13881306384000.0, - "grad_norm": 1.8872928812493461, - "language_loss": 0.78260607, - "learning_rate": 1.3652480843564535e-06, - "loss": 0.80388576, - "num_input_tokens_seen": 220169535, - "step": 10224, - "time_per_iteration": 2.6781771183013916 - }, - { - "auxiliary_loss_clip": 0.01076582, - "auxiliary_loss_mlp": 0.01032552, - "balance_loss_clip": 1.03500175, - "balance_loss_mlp": 1.02076793, - "epoch": 0.6147602585299865, - "flos": 56641791807360.0, - "grad_norm": 1.4371349679447827, - "language_loss": 0.66419935, - "learning_rate": 1.3648787699609746e-06, - "loss": 0.68529069, - "num_input_tokens_seen": 220195305, - "step": 10225, - "time_per_iteration": 3.0390100479125977 - }, - { - "auxiliary_loss_clip": 0.01103654, - "auxiliary_loss_mlp": 0.00771954, - "balance_loss_clip": 1.04197466, - "balance_loss_mlp": 1.00015104, - "epoch": 0.6148203817826544, - "flos": 32817217605120.0, - "grad_norm": 2.067542776960223, - "language_loss": 0.6355052, - "learning_rate": 1.364509479649357e-06, - "loss": 0.65426129, - "num_input_tokens_seen": 220215040, - "step": 10226, - "time_per_iteration": 2.744330644607544 - }, - { - "auxiliary_loss_clip": 0.01090925, - "auxiliary_loss_mlp": 0.01037806, - "balance_loss_clip": 1.03825569, - "balance_loss_mlp": 1.02304804, - "epoch": 0.6148805050353224, - "flos": 18332038748160.0, - "grad_norm": 1.7718988021259403, - "language_loss": 0.75872779, - "learning_rate": 1.3641402134356037e-06, - "loss": 0.78001511, - "num_input_tokens_seen": 220234205, - "step": 10227, - "time_per_iteration": 2.7481887340545654 - }, - { - "auxiliary_loss_clip": 0.01054701, - "auxiliary_loss_mlp": 0.01043082, - "balance_loss_clip": 1.03239739, - "balance_loss_mlp": 1.02689981, - "epoch": 0.6149406282879903, - "flos": 14063983977600.0, - "grad_norm": 2.209409032208413, - "language_loss": 0.62177163, - "learning_rate": 1.3637709713337164e-06, - "loss": 0.64274943, - "num_input_tokens_seen": 220252730, - "step": 10228, - "time_per_iteration": 2.797832489013672 - }, - { - "auxiliary_loss_clip": 0.0109079, - "auxiliary_loss_mlp": 0.01033221, - "balance_loss_clip": 1.03737903, - "balance_loss_mlp": 1.0200839, - "epoch": 0.6150007515406584, - "flos": 25190186400000.0, - "grad_norm": 2.3158396173840683, - "language_loss": 0.74483359, - "learning_rate": 1.3634017533576985e-06, - "loss": 0.7660737, - "num_input_tokens_seen": 220273345, - "step": 10229, - "time_per_iteration": 2.7949423789978027 - }, - { - "auxiliary_loss_clip": 0.01118363, - "auxiliary_loss_mlp": 0.01039286, - "balance_loss_clip": 1.04305434, - "balance_loss_mlp": 1.02533805, - "epoch": 0.6150608747933263, - "flos": 21945262625280.0, - "grad_norm": 1.6423781673268174, - "language_loss": 0.7801019, - "learning_rate": 1.3630325595215493e-06, - "loss": 0.80167842, - "num_input_tokens_seen": 220293845, - "step": 10230, - "time_per_iteration": 2.666316509246826 - }, - { - "auxiliary_loss_clip": 0.01086667, - "auxiliary_loss_mlp": 0.01029888, - "balance_loss_clip": 1.03686535, - "balance_loss_mlp": 1.01674509, - "epoch": 0.6151209980459943, - "flos": 30117453523200.0, - "grad_norm": 1.4482184431954421, - "language_loss": 0.73085076, - "learning_rate": 1.36266338983927e-06, - "loss": 0.75201631, - "num_input_tokens_seen": 220316070, - "step": 10231, - "time_per_iteration": 2.7693657875061035 - }, - { - "auxiliary_loss_clip": 0.01095915, - "auxiliary_loss_mlp": 0.01033904, - "balance_loss_clip": 1.04084241, - "balance_loss_mlp": 1.02099395, - "epoch": 0.6151811212986622, - "flos": 30008356940160.0, - "grad_norm": 1.525819080770735, - "language_loss": 0.69824755, - "learning_rate": 1.362294244324858e-06, - "loss": 0.71954578, - "num_input_tokens_seen": 220335695, - "step": 10232, - "time_per_iteration": 2.682452917098999 - }, - { - "auxiliary_loss_clip": 0.01099274, - "auxiliary_loss_mlp": 0.00770809, - "balance_loss_clip": 1.03777719, - "balance_loss_mlp": 1.00007868, - "epoch": 0.6152412445513302, - "flos": 18872888808960.0, - "grad_norm": 2.3038424014240215, - "language_loss": 0.91654289, - "learning_rate": 1.3619251229923126e-06, - "loss": 0.93524379, - "num_input_tokens_seen": 220353720, - "step": 10233, - "time_per_iteration": 2.6199569702148438 - }, - { - "auxiliary_loss_clip": 0.01083051, - "auxiliary_loss_mlp": 0.01033569, - "balance_loss_clip": 1.04041195, - "balance_loss_mlp": 1.02191687, - "epoch": 0.6153013678039982, - "flos": 25703601448320.0, - "grad_norm": 1.8226041312601646, - "language_loss": 0.71622181, - "learning_rate": 1.3615560258556306e-06, - "loss": 0.73738801, - "num_input_tokens_seen": 220372515, - "step": 10234, - "time_per_iteration": 2.6806395053863525 - }, - { - "auxiliary_loss_clip": 0.01107194, - "auxiliary_loss_mlp": 0.00771951, - "balance_loss_clip": 1.04099405, - "balance_loss_mlp": 1.0002284, - "epoch": 0.6153614910566662, - "flos": 28510271383680.0, - "grad_norm": 2.918285420802953, - "language_loss": 0.66839552, - "learning_rate": 1.3611869529288077e-06, - "loss": 0.68718696, - "num_input_tokens_seen": 220393490, - "step": 10235, - "time_per_iteration": 2.896367073059082 - }, - { - "auxiliary_loss_clip": 0.01102816, - "auxiliary_loss_mlp": 0.0103213, - "balance_loss_clip": 1.04112911, - "balance_loss_mlp": 1.01878452, - "epoch": 0.6154216143093342, - "flos": 23549787158400.0, - "grad_norm": 1.534901762766115, - "language_loss": 0.81011724, - "learning_rate": 1.3608179042258398e-06, - "loss": 0.83146667, - "num_input_tokens_seen": 220412855, - "step": 10236, - "time_per_iteration": 2.679506301879883 - }, - { - "auxiliary_loss_clip": 0.01117813, - "auxiliary_loss_mlp": 0.01032266, - "balance_loss_clip": 1.04047644, - "balance_loss_mlp": 1.01949906, - "epoch": 0.6154817375620021, - "flos": 22748081552640.0, - "grad_norm": 1.522081781804378, - "language_loss": 0.80553526, - "learning_rate": 1.360448879760721e-06, - "loss": 0.82703608, - "num_input_tokens_seen": 220433440, - "step": 10237, - "time_per_iteration": 2.6127498149871826 - }, - { - "auxiliary_loss_clip": 0.011004, - "auxiliary_loss_mlp": 0.01042204, - "balance_loss_clip": 1.04215753, - "balance_loss_mlp": 1.02890038, - "epoch": 0.6155418608146701, - "flos": 27162975121920.0, - "grad_norm": 1.7653521660078044, - "language_loss": 0.75694555, - "learning_rate": 1.3600798795474449e-06, - "loss": 0.77837157, - "num_input_tokens_seen": 220453445, - "step": 10238, - "time_per_iteration": 2.7021820545196533 - }, - { - "auxiliary_loss_clip": 0.00990356, - "auxiliary_loss_mlp": 0.01013988, - "balance_loss_clip": 1.01446486, - "balance_loss_mlp": 1.01235473, - "epoch": 0.615601984067338, - "flos": 68811165014400.0, - "grad_norm": 0.760761219232036, - "language_loss": 0.57602662, - "learning_rate": 1.3597109036000036e-06, - "loss": 0.59607005, - "num_input_tokens_seen": 220509730, - "step": 10239, - "time_per_iteration": 3.3009963035583496 - }, - { - "auxiliary_loss_clip": 0.01096252, - "auxiliary_loss_mlp": 0.0103361, - "balance_loss_clip": 1.03823948, - "balance_loss_mlp": 1.01997805, - "epoch": 0.615662107320006, - "flos": 15517144598400.0, - "grad_norm": 1.7796767695280624, - "language_loss": 0.77439094, - "learning_rate": 1.3593419519323892e-06, - "loss": 0.79568958, - "num_input_tokens_seen": 220527295, - "step": 10240, - "time_per_iteration": 2.7327582836151123 - }, - { - "auxiliary_loss_clip": 0.011174, - "auxiliary_loss_mlp": 0.01036437, - "balance_loss_clip": 1.04190874, - "balance_loss_mlp": 1.02288342, - "epoch": 0.615722230572674, - "flos": 21063691128960.0, - "grad_norm": 3.4544456151934315, - "language_loss": 0.73013711, - "learning_rate": 1.3589730245585922e-06, - "loss": 0.75167549, - "num_input_tokens_seen": 220542730, - "step": 10241, - "time_per_iteration": 2.6023552417755127 - }, - { - "auxiliary_loss_clip": 0.01112719, - "auxiliary_loss_mlp": 0.01028871, - "balance_loss_clip": 1.03958392, - "balance_loss_mlp": 1.01619887, - "epoch": 0.615782353825342, - "flos": 23256791919360.0, - "grad_norm": 1.6070807308789545, - "language_loss": 0.72045815, - "learning_rate": 1.3586041214926018e-06, - "loss": 0.7418741, - "num_input_tokens_seen": 220562995, - "step": 10242, - "time_per_iteration": 2.6226117610931396 - }, - { - "auxiliary_loss_clip": 0.0110498, - "auxiliary_loss_mlp": 0.01029697, - "balance_loss_clip": 1.04025722, - "balance_loss_mlp": 1.01723933, - "epoch": 0.6158424770780099, - "flos": 21103911383040.0, - "grad_norm": 3.2758585328662693, - "language_loss": 0.72332186, - "learning_rate": 1.3582352427484086e-06, - "loss": 0.74466866, - "num_input_tokens_seen": 220581775, - "step": 10243, - "time_per_iteration": 2.6781527996063232 - }, - { - "auxiliary_loss_clip": 0.01030422, - "auxiliary_loss_mlp": 0.01003075, - "balance_loss_clip": 1.01600218, - "balance_loss_mlp": 1.00200224, - "epoch": 0.6159026003306779, - "flos": 70333276769280.0, - "grad_norm": 0.7877801586989086, - "language_loss": 0.56873554, - "learning_rate": 1.3578663883399984e-06, - "loss": 0.5890705, - "num_input_tokens_seen": 220646395, - "step": 10244, - "time_per_iteration": 3.2125418186187744 - }, - { - "auxiliary_loss_clip": 0.01114981, - "auxiliary_loss_mlp": 0.01034292, - "balance_loss_clip": 1.03982329, - "balance_loss_mlp": 1.02022541, - "epoch": 0.6159627235833458, - "flos": 33874355802240.0, - "grad_norm": 1.5742269245602847, - "language_loss": 0.63524461, - "learning_rate": 1.3574975582813593e-06, - "loss": 0.65673733, - "num_input_tokens_seen": 220668335, - "step": 10245, - "time_per_iteration": 2.7619571685791016 - }, - { - "auxiliary_loss_clip": 0.01065921, - "auxiliary_loss_mlp": 0.01029863, - "balance_loss_clip": 1.03640854, - "balance_loss_mlp": 1.01676226, - "epoch": 0.6160228468360138, - "flos": 26575440359040.0, - "grad_norm": 2.04251017264565, - "language_loss": 0.79142463, - "learning_rate": 1.3571287525864771e-06, - "loss": 0.81238246, - "num_input_tokens_seen": 220688915, - "step": 10246, - "time_per_iteration": 2.799443483352661 - }, - { - "auxiliary_loss_clip": 0.01079892, - "auxiliary_loss_mlp": 0.00772846, - "balance_loss_clip": 1.03852773, - "balance_loss_mlp": 1.00013709, - "epoch": 0.6160829700886818, - "flos": 17193274894080.0, - "grad_norm": 3.4946061818357115, - "language_loss": 0.87453389, - "learning_rate": 1.3567599712693368e-06, - "loss": 0.89306134, - "num_input_tokens_seen": 220703465, - "step": 10247, - "time_per_iteration": 2.652655839920044 - }, - { - "auxiliary_loss_clip": 0.01044965, - "auxiliary_loss_mlp": 0.01035448, - "balance_loss_clip": 1.03624761, - "balance_loss_mlp": 1.02157784, - "epoch": 0.6161430933413498, - "flos": 23623547736960.0, - "grad_norm": 1.6669970799602325, - "language_loss": 0.79791045, - "learning_rate": 1.3563912143439235e-06, - "loss": 0.81871456, - "num_input_tokens_seen": 220722090, - "step": 10248, - "time_per_iteration": 2.742093563079834 - }, - { - "auxiliary_loss_clip": 0.01068661, - "auxiliary_loss_mlp": 0.010344, - "balance_loss_clip": 1.03618228, - "balance_loss_mlp": 1.02193117, - "epoch": 0.6162032165940178, - "flos": 23002436736000.0, - "grad_norm": 3.2255403010195884, - "language_loss": 0.87085855, - "learning_rate": 1.3560224818242191e-06, - "loss": 0.89188921, - "num_input_tokens_seen": 220741075, - "step": 10249, - "time_per_iteration": 2.7385706901550293 - }, - { - "auxiliary_loss_clip": 0.01115811, - "auxiliary_loss_mlp": 0.01026714, - "balance_loss_clip": 1.04125154, - "balance_loss_mlp": 1.01251006, - "epoch": 0.6162633398466857, - "flos": 39421979740800.0, - "grad_norm": 2.234106446125174, - "language_loss": 0.69080746, - "learning_rate": 1.3556537737242072e-06, - "loss": 0.71223265, - "num_input_tokens_seen": 220763395, - "step": 10250, - "time_per_iteration": 2.736942768096924 - }, - { - "auxiliary_loss_clip": 0.0108508, - "auxiliary_loss_mlp": 0.0102783, - "balance_loss_clip": 1.03718221, - "balance_loss_mlp": 1.01533055, - "epoch": 0.6163234630993537, - "flos": 19244672530560.0, - "grad_norm": 1.84490130709099, - "language_loss": 0.74013072, - "learning_rate": 1.3552850900578692e-06, - "loss": 0.76125979, - "num_input_tokens_seen": 220780640, - "step": 10251, - "time_per_iteration": 2.736994504928589 - }, - { - "auxiliary_loss_clip": 0.01098297, - "auxiliary_loss_mlp": 0.01035781, - "balance_loss_clip": 1.03710103, - "balance_loss_mlp": 1.02119529, - "epoch": 0.6163835863520216, - "flos": 15961791058560.0, - "grad_norm": 2.3749552307580615, - "language_loss": 0.68138051, - "learning_rate": 1.3549164308391844e-06, - "loss": 0.7027213, - "num_input_tokens_seen": 220797960, - "step": 10252, - "time_per_iteration": 2.5879385471343994 - }, - { - "auxiliary_loss_clip": 0.00977001, - "auxiliary_loss_mlp": 0.01000711, - "balance_loss_clip": 1.01395059, - "balance_loss_mlp": 0.9993996, - "epoch": 0.6164437096046896, - "flos": 68103834393600.0, - "grad_norm": 0.8911370167619598, - "language_loss": 0.57833099, - "learning_rate": 1.3545477960821333e-06, - "loss": 0.59810811, - "num_input_tokens_seen": 220856930, - "step": 10253, - "time_per_iteration": 6.5962769985198975 - }, - { - "auxiliary_loss_clip": 0.0109176, - "auxiliary_loss_mlp": 0.01033043, - "balance_loss_clip": 1.03666162, - "balance_loss_mlp": 1.01960826, - "epoch": 0.6165038328573575, - "flos": 21361211481600.0, - "grad_norm": 1.506953433371801, - "language_loss": 0.80028725, - "learning_rate": 1.3541791858006946e-06, - "loss": 0.82153523, - "num_input_tokens_seen": 220877595, - "step": 10254, - "time_per_iteration": 4.457768678665161 - }, - { - "auxiliary_loss_clip": 0.01092373, - "auxiliary_loss_mlp": 0.01030068, - "balance_loss_clip": 1.04135227, - "balance_loss_mlp": 1.01689541, - "epoch": 0.6165639561100256, - "flos": 21101972048640.0, - "grad_norm": 2.217497401179692, - "language_loss": 0.80495244, - "learning_rate": 1.353810600008846e-06, - "loss": 0.82617688, - "num_input_tokens_seen": 220896880, - "step": 10255, - "time_per_iteration": 2.730621814727783 - }, - { - "auxiliary_loss_clip": 0.010977, - "auxiliary_loss_mlp": 0.0103255, - "balance_loss_clip": 1.04147696, - "balance_loss_mlp": 1.01882291, - "epoch": 0.6166240793626935, - "flos": 25338533569920.0, - "grad_norm": 2.145694668444534, - "language_loss": 0.65628386, - "learning_rate": 1.3534420387205646e-06, - "loss": 0.67758632, - "num_input_tokens_seen": 220916425, - "step": 10256, - "time_per_iteration": 2.7114098072052 - }, - { - "auxiliary_loss_clip": 0.01103834, - "auxiliary_loss_mlp": 0.01031477, - "balance_loss_clip": 1.04223847, - "balance_loss_mlp": 1.01924038, - "epoch": 0.6166842026153615, - "flos": 19682639061120.0, - "grad_norm": 1.5926214774863399, - "language_loss": 0.7198689, - "learning_rate": 1.353073501949825e-06, - "loss": 0.74122202, - "num_input_tokens_seen": 220935050, - "step": 10257, - "time_per_iteration": 2.633733034133911 - }, - { - "auxiliary_loss_clip": 0.01096088, - "auxiliary_loss_mlp": 0.01034844, - "balance_loss_clip": 1.04075146, - "balance_loss_mlp": 1.02102792, - "epoch": 0.6167443258680294, - "flos": 19318361281920.0, - "grad_norm": 1.5727725354833466, - "language_loss": 0.72232676, - "learning_rate": 1.3527049897106034e-06, - "loss": 0.74363607, - "num_input_tokens_seen": 220953085, - "step": 10258, - "time_per_iteration": 4.227793455123901 - }, - { - "auxiliary_loss_clip": 0.010877, - "auxiliary_loss_mlp": 0.01041882, - "balance_loss_clip": 1.03643775, - "balance_loss_mlp": 1.02724326, - "epoch": 0.6168044491206974, - "flos": 25265239868160.0, - "grad_norm": 2.5764422484709026, - "language_loss": 0.63939095, - "learning_rate": 1.3523365020168735e-06, - "loss": 0.66068673, - "num_input_tokens_seen": 220969050, - "step": 10259, - "time_per_iteration": 2.66133713722229 - }, - { - "auxiliary_loss_clip": 0.01079598, - "auxiliary_loss_mlp": 0.01032477, - "balance_loss_clip": 1.04043519, - "balance_loss_mlp": 1.01882792, - "epoch": 0.6168645723733654, - "flos": 13219903301760.0, - "grad_norm": 1.7806797732317314, - "language_loss": 0.71367824, - "learning_rate": 1.3519680388826084e-06, - "loss": 0.73479903, - "num_input_tokens_seen": 220985825, - "step": 10260, - "time_per_iteration": 2.7046947479248047 - }, - { - "auxiliary_loss_clip": 0.01112627, - "auxiliary_loss_mlp": 0.010363, - "balance_loss_clip": 1.04544723, - "balance_loss_mlp": 1.02161956, - "epoch": 0.6169246956260334, - "flos": 26652038112000.0, - "grad_norm": 2.1654324787038366, - "language_loss": 0.68724519, - "learning_rate": 1.3515996003217803e-06, - "loss": 0.70873445, - "num_input_tokens_seen": 221004465, - "step": 10261, - "time_per_iteration": 2.6891751289367676 - }, - { - "auxiliary_loss_clip": 0.01077329, - "auxiliary_loss_mlp": 0.01039226, - "balance_loss_clip": 1.03780389, - "balance_loss_mlp": 1.02766895, - "epoch": 0.6169848188787014, - "flos": 23148413608320.0, - "grad_norm": 2.004758584780846, - "language_loss": 0.71780062, - "learning_rate": 1.3512311863483602e-06, - "loss": 0.73896611, - "num_input_tokens_seen": 221023260, - "step": 10262, - "time_per_iteration": 2.7089951038360596 - }, - { - "auxiliary_loss_clip": 0.01096265, - "auxiliary_loss_mlp": 0.01036729, - "balance_loss_clip": 1.0397017, - "balance_loss_mlp": 1.02370548, - "epoch": 0.6170449421313693, - "flos": 23331917214720.0, - "grad_norm": 1.9399509227658047, - "language_loss": 0.70199084, - "learning_rate": 1.3508627969763188e-06, - "loss": 0.72332084, - "num_input_tokens_seen": 221043090, - "step": 10263, - "time_per_iteration": 2.750321865081787 - }, - { - "auxiliary_loss_clip": 0.01051355, - "auxiliary_loss_mlp": 0.01030928, - "balance_loss_clip": 1.03560829, - "balance_loss_mlp": 1.01777899, - "epoch": 0.6171050653840373, - "flos": 15851617067520.0, - "grad_norm": 2.2572438712768217, - "language_loss": 0.75942671, - "learning_rate": 1.3504944322196244e-06, - "loss": 0.78024954, - "num_input_tokens_seen": 221061435, - "step": 10264, - "time_per_iteration": 2.868535041809082 - }, - { - "auxiliary_loss_clip": 0.0111535, - "auxiliary_loss_mlp": 0.01034326, - "balance_loss_clip": 1.04105282, - "balance_loss_mlp": 1.0207653, - "epoch": 0.6171651886367052, - "flos": 20045516209920.0, - "grad_norm": 2.372576687009926, - "language_loss": 0.85552394, - "learning_rate": 1.350126092092247e-06, - "loss": 0.87702072, - "num_input_tokens_seen": 221078705, - "step": 10265, - "time_per_iteration": 2.8565142154693604 - }, - { - "auxiliary_loss_clip": 0.01067477, - "auxiliary_loss_mlp": 0.01039322, - "balance_loss_clip": 1.04373622, - "balance_loss_mlp": 1.0262332, - "epoch": 0.6172253118893732, - "flos": 26432695710720.0, - "grad_norm": 1.8305416019911092, - "language_loss": 0.64584678, - "learning_rate": 1.349757776608153e-06, - "loss": 0.66691476, - "num_input_tokens_seen": 221099245, - "step": 10266, - "time_per_iteration": 2.8642327785491943 - }, - { - "auxiliary_loss_clip": 0.01077105, - "auxiliary_loss_mlp": 0.01033494, - "balance_loss_clip": 1.03542173, - "balance_loss_mlp": 1.02038074, - "epoch": 0.6172854351420412, - "flos": 22632879657600.0, - "grad_norm": 1.5801224931645446, - "language_loss": 0.75690526, - "learning_rate": 1.3493894857813094e-06, - "loss": 0.77801126, - "num_input_tokens_seen": 221116930, - "step": 10267, - "time_per_iteration": 2.6700358390808105 - }, - { - "auxiliary_loss_clip": 0.01085691, - "auxiliary_loss_mlp": 0.01032961, - "balance_loss_clip": 1.03821349, - "balance_loss_mlp": 1.01927543, - "epoch": 0.6173455583947092, - "flos": 21212936138880.0, - "grad_norm": 1.8670913933452218, - "language_loss": 0.75156605, - "learning_rate": 1.3490212196256818e-06, - "loss": 0.77275252, - "num_input_tokens_seen": 221137660, - "step": 10268, - "time_per_iteration": 2.696876287460327 - }, - { - "auxiliary_loss_clip": 0.01094834, - "auxiliary_loss_mlp": 0.01028833, - "balance_loss_clip": 1.03917122, - "balance_loss_mlp": 1.01574397, - "epoch": 0.6174056816473771, - "flos": 19500284689920.0, - "grad_norm": 1.6535000846549075, - "language_loss": 0.75516117, - "learning_rate": 1.3486529781552342e-06, - "loss": 0.77639782, - "num_input_tokens_seen": 221156225, - "step": 10269, - "time_per_iteration": 2.602811098098755 - }, - { - "auxiliary_loss_clip": 0.01112983, - "auxiliary_loss_mlp": 0.01032291, - "balance_loss_clip": 1.03888416, - "balance_loss_mlp": 1.01934433, - "epoch": 0.6174658049000451, - "flos": 15997342544640.0, - "grad_norm": 2.0658565412775864, - "language_loss": 0.76633871, - "learning_rate": 1.3482847613839318e-06, - "loss": 0.78779137, - "num_input_tokens_seen": 221173820, - "step": 10270, - "time_per_iteration": 2.4974937438964844 - }, - { - "auxiliary_loss_clip": 0.01094367, - "auxiliary_loss_mlp": 0.01029373, - "balance_loss_clip": 1.03897905, - "balance_loss_mlp": 1.01614046, - "epoch": 0.617525928152713, - "flos": 21903893136000.0, - "grad_norm": 1.7132984501088018, - "language_loss": 0.82571089, - "learning_rate": 1.347916569325736e-06, - "loss": 0.84694827, - "num_input_tokens_seen": 221191815, - "step": 10271, - "time_per_iteration": 2.5579023361206055 - }, - { - "auxiliary_loss_clip": 0.01117578, - "auxiliary_loss_mlp": 0.00770278, - "balance_loss_clip": 1.04181647, - "balance_loss_mlp": 1.00026119, - "epoch": 0.617586051405381, - "flos": 21105958458240.0, - "grad_norm": 1.7753710890796277, - "language_loss": 0.77119303, - "learning_rate": 1.3475484019946093e-06, - "loss": 0.79007161, - "num_input_tokens_seen": 221211205, - "step": 10272, - "time_per_iteration": 2.5040929317474365 - }, - { - "auxiliary_loss_clip": 0.01010193, - "auxiliary_loss_mlp": 0.01008445, - "balance_loss_clip": 1.01500225, - "balance_loss_mlp": 1.00734258, - "epoch": 0.617646174658049, - "flos": 58610776665600.0, - "grad_norm": 0.8102559733678494, - "language_loss": 0.59036177, - "learning_rate": 1.347180259404513e-06, - "loss": 0.61054814, - "num_input_tokens_seen": 221268430, - "step": 10273, - "time_per_iteration": 3.0667202472686768 - }, - { - "auxiliary_loss_clip": 0.0108364, - "auxiliary_loss_mlp": 0.01039906, - "balance_loss_clip": 1.03496802, - "balance_loss_mlp": 1.02545786, - "epoch": 0.617706297910717, - "flos": 13878684691200.0, - "grad_norm": 2.411144915020525, - "language_loss": 0.73045421, - "learning_rate": 1.3468121415694059e-06, - "loss": 0.75168967, - "num_input_tokens_seen": 221281930, - "step": 10274, - "time_per_iteration": 2.608651638031006 - }, - { - "auxiliary_loss_clip": 0.0110423, - "auxiliary_loss_mlp": 0.00770133, - "balance_loss_clip": 1.04004967, - "balance_loss_mlp": 1.00015223, - "epoch": 0.617766421163385, - "flos": 19208438686080.0, - "grad_norm": 2.134780547516878, - "language_loss": 0.77694172, - "learning_rate": 1.3464440485032484e-06, - "loss": 0.79568529, - "num_input_tokens_seen": 221301605, - "step": 10275, - "time_per_iteration": 2.588878631591797 - }, - { - "auxiliary_loss_clip": 0.01073523, - "auxiliary_loss_mlp": 0.01029844, - "balance_loss_clip": 1.03674793, - "balance_loss_mlp": 1.01733303, - "epoch": 0.6178265444160529, - "flos": 22565978576640.0, - "grad_norm": 2.554653383498776, - "language_loss": 0.79304695, - "learning_rate": 1.346075980219998e-06, - "loss": 0.8140806, - "num_input_tokens_seen": 221320105, - "step": 10276, - "time_per_iteration": 2.704596757888794 - }, - { - "auxiliary_loss_clip": 0.0104785, - "auxiliary_loss_mlp": 0.0103935, - "balance_loss_clip": 1.03442883, - "balance_loss_mlp": 1.02518225, - "epoch": 0.6178866676687209, - "flos": 11984289402240.0, - "grad_norm": 1.984181156670454, - "language_loss": 0.80967486, - "learning_rate": 1.345707936733612e-06, - "loss": 0.83054686, - "num_input_tokens_seen": 221335915, - "step": 10277, - "time_per_iteration": 2.7356364727020264 - }, - { - "auxiliary_loss_clip": 0.01088845, - "auxiliary_loss_mlp": 0.01030881, - "balance_loss_clip": 1.04154968, - "balance_loss_mlp": 1.01682067, - "epoch": 0.6179467909213888, - "flos": 20991510748800.0, - "grad_norm": 1.5775634797191704, - "language_loss": 0.81279171, - "learning_rate": 1.3453399180580466e-06, - "loss": 0.83398896, - "num_input_tokens_seen": 221353965, - "step": 10278, - "time_per_iteration": 2.703054666519165 - }, - { - "auxiliary_loss_clip": 0.0106686, - "auxiliary_loss_mlp": 0.00769812, - "balance_loss_clip": 1.03503084, - "balance_loss_mlp": 1.00006652, - "epoch": 0.6180069141740568, - "flos": 25338102606720.0, - "grad_norm": 1.5156506321196916, - "language_loss": 0.74347699, - "learning_rate": 1.3449719242072567e-06, - "loss": 0.76184368, - "num_input_tokens_seen": 221374080, - "step": 10279, - "time_per_iteration": 2.777080774307251 - }, - { - "auxiliary_loss_clip": 0.01096628, - "auxiliary_loss_mlp": 0.010318, - "balance_loss_clip": 1.03583622, - "balance_loss_mlp": 1.01950932, - "epoch": 0.6180670374267248, - "flos": 19645722858240.0, - "grad_norm": 1.5230976022896776, - "language_loss": 0.70880997, - "learning_rate": 1.3446039551951975e-06, - "loss": 0.73009425, - "num_input_tokens_seen": 221392910, - "step": 10280, - "time_per_iteration": 2.682345151901245 - }, - { - "auxiliary_loss_clip": 0.01116485, - "auxiliary_loss_mlp": 0.01035107, - "balance_loss_clip": 1.04136443, - "balance_loss_mlp": 1.02197635, - "epoch": 0.6181271606793928, - "flos": 19464876858240.0, - "grad_norm": 1.5388475151652443, - "language_loss": 0.72637439, - "learning_rate": 1.3442360110358215e-06, - "loss": 0.74789023, - "num_input_tokens_seen": 221410990, - "step": 10281, - "time_per_iteration": 2.546891927719116 - }, - { - "auxiliary_loss_clip": 0.01091569, - "auxiliary_loss_mlp": 0.0102895, - "balance_loss_clip": 1.04059482, - "balance_loss_mlp": 1.01733923, - "epoch": 0.6181872839320607, - "flos": 25594289383680.0, - "grad_norm": 1.5263826245103997, - "language_loss": 0.76680994, - "learning_rate": 1.3438680917430827e-06, - "loss": 0.78801513, - "num_input_tokens_seen": 221431020, - "step": 10282, - "time_per_iteration": 2.6794841289520264 - }, - { - "auxiliary_loss_clip": 0.0108706, - "auxiliary_loss_mlp": 0.01034652, - "balance_loss_clip": 1.03559065, - "balance_loss_mlp": 1.01857102, - "epoch": 0.6182474071847287, - "flos": 25551806572800.0, - "grad_norm": 1.675077981875324, - "language_loss": 0.69088876, - "learning_rate": 1.343500197330931e-06, - "loss": 0.71210587, - "num_input_tokens_seen": 221453235, - "step": 10283, - "time_per_iteration": 2.704653263092041 - }, - { - "auxiliary_loss_clip": 0.01110364, - "auxiliary_loss_mlp": 0.01029861, - "balance_loss_clip": 1.03980327, - "balance_loss_mlp": 1.01613414, - "epoch": 0.6183075304373966, - "flos": 22123738327680.0, - "grad_norm": 1.6796519430341141, - "language_loss": 0.75191927, - "learning_rate": 1.3431323278133176e-06, - "loss": 0.77332163, - "num_input_tokens_seen": 221472560, - "step": 10284, - "time_per_iteration": 2.613283395767212 - }, - { - "auxiliary_loss_clip": 0.010977, - "auxiliary_loss_mlp": 0.01036897, - "balance_loss_clip": 1.04041815, - "balance_loss_mlp": 1.02422476, - "epoch": 0.6183676536900646, - "flos": 22455589104000.0, - "grad_norm": 1.4535785054838537, - "language_loss": 0.75249875, - "learning_rate": 1.3427644832041922e-06, - "loss": 0.77384472, - "num_input_tokens_seen": 221492835, - "step": 10285, - "time_per_iteration": 2.661404848098755 - }, - { - "auxiliary_loss_clip": 0.01076492, - "auxiliary_loss_mlp": 0.01032852, - "balance_loss_clip": 1.03464127, - "balance_loss_mlp": 1.02047253, - "epoch": 0.6184277769427327, - "flos": 23364128736000.0, - "grad_norm": 1.9348516071602069, - "language_loss": 0.72801822, - "learning_rate": 1.342396663517503e-06, - "loss": 0.74911165, - "num_input_tokens_seen": 221511870, - "step": 10286, - "time_per_iteration": 2.7692575454711914 - }, - { - "auxiliary_loss_clip": 0.01112181, - "auxiliary_loss_mlp": 0.01029502, - "balance_loss_clip": 1.03996992, - "balance_loss_mlp": 1.01705098, - "epoch": 0.6184879001954006, - "flos": 22711057608960.0, - "grad_norm": 1.6994058202973141, - "language_loss": 0.76147521, - "learning_rate": 1.342028868767199e-06, - "loss": 0.78289199, - "num_input_tokens_seen": 221529915, - "step": 10287, - "time_per_iteration": 2.737244129180908 - }, - { - "auxiliary_loss_clip": 0.01075986, - "auxiliary_loss_mlp": 0.01033857, - "balance_loss_clip": 1.038939, - "balance_loss_mlp": 1.02116728, - "epoch": 0.6185480234480686, - "flos": 23841920471040.0, - "grad_norm": 1.661792493637227, - "language_loss": 0.73342609, - "learning_rate": 1.3416610989672262e-06, - "loss": 0.75452453, - "num_input_tokens_seen": 221549745, - "step": 10288, - "time_per_iteration": 2.738234281539917 - }, - { - "auxiliary_loss_clip": 0.01099888, - "auxiliary_loss_mlp": 0.01035399, - "balance_loss_clip": 1.03925002, - "balance_loss_mlp": 1.0233885, - "epoch": 0.6186081467007365, - "flos": 45477595774080.0, - "grad_norm": 1.4788464659042324, - "language_loss": 0.72843671, - "learning_rate": 1.3412933541315296e-06, - "loss": 0.7497896, - "num_input_tokens_seen": 221572455, - "step": 10289, - "time_per_iteration": 2.870210886001587 - }, - { - "auxiliary_loss_clip": 0.01088106, - "auxiliary_loss_mlp": 0.01030969, - "balance_loss_clip": 1.0376215, - "balance_loss_mlp": 1.01749849, - "epoch": 0.6186682699534045, - "flos": 23550864566400.0, - "grad_norm": 1.4742798847115595, - "language_loss": 0.79430723, - "learning_rate": 1.340925634274056e-06, - "loss": 0.81549788, - "num_input_tokens_seen": 221591325, - "step": 10290, - "time_per_iteration": 2.7061526775360107 - }, - { - "auxiliary_loss_clip": 0.01104029, - "auxiliary_loss_mlp": 0.01033504, - "balance_loss_clip": 1.03934646, - "balance_loss_mlp": 1.02068937, - "epoch": 0.6187283932060724, - "flos": 25774201630080.0, - "grad_norm": 1.6274786697127714, - "language_loss": 0.81492877, - "learning_rate": 1.3405579394087475e-06, - "loss": 0.83630407, - "num_input_tokens_seen": 221611640, - "step": 10291, - "time_per_iteration": 2.664706230163574 - }, - { - "auxiliary_loss_clip": 0.01114199, - "auxiliary_loss_mlp": 0.01034165, - "balance_loss_clip": 1.04050338, - "balance_loss_mlp": 1.02185655, - "epoch": 0.6187885164587404, - "flos": 25265203954560.0, - "grad_norm": 1.5926453232151345, - "language_loss": 0.77492392, - "learning_rate": 1.3401902695495487e-06, - "loss": 0.79640758, - "num_input_tokens_seen": 221631225, - "step": 10292, - "time_per_iteration": 4.222437381744385 - }, - { - "auxiliary_loss_clip": 0.01085532, - "auxiliary_loss_mlp": 0.01041109, - "balance_loss_clip": 1.03610599, - "balance_loss_mlp": 1.02526617, - "epoch": 0.6188486397114084, - "flos": 26250772302720.0, - "grad_norm": 2.004631291368857, - "language_loss": 0.7354871, - "learning_rate": 1.339822624710401e-06, - "loss": 0.75675344, - "num_input_tokens_seen": 221651035, - "step": 10293, - "time_per_iteration": 4.283612251281738 - }, - { - "auxiliary_loss_clip": 0.01083695, - "auxiliary_loss_mlp": 0.0077033, - "balance_loss_clip": 1.03986382, - "balance_loss_mlp": 1.00014317, - "epoch": 0.6189087629640764, - "flos": 20923388605440.0, - "grad_norm": 1.9118403389506524, - "language_loss": 0.8346625, - "learning_rate": 1.3394550049052454e-06, - "loss": 0.85320276, - "num_input_tokens_seen": 221671300, - "step": 10294, - "time_per_iteration": 4.339020013809204 - }, - { - "auxiliary_loss_clip": 0.01097661, - "auxiliary_loss_mlp": 0.01034696, - "balance_loss_clip": 1.04166722, - "balance_loss_mlp": 1.02219725, - "epoch": 0.6189688862167443, - "flos": 14829814874880.0, - "grad_norm": 2.141454584748579, - "language_loss": 0.706837, - "learning_rate": 1.3390874101480225e-06, - "loss": 0.72816062, - "num_input_tokens_seen": 221687320, - "step": 10295, - "time_per_iteration": 2.631901264190674 - }, - { - "auxiliary_loss_clip": 0.01115282, - "auxiliary_loss_mlp": 0.01039296, - "balance_loss_clip": 1.04228771, - "balance_loss_mlp": 1.02599859, - "epoch": 0.6190290094694123, - "flos": 24285058560000.0, - "grad_norm": 1.7512650583676883, - "language_loss": 0.70329851, - "learning_rate": 1.3387198404526705e-06, - "loss": 0.72484434, - "num_input_tokens_seen": 221710175, - "step": 10296, - "time_per_iteration": 2.689392566680908 - }, - { - "auxiliary_loss_clip": 0.01081279, - "auxiliary_loss_mlp": 0.01034769, - "balance_loss_clip": 1.03957784, - "balance_loss_mlp": 1.02048767, - "epoch": 0.6190891327220802, - "flos": 22529457423360.0, - "grad_norm": 1.9634695381003797, - "language_loss": 0.71536231, - "learning_rate": 1.3383522958331287e-06, - "loss": 0.73652285, - "num_input_tokens_seen": 221728145, - "step": 10297, - "time_per_iteration": 2.7065582275390625 - }, - { - "auxiliary_loss_clip": 0.01036404, - "auxiliary_loss_mlp": 0.01000643, - "balance_loss_clip": 1.01235867, - "balance_loss_mlp": 0.99964732, - "epoch": 0.6191492559747482, - "flos": 67729357152000.0, - "grad_norm": 0.8790158844538737, - "language_loss": 0.64109659, - "learning_rate": 1.3379847763033345e-06, - "loss": 0.66146708, - "num_input_tokens_seen": 221786100, - "step": 10298, - "time_per_iteration": 4.634017467498779 - }, - { - "auxiliary_loss_clip": 0.01116645, - "auxiliary_loss_mlp": 0.01033648, - "balance_loss_clip": 1.04158056, - "balance_loss_mlp": 1.02121425, - "epoch": 0.6192093792274163, - "flos": 22346672088960.0, - "grad_norm": 1.7807348336033566, - "language_loss": 0.74117303, - "learning_rate": 1.3376172818772236e-06, - "loss": 0.762676, - "num_input_tokens_seen": 221806450, - "step": 10299, - "time_per_iteration": 2.6040680408477783 - }, - { - "auxiliary_loss_clip": 0.01108454, - "auxiliary_loss_mlp": 0.01030477, - "balance_loss_clip": 1.0418222, - "balance_loss_mlp": 1.01792383, - "epoch": 0.6192695024800842, - "flos": 13553944807680.0, - "grad_norm": 1.8290075775776669, - "language_loss": 0.68678868, - "learning_rate": 1.337249812568732e-06, - "loss": 0.70817792, - "num_input_tokens_seen": 221823330, - "step": 10300, - "time_per_iteration": 2.641167163848877 - }, - { - "auxiliary_loss_clip": 0.01101551, - "auxiliary_loss_mlp": 0.00770748, - "balance_loss_clip": 1.04044676, - "balance_loss_mlp": 1.00015926, - "epoch": 0.6193296257327522, - "flos": 17415310815360.0, - "grad_norm": 1.7786248978132038, - "language_loss": 0.66813135, - "learning_rate": 1.3368823683917939e-06, - "loss": 0.68685436, - "num_input_tokens_seen": 221839360, - "step": 10301, - "time_per_iteration": 2.639004945755005 - }, - { - "auxiliary_loss_clip": 0.01072819, - "auxiliary_loss_mlp": 0.01035838, - "balance_loss_clip": 1.03622746, - "balance_loss_mlp": 1.02365446, - "epoch": 0.6193897489854201, - "flos": 31101118450560.0, - "grad_norm": 1.5932766793388897, - "language_loss": 0.72753853, - "learning_rate": 1.3365149493603424e-06, - "loss": 0.74862504, - "num_input_tokens_seen": 221859465, - "step": 10302, - "time_per_iteration": 2.7263267040252686 - }, - { - "auxiliary_loss_clip": 0.01090931, - "auxiliary_loss_mlp": 0.01030697, - "balance_loss_clip": 1.0426929, - "balance_loss_mlp": 1.01734614, - "epoch": 0.6194498722380881, - "flos": 19134031662720.0, - "grad_norm": 1.7635802463343486, - "language_loss": 0.80626869, - "learning_rate": 1.3361475554883107e-06, - "loss": 0.82748497, - "num_input_tokens_seen": 221878555, - "step": 10303, - "time_per_iteration": 2.674865961074829 - }, - { - "auxiliary_loss_clip": 0.01117513, - "auxiliary_loss_mlp": 0.01032597, - "balance_loss_clip": 1.04101253, - "balance_loss_mlp": 1.01882231, - "epoch": 0.619509995490756, - "flos": 21835088634240.0, - "grad_norm": 4.546006861231834, - "language_loss": 0.76722652, - "learning_rate": 1.3357801867896307e-06, - "loss": 0.78872764, - "num_input_tokens_seen": 221898790, - "step": 10304, - "time_per_iteration": 2.578068256378174 - }, - { - "auxiliary_loss_clip": 0.01085456, - "auxiliary_loss_mlp": 0.0103497, - "balance_loss_clip": 1.04078317, - "balance_loss_mlp": 1.02160096, - "epoch": 0.619570118743424, - "flos": 23806548552960.0, - "grad_norm": 2.037308303130727, - "language_loss": 0.77085918, - "learning_rate": 1.3354128432782324e-06, - "loss": 0.79206347, - "num_input_tokens_seen": 221918875, - "step": 10305, - "time_per_iteration": 2.6557652950286865 - }, - { - "auxiliary_loss_clip": 0.01112573, - "auxiliary_loss_mlp": 0.01033009, - "balance_loss_clip": 1.04317331, - "balance_loss_mlp": 1.01832271, - "epoch": 0.619630241996092, - "flos": 21101612912640.0, - "grad_norm": 1.5905815409224004, - "language_loss": 0.7876581, - "learning_rate": 1.335045524968045e-06, - "loss": 0.80911398, - "num_input_tokens_seen": 221937895, - "step": 10306, - "time_per_iteration": 2.58312726020813 - }, - { - "auxiliary_loss_clip": 0.01056494, - "auxiliary_loss_mlp": 0.01030878, - "balance_loss_clip": 1.03859866, - "balance_loss_mlp": 1.0192728, - "epoch": 0.61969036524876, - "flos": 27308269635840.0, - "grad_norm": 1.649742748314876, - "language_loss": 0.80246294, - "learning_rate": 1.3346782318729988e-06, - "loss": 0.82333666, - "num_input_tokens_seen": 221955920, - "step": 10307, - "time_per_iteration": 2.7693941593170166 - }, - { - "auxiliary_loss_clip": 0.01001046, - "auxiliary_loss_mlp": 0.01015241, - "balance_loss_clip": 1.01444507, - "balance_loss_mlp": 1.0141207, - "epoch": 0.6197504885014279, - "flos": 51648955384320.0, - "grad_norm": 0.8068090756771118, - "language_loss": 0.59387553, - "learning_rate": 1.3343109640070203e-06, - "loss": 0.61403841, - "num_input_tokens_seen": 222011405, - "step": 10308, - "time_per_iteration": 3.2183339595794678 - }, - { - "auxiliary_loss_clip": 0.01087174, - "auxiliary_loss_mlp": 0.01030837, - "balance_loss_clip": 1.03852654, - "balance_loss_mlp": 1.01956522, - "epoch": 0.6198106117540959, - "flos": 30557107992960.0, - "grad_norm": 1.7201847601109612, - "language_loss": 0.67907512, - "learning_rate": 1.333943721384037e-06, - "loss": 0.70025527, - "num_input_tokens_seen": 222034545, - "step": 10309, - "time_per_iteration": 2.728565216064453 - }, - { - "auxiliary_loss_clip": 0.01083478, - "auxiliary_loss_mlp": 0.0103687, - "balance_loss_clip": 1.03543091, - "balance_loss_mlp": 1.02430511, - "epoch": 0.6198707350067638, - "flos": 18909733184640.0, - "grad_norm": 1.5362872726536445, - "language_loss": 0.72323126, - "learning_rate": 1.3335765040179746e-06, - "loss": 0.74443471, - "num_input_tokens_seen": 222052690, - "step": 10310, - "time_per_iteration": 2.7349348068237305 - }, - { - "auxiliary_loss_clip": 0.01098291, - "auxiliary_loss_mlp": 0.01037346, - "balance_loss_clip": 1.04345024, - "balance_loss_mlp": 1.02295148, - "epoch": 0.6199308582594318, - "flos": 21433858738560.0, - "grad_norm": 2.3493071886977948, - "language_loss": 0.79078376, - "learning_rate": 1.3332093119227573e-06, - "loss": 0.81214017, - "num_input_tokens_seen": 222069095, - "step": 10311, - "time_per_iteration": 2.682654857635498 - }, - { - "auxiliary_loss_clip": 0.01081352, - "auxiliary_loss_mlp": 0.01035506, - "balance_loss_clip": 1.0394609, - "balance_loss_mlp": 1.02252364, - "epoch": 0.6199909815120999, - "flos": 18407379525120.0, - "grad_norm": 1.7307569913604643, - "language_loss": 0.72513938, - "learning_rate": 1.3328421451123105e-06, - "loss": 0.74630797, - "num_input_tokens_seen": 222087360, - "step": 10312, - "time_per_iteration": 2.677211284637451 - }, - { - "auxiliary_loss_clip": 0.01071298, - "auxiliary_loss_mlp": 0.01034778, - "balance_loss_clip": 1.04210687, - "balance_loss_mlp": 1.02137268, - "epoch": 0.6200511047647678, - "flos": 21466860359040.0, - "grad_norm": 3.7235217852030926, - "language_loss": 0.72115338, - "learning_rate": 1.3324750036005557e-06, - "loss": 0.74221408, - "num_input_tokens_seen": 222106130, - "step": 10313, - "time_per_iteration": 2.7689011096954346 - }, - { - "auxiliary_loss_clip": 0.01108898, - "auxiliary_loss_mlp": 0.01033235, - "balance_loss_clip": 1.04191053, - "balance_loss_mlp": 1.01971102, - "epoch": 0.6201112280174358, - "flos": 18215903099520.0, - "grad_norm": 1.7819666620639945, - "language_loss": 0.78249431, - "learning_rate": 1.332107887401416e-06, - "loss": 0.80391562, - "num_input_tokens_seen": 222123125, - "step": 10314, - "time_per_iteration": 2.618197441101074 - }, - { - "auxiliary_loss_clip": 0.01102699, - "auxiliary_loss_mlp": 0.01031591, - "balance_loss_clip": 1.03891587, - "balance_loss_mlp": 1.01907969, - "epoch": 0.6201713512701037, - "flos": 20011185786240.0, - "grad_norm": 1.747606387674539, - "language_loss": 0.78019774, - "learning_rate": 1.331740796528812e-06, - "loss": 0.80154061, - "num_input_tokens_seen": 222140655, - "step": 10315, - "time_per_iteration": 2.6219210624694824 - }, - { - "auxiliary_loss_clip": 0.01081861, - "auxiliary_loss_mlp": 0.01033596, - "balance_loss_clip": 1.0434972, - "balance_loss_mlp": 1.02088857, - "epoch": 0.6202314745227717, - "flos": 22487692884480.0, - "grad_norm": 2.153515207542012, - "language_loss": 0.76050055, - "learning_rate": 1.3313737309966641e-06, - "loss": 0.78165507, - "num_input_tokens_seen": 222160450, - "step": 10316, - "time_per_iteration": 2.766108989715576 - }, - { - "auxiliary_loss_clip": 0.01115322, - "auxiliary_loss_mlp": 0.01032068, - "balance_loss_clip": 1.03810644, - "balance_loss_mlp": 1.01903796, - "epoch": 0.6202915977754396, - "flos": 26828682220800.0, - "grad_norm": 2.0292313073024366, - "language_loss": 0.77797258, - "learning_rate": 1.3310066908188915e-06, - "loss": 0.79944646, - "num_input_tokens_seen": 222179170, - "step": 10317, - "time_per_iteration": 2.66479754447937 - }, - { - "auxiliary_loss_clip": 0.01017104, - "auxiliary_loss_mlp": 0.01000773, - "balance_loss_clip": 1.01230764, - "balance_loss_mlp": 0.99964064, - "epoch": 0.6203517210281076, - "flos": 62742694890240.0, - "grad_norm": 0.6983272901342329, - "language_loss": 0.59043646, - "learning_rate": 1.3306396760094122e-06, - "loss": 0.61061525, - "num_input_tokens_seen": 222242660, - "step": 10318, - "time_per_iteration": 3.26334547996521 - }, - { - "auxiliary_loss_clip": 0.01087685, - "auxiliary_loss_mlp": 0.01036361, - "balance_loss_clip": 1.04098892, - "balance_loss_mlp": 1.02262819, - "epoch": 0.6204118442807756, - "flos": 23404277162880.0, - "grad_norm": 1.7402353399266621, - "language_loss": 0.77895933, - "learning_rate": 1.330272686582143e-06, - "loss": 0.80019981, - "num_input_tokens_seen": 222262170, - "step": 10319, - "time_per_iteration": 2.729206085205078 - }, - { - "auxiliary_loss_clip": 0.01095977, - "auxiliary_loss_mlp": 0.01036689, - "balance_loss_clip": 1.04197454, - "balance_loss_mlp": 1.02473831, - "epoch": 0.6204719675334436, - "flos": 20193647898240.0, - "grad_norm": 1.990293472142164, - "language_loss": 0.66651958, - "learning_rate": 1.3299057225510013e-06, - "loss": 0.6878463, - "num_input_tokens_seen": 222280375, - "step": 10320, - "time_per_iteration": 2.6254241466522217 - }, - { - "auxiliary_loss_clip": 0.0107265, - "auxiliary_loss_mlp": 0.01032632, - "balance_loss_clip": 1.03743291, - "balance_loss_mlp": 1.02023411, - "epoch": 0.6205320907861115, - "flos": 13188050916480.0, - "grad_norm": 1.82656973457559, - "language_loss": 0.76147729, - "learning_rate": 1.3295387839299013e-06, - "loss": 0.78253013, - "num_input_tokens_seen": 222297325, - "step": 10321, - "time_per_iteration": 2.7273271083831787 - }, - { - "auxiliary_loss_clip": 0.01086085, - "auxiliary_loss_mlp": 0.01026791, - "balance_loss_clip": 1.03792763, - "balance_loss_mlp": 1.01485252, - "epoch": 0.6205922140387795, - "flos": 20668386977280.0, - "grad_norm": 1.806601811467465, - "language_loss": 0.73700678, - "learning_rate": 1.329171870732758e-06, - "loss": 0.75813556, - "num_input_tokens_seen": 222317095, - "step": 10322, - "time_per_iteration": 2.699514627456665 - }, - { - "auxiliary_loss_clip": 0.01074398, - "auxiliary_loss_mlp": 0.01028622, - "balance_loss_clip": 1.03568387, - "balance_loss_mlp": 1.01665354, - "epoch": 0.6206523372914474, - "flos": 23877831093120.0, - "grad_norm": 1.7201277094728098, - "language_loss": 0.72919118, - "learning_rate": 1.3288049829734845e-06, - "loss": 0.75022137, - "num_input_tokens_seen": 222337055, - "step": 10323, - "time_per_iteration": 2.743650436401367 - }, - { - "auxiliary_loss_clip": 0.01111352, - "auxiliary_loss_mlp": 0.0103222, - "balance_loss_clip": 1.04181314, - "balance_loss_mlp": 1.0182364, - "epoch": 0.6207124605441154, - "flos": 13406603218560.0, - "grad_norm": 2.6397698912445495, - "language_loss": 0.58581293, - "learning_rate": 1.3284381206659933e-06, - "loss": 0.60724854, - "num_input_tokens_seen": 222354515, - "step": 10324, - "time_per_iteration": 2.624112129211426 - }, - { - "auxiliary_loss_clip": 0.0107635, - "auxiliary_loss_mlp": 0.01039987, - "balance_loss_clip": 1.03851843, - "balance_loss_mlp": 1.02483535, - "epoch": 0.6207725837967835, - "flos": 18916341287040.0, - "grad_norm": 1.9960731674186785, - "language_loss": 0.77214384, - "learning_rate": 1.3280712838241956e-06, - "loss": 0.79330719, - "num_input_tokens_seen": 222372755, - "step": 10325, - "time_per_iteration": 2.7152631282806396 - }, - { - "auxiliary_loss_clip": 0.01106149, - "auxiliary_loss_mlp": 0.01030383, - "balance_loss_clip": 1.03993106, - "balance_loss_mlp": 1.01689494, - "epoch": 0.6208327070494514, - "flos": 23980211832960.0, - "grad_norm": 1.8479718801200147, - "language_loss": 0.72421134, - "learning_rate": 1.327704472462003e-06, - "loss": 0.74557668, - "num_input_tokens_seen": 222391380, - "step": 10326, - "time_per_iteration": 2.7786142826080322 - }, - { - "auxiliary_loss_clip": 0.01108733, - "auxiliary_loss_mlp": 0.01040278, - "balance_loss_clip": 1.04103386, - "balance_loss_mlp": 1.02686155, - "epoch": 0.6208928303021194, - "flos": 22820405587200.0, - "grad_norm": 2.631988552550178, - "language_loss": 0.74086714, - "learning_rate": 1.3273376865933234e-06, - "loss": 0.76235723, - "num_input_tokens_seen": 222411165, - "step": 10327, - "time_per_iteration": 2.6204168796539307 - }, - { - "auxiliary_loss_clip": 0.01090969, - "auxiliary_loss_mlp": 0.01032322, - "balance_loss_clip": 1.03982306, - "balance_loss_mlp": 1.01871443, - "epoch": 0.6209529535547873, - "flos": 17564519911680.0, - "grad_norm": 1.9488386802913455, - "language_loss": 0.79213655, - "learning_rate": 1.326970926232066e-06, - "loss": 0.81336939, - "num_input_tokens_seen": 222428110, - "step": 10328, - "time_per_iteration": 2.678966522216797 - }, - { - "auxiliary_loss_clip": 0.01080917, - "auxiliary_loss_mlp": 0.01040936, - "balance_loss_clip": 1.03594792, - "balance_loss_mlp": 1.02738202, - "epoch": 0.6210130768074553, - "flos": 22011912311040.0, - "grad_norm": 1.6747137440925206, - "language_loss": 0.77850568, - "learning_rate": 1.3266041913921396e-06, - "loss": 0.79972422, - "num_input_tokens_seen": 222446385, - "step": 10329, - "time_per_iteration": 2.7247962951660156 - }, - { - "auxiliary_loss_clip": 0.01022383, - "auxiliary_loss_mlp": 0.01002444, - "balance_loss_clip": 1.00971746, - "balance_loss_mlp": 1.00120986, - "epoch": 0.6210732000601232, - "flos": 63676873854720.0, - "grad_norm": 0.8323168859834922, - "language_loss": 0.62231028, - "learning_rate": 1.3262374820874484e-06, - "loss": 0.64255857, - "num_input_tokens_seen": 222502150, - "step": 10330, - "time_per_iteration": 3.1397132873535156 - }, - { - "auxiliary_loss_clip": 0.01109711, - "auxiliary_loss_mlp": 0.01039515, - "balance_loss_clip": 1.04052687, - "balance_loss_mlp": 1.02538919, - "epoch": 0.6211333233127913, - "flos": 24243365848320.0, - "grad_norm": 1.916638297562339, - "language_loss": 0.77865416, - "learning_rate": 1.3258707983319002e-06, - "loss": 0.80014634, - "num_input_tokens_seen": 222519880, - "step": 10331, - "time_per_iteration": 4.165555715560913 - }, - { - "auxiliary_loss_clip": 0.01119225, - "auxiliary_loss_mlp": 0.01036042, - "balance_loss_clip": 1.04211998, - "balance_loss_mlp": 1.0226016, - "epoch": 0.6211934465654592, - "flos": 16943803960320.0, - "grad_norm": 2.274669690788456, - "language_loss": 0.67796123, - "learning_rate": 1.3255041401393992e-06, - "loss": 0.69951391, - "num_input_tokens_seen": 222538545, - "step": 10332, - "time_per_iteration": 4.209641933441162 - }, - { - "auxiliary_loss_clip": 0.01082735, - "auxiliary_loss_mlp": 0.01033197, - "balance_loss_clip": 1.03757524, - "balance_loss_mlp": 1.0202266, - "epoch": 0.6212535698181272, - "flos": 15267386355840.0, - "grad_norm": 1.6414227257739276, - "language_loss": 0.76285797, - "learning_rate": 1.3251375075238476e-06, - "loss": 0.78401732, - "num_input_tokens_seen": 222556935, - "step": 10333, - "time_per_iteration": 4.338353157043457 - }, - { - "auxiliary_loss_clip": 0.01086354, - "auxiliary_loss_mlp": 0.01035943, - "balance_loss_clip": 1.03819084, - "balance_loss_mlp": 1.02344966, - "epoch": 0.6213136930707951, - "flos": 13443950384640.0, - "grad_norm": 2.217560323857708, - "language_loss": 0.69773704, - "learning_rate": 1.3247709004991507e-06, - "loss": 0.71896005, - "num_input_tokens_seen": 222574035, - "step": 10334, - "time_per_iteration": 2.6839816570281982 - }, - { - "auxiliary_loss_clip": 0.01092709, - "auxiliary_loss_mlp": 0.00770618, - "balance_loss_clip": 1.03960049, - "balance_loss_mlp": 1.00011337, - "epoch": 0.6213738163234631, - "flos": 18111223889280.0, - "grad_norm": 1.6672758368774196, - "language_loss": 0.69724143, - "learning_rate": 1.3244043190792078e-06, - "loss": 0.71587467, - "num_input_tokens_seen": 222592290, - "step": 10335, - "time_per_iteration": 2.6737349033355713 - }, - { - "auxiliary_loss_clip": 0.01059124, - "auxiliary_loss_mlp": 0.01035916, - "balance_loss_clip": 1.03123188, - "balance_loss_mlp": 1.02301764, - "epoch": 0.621433939576131, - "flos": 25337348421120.0, - "grad_norm": 1.5976161024349493, - "language_loss": 0.79976332, - "learning_rate": 1.3240377632779213e-06, - "loss": 0.82071376, - "num_input_tokens_seen": 222612805, - "step": 10336, - "time_per_iteration": 2.747412919998169 - }, - { - "auxiliary_loss_clip": 0.01113717, - "auxiliary_loss_mlp": 0.01036201, - "balance_loss_clip": 1.04143834, - "balance_loss_mlp": 1.02375555, - "epoch": 0.621494062828799, - "flos": 22565619440640.0, - "grad_norm": 1.7008650000144097, - "language_loss": 0.73422229, - "learning_rate": 1.3236712331091907e-06, - "loss": 0.75572157, - "num_input_tokens_seen": 222632260, - "step": 10337, - "time_per_iteration": 4.168013334274292 - }, - { - "auxiliary_loss_clip": 0.01118051, - "auxiliary_loss_mlp": 0.01039175, - "balance_loss_clip": 1.04091513, - "balance_loss_mlp": 1.0258832, - "epoch": 0.621554186081467, - "flos": 27417976750080.0, - "grad_norm": 4.811980339506567, - "language_loss": 0.63192534, - "learning_rate": 1.3233047285869145e-06, - "loss": 0.65349758, - "num_input_tokens_seen": 222653570, - "step": 10338, - "time_per_iteration": 2.640453815460205 - }, - { - "auxiliary_loss_clip": 0.01103195, - "auxiliary_loss_mlp": 0.01037115, - "balance_loss_clip": 1.0407145, - "balance_loss_mlp": 1.0245744, - "epoch": 0.621614309334135, - "flos": 22346815743360.0, - "grad_norm": 1.5973259219647309, - "language_loss": 0.71490097, - "learning_rate": 1.322938249724991e-06, - "loss": 0.73630404, - "num_input_tokens_seen": 222672480, - "step": 10339, - "time_per_iteration": 2.6346054077148438 - }, - { - "auxiliary_loss_clip": 0.01062852, - "auxiliary_loss_mlp": 0.01037431, - "balance_loss_clip": 1.03612769, - "balance_loss_mlp": 1.02370453, - "epoch": 0.621674432586803, - "flos": 19281229597440.0, - "grad_norm": 1.7281695006377986, - "language_loss": 0.69872439, - "learning_rate": 1.3225717965373166e-06, - "loss": 0.71972716, - "num_input_tokens_seen": 222691200, - "step": 10340, - "time_per_iteration": 2.7176573276519775 - }, - { - "auxiliary_loss_clip": 0.01067449, - "auxiliary_loss_mlp": 0.01032185, - "balance_loss_clip": 1.03537023, - "balance_loss_mlp": 1.01955473, - "epoch": 0.6217345558394709, - "flos": 21609533180160.0, - "grad_norm": 2.160368660473176, - "language_loss": 0.68745732, - "learning_rate": 1.322205369037788e-06, - "loss": 0.70845366, - "num_input_tokens_seen": 222709975, - "step": 10341, - "time_per_iteration": 2.667415142059326 - }, - { - "auxiliary_loss_clip": 0.01105428, - "auxiliary_loss_mlp": 0.01033663, - "balance_loss_clip": 1.04163766, - "balance_loss_mlp": 1.01951921, - "epoch": 0.6217946790921389, - "flos": 18004102554240.0, - "grad_norm": 1.857842108735868, - "language_loss": 0.8084417, - "learning_rate": 1.321838967240299e-06, - "loss": 0.82983261, - "num_input_tokens_seen": 222729005, - "step": 10342, - "time_per_iteration": 2.6358642578125 - }, - { - "auxiliary_loss_clip": 0.01016012, - "auxiliary_loss_mlp": 0.01001969, - "balance_loss_clip": 1.01067889, - "balance_loss_mlp": 1.00081241, - "epoch": 0.6218548023448068, - "flos": 61973631768960.0, - "grad_norm": 0.7777664565041693, - "language_loss": 0.57339287, - "learning_rate": 1.3214725911587452e-06, - "loss": 0.59357268, - "num_input_tokens_seen": 222786090, - "step": 10343, - "time_per_iteration": 3.105703830718994 - }, - { - "auxiliary_loss_clip": 0.01071779, - "auxiliary_loss_mlp": 0.01031556, - "balance_loss_clip": 1.03384042, - "balance_loss_mlp": 1.01972461, - "epoch": 0.6219149255974749, - "flos": 25739152934400.0, - "grad_norm": 1.873733183159078, - "language_loss": 0.7244643, - "learning_rate": 1.3211062408070184e-06, - "loss": 0.74549764, - "num_input_tokens_seen": 222806100, - "step": 10344, - "time_per_iteration": 2.7128279209136963 - }, - { - "auxiliary_loss_clip": 0.01106863, - "auxiliary_loss_mlp": 0.01045674, - "balance_loss_clip": 1.04245842, - "balance_loss_mlp": 1.03368115, - "epoch": 0.6219750488501428, - "flos": 25411073086080.0, - "grad_norm": 3.095022336982982, - "language_loss": 0.60327411, - "learning_rate": 1.3207399161990105e-06, - "loss": 0.62479943, - "num_input_tokens_seen": 222826575, - "step": 10345, - "time_per_iteration": 2.741757392883301 - }, - { - "auxiliary_loss_clip": 0.01048609, - "auxiliary_loss_mlp": 0.01041234, - "balance_loss_clip": 1.03204262, - "balance_loss_mlp": 1.02753103, - "epoch": 0.6220351721028108, - "flos": 20047383717120.0, - "grad_norm": 1.8310337674001005, - "language_loss": 0.77749038, - "learning_rate": 1.320373617348614e-06, - "loss": 0.79838884, - "num_input_tokens_seen": 222845285, - "step": 10346, - "time_per_iteration": 2.770772695541382 - }, - { - "auxiliary_loss_clip": 0.01080995, - "auxiliary_loss_mlp": 0.01037279, - "balance_loss_clip": 1.03780663, - "balance_loss_mlp": 1.02326, - "epoch": 0.6220952953554787, - "flos": 27488397363840.0, - "grad_norm": 1.684158236808197, - "language_loss": 0.71739966, - "learning_rate": 1.3200073442697171e-06, - "loss": 0.73858243, - "num_input_tokens_seen": 222864575, - "step": 10347, - "time_per_iteration": 2.708918333053589 - }, - { - "auxiliary_loss_clip": 0.01099172, - "auxiliary_loss_mlp": 0.01031988, - "balance_loss_clip": 1.03707337, - "balance_loss_mlp": 1.01956046, - "epoch": 0.6221554186081467, - "flos": 19207612673280.0, - "grad_norm": 1.7479247707562864, - "language_loss": 0.71972638, - "learning_rate": 1.3196410969762108e-06, - "loss": 0.74103796, - "num_input_tokens_seen": 222884420, - "step": 10348, - "time_per_iteration": 2.7594058513641357 - }, - { - "auxiliary_loss_clip": 0.01001862, - "auxiliary_loss_mlp": 0.01006112, - "balance_loss_clip": 1.01154137, - "balance_loss_mlp": 1.00479472, - "epoch": 0.6222155418608146, - "flos": 62950939989120.0, - "grad_norm": 0.816855091094188, - "language_loss": 0.54121429, - "learning_rate": 1.3192748754819815e-06, - "loss": 0.56129414, - "num_input_tokens_seen": 222944690, - "step": 10349, - "time_per_iteration": 3.2531776428222656 - }, - { - "auxiliary_loss_clip": 0.0107704, - "auxiliary_loss_mlp": 0.01030722, - "balance_loss_clip": 1.03621447, - "balance_loss_mlp": 1.01792502, - "epoch": 0.6222756651134826, - "flos": 22601099099520.0, - "grad_norm": 2.4846967665996234, - "language_loss": 0.69486421, - "learning_rate": 1.3189086798009173e-06, - "loss": 0.71594191, - "num_input_tokens_seen": 222962990, - "step": 10350, - "time_per_iteration": 2.7475686073303223 - }, - { - "auxiliary_loss_clip": 0.01116919, - "auxiliary_loss_mlp": 0.01038055, - "balance_loss_clip": 1.04172456, - "balance_loss_mlp": 1.02536559, - "epoch": 0.6223357883661506, - "flos": 21142228216320.0, - "grad_norm": 1.8297714166368801, - "language_loss": 0.5704937, - "learning_rate": 1.3185425099469046e-06, - "loss": 0.59204346, - "num_input_tokens_seen": 222980715, - "step": 10351, - "time_per_iteration": 2.675811290740967 - }, - { - "auxiliary_loss_clip": 0.01024035, - "auxiliary_loss_mlp": 0.01004222, - "balance_loss_clip": 1.01215839, - "balance_loss_mlp": 1.00262439, - "epoch": 0.6223959116188186, - "flos": 63765071700480.0, - "grad_norm": 0.8048031710876978, - "language_loss": 0.61121249, - "learning_rate": 1.3181763659338276e-06, - "loss": 0.63149512, - "num_input_tokens_seen": 223040685, - "step": 10352, - "time_per_iteration": 3.2121970653533936 - }, - { - "auxiliary_loss_clip": 0.01111121, - "auxiliary_loss_mlp": 0.01037194, - "balance_loss_clip": 1.03907847, - "balance_loss_mlp": 1.02456367, - "epoch": 0.6224560348714866, - "flos": 22565727181440.0, - "grad_norm": 2.8594267132643267, - "language_loss": 0.82211882, - "learning_rate": 1.3178102477755714e-06, - "loss": 0.84360194, - "num_input_tokens_seen": 223059000, - "step": 10353, - "time_per_iteration": 2.6481454372406006 - }, - { - "auxiliary_loss_clip": 0.01097506, - "auxiliary_loss_mlp": 0.01033284, - "balance_loss_clip": 1.03879428, - "balance_loss_mlp": 1.02166736, - "epoch": 0.6225161581241545, - "flos": 24097748112000.0, - "grad_norm": 1.6101266746131675, - "language_loss": 0.75329089, - "learning_rate": 1.3174441554860195e-06, - "loss": 0.77459884, - "num_input_tokens_seen": 223079345, - "step": 10354, - "time_per_iteration": 2.672100067138672 - }, - { - "auxiliary_loss_clip": 0.01071329, - "auxiliary_loss_mlp": 0.01033068, - "balance_loss_clip": 1.03829408, - "balance_loss_mlp": 1.02011561, - "epoch": 0.6225762813768225, - "flos": 20443513881600.0, - "grad_norm": 1.4917034506382563, - "language_loss": 0.78818482, - "learning_rate": 1.3170780890790528e-06, - "loss": 0.80922878, - "num_input_tokens_seen": 223097880, - "step": 10355, - "time_per_iteration": 2.6894590854644775 - }, - { - "auxiliary_loss_clip": 0.0110748, - "auxiliary_loss_mlp": 0.01038653, - "balance_loss_clip": 1.04353356, - "balance_loss_mlp": 1.0261302, - "epoch": 0.6226364046294904, - "flos": 27198131558400.0, - "grad_norm": 1.5243384390478247, - "language_loss": 0.7810744, - "learning_rate": 1.3167120485685538e-06, - "loss": 0.80253577, - "num_input_tokens_seen": 223118185, - "step": 10356, - "time_per_iteration": 2.662597417831421 - }, - { - "auxiliary_loss_clip": 0.01095206, - "auxiliary_loss_mlp": 0.00771022, - "balance_loss_clip": 1.03841674, - "balance_loss_mlp": 1.0001657, - "epoch": 0.6226965278821585, - "flos": 20445776438400.0, - "grad_norm": 1.8782312562736863, - "language_loss": 0.6801585, - "learning_rate": 1.3163460339684024e-06, - "loss": 0.69882077, - "num_input_tokens_seen": 223137600, - "step": 10357, - "time_per_iteration": 2.630401611328125 - }, - { - "auxiliary_loss_clip": 0.01095487, - "auxiliary_loss_mlp": 0.01037985, - "balance_loss_clip": 1.03887713, - "balance_loss_mlp": 1.02341211, - "epoch": 0.6227566511348264, - "flos": 22162737519360.0, - "grad_norm": 2.8474094143077453, - "language_loss": 0.76153404, - "learning_rate": 1.3159800452924778e-06, - "loss": 0.78286874, - "num_input_tokens_seen": 223154360, - "step": 10358, - "time_per_iteration": 2.661013126373291 - }, - { - "auxiliary_loss_clip": 0.01092746, - "auxiliary_loss_mlp": 0.01033714, - "balance_loss_clip": 1.03905225, - "balance_loss_mlp": 1.02091646, - "epoch": 0.6228167743874944, - "flos": 18040875102720.0, - "grad_norm": 2.1492109037016287, - "language_loss": 0.82438827, - "learning_rate": 1.3156140825546588e-06, - "loss": 0.84565282, - "num_input_tokens_seen": 223172255, - "step": 10359, - "time_per_iteration": 2.75612211227417 - }, - { - "auxiliary_loss_clip": 0.01084816, - "auxiliary_loss_mlp": 0.0105208, - "balance_loss_clip": 1.03617096, - "balance_loss_mlp": 1.0374589, - "epoch": 0.6228768976401623, - "flos": 17742851959680.0, - "grad_norm": 3.2541550800674046, - "language_loss": 0.73383337, - "learning_rate": 1.315248145768822e-06, - "loss": 0.75520235, - "num_input_tokens_seen": 223186965, - "step": 10360, - "time_per_iteration": 2.761385440826416 - }, - { - "auxiliary_loss_clip": 0.01103199, - "auxiliary_loss_mlp": 0.01038017, - "balance_loss_clip": 1.03837395, - "balance_loss_mlp": 1.025244, - "epoch": 0.6229370208928303, - "flos": 17894934144000.0, - "grad_norm": 1.937323368007563, - "language_loss": 0.77496618, - "learning_rate": 1.3148822349488442e-06, - "loss": 0.79637837, - "num_input_tokens_seen": 223206045, - "step": 10361, - "time_per_iteration": 2.7078726291656494 - }, - { - "auxiliary_loss_clip": 0.0107034, - "auxiliary_loss_mlp": 0.01029878, - "balance_loss_clip": 1.0354948, - "balance_loss_mlp": 1.01774836, - "epoch": 0.6229971441454982, - "flos": 17347763289600.0, - "grad_norm": 2.088996135555703, - "language_loss": 0.6762352, - "learning_rate": 1.3145163501086005e-06, - "loss": 0.69723737, - "num_input_tokens_seen": 223224820, - "step": 10362, - "time_per_iteration": 2.693016529083252 - }, - { - "auxiliary_loss_clip": 0.01095554, - "auxiliary_loss_mlp": 0.01033637, - "balance_loss_clip": 1.03886461, - "balance_loss_mlp": 1.02005267, - "epoch": 0.6230572673981662, - "flos": 29241376807680.0, - "grad_norm": 2.450509773967882, - "language_loss": 0.67575699, - "learning_rate": 1.3141504912619658e-06, - "loss": 0.6970489, - "num_input_tokens_seen": 223243205, - "step": 10363, - "time_per_iteration": 2.7115700244903564 - }, - { - "auxiliary_loss_clip": 0.01068138, - "auxiliary_loss_mlp": 0.01032085, - "balance_loss_clip": 1.03868961, - "balance_loss_mlp": 1.01858449, - "epoch": 0.6231173906508342, - "flos": 16325961096960.0, - "grad_norm": 1.7878512911378444, - "language_loss": 0.86638474, - "learning_rate": 1.3137846584228127e-06, - "loss": 0.88738704, - "num_input_tokens_seen": 223261370, - "step": 10364, - "time_per_iteration": 2.6732850074768066 - }, - { - "auxiliary_loss_clip": 0.01017483, - "auxiliary_loss_mlp": 0.01010257, - "balance_loss_clip": 1.01340818, - "balance_loss_mlp": 1.00900543, - "epoch": 0.6231775139035022, - "flos": 68702032517760.0, - "grad_norm": 0.8935233084209503, - "language_loss": 0.60708529, - "learning_rate": 1.313418851605015e-06, - "loss": 0.62736267, - "num_input_tokens_seen": 223315050, - "step": 10365, - "time_per_iteration": 3.2580301761627197 - }, - { - "auxiliary_loss_clip": 0.01085426, - "auxiliary_loss_mlp": 0.007721, - "balance_loss_clip": 1.04356837, - "balance_loss_mlp": 1.00019813, - "epoch": 0.6232376371561702, - "flos": 19821038163840.0, - "grad_norm": 1.9797808373338666, - "language_loss": 0.75283766, - "learning_rate": 1.3130530708224427e-06, - "loss": 0.77141291, - "num_input_tokens_seen": 223332130, - "step": 10366, - "time_per_iteration": 2.695686101913452 - }, - { - "auxiliary_loss_clip": 0.01107257, - "auxiliary_loss_mlp": 0.01040192, - "balance_loss_clip": 1.04238236, - "balance_loss_mlp": 1.0269959, - "epoch": 0.6232977604088381, - "flos": 23258264376960.0, - "grad_norm": 3.5413788647782978, - "language_loss": 0.76049531, - "learning_rate": 1.3126873160889665e-06, - "loss": 0.78196979, - "num_input_tokens_seen": 223351605, - "step": 10367, - "time_per_iteration": 2.6170830726623535 - }, - { - "auxiliary_loss_clip": 0.01102139, - "auxiliary_loss_mlp": 0.01034539, - "balance_loss_clip": 1.04015589, - "balance_loss_mlp": 1.02192068, - "epoch": 0.6233578836615061, - "flos": 21106425335040.0, - "grad_norm": 1.5257334476599056, - "language_loss": 0.78428042, - "learning_rate": 1.312321587418457e-06, - "loss": 0.80564719, - "num_input_tokens_seen": 223372090, - "step": 10368, - "time_per_iteration": 2.625438928604126 - }, - { - "auxiliary_loss_clip": 0.01052163, - "auxiliary_loss_mlp": 0.01035756, - "balance_loss_clip": 1.03783369, - "balance_loss_mlp": 1.02115321, - "epoch": 0.623418006914174, - "flos": 23769416868480.0, - "grad_norm": 2.0197111691245735, - "language_loss": 0.68460292, - "learning_rate": 1.3119558848247811e-06, - "loss": 0.70548213, - "num_input_tokens_seen": 223390110, - "step": 10369, - "time_per_iteration": 2.808359146118164 - }, - { - "auxiliary_loss_clip": 0.01117993, - "auxiliary_loss_mlp": 0.0103678, - "balance_loss_clip": 1.04215741, - "balance_loss_mlp": 1.02325583, - "epoch": 0.6234781301668421, - "flos": 17890480857600.0, - "grad_norm": 2.044462972771541, - "language_loss": 0.88031048, - "learning_rate": 1.3115902083218072e-06, - "loss": 0.90185821, - "num_input_tokens_seen": 223404205, - "step": 10370, - "time_per_iteration": 4.117987155914307 - }, - { - "auxiliary_loss_clip": 0.0111332, - "auxiliary_loss_mlp": 0.01029208, - "balance_loss_clip": 1.039994, - "balance_loss_mlp": 1.01634502, - "epoch": 0.62353825341951, - "flos": 26175503352960.0, - "grad_norm": 1.608857921427384, - "language_loss": 0.66079128, - "learning_rate": 1.311224557923402e-06, - "loss": 0.68221653, - "num_input_tokens_seen": 223424855, - "step": 10371, - "time_per_iteration": 4.359363079071045 - }, - { - "auxiliary_loss_clip": 0.01098316, - "auxiliary_loss_mlp": 0.01029741, - "balance_loss_clip": 1.03844571, - "balance_loss_mlp": 1.01849937, - "epoch": 0.623598376672178, - "flos": 31139902160640.0, - "grad_norm": 1.3363320294252738, - "language_loss": 0.77749312, - "learning_rate": 1.3108589336434298e-06, - "loss": 0.79877365, - "num_input_tokens_seen": 223447225, - "step": 10372, - "time_per_iteration": 4.2803263664245605 - }, - { - "auxiliary_loss_clip": 0.01105747, - "auxiliary_loss_mlp": 0.01033888, - "balance_loss_clip": 1.0399971, - "balance_loss_mlp": 1.02063167, - "epoch": 0.6236584999248459, - "flos": 23730202195200.0, - "grad_norm": 1.86692873433382, - "language_loss": 0.77388912, - "learning_rate": 1.3104933354957568e-06, - "loss": 0.79528546, - "num_input_tokens_seen": 223467520, - "step": 10373, - "time_per_iteration": 2.6164214611053467 - }, - { - "auxiliary_loss_clip": 0.01099988, - "auxiliary_loss_mlp": 0.01029626, - "balance_loss_clip": 1.03910232, - "balance_loss_mlp": 1.0177052, - "epoch": 0.6237186231775139, - "flos": 21762764599680.0, - "grad_norm": 1.5661441130214229, - "language_loss": 0.69628543, - "learning_rate": 1.3101277634942448e-06, - "loss": 0.71758157, - "num_input_tokens_seen": 223488130, - "step": 10374, - "time_per_iteration": 2.620152711868286 - }, - { - "auxiliary_loss_clip": 0.0109877, - "auxiliary_loss_mlp": 0.01027687, - "balance_loss_clip": 1.04083633, - "balance_loss_mlp": 1.01481199, - "epoch": 0.6237787464301818, - "flos": 14939486075520.0, - "grad_norm": 1.8629467261116164, - "language_loss": 0.77406085, - "learning_rate": 1.3097622176527577e-06, - "loss": 0.7953254, - "num_input_tokens_seen": 223505105, - "step": 10375, - "time_per_iteration": 2.662888526916504 - }, - { - "auxiliary_loss_clip": 0.0108805, - "auxiliary_loss_mlp": 0.01027669, - "balance_loss_clip": 1.03999758, - "balance_loss_mlp": 1.01531863, - "epoch": 0.6238388696828499, - "flos": 35590311302400.0, - "grad_norm": 1.5320519249858895, - "language_loss": 0.70062512, - "learning_rate": 1.3093966979851566e-06, - "loss": 0.72178227, - "num_input_tokens_seen": 223528065, - "step": 10376, - "time_per_iteration": 2.7455239295959473 - }, - { - "auxiliary_loss_clip": 0.01087005, - "auxiliary_loss_mlp": 0.01030618, - "balance_loss_clip": 1.04036319, - "balance_loss_mlp": 1.01622987, - "epoch": 0.6238989929355178, - "flos": 23623511823360.0, - "grad_norm": 1.5317768555363875, - "language_loss": 0.76383424, - "learning_rate": 1.309031204505301e-06, - "loss": 0.78501046, - "num_input_tokens_seen": 223547305, - "step": 10377, - "time_per_iteration": 4.217595338821411 - }, - { - "auxiliary_loss_clip": 0.01095365, - "auxiliary_loss_mlp": 0.01032947, - "balance_loss_clip": 1.04230881, - "balance_loss_mlp": 1.02149701, - "epoch": 0.6239591161881858, - "flos": 22087468569600.0, - "grad_norm": 1.9922863755635154, - "language_loss": 0.68561447, - "learning_rate": 1.308665737227052e-06, - "loss": 0.70689762, - "num_input_tokens_seen": 223567205, - "step": 10378, - "time_per_iteration": 2.668548822402954 - }, - { - "auxiliary_loss_clip": 0.01089219, - "auxiliary_loss_mlp": 0.01032012, - "balance_loss_clip": 1.03835332, - "balance_loss_mlp": 1.01904845, - "epoch": 0.6240192394408538, - "flos": 24535930124160.0, - "grad_norm": 1.8244104489721222, - "language_loss": 0.76516432, - "learning_rate": 1.3083002961642675e-06, - "loss": 0.78637671, - "num_input_tokens_seen": 223586560, - "step": 10379, - "time_per_iteration": 2.636387825012207 - }, - { - "auxiliary_loss_clip": 0.01091775, - "auxiliary_loss_mlp": 0.01029494, - "balance_loss_clip": 1.03986144, - "balance_loss_mlp": 1.01667941, - "epoch": 0.6240793626935217, - "flos": 27931930502400.0, - "grad_norm": 1.3063592721987374, - "language_loss": 0.79515195, - "learning_rate": 1.3079348813308051e-06, - "loss": 0.81636459, - "num_input_tokens_seen": 223610595, - "step": 10380, - "time_per_iteration": 2.7264626026153564 - }, - { - "auxiliary_loss_clip": 0.01098611, - "auxiliary_loss_mlp": 0.01032438, - "balance_loss_clip": 1.04053771, - "balance_loss_mlp": 1.02064252, - "epoch": 0.6241394859461897, - "flos": 22892514140160.0, - "grad_norm": 1.5486607590861352, - "language_loss": 0.80008709, - "learning_rate": 1.3075694927405207e-06, - "loss": 0.82139754, - "num_input_tokens_seen": 223630230, - "step": 10381, - "time_per_iteration": 2.6646101474761963 - }, - { - "auxiliary_loss_clip": 0.01089557, - "auxiliary_loss_mlp": 0.01035129, - "balance_loss_clip": 1.03794694, - "balance_loss_mlp": 1.0213902, - "epoch": 0.6241996091988576, - "flos": 12750766744320.0, - "grad_norm": 2.2250038803258256, - "language_loss": 0.74777293, - "learning_rate": 1.3072041304072718e-06, - "loss": 0.76901984, - "num_input_tokens_seen": 223648360, - "step": 10382, - "time_per_iteration": 2.7230777740478516 - }, - { - "auxiliary_loss_clip": 0.01101818, - "auxiliary_loss_mlp": 0.01025191, - "balance_loss_clip": 1.03977156, - "balance_loss_mlp": 1.01332331, - "epoch": 0.6242597324515257, - "flos": 25851302173440.0, - "grad_norm": 1.6487787752646939, - "language_loss": 0.78440118, - "learning_rate": 1.306838794344911e-06, - "loss": 0.80567122, - "num_input_tokens_seen": 223671255, - "step": 10383, - "time_per_iteration": 2.7347943782806396 - }, - { - "auxiliary_loss_clip": 0.01078794, - "auxiliary_loss_mlp": 0.01030474, - "balance_loss_clip": 1.03457248, - "balance_loss_mlp": 1.01803493, - "epoch": 0.6243198557041936, - "flos": 19937712516480.0, - "grad_norm": 1.7448881929287328, - "language_loss": 0.74959773, - "learning_rate": 1.3064734845672925e-06, - "loss": 0.77069044, - "num_input_tokens_seen": 223689860, - "step": 10384, - "time_per_iteration": 2.715670347213745 - }, - { - "auxiliary_loss_clip": 0.01090865, - "auxiliary_loss_mlp": 0.01039296, - "balance_loss_clip": 1.03685331, - "balance_loss_mlp": 1.02441823, - "epoch": 0.6243799789568616, - "flos": 18406194376320.0, - "grad_norm": 1.703443113253697, - "language_loss": 0.66354865, - "learning_rate": 1.3061082010882694e-06, - "loss": 0.68485022, - "num_input_tokens_seen": 223707835, - "step": 10385, - "time_per_iteration": 2.6395132541656494 - }, - { - "auxiliary_loss_clip": 0.01017413, - "auxiliary_loss_mlp": 0.00999729, - "balance_loss_clip": 1.01207745, - "balance_loss_mlp": 0.998501, - "epoch": 0.6244401022095295, - "flos": 66027587523840.0, - "grad_norm": 0.7616108367019777, - "language_loss": 0.6200667, - "learning_rate": 1.305742943921692e-06, - "loss": 0.64023811, - "num_input_tokens_seen": 223771875, - "step": 10386, - "time_per_iteration": 3.2555150985717773 - }, - { - "auxiliary_loss_clip": 0.01103744, - "auxiliary_loss_mlp": 0.01032194, - "balance_loss_clip": 1.03913903, - "balance_loss_mlp": 1.01928985, - "epoch": 0.6245002254621975, - "flos": 24571266128640.0, - "grad_norm": 2.488369520959267, - "language_loss": 0.7205711, - "learning_rate": 1.3053777130814128e-06, - "loss": 0.74193048, - "num_input_tokens_seen": 223788895, - "step": 10387, - "time_per_iteration": 2.6242222785949707 - }, - { - "auxiliary_loss_clip": 0.01111553, - "auxiliary_loss_mlp": 0.01040034, - "balance_loss_clip": 1.04189062, - "balance_loss_mlp": 1.0254066, - "epoch": 0.6245603487148654, - "flos": 29168837291520.0, - "grad_norm": 2.3305483255195787, - "language_loss": 0.65657806, - "learning_rate": 1.3050125085812798e-06, - "loss": 0.67809391, - "num_input_tokens_seen": 223810385, - "step": 10388, - "time_per_iteration": 2.659313440322876 - }, - { - "auxiliary_loss_clip": 0.0107602, - "auxiliary_loss_mlp": 0.01029774, - "balance_loss_clip": 1.03905761, - "balance_loss_mlp": 1.01803207, - "epoch": 0.6246204719675335, - "flos": 14790097411200.0, - "grad_norm": 1.9152677822128796, - "language_loss": 0.79151481, - "learning_rate": 1.3046473304351417e-06, - "loss": 0.81257272, - "num_input_tokens_seen": 223826040, - "step": 10389, - "time_per_iteration": 2.6531307697296143 - }, - { - "auxiliary_loss_clip": 0.0108775, - "auxiliary_loss_mlp": 0.0103435, - "balance_loss_clip": 1.036906, - "balance_loss_mlp": 1.02176762, - "epoch": 0.6246805952202014, - "flos": 12493538472960.0, - "grad_norm": 4.707823306169989, - "language_loss": 0.60542148, - "learning_rate": 1.3042821786568475e-06, - "loss": 0.62664247, - "num_input_tokens_seen": 223842300, - "step": 10390, - "time_per_iteration": 2.6380884647369385 - }, - { - "auxiliary_loss_clip": 0.01095689, - "auxiliary_loss_mlp": 0.01032943, - "balance_loss_clip": 1.03998685, - "balance_loss_mlp": 1.02047336, - "epoch": 0.6247407184728694, - "flos": 12786677366400.0, - "grad_norm": 1.9478919515008288, - "language_loss": 0.76811498, - "learning_rate": 1.3039170532602416e-06, - "loss": 0.78940129, - "num_input_tokens_seen": 223858320, - "step": 10391, - "time_per_iteration": 2.6485612392425537 - }, - { - "auxiliary_loss_clip": 0.01095815, - "auxiliary_loss_mlp": 0.01034177, - "balance_loss_clip": 1.0409584, - "balance_loss_mlp": 1.02074265, - "epoch": 0.6248008417255374, - "flos": 40629188960640.0, - "grad_norm": 1.4703588614112992, - "language_loss": 0.64372337, - "learning_rate": 1.3035519542591718e-06, - "loss": 0.66502333, - "num_input_tokens_seen": 223883545, - "step": 10392, - "time_per_iteration": 2.8461811542510986 - }, - { - "auxiliary_loss_clip": 0.01096988, - "auxiliary_loss_mlp": 0.01034326, - "balance_loss_clip": 1.04133046, - "balance_loss_mlp": 1.02083135, - "epoch": 0.6248609649782053, - "flos": 19902017376000.0, - "grad_norm": 1.871769291735266, - "language_loss": 0.76746744, - "learning_rate": 1.3031868816674819e-06, - "loss": 0.78878057, - "num_input_tokens_seen": 223901445, - "step": 10393, - "time_per_iteration": 2.637190818786621 - }, - { - "auxiliary_loss_clip": 0.01078713, - "auxiliary_loss_mlp": 0.00772119, - "balance_loss_clip": 1.03866291, - "balance_loss_mlp": 1.00009847, - "epoch": 0.6249210882308733, - "flos": 19682746801920.0, - "grad_norm": 1.7077234803990555, - "language_loss": 0.82370424, - "learning_rate": 1.3028218354990142e-06, - "loss": 0.84221256, - "num_input_tokens_seen": 223920170, - "step": 10394, - "time_per_iteration": 2.6997132301330566 - }, - { - "auxiliary_loss_clip": 0.01095186, - "auxiliary_loss_mlp": 0.01037496, - "balance_loss_clip": 1.03878772, - "balance_loss_mlp": 1.02421618, - "epoch": 0.6249812114835412, - "flos": 13990726189440.0, - "grad_norm": 1.9873009659143388, - "language_loss": 0.75021064, - "learning_rate": 1.3024568157676128e-06, - "loss": 0.77153742, - "num_input_tokens_seen": 223936495, - "step": 10395, - "time_per_iteration": 2.6623713970184326 - }, - { - "auxiliary_loss_clip": 0.01095635, - "auxiliary_loss_mlp": 0.01033862, - "balance_loss_clip": 1.03662229, - "balance_loss_mlp": 1.0203023, - "epoch": 0.6250413347362093, - "flos": 14530031965440.0, - "grad_norm": 3.229511376831138, - "language_loss": 0.72134733, - "learning_rate": 1.302091822487119e-06, - "loss": 0.74264228, - "num_input_tokens_seen": 223950070, - "step": 10396, - "time_per_iteration": 2.677992820739746 - }, - { - "auxiliary_loss_clip": 0.01075755, - "auxiliary_loss_mlp": 0.0103795, - "balance_loss_clip": 1.04014516, - "balance_loss_mlp": 1.0248127, - "epoch": 0.6251014579888772, - "flos": 22963006581120.0, - "grad_norm": 1.7904379273274065, - "language_loss": 0.75906593, - "learning_rate": 1.3017268556713732e-06, - "loss": 0.78020298, - "num_input_tokens_seen": 223970065, - "step": 10397, - "time_per_iteration": 2.722014904022217 - }, - { - "auxiliary_loss_clip": 0.01092491, - "auxiliary_loss_mlp": 0.0103722, - "balance_loss_clip": 1.04022741, - "balance_loss_mlp": 1.02372003, - "epoch": 0.6251615812415452, - "flos": 28111232217600.0, - "grad_norm": 4.888327827010162, - "language_loss": 0.74880314, - "learning_rate": 1.3013619153342154e-06, - "loss": 0.77010036, - "num_input_tokens_seen": 223990315, - "step": 10398, - "time_per_iteration": 2.7456398010253906 - }, - { - "auxiliary_loss_clip": 0.01117793, - "auxiliary_loss_mlp": 0.01031154, - "balance_loss_clip": 1.03983879, - "balance_loss_mlp": 1.01699233, - "epoch": 0.6252217044942131, - "flos": 26724469887360.0, - "grad_norm": 1.9767586095703997, - "language_loss": 0.73813987, - "learning_rate": 1.300997001489483e-06, - "loss": 0.75962937, - "num_input_tokens_seen": 224009960, - "step": 10399, - "time_per_iteration": 2.6542532444000244 - }, - { - "auxiliary_loss_clip": 0.01077509, - "auxiliary_loss_mlp": 0.01036637, - "balance_loss_clip": 1.03692555, - "balance_loss_mlp": 1.02285028, - "epoch": 0.6252818277468811, - "flos": 20006768413440.0, - "grad_norm": 1.7877165034058586, - "language_loss": 0.74266648, - "learning_rate": 1.3006321141510147e-06, - "loss": 0.76380795, - "num_input_tokens_seen": 224028870, - "step": 10400, - "time_per_iteration": 2.6837148666381836 - }, - { - "auxiliary_loss_clip": 0.0101245, - "auxiliary_loss_mlp": 0.01001226, - "balance_loss_clip": 1.01475704, - "balance_loss_mlp": 0.99997389, - "epoch": 0.625341950999549, - "flos": 59278285059840.0, - "grad_norm": 0.8429284892848663, - "language_loss": 0.56419927, - "learning_rate": 1.3002672533326465e-06, - "loss": 0.58433604, - "num_input_tokens_seen": 224094140, - "step": 10401, - "time_per_iteration": 3.3155579566955566 - }, - { - "auxiliary_loss_clip": 0.01107517, - "auxiliary_loss_mlp": 0.01034196, - "balance_loss_clip": 1.04071486, - "balance_loss_mlp": 1.02067709, - "epoch": 0.625402074252217, - "flos": 20157090831360.0, - "grad_norm": 2.04205601235836, - "language_loss": 0.83276439, - "learning_rate": 1.2999024190482146e-06, - "loss": 0.85418153, - "num_input_tokens_seen": 224113235, - "step": 10402, - "time_per_iteration": 2.691084146499634 - }, - { - "auxiliary_loss_clip": 0.01036621, - "auxiliary_loss_mlp": 0.01034014, - "balance_loss_clip": 1.036587, - "balance_loss_mlp": 1.02084088, - "epoch": 0.625462197504885, - "flos": 29132531619840.0, - "grad_norm": 2.64185876470146, - "language_loss": 0.69291663, - "learning_rate": 1.2995376113115527e-06, - "loss": 0.71362293, - "num_input_tokens_seen": 224134530, - "step": 10403, - "time_per_iteration": 2.9650638103485107 - }, - { - "auxiliary_loss_clip": 0.01081288, - "auxiliary_loss_mlp": 0.01031215, - "balance_loss_clip": 1.03741455, - "balance_loss_mlp": 1.01692796, - "epoch": 0.625522320757553, - "flos": 26104436294400.0, - "grad_norm": 1.773424214610222, - "language_loss": 0.71938539, - "learning_rate": 1.2991728301364954e-06, - "loss": 0.74051046, - "num_input_tokens_seen": 224154170, - "step": 10404, - "time_per_iteration": 3.032392978668213 - }, - { - "auxiliary_loss_clip": 0.01071553, - "auxiliary_loss_mlp": 0.01037364, - "balance_loss_clip": 1.03673673, - "balance_loss_mlp": 1.02419138, - "epoch": 0.625582444010221, - "flos": 20630967984000.0, - "grad_norm": 1.988268046568807, - "language_loss": 0.69859874, - "learning_rate": 1.2988080755368742e-06, - "loss": 0.71968794, - "num_input_tokens_seen": 224172730, - "step": 10405, - "time_per_iteration": 2.752593994140625 - }, - { - "auxiliary_loss_clip": 0.01088298, - "auxiliary_loss_mlp": 0.01038031, - "balance_loss_clip": 1.03901088, - "balance_loss_mlp": 1.02447712, - "epoch": 0.6256425672628889, - "flos": 20521512264960.0, - "grad_norm": 1.8903848634840759, - "language_loss": 0.7935456, - "learning_rate": 1.2984433475265207e-06, - "loss": 0.81480896, - "num_input_tokens_seen": 224192620, - "step": 10406, - "time_per_iteration": 2.6944150924682617 - }, - { - "auxiliary_loss_clip": 0.01078593, - "auxiliary_loss_mlp": 0.01035645, - "balance_loss_clip": 1.0391295, - "balance_loss_mlp": 1.02321792, - "epoch": 0.6257026905155569, - "flos": 29529200488320.0, - "grad_norm": 1.7747095604461551, - "language_loss": 0.68853474, - "learning_rate": 1.2980786461192666e-06, - "loss": 0.70967722, - "num_input_tokens_seen": 224214660, - "step": 10407, - "time_per_iteration": 2.7394134998321533 - }, - { - "auxiliary_loss_clip": 0.01101618, - "auxiliary_loss_mlp": 0.00769457, - "balance_loss_clip": 1.03912544, - "balance_loss_mlp": 1.00006318, - "epoch": 0.6257628137682248, - "flos": 24024885373440.0, - "grad_norm": 1.6542698687790116, - "language_loss": 0.8580991, - "learning_rate": 1.2977139713289398e-06, - "loss": 0.87680984, - "num_input_tokens_seen": 224234170, - "step": 10408, - "time_per_iteration": 2.647240400314331 - }, - { - "auxiliary_loss_clip": 0.01090915, - "auxiliary_loss_mlp": 0.00769522, - "balance_loss_clip": 1.03742266, - "balance_loss_mlp": 1.00007892, - "epoch": 0.6258229370208929, - "flos": 20850956830080.0, - "grad_norm": 1.8769352919555562, - "language_loss": 0.79664773, - "learning_rate": 1.2973493231693699e-06, - "loss": 0.81525207, - "num_input_tokens_seen": 224253115, - "step": 10409, - "time_per_iteration": 5.298889636993408 - }, - { - "auxiliary_loss_clip": 0.01091226, - "auxiliary_loss_mlp": 0.01033686, - "balance_loss_clip": 1.03762126, - "balance_loss_mlp": 1.02168143, - "epoch": 0.6258830602735608, - "flos": 22231542021120.0, - "grad_norm": 2.146507314015339, - "language_loss": 0.69629455, - "learning_rate": 1.2969847016543845e-06, - "loss": 0.71754372, - "num_input_tokens_seen": 224271375, - "step": 10410, - "time_per_iteration": 2.7642364501953125 - }, - { - "auxiliary_loss_clip": 0.01066453, - "auxiliary_loss_mlp": 0.01032082, - "balance_loss_clip": 1.03571606, - "balance_loss_mlp": 1.01986265, - "epoch": 0.6259431835262288, - "flos": 25076887925760.0, - "grad_norm": 2.4453810502825153, - "language_loss": 0.67605823, - "learning_rate": 1.2966201067978086e-06, - "loss": 0.6970436, - "num_input_tokens_seen": 224290315, - "step": 10411, - "time_per_iteration": 4.3257997035980225 - }, - { - "auxiliary_loss_clip": 0.0106799, - "auxiliary_loss_mlp": 0.01040597, - "balance_loss_clip": 1.03715658, - "balance_loss_mlp": 1.02818179, - "epoch": 0.6260033067788967, - "flos": 28252288926720.0, - "grad_norm": 1.954494979108325, - "language_loss": 0.69357151, - "learning_rate": 1.2962555386134702e-06, - "loss": 0.71465743, - "num_input_tokens_seen": 224310545, - "step": 10412, - "time_per_iteration": 4.512540578842163 - }, - { - "auxiliary_loss_clip": 0.01080692, - "auxiliary_loss_mlp": 0.0104025, - "balance_loss_clip": 1.03551555, - "balance_loss_mlp": 1.02700531, - "epoch": 0.6260634300315647, - "flos": 23367432787200.0, - "grad_norm": 1.4726479761814617, - "language_loss": 0.6975283, - "learning_rate": 1.2958909971151908e-06, - "loss": 0.71873772, - "num_input_tokens_seen": 224331115, - "step": 10413, - "time_per_iteration": 2.715327262878418 - }, - { - "auxiliary_loss_clip": 0.01083008, - "auxiliary_loss_mlp": 0.01034077, - "balance_loss_clip": 1.03659189, - "balance_loss_mlp": 1.01976025, - "epoch": 0.6261235532842326, - "flos": 18035308494720.0, - "grad_norm": 2.5748151630879277, - "language_loss": 0.80629605, - "learning_rate": 1.295526482316796e-06, - "loss": 0.82746685, - "num_input_tokens_seen": 224347525, - "step": 10414, - "time_per_iteration": 2.7809388637542725 - }, - { - "auxiliary_loss_clip": 0.0110639, - "auxiliary_loss_mlp": 0.01037432, - "balance_loss_clip": 1.04208875, - "balance_loss_mlp": 1.0249393, - "epoch": 0.6261836765369007, - "flos": 22011265866240.0, - "grad_norm": 1.7429212772998885, - "language_loss": 0.74786866, - "learning_rate": 1.2951619942321083e-06, - "loss": 0.7693069, - "num_input_tokens_seen": 224367045, - "step": 10415, - "time_per_iteration": 2.790271282196045 - }, - { - "auxiliary_loss_clip": 0.01062067, - "auxiliary_loss_mlp": 0.01034612, - "balance_loss_clip": 1.03746879, - "balance_loss_mlp": 1.0215826, - "epoch": 0.6262437997895686, - "flos": 24936010784640.0, - "grad_norm": 1.5794864494822807, - "language_loss": 0.74193609, - "learning_rate": 1.2947975328749472e-06, - "loss": 0.76290286, - "num_input_tokens_seen": 224388860, - "step": 10416, - "time_per_iteration": 2.7647581100463867 - }, - { - "auxiliary_loss_clip": 0.01086432, - "auxiliary_loss_mlp": 0.01033307, - "balance_loss_clip": 1.04012477, - "balance_loss_mlp": 1.02101088, - "epoch": 0.6263039230422366, - "flos": 31608428186880.0, - "grad_norm": 1.6472166500534797, - "language_loss": 0.84573495, - "learning_rate": 1.2944330982591352e-06, - "loss": 0.86693239, - "num_input_tokens_seen": 224409645, - "step": 10417, - "time_per_iteration": 4.274592638015747 - }, - { - "auxiliary_loss_clip": 0.01105981, - "auxiliary_loss_mlp": 0.0103493, - "balance_loss_clip": 1.04019403, - "balance_loss_mlp": 1.02186441, - "epoch": 0.6263640462949046, - "flos": 17639465639040.0, - "grad_norm": 2.0790985994239066, - "language_loss": 0.56728101, - "learning_rate": 1.2940686903984904e-06, - "loss": 0.58869016, - "num_input_tokens_seen": 224428530, - "step": 10418, - "time_per_iteration": 2.691500186920166 - }, - { - "auxiliary_loss_clip": 0.01110622, - "auxiliary_loss_mlp": 0.0104313, - "balance_loss_clip": 1.04013753, - "balance_loss_mlp": 1.0293498, - "epoch": 0.6264241695475725, - "flos": 19974951941760.0, - "grad_norm": 1.8736530467564598, - "language_loss": 0.8455261, - "learning_rate": 1.2937043093068316e-06, - "loss": 0.86706358, - "num_input_tokens_seen": 224447175, - "step": 10419, - "time_per_iteration": 2.739027261734009 - }, - { - "auxiliary_loss_clip": 0.01119559, - "auxiliary_loss_mlp": 0.01031671, - "balance_loss_clip": 1.04406238, - "balance_loss_mlp": 1.01907599, - "epoch": 0.6264842928002405, - "flos": 27344323912320.0, - "grad_norm": 1.509247263381085, - "language_loss": 0.6426456, - "learning_rate": 1.2933399549979762e-06, - "loss": 0.66415787, - "num_input_tokens_seen": 224469445, - "step": 10420, - "time_per_iteration": 2.7180798053741455 - }, - { - "auxiliary_loss_clip": 0.01076087, - "auxiliary_loss_mlp": 0.01035851, - "balance_loss_clip": 1.03824723, - "balance_loss_mlp": 1.02204061, - "epoch": 0.6265444160529084, - "flos": 22997265177600.0, - "grad_norm": 2.1707304020443527, - "language_loss": 0.86138391, - "learning_rate": 1.292975627485741e-06, - "loss": 0.88250327, - "num_input_tokens_seen": 224486590, - "step": 10421, - "time_per_iteration": 2.7487831115722656 - }, - { - "auxiliary_loss_clip": 0.01078665, - "auxiliary_loss_mlp": 0.01036628, - "balance_loss_clip": 1.03799725, - "balance_loss_mlp": 1.02374697, - "epoch": 0.6266045393055765, - "flos": 19938323047680.0, - "grad_norm": 2.422674057917065, - "language_loss": 0.79407763, - "learning_rate": 1.2926113267839403e-06, - "loss": 0.81523055, - "num_input_tokens_seen": 224502795, - "step": 10422, - "time_per_iteration": 2.8828704357147217 - }, - { - "auxiliary_loss_clip": 0.01104293, - "auxiliary_loss_mlp": 0.01027022, - "balance_loss_clip": 1.04006767, - "balance_loss_mlp": 1.01370621, - "epoch": 0.6266646625582444, - "flos": 24389091325440.0, - "grad_norm": 2.2930026415354368, - "language_loss": 0.74455339, - "learning_rate": 1.292247052906389e-06, - "loss": 0.76586652, - "num_input_tokens_seen": 224522300, - "step": 10423, - "time_per_iteration": 2.7208752632141113 - }, - { - "auxiliary_loss_clip": 0.01114032, - "auxiliary_loss_mlp": 0.01028546, - "balance_loss_clip": 1.04019392, - "balance_loss_mlp": 1.01625562, - "epoch": 0.6267247858109124, - "flos": 14683802088960.0, - "grad_norm": 1.9557551713522223, - "language_loss": 0.7775594, - "learning_rate": 1.2918828058669004e-06, - "loss": 0.79898518, - "num_input_tokens_seen": 224538260, - "step": 10424, - "time_per_iteration": 2.592926263809204 - }, - { - "auxiliary_loss_clip": 0.01113819, - "auxiliary_loss_mlp": 0.01032907, - "balance_loss_clip": 1.04032254, - "balance_loss_mlp": 1.01879287, - "epoch": 0.6267849090635803, - "flos": 24929977299840.0, - "grad_norm": 2.1677847187028605, - "language_loss": 0.6903978, - "learning_rate": 1.2915185856792868e-06, - "loss": 0.71186507, - "num_input_tokens_seen": 224559155, - "step": 10425, - "time_per_iteration": 2.668877363204956 - }, - { - "auxiliary_loss_clip": 0.01089804, - "auxiliary_loss_mlp": 0.01029639, - "balance_loss_clip": 1.03939557, - "balance_loss_mlp": 1.01808131, - "epoch": 0.6268450323162483, - "flos": 25337851211520.0, - "grad_norm": 1.4857408938723873, - "language_loss": 0.74492955, - "learning_rate": 1.2911543923573598e-06, - "loss": 0.76612389, - "num_input_tokens_seen": 224578660, - "step": 10426, - "time_per_iteration": 2.720566987991333 - }, - { - "auxiliary_loss_clip": 0.01106657, - "auxiliary_loss_mlp": 0.00770492, - "balance_loss_clip": 1.04118848, - "balance_loss_mlp": 1.00016105, - "epoch": 0.6269051555689162, - "flos": 26177299032960.0, - "grad_norm": 2.445291482107416, - "language_loss": 0.80835652, - "learning_rate": 1.290790225914929e-06, - "loss": 0.82712793, - "num_input_tokens_seen": 224599080, - "step": 10427, - "time_per_iteration": 2.6930294036865234 - }, - { - "auxiliary_loss_clip": 0.01083192, - "auxiliary_loss_mlp": 0.01039458, - "balance_loss_clip": 1.03919089, - "balance_loss_mlp": 1.02608228, - "epoch": 0.6269652788215843, - "flos": 18256877539200.0, - "grad_norm": 2.002033794251086, - "language_loss": 0.68361104, - "learning_rate": 1.2904260863658034e-06, - "loss": 0.70483756, - "num_input_tokens_seen": 224614225, - "step": 10428, - "time_per_iteration": 2.750072717666626 - }, - { - "auxiliary_loss_clip": 0.01070825, - "auxiliary_loss_mlp": 0.01048713, - "balance_loss_clip": 1.03721058, - "balance_loss_mlp": 1.03428292, - "epoch": 0.6270254020742522, - "flos": 11765413877760.0, - "grad_norm": 1.948024958379765, - "language_loss": 0.71860063, - "learning_rate": 1.2900619737237928e-06, - "loss": 0.73979598, - "num_input_tokens_seen": 224632365, - "step": 10429, - "time_per_iteration": 2.746628761291504 - }, - { - "auxiliary_loss_clip": 0.01109377, - "auxiliary_loss_mlp": 0.01032535, - "balance_loss_clip": 1.04220653, - "balance_loss_mlp": 1.01867652, - "epoch": 0.6270855253269202, - "flos": 23475631530240.0, - "grad_norm": 1.6097875593140534, - "language_loss": 0.79522586, - "learning_rate": 1.2896978880027023e-06, - "loss": 0.81664503, - "num_input_tokens_seen": 224651125, - "step": 10430, - "time_per_iteration": 2.7708442211151123 - }, - { - "auxiliary_loss_clip": 0.01033801, - "auxiliary_loss_mlp": 0.01002127, - "balance_loss_clip": 1.01011229, - "balance_loss_mlp": 1.00103593, - "epoch": 0.6271456485795882, - "flos": 70064520232320.0, - "grad_norm": 1.3411395578732954, - "language_loss": 0.59105575, - "learning_rate": 1.2893338292163393e-06, - "loss": 0.61141503, - "num_input_tokens_seen": 224716115, - "step": 10431, - "time_per_iteration": 3.284141778945923 - }, - { - "auxiliary_loss_clip": 0.01016087, - "auxiliary_loss_mlp": 0.01003696, - "balance_loss_clip": 1.01267934, - "balance_loss_mlp": 1.00251579, - "epoch": 0.6272057718322561, - "flos": 65156718280320.0, - "grad_norm": 0.8756941222650257, - "language_loss": 0.63814843, - "learning_rate": 1.2889697973785095e-06, - "loss": 0.65834618, - "num_input_tokens_seen": 224782930, - "step": 10432, - "time_per_iteration": 3.315559148788452 - }, - { - "auxiliary_loss_clip": 0.0109102, - "auxiliary_loss_mlp": 0.01033306, - "balance_loss_clip": 1.03992772, - "balance_loss_mlp": 1.02161813, - "epoch": 0.6272658950849241, - "flos": 24389342720640.0, - "grad_norm": 1.881228339897183, - "language_loss": 0.64901084, - "learning_rate": 1.2886057925030153e-06, - "loss": 0.67025411, - "num_input_tokens_seen": 224802010, - "step": 10433, - "time_per_iteration": 2.7182137966156006 - }, - { - "auxiliary_loss_clip": 0.01108511, - "auxiliary_loss_mlp": 0.01033022, - "balance_loss_clip": 1.04193711, - "balance_loss_mlp": 1.01966476, - "epoch": 0.627326018337592, - "flos": 17966001202560.0, - "grad_norm": 2.029162826422426, - "language_loss": 0.61656857, - "learning_rate": 1.2882418146036612e-06, - "loss": 0.63798386, - "num_input_tokens_seen": 224818875, - "step": 10434, - "time_per_iteration": 2.698272228240967 - }, - { - "auxiliary_loss_clip": 0.0107895, - "auxiliary_loss_mlp": 0.01026455, - "balance_loss_clip": 1.03706336, - "balance_loss_mlp": 1.01392627, - "epoch": 0.6273861415902601, - "flos": 20230097224320.0, - "grad_norm": 1.7060876035395582, - "language_loss": 0.84624016, - "learning_rate": 1.2878778636942484e-06, - "loss": 0.86729419, - "num_input_tokens_seen": 224837790, - "step": 10435, - "time_per_iteration": 2.7053635120391846 - }, - { - "auxiliary_loss_clip": 0.01033575, - "auxiliary_loss_mlp": 0.01005985, - "balance_loss_clip": 1.00981998, - "balance_loss_mlp": 1.00484645, - "epoch": 0.627446264842928, - "flos": 64953210798720.0, - "grad_norm": 0.7308695189229724, - "language_loss": 0.61571616, - "learning_rate": 1.2875139397885786e-06, - "loss": 0.63611174, - "num_input_tokens_seen": 224899685, - "step": 10436, - "time_per_iteration": 3.1732895374298096 - }, - { - "auxiliary_loss_clip": 0.01099296, - "auxiliary_loss_mlp": 0.01040375, - "balance_loss_clip": 1.04577446, - "balance_loss_mlp": 1.02651119, - "epoch": 0.627506388095596, - "flos": 23584261236480.0, - "grad_norm": 1.4615085745823022, - "language_loss": 0.77539217, - "learning_rate": 1.2871500429004523e-06, - "loss": 0.79678893, - "num_input_tokens_seen": 224918650, - "step": 10437, - "time_per_iteration": 2.8112289905548096 - }, - { - "auxiliary_loss_clip": 0.0102524, - "auxiliary_loss_mlp": 0.01007069, - "balance_loss_clip": 1.01128411, - "balance_loss_mlp": 1.00595462, - "epoch": 0.6275665113482639, - "flos": 67583631674880.0, - "grad_norm": 0.7245410806399479, - "language_loss": 0.54275799, - "learning_rate": 1.2867861730436667e-06, - "loss": 0.56308109, - "num_input_tokens_seen": 224981575, - "step": 10438, - "time_per_iteration": 3.1365692615509033 - }, - { - "auxiliary_loss_clip": 0.01063228, - "auxiliary_loss_mlp": 0.01041641, - "balance_loss_clip": 1.03674674, - "balance_loss_mlp": 1.02898097, - "epoch": 0.6276266346009319, - "flos": 27636924101760.0, - "grad_norm": 1.7255538562739963, - "language_loss": 0.84122932, - "learning_rate": 1.2864223302320214e-06, - "loss": 0.86227804, - "num_input_tokens_seen": 225000820, - "step": 10439, - "time_per_iteration": 2.909126043319702 - }, - { - "auxiliary_loss_clip": 0.01077398, - "auxiliary_loss_mlp": 0.01044046, - "balance_loss_clip": 1.04187262, - "balance_loss_mlp": 1.03006864, - "epoch": 0.6276867578535998, - "flos": 22746142218240.0, - "grad_norm": 2.0752652164499783, - "language_loss": 0.80063027, - "learning_rate": 1.2860585144793128e-06, - "loss": 0.8218447, - "num_input_tokens_seen": 225017585, - "step": 10440, - "time_per_iteration": 2.7793238162994385 - }, - { - "auxiliary_loss_clip": 0.01059905, - "auxiliary_loss_mlp": 0.01030462, - "balance_loss_clip": 1.03476882, - "balance_loss_mlp": 1.01888728, - "epoch": 0.6277468811062679, - "flos": 24644200694400.0, - "grad_norm": 1.357982638723412, - "language_loss": 0.74566025, - "learning_rate": 1.285694725799337e-06, - "loss": 0.76656389, - "num_input_tokens_seen": 225039085, - "step": 10441, - "time_per_iteration": 2.9267096519470215 - }, - { - "auxiliary_loss_clip": 0.01095865, - "auxiliary_loss_mlp": 0.0103067, - "balance_loss_clip": 1.03701901, - "balance_loss_mlp": 1.01759267, - "epoch": 0.6278070043589358, - "flos": 19678975873920.0, - "grad_norm": 2.0708219033723316, - "language_loss": 0.72098005, - "learning_rate": 1.2853309642058884e-06, - "loss": 0.74224538, - "num_input_tokens_seen": 225058105, - "step": 10442, - "time_per_iteration": 2.6998653411865234 - }, - { - "auxiliary_loss_clip": 0.01081918, - "auxiliary_loss_mlp": 0.01030205, - "balance_loss_clip": 1.03865194, - "balance_loss_mlp": 1.01750898, - "epoch": 0.6278671276116038, - "flos": 22121834906880.0, - "grad_norm": 1.6030154795021492, - "language_loss": 0.7134285, - "learning_rate": 1.284967229712762e-06, - "loss": 0.73454976, - "num_input_tokens_seen": 225077605, - "step": 10443, - "time_per_iteration": 2.8322415351867676 - }, - { - "auxiliary_loss_clip": 0.0111667, - "auxiliary_loss_mlp": 0.01031963, - "balance_loss_clip": 1.04252923, - "balance_loss_mlp": 1.01954722, - "epoch": 0.6279272508642717, - "flos": 23038562839680.0, - "grad_norm": 2.1504215551644523, - "language_loss": 0.73254573, - "learning_rate": 1.2846035223337492e-06, - "loss": 0.75403202, - "num_input_tokens_seen": 225097775, - "step": 10444, - "time_per_iteration": 2.6936285495758057 - }, - { - "auxiliary_loss_clip": 0.01085082, - "auxiliary_loss_mlp": 0.01032194, - "balance_loss_clip": 1.04689062, - "balance_loss_mlp": 1.01936126, - "epoch": 0.6279873741169397, - "flos": 19824090819840.0, - "grad_norm": 2.0098765769795697, - "language_loss": 0.724576, - "learning_rate": 1.2842398420826423e-06, - "loss": 0.74574882, - "num_input_tokens_seen": 225115585, - "step": 10445, - "time_per_iteration": 2.7513034343719482 - }, - { - "auxiliary_loss_clip": 0.01101735, - "auxiliary_loss_mlp": 0.01029744, - "balance_loss_clip": 1.03916216, - "balance_loss_mlp": 1.0170486, - "epoch": 0.6280474973696077, - "flos": 23915393740800.0, - "grad_norm": 1.5354377153299141, - "language_loss": 0.692366, - "learning_rate": 1.2838761889732331e-06, - "loss": 0.71368074, - "num_input_tokens_seen": 225135575, - "step": 10446, - "time_per_iteration": 2.7197511196136475 - }, - { - "auxiliary_loss_clip": 0.01075612, - "auxiliary_loss_mlp": 0.0103331, - "balance_loss_clip": 1.03858328, - "balance_loss_mlp": 1.01901674, - "epoch": 0.6281076206222757, - "flos": 17967976450560.0, - "grad_norm": 2.0624649000071638, - "language_loss": 0.73082191, - "learning_rate": 1.2835125630193102e-06, - "loss": 0.75191116, - "num_input_tokens_seen": 225154230, - "step": 10447, - "time_per_iteration": 2.8416759967803955 - }, - { - "auxiliary_loss_clip": 0.01024228, - "auxiliary_loss_mlp": 0.00999654, - "balance_loss_clip": 1.00985765, - "balance_loss_mlp": 0.99855727, - "epoch": 0.6281677438749437, - "flos": 66778370622720.0, - "grad_norm": 0.6739953142314802, - "language_loss": 0.52296638, - "learning_rate": 1.2831489642346626e-06, - "loss": 0.54320526, - "num_input_tokens_seen": 225213650, - "step": 10448, - "time_per_iteration": 5.136569976806641 - }, - { - "auxiliary_loss_clip": 0.01089733, - "auxiliary_loss_mlp": 0.01050472, - "balance_loss_clip": 1.0385865, - "balance_loss_mlp": 1.03579164, - "epoch": 0.6282278671276116, - "flos": 11656173640320.0, - "grad_norm": 2.2865528324647744, - "language_loss": 0.91361725, - "learning_rate": 1.282785392633079e-06, - "loss": 0.93501937, - "num_input_tokens_seen": 225230135, - "step": 10449, - "time_per_iteration": 2.7638633251190186 - }, - { - "auxiliary_loss_clip": 0.01112884, - "auxiliary_loss_mlp": 0.01032015, - "balance_loss_clip": 1.03918815, - "balance_loss_mlp": 1.02023697, - "epoch": 0.6282879903802796, - "flos": 42741597847680.0, - "grad_norm": 1.5879286033336677, - "language_loss": 0.60231853, - "learning_rate": 1.2824218482283438e-06, - "loss": 0.6237675, - "num_input_tokens_seen": 225253520, - "step": 10450, - "time_per_iteration": 4.464092493057251 - }, - { - "auxiliary_loss_clip": 0.01089139, - "auxiliary_loss_mlp": 0.01032278, - "balance_loss_clip": 1.04133666, - "balance_loss_mlp": 1.01986873, - "epoch": 0.6283481136329475, - "flos": 20009210538240.0, - "grad_norm": 1.522481037470791, - "language_loss": 0.76846904, - "learning_rate": 1.2820583310342452e-06, - "loss": 0.78968322, - "num_input_tokens_seen": 225272460, - "step": 10451, - "time_per_iteration": 4.40496563911438 - }, - { - "auxiliary_loss_clip": 0.01090661, - "auxiliary_loss_mlp": 0.01030764, - "balance_loss_clip": 1.03676105, - "balance_loss_mlp": 1.01773453, - "epoch": 0.6284082368856155, - "flos": 21904431840000.0, - "grad_norm": 1.614739235308552, - "language_loss": 0.77571416, - "learning_rate": 1.281694841064566e-06, - "loss": 0.79692847, - "num_input_tokens_seen": 225291700, - "step": 10452, - "time_per_iteration": 2.7239017486572266 - }, - { - "auxiliary_loss_clip": 0.01088221, - "auxiliary_loss_mlp": 0.01034824, - "balance_loss_clip": 1.04302955, - "balance_loss_mlp": 1.02150226, - "epoch": 0.6284683601382834, - "flos": 25484187219840.0, - "grad_norm": 1.7878849951641813, - "language_loss": 0.72469395, - "learning_rate": 1.2813313783330904e-06, - "loss": 0.74592441, - "num_input_tokens_seen": 225311470, - "step": 10453, - "time_per_iteration": 2.9393930435180664 - }, - { - "auxiliary_loss_clip": 0.01053587, - "auxiliary_loss_mlp": 0.01040763, - "balance_loss_clip": 1.03172648, - "balance_loss_mlp": 1.02527809, - "epoch": 0.6285284833909515, - "flos": 16538695395840.0, - "grad_norm": 1.709886822132608, - "language_loss": 0.80723816, - "learning_rate": 1.2809679428536013e-06, - "loss": 0.82818168, - "num_input_tokens_seen": 225328385, - "step": 10454, - "time_per_iteration": 2.8191676139831543 - }, - { - "auxiliary_loss_clip": 0.01086328, - "auxiliary_loss_mlp": 0.01036988, - "balance_loss_clip": 1.04401016, - "balance_loss_mlp": 1.02476287, - "epoch": 0.6285886066436194, - "flos": 22820692896000.0, - "grad_norm": 1.9883426544542775, - "language_loss": 0.82205665, - "learning_rate": 1.2806045346398792e-06, - "loss": 0.84328985, - "num_input_tokens_seen": 225348415, - "step": 10455, - "time_per_iteration": 2.778773784637451 - }, - { - "auxiliary_loss_clip": 0.01066143, - "auxiliary_loss_mlp": 0.00771548, - "balance_loss_clip": 1.03564739, - "balance_loss_mlp": 1.00019312, - "epoch": 0.6286487298962874, - "flos": 24715734629760.0, - "grad_norm": 1.5354473458638056, - "language_loss": 0.81757617, - "learning_rate": 1.280241153705706e-06, - "loss": 0.83595306, - "num_input_tokens_seen": 225367740, - "step": 10456, - "time_per_iteration": 4.4299633502960205 - }, - { - "auxiliary_loss_clip": 0.0108958, - "auxiliary_loss_mlp": 0.01031242, - "balance_loss_clip": 1.04148746, - "balance_loss_mlp": 1.01731229, - "epoch": 0.6287088531489553, - "flos": 20740818752640.0, - "grad_norm": 1.6813486630133685, - "language_loss": 0.71938455, - "learning_rate": 1.27987780006486e-06, - "loss": 0.74059272, - "num_input_tokens_seen": 225388405, - "step": 10457, - "time_per_iteration": 2.7010886669158936 - }, - { - "auxiliary_loss_clip": 0.0110824, - "auxiliary_loss_mlp": 0.01035135, - "balance_loss_clip": 1.03882265, - "balance_loss_mlp": 1.02124166, - "epoch": 0.6287689764016233, - "flos": 23070630706560.0, - "grad_norm": 1.8855739678870833, - "language_loss": 0.79754472, - "learning_rate": 1.2795144737311202e-06, - "loss": 0.81897843, - "num_input_tokens_seen": 225408360, - "step": 10458, - "time_per_iteration": 2.826195478439331 - }, - { - "auxiliary_loss_clip": 0.01110415, - "auxiliary_loss_mlp": 0.01033556, - "balance_loss_clip": 1.0434413, - "balance_loss_mlp": 1.02032971, - "epoch": 0.6288290996542913, - "flos": 32233669251840.0, - "grad_norm": 1.613153759988395, - "language_loss": 0.61056519, - "learning_rate": 1.2791511747182635e-06, - "loss": 0.63200486, - "num_input_tokens_seen": 225431310, - "step": 10459, - "time_per_iteration": 2.8198750019073486 - }, - { - "auxiliary_loss_clip": 0.01090967, - "auxiliary_loss_mlp": 0.0103353, - "balance_loss_clip": 1.03930306, - "balance_loss_mlp": 1.02109075, - "epoch": 0.6288892229069593, - "flos": 24641327606400.0, - "grad_norm": 1.6884168463635612, - "language_loss": 0.78966278, - "learning_rate": 1.2787879030400666e-06, - "loss": 0.81090778, - "num_input_tokens_seen": 225450385, - "step": 10460, - "time_per_iteration": 2.8095743656158447 - }, - { - "auxiliary_loss_clip": 0.01074125, - "auxiliary_loss_mlp": 0.01031631, - "balance_loss_clip": 1.0369761, - "balance_loss_mlp": 1.01822627, - "epoch": 0.6289493461596273, - "flos": 17858341163520.0, - "grad_norm": 1.6519482013468527, - "language_loss": 0.73814094, - "learning_rate": 1.2784246587103047e-06, - "loss": 0.75919855, - "num_input_tokens_seen": 225467325, - "step": 10461, - "time_per_iteration": 2.754106044769287 - }, - { - "auxiliary_loss_clip": 0.01093245, - "auxiliary_loss_mlp": 0.01040397, - "balance_loss_clip": 1.03983331, - "balance_loss_mlp": 1.02764726, - "epoch": 0.6290094694122952, - "flos": 22345379199360.0, - "grad_norm": 1.7440118950274472, - "language_loss": 0.69962513, - "learning_rate": 1.2780614417427523e-06, - "loss": 0.72096151, - "num_input_tokens_seen": 225487370, - "step": 10462, - "time_per_iteration": 2.721280574798584 - }, - { - "auxiliary_loss_clip": 0.01109582, - "auxiliary_loss_mlp": 0.01030961, - "balance_loss_clip": 1.04013419, - "balance_loss_mlp": 1.01948082, - "epoch": 0.6290695926649632, - "flos": 28402431776640.0, - "grad_norm": 2.4371122708038846, - "language_loss": 0.7249735, - "learning_rate": 1.2776982521511821e-06, - "loss": 0.74637896, - "num_input_tokens_seen": 225506915, - "step": 10463, - "time_per_iteration": 2.7322490215301514 - }, - { - "auxiliary_loss_clip": 0.01094633, - "auxiliary_loss_mlp": 0.0104, - "balance_loss_clip": 1.04333925, - "balance_loss_mlp": 1.02713692, - "epoch": 0.6291297159176311, - "flos": 21505464501120.0, - "grad_norm": 1.7167597419504528, - "language_loss": 0.72533494, - "learning_rate": 1.2773350899493665e-06, - "loss": 0.74668121, - "num_input_tokens_seen": 225525670, - "step": 10464, - "time_per_iteration": 2.7556610107421875 - }, - { - "auxiliary_loss_clip": 0.01086904, - "auxiliary_loss_mlp": 0.01034283, - "balance_loss_clip": 1.04166722, - "balance_loss_mlp": 1.02168906, - "epoch": 0.6291898391702991, - "flos": 12203308581120.0, - "grad_norm": 1.750105989459617, - "language_loss": 0.69012117, - "learning_rate": 1.2769719551510768e-06, - "loss": 0.71133304, - "num_input_tokens_seen": 225542235, - "step": 10465, - "time_per_iteration": 2.6720523834228516 - }, - { - "auxiliary_loss_clip": 0.01026598, - "auxiliary_loss_mlp": 0.01001492, - "balance_loss_clip": 1.0124836, - "balance_loss_mlp": 1.00023413, - "epoch": 0.629249962422967, - "flos": 69299479434240.0, - "grad_norm": 0.6784608705879751, - "language_loss": 0.59741104, - "learning_rate": 1.2766088477700832e-06, - "loss": 0.61769187, - "num_input_tokens_seen": 225607185, - "step": 10466, - "time_per_iteration": 3.353839635848999 - }, - { - "auxiliary_loss_clip": 0.01073177, - "auxiliary_loss_mlp": 0.01032178, - "balance_loss_clip": 1.03545153, - "balance_loss_mlp": 1.02020311, - "epoch": 0.6293100856756351, - "flos": 40077888042240.0, - "grad_norm": 1.835286938356158, - "language_loss": 0.64667165, - "learning_rate": 1.276245767820154e-06, - "loss": 0.66772521, - "num_input_tokens_seen": 225628785, - "step": 10467, - "time_per_iteration": 2.921297550201416 - }, - { - "auxiliary_loss_clip": 0.01014455, - "auxiliary_loss_mlp": 0.01000173, - "balance_loss_clip": 1.01132929, - "balance_loss_mlp": 0.9989695, - "epoch": 0.629370208928303, - "flos": 67501108177920.0, - "grad_norm": 0.7915302961276658, - "language_loss": 0.56811368, - "learning_rate": 1.2758827153150586e-06, - "loss": 0.58825994, - "num_input_tokens_seen": 225678980, - "step": 10468, - "time_per_iteration": 3.01094126701355 - }, - { - "auxiliary_loss_clip": 0.00999481, - "auxiliary_loss_mlp": 0.00999518, - "balance_loss_clip": 1.01559901, - "balance_loss_mlp": 0.9980635, - "epoch": 0.629430332180971, - "flos": 60660450449280.0, - "grad_norm": 0.7367622716998392, - "language_loss": 0.57934558, - "learning_rate": 1.2755196902685626e-06, - "loss": 0.59933555, - "num_input_tokens_seen": 225740295, - "step": 10469, - "time_per_iteration": 3.254342555999756 - }, - { - "auxiliary_loss_clip": 0.01032056, - "auxiliary_loss_mlp": 0.01005271, - "balance_loss_clip": 1.02417684, - "balance_loss_mlp": 1.00394154, - "epoch": 0.6294904554336389, - "flos": 66869764778880.0, - "grad_norm": 0.6802993920043705, - "language_loss": 0.5213244, - "learning_rate": 1.2751566926944329e-06, - "loss": 0.54169762, - "num_input_tokens_seen": 225805615, - "step": 10470, - "time_per_iteration": 3.2833499908447266 - }, - { - "auxiliary_loss_clip": 0.01099474, - "auxiliary_loss_mlp": 0.0103723, - "balance_loss_clip": 1.03933227, - "balance_loss_mlp": 1.02434301, - "epoch": 0.6295505786863069, - "flos": 42522794150400.0, - "grad_norm": 1.6833251005433751, - "language_loss": 0.7409395, - "learning_rate": 1.2747937226064342e-06, - "loss": 0.76230645, - "num_input_tokens_seen": 225826585, - "step": 10471, - "time_per_iteration": 2.839749574661255 - }, - { - "auxiliary_loss_clip": 0.0108924, - "auxiliary_loss_mlp": 0.0103139, - "balance_loss_clip": 1.0421524, - "balance_loss_mlp": 1.01881981, - "epoch": 0.629610701938975, - "flos": 17384140788480.0, - "grad_norm": 1.8072062146815357, - "language_loss": 0.63223195, - "learning_rate": 1.2744307800183297e-06, - "loss": 0.65343827, - "num_input_tokens_seen": 225844095, - "step": 10472, - "time_per_iteration": 2.72947359085083 - }, - { - "auxiliary_loss_clip": 0.01121891, - "auxiliary_loss_mlp": 0.01039125, - "balance_loss_clip": 1.04511738, - "balance_loss_mlp": 1.02616739, - "epoch": 0.6296708251916429, - "flos": 24242934885120.0, - "grad_norm": 1.6320866537592498, - "language_loss": 0.69356817, - "learning_rate": 1.2740678649438828e-06, - "loss": 0.71517837, - "num_input_tokens_seen": 225864310, - "step": 10473, - "time_per_iteration": 2.68420672416687 - }, - { - "auxiliary_loss_clip": 0.01090218, - "auxiliary_loss_mlp": 0.0103276, - "balance_loss_clip": 1.03732657, - "balance_loss_mlp": 1.02030838, - "epoch": 0.6297309484443109, - "flos": 19278536077440.0, - "grad_norm": 1.63494515725041, - "language_loss": 0.7420494, - "learning_rate": 1.2737049773968554e-06, - "loss": 0.7632792, - "num_input_tokens_seen": 225883830, - "step": 10474, - "time_per_iteration": 2.7413995265960693 - }, - { - "auxiliary_loss_clip": 0.01090194, - "auxiliary_loss_mlp": 0.00769939, - "balance_loss_clip": 1.03743196, - "balance_loss_mlp": 1.0001384, - "epoch": 0.6297910716969788, - "flos": 30662685043200.0, - "grad_norm": 1.4351205807606953, - "language_loss": 0.66564953, - "learning_rate": 1.2733421173910081e-06, - "loss": 0.68425083, - "num_input_tokens_seen": 225905755, - "step": 10475, - "time_per_iteration": 2.7660322189331055 - }, - { - "auxiliary_loss_clip": 0.0106541, - "auxiliary_loss_mlp": 0.01030607, - "balance_loss_clip": 1.03863168, - "balance_loss_mlp": 1.01878738, - "epoch": 0.6298511949496468, - "flos": 14423018371200.0, - "grad_norm": 1.9836644906797416, - "language_loss": 0.9036352, - "learning_rate": 1.272979284940101e-06, - "loss": 0.92459542, - "num_input_tokens_seen": 225922155, - "step": 10476, - "time_per_iteration": 2.758232593536377 - }, - { - "auxiliary_loss_clip": 0.01114316, - "auxiliary_loss_mlp": 0.01035706, - "balance_loss_clip": 1.04105282, - "balance_loss_mlp": 1.02374947, - "epoch": 0.6299113182023147, - "flos": 23514163845120.0, - "grad_norm": 5.4120485720423055, - "language_loss": 0.75543785, - "learning_rate": 1.2726164800578913e-06, - "loss": 0.77693808, - "num_input_tokens_seen": 225941060, - "step": 10477, - "time_per_iteration": 2.689332962036133 - }, - { - "auxiliary_loss_clip": 0.01100017, - "auxiliary_loss_mlp": 0.01035097, - "balance_loss_clip": 1.03945518, - "balance_loss_mlp": 1.02181101, - "epoch": 0.6299714414549827, - "flos": 22674500542080.0, - "grad_norm": 1.792423931833335, - "language_loss": 0.70299745, - "learning_rate": 1.272253702758138e-06, - "loss": 0.7243486, - "num_input_tokens_seen": 225960870, - "step": 10478, - "time_per_iteration": 2.641702651977539 - }, - { - "auxiliary_loss_clip": 0.011102, - "auxiliary_loss_mlp": 0.01032825, - "balance_loss_clip": 1.04167068, - "balance_loss_mlp": 1.01943791, - "epoch": 0.6300315647076506, - "flos": 14501735026560.0, - "grad_norm": 2.1836774795585012, - "language_loss": 0.66761291, - "learning_rate": 1.2718909530545974e-06, - "loss": 0.68904316, - "num_input_tokens_seen": 225977895, - "step": 10479, - "time_per_iteration": 2.6688246726989746 - }, - { - "auxiliary_loss_clip": 0.01090005, - "auxiliary_loss_mlp": 0.0077118, - "balance_loss_clip": 1.03907907, - "balance_loss_mlp": 1.0001682, - "epoch": 0.6300916879603187, - "flos": 21871681614720.0, - "grad_norm": 2.512846896597075, - "language_loss": 0.73645091, - "learning_rate": 1.2715282309610245e-06, - "loss": 0.7550627, - "num_input_tokens_seen": 225997835, - "step": 10480, - "time_per_iteration": 2.7305657863616943 - }, - { - "auxiliary_loss_clip": 0.011053, - "auxiliary_loss_mlp": 0.01035523, - "balance_loss_clip": 1.04060471, - "balance_loss_mlp": 1.02189767, - "epoch": 0.6301518112129866, - "flos": 21834047139840.0, - "grad_norm": 1.8722485238317301, - "language_loss": 0.79015726, - "learning_rate": 1.2711655364911744e-06, - "loss": 0.81156552, - "num_input_tokens_seen": 226017620, - "step": 10481, - "time_per_iteration": 2.687849283218384 - }, - { - "auxiliary_loss_clip": 0.01021696, - "auxiliary_loss_mlp": 0.01011899, - "balance_loss_clip": 1.01580834, - "balance_loss_mlp": 1.01079035, - "epoch": 0.6302119344656546, - "flos": 44334237957120.0, - "grad_norm": 0.8976146461078123, - "language_loss": 0.61833119, - "learning_rate": 1.2708028696588e-06, - "loss": 0.63866711, - "num_input_tokens_seen": 226068755, - "step": 10482, - "time_per_iteration": 3.008683681488037 - }, - { - "auxiliary_loss_clip": 0.01109585, - "auxiliary_loss_mlp": 0.01034347, - "balance_loss_clip": 1.04106355, - "balance_loss_mlp": 1.02004182, - "epoch": 0.6302720577183225, - "flos": 11217919800960.0, - "grad_norm": 2.2108979789482635, - "language_loss": 0.8277266, - "learning_rate": 1.2704402304776541e-06, - "loss": 0.84916592, - "num_input_tokens_seen": 226084395, - "step": 10483, - "time_per_iteration": 2.623480796813965 - }, - { - "auxiliary_loss_clip": 0.01094195, - "auxiliary_loss_mlp": 0.01042488, - "balance_loss_clip": 1.03946197, - "balance_loss_mlp": 1.03022778, - "epoch": 0.6303321809709905, - "flos": 27964932122880.0, - "grad_norm": 1.5219185358756147, - "language_loss": 0.72691327, - "learning_rate": 1.270077618961487e-06, - "loss": 0.74828005, - "num_input_tokens_seen": 226105890, - "step": 10484, - "time_per_iteration": 2.7577946186065674 - }, - { - "auxiliary_loss_clip": 0.0108643, - "auxiliary_loss_mlp": 0.01033065, - "balance_loss_clip": 1.04017258, - "balance_loss_mlp": 1.01970792, - "epoch": 0.6303923042236586, - "flos": 28220759763840.0, - "grad_norm": 2.7543040419083606, - "language_loss": 0.74625325, - "learning_rate": 1.2697150351240506e-06, - "loss": 0.76744819, - "num_input_tokens_seen": 226126760, - "step": 10485, - "time_per_iteration": 2.8124029636383057 - }, - { - "auxiliary_loss_clip": 0.01093712, - "auxiliary_loss_mlp": 0.00771476, - "balance_loss_clip": 1.04156017, - "balance_loss_mlp": 1.00019419, - "epoch": 0.6304524274763265, - "flos": 27631034271360.0, - "grad_norm": 1.7508926529215563, - "language_loss": 0.81359017, - "learning_rate": 1.269352478979093e-06, - "loss": 0.83224207, - "num_input_tokens_seen": 226147315, - "step": 10486, - "time_per_iteration": 2.8222594261169434 - }, - { - "auxiliary_loss_clip": 0.0109264, - "auxiliary_loss_mlp": 0.01040277, - "balance_loss_clip": 1.04081047, - "balance_loss_mlp": 1.02773643, - "epoch": 0.6305125507289945, - "flos": 17311313963520.0, - "grad_norm": 1.7524407832841304, - "language_loss": 0.63269603, - "learning_rate": 1.2689899505403628e-06, - "loss": 0.6540252, - "num_input_tokens_seen": 226165935, - "step": 10487, - "time_per_iteration": 2.629199743270874 - }, - { - "auxiliary_loss_clip": 0.01116472, - "auxiliary_loss_mlp": 0.01040106, - "balance_loss_clip": 1.04161322, - "balance_loss_mlp": 1.0270344, - "epoch": 0.6305726739816624, - "flos": 25808280658560.0, - "grad_norm": 1.6120412913951392, - "language_loss": 0.66997957, - "learning_rate": 1.2686274498216065e-06, - "loss": 0.69154537, - "num_input_tokens_seen": 226186890, - "step": 10488, - "time_per_iteration": 4.3398730754852295 - }, - { - "auxiliary_loss_clip": 0.01096551, - "auxiliary_loss_mlp": 0.01032615, - "balance_loss_clip": 1.04035902, - "balance_loss_mlp": 1.02013993, - "epoch": 0.6306327972343304, - "flos": 21797454159360.0, - "grad_norm": 1.6559636367213997, - "language_loss": 0.67318177, - "learning_rate": 1.2682649768365706e-06, - "loss": 0.69447345, - "num_input_tokens_seen": 226206710, - "step": 10489, - "time_per_iteration": 4.3245344161987305 - }, - { - "auxiliary_loss_clip": 0.01079741, - "auxiliary_loss_mlp": 0.01044256, - "balance_loss_clip": 1.03847003, - "balance_loss_mlp": 1.02838886, - "epoch": 0.6306929204869983, - "flos": 20777375819520.0, - "grad_norm": 1.8294067402999528, - "language_loss": 0.6980201, - "learning_rate": 1.2679025315990007e-06, - "loss": 0.7192601, - "num_input_tokens_seen": 226225565, - "step": 10490, - "time_per_iteration": 2.7364768981933594 - }, - { - "auxiliary_loss_clip": 0.0109348, - "auxiliary_loss_mlp": 0.01037174, - "balance_loss_clip": 1.03807712, - "balance_loss_mlp": 1.02385783, - "epoch": 0.6307530437396663, - "flos": 23654214973440.0, - "grad_norm": 3.3228808138384545, - "language_loss": 0.78209651, - "learning_rate": 1.2675401141226393e-06, - "loss": 0.80340308, - "num_input_tokens_seen": 226243680, - "step": 10491, - "time_per_iteration": 4.192841053009033 - }, - { - "auxiliary_loss_clip": 0.01089569, - "auxiliary_loss_mlp": 0.01036924, - "balance_loss_clip": 1.03836989, - "balance_loss_mlp": 1.02435327, - "epoch": 0.6308131669923343, - "flos": 24719002767360.0, - "grad_norm": 2.408793546436542, - "language_loss": 0.55951095, - "learning_rate": 1.2671777244212308e-06, - "loss": 0.58077586, - "num_input_tokens_seen": 226264345, - "step": 10492, - "time_per_iteration": 2.7634830474853516 - }, - { - "auxiliary_loss_clip": 0.01118182, - "auxiliary_loss_mlp": 0.01040842, - "balance_loss_clip": 1.04113233, - "balance_loss_mlp": 1.026793, - "epoch": 0.6308732902450023, - "flos": 22565403959040.0, - "grad_norm": 1.8001504389218699, - "language_loss": 0.64376915, - "learning_rate": 1.2668153625085168e-06, - "loss": 0.66535938, - "num_input_tokens_seen": 226283165, - "step": 10493, - "time_per_iteration": 2.617398977279663 - }, - { - "auxiliary_loss_clip": 0.01079208, - "auxiliary_loss_mlp": 0.01031715, - "balance_loss_clip": 1.03931165, - "balance_loss_mlp": 1.01834536, - "epoch": 0.6309334134976702, - "flos": 24644200694400.0, - "grad_norm": 1.3815551057795799, - "language_loss": 0.82869065, - "learning_rate": 1.2664530283982367e-06, - "loss": 0.84979987, - "num_input_tokens_seen": 226304080, - "step": 10494, - "time_per_iteration": 2.9209089279174805 - }, - { - "auxiliary_loss_clip": 0.01102712, - "auxiliary_loss_mlp": 0.01035887, - "balance_loss_clip": 1.04531574, - "balance_loss_mlp": 1.02259517, - "epoch": 0.6309935367503382, - "flos": 41427949651200.0, - "grad_norm": 1.8103540070682869, - "language_loss": 0.79647011, - "learning_rate": 1.2660907221041317e-06, - "loss": 0.81785613, - "num_input_tokens_seen": 226325925, - "step": 10495, - "time_per_iteration": 2.913984775543213 - }, - { - "auxiliary_loss_clip": 0.0108712, - "auxiliary_loss_mlp": 0.01035788, - "balance_loss_clip": 1.03742623, - "balance_loss_mlp": 1.02182817, - "epoch": 0.6310536600030061, - "flos": 15118931445120.0, - "grad_norm": 1.9837558740535257, - "language_loss": 0.70338362, - "learning_rate": 1.2657284436399403e-06, - "loss": 0.72461271, - "num_input_tokens_seen": 226344190, - "step": 10496, - "time_per_iteration": 4.195697546005249 - }, - { - "auxiliary_loss_clip": 0.01097081, - "auxiliary_loss_mlp": 0.01036726, - "balance_loss_clip": 1.04069757, - "balance_loss_mlp": 1.02359533, - "epoch": 0.6311137832556741, - "flos": 15231619388160.0, - "grad_norm": 2.0479454703454616, - "language_loss": 0.79674435, - "learning_rate": 1.2653661930193997e-06, - "loss": 0.81808245, - "num_input_tokens_seen": 226361520, - "step": 10497, - "time_per_iteration": 2.7244081497192383 - }, - { - "auxiliary_loss_clip": 0.01080809, - "auxiliary_loss_mlp": 0.01033562, - "balance_loss_clip": 1.03673339, - "balance_loss_mlp": 1.02134275, - "epoch": 0.6311739065083422, - "flos": 22018664067840.0, - "grad_norm": 1.9007003272679206, - "language_loss": 0.73755234, - "learning_rate": 1.265003970256247e-06, - "loss": 0.75869608, - "num_input_tokens_seen": 226381920, - "step": 10498, - "time_per_iteration": 2.702826976776123 - }, - { - "auxiliary_loss_clip": 0.01106258, - "auxiliary_loss_mlp": 0.01033967, - "balance_loss_clip": 1.03932881, - "balance_loss_mlp": 1.02077663, - "epoch": 0.6312340297610101, - "flos": 22710770300160.0, - "grad_norm": 2.137540621016438, - "language_loss": 0.70001101, - "learning_rate": 1.264641775364217e-06, - "loss": 0.72141325, - "num_input_tokens_seen": 226400035, - "step": 10499, - "time_per_iteration": 2.6359314918518066 - }, - { - "auxiliary_loss_clip": 0.01105058, - "auxiliary_loss_mlp": 0.0104466, - "balance_loss_clip": 1.04247713, - "balance_loss_mlp": 1.03126705, - "epoch": 0.6312941530136781, - "flos": 24280102483200.0, - "grad_norm": 1.7496076109467864, - "language_loss": 0.69836605, - "learning_rate": 1.2642796083570448e-06, - "loss": 0.7198633, - "num_input_tokens_seen": 226418280, - "step": 10500, - "time_per_iteration": 2.6434264183044434 - }, - { - "auxiliary_loss_clip": 0.01117728, - "auxiliary_loss_mlp": 0.01037176, - "balance_loss_clip": 1.04233432, - "balance_loss_mlp": 1.02433133, - "epoch": 0.631354276266346, - "flos": 21725956137600.0, - "grad_norm": 1.767641766149829, - "language_loss": 0.74439371, - "learning_rate": 1.2639174692484634e-06, - "loss": 0.76594275, - "num_input_tokens_seen": 226436650, - "step": 10501, - "time_per_iteration": 2.6442511081695557 - }, - { - "auxiliary_loss_clip": 0.01104233, - "auxiliary_loss_mlp": 0.00770378, - "balance_loss_clip": 1.04097271, - "balance_loss_mlp": 1.00013256, - "epoch": 0.631414399519014, - "flos": 24025100855040.0, - "grad_norm": 2.125617189575791, - "language_loss": 0.75111711, - "learning_rate": 1.2635553580522053e-06, - "loss": 0.76986325, - "num_input_tokens_seen": 226456275, - "step": 10502, - "time_per_iteration": 2.6732592582702637 - }, - { - "auxiliary_loss_clip": 0.01108933, - "auxiliary_loss_mlp": 0.01052555, - "balance_loss_clip": 1.04151106, - "balance_loss_mlp": 1.03879273, - "epoch": 0.6314745227716819, - "flos": 24315797623680.0, - "grad_norm": 2.013663319345679, - "language_loss": 0.85323668, - "learning_rate": 1.2631932747820022e-06, - "loss": 0.87485158, - "num_input_tokens_seen": 226473610, - "step": 10503, - "time_per_iteration": 2.7602460384368896 - }, - { - "auxiliary_loss_clip": 0.01084517, - "auxiliary_loss_mlp": 0.01034434, - "balance_loss_clip": 1.03906107, - "balance_loss_mlp": 1.02097487, - "epoch": 0.6315346460243499, - "flos": 23366391292800.0, - "grad_norm": 1.6896389995545142, - "language_loss": 0.86806571, - "learning_rate": 1.2628312194515838e-06, - "loss": 0.88925523, - "num_input_tokens_seen": 226493665, - "step": 10504, - "time_per_iteration": 2.6560161113739014 - }, - { - "auxiliary_loss_clip": 0.0108443, - "auxiliary_loss_mlp": 0.0103934, - "balance_loss_clip": 1.0409503, - "balance_loss_mlp": 1.02557158, - "epoch": 0.6315947692770179, - "flos": 20260333497600.0, - "grad_norm": 1.5595011849504998, - "language_loss": 0.76756787, - "learning_rate": 1.2624691920746793e-06, - "loss": 0.78880554, - "num_input_tokens_seen": 226511625, - "step": 10505, - "time_per_iteration": 2.7035913467407227 - }, - { - "auxiliary_loss_clip": 0.01073251, - "auxiliary_loss_mlp": 0.01035629, - "balance_loss_clip": 1.03666878, - "balance_loss_mlp": 1.02143097, - "epoch": 0.6316548925296859, - "flos": 25265850399360.0, - "grad_norm": 2.3166055953098774, - "language_loss": 0.81818491, - "learning_rate": 1.2621071926650166e-06, - "loss": 0.83927369, - "num_input_tokens_seen": 226530085, - "step": 10506, - "time_per_iteration": 2.762647867202759 - }, - { - "auxiliary_loss_clip": 0.01118108, - "auxiliary_loss_mlp": 0.01035819, - "balance_loss_clip": 1.0422647, - "balance_loss_mlp": 1.02248573, - "epoch": 0.6317150157823538, - "flos": 22930579578240.0, - "grad_norm": 1.8490757285143165, - "language_loss": 0.74521178, - "learning_rate": 1.2617452212363238e-06, - "loss": 0.76675105, - "num_input_tokens_seen": 226548115, - "step": 10507, - "time_per_iteration": 2.598595380783081 - }, - { - "auxiliary_loss_clip": 0.01094729, - "auxiliary_loss_mlp": 0.01038809, - "balance_loss_clip": 1.04198813, - "balance_loss_mlp": 1.02511764, - "epoch": 0.6317751390350218, - "flos": 22527051212160.0, - "grad_norm": 2.137138504509131, - "language_loss": 0.67884028, - "learning_rate": 1.2613832778023258e-06, - "loss": 0.7001757, - "num_input_tokens_seen": 226567955, - "step": 10508, - "time_per_iteration": 2.6457536220550537 - }, - { - "auxiliary_loss_clip": 0.01081753, - "auxiliary_loss_mlp": 0.01033704, - "balance_loss_clip": 1.03684628, - "balance_loss_mlp": 1.02029264, - "epoch": 0.6318352622876897, - "flos": 23294749616640.0, - "grad_norm": 1.726891076070715, - "language_loss": 0.70810485, - "learning_rate": 1.2610213623767478e-06, - "loss": 0.72925943, - "num_input_tokens_seen": 226588205, - "step": 10509, - "time_per_iteration": 2.7340633869171143 - }, - { - "auxiliary_loss_clip": 0.01100032, - "auxiliary_loss_mlp": 0.01030079, - "balance_loss_clip": 1.0408802, - "balance_loss_mlp": 1.01750255, - "epoch": 0.6318953855403577, - "flos": 20704082117760.0, - "grad_norm": 2.059347572016265, - "language_loss": 0.79585326, - "learning_rate": 1.2606594749733143e-06, - "loss": 0.81715441, - "num_input_tokens_seen": 226606965, - "step": 10510, - "time_per_iteration": 2.7126991748809814 - }, - { - "auxiliary_loss_clip": 0.01073398, - "auxiliary_loss_mlp": 0.00771235, - "balance_loss_clip": 1.03949821, - "balance_loss_mlp": 1.00013995, - "epoch": 0.6319555087930258, - "flos": 22820046451200.0, - "grad_norm": 2.029248251908187, - "language_loss": 0.70844626, - "learning_rate": 1.2602976156057469e-06, - "loss": 0.72689259, - "num_input_tokens_seen": 226627845, - "step": 10511, - "time_per_iteration": 2.862959384918213 - }, - { - "auxiliary_loss_clip": 0.01113995, - "auxiliary_loss_mlp": 0.01035402, - "balance_loss_clip": 1.04076004, - "balance_loss_mlp": 1.02298617, - "epoch": 0.6320156320456937, - "flos": 19970929618560.0, - "grad_norm": 1.5814642404723724, - "language_loss": 0.80147332, - "learning_rate": 1.2599357842877684e-06, - "loss": 0.82296729, - "num_input_tokens_seen": 226645855, - "step": 10512, - "time_per_iteration": 2.599238872528076 - }, - { - "auxiliary_loss_clip": 0.01104767, - "auxiliary_loss_mlp": 0.01033707, - "balance_loss_clip": 1.04045844, - "balance_loss_mlp": 1.01971221, - "epoch": 0.6320757552983617, - "flos": 27013406889600.0, - "grad_norm": 2.290319172186619, - "language_loss": 0.70844841, - "learning_rate": 1.2595739810330994e-06, - "loss": 0.72983325, - "num_input_tokens_seen": 226665375, - "step": 10513, - "time_per_iteration": 2.706372022628784 - }, - { - "auxiliary_loss_clip": 0.01107929, - "auxiliary_loss_mlp": 0.01034926, - "balance_loss_clip": 1.03973472, - "balance_loss_mlp": 1.02081192, - "epoch": 0.6321358785510296, - "flos": 23695943598720.0, - "grad_norm": 2.242079914271032, - "language_loss": 0.6665644, - "learning_rate": 1.259212205855459e-06, - "loss": 0.68799293, - "num_input_tokens_seen": 226685270, - "step": 10514, - "time_per_iteration": 2.6768577098846436 - }, - { - "auxiliary_loss_clip": 0.01080896, - "auxiliary_loss_mlp": 0.0103395, - "balance_loss_clip": 1.03646874, - "balance_loss_mlp": 1.02114093, - "epoch": 0.6321960018036976, - "flos": 25995231970560.0, - "grad_norm": 1.8993538704282873, - "language_loss": 0.74367702, - "learning_rate": 1.2588504587685663e-06, - "loss": 0.76482546, - "num_input_tokens_seen": 226705325, - "step": 10515, - "time_per_iteration": 2.8709843158721924 - }, - { - "auxiliary_loss_clip": 0.01089992, - "auxiliary_loss_mlp": 0.01031214, - "balance_loss_clip": 1.04074252, - "balance_loss_mlp": 1.01873255, - "epoch": 0.6322561250563655, - "flos": 22821016118400.0, - "grad_norm": 1.7638160656735167, - "language_loss": 0.90024698, - "learning_rate": 1.2584887397861379e-06, - "loss": 0.92145908, - "num_input_tokens_seen": 226723815, - "step": 10516, - "time_per_iteration": 2.691826343536377 - }, - { - "auxiliary_loss_clip": 0.0112538, - "auxiliary_loss_mlp": 0.01036003, - "balance_loss_clip": 1.04528499, - "balance_loss_mlp": 1.02075589, - "epoch": 0.6323162483090335, - "flos": 18988413926400.0, - "grad_norm": 1.6560830086526979, - "language_loss": 0.81829578, - "learning_rate": 1.2581270489218911e-06, - "loss": 0.83990955, - "num_input_tokens_seen": 226741550, - "step": 10517, - "time_per_iteration": 2.620199203491211 - }, - { - "auxiliary_loss_clip": 0.01061827, - "auxiliary_loss_mlp": 0.01039321, - "balance_loss_clip": 1.03930223, - "balance_loss_mlp": 1.02642882, - "epoch": 0.6323763715617015, - "flos": 19865173000320.0, - "grad_norm": 1.7035542921935394, - "language_loss": 0.7784009, - "learning_rate": 1.257765386189541e-06, - "loss": 0.79941237, - "num_input_tokens_seen": 226761115, - "step": 10518, - "time_per_iteration": 2.91979718208313 - }, - { - "auxiliary_loss_clip": 0.01096755, - "auxiliary_loss_mlp": 0.0103349, - "balance_loss_clip": 1.03876209, - "balance_loss_mlp": 1.02090716, - "epoch": 0.6324364948143695, - "flos": 22782699285120.0, - "grad_norm": 1.44276453327461, - "language_loss": 0.85200572, - "learning_rate": 1.2574037516028018e-06, - "loss": 0.87330812, - "num_input_tokens_seen": 226782225, - "step": 10519, - "time_per_iteration": 2.74233078956604 - }, - { - "auxiliary_loss_clip": 0.01088566, - "auxiliary_loss_mlp": 0.01039518, - "balance_loss_clip": 1.03878999, - "balance_loss_mlp": 1.02666724, - "epoch": 0.6324966180670374, - "flos": 22235923480320.0, - "grad_norm": 2.1806676145694692, - "language_loss": 0.71964407, - "learning_rate": 1.2570421451753867e-06, - "loss": 0.74092495, - "num_input_tokens_seen": 226802375, - "step": 10520, - "time_per_iteration": 2.682180404663086 - }, - { - "auxiliary_loss_clip": 0.01103452, - "auxiliary_loss_mlp": 0.01035272, - "balance_loss_clip": 1.03956473, - "balance_loss_mlp": 1.02224886, - "epoch": 0.6325567413197054, - "flos": 21689183589120.0, - "grad_norm": 1.7702779314390575, - "language_loss": 0.71439731, - "learning_rate": 1.2566805669210081e-06, - "loss": 0.73578453, - "num_input_tokens_seen": 226822165, - "step": 10521, - "time_per_iteration": 2.657323122024536 - }, - { - "auxiliary_loss_clip": 0.01076504, - "auxiliary_loss_mlp": 0.01041724, - "balance_loss_clip": 1.03893948, - "balance_loss_mlp": 1.0255115, - "epoch": 0.6326168645723733, - "flos": 19937137898880.0, - "grad_norm": 1.7329974565509776, - "language_loss": 0.721259, - "learning_rate": 1.256319016853377e-06, - "loss": 0.74244124, - "num_input_tokens_seen": 226841645, - "step": 10522, - "time_per_iteration": 2.746037721633911 - }, - { - "auxiliary_loss_clip": 0.01074288, - "auxiliary_loss_mlp": 0.01034292, - "balance_loss_clip": 1.04106843, - "balance_loss_mlp": 1.02167988, - "epoch": 0.6326769878250413, - "flos": 20230348619520.0, - "grad_norm": 1.8934714872441534, - "language_loss": 0.81941485, - "learning_rate": 1.2559574949862023e-06, - "loss": 0.84050065, - "num_input_tokens_seen": 226860355, - "step": 10523, - "time_per_iteration": 2.761061906814575 - }, - { - "auxiliary_loss_clip": 0.01103759, - "auxiliary_loss_mlp": 0.01030499, - "balance_loss_clip": 1.03989744, - "balance_loss_mlp": 1.01712918, - "epoch": 0.6327371110777094, - "flos": 20775759707520.0, - "grad_norm": 2.3750030560810163, - "language_loss": 0.73983908, - "learning_rate": 1.255596001333195e-06, - "loss": 0.76118159, - "num_input_tokens_seen": 226878390, - "step": 10524, - "time_per_iteration": 2.677591323852539 - }, - { - "auxiliary_loss_clip": 0.01101897, - "auxiliary_loss_mlp": 0.01041422, - "balance_loss_clip": 1.04099619, - "balance_loss_mlp": 1.02719402, - "epoch": 0.6327972343303773, - "flos": 30336544529280.0, - "grad_norm": 1.9503552038514373, - "language_loss": 0.84243858, - "learning_rate": 1.2552345359080615e-06, - "loss": 0.86387181, - "num_input_tokens_seen": 226898420, - "step": 10525, - "time_per_iteration": 2.7905821800231934 - }, - { - "auxiliary_loss_clip": 0.0108609, - "auxiliary_loss_mlp": 0.01030251, - "balance_loss_clip": 1.03651416, - "balance_loss_mlp": 1.01617217, - "epoch": 0.6328573575830453, - "flos": 17092258871040.0, - "grad_norm": 1.6646724041083503, - "language_loss": 0.6700424, - "learning_rate": 1.2548730987245093e-06, - "loss": 0.6912058, - "num_input_tokens_seen": 226916305, - "step": 10526, - "time_per_iteration": 2.658766031265259 - }, - { - "auxiliary_loss_clip": 0.01111357, - "auxiliary_loss_mlp": 0.01036081, - "balance_loss_clip": 1.04416919, - "balance_loss_mlp": 1.02141845, - "epoch": 0.6329174808357132, - "flos": 25047154442880.0, - "grad_norm": 2.0355958158409346, - "language_loss": 0.73648405, - "learning_rate": 1.254511689796244e-06, - "loss": 0.75795841, - "num_input_tokens_seen": 226937705, - "step": 10527, - "time_per_iteration": 5.2298712730407715 - }, - { - "auxiliary_loss_clip": 0.01105368, - "auxiliary_loss_mlp": 0.01035127, - "balance_loss_clip": 1.04319382, - "balance_loss_mlp": 1.02256858, - "epoch": 0.6329776040883812, - "flos": 16836826279680.0, - "grad_norm": 2.5914253744426614, - "language_loss": 0.71704459, - "learning_rate": 1.2541503091369693e-06, - "loss": 0.73844951, - "num_input_tokens_seen": 226954880, - "step": 10528, - "time_per_iteration": 2.6561360359191895 - }, - { - "auxiliary_loss_clip": 0.01104345, - "auxiliary_loss_mlp": 0.01031745, - "balance_loss_clip": 1.04158008, - "balance_loss_mlp": 1.01763082, - "epoch": 0.6330377273410491, - "flos": 13516705382400.0, - "grad_norm": 1.8004698597026916, - "language_loss": 0.66514266, - "learning_rate": 1.2537889567603905e-06, - "loss": 0.68650359, - "num_input_tokens_seen": 226972595, - "step": 10529, - "time_per_iteration": 4.169236421585083 - }, - { - "auxiliary_loss_clip": 0.01109158, - "auxiliary_loss_mlp": 0.01033414, - "balance_loss_clip": 1.0410428, - "balance_loss_mlp": 1.01895428, - "epoch": 0.6330978505937171, - "flos": 21538825257600.0, - "grad_norm": 2.1634257180763545, - "language_loss": 0.75199169, - "learning_rate": 1.2534276326802092e-06, - "loss": 0.77341741, - "num_input_tokens_seen": 226991910, - "step": 10530, - "time_per_iteration": 4.1243627071380615 - }, - { - "auxiliary_loss_clip": 0.0111004, - "auxiliary_loss_mlp": 0.00770904, - "balance_loss_clip": 1.04529655, - "balance_loss_mlp": 1.00030541, - "epoch": 0.6331579738463851, - "flos": 25009484054400.0, - "grad_norm": 1.5033967127528767, - "language_loss": 0.73765004, - "learning_rate": 1.2530663369101259e-06, - "loss": 0.75645947, - "num_input_tokens_seen": 227010175, - "step": 10531, - "time_per_iteration": 2.757310152053833 - }, - { - "auxiliary_loss_clip": 0.010819, - "auxiliary_loss_mlp": 0.0103456, - "balance_loss_clip": 1.0428102, - "balance_loss_mlp": 1.02120292, - "epoch": 0.6332180970990531, - "flos": 14976007228800.0, - "grad_norm": 2.152892996011048, - "language_loss": 0.79560679, - "learning_rate": 1.2527050694638432e-06, - "loss": 0.81677139, - "num_input_tokens_seen": 227025540, - "step": 10532, - "time_per_iteration": 2.693357229232788 - }, - { - "auxiliary_loss_clip": 0.01106096, - "auxiliary_loss_mlp": 0.01033111, - "balance_loss_clip": 1.04273748, - "balance_loss_mlp": 1.02105284, - "epoch": 0.633278220351721, - "flos": 22706963458560.0, - "grad_norm": 1.5569394240480623, - "language_loss": 0.74720097, - "learning_rate": 1.2523438303550582e-06, - "loss": 0.76859295, - "num_input_tokens_seen": 227045520, - "step": 10533, - "time_per_iteration": 2.6261446475982666 - }, - { - "auxiliary_loss_clip": 0.01096787, - "auxiliary_loss_mlp": 0.01038782, - "balance_loss_clip": 1.04458022, - "balance_loss_mlp": 1.02473903, - "epoch": 0.633338343604389, - "flos": 12602922364800.0, - "grad_norm": 2.379717167307364, - "language_loss": 0.77104855, - "learning_rate": 1.2519826195974706e-06, - "loss": 0.79240417, - "num_input_tokens_seen": 227059420, - "step": 10534, - "time_per_iteration": 2.6211531162261963 - }, - { - "auxiliary_loss_clip": 0.01080216, - "auxiliary_loss_mlp": 0.01043157, - "balance_loss_clip": 1.03751063, - "balance_loss_mlp": 1.02861977, - "epoch": 0.6333984668570569, - "flos": 25960111447680.0, - "grad_norm": 1.7545098866738538, - "language_loss": 0.86108071, - "learning_rate": 1.251621437204777e-06, - "loss": 0.88231444, - "num_input_tokens_seen": 227081310, - "step": 10535, - "time_per_iteration": 4.269057035446167 - }, - { - "auxiliary_loss_clip": 0.01110282, - "auxiliary_loss_mlp": 0.01037711, - "balance_loss_clip": 1.04232645, - "balance_loss_mlp": 1.02399635, - "epoch": 0.6334585901097249, - "flos": 23659242877440.0, - "grad_norm": 1.7414784178378062, - "language_loss": 0.76938647, - "learning_rate": 1.2512602831906733e-06, - "loss": 0.79086637, - "num_input_tokens_seen": 227100365, - "step": 10536, - "time_per_iteration": 2.6666407585144043 - }, - { - "auxiliary_loss_clip": 0.01102168, - "auxiliary_loss_mlp": 0.01038535, - "balance_loss_clip": 1.04189527, - "balance_loss_mlp": 1.02443218, - "epoch": 0.633518713362393, - "flos": 28760496503040.0, - "grad_norm": 2.0392502828924353, - "language_loss": 0.60273743, - "learning_rate": 1.250899157568855e-06, - "loss": 0.62414443, - "num_input_tokens_seen": 227119680, - "step": 10537, - "time_per_iteration": 2.7295584678649902 - }, - { - "auxiliary_loss_clip": 0.01012372, - "auxiliary_loss_mlp": 0.01000462, - "balance_loss_clip": 1.01797509, - "balance_loss_mlp": 0.99907935, - "epoch": 0.6335788366150609, - "flos": 70420322401920.0, - "grad_norm": 0.7714446209447136, - "language_loss": 0.52451682, - "learning_rate": 1.2505380603530155e-06, - "loss": 0.54464519, - "num_input_tokens_seen": 227184465, - "step": 10538, - "time_per_iteration": 3.3442068099975586 - }, - { - "auxiliary_loss_clip": 0.01100864, - "auxiliary_loss_mlp": 0.01035126, - "balance_loss_clip": 1.04384637, - "balance_loss_mlp": 1.02057028, - "epoch": 0.6336389598677289, - "flos": 23732069702400.0, - "grad_norm": 1.8384221769935791, - "language_loss": 0.83274323, - "learning_rate": 1.250176991556848e-06, - "loss": 0.85410309, - "num_input_tokens_seen": 227202185, - "step": 10539, - "time_per_iteration": 2.696904182434082 - }, - { - "auxiliary_loss_clip": 0.01090255, - "auxiliary_loss_mlp": 0.01032507, - "balance_loss_clip": 1.03990459, - "balance_loss_mlp": 1.01798737, - "epoch": 0.6336990831203968, - "flos": 29276676898560.0, - "grad_norm": 1.6347430731245383, - "language_loss": 0.86721331, - "learning_rate": 1.2498159511940438e-06, - "loss": 0.88844097, - "num_input_tokens_seen": 227222020, - "step": 10540, - "time_per_iteration": 2.7495079040527344 - }, - { - "auxiliary_loss_clip": 0.01091229, - "auxiliary_loss_mlp": 0.01034107, - "balance_loss_clip": 1.04014003, - "balance_loss_mlp": 1.02244198, - "epoch": 0.6337592063730648, - "flos": 29096836479360.0, - "grad_norm": 2.116079588237037, - "language_loss": 0.7269882, - "learning_rate": 1.2494549392782943e-06, - "loss": 0.74824154, - "num_input_tokens_seen": 227240885, - "step": 10541, - "time_per_iteration": 2.750035285949707 - }, - { - "auxiliary_loss_clip": 0.01111525, - "auxiliary_loss_mlp": 0.010355, - "balance_loss_clip": 1.04309511, - "balance_loss_mlp": 1.02114677, - "epoch": 0.6338193296257327, - "flos": 34706477249280.0, - "grad_norm": 2.608261813881904, - "language_loss": 0.85043848, - "learning_rate": 1.2490939558232887e-06, - "loss": 0.87190866, - "num_input_tokens_seen": 227257880, - "step": 10542, - "time_per_iteration": 2.7066802978515625 - }, - { - "auxiliary_loss_clip": 0.01107251, - "auxiliary_loss_mlp": 0.01033519, - "balance_loss_clip": 1.04289162, - "balance_loss_mlp": 1.01898777, - "epoch": 0.6338794528784008, - "flos": 16687581269760.0, - "grad_norm": 1.8074408618170101, - "language_loss": 0.77832586, - "learning_rate": 1.2487330008427153e-06, - "loss": 0.79973352, - "num_input_tokens_seen": 227274840, - "step": 10543, - "time_per_iteration": 2.6362385749816895 - }, - { - "auxiliary_loss_clip": 0.01065317, - "auxiliary_loss_mlp": 0.0104211, - "balance_loss_clip": 1.04040122, - "balance_loss_mlp": 1.02933073, - "epoch": 0.6339395761310687, - "flos": 22346600261760.0, - "grad_norm": 1.5926861927585991, - "language_loss": 0.73305023, - "learning_rate": 1.2483720743502618e-06, - "loss": 0.75412452, - "num_input_tokens_seen": 227294835, - "step": 10544, - "time_per_iteration": 2.7428245544433594 - }, - { - "auxiliary_loss_clip": 0.01089874, - "auxiliary_loss_mlp": 0.01039428, - "balance_loss_clip": 1.04020858, - "balance_loss_mlp": 1.02617836, - "epoch": 0.6339996993837367, - "flos": 18551812112640.0, - "grad_norm": 4.4072583606750895, - "language_loss": 0.68668348, - "learning_rate": 1.2480111763596144e-06, - "loss": 0.70797652, - "num_input_tokens_seen": 227314935, - "step": 10545, - "time_per_iteration": 2.8335583209991455 - }, - { - "auxiliary_loss_clip": 0.01092777, - "auxiliary_loss_mlp": 0.01037678, - "balance_loss_clip": 1.03954399, - "balance_loss_mlp": 1.02418935, - "epoch": 0.6340598226364046, - "flos": 12969498614400.0, - "grad_norm": 1.9287987147307617, - "language_loss": 0.70950794, - "learning_rate": 1.2476503068844592e-06, - "loss": 0.73081255, - "num_input_tokens_seen": 227332905, - "step": 10546, - "time_per_iteration": 2.6343114376068115 - }, - { - "auxiliary_loss_clip": 0.01103009, - "auxiliary_loss_mlp": 0.01031279, - "balance_loss_clip": 1.0436604, - "balance_loss_mlp": 1.01867259, - "epoch": 0.6341199458890726, - "flos": 26687984647680.0, - "grad_norm": 1.2499026086544156, - "language_loss": 0.77873629, - "learning_rate": 1.2472894659384792e-06, - "loss": 0.80007923, - "num_input_tokens_seen": 227354915, - "step": 10547, - "time_per_iteration": 2.704674005508423 - }, - { - "auxiliary_loss_clip": 0.01072985, - "auxiliary_loss_mlp": 0.0104046, - "balance_loss_clip": 1.03441143, - "balance_loss_mlp": 1.02732289, - "epoch": 0.6341800691417405, - "flos": 18734274224640.0, - "grad_norm": 1.6184133650868997, - "language_loss": 0.62827075, - "learning_rate": 1.2469286535353578e-06, - "loss": 0.64940524, - "num_input_tokens_seen": 227372990, - "step": 10548, - "time_per_iteration": 2.7401933670043945 - }, - { - "auxiliary_loss_clip": 0.01089619, - "auxiliary_loss_mlp": 0.01038454, - "balance_loss_clip": 1.03783989, - "balance_loss_mlp": 1.02509081, - "epoch": 0.6342401923944085, - "flos": 26249443499520.0, - "grad_norm": 2.3059628412520308, - "language_loss": 0.62195736, - "learning_rate": 1.2465678696887785e-06, - "loss": 0.64323807, - "num_input_tokens_seen": 227393270, - "step": 10549, - "time_per_iteration": 2.825896739959717 - }, - { - "auxiliary_loss_clip": 0.0106782, - "auxiliary_loss_mlp": 0.01035303, - "balance_loss_clip": 1.0408318, - "balance_loss_mlp": 1.02268422, - "epoch": 0.6343003156470765, - "flos": 24680937329280.0, - "grad_norm": 1.71498279606421, - "language_loss": 0.73401284, - "learning_rate": 1.2462071144124197e-06, - "loss": 0.75504404, - "num_input_tokens_seen": 227413630, - "step": 10550, - "time_per_iteration": 2.780163049697876 - }, - { - "auxiliary_loss_clip": 0.0100437, - "auxiliary_loss_mlp": 0.01001031, - "balance_loss_clip": 1.0126493, - "balance_loss_mlp": 0.99974936, - "epoch": 0.6343604388997445, - "flos": 69805352626560.0, - "grad_norm": 0.6910389749764038, - "language_loss": 0.57719415, - "learning_rate": 1.2458463877199638e-06, - "loss": 0.59724814, - "num_input_tokens_seen": 227476630, - "step": 10551, - "time_per_iteration": 3.286808729171753 - }, - { - "auxiliary_loss_clip": 0.01082742, - "auxiliary_loss_mlp": 0.01030085, - "balance_loss_clip": 1.04196656, - "balance_loss_mlp": 1.01796162, - "epoch": 0.6344205621524125, - "flos": 21982430223360.0, - "grad_norm": 1.74505505177434, - "language_loss": 0.67322063, - "learning_rate": 1.2454856896250881e-06, - "loss": 0.69434893, - "num_input_tokens_seen": 227496060, - "step": 10552, - "time_per_iteration": 2.7764453887939453 - }, - { - "auxiliary_loss_clip": 0.01080056, - "auxiliary_loss_mlp": 0.01034576, - "balance_loss_clip": 1.03920615, - "balance_loss_mlp": 1.02086091, - "epoch": 0.6344806854050804, - "flos": 20448865008000.0, - "grad_norm": 1.5562703117807677, - "language_loss": 0.81798071, - "learning_rate": 1.24512502014147e-06, - "loss": 0.839127, - "num_input_tokens_seen": 227513440, - "step": 10553, - "time_per_iteration": 2.7851717472076416 - }, - { - "auxiliary_loss_clip": 0.01106231, - "auxiliary_loss_mlp": 0.0103609, - "balance_loss_clip": 1.04020214, - "balance_loss_mlp": 1.02246475, - "epoch": 0.6345408086577484, - "flos": 40510611187200.0, - "grad_norm": 1.7532654974316204, - "language_loss": 0.5476743, - "learning_rate": 1.2447643792827879e-06, - "loss": 0.56909752, - "num_input_tokens_seen": 227535395, - "step": 10554, - "time_per_iteration": 2.79447078704834 - }, - { - "auxiliary_loss_clip": 0.01096611, - "auxiliary_loss_mlp": 0.01034981, - "balance_loss_clip": 1.0413723, - "balance_loss_mlp": 1.02187991, - "epoch": 0.6346009319104163, - "flos": 21361319222400.0, - "grad_norm": 2.4671241924977583, - "language_loss": 0.70400488, - "learning_rate": 1.2444037670627153e-06, - "loss": 0.72532082, - "num_input_tokens_seen": 227554545, - "step": 10555, - "time_per_iteration": 2.6849427223205566 - }, - { - "auxiliary_loss_clip": 0.01017602, - "auxiliary_loss_mlp": 0.01006112, - "balance_loss_clip": 1.0127604, - "balance_loss_mlp": 1.00490761, - "epoch": 0.6346610551630844, - "flos": 71365419100800.0, - "grad_norm": 0.773594882523352, - "language_loss": 0.55296588, - "learning_rate": 1.2440431834949276e-06, - "loss": 0.57320297, - "num_input_tokens_seen": 227608575, - "step": 10556, - "time_per_iteration": 3.1463379859924316 - }, - { - "auxiliary_loss_clip": 0.01095791, - "auxiliary_loss_mlp": 0.01031445, - "balance_loss_clip": 1.0396291, - "balance_loss_mlp": 1.01756358, - "epoch": 0.6347211784157523, - "flos": 25411504049280.0, - "grad_norm": 2.5502749141285848, - "language_loss": 0.67922962, - "learning_rate": 1.2436826285930985e-06, - "loss": 0.70050198, - "num_input_tokens_seen": 227628175, - "step": 10557, - "time_per_iteration": 2.693422794342041 - }, - { - "auxiliary_loss_clip": 0.0108673, - "auxiliary_loss_mlp": 0.01038794, - "balance_loss_clip": 1.03953815, - "balance_loss_mlp": 1.02604496, - "epoch": 0.6347813016684203, - "flos": 15742735966080.0, - "grad_norm": 1.602709548432784, - "language_loss": 0.70369065, - "learning_rate": 1.2433221023709002e-06, - "loss": 0.72494584, - "num_input_tokens_seen": 227645330, - "step": 10558, - "time_per_iteration": 2.671268939971924 - }, - { - "auxiliary_loss_clip": 0.01083073, - "auxiliary_loss_mlp": 0.01034562, - "balance_loss_clip": 1.03938115, - "balance_loss_mlp": 1.02120471, - "epoch": 0.6348414249210882, - "flos": 21464777370240.0, - "grad_norm": 1.4417814449763804, - "language_loss": 0.78316975, - "learning_rate": 1.2429616048420031e-06, - "loss": 0.80434608, - "num_input_tokens_seen": 227665250, - "step": 10559, - "time_per_iteration": 2.7575199604034424 - }, - { - "auxiliary_loss_clip": 0.01090706, - "auxiliary_loss_mlp": 0.01041541, - "balance_loss_clip": 1.03786755, - "balance_loss_mlp": 1.02740252, - "epoch": 0.6349015481737562, - "flos": 21653057485440.0, - "grad_norm": 1.8349318523473441, - "language_loss": 0.67984653, - "learning_rate": 1.242601136020078e-06, - "loss": 0.70116907, - "num_input_tokens_seen": 227685070, - "step": 10560, - "time_per_iteration": 2.6403374671936035 - }, - { - "auxiliary_loss_clip": 0.01089304, - "auxiliary_loss_mlp": 0.01045217, - "balance_loss_clip": 1.03931737, - "balance_loss_mlp": 1.03085184, - "epoch": 0.6349616714264241, - "flos": 22194984954240.0, - "grad_norm": 1.606240636171636, - "language_loss": 0.76797289, - "learning_rate": 1.2422406959187939e-06, - "loss": 0.78931808, - "num_input_tokens_seen": 227704430, - "step": 10561, - "time_per_iteration": 2.7372517585754395 - }, - { - "auxiliary_loss_clip": 0.01093461, - "auxiliary_loss_mlp": 0.01035075, - "balance_loss_clip": 1.03962195, - "balance_loss_mlp": 1.02203918, - "epoch": 0.6350217946790921, - "flos": 25410354814080.0, - "grad_norm": 2.1365474752692966, - "language_loss": 0.71962273, - "learning_rate": 1.2418802845518178e-06, - "loss": 0.74090809, - "num_input_tokens_seen": 227724920, - "step": 10562, - "time_per_iteration": 2.7133450508117676 - }, - { - "auxiliary_loss_clip": 0.01105126, - "auxiliary_loss_mlp": 0.01034495, - "balance_loss_clip": 1.04334474, - "balance_loss_mlp": 1.02005243, - "epoch": 0.63508191793176, - "flos": 19718944732800.0, - "grad_norm": 2.0107972952363413, - "language_loss": 0.80757058, - "learning_rate": 1.2415199019328185e-06, - "loss": 0.8289668, - "num_input_tokens_seen": 227743400, - "step": 10563, - "time_per_iteration": 2.6585617065429688 - }, - { - "auxiliary_loss_clip": 0.01091086, - "auxiliary_loss_mlp": 0.01038953, - "balance_loss_clip": 1.04419041, - "balance_loss_mlp": 1.02567887, - "epoch": 0.6351420411844281, - "flos": 18186923802240.0, - "grad_norm": 2.444256209228289, - "language_loss": 0.81206977, - "learning_rate": 1.2411595480754597e-06, - "loss": 0.83337021, - "num_input_tokens_seen": 227759990, - "step": 10564, - "time_per_iteration": 2.705941915512085 - }, - { - "auxiliary_loss_clip": 0.01087784, - "auxiliary_loss_mlp": 0.01045814, - "balance_loss_clip": 1.04181719, - "balance_loss_mlp": 1.03100812, - "epoch": 0.6352021644370961, - "flos": 33726511422720.0, - "grad_norm": 1.5889053443954093, - "language_loss": 0.72453761, - "learning_rate": 1.240799222993407e-06, - "loss": 0.74587357, - "num_input_tokens_seen": 227780835, - "step": 10565, - "time_per_iteration": 2.765345335006714 - }, - { - "auxiliary_loss_clip": 0.01102461, - "auxiliary_loss_mlp": 0.01033958, - "balance_loss_clip": 1.04256928, - "balance_loss_mlp": 1.01919961, - "epoch": 0.635262287689764, - "flos": 20374781207040.0, - "grad_norm": 2.121063161403432, - "language_loss": 0.69596386, - "learning_rate": 1.240438926700324e-06, - "loss": 0.71732807, - "num_input_tokens_seen": 227798580, - "step": 10566, - "time_per_iteration": 4.550225496292114 - }, - { - "auxiliary_loss_clip": 0.01103568, - "auxiliary_loss_mlp": 0.01033127, - "balance_loss_clip": 1.04312527, - "balance_loss_mlp": 1.0210278, - "epoch": 0.635322410942432, - "flos": 27525421307520.0, - "grad_norm": 1.5800197118440122, - "language_loss": 0.69619238, - "learning_rate": 1.2400786592098725e-06, - "loss": 0.71755934, - "num_input_tokens_seen": 227819210, - "step": 10567, - "time_per_iteration": 2.6888957023620605 - }, - { - "auxiliary_loss_clip": 0.01100039, - "auxiliary_loss_mlp": 0.01031317, - "balance_loss_clip": 1.04216862, - "balance_loss_mlp": 1.01925862, - "epoch": 0.6353825341950999, - "flos": 21543601766400.0, - "grad_norm": 2.2757897203537976, - "language_loss": 0.8449024, - "learning_rate": 1.2397184205357154e-06, - "loss": 0.86621594, - "num_input_tokens_seen": 227838340, - "step": 10568, - "time_per_iteration": 4.255465030670166 - }, - { - "auxiliary_loss_clip": 0.01056215, - "auxiliary_loss_mlp": 0.01041007, - "balance_loss_clip": 1.03819847, - "balance_loss_mlp": 1.026559, - "epoch": 0.635442657447768, - "flos": 31759756185600.0, - "grad_norm": 1.8323936037096342, - "language_loss": 0.84063637, - "learning_rate": 1.2393582106915113e-06, - "loss": 0.86160862, - "num_input_tokens_seen": 227859170, - "step": 10569, - "time_per_iteration": 4.377737760543823 - }, - { - "auxiliary_loss_clip": 0.01104285, - "auxiliary_loss_mlp": 0.01032738, - "balance_loss_clip": 1.04183245, - "balance_loss_mlp": 1.01939797, - "epoch": 0.6355027807004359, - "flos": 19828831415040.0, - "grad_norm": 1.6700504081300207, - "language_loss": 0.69352221, - "learning_rate": 1.2389980296909198e-06, - "loss": 0.71489245, - "num_input_tokens_seen": 227878545, - "step": 10570, - "time_per_iteration": 2.6112160682678223 - }, - { - "auxiliary_loss_clip": 0.01107497, - "auxiliary_loss_mlp": 0.01036944, - "balance_loss_clip": 1.04085815, - "balance_loss_mlp": 1.02342606, - "epoch": 0.6355629039531039, - "flos": 30372383324160.0, - "grad_norm": 1.7288699826037912, - "language_loss": 0.65762198, - "learning_rate": 1.2386378775476e-06, - "loss": 0.67906642, - "num_input_tokens_seen": 227898875, - "step": 10571, - "time_per_iteration": 2.7335216999053955 - }, - { - "auxiliary_loss_clip": 0.01113018, - "auxiliary_loss_mlp": 0.01029154, - "balance_loss_clip": 1.04446983, - "balance_loss_mlp": 1.01616585, - "epoch": 0.6356230272057718, - "flos": 17932065828480.0, - "grad_norm": 1.9788287371045428, - "language_loss": 0.71541518, - "learning_rate": 1.2382777542752074e-06, - "loss": 0.73683691, - "num_input_tokens_seen": 227917130, - "step": 10572, - "time_per_iteration": 2.6052427291870117 - }, - { - "auxiliary_loss_clip": 0.01084769, - "auxiliary_loss_mlp": 0.01034, - "balance_loss_clip": 1.04089427, - "balance_loss_mlp": 1.02181661, - "epoch": 0.6356831504584398, - "flos": 25375844822400.0, - "grad_norm": 1.6900483013767176, - "language_loss": 0.81165767, - "learning_rate": 1.2379176598873992e-06, - "loss": 0.83284533, - "num_input_tokens_seen": 227939550, - "step": 10573, - "time_per_iteration": 2.8153634071350098 - }, - { - "auxiliary_loss_clip": 0.0109877, - "auxiliary_loss_mlp": 0.01033009, - "balance_loss_clip": 1.04272556, - "balance_loss_mlp": 1.02006316, - "epoch": 0.6357432737111077, - "flos": 46500331720320.0, - "grad_norm": 1.6632630908080246, - "language_loss": 0.68936265, - "learning_rate": 1.2375575943978303e-06, - "loss": 0.71068037, - "num_input_tokens_seen": 227962200, - "step": 10574, - "time_per_iteration": 4.407367467880249 - }, - { - "auxiliary_loss_clip": 0.01116558, - "auxiliary_loss_mlp": 0.01031438, - "balance_loss_clip": 1.04334235, - "balance_loss_mlp": 1.01825356, - "epoch": 0.6358033969637757, - "flos": 17274361847040.0, - "grad_norm": 2.216480993085757, - "language_loss": 0.86364478, - "learning_rate": 1.2371975578201525e-06, - "loss": 0.88512474, - "num_input_tokens_seen": 227979270, - "step": 10575, - "time_per_iteration": 2.59047532081604 - }, - { - "auxiliary_loss_clip": 0.01116011, - "auxiliary_loss_mlp": 0.01037179, - "balance_loss_clip": 1.04200649, - "balance_loss_mlp": 1.02420902, - "epoch": 0.6358635202164437, - "flos": 27125520215040.0, - "grad_norm": 1.527365029746322, - "language_loss": 0.72139943, - "learning_rate": 1.2368375501680204e-06, - "loss": 0.74293131, - "num_input_tokens_seen": 228000550, - "step": 10576, - "time_per_iteration": 2.6213035583496094 - }, - { - "auxiliary_loss_clip": 0.01094385, - "auxiliary_loss_mlp": 0.0103256, - "balance_loss_clip": 1.0408107, - "balance_loss_mlp": 1.01913691, - "epoch": 0.6359236434691117, - "flos": 27525205825920.0, - "grad_norm": 1.587362724967965, - "language_loss": 0.69232905, - "learning_rate": 1.236477571455085e-06, - "loss": 0.71359849, - "num_input_tokens_seen": 228022005, - "step": 10577, - "time_per_iteration": 2.6874570846557617 - }, - { - "auxiliary_loss_clip": 0.01076719, - "auxiliary_loss_mlp": 0.01031904, - "balance_loss_clip": 1.04086065, - "balance_loss_mlp": 1.01938713, - "epoch": 0.6359837667217797, - "flos": 39348290989440.0, - "grad_norm": 1.631898217557544, - "language_loss": 0.71984881, - "learning_rate": 1.2361176216949964e-06, - "loss": 0.74093509, - "num_input_tokens_seen": 228043770, - "step": 10578, - "time_per_iteration": 2.956587314605713 - }, - { - "auxiliary_loss_clip": 0.01011581, - "auxiliary_loss_mlp": 0.00752167, - "balance_loss_clip": 1.01532173, - "balance_loss_mlp": 0.99992144, - "epoch": 0.6360438899744476, - "flos": 56413797206400.0, - "grad_norm": 0.7005664562343583, - "language_loss": 0.5446803, - "learning_rate": 1.2357577009014044e-06, - "loss": 0.56231779, - "num_input_tokens_seen": 228104985, - "step": 10579, - "time_per_iteration": 3.3165230751037598 - }, - { - "auxiliary_loss_clip": 0.01090928, - "auxiliary_loss_mlp": 0.01034048, - "balance_loss_clip": 1.03814209, - "balance_loss_mlp": 1.02082229, - "epoch": 0.6361040132271156, - "flos": 24973106555520.0, - "grad_norm": 1.557921238837489, - "language_loss": 0.77395153, - "learning_rate": 1.2353978090879568e-06, - "loss": 0.7952013, - "num_input_tokens_seen": 228125620, - "step": 10580, - "time_per_iteration": 2.712324857711792 - }, - { - "auxiliary_loss_clip": 0.01087081, - "auxiliary_loss_mlp": 0.00770805, - "balance_loss_clip": 1.04100418, - "balance_loss_mlp": 1.00011897, - "epoch": 0.6361641364797835, - "flos": 23259198130560.0, - "grad_norm": 2.013936086375126, - "language_loss": 0.66709065, - "learning_rate": 1.235037946268301e-06, - "loss": 0.68566948, - "num_input_tokens_seen": 228143495, - "step": 10581, - "time_per_iteration": 2.7856929302215576 - }, - { - "auxiliary_loss_clip": 0.01102449, - "auxiliary_loss_mlp": 0.01034551, - "balance_loss_clip": 1.0404247, - "balance_loss_mlp": 1.02227867, - "epoch": 0.6362242597324516, - "flos": 25994513698560.0, - "grad_norm": 1.9398130134586062, - "language_loss": 0.68718088, - "learning_rate": 1.2346781124560828e-06, - "loss": 0.70855093, - "num_input_tokens_seen": 228166500, - "step": 10582, - "time_per_iteration": 2.737300395965576 - }, - { - "auxiliary_loss_clip": 0.01089734, - "auxiliary_loss_mlp": 0.01038152, - "balance_loss_clip": 1.04106402, - "balance_loss_mlp": 1.02545059, - "epoch": 0.6362843829851195, - "flos": 25703242312320.0, - "grad_norm": 2.1615330133159305, - "language_loss": 0.84382987, - "learning_rate": 1.2343183076649473e-06, - "loss": 0.86510873, - "num_input_tokens_seen": 228185325, - "step": 10583, - "time_per_iteration": 2.736928939819336 - }, - { - "auxiliary_loss_clip": 0.01094529, - "auxiliary_loss_mlp": 0.01034443, - "balance_loss_clip": 1.04331303, - "balance_loss_mlp": 1.02157402, - "epoch": 0.6363445062377875, - "flos": 20522912895360.0, - "grad_norm": 1.8294448915060182, - "language_loss": 0.75581825, - "learning_rate": 1.233958531908538e-06, - "loss": 0.77710795, - "num_input_tokens_seen": 228204050, - "step": 10584, - "time_per_iteration": 2.66745662689209 - }, - { - "auxiliary_loss_clip": 0.01092434, - "auxiliary_loss_mlp": 0.01035865, - "balance_loss_clip": 1.04142356, - "balance_loss_mlp": 1.02158976, - "epoch": 0.6364046294904554, - "flos": 19463799450240.0, - "grad_norm": 1.8372541511316505, - "language_loss": 0.72750449, - "learning_rate": 1.2335987852004985e-06, - "loss": 0.74878752, - "num_input_tokens_seen": 228222430, - "step": 10585, - "time_per_iteration": 2.7207906246185303 - }, - { - "auxiliary_loss_clip": 0.01078843, - "auxiliary_loss_mlp": 0.01028745, - "balance_loss_clip": 1.03947806, - "balance_loss_mlp": 1.01638353, - "epoch": 0.6364647527431234, - "flos": 20995892208000.0, - "grad_norm": 1.8754451190030996, - "language_loss": 0.82982284, - "learning_rate": 1.2332390675544697e-06, - "loss": 0.85089874, - "num_input_tokens_seen": 228241925, - "step": 10586, - "time_per_iteration": 2.883169174194336 - }, - { - "auxiliary_loss_clip": 0.01104026, - "auxiliary_loss_mlp": 0.01024669, - "balance_loss_clip": 1.04210103, - "balance_loss_mlp": 1.01253915, - "epoch": 0.6365248759957913, - "flos": 25770789838080.0, - "grad_norm": 2.4347749012599382, - "language_loss": 0.72591609, - "learning_rate": 1.2328793789840918e-06, - "loss": 0.74720299, - "num_input_tokens_seen": 228262535, - "step": 10587, - "time_per_iteration": 2.696120500564575 - }, - { - "auxiliary_loss_clip": 0.01095392, - "auxiliary_loss_mlp": 0.01030465, - "balance_loss_clip": 1.04264998, - "balance_loss_mlp": 1.01770997, - "epoch": 0.6365849992484593, - "flos": 22455589104000.0, - "grad_norm": 2.0432270596750395, - "language_loss": 0.77210999, - "learning_rate": 1.2325197195030058e-06, - "loss": 0.79336858, - "num_input_tokens_seen": 228281340, - "step": 10588, - "time_per_iteration": 2.7811734676361084 - }, - { - "auxiliary_loss_clip": 0.0106633, - "auxiliary_loss_mlp": 0.01028817, - "balance_loss_clip": 1.03860903, - "balance_loss_mlp": 1.0154599, - "epoch": 0.6366451225011273, - "flos": 19025689265280.0, - "grad_norm": 1.4710865244749312, - "language_loss": 0.79949176, - "learning_rate": 1.2321600891248478e-06, - "loss": 0.82044327, - "num_input_tokens_seen": 228300865, - "step": 10589, - "time_per_iteration": 2.8011467456817627 - }, - { - "auxiliary_loss_clip": 0.01093718, - "auxiliary_loss_mlp": 0.01032855, - "balance_loss_clip": 1.03902805, - "balance_loss_mlp": 1.02014768, - "epoch": 0.6367052457537953, - "flos": 25228395492480.0, - "grad_norm": 2.226066060883624, - "language_loss": 0.67151499, - "learning_rate": 1.231800487863257e-06, - "loss": 0.69278073, - "num_input_tokens_seen": 228320815, - "step": 10590, - "time_per_iteration": 2.709080934524536 - }, - { - "auxiliary_loss_clip": 0.01111263, - "auxiliary_loss_mlp": 0.01033159, - "balance_loss_clip": 1.04165292, - "balance_loss_mlp": 1.01980138, - "epoch": 0.6367653690064633, - "flos": 19208438686080.0, - "grad_norm": 2.18709267526875, - "language_loss": 0.78891504, - "learning_rate": 1.2314409157318685e-06, - "loss": 0.81035924, - "num_input_tokens_seen": 228339065, - "step": 10591, - "time_per_iteration": 2.636992931365967 - }, - { - "auxiliary_loss_clip": 0.01092014, - "auxiliary_loss_mlp": 0.01028959, - "balance_loss_clip": 1.04065537, - "balance_loss_mlp": 1.01711535, - "epoch": 0.6368254922591312, - "flos": 23546806329600.0, - "grad_norm": 1.430576733389061, - "language_loss": 0.89153397, - "learning_rate": 1.231081372744317e-06, - "loss": 0.91274369, - "num_input_tokens_seen": 228359210, - "step": 10592, - "time_per_iteration": 2.7107973098754883 - }, - { - "auxiliary_loss_clip": 0.01099214, - "auxiliary_loss_mlp": 0.01027902, - "balance_loss_clip": 1.03750551, - "balance_loss_mlp": 1.01598144, - "epoch": 0.6368856155117992, - "flos": 26467313443200.0, - "grad_norm": 1.4034572445207882, - "language_loss": 0.68212253, - "learning_rate": 1.2307218589142376e-06, - "loss": 0.7033937, - "num_input_tokens_seen": 228379630, - "step": 10593, - "time_per_iteration": 2.807321786880493 - }, - { - "auxiliary_loss_clip": 0.01061371, - "auxiliary_loss_mlp": 0.01042752, - "balance_loss_clip": 1.03203607, - "balance_loss_mlp": 1.02891731, - "epoch": 0.6369457387644671, - "flos": 33692432394240.0, - "grad_norm": 1.761330533007529, - "language_loss": 0.63678664, - "learning_rate": 1.2303623742552618e-06, - "loss": 0.65782785, - "num_input_tokens_seen": 228401410, - "step": 10594, - "time_per_iteration": 2.856600046157837 - }, - { - "auxiliary_loss_clip": 0.01023648, - "auxiliary_loss_mlp": 0.01001204, - "balance_loss_clip": 1.01176047, - "balance_loss_mlp": 0.99982756, - "epoch": 0.6370058620171352, - "flos": 70908600908160.0, - "grad_norm": 0.7623002997880329, - "language_loss": 0.54635006, - "learning_rate": 1.230002918781022e-06, - "loss": 0.56659859, - "num_input_tokens_seen": 228470335, - "step": 10595, - "time_per_iteration": 3.2980732917785645 - }, - { - "auxiliary_loss_clip": 0.01118729, - "auxiliary_loss_mlp": 0.01042081, - "balance_loss_clip": 1.04251242, - "balance_loss_mlp": 1.02855635, - "epoch": 0.6370659852698031, - "flos": 21141940907520.0, - "grad_norm": 2.0781706076151445, - "language_loss": 0.67100823, - "learning_rate": 1.2296434925051493e-06, - "loss": 0.69261628, - "num_input_tokens_seen": 228490765, - "step": 10596, - "time_per_iteration": 2.6011126041412354 - }, - { - "auxiliary_loss_clip": 0.01099686, - "auxiliary_loss_mlp": 0.01037794, - "balance_loss_clip": 1.04006338, - "balance_loss_mlp": 1.02463365, - "epoch": 0.6371261085224711, - "flos": 20193288762240.0, - "grad_norm": 2.011756808968462, - "language_loss": 0.7937991, - "learning_rate": 1.2292840954412718e-06, - "loss": 0.81517392, - "num_input_tokens_seen": 228509700, - "step": 10597, - "time_per_iteration": 2.6972439289093018 - }, - { - "auxiliary_loss_clip": 0.01108387, - "auxiliary_loss_mlp": 0.01037543, - "balance_loss_clip": 1.04363835, - "balance_loss_mlp": 1.02541316, - "epoch": 0.637186231775139, - "flos": 19683536901120.0, - "grad_norm": 1.60919791295429, - "language_loss": 0.74850726, - "learning_rate": 1.2289247276030189e-06, - "loss": 0.76996648, - "num_input_tokens_seen": 228529050, - "step": 10598, - "time_per_iteration": 2.6332266330718994 - }, - { - "auxiliary_loss_clip": 0.01084454, - "auxiliary_loss_mlp": 0.00771297, - "balance_loss_clip": 1.03999043, - "balance_loss_mlp": 1.00013983, - "epoch": 0.637246355027807, - "flos": 13071196995840.0, - "grad_norm": 1.9548116793493355, - "language_loss": 0.68556929, - "learning_rate": 1.2285653890040176e-06, - "loss": 0.70412678, - "num_input_tokens_seen": 228544665, - "step": 10599, - "time_per_iteration": 2.6878466606140137 - }, - { - "auxiliary_loss_clip": 0.01077983, - "auxiliary_loss_mlp": 0.01031504, - "balance_loss_clip": 1.03724337, - "balance_loss_mlp": 1.01745534, - "epoch": 0.6373064782804749, - "flos": 18222654856320.0, - "grad_norm": 2.0583135447897933, - "language_loss": 0.80303937, - "learning_rate": 1.2282060796578942e-06, - "loss": 0.82413423, - "num_input_tokens_seen": 228562060, - "step": 10600, - "time_per_iteration": 2.653907060623169 - }, - { - "auxiliary_loss_clip": 0.01101937, - "auxiliary_loss_mlp": 0.01036294, - "balance_loss_clip": 1.03776395, - "balance_loss_mlp": 1.02380645, - "epoch": 0.637366601533143, - "flos": 24498475217280.0, - "grad_norm": 1.4639641102491714, - "language_loss": 0.79828721, - "learning_rate": 1.2278467995782732e-06, - "loss": 0.81966954, - "num_input_tokens_seen": 228582550, - "step": 10601, - "time_per_iteration": 2.797588586807251 - }, - { - "auxiliary_loss_clip": 0.01085997, - "auxiliary_loss_mlp": 0.01032519, - "balance_loss_clip": 1.04335141, - "balance_loss_mlp": 1.01989436, - "epoch": 0.6374267247858109, - "flos": 26359042872960.0, - "grad_norm": 2.3452009289064737, - "language_loss": 0.6766789, - "learning_rate": 1.2274875487787797e-06, - "loss": 0.69786406, - "num_input_tokens_seen": 228604960, - "step": 10602, - "time_per_iteration": 2.742664098739624 - }, - { - "auxiliary_loss_clip": 0.01037986, - "auxiliary_loss_mlp": 0.01033695, - "balance_loss_clip": 1.03193176, - "balance_loss_mlp": 1.02034974, - "epoch": 0.6374868480384789, - "flos": 20371728551040.0, - "grad_norm": 2.210099504390163, - "language_loss": 0.79618657, - "learning_rate": 1.2271283272730354e-06, - "loss": 0.81690341, - "num_input_tokens_seen": 228622195, - "step": 10603, - "time_per_iteration": 2.8134090900421143 - }, - { - "auxiliary_loss_clip": 0.0107315, - "auxiliary_loss_mlp": 0.00770892, - "balance_loss_clip": 1.03933704, - "balance_loss_mlp": 1.00014615, - "epoch": 0.6375469712911469, - "flos": 20996251344000.0, - "grad_norm": 1.8573318102619591, - "language_loss": 0.76802522, - "learning_rate": 1.2267691350746621e-06, - "loss": 0.78646559, - "num_input_tokens_seen": 228639735, - "step": 10604, - "time_per_iteration": 2.7761478424072266 - }, - { - "auxiliary_loss_clip": 0.01095415, - "auxiliary_loss_mlp": 0.01031172, - "balance_loss_clip": 1.03836191, - "balance_loss_mlp": 1.01792753, - "epoch": 0.6376070945438148, - "flos": 19715748422400.0, - "grad_norm": 1.6662789413728705, - "language_loss": 0.76640069, - "learning_rate": 1.226409972197281e-06, - "loss": 0.78766656, - "num_input_tokens_seen": 228658195, - "step": 10605, - "time_per_iteration": 4.650303602218628 - }, - { - "auxiliary_loss_clip": 0.01057897, - "auxiliary_loss_mlp": 0.01038795, - "balance_loss_clip": 1.03824091, - "balance_loss_mlp": 1.02234411, - "epoch": 0.6376672177964828, - "flos": 21506757390720.0, - "grad_norm": 1.7802518386545212, - "language_loss": 0.65565449, - "learning_rate": 1.2260508386545106e-06, - "loss": 0.67662132, - "num_input_tokens_seen": 228677415, - "step": 10606, - "time_per_iteration": 2.8175783157348633 - }, - { - "auxiliary_loss_clip": 0.01090718, - "auxiliary_loss_mlp": 0.01037026, - "balance_loss_clip": 1.04083657, - "balance_loss_mlp": 1.02489638, - "epoch": 0.6377273410491507, - "flos": 18843873598080.0, - "grad_norm": 1.601218417819437, - "language_loss": 0.75069982, - "learning_rate": 1.225691734459971e-06, - "loss": 0.77197731, - "num_input_tokens_seen": 228696450, - "step": 10607, - "time_per_iteration": 2.6365914344787598 - }, - { - "auxiliary_loss_clip": 0.01091801, - "auxiliary_loss_mlp": 0.01037938, - "balance_loss_clip": 1.04039049, - "balance_loss_mlp": 1.02553403, - "epoch": 0.6377874643018188, - "flos": 53062970181120.0, - "grad_norm": 1.5840122270167216, - "language_loss": 0.65928984, - "learning_rate": 1.225332659627278e-06, - "loss": 0.68058717, - "num_input_tokens_seen": 228721600, - "step": 10608, - "time_per_iteration": 4.558081150054932 - }, - { - "auxiliary_loss_clip": 0.00982544, - "auxiliary_loss_mlp": 0.01007387, - "balance_loss_clip": 1.01596785, - "balance_loss_mlp": 1.00617146, - "epoch": 0.6378475875544867, - "flos": 65135026465920.0, - "grad_norm": 0.7133010996130292, - "language_loss": 0.51879215, - "learning_rate": 1.2249736141700475e-06, - "loss": 0.53869152, - "num_input_tokens_seen": 228784535, - "step": 10609, - "time_per_iteration": 3.3632545471191406 - }, - { - "auxiliary_loss_clip": 0.0109935, - "auxiliary_loss_mlp": 0.01025243, - "balance_loss_clip": 1.03736722, - "balance_loss_mlp": 1.01379943, - "epoch": 0.6379077108071547, - "flos": 23002759958400.0, - "grad_norm": 1.6332455111471063, - "language_loss": 0.74713194, - "learning_rate": 1.2246145981018965e-06, - "loss": 0.7683779, - "num_input_tokens_seen": 228804110, - "step": 10610, - "time_per_iteration": 3.2196428775787354 - }, - { - "auxiliary_loss_clip": 0.0101651, - "auxiliary_loss_mlp": 0.0100476, - "balance_loss_clip": 1.01297092, - "balance_loss_mlp": 1.00353765, - "epoch": 0.6379678340598226, - "flos": 67601947610880.0, - "grad_norm": 0.8493432056950548, - "language_loss": 0.63061231, - "learning_rate": 1.2242556114364364e-06, - "loss": 0.65082502, - "num_input_tokens_seen": 228867705, - "step": 10611, - "time_per_iteration": 3.272512435913086 - }, - { - "auxiliary_loss_clip": 0.01103402, - "auxiliary_loss_mlp": 0.01034119, - "balance_loss_clip": 1.04139113, - "balance_loss_mlp": 1.0207442, - "epoch": 0.6380279573124906, - "flos": 29680061610240.0, - "grad_norm": 1.8312259315457267, - "language_loss": 0.72302759, - "learning_rate": 1.223896654187282e-06, - "loss": 0.74440277, - "num_input_tokens_seen": 228889215, - "step": 10612, - "time_per_iteration": 2.7299270629882812 - }, - { - "auxiliary_loss_clip": 0.01015421, - "auxiliary_loss_mlp": 0.0100432, - "balance_loss_clip": 1.0106107, - "balance_loss_mlp": 1.00311053, - "epoch": 0.6380880805651585, - "flos": 66484046580480.0, - "grad_norm": 0.7098749409658618, - "language_loss": 0.57844174, - "learning_rate": 1.2235377263680446e-06, - "loss": 0.59863913, - "num_input_tokens_seen": 228948465, - "step": 10613, - "time_per_iteration": 4.943511009216309 - }, - { - "auxiliary_loss_clip": 0.01071494, - "auxiliary_loss_mlp": 0.01035158, - "balance_loss_clip": 1.03659904, - "balance_loss_mlp": 1.02168155, - "epoch": 0.6381482038178266, - "flos": 23914998691200.0, - "grad_norm": 1.7198956941454036, - "language_loss": 0.75381726, - "learning_rate": 1.2231788279923334e-06, - "loss": 0.77488375, - "num_input_tokens_seen": 228967955, - "step": 10614, - "time_per_iteration": 2.8167922496795654 - }, - { - "auxiliary_loss_clip": 0.01094834, - "auxiliary_loss_mlp": 0.00770691, - "balance_loss_clip": 1.04056311, - "balance_loss_mlp": 1.00018597, - "epoch": 0.6382083270704945, - "flos": 24243042625920.0, - "grad_norm": 1.8795242058434967, - "language_loss": 0.79825491, - "learning_rate": 1.2228199590737599e-06, - "loss": 0.81691015, - "num_input_tokens_seen": 228985495, - "step": 10615, - "time_per_iteration": 2.769399642944336 - }, - { - "auxiliary_loss_clip": 0.01013557, - "auxiliary_loss_mlp": 0.01001876, - "balance_loss_clip": 1.01154137, - "balance_loss_mlp": 1.00048769, - "epoch": 0.6382684503231625, - "flos": 70775552931840.0, - "grad_norm": 0.6556730902042093, - "language_loss": 0.55564505, - "learning_rate": 1.2224611196259305e-06, - "loss": 0.57579941, - "num_input_tokens_seen": 229052995, - "step": 10616, - "time_per_iteration": 3.277085542678833 - }, - { - "auxiliary_loss_clip": 0.01086789, - "auxiliary_loss_mlp": 0.01036908, - "balance_loss_clip": 1.0364368, - "balance_loss_mlp": 1.0233475, - "epoch": 0.6383285735758305, - "flos": 16544836621440.0, - "grad_norm": 1.9142103073146424, - "language_loss": 0.83900499, - "learning_rate": 1.2221023096624538e-06, - "loss": 0.86024189, - "num_input_tokens_seen": 229071030, - "step": 10617, - "time_per_iteration": 2.712834119796753 - }, - { - "auxiliary_loss_clip": 0.0110772, - "auxiliary_loss_mlp": 0.0104261, - "balance_loss_clip": 1.04189885, - "balance_loss_mlp": 1.02821589, - "epoch": 0.6383886968284984, - "flos": 14427651225600.0, - "grad_norm": 1.8904429928249138, - "language_loss": 0.87499708, - "learning_rate": 1.221743529196936e-06, - "loss": 0.89650035, - "num_input_tokens_seen": 229088275, - "step": 10618, - "time_per_iteration": 2.6345932483673096 - }, - { - "auxiliary_loss_clip": 0.01068321, - "auxiliary_loss_mlp": 0.01032015, - "balance_loss_clip": 1.04150379, - "balance_loss_mlp": 1.02012992, - "epoch": 0.6384488200811664, - "flos": 17929659617280.0, - "grad_norm": 1.7304686428232843, - "language_loss": 0.73287666, - "learning_rate": 1.2213847782429806e-06, - "loss": 0.75388002, - "num_input_tokens_seen": 229105190, - "step": 10619, - "time_per_iteration": 2.777869701385498 - }, - { - "auxiliary_loss_clip": 0.0109667, - "auxiliary_loss_mlp": 0.01037459, - "balance_loss_clip": 1.04080129, - "balance_loss_mlp": 1.02271247, - "epoch": 0.6385089433338343, - "flos": 18515578268160.0, - "grad_norm": 1.9267832317981652, - "language_loss": 0.76312691, - "learning_rate": 1.221026056814193e-06, - "loss": 0.78446817, - "num_input_tokens_seen": 229122290, - "step": 10620, - "time_per_iteration": 2.701122760772705 - }, - { - "auxiliary_loss_clip": 0.01093794, - "auxiliary_loss_mlp": 0.01029286, - "balance_loss_clip": 1.04239035, - "balance_loss_mlp": 1.01672101, - "epoch": 0.6385690665865024, - "flos": 24753620499840.0, - "grad_norm": 2.5441546745937114, - "language_loss": 0.70669818, - "learning_rate": 1.2206673649241752e-06, - "loss": 0.727929, - "num_input_tokens_seen": 229141620, - "step": 10621, - "time_per_iteration": 2.7129428386688232 - }, - { - "auxiliary_loss_clip": 0.01085349, - "auxiliary_loss_mlp": 0.0102653, - "balance_loss_clip": 1.03596258, - "balance_loss_mlp": 1.01482916, - "epoch": 0.6386291898391703, - "flos": 20120569678080.0, - "grad_norm": 1.616578696475536, - "language_loss": 0.77862823, - "learning_rate": 1.220308702586529e-06, - "loss": 0.79974699, - "num_input_tokens_seen": 229161570, - "step": 10622, - "time_per_iteration": 2.722543954849243 - }, - { - "auxiliary_loss_clip": 0.01075591, - "auxiliary_loss_mlp": 0.01030912, - "balance_loss_clip": 1.03845859, - "balance_loss_mlp": 1.01837754, - "epoch": 0.6386893130918383, - "flos": 16867278034560.0, - "grad_norm": 1.771071416148221, - "language_loss": 0.74746549, - "learning_rate": 1.2199500698148546e-06, - "loss": 0.76853049, - "num_input_tokens_seen": 229178465, - "step": 10623, - "time_per_iteration": 2.728158712387085 - }, - { - "auxiliary_loss_clip": 0.0109049, - "auxiliary_loss_mlp": 0.01029194, - "balance_loss_clip": 1.03953004, - "balance_loss_mlp": 1.01796472, - "epoch": 0.6387494363445062, - "flos": 22966274718720.0, - "grad_norm": 1.3721054330124807, - "language_loss": 0.76588684, - "learning_rate": 1.2195914666227527e-06, - "loss": 0.78708369, - "num_input_tokens_seen": 229198975, - "step": 10624, - "time_per_iteration": 2.833406925201416 - }, - { - "auxiliary_loss_clip": 0.0105041, - "auxiliary_loss_mlp": 0.0103608, - "balance_loss_clip": 1.03588271, - "balance_loss_mlp": 1.02247274, - "epoch": 0.6388095595971742, - "flos": 22857716839680.0, - "grad_norm": 1.873995828783276, - "language_loss": 0.80408549, - "learning_rate": 1.21923289302382e-06, - "loss": 0.82495034, - "num_input_tokens_seen": 229218825, - "step": 10625, - "time_per_iteration": 2.810683488845825 - }, - { - "auxiliary_loss_clip": 0.01094331, - "auxiliary_loss_mlp": 0.01033337, - "balance_loss_clip": 1.04317892, - "balance_loss_mlp": 1.02039063, - "epoch": 0.6388696828498421, - "flos": 17311529445120.0, - "grad_norm": 1.9242726484746675, - "language_loss": 0.72490007, - "learning_rate": 1.218874349031654e-06, - "loss": 0.74617672, - "num_input_tokens_seen": 229236060, - "step": 10626, - "time_per_iteration": 2.667686939239502 - }, - { - "auxiliary_loss_clip": 0.01093032, - "auxiliary_loss_mlp": 0.01033337, - "balance_loss_clip": 1.03836656, - "balance_loss_mlp": 1.02036738, - "epoch": 0.6389298061025102, - "flos": 17128636369920.0, - "grad_norm": 1.8547762721762564, - "language_loss": 0.72446245, - "learning_rate": 1.2185158346598517e-06, - "loss": 0.74572611, - "num_input_tokens_seen": 229255160, - "step": 10627, - "time_per_iteration": 2.681147575378418 - }, - { - "auxiliary_loss_clip": 0.01095264, - "auxiliary_loss_mlp": 0.01034256, - "balance_loss_clip": 1.04398704, - "balance_loss_mlp": 1.01995111, - "epoch": 0.6389899293551781, - "flos": 27710971989120.0, - "grad_norm": 1.6812239823438198, - "language_loss": 0.67369878, - "learning_rate": 1.2181573499220064e-06, - "loss": 0.69499397, - "num_input_tokens_seen": 229278705, - "step": 10628, - "time_per_iteration": 2.7938716411590576 - }, - { - "auxiliary_loss_clip": 0.0111173, - "auxiliary_loss_mlp": 0.01029902, - "balance_loss_clip": 1.04083705, - "balance_loss_mlp": 1.01804066, - "epoch": 0.6390500526078461, - "flos": 21215701486080.0, - "grad_norm": 1.7139884939852632, - "language_loss": 0.68161869, - "learning_rate": 1.2177988948317135e-06, - "loss": 0.703035, - "num_input_tokens_seen": 229299990, - "step": 10629, - "time_per_iteration": 2.644061803817749 - }, - { - "auxiliary_loss_clip": 0.01079014, - "auxiliary_loss_mlp": 0.01040793, - "balance_loss_clip": 1.03948665, - "balance_loss_mlp": 1.02554584, - "epoch": 0.6391101758605141, - "flos": 21581056673280.0, - "grad_norm": 1.5487398291576047, - "language_loss": 0.75722307, - "learning_rate": 1.2174404694025646e-06, - "loss": 0.77842116, - "num_input_tokens_seen": 229319230, - "step": 10630, - "time_per_iteration": 2.7381680011749268 - }, - { - "auxiliary_loss_clip": 0.01089485, - "auxiliary_loss_mlp": 0.01035881, - "balance_loss_clip": 1.03773403, - "balance_loss_mlp": 1.02401352, - "epoch": 0.639170299113182, - "flos": 19900473091200.0, - "grad_norm": 1.4699321095065776, - "language_loss": 0.7028895, - "learning_rate": 1.2170820736481511e-06, - "loss": 0.72414321, - "num_input_tokens_seen": 229338600, - "step": 10631, - "time_per_iteration": 2.76301908493042 - }, - { - "auxiliary_loss_clip": 0.01010735, - "auxiliary_loss_mlp": 0.01020885, - "balance_loss_clip": 1.00987029, - "balance_loss_mlp": 1.01946056, - "epoch": 0.63923042236585, - "flos": 69877604833920.0, - "grad_norm": 1.2867788563374962, - "language_loss": 0.62960958, - "learning_rate": 1.2167237075820646e-06, - "loss": 0.64992577, - "num_input_tokens_seen": 229402420, - "step": 10632, - "time_per_iteration": 3.23628306388855 - }, - { - "auxiliary_loss_clip": 0.01092617, - "auxiliary_loss_mlp": 0.01034269, - "balance_loss_clip": 1.04134142, - "balance_loss_mlp": 1.02143598, - "epoch": 0.639290545618518, - "flos": 22674823764480.0, - "grad_norm": 11.316815321652387, - "language_loss": 0.66998363, - "learning_rate": 1.216365371217893e-06, - "loss": 0.69125253, - "num_input_tokens_seen": 229419185, - "step": 10633, - "time_per_iteration": 2.719403028488159 - }, - { - "auxiliary_loss_clip": 0.01051248, - "auxiliary_loss_mlp": 0.01028599, - "balance_loss_clip": 1.04067874, - "balance_loss_mlp": 1.01645792, - "epoch": 0.639350668871186, - "flos": 19829190551040.0, - "grad_norm": 2.281228369443932, - "language_loss": 0.81935, - "learning_rate": 1.216007064569225e-06, - "loss": 0.84014845, - "num_input_tokens_seen": 229436735, - "step": 10634, - "time_per_iteration": 2.8779945373535156 - }, - { - "auxiliary_loss_clip": 0.01089506, - "auxiliary_loss_mlp": 0.01036012, - "balance_loss_clip": 1.0404712, - "balance_loss_mlp": 1.02211165, - "epoch": 0.6394107921238539, - "flos": 20553328736640.0, - "grad_norm": 1.5224758560315717, - "language_loss": 0.74918383, - "learning_rate": 1.2156487876496483e-06, - "loss": 0.77043903, - "num_input_tokens_seen": 229455595, - "step": 10635, - "time_per_iteration": 2.7275381088256836 - }, - { - "auxiliary_loss_clip": 0.0110297, - "auxiliary_loss_mlp": 0.01033837, - "balance_loss_clip": 1.04365182, - "balance_loss_mlp": 1.02071238, - "epoch": 0.6394709153765219, - "flos": 25774991729280.0, - "grad_norm": 1.6416528841405902, - "language_loss": 0.71164483, - "learning_rate": 1.2152905404727475e-06, - "loss": 0.73301286, - "num_input_tokens_seen": 229476230, - "step": 10636, - "time_per_iteration": 2.6989855766296387 - }, - { - "auxiliary_loss_clip": 0.0109626, - "auxiliary_loss_mlp": 0.01037788, - "balance_loss_clip": 1.04154992, - "balance_loss_mlp": 1.02471662, - "epoch": 0.6395310386291898, - "flos": 17530153574400.0, - "grad_norm": 1.863216274856941, - "language_loss": 0.73810291, - "learning_rate": 1.2149323230521085e-06, - "loss": 0.7594434, - "num_input_tokens_seen": 229494300, - "step": 10637, - "time_per_iteration": 2.7064554691314697 - }, - { - "auxiliary_loss_clip": 0.01102986, - "auxiliary_loss_mlp": 0.01035332, - "balance_loss_clip": 1.04232454, - "balance_loss_mlp": 1.0214324, - "epoch": 0.6395911618818578, - "flos": 18588225525120.0, - "grad_norm": 1.8583759044592125, - "language_loss": 0.77674294, - "learning_rate": 1.2145741354013143e-06, - "loss": 0.7981261, - "num_input_tokens_seen": 229512985, - "step": 10638, - "time_per_iteration": 2.742272138595581 - }, - { - "auxiliary_loss_clip": 0.01092544, - "auxiliary_loss_mlp": 0.01035401, - "balance_loss_clip": 1.039186, - "balance_loss_mlp": 1.02218056, - "epoch": 0.6396512851345257, - "flos": 28366557068160.0, - "grad_norm": 1.7706841809309422, - "language_loss": 0.81434906, - "learning_rate": 1.2142159775339478e-06, - "loss": 0.83562851, - "num_input_tokens_seen": 229534270, - "step": 10639, - "time_per_iteration": 2.7076473236083984 - }, - { - "auxiliary_loss_clip": 0.0101793, - "auxiliary_loss_mlp": 0.0099976, - "balance_loss_clip": 1.01366258, - "balance_loss_mlp": 0.9985556, - "epoch": 0.6397114083871938, - "flos": 70724307202560.0, - "grad_norm": 0.8066832194631076, - "language_loss": 0.58980644, - "learning_rate": 1.21385784946359e-06, - "loss": 0.60998333, - "num_input_tokens_seen": 229596455, - "step": 10640, - "time_per_iteration": 3.175328254699707 - }, - { - "auxiliary_loss_clip": 0.01081778, - "auxiliary_loss_mlp": 0.01030879, - "balance_loss_clip": 1.03485847, - "balance_loss_mlp": 1.01876175, - "epoch": 0.6397715316398617, - "flos": 18142537570560.0, - "grad_norm": 1.8250663746988522, - "language_loss": 0.78291178, - "learning_rate": 1.2134997512038215e-06, - "loss": 0.80403835, - "num_input_tokens_seen": 229612860, - "step": 10641, - "time_per_iteration": 2.6736910343170166 - }, - { - "auxiliary_loss_clip": 0.01069736, - "auxiliary_loss_mlp": 0.01041571, - "balance_loss_clip": 1.03781104, - "balance_loss_mlp": 1.02828479, - "epoch": 0.6398316548925297, - "flos": 25739512070400.0, - "grad_norm": 1.5814049726496198, - "language_loss": 0.63194126, - "learning_rate": 1.2131416827682209e-06, - "loss": 0.65305436, - "num_input_tokens_seen": 229633960, - "step": 10642, - "time_per_iteration": 2.840916156768799 - }, - { - "auxiliary_loss_clip": 0.01004085, - "auxiliary_loss_mlp": 0.01008093, - "balance_loss_clip": 1.00885439, - "balance_loss_mlp": 1.00666296, - "epoch": 0.6398917781451977, - "flos": 71214234756480.0, - "grad_norm": 0.9138015475084418, - "language_loss": 0.55936515, - "learning_rate": 1.2127836441703667e-06, - "loss": 0.57948697, - "num_input_tokens_seen": 229686730, - "step": 10643, - "time_per_iteration": 3.134157419204712 - }, - { - "auxiliary_loss_clip": 0.01082549, - "auxiliary_loss_mlp": 0.01028613, - "balance_loss_clip": 1.03844333, - "balance_loss_mlp": 1.01577973, - "epoch": 0.6399519013978656, - "flos": 20521835487360.0, - "grad_norm": 2.4755783411685055, - "language_loss": 0.76844835, - "learning_rate": 1.2124256354238358e-06, - "loss": 0.78955996, - "num_input_tokens_seen": 229704800, - "step": 10644, - "time_per_iteration": 2.750016212463379 - }, - { - "auxiliary_loss_clip": 0.01083772, - "auxiliary_loss_mlp": 0.0103714, - "balance_loss_clip": 1.04259241, - "balance_loss_mlp": 1.02343059, - "epoch": 0.6400120246505336, - "flos": 24460840742400.0, - "grad_norm": 1.476966637211995, - "language_loss": 0.82139534, - "learning_rate": 1.212067656542203e-06, - "loss": 0.84260446, - "num_input_tokens_seen": 229725265, - "step": 10645, - "time_per_iteration": 4.434756755828857 - }, - { - "auxiliary_loss_clip": 0.01108206, - "auxiliary_loss_mlp": 0.01043381, - "balance_loss_clip": 1.0400579, - "balance_loss_mlp": 1.02844369, - "epoch": 0.6400721479032015, - "flos": 28366090191360.0, - "grad_norm": 1.9873684481859661, - "language_loss": 0.73491621, - "learning_rate": 1.2117097075390447e-06, - "loss": 0.75643206, - "num_input_tokens_seen": 229744840, - "step": 10646, - "time_per_iteration": 2.790422201156616 - }, - { - "auxiliary_loss_clip": 0.01076409, - "auxiliary_loss_mlp": 0.01036032, - "balance_loss_clip": 1.037462, - "balance_loss_mlp": 1.02220368, - "epoch": 0.6401322711558696, - "flos": 17816540711040.0, - "grad_norm": 2.1141413827607227, - "language_loss": 0.79825467, - "learning_rate": 1.2113517884279327e-06, - "loss": 0.81937909, - "num_input_tokens_seen": 229759095, - "step": 10647, - "time_per_iteration": 6.299994707107544 - }, - { - "auxiliary_loss_clip": 0.0106918, - "auxiliary_loss_mlp": 0.01033575, - "balance_loss_clip": 1.03744197, - "balance_loss_mlp": 1.02105761, - "epoch": 0.6401923944085375, - "flos": 26030855283840.0, - "grad_norm": 1.5992559976065106, - "language_loss": 0.75935119, - "learning_rate": 1.2109938992224399e-06, - "loss": 0.7803787, - "num_input_tokens_seen": 229777750, - "step": 10648, - "time_per_iteration": 2.823535680770874 - }, - { - "auxiliary_loss_clip": 0.01088631, - "auxiliary_loss_mlp": 0.01035586, - "balance_loss_clip": 1.03901458, - "balance_loss_mlp": 1.02278256, - "epoch": 0.6402525176612055, - "flos": 23586451966080.0, - "grad_norm": 3.2506814778416566, - "language_loss": 0.78615916, - "learning_rate": 1.210636039936138e-06, - "loss": 0.80740136, - "num_input_tokens_seen": 229796785, - "step": 10649, - "time_per_iteration": 2.7334954738616943 - }, - { - "auxiliary_loss_clip": 0.01058756, - "auxiliary_loss_mlp": 0.01037312, - "balance_loss_clip": 1.03965068, - "balance_loss_mlp": 1.02403259, - "epoch": 0.6403126409138734, - "flos": 18041413806720.0, - "grad_norm": 4.7583637580681515, - "language_loss": 0.75450838, - "learning_rate": 1.2102782105825956e-06, - "loss": 0.77546906, - "num_input_tokens_seen": 229815425, - "step": 10650, - "time_per_iteration": 2.834925651550293 - }, - { - "auxiliary_loss_clip": 0.01114658, - "auxiliary_loss_mlp": 0.01038182, - "balance_loss_clip": 1.04058218, - "balance_loss_mlp": 1.02501488, - "epoch": 0.6403727641665414, - "flos": 21979485308160.0, - "grad_norm": 1.5877577982319235, - "language_loss": 0.7111091, - "learning_rate": 1.2099204111753833e-06, - "loss": 0.73263752, - "num_input_tokens_seen": 229834545, - "step": 10651, - "time_per_iteration": 2.599517345428467 - }, - { - "auxiliary_loss_clip": 0.01082313, - "auxiliary_loss_mlp": 0.01041331, - "balance_loss_clip": 1.03811073, - "balance_loss_mlp": 1.02803898, - "epoch": 0.6404328874192093, - "flos": 24895539135360.0, - "grad_norm": 2.6398543727492494, - "language_loss": 0.63837707, - "learning_rate": 1.2095626417280684e-06, - "loss": 0.65961355, - "num_input_tokens_seen": 229849175, - "step": 10652, - "time_per_iteration": 4.367003679275513 - }, - { - "auxiliary_loss_clip": 0.0109017, - "auxiliary_loss_mlp": 0.01029675, - "balance_loss_clip": 1.03734291, - "balance_loss_mlp": 1.01728261, - "epoch": 0.6404930106718774, - "flos": 17597198309760.0, - "grad_norm": 2.0413197407443247, - "language_loss": 0.79417443, - "learning_rate": 1.2092049022542168e-06, - "loss": 0.81537288, - "num_input_tokens_seen": 229865400, - "step": 10653, - "time_per_iteration": 2.672642707824707 - }, - { - "auxiliary_loss_clip": 0.01089835, - "auxiliary_loss_mlp": 0.01057293, - "balance_loss_clip": 1.03523707, - "balance_loss_mlp": 1.04088974, - "epoch": 0.6405531339245453, - "flos": 20157880930560.0, - "grad_norm": 2.1735639110567884, - "language_loss": 0.70573318, - "learning_rate": 1.2088471927673952e-06, - "loss": 0.72720444, - "num_input_tokens_seen": 229882945, - "step": 10654, - "time_per_iteration": 2.6905150413513184 - }, - { - "auxiliary_loss_clip": 0.01109265, - "auxiliary_loss_mlp": 0.01041023, - "balance_loss_clip": 1.04214334, - "balance_loss_mlp": 1.02721834, - "epoch": 0.6406132571772133, - "flos": 21942281796480.0, - "grad_norm": 1.704852134606112, - "language_loss": 0.73023099, - "learning_rate": 1.2084895132811666e-06, - "loss": 0.75173384, - "num_input_tokens_seen": 229901590, - "step": 10655, - "time_per_iteration": 2.6235902309417725 - }, - { - "auxiliary_loss_clip": 0.01082305, - "auxiliary_loss_mlp": 0.01040345, - "balance_loss_clip": 1.04245615, - "balance_loss_mlp": 1.0268271, - "epoch": 0.6406733804298813, - "flos": 28768002445440.0, - "grad_norm": 1.5348114269310231, - "language_loss": 0.82592511, - "learning_rate": 1.2081318638090952e-06, - "loss": 0.84715158, - "num_input_tokens_seen": 229922535, - "step": 10656, - "time_per_iteration": 2.786027193069458 - }, - { - "auxiliary_loss_clip": 0.01057312, - "auxiliary_loss_mlp": 0.01037289, - "balance_loss_clip": 1.034778, - "balance_loss_mlp": 1.02465284, - "epoch": 0.6407335036825492, - "flos": 17457183095040.0, - "grad_norm": 2.2686127713919566, - "language_loss": 0.72339928, - "learning_rate": 1.2077742443647433e-06, - "loss": 0.74434525, - "num_input_tokens_seen": 229939575, - "step": 10657, - "time_per_iteration": 2.7300093173980713 - }, - { - "auxiliary_loss_clip": 0.01080913, - "auxiliary_loss_mlp": 0.01039634, - "balance_loss_clip": 1.03770339, - "balance_loss_mlp": 1.0274924, - "epoch": 0.6407936269352172, - "flos": 22125282612480.0, - "grad_norm": 2.024621973540982, - "language_loss": 0.77556098, - "learning_rate": 1.2074166549616707e-06, - "loss": 0.7967664, - "num_input_tokens_seen": 229958840, - "step": 10658, - "time_per_iteration": 2.7543232440948486 - }, - { - "auxiliary_loss_clip": 0.01119551, - "auxiliary_loss_mlp": 0.01041614, - "balance_loss_clip": 1.04269636, - "balance_loss_mlp": 1.02797651, - "epoch": 0.6408537501878852, - "flos": 23110635479040.0, - "grad_norm": 2.31675003494523, - "language_loss": 0.76086068, - "learning_rate": 1.2070590956134386e-06, - "loss": 0.78247231, - "num_input_tokens_seen": 229979680, - "step": 10659, - "time_per_iteration": 2.64536190032959 - }, - { - "auxiliary_loss_clip": 0.01105159, - "auxiliary_loss_mlp": 0.01032937, - "balance_loss_clip": 1.04132307, - "balance_loss_mlp": 1.01971078, - "epoch": 0.6409138734405532, - "flos": 16472440759680.0, - "grad_norm": 1.82994064834737, - "language_loss": 0.78033829, - "learning_rate": 1.2067015663336046e-06, - "loss": 0.80171925, - "num_input_tokens_seen": 229996830, - "step": 10660, - "time_per_iteration": 2.6234161853790283 - }, - { - "auxiliary_loss_clip": 0.01092799, - "auxiliary_loss_mlp": 0.01035048, - "balance_loss_clip": 1.03941202, - "balance_loss_mlp": 1.02086258, - "epoch": 0.6409739966932211, - "flos": 22777922776320.0, - "grad_norm": 1.735823034314566, - "language_loss": 0.68326354, - "learning_rate": 1.206344067135727e-06, - "loss": 0.70454198, - "num_input_tokens_seen": 230015115, - "step": 10661, - "time_per_iteration": 2.7175955772399902 - }, - { - "auxiliary_loss_clip": 0.01114459, - "auxiliary_loss_mlp": 0.01038734, - "balance_loss_clip": 1.04276872, - "balance_loss_mlp": 1.02682471, - "epoch": 0.6410341199458891, - "flos": 25152049134720.0, - "grad_norm": 1.9252684871674384, - "language_loss": 0.75755298, - "learning_rate": 1.205986598033362e-06, - "loss": 0.77908492, - "num_input_tokens_seen": 230035515, - "step": 10662, - "time_per_iteration": 2.633653402328491 - }, - { - "auxiliary_loss_clip": 0.01098112, - "auxiliary_loss_mlp": 0.01035568, - "balance_loss_clip": 1.03684235, - "balance_loss_mlp": 1.02221704, - "epoch": 0.641094243198557, - "flos": 27046193028480.0, - "grad_norm": 2.784052529669845, - "language_loss": 0.70107532, - "learning_rate": 1.2056291590400644e-06, - "loss": 0.72241217, - "num_input_tokens_seen": 230054355, - "step": 10663, - "time_per_iteration": 2.7310519218444824 - }, - { - "auxiliary_loss_clip": 0.01083056, - "auxiliary_loss_mlp": 0.0104552, - "balance_loss_clip": 1.04077351, - "balance_loss_mlp": 1.03102446, - "epoch": 0.641154366451225, - "flos": 25374551932800.0, - "grad_norm": 1.9822481402863719, - "language_loss": 0.67971885, - "learning_rate": 1.205271750169389e-06, - "loss": 0.70100462, - "num_input_tokens_seen": 230074605, - "step": 10664, - "time_per_iteration": 2.773348093032837 - }, - { - "auxiliary_loss_clip": 0.01087025, - "auxiliary_loss_mlp": 0.01033843, - "balance_loss_clip": 1.03581822, - "balance_loss_mlp": 1.02188087, - "epoch": 0.6412144897038929, - "flos": 25153342024320.0, - "grad_norm": 1.8870991168532496, - "language_loss": 0.66328347, - "learning_rate": 1.2049143714348881e-06, - "loss": 0.68449211, - "num_input_tokens_seen": 230093820, - "step": 10665, - "time_per_iteration": 2.6490859985351562 - }, - { - "auxiliary_loss_clip": 0.01103479, - "auxiliary_loss_mlp": 0.01027966, - "balance_loss_clip": 1.04036629, - "balance_loss_mlp": 1.01522827, - "epoch": 0.641274612956561, - "flos": 23440762402560.0, - "grad_norm": 1.6713056871656586, - "language_loss": 0.6435259, - "learning_rate": 1.2045570228501145e-06, - "loss": 0.66484034, - "num_input_tokens_seen": 230114285, - "step": 10666, - "time_per_iteration": 2.667050361633301 - }, - { - "auxiliary_loss_clip": 0.01105312, - "auxiliary_loss_mlp": 0.01033422, - "balance_loss_clip": 1.04096031, - "balance_loss_mlp": 1.02103066, - "epoch": 0.6413347362092289, - "flos": 19427493778560.0, - "grad_norm": 1.5002235875983176, - "language_loss": 0.70960593, - "learning_rate": 1.2041997044286176e-06, - "loss": 0.73099327, - "num_input_tokens_seen": 230132760, - "step": 10667, - "time_per_iteration": 2.701289176940918 - }, - { - "auxiliary_loss_clip": 0.01066227, - "auxiliary_loss_mlp": 0.00773491, - "balance_loss_clip": 1.0367496, - "balance_loss_mlp": 1.00030184, - "epoch": 0.6413948594618969, - "flos": 17196578945280.0, - "grad_norm": 2.416405769977824, - "language_loss": 0.77665913, - "learning_rate": 1.2038424161839484e-06, - "loss": 0.79505634, - "num_input_tokens_seen": 230149690, - "step": 10668, - "time_per_iteration": 2.746056079864502 - }, - { - "auxiliary_loss_clip": 0.01108161, - "auxiliary_loss_mlp": 0.01036614, - "balance_loss_clip": 1.04348612, - "balance_loss_mlp": 1.02366185, - "epoch": 0.6414549827145648, - "flos": 22269787027200.0, - "grad_norm": 1.4845911693701175, - "language_loss": 0.67707181, - "learning_rate": 1.2034851581296544e-06, - "loss": 0.69851947, - "num_input_tokens_seen": 230166950, - "step": 10669, - "time_per_iteration": 2.7345635890960693 - }, - { - "auxiliary_loss_clip": 0.0111572, - "auxiliary_loss_mlp": 0.01038211, - "balance_loss_clip": 1.04545701, - "balance_loss_mlp": 1.02449608, - "epoch": 0.6415151059672328, - "flos": 19640192163840.0, - "grad_norm": 2.894165174832574, - "language_loss": 0.78665972, - "learning_rate": 1.2031279302792825e-06, - "loss": 0.80819899, - "num_input_tokens_seen": 230184785, - "step": 10670, - "time_per_iteration": 2.6661479473114014 - }, - { - "auxiliary_loss_clip": 0.01081535, - "auxiliary_loss_mlp": 0.01035319, - "balance_loss_clip": 1.03874564, - "balance_loss_mlp": 1.02164531, - "epoch": 0.6415752292199008, - "flos": 14865833237760.0, - "grad_norm": 2.1933536907134554, - "language_loss": 0.88588488, - "learning_rate": 1.20277073264638e-06, - "loss": 0.90705341, - "num_input_tokens_seen": 230201385, - "step": 10671, - "time_per_iteration": 2.641057252883911 - }, - { - "auxiliary_loss_clip": 0.01104202, - "auxiliary_loss_mlp": 0.01028531, - "balance_loss_clip": 1.04201674, - "balance_loss_mlp": 1.01649058, - "epoch": 0.6416353524725688, - "flos": 13735580906880.0, - "grad_norm": 1.6223655469146963, - "language_loss": 0.68986869, - "learning_rate": 1.2024135652444907e-06, - "loss": 0.71119601, - "num_input_tokens_seen": 230220380, - "step": 10672, - "time_per_iteration": 2.6609199047088623 - }, - { - "auxiliary_loss_clip": 0.01111137, - "auxiliary_loss_mlp": 0.01033932, - "balance_loss_clip": 1.04236984, - "balance_loss_mlp": 1.01922166, - "epoch": 0.6416954757252368, - "flos": 24534924543360.0, - "grad_norm": 2.291371400531435, - "language_loss": 0.73951614, - "learning_rate": 1.2020564280871593e-06, - "loss": 0.76096678, - "num_input_tokens_seen": 230239845, - "step": 10673, - "time_per_iteration": 2.7125818729400635 - }, - { - "auxiliary_loss_clip": 0.01076968, - "auxiliary_loss_mlp": 0.01038267, - "balance_loss_clip": 1.03657365, - "balance_loss_mlp": 1.02410507, - "epoch": 0.6417555989779047, - "flos": 27710002321920.0, - "grad_norm": 25.869198527491033, - "language_loss": 0.69720078, - "learning_rate": 1.2016993211879283e-06, - "loss": 0.71835309, - "num_input_tokens_seen": 230262420, - "step": 10674, - "time_per_iteration": 2.8267860412597656 - }, - { - "auxiliary_loss_clip": 0.01119164, - "auxiliary_loss_mlp": 0.01029301, - "balance_loss_clip": 1.04007125, - "balance_loss_mlp": 1.01571679, - "epoch": 0.6418157222305727, - "flos": 20556632787840.0, - "grad_norm": 1.784339148090001, - "language_loss": 0.66459048, - "learning_rate": 1.201342244560338e-06, - "loss": 0.68607509, - "num_input_tokens_seen": 230279950, - "step": 10675, - "time_per_iteration": 2.6572489738464355 - }, - { - "auxiliary_loss_clip": 0.01117705, - "auxiliary_loss_mlp": 0.01037266, - "balance_loss_clip": 1.04312348, - "balance_loss_mlp": 1.02500582, - "epoch": 0.6418758454832406, - "flos": 22601530062720.0, - "grad_norm": 1.859703676283548, - "language_loss": 0.66479051, - "learning_rate": 1.2009851982179307e-06, - "loss": 0.68634021, - "num_input_tokens_seen": 230299705, - "step": 10676, - "time_per_iteration": 2.6424221992492676 - }, - { - "auxiliary_loss_clip": 0.01119453, - "auxiliary_loss_mlp": 0.01034897, - "balance_loss_clip": 1.04334652, - "balance_loss_mlp": 1.02030003, - "epoch": 0.6419359687359086, - "flos": 27375098889600.0, - "grad_norm": 1.821732847085161, - "language_loss": 0.75731808, - "learning_rate": 1.2006281821742446e-06, - "loss": 0.77886158, - "num_input_tokens_seen": 230320030, - "step": 10677, - "time_per_iteration": 2.651279926300049 - }, - { - "auxiliary_loss_clip": 0.01017238, - "auxiliary_loss_mlp": 0.0100428, - "balance_loss_clip": 1.01344991, - "balance_loss_mlp": 1.00320745, - "epoch": 0.6419960919885765, - "flos": 67251924552960.0, - "grad_norm": 0.7863000332751263, - "language_loss": 0.60634637, - "learning_rate": 1.200271196442818e-06, - "loss": 0.62656152, - "num_input_tokens_seen": 230381495, - "step": 10678, - "time_per_iteration": 3.29689359664917 - }, - { - "auxiliary_loss_clip": 0.01100247, - "auxiliary_loss_mlp": 0.01035517, - "balance_loss_clip": 1.03918314, - "balance_loss_mlp": 1.02296972, - "epoch": 0.6420562152412446, - "flos": 19901873721600.0, - "grad_norm": 1.6874144871208372, - "language_loss": 0.6772809, - "learning_rate": 1.1999142410371875e-06, - "loss": 0.69863856, - "num_input_tokens_seen": 230401385, - "step": 10679, - "time_per_iteration": 2.656188488006592 - }, - { - "auxiliary_loss_clip": 0.01103127, - "auxiliary_loss_mlp": 0.01041549, - "balance_loss_clip": 1.04055119, - "balance_loss_mlp": 1.02634931, - "epoch": 0.6421163384939125, - "flos": 24790177566720.0, - "grad_norm": 2.4808394593739123, - "language_loss": 0.73067611, - "learning_rate": 1.1995573159708897e-06, - "loss": 0.75212288, - "num_input_tokens_seen": 230421340, - "step": 10680, - "time_per_iteration": 2.6635870933532715 - }, - { - "auxiliary_loss_clip": 0.01079924, - "auxiliary_loss_mlp": 0.01028158, - "balance_loss_clip": 1.03821039, - "balance_loss_mlp": 1.01660097, - "epoch": 0.6421764617465805, - "flos": 25592816926080.0, - "grad_norm": 1.6629690093206273, - "language_loss": 0.67730248, - "learning_rate": 1.1992004212574582e-06, - "loss": 0.69838333, - "num_input_tokens_seen": 230441270, - "step": 10681, - "time_per_iteration": 2.7426977157592773 - }, - { - "auxiliary_loss_clip": 0.0111386, - "auxiliary_loss_mlp": 0.01031892, - "balance_loss_clip": 1.04021406, - "balance_loss_mlp": 1.01944685, - "epoch": 0.6422365849992484, - "flos": 14134727813760.0, - "grad_norm": 1.7354882322045777, - "language_loss": 0.74501145, - "learning_rate": 1.198843556910427e-06, - "loss": 0.76646894, - "num_input_tokens_seen": 230457455, - "step": 10682, - "time_per_iteration": 2.5474164485931396 - }, - { - "auxiliary_loss_clip": 0.01051042, - "auxiliary_loss_mlp": 0.01032531, - "balance_loss_clip": 1.03735995, - "balance_loss_mlp": 1.02086592, - "epoch": 0.6422967082519164, - "flos": 22383911514240.0, - "grad_norm": 1.4579009699070558, - "language_loss": 0.79108202, - "learning_rate": 1.1984867229433287e-06, - "loss": 0.81191772, - "num_input_tokens_seen": 230478955, - "step": 10683, - "time_per_iteration": 2.913137435913086 - }, - { - "auxiliary_loss_clip": 0.01118799, - "auxiliary_loss_mlp": 0.01035941, - "balance_loss_clip": 1.04291272, - "balance_loss_mlp": 1.0225358, - "epoch": 0.6423568315045844, - "flos": 14647927380480.0, - "grad_norm": 1.7236127231650058, - "language_loss": 0.67390025, - "learning_rate": 1.1981299193696941e-06, - "loss": 0.69544768, - "num_input_tokens_seen": 230496425, - "step": 10684, - "time_per_iteration": 4.21756386756897 - }, - { - "auxiliary_loss_clip": 0.0110472, - "auxiliary_loss_mlp": 0.0103372, - "balance_loss_clip": 1.04010284, - "balance_loss_mlp": 1.02044034, - "epoch": 0.6424169547572524, - "flos": 26833925606400.0, - "grad_norm": 2.002909718847722, - "language_loss": 0.7144649, - "learning_rate": 1.1977731462030533e-06, - "loss": 0.73584938, - "num_input_tokens_seen": 230516245, - "step": 10685, - "time_per_iteration": 2.715785026550293 - }, - { - "auxiliary_loss_clip": 0.0107774, - "auxiliary_loss_mlp": 0.01037662, - "balance_loss_clip": 1.03614187, - "balance_loss_mlp": 1.02484107, - "epoch": 0.6424770780099204, - "flos": 22707430335360.0, - "grad_norm": 1.5191327003401023, - "language_loss": 0.75144935, - "learning_rate": 1.197416403456935e-06, - "loss": 0.77260327, - "num_input_tokens_seen": 230534745, - "step": 10686, - "time_per_iteration": 4.366745948791504 - }, - { - "auxiliary_loss_clip": 0.01082252, - "auxiliary_loss_mlp": 0.01034259, - "balance_loss_clip": 1.04008722, - "balance_loss_mlp": 1.01991844, - "epoch": 0.6425372012625883, - "flos": 28469512425600.0, - "grad_norm": 6.424850822093427, - "language_loss": 0.68726957, - "learning_rate": 1.197059691144867e-06, - "loss": 0.7084347, - "num_input_tokens_seen": 230555895, - "step": 10687, - "time_per_iteration": 4.32355523109436 - }, - { - "auxiliary_loss_clip": 0.01092278, - "auxiliary_loss_mlp": 0.0103296, - "balance_loss_clip": 1.03951168, - "balance_loss_mlp": 1.02028227, - "epoch": 0.6425973245152563, - "flos": 29351694453120.0, - "grad_norm": 1.9785933660475024, - "language_loss": 0.66424388, - "learning_rate": 1.1967030092803767e-06, - "loss": 0.68549621, - "num_input_tokens_seen": 230577460, - "step": 10688, - "time_per_iteration": 2.8096606731414795 - }, - { - "auxiliary_loss_clip": 0.01114997, - "auxiliary_loss_mlp": 0.01034078, - "balance_loss_clip": 1.04043853, - "balance_loss_mlp": 1.02081013, - "epoch": 0.6426574477679242, - "flos": 16430388912000.0, - "grad_norm": 1.653295180436115, - "language_loss": 0.73148823, - "learning_rate": 1.1963463578769876e-06, - "loss": 0.75297892, - "num_input_tokens_seen": 230595030, - "step": 10689, - "time_per_iteration": 2.5335159301757812 - }, - { - "auxiliary_loss_clip": 0.01097981, - "auxiliary_loss_mlp": 0.01032097, - "balance_loss_clip": 1.04061198, - "balance_loss_mlp": 1.0200088, - "epoch": 0.6427175710205922, - "flos": 21835914647040.0, - "grad_norm": 2.974297200312542, - "language_loss": 0.72271609, - "learning_rate": 1.195989736948226e-06, - "loss": 0.74401689, - "num_input_tokens_seen": 230615135, - "step": 10690, - "time_per_iteration": 2.678732395172119 - }, - { - "auxiliary_loss_clip": 0.01087197, - "auxiliary_loss_mlp": 0.01034747, - "balance_loss_clip": 1.03962326, - "balance_loss_mlp": 1.02202129, - "epoch": 0.6427776942732601, - "flos": 17786627660160.0, - "grad_norm": 1.747376446154191, - "language_loss": 0.77734852, - "learning_rate": 1.1956331465076143e-06, - "loss": 0.79856801, - "num_input_tokens_seen": 230631965, - "step": 10691, - "time_per_iteration": 2.659553050994873 - }, - { - "auxiliary_loss_clip": 0.01094577, - "auxiliary_loss_mlp": 0.01035823, - "balance_loss_clip": 1.03965449, - "balance_loss_mlp": 1.02299619, - "epoch": 0.6428378175259282, - "flos": 15085893911040.0, - "grad_norm": 1.8605559166150418, - "language_loss": 0.74422169, - "learning_rate": 1.1952765865686738e-06, - "loss": 0.76552576, - "num_input_tokens_seen": 230649565, - "step": 10692, - "time_per_iteration": 4.251460790634155 - }, - { - "auxiliary_loss_clip": 0.01104664, - "auxiliary_loss_mlp": 0.01034677, - "balance_loss_clip": 1.04084218, - "balance_loss_mlp": 1.02164721, - "epoch": 0.6428979407785961, - "flos": 23841776816640.0, - "grad_norm": 1.9248860914210837, - "language_loss": 0.61550558, - "learning_rate": 1.1949200571449263e-06, - "loss": 0.63689899, - "num_input_tokens_seen": 230669265, - "step": 10693, - "time_per_iteration": 2.6779651641845703 - }, - { - "auxiliary_loss_clip": 0.01080488, - "auxiliary_loss_mlp": 0.0102922, - "balance_loss_clip": 1.04029202, - "balance_loss_mlp": 1.016065, - "epoch": 0.6429580640312641, - "flos": 32926852892160.0, - "grad_norm": 2.329079095224612, - "language_loss": 0.59532356, - "learning_rate": 1.1945635582498903e-06, - "loss": 0.61642069, - "num_input_tokens_seen": 230690575, - "step": 10694, - "time_per_iteration": 2.8363914489746094 - }, - { - "auxiliary_loss_clip": 0.01089804, - "auxiliary_loss_mlp": 0.01035527, - "balance_loss_clip": 1.03853726, - "balance_loss_mlp": 1.02255106, - "epoch": 0.643018187283932, - "flos": 21068359896960.0, - "grad_norm": 1.4014414192812676, - "language_loss": 0.80109406, - "learning_rate": 1.1942070898970853e-06, - "loss": 0.82234728, - "num_input_tokens_seen": 230709420, - "step": 10695, - "time_per_iteration": 2.6794557571411133 - }, - { - "auxiliary_loss_clip": 0.01116687, - "auxiliary_loss_mlp": 0.01040293, - "balance_loss_clip": 1.04089379, - "balance_loss_mlp": 1.02677488, - "epoch": 0.6430783105366, - "flos": 26724649455360.0, - "grad_norm": 1.7759454400987778, - "language_loss": 0.73687971, - "learning_rate": 1.1938506521000285e-06, - "loss": 0.75844944, - "num_input_tokens_seen": 230729350, - "step": 10696, - "time_per_iteration": 2.7068281173706055 - }, - { - "auxiliary_loss_clip": 0.01078835, - "auxiliary_loss_mlp": 0.01029709, - "balance_loss_clip": 1.03717327, - "balance_loss_mlp": 1.01736438, - "epoch": 0.643138433789268, - "flos": 23696841438720.0, - "grad_norm": 1.6299732646475602, - "language_loss": 0.75820529, - "learning_rate": 1.1934942448722347e-06, - "loss": 0.7792908, - "num_input_tokens_seen": 230749220, - "step": 10697, - "time_per_iteration": 2.8328888416290283 - }, - { - "auxiliary_loss_clip": 0.01091041, - "auxiliary_loss_mlp": 0.01032937, - "balance_loss_clip": 1.03859711, - "balance_loss_mlp": 1.02061689, - "epoch": 0.643198557041936, - "flos": 34202184255360.0, - "grad_norm": 1.3945921589698136, - "language_loss": 0.65932959, - "learning_rate": 1.1931378682272208e-06, - "loss": 0.68056941, - "num_input_tokens_seen": 230770245, - "step": 10698, - "time_per_iteration": 2.784822702407837 - }, - { - "auxiliary_loss_clip": 0.01036478, - "auxiliary_loss_mlp": 0.01005901, - "balance_loss_clip": 1.01277423, - "balance_loss_mlp": 1.00470889, - "epoch": 0.643258680294604, - "flos": 67626473621760.0, - "grad_norm": 0.8642865572859256, - "language_loss": 0.63445872, - "learning_rate": 1.1927815221784996e-06, - "loss": 0.65488249, - "num_input_tokens_seen": 230837030, - "step": 10699, - "time_per_iteration": 3.1397321224212646 - }, - { - "auxiliary_loss_clip": 0.01103425, - "auxiliary_loss_mlp": 0.01028666, - "balance_loss_clip": 1.04155254, - "balance_loss_mlp": 1.01698923, - "epoch": 0.6433188035472719, - "flos": 25185984508800.0, - "grad_norm": 1.8812795881876412, - "language_loss": 0.69277722, - "learning_rate": 1.1924252067395838e-06, - "loss": 0.71409816, - "num_input_tokens_seen": 230856845, - "step": 10700, - "time_per_iteration": 2.6566555500030518 - }, - { - "auxiliary_loss_clip": 0.01115928, - "auxiliary_loss_mlp": 0.01028377, - "balance_loss_clip": 1.04087234, - "balance_loss_mlp": 1.01547289, - "epoch": 0.6433789267999399, - "flos": 24973573432320.0, - "grad_norm": 2.050726314143076, - "language_loss": 0.7285673, - "learning_rate": 1.1920689219239855e-06, - "loss": 0.75001037, - "num_input_tokens_seen": 230878785, - "step": 10701, - "time_per_iteration": 2.7663381099700928 - }, - { - "auxiliary_loss_clip": 0.01106257, - "auxiliary_loss_mlp": 0.01031843, - "balance_loss_clip": 1.03919315, - "balance_loss_mlp": 1.01695347, - "epoch": 0.6434390500526078, - "flos": 17566028282880.0, - "grad_norm": 1.983939492381853, - "language_loss": 0.82094157, - "learning_rate": 1.1917126677452144e-06, - "loss": 0.84232259, - "num_input_tokens_seen": 230895445, - "step": 10702, - "time_per_iteration": 2.634734630584717 - }, - { - "auxiliary_loss_clip": 0.01084567, - "auxiliary_loss_mlp": 0.01040406, - "balance_loss_clip": 1.03733373, - "balance_loss_mlp": 1.02802002, - "epoch": 0.6434991733052758, - "flos": 20843594542080.0, - "grad_norm": 2.1366744665576536, - "language_loss": 0.74528348, - "learning_rate": 1.1913564442167798e-06, - "loss": 0.76653326, - "num_input_tokens_seen": 230911375, - "step": 10703, - "time_per_iteration": 2.712024688720703 - }, - { - "auxiliary_loss_clip": 0.00980042, - "auxiliary_loss_mlp": 0.01002542, - "balance_loss_clip": 1.00990796, - "balance_loss_mlp": 1.00124288, - "epoch": 0.6435592965579437, - "flos": 66094596345600.0, - "grad_norm": 0.6668164665543085, - "language_loss": 0.54507017, - "learning_rate": 1.1910002513521898e-06, - "loss": 0.56489605, - "num_input_tokens_seen": 230975990, - "step": 10704, - "time_per_iteration": 3.391496419906616 - }, - { - "auxiliary_loss_clip": 0.01074279, - "auxiliary_loss_mlp": 0.01024183, - "balance_loss_clip": 1.03965342, - "balance_loss_mlp": 1.01269126, - "epoch": 0.6436194198106118, - "flos": 23768842250880.0, - "grad_norm": 1.6398007726436414, - "language_loss": 0.76942575, - "learning_rate": 1.1906440891649519e-06, - "loss": 0.79041034, - "num_input_tokens_seen": 230997110, - "step": 10705, - "time_per_iteration": 3.151123523712158 - }, - { - "auxiliary_loss_clip": 0.01080341, - "auxiliary_loss_mlp": 0.01040696, - "balance_loss_clip": 1.03794503, - "balance_loss_mlp": 1.02824438, - "epoch": 0.6436795430632797, - "flos": 20230312705920.0, - "grad_norm": 1.6220851966206657, - "language_loss": 0.78966212, - "learning_rate": 1.1902879576685708e-06, - "loss": 0.81087244, - "num_input_tokens_seen": 231015590, - "step": 10706, - "time_per_iteration": 2.7351467609405518 - }, - { - "auxiliary_loss_clip": 0.01073614, - "auxiliary_loss_mlp": 0.01037334, - "balance_loss_clip": 1.03537798, - "balance_loss_mlp": 1.02350581, - "epoch": 0.6437396663159477, - "flos": 20301846641280.0, - "grad_norm": 1.995060337991945, - "language_loss": 0.80729055, - "learning_rate": 1.1899318568765518e-06, - "loss": 0.82840002, - "num_input_tokens_seen": 231033800, - "step": 10707, - "time_per_iteration": 2.8090367317199707 - }, - { - "auxiliary_loss_clip": 0.01102074, - "auxiliary_loss_mlp": 0.01034341, - "balance_loss_clip": 1.03903484, - "balance_loss_mlp": 1.02176499, - "epoch": 0.6437997895686156, - "flos": 23878585278720.0, - "grad_norm": 1.8783086721412918, - "language_loss": 0.85947567, - "learning_rate": 1.1895757868023978e-06, - "loss": 0.88083982, - "num_input_tokens_seen": 231053160, - "step": 10708, - "time_per_iteration": 2.7102444171905518 - }, - { - "auxiliary_loss_clip": 0.01070026, - "auxiliary_loss_mlp": 0.0104392, - "balance_loss_clip": 1.04000461, - "balance_loss_mlp": 1.02895367, - "epoch": 0.6438599128212836, - "flos": 18989275852800.0, - "grad_norm": 2.169380763975439, - "language_loss": 0.65262228, - "learning_rate": 1.1892197474596106e-06, - "loss": 0.67376173, - "num_input_tokens_seen": 231069470, - "step": 10709, - "time_per_iteration": 2.6978535652160645 - }, - { - "auxiliary_loss_clip": 0.01115477, - "auxiliary_loss_mlp": 0.01032813, - "balance_loss_clip": 1.04076731, - "balance_loss_mlp": 1.02048671, - "epoch": 0.6439200360739517, - "flos": 24096347481600.0, - "grad_norm": 1.8116959175260157, - "language_loss": 0.80929708, - "learning_rate": 1.1888637388616929e-06, - "loss": 0.83077991, - "num_input_tokens_seen": 231088205, - "step": 10710, - "time_per_iteration": 2.6809825897216797 - }, - { - "auxiliary_loss_clip": 0.0110175, - "auxiliary_loss_mlp": 0.01032632, - "balance_loss_clip": 1.03748906, - "balance_loss_mlp": 1.0203414, - "epoch": 0.6439801593266196, - "flos": 31902141697920.0, - "grad_norm": 2.6140299044708106, - "language_loss": 0.6634506, - "learning_rate": 1.1885077610221425e-06, - "loss": 0.68479443, - "num_input_tokens_seen": 231107850, - "step": 10711, - "time_per_iteration": 2.71571946144104 - }, - { - "auxiliary_loss_clip": 0.01077359, - "auxiliary_loss_mlp": 0.01033147, - "balance_loss_clip": 1.0414753, - "balance_loss_mlp": 1.02000391, - "epoch": 0.6440402825792876, - "flos": 27125879351040.0, - "grad_norm": 2.2683722533974437, - "language_loss": 0.78656554, - "learning_rate": 1.1881518139544597e-06, - "loss": 0.80767059, - "num_input_tokens_seen": 231127200, - "step": 10712, - "time_per_iteration": 2.785280466079712 - }, - { - "auxiliary_loss_clip": 0.01103094, - "auxiliary_loss_mlp": 0.01037973, - "balance_loss_clip": 1.03856206, - "balance_loss_mlp": 1.02487159, - "epoch": 0.6441004058319555, - "flos": 20667704618880.0, - "grad_norm": 1.6337129224497011, - "language_loss": 0.82845241, - "learning_rate": 1.1877958976721417e-06, - "loss": 0.84986305, - "num_input_tokens_seen": 231146360, - "step": 10713, - "time_per_iteration": 2.6682519912719727 - }, - { - "auxiliary_loss_clip": 0.01111989, - "auxiliary_loss_mlp": 0.0103674, - "balance_loss_clip": 1.04118943, - "balance_loss_mlp": 1.02455091, - "epoch": 0.6441605290846235, - "flos": 26026006947840.0, - "grad_norm": 1.377683768387238, - "language_loss": 0.78550875, - "learning_rate": 1.187440012188684e-06, - "loss": 0.80699605, - "num_input_tokens_seen": 231168350, - "step": 10714, - "time_per_iteration": 2.6294350624084473 - }, - { - "auxiliary_loss_clip": 0.01081537, - "auxiliary_loss_mlp": 0.01031396, - "balance_loss_clip": 1.03937292, - "balance_loss_mlp": 1.01982093, - "epoch": 0.6442206523372914, - "flos": 24899489631360.0, - "grad_norm": 1.6804962466974145, - "language_loss": 0.8137539, - "learning_rate": 1.187084157517583e-06, - "loss": 0.83488327, - "num_input_tokens_seen": 231188385, - "step": 10715, - "time_per_iteration": 2.7179040908813477 - }, - { - "auxiliary_loss_clip": 0.01083275, - "auxiliary_loss_mlp": 0.01033506, - "balance_loss_clip": 1.03462327, - "balance_loss_mlp": 1.02041125, - "epoch": 0.6442807755899594, - "flos": 25156322853120.0, - "grad_norm": 2.56330690161098, - "language_loss": 0.81656396, - "learning_rate": 1.186728333672332e-06, - "loss": 0.83773172, - "num_input_tokens_seen": 231209880, - "step": 10716, - "time_per_iteration": 2.71616268157959 - }, - { - "auxiliary_loss_clip": 0.01080679, - "auxiliary_loss_mlp": 0.01037142, - "balance_loss_clip": 1.03870273, - "balance_loss_mlp": 1.02335536, - "epoch": 0.6443408988426274, - "flos": 27344503480320.0, - "grad_norm": 2.019166193158946, - "language_loss": 0.78575444, - "learning_rate": 1.186372540666424e-06, - "loss": 0.80693269, - "num_input_tokens_seen": 231230765, - "step": 10717, - "time_per_iteration": 2.7821998596191406 - }, - { - "auxiliary_loss_clip": 0.01111081, - "auxiliary_loss_mlp": 0.01033784, - "balance_loss_clip": 1.03954279, - "balance_loss_mlp": 1.0215416, - "epoch": 0.6444010220952954, - "flos": 27928339142400.0, - "grad_norm": 1.554880211694131, - "language_loss": 0.68287563, - "learning_rate": 1.1860167785133513e-06, - "loss": 0.70432431, - "num_input_tokens_seen": 231252350, - "step": 10718, - "time_per_iteration": 2.619870662689209 - }, - { - "auxiliary_loss_clip": 0.01025406, - "auxiliary_loss_mlp": 0.01008951, - "balance_loss_clip": 1.01146674, - "balance_loss_mlp": 1.00788391, - "epoch": 0.6444611453479633, - "flos": 71215024855680.0, - "grad_norm": 0.7631804630715925, - "language_loss": 0.49633595, - "learning_rate": 1.185661047226603e-06, - "loss": 0.51667953, - "num_input_tokens_seen": 231313865, - "step": 10719, - "time_per_iteration": 3.3252131938934326 - }, - { - "auxiliary_loss_clip": 0.01118591, - "auxiliary_loss_mlp": 0.01039818, - "balance_loss_clip": 1.04287648, - "balance_loss_mlp": 1.02602601, - "epoch": 0.6445212686006313, - "flos": 22705131864960.0, - "grad_norm": 2.1022111741366603, - "language_loss": 0.77604353, - "learning_rate": 1.18530534681967e-06, - "loss": 0.79762757, - "num_input_tokens_seen": 231331710, - "step": 10720, - "time_per_iteration": 2.6171679496765137 - }, - { - "auxiliary_loss_clip": 0.01094489, - "auxiliary_loss_mlp": 0.01034779, - "balance_loss_clip": 1.04128611, - "balance_loss_mlp": 1.02126074, - "epoch": 0.6445813918532992, - "flos": 21178821196800.0, - "grad_norm": 1.7066840296237504, - "language_loss": 0.76980746, - "learning_rate": 1.18494967730604e-06, - "loss": 0.79110014, - "num_input_tokens_seen": 231350705, - "step": 10721, - "time_per_iteration": 2.8883464336395264 - }, - { - "auxiliary_loss_clip": 0.01077386, - "auxiliary_loss_mlp": 0.01035031, - "balance_loss_clip": 1.03889298, - "balance_loss_mlp": 1.02178049, - "epoch": 0.6446415151059672, - "flos": 25191910252800.0, - "grad_norm": 2.156937552750908, - "language_loss": 0.73425972, - "learning_rate": 1.1845940386991995e-06, - "loss": 0.75538391, - "num_input_tokens_seen": 231369550, - "step": 10722, - "time_per_iteration": 3.0992050170898438 - }, - { - "auxiliary_loss_clip": 0.0111233, - "auxiliary_loss_mlp": 0.01033624, - "balance_loss_clip": 1.03991735, - "balance_loss_mlp": 1.02135682, - "epoch": 0.6447016383586353, - "flos": 25302227898240.0, - "grad_norm": 1.8325068766714112, - "language_loss": 0.77818036, - "learning_rate": 1.184238431012635e-06, - "loss": 0.79963994, - "num_input_tokens_seen": 231389285, - "step": 10723, - "time_per_iteration": 2.6199328899383545 - }, - { - "auxiliary_loss_clip": 0.01104393, - "auxiliary_loss_mlp": 0.01038636, - "balance_loss_clip": 1.03816402, - "balance_loss_mlp": 1.02488565, - "epoch": 0.6447617616113032, - "flos": 27703142824320.0, - "grad_norm": 2.2443871002503903, - "language_loss": 0.58686608, - "learning_rate": 1.1838828542598312e-06, - "loss": 0.60829639, - "num_input_tokens_seen": 231408820, - "step": 10724, - "time_per_iteration": 4.554950475692749 - }, - { - "auxiliary_loss_clip": 0.01102176, - "auxiliary_loss_mlp": 0.01033682, - "balance_loss_clip": 1.0418992, - "balance_loss_mlp": 1.02188635, - "epoch": 0.6448218848639712, - "flos": 23039101543680.0, - "grad_norm": 1.7131170240074274, - "language_loss": 0.83707219, - "learning_rate": 1.183527308454271e-06, - "loss": 0.8584308, - "num_input_tokens_seen": 231428100, - "step": 10725, - "time_per_iteration": 2.5963871479034424 - }, - { - "auxiliary_loss_clip": 0.01089104, - "auxiliary_loss_mlp": 0.01037801, - "balance_loss_clip": 1.03586388, - "balance_loss_mlp": 1.02444363, - "epoch": 0.6448820081166391, - "flos": 24496104919680.0, - "grad_norm": 1.7945503193220944, - "language_loss": 0.82327414, - "learning_rate": 1.1831717936094368e-06, - "loss": 0.84454322, - "num_input_tokens_seen": 231445810, - "step": 10726, - "time_per_iteration": 6.177702188491821 - }, - { - "auxiliary_loss_clip": 0.0110184, - "auxiliary_loss_mlp": 0.01037744, - "balance_loss_clip": 1.03911293, - "balance_loss_mlp": 1.02391601, - "epoch": 0.6449421313693071, - "flos": 22419283432320.0, - "grad_norm": 5.950779634023435, - "language_loss": 0.81306756, - "learning_rate": 1.1828163097388108e-06, - "loss": 0.83446342, - "num_input_tokens_seen": 231463570, - "step": 10727, - "time_per_iteration": 2.646756172180176 - }, - { - "auxiliary_loss_clip": 0.01114052, - "auxiliary_loss_mlp": 0.01035116, - "balance_loss_clip": 1.04432821, - "balance_loss_mlp": 1.02101326, - "epoch": 0.645002254621975, - "flos": 20225715765120.0, - "grad_norm": 2.0767423550252047, - "language_loss": 0.79137063, - "learning_rate": 1.1824608568558717e-06, - "loss": 0.81286234, - "num_input_tokens_seen": 231482155, - "step": 10728, - "time_per_iteration": 2.6014702320098877 - }, - { - "auxiliary_loss_clip": 0.01018281, - "auxiliary_loss_mlp": 0.01043432, - "balance_loss_clip": 1.03341746, - "balance_loss_mlp": 1.02857876, - "epoch": 0.645062377874643, - "flos": 27855440490240.0, - "grad_norm": 1.6698019924695346, - "language_loss": 0.74069214, - "learning_rate": 1.1821054349740988e-06, - "loss": 0.76130933, - "num_input_tokens_seen": 231502465, - "step": 10729, - "time_per_iteration": 2.9942080974578857 - }, - { - "auxiliary_loss_clip": 0.01072033, - "auxiliary_loss_mlp": 0.01034846, - "balance_loss_clip": 1.03895199, - "balance_loss_mlp": 1.0206902, - "epoch": 0.645122501127311, - "flos": 25301509626240.0, - "grad_norm": 1.675292027703949, - "language_loss": 0.66314375, - "learning_rate": 1.1817500441069706e-06, - "loss": 0.68421257, - "num_input_tokens_seen": 231522740, - "step": 10730, - "time_per_iteration": 3.029480218887329 - }, - { - "auxiliary_loss_clip": 0.01053326, - "auxiliary_loss_mlp": 0.01035886, - "balance_loss_clip": 1.03969455, - "balance_loss_mlp": 1.02077615, - "epoch": 0.645182624379979, - "flos": 18807352444800.0, - "grad_norm": 1.6301580114634824, - "language_loss": 0.63516945, - "learning_rate": 1.1813946842679614e-06, - "loss": 0.65606159, - "num_input_tokens_seen": 231542050, - "step": 10731, - "time_per_iteration": 4.425801038742065 - }, - { - "auxiliary_loss_clip": 0.01111857, - "auxiliary_loss_mlp": 0.01032419, - "balance_loss_clip": 1.03885424, - "balance_loss_mlp": 1.01941907, - "epoch": 0.6452427476326469, - "flos": 18332182402560.0, - "grad_norm": 1.6688797138193545, - "language_loss": 0.68021357, - "learning_rate": 1.1810393554705492e-06, - "loss": 0.70165634, - "num_input_tokens_seen": 231560380, - "step": 10732, - "time_per_iteration": 2.531669855117798 - }, - { - "auxiliary_loss_clip": 0.01104232, - "auxiliary_loss_mlp": 0.01036786, - "balance_loss_clip": 1.04108346, - "balance_loss_mlp": 1.0236969, - "epoch": 0.6453028708853149, - "flos": 22784746360320.0, - "grad_norm": 2.2675077381725557, - "language_loss": 0.75637865, - "learning_rate": 1.1806840577282055e-06, - "loss": 0.77778876, - "num_input_tokens_seen": 231580810, - "step": 10733, - "time_per_iteration": 2.6263926029205322 - }, - { - "auxiliary_loss_clip": 0.01104718, - "auxiliary_loss_mlp": 0.01039721, - "balance_loss_clip": 1.03942811, - "balance_loss_mlp": 1.02548099, - "epoch": 0.6453629941379828, - "flos": 23945989150080.0, - "grad_norm": 2.5422080980889903, - "language_loss": 0.66799378, - "learning_rate": 1.1803287910544048e-06, - "loss": 0.6894381, - "num_input_tokens_seen": 231600585, - "step": 10734, - "time_per_iteration": 2.639566421508789 - }, - { - "auxiliary_loss_clip": 0.01113842, - "auxiliary_loss_mlp": 0.01041504, - "balance_loss_clip": 1.04339838, - "balance_loss_mlp": 1.028898, - "epoch": 0.6454231173906508, - "flos": 17676381841920.0, - "grad_norm": 1.794099580406708, - "language_loss": 0.73622543, - "learning_rate": 1.1799735554626191e-06, - "loss": 0.75777888, - "num_input_tokens_seen": 231618765, - "step": 10735, - "time_per_iteration": 2.5158708095550537 - }, - { - "auxiliary_loss_clip": 0.01052163, - "auxiliary_loss_mlp": 0.00771954, - "balance_loss_clip": 1.03596699, - "balance_loss_mlp": 1.00020361, - "epoch": 0.6454832406433189, - "flos": 23292774368640.0, - "grad_norm": 1.8433870916344732, - "language_loss": 0.74927819, - "learning_rate": 1.1796183509663176e-06, - "loss": 0.76751935, - "num_input_tokens_seen": 231638525, - "step": 10736, - "time_per_iteration": 2.781177282333374 - }, - { - "auxiliary_loss_clip": 0.01109179, - "auxiliary_loss_mlp": 0.01033432, - "balance_loss_clip": 1.04235053, - "balance_loss_mlp": 1.01909697, - "epoch": 0.6455433638959868, - "flos": 20157198572160.0, - "grad_norm": 1.9123509169430688, - "language_loss": 0.70616424, - "learning_rate": 1.1792631775789708e-06, - "loss": 0.72759038, - "num_input_tokens_seen": 231656785, - "step": 10737, - "time_per_iteration": 2.5800046920776367 - }, - { - "auxiliary_loss_clip": 0.0102545, - "auxiliary_loss_mlp": 0.01002929, - "balance_loss_clip": 1.01085997, - "balance_loss_mlp": 1.00164151, - "epoch": 0.6456034871486548, - "flos": 66532922012160.0, - "grad_norm": 0.7817772178911736, - "language_loss": 0.58405674, - "learning_rate": 1.1789080353140464e-06, - "loss": 0.60434055, - "num_input_tokens_seen": 231719075, - "step": 10738, - "time_per_iteration": 3.238203287124634 - }, - { - "auxiliary_loss_clip": 0.01079809, - "auxiliary_loss_mlp": 0.01029827, - "balance_loss_clip": 1.0387454, - "balance_loss_mlp": 1.01666009, - "epoch": 0.6456636104013227, - "flos": 24206090509440.0, - "grad_norm": 1.920167100598176, - "language_loss": 0.74507523, - "learning_rate": 1.1785529241850118e-06, - "loss": 0.76617157, - "num_input_tokens_seen": 231737810, - "step": 10739, - "time_per_iteration": 2.704909324645996 - }, - { - "auxiliary_loss_clip": 0.01096514, - "auxiliary_loss_mlp": 0.00771409, - "balance_loss_clip": 1.04137897, - "balance_loss_mlp": 1.00027609, - "epoch": 0.6457237336539907, - "flos": 23624086440960.0, - "grad_norm": 1.8028230929667255, - "language_loss": 0.70776832, - "learning_rate": 1.1781978442053324e-06, - "loss": 0.72644746, - "num_input_tokens_seen": 231756140, - "step": 10740, - "time_per_iteration": 2.6947245597839355 - }, - { - "auxiliary_loss_clip": 0.01016337, - "auxiliary_loss_mlp": 0.01004394, - "balance_loss_clip": 1.01068592, - "balance_loss_mlp": 1.00314224, - "epoch": 0.6457838569066586, - "flos": 65846023251840.0, - "grad_norm": 0.8728350789543404, - "language_loss": 0.55255193, - "learning_rate": 1.1778427953884733e-06, - "loss": 0.57275927, - "num_input_tokens_seen": 231823665, - "step": 10741, - "time_per_iteration": 3.214613676071167 - }, - { - "auxiliary_loss_clip": 0.01113695, - "auxiliary_loss_mlp": 0.01034634, - "balance_loss_clip": 1.04090226, - "balance_loss_mlp": 1.02212918, - "epoch": 0.6458439801593266, - "flos": 22381972179840.0, - "grad_norm": 1.5851201591734638, - "language_loss": 0.80647045, - "learning_rate": 1.1774877777478977e-06, - "loss": 0.8279537, - "num_input_tokens_seen": 231844500, - "step": 10742, - "time_per_iteration": 2.6147494316101074 - }, - { - "auxiliary_loss_clip": 0.01089275, - "auxiliary_loss_mlp": 0.01034555, - "balance_loss_clip": 1.03800607, - "balance_loss_mlp": 1.02160883, - "epoch": 0.6459041034119946, - "flos": 24789243813120.0, - "grad_norm": 1.493920390788815, - "language_loss": 0.81934315, - "learning_rate": 1.1771327912970678e-06, - "loss": 0.84058142, - "num_input_tokens_seen": 231864510, - "step": 10743, - "time_per_iteration": 2.7598674297332764 - }, - { - "auxiliary_loss_clip": 0.01088471, - "auxiliary_loss_mlp": 0.01032232, - "balance_loss_clip": 1.03786039, - "balance_loss_mlp": 1.01933324, - "epoch": 0.6459642266646626, - "flos": 18325358818560.0, - "grad_norm": 5.256757204998113, - "language_loss": 0.7177366, - "learning_rate": 1.1767778360494453e-06, - "loss": 0.73894364, - "num_input_tokens_seen": 231881555, - "step": 10744, - "time_per_iteration": 2.620422840118408 - }, - { - "auxiliary_loss_clip": 0.01114623, - "auxiliary_loss_mlp": 0.01029271, - "balance_loss_clip": 1.04074514, - "balance_loss_mlp": 1.01683736, - "epoch": 0.6460243499173305, - "flos": 43581368891520.0, - "grad_norm": 1.6850885635931934, - "language_loss": 0.66688418, - "learning_rate": 1.1764229120184896e-06, - "loss": 0.68832302, - "num_input_tokens_seen": 231905945, - "step": 10745, - "time_per_iteration": 2.7924861907958984 - }, - { - "auxiliary_loss_clip": 0.01101668, - "auxiliary_loss_mlp": 0.01034902, - "balance_loss_clip": 1.03878927, - "balance_loss_mlp": 1.02122271, - "epoch": 0.6460844731699985, - "flos": 19244026085760.0, - "grad_norm": 2.3841357931880536, - "language_loss": 0.73933601, - "learning_rate": 1.1760680192176597e-06, - "loss": 0.76070166, - "num_input_tokens_seen": 231922535, - "step": 10746, - "time_per_iteration": 2.607113838195801 - }, - { - "auxiliary_loss_clip": 0.01106683, - "auxiliary_loss_mlp": 0.01035848, - "balance_loss_clip": 1.04162467, - "balance_loss_mlp": 1.02289009, - "epoch": 0.6461445964226664, - "flos": 27453348668160.0, - "grad_norm": 1.3562492191561222, - "language_loss": 0.66809833, - "learning_rate": 1.175713157660413e-06, - "loss": 0.6895237, - "num_input_tokens_seen": 231944800, - "step": 10747, - "time_per_iteration": 2.7339725494384766 - }, - { - "auxiliary_loss_clip": 0.01082798, - "auxiliary_loss_mlp": 0.0104212, - "balance_loss_clip": 1.03962016, - "balance_loss_mlp": 1.02953124, - "epoch": 0.6462047196753344, - "flos": 20295489934080.0, - "grad_norm": 1.7696623956762259, - "language_loss": 0.67370367, - "learning_rate": 1.1753583273602056e-06, - "loss": 0.69495285, - "num_input_tokens_seen": 231962970, - "step": 10748, - "time_per_iteration": 2.733555555343628 - }, - { - "auxiliary_loss_clip": 0.01117812, - "auxiliary_loss_mlp": 0.01044313, - "balance_loss_clip": 1.04119956, - "balance_loss_mlp": 1.03015089, - "epoch": 0.6462648429280025, - "flos": 22018340845440.0, - "grad_norm": 1.9035207458082712, - "language_loss": 0.75889313, - "learning_rate": 1.1750035283304937e-06, - "loss": 0.78051442, - "num_input_tokens_seen": 231981195, - "step": 10749, - "time_per_iteration": 2.6402747631073 - }, - { - "auxiliary_loss_clip": 0.01075833, - "auxiliary_loss_mlp": 0.01041632, - "balance_loss_clip": 1.03445184, - "balance_loss_mlp": 1.02752352, - "epoch": 0.6463249661806704, - "flos": 27781141207680.0, - "grad_norm": 1.5147294862876182, - "language_loss": 0.77007931, - "learning_rate": 1.17464876058473e-06, - "loss": 0.79125392, - "num_input_tokens_seen": 232001735, - "step": 10750, - "time_per_iteration": 2.7375411987304688 - }, - { - "auxiliary_loss_clip": 0.01097872, - "auxiliary_loss_mlp": 0.01038153, - "balance_loss_clip": 1.03953791, - "balance_loss_mlp": 1.02282298, - "epoch": 0.6463850894333384, - "flos": 22050588280320.0, - "grad_norm": 2.1693323351013496, - "language_loss": 0.68254787, - "learning_rate": 1.1742940241363683e-06, - "loss": 0.70390815, - "num_input_tokens_seen": 232019830, - "step": 10751, - "time_per_iteration": 2.757457733154297 - }, - { - "auxiliary_loss_clip": 0.01088079, - "auxiliary_loss_mlp": 0.01032447, - "balance_loss_clip": 1.03963614, - "balance_loss_mlp": 1.0185945, - "epoch": 0.6464452126860063, - "flos": 21106245767040.0, - "grad_norm": 1.9208554879181607, - "language_loss": 0.71538639, - "learning_rate": 1.1739393189988604e-06, - "loss": 0.73659164, - "num_input_tokens_seen": 232039625, - "step": 10752, - "time_per_iteration": 2.702068328857422 - }, - { - "auxiliary_loss_clip": 0.0108316, - "auxiliary_loss_mlp": 0.0104047, - "balance_loss_clip": 1.03569722, - "balance_loss_mlp": 1.02468061, - "epoch": 0.6465053359386743, - "flos": 16028045694720.0, - "grad_norm": 1.6304463713193273, - "language_loss": 0.78174138, - "learning_rate": 1.1735846451856554e-06, - "loss": 0.80297774, - "num_input_tokens_seen": 232055855, - "step": 10753, - "time_per_iteration": 2.679288387298584 - }, - { - "auxiliary_loss_clip": 0.01114663, - "auxiliary_loss_mlp": 0.01041928, - "balance_loss_clip": 1.04108715, - "balance_loss_mlp": 1.02888012, - "epoch": 0.6465654591913422, - "flos": 23398674641280.0, - "grad_norm": 1.8389919923642137, - "language_loss": 0.85325253, - "learning_rate": 1.1732300027102041e-06, - "loss": 0.87481844, - "num_input_tokens_seen": 232073475, - "step": 10754, - "time_per_iteration": 2.7047979831695557 - }, - { - "auxiliary_loss_clip": 0.01089928, - "auxiliary_loss_mlp": 0.01033615, - "balance_loss_clip": 1.0371294, - "balance_loss_mlp": 1.02018571, - "epoch": 0.6466255824440102, - "flos": 15377273038080.0, - "grad_norm": 2.0086067487297203, - "language_loss": 0.596542, - "learning_rate": 1.1728753915859541e-06, - "loss": 0.61777741, - "num_input_tokens_seen": 232091090, - "step": 10755, - "time_per_iteration": 2.660458564758301 - }, - { - "auxiliary_loss_clip": 0.01070404, - "auxiliary_loss_mlp": 0.01034574, - "balance_loss_clip": 1.03757024, - "balance_loss_mlp": 1.02103186, - "epoch": 0.6466857056966782, - "flos": 16252846963200.0, - "grad_norm": 2.348911212047805, - "language_loss": 0.68158704, - "learning_rate": 1.1725208118263518e-06, - "loss": 0.70263684, - "num_input_tokens_seen": 232107320, - "step": 10756, - "time_per_iteration": 2.667661190032959 - }, - { - "auxiliary_loss_clip": 0.0107653, - "auxiliary_loss_mlp": 0.01039991, - "balance_loss_clip": 1.03933072, - "balance_loss_mlp": 1.02511406, - "epoch": 0.6467458289493462, - "flos": 21178246579200.0, - "grad_norm": 2.3037886815422772, - "language_loss": 0.74333578, - "learning_rate": 1.172166263444844e-06, - "loss": 0.76450104, - "num_input_tokens_seen": 232123930, - "step": 10757, - "time_per_iteration": 2.752260446548462 - }, - { - "auxiliary_loss_clip": 0.01064083, - "auxiliary_loss_mlp": 0.01037594, - "balance_loss_clip": 1.0400213, - "balance_loss_mlp": 1.02434397, - "epoch": 0.6468059522020141, - "flos": 17968299672960.0, - "grad_norm": 1.4896032445983383, - "language_loss": 0.74085969, - "learning_rate": 1.1718117464548734e-06, - "loss": 0.76187646, - "num_input_tokens_seen": 232142905, - "step": 10758, - "time_per_iteration": 2.752277135848999 - }, - { - "auxiliary_loss_clip": 0.01078484, - "auxiliary_loss_mlp": 0.0103444, - "balance_loss_clip": 1.04134357, - "balance_loss_mlp": 1.02081478, - "epoch": 0.6468660754546821, - "flos": 17890157635200.0, - "grad_norm": 1.5569302711566517, - "language_loss": 0.67830229, - "learning_rate": 1.1714572608698845e-06, - "loss": 0.69943154, - "num_input_tokens_seen": 232162230, - "step": 10759, - "time_per_iteration": 2.6961419582366943 - }, - { - "auxiliary_loss_clip": 0.01078582, - "auxiliary_loss_mlp": 0.01038565, - "balance_loss_clip": 1.03437579, - "balance_loss_mlp": 1.02430177, - "epoch": 0.64692619870735, - "flos": 22600991358720.0, - "grad_norm": 1.7675629477863553, - "language_loss": 0.75511646, - "learning_rate": 1.1711028067033197e-06, - "loss": 0.77628791, - "num_input_tokens_seen": 232182700, - "step": 10760, - "time_per_iteration": 2.7628531455993652 - }, - { - "auxiliary_loss_clip": 0.01088869, - "auxiliary_loss_mlp": 0.01035724, - "balance_loss_clip": 1.03735101, - "balance_loss_mlp": 1.02188993, - "epoch": 0.646986321960018, - "flos": 49600786993920.0, - "grad_norm": 1.635479063212096, - "language_loss": 0.65361971, - "learning_rate": 1.1707483839686194e-06, - "loss": 0.6748656, - "num_input_tokens_seen": 232208235, - "step": 10761, - "time_per_iteration": 2.939115047454834 - }, - { - "auxiliary_loss_clip": 0.01069611, - "auxiliary_loss_mlp": 0.01035372, - "balance_loss_clip": 1.03998923, - "balance_loss_mlp": 1.02115035, - "epoch": 0.6470464452126861, - "flos": 21908454163200.0, - "grad_norm": 2.1978879485100014, - "language_loss": 0.6946497, - "learning_rate": 1.1703939926792235e-06, - "loss": 0.71569955, - "num_input_tokens_seen": 232228720, - "step": 10762, - "time_per_iteration": 4.4654014110565186 - }, - { - "auxiliary_loss_clip": 0.01117949, - "auxiliary_loss_mlp": 0.01037436, - "balance_loss_clip": 1.04075444, - "balance_loss_mlp": 1.02360213, - "epoch": 0.647106568465354, - "flos": 18106124158080.0, - "grad_norm": 1.972655429723057, - "language_loss": 0.82998466, - "learning_rate": 1.1700396328485705e-06, - "loss": 0.85153854, - "num_input_tokens_seen": 232244655, - "step": 10763, - "time_per_iteration": 2.592090129852295 - }, - { - "auxiliary_loss_clip": 0.0103456, - "auxiliary_loss_mlp": 0.01005031, - "balance_loss_clip": 1.01049972, - "balance_loss_mlp": 1.00385058, - "epoch": 0.647166691718022, - "flos": 69480038125440.0, - "grad_norm": 0.712357320853497, - "language_loss": 0.57828617, - "learning_rate": 1.1696853044900978e-06, - "loss": 0.59868205, - "num_input_tokens_seen": 232308685, - "step": 10764, - "time_per_iteration": 3.3077809810638428 - }, - { - "auxiliary_loss_clip": 0.01077866, - "auxiliary_loss_mlp": 0.01033689, - "balance_loss_clip": 1.03704214, - "balance_loss_mlp": 1.02015924, - "epoch": 0.6472268149706899, - "flos": 34095170661120.0, - "grad_norm": 2.021573071850794, - "language_loss": 0.6068002, - "learning_rate": 1.1693310076172413e-06, - "loss": 0.62791574, - "num_input_tokens_seen": 232327520, - "step": 10765, - "time_per_iteration": 2.940326690673828 - }, - { - "auxiliary_loss_clip": 0.01113775, - "auxiliary_loss_mlp": 0.01033181, - "balance_loss_clip": 1.04050612, - "balance_loss_mlp": 1.02059865, - "epoch": 0.6472869382233579, - "flos": 28111232217600.0, - "grad_norm": 1.7427036976648405, - "language_loss": 0.62848121, - "learning_rate": 1.168976742243437e-06, - "loss": 0.64995074, - "num_input_tokens_seen": 232349025, - "step": 10766, - "time_per_iteration": 5.861475229263306 - }, - { - "auxiliary_loss_clip": 0.01090186, - "auxiliary_loss_mlp": 0.01036411, - "balance_loss_clip": 1.04002905, - "balance_loss_mlp": 1.02172494, - "epoch": 0.6473470614760258, - "flos": 22492146170880.0, - "grad_norm": 2.0617673547917255, - "language_loss": 0.75767088, - "learning_rate": 1.1686225083821174e-06, - "loss": 0.77893686, - "num_input_tokens_seen": 232367835, - "step": 10767, - "time_per_iteration": 2.7045323848724365 - }, - { - "auxiliary_loss_clip": 0.01096864, - "auxiliary_loss_mlp": 0.01033099, - "balance_loss_clip": 1.03984213, - "balance_loss_mlp": 1.02028418, - "epoch": 0.6474071847286939, - "flos": 14538938538240.0, - "grad_norm": 1.9988107632557572, - "language_loss": 0.78334147, - "learning_rate": 1.1682683060467153e-06, - "loss": 0.80464113, - "num_input_tokens_seen": 232385840, - "step": 10768, - "time_per_iteration": 2.603180170059204 - }, - { - "auxiliary_loss_clip": 0.01056997, - "auxiliary_loss_mlp": 0.01034297, - "balance_loss_clip": 1.03838003, - "balance_loss_mlp": 1.02096355, - "epoch": 0.6474673079813618, - "flos": 24098214988800.0, - "grad_norm": 1.607650242718932, - "language_loss": 0.71857584, - "learning_rate": 1.167914135250663e-06, - "loss": 0.73948884, - "num_input_tokens_seen": 232406205, - "step": 10769, - "time_per_iteration": 2.7530863285064697 - }, - { - "auxiliary_loss_clip": 0.01113406, - "auxiliary_loss_mlp": 0.0103478, - "balance_loss_clip": 1.04209769, - "balance_loss_mlp": 1.02214372, - "epoch": 0.6475274312340298, - "flos": 14976186796800.0, - "grad_norm": 1.9573022312706896, - "language_loss": 0.71980953, - "learning_rate": 1.1675599960073895e-06, - "loss": 0.74129134, - "num_input_tokens_seen": 232424995, - "step": 10770, - "time_per_iteration": 4.22503137588501 - }, - { - "auxiliary_loss_clip": 0.01073177, - "auxiliary_loss_mlp": 0.01031965, - "balance_loss_clip": 1.03501081, - "balance_loss_mlp": 1.01759458, - "epoch": 0.6475875544866977, - "flos": 25045322849280.0, - "grad_norm": 1.5542236081497367, - "language_loss": 0.73281699, - "learning_rate": 1.167205888330325e-06, - "loss": 0.75386834, - "num_input_tokens_seen": 232445870, - "step": 10771, - "time_per_iteration": 2.841069459915161 - }, - { - "auxiliary_loss_clip": 0.01074703, - "auxiliary_loss_mlp": 0.0103808, - "balance_loss_clip": 1.03516805, - "balance_loss_mlp": 1.02413297, - "epoch": 0.6476476777393657, - "flos": 16472153450880.0, - "grad_norm": 1.9087232907246778, - "language_loss": 0.74044871, - "learning_rate": 1.1668518122328958e-06, - "loss": 0.76157653, - "num_input_tokens_seen": 232464285, - "step": 10772, - "time_per_iteration": 2.775754690170288 - }, - { - "auxiliary_loss_clip": 0.01088465, - "auxiliary_loss_mlp": 0.01031281, - "balance_loss_clip": 1.03951991, - "balance_loss_mlp": 1.01950288, - "epoch": 0.6477078009920336, - "flos": 25812267068160.0, - "grad_norm": 1.563820733818388, - "language_loss": 0.8277418, - "learning_rate": 1.1664977677285305e-06, - "loss": 0.84893924, - "num_input_tokens_seen": 232485815, - "step": 10773, - "time_per_iteration": 2.7739098072052 - }, - { - "auxiliary_loss_clip": 0.01100228, - "auxiliary_loss_mlp": 0.00769385, - "balance_loss_clip": 1.03956735, - "balance_loss_mlp": 1.00008345, - "epoch": 0.6477679242447016, - "flos": 17676130446720.0, - "grad_norm": 1.451687382466444, - "language_loss": 0.78496003, - "learning_rate": 1.1661437548306524e-06, - "loss": 0.80365622, - "num_input_tokens_seen": 232504875, - "step": 10774, - "time_per_iteration": 2.7035605907440186 - }, - { - "auxiliary_loss_clip": 0.01104625, - "auxiliary_loss_mlp": 0.01040629, - "balance_loss_clip": 1.04012299, - "balance_loss_mlp": 1.02751637, - "epoch": 0.6478280474973696, - "flos": 21032305620480.0, - "grad_norm": 2.3182968489247986, - "language_loss": 0.68886763, - "learning_rate": 1.1657897735526867e-06, - "loss": 0.71032017, - "num_input_tokens_seen": 232521945, - "step": 10775, - "time_per_iteration": 2.7283878326416016 - }, - { - "auxiliary_loss_clip": 0.01078255, - "auxiliary_loss_mlp": 0.0104184, - "balance_loss_clip": 1.03620017, - "balance_loss_mlp": 1.02827358, - "epoch": 0.6478881707500376, - "flos": 21616931381760.0, - "grad_norm": 1.867125007130101, - "language_loss": 0.65918481, - "learning_rate": 1.1654358239080574e-06, - "loss": 0.68038571, - "num_input_tokens_seen": 232541500, - "step": 10776, - "time_per_iteration": 2.792161226272583 - }, - { - "auxiliary_loss_clip": 0.01086281, - "auxiliary_loss_mlp": 0.01040573, - "balance_loss_clip": 1.03693199, - "balance_loss_mlp": 1.0267868, - "epoch": 0.6479482940027056, - "flos": 18442571875200.0, - "grad_norm": 2.7363901491618297, - "language_loss": 0.7900703, - "learning_rate": 1.1650819059101839e-06, - "loss": 0.81133884, - "num_input_tokens_seen": 232559720, - "step": 10777, - "time_per_iteration": 2.6817147731781006 - }, - { - "auxiliary_loss_clip": 0.01101857, - "auxiliary_loss_mlp": 0.01033898, - "balance_loss_clip": 1.04061663, - "balance_loss_mlp": 1.0203439, - "epoch": 0.6480084172553735, - "flos": 22164066322560.0, - "grad_norm": 2.418675876930909, - "language_loss": 0.73090535, - "learning_rate": 1.1647280195724896e-06, - "loss": 0.75226295, - "num_input_tokens_seen": 232579370, - "step": 10778, - "time_per_iteration": 2.7519023418426514 - }, - { - "auxiliary_loss_clip": 0.01098704, - "auxiliary_loss_mlp": 0.01030563, - "balance_loss_clip": 1.03796005, - "balance_loss_mlp": 1.01817703, - "epoch": 0.6480685405080415, - "flos": 24316228586880.0, - "grad_norm": 1.4697687567373847, - "language_loss": 0.78067875, - "learning_rate": 1.1643741649083923e-06, - "loss": 0.80197144, - "num_input_tokens_seen": 232600495, - "step": 10779, - "time_per_iteration": 2.667295455932617 - }, - { - "auxiliary_loss_clip": 0.01021608, - "auxiliary_loss_mlp": 0.01004834, - "balance_loss_clip": 1.00979376, - "balance_loss_mlp": 1.00352228, - "epoch": 0.6481286637607094, - "flos": 59891207760000.0, - "grad_norm": 0.722667977363254, - "language_loss": 0.59406435, - "learning_rate": 1.1640203419313095e-06, - "loss": 0.61432874, - "num_input_tokens_seen": 232663165, - "step": 10780, - "time_per_iteration": 3.146688461303711 - }, - { - "auxiliary_loss_clip": 0.01013668, - "auxiliary_loss_mlp": 0.01032512, - "balance_loss_clip": 1.03276062, - "balance_loss_mlp": 1.02043653, - "epoch": 0.6481887870133775, - "flos": 25484187219840.0, - "grad_norm": 1.9346405521822077, - "language_loss": 0.79079604, - "learning_rate": 1.1636665506546599e-06, - "loss": 0.81125784, - "num_input_tokens_seen": 232683385, - "step": 10781, - "time_per_iteration": 3.1543314456939697 - }, - { - "auxiliary_loss_clip": 0.01117668, - "auxiliary_loss_mlp": 0.01036085, - "balance_loss_clip": 1.04143655, - "balance_loss_mlp": 1.02158904, - "epoch": 0.6482489102660454, - "flos": 19930206574080.0, - "grad_norm": 2.567868177502946, - "language_loss": 0.79041505, - "learning_rate": 1.1633127910918578e-06, - "loss": 0.81195259, - "num_input_tokens_seen": 232699095, - "step": 10782, - "time_per_iteration": 2.8998003005981445 - }, - { - "auxiliary_loss_clip": 0.01106141, - "auxiliary_loss_mlp": 0.007711, - "balance_loss_clip": 1.04090714, - "balance_loss_mlp": 1.0001415, - "epoch": 0.6483090335187134, - "flos": 26979471515520.0, - "grad_norm": 2.672580630052252, - "language_loss": 0.64563107, - "learning_rate": 1.1629590632563187e-06, - "loss": 0.66440344, - "num_input_tokens_seen": 232717920, - "step": 10783, - "time_per_iteration": 2.807725191116333 - }, - { - "auxiliary_loss_clip": 0.01119847, - "auxiliary_loss_mlp": 0.01038004, - "balance_loss_clip": 1.04234159, - "balance_loss_mlp": 1.02316856, - "epoch": 0.6483691567713813, - "flos": 25077965333760.0, - "grad_norm": 1.6110368507909019, - "language_loss": 0.88390124, - "learning_rate": 1.1626053671614561e-06, - "loss": 0.90547979, - "num_input_tokens_seen": 232737605, - "step": 10784, - "time_per_iteration": 2.640153169631958 - }, - { - "auxiliary_loss_clip": 0.01089797, - "auxiliary_loss_mlp": 0.01033124, - "balance_loss_clip": 1.03887093, - "balance_loss_mlp": 1.02020776, - "epoch": 0.6484292800240493, - "flos": 16105972250880.0, - "grad_norm": 2.090784466794914, - "language_loss": 0.72988814, - "learning_rate": 1.1622517028206815e-06, - "loss": 0.75111735, - "num_input_tokens_seen": 232755110, - "step": 10785, - "time_per_iteration": 2.6515488624572754 - }, - { - "auxiliary_loss_clip": 0.01078138, - "auxiliary_loss_mlp": 0.0103076, - "balance_loss_clip": 1.03758073, - "balance_loss_mlp": 1.01802194, - "epoch": 0.6484894032767172, - "flos": 28840398307200.0, - "grad_norm": 1.5672388778764104, - "language_loss": 0.69397259, - "learning_rate": 1.1618980702474071e-06, - "loss": 0.71506155, - "num_input_tokens_seen": 232779040, - "step": 10786, - "time_per_iteration": 2.831984519958496 - }, - { - "auxiliary_loss_clip": 0.01075224, - "auxiliary_loss_mlp": 0.01031746, - "balance_loss_clip": 1.03817129, - "balance_loss_mlp": 1.01922286, - "epoch": 0.6485495265293852, - "flos": 30227052896640.0, - "grad_norm": 2.0612082804403404, - "language_loss": 0.71243078, - "learning_rate": 1.161544469455041e-06, - "loss": 0.73350048, - "num_input_tokens_seen": 232800515, - "step": 10787, - "time_per_iteration": 2.793691635131836 - }, - { - "auxiliary_loss_clip": 0.0111836, - "auxiliary_loss_mlp": 0.01035294, - "balance_loss_clip": 1.0412823, - "balance_loss_mlp": 1.0220623, - "epoch": 0.6486096497820532, - "flos": 20082181017600.0, - "grad_norm": 1.9333037316798733, - "language_loss": 0.84715712, - "learning_rate": 1.1611909004569934e-06, - "loss": 0.86869359, - "num_input_tokens_seen": 232818450, - "step": 10788, - "time_per_iteration": 2.606229543685913 - }, - { - "auxiliary_loss_clip": 0.01078244, - "auxiliary_loss_mlp": 0.01034763, - "balance_loss_clip": 1.04034448, - "balance_loss_mlp": 1.02126873, - "epoch": 0.6486697730347212, - "flos": 17129067333120.0, - "grad_norm": 2.006310721450953, - "language_loss": 0.7757296, - "learning_rate": 1.1608373632666708e-06, - "loss": 0.79685968, - "num_input_tokens_seen": 232834785, - "step": 10789, - "time_per_iteration": 2.689147710800171 - }, - { - "auxiliary_loss_clip": 0.01096496, - "auxiliary_loss_mlp": 0.01031686, - "balance_loss_clip": 1.03580093, - "balance_loss_mlp": 1.01941395, - "epoch": 0.6487298962873892, - "flos": 38911940570880.0, - "grad_norm": 1.6467685685264215, - "language_loss": 0.75511503, - "learning_rate": 1.160483857897479e-06, - "loss": 0.77639687, - "num_input_tokens_seen": 232856050, - "step": 10790, - "time_per_iteration": 2.8264946937561035 - }, - { - "auxiliary_loss_clip": 0.01113527, - "auxiliary_loss_mlp": 0.01036831, - "balance_loss_clip": 1.04156542, - "balance_loss_mlp": 1.02490366, - "epoch": 0.6487900195400571, - "flos": 11947840076160.0, - "grad_norm": 2.307183406251666, - "language_loss": 0.60332596, - "learning_rate": 1.160130384362823e-06, - "loss": 0.62482953, - "num_input_tokens_seen": 232873945, - "step": 10791, - "time_per_iteration": 2.5990047454833984 - }, - { - "auxiliary_loss_clip": 0.01076606, - "auxiliary_loss_mlp": 0.01034239, - "balance_loss_clip": 1.03773832, - "balance_loss_mlp": 1.0215373, - "epoch": 0.6488501427927251, - "flos": 22344445445760.0, - "grad_norm": 1.759760291391278, - "language_loss": 0.86496675, - "learning_rate": 1.1597769426761082e-06, - "loss": 0.88607526, - "num_input_tokens_seen": 232892160, - "step": 10792, - "time_per_iteration": 2.771683692932129 - }, - { - "auxiliary_loss_clip": 0.01093434, - "auxiliary_loss_mlp": 0.01039713, - "balance_loss_clip": 1.03958428, - "balance_loss_mlp": 1.02602792, - "epoch": 0.648910266045393, - "flos": 22236282616320.0, - "grad_norm": 2.0358486422598445, - "language_loss": 0.78231007, - "learning_rate": 1.159423532850735e-06, - "loss": 0.8036415, - "num_input_tokens_seen": 232911725, - "step": 10793, - "time_per_iteration": 2.67922043800354 - }, - { - "auxiliary_loss_clip": 0.0108252, - "auxiliary_loss_mlp": 0.01032713, - "balance_loss_clip": 1.0395385, - "balance_loss_mlp": 1.0193671, - "epoch": 0.6489703892980611, - "flos": 25301258231040.0, - "grad_norm": 2.0060089316964667, - "language_loss": 0.75005889, - "learning_rate": 1.1590701549001055e-06, - "loss": 0.77121115, - "num_input_tokens_seen": 232929085, - "step": 10794, - "time_per_iteration": 2.740185022354126 - }, - { - "auxiliary_loss_clip": 0.01102066, - "auxiliary_loss_mlp": 0.00770842, - "balance_loss_clip": 1.03801179, - "balance_loss_mlp": 1.00016379, - "epoch": 0.649030512550729, - "flos": 24571912573440.0, - "grad_norm": 1.6388436552304226, - "language_loss": 0.70095515, - "learning_rate": 1.158716808837621e-06, - "loss": 0.71968424, - "num_input_tokens_seen": 232949455, - "step": 10795, - "time_per_iteration": 2.7056167125701904 - }, - { - "auxiliary_loss_clip": 0.01092893, - "auxiliary_loss_mlp": 0.01034868, - "balance_loss_clip": 1.03938341, - "balance_loss_mlp": 1.02145672, - "epoch": 0.649090635803397, - "flos": 26244702904320.0, - "grad_norm": 1.931230622678825, - "language_loss": 0.54384381, - "learning_rate": 1.158363494676679e-06, - "loss": 0.56512141, - "num_input_tokens_seen": 232969445, - "step": 10796, - "time_per_iteration": 2.70178484916687 - }, - { - "auxiliary_loss_clip": 0.0110304, - "auxiliary_loss_mlp": 0.010382, - "balance_loss_clip": 1.04058564, - "balance_loss_mlp": 1.02635705, - "epoch": 0.6491507590560649, - "flos": 24937375501440.0, - "grad_norm": 1.521654875765255, - "language_loss": 0.77584833, - "learning_rate": 1.1580102124306775e-06, - "loss": 0.7972607, - "num_input_tokens_seen": 232988900, - "step": 10797, - "time_per_iteration": 2.740236759185791 - }, - { - "auxiliary_loss_clip": 0.010649, - "auxiliary_loss_mlp": 0.01033495, - "balance_loss_clip": 1.03765631, - "balance_loss_mlp": 1.02110291, - "epoch": 0.6492108823087329, - "flos": 19499781899520.0, - "grad_norm": 2.1886950551197835, - "language_loss": 0.7017765, - "learning_rate": 1.1576569621130134e-06, - "loss": 0.72276044, - "num_input_tokens_seen": 233005060, - "step": 10798, - "time_per_iteration": 2.7228379249572754 - }, - { - "auxiliary_loss_clip": 0.01059107, - "auxiliary_loss_mlp": 0.01032641, - "balance_loss_clip": 1.03400683, - "balance_loss_mlp": 1.02048159, - "epoch": 0.6492710055614008, - "flos": 19719303868800.0, - "grad_norm": 1.8018305819700693, - "language_loss": 0.76899987, - "learning_rate": 1.1573037437370811e-06, - "loss": 0.78991735, - "num_input_tokens_seen": 233023375, - "step": 10799, - "time_per_iteration": 2.7452025413513184 - }, - { - "auxiliary_loss_clip": 0.01102121, - "auxiliary_loss_mlp": 0.0103607, - "balance_loss_clip": 1.03952456, - "balance_loss_mlp": 1.02255809, - "epoch": 0.6493311288140688, - "flos": 24317018686080.0, - "grad_norm": 1.8690878603480447, - "language_loss": 0.71881801, - "learning_rate": 1.1569505573162755e-06, - "loss": 0.74019992, - "num_input_tokens_seen": 233043130, - "step": 10800, - "time_per_iteration": 2.681090831756592 - }, - { - "auxiliary_loss_clip": 0.01025406, - "auxiliary_loss_mlp": 0.01015193, - "balance_loss_clip": 1.01085913, - "balance_loss_mlp": 1.01379859, - "epoch": 0.6493912520667368, - "flos": 70934635290240.0, - "grad_norm": 0.7781340996665279, - "language_loss": 0.60211796, - "learning_rate": 1.1565974028639897e-06, - "loss": 0.62252396, - "num_input_tokens_seen": 233110560, - "step": 10801, - "time_per_iteration": 3.3247601985931396 - }, - { - "auxiliary_loss_clip": 0.01104473, - "auxiliary_loss_mlp": 0.01042076, - "balance_loss_clip": 1.04024768, - "balance_loss_mlp": 1.02764034, - "epoch": 0.6494513753194048, - "flos": 25337779384320.0, - "grad_norm": 2.523744267443401, - "language_loss": 0.78645104, - "learning_rate": 1.156244280393614e-06, - "loss": 0.80791658, - "num_input_tokens_seen": 233130080, - "step": 10802, - "time_per_iteration": 4.631081581115723 - }, - { - "auxiliary_loss_clip": 0.01114091, - "auxiliary_loss_mlp": 0.01039322, - "balance_loss_clip": 1.03890288, - "balance_loss_mlp": 1.02562487, - "epoch": 0.6495114985720728, - "flos": 24681978823680.0, - "grad_norm": 1.6103480042358926, - "language_loss": 0.74409741, - "learning_rate": 1.155891189918541e-06, - "loss": 0.76563156, - "num_input_tokens_seen": 233150235, - "step": 10803, - "time_per_iteration": 2.6966469287872314 - }, - { - "auxiliary_loss_clip": 0.01052817, - "auxiliary_loss_mlp": 0.01033534, - "balance_loss_clip": 1.03642201, - "balance_loss_mlp": 1.02049232, - "epoch": 0.6495716218247407, - "flos": 23651162317440.0, - "grad_norm": 2.357483246632454, - "language_loss": 0.70044661, - "learning_rate": 1.1555381314521578e-06, - "loss": 0.72131014, - "num_input_tokens_seen": 233166710, - "step": 10804, - "time_per_iteration": 2.8469581604003906 - }, - { - "auxiliary_loss_clip": 0.01100022, - "auxiliary_loss_mlp": 0.01031932, - "balance_loss_clip": 1.03885949, - "balance_loss_mlp": 1.01822269, - "epoch": 0.6496317450774087, - "flos": 22346169298560.0, - "grad_norm": 1.6424372411167527, - "language_loss": 0.72557664, - "learning_rate": 1.1551851050078537e-06, - "loss": 0.74689615, - "num_input_tokens_seen": 233185445, - "step": 10805, - "time_per_iteration": 4.559306621551514 - }, - { - "auxiliary_loss_clip": 0.01088097, - "auxiliary_loss_mlp": 0.01030943, - "balance_loss_clip": 1.03999364, - "balance_loss_mlp": 1.01886106, - "epoch": 0.6496918683300766, - "flos": 30518647505280.0, - "grad_norm": 2.1225421947180467, - "language_loss": 0.65710378, - "learning_rate": 1.1548321105990155e-06, - "loss": 0.67829412, - "num_input_tokens_seen": 233205805, - "step": 10806, - "time_per_iteration": 4.271615266799927 - }, - { - "auxiliary_loss_clip": 0.01093074, - "auxiliary_loss_mlp": 0.00771144, - "balance_loss_clip": 1.03741765, - "balance_loss_mlp": 1.00009441, - "epoch": 0.6497519915827447, - "flos": 12458992567680.0, - "grad_norm": 1.9214718172589236, - "language_loss": 0.78912604, - "learning_rate": 1.1544791482390275e-06, - "loss": 0.80776823, - "num_input_tokens_seen": 233224215, - "step": 10807, - "time_per_iteration": 2.7781808376312256 - }, - { - "auxiliary_loss_clip": 0.01014724, - "auxiliary_loss_mlp": 0.0100172, - "balance_loss_clip": 1.0100404, - "balance_loss_mlp": 1.00033116, - "epoch": 0.6498121148354126, - "flos": 69093748287360.0, - "grad_norm": 0.7866075869002591, - "language_loss": 0.58888513, - "learning_rate": 1.1541262179412745e-06, - "loss": 0.60904956, - "num_input_tokens_seen": 233294440, - "step": 10808, - "time_per_iteration": 3.3867762088775635 - }, - { - "auxiliary_loss_clip": 0.01091297, - "auxiliary_loss_mlp": 0.01027122, - "balance_loss_clip": 1.04009056, - "balance_loss_mlp": 1.01453352, - "epoch": 0.6498722380880806, - "flos": 36897135914880.0, - "grad_norm": 1.7443140014145102, - "language_loss": 0.63562334, - "learning_rate": 1.1537733197191415e-06, - "loss": 0.65680754, - "num_input_tokens_seen": 233316125, - "step": 10809, - "time_per_iteration": 4.545352220535278 - }, - { - "auxiliary_loss_clip": 0.01101385, - "auxiliary_loss_mlp": 0.00769706, - "balance_loss_clip": 1.04086709, - "balance_loss_mlp": 1.00011587, - "epoch": 0.6499323613407485, - "flos": 29017760688000.0, - "grad_norm": 1.6271930156290193, - "language_loss": 0.81576955, - "learning_rate": 1.153420453586008e-06, - "loss": 0.8344804, - "num_input_tokens_seen": 233336140, - "step": 10810, - "time_per_iteration": 2.6756200790405273 - }, - { - "auxiliary_loss_clip": 0.01071315, - "auxiliary_loss_mlp": 0.01036036, - "balance_loss_clip": 1.0380795, - "balance_loss_mlp": 1.02466989, - "epoch": 0.6499924845934165, - "flos": 20119240874880.0, - "grad_norm": 1.6231866882582067, - "language_loss": 0.72109252, - "learning_rate": 1.1530676195552561e-06, - "loss": 0.74216604, - "num_input_tokens_seen": 233356095, - "step": 10811, - "time_per_iteration": 2.6948235034942627 - }, - { - "auxiliary_loss_clip": 0.01053868, - "auxiliary_loss_mlp": 0.01028108, - "balance_loss_clip": 1.04128838, - "balance_loss_mlp": 1.01610339, - "epoch": 0.6500526078460844, - "flos": 24421338760320.0, - "grad_norm": 1.6351468205414483, - "language_loss": 0.77842551, - "learning_rate": 1.1527148176402649e-06, - "loss": 0.79924524, - "num_input_tokens_seen": 233376830, - "step": 10812, - "time_per_iteration": 2.8678853511810303 - }, - { - "auxiliary_loss_clip": 0.01098947, - "auxiliary_loss_mlp": 0.01036383, - "balance_loss_clip": 1.04008079, - "balance_loss_mlp": 1.02321005, - "epoch": 0.6501127310987524, - "flos": 23331019374720.0, - "grad_norm": 1.6938636909154852, - "language_loss": 0.85069716, - "learning_rate": 1.152362047854413e-06, - "loss": 0.8720504, - "num_input_tokens_seen": 233395275, - "step": 10813, - "time_per_iteration": 2.618603467941284 - }, - { - "auxiliary_loss_clip": 0.01071283, - "auxiliary_loss_mlp": 0.01035396, - "balance_loss_clip": 1.03572655, - "balance_loss_mlp": 1.02187157, - "epoch": 0.6501728543514204, - "flos": 18697824898560.0, - "grad_norm": 2.609145629781726, - "language_loss": 0.79691541, - "learning_rate": 1.1520093102110764e-06, - "loss": 0.8179822, - "num_input_tokens_seen": 233413345, - "step": 10814, - "time_per_iteration": 2.742004156112671 - }, - { - "auxiliary_loss_clip": 0.01064254, - "auxiliary_loss_mlp": 0.00773576, - "balance_loss_clip": 1.03794754, - "balance_loss_mlp": 1.00018024, - "epoch": 0.6502329776040884, - "flos": 44199858199680.0, - "grad_norm": 1.9285039571390825, - "language_loss": 0.65348196, - "learning_rate": 1.1516566047236328e-06, - "loss": 0.67186022, - "num_input_tokens_seen": 233436105, - "step": 10815, - "time_per_iteration": 2.967710256576538 - }, - { - "auxiliary_loss_clip": 0.01118333, - "auxiliary_loss_mlp": 0.01032665, - "balance_loss_clip": 1.04089963, - "balance_loss_mlp": 1.01759648, - "epoch": 0.6502931008567564, - "flos": 14574741419520.0, - "grad_norm": 1.878543508830568, - "language_loss": 0.75245708, - "learning_rate": 1.1513039314054546e-06, - "loss": 0.77396703, - "num_input_tokens_seen": 233452320, - "step": 10816, - "time_per_iteration": 2.619370698928833 - }, - { - "auxiliary_loss_clip": 0.01085538, - "auxiliary_loss_mlp": 0.01031729, - "balance_loss_clip": 1.03892541, - "balance_loss_mlp": 1.01897991, - "epoch": 0.6503532241094243, - "flos": 21395003201280.0, - "grad_norm": 1.8185101411846911, - "language_loss": 0.73227775, - "learning_rate": 1.1509512902699174e-06, - "loss": 0.75345039, - "num_input_tokens_seen": 233469920, - "step": 10817, - "time_per_iteration": 2.758009672164917 - }, - { - "auxiliary_loss_clip": 0.01071537, - "auxiliary_loss_mlp": 0.01046459, - "balance_loss_clip": 1.03518438, - "balance_loss_mlp": 1.03168857, - "epoch": 0.6504133473620923, - "flos": 74740840986240.0, - "grad_norm": 1.5063120441652318, - "language_loss": 0.72075009, - "learning_rate": 1.1505986813303916e-06, - "loss": 0.74193007, - "num_input_tokens_seen": 233499780, - "step": 10818, - "time_per_iteration": 3.143178701400757 - }, - { - "auxiliary_loss_clip": 0.01085148, - "auxiliary_loss_mlp": 0.01030336, - "balance_loss_clip": 1.03872418, - "balance_loss_mlp": 1.01738429, - "epoch": 0.6504734706147602, - "flos": 19713270384000.0, - "grad_norm": 2.002053752481776, - "language_loss": 0.65038371, - "learning_rate": 1.150246104600249e-06, - "loss": 0.67153859, - "num_input_tokens_seen": 233518235, - "step": 10819, - "time_per_iteration": 2.704205274581909 - }, - { - "auxiliary_loss_clip": 0.01077923, - "auxiliary_loss_mlp": 0.01031636, - "balance_loss_clip": 1.03569567, - "balance_loss_mlp": 1.01811743, - "epoch": 0.6505335938674283, - "flos": 25556870390400.0, - "grad_norm": 1.8302178372953948, - "language_loss": 0.83782417, - "learning_rate": 1.14989356009286e-06, - "loss": 0.85891974, - "num_input_tokens_seen": 233535215, - "step": 10820, - "time_per_iteration": 2.762343645095825 - }, - { - "auxiliary_loss_clip": 0.01106479, - "auxiliary_loss_mlp": 0.01030319, - "balance_loss_clip": 1.03934109, - "balance_loss_mlp": 1.01703274, - "epoch": 0.6505937171200962, - "flos": 17821424960640.0, - "grad_norm": 2.074138898013104, - "language_loss": 0.77881086, - "learning_rate": 1.1495410478215914e-06, - "loss": 0.80017889, - "num_input_tokens_seen": 233552775, - "step": 10821, - "time_per_iteration": 2.6239891052246094 - }, - { - "auxiliary_loss_clip": 0.01077516, - "auxiliary_loss_mlp": 0.01028396, - "balance_loss_clip": 1.03843164, - "balance_loss_mlp": 1.01721418, - "epoch": 0.6506538403727642, - "flos": 20668135582080.0, - "grad_norm": 1.4292101756111668, - "language_loss": 0.80072695, - "learning_rate": 1.1491885677998126e-06, - "loss": 0.82178605, - "num_input_tokens_seen": 233572080, - "step": 10822, - "time_per_iteration": 2.7913742065429688 - }, - { - "auxiliary_loss_clip": 0.01084959, - "auxiliary_loss_mlp": 0.01029649, - "balance_loss_clip": 1.04204702, - "balance_loss_mlp": 1.01634574, - "epoch": 0.6507139636254321, - "flos": 11721422695680.0, - "grad_norm": 2.216597297898186, - "language_loss": 0.8719157, - "learning_rate": 1.1488361200408883e-06, - "loss": 0.89306176, - "num_input_tokens_seen": 233589155, - "step": 10823, - "time_per_iteration": 2.7045187950134277 - }, - { - "auxiliary_loss_clip": 0.01114569, - "auxiliary_loss_mlp": 0.01031767, - "balance_loss_clip": 1.0398941, - "balance_loss_mlp": 1.01913643, - "epoch": 0.6507740868781001, - "flos": 26761745226240.0, - "grad_norm": 1.6940233286010407, - "language_loss": 0.66299087, - "learning_rate": 1.148483704558183e-06, - "loss": 0.6844542, - "num_input_tokens_seen": 233608180, - "step": 10824, - "time_per_iteration": 2.609870433807373 - }, - { - "auxiliary_loss_clip": 0.01096015, - "auxiliary_loss_mlp": 0.01031659, - "balance_loss_clip": 1.04038215, - "balance_loss_mlp": 1.01846242, - "epoch": 0.650834210130768, - "flos": 16471722487680.0, - "grad_norm": 5.416027486189251, - "language_loss": 0.87431592, - "learning_rate": 1.1481313213650607e-06, - "loss": 0.89559269, - "num_input_tokens_seen": 233625750, - "step": 10825, - "time_per_iteration": 2.649099588394165 - }, - { - "auxiliary_loss_clip": 0.01092468, - "auxiliary_loss_mlp": 0.01028826, - "balance_loss_clip": 1.03650379, - "balance_loss_mlp": 1.01514649, - "epoch": 0.650894333383436, - "flos": 17128672283520.0, - "grad_norm": 2.103621809336841, - "language_loss": 0.73180604, - "learning_rate": 1.147778970474885e-06, - "loss": 0.75301898, - "num_input_tokens_seen": 233644235, - "step": 10826, - "time_per_iteration": 2.6394810676574707 - }, - { - "auxiliary_loss_clip": 0.01104739, - "auxiliary_loss_mlp": 0.01027729, - "balance_loss_clip": 1.04116881, - "balance_loss_mlp": 1.01562333, - "epoch": 0.650954456636104, - "flos": 18734238311040.0, - "grad_norm": 1.7744084173415924, - "language_loss": 0.68743241, - "learning_rate": 1.1474266519010157e-06, - "loss": 0.70875704, - "num_input_tokens_seen": 233662845, - "step": 10827, - "time_per_iteration": 2.5662622451782227 - }, - { - "auxiliary_loss_clip": 0.01089545, - "auxiliary_loss_mlp": 0.01031977, - "balance_loss_clip": 1.03715336, - "balance_loss_mlp": 1.02000248, - "epoch": 0.651014579888772, - "flos": 24528244613760.0, - "grad_norm": 1.7280110593006797, - "language_loss": 0.76715839, - "learning_rate": 1.1470743656568136e-06, - "loss": 0.78837359, - "num_input_tokens_seen": 233681990, - "step": 10828, - "time_per_iteration": 2.6430130004882812 - }, - { - "auxiliary_loss_clip": 0.01101657, - "auxiliary_loss_mlp": 0.0102849, - "balance_loss_clip": 1.0396359, - "balance_loss_mlp": 1.01659322, - "epoch": 0.65107470314144, - "flos": 24061083304320.0, - "grad_norm": 2.028448280689147, - "language_loss": 0.89382976, - "learning_rate": 1.1467221117556362e-06, - "loss": 0.91513121, - "num_input_tokens_seen": 233698930, - "step": 10829, - "time_per_iteration": 2.676887273788452 - }, - { - "auxiliary_loss_clip": 0.01033575, - "auxiliary_loss_mlp": 0.01003174, - "balance_loss_clip": 1.00994611, - "balance_loss_mlp": 1.00192249, - "epoch": 0.6511348263941079, - "flos": 72480734352000.0, - "grad_norm": 0.6385058987930536, - "language_loss": 0.55351257, - "learning_rate": 1.1463698902108428e-06, - "loss": 0.57388008, - "num_input_tokens_seen": 233769825, - "step": 10830, - "time_per_iteration": 3.283604383468628 - }, - { - "auxiliary_loss_clip": 0.01080445, - "auxiliary_loss_mlp": 0.01033977, - "balance_loss_clip": 1.03753436, - "balance_loss_mlp": 1.02031004, - "epoch": 0.6511949496467759, - "flos": 23367684182400.0, - "grad_norm": 2.2496423265989263, - "language_loss": 0.74632305, - "learning_rate": 1.1460177010357878e-06, - "loss": 0.76746726, - "num_input_tokens_seen": 233787095, - "step": 10831, - "time_per_iteration": 2.6958060264587402 - }, - { - "auxiliary_loss_clip": 0.01016148, - "auxiliary_loss_mlp": 0.01001305, - "balance_loss_clip": 1.01118171, - "balance_loss_mlp": 0.99989206, - "epoch": 0.6512550728994438, - "flos": 67333191073920.0, - "grad_norm": 0.6457874133081085, - "language_loss": 0.50977135, - "learning_rate": 1.145665544243828e-06, - "loss": 0.52994585, - "num_input_tokens_seen": 233853050, - "step": 10832, - "time_per_iteration": 3.3019638061523438 - }, - { - "auxiliary_loss_clip": 0.01094456, - "auxiliary_loss_mlp": 0.0103476, - "balance_loss_clip": 1.03838396, - "balance_loss_mlp": 1.02121806, - "epoch": 0.6513151961521119, - "flos": 21141689512320.0, - "grad_norm": 2.261964071454772, - "language_loss": 0.83006239, - "learning_rate": 1.145313419848316e-06, - "loss": 0.85135454, - "num_input_tokens_seen": 233871385, - "step": 10833, - "time_per_iteration": 2.643763542175293 - }, - { - "auxiliary_loss_clip": 0.01096358, - "auxiliary_loss_mlp": 0.01034172, - "balance_loss_clip": 1.04303241, - "balance_loss_mlp": 1.02144599, - "epoch": 0.6513753194047798, - "flos": 15158828476800.0, - "grad_norm": 2.0262015833742937, - "language_loss": 0.83040363, - "learning_rate": 1.1449613278626049e-06, - "loss": 0.85170895, - "num_input_tokens_seen": 233888175, - "step": 10834, - "time_per_iteration": 2.696136713027954 - }, - { - "auxiliary_loss_clip": 0.01102331, - "auxiliary_loss_mlp": 0.01040155, - "balance_loss_clip": 1.039487, - "balance_loss_mlp": 1.02702951, - "epoch": 0.6514354426574478, - "flos": 30226621933440.0, - "grad_norm": 1.5116925476060534, - "language_loss": 0.7712391, - "learning_rate": 1.1446092683000455e-06, - "loss": 0.79266393, - "num_input_tokens_seen": 233911470, - "step": 10835, - "time_per_iteration": 2.733752965927124 - }, - { - "auxiliary_loss_clip": 0.01087811, - "auxiliary_loss_mlp": 0.01038329, - "balance_loss_clip": 1.0393815, - "balance_loss_mlp": 1.02551985, - "epoch": 0.6514955659101157, - "flos": 24205587719040.0, - "grad_norm": 1.603053369126082, - "language_loss": 0.77712744, - "learning_rate": 1.1442572411739882e-06, - "loss": 0.79838884, - "num_input_tokens_seen": 233932135, - "step": 10836, - "time_per_iteration": 2.7181618213653564 - }, - { - "auxiliary_loss_clip": 0.01076915, - "auxiliary_loss_mlp": 0.01034338, - "balance_loss_clip": 1.037691, - "balance_loss_mlp": 1.02143383, - "epoch": 0.6515556891627837, - "flos": 12377761960320.0, - "grad_norm": 2.035005351868823, - "language_loss": 0.82812917, - "learning_rate": 1.143905246497783e-06, - "loss": 0.84924167, - "num_input_tokens_seen": 233947880, - "step": 10837, - "time_per_iteration": 2.6514079570770264 - }, - { - "auxiliary_loss_clip": 0.01073313, - "auxiliary_loss_mlp": 0.01035471, - "balance_loss_clip": 1.03897333, - "balance_loss_mlp": 1.0211482, - "epoch": 0.6516158124154516, - "flos": 49601217957120.0, - "grad_norm": 1.8965490798746285, - "language_loss": 0.5910452, - "learning_rate": 1.1435532842847758e-06, - "loss": 0.61213303, - "num_input_tokens_seen": 233971475, - "step": 10838, - "time_per_iteration": 2.955751419067383 - }, - { - "auxiliary_loss_clip": 0.01033147, - "auxiliary_loss_mlp": 0.01008878, - "balance_loss_clip": 1.0095979, - "balance_loss_mlp": 1.00770998, - "epoch": 0.6516759356681197, - "flos": 59702748076800.0, - "grad_norm": 0.7683915325325666, - "language_loss": 0.60835862, - "learning_rate": 1.1432013545483147e-06, - "loss": 0.62877893, - "num_input_tokens_seen": 234030690, - "step": 10839, - "time_per_iteration": 3.200835943222046 - }, - { - "auxiliary_loss_clip": 0.0109233, - "auxiliary_loss_mlp": 0.0103157, - "balance_loss_clip": 1.04093075, - "balance_loss_mlp": 1.01998901, - "epoch": 0.6517360589207876, - "flos": 37450807130880.0, - "grad_norm": 1.7743025760939068, - "language_loss": 0.67926049, - "learning_rate": 1.1428494573017439e-06, - "loss": 0.70049942, - "num_input_tokens_seen": 234052470, - "step": 10840, - "time_per_iteration": 2.8348867893218994 - }, - { - "auxiliary_loss_clip": 0.01067745, - "auxiliary_loss_mlp": 0.01034413, - "balance_loss_clip": 1.03654337, - "balance_loss_mlp": 1.02269483, - "epoch": 0.6517961821734556, - "flos": 25374911068800.0, - "grad_norm": 2.0615754511911306, - "language_loss": 0.73519421, - "learning_rate": 1.1424975925584071e-06, - "loss": 0.75621581, - "num_input_tokens_seen": 234071495, - "step": 10841, - "time_per_iteration": 4.435396671295166 - }, - { - "auxiliary_loss_clip": 0.01114891, - "auxiliary_loss_mlp": 0.01038031, - "balance_loss_clip": 1.03930378, - "balance_loss_mlp": 1.02487588, - "epoch": 0.6518563054261236, - "flos": 28766996864640.0, - "grad_norm": 1.4942272074667713, - "language_loss": 0.62317944, - "learning_rate": 1.142145760331648e-06, - "loss": 0.64470863, - "num_input_tokens_seen": 234092325, - "step": 10842, - "time_per_iteration": 2.6767518520355225 - }, - { - "auxiliary_loss_clip": 0.01024949, - "auxiliary_loss_mlp": 0.01006106, - "balance_loss_clip": 1.01075029, - "balance_loss_mlp": 1.00497305, - "epoch": 0.6519164286787915, - "flos": 68924750797440.0, - "grad_norm": 0.8104047899585891, - "language_loss": 0.5617612, - "learning_rate": 1.141793960634807e-06, - "loss": 0.58207178, - "num_input_tokens_seen": 234148005, - "step": 10843, - "time_per_iteration": 3.0310990810394287 - }, - { - "auxiliary_loss_clip": 0.01104455, - "auxiliary_loss_mlp": 0.01039452, - "balance_loss_clip": 1.03846788, - "balance_loss_mlp": 1.02576053, - "epoch": 0.6519765519314595, - "flos": 20441933683200.0, - "grad_norm": 1.5675649945193708, - "language_loss": 0.82750475, - "learning_rate": 1.1414421934812253e-06, - "loss": 0.84894383, - "num_input_tokens_seen": 234164280, - "step": 10844, - "time_per_iteration": 5.7787792682647705 - }, - { - "auxiliary_loss_clip": 0.01104311, - "auxiliary_loss_mlp": 0.01034465, - "balance_loss_clip": 1.04057419, - "balance_loss_mlp": 1.02136445, - "epoch": 0.6520366751841274, - "flos": 28402970480640.0, - "grad_norm": 1.85573565019848, - "language_loss": 0.59983897, - "learning_rate": 1.1410904588842421e-06, - "loss": 0.62122673, - "num_input_tokens_seen": 234185090, - "step": 10845, - "time_per_iteration": 2.7293028831481934 - }, - { - "auxiliary_loss_clip": 0.0110391, - "auxiliary_loss_mlp": 0.01032078, - "balance_loss_clip": 1.04017997, - "balance_loss_mlp": 1.01897073, - "epoch": 0.6520967984367955, - "flos": 22273414300800.0, - "grad_norm": 1.668141485329768, - "language_loss": 0.79591072, - "learning_rate": 1.140738756857194e-06, - "loss": 0.81727064, - "num_input_tokens_seen": 234204050, - "step": 10846, - "time_per_iteration": 2.6495091915130615 - }, - { - "auxiliary_loss_clip": 0.01025275, - "auxiliary_loss_mlp": 0.01003438, - "balance_loss_clip": 1.01079941, - "balance_loss_mlp": 1.00228775, - "epoch": 0.6521569216894634, - "flos": 68917140092160.0, - "grad_norm": 0.709011283257112, - "language_loss": 0.60191703, - "learning_rate": 1.1403870874134192e-06, - "loss": 0.62220418, - "num_input_tokens_seen": 234269790, - "step": 10847, - "time_per_iteration": 3.282104730606079 - }, - { - "auxiliary_loss_clip": 0.0111717, - "auxiliary_loss_mlp": 0.01037772, - "balance_loss_clip": 1.0412842, - "balance_loss_mlp": 1.02495718, - "epoch": 0.6522170449421314, - "flos": 29130520458240.0, - "grad_norm": 1.5919105635369972, - "language_loss": 0.81118578, - "learning_rate": 1.1400354505662514e-06, - "loss": 0.8327353, - "num_input_tokens_seen": 234290135, - "step": 10848, - "time_per_iteration": 2.6569244861602783 - }, - { - "auxiliary_loss_clip": 0.01084019, - "auxiliary_loss_mlp": 0.01035374, - "balance_loss_clip": 1.03738701, - "balance_loss_mlp": 1.02265429, - "epoch": 0.6522771681947993, - "flos": 26651930371200.0, - "grad_norm": 2.586521111897064, - "language_loss": 0.74449492, - "learning_rate": 1.1396838463290263e-06, - "loss": 0.7656889, - "num_input_tokens_seen": 234309535, - "step": 10849, - "time_per_iteration": 4.26736044883728 - }, - { - "auxiliary_loss_clip": 0.0106317, - "auxiliary_loss_mlp": 0.01031804, - "balance_loss_clip": 1.03691697, - "balance_loss_mlp": 1.0188818, - "epoch": 0.6523372914474673, - "flos": 25739763465600.0, - "grad_norm": 1.4022053902069738, - "language_loss": 0.67808872, - "learning_rate": 1.1393322747150752e-06, - "loss": 0.69903851, - "num_input_tokens_seen": 234328755, - "step": 10850, - "time_per_iteration": 2.8357365131378174 - }, - { - "auxiliary_loss_clip": 0.01089828, - "auxiliary_loss_mlp": 0.00769863, - "balance_loss_clip": 1.03987718, - "balance_loss_mlp": 1.00014496, - "epoch": 0.6523974147001352, - "flos": 24827345164800.0, - "grad_norm": 1.627745472842777, - "language_loss": 0.66696799, - "learning_rate": 1.1389807357377313e-06, - "loss": 0.68556488, - "num_input_tokens_seen": 234348655, - "step": 10851, - "time_per_iteration": 2.702782154083252 - }, - { - "auxiliary_loss_clip": 0.01092324, - "auxiliary_loss_mlp": 0.0103014, - "balance_loss_clip": 1.04054999, - "balance_loss_mlp": 1.01776636, - "epoch": 0.6524575379528033, - "flos": 26317637470080.0, - "grad_norm": 2.9837115627224238, - "language_loss": 0.73833734, - "learning_rate": 1.1386292294103235e-06, - "loss": 0.75956196, - "num_input_tokens_seen": 234367445, - "step": 10852, - "time_per_iteration": 2.7116212844848633 - }, - { - "auxiliary_loss_clip": 0.0109357, - "auxiliary_loss_mlp": 0.01030172, - "balance_loss_clip": 1.04287267, - "balance_loss_mlp": 1.01617694, - "epoch": 0.6525176612054712, - "flos": 19494143464320.0, - "grad_norm": 1.9044884730623952, - "language_loss": 0.66662163, - "learning_rate": 1.1382777557461812e-06, - "loss": 0.68785906, - "num_input_tokens_seen": 234384825, - "step": 10853, - "time_per_iteration": 2.7027504444122314 - }, - { - "auxiliary_loss_clip": 0.01002155, - "auxiliary_loss_mlp": 0.01000193, - "balance_loss_clip": 1.01079071, - "balance_loss_mlp": 0.99902517, - "epoch": 0.6525777844581392, - "flos": 71706894721920.0, - "grad_norm": 0.7271722971933409, - "language_loss": 0.62995195, - "learning_rate": 1.137926314758634e-06, - "loss": 0.64997554, - "num_input_tokens_seen": 234450630, - "step": 10854, - "time_per_iteration": 3.330467462539673 - }, - { - "auxiliary_loss_clip": 0.01098588, - "auxiliary_loss_mlp": 0.01040453, - "balance_loss_clip": 1.03749895, - "balance_loss_mlp": 1.02501512, - "epoch": 0.6526379077108072, - "flos": 26653115520000.0, - "grad_norm": 1.9818066069545293, - "language_loss": 0.77810514, - "learning_rate": 1.1375749064610072e-06, - "loss": 0.79949546, - "num_input_tokens_seen": 234473505, - "step": 10855, - "time_per_iteration": 2.856804132461548 - }, - { - "auxiliary_loss_clip": 0.01073699, - "auxiliary_loss_mlp": 0.01028598, - "balance_loss_clip": 1.03438473, - "balance_loss_mlp": 1.01601565, - "epoch": 0.6526980309634751, - "flos": 22820369673600.0, - "grad_norm": 1.8477737717286657, - "language_loss": 0.78975284, - "learning_rate": 1.1372235308666256e-06, - "loss": 0.81077588, - "num_input_tokens_seen": 234492485, - "step": 10856, - "time_per_iteration": 2.7385408878326416 - }, - { - "auxiliary_loss_clip": 0.01114282, - "auxiliary_loss_mlp": 0.01034182, - "balance_loss_clip": 1.04025459, - "balance_loss_mlp": 1.0199244, - "epoch": 0.6527581542161431, - "flos": 28365048696960.0, - "grad_norm": 3.158979628826276, - "language_loss": 0.73701787, - "learning_rate": 1.136872187988815e-06, - "loss": 0.75850254, - "num_input_tokens_seen": 234512645, - "step": 10857, - "time_per_iteration": 2.6843883991241455 - }, - { - "auxiliary_loss_clip": 0.01090082, - "auxiliary_loss_mlp": 0.01035453, - "balance_loss_clip": 1.03591764, - "balance_loss_mlp": 1.02337718, - "epoch": 0.652818277468811, - "flos": 18369206346240.0, - "grad_norm": 3.7655949608052257, - "language_loss": 0.6289376, - "learning_rate": 1.1365208778408965e-06, - "loss": 0.65019298, - "num_input_tokens_seen": 234529310, - "step": 10858, - "time_per_iteration": 2.72822904586792 - }, - { - "auxiliary_loss_clip": 0.01110966, - "auxiliary_loss_mlp": 0.01034439, - "balance_loss_clip": 1.03902686, - "balance_loss_mlp": 1.02228558, - "epoch": 0.6528784007214791, - "flos": 18036170421120.0, - "grad_norm": 1.6211282430818235, - "language_loss": 0.78672451, - "learning_rate": 1.1361696004361939e-06, - "loss": 0.80817854, - "num_input_tokens_seen": 234546685, - "step": 10859, - "time_per_iteration": 2.5962581634521484 - }, - { - "auxiliary_loss_clip": 0.01104671, - "auxiliary_loss_mlp": 0.01033239, - "balance_loss_clip": 1.03923452, - "balance_loss_mlp": 1.02013731, - "epoch": 0.652938523974147, - "flos": 22382008093440.0, - "grad_norm": 1.697122178276391, - "language_loss": 0.67908686, - "learning_rate": 1.1358183557880256e-06, - "loss": 0.70046592, - "num_input_tokens_seen": 234566255, - "step": 10860, - "time_per_iteration": 2.7275006771087646 - }, - { - "auxiliary_loss_clip": 0.01105971, - "auxiliary_loss_mlp": 0.01029587, - "balance_loss_clip": 1.04165852, - "balance_loss_mlp": 1.01677179, - "epoch": 0.652998647226815, - "flos": 16764035368320.0, - "grad_norm": 2.149849017639803, - "language_loss": 0.67175591, - "learning_rate": 1.135467143909712e-06, - "loss": 0.69311142, - "num_input_tokens_seen": 234585405, - "step": 10861, - "time_per_iteration": 2.700737237930298 - }, - { - "auxiliary_loss_clip": 0.01093061, - "auxiliary_loss_mlp": 0.01034965, - "balance_loss_clip": 1.03918886, - "balance_loss_mlp": 1.02101707, - "epoch": 0.6530587704794829, - "flos": 35772522019200.0, - "grad_norm": 1.8900448169789823, - "language_loss": 0.64973295, - "learning_rate": 1.135115964814572e-06, - "loss": 0.67101324, - "num_input_tokens_seen": 234608095, - "step": 10862, - "time_per_iteration": 2.8191120624542236 - }, - { - "auxiliary_loss_clip": 0.01090214, - "auxiliary_loss_mlp": 0.01035944, - "balance_loss_clip": 1.03788185, - "balance_loss_mlp": 1.02351046, - "epoch": 0.6531188937321509, - "flos": 19316134638720.0, - "grad_norm": 1.7201949909347662, - "language_loss": 0.77214205, - "learning_rate": 1.13476481851592e-06, - "loss": 0.79340369, - "num_input_tokens_seen": 234627335, - "step": 10863, - "time_per_iteration": 2.7301394939422607 - }, - { - "auxiliary_loss_clip": 0.01086865, - "auxiliary_loss_mlp": 0.01035498, - "balance_loss_clip": 1.03934371, - "balance_loss_mlp": 1.0234524, - "epoch": 0.6531790169848188, - "flos": 22893771116160.0, - "grad_norm": 5.89922160085871, - "language_loss": 0.74717021, - "learning_rate": 1.1344137050270739e-06, - "loss": 0.76839387, - "num_input_tokens_seen": 234646540, - "step": 10864, - "time_per_iteration": 2.694638729095459 - }, - { - "auxiliary_loss_clip": 0.01101868, - "auxiliary_loss_mlp": 0.01037064, - "balance_loss_clip": 1.03954864, - "balance_loss_mlp": 1.02464223, - "epoch": 0.6532391402374869, - "flos": 29563530912000.0, - "grad_norm": 1.7565530493907513, - "language_loss": 0.86014044, - "learning_rate": 1.1340626243613458e-06, - "loss": 0.88152981, - "num_input_tokens_seen": 234665470, - "step": 10865, - "time_per_iteration": 2.6702401638031006 - }, - { - "auxiliary_loss_clip": 0.01084878, - "auxiliary_loss_mlp": 0.00771127, - "balance_loss_clip": 1.0366689, - "balance_loss_mlp": 1.00016713, - "epoch": 0.6532992634901548, - "flos": 23105463920640.0, - "grad_norm": 1.5997360666048854, - "language_loss": 0.81537604, - "learning_rate": 1.133711576532051e-06, - "loss": 0.8339361, - "num_input_tokens_seen": 234683955, - "step": 10866, - "time_per_iteration": 2.7677865028381348 - }, - { - "auxiliary_loss_clip": 0.01092326, - "auxiliary_loss_mlp": 0.01027552, - "balance_loss_clip": 1.04049444, - "balance_loss_mlp": 1.0153923, - "epoch": 0.6533593867428228, - "flos": 26067340523520.0, - "grad_norm": 1.499689557141503, - "language_loss": 0.82382023, - "learning_rate": 1.1333605615524995e-06, - "loss": 0.84501904, - "num_input_tokens_seen": 234704595, - "step": 10867, - "time_per_iteration": 2.67887020111084 - }, - { - "auxiliary_loss_clip": 0.01086387, - "auxiliary_loss_mlp": 0.01028819, - "balance_loss_clip": 1.03923059, - "balance_loss_mlp": 1.01656437, - "epoch": 0.6534195099954908, - "flos": 21212469262080.0, - "grad_norm": 1.9931778054716736, - "language_loss": 0.81410849, - "learning_rate": 1.1330095794360016e-06, - "loss": 0.83526063, - "num_input_tokens_seen": 234724090, - "step": 10868, - "time_per_iteration": 2.692563533782959 - }, - { - "auxiliary_loss_clip": 0.01085283, - "auxiliary_loss_mlp": 0.0103014, - "balance_loss_clip": 1.04046869, - "balance_loss_mlp": 1.01654446, - "epoch": 0.6534796332481587, - "flos": 19646584784640.0, - "grad_norm": 1.7926198693955093, - "language_loss": 0.79652596, - "learning_rate": 1.1326586301958675e-06, - "loss": 0.81768018, - "num_input_tokens_seen": 234742560, - "step": 10869, - "time_per_iteration": 2.6747188568115234 - }, - { - "auxiliary_loss_clip": 0.01107733, - "auxiliary_loss_mlp": 0.01034253, - "balance_loss_clip": 1.04306769, - "balance_loss_mlp": 1.02144957, - "epoch": 0.6535397565008267, - "flos": 24022479162240.0, - "grad_norm": 1.9247655195442634, - "language_loss": 0.72409803, - "learning_rate": 1.1323077138454063e-06, - "loss": 0.74551791, - "num_input_tokens_seen": 234762315, - "step": 10870, - "time_per_iteration": 2.6496713161468506 - }, - { - "auxiliary_loss_clip": 0.01073837, - "auxiliary_loss_mlp": 0.01040127, - "balance_loss_clip": 1.0374316, - "balance_loss_mlp": 1.02689457, - "epoch": 0.6535998797534947, - "flos": 24602759377920.0, - "grad_norm": 2.0567680865886797, - "language_loss": 0.7481339, - "learning_rate": 1.1319568303979221e-06, - "loss": 0.76927352, - "num_input_tokens_seen": 234781300, - "step": 10871, - "time_per_iteration": 2.738467216491699 - }, - { - "auxiliary_loss_clip": 0.01094755, - "auxiliary_loss_mlp": 0.00768767, - "balance_loss_clip": 1.04057598, - "balance_loss_mlp": 1.00008535, - "epoch": 0.6536600030061627, - "flos": 23364164649600.0, - "grad_norm": 1.631721616705098, - "language_loss": 0.55669373, - "learning_rate": 1.1316059798667227e-06, - "loss": 0.57532895, - "num_input_tokens_seen": 234801040, - "step": 10872, - "time_per_iteration": 2.7837493419647217 - }, - { - "auxiliary_loss_clip": 0.01089558, - "auxiliary_loss_mlp": 0.01033451, - "balance_loss_clip": 1.03836048, - "balance_loss_mlp": 1.02150071, - "epoch": 0.6537201262588306, - "flos": 23878477537920.0, - "grad_norm": 1.5206380793292014, - "language_loss": 0.74701464, - "learning_rate": 1.1312551622651112e-06, - "loss": 0.76824474, - "num_input_tokens_seen": 234821415, - "step": 10873, - "time_per_iteration": 2.6991825103759766 - }, - { - "auxiliary_loss_clip": 0.01103837, - "auxiliary_loss_mlp": 0.01031753, - "balance_loss_clip": 1.04124331, - "balance_loss_mlp": 1.01923621, - "epoch": 0.6537802495114986, - "flos": 24354760901760.0, - "grad_norm": 1.5572607769752447, - "language_loss": 0.75670367, - "learning_rate": 1.1309043776063917e-06, - "loss": 0.7780596, - "num_input_tokens_seen": 234843795, - "step": 10874, - "time_per_iteration": 2.78080153465271 - }, - { - "auxiliary_loss_clip": 0.01071596, - "auxiliary_loss_mlp": 0.01032474, - "balance_loss_clip": 1.03871393, - "balance_loss_mlp": 1.01939058, - "epoch": 0.6538403727641665, - "flos": 27996892248960.0, - "grad_norm": 1.5478335962993721, - "language_loss": 0.81636667, - "learning_rate": 1.1305536259038642e-06, - "loss": 0.83740735, - "num_input_tokens_seen": 234862350, - "step": 10875, - "time_per_iteration": 2.8029510974884033 - }, - { - "auxiliary_loss_clip": 0.01113458, - "auxiliary_loss_mlp": 0.01038052, - "balance_loss_clip": 1.03928709, - "balance_loss_mlp": 1.0257194, - "epoch": 0.6539004960168345, - "flos": 27563594486400.0, - "grad_norm": 1.7147154744114859, - "language_loss": 0.70016718, - "learning_rate": 1.1302029071708314e-06, - "loss": 0.72168231, - "num_input_tokens_seen": 234881790, - "step": 10876, - "time_per_iteration": 2.7378597259521484 - }, - { - "auxiliary_loss_clip": 0.01019889, - "auxiliary_loss_mlp": 0.01040083, - "balance_loss_clip": 1.03454161, - "balance_loss_mlp": 1.02664804, - "epoch": 0.6539606192695024, - "flos": 14530067879040.0, - "grad_norm": 5.2813318768904, - "language_loss": 0.79471064, - "learning_rate": 1.1298522214205908e-06, - "loss": 0.81531036, - "num_input_tokens_seen": 234897775, - "step": 10877, - "time_per_iteration": 2.9654347896575928 - }, - { - "auxiliary_loss_clip": 0.0109536, - "auxiliary_loss_mlp": 0.00770832, - "balance_loss_clip": 1.04007304, - "balance_loss_mlp": 1.00019956, - "epoch": 0.6540207425221705, - "flos": 21616356764160.0, - "grad_norm": 10.000647074298708, - "language_loss": 0.79720318, - "learning_rate": 1.1295015686664408e-06, - "loss": 0.81586516, - "num_input_tokens_seen": 234918395, - "step": 10878, - "time_per_iteration": 3.0778963565826416 - }, - { - "auxiliary_loss_clip": 0.01091014, - "auxiliary_loss_mlp": 0.01032504, - "balance_loss_clip": 1.03766847, - "balance_loss_mlp": 1.01898539, - "epoch": 0.6540808657748384, - "flos": 17668983640320.0, - "grad_norm": 1.8035849841716871, - "language_loss": 0.84622979, - "learning_rate": 1.1291509489216797e-06, - "loss": 0.8674649, - "num_input_tokens_seen": 234936260, - "step": 10879, - "time_per_iteration": 2.668922185897827 - }, - { - "auxiliary_loss_clip": 0.01093903, - "auxiliary_loss_mlp": 0.01030306, - "balance_loss_clip": 1.03903461, - "balance_loss_mlp": 1.01730609, - "epoch": 0.6541409890275064, - "flos": 14538292093440.0, - "grad_norm": 2.263757202052665, - "language_loss": 0.71778309, - "learning_rate": 1.128800362199601e-06, - "loss": 0.73902524, - "num_input_tokens_seen": 234952110, - "step": 10880, - "time_per_iteration": 2.662271499633789 - }, - { - "auxiliary_loss_clip": 0.0107269, - "auxiliary_loss_mlp": 0.01037728, - "balance_loss_clip": 1.03594911, - "balance_loss_mlp": 1.02518129, - "epoch": 0.6542011122801744, - "flos": 17165301177600.0, - "grad_norm": 2.432806470924959, - "language_loss": 0.8439703, - "learning_rate": 1.1284498085135005e-06, - "loss": 0.86507452, - "num_input_tokens_seen": 234970810, - "step": 10881, - "time_per_iteration": 4.583907127380371 - }, - { - "auxiliary_loss_clip": 0.01081012, - "auxiliary_loss_mlp": 0.01035122, - "balance_loss_clip": 1.03797197, - "balance_loss_mlp": 1.02026868, - "epoch": 0.6542612355328423, - "flos": 18186600579840.0, - "grad_norm": 1.797675187581498, - "language_loss": 0.78180546, - "learning_rate": 1.1280992878766699e-06, - "loss": 0.80296683, - "num_input_tokens_seen": 234989565, - "step": 10882, - "time_per_iteration": 2.7273218631744385 - }, - { - "auxiliary_loss_clip": 0.01117869, - "auxiliary_loss_mlp": 0.01030964, - "balance_loss_clip": 1.04191113, - "balance_loss_mlp": 1.01693869, - "epoch": 0.6543213587855103, - "flos": 19792453916160.0, - "grad_norm": 2.0896373641472716, - "language_loss": 0.82002509, - "learning_rate": 1.1277488003024024e-06, - "loss": 0.84151345, - "num_input_tokens_seen": 235007955, - "step": 10883, - "time_per_iteration": 2.6430859565734863 - }, - { - "auxiliary_loss_clip": 0.01063765, - "auxiliary_loss_mlp": 0.01039023, - "balance_loss_clip": 1.03828621, - "balance_loss_mlp": 1.02518272, - "epoch": 0.6543814820381783, - "flos": 21105096531840.0, - "grad_norm": 2.092498099252334, - "language_loss": 0.85347474, - "learning_rate": 1.127398345803988e-06, - "loss": 0.8745026, - "num_input_tokens_seen": 235024860, - "step": 10884, - "time_per_iteration": 6.071943998336792 - }, - { - "auxiliary_loss_clip": 0.01092231, - "auxiliary_loss_mlp": 0.01036915, - "balance_loss_clip": 1.03901005, - "balance_loss_mlp": 1.02371883, - "epoch": 0.6544416052908463, - "flos": 20194042947840.0, - "grad_norm": 2.4941716916648367, - "language_loss": 0.79124463, - "learning_rate": 1.127047924394715e-06, - "loss": 0.81253612, - "num_input_tokens_seen": 235043815, - "step": 10885, - "time_per_iteration": 2.675748586654663 - }, - { - "auxiliary_loss_clip": 0.01074538, - "auxiliary_loss_mlp": 0.01031139, - "balance_loss_clip": 1.03618622, - "balance_loss_mlp": 1.01794887, - "epoch": 0.6545017285435142, - "flos": 23368258800000.0, - "grad_norm": 1.8639137549782854, - "language_loss": 0.72277772, - "learning_rate": 1.1266975360878722e-06, - "loss": 0.7438345, - "num_input_tokens_seen": 235062985, - "step": 10886, - "time_per_iteration": 2.750396490097046 - }, - { - "auxiliary_loss_clip": 0.0109826, - "auxiliary_loss_mlp": 0.01029695, - "balance_loss_clip": 1.04163647, - "balance_loss_mlp": 1.01777434, - "epoch": 0.6545618517961822, - "flos": 19134714021120.0, - "grad_norm": 1.7570103692481698, - "language_loss": 0.77918178, - "learning_rate": 1.1263471808967468e-06, - "loss": 0.80046129, - "num_input_tokens_seen": 235081670, - "step": 10887, - "time_per_iteration": 2.6504671573638916 - }, - { - "auxiliary_loss_clip": 0.01087762, - "auxiliary_loss_mlp": 0.01034009, - "balance_loss_clip": 1.03893995, - "balance_loss_mlp": 1.02152205, - "epoch": 0.6546219750488501, - "flos": 14938624149120.0, - "grad_norm": 3.1473995780079567, - "language_loss": 0.78907198, - "learning_rate": 1.1259968588346234e-06, - "loss": 0.81028962, - "num_input_tokens_seen": 235098510, - "step": 10888, - "time_per_iteration": 4.194061040878296 - }, - { - "auxiliary_loss_clip": 0.01101212, - "auxiliary_loss_mlp": 0.01030296, - "balance_loss_clip": 1.03934383, - "balance_loss_mlp": 1.0185833, - "epoch": 0.6546820983015181, - "flos": 36320518886400.0, - "grad_norm": 1.6496831253156983, - "language_loss": 0.66765958, - "learning_rate": 1.1256465699147874e-06, - "loss": 0.68897462, - "num_input_tokens_seen": 235119990, - "step": 10889, - "time_per_iteration": 2.784081220626831 - }, - { - "auxiliary_loss_clip": 0.01087306, - "auxiliary_loss_mlp": 0.01041216, - "balance_loss_clip": 1.03762484, - "balance_loss_mlp": 1.02561128, - "epoch": 0.654742221554186, - "flos": 20411446014720.0, - "grad_norm": 1.423388332820949, - "language_loss": 0.7975992, - "learning_rate": 1.1252963141505203e-06, - "loss": 0.81888443, - "num_input_tokens_seen": 235139255, - "step": 10890, - "time_per_iteration": 2.630934000015259 - }, - { - "auxiliary_loss_clip": 0.01103288, - "auxiliary_loss_mlp": 0.00771276, - "balance_loss_clip": 1.0388689, - "balance_loss_mlp": 1.00018215, - "epoch": 0.6548023448068541, - "flos": 24863650836480.0, - "grad_norm": 4.747441832744551, - "language_loss": 0.66281724, - "learning_rate": 1.1249460915551052e-06, - "loss": 0.6815629, - "num_input_tokens_seen": 235158455, - "step": 10891, - "time_per_iteration": 2.7071638107299805 - }, - { - "auxiliary_loss_clip": 0.01100507, - "auxiliary_loss_mlp": 0.01034408, - "balance_loss_clip": 1.03802693, - "balance_loss_mlp": 1.02253485, - "epoch": 0.654862468059522, - "flos": 21427573858560.0, - "grad_norm": 1.8230572175778426, - "language_loss": 0.79398739, - "learning_rate": 1.1245959021418214e-06, - "loss": 0.81533659, - "num_input_tokens_seen": 235177350, - "step": 10892, - "time_per_iteration": 2.7039225101470947 - }, - { - "auxiliary_loss_clip": 0.01109845, - "auxiliary_loss_mlp": 0.01032726, - "balance_loss_clip": 1.04345989, - "balance_loss_mlp": 1.01996517, - "epoch": 0.65492259131219, - "flos": 26577846570240.0, - "grad_norm": 1.9941624602256833, - "language_loss": 0.7830174, - "learning_rate": 1.1242457459239497e-06, - "loss": 0.80444312, - "num_input_tokens_seen": 235196435, - "step": 10893, - "time_per_iteration": 2.6736834049224854 - }, - { - "auxiliary_loss_clip": 0.01119127, - "auxiliary_loss_mlp": 0.01033009, - "balance_loss_clip": 1.04234505, - "balance_loss_mlp": 1.01919901, - "epoch": 0.6549827145648579, - "flos": 21501334437120.0, - "grad_norm": 1.6280761795880925, - "language_loss": 0.70089674, - "learning_rate": 1.123895622914766e-06, - "loss": 0.72241807, - "num_input_tokens_seen": 235215430, - "step": 10894, - "time_per_iteration": 2.5782406330108643 - }, - { - "auxiliary_loss_clip": 0.01108084, - "auxiliary_loss_mlp": 0.01033898, - "balance_loss_clip": 1.03990614, - "balance_loss_mlp": 1.02057683, - "epoch": 0.6550428378175259, - "flos": 22594275515520.0, - "grad_norm": 3.549181275643373, - "language_loss": 0.63655615, - "learning_rate": 1.123545533127549e-06, - "loss": 0.65797597, - "num_input_tokens_seen": 235232015, - "step": 10895, - "time_per_iteration": 2.629176139831543 - }, - { - "auxiliary_loss_clip": 0.0109961, - "auxiliary_loss_mlp": 0.01034607, - "balance_loss_clip": 1.03651488, - "balance_loss_mlp": 1.02231681, - "epoch": 0.655102961070194, - "flos": 12823809050880.0, - "grad_norm": 1.94601425933446, - "language_loss": 0.78524303, - "learning_rate": 1.1231954765755722e-06, - "loss": 0.80658519, - "num_input_tokens_seen": 235248115, - "step": 10896, - "time_per_iteration": 2.5840821266174316 - }, - { - "auxiliary_loss_clip": 0.01092224, - "auxiliary_loss_mlp": 0.01033088, - "balance_loss_clip": 1.04114115, - "balance_loss_mlp": 1.02101183, - "epoch": 0.6551630843228619, - "flos": 24791075406720.0, - "grad_norm": 1.3806195961019156, - "language_loss": 0.70286167, - "learning_rate": 1.1228454532721111e-06, - "loss": 0.72411478, - "num_input_tokens_seen": 235270785, - "step": 10897, - "time_per_iteration": 2.7511391639709473 - }, - { - "auxiliary_loss_clip": 0.01117369, - "auxiliary_loss_mlp": 0.01034749, - "balance_loss_clip": 1.0412488, - "balance_loss_mlp": 1.02182722, - "epoch": 0.6552232075755299, - "flos": 16724461559040.0, - "grad_norm": 1.8561946448451885, - "language_loss": 0.75477493, - "learning_rate": 1.1224954632304391e-06, - "loss": 0.77629614, - "num_input_tokens_seen": 235287905, - "step": 10898, - "time_per_iteration": 2.5865721702575684 - }, - { - "auxiliary_loss_clip": 0.0109408, - "auxiliary_loss_mlp": 0.01033257, - "balance_loss_clip": 1.03979027, - "balance_loss_mlp": 1.0210743, - "epoch": 0.6552833308281978, - "flos": 22016473338240.0, - "grad_norm": 3.2174058784853634, - "language_loss": 0.73745394, - "learning_rate": 1.122145506463827e-06, - "loss": 0.75872725, - "num_input_tokens_seen": 235305525, - "step": 10899, - "time_per_iteration": 2.6415457725524902 - }, - { - "auxiliary_loss_clip": 0.01092854, - "auxiliary_loss_mlp": 0.0103035, - "balance_loss_clip": 1.0398674, - "balance_loss_mlp": 1.0178864, - "epoch": 0.6553434540808658, - "flos": 24863399441280.0, - "grad_norm": 1.7775828030787661, - "language_loss": 0.5608502, - "learning_rate": 1.1217955829855443e-06, - "loss": 0.58208227, - "num_input_tokens_seen": 235324415, - "step": 10900, - "time_per_iteration": 2.6782078742980957 - }, - { - "auxiliary_loss_clip": 0.0110767, - "auxiliary_loss_mlp": 0.01036194, - "balance_loss_clip": 1.04541218, - "balance_loss_mlp": 1.02259791, - "epoch": 0.6554035773335337, - "flos": 23221060865280.0, - "grad_norm": 1.7239848151303507, - "language_loss": 0.76706004, - "learning_rate": 1.1214456928088622e-06, - "loss": 0.78849864, - "num_input_tokens_seen": 235341595, - "step": 10901, - "time_per_iteration": 2.6912708282470703 - }, - { - "auxiliary_loss_clip": 0.01116025, - "auxiliary_loss_mlp": 0.01030659, - "balance_loss_clip": 1.04228628, - "balance_loss_mlp": 1.01741457, - "epoch": 0.6554637005862017, - "flos": 22783597125120.0, - "grad_norm": 1.8287933063935295, - "language_loss": 0.73178118, - "learning_rate": 1.1210958359470463e-06, - "loss": 0.7532481, - "num_input_tokens_seen": 235361700, - "step": 10902, - "time_per_iteration": 2.602215528488159 - }, - { - "auxiliary_loss_clip": 0.01116289, - "auxiliary_loss_mlp": 0.0103284, - "balance_loss_clip": 1.04363585, - "balance_loss_mlp": 1.020293, - "epoch": 0.6555238238388696, - "flos": 21507224267520.0, - "grad_norm": 2.8262041202402806, - "language_loss": 0.68081355, - "learning_rate": 1.1207460124133645e-06, - "loss": 0.7023049, - "num_input_tokens_seen": 235382065, - "step": 10903, - "time_per_iteration": 2.6410489082336426 - }, - { - "auxiliary_loss_clip": 0.01095479, - "auxiliary_loss_mlp": 0.00772021, - "balance_loss_clip": 1.0381676, - "balance_loss_mlp": 1.00024486, - "epoch": 0.6555839470915377, - "flos": 30519473518080.0, - "grad_norm": 1.6908937242491595, - "language_loss": 0.66551757, - "learning_rate": 1.1203962222210832e-06, - "loss": 0.6841926, - "num_input_tokens_seen": 235402130, - "step": 10904, - "time_per_iteration": 2.790280342102051 - }, - { - "auxiliary_loss_clip": 0.01106834, - "auxiliary_loss_mlp": 0.01041591, - "balance_loss_clip": 1.0399909, - "balance_loss_mlp": 1.02686858, - "epoch": 0.6556440703442056, - "flos": 24642943718400.0, - "grad_norm": 1.7449585350931947, - "language_loss": 0.90588987, - "learning_rate": 1.120046465383464e-06, - "loss": 0.92737412, - "num_input_tokens_seen": 235420435, - "step": 10905, - "time_per_iteration": 2.6630730628967285 - }, - { - "auxiliary_loss_clip": 0.01101239, - "auxiliary_loss_mlp": 0.01036194, - "balance_loss_clip": 1.0387404, - "balance_loss_mlp": 1.02384353, - "epoch": 0.6557041935968736, - "flos": 23732464752000.0, - "grad_norm": 1.68326433592196, - "language_loss": 0.75189042, - "learning_rate": 1.1196967419137721e-06, - "loss": 0.77326465, - "num_input_tokens_seen": 235439960, - "step": 10906, - "time_per_iteration": 2.808749198913574 - }, - { - "auxiliary_loss_clip": 0.01120903, - "auxiliary_loss_mlp": 0.01039658, - "balance_loss_clip": 1.04417121, - "balance_loss_mlp": 1.02620482, - "epoch": 0.6557643168495415, - "flos": 11102753819520.0, - "grad_norm": 2.6025393297474753, - "language_loss": 0.74533153, - "learning_rate": 1.119347051825267e-06, - "loss": 0.76693714, - "num_input_tokens_seen": 235457495, - "step": 10907, - "time_per_iteration": 2.593248128890991 - }, - { - "auxiliary_loss_clip": 0.01074084, - "auxiliary_loss_mlp": 0.01033534, - "balance_loss_clip": 1.03740346, - "balance_loss_mlp": 1.01887107, - "epoch": 0.6558244401022095, - "flos": 30191034533760.0, - "grad_norm": 1.4237999067012654, - "language_loss": 0.72347319, - "learning_rate": 1.118997395131211e-06, - "loss": 0.74454939, - "num_input_tokens_seen": 235479525, - "step": 10908, - "time_per_iteration": 2.82675838470459 - }, - { - "auxiliary_loss_clip": 0.01119224, - "auxiliary_loss_mlp": 0.01039345, - "balance_loss_clip": 1.04407787, - "balance_loss_mlp": 1.02501035, - "epoch": 0.6558845633548775, - "flos": 17931060247680.0, - "grad_norm": 2.1324653040060206, - "language_loss": 0.81237155, - "learning_rate": 1.118647771844861e-06, - "loss": 0.83395725, - "num_input_tokens_seen": 235496305, - "step": 10909, - "time_per_iteration": 2.5471675395965576 - }, - { - "auxiliary_loss_clip": 0.01118639, - "auxiliary_loss_mlp": 0.01037445, - "balance_loss_clip": 1.04318082, - "balance_loss_mlp": 1.02355766, - "epoch": 0.6559446866075455, - "flos": 21904144531200.0, - "grad_norm": 2.016309466872126, - "language_loss": 0.6391021, - "learning_rate": 1.1182981819794767e-06, - "loss": 0.66066295, - "num_input_tokens_seen": 235512545, - "step": 10910, - "time_per_iteration": 2.5981180667877197 - }, - { - "auxiliary_loss_clip": 0.01094899, - "auxiliary_loss_mlp": 0.01035785, - "balance_loss_clip": 1.03948653, - "balance_loss_mlp": 1.02022815, - "epoch": 0.6560048098602135, - "flos": 14127976056960.0, - "grad_norm": 3.167812850459713, - "language_loss": 0.75653553, - "learning_rate": 1.117948625548313e-06, - "loss": 0.7778424, - "num_input_tokens_seen": 235526045, - "step": 10911, - "time_per_iteration": 2.6054794788360596 - }, - { - "auxiliary_loss_clip": 0.01110901, - "auxiliary_loss_mlp": 0.01032832, - "balance_loss_clip": 1.03947508, - "balance_loss_mlp": 1.02068496, - "epoch": 0.6560649331128814, - "flos": 18807567926400.0, - "grad_norm": 1.6537881729795834, - "language_loss": 0.75314403, - "learning_rate": 1.1175991025646265e-06, - "loss": 0.77458137, - "num_input_tokens_seen": 235545285, - "step": 10912, - "time_per_iteration": 2.5621368885040283 - }, - { - "auxiliary_loss_clip": 0.01080239, - "auxiliary_loss_mlp": 0.00773337, - "balance_loss_clip": 1.04076517, - "balance_loss_mlp": 1.00024402, - "epoch": 0.6561250563655494, - "flos": 17053618815360.0, - "grad_norm": 1.7152126223100395, - "language_loss": 0.77399373, - "learning_rate": 1.1172496130416697e-06, - "loss": 0.79252946, - "num_input_tokens_seen": 235563150, - "step": 10913, - "time_per_iteration": 2.6770215034484863 - }, - { - "auxiliary_loss_clip": 0.01082486, - "auxiliary_loss_mlp": 0.01031683, - "balance_loss_clip": 1.03641891, - "balance_loss_mlp": 1.0197978, - "epoch": 0.6561851796182173, - "flos": 22637656166400.0, - "grad_norm": 1.7806335935721003, - "language_loss": 0.71243644, - "learning_rate": 1.1169001569926961e-06, - "loss": 0.73357815, - "num_input_tokens_seen": 235582535, - "step": 10914, - "time_per_iteration": 2.667307138442993 - }, - { - "auxiliary_loss_clip": 0.01083296, - "auxiliary_loss_mlp": 0.01037173, - "balance_loss_clip": 1.03966224, - "balance_loss_mlp": 1.02398872, - "epoch": 0.6562453028708853, - "flos": 19239213663360.0, - "grad_norm": 1.6513290970886485, - "language_loss": 0.73859835, - "learning_rate": 1.116550734430958e-06, - "loss": 0.75980306, - "num_input_tokens_seen": 235601490, - "step": 10915, - "time_per_iteration": 2.6983346939086914 - }, - { - "auxiliary_loss_clip": 0.01073456, - "auxiliary_loss_mlp": 0.01034672, - "balance_loss_clip": 1.03744984, - "balance_loss_mlp": 1.02053952, - "epoch": 0.6563054261235532, - "flos": 23801305167360.0, - "grad_norm": 1.7082646806866446, - "language_loss": 0.79868412, - "learning_rate": 1.1162013453697042e-06, - "loss": 0.81976539, - "num_input_tokens_seen": 235619165, - "step": 10916, - "time_per_iteration": 2.7007508277893066 - }, - { - "auxiliary_loss_clip": 0.01085821, - "auxiliary_loss_mlp": 0.01034159, - "balance_loss_clip": 1.03703237, - "balance_loss_mlp": 1.02174914, - "epoch": 0.6563655493762213, - "flos": 19240039676160.0, - "grad_norm": 7.0038314681057265, - "language_loss": 0.76291168, - "learning_rate": 1.1158519898221831e-06, - "loss": 0.78411144, - "num_input_tokens_seen": 235637115, - "step": 10917, - "time_per_iteration": 2.6554038524627686 - }, - { - "auxiliary_loss_clip": 0.01114484, - "auxiliary_loss_mlp": 0.00770758, - "balance_loss_clip": 1.04096055, - "balance_loss_mlp": 1.00018668, - "epoch": 0.6564256726288892, - "flos": 25556439427200.0, - "grad_norm": 1.7912511669436304, - "language_loss": 0.69599342, - "learning_rate": 1.1155026678016445e-06, - "loss": 0.7148459, - "num_input_tokens_seen": 235656330, - "step": 10918, - "time_per_iteration": 2.658940315246582 - }, - { - "auxiliary_loss_clip": 0.0108095, - "auxiliary_loss_mlp": 0.01038011, - "balance_loss_clip": 1.04091477, - "balance_loss_mlp": 1.02542877, - "epoch": 0.6564857958815572, - "flos": 22200623389440.0, - "grad_norm": 1.5721628638219425, - "language_loss": 0.76389003, - "learning_rate": 1.115153379321332e-06, - "loss": 0.78507966, - "num_input_tokens_seen": 235674510, - "step": 10919, - "time_per_iteration": 2.8179666996002197 - }, - { - "auxiliary_loss_clip": 0.01024309, - "auxiliary_loss_mlp": 0.00751654, - "balance_loss_clip": 1.01056981, - "balance_loss_mlp": 0.99972719, - "epoch": 0.6565459191342251, - "flos": 58123144604160.0, - "grad_norm": 0.7147618349733724, - "language_loss": 0.52982259, - "learning_rate": 1.1148041243944931e-06, - "loss": 0.54758221, - "num_input_tokens_seen": 235735050, - "step": 10920, - "time_per_iteration": 4.864136457443237 - }, - { - "auxiliary_loss_clip": 0.01102705, - "auxiliary_loss_mlp": 0.01032103, - "balance_loss_clip": 1.03955173, - "balance_loss_mlp": 1.01899636, - "epoch": 0.6566060423868931, - "flos": 30809631582720.0, - "grad_norm": 1.4970588684029809, - "language_loss": 0.65309536, - "learning_rate": 1.1144549030343697e-06, - "loss": 0.67444336, - "num_input_tokens_seen": 235757545, - "step": 10921, - "time_per_iteration": 2.6399025917053223 - }, - { - "auxiliary_loss_clip": 0.01088773, - "auxiliary_loss_mlp": 0.01042354, - "balance_loss_clip": 1.03777099, - "balance_loss_mlp": 1.02691627, - "epoch": 0.6566661656395612, - "flos": 23367432787200.0, - "grad_norm": 1.7149236463781705, - "language_loss": 0.81306088, - "learning_rate": 1.114105715254205e-06, - "loss": 0.83437216, - "num_input_tokens_seen": 235777265, - "step": 10922, - "time_per_iteration": 2.6043496131896973 - }, - { - "auxiliary_loss_clip": 0.0105706, - "auxiliary_loss_mlp": 0.00773782, - "balance_loss_clip": 1.03730524, - "balance_loss_mlp": 1.00019729, - "epoch": 0.6567262888922291, - "flos": 25735597488000.0, - "grad_norm": 1.8848622596547697, - "language_loss": 0.71114737, - "learning_rate": 1.1137565610672414e-06, - "loss": 0.72945583, - "num_input_tokens_seen": 235796565, - "step": 10923, - "time_per_iteration": 4.080937385559082 - }, - { - "auxiliary_loss_clip": 0.01080403, - "auxiliary_loss_mlp": 0.01035157, - "balance_loss_clip": 1.04141772, - "balance_loss_mlp": 1.02234805, - "epoch": 0.6567864121448971, - "flos": 17123716206720.0, - "grad_norm": 1.9659077727339813, - "language_loss": 0.80819428, - "learning_rate": 1.1134074404867169e-06, - "loss": 0.82934988, - "num_input_tokens_seen": 235814805, - "step": 10924, - "time_per_iteration": 4.207550287246704 - }, - { - "auxiliary_loss_clip": 0.01098058, - "auxiliary_loss_mlp": 0.01028766, - "balance_loss_clip": 1.03715539, - "balance_loss_mlp": 1.01637435, - "epoch": 0.656846535397565, - "flos": 22419319345920.0, - "grad_norm": 4.832574898603098, - "language_loss": 0.7250914, - "learning_rate": 1.1130583535258717e-06, - "loss": 0.74635959, - "num_input_tokens_seen": 235833405, - "step": 10925, - "time_per_iteration": 2.637345790863037 - }, - { - "auxiliary_loss_clip": 0.01101634, - "auxiliary_loss_mlp": 0.01030115, - "balance_loss_clip": 1.03830063, - "balance_loss_mlp": 1.01710916, - "epoch": 0.656906658650233, - "flos": 17704535126400.0, - "grad_norm": 2.262744420383479, - "language_loss": 0.72445238, - "learning_rate": 1.112709300197942e-06, - "loss": 0.74576986, - "num_input_tokens_seen": 235848530, - "step": 10926, - "time_per_iteration": 2.6307756900787354 - }, - { - "auxiliary_loss_clip": 0.0106886, - "auxiliary_loss_mlp": 0.01034286, - "balance_loss_clip": 1.03765988, - "balance_loss_mlp": 1.02080905, - "epoch": 0.6569667819029009, - "flos": 21175158009600.0, - "grad_norm": 1.7200943700135627, - "language_loss": 0.72494638, - "learning_rate": 1.1123602805161656e-06, - "loss": 0.74597794, - "num_input_tokens_seen": 235867225, - "step": 10927, - "time_per_iteration": 4.311558246612549 - }, - { - "auxiliary_loss_clip": 0.01005194, - "auxiliary_loss_mlp": 0.01007222, - "balance_loss_clip": 1.01187444, - "balance_loss_mlp": 1.00603569, - "epoch": 0.6570269051555689, - "flos": 68761897511040.0, - "grad_norm": 0.7266677598408974, - "language_loss": 0.64416504, - "learning_rate": 1.112011294493775e-06, - "loss": 0.66428924, - "num_input_tokens_seen": 235932925, - "step": 10928, - "time_per_iteration": 3.2423789501190186 - }, - { - "auxiliary_loss_clip": 0.01100905, - "auxiliary_loss_mlp": 0.01034907, - "balance_loss_clip": 1.03707099, - "balance_loss_mlp": 1.02176392, - "epoch": 0.6570870284082369, - "flos": 26319289495680.0, - "grad_norm": 1.7795232563837846, - "language_loss": 0.77698803, - "learning_rate": 1.1116623421440063e-06, - "loss": 0.79834616, - "num_input_tokens_seen": 235952680, - "step": 10929, - "time_per_iteration": 2.6381664276123047 - }, - { - "auxiliary_loss_clip": 0.01078467, - "auxiliary_loss_mlp": 0.01030363, - "balance_loss_clip": 1.03687572, - "balance_loss_mlp": 1.01705337, - "epoch": 0.6571471516609049, - "flos": 26174749167360.0, - "grad_norm": 2.3625903698766826, - "language_loss": 0.65178704, - "learning_rate": 1.1113134234800895e-06, - "loss": 0.67287529, - "num_input_tokens_seen": 235972075, - "step": 10930, - "time_per_iteration": 2.7424116134643555 - }, - { - "auxiliary_loss_clip": 0.01063728, - "auxiliary_loss_mlp": 0.01033942, - "balance_loss_clip": 1.03379416, - "balance_loss_mlp": 1.02037621, - "epoch": 0.6572072749135728, - "flos": 20376253664640.0, - "grad_norm": 1.690752691180261, - "language_loss": 0.70888293, - "learning_rate": 1.110964538515258e-06, - "loss": 0.72985959, - "num_input_tokens_seen": 235990340, - "step": 10931, - "time_per_iteration": 2.7526936531066895 - }, - { - "auxiliary_loss_clip": 0.01070712, - "auxiliary_loss_mlp": 0.01038764, - "balance_loss_clip": 1.03789568, - "balance_loss_mlp": 1.02569246, - "epoch": 0.6572673981662408, - "flos": 17128744110720.0, - "grad_norm": 2.7494594651926763, - "language_loss": 0.68903434, - "learning_rate": 1.1106156872627393e-06, - "loss": 0.71012914, - "num_input_tokens_seen": 236007470, - "step": 10932, - "time_per_iteration": 2.699676036834717 - }, - { - "auxiliary_loss_clip": 0.01088862, - "auxiliary_loss_mlp": 0.0077114, - "balance_loss_clip": 1.03621304, - "balance_loss_mlp": 1.00018311, - "epoch": 0.6573275214189087, - "flos": 41275113281280.0, - "grad_norm": 1.7103641293724658, - "language_loss": 0.80041671, - "learning_rate": 1.1102668697357626e-06, - "loss": 0.8190167, - "num_input_tokens_seen": 236029030, - "step": 10933, - "time_per_iteration": 2.884944200515747 - }, - { - "auxiliary_loss_clip": 0.01066755, - "auxiliary_loss_mlp": 0.01038188, - "balance_loss_clip": 1.03784192, - "balance_loss_mlp": 1.02397847, - "epoch": 0.6573876446715767, - "flos": 22890143842560.0, - "grad_norm": 1.944468432565168, - "language_loss": 0.73796332, - "learning_rate": 1.1099180859475571e-06, - "loss": 0.75901282, - "num_input_tokens_seen": 236047160, - "step": 10934, - "time_per_iteration": 2.689169406890869 - }, - { - "auxiliary_loss_clip": 0.01097012, - "auxiliary_loss_mlp": 0.01038397, - "balance_loss_clip": 1.0375042, - "balance_loss_mlp": 1.02410352, - "epoch": 0.6574477679242448, - "flos": 44018150273280.0, - "grad_norm": 1.510657094056813, - "language_loss": 0.76061821, - "learning_rate": 1.1095693359113454e-06, - "loss": 0.78197235, - "num_input_tokens_seen": 236069215, - "step": 10935, - "time_per_iteration": 2.798928737640381 - }, - { - "auxiliary_loss_clip": 0.01075783, - "auxiliary_loss_mlp": 0.01039916, - "balance_loss_clip": 1.03844082, - "balance_loss_mlp": 1.02543783, - "epoch": 0.6575078911769127, - "flos": 24571517523840.0, - "grad_norm": 1.6442083694725653, - "language_loss": 0.78311378, - "learning_rate": 1.1092206196403538e-06, - "loss": 0.80427074, - "num_input_tokens_seen": 236088335, - "step": 10936, - "time_per_iteration": 2.718698263168335 - }, - { - "auxiliary_loss_clip": 0.01065449, - "auxiliary_loss_mlp": 0.01032937, - "balance_loss_clip": 1.0363667, - "balance_loss_mlp": 1.02052104, - "epoch": 0.6575680144295807, - "flos": 20924035050240.0, - "grad_norm": 1.7517271883506782, - "language_loss": 0.68920904, - "learning_rate": 1.1088719371478056e-06, - "loss": 0.71019292, - "num_input_tokens_seen": 236108540, - "step": 10937, - "time_per_iteration": 2.7036542892456055 - }, - { - "auxiliary_loss_clip": 0.01087739, - "auxiliary_loss_mlp": 0.0103126, - "balance_loss_clip": 1.03832746, - "balance_loss_mlp": 1.01813471, - "epoch": 0.6576281376822486, - "flos": 10925642833920.0, - "grad_norm": 2.652931448732022, - "language_loss": 0.6823296, - "learning_rate": 1.1085232884469236e-06, - "loss": 0.70351958, - "num_input_tokens_seen": 236124495, - "step": 10938, - "time_per_iteration": 2.6599676609039307 - }, - { - "auxiliary_loss_clip": 0.01085941, - "auxiliary_loss_mlp": 0.01033843, - "balance_loss_clip": 1.03766704, - "balance_loss_mlp": 1.02009773, - "epoch": 0.6576882609349166, - "flos": 19281552819840.0, - "grad_norm": 3.453384337403157, - "language_loss": 0.71610057, - "learning_rate": 1.108174673550927e-06, - "loss": 0.73729843, - "num_input_tokens_seen": 236142550, - "step": 10939, - "time_per_iteration": 2.650425672531128 - }, - { - "auxiliary_loss_clip": 0.01092138, - "auxiliary_loss_mlp": 0.00771382, - "balance_loss_clip": 1.03735209, - "balance_loss_mlp": 1.00023603, - "epoch": 0.6577483841875845, - "flos": 20220544206720.0, - "grad_norm": 2.2437103704575345, - "language_loss": 0.77729875, - "learning_rate": 1.107826092473037e-06, - "loss": 0.79593396, - "num_input_tokens_seen": 236156620, - "step": 10940, - "time_per_iteration": 2.669313669204712 - }, - { - "auxiliary_loss_clip": 0.01071259, - "auxiliary_loss_mlp": 0.01031827, - "balance_loss_clip": 1.03549123, - "balance_loss_mlp": 1.01780236, - "epoch": 0.6578085074402525, - "flos": 34751078962560.0, - "grad_norm": 2.3851655144351818, - "language_loss": 0.68552613, - "learning_rate": 1.107477545226471e-06, - "loss": 0.70655704, - "num_input_tokens_seen": 236177095, - "step": 10941, - "time_per_iteration": 2.8323819637298584 - }, - { - "auxiliary_loss_clip": 0.01098124, - "auxiliary_loss_mlp": 0.00771304, - "balance_loss_clip": 1.03532124, - "balance_loss_mlp": 1.00012338, - "epoch": 0.6578686306929205, - "flos": 23470998675840.0, - "grad_norm": 2.4287401057679436, - "language_loss": 0.68286288, - "learning_rate": 1.1071290318244448e-06, - "loss": 0.70155716, - "num_input_tokens_seen": 236194695, - "step": 10942, - "time_per_iteration": 2.662338972091675 - }, - { - "auxiliary_loss_clip": 0.01082673, - "auxiliary_loss_mlp": 0.01036222, - "balance_loss_clip": 1.03803504, - "balance_loss_mlp": 1.02132106, - "epoch": 0.6579287539455885, - "flos": 18077073033600.0, - "grad_norm": 1.9182303150374724, - "language_loss": 0.71618617, - "learning_rate": 1.1067805522801753e-06, - "loss": 0.73737514, - "num_input_tokens_seen": 236213885, - "step": 10943, - "time_per_iteration": 2.6217944622039795 - }, - { - "auxiliary_loss_clip": 0.01070671, - "auxiliary_loss_mlp": 0.01032935, - "balance_loss_clip": 1.03640389, - "balance_loss_mlp": 1.01936865, - "epoch": 0.6579888771982564, - "flos": 28661383900800.0, - "grad_norm": 1.8289069022809952, - "language_loss": 0.59149086, - "learning_rate": 1.1064321066068778e-06, - "loss": 0.61252689, - "num_input_tokens_seen": 236237315, - "step": 10944, - "time_per_iteration": 2.8202292919158936 - }, - { - "auxiliary_loss_clip": 0.01109311, - "auxiliary_loss_mlp": 0.01034463, - "balance_loss_clip": 1.04082966, - "balance_loss_mlp": 1.02081347, - "epoch": 0.6580490004509244, - "flos": 25046543911680.0, - "grad_norm": 1.5174772565974388, - "language_loss": 0.7224496, - "learning_rate": 1.1060836948177646e-06, - "loss": 0.74388736, - "num_input_tokens_seen": 236256345, - "step": 10945, - "time_per_iteration": 2.658428430557251 - }, - { - "auxiliary_loss_clip": 0.0109325, - "auxiliary_loss_mlp": 0.0102876, - "balance_loss_clip": 1.04045701, - "balance_loss_mlp": 1.0164274, - "epoch": 0.6581091237035923, - "flos": 43508793461760.0, - "grad_norm": 1.5303954795060517, - "language_loss": 0.70540607, - "learning_rate": 1.105735316926046e-06, - "loss": 0.72662616, - "num_input_tokens_seen": 236281890, - "step": 10946, - "time_per_iteration": 2.859764814376831 - }, - { - "auxiliary_loss_clip": 0.01103097, - "auxiliary_loss_mlp": 0.01034987, - "balance_loss_clip": 1.04042983, - "balance_loss_mlp": 1.02167702, - "epoch": 0.6581692469562603, - "flos": 22415404763520.0, - "grad_norm": 2.072130981046482, - "language_loss": 0.82211119, - "learning_rate": 1.105386972944934e-06, - "loss": 0.84349203, - "num_input_tokens_seen": 236298370, - "step": 10947, - "time_per_iteration": 2.630653142929077 - }, - { - "auxiliary_loss_clip": 0.01056612, - "auxiliary_loss_mlp": 0.00771489, - "balance_loss_clip": 1.0330416, - "balance_loss_mlp": 1.0001905, - "epoch": 0.6582293702089284, - "flos": 24859772167680.0, - "grad_norm": 1.881732401940151, - "language_loss": 0.77187896, - "learning_rate": 1.1050386628876385e-06, - "loss": 0.79015994, - "num_input_tokens_seen": 236317380, - "step": 10948, - "time_per_iteration": 2.7764172554016113 - }, - { - "auxiliary_loss_clip": 0.01105319, - "auxiliary_loss_mlp": 0.01030601, - "balance_loss_clip": 1.04180968, - "balance_loss_mlp": 1.01791072, - "epoch": 0.6582894934615963, - "flos": 23039676161280.0, - "grad_norm": 2.2574860884284793, - "language_loss": 0.79085296, - "learning_rate": 1.1046903867673655e-06, - "loss": 0.81221217, - "num_input_tokens_seen": 236336210, - "step": 10949, - "time_per_iteration": 2.7244157791137695 - }, - { - "auxiliary_loss_clip": 0.0102471, - "auxiliary_loss_mlp": 0.01003119, - "balance_loss_clip": 1.01120281, - "balance_loss_mlp": 1.00195682, - "epoch": 0.6583496167142643, - "flos": 72551980978560.0, - "grad_norm": 0.7330189150463328, - "language_loss": 0.6181432, - "learning_rate": 1.104342144597323e-06, - "loss": 0.63842142, - "num_input_tokens_seen": 236403090, - "step": 10950, - "time_per_iteration": 3.2641515731811523 - }, - { - "auxiliary_loss_clip": 0.01100983, - "auxiliary_loss_mlp": 0.01032251, - "balance_loss_clip": 1.0385226, - "balance_loss_mlp": 1.02026415, - "epoch": 0.6584097399669322, - "flos": 13078846592640.0, - "grad_norm": 2.3980091828088144, - "language_loss": 0.67179585, - "learning_rate": 1.1039939363907178e-06, - "loss": 0.69312811, - "num_input_tokens_seen": 236420475, - "step": 10951, - "time_per_iteration": 2.619748115539551 - }, - { - "auxiliary_loss_clip": 0.01100086, - "auxiliary_loss_mlp": 0.01034456, - "balance_loss_clip": 1.03776073, - "balance_loss_mlp": 1.02158761, - "epoch": 0.6584698632196002, - "flos": 28693164458880.0, - "grad_norm": 1.4089441578543043, - "language_loss": 0.76300871, - "learning_rate": 1.1036457621607504e-06, - "loss": 0.78435409, - "num_input_tokens_seen": 236441915, - "step": 10952, - "time_per_iteration": 2.7250633239746094 - }, - { - "auxiliary_loss_clip": 0.0111349, - "auxiliary_loss_mlp": 0.0103139, - "balance_loss_clip": 1.04090011, - "balance_loss_mlp": 1.018188, - "epoch": 0.6585299864722681, - "flos": 14319272914560.0, - "grad_norm": 1.8443663213164305, - "language_loss": 0.73402822, - "learning_rate": 1.1032976219206257e-06, - "loss": 0.75547707, - "num_input_tokens_seen": 236460340, - "step": 10953, - "time_per_iteration": 2.566080331802368 - }, - { - "auxiliary_loss_clip": 0.01082894, - "auxiliary_loss_mlp": 0.0104307, - "balance_loss_clip": 1.03907454, - "balance_loss_mlp": 1.02891934, - "epoch": 0.6585901097249361, - "flos": 26797907243520.0, - "grad_norm": 2.1744380051357, - "language_loss": 0.78487962, - "learning_rate": 1.102949515683546e-06, - "loss": 0.80613929, - "num_input_tokens_seen": 236478280, - "step": 10954, - "time_per_iteration": 2.724165678024292 - }, - { - "auxiliary_loss_clip": 0.01088368, - "auxiliary_loss_mlp": 0.01037943, - "balance_loss_clip": 1.03427434, - "balance_loss_mlp": 1.0242219, - "epoch": 0.658650232977604, - "flos": 18733124989440.0, - "grad_norm": 2.555140209313338, - "language_loss": 0.69544291, - "learning_rate": 1.1026014434627096e-06, - "loss": 0.71670604, - "num_input_tokens_seen": 236493225, - "step": 10955, - "time_per_iteration": 2.6414260864257812 - }, - { - "auxiliary_loss_clip": 0.01082497, - "auxiliary_loss_mlp": 0.01033927, - "balance_loss_clip": 1.03517938, - "balance_loss_mlp": 1.02191079, - "epoch": 0.6587103562302721, - "flos": 24753440931840.0, - "grad_norm": 2.1706102915019434, - "language_loss": 0.80620337, - "learning_rate": 1.1022534052713172e-06, - "loss": 0.82736766, - "num_input_tokens_seen": 236514420, - "step": 10956, - "time_per_iteration": 2.679706335067749 - }, - { - "auxiliary_loss_clip": 0.01104337, - "auxiliary_loss_mlp": 0.01038231, - "balance_loss_clip": 1.04236186, - "balance_loss_mlp": 1.02459347, - "epoch": 0.65877047948294, - "flos": 22346133384960.0, - "grad_norm": 2.024941431440732, - "language_loss": 0.81428325, - "learning_rate": 1.1019054011225648e-06, - "loss": 0.83570898, - "num_input_tokens_seen": 236532785, - "step": 10957, - "time_per_iteration": 2.7104432582855225 - }, - { - "auxiliary_loss_clip": 0.01091788, - "auxiliary_loss_mlp": 0.01030278, - "balance_loss_clip": 1.04065537, - "balance_loss_mlp": 1.01872087, - "epoch": 0.658830602735608, - "flos": 45180542298240.0, - "grad_norm": 1.6614910080791612, - "language_loss": 0.75887316, - "learning_rate": 1.1015574310296506e-06, - "loss": 0.78009385, - "num_input_tokens_seen": 236553330, - "step": 10958, - "time_per_iteration": 2.829876661300659 - }, - { - "auxiliary_loss_clip": 0.01070256, - "auxiliary_loss_mlp": 0.01040301, - "balance_loss_clip": 1.0364852, - "balance_loss_mlp": 1.02578747, - "epoch": 0.6588907259882759, - "flos": 19901622326400.0, - "grad_norm": 1.76623385890274, - "language_loss": 0.74976909, - "learning_rate": 1.1012094950057678e-06, - "loss": 0.77087468, - "num_input_tokens_seen": 236572960, - "step": 10959, - "time_per_iteration": 4.3221375942230225 - }, - { - "auxiliary_loss_clip": 0.01103616, - "auxiliary_loss_mlp": 0.01030743, - "balance_loss_clip": 1.03967154, - "balance_loss_mlp": 1.01826799, - "epoch": 0.6589508492409439, - "flos": 24133766474880.0, - "grad_norm": 1.6028003647190308, - "language_loss": 0.6497494, - "learning_rate": 1.1008615930641107e-06, - "loss": 0.67109299, - "num_input_tokens_seen": 236594090, - "step": 10960, - "time_per_iteration": 2.685056209564209 - }, - { - "auxiliary_loss_clip": 0.01119947, - "auxiliary_loss_mlp": 0.01034422, - "balance_loss_clip": 1.04166222, - "balance_loss_mlp": 1.0203135, - "epoch": 0.659010972493612, - "flos": 18222906251520.0, - "grad_norm": 3.156226944144234, - "language_loss": 0.81759185, - "learning_rate": 1.1005137252178734e-06, - "loss": 0.83913553, - "num_input_tokens_seen": 236610190, - "step": 10961, - "time_per_iteration": 2.6374056339263916 - }, - { - "auxiliary_loss_clip": 0.01076452, - "auxiliary_loss_mlp": 0.01033701, - "balance_loss_clip": 1.03810012, - "balance_loss_mlp": 1.01989698, - "epoch": 0.6590710957462799, - "flos": 27600007898880.0, - "grad_norm": 1.7436713822775258, - "language_loss": 0.73479664, - "learning_rate": 1.1001658914802453e-06, - "loss": 0.75589824, - "num_input_tokens_seen": 236631575, - "step": 10962, - "time_per_iteration": 4.275976181030273 - }, - { - "auxiliary_loss_clip": 0.0109814, - "auxiliary_loss_mlp": 0.01033187, - "balance_loss_clip": 1.03807235, - "balance_loss_mlp": 1.01996064, - "epoch": 0.6591312189989479, - "flos": 20302959962880.0, - "grad_norm": 1.9404531224692678, - "language_loss": 0.80004346, - "learning_rate": 1.0998180918644165e-06, - "loss": 0.82135677, - "num_input_tokens_seen": 236649815, - "step": 10963, - "time_per_iteration": 4.260782480239868 - }, - { - "auxiliary_loss_clip": 0.01062785, - "auxiliary_loss_mlp": 0.00769293, - "balance_loss_clip": 1.0372498, - "balance_loss_mlp": 1.00011432, - "epoch": 0.6591913422516158, - "flos": 12312943868160.0, - "grad_norm": 1.8441045997478804, - "language_loss": 0.78224564, - "learning_rate": 1.0994703263835754e-06, - "loss": 0.80056643, - "num_input_tokens_seen": 236668335, - "step": 10964, - "time_per_iteration": 2.6830945014953613 - }, - { - "auxiliary_loss_clip": 0.01075287, - "auxiliary_loss_mlp": 0.01040262, - "balance_loss_clip": 1.03438485, - "balance_loss_mlp": 1.02721417, - "epoch": 0.6592514655042838, - "flos": 25884591102720.0, - "grad_norm": 1.683709186180651, - "language_loss": 0.73955643, - "learning_rate": 1.0991225950509106e-06, - "loss": 0.76071191, - "num_input_tokens_seen": 236688945, - "step": 10965, - "time_per_iteration": 2.687619924545288 - }, - { - "auxiliary_loss_clip": 0.01081038, - "auxiliary_loss_mlp": 0.01038566, - "balance_loss_clip": 1.03631306, - "balance_loss_mlp": 1.02412999, - "epoch": 0.6593115887569517, - "flos": 14063624841600.0, - "grad_norm": 2.0913085470177943, - "language_loss": 0.73648584, - "learning_rate": 1.0987748978796067e-06, - "loss": 0.75768185, - "num_input_tokens_seen": 236707055, - "step": 10966, - "time_per_iteration": 2.6525564193725586 - }, - { - "auxiliary_loss_clip": 0.01102724, - "auxiliary_loss_mlp": 0.01032944, - "balance_loss_clip": 1.03741455, - "balance_loss_mlp": 1.01951456, - "epoch": 0.6593717120096197, - "flos": 24717925359360.0, - "grad_norm": 1.533295813226106, - "language_loss": 0.76610076, - "learning_rate": 1.0984272348828487e-06, - "loss": 0.78745747, - "num_input_tokens_seen": 236725900, - "step": 10967, - "time_per_iteration": 4.112145900726318 - }, - { - "auxiliary_loss_clip": 0.01023116, - "auxiliary_loss_mlp": 0.0100237, - "balance_loss_clip": 1.00873816, - "balance_loss_mlp": 1.00111854, - "epoch": 0.6594318352622877, - "flos": 55558083502080.0, - "grad_norm": 0.6961608375444348, - "language_loss": 0.48445863, - "learning_rate": 1.0980796060738221e-06, - "loss": 0.50471348, - "num_input_tokens_seen": 236788415, - "step": 10968, - "time_per_iteration": 3.0989933013916016 - }, - { - "auxiliary_loss_clip": 0.01066259, - "auxiliary_loss_mlp": 0.01036629, - "balance_loss_clip": 1.03324318, - "balance_loss_mlp": 1.02168036, - "epoch": 0.6594919585149557, - "flos": 17456931699840.0, - "grad_norm": 1.7813410381881563, - "language_loss": 0.79142725, - "learning_rate": 1.0977320114657058e-06, - "loss": 0.81245613, - "num_input_tokens_seen": 236805155, - "step": 10969, - "time_per_iteration": 2.6929845809936523 - }, - { - "auxiliary_loss_clip": 0.01103958, - "auxiliary_loss_mlp": 0.01031334, - "balance_loss_clip": 1.0396595, - "balance_loss_mlp": 1.01903188, - "epoch": 0.6595520817676236, - "flos": 18223229473920.0, - "grad_norm": 2.1653605986578137, - "language_loss": 0.65524602, - "learning_rate": 1.0973844510716817e-06, - "loss": 0.67659903, - "num_input_tokens_seen": 236824360, - "step": 10970, - "time_per_iteration": 2.5729503631591797 - }, - { - "auxiliary_loss_clip": 0.01098998, - "auxiliary_loss_mlp": 0.01031128, - "balance_loss_clip": 1.03612995, - "balance_loss_mlp": 1.01827741, - "epoch": 0.6596122050202916, - "flos": 22199761463040.0, - "grad_norm": 1.6607954000770715, - "language_loss": 0.7680558, - "learning_rate": 1.0970369249049308e-06, - "loss": 0.78935707, - "num_input_tokens_seen": 236844640, - "step": 10971, - "time_per_iteration": 2.699892997741699 - }, - { - "auxiliary_loss_clip": 0.01045077, - "auxiliary_loss_mlp": 0.01047077, - "balance_loss_clip": 1.03190637, - "balance_loss_mlp": 1.03174686, - "epoch": 0.6596723282729595, - "flos": 14173834746240.0, - "grad_norm": 2.880961149913922, - "language_loss": 0.70055163, - "learning_rate": 1.096689432978629e-06, - "loss": 0.72147322, - "num_input_tokens_seen": 236861160, - "step": 10972, - "time_per_iteration": 2.7359213829040527 - }, - { - "auxiliary_loss_clip": 0.01101135, - "auxiliary_loss_mlp": 0.01025815, - "balance_loss_clip": 1.03941655, - "balance_loss_mlp": 1.01266074, - "epoch": 0.6597324515256275, - "flos": 30553193410560.0, - "grad_norm": 9.926316428888306, - "language_loss": 0.55695325, - "learning_rate": 1.0963419753059556e-06, - "loss": 0.57822275, - "num_input_tokens_seen": 236880465, - "step": 10973, - "time_per_iteration": 2.69612455368042 - }, - { - "auxiliary_loss_clip": 0.01099195, - "auxiliary_loss_mlp": 0.01040169, - "balance_loss_clip": 1.0419203, - "balance_loss_mlp": 1.02660263, - "epoch": 0.6597925747782956, - "flos": 17639860688640.0, - "grad_norm": 2.5012890193080026, - "language_loss": 0.78572869, - "learning_rate": 1.0959945519000839e-06, - "loss": 0.80712223, - "num_input_tokens_seen": 236897730, - "step": 10974, - "time_per_iteration": 2.6455633640289307 - }, - { - "auxiliary_loss_clip": 0.01100482, - "auxiliary_loss_mlp": 0.01037502, - "balance_loss_clip": 1.04022431, - "balance_loss_mlp": 1.02422214, - "epoch": 0.6598526980309635, - "flos": 22819112697600.0, - "grad_norm": 2.251661999993696, - "language_loss": 0.68701649, - "learning_rate": 1.0956471627741906e-06, - "loss": 0.70839626, - "num_input_tokens_seen": 236917300, - "step": 10975, - "time_per_iteration": 2.6761295795440674 - }, - { - "auxiliary_loss_clip": 0.01097399, - "auxiliary_loss_mlp": 0.01032564, - "balance_loss_clip": 1.03912926, - "balance_loss_mlp": 1.02060747, - "epoch": 0.6599128212836315, - "flos": 21068036674560.0, - "grad_norm": 1.6540937029958567, - "language_loss": 0.70881736, - "learning_rate": 1.0952998079414464e-06, - "loss": 0.73011696, - "num_input_tokens_seen": 236935590, - "step": 10976, - "time_per_iteration": 2.5975265502929688 - }, - { - "auxiliary_loss_clip": 0.01083365, - "auxiliary_loss_mlp": 0.01033377, - "balance_loss_clip": 1.03734148, - "balance_loss_mlp": 1.02016902, - "epoch": 0.6599729445362994, - "flos": 22163527618560.0, - "grad_norm": 1.6096140121507374, - "language_loss": 0.67765009, - "learning_rate": 1.0949524874150243e-06, - "loss": 0.69881749, - "num_input_tokens_seen": 236952830, - "step": 10977, - "time_per_iteration": 2.676992177963257 - }, - { - "auxiliary_loss_clip": 0.01079353, - "auxiliary_loss_mlp": 0.01037069, - "balance_loss_clip": 1.03872538, - "balance_loss_mlp": 1.02254331, - "epoch": 0.6600330677889674, - "flos": 18150079426560.0, - "grad_norm": 2.028840451789988, - "language_loss": 0.80975902, - "learning_rate": 1.0946052012080952e-06, - "loss": 0.8309232, - "num_input_tokens_seen": 236971930, - "step": 10978, - "time_per_iteration": 2.670058488845825 - }, - { - "auxiliary_loss_clip": 0.01084138, - "auxiliary_loss_mlp": 0.01037844, - "balance_loss_clip": 1.03935933, - "balance_loss_mlp": 1.02446318, - "epoch": 0.6600931910416353, - "flos": 18150115340160.0, - "grad_norm": 3.3630669376979534, - "language_loss": 0.67552471, - "learning_rate": 1.0942579493338278e-06, - "loss": 0.69674456, - "num_input_tokens_seen": 236989920, - "step": 10979, - "time_per_iteration": 2.6543848514556885 - }, - { - "auxiliary_loss_clip": 0.01082232, - "auxiliary_loss_mlp": 0.01035503, - "balance_loss_clip": 1.03750384, - "balance_loss_mlp": 1.02135265, - "epoch": 0.6601533142943034, - "flos": 17420733768960.0, - "grad_norm": 2.7062652296553793, - "language_loss": 0.7310946, - "learning_rate": 1.0939107318053889e-06, - "loss": 0.75227201, - "num_input_tokens_seen": 237006570, - "step": 10980, - "time_per_iteration": 2.614719867706299 - }, - { - "auxiliary_loss_clip": 0.01075162, - "auxiliary_loss_mlp": 0.01033537, - "balance_loss_clip": 1.0369494, - "balance_loss_mlp": 1.02132368, - "epoch": 0.6602134375469713, - "flos": 28219574615040.0, - "grad_norm": 1.6769637422208983, - "language_loss": 0.72674447, - "learning_rate": 1.0935635486359459e-06, - "loss": 0.74783146, - "num_input_tokens_seen": 237028415, - "step": 10981, - "time_per_iteration": 2.7521674633026123 - }, - { - "auxiliary_loss_clip": 0.01059889, - "auxiliary_loss_mlp": 0.0103708, - "balance_loss_clip": 1.03629518, - "balance_loss_mlp": 1.02407432, - "epoch": 0.6602735607996393, - "flos": 29418056830080.0, - "grad_norm": 2.169047564074697, - "language_loss": 0.68625891, - "learning_rate": 1.0932163998386647e-06, - "loss": 0.70722854, - "num_input_tokens_seen": 237046595, - "step": 10982, - "time_per_iteration": 2.791590690612793 - }, - { - "auxiliary_loss_clip": 0.01102094, - "auxiliary_loss_mlp": 0.01028932, - "balance_loss_clip": 1.03903246, - "balance_loss_mlp": 1.01600397, - "epoch": 0.6603336840523072, - "flos": 18588045957120.0, - "grad_norm": 1.9479050528854345, - "language_loss": 0.69151658, - "learning_rate": 1.0928692854267075e-06, - "loss": 0.71282685, - "num_input_tokens_seen": 237066150, - "step": 10983, - "time_per_iteration": 2.662109851837158 - }, - { - "auxiliary_loss_clip": 0.01102705, - "auxiliary_loss_mlp": 0.01032804, - "balance_loss_clip": 1.03690076, - "balance_loss_mlp": 1.0190587, - "epoch": 0.6603938073049752, - "flos": 33254860913280.0, - "grad_norm": 1.7348773084229319, - "language_loss": 0.70333445, - "learning_rate": 1.092522205413239e-06, - "loss": 0.72468954, - "num_input_tokens_seen": 237087060, - "step": 10984, - "time_per_iteration": 2.732595443725586 - }, - { - "auxiliary_loss_clip": 0.01077924, - "auxiliary_loss_mlp": 0.01038628, - "balance_loss_clip": 1.03689432, - "balance_loss_mlp": 1.02587259, - "epoch": 0.6604539305576431, - "flos": 17384284442880.0, - "grad_norm": 1.6767760179184985, - "language_loss": 0.83797729, - "learning_rate": 1.0921751598114193e-06, - "loss": 0.85914278, - "num_input_tokens_seen": 237103825, - "step": 10985, - "time_per_iteration": 2.654433250427246 - }, - { - "auxiliary_loss_clip": 0.01105556, - "auxiliary_loss_mlp": 0.01034686, - "balance_loss_clip": 1.0407331, - "balance_loss_mlp": 1.02094078, - "epoch": 0.6605140538103111, - "flos": 21251145231360.0, - "grad_norm": 2.384704611416695, - "language_loss": 0.74183935, - "learning_rate": 1.0918281486344077e-06, - "loss": 0.76324177, - "num_input_tokens_seen": 237121740, - "step": 10986, - "time_per_iteration": 2.6019506454467773 - }, - { - "auxiliary_loss_clip": 0.01100549, - "auxiliary_loss_mlp": 0.01029651, - "balance_loss_clip": 1.03883743, - "balance_loss_mlp": 1.01647878, - "epoch": 0.6605741770629792, - "flos": 13881701433600.0, - "grad_norm": 1.9122697335713108, - "language_loss": 0.78908652, - "learning_rate": 1.0914811718953636e-06, - "loss": 0.81038857, - "num_input_tokens_seen": 237139565, - "step": 10987, - "time_per_iteration": 2.5722427368164062 - }, - { - "auxiliary_loss_clip": 0.01008768, - "auxiliary_loss_mlp": 0.01002668, - "balance_loss_clip": 1.00836062, - "balance_loss_mlp": 1.0013566, - "epoch": 0.6606343003156471, - "flos": 69316215171840.0, - "grad_norm": 0.8094121865469099, - "language_loss": 0.541363, - "learning_rate": 1.0911342296074454e-06, - "loss": 0.5614773, - "num_input_tokens_seen": 237201055, - "step": 10988, - "time_per_iteration": 3.272397994995117 - }, - { - "auxiliary_loss_clip": 0.01053267, - "auxiliary_loss_mlp": 0.01036624, - "balance_loss_clip": 1.03639925, - "balance_loss_mlp": 1.02483392, - "epoch": 0.6606944235683151, - "flos": 27272394927360.0, - "grad_norm": 1.725996965304981, - "language_loss": 0.77469909, - "learning_rate": 1.0907873217838077e-06, - "loss": 0.79559803, - "num_input_tokens_seen": 237221805, - "step": 10989, - "time_per_iteration": 2.911433458328247 - }, - { - "auxiliary_loss_clip": 0.01092952, - "auxiliary_loss_mlp": 0.01034291, - "balance_loss_clip": 1.04096937, - "balance_loss_mlp": 1.02172589, - "epoch": 0.660754546820983, - "flos": 13772820332160.0, - "grad_norm": 2.2526328276614542, - "language_loss": 0.77053428, - "learning_rate": 1.0904404484376064e-06, - "loss": 0.7918067, - "num_input_tokens_seen": 237238270, - "step": 10990, - "time_per_iteration": 2.6875393390655518 - }, - { - "auxiliary_loss_clip": 0.01116631, - "auxiliary_loss_mlp": 0.01032525, - "balance_loss_clip": 1.04041815, - "balance_loss_mlp": 1.01960862, - "epoch": 0.660814670073651, - "flos": 15705209232000.0, - "grad_norm": 4.452653785760573, - "language_loss": 0.60725391, - "learning_rate": 1.0900936095819937e-06, - "loss": 0.62874544, - "num_input_tokens_seen": 237255400, - "step": 10991, - "time_per_iteration": 2.581926107406616 - }, - { - "auxiliary_loss_clip": 0.01088945, - "auxiliary_loss_mlp": 0.01037016, - "balance_loss_clip": 1.03823137, - "balance_loss_mlp": 1.02305102, - "epoch": 0.6608747933263189, - "flos": 20850023076480.0, - "grad_norm": 2.2752499400269057, - "language_loss": 0.68441308, - "learning_rate": 1.0897468052301234e-06, - "loss": 0.70567274, - "num_input_tokens_seen": 237273105, - "step": 10992, - "time_per_iteration": 2.6633994579315186 - }, - { - "auxiliary_loss_clip": 0.01102357, - "auxiliary_loss_mlp": 0.01033607, - "balance_loss_clip": 1.03874791, - "balance_loss_mlp": 1.02007651, - "epoch": 0.660934916578987, - "flos": 20632117219200.0, - "grad_norm": 1.7286431682231886, - "language_loss": 0.87802613, - "learning_rate": 1.0894000353951444e-06, - "loss": 0.89938569, - "num_input_tokens_seen": 237292650, - "step": 10993, - "time_per_iteration": 2.618743419647217 - }, - { - "auxiliary_loss_clip": 0.01111168, - "auxiliary_loss_mlp": 0.01033402, - "balance_loss_clip": 1.04143643, - "balance_loss_mlp": 1.01837611, - "epoch": 0.6609950398316549, - "flos": 25113588647040.0, - "grad_norm": 1.7020728160261662, - "language_loss": 0.66939056, - "learning_rate": 1.0890533000902078e-06, - "loss": 0.69083625, - "num_input_tokens_seen": 237312865, - "step": 10994, - "time_per_iteration": 2.694892406463623 - }, - { - "auxiliary_loss_clip": 0.01078298, - "auxiliary_loss_mlp": 0.01039322, - "balance_loss_clip": 1.03795636, - "balance_loss_mlp": 1.02551126, - "epoch": 0.6610551630843229, - "flos": 18661196004480.0, - "grad_norm": 2.5249476876910277, - "language_loss": 0.77071732, - "learning_rate": 1.0887065993284626e-06, - "loss": 0.79189348, - "num_input_tokens_seen": 237331210, - "step": 10995, - "time_per_iteration": 2.6232664585113525 - }, - { - "auxiliary_loss_clip": 0.01093968, - "auxiliary_loss_mlp": 0.01029631, - "balance_loss_clip": 1.03934228, - "balance_loss_mlp": 1.01722097, - "epoch": 0.6611152863369908, - "flos": 23258192549760.0, - "grad_norm": 1.8438791376239891, - "language_loss": 0.74463415, - "learning_rate": 1.088359933123053e-06, - "loss": 0.76587015, - "num_input_tokens_seen": 237349455, - "step": 10996, - "time_per_iteration": 2.628135919570923 - }, - { - "auxiliary_loss_clip": 0.01115792, - "auxiliary_loss_mlp": 0.01034651, - "balance_loss_clip": 1.04123545, - "balance_loss_mlp": 1.02159739, - "epoch": 0.6611754095896588, - "flos": 22159720776960.0, - "grad_norm": 1.8400435689118084, - "language_loss": 0.69207805, - "learning_rate": 1.088013301487126e-06, - "loss": 0.71358246, - "num_input_tokens_seen": 237367100, - "step": 10997, - "time_per_iteration": 2.5729880332946777 - }, - { - "auxiliary_loss_clip": 0.01095929, - "auxiliary_loss_mlp": 0.01033874, - "balance_loss_clip": 1.0389818, - "balance_loss_mlp": 1.02096367, - "epoch": 0.6612355328423267, - "flos": 13991228979840.0, - "grad_norm": 2.212339587573469, - "language_loss": 0.68443197, - "learning_rate": 1.0876667044338269e-06, - "loss": 0.70572996, - "num_input_tokens_seen": 237384840, - "step": 10998, - "time_per_iteration": 4.240036249160767 - }, - { - "auxiliary_loss_clip": 0.01026396, - "auxiliary_loss_mlp": 0.01003226, - "balance_loss_clip": 1.01201963, - "balance_loss_mlp": 1.00200462, - "epoch": 0.6612956560949947, - "flos": 61453716359040.0, - "grad_norm": 0.6556172869742106, - "language_loss": 0.51124817, - "learning_rate": 1.087320141976297e-06, - "loss": 0.53154439, - "num_input_tokens_seen": 237443355, - "step": 10999, - "time_per_iteration": 3.0903005599975586 - }, - { - "auxiliary_loss_clip": 0.01117437, - "auxiliary_loss_mlp": 0.00771071, - "balance_loss_clip": 1.04025114, - "balance_loss_mlp": 1.00016904, - "epoch": 0.6613557793476627, - "flos": 21616644072960.0, - "grad_norm": 2.396543073072743, - "language_loss": 0.70902514, - "learning_rate": 1.086973614127679e-06, - "loss": 0.72791028, - "num_input_tokens_seen": 237459205, - "step": 11000, - "time_per_iteration": 2.5685982704162598 - }, - { - "auxiliary_loss_clip": 0.01082819, - "auxiliary_loss_mlp": 0.01036908, - "balance_loss_clip": 1.03847837, - "balance_loss_mlp": 1.024737, - "epoch": 0.6614159026003307, - "flos": 34020117192960.0, - "grad_norm": 1.430398099595452, - "language_loss": 0.65089309, - "learning_rate": 1.0866271209011133e-06, - "loss": 0.67209029, - "num_input_tokens_seen": 237483580, - "step": 11001, - "time_per_iteration": 4.2755303382873535 - }, - { - "auxiliary_loss_clip": 0.01112876, - "auxiliary_loss_mlp": 0.0103109, - "balance_loss_clip": 1.03954029, - "balance_loss_mlp": 1.01845384, - "epoch": 0.6614760258529987, - "flos": 24097281235200.0, - "grad_norm": 1.7701672836009255, - "language_loss": 0.7300179, - "learning_rate": 1.086280662309739e-06, - "loss": 0.75145757, - "num_input_tokens_seen": 237502860, - "step": 11002, - "time_per_iteration": 2.6314847469329834 - }, - { - "auxiliary_loss_clip": 0.01097492, - "auxiliary_loss_mlp": 0.01037063, - "balance_loss_clip": 1.03688526, - "balance_loss_mlp": 1.02355647, - "epoch": 0.6615361491056666, - "flos": 14903790935040.0, - "grad_norm": 1.9389438141435231, - "language_loss": 0.79010653, - "learning_rate": 1.0859342383666928e-06, - "loss": 0.81145215, - "num_input_tokens_seen": 237521030, - "step": 11003, - "time_per_iteration": 4.314274072647095 - }, - { - "auxiliary_loss_clip": 0.01104366, - "auxiliary_loss_mlp": 0.01038348, - "balance_loss_clip": 1.03993845, - "balance_loss_mlp": 1.02454972, - "epoch": 0.6615962723583346, - "flos": 15304877176320.0, - "grad_norm": 1.933163608906101, - "language_loss": 0.69039351, - "learning_rate": 1.0855878490851119e-06, - "loss": 0.7118206, - "num_input_tokens_seen": 237539585, - "step": 11004, - "time_per_iteration": 2.6783957481384277 - }, - { - "auxiliary_loss_clip": 0.01104574, - "auxiliary_loss_mlp": 0.0103687, - "balance_loss_clip": 1.03920364, - "balance_loss_mlp": 1.02226102, - "epoch": 0.6616563956110025, - "flos": 18732586285440.0, - "grad_norm": 2.0685835155239487, - "language_loss": 0.69767517, - "learning_rate": 1.085241494478132e-06, - "loss": 0.71908963, - "num_input_tokens_seen": 237557655, - "step": 11005, - "time_per_iteration": 2.5958964824676514 - }, - { - "auxiliary_loss_clip": 0.01094809, - "auxiliary_loss_mlp": 0.01030023, - "balance_loss_clip": 1.04032111, - "balance_loss_mlp": 1.01691008, - "epoch": 0.6617165188636706, - "flos": 24495063425280.0, - "grad_norm": 4.5323320504778035, - "language_loss": 0.78211862, - "learning_rate": 1.0848951745588855e-06, - "loss": 0.80336696, - "num_input_tokens_seen": 237577000, - "step": 11006, - "time_per_iteration": 4.20892596244812 - }, - { - "auxiliary_loss_clip": 0.01102255, - "auxiliary_loss_mlp": 0.01033472, - "balance_loss_clip": 1.03898382, - "balance_loss_mlp": 1.02004886, - "epoch": 0.6617766421163385, - "flos": 22379673709440.0, - "grad_norm": 1.4341781143462713, - "language_loss": 0.76336843, - "learning_rate": 1.0845488893405068e-06, - "loss": 0.78472567, - "num_input_tokens_seen": 237597960, - "step": 11007, - "time_per_iteration": 2.6313998699188232 - }, - { - "auxiliary_loss_clip": 0.0110241, - "auxiliary_loss_mlp": 0.01033667, - "balance_loss_clip": 1.0410744, - "balance_loss_mlp": 1.02089977, - "epoch": 0.6618367653690065, - "flos": 20850418126080.0, - "grad_norm": 1.678556210667641, - "language_loss": 0.78647077, - "learning_rate": 1.0842026388361248e-06, - "loss": 0.80783153, - "num_input_tokens_seen": 237616385, - "step": 11008, - "time_per_iteration": 2.6336562633514404 - }, - { - "auxiliary_loss_clip": 0.01117118, - "auxiliary_loss_mlp": 0.01030319, - "balance_loss_clip": 1.0386076, - "balance_loss_mlp": 1.01620448, - "epoch": 0.6618968886216744, - "flos": 17712328377600.0, - "grad_norm": 1.8062458144067923, - "language_loss": 0.81780714, - "learning_rate": 1.0838564230588715e-06, - "loss": 0.83928156, - "num_input_tokens_seen": 237634930, - "step": 11009, - "time_per_iteration": 2.559891939163208 - }, - { - "auxiliary_loss_clip": 0.01003698, - "auxiliary_loss_mlp": 0.01000096, - "balance_loss_clip": 1.01631284, - "balance_loss_mlp": 0.99864715, - "epoch": 0.6619570118743424, - "flos": 67035347498880.0, - "grad_norm": 1.1306824373429385, - "language_loss": 0.67373979, - "learning_rate": 1.0835102420218735e-06, - "loss": 0.69377768, - "num_input_tokens_seen": 237693175, - "step": 11010, - "time_per_iteration": 3.1341817378997803 - }, - { - "auxiliary_loss_clip": 0.01103659, - "auxiliary_loss_mlp": 0.01034485, - "balance_loss_clip": 1.03835106, - "balance_loss_mlp": 1.02063894, - "epoch": 0.6620171351270103, - "flos": 18660908695680.0, - "grad_norm": 1.5388019077167303, - "language_loss": 0.71031803, - "learning_rate": 1.0831640957382593e-06, - "loss": 0.73169947, - "num_input_tokens_seen": 237713160, - "step": 11011, - "time_per_iteration": 2.6373953819274902 - }, - { - "auxiliary_loss_clip": 0.01106184, - "auxiliary_loss_mlp": 0.01032454, - "balance_loss_clip": 1.04299128, - "balance_loss_mlp": 1.01964521, - "epoch": 0.6620772583796783, - "flos": 24170503109760.0, - "grad_norm": 1.4417744086263622, - "language_loss": 0.7236765, - "learning_rate": 1.0828179842211557e-06, - "loss": 0.74506283, - "num_input_tokens_seen": 237733600, - "step": 11012, - "time_per_iteration": 2.6834990978240967 - }, - { - "auxiliary_loss_clip": 0.01098433, - "auxiliary_loss_mlp": 0.01034539, - "balance_loss_clip": 1.03888941, - "balance_loss_mlp": 1.02273691, - "epoch": 0.6621373816323463, - "flos": 23623547736960.0, - "grad_norm": 1.657176750213381, - "language_loss": 0.79366904, - "learning_rate": 1.0824719074836845e-06, - "loss": 0.81499881, - "num_input_tokens_seen": 237752135, - "step": 11013, - "time_per_iteration": 2.6497538089752197 - }, - { - "auxiliary_loss_clip": 0.01092428, - "auxiliary_loss_mlp": 0.01032971, - "balance_loss_clip": 1.03971934, - "balance_loss_mlp": 1.01944637, - "epoch": 0.6621975048850143, - "flos": 18442212739200.0, - "grad_norm": 2.6791842321865698, - "language_loss": 0.70635635, - "learning_rate": 1.082125865538971e-06, - "loss": 0.72761035, - "num_input_tokens_seen": 237770735, - "step": 11014, - "time_per_iteration": 2.6886751651763916 - }, - { - "auxiliary_loss_clip": 0.01083433, - "auxiliary_loss_mlp": 0.00768947, - "balance_loss_clip": 1.03894365, - "balance_loss_mlp": 1.00011313, - "epoch": 0.6622576281376823, - "flos": 14063876236800.0, - "grad_norm": 1.8642341672837748, - "language_loss": 0.77003562, - "learning_rate": 1.081779858400137e-06, - "loss": 0.78855944, - "num_input_tokens_seen": 237789005, - "step": 11015, - "time_per_iteration": 2.7417409420013428 - }, - { - "auxiliary_loss_clip": 0.01104344, - "auxiliary_loss_mlp": 0.0077007, - "balance_loss_clip": 1.04066467, - "balance_loss_mlp": 1.00019598, - "epoch": 0.6623177513903502, - "flos": 17018965169280.0, - "grad_norm": 1.678948777257364, - "language_loss": 0.82612354, - "learning_rate": 1.0814338860803021e-06, - "loss": 0.84486771, - "num_input_tokens_seen": 237807740, - "step": 11016, - "time_per_iteration": 2.6134469509124756 - }, - { - "auxiliary_loss_clip": 0.01098949, - "auxiliary_loss_mlp": 0.01033634, - "balance_loss_clip": 1.03807402, - "balance_loss_mlp": 1.02006221, - "epoch": 0.6623778746430182, - "flos": 17271021882240.0, - "grad_norm": 1.953011458286016, - "language_loss": 0.69714379, - "learning_rate": 1.0810879485925864e-06, - "loss": 0.71846962, - "num_input_tokens_seen": 237826340, - "step": 11017, - "time_per_iteration": 2.58854079246521 - }, - { - "auxiliary_loss_clip": 0.01083899, - "auxiliary_loss_mlp": 0.01039946, - "balance_loss_clip": 1.0361867, - "balance_loss_mlp": 1.02632689, - "epoch": 0.6624379978956861, - "flos": 48792688767360.0, - "grad_norm": 1.7400500770773162, - "language_loss": 0.774885, - "learning_rate": 1.0807420459501084e-06, - "loss": 0.79612345, - "num_input_tokens_seen": 237848305, - "step": 11018, - "time_per_iteration": 2.9582974910736084 - }, - { - "auxiliary_loss_clip": 0.01091037, - "auxiliary_loss_mlp": 0.01042104, - "balance_loss_clip": 1.03768778, - "balance_loss_mlp": 1.02916956, - "epoch": 0.6624981211483542, - "flos": 18952431477120.0, - "grad_norm": 2.014925244928839, - "language_loss": 0.83705002, - "learning_rate": 1.0803961781659841e-06, - "loss": 0.85838139, - "num_input_tokens_seen": 237867020, - "step": 11019, - "time_per_iteration": 2.684549331665039 - }, - { - "auxiliary_loss_clip": 0.01097432, - "auxiliary_loss_mlp": 0.00772198, - "balance_loss_clip": 1.03844643, - "balance_loss_mlp": 1.00007081, - "epoch": 0.6625582444010221, - "flos": 23256576437760.0, - "grad_norm": 1.6087102704367435, - "language_loss": 0.71948653, - "learning_rate": 1.080050345253328e-06, - "loss": 0.73818284, - "num_input_tokens_seen": 237886710, - "step": 11020, - "time_per_iteration": 2.6002566814422607 - }, - { - "auxiliary_loss_clip": 0.01092653, - "auxiliary_loss_mlp": 0.01030916, - "balance_loss_clip": 1.03763211, - "balance_loss_mlp": 1.01636672, - "epoch": 0.6626183676536901, - "flos": 21394823633280.0, - "grad_norm": 1.6700673315170445, - "language_loss": 0.72552252, - "learning_rate": 1.0797045472252554e-06, - "loss": 0.74675822, - "num_input_tokens_seen": 237904795, - "step": 11021, - "time_per_iteration": 2.677899122238159 - }, - { - "auxiliary_loss_clip": 0.01087084, - "auxiliary_loss_mlp": 0.0104125, - "balance_loss_clip": 1.03822863, - "balance_loss_mlp": 1.02790403, - "epoch": 0.662678490906358, - "flos": 14571293713920.0, - "grad_norm": 2.016335698142833, - "language_loss": 0.83232486, - "learning_rate": 1.0793587840948793e-06, - "loss": 0.85360825, - "num_input_tokens_seen": 237921320, - "step": 11022, - "time_per_iteration": 2.62428879737854 - }, - { - "auxiliary_loss_clip": 0.01099654, - "auxiliary_loss_mlp": 0.01034417, - "balance_loss_clip": 1.04019356, - "balance_loss_mlp": 1.01928318, - "epoch": 0.662738614159026, - "flos": 15992350554240.0, - "grad_norm": 2.476624679148487, - "language_loss": 0.72735739, - "learning_rate": 1.0790130558753099e-06, - "loss": 0.74869806, - "num_input_tokens_seen": 237933525, - "step": 11023, - "time_per_iteration": 2.632291316986084 - }, - { - "auxiliary_loss_clip": 0.01079183, - "auxiliary_loss_mlp": 0.01035009, - "balance_loss_clip": 1.03499722, - "balance_loss_mlp": 1.02165151, - "epoch": 0.6627987374116939, - "flos": 19536338966400.0, - "grad_norm": 1.8699789342451163, - "language_loss": 0.75085115, - "learning_rate": 1.0786673625796574e-06, - "loss": 0.7719931, - "num_input_tokens_seen": 237953395, - "step": 11024, - "time_per_iteration": 2.7034032344818115 - }, - { - "auxiliary_loss_clip": 0.01083517, - "auxiliary_loss_mlp": 0.01031822, - "balance_loss_clip": 1.0384872, - "balance_loss_mlp": 1.01755285, - "epoch": 0.662858860664362, - "flos": 15702838934400.0, - "grad_norm": 2.491473090515614, - "language_loss": 0.69829249, - "learning_rate": 1.0783217042210306e-06, - "loss": 0.71944588, - "num_input_tokens_seen": 237971445, - "step": 11025, - "time_per_iteration": 2.7056894302368164 - }, - { - "auxiliary_loss_clip": 0.01118609, - "auxiliary_loss_mlp": 0.01038933, - "balance_loss_clip": 1.04383016, - "balance_loss_mlp": 1.02548599, - "epoch": 0.6629189839170299, - "flos": 20154289570560.0, - "grad_norm": 1.5605432120454088, - "language_loss": 0.79108787, - "learning_rate": 1.0779760808125379e-06, - "loss": 0.81266326, - "num_input_tokens_seen": 237989965, - "step": 11026, - "time_per_iteration": 2.6094040870666504 - }, - { - "auxiliary_loss_clip": 0.01104761, - "auxiliary_loss_mlp": 0.01030194, - "balance_loss_clip": 1.04092979, - "balance_loss_mlp": 1.01790905, - "epoch": 0.6629791071696979, - "flos": 20915415786240.0, - "grad_norm": 1.5667845463950276, - "language_loss": 0.75913531, - "learning_rate": 1.0776304923672842e-06, - "loss": 0.7804848, - "num_input_tokens_seen": 238006820, - "step": 11027, - "time_per_iteration": 2.6272130012512207 - }, - { - "auxiliary_loss_clip": 0.01088271, - "auxiliary_loss_mlp": 0.01038744, - "balance_loss_clip": 1.03918552, - "balance_loss_mlp": 1.02465403, - "epoch": 0.6630392304223659, - "flos": 20846898593280.0, - "grad_norm": 2.126605601662929, - "language_loss": 0.703035, - "learning_rate": 1.0772849388983742e-06, - "loss": 0.72430521, - "num_input_tokens_seen": 238022560, - "step": 11028, - "time_per_iteration": 2.7173945903778076 - }, - { - "auxiliary_loss_clip": 0.01103236, - "auxiliary_loss_mlp": 0.01033808, - "balance_loss_clip": 1.03955865, - "balance_loss_mlp": 1.0220722, - "epoch": 0.6630993536750338, - "flos": 20995820380800.0, - "grad_norm": 1.8721020554211893, - "language_loss": 0.79606169, - "learning_rate": 1.0769394204189138e-06, - "loss": 0.81743217, - "num_input_tokens_seen": 238041895, - "step": 11029, - "time_per_iteration": 2.5954697132110596 - }, - { - "auxiliary_loss_clip": 0.01116256, - "auxiliary_loss_mlp": 0.01035331, - "balance_loss_clip": 1.03937316, - "balance_loss_mlp": 1.02168214, - "epoch": 0.6631594769277018, - "flos": 18259032355200.0, - "grad_norm": 2.1557545389807617, - "language_loss": 0.76608872, - "learning_rate": 1.0765939369420012e-06, - "loss": 0.78760457, - "num_input_tokens_seen": 238060445, - "step": 11030, - "time_per_iteration": 2.5441596508026123 - }, - { - "auxiliary_loss_clip": 0.01113502, - "auxiliary_loss_mlp": 0.01035912, - "balance_loss_clip": 1.04352438, - "balance_loss_mlp": 1.02144003, - "epoch": 0.6632196001803697, - "flos": 17820491207040.0, - "grad_norm": 2.2370976778546803, - "language_loss": 0.75485003, - "learning_rate": 1.0762484884807391e-06, - "loss": 0.77634418, - "num_input_tokens_seen": 238077080, - "step": 11031, - "time_per_iteration": 2.607260227203369 - }, - { - "auxiliary_loss_clip": 0.01106421, - "auxiliary_loss_mlp": 0.01038808, - "balance_loss_clip": 1.04007494, - "balance_loss_mlp": 1.02508116, - "epoch": 0.6632797234330378, - "flos": 12670182581760.0, - "grad_norm": 4.999518522839319, - "language_loss": 0.74670291, - "learning_rate": 1.075903075048228e-06, - "loss": 0.76815522, - "num_input_tokens_seen": 238091045, - "step": 11032, - "time_per_iteration": 2.594426393508911 - }, - { - "auxiliary_loss_clip": 0.01072119, - "auxiliary_loss_mlp": 0.01033962, - "balance_loss_clip": 1.0367676, - "balance_loss_mlp": 1.02086639, - "epoch": 0.6633398466857057, - "flos": 23584728113280.0, - "grad_norm": 1.76988392785946, - "language_loss": 0.80491328, - "learning_rate": 1.0755576966575635e-06, - "loss": 0.82597411, - "num_input_tokens_seen": 238110220, - "step": 11033, - "time_per_iteration": 2.7742807865142822 - }, - { - "auxiliary_loss_clip": 0.01098023, - "auxiliary_loss_mlp": 0.01031641, - "balance_loss_clip": 1.04221106, - "balance_loss_mlp": 1.01806927, - "epoch": 0.6633999699383737, - "flos": 20631686256000.0, - "grad_norm": 1.7697764735120445, - "language_loss": 0.80480468, - "learning_rate": 1.0752123533218451e-06, - "loss": 0.82610136, - "num_input_tokens_seen": 238130400, - "step": 11034, - "time_per_iteration": 2.72609543800354 - }, - { - "auxiliary_loss_clip": 0.01098853, - "auxiliary_loss_mlp": 0.01029417, - "balance_loss_clip": 1.03850234, - "balance_loss_mlp": 1.01725149, - "epoch": 0.6634600931910416, - "flos": 21797095023360.0, - "grad_norm": 1.6912859958234545, - "language_loss": 0.7568692, - "learning_rate": 1.074867045054166e-06, - "loss": 0.77815193, - "num_input_tokens_seen": 238148165, - "step": 11035, - "time_per_iteration": 2.6851565837860107 - }, - { - "auxiliary_loss_clip": 0.01080784, - "auxiliary_loss_mlp": 0.01029042, - "balance_loss_clip": 1.03555465, - "balance_loss_mlp": 1.01562476, - "epoch": 0.6635202164437096, - "flos": 18732873594240.0, - "grad_norm": 1.9570572428830155, - "language_loss": 0.8271299, - "learning_rate": 1.074521771867622e-06, - "loss": 0.84822816, - "num_input_tokens_seen": 238166360, - "step": 11036, - "time_per_iteration": 2.6795291900634766 - }, - { - "auxiliary_loss_clip": 0.01034271, - "auxiliary_loss_mlp": 0.01004373, - "balance_loss_clip": 1.01085413, - "balance_loss_mlp": 1.00327635, - "epoch": 0.6635803396963775, - "flos": 60222771227520.0, - "grad_norm": 0.7751211897866269, - "language_loss": 0.52259576, - "learning_rate": 1.0741765337753044e-06, - "loss": 0.54298222, - "num_input_tokens_seen": 238227630, - "step": 11037, - "time_per_iteration": 4.7726218700408936 - }, - { - "auxiliary_loss_clip": 0.01060431, - "auxiliary_loss_mlp": 0.01041224, - "balance_loss_clip": 1.03799784, - "balance_loss_mlp": 1.0276525, - "epoch": 0.6636404629490456, - "flos": 29167041611520.0, - "grad_norm": 1.5502874196412986, - "language_loss": 0.79120708, - "learning_rate": 1.0738313307903052e-06, - "loss": 0.81222361, - "num_input_tokens_seen": 238248435, - "step": 11038, - "time_per_iteration": 2.8115994930267334 - }, - { - "auxiliary_loss_clip": 0.01082049, - "auxiliary_loss_mlp": 0.01043022, - "balance_loss_clip": 1.03767705, - "balance_loss_mlp": 1.02863979, - "epoch": 0.6637005862017135, - "flos": 38907702766080.0, - "grad_norm": 1.791707577314863, - "language_loss": 0.63976014, - "learning_rate": 1.073486162925716e-06, - "loss": 0.66101086, - "num_input_tokens_seen": 238268755, - "step": 11039, - "time_per_iteration": 2.8266031742095947 - }, - { - "auxiliary_loss_clip": 0.0107412, - "auxiliary_loss_mlp": 0.01031845, - "balance_loss_clip": 1.03814256, - "balance_loss_mlp": 1.01877952, - "epoch": 0.6637607094543815, - "flos": 22783345729920.0, - "grad_norm": 1.6823578045159262, - "language_loss": 0.63401222, - "learning_rate": 1.0731410301946237e-06, - "loss": 0.65507191, - "num_input_tokens_seen": 238290120, - "step": 11040, - "time_per_iteration": 2.705897569656372 - }, - { - "auxiliary_loss_clip": 0.01074324, - "auxiliary_loss_mlp": 0.01044015, - "balance_loss_clip": 1.03504574, - "balance_loss_mlp": 1.02977514, - "epoch": 0.6638208327070495, - "flos": 18114096977280.0, - "grad_norm": 1.8896789484535716, - "language_loss": 0.71718216, - "learning_rate": 1.0727959326101161e-06, - "loss": 0.73836553, - "num_input_tokens_seen": 238309290, - "step": 11041, - "time_per_iteration": 4.213087320327759 - }, - { - "auxiliary_loss_clip": 0.01097087, - "auxiliary_loss_mlp": 0.01048475, - "balance_loss_clip": 1.03642857, - "balance_loss_mlp": 1.03349042, - "epoch": 0.6638809559597174, - "flos": 29424880414080.0, - "grad_norm": 2.2565600398451795, - "language_loss": 0.61915213, - "learning_rate": 1.0724508701852806e-06, - "loss": 0.64060771, - "num_input_tokens_seen": 238327280, - "step": 11042, - "time_per_iteration": 4.279943943023682 - }, - { - "auxiliary_loss_clip": 0.011055, - "auxiliary_loss_mlp": 0.01031149, - "balance_loss_clip": 1.03810656, - "balance_loss_mlp": 1.01686156, - "epoch": 0.6639410792123854, - "flos": 28072699902720.0, - "grad_norm": 2.105682360594448, - "language_loss": 0.68285942, - "learning_rate": 1.0721058429331998e-06, - "loss": 0.7042259, - "num_input_tokens_seen": 238346330, - "step": 11043, - "time_per_iteration": 2.6422598361968994 - }, - { - "auxiliary_loss_clip": 0.01101764, - "auxiliary_loss_mlp": 0.0103036, - "balance_loss_clip": 1.04116786, - "balance_loss_mlp": 1.018767, - "epoch": 0.6640012024650533, - "flos": 25556367600000.0, - "grad_norm": 1.5365440611155503, - "language_loss": 0.83934712, - "learning_rate": 1.0717608508669587e-06, - "loss": 0.8606683, - "num_input_tokens_seen": 238364650, - "step": 11044, - "time_per_iteration": 2.732520341873169 - }, - { - "auxiliary_loss_clip": 0.01073049, - "auxiliary_loss_mlp": 0.01031878, - "balance_loss_clip": 1.03586829, - "balance_loss_mlp": 1.0185442, - "epoch": 0.6640613257177214, - "flos": 14866946559360.0, - "grad_norm": 2.1294485287076315, - "language_loss": 0.6951791, - "learning_rate": 1.0714158939996392e-06, - "loss": 0.71622837, - "num_input_tokens_seen": 238381630, - "step": 11045, - "time_per_iteration": 2.6816322803497314 - }, - { - "auxiliary_loss_clip": 0.01104183, - "auxiliary_loss_mlp": 0.01027582, - "balance_loss_clip": 1.04048705, - "balance_loss_mlp": 1.0148927, - "epoch": 0.6641214489703893, - "flos": 23221096778880.0, - "grad_norm": 2.227953338696249, - "language_loss": 0.64640826, - "learning_rate": 1.0710709723443235e-06, - "loss": 0.66772592, - "num_input_tokens_seen": 238402595, - "step": 11046, - "time_per_iteration": 4.160333156585693 - }, - { - "auxiliary_loss_clip": 0.01085109, - "auxiliary_loss_mlp": 0.01027972, - "balance_loss_clip": 1.03931284, - "balance_loss_mlp": 1.01488853, - "epoch": 0.6641815722230573, - "flos": 37742617221120.0, - "grad_norm": 1.6669339663762488, - "language_loss": 0.71004307, - "learning_rate": 1.070726085914088e-06, - "loss": 0.73117387, - "num_input_tokens_seen": 238426860, - "step": 11047, - "time_per_iteration": 2.8554368019104004 - }, - { - "auxiliary_loss_clip": 0.01049735, - "auxiliary_loss_mlp": 0.01036523, - "balance_loss_clip": 1.04015899, - "balance_loss_mlp": 1.02316511, - "epoch": 0.6642416954757252, - "flos": 17931132074880.0, - "grad_norm": 1.8883257209384914, - "language_loss": 0.77274108, - "learning_rate": 1.0703812347220126e-06, - "loss": 0.79360354, - "num_input_tokens_seen": 238443990, - "step": 11048, - "time_per_iteration": 2.755452871322632 - }, - { - "auxiliary_loss_clip": 0.01010482, - "auxiliary_loss_mlp": 0.01002664, - "balance_loss_clip": 1.01594365, - "balance_loss_mlp": 1.00137699, - "epoch": 0.6643018187283932, - "flos": 51995384104320.0, - "grad_norm": 0.747851272534148, - "language_loss": 0.55009979, - "learning_rate": 1.0700364187811745e-06, - "loss": 0.57023126, - "num_input_tokens_seen": 238503045, - "step": 11049, - "time_per_iteration": 3.232647180557251 - }, - { - "auxiliary_loss_clip": 0.01103139, - "auxiliary_loss_mlp": 0.01032284, - "balance_loss_clip": 1.04035759, - "balance_loss_mlp": 1.02035105, - "epoch": 0.6643619419810611, - "flos": 30226657847040.0, - "grad_norm": 1.8987287972691187, - "language_loss": 0.63542056, - "learning_rate": 1.069691638104648e-06, - "loss": 0.65677476, - "num_input_tokens_seen": 238527320, - "step": 11050, - "time_per_iteration": 2.712871551513672 - }, - { - "auxiliary_loss_clip": 0.01110292, - "auxiliary_loss_mlp": 0.01033953, - "balance_loss_clip": 1.03804648, - "balance_loss_mlp": 1.02145386, - "epoch": 0.6644220652337292, - "flos": 22966131064320.0, - "grad_norm": 2.56878578960884, - "language_loss": 0.78747934, - "learning_rate": 1.0693468927055085e-06, - "loss": 0.80892181, - "num_input_tokens_seen": 238546030, - "step": 11051, - "time_per_iteration": 2.5602593421936035 - }, - { - "auxiliary_loss_clip": 0.01090775, - "auxiliary_loss_mlp": 0.01036923, - "balance_loss_clip": 1.04075074, - "balance_loss_mlp": 1.02409577, - "epoch": 0.6644821884863971, - "flos": 21142228216320.0, - "grad_norm": 1.6830071971795009, - "language_loss": 0.85365808, - "learning_rate": 1.0690021825968276e-06, - "loss": 0.87493503, - "num_input_tokens_seen": 238564175, - "step": 11052, - "time_per_iteration": 2.6400978565216064 - }, - { - "auxiliary_loss_clip": 0.0106864, - "auxiliary_loss_mlp": 0.01035785, - "balance_loss_clip": 1.03640008, - "balance_loss_mlp": 1.02115774, - "epoch": 0.6645423117390651, - "flos": 20192821885440.0, - "grad_norm": 2.468702862512036, - "language_loss": 0.7442345, - "learning_rate": 1.0686575077916776e-06, - "loss": 0.7652787, - "num_input_tokens_seen": 238581010, - "step": 11053, - "time_per_iteration": 2.7525177001953125 - }, - { - "auxiliary_loss_clip": 0.01081443, - "auxiliary_loss_mlp": 0.0103047, - "balance_loss_clip": 1.03704178, - "balance_loss_mlp": 1.01803088, - "epoch": 0.6646024349917331, - "flos": 24351959640960.0, - "grad_norm": 1.6350550334521685, - "language_loss": 0.7937814, - "learning_rate": 1.0683128683031278e-06, - "loss": 0.81490058, - "num_input_tokens_seen": 238601365, - "step": 11054, - "time_per_iteration": 2.6874406337738037 - }, - { - "auxiliary_loss_clip": 0.01067976, - "auxiliary_loss_mlp": 0.01035163, - "balance_loss_clip": 1.03798532, - "balance_loss_mlp": 1.02267623, - "epoch": 0.664662558244401, - "flos": 18806706000000.0, - "grad_norm": 1.6423825875919162, - "language_loss": 0.73928297, - "learning_rate": 1.0679682641442472e-06, - "loss": 0.76031435, - "num_input_tokens_seen": 238619850, - "step": 11055, - "time_per_iteration": 2.733832597732544 - }, - { - "auxiliary_loss_clip": 0.01082031, - "auxiliary_loss_mlp": 0.01043702, - "balance_loss_clip": 1.03823996, - "balance_loss_mlp": 1.02983165, - "epoch": 0.664722681497069, - "flos": 18952790613120.0, - "grad_norm": 1.8844406603153, - "language_loss": 0.7300725, - "learning_rate": 1.0676236953281042e-06, - "loss": 0.75132978, - "num_input_tokens_seen": 238637635, - "step": 11056, - "time_per_iteration": 2.6787209510803223 - }, - { - "auxiliary_loss_clip": 0.01069462, - "auxiliary_loss_mlp": 0.01036287, - "balance_loss_clip": 1.0367837, - "balance_loss_mlp": 1.02314389, - "epoch": 0.6647828047497369, - "flos": 19571279921280.0, - "grad_norm": 3.230794817750296, - "language_loss": 0.69325733, - "learning_rate": 1.0672791618677641e-06, - "loss": 0.71431488, - "num_input_tokens_seen": 238656200, - "step": 11057, - "time_per_iteration": 2.749843120574951 - }, - { - "auxiliary_loss_clip": 0.01103707, - "auxiliary_loss_mlp": 0.01033971, - "balance_loss_clip": 1.03987014, - "balance_loss_mlp": 1.0206548, - "epoch": 0.664842928002405, - "flos": 23149455102720.0, - "grad_norm": 1.6131185292636203, - "language_loss": 0.80123711, - "learning_rate": 1.066934663776291e-06, - "loss": 0.82261384, - "num_input_tokens_seen": 238675005, - "step": 11058, - "time_per_iteration": 2.6598408222198486 - }, - { - "auxiliary_loss_clip": 0.01008973, - "auxiliary_loss_mlp": 0.01008338, - "balance_loss_clip": 1.01433647, - "balance_loss_mlp": 1.00715828, - "epoch": 0.6649030512550729, - "flos": 65244913148160.0, - "grad_norm": 0.802003162122869, - "language_loss": 0.62611187, - "learning_rate": 1.0665902010667496e-06, - "loss": 0.64628494, - "num_input_tokens_seen": 238731425, - "step": 11059, - "time_per_iteration": 3.12062668800354 - }, - { - "auxiliary_loss_clip": 0.01102046, - "auxiliary_loss_mlp": 0.01038723, - "balance_loss_clip": 1.03967965, - "balance_loss_mlp": 1.026546, - "epoch": 0.6649631745077409, - "flos": 20194797133440.0, - "grad_norm": 1.442710173280966, - "language_loss": 0.7869736, - "learning_rate": 1.0662457737522008e-06, - "loss": 0.80838132, - "num_input_tokens_seen": 238752020, - "step": 11060, - "time_per_iteration": 2.776430606842041 - }, - { - "auxiliary_loss_clip": 0.01082742, - "auxiliary_loss_mlp": 0.01038039, - "balance_loss_clip": 1.03887463, - "balance_loss_mlp": 1.02412772, - "epoch": 0.6650232977604088, - "flos": 17238558965760.0, - "grad_norm": 1.6774634080954063, - "language_loss": 0.78738892, - "learning_rate": 1.0659013818457055e-06, - "loss": 0.80859673, - "num_input_tokens_seen": 238769665, - "step": 11061, - "time_per_iteration": 2.6786346435546875 - }, - { - "auxiliary_loss_clip": 0.01092682, - "auxiliary_loss_mlp": 0.01030082, - "balance_loss_clip": 1.0417732, - "balance_loss_mlp": 1.01765454, - "epoch": 0.6650834210130768, - "flos": 10006867825920.0, - "grad_norm": 2.291207929066884, - "language_loss": 0.56939697, - "learning_rate": 1.0655570253603243e-06, - "loss": 0.59062469, - "num_input_tokens_seen": 238782180, - "step": 11062, - "time_per_iteration": 2.6440412998199463 - }, - { - "auxiliary_loss_clip": 0.01100317, - "auxiliary_loss_mlp": 0.01037299, - "balance_loss_clip": 1.03600216, - "balance_loss_mlp": 1.02142608, - "epoch": 0.6651435442657447, - "flos": 10452088903680.0, - "grad_norm": 1.8230256032266374, - "language_loss": 0.75959098, - "learning_rate": 1.0652127043091144e-06, - "loss": 0.78096718, - "num_input_tokens_seen": 238800315, - "step": 11063, - "time_per_iteration": 2.592930555343628 - }, - { - "auxiliary_loss_clip": 0.01056354, - "auxiliary_loss_mlp": 0.01044348, - "balance_loss_clip": 1.03860426, - "balance_loss_mlp": 1.03033507, - "epoch": 0.6652036675184128, - "flos": 22344229964160.0, - "grad_norm": 1.2698232462033214, - "language_loss": 0.70678842, - "learning_rate": 1.0648684187051316e-06, - "loss": 0.72779548, - "num_input_tokens_seen": 238822250, - "step": 11064, - "time_per_iteration": 2.800218105316162 - }, - { - "auxiliary_loss_clip": 0.01032183, - "auxiliary_loss_mlp": 0.01006383, - "balance_loss_clip": 1.00864732, - "balance_loss_mlp": 1.00513113, - "epoch": 0.6652637907710807, - "flos": 52909633998720.0, - "grad_norm": 0.8463523903026119, - "language_loss": 0.629758, - "learning_rate": 1.0645241685614322e-06, - "loss": 0.65014362, - "num_input_tokens_seen": 238877190, - "step": 11065, - "time_per_iteration": 3.1035780906677246 - }, - { - "auxiliary_loss_clip": 0.01099093, - "auxiliary_loss_mlp": 0.01039736, - "balance_loss_clip": 1.03762209, - "balance_loss_mlp": 1.02464366, - "epoch": 0.6653239140237487, - "flos": 23104637907840.0, - "grad_norm": 1.610155502063491, - "language_loss": 0.62464315, - "learning_rate": 1.0641799538910708e-06, - "loss": 0.64603138, - "num_input_tokens_seen": 238896010, - "step": 11066, - "time_per_iteration": 2.6371681690216064 - }, - { - "auxiliary_loss_clip": 0.01074468, - "auxiliary_loss_mlp": 0.01041028, - "balance_loss_clip": 1.03320074, - "balance_loss_mlp": 1.02528, - "epoch": 0.6653840372764167, - "flos": 25959393175680.0, - "grad_norm": 1.5735109104273866, - "language_loss": 0.70316392, - "learning_rate": 1.0638357747070985e-06, - "loss": 0.72431886, - "num_input_tokens_seen": 238918990, - "step": 11067, - "time_per_iteration": 2.712170362472534 - }, - { - "auxiliary_loss_clip": 0.01015121, - "auxiliary_loss_mlp": 0.0100891, - "balance_loss_clip": 1.01019919, - "balance_loss_mlp": 1.00739563, - "epoch": 0.6654441605290846, - "flos": 66041985899520.0, - "grad_norm": 0.9248325292472583, - "language_loss": 0.72063255, - "learning_rate": 1.0634916310225684e-06, - "loss": 0.74087286, - "num_input_tokens_seen": 238975735, - "step": 11068, - "time_per_iteration": 3.188148021697998 - }, - { - "auxiliary_loss_clip": 0.01006694, - "auxiliary_loss_mlp": 0.01006942, - "balance_loss_clip": 1.01129699, - "balance_loss_mlp": 1.00560117, - "epoch": 0.6655042837817526, - "flos": 65196112521600.0, - "grad_norm": 0.8265951746379137, - "language_loss": 0.57727754, - "learning_rate": 1.0631475228505285e-06, - "loss": 0.5974139, - "num_input_tokens_seen": 239042360, - "step": 11069, - "time_per_iteration": 3.3526012897491455 - }, - { - "auxiliary_loss_clip": 0.01011659, - "auxiliary_loss_mlp": 0.0100159, - "balance_loss_clip": 1.00811982, - "balance_loss_mlp": 1.00046349, - "epoch": 0.6655644070344205, - "flos": 69008746752000.0, - "grad_norm": 0.7554433068818003, - "language_loss": 0.63502038, - "learning_rate": 1.062803450204029e-06, - "loss": 0.65515292, - "num_input_tokens_seen": 239109410, - "step": 11070, - "time_per_iteration": 3.189624071121216 - }, - { - "auxiliary_loss_clip": 0.0111185, - "auxiliary_loss_mlp": 0.01029573, - "balance_loss_clip": 1.03767705, - "balance_loss_mlp": 1.01622725, - "epoch": 0.6656245302870886, - "flos": 36315562809600.0, - "grad_norm": 1.5957526683817405, - "language_loss": 0.58635205, - "learning_rate": 1.062459413096116e-06, - "loss": 0.60776627, - "num_input_tokens_seen": 239135345, - "step": 11071, - "time_per_iteration": 2.7373464107513428 - }, - { - "auxiliary_loss_clip": 0.01107113, - "auxiliary_loss_mlp": 0.01030576, - "balance_loss_clip": 1.04254675, - "balance_loss_mlp": 1.01822627, - "epoch": 0.6656846535397565, - "flos": 21794832466560.0, - "grad_norm": 1.7851142792546852, - "language_loss": 0.72693968, - "learning_rate": 1.0621154115398364e-06, - "loss": 0.74831653, - "num_input_tokens_seen": 239154340, - "step": 11072, - "time_per_iteration": 2.6327590942382812 - }, - { - "auxiliary_loss_clip": 0.01103867, - "auxiliary_loss_mlp": 0.01032627, - "balance_loss_clip": 1.04155874, - "balance_loss_mlp": 1.01859617, - "epoch": 0.6657447767924245, - "flos": 37487615592960.0, - "grad_norm": 1.879864387077726, - "language_loss": 0.70789611, - "learning_rate": 1.0617714455482353e-06, - "loss": 0.72926104, - "num_input_tokens_seen": 239177815, - "step": 11073, - "time_per_iteration": 2.704252243041992 - }, - { - "auxiliary_loss_clip": 0.01084232, - "auxiliary_loss_mlp": 0.0103114, - "balance_loss_clip": 1.03998876, - "balance_loss_mlp": 1.01784229, - "epoch": 0.6658049000450924, - "flos": 16837688206080.0, - "grad_norm": 2.6568090318066475, - "language_loss": 0.56073666, - "learning_rate": 1.061427515134354e-06, - "loss": 0.5818904, - "num_input_tokens_seen": 239195735, - "step": 11074, - "time_per_iteration": 2.6551811695098877 - }, - { - "auxiliary_loss_clip": 0.01116885, - "auxiliary_loss_mlp": 0.00770661, - "balance_loss_clip": 1.04282713, - "balance_loss_mlp": 1.00006819, - "epoch": 0.6658650232977604, - "flos": 33510975863040.0, - "grad_norm": 1.424580138870233, - "language_loss": 0.7252624, - "learning_rate": 1.061083620311235e-06, - "loss": 0.74413788, - "num_input_tokens_seen": 239217535, - "step": 11075, - "time_per_iteration": 2.7062625885009766 - }, - { - "auxiliary_loss_clip": 0.01100028, - "auxiliary_loss_mlp": 0.01032017, - "balance_loss_clip": 1.03886509, - "balance_loss_mlp": 1.01981592, - "epoch": 0.6659251465504283, - "flos": 37706311549440.0, - "grad_norm": 1.4599897176092982, - "language_loss": 0.66246772, - "learning_rate": 1.0607397610919202e-06, - "loss": 0.68378824, - "num_input_tokens_seen": 239241975, - "step": 11076, - "time_per_iteration": 2.804659605026245 - }, - { - "auxiliary_loss_clip": 0.01087468, - "auxiliary_loss_mlp": 0.01032867, - "balance_loss_clip": 1.03394532, - "balance_loss_mlp": 1.01870489, - "epoch": 0.6659852698030964, - "flos": 24893420232960.0, - "grad_norm": 1.6180459271945493, - "language_loss": 0.75299704, - "learning_rate": 1.0603959374894468e-06, - "loss": 0.77420044, - "num_input_tokens_seen": 239262025, - "step": 11077, - "time_per_iteration": 4.274590253829956 - }, - { - "auxiliary_loss_clip": 0.0108965, - "auxiliary_loss_mlp": 0.01031149, - "balance_loss_clip": 1.0374043, - "balance_loss_mlp": 1.01802957, - "epoch": 0.6660453930557643, - "flos": 24352821567360.0, - "grad_norm": 1.5713803954899295, - "language_loss": 0.66825247, - "learning_rate": 1.0600521495168538e-06, - "loss": 0.68946046, - "num_input_tokens_seen": 239282775, - "step": 11078, - "time_per_iteration": 2.7334680557250977 - }, - { - "auxiliary_loss_clip": 0.01115428, - "auxiliary_loss_mlp": 0.01033982, - "balance_loss_clip": 1.03945637, - "balance_loss_mlp": 1.01990974, - "epoch": 0.6661055163084323, - "flos": 10597814380800.0, - "grad_norm": 2.400926553792791, - "language_loss": 0.69900686, - "learning_rate": 1.0597083971871783e-06, - "loss": 0.72050095, - "num_input_tokens_seen": 239299775, - "step": 11079, - "time_per_iteration": 2.6223835945129395 - }, - { - "auxiliary_loss_clip": 0.01089448, - "auxiliary_loss_mlp": 0.01030137, - "balance_loss_clip": 1.03717136, - "balance_loss_mlp": 1.01738119, - "epoch": 0.6661656395611003, - "flos": 24057491944320.0, - "grad_norm": 1.61546827465866, - "language_loss": 0.80478466, - "learning_rate": 1.0593646805134544e-06, - "loss": 0.82598048, - "num_input_tokens_seen": 239319660, - "step": 11080, - "time_per_iteration": 4.228775978088379 - }, - { - "auxiliary_loss_clip": 0.01075927, - "auxiliary_loss_mlp": 0.01033579, - "balance_loss_clip": 1.03583407, - "balance_loss_mlp": 1.02147329, - "epoch": 0.6662257628137682, - "flos": 23036192542080.0, - "grad_norm": 1.8384302926010723, - "language_loss": 0.78062707, - "learning_rate": 1.0590209995087157e-06, - "loss": 0.80172205, - "num_input_tokens_seen": 239339215, - "step": 11081, - "time_per_iteration": 4.32209324836731 - }, - { - "auxiliary_loss_clip": 0.01076143, - "auxiliary_loss_mlp": 0.01039547, - "balance_loss_clip": 1.03748226, - "balance_loss_mlp": 1.02387714, - "epoch": 0.6662858860664362, - "flos": 24754446512640.0, - "grad_norm": 1.6809862267344533, - "language_loss": 0.80329323, - "learning_rate": 1.0586773541859946e-06, - "loss": 0.82445014, - "num_input_tokens_seen": 239358545, - "step": 11082, - "time_per_iteration": 2.7251505851745605 - }, - { - "auxiliary_loss_clip": 0.01076739, - "auxiliary_loss_mlp": 0.01033286, - "balance_loss_clip": 1.04017997, - "balance_loss_mlp": 1.02098405, - "epoch": 0.6663460093191041, - "flos": 20009066883840.0, - "grad_norm": 1.4477945081554633, - "language_loss": 0.83849418, - "learning_rate": 1.0583337445583234e-06, - "loss": 0.85959446, - "num_input_tokens_seen": 239376665, - "step": 11083, - "time_per_iteration": 2.669404983520508 - }, - { - "auxiliary_loss_clip": 0.01079397, - "auxiliary_loss_mlp": 0.01036057, - "balance_loss_clip": 1.04023921, - "balance_loss_mlp": 1.02203834, - "epoch": 0.6664061325717722, - "flos": 17821389047040.0, - "grad_norm": 2.7255574695502216, - "language_loss": 0.85510308, - "learning_rate": 1.057990170638731e-06, - "loss": 0.87625766, - "num_input_tokens_seen": 239394345, - "step": 11084, - "time_per_iteration": 2.663749933242798 - }, - { - "auxiliary_loss_clip": 0.01094685, - "auxiliary_loss_mlp": 0.01031242, - "balance_loss_clip": 1.03857958, - "balance_loss_mlp": 1.01727629, - "epoch": 0.6664662558244401, - "flos": 18076893465600.0, - "grad_norm": 2.199200259512602, - "language_loss": 0.73457599, - "learning_rate": 1.0576466324402452e-06, - "loss": 0.75583529, - "num_input_tokens_seen": 239410605, - "step": 11085, - "time_per_iteration": 4.193335771560669 - }, - { - "auxiliary_loss_clip": 0.01087888, - "auxiliary_loss_mlp": 0.01031014, - "balance_loss_clip": 1.03528535, - "balance_loss_mlp": 1.01760268, - "epoch": 0.6665263790771081, - "flos": 21574197175680.0, - "grad_norm": 1.9746802098097909, - "language_loss": 0.80359179, - "learning_rate": 1.057303129975894e-06, - "loss": 0.82478082, - "num_input_tokens_seen": 239427155, - "step": 11086, - "time_per_iteration": 2.6708765029907227 - }, - { - "auxiliary_loss_clip": 0.01090857, - "auxiliary_loss_mlp": 0.01032379, - "balance_loss_clip": 1.03936315, - "balance_loss_mlp": 1.018646, - "epoch": 0.666586502329776, - "flos": 24206629213440.0, - "grad_norm": 1.7936971088038383, - "language_loss": 0.74496621, - "learning_rate": 1.056959663258702e-06, - "loss": 0.76619852, - "num_input_tokens_seen": 239445510, - "step": 11087, - "time_per_iteration": 2.7366881370544434 - }, - { - "auxiliary_loss_clip": 0.01101311, - "auxiliary_loss_mlp": 0.01035633, - "balance_loss_clip": 1.03835797, - "balance_loss_mlp": 1.02250183, - "epoch": 0.666646625582444, - "flos": 22200515648640.0, - "grad_norm": 1.692056233669114, - "language_loss": 0.64937711, - "learning_rate": 1.0566162323016939e-06, - "loss": 0.67074656, - "num_input_tokens_seen": 239464805, - "step": 11088, - "time_per_iteration": 2.652937412261963 - }, - { - "auxiliary_loss_clip": 0.01099844, - "auxiliary_loss_mlp": 0.01029761, - "balance_loss_clip": 1.03648591, - "balance_loss_mlp": 1.01637387, - "epoch": 0.6667067488351119, - "flos": 18259930195200.0, - "grad_norm": 2.239140495962673, - "language_loss": 0.64203691, - "learning_rate": 1.0562728371178928e-06, - "loss": 0.66333294, - "num_input_tokens_seen": 239483890, - "step": 11089, - "time_per_iteration": 2.6637988090515137 - }, - { - "auxiliary_loss_clip": 0.01113447, - "auxiliary_loss_mlp": 0.01031785, - "balance_loss_clip": 1.03998184, - "balance_loss_mlp": 1.01876771, - "epoch": 0.66676687208778, - "flos": 17236547804160.0, - "grad_norm": 2.535090345981802, - "language_loss": 0.80804038, - "learning_rate": 1.0559294777203221e-06, - "loss": 0.82949275, - "num_input_tokens_seen": 239500080, - "step": 11090, - "time_per_iteration": 2.581758737564087 - }, - { - "auxiliary_loss_clip": 0.01092289, - "auxiliary_loss_mlp": 0.01035587, - "balance_loss_clip": 1.03686905, - "balance_loss_mlp": 1.02217007, - "epoch": 0.6668269953404479, - "flos": 19752197748480.0, - "grad_norm": 1.9927096976475185, - "language_loss": 0.77528715, - "learning_rate": 1.0555861541219984e-06, - "loss": 0.79656601, - "num_input_tokens_seen": 239517335, - "step": 11091, - "time_per_iteration": 2.673798084259033 - }, - { - "auxiliary_loss_clip": 0.01114388, - "auxiliary_loss_mlp": 0.01033443, - "balance_loss_clip": 1.04016709, - "balance_loss_mlp": 1.02024066, - "epoch": 0.6668871185931159, - "flos": 20558428467840.0, - "grad_norm": 1.9227343143607547, - "language_loss": 0.79361308, - "learning_rate": 1.0552428663359425e-06, - "loss": 0.81509137, - "num_input_tokens_seen": 239536240, - "step": 11092, - "time_per_iteration": 2.6783652305603027 - }, - { - "auxiliary_loss_clip": 0.01010839, - "auxiliary_loss_mlp": 0.01001852, - "balance_loss_clip": 1.01392734, - "balance_loss_mlp": 1.00064206, - "epoch": 0.6669472418457839, - "flos": 58088167735680.0, - "grad_norm": 1.5742465893545905, - "language_loss": 0.57764924, - "learning_rate": 1.0548996143751724e-06, - "loss": 0.59777617, - "num_input_tokens_seen": 239598000, - "step": 11093, - "time_per_iteration": 3.25225567817688 - }, - { - "auxiliary_loss_clip": 0.011126, - "auxiliary_loss_mlp": 0.01032323, - "balance_loss_clip": 1.03999138, - "balance_loss_mlp": 1.01957977, - "epoch": 0.6670073650984518, - "flos": 26065113880320.0, - "grad_norm": 1.5604249547045095, - "language_loss": 0.76737595, - "learning_rate": 1.054556398252703e-06, - "loss": 0.78882521, - "num_input_tokens_seen": 239617650, - "step": 11094, - "time_per_iteration": 2.6441400051116943 - }, - { - "auxiliary_loss_clip": 0.01114242, - "auxiliary_loss_mlp": 0.01034632, - "balance_loss_clip": 1.03926766, - "balance_loss_mlp": 1.02063107, - "epoch": 0.6670674883511198, - "flos": 32416849635840.0, - "grad_norm": 1.725805849880736, - "language_loss": 0.73280704, - "learning_rate": 1.05421321798155e-06, - "loss": 0.75429583, - "num_input_tokens_seen": 239639825, - "step": 11095, - "time_per_iteration": 2.6807525157928467 - }, - { - "auxiliary_loss_clip": 0.01100599, - "auxiliary_loss_mlp": 0.01038236, - "balance_loss_clip": 1.03832078, - "balance_loss_mlp": 1.02496827, - "epoch": 0.6671276116037878, - "flos": 18037786533120.0, - "grad_norm": 1.9301541125816652, - "language_loss": 0.73262459, - "learning_rate": 1.053870073574727e-06, - "loss": 0.75401294, - "num_input_tokens_seen": 239656300, - "step": 11096, - "time_per_iteration": 2.568824052810669 - }, - { - "auxiliary_loss_clip": 0.01069521, - "auxiliary_loss_mlp": 0.01032338, - "balance_loss_clip": 1.03659153, - "balance_loss_mlp": 1.01915956, - "epoch": 0.6671877348564558, - "flos": 23767046570880.0, - "grad_norm": 2.803880463620154, - "language_loss": 0.64528841, - "learning_rate": 1.0535269650452456e-06, - "loss": 0.66630697, - "num_input_tokens_seen": 239676655, - "step": 11097, - "time_per_iteration": 2.7534751892089844 - }, - { - "auxiliary_loss_clip": 0.01101343, - "auxiliary_loss_mlp": 0.01036605, - "balance_loss_clip": 1.03823709, - "balance_loss_mlp": 1.0242486, - "epoch": 0.6672478581091237, - "flos": 20918360701440.0, - "grad_norm": 1.9121192931903639, - "language_loss": 0.75842595, - "learning_rate": 1.0531838924061158e-06, - "loss": 0.77980542, - "num_input_tokens_seen": 239695430, - "step": 11098, - "time_per_iteration": 2.6095056533813477 - }, - { - "auxiliary_loss_clip": 0.01115287, - "auxiliary_loss_mlp": 0.0103552, - "balance_loss_clip": 1.04045045, - "balance_loss_mlp": 1.02328897, - "epoch": 0.6673079813617917, - "flos": 27855799626240.0, - "grad_norm": 1.5630193182693057, - "language_loss": 0.74190086, - "learning_rate": 1.0528408556703476e-06, - "loss": 0.76340902, - "num_input_tokens_seen": 239717070, - "step": 11099, - "time_per_iteration": 2.673234224319458 - }, - { - "auxiliary_loss_clip": 0.01098732, - "auxiliary_loss_mlp": 0.01036855, - "balance_loss_clip": 1.03607726, - "balance_loss_mlp": 1.02412391, - "epoch": 0.6673681046144596, - "flos": 21616859554560.0, - "grad_norm": 1.7967972361910232, - "language_loss": 0.78233874, - "learning_rate": 1.0524978548509502e-06, - "loss": 0.80369455, - "num_input_tokens_seen": 239737105, - "step": 11100, - "time_per_iteration": 2.637829303741455 - }, - { - "auxiliary_loss_clip": 0.01112293, - "auxiliary_loss_mlp": 0.01037899, - "balance_loss_clip": 1.03913033, - "balance_loss_mlp": 1.02564454, - "epoch": 0.6674282278671276, - "flos": 20889884194560.0, - "grad_norm": 3.1933899226541804, - "language_loss": 0.60124767, - "learning_rate": 1.0521548899609288e-06, - "loss": 0.62274957, - "num_input_tokens_seen": 239757835, - "step": 11101, - "time_per_iteration": 2.649627685546875 - }, - { - "auxiliary_loss_clip": 0.01098761, - "auxiliary_loss_mlp": 0.01034633, - "balance_loss_clip": 1.03970337, - "balance_loss_mlp": 1.02054238, - "epoch": 0.6674883511197955, - "flos": 23624194181760.0, - "grad_norm": 2.1447614079629362, - "language_loss": 0.71100485, - "learning_rate": 1.0518119610132884e-06, - "loss": 0.73233879, - "num_input_tokens_seen": 239775425, - "step": 11102, - "time_per_iteration": 2.7131104469299316 - }, - { - "auxiliary_loss_clip": 0.01103363, - "auxiliary_loss_mlp": 0.01031628, - "balance_loss_clip": 1.03698874, - "balance_loss_mlp": 1.01878357, - "epoch": 0.6675484743724636, - "flos": 19609668581760.0, - "grad_norm": 1.3386493038394256, - "language_loss": 0.84490895, - "learning_rate": 1.051469068021034e-06, - "loss": 0.8662588, - "num_input_tokens_seen": 239794605, - "step": 11103, - "time_per_iteration": 2.630141496658325 - }, - { - "auxiliary_loss_clip": 0.01091051, - "auxiliary_loss_mlp": 0.01027938, - "balance_loss_clip": 1.03639507, - "balance_loss_mlp": 1.01571894, - "epoch": 0.6676085976251315, - "flos": 14319452482560.0, - "grad_norm": 1.8538250094473767, - "language_loss": 0.77889514, - "learning_rate": 1.0511262109971668e-06, - "loss": 0.80008507, - "num_input_tokens_seen": 239812135, - "step": 11104, - "time_per_iteration": 2.7340710163116455 - }, - { - "auxiliary_loss_clip": 0.01067144, - "auxiliary_loss_mlp": 0.01029925, - "balance_loss_clip": 1.03659081, - "balance_loss_mlp": 1.01740217, - "epoch": 0.6676687208777995, - "flos": 38104596529920.0, - "grad_norm": 5.138036678415969, - "language_loss": 0.58146316, - "learning_rate": 1.0507833899546889e-06, - "loss": 0.60243386, - "num_input_tokens_seen": 239835845, - "step": 11105, - "time_per_iteration": 2.882567882537842 - }, - { - "auxiliary_loss_clip": 0.01107097, - "auxiliary_loss_mlp": 0.0103608, - "balance_loss_clip": 1.03966367, - "balance_loss_mlp": 1.02172112, - "epoch": 0.6677288441304675, - "flos": 23981576549760.0, - "grad_norm": 6.297152012729004, - "language_loss": 0.73476273, - "learning_rate": 1.0504406049066e-06, - "loss": 0.75619453, - "num_input_tokens_seen": 239853820, - "step": 11106, - "time_per_iteration": 2.6627464294433594 - }, - { - "auxiliary_loss_clip": 0.01113601, - "auxiliary_loss_mlp": 0.0103128, - "balance_loss_clip": 1.0392319, - "balance_loss_mlp": 1.01777363, - "epoch": 0.6677889673831354, - "flos": 24170682677760.0, - "grad_norm": 1.6820711130448331, - "language_loss": 0.76552516, - "learning_rate": 1.0500978558659e-06, - "loss": 0.78697395, - "num_input_tokens_seen": 239873365, - "step": 11107, - "time_per_iteration": 2.655085325241089 - }, - { - "auxiliary_loss_clip": 0.01089336, - "auxiliary_loss_mlp": 0.010326, - "balance_loss_clip": 1.03778529, - "balance_loss_mlp": 1.01969552, - "epoch": 0.6678490906358034, - "flos": 22309648145280.0, - "grad_norm": 2.264065486271505, - "language_loss": 0.90136391, - "learning_rate": 1.049755142845583e-06, - "loss": 0.92258334, - "num_input_tokens_seen": 239891215, - "step": 11108, - "time_per_iteration": 2.7129766941070557 - }, - { - "auxiliary_loss_clip": 0.01083707, - "auxiliary_loss_mlp": 0.01029485, - "balance_loss_clip": 1.04215026, - "balance_loss_mlp": 1.01795101, - "epoch": 0.6679092138884714, - "flos": 36898752026880.0, - "grad_norm": 1.413392892629677, - "language_loss": 0.82960904, - "learning_rate": 1.049412465858646e-06, - "loss": 0.85074097, - "num_input_tokens_seen": 239913490, - "step": 11109, - "time_per_iteration": 2.867154121398926 - }, - { - "auxiliary_loss_clip": 0.01087234, - "auxiliary_loss_mlp": 0.01035101, - "balance_loss_clip": 1.03826952, - "balance_loss_mlp": 1.02132595, - "epoch": 0.6679693371411394, - "flos": 18150294908160.0, - "grad_norm": 2.421344403388597, - "language_loss": 0.70021516, - "learning_rate": 1.0490698249180847e-06, - "loss": 0.72143853, - "num_input_tokens_seen": 239931565, - "step": 11110, - "time_per_iteration": 2.6291885375976562 - }, - { - "auxiliary_loss_clip": 0.01087492, - "auxiliary_loss_mlp": 0.01037588, - "balance_loss_clip": 1.03955197, - "balance_loss_mlp": 1.02289498, - "epoch": 0.6680294603938073, - "flos": 27198167472000.0, - "grad_norm": 1.5840834743354089, - "language_loss": 0.73441553, - "learning_rate": 1.04872722003689e-06, - "loss": 0.75566632, - "num_input_tokens_seen": 239952395, - "step": 11111, - "time_per_iteration": 2.677231788635254 - }, - { - "auxiliary_loss_clip": 0.01110772, - "auxiliary_loss_mlp": 0.01031184, - "balance_loss_clip": 1.0385406, - "balance_loss_mlp": 1.01907229, - "epoch": 0.6680895836464753, - "flos": 21725309692800.0, - "grad_norm": 6.009810258732459, - "language_loss": 0.65599185, - "learning_rate": 1.0483846512280553e-06, - "loss": 0.67741144, - "num_input_tokens_seen": 239968910, - "step": 11112, - "time_per_iteration": 2.5904297828674316 - }, - { - "auxiliary_loss_clip": 0.01086609, - "auxiliary_loss_mlp": 0.01030827, - "balance_loss_clip": 1.03706861, - "balance_loss_mlp": 1.01755929, - "epoch": 0.6681497068991432, - "flos": 19646477043840.0, - "grad_norm": 1.8628832000622026, - "language_loss": 0.6369822, - "learning_rate": 1.048042118504569e-06, - "loss": 0.65815663, - "num_input_tokens_seen": 239987680, - "step": 11113, - "time_per_iteration": 2.623263359069824 - }, - { - "auxiliary_loss_clip": 0.01072141, - "auxiliary_loss_mlp": 0.01037464, - "balance_loss_clip": 1.04164052, - "balance_loss_mlp": 1.02563202, - "epoch": 0.6682098301518112, - "flos": 17419153570560.0, - "grad_norm": 1.9634476729216852, - "language_loss": 0.6540277, - "learning_rate": 1.047699621879422e-06, - "loss": 0.67512381, - "num_input_tokens_seen": 240005790, - "step": 11114, - "time_per_iteration": 2.865252733230591 - }, - { - "auxiliary_loss_clip": 0.0110424, - "auxiliary_loss_mlp": 0.0103987, - "balance_loss_clip": 1.03883052, - "balance_loss_mlp": 1.0267992, - "epoch": 0.6682699534044791, - "flos": 22599016110720.0, - "grad_norm": 1.6562280172476918, - "language_loss": 0.78432989, - "learning_rate": 1.0473571613655998e-06, - "loss": 0.80577099, - "num_input_tokens_seen": 240025895, - "step": 11115, - "time_per_iteration": 2.7281594276428223 - }, - { - "auxiliary_loss_clip": 0.0105862, - "auxiliary_loss_mlp": 0.00771764, - "balance_loss_clip": 1.0309999, - "balance_loss_mlp": 1.00021195, - "epoch": 0.6683300766571472, - "flos": 24863686750080.0, - "grad_norm": 1.6526033494173815, - "language_loss": 0.79655063, - "learning_rate": 1.0470147369760896e-06, - "loss": 0.81485444, - "num_input_tokens_seen": 240044880, - "step": 11116, - "time_per_iteration": 4.51043963432312 - }, - { - "auxiliary_loss_clip": 0.01084566, - "auxiliary_loss_mlp": 0.01036408, - "balance_loss_clip": 1.03999686, - "balance_loss_mlp": 1.02240658, - "epoch": 0.6683901999098151, - "flos": 27126633536640.0, - "grad_norm": 2.4411111020753347, - "language_loss": 0.7904433, - "learning_rate": 1.0466723487238768e-06, - "loss": 0.81165314, - "num_input_tokens_seen": 240065785, - "step": 11117, - "time_per_iteration": 2.748905897140503 - }, - { - "auxiliary_loss_clip": 0.01069081, - "auxiliary_loss_mlp": 0.01033165, - "balance_loss_clip": 1.03828013, - "balance_loss_mlp": 1.01844835, - "epoch": 0.6684503231624831, - "flos": 20739023072640.0, - "grad_norm": 3.4807828142340815, - "language_loss": 0.65610313, - "learning_rate": 1.0463299966219441e-06, - "loss": 0.67712557, - "num_input_tokens_seen": 240085130, - "step": 11118, - "time_per_iteration": 2.707383871078491 - }, - { - "auxiliary_loss_clip": 0.01091583, - "auxiliary_loss_mlp": 0.01033843, - "balance_loss_clip": 1.03924751, - "balance_loss_mlp": 1.02176738, - "epoch": 0.668510446415151, - "flos": 21762189982080.0, - "grad_norm": 1.4374358637877027, - "language_loss": 0.68942273, - "learning_rate": 1.0459876806832727e-06, - "loss": 0.71067697, - "num_input_tokens_seen": 240105495, - "step": 11119, - "time_per_iteration": 2.6769771575927734 - }, - { - "auxiliary_loss_clip": 0.01086506, - "auxiliary_loss_mlp": 0.01033952, - "balance_loss_clip": 1.03629112, - "balance_loss_mlp": 1.02011752, - "epoch": 0.668570569667819, - "flos": 30191250015360.0, - "grad_norm": 1.6841707968514, - "language_loss": 0.67587042, - "learning_rate": 1.0456454009208448e-06, - "loss": 0.69707495, - "num_input_tokens_seen": 240125455, - "step": 11120, - "time_per_iteration": 5.847496509552002 - }, - { - "auxiliary_loss_clip": 0.01082761, - "auxiliary_loss_mlp": 0.01034422, - "balance_loss_clip": 1.03859222, - "balance_loss_mlp": 1.02105224, - "epoch": 0.668630692920487, - "flos": 24170646764160.0, - "grad_norm": 1.5497664343001825, - "language_loss": 0.72015131, - "learning_rate": 1.045303157347638e-06, - "loss": 0.74132311, - "num_input_tokens_seen": 240143870, - "step": 11121, - "time_per_iteration": 2.763155698776245 - }, - { - "auxiliary_loss_clip": 0.01090844, - "auxiliary_loss_mlp": 0.01037875, - "balance_loss_clip": 1.03589582, - "balance_loss_mlp": 1.02405834, - "epoch": 0.668690816173155, - "flos": 17457147181440.0, - "grad_norm": 2.929304268957898, - "language_loss": 0.70167738, - "learning_rate": 1.0449609499766316e-06, - "loss": 0.72296458, - "num_input_tokens_seen": 240161020, - "step": 11122, - "time_per_iteration": 2.657095432281494 - }, - { - "auxiliary_loss_clip": 0.0105491, - "auxiliary_loss_mlp": 0.00772515, - "balance_loss_clip": 1.03529811, - "balance_loss_mlp": 1.00017619, - "epoch": 0.668750939425823, - "flos": 25005102595200.0, - "grad_norm": 1.8472771024518286, - "language_loss": 0.71752214, - "learning_rate": 1.0446187788208015e-06, - "loss": 0.73579645, - "num_input_tokens_seen": 240179820, - "step": 11123, - "time_per_iteration": 2.811048984527588 - }, - { - "auxiliary_loss_clip": 0.01096616, - "auxiliary_loss_mlp": 0.01042035, - "balance_loss_clip": 1.04108119, - "balance_loss_mlp": 1.02759266, - "epoch": 0.6688110626784909, - "flos": 24096778444800.0, - "grad_norm": 1.6363097123873878, - "language_loss": 0.79147661, - "learning_rate": 1.0442766438931244e-06, - "loss": 0.81286311, - "num_input_tokens_seen": 240200130, - "step": 11124, - "time_per_iteration": 4.317869663238525 - }, - { - "auxiliary_loss_clip": 0.01089397, - "auxiliary_loss_mlp": 0.01041307, - "balance_loss_clip": 1.04114437, - "balance_loss_mlp": 1.02808654, - "epoch": 0.6688711859311589, - "flos": 21759532375680.0, - "grad_norm": 1.733456144830199, - "language_loss": 0.74266189, - "learning_rate": 1.0439345452065716e-06, - "loss": 0.76396894, - "num_input_tokens_seen": 240217945, - "step": 11125, - "time_per_iteration": 2.67317795753479 - }, - { - "auxiliary_loss_clip": 0.01076985, - "auxiliary_loss_mlp": 0.0104133, - "balance_loss_clip": 1.0369091, - "balance_loss_mlp": 1.02759719, - "epoch": 0.6689313091838268, - "flos": 22929645824640.0, - "grad_norm": 2.098915501123677, - "language_loss": 0.67166436, - "learning_rate": 1.043592482774116e-06, - "loss": 0.69284761, - "num_input_tokens_seen": 240237220, - "step": 11126, - "time_per_iteration": 2.739659547805786 - }, - { - "auxiliary_loss_clip": 0.01096554, - "auxiliary_loss_mlp": 0.01031996, - "balance_loss_clip": 1.03653789, - "balance_loss_mlp": 1.01875162, - "epoch": 0.6689914324364948, - "flos": 20886149180160.0, - "grad_norm": 1.7642293623874703, - "language_loss": 0.71071386, - "learning_rate": 1.0432504566087305e-06, - "loss": 0.73199928, - "num_input_tokens_seen": 240256000, - "step": 11127, - "time_per_iteration": 2.729490041732788 - }, - { - "auxiliary_loss_clip": 0.01093813, - "auxiliary_loss_mlp": 0.01034088, - "balance_loss_clip": 1.03839648, - "balance_loss_mlp": 1.01959229, - "epoch": 0.6690515556891627, - "flos": 22748225207040.0, - "grad_norm": 1.9937177709857246, - "language_loss": 0.80368018, - "learning_rate": 1.0429084667233827e-06, - "loss": 0.82495916, - "num_input_tokens_seen": 240275845, - "step": 11128, - "time_per_iteration": 2.6976559162139893 - }, - { - "auxiliary_loss_clip": 0.01114736, - "auxiliary_loss_mlp": 0.01031549, - "balance_loss_clip": 1.03945661, - "balance_loss_mlp": 1.01769066, - "epoch": 0.6691116789418308, - "flos": 23331450337920.0, - "grad_norm": 1.7224977753706385, - "language_loss": 0.80861622, - "learning_rate": 1.0425665131310427e-06, - "loss": 0.83007908, - "num_input_tokens_seen": 240294095, - "step": 11129, - "time_per_iteration": 2.6617815494537354 - }, - { - "auxiliary_loss_clip": 0.01091652, - "auxiliary_loss_mlp": 0.010401, - "balance_loss_clip": 1.03546023, - "balance_loss_mlp": 1.02758944, - "epoch": 0.6691718021944987, - "flos": 32447014081920.0, - "grad_norm": 1.6214077068942991, - "language_loss": 0.70471781, - "learning_rate": 1.0422245958446762e-06, - "loss": 0.72603536, - "num_input_tokens_seen": 240313460, - "step": 11130, - "time_per_iteration": 2.715178966522217 - }, - { - "auxiliary_loss_clip": 0.01088381, - "auxiliary_loss_mlp": 0.0103708, - "balance_loss_clip": 1.03720927, - "balance_loss_mlp": 1.02462888, - "epoch": 0.6692319254471667, - "flos": 23731602825600.0, - "grad_norm": 2.6655548100703643, - "language_loss": 0.70267725, - "learning_rate": 1.0418827148772486e-06, - "loss": 0.72393191, - "num_input_tokens_seen": 240333540, - "step": 11131, - "time_per_iteration": 2.747252941131592 - }, - { - "auxiliary_loss_clip": 0.01104604, - "auxiliary_loss_mlp": 0.01034247, - "balance_loss_clip": 1.03865063, - "balance_loss_mlp": 1.01924431, - "epoch": 0.6692920486998346, - "flos": 14427902620800.0, - "grad_norm": 2.56171206247206, - "language_loss": 0.65588742, - "learning_rate": 1.0415408702417243e-06, - "loss": 0.6772759, - "num_input_tokens_seen": 240350085, - "step": 11132, - "time_per_iteration": 2.697385311126709 - }, - { - "auxiliary_loss_clip": 0.01102641, - "auxiliary_loss_mlp": 0.01034541, - "balance_loss_clip": 1.03688669, - "balance_loss_mlp": 1.01992595, - "epoch": 0.6693521719525026, - "flos": 21507475662720.0, - "grad_norm": 1.866615287346346, - "language_loss": 0.74370456, - "learning_rate": 1.0411990619510661e-06, - "loss": 0.7650764, - "num_input_tokens_seen": 240370015, - "step": 11133, - "time_per_iteration": 2.7032175064086914 - }, - { - "auxiliary_loss_clip": 0.01110623, - "auxiliary_loss_mlp": 0.01036691, - "balance_loss_clip": 1.04268622, - "balance_loss_mlp": 1.0218854, - "epoch": 0.6694122952051706, - "flos": 25406943022080.0, - "grad_norm": 1.7566380678066518, - "language_loss": 0.66696709, - "learning_rate": 1.0408572900182363e-06, - "loss": 0.6884402, - "num_input_tokens_seen": 240390770, - "step": 11134, - "time_per_iteration": 2.7601702213287354 - }, - { - "auxiliary_loss_clip": 0.01106772, - "auxiliary_loss_mlp": 0.01043022, - "balance_loss_clip": 1.04027784, - "balance_loss_mlp": 1.0275842, - "epoch": 0.6694724184578386, - "flos": 25661729168640.0, - "grad_norm": 1.8684519143911829, - "language_loss": 0.77561742, - "learning_rate": 1.0405155544561943e-06, - "loss": 0.79711533, - "num_input_tokens_seen": 240409590, - "step": 11135, - "time_per_iteration": 2.6581594944000244 - }, - { - "auxiliary_loss_clip": 0.01104169, - "auxiliary_loss_mlp": 0.01034806, - "balance_loss_clip": 1.04034281, - "balance_loss_mlp": 1.02143669, - "epoch": 0.6695325417105066, - "flos": 17709311635200.0, - "grad_norm": 1.6117039898518706, - "language_loss": 0.74245167, - "learning_rate": 1.040173855277898e-06, - "loss": 0.76384139, - "num_input_tokens_seen": 240428180, - "step": 11136, - "time_per_iteration": 2.7073006629943848 - }, - { - "auxiliary_loss_clip": 0.01109339, - "auxiliary_loss_mlp": 0.01037889, - "balance_loss_clip": 1.04210007, - "balance_loss_mlp": 1.0232265, - "epoch": 0.6695926649631745, - "flos": 24460050643200.0, - "grad_norm": 1.7129981010638282, - "language_loss": 0.62248957, - "learning_rate": 1.0398321924963061e-06, - "loss": 0.64396185, - "num_input_tokens_seen": 240447815, - "step": 11137, - "time_per_iteration": 2.6636767387390137 - }, - { - "auxiliary_loss_clip": 0.01114546, - "auxiliary_loss_mlp": 0.01028612, - "balance_loss_clip": 1.04025912, - "balance_loss_mlp": 1.01526093, - "epoch": 0.6696527882158425, - "flos": 24280138396800.0, - "grad_norm": 1.688540284250028, - "language_loss": 0.66006732, - "learning_rate": 1.0394905661243724e-06, - "loss": 0.68149894, - "num_input_tokens_seen": 240468635, - "step": 11138, - "time_per_iteration": 2.608583688735962 - }, - { - "auxiliary_loss_clip": 0.01077908, - "auxiliary_loss_mlp": 0.01040221, - "balance_loss_clip": 1.0351193, - "balance_loss_mlp": 1.02686357, - "epoch": 0.6697129114685104, - "flos": 23002759958400.0, - "grad_norm": 1.6525815819397558, - "language_loss": 0.73112983, - "learning_rate": 1.039148976175053e-06, - "loss": 0.75231111, - "num_input_tokens_seen": 240488550, - "step": 11139, - "time_per_iteration": 2.6988184452056885 - }, - { - "auxiliary_loss_clip": 0.01073576, - "auxiliary_loss_mlp": 0.01036448, - "balance_loss_clip": 1.0351299, - "balance_loss_mlp": 1.02378786, - "epoch": 0.6697730347211784, - "flos": 22638123043200.0, - "grad_norm": 1.9643022468042264, - "language_loss": 0.70518827, - "learning_rate": 1.0388074226613016e-06, - "loss": 0.72628856, - "num_input_tokens_seen": 240508330, - "step": 11140, - "time_per_iteration": 2.782379150390625 - }, - { - "auxiliary_loss_clip": 0.01103316, - "auxiliary_loss_mlp": 0.01029096, - "balance_loss_clip": 1.0356679, - "balance_loss_mlp": 1.01478446, - "epoch": 0.6698331579738463, - "flos": 28877242682880.0, - "grad_norm": 1.8179612458816414, - "language_loss": 0.75826752, - "learning_rate": 1.0384659055960691e-06, - "loss": 0.77959162, - "num_input_tokens_seen": 240528470, - "step": 11141, - "time_per_iteration": 2.662597417831421 - }, - { - "auxiliary_loss_clip": 0.01103859, - "auxiliary_loss_mlp": 0.01038503, - "balance_loss_clip": 1.03954339, - "balance_loss_mlp": 1.02456141, - "epoch": 0.6698932812265144, - "flos": 24207096090240.0, - "grad_norm": 1.817558320872016, - "language_loss": 0.81910652, - "learning_rate": 1.0381244249923052e-06, - "loss": 0.84053016, - "num_input_tokens_seen": 240547815, - "step": 11142, - "time_per_iteration": 2.6364564895629883 - }, - { - "auxiliary_loss_clip": 0.01063471, - "auxiliary_loss_mlp": 0.01030688, - "balance_loss_clip": 1.03567362, - "balance_loss_mlp": 1.01705074, - "epoch": 0.6699534044791823, - "flos": 22090269830400.0, - "grad_norm": 1.605847382893669, - "language_loss": 0.70027417, - "learning_rate": 1.037782980862959e-06, - "loss": 0.72121578, - "num_input_tokens_seen": 240567765, - "step": 11143, - "time_per_iteration": 2.738811492919922 - }, - { - "auxiliary_loss_clip": 0.01071446, - "auxiliary_loss_mlp": 0.00771315, - "balance_loss_clip": 1.03594804, - "balance_loss_mlp": 1.00014567, - "epoch": 0.6700135277318503, - "flos": 25192377129600.0, - "grad_norm": 1.4724413771665843, - "language_loss": 0.70065033, - "learning_rate": 1.0374415732209796e-06, - "loss": 0.71907794, - "num_input_tokens_seen": 240590750, - "step": 11144, - "time_per_iteration": 2.85090708732605 - }, - { - "auxiliary_loss_clip": 0.01087354, - "auxiliary_loss_mlp": 0.01033347, - "balance_loss_clip": 1.0364095, - "balance_loss_mlp": 1.02025223, - "epoch": 0.6700736509845182, - "flos": 23440187784960.0, - "grad_norm": 1.6283494272446573, - "language_loss": 0.74419498, - "learning_rate": 1.0371002020793114e-06, - "loss": 0.76540208, - "num_input_tokens_seen": 240608875, - "step": 11145, - "time_per_iteration": 2.9192864894866943 - }, - { - "auxiliary_loss_clip": 0.0109431, - "auxiliary_loss_mlp": 0.01030391, - "balance_loss_clip": 1.03830147, - "balance_loss_mlp": 1.01683688, - "epoch": 0.6701337742371862, - "flos": 24389953251840.0, - "grad_norm": 5.654995580149679, - "language_loss": 0.7114135, - "learning_rate": 1.0367588674509008e-06, - "loss": 0.73266053, - "num_input_tokens_seen": 240628565, - "step": 11146, - "time_per_iteration": 2.7690348625183105 - }, - { - "auxiliary_loss_clip": 0.01109374, - "auxiliary_loss_mlp": 0.00770286, - "balance_loss_clip": 1.03855777, - "balance_loss_mlp": 1.00021374, - "epoch": 0.6701938974898543, - "flos": 14793652857600.0, - "grad_norm": 1.9526898160613644, - "language_loss": 0.78687358, - "learning_rate": 1.0364175693486905e-06, - "loss": 0.80567014, - "num_input_tokens_seen": 240646325, - "step": 11147, - "time_per_iteration": 2.6259043216705322 - }, - { - "auxiliary_loss_clip": 0.01104856, - "auxiliary_loss_mlp": 0.0077075, - "balance_loss_clip": 1.04050827, - "balance_loss_mlp": 1.00021648, - "epoch": 0.6702540207425222, - "flos": 20154002261760.0, - "grad_norm": 2.133465120376381, - "language_loss": 0.70325512, - "learning_rate": 1.0360763077856218e-06, - "loss": 0.72201115, - "num_input_tokens_seen": 240666145, - "step": 11148, - "time_per_iteration": 2.6906309127807617 - }, - { - "auxiliary_loss_clip": 0.01094652, - "auxiliary_loss_mlp": 0.01033466, - "balance_loss_clip": 1.03719747, - "balance_loss_mlp": 1.02005529, - "epoch": 0.6703141439951902, - "flos": 21214157201280.0, - "grad_norm": 2.1690530349128148, - "language_loss": 0.7037127, - "learning_rate": 1.035735082774636e-06, - "loss": 0.72499388, - "num_input_tokens_seen": 240685570, - "step": 11149, - "time_per_iteration": 2.6307806968688965 - }, - { - "auxiliary_loss_clip": 0.01092611, - "auxiliary_loss_mlp": 0.010296, - "balance_loss_clip": 1.03670847, - "balance_loss_mlp": 1.01705897, - "epoch": 0.6703742672478581, - "flos": 23112538899840.0, - "grad_norm": 1.6662323590997945, - "language_loss": 0.73725748, - "learning_rate": 1.0353938943286727e-06, - "loss": 0.75847954, - "num_input_tokens_seen": 240706945, - "step": 11150, - "time_per_iteration": 2.6917827129364014 - }, - { - "auxiliary_loss_clip": 0.01103639, - "auxiliary_loss_mlp": 0.01036073, - "balance_loss_clip": 1.0409379, - "balance_loss_mlp": 1.02276325, - "epoch": 0.6704343905005261, - "flos": 22528918719360.0, - "grad_norm": 1.705366168717962, - "language_loss": 0.78539407, - "learning_rate": 1.035052742460671e-06, - "loss": 0.80679119, - "num_input_tokens_seen": 240727990, - "step": 11151, - "time_per_iteration": 2.6567208766937256 - }, - { - "auxiliary_loss_clip": 0.00987579, - "auxiliary_loss_mlp": 0.01000572, - "balance_loss_clip": 1.01053739, - "balance_loss_mlp": 0.99935037, - "epoch": 0.670494513753194, - "flos": 64793158773120.0, - "grad_norm": 0.7884890805336543, - "language_loss": 0.55364567, - "learning_rate": 1.0347116271835643e-06, - "loss": 0.57352722, - "num_input_tokens_seen": 240790380, - "step": 11152, - "time_per_iteration": 3.3006503582000732 - }, - { - "auxiliary_loss_clip": 0.0109132, - "auxiliary_loss_mlp": 0.01038631, - "balance_loss_clip": 1.03844714, - "balance_loss_mlp": 1.025244, - "epoch": 0.670554637005862, - "flos": 23511506238720.0, - "grad_norm": 1.9985135771335918, - "language_loss": 0.80859494, - "learning_rate": 1.0343705485102896e-06, - "loss": 0.82989448, - "num_input_tokens_seen": 240811545, - "step": 11153, - "time_per_iteration": 2.7756435871124268 - }, - { - "auxiliary_loss_clip": 0.01076408, - "auxiliary_loss_mlp": 0.00771693, - "balance_loss_clip": 1.03820157, - "balance_loss_mlp": 1.00020981, - "epoch": 0.67061476025853, - "flos": 19463404400640.0, - "grad_norm": 1.6080859471988709, - "language_loss": 0.76408523, - "learning_rate": 1.0340295064537814e-06, - "loss": 0.78256631, - "num_input_tokens_seen": 240831380, - "step": 11154, - "time_per_iteration": 2.8628106117248535 - }, - { - "auxiliary_loss_clip": 0.01094529, - "auxiliary_loss_mlp": 0.01041911, - "balance_loss_clip": 1.03737462, - "balance_loss_mlp": 1.02754045, - "epoch": 0.670674883511198, - "flos": 20519967980160.0, - "grad_norm": 1.6589905225029438, - "language_loss": 0.76200944, - "learning_rate": 1.0336885010269702e-06, - "loss": 0.78337383, - "num_input_tokens_seen": 240851855, - "step": 11155, - "time_per_iteration": 4.394611120223999 - }, - { - "auxiliary_loss_clip": 0.01115828, - "auxiliary_loss_mlp": 0.01036265, - "balance_loss_clip": 1.04136384, - "balance_loss_mlp": 1.02283049, - "epoch": 0.6707350067638659, - "flos": 25483971738240.0, - "grad_norm": 2.252293716833977, - "language_loss": 0.82174289, - "learning_rate": 1.0333475322427878e-06, - "loss": 0.8432638, - "num_input_tokens_seen": 240869980, - "step": 11156, - "time_per_iteration": 2.672253370285034 - }, - { - "auxiliary_loss_clip": 0.01114074, - "auxiliary_loss_mlp": 0.01037802, - "balance_loss_clip": 1.04081774, - "balance_loss_mlp": 1.02488017, - "epoch": 0.6707951300165339, - "flos": 22273450214400.0, - "grad_norm": 1.89603770151681, - "language_loss": 0.7505753, - "learning_rate": 1.033006600114165e-06, - "loss": 0.77209401, - "num_input_tokens_seen": 240888680, - "step": 11157, - "time_per_iteration": 2.6131577491760254 - }, - { - "auxiliary_loss_clip": 0.01109055, - "auxiliary_loss_mlp": 0.01042973, - "balance_loss_clip": 1.04226005, - "balance_loss_mlp": 1.02867961, - "epoch": 0.6708552532692018, - "flos": 23984593292160.0, - "grad_norm": 1.7747922187460388, - "language_loss": 0.74478519, - "learning_rate": 1.0326657046540282e-06, - "loss": 0.76630545, - "num_input_tokens_seen": 240909050, - "step": 11158, - "time_per_iteration": 2.7293169498443604 - }, - { - "auxiliary_loss_clip": 0.01118082, - "auxiliary_loss_mlp": 0.01037488, - "balance_loss_clip": 1.04157019, - "balance_loss_mlp": 1.02339745, - "epoch": 0.6709153765218698, - "flos": 24937519155840.0, - "grad_norm": 1.5675836402135142, - "language_loss": 0.81520784, - "learning_rate": 1.0323248458753044e-06, - "loss": 0.8367635, - "num_input_tokens_seen": 240930035, - "step": 11159, - "time_per_iteration": 5.697297811508179 - }, - { - "auxiliary_loss_clip": 0.01093112, - "auxiliary_loss_mlp": 0.01031466, - "balance_loss_clip": 1.037853, - "balance_loss_mlp": 1.01822233, - "epoch": 0.6709754997745379, - "flos": 17530225401600.0, - "grad_norm": 1.775658111941971, - "language_loss": 0.76943409, - "learning_rate": 1.0319840237909193e-06, - "loss": 0.79067993, - "num_input_tokens_seen": 240948895, - "step": 11160, - "time_per_iteration": 2.649531602859497 - }, - { - "auxiliary_loss_clip": 0.01088534, - "auxiliary_loss_mlp": 0.01033293, - "balance_loss_clip": 1.03823304, - "balance_loss_mlp": 1.01970327, - "epoch": 0.6710356230272058, - "flos": 22090880361600.0, - "grad_norm": 1.7750116462358165, - "language_loss": 0.73715007, - "learning_rate": 1.0316432384137978e-06, - "loss": 0.75836837, - "num_input_tokens_seen": 240967770, - "step": 11161, - "time_per_iteration": 2.677884817123413 - }, - { - "auxiliary_loss_clip": 0.01093874, - "auxiliary_loss_mlp": 0.01041282, - "balance_loss_clip": 1.03686976, - "balance_loss_mlp": 1.0268575, - "epoch": 0.6710957462798738, - "flos": 24206449645440.0, - "grad_norm": 1.942474500054277, - "language_loss": 0.68453658, - "learning_rate": 1.0313024897568618e-06, - "loss": 0.70588821, - "num_input_tokens_seen": 240988985, - "step": 11162, - "time_per_iteration": 2.7426352500915527 - }, - { - "auxiliary_loss_clip": 0.01089967, - "auxiliary_loss_mlp": 0.01042721, - "balance_loss_clip": 1.03566909, - "balance_loss_mlp": 1.02965569, - "epoch": 0.6711558695325417, - "flos": 19093955063040.0, - "grad_norm": 2.157920195613674, - "language_loss": 0.70179218, - "learning_rate": 1.030961777833032e-06, - "loss": 0.72311902, - "num_input_tokens_seen": 241005455, - "step": 11163, - "time_per_iteration": 2.6737561225891113 - }, - { - "auxiliary_loss_clip": 0.01113094, - "auxiliary_loss_mlp": 0.0103505, - "balance_loss_clip": 1.0411427, - "balance_loss_mlp": 1.02216315, - "epoch": 0.6712159927852097, - "flos": 25557875971200.0, - "grad_norm": 1.7583635951984506, - "language_loss": 0.75421375, - "learning_rate": 1.0306211026552291e-06, - "loss": 0.7756952, - "num_input_tokens_seen": 241026175, - "step": 11164, - "time_per_iteration": 4.2939674854278564 - }, - { - "auxiliary_loss_clip": 0.01115198, - "auxiliary_loss_mlp": 0.01033198, - "balance_loss_clip": 1.0404532, - "balance_loss_mlp": 1.01967335, - "epoch": 0.6712761160378776, - "flos": 22228812587520.0, - "grad_norm": 1.9153842218638528, - "language_loss": 0.65245664, - "learning_rate": 1.0302804642363704e-06, - "loss": 0.67394054, - "num_input_tokens_seen": 241044040, - "step": 11165, - "time_per_iteration": 2.6558966636657715 - }, - { - "auxiliary_loss_clip": 0.01112642, - "auxiliary_loss_mlp": 0.01036219, - "balance_loss_clip": 1.03975642, - "balance_loss_mlp": 1.02284431, - "epoch": 0.6713362392905456, - "flos": 22455517276800.0, - "grad_norm": 2.4551304238389218, - "language_loss": 0.71630502, - "learning_rate": 1.0299398625893738e-06, - "loss": 0.73779362, - "num_input_tokens_seen": 241063615, - "step": 11166, - "time_per_iteration": 2.594005823135376 - }, - { - "auxiliary_loss_clip": 0.01113176, - "auxiliary_loss_mlp": 0.01030674, - "balance_loss_clip": 1.04087472, - "balance_loss_mlp": 1.01834142, - "epoch": 0.6713963625432136, - "flos": 25630200005760.0, - "grad_norm": 2.7039163117890728, - "language_loss": 0.77024722, - "learning_rate": 1.0295992977271546e-06, - "loss": 0.79168576, - "num_input_tokens_seen": 241082520, - "step": 11167, - "time_per_iteration": 2.630964517593384 - }, - { - "auxiliary_loss_clip": 0.01101695, - "auxiliary_loss_mlp": 0.01040458, - "balance_loss_clip": 1.03634501, - "balance_loss_mlp": 1.02711856, - "epoch": 0.6714564857958816, - "flos": 35006475640320.0, - "grad_norm": 1.6082371290328819, - "language_loss": 0.68865132, - "learning_rate": 1.029258769662629e-06, - "loss": 0.71007288, - "num_input_tokens_seen": 241103505, - "step": 11168, - "time_per_iteration": 2.845033884048462 - }, - { - "auxiliary_loss_clip": 0.01078889, - "auxiliary_loss_mlp": 0.01043458, - "balance_loss_clip": 1.03778422, - "balance_loss_mlp": 1.02867651, - "epoch": 0.6715166090485495, - "flos": 26279931168000.0, - "grad_norm": 1.9394421042077383, - "language_loss": 0.73349601, - "learning_rate": 1.0289182784087068e-06, - "loss": 0.75471944, - "num_input_tokens_seen": 241122885, - "step": 11169, - "time_per_iteration": 2.9264886379241943 - }, - { - "auxiliary_loss_clip": 0.0110554, - "auxiliary_loss_mlp": 0.01039147, - "balance_loss_clip": 1.0378871, - "balance_loss_mlp": 1.02427554, - "epoch": 0.6715767323012175, - "flos": 15924156583680.0, - "grad_norm": 1.9283403176707277, - "language_loss": 0.76306462, - "learning_rate": 1.0285778239783005e-06, - "loss": 0.78451145, - "num_input_tokens_seen": 241140865, - "step": 11170, - "time_per_iteration": 2.649400472640991 - }, - { - "auxiliary_loss_clip": 0.01095301, - "auxiliary_loss_mlp": 0.0103096, - "balance_loss_clip": 1.03898799, - "balance_loss_mlp": 1.01709008, - "epoch": 0.6716368555538854, - "flos": 17491441691520.0, - "grad_norm": 1.924665288480993, - "language_loss": 0.74140078, - "learning_rate": 1.0282374063843212e-06, - "loss": 0.76266336, - "num_input_tokens_seen": 241158225, - "step": 11171, - "time_per_iteration": 2.672985076904297 - }, - { - "auxiliary_loss_clip": 0.0107518, - "auxiliary_loss_mlp": 0.01054034, - "balance_loss_clip": 1.03710866, - "balance_loss_mlp": 1.03831053, - "epoch": 0.6716969788065534, - "flos": 16761521416320.0, - "grad_norm": 1.4921239292463526, - "language_loss": 0.86225343, - "learning_rate": 1.0278970256396762e-06, - "loss": 0.88354552, - "num_input_tokens_seen": 241175215, - "step": 11172, - "time_per_iteration": 2.720012664794922 - }, - { - "auxiliary_loss_clip": 0.01098137, - "auxiliary_loss_mlp": 0.01041037, - "balance_loss_clip": 1.0346545, - "balance_loss_mlp": 1.02693462, - "epoch": 0.6717571020592215, - "flos": 22709800632960.0, - "grad_norm": 1.8099463790548698, - "language_loss": 0.63222194, - "learning_rate": 1.0275566817572733e-06, - "loss": 0.65361369, - "num_input_tokens_seen": 241195250, - "step": 11173, - "time_per_iteration": 2.6705803871154785 - }, - { - "auxiliary_loss_clip": 0.0111084, - "auxiliary_loss_mlp": 0.01040058, - "balance_loss_clip": 1.03873289, - "balance_loss_mlp": 1.02487719, - "epoch": 0.6718172253118894, - "flos": 18734094656640.0, - "grad_norm": 2.1708594678401707, - "language_loss": 0.71347594, - "learning_rate": 1.02721637475002e-06, - "loss": 0.73498487, - "num_input_tokens_seen": 241210720, - "step": 11174, - "time_per_iteration": 2.602283477783203 - }, - { - "auxiliary_loss_clip": 0.01075457, - "auxiliary_loss_mlp": 0.01030548, - "balance_loss_clip": 1.03783953, - "balance_loss_mlp": 1.01738167, - "epoch": 0.6718773485645574, - "flos": 15632526061440.0, - "grad_norm": 2.052442882823656, - "language_loss": 0.67971045, - "learning_rate": 1.0268761046308178e-06, - "loss": 0.7007705, - "num_input_tokens_seen": 241227395, - "step": 11175, - "time_per_iteration": 2.669154644012451 - }, - { - "auxiliary_loss_clip": 0.01085, - "auxiliary_loss_mlp": 0.01037471, - "balance_loss_clip": 1.0389663, - "balance_loss_mlp": 1.02479339, - "epoch": 0.6719374718172253, - "flos": 19354774694400.0, - "grad_norm": 2.182967446535966, - "language_loss": 0.7362026, - "learning_rate": 1.0265358714125714e-06, - "loss": 0.75742733, - "num_input_tokens_seen": 241246355, - "step": 11176, - "time_per_iteration": 2.644695997238159 - }, - { - "auxiliary_loss_clip": 0.01093824, - "auxiliary_loss_mlp": 0.01037825, - "balance_loss_clip": 1.03961146, - "balance_loss_mlp": 1.02334082, - "epoch": 0.6719975950698933, - "flos": 21981316901760.0, - "grad_norm": 1.8406147660483967, - "language_loss": 0.72720611, - "learning_rate": 1.026195675108182e-06, - "loss": 0.74852264, - "num_input_tokens_seen": 241264180, - "step": 11177, - "time_per_iteration": 2.6863327026367188 - }, - { - "auxiliary_loss_clip": 0.01115157, - "auxiliary_loss_mlp": 0.01038577, - "balance_loss_clip": 1.03991175, - "balance_loss_mlp": 1.0244683, - "epoch": 0.6720577183225612, - "flos": 25228072270080.0, - "grad_norm": 2.150822621130827, - "language_loss": 0.76274478, - "learning_rate": 1.025855515730551e-06, - "loss": 0.78428215, - "num_input_tokens_seen": 241282245, - "step": 11178, - "time_per_iteration": 2.580979108810425 - }, - { - "auxiliary_loss_clip": 0.01106474, - "auxiliary_loss_mlp": 0.0103827, - "balance_loss_clip": 1.04109895, - "balance_loss_mlp": 1.02494228, - "epoch": 0.6721178415752292, - "flos": 16945886949120.0, - "grad_norm": 1.6631958135032512, - "language_loss": 0.69917423, - "learning_rate": 1.0255153932925766e-06, - "loss": 0.72062165, - "num_input_tokens_seen": 241300745, - "step": 11179, - "time_per_iteration": 2.765749454498291 - }, - { - "auxiliary_loss_clip": 0.01067075, - "auxiliary_loss_mlp": 0.01035482, - "balance_loss_clip": 1.03598237, - "balance_loss_mlp": 1.02269685, - "epoch": 0.6721779648278972, - "flos": 21541375123200.0, - "grad_norm": 1.5427953976374715, - "language_loss": 0.74147439, - "learning_rate": 1.0251753078071557e-06, - "loss": 0.76249993, - "num_input_tokens_seen": 241319320, - "step": 11180, - "time_per_iteration": 2.7570419311523438 - }, - { - "auxiliary_loss_clip": 0.01094967, - "auxiliary_loss_mlp": 0.01032611, - "balance_loss_clip": 1.03934419, - "balance_loss_mlp": 1.01931906, - "epoch": 0.6722380880805652, - "flos": 22605444645120.0, - "grad_norm": 1.3453936001041888, - "language_loss": 0.75262862, - "learning_rate": 1.0248352592871848e-06, - "loss": 0.77390438, - "num_input_tokens_seen": 241342225, - "step": 11181, - "time_per_iteration": 2.805821418762207 - }, - { - "auxiliary_loss_clip": 0.0109711, - "auxiliary_loss_mlp": 0.0103353, - "balance_loss_clip": 1.03977168, - "balance_loss_mlp": 1.0209651, - "epoch": 0.6722982113332331, - "flos": 15925269905280.0, - "grad_norm": 4.685407340613367, - "language_loss": 0.74491268, - "learning_rate": 1.0244952477455585e-06, - "loss": 0.76621902, - "num_input_tokens_seen": 241358240, - "step": 11182, - "time_per_iteration": 2.7147958278656006 - }, - { - "auxiliary_loss_clip": 0.01098785, - "auxiliary_loss_mlp": 0.01033728, - "balance_loss_clip": 1.03787458, - "balance_loss_mlp": 1.02139592, - "epoch": 0.6723583345859011, - "flos": 20596170683520.0, - "grad_norm": 2.0288719371623323, - "language_loss": 0.69882548, - "learning_rate": 1.0241552731951699e-06, - "loss": 0.72015059, - "num_input_tokens_seen": 241378420, - "step": 11183, - "time_per_iteration": 2.6687538623809814 - }, - { - "auxiliary_loss_clip": 0.01064932, - "auxiliary_loss_mlp": 0.01033349, - "balance_loss_clip": 1.0361743, - "balance_loss_mlp": 1.01995015, - "epoch": 0.672418457838569, - "flos": 21725848396800.0, - "grad_norm": 2.97348360718205, - "language_loss": 0.77805459, - "learning_rate": 1.0238153356489112e-06, - "loss": 0.7990374, - "num_input_tokens_seen": 241397185, - "step": 11184, - "time_per_iteration": 2.777731418609619 - }, - { - "auxiliary_loss_clip": 0.0109739, - "auxiliary_loss_mlp": 0.00775757, - "balance_loss_clip": 1.04143977, - "balance_loss_mlp": 1.00022709, - "epoch": 0.672478581091237, - "flos": 21470379891840.0, - "grad_norm": 3.9325636134414426, - "language_loss": 0.66277105, - "learning_rate": 1.0234754351196743e-06, - "loss": 0.68150252, - "num_input_tokens_seen": 241415785, - "step": 11185, - "time_per_iteration": 2.737527370452881 - }, - { - "auxiliary_loss_clip": 0.01076626, - "auxiliary_loss_mlp": 0.01036011, - "balance_loss_clip": 1.03503013, - "balance_loss_mlp": 1.02205157, - "epoch": 0.6725387043439051, - "flos": 30846763267200.0, - "grad_norm": 1.5938972508624505, - "language_loss": 0.80483949, - "learning_rate": 1.023135571620345e-06, - "loss": 0.82596588, - "num_input_tokens_seen": 241437390, - "step": 11186, - "time_per_iteration": 2.8201353549957275 - }, - { - "auxiliary_loss_clip": 0.01101545, - "auxiliary_loss_mlp": 0.01035806, - "balance_loss_clip": 1.04061747, - "balance_loss_mlp": 1.02350974, - "epoch": 0.672598827596573, - "flos": 24055947659520.0, - "grad_norm": 2.88496393330639, - "language_loss": 0.80385649, - "learning_rate": 1.022795745163813e-06, - "loss": 0.82523, - "num_input_tokens_seen": 241458085, - "step": 11187, - "time_per_iteration": 2.7198538780212402 - }, - { - "auxiliary_loss_clip": 0.0107469, - "auxiliary_loss_mlp": 0.01033866, - "balance_loss_clip": 1.04410124, - "balance_loss_mlp": 1.01917362, - "epoch": 0.672658950849241, - "flos": 21871861182720.0, - "grad_norm": 1.9454261923533847, - "language_loss": 0.7059114, - "learning_rate": 1.022455955762965e-06, - "loss": 0.7269969, - "num_input_tokens_seen": 241476880, - "step": 11188, - "time_per_iteration": 2.7985453605651855 - }, - { - "auxiliary_loss_clip": 0.01054991, - "auxiliary_loss_mlp": 0.01036775, - "balance_loss_clip": 1.04298103, - "balance_loss_mlp": 1.02394819, - "epoch": 0.6727190741019089, - "flos": 23222102359680.0, - "grad_norm": 1.8365043403177213, - "language_loss": 0.7589345, - "learning_rate": 1.0221162034306842e-06, - "loss": 0.77985215, - "num_input_tokens_seen": 241496535, - "step": 11189, - "time_per_iteration": 2.905705213546753 - }, - { - "auxiliary_loss_clip": 0.01116413, - "auxiliary_loss_mlp": 0.01032806, - "balance_loss_clip": 1.0382818, - "balance_loss_mlp": 1.01785755, - "epoch": 0.6727791973545769, - "flos": 15778610674560.0, - "grad_norm": 2.0168522444965986, - "language_loss": 0.75364029, - "learning_rate": 1.0217764881798562e-06, - "loss": 0.77513248, - "num_input_tokens_seen": 241513465, - "step": 11190, - "time_per_iteration": 2.833767890930176 - }, - { - "auxiliary_loss_clip": 0.01048034, - "auxiliary_loss_mlp": 0.0103557, - "balance_loss_clip": 1.03332615, - "balance_loss_mlp": 1.02153933, - "epoch": 0.6728393206072448, - "flos": 21249852341760.0, - "grad_norm": 2.7169326773783236, - "language_loss": 0.77364898, - "learning_rate": 1.0214368100233612e-06, - "loss": 0.79448497, - "num_input_tokens_seen": 241534125, - "step": 11191, - "time_per_iteration": 2.782000780105591 - }, - { - "auxiliary_loss_clip": 0.01111788, - "auxiliary_loss_mlp": 0.01034042, - "balance_loss_clip": 1.03986657, - "balance_loss_mlp": 1.02100623, - "epoch": 0.6728994438599128, - "flos": 32123279779200.0, - "grad_norm": 2.4096830802466416, - "language_loss": 0.8635608, - "learning_rate": 1.0210971689740802e-06, - "loss": 0.88501906, - "num_input_tokens_seen": 241556340, - "step": 11192, - "time_per_iteration": 2.7193620204925537 - }, - { - "auxiliary_loss_clip": 0.01104606, - "auxiliary_loss_mlp": 0.0103762, - "balance_loss_clip": 1.03892374, - "balance_loss_mlp": 1.023458, - "epoch": 0.6729595671125808, - "flos": 23112359331840.0, - "grad_norm": 2.0040177590782906, - "language_loss": 0.75960791, - "learning_rate": 1.0207575650448923e-06, - "loss": 0.78103018, - "num_input_tokens_seen": 241575185, - "step": 11193, - "time_per_iteration": 2.713738441467285 - }, - { - "auxiliary_loss_clip": 0.01081133, - "auxiliary_loss_mlp": 0.01033198, - "balance_loss_clip": 1.04058063, - "balance_loss_mlp": 1.02000737, - "epoch": 0.6730196903652488, - "flos": 14611406227200.0, - "grad_norm": 1.775580074575331, - "language_loss": 0.78365123, - "learning_rate": 1.0204179982486758e-06, - "loss": 0.80479455, - "num_input_tokens_seen": 241592970, - "step": 11194, - "time_per_iteration": 4.453005075454712 - }, - { - "auxiliary_loss_clip": 0.0110231, - "auxiliary_loss_mlp": 0.01028753, - "balance_loss_clip": 1.03805304, - "balance_loss_mlp": 1.01621783, - "epoch": 0.6730798136179167, - "flos": 21105922544640.0, - "grad_norm": 1.9211443871049516, - "language_loss": 0.89955217, - "learning_rate": 1.0200784685983075e-06, - "loss": 0.92086279, - "num_input_tokens_seen": 241610245, - "step": 11195, - "time_per_iteration": 2.6450841426849365 - }, - { - "auxiliary_loss_clip": 0.01101967, - "auxiliary_loss_mlp": 0.01032017, - "balance_loss_clip": 1.03769374, - "balance_loss_mlp": 1.01929736, - "epoch": 0.6731399368705847, - "flos": 28986267438720.0, - "grad_norm": 1.64687980974086, - "language_loss": 0.72439396, - "learning_rate": 1.019738976106662e-06, - "loss": 0.74573386, - "num_input_tokens_seen": 241630350, - "step": 11196, - "time_per_iteration": 2.685826063156128 - }, - { - "auxiliary_loss_clip": 0.00973165, - "auxiliary_loss_mlp": 0.01004254, - "balance_loss_clip": 1.01249313, - "balance_loss_mlp": 1.00303793, - "epoch": 0.6732000601232526, - "flos": 64743708723840.0, - "grad_norm": 0.7752886509100162, - "language_loss": 0.5652535, - "learning_rate": 1.0193995207866123e-06, - "loss": 0.58502769, - "num_input_tokens_seen": 241692380, - "step": 11197, - "time_per_iteration": 3.259193181991577 - }, - { - "auxiliary_loss_clip": 0.01093274, - "auxiliary_loss_mlp": 0.01029344, - "balance_loss_clip": 1.04169464, - "balance_loss_mlp": 1.01701725, - "epoch": 0.6732601833759206, - "flos": 17201642762880.0, - "grad_norm": 2.055708631074821, - "language_loss": 0.7532202, - "learning_rate": 1.0190601026510312e-06, - "loss": 0.77444637, - "num_input_tokens_seen": 241710430, - "step": 11198, - "time_per_iteration": 5.827820777893066 - }, - { - "auxiliary_loss_clip": 0.01103142, - "auxiliary_loss_mlp": 0.01033637, - "balance_loss_clip": 1.03708792, - "balance_loss_mlp": 1.01949286, - "epoch": 0.6733203066285887, - "flos": 18658861620480.0, - "grad_norm": 2.036459352353542, - "language_loss": 0.81907552, - "learning_rate": 1.0187207217127892e-06, - "loss": 0.84044337, - "num_input_tokens_seen": 241724775, - "step": 11199, - "time_per_iteration": 2.5949244499206543 - }, - { - "auxiliary_loss_clip": 0.01059201, - "auxiliary_loss_mlp": 0.01036318, - "balance_loss_clip": 1.03536808, - "balance_loss_mlp": 1.0218575, - "epoch": 0.6733804298812566, - "flos": 35809330481280.0, - "grad_norm": 1.7500176126376645, - "language_loss": 0.7166037, - "learning_rate": 1.0183813779847552e-06, - "loss": 0.73755884, - "num_input_tokens_seen": 241744440, - "step": 11200, - "time_per_iteration": 2.9160830974578857 - }, - { - "auxiliary_loss_clip": 0.01115381, - "auxiliary_loss_mlp": 0.01035903, - "balance_loss_clip": 1.04130912, - "balance_loss_mlp": 1.02295125, - "epoch": 0.6734405531339246, - "flos": 61638833099520.0, - "grad_norm": 2.371327555495297, - "language_loss": 0.64769435, - "learning_rate": 1.0180420714797987e-06, - "loss": 0.66920727, - "num_input_tokens_seen": 241771705, - "step": 11201, - "time_per_iteration": 2.9968230724334717 - }, - { - "auxiliary_loss_clip": 0.01096465, - "auxiliary_loss_mlp": 0.01040904, - "balance_loss_clip": 1.04056644, - "balance_loss_mlp": 1.02676558, - "epoch": 0.6735006763865925, - "flos": 20522338277760.0, - "grad_norm": 2.019287776053706, - "language_loss": 0.63276017, - "learning_rate": 1.0177028022107856e-06, - "loss": 0.65413386, - "num_input_tokens_seen": 241790830, - "step": 11202, - "time_per_iteration": 2.7302961349487305 - }, - { - "auxiliary_loss_clip": 0.01112496, - "auxiliary_loss_mlp": 0.01028107, - "balance_loss_clip": 1.03865552, - "balance_loss_mlp": 1.01558411, - "epoch": 0.6735607996392605, - "flos": 13918869031680.0, - "grad_norm": 1.915253440556218, - "language_loss": 0.74535716, - "learning_rate": 1.0173635701905796e-06, - "loss": 0.76676321, - "num_input_tokens_seen": 241808165, - "step": 11203, - "time_per_iteration": 4.089365243911743 - }, - { - "auxiliary_loss_clip": 0.01098401, - "auxiliary_loss_mlp": 0.01034017, - "balance_loss_clip": 1.04094148, - "balance_loss_mlp": 1.01900887, - "epoch": 0.6736209228919284, - "flos": 18807244704000.0, - "grad_norm": 1.6291352462615132, - "language_loss": 0.67681134, - "learning_rate": 1.0170243754320456e-06, - "loss": 0.6981355, - "num_input_tokens_seen": 241826925, - "step": 11204, - "time_per_iteration": 2.6192142963409424 - }, - { - "auxiliary_loss_clip": 0.01110427, - "auxiliary_loss_mlp": 0.01034559, - "balance_loss_clip": 1.04293954, - "balance_loss_mlp": 1.02012277, - "epoch": 0.6736810461445965, - "flos": 20373129181440.0, - "grad_norm": 1.6630781718701608, - "language_loss": 0.74060369, - "learning_rate": 1.0166852179480465e-06, - "loss": 0.76205349, - "num_input_tokens_seen": 241845525, - "step": 11205, - "time_per_iteration": 2.6068971157073975 - }, - { - "auxiliary_loss_clip": 0.01109012, - "auxiliary_loss_mlp": 0.01037861, - "balance_loss_clip": 1.03733087, - "balance_loss_mlp": 1.02507019, - "epoch": 0.6737411693972644, - "flos": 30007530927360.0, - "grad_norm": 1.5764181927902094, - "language_loss": 0.71426833, - "learning_rate": 1.0163460977514416e-06, - "loss": 0.73573703, - "num_input_tokens_seen": 241866815, - "step": 11206, - "time_per_iteration": 2.6492159366607666 - }, - { - "auxiliary_loss_clip": 0.0107907, - "auxiliary_loss_mlp": 0.0077308, - "balance_loss_clip": 1.03777742, - "balance_loss_mlp": 1.00019574, - "epoch": 0.6738012926499324, - "flos": 25447342844160.0, - "grad_norm": 3.2743303537758712, - "language_loss": 0.67471528, - "learning_rate": 1.016007014855092e-06, - "loss": 0.69323683, - "num_input_tokens_seen": 241887050, - "step": 11207, - "time_per_iteration": 2.7261955738067627 - }, - { - "auxiliary_loss_clip": 0.01062123, - "auxiliary_loss_mlp": 0.01037554, - "balance_loss_clip": 1.035918, - "balance_loss_mlp": 1.02464974, - "epoch": 0.6738614159026003, - "flos": 20776873029120.0, - "grad_norm": 1.9421642242153492, - "language_loss": 0.73736989, - "learning_rate": 1.0156679692718553e-06, - "loss": 0.7583667, - "num_input_tokens_seen": 241904280, - "step": 11208, - "time_per_iteration": 2.7930853366851807 - }, - { - "auxiliary_loss_clip": 0.01097466, - "auxiliary_loss_mlp": 0.01047913, - "balance_loss_clip": 1.03587651, - "balance_loss_mlp": 1.03142679, - "epoch": 0.6739215391552683, - "flos": 19566898462080.0, - "grad_norm": 2.600803881105225, - "language_loss": 0.75433391, - "learning_rate": 1.0153289610145867e-06, - "loss": 0.77578771, - "num_input_tokens_seen": 241919190, - "step": 11209, - "time_per_iteration": 2.626483678817749 - }, - { - "auxiliary_loss_clip": 0.01073019, - "auxiliary_loss_mlp": 0.01034768, - "balance_loss_clip": 1.03657913, - "balance_loss_mlp": 1.02250111, - "epoch": 0.6739816624079362, - "flos": 24388193485440.0, - "grad_norm": 1.7906778547342261, - "language_loss": 0.66272515, - "learning_rate": 1.0149899900961428e-06, - "loss": 0.68380302, - "num_input_tokens_seen": 241940525, - "step": 11210, - "time_per_iteration": 2.711866617202759 - }, - { - "auxiliary_loss_clip": 0.01108754, - "auxiliary_loss_mlp": 0.01032447, - "balance_loss_clip": 1.03769946, - "balance_loss_mlp": 1.02072227, - "epoch": 0.6740417856606042, - "flos": 22528164533760.0, - "grad_norm": 2.164396283420133, - "language_loss": 0.80170596, - "learning_rate": 1.014651056529377e-06, - "loss": 0.82311797, - "num_input_tokens_seen": 241959290, - "step": 11211, - "time_per_iteration": 2.650737762451172 - }, - { - "auxiliary_loss_clip": 0.01065338, - "auxiliary_loss_mlp": 0.01034108, - "balance_loss_clip": 1.03782678, - "balance_loss_mlp": 1.02107227, - "epoch": 0.6741019089132723, - "flos": 25775458606080.0, - "grad_norm": 1.391978533622519, - "language_loss": 0.76499903, - "learning_rate": 1.014312160327143e-06, - "loss": 0.78599358, - "num_input_tokens_seen": 241980715, - "step": 11212, - "time_per_iteration": 2.778548240661621 - }, - { - "auxiliary_loss_clip": 0.0107247, - "auxiliary_loss_mlp": 0.00773763, - "balance_loss_clip": 1.03496337, - "balance_loss_mlp": 1.00017405, - "epoch": 0.6741620321659402, - "flos": 21105671149440.0, - "grad_norm": 1.7101000867736138, - "language_loss": 0.7758128, - "learning_rate": 1.0139733015022905e-06, - "loss": 0.79427516, - "num_input_tokens_seen": 241999985, - "step": 11213, - "time_per_iteration": 2.7835280895233154 - }, - { - "auxiliary_loss_clip": 0.01061037, - "auxiliary_loss_mlp": 0.01033911, - "balance_loss_clip": 1.03824186, - "balance_loss_mlp": 1.0204041, - "epoch": 0.6742221554186082, - "flos": 20740423703040.0, - "grad_norm": 1.981711873743371, - "language_loss": 0.67612016, - "learning_rate": 1.0136344800676685e-06, - "loss": 0.69706964, - "num_input_tokens_seen": 242018990, - "step": 11214, - "time_per_iteration": 2.9053549766540527 - }, - { - "auxiliary_loss_clip": 0.01113738, - "auxiliary_loss_mlp": 0.00770067, - "balance_loss_clip": 1.03925085, - "balance_loss_mlp": 1.00014567, - "epoch": 0.6742822786712761, - "flos": 37774146384000.0, - "grad_norm": 1.8488975905792826, - "language_loss": 0.72834229, - "learning_rate": 1.0132956960361263e-06, - "loss": 0.74718034, - "num_input_tokens_seen": 242039340, - "step": 11215, - "time_per_iteration": 2.7654783725738525 - }, - { - "auxiliary_loss_clip": 0.0110075, - "auxiliary_loss_mlp": 0.00770504, - "balance_loss_clip": 1.03589749, - "balance_loss_mlp": 1.00019991, - "epoch": 0.6743424019239441, - "flos": 37263891732480.0, - "grad_norm": 2.067200737701415, - "language_loss": 0.67394143, - "learning_rate": 1.0129569494205096e-06, - "loss": 0.69265401, - "num_input_tokens_seen": 242062215, - "step": 11216, - "time_per_iteration": 2.7729885578155518 - }, - { - "auxiliary_loss_clip": 0.01032166, - "auxiliary_loss_mlp": 0.01006139, - "balance_loss_clip": 1.00926828, - "balance_loss_mlp": 1.0051198, - "epoch": 0.674402525176612, - "flos": 65997746300160.0, - "grad_norm": 0.6926580332237084, - "language_loss": 0.56280029, - "learning_rate": 1.0126182402336646e-06, - "loss": 0.58318341, - "num_input_tokens_seen": 242131130, - "step": 11217, - "time_per_iteration": 3.255324125289917 - }, - { - "auxiliary_loss_clip": 0.01099919, - "auxiliary_loss_mlp": 0.0103498, - "balance_loss_clip": 1.0376718, - "balance_loss_mlp": 1.02188516, - "epoch": 0.67446264842928, - "flos": 26461208131200.0, - "grad_norm": 1.7874095302934647, - "language_loss": 0.74496436, - "learning_rate": 1.0122795684884363e-06, - "loss": 0.76631337, - "num_input_tokens_seen": 242149720, - "step": 11218, - "time_per_iteration": 2.672130823135376 - }, - { - "auxiliary_loss_clip": 0.01080832, - "auxiliary_loss_mlp": 0.01049632, - "balance_loss_clip": 1.03884029, - "balance_loss_mlp": 1.03438509, - "epoch": 0.674522771681948, - "flos": 23732392924800.0, - "grad_norm": 1.6252161995703833, - "language_loss": 0.65911674, - "learning_rate": 1.0119409341976639e-06, - "loss": 0.68042141, - "num_input_tokens_seen": 242168875, - "step": 11219, - "time_per_iteration": 2.734159469604492 - }, - { - "auxiliary_loss_clip": 0.01070647, - "auxiliary_loss_mlp": 0.01046329, - "balance_loss_clip": 1.0323844, - "balance_loss_mlp": 1.03093362, - "epoch": 0.674582894934616, - "flos": 24754338771840.0, - "grad_norm": 1.8389940842715735, - "language_loss": 0.75087273, - "learning_rate": 1.0116023373741904e-06, - "loss": 0.77204245, - "num_input_tokens_seen": 242188465, - "step": 11220, - "time_per_iteration": 2.6810474395751953 - }, - { - "auxiliary_loss_clip": 0.01097202, - "auxiliary_loss_mlp": 0.01035624, - "balance_loss_clip": 1.03908563, - "balance_loss_mlp": 1.02207017, - "epoch": 0.6746430181872839, - "flos": 24826626892800.0, - "grad_norm": 1.6228841440103556, - "language_loss": 0.70216316, - "learning_rate": 1.0112637780308554e-06, - "loss": 0.72349143, - "num_input_tokens_seen": 242208675, - "step": 11221, - "time_per_iteration": 2.655421733856201 - }, - { - "auxiliary_loss_clip": 0.01076344, - "auxiliary_loss_mlp": 0.01033323, - "balance_loss_clip": 1.0356853, - "balance_loss_mlp": 1.02112806, - "epoch": 0.6747031414399519, - "flos": 16873491087360.0, - "grad_norm": 2.1723447923231554, - "language_loss": 0.58043802, - "learning_rate": 1.010925256180498e-06, - "loss": 0.60153466, - "num_input_tokens_seen": 242227440, - "step": 11222, - "time_per_iteration": 2.698503255844116 - }, - { - "auxiliary_loss_clip": 0.01100055, - "auxiliary_loss_mlp": 0.01035283, - "balance_loss_clip": 1.03881896, - "balance_loss_mlp": 1.02253962, - "epoch": 0.6747632646926198, - "flos": 22784925928320.0, - "grad_norm": 2.113432638374052, - "language_loss": 0.76298106, - "learning_rate": 1.0105867718359528e-06, - "loss": 0.78433442, - "num_input_tokens_seen": 242245240, - "step": 11223, - "time_per_iteration": 2.6607108116149902 - }, - { - "auxiliary_loss_clip": 0.01108219, - "auxiliary_loss_mlp": 0.01036403, - "balance_loss_clip": 1.04148507, - "balance_loss_mlp": 1.02318275, - "epoch": 0.6748233879452878, - "flos": 20046090827520.0, - "grad_norm": 1.767291040158093, - "language_loss": 0.75444579, - "learning_rate": 1.0102483250100574e-06, - "loss": 0.77589202, - "num_input_tokens_seen": 242263435, - "step": 11224, - "time_per_iteration": 2.7242133617401123 - }, - { - "auxiliary_loss_clip": 0.01060708, - "auxiliary_loss_mlp": 0.01032334, - "balance_loss_clip": 1.03886676, - "balance_loss_mlp": 1.02131319, - "epoch": 0.6748835111979558, - "flos": 23002831785600.0, - "grad_norm": 1.693566744371799, - "language_loss": 0.6366834, - "learning_rate": 1.0099099157156445e-06, - "loss": 0.65761381, - "num_input_tokens_seen": 242282765, - "step": 11225, - "time_per_iteration": 2.8525750637054443 - }, - { - "auxiliary_loss_clip": 0.01108343, - "auxiliary_loss_mlp": 0.00768466, - "balance_loss_clip": 1.03901696, - "balance_loss_mlp": 1.00013793, - "epoch": 0.6749436344506238, - "flos": 12197311009920.0, - "grad_norm": 2.4278333466029163, - "language_loss": 0.63865972, - "learning_rate": 1.0095715439655462e-06, - "loss": 0.65742779, - "num_input_tokens_seen": 242298980, - "step": 11226, - "time_per_iteration": 2.5835680961608887 - }, - { - "auxiliary_loss_clip": 0.01105473, - "auxiliary_loss_mlp": 0.01037357, - "balance_loss_clip": 1.04047155, - "balance_loss_mlp": 1.02423763, - "epoch": 0.6750037577032918, - "flos": 11873720361600.0, - "grad_norm": 2.1970918660293717, - "language_loss": 0.71417314, - "learning_rate": 1.0092332097725945e-06, - "loss": 0.73560148, - "num_input_tokens_seen": 242315420, - "step": 11227, - "time_per_iteration": 2.5965003967285156 - }, - { - "auxiliary_loss_clip": 0.01082342, - "auxiliary_loss_mlp": 0.01039904, - "balance_loss_clip": 1.0346818, - "balance_loss_mlp": 1.02601051, - "epoch": 0.6750638809559597, - "flos": 17019611614080.0, - "grad_norm": 1.9754224773045619, - "language_loss": 0.71259153, - "learning_rate": 1.0088949131496183e-06, - "loss": 0.733814, - "num_input_tokens_seen": 242332805, - "step": 11228, - "time_per_iteration": 2.6131396293640137 - }, - { - "auxiliary_loss_clip": 0.01010708, - "auxiliary_loss_mlp": 0.01005158, - "balance_loss_clip": 1.01072896, - "balance_loss_mlp": 1.00386512, - "epoch": 0.6751240042086277, - "flos": 70951011891840.0, - "grad_norm": 0.7503769026779433, - "language_loss": 0.5320974, - "learning_rate": 1.0085566541094482e-06, - "loss": 0.55225611, - "num_input_tokens_seen": 242396160, - "step": 11229, - "time_per_iteration": 3.22717022895813 - }, - { - "auxiliary_loss_clip": 0.01101526, - "auxiliary_loss_mlp": 0.01034212, - "balance_loss_clip": 1.03896284, - "balance_loss_mlp": 1.02234495, - "epoch": 0.6751841274612956, - "flos": 22675146986880.0, - "grad_norm": 1.7298636476457805, - "language_loss": 0.8039158, - "learning_rate": 1.0082184326649072e-06, - "loss": 0.82527316, - "num_input_tokens_seen": 242414660, - "step": 11230, - "time_per_iteration": 2.6328141689300537 - }, - { - "auxiliary_loss_clip": 0.01082067, - "auxiliary_loss_mlp": 0.01035691, - "balance_loss_clip": 1.03726006, - "balance_loss_mlp": 1.02402639, - "epoch": 0.6752442507139637, - "flos": 21288636051840.0, - "grad_norm": 1.6008618014341174, - "language_loss": 0.65935898, - "learning_rate": 1.0078802488288228e-06, - "loss": 0.68053663, - "num_input_tokens_seen": 242434225, - "step": 11231, - "time_per_iteration": 2.626856803894043 - }, - { - "auxiliary_loss_clip": 0.01078317, - "auxiliary_loss_mlp": 0.01042803, - "balance_loss_clip": 1.04247785, - "balance_loss_mlp": 1.02774644, - "epoch": 0.6753043739666316, - "flos": 28256921781120.0, - "grad_norm": 2.0251672391245936, - "language_loss": 0.66539383, - "learning_rate": 1.0075421026140198e-06, - "loss": 0.68660504, - "num_input_tokens_seen": 242454355, - "step": 11232, - "time_per_iteration": 2.743908166885376 - }, - { - "auxiliary_loss_clip": 0.01066681, - "auxiliary_loss_mlp": 0.0103246, - "balance_loss_clip": 1.03211284, - "balance_loss_mlp": 1.01948404, - "epoch": 0.6753644972192996, - "flos": 21360349555200.0, - "grad_norm": 1.6294007960486003, - "language_loss": 0.72326458, - "learning_rate": 1.0072039940333188e-06, - "loss": 0.74425602, - "num_input_tokens_seen": 242474935, - "step": 11233, - "time_per_iteration": 4.338082790374756 - }, - { - "auxiliary_loss_clip": 0.01103097, - "auxiliary_loss_mlp": 0.01037684, - "balance_loss_clip": 1.03895485, - "balance_loss_mlp": 1.02474928, - "epoch": 0.6754246204719675, - "flos": 26541971861760.0, - "grad_norm": 1.5686096839287218, - "language_loss": 0.76833057, - "learning_rate": 1.0068659230995418e-06, - "loss": 0.7897383, - "num_input_tokens_seen": 242495530, - "step": 11234, - "time_per_iteration": 2.6492395401000977 - }, - { - "auxiliary_loss_clip": 0.01111909, - "auxiliary_loss_mlp": 0.01036672, - "balance_loss_clip": 1.03924251, - "balance_loss_mlp": 1.02342129, - "epoch": 0.6754847437246355, - "flos": 25556690822400.0, - "grad_norm": 1.5014850027131166, - "language_loss": 0.75410771, - "learning_rate": 1.0065278898255101e-06, - "loss": 0.77559352, - "num_input_tokens_seen": 242514550, - "step": 11235, - "time_per_iteration": 2.5974621772766113 - }, - { - "auxiliary_loss_clip": 0.01025646, - "auxiliary_loss_mlp": 0.0100208, - "balance_loss_clip": 1.01184058, - "balance_loss_mlp": 1.00095963, - "epoch": 0.6755448669773034, - "flos": 59513318726400.0, - "grad_norm": 0.7779431781811396, - "language_loss": 0.51255912, - "learning_rate": 1.0061898942240387e-06, - "loss": 0.53283638, - "num_input_tokens_seen": 242569200, - "step": 11236, - "time_per_iteration": 3.1667306423187256 - }, - { - "auxiliary_loss_clip": 0.0107986, - "auxiliary_loss_mlp": 0.01032431, - "balance_loss_clip": 1.03790748, - "balance_loss_mlp": 1.01711285, - "epoch": 0.6756049902299714, - "flos": 23294534135040.0, - "grad_norm": 2.192780802483493, - "language_loss": 0.75628972, - "learning_rate": 1.0058519363079464e-06, - "loss": 0.77741265, - "num_input_tokens_seen": 242586950, - "step": 11237, - "time_per_iteration": 5.957702159881592 - }, - { - "auxiliary_loss_clip": 0.01086462, - "auxiliary_loss_mlp": 0.01041243, - "balance_loss_clip": 1.03836346, - "balance_loss_mlp": 1.0282433, - "epoch": 0.6756651134826394, - "flos": 31575426566400.0, - "grad_norm": 2.7350155461999184, - "language_loss": 0.77482605, - "learning_rate": 1.0055140160900482e-06, - "loss": 0.79610306, - "num_input_tokens_seen": 242607380, - "step": 11238, - "time_per_iteration": 2.7448818683624268 - }, - { - "auxiliary_loss_clip": 0.01099837, - "auxiliary_loss_mlp": 0.01036444, - "balance_loss_clip": 1.03648901, - "balance_loss_mlp": 1.0227412, - "epoch": 0.6757252367353074, - "flos": 27272287186560.0, - "grad_norm": 1.6539290066506784, - "language_loss": 0.66314852, - "learning_rate": 1.0051761335831587e-06, - "loss": 0.6845113, - "num_input_tokens_seen": 242628025, - "step": 11239, - "time_per_iteration": 2.740363597869873 - }, - { - "auxiliary_loss_clip": 0.01089775, - "auxiliary_loss_mlp": 0.01030806, - "balance_loss_clip": 1.04055905, - "balance_loss_mlp": 1.0182538, - "epoch": 0.6757853599879754, - "flos": 16830900535680.0, - "grad_norm": 1.7720867918116858, - "language_loss": 0.82882285, - "learning_rate": 1.0048382888000898e-06, - "loss": 0.85002863, - "num_input_tokens_seen": 242643825, - "step": 11240, - "time_per_iteration": 2.7659623622894287 - }, - { - "auxiliary_loss_clip": 0.01090669, - "auxiliary_loss_mlp": 0.01035174, - "balance_loss_clip": 1.04133797, - "balance_loss_mlp": 1.01949787, - "epoch": 0.6758454832406433, - "flos": 23220055284480.0, - "grad_norm": 2.676956533168836, - "language_loss": 0.74727547, - "learning_rate": 1.0045004817536525e-06, - "loss": 0.76853395, - "num_input_tokens_seen": 242661820, - "step": 11241, - "time_per_iteration": 2.7259037494659424 - }, - { - "auxiliary_loss_clip": 0.01064722, - "auxiliary_loss_mlp": 0.0103699, - "balance_loss_clip": 1.03947997, - "balance_loss_mlp": 1.02388871, - "epoch": 0.6759056064933113, - "flos": 16289547684480.0, - "grad_norm": 2.2859314322063415, - "language_loss": 0.80506319, - "learning_rate": 1.0041627124566572e-06, - "loss": 0.82608032, - "num_input_tokens_seen": 242679890, - "step": 11242, - "time_per_iteration": 2.7591724395751953 - }, - { - "auxiliary_loss_clip": 0.01095714, - "auxiliary_loss_mlp": 0.01047852, - "balance_loss_clip": 1.03617179, - "balance_loss_mlp": 1.03376102, - "epoch": 0.6759657297459792, - "flos": 25922297404800.0, - "grad_norm": 1.8958528418461225, - "language_loss": 0.72530574, - "learning_rate": 1.0038249809219109e-06, - "loss": 0.74674141, - "num_input_tokens_seen": 242699495, - "step": 11243, - "time_per_iteration": 4.2785255908966064 - }, - { - "auxiliary_loss_clip": 0.01102771, - "auxiliary_loss_mlp": 0.01038727, - "balance_loss_clip": 1.03992796, - "balance_loss_mlp": 1.02621591, - "epoch": 0.6760258529986473, - "flos": 23000820624000.0, - "grad_norm": 3.620795649046328, - "language_loss": 0.72916102, - "learning_rate": 1.003487287162221e-06, - "loss": 0.75057596, - "num_input_tokens_seen": 242719500, - "step": 11244, - "time_per_iteration": 2.656297445297241 - }, - { - "auxiliary_loss_clip": 0.01115915, - "auxiliary_loss_mlp": 0.01045105, - "balance_loss_clip": 1.04072213, - "balance_loss_mlp": 1.03150368, - "epoch": 0.6760859762513152, - "flos": 20959335141120.0, - "grad_norm": 2.083059893523475, - "language_loss": 0.86242104, - "learning_rate": 1.003149631190393e-06, - "loss": 0.8840313, - "num_input_tokens_seen": 242738325, - "step": 11245, - "time_per_iteration": 2.6280319690704346 - }, - { - "auxiliary_loss_clip": 0.01117876, - "auxiliary_loss_mlp": 0.0077189, - "balance_loss_clip": 1.04022503, - "balance_loss_mlp": 1.00016975, - "epoch": 0.6761460995039832, - "flos": 23622937205760.0, - "grad_norm": 2.1743867677621918, - "language_loss": 0.73484135, - "learning_rate": 1.0028120130192327e-06, - "loss": 0.753739, - "num_input_tokens_seen": 242756620, - "step": 11246, - "time_per_iteration": 2.696730375289917 - }, - { - "auxiliary_loss_clip": 0.0109861, - "auxiliary_loss_mlp": 0.01029896, - "balance_loss_clip": 1.03731704, - "balance_loss_mlp": 1.01679528, - "epoch": 0.6762062227566511, - "flos": 20770875457920.0, - "grad_norm": 1.7495113919795662, - "language_loss": 0.87749994, - "learning_rate": 1.002474432661539e-06, - "loss": 0.89878494, - "num_input_tokens_seen": 242774505, - "step": 11247, - "time_per_iteration": 2.6828203201293945 - }, - { - "auxiliary_loss_clip": 0.01009927, - "auxiliary_loss_mlp": 0.01001384, - "balance_loss_clip": 1.00954247, - "balance_loss_mlp": 1.00016785, - "epoch": 0.6762663460093191, - "flos": 52818099166080.0, - "grad_norm": 0.8307013339921004, - "language_loss": 0.53909206, - "learning_rate": 1.002136890130115e-06, - "loss": 0.55920517, - "num_input_tokens_seen": 242828645, - "step": 11248, - "time_per_iteration": 3.2222228050231934 - }, - { - "auxiliary_loss_clip": 0.01057434, - "auxiliary_loss_mlp": 0.01030146, - "balance_loss_clip": 1.0432303, - "balance_loss_mlp": 1.01780176, - "epoch": 0.676326469261987, - "flos": 23696302734720.0, - "grad_norm": 1.557725146566793, - "language_loss": 0.73398393, - "learning_rate": 1.001799385437761e-06, - "loss": 0.75485975, - "num_input_tokens_seen": 242850100, - "step": 11249, - "time_per_iteration": 2.8122363090515137 - }, - { - "auxiliary_loss_clip": 0.01102856, - "auxiliary_loss_mlp": 0.01036455, - "balance_loss_clip": 1.03738058, - "balance_loss_mlp": 1.02277553, - "epoch": 0.676386592514655, - "flos": 14063732582400.0, - "grad_norm": 2.1313223732491506, - "language_loss": 0.73983771, - "learning_rate": 1.0014619185972732e-06, - "loss": 0.76123083, - "num_input_tokens_seen": 242867775, - "step": 11250, - "time_per_iteration": 2.697199583053589 - }, - { - "auxiliary_loss_clip": 0.01113481, - "auxiliary_loss_mlp": 0.01031768, - "balance_loss_clip": 1.03948021, - "balance_loss_mlp": 1.01904869, - "epoch": 0.676446715767323, - "flos": 20412236113920.0, - "grad_norm": 1.816015271011089, - "language_loss": 0.75130785, - "learning_rate": 1.0011244896214497e-06, - "loss": 0.77276027, - "num_input_tokens_seen": 242886865, - "step": 11251, - "time_per_iteration": 2.6333305835723877 - }, - { - "auxiliary_loss_clip": 0.01078452, - "auxiliary_loss_mlp": 0.01031314, - "balance_loss_clip": 1.04010725, - "balance_loss_mlp": 1.0182966, - "epoch": 0.676506839019991, - "flos": 21288241002240.0, - "grad_norm": 1.551518166422534, - "language_loss": 0.69901943, - "learning_rate": 1.0007870985230873e-06, - "loss": 0.72011709, - "num_input_tokens_seen": 242906705, - "step": 11252, - "time_per_iteration": 2.9181244373321533 - }, - { - "auxiliary_loss_clip": 0.01064839, - "auxiliary_loss_mlp": 0.01033228, - "balance_loss_clip": 1.03892565, - "balance_loss_mlp": 1.02052665, - "epoch": 0.676566962272659, - "flos": 29932477459200.0, - "grad_norm": 1.6718962617994413, - "language_loss": 0.66779602, - "learning_rate": 1.0004497453149765e-06, - "loss": 0.68877667, - "num_input_tokens_seen": 242925215, - "step": 11253, - "time_per_iteration": 2.8428003787994385 - }, - { - "auxiliary_loss_clip": 0.01070699, - "auxiliary_loss_mlp": 0.00775318, - "balance_loss_clip": 1.03454018, - "balance_loss_mlp": 1.00019038, - "epoch": 0.6766270855253269, - "flos": 17931203902080.0, - "grad_norm": 1.5527696111799332, - "language_loss": 0.7722379, - "learning_rate": 1.0001124300099115e-06, - "loss": 0.79069805, - "num_input_tokens_seen": 242944750, - "step": 11254, - "time_per_iteration": 2.712817668914795 - }, - { - "auxiliary_loss_clip": 0.0110248, - "auxiliary_loss_mlp": 0.01035428, - "balance_loss_clip": 1.03869474, - "balance_loss_mlp": 1.02183247, - "epoch": 0.6766872087779949, - "flos": 23104853389440.0, - "grad_norm": 3.950409887802226, - "language_loss": 0.72361761, - "learning_rate": 9.997751526206835e-07, - "loss": 0.74499667, - "num_input_tokens_seen": 242963860, - "step": 11255, - "time_per_iteration": 2.6217257976531982 - }, - { - "auxiliary_loss_clip": 0.01061354, - "auxiliary_loss_mlp": 0.00771432, - "balance_loss_clip": 1.03389072, - "balance_loss_mlp": 1.00019884, - "epoch": 0.6767473320306628, - "flos": 26213137827840.0, - "grad_norm": 2.6592328120058584, - "language_loss": 0.75315595, - "learning_rate": 9.994379131600828e-07, - "loss": 0.7714839, - "num_input_tokens_seen": 242983050, - "step": 11256, - "time_per_iteration": 2.834801435470581 - }, - { - "auxiliary_loss_clip": 0.01105322, - "auxiliary_loss_mlp": 0.01036203, - "balance_loss_clip": 1.04157603, - "balance_loss_mlp": 1.0230726, - "epoch": 0.6768074552833309, - "flos": 18368739469440.0, - "grad_norm": 2.0954099595982836, - "language_loss": 0.6498003, - "learning_rate": 9.991007116408965e-07, - "loss": 0.67121565, - "num_input_tokens_seen": 243001125, - "step": 11257, - "time_per_iteration": 2.6306867599487305 - }, - { - "auxiliary_loss_clip": 0.01067487, - "auxiliary_loss_mlp": 0.01033246, - "balance_loss_clip": 1.04190707, - "balance_loss_mlp": 1.02090788, - "epoch": 0.6768675785359988, - "flos": 23039927556480.0, - "grad_norm": 1.6004297573343516, - "language_loss": 0.75491571, - "learning_rate": 9.987635480759109e-07, - "loss": 0.77592301, - "num_input_tokens_seen": 243021865, - "step": 11258, - "time_per_iteration": 2.9189696311950684 - }, - { - "auxiliary_loss_clip": 0.01089351, - "auxiliary_loss_mlp": 0.01036704, - "balance_loss_clip": 1.03900814, - "balance_loss_mlp": 1.02450931, - "epoch": 0.6769277017886668, - "flos": 33036524092800.0, - "grad_norm": 1.5823758287987741, - "language_loss": 0.66676503, - "learning_rate": 9.984264224779127e-07, - "loss": 0.68802559, - "num_input_tokens_seen": 243042970, - "step": 11259, - "time_per_iteration": 2.7564451694488525 - }, - { - "auxiliary_loss_clip": 0.01090564, - "auxiliary_loss_mlp": 0.01035059, - "balance_loss_clip": 1.03726125, - "balance_loss_mlp": 1.02206516, - "epoch": 0.6769878250413347, - "flos": 20848406964480.0, - "grad_norm": 2.48292750681471, - "language_loss": 0.85291499, - "learning_rate": 9.980893348596839e-07, - "loss": 0.8741712, - "num_input_tokens_seen": 243058470, - "step": 11260, - "time_per_iteration": 2.660332441329956 - }, - { - "auxiliary_loss_clip": 0.01085932, - "auxiliary_loss_mlp": 0.01039752, - "balance_loss_clip": 1.03486264, - "balance_loss_mlp": 1.02588189, - "epoch": 0.6770479482940027, - "flos": 15595968994560.0, - "grad_norm": 2.613252024528438, - "language_loss": 0.77209002, - "learning_rate": 9.977522852340081e-07, - "loss": 0.79334688, - "num_input_tokens_seen": 243076630, - "step": 11261, - "time_per_iteration": 2.6410372257232666 - }, - { - "auxiliary_loss_clip": 0.0109228, - "auxiliary_loss_mlp": 0.01040148, - "balance_loss_clip": 1.03776324, - "balance_loss_mlp": 1.02687383, - "epoch": 0.6771080715466706, - "flos": 18621011664000.0, - "grad_norm": 2.010792691714421, - "language_loss": 0.87528884, - "learning_rate": 9.97415273613666e-07, - "loss": 0.89661312, - "num_input_tokens_seen": 243092260, - "step": 11262, - "time_per_iteration": 2.6288645267486572 - }, - { - "auxiliary_loss_clip": 0.01089821, - "auxiliary_loss_mlp": 0.01035622, - "balance_loss_clip": 1.03876138, - "balance_loss_mlp": 1.02234757, - "epoch": 0.6771681947993387, - "flos": 12495441893760.0, - "grad_norm": 1.942743373156589, - "language_loss": 0.74668461, - "learning_rate": 9.97078300011439e-07, - "loss": 0.76793909, - "num_input_tokens_seen": 243109405, - "step": 11263, - "time_per_iteration": 2.666969060897827 - }, - { - "auxiliary_loss_clip": 0.01107967, - "auxiliary_loss_mlp": 0.01034392, - "balance_loss_clip": 1.04032826, - "balance_loss_mlp": 1.02013433, - "epoch": 0.6772283180520066, - "flos": 22236964974720.0, - "grad_norm": 3.4280923778329435, - "language_loss": 0.67490625, - "learning_rate": 9.967413644401016e-07, - "loss": 0.69632983, - "num_input_tokens_seen": 243128135, - "step": 11264, - "time_per_iteration": 2.620027780532837 - }, - { - "auxiliary_loss_clip": 0.01092011, - "auxiliary_loss_mlp": 0.01036919, - "balance_loss_clip": 1.04065371, - "balance_loss_mlp": 1.02333474, - "epoch": 0.6772884413046746, - "flos": 16143139848960.0, - "grad_norm": 1.9352746586576008, - "language_loss": 0.7301234, - "learning_rate": 9.964044669124324e-07, - "loss": 0.75141263, - "num_input_tokens_seen": 243146785, - "step": 11265, - "time_per_iteration": 2.638399600982666 - }, - { - "auxiliary_loss_clip": 0.0106857, - "auxiliary_loss_mlp": 0.01046347, - "balance_loss_clip": 1.03400207, - "balance_loss_mlp": 1.03247142, - "epoch": 0.6773485645573426, - "flos": 19135755515520.0, - "grad_norm": 2.1206255290710594, - "language_loss": 0.61617583, - "learning_rate": 9.96067607441207e-07, - "loss": 0.63732499, - "num_input_tokens_seen": 243165275, - "step": 11266, - "time_per_iteration": 2.6741204261779785 - }, - { - "auxiliary_loss_clip": 0.01086026, - "auxiliary_loss_mlp": 0.01036436, - "balance_loss_clip": 1.04107964, - "balance_loss_mlp": 1.02305472, - "epoch": 0.6774086878100105, - "flos": 14136918543360.0, - "grad_norm": 1.7639590037989088, - "language_loss": 0.7056399, - "learning_rate": 9.957307860391976e-07, - "loss": 0.72686452, - "num_input_tokens_seen": 243182845, - "step": 11267, - "time_per_iteration": 2.701676607131958 - }, - { - "auxiliary_loss_clip": 0.01112717, - "auxiliary_loss_mlp": 0.01034596, - "balance_loss_clip": 1.03907073, - "balance_loss_mlp": 1.02175152, - "epoch": 0.6774688110626785, - "flos": 22197067943040.0, - "grad_norm": 3.4663937575357093, - "language_loss": 0.71232986, - "learning_rate": 9.953940027191785e-07, - "loss": 0.73380297, - "num_input_tokens_seen": 243201475, - "step": 11268, - "time_per_iteration": 2.582158327102661 - }, - { - "auxiliary_loss_clip": 0.01089704, - "auxiliary_loss_mlp": 0.01038381, - "balance_loss_clip": 1.03727484, - "balance_loss_mlp": 1.02395701, - "epoch": 0.6775289343153464, - "flos": 23039963470080.0, - "grad_norm": 1.54975098078917, - "language_loss": 0.76582503, - "learning_rate": 9.950572574939194e-07, - "loss": 0.78710592, - "num_input_tokens_seen": 243221850, - "step": 11269, - "time_per_iteration": 2.6784608364105225 - }, - { - "auxiliary_loss_clip": 0.01079985, - "auxiliary_loss_mlp": 0.01039406, - "balance_loss_clip": 1.03688645, - "balance_loss_mlp": 1.02560711, - "epoch": 0.6775890575680145, - "flos": 18293506433280.0, - "grad_norm": 3.9513063189541895, - "language_loss": 0.74380577, - "learning_rate": 9.94720550376189e-07, - "loss": 0.76499963, - "num_input_tokens_seen": 243239855, - "step": 11270, - "time_per_iteration": 2.82761812210083 - }, - { - "auxiliary_loss_clip": 0.01059034, - "auxiliary_loss_mlp": 0.0104238, - "balance_loss_clip": 1.03957486, - "balance_loss_mlp": 1.02821255, - "epoch": 0.6776491808206824, - "flos": 25336450581120.0, - "grad_norm": 1.7088059356464216, - "language_loss": 0.73103487, - "learning_rate": 9.94383881378756e-07, - "loss": 0.75204897, - "num_input_tokens_seen": 243260085, - "step": 11271, - "time_per_iteration": 2.7955849170684814 - }, - { - "auxiliary_loss_clip": 0.01113021, - "auxiliary_loss_mlp": 0.01036559, - "balance_loss_clip": 1.03949916, - "balance_loss_mlp": 1.02401781, - "epoch": 0.6777093040733504, - "flos": 26028233591040.0, - "grad_norm": 3.147727016102492, - "language_loss": 0.68212342, - "learning_rate": 9.94047250514387e-07, - "loss": 0.70361924, - "num_input_tokens_seen": 243280065, - "step": 11272, - "time_per_iteration": 2.637103796005249 - }, - { - "auxiliary_loss_clip": 0.01103771, - "auxiliary_loss_mlp": 0.01035716, - "balance_loss_clip": 1.0390712, - "balance_loss_mlp": 1.02126229, - "epoch": 0.6777694273260183, - "flos": 18003599763840.0, - "grad_norm": 1.7828232071604915, - "language_loss": 0.73829705, - "learning_rate": 9.937106577958481e-07, - "loss": 0.75969195, - "num_input_tokens_seen": 243297775, - "step": 11273, - "time_per_iteration": 4.394399642944336 - }, - { - "auxiliary_loss_clip": 0.01094453, - "auxiliary_loss_mlp": 0.01041712, - "balance_loss_clip": 1.03916848, - "balance_loss_mlp": 1.02846813, - "epoch": 0.6778295505786863, - "flos": 23441085624960.0, - "grad_norm": 1.8919224773021028, - "language_loss": 0.70701563, - "learning_rate": 9.933741032359015e-07, - "loss": 0.72837734, - "num_input_tokens_seen": 243315760, - "step": 11274, - "time_per_iteration": 2.5985240936279297 - }, - { - "auxiliary_loss_clip": 0.01114225, - "auxiliary_loss_mlp": 0.01033716, - "balance_loss_clip": 1.0387696, - "balance_loss_mlp": 1.02027476, - "epoch": 0.6778896738313542, - "flos": 19098408349440.0, - "grad_norm": 4.65357993377392, - "language_loss": 0.6543079, - "learning_rate": 9.930375868473093e-07, - "loss": 0.67578733, - "num_input_tokens_seen": 243335715, - "step": 11275, - "time_per_iteration": 2.6151697635650635 - }, - { - "auxiliary_loss_clip": 0.01106727, - "auxiliary_loss_mlp": 0.01033632, - "balance_loss_clip": 1.04250121, - "balance_loss_mlp": 1.02126956, - "epoch": 0.6779497970840223, - "flos": 26103933504000.0, - "grad_norm": 1.5789952929470612, - "language_loss": 0.72767758, - "learning_rate": 9.927011086428335e-07, - "loss": 0.74908113, - "num_input_tokens_seen": 243356935, - "step": 11276, - "time_per_iteration": 5.899662017822266 - }, - { - "auxiliary_loss_clip": 0.01087765, - "auxiliary_loss_mlp": 0.00771415, - "balance_loss_clip": 1.03646386, - "balance_loss_mlp": 1.00016904, - "epoch": 0.6780099203366902, - "flos": 19719232041600.0, - "grad_norm": 1.681215818326951, - "language_loss": 0.76681376, - "learning_rate": 9.923646686352317e-07, - "loss": 0.78540558, - "num_input_tokens_seen": 243375625, - "step": 11277, - "time_per_iteration": 2.6914784908294678 - }, - { - "auxiliary_loss_clip": 0.01092848, - "auxiliary_loss_mlp": 0.01033178, - "balance_loss_clip": 1.03808713, - "balance_loss_mlp": 1.01976132, - "epoch": 0.6780700435893582, - "flos": 18214538382720.0, - "grad_norm": 2.7540725669591724, - "language_loss": 0.83632004, - "learning_rate": 9.920282668372627e-07, - "loss": 0.8575803, - "num_input_tokens_seen": 243390195, - "step": 11278, - "time_per_iteration": 2.637618064880371 - }, - { - "auxiliary_loss_clip": 0.01085002, - "auxiliary_loss_mlp": 0.00769413, - "balance_loss_clip": 1.04043651, - "balance_loss_mlp": 1.00012565, - "epoch": 0.6781301668420262, - "flos": 25376239872000.0, - "grad_norm": 1.5336771376537068, - "language_loss": 0.70519423, - "learning_rate": 9.916919032616844e-07, - "loss": 0.72373849, - "num_input_tokens_seen": 243411690, - "step": 11279, - "time_per_iteration": 2.7715609073638916 - }, - { - "auxiliary_loss_clip": 0.01105152, - "auxiliary_loss_mlp": 0.01035574, - "balance_loss_clip": 1.03994751, - "balance_loss_mlp": 1.0217998, - "epoch": 0.6781902900946941, - "flos": 24020432087040.0, - "grad_norm": 1.8650529100630782, - "language_loss": 0.73610586, - "learning_rate": 9.913555779212485e-07, - "loss": 0.75751317, - "num_input_tokens_seen": 243430280, - "step": 11280, - "time_per_iteration": 2.734544277191162 - }, - { - "auxiliary_loss_clip": 0.01103265, - "auxiliary_loss_mlp": 0.01036072, - "balance_loss_clip": 1.03754926, - "balance_loss_mlp": 1.02211285, - "epoch": 0.6782504133473621, - "flos": 19646764352640.0, - "grad_norm": 2.0625858122456178, - "language_loss": 0.70312506, - "learning_rate": 9.910192908287104e-07, - "loss": 0.72451842, - "num_input_tokens_seen": 243448690, - "step": 11281, - "time_per_iteration": 2.622098684310913 - }, - { - "auxiliary_loss_clip": 0.01111077, - "auxiliary_loss_mlp": 0.01028525, - "balance_loss_clip": 1.04020238, - "balance_loss_mlp": 1.01619864, - "epoch": 0.67831053660003, - "flos": 24932742647040.0, - "grad_norm": 1.4839814095064274, - "language_loss": 0.63879716, - "learning_rate": 9.906830419968217e-07, - "loss": 0.66019315, - "num_input_tokens_seen": 243470695, - "step": 11282, - "time_per_iteration": 4.292442798614502 - }, - { - "auxiliary_loss_clip": 0.01075036, - "auxiliary_loss_mlp": 0.01049322, - "balance_loss_clip": 1.03349972, - "balance_loss_mlp": 1.03204811, - "epoch": 0.6783706598526981, - "flos": 31208383440000.0, - "grad_norm": 1.7556170346158129, - "language_loss": 0.74497384, - "learning_rate": 9.90346831438334e-07, - "loss": 0.76621741, - "num_input_tokens_seen": 243493345, - "step": 11283, - "time_per_iteration": 2.9562923908233643 - }, - { - "auxiliary_loss_clip": 0.01103456, - "auxiliary_loss_mlp": 0.01029151, - "balance_loss_clip": 1.04012847, - "balance_loss_mlp": 1.01659822, - "epoch": 0.678430783105366, - "flos": 35441317687680.0, - "grad_norm": 1.5708851854862296, - "language_loss": 0.56767416, - "learning_rate": 9.900106591659948e-07, - "loss": 0.58900023, - "num_input_tokens_seen": 243515670, - "step": 11284, - "time_per_iteration": 2.8391168117523193 - }, - { - "auxiliary_loss_clip": 0.01090169, - "auxiliary_loss_mlp": 0.0103128, - "balance_loss_clip": 1.03865993, - "balance_loss_mlp": 1.01850688, - "epoch": 0.678490906358034, - "flos": 14428800460800.0, - "grad_norm": 1.928659697893085, - "language_loss": 0.75430858, - "learning_rate": 9.896745251925535e-07, - "loss": 0.77552313, - "num_input_tokens_seen": 243533625, - "step": 11285, - "time_per_iteration": 2.8025879859924316 - }, - { - "auxiliary_loss_clip": 0.01113003, - "auxiliary_loss_mlp": 0.01032581, - "balance_loss_clip": 1.04134154, - "balance_loss_mlp": 1.01964653, - "epoch": 0.6785510296107019, - "flos": 24311236596480.0, - "grad_norm": 1.7863771120930665, - "language_loss": 0.66262901, - "learning_rate": 9.893384295307557e-07, - "loss": 0.68408483, - "num_input_tokens_seen": 243553040, - "step": 11286, - "time_per_iteration": 2.6879425048828125 - }, - { - "auxiliary_loss_clip": 0.0109176, - "auxiliary_loss_mlp": 0.01029833, - "balance_loss_clip": 1.03810883, - "balance_loss_mlp": 1.01649332, - "epoch": 0.6786111528633699, - "flos": 26977244872320.0, - "grad_norm": 2.710702139669138, - "language_loss": 0.5293864, - "learning_rate": 9.890023721933447e-07, - "loss": 0.55060238, - "num_input_tokens_seen": 243572590, - "step": 11287, - "time_per_iteration": 2.6729018688201904 - }, - { - "auxiliary_loss_clip": 0.01070232, - "auxiliary_loss_mlp": 0.01039812, - "balance_loss_clip": 1.03733265, - "balance_loss_mlp": 1.0263176, - "epoch": 0.6786712761160378, - "flos": 24317557390080.0, - "grad_norm": 1.5085934530827387, - "language_loss": 0.77353847, - "learning_rate": 9.886663531930655e-07, - "loss": 0.79463893, - "num_input_tokens_seen": 243594140, - "step": 11288, - "time_per_iteration": 2.76521897315979 - }, - { - "auxiliary_loss_clip": 0.01106153, - "auxiliary_loss_mlp": 0.0103743, - "balance_loss_clip": 1.04171705, - "balance_loss_mlp": 1.0247159, - "epoch": 0.6787313993687059, - "flos": 22930435923840.0, - "grad_norm": 1.9499442288864346, - "language_loss": 0.73456311, - "learning_rate": 9.883303725426593e-07, - "loss": 0.75599885, - "num_input_tokens_seen": 243615170, - "step": 11289, - "time_per_iteration": 2.6673424243927 - }, - { - "auxiliary_loss_clip": 0.01114362, - "auxiliary_loss_mlp": 0.010388, - "balance_loss_clip": 1.04031169, - "balance_loss_mlp": 1.02534115, - "epoch": 0.6787915226213738, - "flos": 26868435598080.0, - "grad_norm": 1.6821989273437945, - "language_loss": 0.80101818, - "learning_rate": 9.879944302548682e-07, - "loss": 0.82254982, - "num_input_tokens_seen": 243635675, - "step": 11290, - "time_per_iteration": 2.632082223892212 - }, - { - "auxiliary_loss_clip": 0.01101296, - "auxiliary_loss_mlp": 0.010341, - "balance_loss_clip": 1.04066992, - "balance_loss_mlp": 1.02134442, - "epoch": 0.6788516458740418, - "flos": 20008851402240.0, - "grad_norm": 1.599385548142358, - "language_loss": 0.75065523, - "learning_rate": 9.87658526342428e-07, - "loss": 0.77200925, - "num_input_tokens_seen": 243654950, - "step": 11291, - "time_per_iteration": 2.6852645874023438 - }, - { - "auxiliary_loss_clip": 0.01096412, - "auxiliary_loss_mlp": 0.00771696, - "balance_loss_clip": 1.04071581, - "balance_loss_mlp": 1.0002079, - "epoch": 0.6789117691267098, - "flos": 28727099832960.0, - "grad_norm": 1.9085592005378407, - "language_loss": 0.75479198, - "learning_rate": 9.873226608180785e-07, - "loss": 0.77347308, - "num_input_tokens_seen": 243674970, - "step": 11292, - "time_per_iteration": 2.699632167816162 - }, - { - "auxiliary_loss_clip": 0.01073788, - "auxiliary_loss_mlp": 0.01034074, - "balance_loss_clip": 1.03495204, - "balance_loss_mlp": 1.02013278, - "epoch": 0.6789718923793777, - "flos": 23403451150080.0, - "grad_norm": 2.0284676657461858, - "language_loss": 0.84163547, - "learning_rate": 9.869868336945556e-07, - "loss": 0.86271405, - "num_input_tokens_seen": 243693440, - "step": 11293, - "time_per_iteration": 2.719419240951538 - }, - { - "auxiliary_loss_clip": 0.01119964, - "auxiliary_loss_mlp": 0.01040774, - "balance_loss_clip": 1.04201078, - "balance_loss_mlp": 1.02618265, - "epoch": 0.6790320156320457, - "flos": 20448865008000.0, - "grad_norm": 2.5902571187100722, - "language_loss": 0.79863316, - "learning_rate": 9.866510449845929e-07, - "loss": 0.8202405, - "num_input_tokens_seen": 243710055, - "step": 11294, - "time_per_iteration": 2.5868771076202393 - }, - { - "auxiliary_loss_clip": 0.0109056, - "auxiliary_loss_mlp": 0.01027861, - "balance_loss_clip": 1.03927612, - "balance_loss_mlp": 1.01579142, - "epoch": 0.6790921388847136, - "flos": 24167199058560.0, - "grad_norm": 1.662741005119297, - "language_loss": 0.79054183, - "learning_rate": 9.86315294700924e-07, - "loss": 0.81172609, - "num_input_tokens_seen": 243728635, - "step": 11295, - "time_per_iteration": 2.677510976791382 - }, - { - "auxiliary_loss_clip": 0.0108512, - "auxiliary_loss_mlp": 0.01031535, - "balance_loss_clip": 1.03939927, - "balance_loss_mlp": 1.02034116, - "epoch": 0.6791522621373817, - "flos": 21908095027200.0, - "grad_norm": 1.7946887652261734, - "language_loss": 0.71118504, - "learning_rate": 9.859795828562823e-07, - "loss": 0.7323516, - "num_input_tokens_seen": 243748330, - "step": 11296, - "time_per_iteration": 2.7060418128967285 - }, - { - "auxiliary_loss_clip": 0.01100933, - "auxiliary_loss_mlp": 0.01032318, - "balance_loss_clip": 1.03921032, - "balance_loss_mlp": 1.01968789, - "epoch": 0.6792123853900496, - "flos": 24826519152000.0, - "grad_norm": 1.4998043731898119, - "language_loss": 0.70986772, - "learning_rate": 9.856439094633949e-07, - "loss": 0.73120022, - "num_input_tokens_seen": 243769380, - "step": 11297, - "time_per_iteration": 2.6602540016174316 - }, - { - "auxiliary_loss_clip": 0.01086842, - "auxiliary_loss_mlp": 0.01036373, - "balance_loss_clip": 1.03981018, - "balance_loss_mlp": 1.02242589, - "epoch": 0.6792725086427176, - "flos": 17566279678080.0, - "grad_norm": 2.428106974293634, - "language_loss": 0.66109335, - "learning_rate": 9.853082745349918e-07, - "loss": 0.68232548, - "num_input_tokens_seen": 243785510, - "step": 11298, - "time_per_iteration": 2.694490671157837 - }, - { - "auxiliary_loss_clip": 0.01105001, - "auxiliary_loss_mlp": 0.01027328, - "balance_loss_clip": 1.03936362, - "balance_loss_mlp": 1.0155797, - "epoch": 0.6793326318953855, - "flos": 26941837040640.0, - "grad_norm": 1.6664116325381613, - "language_loss": 0.71988988, - "learning_rate": 9.84972678083801e-07, - "loss": 0.7412132, - "num_input_tokens_seen": 243805545, - "step": 11299, - "time_per_iteration": 2.713809013366699 - }, - { - "auxiliary_loss_clip": 0.0111669, - "auxiliary_loss_mlp": 0.01035745, - "balance_loss_clip": 1.04250383, - "balance_loss_mlp": 1.02194023, - "epoch": 0.6793927551480535, - "flos": 24318275662080.0, - "grad_norm": 1.2656496116170863, - "language_loss": 0.77410668, - "learning_rate": 9.846371201225488e-07, - "loss": 0.79563105, - "num_input_tokens_seen": 243825185, - "step": 11300, - "time_per_iteration": 2.6434032917022705 - }, - { - "auxiliary_loss_clip": 0.01101279, - "auxiliary_loss_mlp": 0.01035248, - "balance_loss_clip": 1.0383904, - "balance_loss_mlp": 1.02223039, - "epoch": 0.6794528784007214, - "flos": 11436615757440.0, - "grad_norm": 1.7784061043917232, - "language_loss": 0.63197196, - "learning_rate": 9.843016006639577e-07, - "loss": 0.65333718, - "num_input_tokens_seen": 243841600, - "step": 11301, - "time_per_iteration": 2.5723037719726562 - }, - { - "auxiliary_loss_clip": 0.0110239, - "auxiliary_loss_mlp": 0.01032092, - "balance_loss_clip": 1.03951216, - "balance_loss_mlp": 1.01922345, - "epoch": 0.6795130016533895, - "flos": 25229688382080.0, - "grad_norm": 1.6293976559584973, - "language_loss": 0.82879919, - "learning_rate": 9.839661197207525e-07, - "loss": 0.85014397, - "num_input_tokens_seen": 243862250, - "step": 11302, - "time_per_iteration": 2.8143625259399414 - }, - { - "auxiliary_loss_clip": 0.01105417, - "auxiliary_loss_mlp": 0.01036251, - "balance_loss_clip": 1.039042, - "balance_loss_mlp": 1.02304244, - "epoch": 0.6795731249060574, - "flos": 18296415434880.0, - "grad_norm": 2.345439766685576, - "language_loss": 0.69651306, - "learning_rate": 9.83630677305654e-07, - "loss": 0.71792972, - "num_input_tokens_seen": 243880560, - "step": 11303, - "time_per_iteration": 2.685213565826416 - }, - { - "auxiliary_loss_clip": 0.01084889, - "auxiliary_loss_mlp": 0.01035911, - "balance_loss_clip": 1.03954554, - "balance_loss_mlp": 1.02285755, - "epoch": 0.6796332481587254, - "flos": 20300374183680.0, - "grad_norm": 2.3397839406401864, - "language_loss": 0.70244884, - "learning_rate": 9.832952734313813e-07, - "loss": 0.72365683, - "num_input_tokens_seen": 243900635, - "step": 11304, - "time_per_iteration": 2.7310903072357178 - }, - { - "auxiliary_loss_clip": 0.01105112, - "auxiliary_loss_mlp": 0.01033253, - "balance_loss_clip": 1.04138947, - "balance_loss_mlp": 1.0197227, - "epoch": 0.6796933714113934, - "flos": 23586847015680.0, - "grad_norm": 2.2457086045709107, - "language_loss": 0.72435552, - "learning_rate": 9.829599081106536e-07, - "loss": 0.74573922, - "num_input_tokens_seen": 243920160, - "step": 11305, - "time_per_iteration": 2.651684522628784 - }, - { - "auxiliary_loss_clip": 0.01091817, - "auxiliary_loss_mlp": 0.01031313, - "balance_loss_clip": 1.03978157, - "balance_loss_mlp": 1.01869428, - "epoch": 0.6797534946640613, - "flos": 27119917693440.0, - "grad_norm": 2.0035628154788268, - "language_loss": 0.66448355, - "learning_rate": 9.826245813561882e-07, - "loss": 0.68571484, - "num_input_tokens_seen": 243939015, - "step": 11306, - "time_per_iteration": 2.655308723449707 - }, - { - "auxiliary_loss_clip": 0.01089759, - "auxiliary_loss_mlp": 0.0103013, - "balance_loss_clip": 1.03967357, - "balance_loss_mlp": 1.0164274, - "epoch": 0.6798136179167293, - "flos": 22127437428480.0, - "grad_norm": 1.6430540606311845, - "language_loss": 0.80062962, - "learning_rate": 9.822892931807021e-07, - "loss": 0.82182848, - "num_input_tokens_seen": 243958470, - "step": 11307, - "time_per_iteration": 2.661414861679077 - }, - { - "auxiliary_loss_clip": 0.01087499, - "auxiliary_loss_mlp": 0.01040608, - "balance_loss_clip": 1.03799939, - "balance_loss_mlp": 1.0259217, - "epoch": 0.6798737411693972, - "flos": 17488640430720.0, - "grad_norm": 1.645939087248785, - "language_loss": 0.89180249, - "learning_rate": 9.819540435969066e-07, - "loss": 0.91308355, - "num_input_tokens_seen": 243975450, - "step": 11308, - "time_per_iteration": 2.677755117416382 - }, - { - "auxiliary_loss_clip": 0.01075745, - "auxiliary_loss_mlp": 0.01043456, - "balance_loss_clip": 1.03412437, - "balance_loss_mlp": 1.02860808, - "epoch": 0.6799338644220653, - "flos": 22892262744960.0, - "grad_norm": 2.440998746341053, - "language_loss": 0.70999914, - "learning_rate": 9.816188326175154e-07, - "loss": 0.73119116, - "num_input_tokens_seen": 243994355, - "step": 11309, - "time_per_iteration": 2.716607093811035 - }, - { - "auxiliary_loss_clip": 0.01084669, - "auxiliary_loss_mlp": 0.01038527, - "balance_loss_clip": 1.0404917, - "balance_loss_mlp": 1.02482367, - "epoch": 0.6799939876747332, - "flos": 23180409648000.0, - "grad_norm": 1.9482500712222228, - "language_loss": 0.84240967, - "learning_rate": 9.812836602552411e-07, - "loss": 0.86364162, - "num_input_tokens_seen": 244011620, - "step": 11310, - "time_per_iteration": 2.722900152206421 - }, - { - "auxiliary_loss_clip": 0.01085075, - "auxiliary_loss_mlp": 0.01035424, - "balance_loss_clip": 1.03975177, - "balance_loss_mlp": 1.0229609, - "epoch": 0.6800541109274012, - "flos": 19499925553920.0, - "grad_norm": 2.93005287761764, - "language_loss": 0.83355272, - "learning_rate": 9.80948526522792e-07, - "loss": 0.85475767, - "num_input_tokens_seen": 244029925, - "step": 11311, - "time_per_iteration": 2.6596853733062744 - }, - { - "auxiliary_loss_clip": 0.01066687, - "auxiliary_loss_mlp": 0.01032083, - "balance_loss_clip": 1.03414321, - "balance_loss_mlp": 1.01699138, - "epoch": 0.6801142341800691, - "flos": 22277652105600.0, - "grad_norm": 2.3870630839480045, - "language_loss": 0.76332116, - "learning_rate": 9.806134314328767e-07, - "loss": 0.78430879, - "num_input_tokens_seen": 244051225, - "step": 11312, - "time_per_iteration": 4.449703693389893 - }, - { - "auxiliary_loss_clip": 0.01032073, - "auxiliary_loss_mlp": 0.01012541, - "balance_loss_clip": 1.00875306, - "balance_loss_mlp": 1.0114975, - "epoch": 0.6801743574327371, - "flos": 68714817759360.0, - "grad_norm": 0.6670891966515724, - "language_loss": 0.57208383, - "learning_rate": 9.802783749982038e-07, - "loss": 0.59253001, - "num_input_tokens_seen": 244115930, - "step": 11313, - "time_per_iteration": 3.371553897857666 - }, - { - "auxiliary_loss_clip": 0.01103732, - "auxiliary_loss_mlp": 0.01030366, - "balance_loss_clip": 1.03782439, - "balance_loss_mlp": 1.01666939, - "epoch": 0.680234480685405, - "flos": 29460467813760.0, - "grad_norm": 2.3056583415168075, - "language_loss": 0.69011742, - "learning_rate": 9.799433572314754e-07, - "loss": 0.71145844, - "num_input_tokens_seen": 244137320, - "step": 11314, - "time_per_iteration": 2.697596549987793 - }, - { - "auxiliary_loss_clip": 0.01097203, - "auxiliary_loss_mlp": 0.01032148, - "balance_loss_clip": 1.0348376, - "balance_loss_mlp": 1.01988137, - "epoch": 0.6802946039380731, - "flos": 15916866122880.0, - "grad_norm": 1.7422328614888773, - "language_loss": 0.81493306, - "learning_rate": 9.796083781453972e-07, - "loss": 0.83622658, - "num_input_tokens_seen": 244152755, - "step": 11315, - "time_per_iteration": 5.966327905654907 - }, - { - "auxiliary_loss_clip": 0.01074474, - "auxiliary_loss_mlp": 0.01028329, - "balance_loss_clip": 1.04119301, - "balance_loss_mlp": 1.01551998, - "epoch": 0.680354727190741, - "flos": 22018664067840.0, - "grad_norm": 1.6310079102471389, - "language_loss": 0.6954093, - "learning_rate": 9.792734377526718e-07, - "loss": 0.71643734, - "num_input_tokens_seen": 244171480, - "step": 11316, - "time_per_iteration": 2.767069101333618 - }, - { - "auxiliary_loss_clip": 0.01101612, - "auxiliary_loss_mlp": 0.01031421, - "balance_loss_clip": 1.04027951, - "balance_loss_mlp": 1.01897597, - "epoch": 0.680414850443409, - "flos": 18441494467200.0, - "grad_norm": 2.220463387251609, - "language_loss": 0.66746044, - "learning_rate": 9.789385360660003e-07, - "loss": 0.6887908, - "num_input_tokens_seen": 244187920, - "step": 11317, - "time_per_iteration": 2.6441752910614014 - }, - { - "auxiliary_loss_clip": 0.01104685, - "auxiliary_loss_mlp": 0.01039234, - "balance_loss_clip": 1.04243541, - "balance_loss_mlp": 1.02681887, - "epoch": 0.680474973696077, - "flos": 26358611909760.0, - "grad_norm": 1.689359585188632, - "language_loss": 0.74998158, - "learning_rate": 9.78603673098082e-07, - "loss": 0.77142078, - "num_input_tokens_seen": 244209565, - "step": 11318, - "time_per_iteration": 2.6722664833068848 - }, - { - "auxiliary_loss_clip": 0.01082639, - "auxiliary_loss_mlp": 0.01031826, - "balance_loss_clip": 1.03599584, - "balance_loss_mlp": 1.01942801, - "epoch": 0.6805350969487449, - "flos": 18333116156160.0, - "grad_norm": 1.901183060662594, - "language_loss": 0.67961919, - "learning_rate": 9.782688488616143e-07, - "loss": 0.70076376, - "num_input_tokens_seen": 244228015, - "step": 11319, - "time_per_iteration": 2.6768836975097656 - }, - { - "auxiliary_loss_clip": 0.01075168, - "auxiliary_loss_mlp": 0.00771315, - "balance_loss_clip": 1.04308462, - "balance_loss_mlp": 1.00015819, - "epoch": 0.6805952202014129, - "flos": 19937497034880.0, - "grad_norm": 2.0018434908969054, - "language_loss": 0.7674346, - "learning_rate": 9.779340633692945e-07, - "loss": 0.7858994, - "num_input_tokens_seen": 244245615, - "step": 11320, - "time_per_iteration": 2.7204952239990234 - }, - { - "auxiliary_loss_clip": 0.01085122, - "auxiliary_loss_mlp": 0.01032538, - "balance_loss_clip": 1.03866565, - "balance_loss_mlp": 1.01947236, - "epoch": 0.6806553434540809, - "flos": 25224301342080.0, - "grad_norm": 1.7764880825865026, - "language_loss": 0.74452037, - "learning_rate": 9.77599316633817e-07, - "loss": 0.76569694, - "num_input_tokens_seen": 244263625, - "step": 11321, - "time_per_iteration": 4.261602878570557 - }, - { - "auxiliary_loss_clip": 0.01093792, - "auxiliary_loss_mlp": 0.0103627, - "balance_loss_clip": 1.04168797, - "balance_loss_mlp": 1.02327621, - "epoch": 0.6807154667067489, - "flos": 17785586165760.0, - "grad_norm": 1.8638596765379807, - "language_loss": 0.72978008, - "learning_rate": 9.772646086678758e-07, - "loss": 0.75108075, - "num_input_tokens_seen": 244282745, - "step": 11322, - "time_per_iteration": 2.672649383544922 - }, - { - "auxiliary_loss_clip": 0.0106289, - "auxiliary_loss_mlp": 0.00772149, - "balance_loss_clip": 1.03630495, - "balance_loss_mlp": 1.00025296, - "epoch": 0.6807755899594168, - "flos": 22199905117440.0, - "grad_norm": 1.6392607041064693, - "language_loss": 0.78432202, - "learning_rate": 9.769299394841638e-07, - "loss": 0.80267245, - "num_input_tokens_seen": 244303770, - "step": 11323, - "time_per_iteration": 2.9052011966705322 - }, - { - "auxiliary_loss_clip": 0.01000379, - "auxiliary_loss_mlp": 0.01000283, - "balance_loss_clip": 1.00907302, - "balance_loss_mlp": 0.99898928, - "epoch": 0.6808357132120848, - "flos": 68631073200000.0, - "grad_norm": 0.7447348872303097, - "language_loss": 0.57086504, - "learning_rate": 9.765953090953714e-07, - "loss": 0.59087169, - "num_input_tokens_seen": 244355910, - "step": 11324, - "time_per_iteration": 3.0236268043518066 - }, - { - "auxiliary_loss_clip": 0.01094828, - "auxiliary_loss_mlp": 0.01037923, - "balance_loss_clip": 1.04058325, - "balance_loss_mlp": 1.02427304, - "epoch": 0.6808958364647527, - "flos": 23843357015040.0, - "grad_norm": 2.178701691002947, - "language_loss": 0.68127519, - "learning_rate": 9.76260717514186e-07, - "loss": 0.70260274, - "num_input_tokens_seen": 244376610, - "step": 11325, - "time_per_iteration": 2.6579439640045166 - }, - { - "auxiliary_loss_clip": 0.01104202, - "auxiliary_loss_mlp": 0.01032415, - "balance_loss_clip": 1.03789818, - "balance_loss_mlp": 1.01858699, - "epoch": 0.6809559597174207, - "flos": 17711717846400.0, - "grad_norm": 2.3301645499310655, - "language_loss": 0.70840496, - "learning_rate": 9.759261647532974e-07, - "loss": 0.72977114, - "num_input_tokens_seen": 244393000, - "step": 11326, - "time_per_iteration": 2.599581718444824 - }, - { - "auxiliary_loss_clip": 0.0111401, - "auxiliary_loss_mlp": 0.01033979, - "balance_loss_clip": 1.03960943, - "balance_loss_mlp": 1.02112806, - "epoch": 0.6810160829700886, - "flos": 22491894775680.0, - "grad_norm": 1.9162803943824422, - "language_loss": 0.73098135, - "learning_rate": 9.75591650825392e-07, - "loss": 0.75246119, - "num_input_tokens_seen": 244409515, - "step": 11327, - "time_per_iteration": 2.6066248416900635 - }, - { - "auxiliary_loss_clip": 0.01099529, - "auxiliary_loss_mlp": 0.01030672, - "balance_loss_clip": 1.03761911, - "balance_loss_mlp": 1.01766074, - "epoch": 0.6810762062227567, - "flos": 16832875783680.0, - "grad_norm": 1.8101774590253075, - "language_loss": 0.774257, - "learning_rate": 9.752571757431526e-07, - "loss": 0.79555899, - "num_input_tokens_seen": 244427165, - "step": 11328, - "time_per_iteration": 2.6368680000305176 - }, - { - "auxiliary_loss_clip": 0.01114029, - "auxiliary_loss_mlp": 0.01029553, - "balance_loss_clip": 1.03958964, - "balance_loss_mlp": 1.01668477, - "epoch": 0.6811363294754246, - "flos": 12714676554240.0, - "grad_norm": 2.0540376759628183, - "language_loss": 0.64911687, - "learning_rate": 9.74922739519265e-07, - "loss": 0.67055273, - "num_input_tokens_seen": 244445705, - "step": 11329, - "time_per_iteration": 2.573288679122925 - }, - { - "auxiliary_loss_clip": 0.0105984, - "auxiliary_loss_mlp": 0.00771154, - "balance_loss_clip": 1.03904939, - "balance_loss_mlp": 1.00018847, - "epoch": 0.6811964527280926, - "flos": 17711969241600.0, - "grad_norm": 1.9764612571544942, - "language_loss": 0.79003155, - "learning_rate": 9.745883421664096e-07, - "loss": 0.80834144, - "num_input_tokens_seen": 244460415, - "step": 11330, - "time_per_iteration": 2.773776054382324 - }, - { - "auxiliary_loss_clip": 0.01103225, - "auxiliary_loss_mlp": 0.01032506, - "balance_loss_clip": 1.03934312, - "balance_loss_mlp": 1.01867759, - "epoch": 0.6812565759807605, - "flos": 24863471268480.0, - "grad_norm": 2.2394798931993467, - "language_loss": 0.6390332, - "learning_rate": 9.742539836972665e-07, - "loss": 0.66039056, - "num_input_tokens_seen": 244480555, - "step": 11331, - "time_per_iteration": 2.648928165435791 - }, - { - "auxiliary_loss_clip": 0.01066258, - "auxiliary_loss_mlp": 0.01041405, - "balance_loss_clip": 1.0375067, - "balance_loss_mlp": 1.02576447, - "epoch": 0.6813166992334285, - "flos": 17166019449600.0, - "grad_norm": 1.609984813626945, - "language_loss": 0.72195572, - "learning_rate": 9.739196641245148e-07, - "loss": 0.74303234, - "num_input_tokens_seen": 244498540, - "step": 11332, - "time_per_iteration": 2.713545322418213 - }, - { - "auxiliary_loss_clip": 0.01103166, - "auxiliary_loss_mlp": 0.01035791, - "balance_loss_clip": 1.03957558, - "balance_loss_mlp": 1.02199841, - "epoch": 0.6813768224860965, - "flos": 18843550375680.0, - "grad_norm": 2.168847998609439, - "language_loss": 0.74432015, - "learning_rate": 9.735853834608326e-07, - "loss": 0.76570976, - "num_input_tokens_seen": 244517015, - "step": 11333, - "time_per_iteration": 2.6175012588500977 - }, - { - "auxiliary_loss_clip": 0.01105297, - "auxiliary_loss_mlp": 0.01033437, - "balance_loss_clip": 1.04025042, - "balance_loss_mlp": 1.01950097, - "epoch": 0.6814369457387645, - "flos": 24532733813760.0, - "grad_norm": 1.5744091613936917, - "language_loss": 0.71911031, - "learning_rate": 9.732511417188963e-07, - "loss": 0.74049771, - "num_input_tokens_seen": 244537450, - "step": 11334, - "time_per_iteration": 2.6650426387786865 - }, - { - "auxiliary_loss_clip": 0.0109758, - "auxiliary_loss_mlp": 0.01035782, - "balance_loss_clip": 1.0405134, - "balance_loss_mlp": 1.02300835, - "epoch": 0.6814970689914325, - "flos": 18222978078720.0, - "grad_norm": 1.6636732632213396, - "language_loss": 0.85627699, - "learning_rate": 9.729169389113791e-07, - "loss": 0.87761062, - "num_input_tokens_seen": 244555640, - "step": 11335, - "time_per_iteration": 2.6842143535614014 - }, - { - "auxiliary_loss_clip": 0.01094419, - "auxiliary_loss_mlp": 0.01029844, - "balance_loss_clip": 1.03577423, - "balance_loss_mlp": 1.01767254, - "epoch": 0.6815571922441004, - "flos": 25228790542080.0, - "grad_norm": 1.7085573075393508, - "language_loss": 0.82023531, - "learning_rate": 9.725827750509542e-07, - "loss": 0.84147793, - "num_input_tokens_seen": 244574005, - "step": 11336, - "time_per_iteration": 2.6614902019500732 - }, - { - "auxiliary_loss_clip": 0.01068778, - "auxiliary_loss_mlp": 0.01036536, - "balance_loss_clip": 1.03518701, - "balance_loss_mlp": 1.02383399, - "epoch": 0.6816173154967684, - "flos": 19456078026240.0, - "grad_norm": 2.367816368546713, - "language_loss": 0.81341016, - "learning_rate": 9.72248650150294e-07, - "loss": 0.83446324, - "num_input_tokens_seen": 244591395, - "step": 11337, - "time_per_iteration": 2.657960891723633 - }, - { - "auxiliary_loss_clip": 0.01066549, - "auxiliary_loss_mlp": 0.01031579, - "balance_loss_clip": 1.03811026, - "balance_loss_mlp": 1.01948464, - "epoch": 0.6816774387494363, - "flos": 17931455297280.0, - "grad_norm": 1.783864414164821, - "language_loss": 0.72693783, - "learning_rate": 9.719145642220673e-07, - "loss": 0.74791908, - "num_input_tokens_seen": 244610400, - "step": 11338, - "time_per_iteration": 2.7979798316955566 - }, - { - "auxiliary_loss_clip": 0.0107103, - "auxiliary_loss_mlp": 0.0103912, - "balance_loss_clip": 1.03607392, - "balance_loss_mlp": 1.02586377, - "epoch": 0.6817375620021043, - "flos": 22233014478720.0, - "grad_norm": 1.4931481110033054, - "language_loss": 0.775159, - "learning_rate": 9.715805172789435e-07, - "loss": 0.79626048, - "num_input_tokens_seen": 244630400, - "step": 11339, - "time_per_iteration": 2.795623540878296 - }, - { - "auxiliary_loss_clip": 0.01078643, - "auxiliary_loss_mlp": 0.01038418, - "balance_loss_clip": 1.03534794, - "balance_loss_mlp": 1.02518606, - "epoch": 0.6817976852547722, - "flos": 25374408278400.0, - "grad_norm": 2.310939550899193, - "language_loss": 0.70462239, - "learning_rate": 9.712465093335901e-07, - "loss": 0.72579294, - "num_input_tokens_seen": 244649155, - "step": 11340, - "time_per_iteration": 2.7247865200042725 - }, - { - "auxiliary_loss_clip": 0.01095095, - "auxiliary_loss_mlp": 0.01039919, - "balance_loss_clip": 1.04094124, - "balance_loss_mlp": 1.02693105, - "epoch": 0.6818578085074403, - "flos": 22265764704000.0, - "grad_norm": 2.4098119524731483, - "language_loss": 0.83344734, - "learning_rate": 9.709125403986722e-07, - "loss": 0.85479748, - "num_input_tokens_seen": 244665470, - "step": 11341, - "time_per_iteration": 2.693506956100464 - }, - { - "auxiliary_loss_clip": 0.01081074, - "auxiliary_loss_mlp": 0.01039727, - "balance_loss_clip": 1.03870487, - "balance_loss_mlp": 1.02477837, - "epoch": 0.6819179317601082, - "flos": 19318145800320.0, - "grad_norm": 2.2255629522007907, - "language_loss": 0.68262535, - "learning_rate": 9.705786104868531e-07, - "loss": 0.70383334, - "num_input_tokens_seen": 244684390, - "step": 11342, - "time_per_iteration": 2.73895263671875 - }, - { - "auxiliary_loss_clip": 0.01057789, - "auxiliary_loss_mlp": 0.01030928, - "balance_loss_clip": 1.0362978, - "balance_loss_mlp": 1.01732635, - "epoch": 0.6819780550127762, - "flos": 21104126864640.0, - "grad_norm": 1.497693268832665, - "language_loss": 0.74835187, - "learning_rate": 9.702447196107963e-07, - "loss": 0.76923907, - "num_input_tokens_seen": 244703370, - "step": 11343, - "time_per_iteration": 2.713353157043457 - }, - { - "auxiliary_loss_clip": 0.0107318, - "auxiliary_loss_mlp": 0.01047273, - "balance_loss_clip": 1.03880191, - "balance_loss_mlp": 1.03244925, - "epoch": 0.6820381782654441, - "flos": 29716403195520.0, - "grad_norm": 2.1197347783426714, - "language_loss": 0.79880822, - "learning_rate": 9.699108677831639e-07, - "loss": 0.82001281, - "num_input_tokens_seen": 244723325, - "step": 11344, - "time_per_iteration": 2.794928550720215 - }, - { - "auxiliary_loss_clip": 0.01076417, - "auxiliary_loss_mlp": 0.01035037, - "balance_loss_clip": 1.03809428, - "balance_loss_mlp": 1.02200782, - "epoch": 0.6820983015181121, - "flos": 29242130993280.0, - "grad_norm": 2.387575724266914, - "language_loss": 0.66499114, - "learning_rate": 9.695770550166136e-07, - "loss": 0.68610573, - "num_input_tokens_seen": 244745650, - "step": 11345, - "time_per_iteration": 2.7620160579681396 - }, - { - "auxiliary_loss_clip": 0.01095586, - "auxiliary_loss_mlp": 0.01037386, - "balance_loss_clip": 1.04086328, - "balance_loss_mlp": 1.02385557, - "epoch": 0.6821584247707801, - "flos": 18871775487360.0, - "grad_norm": 2.5184537784822347, - "language_loss": 0.64879942, - "learning_rate": 9.692432813238054e-07, - "loss": 0.67012918, - "num_input_tokens_seen": 244760270, - "step": 11346, - "time_per_iteration": 2.6018569469451904 - }, - { - "auxiliary_loss_clip": 0.01047778, - "auxiliary_loss_mlp": 0.00774999, - "balance_loss_clip": 1.02989614, - "balance_loss_mlp": 1.00022125, - "epoch": 0.6822185480234481, - "flos": 21324582587520.0, - "grad_norm": 1.6732433495926922, - "language_loss": 0.78631318, - "learning_rate": 9.689095467173952e-07, - "loss": 0.80454087, - "num_input_tokens_seen": 244779565, - "step": 11347, - "time_per_iteration": 2.7881743907928467 - }, - { - "auxiliary_loss_clip": 0.01023846, - "auxiliary_loss_mlp": 0.01003144, - "balance_loss_clip": 1.01006222, - "balance_loss_mlp": 1.00196934, - "epoch": 0.6822786712761161, - "flos": 63488306430720.0, - "grad_norm": 0.7197437780259376, - "language_loss": 0.5252403, - "learning_rate": 9.685758512100378e-07, - "loss": 0.54551017, - "num_input_tokens_seen": 244838480, - "step": 11348, - "time_per_iteration": 3.159472942352295 - }, - { - "auxiliary_loss_clip": 0.01111335, - "auxiliary_loss_mlp": 0.0103669, - "balance_loss_clip": 1.03910565, - "balance_loss_mlp": 1.02423859, - "epoch": 0.682338794528784, - "flos": 21068934514560.0, - "grad_norm": 1.699268260200451, - "language_loss": 0.79743314, - "learning_rate": 9.682421948143873e-07, - "loss": 0.8189134, - "num_input_tokens_seen": 244855265, - "step": 11349, - "time_per_iteration": 2.6090118885040283 - }, - { - "auxiliary_loss_clip": 0.01107133, - "auxiliary_loss_mlp": 0.01033347, - "balance_loss_clip": 1.03977346, - "balance_loss_mlp": 1.01808834, - "epoch": 0.682398917781452, - "flos": 36283243547520.0, - "grad_norm": 1.8874948598089236, - "language_loss": 0.73788822, - "learning_rate": 9.67908577543096e-07, - "loss": 0.75929302, - "num_input_tokens_seen": 244875555, - "step": 11350, - "time_per_iteration": 2.819202184677124 - }, - { - "auxiliary_loss_clip": 0.01113228, - "auxiliary_loss_mlp": 0.01032526, - "balance_loss_clip": 1.04043305, - "balance_loss_mlp": 1.01912093, - "epoch": 0.6824590410341199, - "flos": 24859197550080.0, - "grad_norm": 1.5956944903953967, - "language_loss": 0.79352248, - "learning_rate": 9.675749994088161e-07, - "loss": 0.81498003, - "num_input_tokens_seen": 244895270, - "step": 11351, - "time_per_iteration": 4.24803614616394 - }, - { - "auxiliary_loss_clip": 0.01100964, - "auxiliary_loss_mlp": 0.01038048, - "balance_loss_clip": 1.03889775, - "balance_loss_mlp": 1.02563834, - "epoch": 0.6825191642867879, - "flos": 22452392793600.0, - "grad_norm": 1.5926332734392936, - "language_loss": 0.73048198, - "learning_rate": 9.672414604241954e-07, - "loss": 0.75187206, - "num_input_tokens_seen": 244914535, - "step": 11352, - "time_per_iteration": 2.608426094055176 - }, - { - "auxiliary_loss_clip": 0.01066712, - "auxiliary_loss_mlp": 0.01039687, - "balance_loss_clip": 1.03466344, - "balance_loss_mlp": 1.0250721, - "epoch": 0.6825792875394558, - "flos": 29424377623680.0, - "grad_norm": 1.4647464086643525, - "language_loss": 0.79812884, - "learning_rate": 9.669079606018814e-07, - "loss": 0.81919283, - "num_input_tokens_seen": 244936095, - "step": 11353, - "time_per_iteration": 2.789823532104492 - }, - { - "auxiliary_loss_clip": 0.0110206, - "auxiliary_loss_mlp": 0.01030724, - "balance_loss_clip": 1.03809357, - "balance_loss_mlp": 1.01783192, - "epoch": 0.6826394107921239, - "flos": 18770974945920.0, - "grad_norm": 1.6494881368897465, - "language_loss": 0.78637832, - "learning_rate": 9.665744999545218e-07, - "loss": 0.80770618, - "num_input_tokens_seen": 244955290, - "step": 11354, - "time_per_iteration": 2.621384382247925 - }, - { - "auxiliary_loss_clip": 0.0105434, - "auxiliary_loss_mlp": 0.01030514, - "balance_loss_clip": 1.03731966, - "balance_loss_mlp": 1.01798463, - "epoch": 0.6826995340447918, - "flos": 16617591619200.0, - "grad_norm": 2.3579555752139107, - "language_loss": 0.61690813, - "learning_rate": 9.662410784947599e-07, - "loss": 0.63775671, - "num_input_tokens_seen": 244972935, - "step": 11355, - "time_per_iteration": 4.416518449783325 - }, - { - "auxiliary_loss_clip": 0.01059431, - "auxiliary_loss_mlp": 0.01031412, - "balance_loss_clip": 1.03248143, - "balance_loss_mlp": 1.01780415, - "epoch": 0.6827596572974598, - "flos": 20848299223680.0, - "grad_norm": 2.0827591580165525, - "language_loss": 0.81958997, - "learning_rate": 9.659076962352398e-07, - "loss": 0.84049839, - "num_input_tokens_seen": 244989440, - "step": 11356, - "time_per_iteration": 2.772223949432373 - }, - { - "auxiliary_loss_clip": 0.0109731, - "auxiliary_loss_mlp": 0.01033608, - "balance_loss_clip": 1.04186547, - "balance_loss_mlp": 1.01991129, - "epoch": 0.6828197805501277, - "flos": 22748081552640.0, - "grad_norm": 1.8123732324115438, - "language_loss": 0.78407294, - "learning_rate": 9.655743531886052e-07, - "loss": 0.80538213, - "num_input_tokens_seen": 245007830, - "step": 11357, - "time_per_iteration": 2.7943849563598633 - }, - { - "auxiliary_loss_clip": 0.01014132, - "auxiliary_loss_mlp": 0.01026339, - "balance_loss_clip": 1.008708, - "balance_loss_mlp": 1.02481925, - "epoch": 0.6828799038027957, - "flos": 71646565829760.0, - "grad_norm": 0.8539086135635664, - "language_loss": 0.59534943, - "learning_rate": 9.65241049367493e-07, - "loss": 0.61575413, - "num_input_tokens_seen": 245070720, - "step": 11358, - "time_per_iteration": 3.2622344493865967 - }, - { - "auxiliary_loss_clip": 0.0107463, - "auxiliary_loss_mlp": 0.01049087, - "balance_loss_clip": 1.03272629, - "balance_loss_mlp": 1.03269577, - "epoch": 0.6829400270554637, - "flos": 19829154637440.0, - "grad_norm": 1.8877966802390573, - "language_loss": 0.78321809, - "learning_rate": 9.64907784784544e-07, - "loss": 0.80445516, - "num_input_tokens_seen": 245089070, - "step": 11359, - "time_per_iteration": 2.7041001319885254 - }, - { - "auxiliary_loss_clip": 0.01102262, - "auxiliary_loss_mlp": 0.01035361, - "balance_loss_clip": 1.03907776, - "balance_loss_mlp": 1.02245069, - "epoch": 0.6830001503081317, - "flos": 21980634543360.0, - "grad_norm": 1.8916230773559268, - "language_loss": 0.81728172, - "learning_rate": 9.645745594523958e-07, - "loss": 0.83865792, - "num_input_tokens_seen": 245106500, - "step": 11360, - "time_per_iteration": 2.7257988452911377 - }, - { - "auxiliary_loss_clip": 0.01103488, - "auxiliary_loss_mlp": 0.01039736, - "balance_loss_clip": 1.04132986, - "balance_loss_mlp": 1.02476335, - "epoch": 0.6830602735607997, - "flos": 24316767290880.0, - "grad_norm": 1.928922588936481, - "language_loss": 0.75214481, - "learning_rate": 9.642413733836844e-07, - "loss": 0.77357709, - "num_input_tokens_seen": 245125260, - "step": 11361, - "time_per_iteration": 4.145911931991577 - }, - { - "auxiliary_loss_clip": 0.01014146, - "auxiliary_loss_mlp": 0.01006013, - "balance_loss_clip": 1.01607728, - "balance_loss_mlp": 1.00464237, - "epoch": 0.6831203968134676, - "flos": 57690062323200.0, - "grad_norm": 0.8723971229353714, - "language_loss": 0.59647572, - "learning_rate": 9.639082265910437e-07, - "loss": 0.6166774, - "num_input_tokens_seen": 245188730, - "step": 11362, - "time_per_iteration": 3.303649425506592 - }, - { - "auxiliary_loss_clip": 0.01085969, - "auxiliary_loss_mlp": 0.01032083, - "balance_loss_clip": 1.03544044, - "balance_loss_mlp": 1.01791525, - "epoch": 0.6831805200661356, - "flos": 14388436552320.0, - "grad_norm": 2.2849011537380384, - "language_loss": 0.75293076, - "learning_rate": 9.635751190871074e-07, - "loss": 0.77411127, - "num_input_tokens_seen": 245205065, - "step": 11363, - "time_per_iteration": 2.646646499633789 - }, - { - "auxiliary_loss_clip": 0.0109026, - "auxiliary_loss_mlp": 0.01039827, - "balance_loss_clip": 1.03785634, - "balance_loss_mlp": 1.02562308, - "epoch": 0.6832406433188035, - "flos": 22820297846400.0, - "grad_norm": 2.373792593478636, - "language_loss": 0.89238822, - "learning_rate": 9.632420508845063e-07, - "loss": 0.91368914, - "num_input_tokens_seen": 245224265, - "step": 11364, - "time_per_iteration": 2.7119343280792236 - }, - { - "auxiliary_loss_clip": 0.0108884, - "auxiliary_loss_mlp": 0.01037187, - "balance_loss_clip": 1.03655684, - "balance_loss_mlp": 1.02403259, - "epoch": 0.6833007665714715, - "flos": 17561718650880.0, - "grad_norm": 2.1030126962068634, - "language_loss": 0.88149464, - "learning_rate": 9.629090219958697e-07, - "loss": 0.9027549, - "num_input_tokens_seen": 245243360, - "step": 11365, - "time_per_iteration": 2.702363967895508 - }, - { - "auxiliary_loss_clip": 0.01078844, - "auxiliary_loss_mlp": 0.01042641, - "balance_loss_clip": 1.03964448, - "balance_loss_mlp": 1.02709591, - "epoch": 0.6833608898241395, - "flos": 22445928345600.0, - "grad_norm": 2.424437854190435, - "language_loss": 0.81156111, - "learning_rate": 9.625760324338272e-07, - "loss": 0.83277589, - "num_input_tokens_seen": 245256350, - "step": 11366, - "time_per_iteration": 2.674567937850952 - }, - { - "auxiliary_loss_clip": 0.01093776, - "auxiliary_loss_mlp": 0.01032035, - "balance_loss_clip": 1.03835893, - "balance_loss_mlp": 1.0188446, - "epoch": 0.6834210130768075, - "flos": 24534637234560.0, - "grad_norm": 6.88774223787515, - "language_loss": 0.76857549, - "learning_rate": 9.622430822110062e-07, - "loss": 0.78983361, - "num_input_tokens_seen": 245277575, - "step": 11367, - "time_per_iteration": 2.759528160095215 - }, - { - "auxiliary_loss_clip": 0.01087848, - "auxiliary_loss_mlp": 0.01037587, - "balance_loss_clip": 1.03855908, - "balance_loss_mlp": 1.0238061, - "epoch": 0.6834811363294754, - "flos": 20047132321920.0, - "grad_norm": 1.645021355885407, - "language_loss": 0.69147146, - "learning_rate": 9.619101713400312e-07, - "loss": 0.71272576, - "num_input_tokens_seen": 245296615, - "step": 11368, - "time_per_iteration": 2.730281352996826 - }, - { - "auxiliary_loss_clip": 0.01073853, - "auxiliary_loss_mlp": 0.01036705, - "balance_loss_clip": 1.03326845, - "balance_loss_mlp": 1.02335334, - "epoch": 0.6835412595821434, - "flos": 24790752184320.0, - "grad_norm": 2.698591110002851, - "language_loss": 0.73457599, - "learning_rate": 9.615772998335261e-07, - "loss": 0.75568151, - "num_input_tokens_seen": 245316275, - "step": 11369, - "time_per_iteration": 2.7577805519104004 - }, - { - "auxiliary_loss_clip": 0.01098451, - "auxiliary_loss_mlp": 0.01031594, - "balance_loss_clip": 1.03844953, - "balance_loss_mlp": 1.01816475, - "epoch": 0.6836013828348113, - "flos": 19500356517120.0, - "grad_norm": 1.7788685178761994, - "language_loss": 0.79114872, - "learning_rate": 9.612444677041138e-07, - "loss": 0.81244916, - "num_input_tokens_seen": 245334595, - "step": 11370, - "time_per_iteration": 2.6396684646606445 - }, - { - "auxiliary_loss_clip": 0.01022242, - "auxiliary_loss_mlp": 0.01001045, - "balance_loss_clip": 1.00799215, - "balance_loss_mlp": 0.99983543, - "epoch": 0.6836615060874793, - "flos": 58363999251840.0, - "grad_norm": 0.7422193722806905, - "language_loss": 0.59737813, - "learning_rate": 9.609116749644162e-07, - "loss": 0.61761105, - "num_input_tokens_seen": 245389750, - "step": 11371, - "time_per_iteration": 3.0740749835968018 - }, - { - "auxiliary_loss_clip": 0.01085535, - "auxiliary_loss_mlp": 0.01029543, - "balance_loss_clip": 1.03905046, - "balance_loss_mlp": 1.01730061, - "epoch": 0.6837216293401474, - "flos": 12166895168640.0, - "grad_norm": 1.4865479653921647, - "language_loss": 0.63814664, - "learning_rate": 9.605789216270511e-07, - "loss": 0.65929747, - "num_input_tokens_seen": 245407530, - "step": 11372, - "time_per_iteration": 2.7413337230682373 - }, - { - "auxiliary_loss_clip": 0.0110109, - "auxiliary_loss_mlp": 0.01031341, - "balance_loss_clip": 1.03960001, - "balance_loss_mlp": 1.01804972, - "epoch": 0.6837817525928153, - "flos": 22127581082880.0, - "grad_norm": 1.4899002874098461, - "language_loss": 0.71882284, - "learning_rate": 9.602462077046375e-07, - "loss": 0.74014717, - "num_input_tokens_seen": 245427000, - "step": 11373, - "time_per_iteration": 2.6774277687072754 - }, - { - "auxiliary_loss_clip": 0.01004865, - "auxiliary_loss_mlp": 0.01001461, - "balance_loss_clip": 1.00957799, - "balance_loss_mlp": 1.00026858, - "epoch": 0.6838418758454833, - "flos": 65005928985600.0, - "grad_norm": 1.2536847263503932, - "language_loss": 0.56630689, - "learning_rate": 9.599135332097935e-07, - "loss": 0.58637011, - "num_input_tokens_seen": 245491620, - "step": 11374, - "time_per_iteration": 3.388324499130249 - }, - { - "auxiliary_loss_clip": 0.01107854, - "auxiliary_loss_mlp": 0.01029868, - "balance_loss_clip": 1.04247069, - "balance_loss_mlp": 1.01605177, - "epoch": 0.6839019990981512, - "flos": 21030833162880.0, - "grad_norm": 1.4678437510466378, - "language_loss": 0.74034035, - "learning_rate": 9.595808981551312e-07, - "loss": 0.76171762, - "num_input_tokens_seen": 245511285, - "step": 11375, - "time_per_iteration": 2.6397507190704346 - }, - { - "auxiliary_loss_clip": 0.01095867, - "auxiliary_loss_mlp": 0.01034414, - "balance_loss_clip": 1.04174185, - "balance_loss_mlp": 1.02130103, - "epoch": 0.6839621223508192, - "flos": 24935543907840.0, - "grad_norm": 1.7532880441573435, - "language_loss": 0.70852029, - "learning_rate": 9.592483025532651e-07, - "loss": 0.72982311, - "num_input_tokens_seen": 245532910, - "step": 11376, - "time_per_iteration": 2.693699598312378 - }, - { - "auxiliary_loss_clip": 0.01115191, - "auxiliary_loss_mlp": 0.01033917, - "balance_loss_clip": 1.03887844, - "balance_loss_mlp": 1.02039814, - "epoch": 0.6840222456034871, - "flos": 26358827391360.0, - "grad_norm": 2.037488504873538, - "language_loss": 0.74301463, - "learning_rate": 9.58915746416808e-07, - "loss": 0.76450574, - "num_input_tokens_seen": 245550540, - "step": 11377, - "time_per_iteration": 2.5986266136169434 - }, - { - "auxiliary_loss_clip": 0.01014709, - "auxiliary_loss_mlp": 0.010028, - "balance_loss_clip": 1.00959396, - "balance_loss_mlp": 1.00172734, - "epoch": 0.6840823688561551, - "flos": 65988336936960.0, - "grad_norm": 0.7236208934827679, - "language_loss": 0.56872022, - "learning_rate": 9.585832297583707e-07, - "loss": 0.58889532, - "num_input_tokens_seen": 245619570, - "step": 11378, - "time_per_iteration": 3.304108142852783 - }, - { - "auxiliary_loss_clip": 0.01114944, - "auxiliary_loss_mlp": 0.01038581, - "balance_loss_clip": 1.04010487, - "balance_loss_mlp": 1.02452612, - "epoch": 0.684142492108823, - "flos": 21397588980480.0, - "grad_norm": 1.9771846674075895, - "language_loss": 0.78299057, - "learning_rate": 9.58250752590561e-07, - "loss": 0.80452579, - "num_input_tokens_seen": 245637980, - "step": 11379, - "time_per_iteration": 2.5876471996307373 - }, - { - "auxiliary_loss_clip": 0.01110374, - "auxiliary_loss_mlp": 0.0102753, - "balance_loss_clip": 1.04125404, - "balance_loss_mlp": 1.01560271, - "epoch": 0.6842026153614911, - "flos": 18801426700800.0, - "grad_norm": 1.9586312359498843, - "language_loss": 0.69083488, - "learning_rate": 9.57918314925988e-07, - "loss": 0.71221387, - "num_input_tokens_seen": 245655690, - "step": 11380, - "time_per_iteration": 2.565652847290039 - }, - { - "auxiliary_loss_clip": 0.0109036, - "auxiliary_loss_mlp": 0.01036194, - "balance_loss_clip": 1.03853393, - "balance_loss_mlp": 1.02266991, - "epoch": 0.684262738614159, - "flos": 19646405216640.0, - "grad_norm": 1.9286622353610317, - "language_loss": 0.78519118, - "learning_rate": 9.575859167772568e-07, - "loss": 0.80645669, - "num_input_tokens_seen": 245671525, - "step": 11381, - "time_per_iteration": 2.6301379203796387 - }, - { - "auxiliary_loss_clip": 0.010226, - "auxiliary_loss_mlp": 0.01003847, - "balance_loss_clip": 1.00947046, - "balance_loss_mlp": 1.00290525, - "epoch": 0.684322861866827, - "flos": 62354462739840.0, - "grad_norm": 0.864991455722599, - "language_loss": 0.67092407, - "learning_rate": 9.572535581569713e-07, - "loss": 0.69118857, - "num_input_tokens_seen": 245724115, - "step": 11382, - "time_per_iteration": 3.0039761066436768 - }, - { - "auxiliary_loss_clip": 0.01021817, - "auxiliary_loss_mlp": 0.01001983, - "balance_loss_clip": 1.0083313, - "balance_loss_mlp": 1.000862, - "epoch": 0.6843829851194949, - "flos": 65805048812160.0, - "grad_norm": 0.8192420417585807, - "language_loss": 0.58103538, - "learning_rate": 9.569212390777356e-07, - "loss": 0.60127336, - "num_input_tokens_seen": 245789245, - "step": 11383, - "time_per_iteration": 3.165360450744629 - }, - { - "auxiliary_loss_clip": 0.01062418, - "auxiliary_loss_mlp": 0.01038341, - "balance_loss_clip": 1.03478217, - "balance_loss_mlp": 1.02372622, - "epoch": 0.6844431083721629, - "flos": 27855153181440.0, - "grad_norm": 2.857238801522205, - "language_loss": 0.80316836, - "learning_rate": 9.565889595521517e-07, - "loss": 0.82417595, - "num_input_tokens_seen": 245812420, - "step": 11384, - "time_per_iteration": 2.770827054977417 - }, - { - "auxiliary_loss_clip": 0.01103805, - "auxiliary_loss_mlp": 0.01030338, - "balance_loss_clip": 1.03886342, - "balance_loss_mlp": 1.01740408, - "epoch": 0.684503231624831, - "flos": 18255010032000.0, - "grad_norm": 2.2679652674144157, - "language_loss": 0.77255201, - "learning_rate": 9.562567195928187e-07, - "loss": 0.79389346, - "num_input_tokens_seen": 245829135, - "step": 11385, - "time_per_iteration": 2.591132164001465 - }, - { - "auxiliary_loss_clip": 0.0108167, - "auxiliary_loss_mlp": 0.01042801, - "balance_loss_clip": 1.0381335, - "balance_loss_mlp": 1.02736902, - "epoch": 0.6845633548774989, - "flos": 17639681120640.0, - "grad_norm": 2.065101540426227, - "language_loss": 0.84796238, - "learning_rate": 9.55924519212335e-07, - "loss": 0.86920702, - "num_input_tokens_seen": 245847140, - "step": 11386, - "time_per_iteration": 2.6891727447509766 - }, - { - "auxiliary_loss_clip": 0.01103811, - "auxiliary_loss_mlp": 0.01041499, - "balance_loss_clip": 1.04075646, - "balance_loss_mlp": 1.02887416, - "epoch": 0.6846234781301669, - "flos": 20807576179200.0, - "grad_norm": 2.1990004125382634, - "language_loss": 0.83455414, - "learning_rate": 9.555923584232984e-07, - "loss": 0.85600722, - "num_input_tokens_seen": 245862855, - "step": 11387, - "time_per_iteration": 2.61997127532959 - }, - { - "auxiliary_loss_clip": 0.01092977, - "auxiliary_loss_mlp": 0.01030654, - "balance_loss_clip": 1.03438902, - "balance_loss_mlp": 1.01760602, - "epoch": 0.6846836013828348, - "flos": 36101176485120.0, - "grad_norm": 1.848194295156486, - "language_loss": 0.72119319, - "learning_rate": 9.552602372383047e-07, - "loss": 0.74242949, - "num_input_tokens_seen": 245885415, - "step": 11388, - "time_per_iteration": 2.7075023651123047 - }, - { - "auxiliary_loss_clip": 0.01098153, - "auxiliary_loss_mlp": 0.01027593, - "balance_loss_clip": 1.04095197, - "balance_loss_mlp": 1.01512408, - "epoch": 0.6847437246355028, - "flos": 43142468607360.0, - "grad_norm": 2.050560945832389, - "language_loss": 0.6225087, - "learning_rate": 9.549281556699469e-07, - "loss": 0.64376616, - "num_input_tokens_seen": 245906285, - "step": 11389, - "time_per_iteration": 2.8079371452331543 - }, - { - "auxiliary_loss_clip": 0.01011672, - "auxiliary_loss_mlp": 0.01004667, - "balance_loss_clip": 1.00851202, - "balance_loss_mlp": 1.00345695, - "epoch": 0.6848038478881707, - "flos": 71663729552640.0, - "grad_norm": 0.7413355837031695, - "language_loss": 0.5598197, - "learning_rate": 9.54596113730818e-07, - "loss": 0.57998312, - "num_input_tokens_seen": 245967620, - "step": 11390, - "time_per_iteration": 5.026982307434082 - }, - { - "auxiliary_loss_clip": 0.01076744, - "auxiliary_loss_mlp": 0.00771583, - "balance_loss_clip": 1.03878915, - "balance_loss_mlp": 1.00011551, - "epoch": 0.6848639711408387, - "flos": 19937820257280.0, - "grad_norm": 1.8276249174915487, - "language_loss": 0.87787604, - "learning_rate": 9.542641114335109e-07, - "loss": 0.89635926, - "num_input_tokens_seen": 245985075, - "step": 11391, - "time_per_iteration": 2.7455759048461914 - }, - { - "auxiliary_loss_clip": 0.01073324, - "auxiliary_loss_mlp": 0.01040704, - "balance_loss_clip": 1.03858793, - "balance_loss_mlp": 1.0274303, - "epoch": 0.6849240943935067, - "flos": 26867501844480.0, - "grad_norm": 1.7326264104251545, - "language_loss": 0.79257655, - "learning_rate": 9.539321487906117e-07, - "loss": 0.81371683, - "num_input_tokens_seen": 246003560, - "step": 11392, - "time_per_iteration": 2.8595845699310303 - }, - { - "auxiliary_loss_clip": 0.0108908, - "auxiliary_loss_mlp": 0.01032172, - "balance_loss_clip": 1.03764129, - "balance_loss_mlp": 1.01933873, - "epoch": 0.6849842176461747, - "flos": 13735365425280.0, - "grad_norm": 2.218400680256619, - "language_loss": 0.71076894, - "learning_rate": 9.536002258147104e-07, - "loss": 0.7319814, - "num_input_tokens_seen": 246019600, - "step": 11393, - "time_per_iteration": 2.680263042449951 - }, - { - "auxiliary_loss_clip": 0.01075845, - "auxiliary_loss_mlp": 0.01032815, - "balance_loss_clip": 1.03768921, - "balance_loss_mlp": 1.01831901, - "epoch": 0.6850443408988426, - "flos": 24973070641920.0, - "grad_norm": 1.8123459031815148, - "language_loss": 0.64661837, - "learning_rate": 9.532683425183936e-07, - "loss": 0.66770494, - "num_input_tokens_seen": 246038920, - "step": 11394, - "time_per_iteration": 4.561980724334717 - }, - { - "auxiliary_loss_clip": 0.01087026, - "auxiliary_loss_mlp": 0.00773484, - "balance_loss_clip": 1.03753853, - "balance_loss_mlp": 1.00009871, - "epoch": 0.6851044641515106, - "flos": 27744225004800.0, - "grad_norm": 2.9988719811827633, - "language_loss": 0.80739737, - "learning_rate": 9.529364989142468e-07, - "loss": 0.82600248, - "num_input_tokens_seen": 246060490, - "step": 11395, - "time_per_iteration": 2.758030891418457 - }, - { - "auxiliary_loss_clip": 0.01077162, - "auxiliary_loss_mlp": 0.01035315, - "balance_loss_clip": 1.03991926, - "balance_loss_mlp": 1.02056861, - "epoch": 0.6851645874041785, - "flos": 24351061800960.0, - "grad_norm": 1.836466665804894, - "language_loss": 0.73088896, - "learning_rate": 9.526046950148527e-07, - "loss": 0.75201374, - "num_input_tokens_seen": 246081465, - "step": 11396, - "time_per_iteration": 2.781780481338501 - }, - { - "auxiliary_loss_clip": 0.01084632, - "auxiliary_loss_mlp": 0.01031663, - "balance_loss_clip": 1.03876483, - "balance_loss_mlp": 1.01705348, - "epoch": 0.6852247106568465, - "flos": 15077849264640.0, - "grad_norm": 3.468595562954195, - "language_loss": 0.79397655, - "learning_rate": 9.522729308327931e-07, - "loss": 0.81513953, - "num_input_tokens_seen": 246096110, - "step": 11397, - "time_per_iteration": 2.759290933609009 - }, - { - "auxiliary_loss_clip": 0.01035311, - "auxiliary_loss_mlp": 0.01038237, - "balance_loss_clip": 1.03174019, - "balance_loss_mlp": 1.02346683, - "epoch": 0.6852848339095146, - "flos": 18770005278720.0, - "grad_norm": 1.7538021552670298, - "language_loss": 0.71620733, - "learning_rate": 9.519412063806493e-07, - "loss": 0.73694277, - "num_input_tokens_seen": 246114785, - "step": 11398, - "time_per_iteration": 2.8469512462615967 - }, - { - "auxiliary_loss_clip": 0.01063012, - "auxiliary_loss_mlp": 0.01031401, - "balance_loss_clip": 1.03693652, - "balance_loss_mlp": 1.01927114, - "epoch": 0.6853449571621825, - "flos": 27854363082240.0, - "grad_norm": 1.5995227781124475, - "language_loss": 0.70539916, - "learning_rate": 9.516095216709996e-07, - "loss": 0.72634327, - "num_input_tokens_seen": 246136375, - "step": 11399, - "time_per_iteration": 2.8067455291748047 - }, - { - "auxiliary_loss_clip": 0.01099638, - "auxiliary_loss_mlp": 0.01034955, - "balance_loss_clip": 1.03867149, - "balance_loss_mlp": 1.02175879, - "epoch": 0.6854050804148505, - "flos": 18150510389760.0, - "grad_norm": 1.5522963452984355, - "language_loss": 0.7023446, - "learning_rate": 9.512778767164217e-07, - "loss": 0.72369051, - "num_input_tokens_seen": 246155090, - "step": 11400, - "time_per_iteration": 4.245120286941528 - }, - { - "auxiliary_loss_clip": 0.01077599, - "auxiliary_loss_mlp": 0.01038032, - "balance_loss_clip": 1.04081964, - "balance_loss_mlp": 1.02140248, - "epoch": 0.6854652036675184, - "flos": 16326212492160.0, - "grad_norm": 2.0326109813245217, - "language_loss": 0.77846044, - "learning_rate": 9.509462715294927e-07, - "loss": 0.79961675, - "num_input_tokens_seen": 246172645, - "step": 11401, - "time_per_iteration": 2.758004665374756 - }, - { - "auxiliary_loss_clip": 0.01113766, - "auxiliary_loss_mlp": 0.01037682, - "balance_loss_clip": 1.04050303, - "balance_loss_mlp": 1.02477169, - "epoch": 0.6855253269201864, - "flos": 14940814878720.0, - "grad_norm": 1.868317908345602, - "language_loss": 0.75315881, - "learning_rate": 9.50614706122786e-07, - "loss": 0.77467334, - "num_input_tokens_seen": 246189055, - "step": 11402, - "time_per_iteration": 2.562199115753174 - }, - { - "auxiliary_loss_clip": 0.0109933, - "auxiliary_loss_mlp": 0.01041955, - "balance_loss_clip": 1.03740358, - "balance_loss_mlp": 1.02720892, - "epoch": 0.6855854501728543, - "flos": 23037736826880.0, - "grad_norm": 1.5371325501517963, - "language_loss": 0.72588831, - "learning_rate": 9.502831805088742e-07, - "loss": 0.74730122, - "num_input_tokens_seen": 246207990, - "step": 11403, - "time_per_iteration": 2.677266836166382 - }, - { - "auxiliary_loss_clip": 0.01114001, - "auxiliary_loss_mlp": 0.01034831, - "balance_loss_clip": 1.04157901, - "balance_loss_mlp": 1.02145004, - "epoch": 0.6856455734255223, - "flos": 13253623194240.0, - "grad_norm": 2.0747553095420175, - "language_loss": 0.81451255, - "learning_rate": 9.499516947003294e-07, - "loss": 0.83600086, - "num_input_tokens_seen": 246221595, - "step": 11404, - "time_per_iteration": 2.5857958793640137 - }, - { - "auxiliary_loss_clip": 0.01086293, - "auxiliary_loss_mlp": 0.01040032, - "balance_loss_clip": 1.0375222, - "balance_loss_mlp": 1.0263046, - "epoch": 0.6857056966781903, - "flos": 23333461499520.0, - "grad_norm": 1.3945702093947172, - "language_loss": 0.77848321, - "learning_rate": 9.496202487097222e-07, - "loss": 0.79974639, - "num_input_tokens_seen": 246242970, - "step": 11405, - "time_per_iteration": 2.743281364440918 - }, - { - "auxiliary_loss_clip": 0.01023454, - "auxiliary_loss_mlp": 0.00999881, - "balance_loss_clip": 1.00911474, - "balance_loss_mlp": 0.99873084, - "epoch": 0.6857658199308583, - "flos": 61852647784320.0, - "grad_norm": 0.7882286280493239, - "language_loss": 0.60976082, - "learning_rate": 9.492888425496199e-07, - "loss": 0.62999415, - "num_input_tokens_seen": 246300405, - "step": 11406, - "time_per_iteration": 3.236720085144043 - }, - { - "auxiliary_loss_clip": 0.01080565, - "auxiliary_loss_mlp": 0.01035523, - "balance_loss_clip": 1.03731775, - "balance_loss_mlp": 1.02062762, - "epoch": 0.6858259431835262, - "flos": 16654543735680.0, - "grad_norm": 1.6671355551751728, - "language_loss": 0.76914632, - "learning_rate": 9.489574762325907e-07, - "loss": 0.79030716, - "num_input_tokens_seen": 246318780, - "step": 11407, - "time_per_iteration": 2.7857916355133057 - }, - { - "auxiliary_loss_clip": 0.01092831, - "auxiliary_loss_mlp": 0.01039174, - "balance_loss_clip": 1.0389874, - "balance_loss_mlp": 1.02427292, - "epoch": 0.6858860664361942, - "flos": 21872974504320.0, - "grad_norm": 2.9798515710303572, - "language_loss": 0.71276259, - "learning_rate": 9.486261497711991e-07, - "loss": 0.7340827, - "num_input_tokens_seen": 246339405, - "step": 11408, - "time_per_iteration": 2.8327853679656982 - }, - { - "auxiliary_loss_clip": 0.01104322, - "auxiliary_loss_mlp": 0.01031824, - "balance_loss_clip": 1.03901792, - "balance_loss_mlp": 1.01819825, - "epoch": 0.6859461896888621, - "flos": 15267637751040.0, - "grad_norm": 1.7652749442295346, - "language_loss": 0.70438635, - "learning_rate": 9.482948631780087e-07, - "loss": 0.72574776, - "num_input_tokens_seen": 246357055, - "step": 11409, - "time_per_iteration": 2.6262388229370117 - }, - { - "auxiliary_loss_clip": 0.01069373, - "auxiliary_loss_mlp": 0.01029456, - "balance_loss_clip": 1.03979826, - "balance_loss_mlp": 1.01718974, - "epoch": 0.6860063129415301, - "flos": 18620293392000.0, - "grad_norm": 1.5800008029842278, - "language_loss": 0.78244615, - "learning_rate": 9.479636164655825e-07, - "loss": 0.80343449, - "num_input_tokens_seen": 246374050, - "step": 11410, - "time_per_iteration": 2.742436408996582 - }, - { - "auxiliary_loss_clip": 0.01104718, - "auxiliary_loss_mlp": 0.0103935, - "balance_loss_clip": 1.03746653, - "balance_loss_mlp": 1.02479458, - "epoch": 0.6860664361941982, - "flos": 23951376190080.0, - "grad_norm": 1.9022000774970669, - "language_loss": 0.71458399, - "learning_rate": 9.476324096464821e-07, - "loss": 0.73602462, - "num_input_tokens_seen": 246392910, - "step": 11411, - "time_per_iteration": 2.7334024906158447 - }, - { - "auxiliary_loss_clip": 0.01062107, - "auxiliary_loss_mlp": 0.01047011, - "balance_loss_clip": 1.03538156, - "balance_loss_mlp": 1.03152537, - "epoch": 0.6861265594468661, - "flos": 20407782827520.0, - "grad_norm": 2.454164229167477, - "language_loss": 0.70101523, - "learning_rate": 9.473012427332654e-07, - "loss": 0.7221064, - "num_input_tokens_seen": 246411540, - "step": 11412, - "time_per_iteration": 2.830611228942871 - }, - { - "auxiliary_loss_clip": 0.01114643, - "auxiliary_loss_mlp": 0.01034015, - "balance_loss_clip": 1.03966832, - "balance_loss_mlp": 1.02018094, - "epoch": 0.6861866826995341, - "flos": 11428571111040.0, - "grad_norm": 3.537487518671294, - "language_loss": 0.71493286, - "learning_rate": 9.469701157384919e-07, - "loss": 0.73641944, - "num_input_tokens_seen": 246423295, - "step": 11413, - "time_per_iteration": 2.5294950008392334 - }, - { - "auxiliary_loss_clip": 0.01104826, - "auxiliary_loss_mlp": 0.01034314, - "balance_loss_clip": 1.03952384, - "balance_loss_mlp": 1.02099848, - "epoch": 0.686246805952202, - "flos": 15997593939840.0, - "grad_norm": 1.8251318339835605, - "language_loss": 0.73947906, - "learning_rate": 9.466390286747164e-07, - "loss": 0.7608704, - "num_input_tokens_seen": 246441045, - "step": 11414, - "time_per_iteration": 2.5965075492858887 - }, - { - "auxiliary_loss_clip": 0.01090896, - "auxiliary_loss_mlp": 0.01033258, - "balance_loss_clip": 1.03907096, - "balance_loss_mlp": 1.01883936, - "epoch": 0.68630692920487, - "flos": 19826712512640.0, - "grad_norm": 2.434529931317787, - "language_loss": 0.8682794, - "learning_rate": 9.46307981554495e-07, - "loss": 0.88952088, - "num_input_tokens_seen": 246456905, - "step": 11415, - "time_per_iteration": 2.6476597785949707 - }, - { - "auxiliary_loss_clip": 0.01106277, - "auxiliary_loss_mlp": 0.01036888, - "balance_loss_clip": 1.04034388, - "balance_loss_mlp": 1.02316129, - "epoch": 0.6863670524575379, - "flos": 26286216048000.0, - "grad_norm": 1.8704963128355632, - "language_loss": 0.67290139, - "learning_rate": 9.459769743903801e-07, - "loss": 0.69433296, - "num_input_tokens_seen": 246477545, - "step": 11416, - "time_per_iteration": 2.658177137374878 - }, - { - "auxiliary_loss_clip": 0.01090013, - "auxiliary_loss_mlp": 0.01041407, - "balance_loss_clip": 1.03826094, - "balance_loss_mlp": 1.02668476, - "epoch": 0.686427175710206, - "flos": 19173138595200.0, - "grad_norm": 1.424267552511055, - "language_loss": 0.76128805, - "learning_rate": 9.456460071949237e-07, - "loss": 0.78260225, - "num_input_tokens_seen": 246496705, - "step": 11417, - "time_per_iteration": 2.6901679039001465 - }, - { - "auxiliary_loss_clip": 0.01087664, - "auxiliary_loss_mlp": 0.01036409, - "balance_loss_clip": 1.03694177, - "balance_loss_mlp": 1.02199018, - "epoch": 0.6864872989628739, - "flos": 18916628595840.0, - "grad_norm": 2.686574302160358, - "language_loss": 0.7732662, - "learning_rate": 9.45315079980678e-07, - "loss": 0.79450691, - "num_input_tokens_seen": 246514860, - "step": 11418, - "time_per_iteration": 2.755699872970581 - }, - { - "auxiliary_loss_clip": 0.01066399, - "auxiliary_loss_mlp": 0.01031853, - "balance_loss_clip": 1.03764701, - "balance_loss_mlp": 1.01901984, - "epoch": 0.6865474222155419, - "flos": 25956196865280.0, - "grad_norm": 1.6068340325317958, - "language_loss": 0.76434135, - "learning_rate": 9.449841927601887e-07, - "loss": 0.78532386, - "num_input_tokens_seen": 246536145, - "step": 11419, - "time_per_iteration": 2.865663766860962 - }, - { - "auxiliary_loss_clip": 0.01111545, - "auxiliary_loss_mlp": 0.0103699, - "balance_loss_clip": 1.03836358, - "balance_loss_mlp": 1.02422845, - "epoch": 0.6866075454682098, - "flos": 18478087447680.0, - "grad_norm": 2.1745569847310624, - "language_loss": 0.71438152, - "learning_rate": 9.446533455460044e-07, - "loss": 0.7358669, - "num_input_tokens_seen": 246553265, - "step": 11420, - "time_per_iteration": 2.6367876529693604 - }, - { - "auxiliary_loss_clip": 0.01071734, - "auxiliary_loss_mlp": 0.01035393, - "balance_loss_clip": 1.03420091, - "balance_loss_mlp": 1.02145147, - "epoch": 0.6866676687208778, - "flos": 34239998298240.0, - "grad_norm": 1.4612378577280256, - "language_loss": 0.74987674, - "learning_rate": 9.443225383506712e-07, - "loss": 0.77094799, - "num_input_tokens_seen": 246575130, - "step": 11421, - "time_per_iteration": 2.905451774597168 - }, - { - "auxiliary_loss_clip": 0.01099049, - "auxiliary_loss_mlp": 0.01031312, - "balance_loss_clip": 1.03840101, - "balance_loss_mlp": 1.01820481, - "epoch": 0.6867277919735457, - "flos": 21721754246400.0, - "grad_norm": 1.8216780462844224, - "language_loss": 0.76901162, - "learning_rate": 9.439917711867338e-07, - "loss": 0.79031521, - "num_input_tokens_seen": 246593095, - "step": 11422, - "time_per_iteration": 2.7650146484375 - }, - { - "auxiliary_loss_clip": 0.01107124, - "auxiliary_loss_mlp": 0.01039503, - "balance_loss_clip": 1.04101586, - "balance_loss_mlp": 1.02516782, - "epoch": 0.6867879152262137, - "flos": 24097999507200.0, - "grad_norm": 1.6784649507913934, - "language_loss": 0.77082187, - "learning_rate": 9.436610440667334e-07, - "loss": 0.79228812, - "num_input_tokens_seen": 246612165, - "step": 11423, - "time_per_iteration": 2.8607351779937744 - }, - { - "auxiliary_loss_clip": 0.01082395, - "auxiliary_loss_mlp": 0.01033174, - "balance_loss_clip": 1.03814936, - "balance_loss_mlp": 1.01943564, - "epoch": 0.6868480384788818, - "flos": 21615818060160.0, - "grad_norm": 1.4803784362494392, - "language_loss": 0.72793746, - "learning_rate": 9.433303570032129e-07, - "loss": 0.74909317, - "num_input_tokens_seen": 246632065, - "step": 11424, - "time_per_iteration": 2.8126673698425293 - }, - { - "auxiliary_loss_clip": 0.01092944, - "auxiliary_loss_mlp": 0.0103129, - "balance_loss_clip": 1.03935122, - "balance_loss_mlp": 1.01783705, - "epoch": 0.6869081617315497, - "flos": 26286144220800.0, - "grad_norm": 1.8921444035478678, - "language_loss": 0.65257877, - "learning_rate": 9.429997100087112e-07, - "loss": 0.67382109, - "num_input_tokens_seen": 246651245, - "step": 11425, - "time_per_iteration": 2.7407920360565186 - }, - { - "auxiliary_loss_clip": 0.01073701, - "auxiliary_loss_mlp": 0.01028234, - "balance_loss_clip": 1.03880644, - "balance_loss_mlp": 1.01543677, - "epoch": 0.6869682849842177, - "flos": 21105096531840.0, - "grad_norm": 1.3754232219458198, - "language_loss": 0.71719813, - "learning_rate": 9.426691030957657e-07, - "loss": 0.73821747, - "num_input_tokens_seen": 246672225, - "step": 11426, - "time_per_iteration": 2.821906089782715 - }, - { - "auxiliary_loss_clip": 0.01060498, - "auxiliary_loss_mlp": 0.01034142, - "balance_loss_clip": 1.03450513, - "balance_loss_mlp": 1.02006936, - "epoch": 0.6870284082368856, - "flos": 17092653920640.0, - "grad_norm": 2.015308300418605, - "language_loss": 0.84978002, - "learning_rate": 9.423385362769136e-07, - "loss": 0.87072647, - "num_input_tokens_seen": 246688385, - "step": 11427, - "time_per_iteration": 2.769426107406616 - }, - { - "auxiliary_loss_clip": 0.01100329, - "auxiliary_loss_mlp": 0.01033991, - "balance_loss_clip": 1.03816628, - "balance_loss_mlp": 1.02096152, - "epoch": 0.6870885314895536, - "flos": 27308090067840.0, - "grad_norm": 1.7434629559161423, - "language_loss": 0.76254469, - "learning_rate": 9.420080095646909e-07, - "loss": 0.78388786, - "num_input_tokens_seen": 246710730, - "step": 11428, - "time_per_iteration": 2.708268165588379 - }, - { - "auxiliary_loss_clip": 0.01079241, - "auxiliary_loss_mlp": 0.01041381, - "balance_loss_clip": 1.03692877, - "balance_loss_mlp": 1.02690864, - "epoch": 0.6871486547422215, - "flos": 20814543417600.0, - "grad_norm": 2.419770929596293, - "language_loss": 0.73118293, - "learning_rate": 9.4167752297163e-07, - "loss": 0.75238913, - "num_input_tokens_seen": 246730350, - "step": 11429, - "time_per_iteration": 2.7956650257110596 - }, - { - "auxiliary_loss_clip": 0.01089951, - "auxiliary_loss_mlp": 0.01029631, - "balance_loss_clip": 1.03737235, - "balance_loss_mlp": 1.01640451, - "epoch": 0.6872087779948896, - "flos": 30154118330880.0, - "grad_norm": 1.9861165427275798, - "language_loss": 0.83426887, - "learning_rate": 9.413470765102643e-07, - "loss": 0.8554647, - "num_input_tokens_seen": 246751700, - "step": 11430, - "time_per_iteration": 4.525273084640503 - }, - { - "auxiliary_loss_clip": 0.01105193, - "auxiliary_loss_mlp": 0.01038794, - "balance_loss_clip": 1.03941309, - "balance_loss_mlp": 1.02549052, - "epoch": 0.6872689012475575, - "flos": 20704584908160.0, - "grad_norm": 2.206529961181577, - "language_loss": 0.7042433, - "learning_rate": 9.410166701931225e-07, - "loss": 0.72568321, - "num_input_tokens_seen": 246769860, - "step": 11431, - "time_per_iteration": 2.6291654109954834 - }, - { - "auxiliary_loss_clip": 0.01093068, - "auxiliary_loss_mlp": 0.00771593, - "balance_loss_clip": 1.03726888, - "balance_loss_mlp": 1.0001148, - "epoch": 0.6873290245002255, - "flos": 25520852027520.0, - "grad_norm": 1.7240375281978666, - "language_loss": 0.80058414, - "learning_rate": 9.406863040327355e-07, - "loss": 0.81923079, - "num_input_tokens_seen": 246789905, - "step": 11432, - "time_per_iteration": 2.7238457202911377 - }, - { - "auxiliary_loss_clip": 0.01089362, - "auxiliary_loss_mlp": 0.01029871, - "balance_loss_clip": 1.03868675, - "balance_loss_mlp": 1.01700783, - "epoch": 0.6873891477528934, - "flos": 25191479289600.0, - "grad_norm": 1.5401718085798923, - "language_loss": 0.67718959, - "learning_rate": 9.403559780416295e-07, - "loss": 0.6983819, - "num_input_tokens_seen": 246808815, - "step": 11433, - "time_per_iteration": 4.300631999969482 - }, - { - "auxiliary_loss_clip": 0.01108222, - "auxiliary_loss_mlp": 0.01044912, - "balance_loss_clip": 1.04331732, - "balance_loss_mlp": 1.03123283, - "epoch": 0.6874492710055614, - "flos": 35152380685440.0, - "grad_norm": 1.9633714481574007, - "language_loss": 0.73058158, - "learning_rate": 9.400256922323309e-07, - "loss": 0.75211298, - "num_input_tokens_seen": 246829775, - "step": 11434, - "time_per_iteration": 4.712211608886719 - }, - { - "auxiliary_loss_clip": 0.0107867, - "auxiliary_loss_mlp": 0.01034388, - "balance_loss_clip": 1.04082966, - "balance_loss_mlp": 1.02101231, - "epoch": 0.6875093942582293, - "flos": 17822215059840.0, - "grad_norm": 1.6101742183302694, - "language_loss": 0.80406773, - "learning_rate": 9.396954466173657e-07, - "loss": 0.82519835, - "num_input_tokens_seen": 246848045, - "step": 11435, - "time_per_iteration": 2.644397735595703 - }, - { - "auxiliary_loss_clip": 0.01116024, - "auxiliary_loss_mlp": 0.01035166, - "balance_loss_clip": 1.04015982, - "balance_loss_mlp": 1.02111077, - "epoch": 0.6875695175108973, - "flos": 20704548994560.0, - "grad_norm": 3.274458448067563, - "language_loss": 0.81117046, - "learning_rate": 9.393652412092538e-07, - "loss": 0.83268237, - "num_input_tokens_seen": 246866095, - "step": 11436, - "time_per_iteration": 2.600048303604126 - }, - { - "auxiliary_loss_clip": 0.0106725, - "auxiliary_loss_mlp": 0.0104019, - "balance_loss_clip": 1.03428948, - "balance_loss_mlp": 1.02743411, - "epoch": 0.6876296407635654, - "flos": 25374013228800.0, - "grad_norm": 1.9842620224172498, - "language_loss": 0.82207173, - "learning_rate": 9.390350760205183e-07, - "loss": 0.84314615, - "num_input_tokens_seen": 246883975, - "step": 11437, - "time_per_iteration": 2.7188313007354736 - }, - { - "auxiliary_loss_clip": 0.01097489, - "auxiliary_loss_mlp": 0.01042761, - "balance_loss_clip": 1.03876507, - "balance_loss_mlp": 1.02794886, - "epoch": 0.6876897640162333, - "flos": 23222317841280.0, - "grad_norm": 4.685984752688369, - "language_loss": 0.78381348, - "learning_rate": 9.387049510636793e-07, - "loss": 0.80521595, - "num_input_tokens_seen": 246901560, - "step": 11438, - "time_per_iteration": 2.6525228023529053 - }, - { - "auxiliary_loss_clip": 0.01108734, - "auxiliary_loss_mlp": 0.0103476, - "balance_loss_clip": 1.03871489, - "balance_loss_mlp": 1.02167058, - "epoch": 0.6877498872689013, - "flos": 27124335066240.0, - "grad_norm": 1.647979155501369, - "language_loss": 0.72087812, - "learning_rate": 9.383748663512554e-07, - "loss": 0.74231309, - "num_input_tokens_seen": 246922655, - "step": 11439, - "time_per_iteration": 4.218140363693237 - }, - { - "auxiliary_loss_clip": 0.01101936, - "auxiliary_loss_mlp": 0.01030006, - "balance_loss_clip": 1.03944337, - "balance_loss_mlp": 1.01658285, - "epoch": 0.6878100105215692, - "flos": 11581658876160.0, - "grad_norm": 1.9534001671179906, - "language_loss": 0.75862855, - "learning_rate": 9.380448218957623e-07, - "loss": 0.779948, - "num_input_tokens_seen": 246940100, - "step": 11440, - "time_per_iteration": 2.580472946166992 - }, - { - "auxiliary_loss_clip": 0.01066967, - "auxiliary_loss_mlp": 0.01040415, - "balance_loss_clip": 1.03528094, - "balance_loss_mlp": 1.02684307, - "epoch": 0.6878701337742372, - "flos": 20303175444480.0, - "grad_norm": 7.861818924260737, - "language_loss": 0.71750253, - "learning_rate": 9.377148177097167e-07, - "loss": 0.73857641, - "num_input_tokens_seen": 246958545, - "step": 11441, - "time_per_iteration": 2.706754207611084 - }, - { - "auxiliary_loss_clip": 0.01074524, - "auxiliary_loss_mlp": 0.01043281, - "balance_loss_clip": 1.03488159, - "balance_loss_mlp": 1.02677059, - "epoch": 0.6879302570269051, - "flos": 13840080549120.0, - "grad_norm": 1.6357806540454092, - "language_loss": 0.66401327, - "learning_rate": 9.373848538056317e-07, - "loss": 0.68519139, - "num_input_tokens_seen": 246974805, - "step": 11442, - "time_per_iteration": 2.7559654712677 - }, - { - "auxiliary_loss_clip": 0.0109822, - "auxiliary_loss_mlp": 0.01033105, - "balance_loss_clip": 1.03951812, - "balance_loss_mlp": 1.02001595, - "epoch": 0.6879903802795732, - "flos": 21324654414720.0, - "grad_norm": 4.42004898936703, - "language_loss": 0.69321597, - "learning_rate": 9.370549301960189e-07, - "loss": 0.71452922, - "num_input_tokens_seen": 246992505, - "step": 11443, - "time_per_iteration": 2.6616227626800537 - }, - { - "auxiliary_loss_clip": 0.0109609, - "auxiliary_loss_mlp": 0.01035788, - "balance_loss_clip": 1.03986192, - "balance_loss_mlp": 1.02196562, - "epoch": 0.6880505035322411, - "flos": 25152049134720.0, - "grad_norm": 2.6937329387099784, - "language_loss": 0.76372284, - "learning_rate": 9.367250468933893e-07, - "loss": 0.78504163, - "num_input_tokens_seen": 247013370, - "step": 11444, - "time_per_iteration": 2.8355183601379395 - }, - { - "auxiliary_loss_clip": 0.01110169, - "auxiliary_loss_mlp": 0.01032915, - "balance_loss_clip": 1.03819597, - "balance_loss_mlp": 1.02007592, - "epoch": 0.6881106267849091, - "flos": 23215530170880.0, - "grad_norm": 2.350463307106156, - "language_loss": 0.76555073, - "learning_rate": 9.363952039102536e-07, - "loss": 0.78698158, - "num_input_tokens_seen": 247029855, - "step": 11445, - "time_per_iteration": 2.567321300506592 - }, - { - "auxiliary_loss_clip": 0.01022025, - "auxiliary_loss_mlp": 0.01003467, - "balance_loss_clip": 1.00763083, - "balance_loss_mlp": 1.00232887, - "epoch": 0.688170750037577, - "flos": 48484397312640.0, - "grad_norm": 0.815591807379434, - "language_loss": 0.58349764, - "learning_rate": 9.360654012591183e-07, - "loss": 0.60375261, - "num_input_tokens_seen": 247085030, - "step": 11446, - "time_per_iteration": 3.1823232173919678 - }, - { - "auxiliary_loss_clip": 0.01102524, - "auxiliary_loss_mlp": 0.01031252, - "balance_loss_clip": 1.03622508, - "balance_loss_mlp": 1.01726246, - "epoch": 0.688230873290245, - "flos": 22783633038720.0, - "grad_norm": 1.4577025181029204, - "language_loss": 0.75851154, - "learning_rate": 9.357356389524886e-07, - "loss": 0.77984923, - "num_input_tokens_seen": 247104840, - "step": 11447, - "time_per_iteration": 2.6292076110839844 - }, - { - "auxiliary_loss_clip": 0.01092756, - "auxiliary_loss_mlp": 0.01038788, - "balance_loss_clip": 1.0371995, - "balance_loss_mlp": 1.02566266, - "epoch": 0.6882909965429129, - "flos": 22455660931200.0, - "grad_norm": 1.9523079882919305, - "language_loss": 0.73051161, - "learning_rate": 9.354059170028705e-07, - "loss": 0.75182706, - "num_input_tokens_seen": 247121905, - "step": 11448, - "time_per_iteration": 2.6177000999450684 - }, - { - "auxiliary_loss_clip": 0.01100637, - "auxiliary_loss_mlp": 0.01044689, - "balance_loss_clip": 1.0369277, - "balance_loss_mlp": 1.02910876, - "epoch": 0.688351119795581, - "flos": 26214143408640.0, - "grad_norm": 1.5228707353550825, - "language_loss": 0.74738759, - "learning_rate": 9.350762354227673e-07, - "loss": 0.76884079, - "num_input_tokens_seen": 247142375, - "step": 11449, - "time_per_iteration": 2.601680040359497 - }, - { - "auxiliary_loss_clip": 0.01111281, - "auxiliary_loss_mlp": 0.01034157, - "balance_loss_clip": 1.03867829, - "balance_loss_mlp": 1.02147889, - "epoch": 0.6884112430482489, - "flos": 22565260304640.0, - "grad_norm": 3.717332242852324, - "language_loss": 0.69703102, - "learning_rate": 9.34746594224679e-07, - "loss": 0.71848536, - "num_input_tokens_seen": 247161095, - "step": 11450, - "time_per_iteration": 2.664257764816284 - }, - { - "auxiliary_loss_clip": 0.0107707, - "auxiliary_loss_mlp": 0.01038466, - "balance_loss_clip": 1.03789186, - "balance_loss_mlp": 1.02427959, - "epoch": 0.6884713663009169, - "flos": 17341047446400.0, - "grad_norm": 1.8597549906829547, - "language_loss": 0.75942892, - "learning_rate": 9.344169934211068e-07, - "loss": 0.78058428, - "num_input_tokens_seen": 247178565, - "step": 11451, - "time_per_iteration": 2.6398167610168457 - }, - { - "auxiliary_loss_clip": 0.01101483, - "auxiliary_loss_mlp": 0.010314, - "balance_loss_clip": 1.03904259, - "balance_loss_mlp": 1.01854348, - "epoch": 0.6885314895535849, - "flos": 26470832976000.0, - "grad_norm": 1.4408172988825247, - "language_loss": 0.69557142, - "learning_rate": 9.340874330245505e-07, - "loss": 0.71690023, - "num_input_tokens_seen": 247202345, - "step": 11452, - "time_per_iteration": 2.6441712379455566 - }, - { - "auxiliary_loss_clip": 0.01112297, - "auxiliary_loss_mlp": 0.01036775, - "balance_loss_clip": 1.03905725, - "balance_loss_mlp": 1.02143824, - "epoch": 0.6885916128062528, - "flos": 20521548178560.0, - "grad_norm": 1.603751678545201, - "language_loss": 0.71996975, - "learning_rate": 9.337579130475042e-07, - "loss": 0.74146044, - "num_input_tokens_seen": 247219240, - "step": 11453, - "time_per_iteration": 2.564039707183838 - }, - { - "auxiliary_loss_clip": 0.010232, - "auxiliary_loss_mlp": 0.0075158, - "balance_loss_clip": 1.00928593, - "balance_loss_mlp": 0.99959499, - "epoch": 0.6886517360589208, - "flos": 70715795679360.0, - "grad_norm": 0.7798992537715281, - "language_loss": 0.50685745, - "learning_rate": 9.334284335024644e-07, - "loss": 0.52460527, - "num_input_tokens_seen": 247272010, - "step": 11454, - "time_per_iteration": 3.016122341156006 - }, - { - "auxiliary_loss_clip": 0.01097098, - "auxiliary_loss_mlp": 0.01035719, - "balance_loss_clip": 1.03854132, - "balance_loss_mlp": 1.02329731, - "epoch": 0.6887118593115887, - "flos": 17893533513600.0, - "grad_norm": 2.526020135416449, - "language_loss": 0.75680363, - "learning_rate": 9.330989944019263e-07, - "loss": 0.77813178, - "num_input_tokens_seen": 247290630, - "step": 11455, - "time_per_iteration": 2.7730109691619873 - }, - { - "auxiliary_loss_clip": 0.01092116, - "auxiliary_loss_mlp": 0.0103676, - "balance_loss_clip": 1.03623128, - "balance_loss_mlp": 1.02249074, - "epoch": 0.6887719825642568, - "flos": 17453017117440.0, - "grad_norm": 2.7328430061690154, - "language_loss": 0.7254653, - "learning_rate": 9.327695957583803e-07, - "loss": 0.74675405, - "num_input_tokens_seen": 247304800, - "step": 11456, - "time_per_iteration": 2.7660651206970215 - }, - { - "auxiliary_loss_clip": 0.0108935, - "auxiliary_loss_mlp": 0.01035233, - "balance_loss_clip": 1.03873277, - "balance_loss_mlp": 1.02247739, - "epoch": 0.6888321058169247, - "flos": 23070199743360.0, - "grad_norm": 2.090937721500204, - "language_loss": 0.81322861, - "learning_rate": 9.32440237584319e-07, - "loss": 0.83447444, - "num_input_tokens_seen": 247323450, - "step": 11457, - "time_per_iteration": 2.691455841064453 - }, - { - "auxiliary_loss_clip": 0.01105328, - "auxiliary_loss_mlp": 0.00771348, - "balance_loss_clip": 1.04052448, - "balance_loss_mlp": 1.00017715, - "epoch": 0.6888922290695927, - "flos": 23368833417600.0, - "grad_norm": 1.548184255846192, - "language_loss": 0.76552927, - "learning_rate": 9.321109198922301e-07, - "loss": 0.78429604, - "num_input_tokens_seen": 247343845, - "step": 11458, - "time_per_iteration": 2.6362695693969727 - }, - { - "auxiliary_loss_clip": 0.01113281, - "auxiliary_loss_mlp": 0.01034444, - "balance_loss_clip": 1.03937232, - "balance_loss_mlp": 1.02138472, - "epoch": 0.6889523523222606, - "flos": 17631636474240.0, - "grad_norm": 2.7369612879197986, - "language_loss": 0.67654693, - "learning_rate": 9.31781642694603e-07, - "loss": 0.69802415, - "num_input_tokens_seen": 247356650, - "step": 11459, - "time_per_iteration": 2.6157007217407227 - }, - { - "auxiliary_loss_clip": 0.01064164, - "auxiliary_loss_mlp": 0.01032258, - "balance_loss_clip": 1.03582239, - "balance_loss_mlp": 1.01958048, - "epoch": 0.6890124755749286, - "flos": 25228144097280.0, - "grad_norm": 1.4844709645177188, - "language_loss": 0.68446231, - "learning_rate": 9.314524060039221e-07, - "loss": 0.70542651, - "num_input_tokens_seen": 247377340, - "step": 11460, - "time_per_iteration": 2.7714388370513916 - }, - { - "auxiliary_loss_clip": 0.01087273, - "auxiliary_loss_mlp": 0.01033379, - "balance_loss_clip": 1.03934288, - "balance_loss_mlp": 1.01844215, - "epoch": 0.6890725988275965, - "flos": 20230240878720.0, - "grad_norm": 1.8579339278918177, - "language_loss": 0.77017105, - "learning_rate": 9.311232098326731e-07, - "loss": 0.7913776, - "num_input_tokens_seen": 247395805, - "step": 11461, - "time_per_iteration": 2.7195050716400146 - }, - { - "auxiliary_loss_clip": 0.01091784, - "auxiliary_loss_mlp": 0.01037341, - "balance_loss_clip": 1.03789628, - "balance_loss_mlp": 1.02331018, - "epoch": 0.6891327220802645, - "flos": 14535311264640.0, - "grad_norm": 1.7919419635412812, - "language_loss": 0.6962589, - "learning_rate": 9.307940541933401e-07, - "loss": 0.71755016, - "num_input_tokens_seen": 247413165, - "step": 11462, - "time_per_iteration": 2.695122718811035 - }, - { - "auxiliary_loss_clip": 0.01105224, - "auxiliary_loss_mlp": 0.01028927, - "balance_loss_clip": 1.04118133, - "balance_loss_mlp": 1.01500297, - "epoch": 0.6891928453329325, - "flos": 21139139646720.0, - "grad_norm": 1.4465330715019271, - "language_loss": 0.8737253, - "learning_rate": 9.304649390984034e-07, - "loss": 0.89506674, - "num_input_tokens_seen": 247433140, - "step": 11463, - "time_per_iteration": 2.746290922164917 - }, - { - "auxiliary_loss_clip": 0.01064548, - "auxiliary_loss_mlp": 0.01030124, - "balance_loss_clip": 1.04010975, - "balance_loss_mlp": 1.01829851, - "epoch": 0.6892529685856005, - "flos": 17858520731520.0, - "grad_norm": 1.5297822834727555, - "language_loss": 0.68426907, - "learning_rate": 9.301358645603428e-07, - "loss": 0.70521581, - "num_input_tokens_seen": 247451265, - "step": 11464, - "time_per_iteration": 2.8325612545013428 - }, - { - "auxiliary_loss_clip": 0.01102764, - "auxiliary_loss_mlp": 0.01040883, - "balance_loss_clip": 1.03917408, - "balance_loss_mlp": 1.02711463, - "epoch": 0.6893130918382685, - "flos": 29934811843200.0, - "grad_norm": 2.288958108481903, - "language_loss": 0.65110016, - "learning_rate": 9.298068305916373e-07, - "loss": 0.67253661, - "num_input_tokens_seen": 247471645, - "step": 11465, - "time_per_iteration": 2.815046787261963 - }, - { - "auxiliary_loss_clip": 0.01104457, - "auxiliary_loss_mlp": 0.01038209, - "balance_loss_clip": 1.03854775, - "balance_loss_mlp": 1.02463746, - "epoch": 0.6893732150909364, - "flos": 24388516707840.0, - "grad_norm": 1.3495813204241554, - "language_loss": 0.72669965, - "learning_rate": 9.294778372047649e-07, - "loss": 0.74812633, - "num_input_tokens_seen": 247491170, - "step": 11466, - "time_per_iteration": 2.671194314956665 - }, - { - "auxiliary_loss_clip": 0.01114766, - "auxiliary_loss_mlp": 0.01034591, - "balance_loss_clip": 1.04005003, - "balance_loss_mlp": 1.02122736, - "epoch": 0.6894333383436044, - "flos": 16982874979200.0, - "grad_norm": 1.6856701084963044, - "language_loss": 0.71847236, - "learning_rate": 9.291488844121995e-07, - "loss": 0.73996592, - "num_input_tokens_seen": 247509005, - "step": 11467, - "time_per_iteration": 2.759052276611328 - }, - { - "auxiliary_loss_clip": 0.01096068, - "auxiliary_loss_mlp": 0.01036799, - "balance_loss_clip": 1.03972626, - "balance_loss_mlp": 1.02171886, - "epoch": 0.6894934615962723, - "flos": 18985540838400.0, - "grad_norm": 1.978085572567592, - "language_loss": 0.80877995, - "learning_rate": 9.288199722264156e-07, - "loss": 0.83010864, - "num_input_tokens_seen": 247527050, - "step": 11468, - "time_per_iteration": 2.8261470794677734 - }, - { - "auxiliary_loss_clip": 0.01116061, - "auxiliary_loss_mlp": 0.01034579, - "balance_loss_clip": 1.04050148, - "balance_loss_mlp": 1.02103066, - "epoch": 0.6895535848489404, - "flos": 34531664734080.0, - "grad_norm": 1.489529294726542, - "language_loss": 0.66164148, - "learning_rate": 9.284911006598875e-07, - "loss": 0.68314791, - "num_input_tokens_seen": 247547765, - "step": 11469, - "time_per_iteration": 5.082685232162476 - }, - { - "auxiliary_loss_clip": 0.01023211, - "auxiliary_loss_mlp": 0.01004328, - "balance_loss_clip": 1.00959301, - "balance_loss_mlp": 1.00309992, - "epoch": 0.6896137081016083, - "flos": 50075852273280.0, - "grad_norm": 0.7983802511717295, - "language_loss": 0.55211931, - "learning_rate": 9.281622697250824e-07, - "loss": 0.57239467, - "num_input_tokens_seen": 247603515, - "step": 11470, - "time_per_iteration": 3.123518228530884 - }, - { - "auxiliary_loss_clip": 0.01098666, - "auxiliary_loss_mlp": 0.01034034, - "balance_loss_clip": 1.03866851, - "balance_loss_mlp": 1.02299523, - "epoch": 0.6896738313542763, - "flos": 19938215306880.0, - "grad_norm": 1.7748421149249738, - "language_loss": 0.78111279, - "learning_rate": 9.278334794344715e-07, - "loss": 0.80243975, - "num_input_tokens_seen": 247622110, - "step": 11471, - "time_per_iteration": 2.6707584857940674 - }, - { - "auxiliary_loss_clip": 0.01088217, - "auxiliary_loss_mlp": 0.01034739, - "balance_loss_clip": 1.03501463, - "balance_loss_mlp": 1.02104771, - "epoch": 0.6897339546069442, - "flos": 21725489260800.0, - "grad_norm": 1.724757958239778, - "language_loss": 0.78451025, - "learning_rate": 9.275047298005232e-07, - "loss": 0.80573976, - "num_input_tokens_seen": 247641905, - "step": 11472, - "time_per_iteration": 4.256728887557983 - }, - { - "auxiliary_loss_clip": 0.01081643, - "auxiliary_loss_mlp": 0.01032054, - "balance_loss_clip": 1.03641033, - "balance_loss_mlp": 1.0195905, - "epoch": 0.6897940778596122, - "flos": 19826497031040.0, - "grad_norm": 1.5995976978854818, - "language_loss": 0.76272285, - "learning_rate": 9.271760208357024e-07, - "loss": 0.78385979, - "num_input_tokens_seen": 247660945, - "step": 11473, - "time_per_iteration": 4.321485757827759 - }, - { - "auxiliary_loss_clip": 0.01070517, - "auxiliary_loss_mlp": 0.0105009, - "balance_loss_clip": 1.03430462, - "balance_loss_mlp": 1.03352571, - "epoch": 0.6898542011122801, - "flos": 17310056987520.0, - "grad_norm": 1.7861232918293928, - "language_loss": 0.75359839, - "learning_rate": 9.268473525524751e-07, - "loss": 0.77480447, - "num_input_tokens_seen": 247678395, - "step": 11474, - "time_per_iteration": 2.788238525390625 - }, - { - "auxiliary_loss_clip": 0.01068006, - "auxiliary_loss_mlp": 0.01032395, - "balance_loss_clip": 1.04364872, - "balance_loss_mlp": 1.01921058, - "epoch": 0.6899143243649482, - "flos": 24754051463040.0, - "grad_norm": 1.4663281053614279, - "language_loss": 0.74502885, - "learning_rate": 9.26518724963303e-07, - "loss": 0.76603287, - "num_input_tokens_seen": 247698380, - "step": 11475, - "time_per_iteration": 2.878188371658325 - }, - { - "auxiliary_loss_clip": 0.01084179, - "auxiliary_loss_mlp": 0.01035391, - "balance_loss_clip": 1.03779638, - "balance_loss_mlp": 1.02154493, - "epoch": 0.6899744476176161, - "flos": 17234536642560.0, - "grad_norm": 1.9957028062650322, - "language_loss": 0.88603026, - "learning_rate": 9.261901380806491e-07, - "loss": 0.90722603, - "num_input_tokens_seen": 247716370, - "step": 11476, - "time_per_iteration": 2.7922370433807373 - }, - { - "auxiliary_loss_clip": 0.01112551, - "auxiliary_loss_mlp": 0.01037017, - "balance_loss_clip": 1.03934443, - "balance_loss_mlp": 1.02450645, - "epoch": 0.6900345708702841, - "flos": 25410678036480.0, - "grad_norm": 1.5288697914631357, - "language_loss": 0.70166922, - "learning_rate": 9.258615919169724e-07, - "loss": 0.72316492, - "num_input_tokens_seen": 247737335, - "step": 11477, - "time_per_iteration": 2.780515193939209 - }, - { - "auxiliary_loss_clip": 0.01107191, - "auxiliary_loss_mlp": 0.01045383, - "balance_loss_clip": 1.03964376, - "balance_loss_mlp": 1.03081584, - "epoch": 0.6900946941229521, - "flos": 23434190213760.0, - "grad_norm": 2.1987152086234723, - "language_loss": 0.68323863, - "learning_rate": 9.255330864847313e-07, - "loss": 0.70476437, - "num_input_tokens_seen": 247756680, - "step": 11478, - "time_per_iteration": 4.340089559555054 - }, - { - "auxiliary_loss_clip": 0.01104632, - "auxiliary_loss_mlp": 0.0103447, - "balance_loss_clip": 1.04020643, - "balance_loss_mlp": 1.02203012, - "epoch": 0.69015481737562, - "flos": 17820096157440.0, - "grad_norm": 1.918426525633328, - "language_loss": 0.76238775, - "learning_rate": 9.252046217963843e-07, - "loss": 0.78377873, - "num_input_tokens_seen": 247774265, - "step": 11479, - "time_per_iteration": 2.7662193775177 - }, - { - "auxiliary_loss_clip": 0.01104072, - "auxiliary_loss_mlp": 0.01030752, - "balance_loss_clip": 1.03842354, - "balance_loss_mlp": 1.01716816, - "epoch": 0.690214940628288, - "flos": 17456500736640.0, - "grad_norm": 1.8031624410020608, - "language_loss": 0.78769386, - "learning_rate": 9.248761978643856e-07, - "loss": 0.8090421, - "num_input_tokens_seen": 247792395, - "step": 11480, - "time_per_iteration": 2.6917519569396973 - }, - { - "auxiliary_loss_clip": 0.01071212, - "auxiliary_loss_mlp": 0.01033409, - "balance_loss_clip": 1.03474808, - "balance_loss_mlp": 1.01971793, - "epoch": 0.6902750638809559, - "flos": 29566691308800.0, - "grad_norm": 2.1117215547556922, - "language_loss": 0.75273913, - "learning_rate": 9.245478147011885e-07, - "loss": 0.77378535, - "num_input_tokens_seen": 247811985, - "step": 11481, - "time_per_iteration": 2.914005994796753 - }, - { - "auxiliary_loss_clip": 0.01078232, - "auxiliary_loss_mlp": 0.01031748, - "balance_loss_clip": 1.03950965, - "balance_loss_mlp": 1.01795578, - "epoch": 0.690335187133624, - "flos": 25557121785600.0, - "grad_norm": 1.8140875397528662, - "language_loss": 0.69146681, - "learning_rate": 9.24219472319246e-07, - "loss": 0.71256661, - "num_input_tokens_seen": 247831880, - "step": 11482, - "time_per_iteration": 2.888972759246826 - }, - { - "auxiliary_loss_clip": 0.01114892, - "auxiliary_loss_mlp": 0.01033455, - "balance_loss_clip": 1.04087675, - "balance_loss_mlp": 1.02031827, - "epoch": 0.6903953103862919, - "flos": 22488447070080.0, - "grad_norm": 1.4863828280794367, - "language_loss": 0.82752049, - "learning_rate": 9.238911707310096e-07, - "loss": 0.84900403, - "num_input_tokens_seen": 247851170, - "step": 11483, - "time_per_iteration": 2.6664347648620605 - }, - { - "auxiliary_loss_clip": 0.01116625, - "auxiliary_loss_mlp": 0.01030991, - "balance_loss_clip": 1.0412333, - "balance_loss_mlp": 1.01880169, - "epoch": 0.6904554336389599, - "flos": 26100521712000.0, - "grad_norm": 1.9326210731008662, - "language_loss": 0.65550387, - "learning_rate": 9.235629099489273e-07, - "loss": 0.67697996, - "num_input_tokens_seen": 247868950, - "step": 11484, - "time_per_iteration": 2.629709005355835 - }, - { - "auxiliary_loss_clip": 0.01079245, - "auxiliary_loss_mlp": 0.01044412, - "balance_loss_clip": 1.03618813, - "balance_loss_mlp": 1.03035724, - "epoch": 0.6905155568916278, - "flos": 31171754545920.0, - "grad_norm": 1.4648771757692296, - "language_loss": 0.7359699, - "learning_rate": 9.232346899854479e-07, - "loss": 0.75720656, - "num_input_tokens_seen": 247889805, - "step": 11485, - "time_per_iteration": 2.780137300491333 - }, - { - "auxiliary_loss_clip": 0.01100883, - "auxiliary_loss_mlp": 0.00771626, - "balance_loss_clip": 1.04121161, - "balance_loss_mlp": 1.00017738, - "epoch": 0.6905756801442958, - "flos": 17639681120640.0, - "grad_norm": 1.7496856130724467, - "language_loss": 0.84967637, - "learning_rate": 9.22906510853017e-07, - "loss": 0.86840141, - "num_input_tokens_seen": 247908585, - "step": 11486, - "time_per_iteration": 2.6427667140960693 - }, - { - "auxiliary_loss_clip": 0.01053468, - "auxiliary_loss_mlp": 0.01037616, - "balance_loss_clip": 1.03498769, - "balance_loss_mlp": 1.02395463, - "epoch": 0.6906358033969637, - "flos": 22343691260160.0, - "grad_norm": 1.463253599304318, - "language_loss": 0.72599518, - "learning_rate": 9.225783725640786e-07, - "loss": 0.74690592, - "num_input_tokens_seen": 247928480, - "step": 11487, - "time_per_iteration": 2.8396995067596436 - }, - { - "auxiliary_loss_clip": 0.01016718, - "auxiliary_loss_mlp": 0.0100075, - "balance_loss_clip": 1.01205254, - "balance_loss_mlp": 0.99957544, - "epoch": 0.6906959266496318, - "flos": 69747789081600.0, - "grad_norm": 0.9486957802927981, - "language_loss": 0.66587651, - "learning_rate": 9.222502751310759e-07, - "loss": 0.68605119, - "num_input_tokens_seen": 247988855, - "step": 11488, - "time_per_iteration": 3.256028175354004 - }, - { - "auxiliary_loss_clip": 0.01090242, - "auxiliary_loss_mlp": 0.01035444, - "balance_loss_clip": 1.039554, - "balance_loss_mlp": 1.02100825, - "epoch": 0.6907560499022997, - "flos": 21434253788160.0, - "grad_norm": 1.736123733035723, - "language_loss": 0.74721605, - "learning_rate": 9.219222185664519e-07, - "loss": 0.76847291, - "num_input_tokens_seen": 248007685, - "step": 11489, - "time_per_iteration": 2.6813058853149414 - }, - { - "auxiliary_loss_clip": 0.01102738, - "auxiliary_loss_mlp": 0.01041104, - "balance_loss_clip": 1.03759074, - "balance_loss_mlp": 1.0267272, - "epoch": 0.6908161731549677, - "flos": 14392207480320.0, - "grad_norm": 2.0464811594474006, - "language_loss": 0.62228811, - "learning_rate": 9.215942028826445e-07, - "loss": 0.64372647, - "num_input_tokens_seen": 248025145, - "step": 11490, - "time_per_iteration": 2.7024333477020264 - }, - { - "auxiliary_loss_clip": 0.01090002, - "auxiliary_loss_mlp": 0.01032703, - "balance_loss_clip": 1.03779197, - "balance_loss_mlp": 1.01960242, - "epoch": 0.6908762964076357, - "flos": 20010970304640.0, - "grad_norm": 1.709286193075208, - "language_loss": 0.72809607, - "learning_rate": 9.212662280920937e-07, - "loss": 0.74932313, - "num_input_tokens_seen": 248043750, - "step": 11491, - "time_per_iteration": 2.746288537979126 - }, - { - "auxiliary_loss_clip": 0.01089559, - "auxiliary_loss_mlp": 0.00771788, - "balance_loss_clip": 1.03801966, - "balance_loss_mlp": 1.00016296, - "epoch": 0.6909364196603036, - "flos": 28769079853440.0, - "grad_norm": 1.39649539646883, - "language_loss": 0.70297456, - "learning_rate": 9.20938294207235e-07, - "loss": 0.72158802, - "num_input_tokens_seen": 248065765, - "step": 11492, - "time_per_iteration": 2.7897520065307617 - }, - { - "auxiliary_loss_clip": 0.010831, - "auxiliary_loss_mlp": 0.01033716, - "balance_loss_clip": 1.04620051, - "balance_loss_mlp": 1.0190773, - "epoch": 0.6909965429129716, - "flos": 22528128620160.0, - "grad_norm": 1.7344123027630052, - "language_loss": 0.74773538, - "learning_rate": 9.206104012405049e-07, - "loss": 0.76890349, - "num_input_tokens_seen": 248083810, - "step": 11493, - "time_per_iteration": 2.9563519954681396 - }, - { - "auxiliary_loss_clip": 0.01114123, - "auxiliary_loss_mlp": 0.01030007, - "balance_loss_clip": 1.04090369, - "balance_loss_mlp": 1.01648879, - "epoch": 0.6910566661656395, - "flos": 18405942981120.0, - "grad_norm": 1.7115108202132974, - "language_loss": 0.74647975, - "learning_rate": 9.20282549204336e-07, - "loss": 0.76792109, - "num_input_tokens_seen": 248103185, - "step": 11494, - "time_per_iteration": 2.606947422027588 - }, - { - "auxiliary_loss_clip": 0.01086005, - "auxiliary_loss_mlp": 0.01030061, - "balance_loss_clip": 1.03727913, - "balance_loss_mlp": 1.01682854, - "epoch": 0.6911167894183076, - "flos": 30773972355840.0, - "grad_norm": 1.4748735208604244, - "language_loss": 0.68749166, - "learning_rate": 9.19954738111161e-07, - "loss": 0.70865232, - "num_input_tokens_seen": 248125665, - "step": 11495, - "time_per_iteration": 2.768889904022217 - }, - { - "auxiliary_loss_clip": 0.01089976, - "auxiliary_loss_mlp": 0.01029957, - "balance_loss_clip": 1.03646207, - "balance_loss_mlp": 1.01640916, - "epoch": 0.6911769126709755, - "flos": 13735724561280.0, - "grad_norm": 1.8085674885564547, - "language_loss": 0.74088383, - "learning_rate": 9.196269679734119e-07, - "loss": 0.76208317, - "num_input_tokens_seen": 248142545, - "step": 11496, - "time_per_iteration": 2.6374707221984863 - }, - { - "auxiliary_loss_clip": 0.01075882, - "auxiliary_loss_mlp": 0.01033532, - "balance_loss_clip": 1.03445745, - "balance_loss_mlp": 1.02084804, - "epoch": 0.6912370359236435, - "flos": 17566854295680.0, - "grad_norm": 2.1445438478171, - "language_loss": 0.80236906, - "learning_rate": 9.19299238803515e-07, - "loss": 0.82346314, - "num_input_tokens_seen": 248160225, - "step": 11497, - "time_per_iteration": 2.6873879432678223 - }, - { - "auxiliary_loss_clip": 0.01074496, - "auxiliary_loss_mlp": 0.01037916, - "balance_loss_clip": 1.03591168, - "balance_loss_mlp": 1.02401567, - "epoch": 0.6912971591763114, - "flos": 22090772620800.0, - "grad_norm": 1.4245028324847169, - "language_loss": 0.8060286, - "learning_rate": 9.189715506138993e-07, - "loss": 0.82715273, - "num_input_tokens_seen": 248180430, - "step": 11498, - "time_per_iteration": 2.7175493240356445 - }, - { - "auxiliary_loss_clip": 0.01099715, - "auxiliary_loss_mlp": 0.01033226, - "balance_loss_clip": 1.03920996, - "balance_loss_mlp": 1.01955223, - "epoch": 0.6913572824289794, - "flos": 29971476650880.0, - "grad_norm": 1.5050738051892152, - "language_loss": 0.86088848, - "learning_rate": 9.186439034169915e-07, - "loss": 0.88221788, - "num_input_tokens_seen": 248202365, - "step": 11499, - "time_per_iteration": 2.7579431533813477 - }, - { - "auxiliary_loss_clip": 0.01080625, - "auxiliary_loss_mlp": 0.00771124, - "balance_loss_clip": 1.040236, - "balance_loss_mlp": 1.00014019, - "epoch": 0.6914174056816473, - "flos": 20448936835200.0, - "grad_norm": 1.7961404954828535, - "language_loss": 0.75816536, - "learning_rate": 9.183162972252145e-07, - "loss": 0.77668285, - "num_input_tokens_seen": 248221750, - "step": 11500, - "time_per_iteration": 2.658766031265259 - }, - { - "auxiliary_loss_clip": 0.01058615, - "auxiliary_loss_mlp": 0.01050016, - "balance_loss_clip": 1.03728688, - "balance_loss_mlp": 1.03423262, - "epoch": 0.6914775289343154, - "flos": 21282530739840.0, - "grad_norm": 1.8214656574654693, - "language_loss": 0.77514184, - "learning_rate": 9.179887320509921e-07, - "loss": 0.79622817, - "num_input_tokens_seen": 248239535, - "step": 11501, - "time_per_iteration": 2.751330614089966 - }, - { - "auxiliary_loss_clip": 0.01099448, - "auxiliary_loss_mlp": 0.01040566, - "balance_loss_clip": 1.03807986, - "balance_loss_mlp": 1.02625489, - "epoch": 0.6915376521869833, - "flos": 23878118401920.0, - "grad_norm": 1.7335303734743124, - "language_loss": 0.73580784, - "learning_rate": 9.176612079067458e-07, - "loss": 0.75720799, - "num_input_tokens_seen": 248259055, - "step": 11502, - "time_per_iteration": 2.8098790645599365 - }, - { - "auxiliary_loss_clip": 0.01041175, - "auxiliary_loss_mlp": 0.01044606, - "balance_loss_clip": 1.034199, - "balance_loss_mlp": 1.02875125, - "epoch": 0.6915977754396513, - "flos": 11510268595200.0, - "grad_norm": 2.5749254426128743, - "language_loss": 0.73368824, - "learning_rate": 9.173337248048953e-07, - "loss": 0.75454605, - "num_input_tokens_seen": 248276765, - "step": 11503, - "time_per_iteration": 2.747083902359009 - }, - { - "auxiliary_loss_clip": 0.01098455, - "auxiliary_loss_mlp": 0.01041122, - "balance_loss_clip": 1.03777838, - "balance_loss_mlp": 1.02701986, - "epoch": 0.6916578986923193, - "flos": 22601278667520.0, - "grad_norm": 1.7356607503629284, - "language_loss": 0.77010226, - "learning_rate": 9.170062827578575e-07, - "loss": 0.79149806, - "num_input_tokens_seen": 248295310, - "step": 11504, - "time_per_iteration": 2.706209182739258 - }, - { - "auxiliary_loss_clip": 0.01069336, - "auxiliary_loss_mlp": 0.01039527, - "balance_loss_clip": 1.0342164, - "balance_loss_mlp": 1.02457845, - "epoch": 0.6917180219449872, - "flos": 23477355383040.0, - "grad_norm": 1.7715532639399074, - "language_loss": 0.73565066, - "learning_rate": 9.166788817780499e-07, - "loss": 0.75673938, - "num_input_tokens_seen": 248315230, - "step": 11505, - "time_per_iteration": 2.725203514099121 - }, - { - "auxiliary_loss_clip": 0.01054739, - "auxiliary_loss_mlp": 0.00772936, - "balance_loss_clip": 1.03434849, - "balance_loss_mlp": 1.00009656, - "epoch": 0.6917781451976552, - "flos": 23732536579200.0, - "grad_norm": 1.8090122394504842, - "language_loss": 0.88027036, - "learning_rate": 9.163515218778886e-07, - "loss": 0.89854711, - "num_input_tokens_seen": 248332980, - "step": 11506, - "time_per_iteration": 2.796102285385132 - }, - { - "auxiliary_loss_clip": 0.01086001, - "auxiliary_loss_mlp": 0.01030005, - "balance_loss_clip": 1.03935504, - "balance_loss_mlp": 1.01724994, - "epoch": 0.6918382684503231, - "flos": 31466760946560.0, - "grad_norm": 2.045878343291588, - "language_loss": 0.7011205, - "learning_rate": 9.160242030697856e-07, - "loss": 0.72228056, - "num_input_tokens_seen": 248352865, - "step": 11507, - "time_per_iteration": 2.755439043045044 - }, - { - "auxiliary_loss_clip": 0.01086914, - "auxiliary_loss_mlp": 0.01036889, - "balance_loss_clip": 1.03763783, - "balance_loss_mlp": 1.02344775, - "epoch": 0.6918983917029912, - "flos": 21650471706240.0, - "grad_norm": 1.853503068489786, - "language_loss": 0.76915097, - "learning_rate": 9.156969253661538e-07, - "loss": 0.79038906, - "num_input_tokens_seen": 248371125, - "step": 11508, - "time_per_iteration": 4.521030426025391 - }, - { - "auxiliary_loss_clip": 0.0109627, - "auxiliary_loss_mlp": 0.01034219, - "balance_loss_clip": 1.03752184, - "balance_loss_mlp": 1.02148128, - "epoch": 0.6919585149556591, - "flos": 25550082720000.0, - "grad_norm": 1.8821969374944694, - "language_loss": 0.75171518, - "learning_rate": 9.153696887794027e-07, - "loss": 0.77302009, - "num_input_tokens_seen": 248390455, - "step": 11509, - "time_per_iteration": 2.69903826713562 - }, - { - "auxiliary_loss_clip": 0.01062313, - "auxiliary_loss_mlp": 0.01036661, - "balance_loss_clip": 1.03829181, - "balance_loss_mlp": 1.02342892, - "epoch": 0.6920186382083271, - "flos": 23659781581440.0, - "grad_norm": 1.9874772496775723, - "language_loss": 0.64212132, - "learning_rate": 9.150424933219425e-07, - "loss": 0.66311103, - "num_input_tokens_seen": 248411305, - "step": 11510, - "time_per_iteration": 2.848520278930664 - }, - { - "auxiliary_loss_clip": 0.0108123, - "auxiliary_loss_mlp": 0.01034683, - "balance_loss_clip": 1.03798079, - "balance_loss_mlp": 1.02002048, - "epoch": 0.692078761460995, - "flos": 19061959023360.0, - "grad_norm": 1.835249008120565, - "language_loss": 0.75375962, - "learning_rate": 9.147153390061788e-07, - "loss": 0.77491868, - "num_input_tokens_seen": 248430190, - "step": 11511, - "time_per_iteration": 4.174523115158081 - }, - { - "auxiliary_loss_clip": 0.01084843, - "auxiliary_loss_mlp": 0.01029668, - "balance_loss_clip": 1.04214227, - "balance_loss_mlp": 1.01793194, - "epoch": 0.692138884713663, - "flos": 29023291382400.0, - "grad_norm": 1.7047662296255404, - "language_loss": 0.62659085, - "learning_rate": 9.143882258445184e-07, - "loss": 0.64773595, - "num_input_tokens_seen": 248450830, - "step": 11512, - "time_per_iteration": 4.534400224685669 - }, - { - "auxiliary_loss_clip": 0.01080139, - "auxiliary_loss_mlp": 0.01036917, - "balance_loss_clip": 1.03771234, - "balance_loss_mlp": 1.02366054, - "epoch": 0.6921990079663309, - "flos": 14757849976320.0, - "grad_norm": 1.7738066365158425, - "language_loss": 0.82885146, - "learning_rate": 9.140611538493666e-07, - "loss": 0.85002202, - "num_input_tokens_seen": 248468585, - "step": 11513, - "time_per_iteration": 2.744152545928955 - }, - { - "auxiliary_loss_clip": 0.01050332, - "auxiliary_loss_mlp": 0.0103433, - "balance_loss_clip": 1.03769469, - "balance_loss_mlp": 1.02236128, - "epoch": 0.692259131218999, - "flos": 23841848643840.0, - "grad_norm": 1.4085469853389758, - "language_loss": 0.78494793, - "learning_rate": 9.137341230331233e-07, - "loss": 0.8057946, - "num_input_tokens_seen": 248490535, - "step": 11514, - "time_per_iteration": 2.7933335304260254 - }, - { - "auxiliary_loss_clip": 0.0106844, - "auxiliary_loss_mlp": 0.01038567, - "balance_loss_clip": 1.03552842, - "balance_loss_mlp": 1.0250721, - "epoch": 0.6923192544716669, - "flos": 19135073157120.0, - "grad_norm": 2.196765924687951, - "language_loss": 0.75278533, - "learning_rate": 9.134071334081907e-07, - "loss": 0.77385533, - "num_input_tokens_seen": 248508575, - "step": 11515, - "time_per_iteration": 2.7745299339294434 - }, - { - "auxiliary_loss_clip": 0.01070009, - "auxiliary_loss_mlp": 0.01033102, - "balance_loss_clip": 1.03992462, - "balance_loss_mlp": 1.02032304, - "epoch": 0.6923793777243349, - "flos": 28074639237120.0, - "grad_norm": 1.799388111244089, - "language_loss": 0.53198493, - "learning_rate": 9.130801849869694e-07, - "loss": 0.55301601, - "num_input_tokens_seen": 248527025, - "step": 11516, - "time_per_iteration": 2.775190830230713 - }, - { - "auxiliary_loss_clip": 0.01097274, - "auxiliary_loss_mlp": 0.01037809, - "balance_loss_clip": 1.03787732, - "balance_loss_mlp": 1.02451098, - "epoch": 0.6924395009770029, - "flos": 16581250033920.0, - "grad_norm": 1.6962423082360507, - "language_loss": 0.72982675, - "learning_rate": 9.127532777818557e-07, - "loss": 0.75117755, - "num_input_tokens_seen": 248544275, - "step": 11517, - "time_per_iteration": 2.598116397857666 - }, - { - "auxiliary_loss_clip": 0.0111384, - "auxiliary_loss_mlp": 0.01037211, - "balance_loss_clip": 1.03925538, - "balance_loss_mlp": 1.02354932, - "epoch": 0.6924996242296708, - "flos": 16655297921280.0, - "grad_norm": 1.6338598791065129, - "language_loss": 0.76462078, - "learning_rate": 9.124264118052465e-07, - "loss": 0.78613126, - "num_input_tokens_seen": 248561870, - "step": 11518, - "time_per_iteration": 4.141700983047485 - }, - { - "auxiliary_loss_clip": 0.0110627, - "auxiliary_loss_mlp": 0.01040853, - "balance_loss_clip": 1.04075885, - "balance_loss_mlp": 1.02653027, - "epoch": 0.6925597474823388, - "flos": 34754167532160.0, - "grad_norm": 1.3592469216685072, - "language_loss": 0.64467025, - "learning_rate": 9.120995870695376e-07, - "loss": 0.66614151, - "num_input_tokens_seen": 248588190, - "step": 11519, - "time_per_iteration": 2.8347549438476562 - }, - { - "auxiliary_loss_clip": 0.01080573, - "auxiliary_loss_mlp": 0.0103987, - "balance_loss_clip": 1.03696394, - "balance_loss_mlp": 1.02670944, - "epoch": 0.6926198707350067, - "flos": 21871717528320.0, - "grad_norm": 2.1051263263306805, - "language_loss": 0.62538528, - "learning_rate": 9.117728035871212e-07, - "loss": 0.64658964, - "num_input_tokens_seen": 248606460, - "step": 11520, - "time_per_iteration": 2.7294435501098633 - }, - { - "auxiliary_loss_clip": 0.01075792, - "auxiliary_loss_mlp": 0.01037449, - "balance_loss_clip": 1.03631949, - "balance_loss_mlp": 1.0228461, - "epoch": 0.6926799939876748, - "flos": 13006271162880.0, - "grad_norm": 2.2378150496595013, - "language_loss": 0.77924216, - "learning_rate": 9.114460613703887e-07, - "loss": 0.80037463, - "num_input_tokens_seen": 248623715, - "step": 11521, - "time_per_iteration": 2.717240571975708 - }, - { - "auxiliary_loss_clip": 0.01100684, - "auxiliary_loss_mlp": 0.0103794, - "balance_loss_clip": 1.03691578, - "balance_loss_mlp": 1.02260375, - "epoch": 0.6927401172403427, - "flos": 16761234107520.0, - "grad_norm": 2.442345109030128, - "language_loss": 0.81992316, - "learning_rate": 9.111193604317304e-07, - "loss": 0.84130937, - "num_input_tokens_seen": 248640575, - "step": 11522, - "time_per_iteration": 2.6045098304748535 - }, - { - "auxiliary_loss_clip": 0.01100284, - "auxiliary_loss_mlp": 0.01034079, - "balance_loss_clip": 1.04276228, - "balance_loss_mlp": 1.02152598, - "epoch": 0.6928002404930107, - "flos": 25705648523520.0, - "grad_norm": 1.8984649858129847, - "language_loss": 0.76575756, - "learning_rate": 9.107927007835361e-07, - "loss": 0.78710121, - "num_input_tokens_seen": 248663535, - "step": 11523, - "time_per_iteration": 2.6705586910247803 - }, - { - "auxiliary_loss_clip": 0.01082858, - "auxiliary_loss_mlp": 0.01035266, - "balance_loss_clip": 1.03894114, - "balance_loss_mlp": 1.02276123, - "epoch": 0.6928603637456786, - "flos": 18588261438720.0, - "grad_norm": 2.087470687803226, - "language_loss": 0.68297094, - "learning_rate": 9.104660824381915e-07, - "loss": 0.70415223, - "num_input_tokens_seen": 248681125, - "step": 11524, - "time_per_iteration": 2.6786375045776367 - }, - { - "auxiliary_loss_clip": 0.0108268, - "auxiliary_loss_mlp": 0.01033665, - "balance_loss_clip": 1.03927469, - "balance_loss_mlp": 1.01960993, - "epoch": 0.6929204869983466, - "flos": 22200874784640.0, - "grad_norm": 1.782896915319788, - "language_loss": 0.64250147, - "learning_rate": 9.101395054080815e-07, - "loss": 0.66366494, - "num_input_tokens_seen": 248700555, - "step": 11525, - "time_per_iteration": 2.709665536880493 - }, - { - "auxiliary_loss_clip": 0.01076674, - "auxiliary_loss_mlp": 0.01040353, - "balance_loss_clip": 1.04186177, - "balance_loss_mlp": 1.02660835, - "epoch": 0.6929806102510145, - "flos": 17894754576000.0, - "grad_norm": 2.1892792904192366, - "language_loss": 0.70518214, - "learning_rate": 9.098129697055907e-07, - "loss": 0.72635239, - "num_input_tokens_seen": 248716095, - "step": 11526, - "time_per_iteration": 2.7389345169067383 - }, - { - "auxiliary_loss_clip": 0.01089418, - "auxiliary_loss_mlp": 0.01034739, - "balance_loss_clip": 1.03708529, - "balance_loss_mlp": 1.02210879, - "epoch": 0.6930407335036826, - "flos": 19755178577280.0, - "grad_norm": 2.017152131296503, - "language_loss": 0.76394051, - "learning_rate": 9.094864753431022e-07, - "loss": 0.78518212, - "num_input_tokens_seen": 248735330, - "step": 11527, - "time_per_iteration": 2.675387382507324 - }, - { - "auxiliary_loss_clip": 0.01084801, - "auxiliary_loss_mlp": 0.01040813, - "balance_loss_clip": 1.03604603, - "balance_loss_mlp": 1.02701497, - "epoch": 0.6931008567563505, - "flos": 21544248211200.0, - "grad_norm": 1.6619978055585172, - "language_loss": 0.7924946, - "learning_rate": 9.091600223329952e-07, - "loss": 0.81375074, - "num_input_tokens_seen": 248754530, - "step": 11528, - "time_per_iteration": 2.708937883377075 - }, - { - "auxiliary_loss_clip": 0.01097731, - "auxiliary_loss_mlp": 0.01032598, - "balance_loss_clip": 1.03879142, - "balance_loss_mlp": 1.02049828, - "epoch": 0.6931609800090185, - "flos": 26250018117120.0, - "grad_norm": 1.5147905000718478, - "language_loss": 0.76348805, - "learning_rate": 9.088336106876491e-07, - "loss": 0.78479135, - "num_input_tokens_seen": 248775825, - "step": 11529, - "time_per_iteration": 2.7546539306640625 - }, - { - "auxiliary_loss_clip": 0.01110971, - "auxiliary_loss_mlp": 0.00770303, - "balance_loss_clip": 1.03999567, - "balance_loss_mlp": 1.00013018, - "epoch": 0.6932211032616865, - "flos": 32343376366080.0, - "grad_norm": 1.6406393226660527, - "language_loss": 0.7214883, - "learning_rate": 9.085072404194436e-07, - "loss": 0.74030107, - "num_input_tokens_seen": 248796180, - "step": 11530, - "time_per_iteration": 2.6844561100006104 - }, - { - "auxiliary_loss_clip": 0.01098446, - "auxiliary_loss_mlp": 0.01035346, - "balance_loss_clip": 1.04138708, - "balance_loss_mlp": 1.02000356, - "epoch": 0.6932812265143544, - "flos": 22049079909120.0, - "grad_norm": 1.6484845997572906, - "language_loss": 0.78485453, - "learning_rate": 9.081809115407513e-07, - "loss": 0.80619252, - "num_input_tokens_seen": 248814735, - "step": 11531, - "time_per_iteration": 2.753316879272461 - }, - { - "auxiliary_loss_clip": 0.010964, - "auxiliary_loss_mlp": 0.01038589, - "balance_loss_clip": 1.03926003, - "balance_loss_mlp": 1.02656698, - "epoch": 0.6933413497670224, - "flos": 26256626219520.0, - "grad_norm": 1.5040049491252714, - "language_loss": 0.69552708, - "learning_rate": 9.078546240639484e-07, - "loss": 0.71687698, - "num_input_tokens_seen": 248839140, - "step": 11532, - "time_per_iteration": 2.7001755237579346 - }, - { - "auxiliary_loss_clip": 0.01087082, - "auxiliary_loss_mlp": 0.01032046, - "balance_loss_clip": 1.03650141, - "balance_loss_mlp": 1.01820564, - "epoch": 0.6934014730196904, - "flos": 19573003774080.0, - "grad_norm": 1.314927604950551, - "language_loss": 0.6689446, - "learning_rate": 9.075283780014082e-07, - "loss": 0.69013584, - "num_input_tokens_seen": 248858300, - "step": 11533, - "time_per_iteration": 2.761096239089966 - }, - { - "auxiliary_loss_clip": 0.01089563, - "auxiliary_loss_mlp": 0.01038191, - "balance_loss_clip": 1.04126263, - "balance_loss_mlp": 1.02426171, - "epoch": 0.6934615962723584, - "flos": 22119249127680.0, - "grad_norm": 3.125205881661687, - "language_loss": 0.58564359, - "learning_rate": 9.072021733655007e-07, - "loss": 0.60692114, - "num_input_tokens_seen": 248876310, - "step": 11534, - "time_per_iteration": 2.6929404735565186 - }, - { - "auxiliary_loss_clip": 0.01078734, - "auxiliary_loss_mlp": 0.01030158, - "balance_loss_clip": 1.03795767, - "balance_loss_mlp": 1.01613939, - "epoch": 0.6935217195250263, - "flos": 21360816432000.0, - "grad_norm": 2.25045203731707, - "language_loss": 0.71212113, - "learning_rate": 9.068760101685971e-07, - "loss": 0.73321003, - "num_input_tokens_seen": 248895650, - "step": 11535, - "time_per_iteration": 2.68656849861145 - }, - { - "auxiliary_loss_clip": 0.01013917, - "auxiliary_loss_mlp": 0.01003832, - "balance_loss_clip": 1.00924766, - "balance_loss_mlp": 1.00264609, - "epoch": 0.6935818427776943, - "flos": 64063813115520.0, - "grad_norm": 0.7110018734854711, - "language_loss": 0.59062427, - "learning_rate": 9.065498884230638e-07, - "loss": 0.61080176, - "num_input_tokens_seen": 248963920, - "step": 11536, - "time_per_iteration": 3.347024917602539 - }, - { - "auxiliary_loss_clip": 0.0110154, - "auxiliary_loss_mlp": 0.00771293, - "balance_loss_clip": 1.04176164, - "balance_loss_mlp": 1.00036359, - "epoch": 0.6936419660303622, - "flos": 20302564913280.0, - "grad_norm": 1.511578579133692, - "language_loss": 0.72917026, - "learning_rate": 9.062238081412692e-07, - "loss": 0.74789858, - "num_input_tokens_seen": 248983380, - "step": 11537, - "time_per_iteration": 2.7138421535491943 - }, - { - "auxiliary_loss_clip": 0.01022423, - "auxiliary_loss_mlp": 0.00751474, - "balance_loss_clip": 1.0083034, - "balance_loss_mlp": 0.99969625, - "epoch": 0.6937020892830302, - "flos": 67182581347200.0, - "grad_norm": 0.7456734981947979, - "language_loss": 0.55525714, - "learning_rate": 9.058977693355767e-07, - "loss": 0.57299614, - "num_input_tokens_seen": 249044680, - "step": 11538, - "time_per_iteration": 3.1686036586761475 - }, - { - "auxiliary_loss_clip": 0.01097095, - "auxiliary_loss_mlp": 0.01037153, - "balance_loss_clip": 1.03834343, - "balance_loss_mlp": 1.02519631, - "epoch": 0.6937622125356981, - "flos": 23878190229120.0, - "grad_norm": 1.582889813468805, - "language_loss": 0.77747178, - "learning_rate": 9.055717720183505e-07, - "loss": 0.79881424, - "num_input_tokens_seen": 249061060, - "step": 11539, - "time_per_iteration": 2.7487149238586426 - }, - { - "auxiliary_loss_clip": 0.01088793, - "auxiliary_loss_mlp": 0.01029242, - "balance_loss_clip": 1.03841698, - "balance_loss_mlp": 1.01741016, - "epoch": 0.6938223357883662, - "flos": 28730619365760.0, - "grad_norm": 1.696359380020658, - "language_loss": 0.63957608, - "learning_rate": 9.05245816201953e-07, - "loss": 0.66075647, - "num_input_tokens_seen": 249081430, - "step": 11540, - "time_per_iteration": 2.897141456604004 - }, - { - "auxiliary_loss_clip": 0.01064567, - "auxiliary_loss_mlp": 0.01031848, - "balance_loss_clip": 1.0352695, - "balance_loss_mlp": 1.01913404, - "epoch": 0.6938824590410341, - "flos": 28655027193600.0, - "grad_norm": 1.5143087108308135, - "language_loss": 0.86776996, - "learning_rate": 9.049199018987437e-07, - "loss": 0.8887341, - "num_input_tokens_seen": 249103020, - "step": 11541, - "time_per_iteration": 2.790721893310547 - }, - { - "auxiliary_loss_clip": 0.01113533, - "auxiliary_loss_mlp": 0.00771014, - "balance_loss_clip": 1.04010653, - "balance_loss_mlp": 1.00017405, - "epoch": 0.6939425822937021, - "flos": 18983062800000.0, - "grad_norm": 2.0467106914623483, - "language_loss": 0.84313244, - "learning_rate": 9.04594029121081e-07, - "loss": 0.86197793, - "num_input_tokens_seen": 249120810, - "step": 11542, - "time_per_iteration": 2.6897356510162354 - }, - { - "auxiliary_loss_clip": 0.01101602, - "auxiliary_loss_mlp": 0.0103373, - "balance_loss_clip": 1.03908658, - "balance_loss_mlp": 1.01946616, - "epoch": 0.6940027055463701, - "flos": 23075838178560.0, - "grad_norm": 1.712845406510252, - "language_loss": 0.75460529, - "learning_rate": 9.04268197881323e-07, - "loss": 0.7759586, - "num_input_tokens_seen": 249138050, - "step": 11543, - "time_per_iteration": 2.6957714557647705 - }, - { - "auxiliary_loss_clip": 0.01092628, - "auxiliary_loss_mlp": 0.01030945, - "balance_loss_clip": 1.03984666, - "balance_loss_mlp": 1.01842248, - "epoch": 0.694062828799038, - "flos": 18186564666240.0, - "grad_norm": 1.7601740067431768, - "language_loss": 0.76118124, - "learning_rate": 9.039424081918241e-07, - "loss": 0.782417, - "num_input_tokens_seen": 249155570, - "step": 11544, - "time_per_iteration": 2.6654560565948486 - }, - { - "auxiliary_loss_clip": 0.01059106, - "auxiliary_loss_mlp": 0.01041973, - "balance_loss_clip": 1.03483558, - "balance_loss_mlp": 1.02701259, - "epoch": 0.694122952051706, - "flos": 17821532701440.0, - "grad_norm": 1.7077891138664472, - "language_loss": 0.71304005, - "learning_rate": 9.036166600649388e-07, - "loss": 0.73405087, - "num_input_tokens_seen": 249172960, - "step": 11545, - "time_per_iteration": 2.6869020462036133 - }, - { - "auxiliary_loss_clip": 0.0109854, - "auxiliary_loss_mlp": 0.01030108, - "balance_loss_clip": 1.04018188, - "balance_loss_mlp": 1.01828814, - "epoch": 0.694183075304374, - "flos": 21215306436480.0, - "grad_norm": 1.7532682541368763, - "language_loss": 0.79367101, - "learning_rate": 9.0329095351302e-07, - "loss": 0.8149575, - "num_input_tokens_seen": 249192450, - "step": 11546, - "time_per_iteration": 2.6320011615753174 - }, - { - "auxiliary_loss_clip": 0.01080505, - "auxiliary_loss_mlp": 0.01029935, - "balance_loss_clip": 1.03777122, - "balance_loss_mlp": 1.01704824, - "epoch": 0.694243198557042, - "flos": 24060508686720.0, - "grad_norm": 1.4008683277346297, - "language_loss": 0.78635859, - "learning_rate": 9.029652885484194e-07, - "loss": 0.80746305, - "num_input_tokens_seen": 249214320, - "step": 11547, - "time_per_iteration": 2.7307076454162598 - }, - { - "auxiliary_loss_clip": 0.010916, - "auxiliary_loss_mlp": 0.00771764, - "balance_loss_clip": 1.04151332, - "balance_loss_mlp": 1.00021195, - "epoch": 0.6943033218097099, - "flos": 21141869080320.0, - "grad_norm": 2.101396590702846, - "language_loss": 0.80507267, - "learning_rate": 9.026396651834834e-07, - "loss": 0.82370633, - "num_input_tokens_seen": 249230925, - "step": 11548, - "time_per_iteration": 4.426462650299072 - }, - { - "auxiliary_loss_clip": 0.01032364, - "auxiliary_loss_mlp": 0.0075149, - "balance_loss_clip": 1.00922537, - "balance_loss_mlp": 0.99970454, - "epoch": 0.6943634450623779, - "flos": 57812015975040.0, - "grad_norm": 0.6903286764237632, - "language_loss": 0.53703904, - "learning_rate": 9.023140834305613e-07, - "loss": 0.55487758, - "num_input_tokens_seen": 249293975, - "step": 11549, - "time_per_iteration": 3.1308066844940186 - }, - { - "auxiliary_loss_clip": 0.01093982, - "auxiliary_loss_mlp": 0.01035759, - "balance_loss_clip": 1.03542507, - "balance_loss_mlp": 1.02189505, - "epoch": 0.6944235683150458, - "flos": 30590684231040.0, - "grad_norm": 1.55426436192092, - "language_loss": 0.73198104, - "learning_rate": 9.01988543302e-07, - "loss": 0.75327837, - "num_input_tokens_seen": 249315285, - "step": 11550, - "time_per_iteration": 5.8028564453125 - }, - { - "auxiliary_loss_clip": 0.010896, - "auxiliary_loss_mlp": 0.01039664, - "balance_loss_clip": 1.04099548, - "balance_loss_mlp": 1.02650332, - "epoch": 0.6944836915677138, - "flos": 19719447523200.0, - "grad_norm": 1.9506864007678324, - "language_loss": 0.74081314, - "learning_rate": 9.016630448101425e-07, - "loss": 0.76210582, - "num_input_tokens_seen": 249333505, - "step": 11551, - "time_per_iteration": 2.665813446044922 - }, - { - "auxiliary_loss_clip": 0.01114588, - "auxiliary_loss_mlp": 0.01038306, - "balance_loss_clip": 1.0404079, - "balance_loss_mlp": 1.0249548, - "epoch": 0.6945438148203817, - "flos": 24863579009280.0, - "grad_norm": 1.5863003603219143, - "language_loss": 0.84288925, - "learning_rate": 9.01337587967333e-07, - "loss": 0.86441821, - "num_input_tokens_seen": 249354180, - "step": 11552, - "time_per_iteration": 2.8407604694366455 - }, - { - "auxiliary_loss_clip": 0.01112485, - "auxiliary_loss_mlp": 0.01035825, - "balance_loss_clip": 1.03997219, - "balance_loss_mlp": 1.02287877, - "epoch": 0.6946039380730498, - "flos": 33326646243840.0, - "grad_norm": 1.6205787984736582, - "language_loss": 0.6727165, - "learning_rate": 9.010121727859117e-07, - "loss": 0.69419956, - "num_input_tokens_seen": 249377035, - "step": 11553, - "time_per_iteration": 2.7572171688079834 - }, - { - "auxiliary_loss_clip": 0.01097133, - "auxiliary_loss_mlp": 0.01031798, - "balance_loss_clip": 1.04150629, - "balance_loss_mlp": 1.0176357, - "epoch": 0.6946640613257177, - "flos": 20850956830080.0, - "grad_norm": 2.0885031059024017, - "language_loss": 0.79817116, - "learning_rate": 9.006867992782195e-07, - "loss": 0.81946045, - "num_input_tokens_seen": 249396155, - "step": 11554, - "time_per_iteration": 2.721204996109009 - }, - { - "auxiliary_loss_clip": 0.01101639, - "auxiliary_loss_mlp": 0.01028417, - "balance_loss_clip": 1.03683937, - "balance_loss_mlp": 1.01538706, - "epoch": 0.6947241845783857, - "flos": 19354846521600.0, - "grad_norm": 5.498909507177023, - "language_loss": 0.72485244, - "learning_rate": 9.003614674565934e-07, - "loss": 0.746153, - "num_input_tokens_seen": 249414555, - "step": 11555, - "time_per_iteration": 2.5764734745025635 - }, - { - "auxiliary_loss_clip": 0.01075985, - "auxiliary_loss_mlp": 0.01033395, - "balance_loss_clip": 1.0355674, - "balance_loss_mlp": 1.02071118, - "epoch": 0.6947843078310536, - "flos": 27120240915840.0, - "grad_norm": 1.691992683709007, - "language_loss": 0.78099442, - "learning_rate": 9.000361773333705e-07, - "loss": 0.80208826, - "num_input_tokens_seen": 249433570, - "step": 11556, - "time_per_iteration": 2.709371328353882 - }, - { - "auxiliary_loss_clip": 0.01053238, - "auxiliary_loss_mlp": 0.01042754, - "balance_loss_clip": 1.03567553, - "balance_loss_mlp": 1.02977192, - "epoch": 0.6948444310837216, - "flos": 28585109370240.0, - "grad_norm": 2.67608324512941, - "language_loss": 0.6078257, - "learning_rate": 8.997109289208869e-07, - "loss": 0.62878561, - "num_input_tokens_seen": 249453735, - "step": 11557, - "time_per_iteration": 2.802755832672119 - }, - { - "auxiliary_loss_clip": 0.01091412, - "auxiliary_loss_mlp": 0.01036617, - "balance_loss_clip": 1.04582477, - "balance_loss_mlp": 1.02432072, - "epoch": 0.6949045543363896, - "flos": 15669262696320.0, - "grad_norm": 1.8868639353826757, - "language_loss": 0.85245895, - "learning_rate": 8.993857222314752e-07, - "loss": 0.87373924, - "num_input_tokens_seen": 249470805, - "step": 11558, - "time_per_iteration": 4.191239595413208 - }, - { - "auxiliary_loss_clip": 0.01103665, - "auxiliary_loss_mlp": 0.01036679, - "balance_loss_clip": 1.03848016, - "balance_loss_mlp": 1.02259421, - "epoch": 0.6949646775890576, - "flos": 23259413612160.0, - "grad_norm": 1.6011995670577914, - "language_loss": 0.70525056, - "learning_rate": 8.990605572774664e-07, - "loss": 0.72665399, - "num_input_tokens_seen": 249491150, - "step": 11559, - "time_per_iteration": 2.7076830863952637 - }, - { - "auxiliary_loss_clip": 0.01078357, - "auxiliary_loss_mlp": 0.01032244, - "balance_loss_clip": 1.03816533, - "balance_loss_mlp": 1.01998925, - "epoch": 0.6950248008417256, - "flos": 22382546797440.0, - "grad_norm": 2.0259020832909234, - "language_loss": 0.78594178, - "learning_rate": 8.987354340711921e-07, - "loss": 0.80704772, - "num_input_tokens_seen": 249511560, - "step": 11560, - "time_per_iteration": 2.7197508811950684 - }, - { - "auxiliary_loss_clip": 0.01087442, - "auxiliary_loss_mlp": 0.01034646, - "balance_loss_clip": 1.03931344, - "balance_loss_mlp": 1.0221293, - "epoch": 0.6950849240943935, - "flos": 23477355383040.0, - "grad_norm": 1.532648657325296, - "language_loss": 0.76758087, - "learning_rate": 8.9841035262498e-07, - "loss": 0.78880179, - "num_input_tokens_seen": 249531910, - "step": 11561, - "time_per_iteration": 2.707702159881592 - }, - { - "auxiliary_loss_clip": 0.01108982, - "auxiliary_loss_mlp": 0.01032613, - "balance_loss_clip": 1.03717422, - "balance_loss_mlp": 1.01877272, - "epoch": 0.6951450473470615, - "flos": 17420554200960.0, - "grad_norm": 1.812422200416747, - "language_loss": 0.78550988, - "learning_rate": 8.980853129511577e-07, - "loss": 0.80692589, - "num_input_tokens_seen": 249550300, - "step": 11562, - "time_per_iteration": 2.5765740871429443 - }, - { - "auxiliary_loss_clip": 0.01104346, - "auxiliary_loss_mlp": 0.01034539, - "balance_loss_clip": 1.0394088, - "balance_loss_mlp": 1.02134836, - "epoch": 0.6952051705997294, - "flos": 20485745297280.0, - "grad_norm": 1.9668484309221967, - "language_loss": 0.69117391, - "learning_rate": 8.977603150620515e-07, - "loss": 0.7125628, - "num_input_tokens_seen": 249567740, - "step": 11563, - "time_per_iteration": 2.6727218627929688 - }, - { - "auxiliary_loss_clip": 0.01090765, - "auxiliary_loss_mlp": 0.0102846, - "balance_loss_clip": 1.03766811, - "balance_loss_mlp": 1.0160023, - "epoch": 0.6952652938523974, - "flos": 13989541040640.0, - "grad_norm": 2.495686990019142, - "language_loss": 0.73530227, - "learning_rate": 8.974353589699846e-07, - "loss": 0.75649452, - "num_input_tokens_seen": 249582700, - "step": 11564, - "time_per_iteration": 2.576385259628296 - }, - { - "auxiliary_loss_clip": 0.01083646, - "auxiliary_loss_mlp": 0.01038821, - "balance_loss_clip": 1.04269266, - "balance_loss_mlp": 1.02250147, - "epoch": 0.6953254171050653, - "flos": 30953956429440.0, - "grad_norm": 1.8121742039667086, - "language_loss": 0.71753776, - "learning_rate": 8.971104446872785e-07, - "loss": 0.73876244, - "num_input_tokens_seen": 249602920, - "step": 11565, - "time_per_iteration": 2.732823133468628 - }, - { - "auxiliary_loss_clip": 0.01016312, - "auxiliary_loss_mlp": 0.01000486, - "balance_loss_clip": 1.01167345, - "balance_loss_mlp": 0.99898958, - "epoch": 0.6953855403577334, - "flos": 61670257499520.0, - "grad_norm": 0.9560441236848968, - "language_loss": 0.58358735, - "learning_rate": 8.96785572226255e-07, - "loss": 0.60375541, - "num_input_tokens_seen": 249660400, - "step": 11566, - "time_per_iteration": 3.0193676948547363 - }, - { - "auxiliary_loss_clip": 0.01081084, - "auxiliary_loss_mlp": 0.01031181, - "balance_loss_clip": 1.04008102, - "balance_loss_mlp": 1.01653004, - "epoch": 0.6954456636104013, - "flos": 23039029716480.0, - "grad_norm": 1.9855993328717996, - "language_loss": 0.7417689, - "learning_rate": 8.964607415992338e-07, - "loss": 0.76289153, - "num_input_tokens_seen": 249679335, - "step": 11567, - "time_per_iteration": 2.72933030128479 - }, - { - "auxiliary_loss_clip": 0.01081196, - "auxiliary_loss_mlp": 0.01034089, - "balance_loss_clip": 1.03550458, - "balance_loss_mlp": 1.02039182, - "epoch": 0.6955057868630693, - "flos": 23918518224000.0, - "grad_norm": 1.2846819146580761, - "language_loss": 0.76948917, - "learning_rate": 8.961359528185313e-07, - "loss": 0.79064202, - "num_input_tokens_seen": 249701805, - "step": 11568, - "time_per_iteration": 2.715871572494507 - }, - { - "auxiliary_loss_clip": 0.01096832, - "auxiliary_loss_mlp": 0.01035387, - "balance_loss_clip": 1.04105902, - "balance_loss_mlp": 1.02265501, - "epoch": 0.6955659101157372, - "flos": 22594634651520.0, - "grad_norm": 1.619102090378134, - "language_loss": 0.72502244, - "learning_rate": 8.958112058964649e-07, - "loss": 0.74634463, - "num_input_tokens_seen": 249720550, - "step": 11569, - "time_per_iteration": 2.645249366760254 - }, - { - "auxiliary_loss_clip": 0.01091211, - "auxiliary_loss_mlp": 0.01033237, - "balance_loss_clip": 1.04227805, - "balance_loss_mlp": 1.01993299, - "epoch": 0.6956260333684052, - "flos": 24572523104640.0, - "grad_norm": 1.7249170948582337, - "language_loss": 0.76852113, - "learning_rate": 8.954865008453471e-07, - "loss": 0.78976554, - "num_input_tokens_seen": 249740325, - "step": 11570, - "time_per_iteration": 2.7455241680145264 - }, - { - "auxiliary_loss_clip": 0.01102536, - "auxiliary_loss_mlp": 0.01035635, - "balance_loss_clip": 1.03880751, - "balance_loss_mlp": 1.0223434, - "epoch": 0.6956861566210732, - "flos": 25846058787840.0, - "grad_norm": 2.852635379776112, - "language_loss": 0.7431376, - "learning_rate": 8.95161837677493e-07, - "loss": 0.76451933, - "num_input_tokens_seen": 249760570, - "step": 11571, - "time_per_iteration": 2.6448328495025635 - }, - { - "auxiliary_loss_clip": 0.01094888, - "auxiliary_loss_mlp": 0.01033197, - "balance_loss_clip": 1.03635645, - "balance_loss_mlp": 1.01997066, - "epoch": 0.6957462798737412, - "flos": 15301393557120.0, - "grad_norm": 1.759907053555304, - "language_loss": 0.74442685, - "learning_rate": 8.948372164052118e-07, - "loss": 0.76570773, - "num_input_tokens_seen": 249778290, - "step": 11572, - "time_per_iteration": 2.6260786056518555 - }, - { - "auxiliary_loss_clip": 0.01089599, - "auxiliary_loss_mlp": 0.01029265, - "balance_loss_clip": 1.03659272, - "balance_loss_mlp": 1.01614022, - "epoch": 0.6958064031264092, - "flos": 36246830135040.0, - "grad_norm": 1.9436396296550662, - "language_loss": 0.7025919, - "learning_rate": 8.94512637040814e-07, - "loss": 0.72378051, - "num_input_tokens_seen": 249800925, - "step": 11573, - "time_per_iteration": 2.783256769180298 - }, - { - "auxiliary_loss_clip": 0.0109259, - "auxiliary_loss_mlp": 0.01036465, - "balance_loss_clip": 1.04023504, - "balance_loss_mlp": 1.02252948, - "epoch": 0.6958665263790771, - "flos": 19208725994880.0, - "grad_norm": 2.0706527554899137, - "language_loss": 0.75003505, - "learning_rate": 8.941880995966095e-07, - "loss": 0.77132565, - "num_input_tokens_seen": 249820500, - "step": 11574, - "time_per_iteration": 2.684457540512085 - }, - { - "auxiliary_loss_clip": 0.01077067, - "auxiliary_loss_mlp": 0.01034435, - "balance_loss_clip": 1.03447127, - "balance_loss_mlp": 1.02117276, - "epoch": 0.6959266496317451, - "flos": 21795838047360.0, - "grad_norm": 1.601224427976484, - "language_loss": 0.74403846, - "learning_rate": 8.938636040849014e-07, - "loss": 0.76515353, - "num_input_tokens_seen": 249839845, - "step": 11575, - "time_per_iteration": 2.7856502532958984 - }, - { - "auxiliary_loss_clip": 0.01102844, - "auxiliary_loss_mlp": 0.0103293, - "balance_loss_clip": 1.03945291, - "balance_loss_mlp": 1.01965618, - "epoch": 0.695986772884413, - "flos": 20558248899840.0, - "grad_norm": 1.7874437641987468, - "language_loss": 0.78887069, - "learning_rate": 8.935391505179966e-07, - "loss": 0.81022847, - "num_input_tokens_seen": 249857400, - "step": 11576, - "time_per_iteration": 2.6610217094421387 - }, - { - "auxiliary_loss_clip": 0.01068698, - "auxiliary_loss_mlp": 0.01032851, - "balance_loss_clip": 1.03670073, - "balance_loss_mlp": 1.02041745, - "epoch": 0.696046896137081, - "flos": 14936217937920.0, - "grad_norm": 2.693764444619471, - "language_loss": 0.567918, - "learning_rate": 8.932147389081985e-07, - "loss": 0.58893347, - "num_input_tokens_seen": 249871645, - "step": 11577, - "time_per_iteration": 2.666973114013672 - }, - { - "auxiliary_loss_clip": 0.01034011, - "auxiliary_loss_mlp": 0.01034025, - "balance_loss_clip": 1.03415358, - "balance_loss_mlp": 1.02244925, - "epoch": 0.696107019389749, - "flos": 30740216549760.0, - "grad_norm": 1.3814908758376254, - "language_loss": 0.77030635, - "learning_rate": 8.928903692678081e-07, - "loss": 0.79098672, - "num_input_tokens_seen": 249894215, - "step": 11578, - "time_per_iteration": 2.8858745098114014 - }, - { - "auxiliary_loss_clip": 0.01078498, - "auxiliary_loss_mlp": 0.01037884, - "balance_loss_clip": 1.03798914, - "balance_loss_mlp": 1.02474117, - "epoch": 0.696167142642417, - "flos": 20776729374720.0, - "grad_norm": 1.8210944500658799, - "language_loss": 0.79498136, - "learning_rate": 8.925660416091254e-07, - "loss": 0.81614518, - "num_input_tokens_seen": 249912850, - "step": 11579, - "time_per_iteration": 2.664579153060913 - }, - { - "auxiliary_loss_clip": 0.01072667, - "auxiliary_loss_mlp": 0.01030035, - "balance_loss_clip": 1.03569424, - "balance_loss_mlp": 1.01685047, - "epoch": 0.6962272658950849, - "flos": 22565152563840.0, - "grad_norm": 1.691551451223947, - "language_loss": 0.72261667, - "learning_rate": 8.922417559444502e-07, - "loss": 0.7436437, - "num_input_tokens_seen": 249932650, - "step": 11580, - "time_per_iteration": 2.61865496635437 - }, - { - "auxiliary_loss_clip": 0.01096209, - "auxiliary_loss_mlp": 0.0103361, - "balance_loss_clip": 1.04078209, - "balance_loss_mlp": 1.01977623, - "epoch": 0.6962873891477529, - "flos": 22200156512640.0, - "grad_norm": 2.100362129300219, - "language_loss": 0.65822804, - "learning_rate": 8.919175122860787e-07, - "loss": 0.67952627, - "num_input_tokens_seen": 249951205, - "step": 11581, - "time_per_iteration": 2.559589385986328 - }, - { - "auxiliary_loss_clip": 0.0111328, - "auxiliary_loss_mlp": 0.01033054, - "balance_loss_clip": 1.03978491, - "balance_loss_mlp": 1.02038205, - "epoch": 0.6963475124004208, - "flos": 12489695717760.0, - "grad_norm": 1.963411305318447, - "language_loss": 0.76478052, - "learning_rate": 8.915933106463056e-07, - "loss": 0.78624392, - "num_input_tokens_seen": 249967045, - "step": 11582, - "time_per_iteration": 2.4627199172973633 - }, - { - "auxiliary_loss_clip": 0.01086826, - "auxiliary_loss_mlp": 0.01032933, - "balance_loss_clip": 1.03617883, - "balance_loss_mlp": 1.0209347, - "epoch": 0.6964076356530888, - "flos": 17165085696000.0, - "grad_norm": 2.040960017039001, - "language_loss": 0.69914186, - "learning_rate": 8.91269151037425e-07, - "loss": 0.72033942, - "num_input_tokens_seen": 249984565, - "step": 11583, - "time_per_iteration": 2.5302255153656006 - }, - { - "auxiliary_loss_clip": 0.01087174, - "auxiliary_loss_mlp": 0.01035499, - "balance_loss_clip": 1.0447557, - "balance_loss_mlp": 1.02192092, - "epoch": 0.6964677589057569, - "flos": 19937317466880.0, - "grad_norm": 4.628034393643301, - "language_loss": 0.8247509, - "learning_rate": 8.909450334717301e-07, - "loss": 0.84597766, - "num_input_tokens_seen": 250004235, - "step": 11584, - "time_per_iteration": 2.6446306705474854 - }, - { - "auxiliary_loss_clip": 0.01064623, - "auxiliary_loss_mlp": 0.01039599, - "balance_loss_clip": 1.04226518, - "balance_loss_mlp": 1.02518678, - "epoch": 0.6965278821584248, - "flos": 22784064001920.0, - "grad_norm": 2.204044356312338, - "language_loss": 0.80097932, - "learning_rate": 8.906209579615107e-07, - "loss": 0.82202154, - "num_input_tokens_seen": 250017645, - "step": 11585, - "time_per_iteration": 2.739288568496704 - }, - { - "auxiliary_loss_clip": 0.01109133, - "auxiliary_loss_mlp": 0.01033696, - "balance_loss_clip": 1.03880882, - "balance_loss_mlp": 1.02158463, - "epoch": 0.6965880054110928, - "flos": 20047563285120.0, - "grad_norm": 1.6413269485286424, - "language_loss": 0.7727071, - "learning_rate": 8.90296924519055e-07, - "loss": 0.79413539, - "num_input_tokens_seen": 250037640, - "step": 11586, - "time_per_iteration": 2.624624013900757 - }, - { - "auxiliary_loss_clip": 0.01098392, - "auxiliary_loss_mlp": 0.01031126, - "balance_loss_clip": 1.03904796, - "balance_loss_mlp": 1.0188241, - "epoch": 0.6966481286637607, - "flos": 21908238681600.0, - "grad_norm": 1.6313830444330661, - "language_loss": 0.78602171, - "learning_rate": 8.899729331566519e-07, - "loss": 0.8073169, - "num_input_tokens_seen": 250056490, - "step": 11587, - "time_per_iteration": 4.355099439620972 - }, - { - "auxiliary_loss_clip": 0.01088702, - "auxiliary_loss_mlp": 0.01033313, - "balance_loss_clip": 1.04040754, - "balance_loss_mlp": 1.02037311, - "epoch": 0.6967082519164287, - "flos": 15633172506240.0, - "grad_norm": 1.7929628743992274, - "language_loss": 0.72862899, - "learning_rate": 8.896489838865857e-07, - "loss": 0.74984908, - "num_input_tokens_seen": 250074285, - "step": 11588, - "time_per_iteration": 2.609231472015381 - }, - { - "auxiliary_loss_clip": 0.01084626, - "auxiliary_loss_mlp": 0.01027673, - "balance_loss_clip": 1.03783381, - "balance_loss_mlp": 1.01608634, - "epoch": 0.6967683751690966, - "flos": 24024598064640.0, - "grad_norm": 1.8203274112017525, - "language_loss": 0.75158805, - "learning_rate": 8.893250767211413e-07, - "loss": 0.77271104, - "num_input_tokens_seen": 250093350, - "step": 11589, - "time_per_iteration": 4.13300895690918 - }, - { - "auxiliary_loss_clip": 0.01092018, - "auxiliary_loss_mlp": 0.01029816, - "balance_loss_clip": 1.0398705, - "balance_loss_mlp": 1.01764512, - "epoch": 0.6968284984217646, - "flos": 31024700265600.0, - "grad_norm": 2.003391247755446, - "language_loss": 0.63547194, - "learning_rate": 8.890012116726012e-07, - "loss": 0.6566903, - "num_input_tokens_seen": 250114170, - "step": 11590, - "time_per_iteration": 4.382747411727905 - }, - { - "auxiliary_loss_clip": 0.0099554, - "auxiliary_loss_mlp": 0.01020589, - "balance_loss_clip": 1.01743388, - "balance_loss_mlp": 1.01875329, - "epoch": 0.6968886216744326, - "flos": 67622990002560.0, - "grad_norm": 0.7509859568380568, - "language_loss": 0.61225605, - "learning_rate": 8.88677388753248e-07, - "loss": 0.63241732, - "num_input_tokens_seen": 250178250, - "step": 11591, - "time_per_iteration": 3.3300459384918213 - }, - { - "auxiliary_loss_clip": 0.01070341, - "auxiliary_loss_mlp": 0.00770828, - "balance_loss_clip": 1.04759347, - "balance_loss_mlp": 1.0002979, - "epoch": 0.6969487449271006, - "flos": 24863686750080.0, - "grad_norm": 3.0435355552778893, - "language_loss": 0.69087148, - "learning_rate": 8.883536079753582e-07, - "loss": 0.70928317, - "num_input_tokens_seen": 250198420, - "step": 11592, - "time_per_iteration": 2.8862390518188477 - }, - { - "auxiliary_loss_clip": 0.010765, - "auxiliary_loss_mlp": 0.01030332, - "balance_loss_clip": 1.03645754, - "balance_loss_mlp": 1.01758289, - "epoch": 0.6970088681797685, - "flos": 28767858791040.0, - "grad_norm": 1.559625176666528, - "language_loss": 0.6217618, - "learning_rate": 8.880298693512109e-07, - "loss": 0.64283013, - "num_input_tokens_seen": 250220650, - "step": 11593, - "time_per_iteration": 2.743360757827759 - }, - { - "auxiliary_loss_clip": 0.01085759, - "auxiliary_loss_mlp": 0.01027148, - "balance_loss_clip": 1.0385946, - "balance_loss_mlp": 1.01526248, - "epoch": 0.6970689914324365, - "flos": 27308556944640.0, - "grad_norm": 4.989009563072009, - "language_loss": 0.54315436, - "learning_rate": 8.877061728930832e-07, - "loss": 0.56428343, - "num_input_tokens_seen": 250241750, - "step": 11594, - "time_per_iteration": 2.738746404647827 - }, - { - "auxiliary_loss_clip": 0.01100892, - "auxiliary_loss_mlp": 0.01029271, - "balance_loss_clip": 1.04011106, - "balance_loss_mlp": 1.01718903, - "epoch": 0.6971291146851044, - "flos": 19136258305920.0, - "grad_norm": 1.879143273787494, - "language_loss": 0.76764494, - "learning_rate": 8.87382518613248e-07, - "loss": 0.78894663, - "num_input_tokens_seen": 250259445, - "step": 11595, - "time_per_iteration": 2.6188101768493652 - }, - { - "auxiliary_loss_clip": 0.01091633, - "auxiliary_loss_mlp": 0.00771425, - "balance_loss_clip": 1.04053104, - "balance_loss_mlp": 1.00017834, - "epoch": 0.6971892379377724, - "flos": 14610508387200.0, - "grad_norm": 3.031219644590871, - "language_loss": 0.71711326, - "learning_rate": 8.870589065239793e-07, - "loss": 0.73574388, - "num_input_tokens_seen": 250275640, - "step": 11596, - "time_per_iteration": 4.301288843154907 - }, - { - "auxiliary_loss_clip": 0.01114621, - "auxiliary_loss_mlp": 0.0103195, - "balance_loss_clip": 1.04142773, - "balance_loss_mlp": 1.0187242, - "epoch": 0.6972493611904405, - "flos": 22307457415680.0, - "grad_norm": 1.6942281967354775, - "language_loss": 0.76373446, - "learning_rate": 8.867353366375492e-07, - "loss": 0.78520012, - "num_input_tokens_seen": 250296435, - "step": 11597, - "time_per_iteration": 2.6642062664031982 - }, - { - "auxiliary_loss_clip": 0.0110086, - "auxiliary_loss_mlp": 0.01034036, - "balance_loss_clip": 1.03868103, - "balance_loss_mlp": 1.02113771, - "epoch": 0.6973094844431084, - "flos": 17420374632960.0, - "grad_norm": 1.9087026852292102, - "language_loss": 0.74286294, - "learning_rate": 8.864118089662267e-07, - "loss": 0.76421189, - "num_input_tokens_seen": 250314035, - "step": 11598, - "time_per_iteration": 2.6227927207946777 - }, - { - "auxiliary_loss_clip": 0.01097599, - "auxiliary_loss_mlp": 0.01035152, - "balance_loss_clip": 1.04099619, - "balance_loss_mlp": 1.02143049, - "epoch": 0.6973696076957764, - "flos": 27235370983680.0, - "grad_norm": 1.7329159354231054, - "language_loss": 0.89613545, - "learning_rate": 8.860883235222791e-07, - "loss": 0.91746294, - "num_input_tokens_seen": 250332995, - "step": 11599, - "time_per_iteration": 2.6820266246795654 - }, - { - "auxiliary_loss_clip": 0.0111129, - "auxiliary_loss_mlp": 0.01041078, - "balance_loss_clip": 1.04336691, - "balance_loss_mlp": 1.02599812, - "epoch": 0.6974297309484443, - "flos": 22018089450240.0, - "grad_norm": 2.876326408269763, - "language_loss": 0.69646597, - "learning_rate": 8.85764880317974e-07, - "loss": 0.71798968, - "num_input_tokens_seen": 250352120, - "step": 11600, - "time_per_iteration": 2.6357643604278564 - }, - { - "auxiliary_loss_clip": 0.01071835, - "auxiliary_loss_mlp": 0.01034549, - "balance_loss_clip": 1.03591609, - "balance_loss_mlp": 1.02162719, - "epoch": 0.6974898542011123, - "flos": 28366449327360.0, - "grad_norm": 1.8870788782898071, - "language_loss": 0.77037942, - "learning_rate": 8.854414793655771e-07, - "loss": 0.79144335, - "num_input_tokens_seen": 250371705, - "step": 11601, - "time_per_iteration": 2.767747402191162 - }, - { - "auxiliary_loss_clip": 0.01095268, - "auxiliary_loss_mlp": 0.00769859, - "balance_loss_clip": 1.03727877, - "balance_loss_mlp": 1.0001725, - "epoch": 0.6975499774537802, - "flos": 15232050351360.0, - "grad_norm": 1.9951550875439237, - "language_loss": 0.7223537, - "learning_rate": 8.851181206773508e-07, - "loss": 0.74100494, - "num_input_tokens_seen": 250390485, - "step": 11602, - "time_per_iteration": 2.7312278747558594 - }, - { - "auxiliary_loss_clip": 0.0109282, - "auxiliary_loss_mlp": 0.00770776, - "balance_loss_clip": 1.03932607, - "balance_loss_mlp": 1.00030899, - "epoch": 0.6976101007064482, - "flos": 22157422306560.0, - "grad_norm": 2.0827075826583115, - "language_loss": 0.76365876, - "learning_rate": 8.847948042655567e-07, - "loss": 0.78229469, - "num_input_tokens_seen": 250407020, - "step": 11603, - "time_per_iteration": 2.689286231994629 - }, - { - "auxiliary_loss_clip": 0.01062872, - "auxiliary_loss_mlp": 0.01031896, - "balance_loss_clip": 1.03511834, - "balance_loss_mlp": 1.01951063, - "epoch": 0.6976702239591162, - "flos": 22273522041600.0, - "grad_norm": 1.585564011892126, - "language_loss": 0.62287712, - "learning_rate": 8.844715301424557e-07, - "loss": 0.64382482, - "num_input_tokens_seen": 250425880, - "step": 11604, - "time_per_iteration": 2.7053442001342773 - }, - { - "auxiliary_loss_clip": 0.01097384, - "auxiliary_loss_mlp": 0.01033963, - "balance_loss_clip": 1.0394032, - "balance_loss_mlp": 1.01989651, - "epoch": 0.6977303472117842, - "flos": 25848608653440.0, - "grad_norm": 2.5387493951629954, - "language_loss": 0.82072401, - "learning_rate": 8.841482983203057e-07, - "loss": 0.8420375, - "num_input_tokens_seen": 250442925, - "step": 11605, - "time_per_iteration": 2.62129545211792 - }, - { - "auxiliary_loss_clip": 0.01101547, - "auxiliary_loss_mlp": 0.01035684, - "balance_loss_clip": 1.03945065, - "balance_loss_mlp": 1.02364397, - "epoch": 0.6977904704644521, - "flos": 20959586536320.0, - "grad_norm": 1.5446115393296036, - "language_loss": 0.70200372, - "learning_rate": 8.838251088113638e-07, - "loss": 0.72337604, - "num_input_tokens_seen": 250461220, - "step": 11606, - "time_per_iteration": 2.5922529697418213 - }, - { - "auxiliary_loss_clip": 0.01092847, - "auxiliary_loss_mlp": 0.01030242, - "balance_loss_clip": 1.03967309, - "balance_loss_mlp": 1.01759934, - "epoch": 0.6978505937171201, - "flos": 22055041566720.0, - "grad_norm": 2.256488814952284, - "language_loss": 0.82503331, - "learning_rate": 8.835019616278856e-07, - "loss": 0.84626418, - "num_input_tokens_seen": 250480975, - "step": 11607, - "time_per_iteration": 2.6494314670562744 - }, - { - "auxiliary_loss_clip": 0.0109393, - "auxiliary_loss_mlp": 0.01035205, - "balance_loss_clip": 1.04016328, - "balance_loss_mlp": 1.02121639, - "epoch": 0.697910716969788, - "flos": 20043720529920.0, - "grad_norm": 1.8306370219101196, - "language_loss": 0.78975695, - "learning_rate": 8.831788567821265e-07, - "loss": 0.81104833, - "num_input_tokens_seen": 250497980, - "step": 11608, - "time_per_iteration": 2.6512763500213623 - }, - { - "auxiliary_loss_clip": 0.01095127, - "auxiliary_loss_mlp": 0.0103495, - "balance_loss_clip": 1.03882265, - "balance_loss_mlp": 1.02155733, - "epoch": 0.697970840222456, - "flos": 15888245961600.0, - "grad_norm": 2.0581522063930104, - "language_loss": 0.89782465, - "learning_rate": 8.828557942863357e-07, - "loss": 0.91912538, - "num_input_tokens_seen": 250511910, - "step": 11609, - "time_per_iteration": 2.608104944229126 - }, - { - "auxiliary_loss_clip": 0.01078996, - "auxiliary_loss_mlp": 0.01028801, - "balance_loss_clip": 1.03936923, - "balance_loss_mlp": 1.0155983, - "epoch": 0.698030963475124, - "flos": 21215629658880.0, - "grad_norm": 1.5744545790773263, - "language_loss": 0.63836277, - "learning_rate": 8.82532774152765e-07, - "loss": 0.65944076, - "num_input_tokens_seen": 250531090, - "step": 11610, - "time_per_iteration": 2.743638038635254 - }, - { - "auxiliary_loss_clip": 0.0108087, - "auxiliary_loss_mlp": 0.01031692, - "balance_loss_clip": 1.03968239, - "balance_loss_mlp": 1.01942515, - "epoch": 0.698091086727792, - "flos": 33759728524800.0, - "grad_norm": 1.9401424134223804, - "language_loss": 0.84452772, - "learning_rate": 8.822097963936643e-07, - "loss": 0.86565328, - "num_input_tokens_seen": 250551565, - "step": 11611, - "time_per_iteration": 2.840013265609741 - }, - { - "auxiliary_loss_clip": 0.01102996, - "auxiliary_loss_mlp": 0.01033816, - "balance_loss_clip": 1.03944659, - "balance_loss_mlp": 1.02075076, - "epoch": 0.69815120998046, - "flos": 15887850912000.0, - "grad_norm": 1.998962318660509, - "language_loss": 0.70772141, - "learning_rate": 8.818868610212793e-07, - "loss": 0.7290895, - "num_input_tokens_seen": 250569625, - "step": 11612, - "time_per_iteration": 2.625783681869507 - }, - { - "auxiliary_loss_clip": 0.01094811, - "auxiliary_loss_mlp": 0.01031465, - "balance_loss_clip": 1.03740323, - "balance_loss_mlp": 1.01831591, - "epoch": 0.6982113332331279, - "flos": 18947044437120.0, - "grad_norm": 1.6349871835857621, - "language_loss": 0.80866611, - "learning_rate": 8.815639680478573e-07, - "loss": 0.82992887, - "num_input_tokens_seen": 250586960, - "step": 11613, - "time_per_iteration": 2.601461887359619 - }, - { - "auxiliary_loss_clip": 0.0110142, - "auxiliary_loss_mlp": 0.01035984, - "balance_loss_clip": 1.03952324, - "balance_loss_mlp": 1.02403307, - "epoch": 0.6982714564857959, - "flos": 24389594115840.0, - "grad_norm": 1.8644816544131795, - "language_loss": 0.75648409, - "learning_rate": 8.812411174856411e-07, - "loss": 0.77785814, - "num_input_tokens_seen": 250605080, - "step": 11614, - "time_per_iteration": 2.5961225032806396 - }, - { - "auxiliary_loss_clip": 0.01054504, - "auxiliary_loss_mlp": 0.01034441, - "balance_loss_clip": 1.04426599, - "balance_loss_mlp": 1.0214231, - "epoch": 0.6983315797384638, - "flos": 20083725302400.0, - "grad_norm": 2.0397367451391033, - "language_loss": 0.77191758, - "learning_rate": 8.809183093468746e-07, - "loss": 0.79280698, - "num_input_tokens_seen": 250623965, - "step": 11615, - "time_per_iteration": 2.9072482585906982 - }, - { - "auxiliary_loss_clip": 0.01083429, - "auxiliary_loss_mlp": 0.01032942, - "balance_loss_clip": 1.03927791, - "balance_loss_mlp": 1.0201925, - "epoch": 0.6983917029911318, - "flos": 13512431664000.0, - "grad_norm": 1.8809513154033768, - "language_loss": 0.73199165, - "learning_rate": 8.80595543643797e-07, - "loss": 0.75315541, - "num_input_tokens_seen": 250640675, - "step": 11616, - "time_per_iteration": 2.961540937423706 - }, - { - "auxiliary_loss_clip": 0.01114861, - "auxiliary_loss_mlp": 0.01037909, - "balance_loss_clip": 1.04299331, - "balance_loss_mlp": 1.02498102, - "epoch": 0.6984518262437998, - "flos": 22018412672640.0, - "grad_norm": 1.6387107954550246, - "language_loss": 0.84313893, - "learning_rate": 8.802728203886487e-07, - "loss": 0.86466658, - "num_input_tokens_seen": 250660295, - "step": 11617, - "time_per_iteration": 2.586671829223633 - }, - { - "auxiliary_loss_clip": 0.01074632, - "auxiliary_loss_mlp": 0.01043262, - "balance_loss_clip": 1.03665471, - "balance_loss_mlp": 1.02901638, - "epoch": 0.6985119494964678, - "flos": 18770615809920.0, - "grad_norm": 2.7737807419222102, - "language_loss": 0.59687322, - "learning_rate": 8.799501395936682e-07, - "loss": 0.61805212, - "num_input_tokens_seen": 250678155, - "step": 11618, - "time_per_iteration": 2.6617705821990967 - }, - { - "auxiliary_loss_clip": 0.01090766, - "auxiliary_loss_mlp": 0.01037543, - "balance_loss_clip": 1.04022956, - "balance_loss_mlp": 1.02521658, - "epoch": 0.6985720727491357, - "flos": 22382834106240.0, - "grad_norm": 1.6558560034629248, - "language_loss": 0.83071142, - "learning_rate": 8.796275012710903e-07, - "loss": 0.85199451, - "num_input_tokens_seen": 250697230, - "step": 11619, - "time_per_iteration": 2.6859943866729736 - }, - { - "auxiliary_loss_clip": 0.0109875, - "auxiliary_loss_mlp": 0.01029593, - "balance_loss_clip": 1.04005098, - "balance_loss_mlp": 1.01863778, - "epoch": 0.6986321960018037, - "flos": 39567884785920.0, - "grad_norm": 1.8289485555178766, - "language_loss": 0.67044997, - "learning_rate": 8.793049054331494e-07, - "loss": 0.69173336, - "num_input_tokens_seen": 250719865, - "step": 11620, - "time_per_iteration": 2.7397263050079346 - }, - { - "auxiliary_loss_clip": 0.01062849, - "auxiliary_loss_mlp": 0.01030294, - "balance_loss_clip": 1.03714263, - "balance_loss_mlp": 1.01732397, - "epoch": 0.6986923192544716, - "flos": 17967725055360.0, - "grad_norm": 2.0135534332411353, - "language_loss": 0.72860134, - "learning_rate": 8.789823520920794e-07, - "loss": 0.7495327, - "num_input_tokens_seen": 250736565, - "step": 11621, - "time_per_iteration": 2.731579303741455 - }, - { - "auxiliary_loss_clip": 0.01060219, - "auxiliary_loss_mlp": 0.01043603, - "balance_loss_clip": 1.03684866, - "balance_loss_mlp": 1.02971494, - "epoch": 0.6987524425071396, - "flos": 25594325297280.0, - "grad_norm": 2.5508212623422573, - "language_loss": 0.68065464, - "learning_rate": 8.7865984126011e-07, - "loss": 0.70169282, - "num_input_tokens_seen": 250757235, - "step": 11622, - "time_per_iteration": 2.7399957180023193 - }, - { - "auxiliary_loss_clip": 0.01044503, - "auxiliary_loss_mlp": 0.01029486, - "balance_loss_clip": 1.03589344, - "balance_loss_mlp": 1.01729143, - "epoch": 0.6988125657598077, - "flos": 17530081747200.0, - "grad_norm": 4.399012607705736, - "language_loss": 0.62462044, - "learning_rate": 8.783373729494721e-07, - "loss": 0.64536035, - "num_input_tokens_seen": 250775585, - "step": 11623, - "time_per_iteration": 2.712000846862793 - }, - { - "auxiliary_loss_clip": 0.01115272, - "auxiliary_loss_mlp": 0.01029213, - "balance_loss_clip": 1.03893709, - "balance_loss_mlp": 1.01590955, - "epoch": 0.6988726890124756, - "flos": 39165721136640.0, - "grad_norm": 1.8283272495674747, - "language_loss": 0.6077522, - "learning_rate": 8.780149471723932e-07, - "loss": 0.62919706, - "num_input_tokens_seen": 250795725, - "step": 11624, - "time_per_iteration": 2.765336275100708 - }, - { - "auxiliary_loss_clip": 0.01103178, - "auxiliary_loss_mlp": 0.01043279, - "balance_loss_clip": 1.0379796, - "balance_loss_mlp": 1.02959347, - "epoch": 0.6989328122651436, - "flos": 20193468330240.0, - "grad_norm": 3.187254618002262, - "language_loss": 0.78135574, - "learning_rate": 8.776925639411017e-07, - "loss": 0.80282032, - "num_input_tokens_seen": 250814555, - "step": 11625, - "time_per_iteration": 2.673025608062744 - }, - { - "auxiliary_loss_clip": 0.01074244, - "auxiliary_loss_mlp": 0.01033485, - "balance_loss_clip": 1.03602779, - "balance_loss_mlp": 1.0217849, - "epoch": 0.6989929355178115, - "flos": 21834873152640.0, - "grad_norm": 2.0895075471451903, - "language_loss": 0.65869164, - "learning_rate": 8.773702232678188e-07, - "loss": 0.67976898, - "num_input_tokens_seen": 250833105, - "step": 11626, - "time_per_iteration": 4.503946542739868 - }, - { - "auxiliary_loss_clip": 0.01092456, - "auxiliary_loss_mlp": 0.00770949, - "balance_loss_clip": 1.03971887, - "balance_loss_mlp": 1.000265, - "epoch": 0.6990530587704795, - "flos": 26322880855680.0, - "grad_norm": 1.5700426870038287, - "language_loss": 0.70198143, - "learning_rate": 8.770479251647697e-07, - "loss": 0.72061551, - "num_input_tokens_seen": 250852570, - "step": 11627, - "time_per_iteration": 2.7615082263946533 - }, - { - "auxiliary_loss_clip": 0.01110072, - "auxiliary_loss_mlp": 0.01029931, - "balance_loss_clip": 1.04070234, - "balance_loss_mlp": 1.01854658, - "epoch": 0.6991131820231474, - "flos": 19828975069440.0, - "grad_norm": 1.7298548393631112, - "language_loss": 0.6256994, - "learning_rate": 8.767256696441768e-07, - "loss": 0.64709944, - "num_input_tokens_seen": 250870500, - "step": 11628, - "time_per_iteration": 2.5734152793884277 - }, - { - "auxiliary_loss_clip": 0.01103325, - "auxiliary_loss_mlp": 0.01035699, - "balance_loss_clip": 1.03856647, - "balance_loss_mlp": 1.02237749, - "epoch": 0.6991733052758154, - "flos": 33984817102080.0, - "grad_norm": 2.290690432267753, - "language_loss": 0.67708141, - "learning_rate": 8.764034567182581e-07, - "loss": 0.69847167, - "num_input_tokens_seen": 250892745, - "step": 11629, - "time_per_iteration": 5.866469621658325 - }, - { - "auxiliary_loss_clip": 0.01112912, - "auxiliary_loss_mlp": 0.01036564, - "balance_loss_clip": 1.04074121, - "balance_loss_mlp": 1.02318311, - "epoch": 0.6992334285284834, - "flos": 15633136592640.0, - "grad_norm": 1.543023133812331, - "language_loss": 0.72312945, - "learning_rate": 8.760812863992337e-07, - "loss": 0.74462426, - "num_input_tokens_seen": 250910225, - "step": 11630, - "time_per_iteration": 2.657487392425537 - }, - { - "auxiliary_loss_clip": 0.01113352, - "auxiliary_loss_mlp": 0.01034612, - "balance_loss_clip": 1.04170883, - "balance_loss_mlp": 1.02198827, - "epoch": 0.6992935517811514, - "flos": 21726279360000.0, - "grad_norm": 1.59875756731347, - "language_loss": 0.73932934, - "learning_rate": 8.757591586993196e-07, - "loss": 0.76080894, - "num_input_tokens_seen": 250929715, - "step": 11631, - "time_per_iteration": 2.5861480236053467 - }, - { - "auxiliary_loss_clip": 0.01104832, - "auxiliary_loss_mlp": 0.01034444, - "balance_loss_clip": 1.0415566, - "balance_loss_mlp": 1.02083039, - "epoch": 0.6993536750338193, - "flos": 20115254465280.0, - "grad_norm": 2.102391376159487, - "language_loss": 0.89547968, - "learning_rate": 8.7543707363073e-07, - "loss": 0.91687244, - "num_input_tokens_seen": 250944230, - "step": 11632, - "time_per_iteration": 2.590348482131958 - }, - { - "auxiliary_loss_clip": 0.01094827, - "auxiliary_loss_mlp": 0.01040482, - "balance_loss_clip": 1.04256976, - "balance_loss_mlp": 1.02795899, - "epoch": 0.6994137982864873, - "flos": 22010547594240.0, - "grad_norm": 1.8034038956889087, - "language_loss": 0.80041152, - "learning_rate": 8.751150312056792e-07, - "loss": 0.82176459, - "num_input_tokens_seen": 250961865, - "step": 11633, - "time_per_iteration": 2.681643486022949 - }, - { - "auxiliary_loss_clip": 0.011161, - "auxiliary_loss_mlp": 0.01037638, - "balance_loss_clip": 1.04005051, - "balance_loss_mlp": 1.02334523, - "epoch": 0.6994739215391552, - "flos": 25519020433920.0, - "grad_norm": 1.9182761585422314, - "language_loss": 0.67464936, - "learning_rate": 8.747930314363794e-07, - "loss": 0.69618672, - "num_input_tokens_seen": 250982025, - "step": 11634, - "time_per_iteration": 2.604487419128418 - }, - { - "auxiliary_loss_clip": 0.01010044, - "auxiliary_loss_mlp": 0.01002813, - "balance_loss_clip": 1.01407039, - "balance_loss_mlp": 1.00143051, - "epoch": 0.6995340447918232, - "flos": 59128357691520.0, - "grad_norm": 0.6847666166760457, - "language_loss": 0.53152555, - "learning_rate": 8.744710743350412e-07, - "loss": 0.5516541, - "num_input_tokens_seen": 251046900, - "step": 11635, - "time_per_iteration": 4.9191200733184814 - }, - { - "auxiliary_loss_clip": 0.01086524, - "auxiliary_loss_mlp": 0.01034637, - "balance_loss_clip": 1.03990149, - "balance_loss_mlp": 1.02167284, - "epoch": 0.6995941680444913, - "flos": 17967832796160.0, - "grad_norm": 1.5113357083257617, - "language_loss": 0.81950343, - "learning_rate": 8.741491599138726e-07, - "loss": 0.84071505, - "num_input_tokens_seen": 251065050, - "step": 11636, - "time_per_iteration": 2.6814749240875244 - }, - { - "auxiliary_loss_clip": 0.01114034, - "auxiliary_loss_mlp": 0.01030856, - "balance_loss_clip": 1.04003048, - "balance_loss_mlp": 1.01799953, - "epoch": 0.6996542912971592, - "flos": 21980095839360.0, - "grad_norm": 4.210307970710786, - "language_loss": 0.83255941, - "learning_rate": 8.738272881850801e-07, - "loss": 0.85400826, - "num_input_tokens_seen": 251083355, - "step": 11637, - "time_per_iteration": 2.6101019382476807 - }, - { - "auxiliary_loss_clip": 0.01063351, - "auxiliary_loss_mlp": 0.01039835, - "balance_loss_clip": 1.0357244, - "balance_loss_mlp": 1.0266149, - "epoch": 0.6997144145498272, - "flos": 11686158518400.0, - "grad_norm": 1.88143316282286, - "language_loss": 0.68318653, - "learning_rate": 8.735054591608704e-07, - "loss": 0.70421839, - "num_input_tokens_seen": 251096420, - "step": 11638, - "time_per_iteration": 2.757967233657837 - }, - { - "auxiliary_loss_clip": 0.0110744, - "auxiliary_loss_mlp": 0.01034716, - "balance_loss_clip": 1.04054809, - "balance_loss_mlp": 1.02038121, - "epoch": 0.6997745378024951, - "flos": 29607162958080.0, - "grad_norm": 2.0225103573047334, - "language_loss": 0.77908248, - "learning_rate": 8.731836728534459e-07, - "loss": 0.80050403, - "num_input_tokens_seen": 251115410, - "step": 11639, - "time_per_iteration": 2.7171573638916016 - }, - { - "auxiliary_loss_clip": 0.01088431, - "auxiliary_loss_mlp": 0.01044388, - "balance_loss_clip": 1.03905129, - "balance_loss_mlp": 1.03095889, - "epoch": 0.6998346610551631, - "flos": 20886616056960.0, - "grad_norm": 2.0145862528244542, - "language_loss": 0.82033116, - "learning_rate": 8.728619292750093e-07, - "loss": 0.84165937, - "num_input_tokens_seen": 251133530, - "step": 11640, - "time_per_iteration": 2.746412515640259 - }, - { - "auxiliary_loss_clip": 0.01079412, - "auxiliary_loss_mlp": 0.01033499, - "balance_loss_clip": 1.03800678, - "balance_loss_mlp": 1.02089286, - "epoch": 0.699894784307831, - "flos": 27163046949120.0, - "grad_norm": 1.988818239892088, - "language_loss": 0.75212121, - "learning_rate": 8.725402284377619e-07, - "loss": 0.77325034, - "num_input_tokens_seen": 251153985, - "step": 11641, - "time_per_iteration": 2.789306879043579 - }, - { - "auxiliary_loss_clip": 0.01089337, - "auxiliary_loss_mlp": 0.01024848, - "balance_loss_clip": 1.03791595, - "balance_loss_mlp": 1.01126993, - "epoch": 0.699954907560499, - "flos": 20923640000640.0, - "grad_norm": 1.9133228013475547, - "language_loss": 0.77589947, - "learning_rate": 8.722185703539022e-07, - "loss": 0.79704136, - "num_input_tokens_seen": 251173225, - "step": 11642, - "time_per_iteration": 2.6469504833221436 - }, - { - "auxiliary_loss_clip": 0.01110134, - "auxiliary_loss_mlp": 0.01039612, - "balance_loss_clip": 1.04202175, - "balance_loss_mlp": 1.02436519, - "epoch": 0.700015030813167, - "flos": 28657792540800.0, - "grad_norm": 1.9777967243613577, - "language_loss": 0.74846154, - "learning_rate": 8.718969550356266e-07, - "loss": 0.76995897, - "num_input_tokens_seen": 251192485, - "step": 11643, - "time_per_iteration": 2.6794352531433105 - }, - { - "auxiliary_loss_clip": 0.01079698, - "auxiliary_loss_mlp": 0.01030119, - "balance_loss_clip": 1.03809059, - "balance_loss_mlp": 1.01665401, - "epoch": 0.700075154065835, - "flos": 29205286617600.0, - "grad_norm": 1.5319096526083835, - "language_loss": 0.60467082, - "learning_rate": 8.715753824951315e-07, - "loss": 0.62576902, - "num_input_tokens_seen": 251214965, - "step": 11644, - "time_per_iteration": 2.7573509216308594 - }, - { - "auxiliary_loss_clip": 0.01098759, - "auxiliary_loss_mlp": 0.0103069, - "balance_loss_clip": 1.03691936, - "balance_loss_mlp": 1.01848316, - "epoch": 0.7001352773185029, - "flos": 23112431159040.0, - "grad_norm": 2.3431210800913203, - "language_loss": 0.81582069, - "learning_rate": 8.712538527446119e-07, - "loss": 0.83711517, - "num_input_tokens_seen": 251234500, - "step": 11645, - "time_per_iteration": 2.6731204986572266 - }, - { - "auxiliary_loss_clip": 0.01102676, - "auxiliary_loss_mlp": 0.01031812, - "balance_loss_clip": 1.03974915, - "balance_loss_mlp": 1.01880574, - "epoch": 0.7001954005711709, - "flos": 21322858734720.0, - "grad_norm": 2.1357575492399143, - "language_loss": 0.68504727, - "learning_rate": 8.709323657962584e-07, - "loss": 0.70639217, - "num_input_tokens_seen": 251254360, - "step": 11646, - "time_per_iteration": 2.622621774673462 - }, - { - "auxiliary_loss_clip": 0.01096745, - "auxiliary_loss_mlp": 0.01045056, - "balance_loss_clip": 1.03817129, - "balance_loss_mlp": 1.03125119, - "epoch": 0.7002555238238388, - "flos": 24535822383360.0, - "grad_norm": 1.6406688140332686, - "language_loss": 0.71264708, - "learning_rate": 8.706109216622635e-07, - "loss": 0.73406506, - "num_input_tokens_seen": 251274790, - "step": 11647, - "time_per_iteration": 2.627837896347046 - }, - { - "auxiliary_loss_clip": 0.01105019, - "auxiliary_loss_mlp": 0.01036652, - "balance_loss_clip": 1.041466, - "balance_loss_mlp": 1.02333069, - "epoch": 0.7003156470765068, - "flos": 39056552726400.0, - "grad_norm": 1.5459607229268986, - "language_loss": 0.71446347, - "learning_rate": 8.702895203548155e-07, - "loss": 0.7358802, - "num_input_tokens_seen": 251296275, - "step": 11648, - "time_per_iteration": 2.753802537918091 - }, - { - "auxiliary_loss_clip": 0.01057301, - "auxiliary_loss_mlp": 0.01037599, - "balance_loss_clip": 1.03298402, - "balance_loss_mlp": 1.02418768, - "epoch": 0.7003757703291749, - "flos": 28804092635520.0, - "grad_norm": 1.667578697289853, - "language_loss": 0.77163005, - "learning_rate": 8.699681618861014e-07, - "loss": 0.79257905, - "num_input_tokens_seen": 251317375, - "step": 11649, - "time_per_iteration": 2.7761147022247314 - }, - { - "auxiliary_loss_clip": 0.01090081, - "auxiliary_loss_mlp": 0.01033039, - "balance_loss_clip": 1.03838015, - "balance_loss_mlp": 1.02049243, - "epoch": 0.7004358935818428, - "flos": 15953854152960.0, - "grad_norm": 2.229815779289175, - "language_loss": 0.787054, - "learning_rate": 8.69646846268308e-07, - "loss": 0.80828524, - "num_input_tokens_seen": 251333570, - "step": 11650, - "time_per_iteration": 2.651338815689087 - }, - { - "auxiliary_loss_clip": 0.01087246, - "auxiliary_loss_mlp": 0.01026359, - "balance_loss_clip": 1.03717887, - "balance_loss_mlp": 1.01436639, - "epoch": 0.7004960168345108, - "flos": 20411984718720.0, - "grad_norm": 2.805466583174802, - "language_loss": 0.78653586, - "learning_rate": 8.693255735136194e-07, - "loss": 0.8076719, - "num_input_tokens_seen": 251351070, - "step": 11651, - "time_per_iteration": 2.650684118270874 - }, - { - "auxiliary_loss_clip": 0.01078764, - "auxiliary_loss_mlp": 0.01048293, - "balance_loss_clip": 1.03785074, - "balance_loss_mlp": 1.03431594, - "epoch": 0.7005561400871787, - "flos": 17347547808000.0, - "grad_norm": 1.6343941081799256, - "language_loss": 0.69484842, - "learning_rate": 8.690043436342198e-07, - "loss": 0.71611905, - "num_input_tokens_seen": 251370005, - "step": 11652, - "time_per_iteration": 2.807304859161377 - }, - { - "auxiliary_loss_clip": 0.01104104, - "auxiliary_loss_mlp": 0.0103143, - "balance_loss_clip": 1.04046094, - "balance_loss_mlp": 1.01811981, - "epoch": 0.7006162633398467, - "flos": 25302120157440.0, - "grad_norm": 1.3561275324415532, - "language_loss": 0.74232221, - "learning_rate": 8.686831566422874e-07, - "loss": 0.7636776, - "num_input_tokens_seen": 251391210, - "step": 11653, - "time_per_iteration": 2.696967601776123 - }, - { - "auxiliary_loss_clip": 0.01087115, - "auxiliary_loss_mlp": 0.01035907, - "balance_loss_clip": 1.03951633, - "balance_loss_mlp": 1.02182245, - "epoch": 0.7006763865925146, - "flos": 20668997508480.0, - "grad_norm": 2.1100512473261808, - "language_loss": 0.70994234, - "learning_rate": 8.68362012550003e-07, - "loss": 0.73117256, - "num_input_tokens_seen": 251411505, - "step": 11654, - "time_per_iteration": 2.6838555335998535 - }, - { - "auxiliary_loss_clip": 0.01066217, - "auxiliary_loss_mlp": 0.01033137, - "balance_loss_clip": 1.03629136, - "balance_loss_mlp": 1.0182364, - "epoch": 0.7007365098451827, - "flos": 20046449963520.0, - "grad_norm": 2.5073875946500093, - "language_loss": 0.73771894, - "learning_rate": 8.680409113695453e-07, - "loss": 0.75871241, - "num_input_tokens_seen": 251428975, - "step": 11655, - "time_per_iteration": 2.7410359382629395 - }, - { - "auxiliary_loss_clip": 0.01111257, - "auxiliary_loss_mlp": 0.01039302, - "balance_loss_clip": 1.04240656, - "balance_loss_mlp": 1.02404356, - "epoch": 0.7007966330978506, - "flos": 20777375819520.0, - "grad_norm": 1.9005607339243875, - "language_loss": 0.70418394, - "learning_rate": 8.677198531130889e-07, - "loss": 0.72568953, - "num_input_tokens_seen": 251446940, - "step": 11656, - "time_per_iteration": 2.731491804122925 - }, - { - "auxiliary_loss_clip": 0.01066256, - "auxiliary_loss_mlp": 0.01032163, - "balance_loss_clip": 1.03605461, - "balance_loss_mlp": 1.0202893, - "epoch": 0.7008567563505186, - "flos": 29638189330560.0, - "grad_norm": 1.6373792531081708, - "language_loss": 0.7814554, - "learning_rate": 8.673988377928092e-07, - "loss": 0.80243957, - "num_input_tokens_seen": 251466205, - "step": 11657, - "time_per_iteration": 2.77717924118042 - }, - { - "auxiliary_loss_clip": 0.01118749, - "auxiliary_loss_mlp": 0.010372, - "balance_loss_clip": 1.04163647, - "balance_loss_mlp": 1.02257895, - "epoch": 0.7009168796031865, - "flos": 17092007475840.0, - "grad_norm": 1.9768682726826163, - "language_loss": 0.78330362, - "learning_rate": 8.670778654208797e-07, - "loss": 0.8048631, - "num_input_tokens_seen": 251484820, - "step": 11658, - "time_per_iteration": 2.6049365997314453 - }, - { - "auxiliary_loss_clip": 0.01086248, - "auxiliary_loss_mlp": 0.0103209, - "balance_loss_clip": 1.03589261, - "balance_loss_mlp": 1.01928139, - "epoch": 0.7009770028558545, - "flos": 20448972748800.0, - "grad_norm": 5.565674692031433, - "language_loss": 0.82623971, - "learning_rate": 8.667569360094713e-07, - "loss": 0.84742308, - "num_input_tokens_seen": 251502670, - "step": 11659, - "time_per_iteration": 2.686923027038574 - }, - { - "auxiliary_loss_clip": 0.01069607, - "auxiliary_loss_mlp": 0.01030844, - "balance_loss_clip": 1.03661668, - "balance_loss_mlp": 1.0180831, - "epoch": 0.7010371261085224, - "flos": 19245139407360.0, - "grad_norm": 1.865164954565192, - "language_loss": 0.6914413, - "learning_rate": 8.664360495707526e-07, - "loss": 0.71244586, - "num_input_tokens_seen": 251521630, - "step": 11660, - "time_per_iteration": 2.6798696517944336 - }, - { - "auxiliary_loss_clip": 0.01114876, - "auxiliary_loss_mlp": 0.01039123, - "balance_loss_clip": 1.03920841, - "balance_loss_mlp": 1.02499104, - "epoch": 0.7010972493611904, - "flos": 22127581082880.0, - "grad_norm": 1.6974567931874, - "language_loss": 0.81309623, - "learning_rate": 8.661152061168924e-07, - "loss": 0.83463621, - "num_input_tokens_seen": 251540105, - "step": 11661, - "time_per_iteration": 2.665506601333618 - }, - { - "auxiliary_loss_clip": 0.01100544, - "auxiliary_loss_mlp": 0.01036779, - "balance_loss_clip": 1.03771257, - "balance_loss_mlp": 1.02428651, - "epoch": 0.7011573726138585, - "flos": 31391132860800.0, - "grad_norm": 1.6549167062780274, - "language_loss": 0.79250038, - "learning_rate": 8.657944056600579e-07, - "loss": 0.81387359, - "num_input_tokens_seen": 251560530, - "step": 11662, - "time_per_iteration": 2.747738838195801 - }, - { - "auxiliary_loss_clip": 0.01099278, - "auxiliary_loss_mlp": 0.01034891, - "balance_loss_clip": 1.03757107, - "balance_loss_mlp": 1.02009749, - "epoch": 0.7012174958665264, - "flos": 18150582216960.0, - "grad_norm": 1.8518490996849224, - "language_loss": 0.83547205, - "learning_rate": 8.654736482124134e-07, - "loss": 0.85681379, - "num_input_tokens_seen": 251577930, - "step": 11663, - "time_per_iteration": 2.631399631500244 - }, - { - "auxiliary_loss_clip": 0.01021926, - "auxiliary_loss_mlp": 0.00999736, - "balance_loss_clip": 1.00981212, - "balance_loss_mlp": 0.99871653, - "epoch": 0.7012776191191944, - "flos": 60651256567680.0, - "grad_norm": 0.8199034033651936, - "language_loss": 0.5377062, - "learning_rate": 8.651529337861209e-07, - "loss": 0.55792284, - "num_input_tokens_seen": 251638820, - "step": 11664, - "time_per_iteration": 3.219939708709717 - }, - { - "auxiliary_loss_clip": 0.01091352, - "auxiliary_loss_mlp": 0.01036957, - "balance_loss_clip": 1.03675961, - "balance_loss_mlp": 1.02283645, - "epoch": 0.7013377423718623, - "flos": 27198598435200.0, - "grad_norm": 25.234897895477353, - "language_loss": 0.78593969, - "learning_rate": 8.64832262393344e-07, - "loss": 0.80722272, - "num_input_tokens_seen": 251658070, - "step": 11665, - "time_per_iteration": 2.7333061695098877 - }, - { - "auxiliary_loss_clip": 0.01097626, - "auxiliary_loss_mlp": 0.01033659, - "balance_loss_clip": 1.03675759, - "balance_loss_mlp": 1.02039695, - "epoch": 0.7013978656245303, - "flos": 16543543731840.0, - "grad_norm": 3.3099112213098576, - "language_loss": 0.76706922, - "learning_rate": 8.645116340462404e-07, - "loss": 0.78838205, - "num_input_tokens_seen": 251671575, - "step": 11666, - "time_per_iteration": 4.164456844329834 - }, - { - "auxiliary_loss_clip": 0.0109964, - "auxiliary_loss_mlp": 0.01033563, - "balance_loss_clip": 1.03881526, - "balance_loss_mlp": 1.02059937, - "epoch": 0.7014579888771982, - "flos": 23143780753920.0, - "grad_norm": 1.9172711089554313, - "language_loss": 0.81507015, - "learning_rate": 8.641910487569695e-07, - "loss": 0.83640218, - "num_input_tokens_seen": 251689350, - "step": 11667, - "time_per_iteration": 2.6507012844085693 - }, - { - "auxiliary_loss_clip": 0.01080493, - "auxiliary_loss_mlp": 0.01039617, - "balance_loss_clip": 1.03758526, - "balance_loss_mlp": 1.02586019, - "epoch": 0.7015181121298663, - "flos": 25082095397760.0, - "grad_norm": 4.617945846615331, - "language_loss": 0.65072989, - "learning_rate": 8.638705065376879e-07, - "loss": 0.67193091, - "num_input_tokens_seen": 251704635, - "step": 11668, - "time_per_iteration": 4.238234758377075 - }, - { - "auxiliary_loss_clip": 0.01094365, - "auxiliary_loss_mlp": 0.0102871, - "balance_loss_clip": 1.03865385, - "balance_loss_mlp": 1.01505494, - "epoch": 0.7015782353825342, - "flos": 23327894891520.0, - "grad_norm": 2.259598520998094, - "language_loss": 0.7661069, - "learning_rate": 8.635500074005519e-07, - "loss": 0.78733766, - "num_input_tokens_seen": 251723035, - "step": 11669, - "time_per_iteration": 4.344635248184204 - }, - { - "auxiliary_loss_clip": 0.01013949, - "auxiliary_loss_mlp": 0.0100684, - "balance_loss_clip": 1.00989032, - "balance_loss_mlp": 1.00561166, - "epoch": 0.7016383586352022, - "flos": 70397161107840.0, - "grad_norm": 0.6970325216386312, - "language_loss": 0.54508567, - "learning_rate": 8.632295513577122e-07, - "loss": 0.56529355, - "num_input_tokens_seen": 251791630, - "step": 11670, - "time_per_iteration": 3.3269011974334717 - }, - { - "auxiliary_loss_clip": 0.01088398, - "auxiliary_loss_mlp": 0.01044417, - "balance_loss_clip": 1.04069412, - "balance_loss_mlp": 1.03119707, - "epoch": 0.7016984818878701, - "flos": 19792274348160.0, - "grad_norm": 1.7508124659386841, - "language_loss": 0.81738812, - "learning_rate": 8.629091384213218e-07, - "loss": 0.83871627, - "num_input_tokens_seen": 251809840, - "step": 11671, - "time_per_iteration": 2.622065544128418 - }, - { - "auxiliary_loss_clip": 0.0110729, - "auxiliary_loss_mlp": 0.01033835, - "balance_loss_clip": 1.04194474, - "balance_loss_mlp": 1.02070975, - "epoch": 0.7017586051405381, - "flos": 12896923184640.0, - "grad_norm": 2.274862917429984, - "language_loss": 0.75504148, - "learning_rate": 8.625887686035313e-07, - "loss": 0.77645272, - "num_input_tokens_seen": 251827550, - "step": 11672, - "time_per_iteration": 2.6540980339050293 - }, - { - "auxiliary_loss_clip": 0.01096652, - "auxiliary_loss_mlp": 0.01034604, - "balance_loss_clip": 1.03700793, - "balance_loss_mlp": 1.02045953, - "epoch": 0.701818728393206, - "flos": 18332828847360.0, - "grad_norm": 1.5989851774558104, - "language_loss": 0.87145984, - "learning_rate": 8.622684419164883e-07, - "loss": 0.8927725, - "num_input_tokens_seen": 251844880, - "step": 11673, - "time_per_iteration": 2.8490025997161865 - }, - { - "auxiliary_loss_clip": 0.01096229, - "auxiliary_loss_mlp": 0.01029311, - "balance_loss_clip": 1.0356524, - "balance_loss_mlp": 1.01583493, - "epoch": 0.701878851645874, - "flos": 17384212615680.0, - "grad_norm": 1.8214270702877817, - "language_loss": 0.73174304, - "learning_rate": 8.619481583723399e-07, - "loss": 0.75299847, - "num_input_tokens_seen": 251861025, - "step": 11674, - "time_per_iteration": 2.679823160171509 - }, - { - "auxiliary_loss_clip": 0.01096759, - "auxiliary_loss_mlp": 0.00769911, - "balance_loss_clip": 1.04201114, - "balance_loss_mlp": 1.00022173, - "epoch": 0.701938974898542, - "flos": 23915501481600.0, - "grad_norm": 1.622926679018359, - "language_loss": 0.72171724, - "learning_rate": 8.616279179832329e-07, - "loss": 0.74038392, - "num_input_tokens_seen": 251880175, - "step": 11675, - "time_per_iteration": 4.312408447265625 - }, - { - "auxiliary_loss_clip": 0.01074264, - "auxiliary_loss_mlp": 0.01030849, - "balance_loss_clip": 1.03631043, - "balance_loss_mlp": 1.01713443, - "epoch": 0.70199909815121, - "flos": 21795586652160.0, - "grad_norm": 2.2219041729549143, - "language_loss": 0.51501888, - "learning_rate": 8.613077207613078e-07, - "loss": 0.53606999, - "num_input_tokens_seen": 251899005, - "step": 11676, - "time_per_iteration": 2.6962332725524902 - }, - { - "auxiliary_loss_clip": 0.01010504, - "auxiliary_loss_mlp": 0.0075156, - "balance_loss_clip": 1.00856614, - "balance_loss_mlp": 0.99960405, - "epoch": 0.702059221403878, - "flos": 71715047109120.0, - "grad_norm": 0.7294989296672769, - "language_loss": 0.59194738, - "learning_rate": 8.609875667187079e-07, - "loss": 0.60956806, - "num_input_tokens_seen": 251966790, - "step": 11677, - "time_per_iteration": 3.283904552459717 - }, - { - "auxiliary_loss_clip": 0.01100162, - "auxiliary_loss_mlp": 0.01038032, - "balance_loss_clip": 1.03780138, - "balance_loss_mlp": 1.02260053, - "epoch": 0.7021193446565459, - "flos": 28111052649600.0, - "grad_norm": 2.3043069619356333, - "language_loss": 0.62869537, - "learning_rate": 8.606674558675737e-07, - "loss": 0.65007722, - "num_input_tokens_seen": 251989315, - "step": 11678, - "time_per_iteration": 2.683986186981201 - }, - { - "auxiliary_loss_clip": 0.01114626, - "auxiliary_loss_mlp": 0.01034461, - "balance_loss_clip": 1.04092705, - "balance_loss_mlp": 1.02130055, - "epoch": 0.7021794679092139, - "flos": 22924905229440.0, - "grad_norm": 1.616922175472628, - "language_loss": 0.79195565, - "learning_rate": 8.603473882200444e-07, - "loss": 0.81344652, - "num_input_tokens_seen": 252006620, - "step": 11679, - "time_per_iteration": 2.6574866771698 - }, - { - "auxiliary_loss_clip": 0.01084266, - "auxiliary_loss_mlp": 0.01048047, - "balance_loss_clip": 1.03683782, - "balance_loss_mlp": 1.03429675, - "epoch": 0.7022395911618818, - "flos": 18077827219200.0, - "grad_norm": 2.0679583940680746, - "language_loss": 0.70934772, - "learning_rate": 8.600273637882567e-07, - "loss": 0.73067081, - "num_input_tokens_seen": 252024570, - "step": 11680, - "time_per_iteration": 2.7358908653259277 - }, - { - "auxiliary_loss_clip": 0.01074807, - "auxiliary_loss_mlp": 0.010398, - "balance_loss_clip": 1.03587687, - "balance_loss_mlp": 1.02517891, - "epoch": 0.7022997144145499, - "flos": 16034294661120.0, - "grad_norm": 1.6247051825758914, - "language_loss": 0.74976349, - "learning_rate": 8.597073825843446e-07, - "loss": 0.77090955, - "num_input_tokens_seen": 252042775, - "step": 11681, - "time_per_iteration": 2.774574041366577 - }, - { - "auxiliary_loss_clip": 0.01094616, - "auxiliary_loss_mlp": 0.0103624, - "balance_loss_clip": 1.03856039, - "balance_loss_mlp": 1.0238483, - "epoch": 0.7023598376672178, - "flos": 26468678160000.0, - "grad_norm": 1.575109913537797, - "language_loss": 0.76865822, - "learning_rate": 8.593874446204434e-07, - "loss": 0.78996682, - "num_input_tokens_seen": 252063690, - "step": 11682, - "time_per_iteration": 2.7486395835876465 - }, - { - "auxiliary_loss_clip": 0.01082555, - "auxiliary_loss_mlp": 0.00772032, - "balance_loss_clip": 1.03884804, - "balance_loss_mlp": 1.00019991, - "epoch": 0.7024199609198858, - "flos": 17055917285760.0, - "grad_norm": 2.008069790408466, - "language_loss": 0.737746, - "learning_rate": 8.590675499086841e-07, - "loss": 0.75629187, - "num_input_tokens_seen": 252080335, - "step": 11683, - "time_per_iteration": 2.744171142578125 - }, - { - "auxiliary_loss_clip": 0.01079915, - "auxiliary_loss_mlp": 0.01035548, - "balance_loss_clip": 1.03894246, - "balance_loss_mlp": 1.02157617, - "epoch": 0.7024800841725537, - "flos": 25849039616640.0, - "grad_norm": 1.8760496906578064, - "language_loss": 0.71592307, - "learning_rate": 8.587476984611976e-07, - "loss": 0.73707771, - "num_input_tokens_seen": 252101075, - "step": 11684, - "time_per_iteration": 2.88992977142334 - }, - { - "auxiliary_loss_clip": 0.01104368, - "auxiliary_loss_mlp": 0.01036296, - "balance_loss_clip": 1.03960109, - "balance_loss_mlp": 1.02242017, - "epoch": 0.7025402074252217, - "flos": 23513014609920.0, - "grad_norm": 1.7667874173043265, - "language_loss": 0.71676773, - "learning_rate": 8.584278902901128e-07, - "loss": 0.73817438, - "num_input_tokens_seen": 252120510, - "step": 11685, - "time_per_iteration": 2.7373046875 - }, - { - "auxiliary_loss_clip": 0.01101099, - "auxiliary_loss_mlp": 0.01033615, - "balance_loss_clip": 1.03761411, - "balance_loss_mlp": 1.02074599, - "epoch": 0.7026003306778896, - "flos": 20150985519360.0, - "grad_norm": 2.3305068980612695, - "language_loss": 0.84660101, - "learning_rate": 8.581081254075582e-07, - "loss": 0.86794817, - "num_input_tokens_seen": 252137590, - "step": 11686, - "time_per_iteration": 2.6728014945983887 - }, - { - "auxiliary_loss_clip": 0.0101853, - "auxiliary_loss_mlp": 0.01001761, - "balance_loss_clip": 1.00980115, - "balance_loss_mlp": 1.00045574, - "epoch": 0.7026604539305576, - "flos": 64772400712320.0, - "grad_norm": 1.2905405547920359, - "language_loss": 0.69901091, - "learning_rate": 8.577884038256566e-07, - "loss": 0.71921384, - "num_input_tokens_seen": 252199830, - "step": 11687, - "time_per_iteration": 3.3107638359069824 - }, - { - "auxiliary_loss_clip": 0.01076554, - "auxiliary_loss_mlp": 0.01032291, - "balance_loss_clip": 1.03496408, - "balance_loss_mlp": 1.01832569, - "epoch": 0.7027205771832256, - "flos": 21871466133120.0, - "grad_norm": 1.932037995437553, - "language_loss": 0.7684707, - "learning_rate": 8.574687255565329e-07, - "loss": 0.78955913, - "num_input_tokens_seen": 252217200, - "step": 11688, - "time_per_iteration": 2.7459444999694824 - }, - { - "auxiliary_loss_clip": 0.0111428, - "auxiliary_loss_mlp": 0.0103517, - "balance_loss_clip": 1.04030085, - "balance_loss_mlp": 1.02199149, - "epoch": 0.7027807004358936, - "flos": 23367791923200.0, - "grad_norm": 2.3717928399741117, - "language_loss": 0.68631124, - "learning_rate": 8.571490906123107e-07, - "loss": 0.70780575, - "num_input_tokens_seen": 252236105, - "step": 11689, - "time_per_iteration": 2.615769624710083 - }, - { - "auxiliary_loss_clip": 0.01092147, - "auxiliary_loss_mlp": 0.01039659, - "balance_loss_clip": 1.03881717, - "balance_loss_mlp": 1.02628398, - "epoch": 0.7028408236885616, - "flos": 15304266645120.0, - "grad_norm": 2.2678896966391293, - "language_loss": 0.79724276, - "learning_rate": 8.568294990051086e-07, - "loss": 0.81856084, - "num_input_tokens_seen": 252253315, - "step": 11690, - "time_per_iteration": 2.752448081970215 - }, - { - "auxiliary_loss_clip": 0.01114987, - "auxiliary_loss_mlp": 0.01035613, - "balance_loss_clip": 1.04160511, - "balance_loss_mlp": 1.02232075, - "epoch": 0.7029009469412295, - "flos": 22018197191040.0, - "grad_norm": 1.8737745525801948, - "language_loss": 0.76049984, - "learning_rate": 8.56509950747047e-07, - "loss": 0.78200579, - "num_input_tokens_seen": 252272765, - "step": 11691, - "time_per_iteration": 2.6119184494018555 - }, - { - "auxiliary_loss_clip": 0.0108875, - "auxiliary_loss_mlp": 0.01032492, - "balance_loss_clip": 1.03811002, - "balance_loss_mlp": 1.01972437, - "epoch": 0.7029610701938975, - "flos": 21835519597440.0, - "grad_norm": 9.903733322151682, - "language_loss": 0.81749791, - "learning_rate": 8.561904458502429e-07, - "loss": 0.83871031, - "num_input_tokens_seen": 252290510, - "step": 11692, - "time_per_iteration": 2.69521427154541 - }, - { - "auxiliary_loss_clip": 0.0108957, - "auxiliary_loss_mlp": 0.01032565, - "balance_loss_clip": 1.03853154, - "balance_loss_mlp": 1.01875424, - "epoch": 0.7030211934465654, - "flos": 19135647774720.0, - "grad_norm": 1.5579267825971022, - "language_loss": 0.76325333, - "learning_rate": 8.558709843268111e-07, - "loss": 0.78447467, - "num_input_tokens_seen": 252309365, - "step": 11693, - "time_per_iteration": 2.6727678775787354 - }, - { - "auxiliary_loss_clip": 0.01089511, - "auxiliary_loss_mlp": 0.01037963, - "balance_loss_clip": 1.04170704, - "balance_loss_mlp": 1.02457595, - "epoch": 0.7030813166992335, - "flos": 38546010766080.0, - "grad_norm": 1.4223920880081815, - "language_loss": 0.68617809, - "learning_rate": 8.55551566188866e-07, - "loss": 0.70745289, - "num_input_tokens_seen": 252333010, - "step": 11694, - "time_per_iteration": 2.858931541442871 - }, - { - "auxiliary_loss_clip": 0.0111374, - "auxiliary_loss_mlp": 0.01034494, - "balance_loss_clip": 1.03941369, - "balance_loss_mlp": 1.02133918, - "epoch": 0.7031414399519014, - "flos": 14720897859840.0, - "grad_norm": 2.09445306191262, - "language_loss": 0.75264633, - "learning_rate": 8.552321914485203e-07, - "loss": 0.77412868, - "num_input_tokens_seen": 252351330, - "step": 11695, - "time_per_iteration": 2.631002902984619 - }, - { - "auxiliary_loss_clip": 0.01092725, - "auxiliary_loss_mlp": 0.01042661, - "balance_loss_clip": 1.04262757, - "balance_loss_mlp": 1.02838016, - "epoch": 0.7032015632045694, - "flos": 14027247342720.0, - "grad_norm": 2.095793582645169, - "language_loss": 0.73874116, - "learning_rate": 8.549128601178852e-07, - "loss": 0.760095, - "num_input_tokens_seen": 252369580, - "step": 11696, - "time_per_iteration": 2.7669694423675537 - }, - { - "auxiliary_loss_clip": 0.01097157, - "auxiliary_loss_mlp": 0.01032154, - "balance_loss_clip": 1.0388881, - "balance_loss_mlp": 1.01825988, - "epoch": 0.7032616864572373, - "flos": 27637175496960.0, - "grad_norm": 1.7096974428239413, - "language_loss": 0.75290072, - "learning_rate": 8.545935722090693e-07, - "loss": 0.77419376, - "num_input_tokens_seen": 252390525, - "step": 11697, - "time_per_iteration": 2.763500928878784 - }, - { - "auxiliary_loss_clip": 0.01063183, - "auxiliary_loss_mlp": 0.01044698, - "balance_loss_clip": 1.03909528, - "balance_loss_mlp": 1.02815211, - "epoch": 0.7033218097099053, - "flos": 17967294092160.0, - "grad_norm": 1.8043330597848055, - "language_loss": 0.81064868, - "learning_rate": 8.542743277341793e-07, - "loss": 0.8317275, - "num_input_tokens_seen": 252407470, - "step": 11698, - "time_per_iteration": 2.869485378265381 - }, - { - "auxiliary_loss_clip": 0.01087007, - "auxiliary_loss_mlp": 0.01041792, - "balance_loss_clip": 1.03696036, - "balance_loss_mlp": 1.02600873, - "epoch": 0.7033819329625732, - "flos": 19501721233920.0, - "grad_norm": 1.788545707110732, - "language_loss": 0.84702611, - "learning_rate": 8.539551267053222e-07, - "loss": 0.86831409, - "num_input_tokens_seen": 252427025, - "step": 11699, - "time_per_iteration": 2.664696216583252 - }, - { - "auxiliary_loss_clip": 0.01097664, - "auxiliary_loss_mlp": 0.01034973, - "balance_loss_clip": 1.03910065, - "balance_loss_mlp": 1.02029788, - "epoch": 0.7034420562152413, - "flos": 23987645948160.0, - "grad_norm": 2.037813318278341, - "language_loss": 0.78878331, - "learning_rate": 8.53635969134601e-07, - "loss": 0.81010973, - "num_input_tokens_seen": 252445410, - "step": 11700, - "time_per_iteration": 2.6491341590881348 - }, - { - "auxiliary_loss_clip": 0.01104199, - "auxiliary_loss_mlp": 0.01030571, - "balance_loss_clip": 1.04006886, - "balance_loss_mlp": 1.01655197, - "epoch": 0.7035021794679092, - "flos": 35043427756800.0, - "grad_norm": 1.9483737724471917, - "language_loss": 0.74603212, - "learning_rate": 8.533168550341186e-07, - "loss": 0.76737982, - "num_input_tokens_seen": 252463905, - "step": 11701, - "time_per_iteration": 2.75663423538208 - }, - { - "auxiliary_loss_clip": 0.0110842, - "auxiliary_loss_mlp": 0.01031935, - "balance_loss_clip": 1.04136193, - "balance_loss_mlp": 1.01701057, - "epoch": 0.7035623027205772, - "flos": 10997428164480.0, - "grad_norm": 2.4684106998326913, - "language_loss": 0.84602612, - "learning_rate": 8.529977844159769e-07, - "loss": 0.86742967, - "num_input_tokens_seen": 252478655, - "step": 11702, - "time_per_iteration": 2.691843032836914 - }, - { - "auxiliary_loss_clip": 0.01114954, - "auxiliary_loss_mlp": 0.01040813, - "balance_loss_clip": 1.03995621, - "balance_loss_mlp": 1.02679968, - "epoch": 0.7036224259732452, - "flos": 23623727304960.0, - "grad_norm": 2.401456792119983, - "language_loss": 0.61207104, - "learning_rate": 8.526787572922738e-07, - "loss": 0.63362873, - "num_input_tokens_seen": 252498740, - "step": 11703, - "time_per_iteration": 2.6257216930389404 - }, - { - "auxiliary_loss_clip": 0.01112246, - "auxiliary_loss_mlp": 0.01030363, - "balance_loss_clip": 1.03811026, - "balance_loss_mlp": 1.01622462, - "epoch": 0.7036825492259131, - "flos": 31686175175040.0, - "grad_norm": 1.8586997056929888, - "language_loss": 0.61509585, - "learning_rate": 8.523597736751067e-07, - "loss": 0.63652194, - "num_input_tokens_seen": 252517800, - "step": 11704, - "time_per_iteration": 2.6846559047698975 - }, - { - "auxiliary_loss_clip": 0.01096047, - "auxiliary_loss_mlp": 0.01032429, - "balance_loss_clip": 1.0392369, - "balance_loss_mlp": 1.02010882, - "epoch": 0.7037426724785811, - "flos": 30192866127360.0, - "grad_norm": 1.6136715073341366, - "language_loss": 0.70614809, - "learning_rate": 8.520408335765719e-07, - "loss": 0.72743285, - "num_input_tokens_seen": 252539620, - "step": 11705, - "time_per_iteration": 4.335879564285278 - }, - { - "auxiliary_loss_clip": 0.01103119, - "auxiliary_loss_mlp": 0.01036985, - "balance_loss_clip": 1.04067218, - "balance_loss_mlp": 1.02324617, - "epoch": 0.703802795731249, - "flos": 24311523905280.0, - "grad_norm": 1.8826981498859905, - "language_loss": 0.61822981, - "learning_rate": 8.517219370087645e-07, - "loss": 0.63963085, - "num_input_tokens_seen": 252557300, - "step": 11706, - "time_per_iteration": 2.593494176864624 - }, - { - "auxiliary_loss_clip": 0.01106671, - "auxiliary_loss_mlp": 0.01030439, - "balance_loss_clip": 1.04123783, - "balance_loss_mlp": 1.01777911, - "epoch": 0.7038629189839171, - "flos": 22528954632960.0, - "grad_norm": 2.04643987571859, - "language_loss": 0.67915642, - "learning_rate": 8.514030839837756e-07, - "loss": 0.70052749, - "num_input_tokens_seen": 252576715, - "step": 11707, - "time_per_iteration": 2.7215147018432617 - }, - { - "auxiliary_loss_clip": 0.01112969, - "auxiliary_loss_mlp": 0.01030922, - "balance_loss_clip": 1.04011774, - "balance_loss_mlp": 1.01814246, - "epoch": 0.703923042236585, - "flos": 26250484993920.0, - "grad_norm": 1.7862705599659854, - "language_loss": 0.76583481, - "learning_rate": 8.510842745136974e-07, - "loss": 0.78727371, - "num_input_tokens_seen": 252596190, - "step": 11708, - "time_per_iteration": 4.144139051437378 - }, - { - "auxiliary_loss_clip": 0.01090944, - "auxiliary_loss_mlp": 0.01034476, - "balance_loss_clip": 1.03818655, - "balance_loss_mlp": 1.02149391, - "epoch": 0.703983165489253, - "flos": 19390254353280.0, - "grad_norm": 1.9126700939118615, - "language_loss": 0.72069716, - "learning_rate": 8.50765508610619e-07, - "loss": 0.74195135, - "num_input_tokens_seen": 252613410, - "step": 11709, - "time_per_iteration": 4.317174911499023 - }, - { - "auxiliary_loss_clip": 0.01103216, - "auxiliary_loss_mlp": 0.01032072, - "balance_loss_clip": 1.04039192, - "balance_loss_mlp": 1.01939988, - "epoch": 0.7040432887419209, - "flos": 16683630773760.0, - "grad_norm": 2.4201158088388337, - "language_loss": 0.78757358, - "learning_rate": 8.504467862866267e-07, - "loss": 0.80892646, - "num_input_tokens_seen": 252629150, - "step": 11710, - "time_per_iteration": 2.6521589756011963 - }, - { - "auxiliary_loss_clip": 0.01106607, - "auxiliary_loss_mlp": 0.01035061, - "balance_loss_clip": 1.04086101, - "balance_loss_mlp": 1.02094674, - "epoch": 0.7041034119945889, - "flos": 21141402203520.0, - "grad_norm": 1.5478201961623483, - "language_loss": 0.77396274, - "learning_rate": 8.501281075538076e-07, - "loss": 0.7953794, - "num_input_tokens_seen": 252648225, - "step": 11711, - "time_per_iteration": 2.673774242401123 - }, - { - "auxiliary_loss_clip": 0.01077655, - "auxiliary_loss_mlp": 0.01031566, - "balance_loss_clip": 1.03709733, - "balance_loss_mlp": 1.01935935, - "epoch": 0.7041635352472568, - "flos": 16910299549440.0, - "grad_norm": 2.4253841205918945, - "language_loss": 0.74240053, - "learning_rate": 8.498094724242457e-07, - "loss": 0.7634927, - "num_input_tokens_seen": 252665380, - "step": 11712, - "time_per_iteration": 2.7232208251953125 - }, - { - "auxiliary_loss_clip": 0.00994093, - "auxiliary_loss_mlp": 0.01000365, - "balance_loss_clip": 1.0117116, - "balance_loss_mlp": 0.99926871, - "epoch": 0.7042236584999249, - "flos": 71681219475840.0, - "grad_norm": 0.8868310854542714, - "language_loss": 0.64613295, - "learning_rate": 8.494908809100247e-07, - "loss": 0.66607749, - "num_input_tokens_seen": 252727950, - "step": 11713, - "time_per_iteration": 3.285946846008301 - }, - { - "auxiliary_loss_clip": 0.01098435, - "auxiliary_loss_mlp": 0.01032407, - "balance_loss_clip": 1.03652644, - "balance_loss_mlp": 1.01949096, - "epoch": 0.7042837817525928, - "flos": 28658187590400.0, - "grad_norm": 2.013493705916732, - "language_loss": 0.73046267, - "learning_rate": 8.49172333023225e-07, - "loss": 0.75177109, - "num_input_tokens_seen": 252746770, - "step": 11714, - "time_per_iteration": 4.236090898513794 - }, - { - "auxiliary_loss_clip": 0.01087938, - "auxiliary_loss_mlp": 0.0077181, - "balance_loss_clip": 1.03839374, - "balance_loss_mlp": 1.00026381, - "epoch": 0.7043439050052608, - "flos": 19753562465280.0, - "grad_norm": 1.7093963248736088, - "language_loss": 0.79507661, - "learning_rate": 8.488538287759248e-07, - "loss": 0.81367409, - "num_input_tokens_seen": 252765610, - "step": 11715, - "time_per_iteration": 2.6581244468688965 - }, - { - "auxiliary_loss_clip": 0.01084772, - "auxiliary_loss_mlp": 0.0104235, - "balance_loss_clip": 1.03780401, - "balance_loss_mlp": 1.02811062, - "epoch": 0.7044040282579288, - "flos": 11538529620480.0, - "grad_norm": 2.525582703515887, - "language_loss": 0.71628869, - "learning_rate": 8.485353681802037e-07, - "loss": 0.73755985, - "num_input_tokens_seen": 252781610, - "step": 11716, - "time_per_iteration": 2.6553633213043213 - }, - { - "auxiliary_loss_clip": 0.0108292, - "auxiliary_loss_mlp": 0.01035705, - "balance_loss_clip": 1.04328799, - "balance_loss_mlp": 1.02210903, - "epoch": 0.7044641515105967, - "flos": 33656126722560.0, - "grad_norm": 1.9767167849127631, - "language_loss": 0.66507739, - "learning_rate": 8.482169512481358e-07, - "loss": 0.68626368, - "num_input_tokens_seen": 252800600, - "step": 11717, - "time_per_iteration": 2.8526103496551514 - }, - { - "auxiliary_loss_clip": 0.01115695, - "auxiliary_loss_mlp": 0.01028921, - "balance_loss_clip": 1.04154098, - "balance_loss_mlp": 1.01575446, - "epoch": 0.7045242747632647, - "flos": 26723859356160.0, - "grad_norm": 1.3833751412057287, - "language_loss": 0.74381793, - "learning_rate": 8.478985779917967e-07, - "loss": 0.76526403, - "num_input_tokens_seen": 252822310, - "step": 11718, - "time_per_iteration": 2.6526429653167725 - }, - { - "auxiliary_loss_clip": 0.0110132, - "auxiliary_loss_mlp": 0.01033293, - "balance_loss_clip": 1.039011, - "balance_loss_mlp": 1.02069247, - "epoch": 0.7045843980159326, - "flos": 26797655848320.0, - "grad_norm": 2.395703855617911, - "language_loss": 0.79719883, - "learning_rate": 8.475802484232606e-07, - "loss": 0.81854498, - "num_input_tokens_seen": 252842355, - "step": 11719, - "time_per_iteration": 2.66690731048584 - }, - { - "auxiliary_loss_clip": 0.01105187, - "auxiliary_loss_mlp": 0.01041375, - "balance_loss_clip": 1.04220223, - "balance_loss_mlp": 1.02782118, - "epoch": 0.7046445212686007, - "flos": 41574824363520.0, - "grad_norm": 1.733178571450649, - "language_loss": 0.65760505, - "learning_rate": 8.472619625545951e-07, - "loss": 0.67907059, - "num_input_tokens_seen": 252866785, - "step": 11720, - "time_per_iteration": 2.808574914932251 - }, - { - "auxiliary_loss_clip": 0.0109618, - "auxiliary_loss_mlp": 0.0103147, - "balance_loss_clip": 1.04084325, - "balance_loss_mlp": 1.01776671, - "epoch": 0.7047046445212686, - "flos": 15560166113280.0, - "grad_norm": 2.168655366214879, - "language_loss": 0.80443633, - "learning_rate": 8.46943720397872e-07, - "loss": 0.8257128, - "num_input_tokens_seen": 252881870, - "step": 11721, - "time_per_iteration": 2.7525858879089355 - }, - { - "auxiliary_loss_clip": 0.01001442, - "auxiliary_loss_mlp": 0.00998843, - "balance_loss_clip": 1.0093838, - "balance_loss_mlp": 0.99760932, - "epoch": 0.7047647677739366, - "flos": 70410269571840.0, - "grad_norm": 0.7632832348458642, - "language_loss": 0.64800274, - "learning_rate": 8.466255219651582e-07, - "loss": 0.66800559, - "num_input_tokens_seen": 252951300, - "step": 11722, - "time_per_iteration": 3.413194179534912 - }, - { - "auxiliary_loss_clip": 0.010923, - "auxiliary_loss_mlp": 0.01034777, - "balance_loss_clip": 1.03915524, - "balance_loss_mlp": 1.02249277, - "epoch": 0.7048248910266045, - "flos": 23660032976640.0, - "grad_norm": 1.7290381066238394, - "language_loss": 0.65823722, - "learning_rate": 8.463073672685211e-07, - "loss": 0.67950797, - "num_input_tokens_seen": 252971400, - "step": 11723, - "time_per_iteration": 2.668208360671997 - }, - { - "auxiliary_loss_clip": 0.01083668, - "auxiliary_loss_mlp": 0.01031266, - "balance_loss_clip": 1.03798199, - "balance_loss_mlp": 1.01790833, - "epoch": 0.7048850142792725, - "flos": 21397158017280.0, - "grad_norm": 1.6832847250880896, - "language_loss": 0.80916411, - "learning_rate": 8.459892563200235e-07, - "loss": 0.8303135, - "num_input_tokens_seen": 252989475, - "step": 11724, - "time_per_iteration": 2.7347311973571777 - }, - { - "auxiliary_loss_clip": 0.01104162, - "auxiliary_loss_mlp": 0.01035842, - "balance_loss_clip": 1.03983295, - "balance_loss_mlp": 1.02229953, - "epoch": 0.7049451375319404, - "flos": 21648101408640.0, - "grad_norm": 1.7664791855809618, - "language_loss": 0.7323097, - "learning_rate": 8.456711891317296e-07, - "loss": 0.75370979, - "num_input_tokens_seen": 253007220, - "step": 11725, - "time_per_iteration": 2.654641628265381 - }, - { - "auxiliary_loss_clip": 0.01066947, - "auxiliary_loss_mlp": 0.01039139, - "balance_loss_clip": 1.03276384, - "balance_loss_mlp": 1.02378523, - "epoch": 0.7050052607846085, - "flos": 14866802904960.0, - "grad_norm": 2.572506179811176, - "language_loss": 0.78501201, - "learning_rate": 8.453531657156998e-07, - "loss": 0.80607283, - "num_input_tokens_seen": 253025410, - "step": 11726, - "time_per_iteration": 2.7266314029693604 - }, - { - "auxiliary_loss_clip": 0.01093418, - "auxiliary_loss_mlp": 0.01038851, - "balance_loss_clip": 1.03780484, - "balance_loss_mlp": 1.02567255, - "epoch": 0.7050653840372764, - "flos": 19241763528960.0, - "grad_norm": 2.180783221878792, - "language_loss": 0.70615113, - "learning_rate": 8.450351860839931e-07, - "loss": 0.72747386, - "num_input_tokens_seen": 253043305, - "step": 11727, - "time_per_iteration": 2.6545214653015137 - }, - { - "auxiliary_loss_clip": 0.0110651, - "auxiliary_loss_mlp": 0.00770675, - "balance_loss_clip": 1.03787398, - "balance_loss_mlp": 1.00010693, - "epoch": 0.7051255072899444, - "flos": 27780422935680.0, - "grad_norm": 1.6658028000538345, - "language_loss": 0.68843293, - "learning_rate": 8.44717250248668e-07, - "loss": 0.7072047, - "num_input_tokens_seen": 253062790, - "step": 11728, - "time_per_iteration": 2.7480993270874023 - }, - { - "auxiliary_loss_clip": 0.01080875, - "auxiliary_loss_mlp": 0.00771073, - "balance_loss_clip": 1.03778076, - "balance_loss_mlp": 1.00025976, - "epoch": 0.7051856305426124, - "flos": 27892033470720.0, - "grad_norm": 1.6755629992737011, - "language_loss": 0.73434365, - "learning_rate": 8.443993582217803e-07, - "loss": 0.75286305, - "num_input_tokens_seen": 253082055, - "step": 11729, - "time_per_iteration": 2.762924909591675 - }, - { - "auxiliary_loss_clip": 0.01101913, - "auxiliary_loss_mlp": 0.01033478, - "balance_loss_clip": 1.04192829, - "balance_loss_mlp": 1.01929784, - "epoch": 0.7052457537952803, - "flos": 25043563082880.0, - "grad_norm": 1.5899274462099697, - "language_loss": 0.77899098, - "learning_rate": 8.440815100153862e-07, - "loss": 0.80034494, - "num_input_tokens_seen": 253102575, - "step": 11730, - "time_per_iteration": 2.6950738430023193 - }, - { - "auxiliary_loss_clip": 0.01113225, - "auxiliary_loss_mlp": 0.01040637, - "balance_loss_clip": 1.0385406, - "balance_loss_mlp": 1.02747023, - "epoch": 0.7053058770479483, - "flos": 21871717528320.0, - "grad_norm": 1.9830603646297307, - "language_loss": 0.62745416, - "learning_rate": 8.437637056415359e-07, - "loss": 0.64899278, - "num_input_tokens_seen": 253121290, - "step": 11731, - "time_per_iteration": 2.588109016418457 - }, - { - "auxiliary_loss_clip": 0.01058245, - "auxiliary_loss_mlp": 0.01032885, - "balance_loss_clip": 1.0364511, - "balance_loss_mlp": 1.01818633, - "epoch": 0.7053660003006162, - "flos": 16398716094720.0, - "grad_norm": 1.9950553391193862, - "language_loss": 0.74299359, - "learning_rate": 8.434459451122815e-07, - "loss": 0.76390493, - "num_input_tokens_seen": 253139720, - "step": 11732, - "time_per_iteration": 2.6930272579193115 - }, - { - "auxiliary_loss_clip": 0.01102907, - "auxiliary_loss_mlp": 0.01034149, - "balance_loss_clip": 1.04072869, - "balance_loss_mlp": 1.0211252, - "epoch": 0.7054261235532843, - "flos": 22711560399360.0, - "grad_norm": 1.5877435619523543, - "language_loss": 0.71181738, - "learning_rate": 8.431282284396735e-07, - "loss": 0.73318791, - "num_input_tokens_seen": 253160250, - "step": 11733, - "time_per_iteration": 2.6677498817443848 - }, - { - "auxiliary_loss_clip": 0.01077175, - "auxiliary_loss_mlp": 0.01034316, - "balance_loss_clip": 1.03793108, - "balance_loss_mlp": 1.02100611, - "epoch": 0.7054862468059522, - "flos": 13589711775360.0, - "grad_norm": 1.9036570704847118, - "language_loss": 0.73538595, - "learning_rate": 8.428105556357583e-07, - "loss": 0.75650084, - "num_input_tokens_seen": 253178710, - "step": 11734, - "time_per_iteration": 2.660600185394287 - }, - { - "auxiliary_loss_clip": 0.01080202, - "auxiliary_loss_mlp": 0.01045919, - "balance_loss_clip": 1.03852844, - "balance_loss_mlp": 1.02982593, - "epoch": 0.7055463700586202, - "flos": 15880704105600.0, - "grad_norm": 2.4268970887811685, - "language_loss": 0.6969564, - "learning_rate": 8.424929267125829e-07, - "loss": 0.71821761, - "num_input_tokens_seen": 253194805, - "step": 11735, - "time_per_iteration": 2.6809842586517334 - }, - { - "auxiliary_loss_clip": 0.01084684, - "auxiliary_loss_mlp": 0.01039756, - "balance_loss_clip": 1.03542256, - "balance_loss_mlp": 1.02454448, - "epoch": 0.7056064933112881, - "flos": 23076161400960.0, - "grad_norm": 2.1216259487349918, - "language_loss": 0.72249383, - "learning_rate": 8.421753416821933e-07, - "loss": 0.74373823, - "num_input_tokens_seen": 253213895, - "step": 11736, - "time_per_iteration": 2.7173984050750732 - }, - { - "auxiliary_loss_clip": 0.01093397, - "auxiliary_loss_mlp": 0.01028697, - "balance_loss_clip": 1.03913403, - "balance_loss_mlp": 1.01618576, - "epoch": 0.7056666165639561, - "flos": 24057168721920.0, - "grad_norm": 1.8303036629192204, - "language_loss": 0.68867785, - "learning_rate": 8.41857800556629e-07, - "loss": 0.70989877, - "num_input_tokens_seen": 253231620, - "step": 11737, - "time_per_iteration": 2.7358338832855225 - }, - { - "auxiliary_loss_clip": 0.01082807, - "auxiliary_loss_mlp": 0.01039951, - "balance_loss_clip": 1.04143405, - "balance_loss_mlp": 1.02608716, - "epoch": 0.705726739816624, - "flos": 17493237371520.0, - "grad_norm": 2.0769803991045848, - "language_loss": 0.67978764, - "learning_rate": 8.415403033479332e-07, - "loss": 0.70101517, - "num_input_tokens_seen": 253249590, - "step": 11738, - "time_per_iteration": 2.7042016983032227 - }, - { - "auxiliary_loss_clip": 0.01114904, - "auxiliary_loss_mlp": 0.01037563, - "balance_loss_clip": 1.04074073, - "balance_loss_mlp": 1.02349627, - "epoch": 0.7057868630692921, - "flos": 51350426472960.0, - "grad_norm": 1.923264037015028, - "language_loss": 0.75011027, - "learning_rate": 8.41222850068145e-07, - "loss": 0.77163494, - "num_input_tokens_seen": 253273870, - "step": 11739, - "time_per_iteration": 2.9135007858276367 - }, - { - "auxiliary_loss_clip": 0.01084303, - "auxiliary_loss_mlp": 0.00770885, - "balance_loss_clip": 1.03611875, - "balance_loss_mlp": 1.00016105, - "epoch": 0.70584698632196, - "flos": 26102963836800.0, - "grad_norm": 1.6688083494293096, - "language_loss": 0.71504521, - "learning_rate": 8.409054407293032e-07, - "loss": 0.7335971, - "num_input_tokens_seen": 253293720, - "step": 11740, - "time_per_iteration": 2.7146854400634766 - }, - { - "auxiliary_loss_clip": 0.01081608, - "auxiliary_loss_mlp": 0.01029234, - "balance_loss_clip": 1.03897929, - "balance_loss_mlp": 1.01712787, - "epoch": 0.705907109574628, - "flos": 21543134889600.0, - "grad_norm": 1.6427960474243053, - "language_loss": 0.81782758, - "learning_rate": 8.405880753434434e-07, - "loss": 0.83893597, - "num_input_tokens_seen": 253313700, - "step": 11741, - "time_per_iteration": 2.7265563011169434 - }, - { - "auxiliary_loss_clip": 0.01091272, - "auxiliary_loss_mlp": 0.01033057, - "balance_loss_clip": 1.03843784, - "balance_loss_mlp": 1.01918101, - "epoch": 0.705967232827296, - "flos": 22710842127360.0, - "grad_norm": 1.7910600093045685, - "language_loss": 0.77970088, - "learning_rate": 8.402707539225993e-07, - "loss": 0.80094415, - "num_input_tokens_seen": 253332425, - "step": 11742, - "time_per_iteration": 2.744617462158203 - }, - { - "auxiliary_loss_clip": 0.01119104, - "auxiliary_loss_mlp": 0.01034721, - "balance_loss_clip": 1.04196119, - "balance_loss_mlp": 1.02049327, - "epoch": 0.7060273560799639, - "flos": 28691225124480.0, - "grad_norm": 1.5236916078434313, - "language_loss": 0.64199877, - "learning_rate": 8.39953476478805e-07, - "loss": 0.66353697, - "num_input_tokens_seen": 253353620, - "step": 11743, - "time_per_iteration": 2.6587469577789307 - }, - { - "auxiliary_loss_clip": 0.01087403, - "auxiliary_loss_mlp": 0.01037922, - "balance_loss_clip": 1.03589988, - "balance_loss_mlp": 1.02340233, - "epoch": 0.7060874793326319, - "flos": 15706178899200.0, - "grad_norm": 1.87431643891437, - "language_loss": 0.65725398, - "learning_rate": 8.396362430240902e-07, - "loss": 0.67850721, - "num_input_tokens_seen": 253370930, - "step": 11744, - "time_per_iteration": 2.651118278503418 - }, - { - "auxiliary_loss_clip": 0.01100616, - "auxiliary_loss_mlp": 0.01034598, - "balance_loss_clip": 1.03792346, - "balance_loss_mlp": 1.02068591, - "epoch": 0.7061476025852998, - "flos": 21506757390720.0, - "grad_norm": 1.7030797660453072, - "language_loss": 0.63694406, - "learning_rate": 8.393190535704857e-07, - "loss": 0.65829617, - "num_input_tokens_seen": 253389810, - "step": 11745, - "time_per_iteration": 4.299595832824707 - }, - { - "auxiliary_loss_clip": 0.01077796, - "auxiliary_loss_mlp": 0.01034157, - "balance_loss_clip": 1.03395259, - "balance_loss_mlp": 1.02075148, - "epoch": 0.7062077258379679, - "flos": 28181832399360.0, - "grad_norm": 1.8209890328260383, - "language_loss": 0.71859854, - "learning_rate": 8.390019081300188e-07, - "loss": 0.73971808, - "num_input_tokens_seen": 253408685, - "step": 11746, - "time_per_iteration": 2.736166000366211 - }, - { - "auxiliary_loss_clip": 0.01057236, - "auxiliary_loss_mlp": 0.01033707, - "balance_loss_clip": 1.03863013, - "balance_loss_mlp": 1.02068353, - "epoch": 0.7062678490906358, - "flos": 27853680723840.0, - "grad_norm": 1.467695639044188, - "language_loss": 0.79042476, - "learning_rate": 8.386848067147175e-07, - "loss": 0.81133419, - "num_input_tokens_seen": 253429685, - "step": 11747, - "time_per_iteration": 4.4075751304626465 - }, - { - "auxiliary_loss_clip": 0.01099667, - "auxiliary_loss_mlp": 0.0103307, - "balance_loss_clip": 1.03842044, - "balance_loss_mlp": 1.02088141, - "epoch": 0.7063279723433038, - "flos": 23184862934400.0, - "grad_norm": 1.8618400783249733, - "language_loss": 0.65024734, - "learning_rate": 8.383677493366031e-07, - "loss": 0.67157471, - "num_input_tokens_seen": 253448260, - "step": 11748, - "time_per_iteration": 4.207107305526733 - }, - { - "auxiliary_loss_clip": 0.01067203, - "auxiliary_loss_mlp": 0.01036794, - "balance_loss_clip": 1.03578413, - "balance_loss_mlp": 1.02373433, - "epoch": 0.7063880955959717, - "flos": 20188655907840.0, - "grad_norm": 2.0013045839241337, - "language_loss": 0.79624116, - "learning_rate": 8.380507360077003e-07, - "loss": 0.81728113, - "num_input_tokens_seen": 253467725, - "step": 11749, - "time_per_iteration": 2.8175888061523438 - }, - { - "auxiliary_loss_clip": 0.01033129, - "auxiliary_loss_mlp": 0.01002303, - "balance_loss_clip": 1.01007652, - "balance_loss_mlp": 1.001194, - "epoch": 0.7064482188486397, - "flos": 63668182763520.0, - "grad_norm": 0.7992789353290359, - "language_loss": 0.54014421, - "learning_rate": 8.377337667400304e-07, - "loss": 0.56049848, - "num_input_tokens_seen": 253526940, - "step": 11750, - "time_per_iteration": 3.154975175857544 - }, - { - "auxiliary_loss_clip": 0.01092158, - "auxiliary_loss_mlp": 0.01037282, - "balance_loss_clip": 1.03940368, - "balance_loss_mlp": 1.0240972, - "epoch": 0.7065083421013076, - "flos": 25191227894400.0, - "grad_norm": 1.6870361976430077, - "language_loss": 0.78464556, - "learning_rate": 8.37416841545612e-07, - "loss": 0.80593991, - "num_input_tokens_seen": 253546160, - "step": 11751, - "time_per_iteration": 2.732318878173828 - }, - { - "auxiliary_loss_clip": 0.01074241, - "auxiliary_loss_mlp": 0.01032324, - "balance_loss_clip": 1.03658986, - "balance_loss_mlp": 1.02024841, - "epoch": 0.7065684653539757, - "flos": 22893699288960.0, - "grad_norm": 2.1027740219742928, - "language_loss": 0.67992324, - "learning_rate": 8.370999604364634e-07, - "loss": 0.70098889, - "num_input_tokens_seen": 253565505, - "step": 11752, - "time_per_iteration": 2.810976505279541 - }, - { - "auxiliary_loss_clip": 0.01058629, - "auxiliary_loss_mlp": 0.00771253, - "balance_loss_clip": 1.03738487, - "balance_loss_mlp": 1.00034094, - "epoch": 0.7066285886066436, - "flos": 23550254035200.0, - "grad_norm": 2.005268773121489, - "language_loss": 0.7630111, - "learning_rate": 8.367831234246025e-07, - "loss": 0.7813099, - "num_input_tokens_seen": 253585125, - "step": 11753, - "time_per_iteration": 2.7646682262420654 - }, - { - "auxiliary_loss_clip": 0.01082791, - "auxiliary_loss_mlp": 0.00770025, - "balance_loss_clip": 1.0389396, - "balance_loss_mlp": 1.00026608, - "epoch": 0.7066887118593116, - "flos": 21069293650560.0, - "grad_norm": 1.5279654497487496, - "language_loss": 0.70828259, - "learning_rate": 8.364663305220405e-07, - "loss": 0.72681069, - "num_input_tokens_seen": 253604815, - "step": 11754, - "time_per_iteration": 4.359536170959473 - }, - { - "auxiliary_loss_clip": 0.01072435, - "auxiliary_loss_mlp": 0.01043933, - "balance_loss_clip": 1.03650284, - "balance_loss_mlp": 1.02949619, - "epoch": 0.7067488351119796, - "flos": 21176307244800.0, - "grad_norm": 2.2420199206717104, - "language_loss": 0.89593709, - "learning_rate": 8.361495817407919e-07, - "loss": 0.91710079, - "num_input_tokens_seen": 253622855, - "step": 11755, - "time_per_iteration": 2.682011365890503 - }, - { - "auxiliary_loss_clip": 0.01088944, - "auxiliary_loss_mlp": 0.00770375, - "balance_loss_clip": 1.03828812, - "balance_loss_mlp": 1.00012851, - "epoch": 0.7068089583646475, - "flos": 20449224144000.0, - "grad_norm": 1.7866584888812729, - "language_loss": 0.79776525, - "learning_rate": 8.358328770928678e-07, - "loss": 0.81635845, - "num_input_tokens_seen": 253642760, - "step": 11756, - "time_per_iteration": 2.7065372467041016 - }, - { - "auxiliary_loss_clip": 0.00998647, - "auxiliary_loss_mlp": 0.01001672, - "balance_loss_clip": 1.01305819, - "balance_loss_mlp": 1.00066495, - "epoch": 0.7068690816173155, - "flos": 59109179829120.0, - "grad_norm": 1.037725439626294, - "language_loss": 0.60347986, - "learning_rate": 8.355162165902785e-07, - "loss": 0.62348306, - "num_input_tokens_seen": 253695685, - "step": 11757, - "time_per_iteration": 3.031812906265259 - }, - { - "auxiliary_loss_clip": 0.0107753, - "auxiliary_loss_mlp": 0.01034781, - "balance_loss_clip": 1.03763008, - "balance_loss_mlp": 1.02182341, - "epoch": 0.7069292048699835, - "flos": 16251554073600.0, - "grad_norm": 1.6437193466007092, - "language_loss": 0.80430943, - "learning_rate": 8.351996002450307e-07, - "loss": 0.82543254, - "num_input_tokens_seen": 253713305, - "step": 11758, - "time_per_iteration": 2.655449628829956 - }, - { - "auxiliary_loss_clip": 0.01071031, - "auxiliary_loss_mlp": 0.00770922, - "balance_loss_clip": 1.03758502, - "balance_loss_mlp": 1.000157, - "epoch": 0.7069893281226515, - "flos": 41172768455040.0, - "grad_norm": 2.774739921576683, - "language_loss": 0.77785885, - "learning_rate": 8.348830280691304e-07, - "loss": 0.79627836, - "num_input_tokens_seen": 253736100, - "step": 11759, - "time_per_iteration": 2.8444526195526123 - }, - { - "auxiliary_loss_clip": 0.01101914, - "auxiliary_loss_mlp": 0.01031651, - "balance_loss_clip": 1.03790534, - "balance_loss_mlp": 1.01810324, - "epoch": 0.7070494513753194, - "flos": 24207275658240.0, - "grad_norm": 1.9103417618984617, - "language_loss": 0.67560184, - "learning_rate": 8.34566500074583e-07, - "loss": 0.69693744, - "num_input_tokens_seen": 253757350, - "step": 11760, - "time_per_iteration": 2.715236186981201 - }, - { - "auxiliary_loss_clip": 0.01076213, - "auxiliary_loss_mlp": 0.01033273, - "balance_loss_clip": 1.03878856, - "balance_loss_mlp": 1.02084506, - "epoch": 0.7071095746279874, - "flos": 20185675079040.0, - "grad_norm": 1.9224750289593278, - "language_loss": 0.80442196, - "learning_rate": 8.342500162733899e-07, - "loss": 0.82551688, - "num_input_tokens_seen": 253772855, - "step": 11761, - "time_per_iteration": 2.6564581394195557 - }, - { - "auxiliary_loss_clip": 0.01086413, - "auxiliary_loss_mlp": 0.01043874, - "balance_loss_clip": 1.03556442, - "balance_loss_mlp": 1.02777457, - "epoch": 0.7071696978806553, - "flos": 18183045133440.0, - "grad_norm": 2.457600250400544, - "language_loss": 0.75026697, - "learning_rate": 8.33933576677553e-07, - "loss": 0.77156985, - "num_input_tokens_seen": 253790360, - "step": 11762, - "time_per_iteration": 2.6615617275238037 - }, - { - "auxiliary_loss_clip": 0.01087856, - "auxiliary_loss_mlp": 0.01032971, - "balance_loss_clip": 1.0366205, - "balance_loss_mlp": 1.0203824, - "epoch": 0.7072298211333233, - "flos": 24131719399680.0, - "grad_norm": 1.9130475183821334, - "language_loss": 0.76827163, - "learning_rate": 8.336171812990724e-07, - "loss": 0.78947991, - "num_input_tokens_seen": 253810585, - "step": 11763, - "time_per_iteration": 2.7182300090789795 - }, - { - "auxiliary_loss_clip": 0.01083565, - "auxiliary_loss_mlp": 0.00771257, - "balance_loss_clip": 1.03937089, - "balance_loss_mlp": 1.0002048, - "epoch": 0.7072899443859912, - "flos": 27198418867200.0, - "grad_norm": 2.255368812760523, - "language_loss": 0.78756404, - "learning_rate": 8.333008301499453e-07, - "loss": 0.80611229, - "num_input_tokens_seen": 253829080, - "step": 11764, - "time_per_iteration": 2.7578113079071045 - }, - { - "auxiliary_loss_clip": 0.01064836, - "auxiliary_loss_mlp": 0.01037677, - "balance_loss_clip": 1.03533673, - "balance_loss_mlp": 1.02406919, - "epoch": 0.7073500676386593, - "flos": 16435596384000.0, - "grad_norm": 1.727630740916822, - "language_loss": 0.79465842, - "learning_rate": 8.32984523242167e-07, - "loss": 0.81568348, - "num_input_tokens_seen": 253846780, - "step": 11765, - "time_per_iteration": 2.7866904735565186 - }, - { - "auxiliary_loss_clip": 0.01109005, - "auxiliary_loss_mlp": 0.01028153, - "balance_loss_clip": 1.03910017, - "balance_loss_mlp": 1.017102, - "epoch": 0.7074101908913272, - "flos": 27673732563840.0, - "grad_norm": 1.7516856530265033, - "language_loss": 0.68398869, - "learning_rate": 8.326682605877324e-07, - "loss": 0.70536023, - "num_input_tokens_seen": 253867075, - "step": 11766, - "time_per_iteration": 2.701338768005371 - }, - { - "auxiliary_loss_clip": 0.01090324, - "auxiliary_loss_mlp": 0.01038357, - "balance_loss_clip": 1.03629494, - "balance_loss_mlp": 1.02530944, - "epoch": 0.7074703141439952, - "flos": 22238078296320.0, - "grad_norm": 2.1562679089059245, - "language_loss": 0.63844228, - "learning_rate": 8.323520421986352e-07, - "loss": 0.65972912, - "num_input_tokens_seen": 253885790, - "step": 11767, - "time_per_iteration": 2.682774543762207 - }, - { - "auxiliary_loss_clip": 0.01101727, - "auxiliary_loss_mlp": 0.01027229, - "balance_loss_clip": 1.03870296, - "balance_loss_mlp": 1.01452112, - "epoch": 0.7075304373966632, - "flos": 29643217234560.0, - "grad_norm": 1.942959612416706, - "language_loss": 0.5247106, - "learning_rate": 8.320358680868646e-07, - "loss": 0.54600012, - "num_input_tokens_seen": 253907070, - "step": 11768, - "time_per_iteration": 2.753188133239746 - }, - { - "auxiliary_loss_clip": 0.01088433, - "auxiliary_loss_mlp": 0.00770686, - "balance_loss_clip": 1.03836966, - "balance_loss_mlp": 1.00015306, - "epoch": 0.7075905606493311, - "flos": 19755214490880.0, - "grad_norm": 1.6006404938341818, - "language_loss": 0.75532562, - "learning_rate": 8.317197382644119e-07, - "loss": 0.77391684, - "num_input_tokens_seen": 253927290, - "step": 11769, - "time_per_iteration": 2.7288544178009033 - }, - { - "auxiliary_loss_clip": 0.01013903, - "auxiliary_loss_mlp": 0.01015517, - "balance_loss_clip": 1.00881553, - "balance_loss_mlp": 1.01409209, - "epoch": 0.7076506839019991, - "flos": 65716132694400.0, - "grad_norm": 0.8537079866047984, - "language_loss": 0.61981726, - "learning_rate": 8.314036527432637e-07, - "loss": 0.64011145, - "num_input_tokens_seen": 253983440, - "step": 11770, - "time_per_iteration": 3.14176607131958 - }, - { - "auxiliary_loss_clip": 0.01078902, - "auxiliary_loss_mlp": 0.01036173, - "balance_loss_clip": 1.03566778, - "balance_loss_mlp": 1.02311409, - "epoch": 0.707710807154667, - "flos": 23765286804480.0, - "grad_norm": 1.7412518327218227, - "language_loss": 0.76630473, - "learning_rate": 8.310876115354055e-07, - "loss": 0.7874555, - "num_input_tokens_seen": 254003825, - "step": 11771, - "time_per_iteration": 2.8118982315063477 - }, - { - "auxiliary_loss_clip": 0.01097524, - "auxiliary_loss_mlp": 0.01028735, - "balance_loss_clip": 1.03753102, - "balance_loss_mlp": 1.01656938, - "epoch": 0.7077709304073351, - "flos": 21251360712960.0, - "grad_norm": 1.6484169639122062, - "language_loss": 0.70931244, - "learning_rate": 8.307716146528221e-07, - "loss": 0.73057497, - "num_input_tokens_seen": 254023345, - "step": 11772, - "time_per_iteration": 2.6268296241760254 - }, - { - "auxiliary_loss_clip": 0.01063148, - "auxiliary_loss_mlp": 0.01033223, - "balance_loss_clip": 1.03564882, - "balance_loss_mlp": 1.01975262, - "epoch": 0.707831053660003, - "flos": 20740746925440.0, - "grad_norm": 1.8398984713825175, - "language_loss": 0.69417477, - "learning_rate": 8.30455662107496e-07, - "loss": 0.71513855, - "num_input_tokens_seen": 254041815, - "step": 11773, - "time_per_iteration": 2.7778313159942627 - }, - { - "auxiliary_loss_clip": 0.01104178, - "auxiliary_loss_mlp": 0.01034694, - "balance_loss_clip": 1.03965759, - "balance_loss_mlp": 1.02178395, - "epoch": 0.707891176912671, - "flos": 21980993679360.0, - "grad_norm": 1.4556965212861144, - "language_loss": 0.7014932, - "learning_rate": 8.301397539114095e-07, - "loss": 0.72288191, - "num_input_tokens_seen": 254062065, - "step": 11774, - "time_per_iteration": 2.68330979347229 - }, - { - "auxiliary_loss_clip": 0.01081938, - "auxiliary_loss_mlp": 0.01028977, - "balance_loss_clip": 1.04048491, - "balance_loss_mlp": 1.01713347, - "epoch": 0.7079513001653389, - "flos": 21068970428160.0, - "grad_norm": 1.5670559751490778, - "language_loss": 0.74400485, - "learning_rate": 8.298238900765407e-07, - "loss": 0.76511401, - "num_input_tokens_seen": 254080605, - "step": 11775, - "time_per_iteration": 2.672057628631592 - }, - { - "auxiliary_loss_clip": 0.01074662, - "auxiliary_loss_mlp": 0.00770567, - "balance_loss_clip": 1.03893805, - "balance_loss_mlp": 1.00032187, - "epoch": 0.7080114234180069, - "flos": 18040659621120.0, - "grad_norm": 1.9150110614912736, - "language_loss": 0.86714977, - "learning_rate": 8.295080706148665e-07, - "loss": 0.88560206, - "num_input_tokens_seen": 254098710, - "step": 11776, - "time_per_iteration": 2.68167781829834 - }, - { - "auxiliary_loss_clip": 0.01093973, - "auxiliary_loss_mlp": 0.01034049, - "balance_loss_clip": 1.03666544, - "balance_loss_mlp": 1.02201438, - "epoch": 0.7080715466706748, - "flos": 15122271409920.0, - "grad_norm": 1.4933339123942304, - "language_loss": 0.75204122, - "learning_rate": 8.291922955383641e-07, - "loss": 0.77332139, - "num_input_tokens_seen": 254117200, - "step": 11777, - "time_per_iteration": 2.617124319076538 - }, - { - "auxiliary_loss_clip": 0.0109467, - "auxiliary_loss_mlp": 0.01033372, - "balance_loss_clip": 1.04061341, - "balance_loss_mlp": 1.02046156, - "epoch": 0.7081316699233429, - "flos": 14422802889600.0, - "grad_norm": 2.468930422112918, - "language_loss": 0.8228538, - "learning_rate": 8.288765648590066e-07, - "loss": 0.84413421, - "num_input_tokens_seen": 254132115, - "step": 11778, - "time_per_iteration": 2.7087488174438477 - }, - { - "auxiliary_loss_clip": 0.01082719, - "auxiliary_loss_mlp": 0.0103363, - "balance_loss_clip": 1.03583169, - "balance_loss_mlp": 1.02246594, - "epoch": 0.7081917931760108, - "flos": 23222389668480.0, - "grad_norm": 1.6098616985666978, - "language_loss": 0.85021019, - "learning_rate": 8.285608785887673e-07, - "loss": 0.87137371, - "num_input_tokens_seen": 254152285, - "step": 11779, - "time_per_iteration": 2.6744067668914795 - }, - { - "auxiliary_loss_clip": 0.0108855, - "auxiliary_loss_mlp": 0.01032676, - "balance_loss_clip": 1.03944063, - "balance_loss_mlp": 1.01993263, - "epoch": 0.7082519164286788, - "flos": 39308429871360.0, - "grad_norm": 2.2882732237177326, - "language_loss": 0.72005677, - "learning_rate": 8.28245236739618e-07, - "loss": 0.74126905, - "num_input_tokens_seen": 254172805, - "step": 11780, - "time_per_iteration": 2.8406546115875244 - }, - { - "auxiliary_loss_clip": 0.0105972, - "auxiliary_loss_mlp": 0.010316, - "balance_loss_clip": 1.03477693, - "balance_loss_mlp": 1.01896429, - "epoch": 0.7083120396813467, - "flos": 21651154064640.0, - "grad_norm": 1.4192183070754045, - "language_loss": 0.73349321, - "learning_rate": 8.279296393235256e-07, - "loss": 0.75440645, - "num_input_tokens_seen": 254191890, - "step": 11781, - "time_per_iteration": 2.8251590728759766 - }, - { - "auxiliary_loss_clip": 0.01099337, - "auxiliary_loss_mlp": 0.01032456, - "balance_loss_clip": 1.0398531, - "balance_loss_mlp": 1.02066612, - "epoch": 0.7083721629340147, - "flos": 17567033863680.0, - "grad_norm": 1.678957129171523, - "language_loss": 0.77408248, - "learning_rate": 8.276140863524585e-07, - "loss": 0.79540044, - "num_input_tokens_seen": 254210150, - "step": 11782, - "time_per_iteration": 2.6758499145507812 - }, - { - "auxiliary_loss_clip": 0.0108554, - "auxiliary_loss_mlp": 0.01029552, - "balance_loss_clip": 1.03717136, - "balance_loss_mlp": 1.01893663, - "epoch": 0.7084322861866827, - "flos": 29350509304320.0, - "grad_norm": 1.5187238607788938, - "language_loss": 0.69871926, - "learning_rate": 8.272985778383828e-07, - "loss": 0.71987015, - "num_input_tokens_seen": 254233015, - "step": 11783, - "time_per_iteration": 2.8378536701202393 - }, - { - "auxiliary_loss_clip": 0.01073688, - "auxiliary_loss_mlp": 0.01028822, - "balance_loss_clip": 1.03804398, - "balance_loss_mlp": 1.01622784, - "epoch": 0.7084924094393507, - "flos": 20194294343040.0, - "grad_norm": 1.5952065311243613, - "language_loss": 0.78930736, - "learning_rate": 8.269831137932632e-07, - "loss": 0.81033248, - "num_input_tokens_seen": 254251345, - "step": 11784, - "time_per_iteration": 4.362036943435669 - }, - { - "auxiliary_loss_clip": 0.01111276, - "auxiliary_loss_mlp": 0.0103335, - "balance_loss_clip": 1.04004228, - "balance_loss_mlp": 1.02080894, - "epoch": 0.7085525326920187, - "flos": 23477211728640.0, - "grad_norm": 2.08544088937736, - "language_loss": 0.77696943, - "learning_rate": 8.266676942290609e-07, - "loss": 0.79841572, - "num_input_tokens_seen": 254269905, - "step": 11785, - "time_per_iteration": 2.5937209129333496 - }, - { - "auxiliary_loss_clip": 0.01085039, - "auxiliary_loss_mlp": 0.01034205, - "balance_loss_clip": 1.0364778, - "balance_loss_mlp": 1.02091932, - "epoch": 0.7086126559446866, - "flos": 25958818558080.0, - "grad_norm": 1.9412182789537995, - "language_loss": 0.78004217, - "learning_rate": 8.26352319157738e-07, - "loss": 0.80123466, - "num_input_tokens_seen": 254289990, - "step": 11786, - "time_per_iteration": 4.211290121078491 - }, - { - "auxiliary_loss_clip": 0.01113302, - "auxiliary_loss_mlp": 0.01030804, - "balance_loss_clip": 1.03969085, - "balance_loss_mlp": 1.01798928, - "epoch": 0.7086727791973546, - "flos": 26724793109760.0, - "grad_norm": 1.9518945204498503, - "language_loss": 0.78987539, - "learning_rate": 8.260369885912526e-07, - "loss": 0.81131643, - "num_input_tokens_seen": 254309085, - "step": 11787, - "time_per_iteration": 4.215709447860718 - }, - { - "auxiliary_loss_clip": 0.01100936, - "auxiliary_loss_mlp": 0.01032335, - "balance_loss_clip": 1.0393877, - "balance_loss_mlp": 1.02004457, - "epoch": 0.7087329024500225, - "flos": 21683365585920.0, - "grad_norm": 1.963463516719362, - "language_loss": 0.76586342, - "learning_rate": 8.257217025415615e-07, - "loss": 0.7871961, - "num_input_tokens_seen": 254327045, - "step": 11788, - "time_per_iteration": 2.6296236515045166 - }, - { - "auxiliary_loss_clip": 0.0107305, - "auxiliary_loss_mlp": 0.01037785, - "balance_loss_clip": 1.03411317, - "balance_loss_mlp": 1.02229357, - "epoch": 0.7087930257026905, - "flos": 17931060247680.0, - "grad_norm": 1.8171493958934544, - "language_loss": 0.67838019, - "learning_rate": 8.254064610206212e-07, - "loss": 0.69948852, - "num_input_tokens_seen": 254344585, - "step": 11789, - "time_per_iteration": 2.664304733276367 - }, - { - "auxiliary_loss_clip": 0.0105779, - "auxiliary_loss_mlp": 0.01034389, - "balance_loss_clip": 1.03825188, - "balance_loss_mlp": 1.02094257, - "epoch": 0.7088531489553584, - "flos": 18911528864640.0, - "grad_norm": 1.6253389079610685, - "language_loss": 0.77915251, - "learning_rate": 8.250912640403858e-07, - "loss": 0.80007434, - "num_input_tokens_seen": 254362470, - "step": 11790, - "time_per_iteration": 2.745398759841919 - }, - { - "auxiliary_loss_clip": 0.01093327, - "auxiliary_loss_mlp": 0.01033055, - "balance_loss_clip": 1.03802967, - "balance_loss_mlp": 1.01917279, - "epoch": 0.7089132722080265, - "flos": 27380880979200.0, - "grad_norm": 2.6743877386072046, - "language_loss": 0.70789683, - "learning_rate": 8.247761116128085e-07, - "loss": 0.72916067, - "num_input_tokens_seen": 254383190, - "step": 11791, - "time_per_iteration": 2.7536044120788574 - }, - { - "auxiliary_loss_clip": 0.0110278, - "auxiliary_loss_mlp": 0.01035298, - "balance_loss_clip": 1.04042172, - "balance_loss_mlp": 1.02178025, - "epoch": 0.7089733954606944, - "flos": 22162917087360.0, - "grad_norm": 1.574032400084743, - "language_loss": 0.82329011, - "learning_rate": 8.244610037498376e-07, - "loss": 0.84467089, - "num_input_tokens_seen": 254403115, - "step": 11792, - "time_per_iteration": 2.658579111099243 - }, - { - "auxiliary_loss_clip": 0.01071076, - "auxiliary_loss_mlp": 0.01032048, - "balance_loss_clip": 1.03814042, - "balance_loss_mlp": 1.01898229, - "epoch": 0.7090335187133624, - "flos": 24425827960320.0, - "grad_norm": 2.4251661207172406, - "language_loss": 0.64878172, - "learning_rate": 8.241459404634232e-07, - "loss": 0.66981292, - "num_input_tokens_seen": 254421875, - "step": 11793, - "time_per_iteration": 4.261074066162109 - }, - { - "auxiliary_loss_clip": 0.01097375, - "auxiliary_loss_mlp": 0.01035971, - "balance_loss_clip": 1.03896296, - "balance_loss_mlp": 1.02244079, - "epoch": 0.7090936419660303, - "flos": 21835232288640.0, - "grad_norm": 5.81708329493613, - "language_loss": 0.70618987, - "learning_rate": 8.238309217655133e-07, - "loss": 0.72752333, - "num_input_tokens_seen": 254440765, - "step": 11794, - "time_per_iteration": 2.6876423358917236 - }, - { - "auxiliary_loss_clip": 0.01091573, - "auxiliary_loss_mlp": 0.01037583, - "balance_loss_clip": 1.04035616, - "balance_loss_mlp": 1.02522707, - "epoch": 0.7091537652186983, - "flos": 20082360585600.0, - "grad_norm": 1.8634208156139904, - "language_loss": 0.76080108, - "learning_rate": 8.23515947668052e-07, - "loss": 0.78209263, - "num_input_tokens_seen": 254459480, - "step": 11795, - "time_per_iteration": 2.63566255569458 - }, - { - "auxiliary_loss_clip": 0.01075226, - "auxiliary_loss_mlp": 0.01033753, - "balance_loss_clip": 1.03935122, - "balance_loss_mlp": 1.02176011, - "epoch": 0.7092138884713663, - "flos": 13151565676800.0, - "grad_norm": 2.3261568456734816, - "language_loss": 0.75312549, - "learning_rate": 8.232010181829838e-07, - "loss": 0.77421528, - "num_input_tokens_seen": 254473985, - "step": 11796, - "time_per_iteration": 2.716097116470337 - }, - { - "auxiliary_loss_clip": 0.01103014, - "auxiliary_loss_mlp": 0.01042156, - "balance_loss_clip": 1.03999233, - "balance_loss_mlp": 1.02640212, - "epoch": 0.7092740117240343, - "flos": 21645982506240.0, - "grad_norm": 1.559472378141648, - "language_loss": 0.74076355, - "learning_rate": 8.228861333222523e-07, - "loss": 0.76221526, - "num_input_tokens_seen": 254492135, - "step": 11797, - "time_per_iteration": 2.6320409774780273 - }, - { - "auxiliary_loss_clip": 0.0106907, - "auxiliary_loss_mlp": 0.01035432, - "balance_loss_clip": 1.03879786, - "balance_loss_mlp": 1.02290332, - "epoch": 0.7093341349767023, - "flos": 21032521102080.0, - "grad_norm": 1.5130058981356065, - "language_loss": 0.79285604, - "learning_rate": 8.225712930977953e-07, - "loss": 0.81390107, - "num_input_tokens_seen": 254512865, - "step": 11798, - "time_per_iteration": 2.7878127098083496 - }, - { - "auxiliary_loss_clip": 0.01079128, - "auxiliary_loss_mlp": 0.01040993, - "balance_loss_clip": 1.03444886, - "balance_loss_mlp": 1.02752233, - "epoch": 0.7093942582293702, - "flos": 22017658487040.0, - "grad_norm": 1.8215442061382334, - "language_loss": 0.6698848, - "learning_rate": 8.222564975215529e-07, - "loss": 0.69108605, - "num_input_tokens_seen": 254532605, - "step": 11799, - "time_per_iteration": 2.6869001388549805 - }, - { - "auxiliary_loss_clip": 0.01112483, - "auxiliary_loss_mlp": 0.01028449, - "balance_loss_clip": 1.03966284, - "balance_loss_mlp": 1.01535368, - "epoch": 0.7094543814820382, - "flos": 27235586465280.0, - "grad_norm": 1.576526567313424, - "language_loss": 0.81716406, - "learning_rate": 8.219417466054622e-07, - "loss": 0.8385734, - "num_input_tokens_seen": 254553780, - "step": 11800, - "time_per_iteration": 2.658963680267334 - }, - { - "auxiliary_loss_clip": 0.01088302, - "auxiliary_loss_mlp": 0.01033764, - "balance_loss_clip": 1.03797638, - "balance_loss_mlp": 1.02189112, - "epoch": 0.7095145047347061, - "flos": 12089148180480.0, - "grad_norm": 1.9061714801555572, - "language_loss": 0.86517024, - "learning_rate": 8.21627040361459e-07, - "loss": 0.88639092, - "num_input_tokens_seen": 254567510, - "step": 11801, - "time_per_iteration": 2.6748046875 - }, - { - "auxiliary_loss_clip": 0.0111264, - "auxiliary_loss_mlp": 0.0103384, - "balance_loss_clip": 1.03984725, - "balance_loss_mlp": 1.02127492, - "epoch": 0.7095746279873741, - "flos": 19383789905280.0, - "grad_norm": 1.9051932445720021, - "language_loss": 0.7623291, - "learning_rate": 8.213123788014758e-07, - "loss": 0.78379387, - "num_input_tokens_seen": 254585565, - "step": 11802, - "time_per_iteration": 2.618805170059204 - }, - { - "auxiliary_loss_clip": 0.01097308, - "auxiliary_loss_mlp": 0.01046076, - "balance_loss_clip": 1.03798604, - "balance_loss_mlp": 1.03312433, - "epoch": 0.709634751240042, - "flos": 21360600950400.0, - "grad_norm": 1.6390429241877804, - "language_loss": 0.81943619, - "learning_rate": 8.209977619374462e-07, - "loss": 0.84087008, - "num_input_tokens_seen": 254603465, - "step": 11803, - "time_per_iteration": 2.68537974357605 - }, - { - "auxiliary_loss_clip": 0.01112366, - "auxiliary_loss_mlp": 0.01034568, - "balance_loss_clip": 1.03814209, - "balance_loss_mlp": 1.02085912, - "epoch": 0.7096948744927101, - "flos": 13917037438080.0, - "grad_norm": 2.293538036404514, - "language_loss": 0.67322147, - "learning_rate": 8.206831897812995e-07, - "loss": 0.69469082, - "num_input_tokens_seen": 254620500, - "step": 11804, - "time_per_iteration": 2.642585277557373 - }, - { - "auxiliary_loss_clip": 0.01097953, - "auxiliary_loss_mlp": 0.01028551, - "balance_loss_clip": 1.03816247, - "balance_loss_mlp": 1.01730967, - "epoch": 0.709754997745378, - "flos": 30298335436800.0, - "grad_norm": 1.964223724359439, - "language_loss": 0.78081644, - "learning_rate": 8.203686623449637e-07, - "loss": 0.80208147, - "num_input_tokens_seen": 254638565, - "step": 11805, - "time_per_iteration": 2.720667600631714 - }, - { - "auxiliary_loss_clip": 0.01091353, - "auxiliary_loss_mlp": 0.00771338, - "balance_loss_clip": 1.03825855, - "balance_loss_mlp": 1.00015116, - "epoch": 0.709815120998046, - "flos": 18515147304960.0, - "grad_norm": 9.192055527956402, - "language_loss": 0.79064679, - "learning_rate": 8.200541796403667e-07, - "loss": 0.80927366, - "num_input_tokens_seen": 254657505, - "step": 11806, - "time_per_iteration": 2.681230306625366 - }, - { - "auxiliary_loss_clip": 0.01083674, - "auxiliary_loss_mlp": 0.01041523, - "balance_loss_clip": 1.03755128, - "balance_loss_mlp": 1.02857089, - "epoch": 0.7098752442507139, - "flos": 22272588288000.0, - "grad_norm": 2.0898066634684573, - "language_loss": 0.56422603, - "learning_rate": 8.197397416794332e-07, - "loss": 0.58547801, - "num_input_tokens_seen": 254674730, - "step": 11807, - "time_per_iteration": 2.734550714492798 - }, - { - "auxiliary_loss_clip": 0.01114828, - "auxiliary_loss_mlp": 0.01043868, - "balance_loss_clip": 1.03833497, - "balance_loss_mlp": 1.03099334, - "epoch": 0.7099353675033819, - "flos": 19275447507840.0, - "grad_norm": 2.0105184465729464, - "language_loss": 0.68802261, - "learning_rate": 8.194253484740882e-07, - "loss": 0.70960963, - "num_input_tokens_seen": 254691665, - "step": 11808, - "time_per_iteration": 2.6423966884613037 - }, - { - "auxiliary_loss_clip": 0.01098171, - "auxiliary_loss_mlp": 0.01032542, - "balance_loss_clip": 1.03855026, - "balance_loss_mlp": 1.02025712, - "epoch": 0.70999549075605, - "flos": 21908525990400.0, - "grad_norm": 2.1402280143316834, - "language_loss": 0.71625412, - "learning_rate": 8.191110000362513e-07, - "loss": 0.73756123, - "num_input_tokens_seen": 254711610, - "step": 11809, - "time_per_iteration": 2.627044200897217 - }, - { - "auxiliary_loss_clip": 0.01031591, - "auxiliary_loss_mlp": 0.0100231, - "balance_loss_clip": 1.00862455, - "balance_loss_mlp": 1.00124347, - "epoch": 0.7100556140087179, - "flos": 70456053456000.0, - "grad_norm": 0.7494928075068129, - "language_loss": 0.5943656, - "learning_rate": 8.187966963778435e-07, - "loss": 0.61470461, - "num_input_tokens_seen": 254772615, - "step": 11810, - "time_per_iteration": 3.2029061317443848 - }, - { - "auxiliary_loss_clip": 0.01033991, - "auxiliary_loss_mlp": 0.01048321, - "balance_loss_clip": 1.03268588, - "balance_loss_mlp": 1.03488612, - "epoch": 0.7101157372613859, - "flos": 23039568420480.0, - "grad_norm": 2.72885983888825, - "language_loss": 0.74159658, - "learning_rate": 8.18482437510784e-07, - "loss": 0.7624197, - "num_input_tokens_seen": 254791375, - "step": 11811, - "time_per_iteration": 2.8374974727630615 - }, - { - "auxiliary_loss_clip": 0.01073985, - "auxiliary_loss_mlp": 0.01027518, - "balance_loss_clip": 1.03985, - "balance_loss_mlp": 1.015275, - "epoch": 0.7101758605140538, - "flos": 23185329811200.0, - "grad_norm": 2.0242190076468223, - "language_loss": 0.83632278, - "learning_rate": 8.181682234469882e-07, - "loss": 0.85733783, - "num_input_tokens_seen": 254809300, - "step": 11812, - "time_per_iteration": 2.757760763168335 - }, - { - "auxiliary_loss_clip": 0.01114938, - "auxiliary_loss_mlp": 0.01031671, - "balance_loss_clip": 1.04106176, - "balance_loss_mlp": 1.01833153, - "epoch": 0.7102359837667218, - "flos": 23696123166720.0, - "grad_norm": 1.5968911597601785, - "language_loss": 0.6982094, - "learning_rate": 8.178540541983716e-07, - "loss": 0.71967542, - "num_input_tokens_seen": 254829325, - "step": 11813, - "time_per_iteration": 2.593907594680786 - }, - { - "auxiliary_loss_clip": 0.01109186, - "auxiliary_loss_mlp": 0.01028954, - "balance_loss_clip": 1.03841877, - "balance_loss_mlp": 1.01689565, - "epoch": 0.7102961070193897, - "flos": 19391116279680.0, - "grad_norm": 1.9874956852039145, - "language_loss": 0.81565011, - "learning_rate": 8.175399297768495e-07, - "loss": 0.83703148, - "num_input_tokens_seen": 254847690, - "step": 11814, - "time_per_iteration": 2.5443472862243652 - }, - { - "auxiliary_loss_clip": 0.01112342, - "auxiliary_loss_mlp": 0.01033274, - "balance_loss_clip": 1.04030275, - "balance_loss_mlp": 1.02032149, - "epoch": 0.7103562302720577, - "flos": 21507511576320.0, - "grad_norm": 2.7298772348158074, - "language_loss": 0.75506926, - "learning_rate": 8.172258501943301e-07, - "loss": 0.77652538, - "num_input_tokens_seen": 254865960, - "step": 11815, - "time_per_iteration": 2.5428481101989746 - }, - { - "auxiliary_loss_clip": 0.01067291, - "auxiliary_loss_mlp": 0.01031233, - "balance_loss_clip": 1.03749645, - "balance_loss_mlp": 1.01881731, - "epoch": 0.7104163535247257, - "flos": 14535059869440.0, - "grad_norm": 1.7974303130693923, - "language_loss": 0.78488684, - "learning_rate": 8.16911815462725e-07, - "loss": 0.80587208, - "num_input_tokens_seen": 254882815, - "step": 11816, - "time_per_iteration": 2.7543234825134277 - }, - { - "auxiliary_loss_clip": 0.01085859, - "auxiliary_loss_mlp": 0.01038436, - "balance_loss_clip": 1.03845906, - "balance_loss_mlp": 1.02579379, - "epoch": 0.7104764767773937, - "flos": 11400310085760.0, - "grad_norm": 1.72092645420432, - "language_loss": 0.86654431, - "learning_rate": 8.165978255939426e-07, - "loss": 0.88778722, - "num_input_tokens_seen": 254898705, - "step": 11817, - "time_per_iteration": 2.6052346229553223 - }, - { - "auxiliary_loss_clip": 0.01064818, - "auxiliary_loss_mlp": 0.01029492, - "balance_loss_clip": 1.0393579, - "balance_loss_mlp": 1.01749921, - "epoch": 0.7105366000300616, - "flos": 11690432236800.0, - "grad_norm": 2.3052427315849964, - "language_loss": 0.848396, - "learning_rate": 8.162838805998897e-07, - "loss": 0.86933911, - "num_input_tokens_seen": 254913665, - "step": 11818, - "time_per_iteration": 2.664659023284912 - }, - { - "auxiliary_loss_clip": 0.01111214, - "auxiliary_loss_mlp": 0.01029578, - "balance_loss_clip": 1.03756952, - "balance_loss_mlp": 1.01640534, - "epoch": 0.7105967232827296, - "flos": 19354020508800.0, - "grad_norm": 2.1251714303337006, - "language_loss": 0.76013577, - "learning_rate": 8.159699804924709e-07, - "loss": 0.78154367, - "num_input_tokens_seen": 254932140, - "step": 11819, - "time_per_iteration": 2.5721442699432373 - }, - { - "auxiliary_loss_clip": 0.01069448, - "auxiliary_loss_mlp": 0.01034158, - "balance_loss_clip": 1.0366652, - "balance_loss_mlp": 1.01895833, - "epoch": 0.7106568465353975, - "flos": 22930400010240.0, - "grad_norm": 1.554661416155005, - "language_loss": 0.70843577, - "learning_rate": 8.156561252835883e-07, - "loss": 0.7294718, - "num_input_tokens_seen": 254951580, - "step": 11820, - "time_per_iteration": 2.7395031452178955 - }, - { - "auxiliary_loss_clip": 0.01101119, - "auxiliary_loss_mlp": 0.01029233, - "balance_loss_clip": 1.03955543, - "balance_loss_mlp": 1.01675773, - "epoch": 0.7107169697880655, - "flos": 19099665325440.0, - "grad_norm": 1.8332126356579708, - "language_loss": 0.75666863, - "learning_rate": 8.153423149851449e-07, - "loss": 0.7779721, - "num_input_tokens_seen": 254969425, - "step": 11821, - "time_per_iteration": 2.6001696586608887 - }, - { - "auxiliary_loss_clip": 0.00987426, - "auxiliary_loss_mlp": 0.00999944, - "balance_loss_clip": 1.01348448, - "balance_loss_mlp": 0.99880552, - "epoch": 0.7107770930407336, - "flos": 63638054231040.0, - "grad_norm": 0.7717757980179868, - "language_loss": 0.5505957, - "learning_rate": 8.150285496090388e-07, - "loss": 0.57046944, - "num_input_tokens_seen": 255032680, - "step": 11822, - "time_per_iteration": 3.295065402984619 - }, - { - "auxiliary_loss_clip": 0.0109566, - "auxiliary_loss_mlp": 0.01026855, - "balance_loss_clip": 1.03837609, - "balance_loss_mlp": 1.01429629, - "epoch": 0.7108372162934015, - "flos": 22054466949120.0, - "grad_norm": 1.928380251227047, - "language_loss": 0.60496062, - "learning_rate": 8.147148291671688e-07, - "loss": 0.62618577, - "num_input_tokens_seen": 255054400, - "step": 11823, - "time_per_iteration": 2.6415092945098877 - }, - { - "auxiliary_loss_clip": 0.01099793, - "auxiliary_loss_mlp": 0.01032476, - "balance_loss_clip": 1.03883433, - "balance_loss_mlp": 1.02019811, - "epoch": 0.7108973395460695, - "flos": 19135144984320.0, - "grad_norm": 2.0421558606422434, - "language_loss": 0.71593511, - "learning_rate": 8.144011536714322e-07, - "loss": 0.73725778, - "num_input_tokens_seen": 255072785, - "step": 11824, - "time_per_iteration": 4.298635244369507 - }, - { - "auxiliary_loss_clip": 0.01077795, - "auxiliary_loss_mlp": 0.00772624, - "balance_loss_clip": 1.03366399, - "balance_loss_mlp": 1.00021195, - "epoch": 0.7109574627987374, - "flos": 17894431353600.0, - "grad_norm": 2.7239449344013322, - "language_loss": 0.72674167, - "learning_rate": 8.140875231337223e-07, - "loss": 0.74524581, - "num_input_tokens_seen": 255091820, - "step": 11825, - "time_per_iteration": 2.652414083480835 - }, - { - "auxiliary_loss_clip": 0.01081872, - "auxiliary_loss_mlp": 0.01031095, - "balance_loss_clip": 1.03761208, - "balance_loss_mlp": 1.01838112, - "epoch": 0.7110175860514054, - "flos": 28979623422720.0, - "grad_norm": 1.6201501744547915, - "language_loss": 0.79405123, - "learning_rate": 8.137739375659321e-07, - "loss": 0.8151809, - "num_input_tokens_seen": 255111720, - "step": 11826, - "time_per_iteration": 4.22081995010376 - }, - { - "auxiliary_loss_clip": 0.01098598, - "auxiliary_loss_mlp": 0.01034594, - "balance_loss_clip": 1.03932214, - "balance_loss_mlp": 1.02239263, - "epoch": 0.7110777093040733, - "flos": 26173312623360.0, - "grad_norm": 1.462780118765175, - "language_loss": 0.8310101, - "learning_rate": 8.134603969799527e-07, - "loss": 0.85234201, - "num_input_tokens_seen": 255133495, - "step": 11827, - "time_per_iteration": 4.226747512817383 - }, - { - "auxiliary_loss_clip": 0.01079454, - "auxiliary_loss_mlp": 0.01032813, - "balance_loss_clip": 1.03688717, - "balance_loss_mlp": 1.01972437, - "epoch": 0.7111378325567413, - "flos": 26869943969280.0, - "grad_norm": 1.4792451308451542, - "language_loss": 0.6237936, - "learning_rate": 8.131469013876748e-07, - "loss": 0.6449163, - "num_input_tokens_seen": 255156880, - "step": 11828, - "time_per_iteration": 2.7983498573303223 - }, - { - "auxiliary_loss_clip": 0.01111659, - "auxiliary_loss_mlp": 0.01034434, - "balance_loss_clip": 1.03956318, - "balance_loss_mlp": 1.02155936, - "epoch": 0.7111979558094093, - "flos": 27271820309760.0, - "grad_norm": 1.434450077194213, - "language_loss": 0.72024685, - "learning_rate": 8.128334508009846e-07, - "loss": 0.7417078, - "num_input_tokens_seen": 255178920, - "step": 11829, - "time_per_iteration": 2.6990365982055664 - }, - { - "auxiliary_loss_clip": 0.01111652, - "auxiliary_loss_mlp": 0.01034605, - "balance_loss_clip": 1.04012764, - "balance_loss_mlp": 1.02254748, - "epoch": 0.7112580790620773, - "flos": 25046938961280.0, - "grad_norm": 1.7220593364674301, - "language_loss": 0.80250454, - "learning_rate": 8.125200452317697e-07, - "loss": 0.8239671, - "num_input_tokens_seen": 255198095, - "step": 11830, - "time_per_iteration": 2.573199987411499 - }, - { - "auxiliary_loss_clip": 0.01099477, - "auxiliary_loss_mlp": 0.0103532, - "balance_loss_clip": 1.0376153, - "balance_loss_mlp": 1.02277327, - "epoch": 0.7113182023147452, - "flos": 21646628951040.0, - "grad_norm": 1.7248457668834243, - "language_loss": 0.84357107, - "learning_rate": 8.122066846919138e-07, - "loss": 0.86491901, - "num_input_tokens_seen": 255215860, - "step": 11831, - "time_per_iteration": 2.6142139434814453 - }, - { - "auxiliary_loss_clip": 0.01088822, - "auxiliary_loss_mlp": 0.01032858, - "balance_loss_clip": 1.0360502, - "balance_loss_mlp": 1.02048969, - "epoch": 0.7113783255674132, - "flos": 20996287257600.0, - "grad_norm": 3.6930845590637986, - "language_loss": 0.77417958, - "learning_rate": 8.118933691932985e-07, - "loss": 0.79539645, - "num_input_tokens_seen": 255235425, - "step": 11832, - "time_per_iteration": 2.6712517738342285 - }, - { - "auxiliary_loss_clip": 0.01020951, - "auxiliary_loss_mlp": 0.01006539, - "balance_loss_clip": 1.00784588, - "balance_loss_mlp": 1.00549638, - "epoch": 0.7114384488200811, - "flos": 66771080161920.0, - "grad_norm": 0.7440594945981316, - "language_loss": 0.56577992, - "learning_rate": 8.115800987478059e-07, - "loss": 0.5860548, - "num_input_tokens_seen": 255291680, - "step": 11833, - "time_per_iteration": 4.970557689666748 - }, - { - "auxiliary_loss_clip": 0.01063684, - "auxiliary_loss_mlp": 0.01035215, - "balance_loss_clip": 1.03803515, - "balance_loss_mlp": 1.02324665, - "epoch": 0.7114985720727491, - "flos": 25010058672000.0, - "grad_norm": 1.530403195160948, - "language_loss": 0.70702851, - "learning_rate": 8.11266873367315e-07, - "loss": 0.72801757, - "num_input_tokens_seen": 255313880, - "step": 11834, - "time_per_iteration": 2.814005136489868 - }, - { - "auxiliary_loss_clip": 0.0111468, - "auxiliary_loss_mlp": 0.01035977, - "balance_loss_clip": 1.04097462, - "balance_loss_mlp": 1.02278614, - "epoch": 0.7115586953254172, - "flos": 21470128496640.0, - "grad_norm": 2.039356893023201, - "language_loss": 0.79006612, - "learning_rate": 8.10953693063704e-07, - "loss": 0.81157267, - "num_input_tokens_seen": 255332390, - "step": 11835, - "time_per_iteration": 2.6193342208862305 - }, - { - "auxiliary_loss_clip": 0.01098428, - "auxiliary_loss_mlp": 0.01030957, - "balance_loss_clip": 1.0383265, - "balance_loss_mlp": 1.01929832, - "epoch": 0.7116188185780851, - "flos": 28622600190720.0, - "grad_norm": 1.4832382343509314, - "language_loss": 0.75895661, - "learning_rate": 8.10640557848848e-07, - "loss": 0.78025043, - "num_input_tokens_seen": 255354025, - "step": 11836, - "time_per_iteration": 2.796912670135498 - }, - { - "auxiliary_loss_clip": 0.01041174, - "auxiliary_loss_mlp": 0.01035442, - "balance_loss_clip": 1.03577304, - "balance_loss_mlp": 1.02302051, - "epoch": 0.7116789418307531, - "flos": 25293608634240.0, - "grad_norm": 1.738152097420041, - "language_loss": 0.69952178, - "learning_rate": 8.103274677346208e-07, - "loss": 0.72028792, - "num_input_tokens_seen": 255371400, - "step": 11837, - "time_per_iteration": 2.850287914276123 - }, - { - "auxiliary_loss_clip": 0.01104188, - "auxiliary_loss_mlp": 0.01038013, - "balance_loss_clip": 1.04023147, - "balance_loss_mlp": 1.02389264, - "epoch": 0.711739065083421, - "flos": 25557301353600.0, - "grad_norm": 1.8562944097025111, - "language_loss": 0.61769348, - "learning_rate": 8.100144227328958e-07, - "loss": 0.63911551, - "num_input_tokens_seen": 255390710, - "step": 11838, - "time_per_iteration": 2.6722800731658936 - }, - { - "auxiliary_loss_clip": 0.01103036, - "auxiliary_loss_mlp": 0.01032537, - "balance_loss_clip": 1.04213476, - "balance_loss_mlp": 1.02000856, - "epoch": 0.711799188336089, - "flos": 26140993361280.0, - "grad_norm": 2.198172519021995, - "language_loss": 0.67758644, - "learning_rate": 8.097014228555426e-07, - "loss": 0.69894218, - "num_input_tokens_seen": 255408790, - "step": 11839, - "time_per_iteration": 2.700693130493164 - }, - { - "auxiliary_loss_clip": 0.01113567, - "auxiliary_loss_mlp": 0.01032497, - "balance_loss_clip": 1.04118514, - "balance_loss_mlp": 1.02025414, - "epoch": 0.7118593115887569, - "flos": 21140648017920.0, - "grad_norm": 2.000863214598685, - "language_loss": 0.84081334, - "learning_rate": 8.093884681144305e-07, - "loss": 0.86227405, - "num_input_tokens_seen": 255426280, - "step": 11840, - "time_per_iteration": 2.6260175704956055 - }, - { - "auxiliary_loss_clip": 0.01089291, - "auxiliary_loss_mlp": 0.01032161, - "balance_loss_clip": 1.03793836, - "balance_loss_mlp": 1.01938784, - "epoch": 0.711919434841425, - "flos": 14975684006400.0, - "grad_norm": 1.8693362232508501, - "language_loss": 0.76592988, - "learning_rate": 8.090755585214277e-07, - "loss": 0.78714442, - "num_input_tokens_seen": 255442935, - "step": 11841, - "time_per_iteration": 2.7380130290985107 - }, - { - "auxiliary_loss_clip": 0.01097544, - "auxiliary_loss_mlp": 0.01031388, - "balance_loss_clip": 1.0421263, - "balance_loss_mlp": 1.01840663, - "epoch": 0.7119795580940929, - "flos": 16508997826560.0, - "grad_norm": 2.0423814070424546, - "language_loss": 0.75526315, - "learning_rate": 8.087626940883994e-07, - "loss": 0.77655244, - "num_input_tokens_seen": 255460925, - "step": 11842, - "time_per_iteration": 2.7132010459899902 - }, - { - "auxiliary_loss_clip": 0.01025805, - "auxiliary_loss_mlp": 0.01005384, - "balance_loss_clip": 1.01843143, - "balance_loss_mlp": 1.00406706, - "epoch": 0.7120396813467609, - "flos": 66570736055040.0, - "grad_norm": 0.7903612051800185, - "language_loss": 0.61607522, - "learning_rate": 8.084498748272082e-07, - "loss": 0.63638717, - "num_input_tokens_seen": 255521360, - "step": 11843, - "time_per_iteration": 3.199335813522339 - }, - { - "auxiliary_loss_clip": 0.01110982, - "auxiliary_loss_mlp": 0.01027269, - "balance_loss_clip": 1.04004669, - "balance_loss_mlp": 1.01506245, - "epoch": 0.7120998045994288, - "flos": 26432731624320.0, - "grad_norm": 2.817805590014094, - "language_loss": 0.80302823, - "learning_rate": 8.081371007497171e-07, - "loss": 0.82441074, - "num_input_tokens_seen": 255541435, - "step": 11844, - "time_per_iteration": 2.7244338989257812 - }, - { - "auxiliary_loss_clip": 0.010573, - "auxiliary_loss_mlp": 0.01034033, - "balance_loss_clip": 1.03133631, - "balance_loss_mlp": 1.02053213, - "epoch": 0.7121599278520968, - "flos": 16427982700800.0, - "grad_norm": 2.6971267365188565, - "language_loss": 0.79268605, - "learning_rate": 8.078243718677873e-07, - "loss": 0.81359935, - "num_input_tokens_seen": 255558505, - "step": 11845, - "time_per_iteration": 2.7217719554901123 - }, - { - "auxiliary_loss_clip": 0.01094755, - "auxiliary_loss_mlp": 0.0103426, - "balance_loss_clip": 1.03866315, - "balance_loss_mlp": 1.02122474, - "epoch": 0.7122200511047647, - "flos": 28949889939840.0, - "grad_norm": 2.005574335935101, - "language_loss": 0.77602625, - "learning_rate": 8.075116881932762e-07, - "loss": 0.79731637, - "num_input_tokens_seen": 255577815, - "step": 11846, - "time_per_iteration": 2.64569354057312 - }, - { - "auxiliary_loss_clip": 0.01101916, - "auxiliary_loss_mlp": 0.01033988, - "balance_loss_clip": 1.03883851, - "balance_loss_mlp": 1.0209887, - "epoch": 0.7122801743574327, - "flos": 16471866142080.0, - "grad_norm": 1.8418760265221825, - "language_loss": 0.58981413, - "learning_rate": 8.071990497380421e-07, - "loss": 0.61117315, - "num_input_tokens_seen": 255595885, - "step": 11847, - "time_per_iteration": 2.626909017562866 - }, - { - "auxiliary_loss_clip": 0.01095645, - "auxiliary_loss_mlp": 0.00769201, - "balance_loss_clip": 1.03844142, - "balance_loss_mlp": 1.00012553, - "epoch": 0.7123402976101008, - "flos": 20631039811200.0, - "grad_norm": 2.0944282784493353, - "language_loss": 0.71676862, - "learning_rate": 8.068864565139395e-07, - "loss": 0.73541707, - "num_input_tokens_seen": 255616750, - "step": 11848, - "time_per_iteration": 2.7139625549316406 - }, - { - "auxiliary_loss_clip": 0.01023376, - "auxiliary_loss_mlp": 0.01000864, - "balance_loss_clip": 1.00891995, - "balance_loss_mlp": 0.99969596, - "epoch": 0.7124004208627687, - "flos": 62325734837760.0, - "grad_norm": 0.8463803916761699, - "language_loss": 0.62977934, - "learning_rate": 8.065739085328211e-07, - "loss": 0.65002173, - "num_input_tokens_seen": 255677900, - "step": 11849, - "time_per_iteration": 3.1411380767822266 - }, - { - "auxiliary_loss_clip": 0.01083662, - "auxiliary_loss_mlp": 0.01037172, - "balance_loss_clip": 1.03620315, - "balance_loss_mlp": 1.0243752, - "epoch": 0.7124605441154367, - "flos": 39675975788160.0, - "grad_norm": 2.803971224135637, - "language_loss": 0.63841069, - "learning_rate": 8.0626140580654e-07, - "loss": 0.65961903, - "num_input_tokens_seen": 255699140, - "step": 11850, - "time_per_iteration": 2.923384428024292 - }, - { - "auxiliary_loss_clip": 0.01102405, - "auxiliary_loss_mlp": 0.01032425, - "balance_loss_clip": 1.03889465, - "balance_loss_mlp": 1.01953292, - "epoch": 0.7125206673681046, - "flos": 28181868312960.0, - "grad_norm": 1.6311022275087306, - "language_loss": 0.69985723, - "learning_rate": 8.05948948346946e-07, - "loss": 0.72120547, - "num_input_tokens_seen": 255719640, - "step": 11851, - "time_per_iteration": 2.7820382118225098 - }, - { - "auxiliary_loss_clip": 0.0110311, - "auxiliary_loss_mlp": 0.01034212, - "balance_loss_clip": 1.04154539, - "balance_loss_mlp": 1.02258956, - "epoch": 0.7125807906207726, - "flos": 26176939896960.0, - "grad_norm": 1.8696019158411576, - "language_loss": 0.83187509, - "learning_rate": 8.056365361658882e-07, - "loss": 0.8532483, - "num_input_tokens_seen": 255740450, - "step": 11852, - "time_per_iteration": 2.6444952487945557 - }, - { - "auxiliary_loss_clip": 0.01100225, - "auxiliary_loss_mlp": 0.00771762, - "balance_loss_clip": 1.03667736, - "balance_loss_mlp": 1.00029016, - "epoch": 0.7126409138734405, - "flos": 17157328358400.0, - "grad_norm": 2.349353252161211, - "language_loss": 0.73249555, - "learning_rate": 8.053241692752126e-07, - "loss": 0.75121546, - "num_input_tokens_seen": 255758070, - "step": 11853, - "time_per_iteration": 2.637211799621582 - }, - { - "auxiliary_loss_clip": 0.0107018, - "auxiliary_loss_mlp": 0.0103944, - "balance_loss_clip": 1.03552818, - "balance_loss_mlp": 1.02707744, - "epoch": 0.7127010371261085, - "flos": 18769933451520.0, - "grad_norm": 1.913315807088991, - "language_loss": 0.92358422, - "learning_rate": 8.050118476867635e-07, - "loss": 0.94468045, - "num_input_tokens_seen": 255775685, - "step": 11854, - "time_per_iteration": 2.7072691917419434 - }, - { - "auxiliary_loss_clip": 0.01098797, - "auxiliary_loss_mlp": 0.01033688, - "balance_loss_clip": 1.03843451, - "balance_loss_mlp": 1.02162969, - "epoch": 0.7127611603787765, - "flos": 20376433232640.0, - "grad_norm": 1.737694299359858, - "language_loss": 0.7940923, - "learning_rate": 8.046995714123856e-07, - "loss": 0.81541711, - "num_input_tokens_seen": 255794750, - "step": 11855, - "time_per_iteration": 2.6459240913391113 - }, - { - "auxiliary_loss_clip": 0.01062363, - "auxiliary_loss_mlp": 0.0103668, - "balance_loss_clip": 1.03427172, - "balance_loss_mlp": 1.02277398, - "epoch": 0.7128212836314445, - "flos": 20449008662400.0, - "grad_norm": 1.6847297518773263, - "language_loss": 0.72626299, - "learning_rate": 8.043873404639192e-07, - "loss": 0.74725342, - "num_input_tokens_seen": 255813325, - "step": 11856, - "time_per_iteration": 2.798802614212036 - }, - { - "auxiliary_loss_clip": 0.01105236, - "auxiliary_loss_mlp": 0.01030789, - "balance_loss_clip": 1.0418961, - "balance_loss_mlp": 1.01811683, - "epoch": 0.7128814068841124, - "flos": 23440834229760.0, - "grad_norm": 1.7617515399603183, - "language_loss": 0.70205921, - "learning_rate": 8.040751548532046e-07, - "loss": 0.72341949, - "num_input_tokens_seen": 255832470, - "step": 11857, - "time_per_iteration": 2.7193527221679688 - }, - { - "auxiliary_loss_clip": 0.01097533, - "auxiliary_loss_mlp": 0.01029351, - "balance_loss_clip": 1.03706014, - "balance_loss_mlp": 1.01644111, - "epoch": 0.7129415301367804, - "flos": 18222942165120.0, - "grad_norm": 2.6735250437319684, - "language_loss": 0.85148036, - "learning_rate": 8.03763014592081e-07, - "loss": 0.87274927, - "num_input_tokens_seen": 255849740, - "step": 11858, - "time_per_iteration": 2.640803813934326 - }, - { - "auxiliary_loss_clip": 0.01116792, - "auxiliary_loss_mlp": 0.01033094, - "balance_loss_clip": 1.04105759, - "balance_loss_mlp": 1.020136, - "epoch": 0.7130016533894483, - "flos": 15523896355200.0, - "grad_norm": 1.6211685141896377, - "language_loss": 0.80374736, - "learning_rate": 8.034509196923829e-07, - "loss": 0.82524627, - "num_input_tokens_seen": 255866975, - "step": 11859, - "time_per_iteration": 2.600557565689087 - }, - { - "auxiliary_loss_clip": 0.01088815, - "auxiliary_loss_mlp": 0.01031931, - "balance_loss_clip": 1.03991199, - "balance_loss_mlp": 1.01981378, - "epoch": 0.7130617766421163, - "flos": 57115668960000.0, - "grad_norm": 1.670734379003671, - "language_loss": 0.68986422, - "learning_rate": 8.031388701659456e-07, - "loss": 0.71107167, - "num_input_tokens_seen": 255892915, - "step": 11860, - "time_per_iteration": 3.0131988525390625 - }, - { - "auxiliary_loss_clip": 0.01101154, - "auxiliary_loss_mlp": 0.01031024, - "balance_loss_clip": 1.03928113, - "balance_loss_mlp": 1.01791143, - "epoch": 0.7131218998947844, - "flos": 19788252024960.0, - "grad_norm": 1.6914333481475103, - "language_loss": 0.64537835, - "learning_rate": 8.028268660246023e-07, - "loss": 0.66670012, - "num_input_tokens_seen": 255911480, - "step": 11861, - "time_per_iteration": 2.609196424484253 - }, - { - "auxiliary_loss_clip": 0.01095274, - "auxiliary_loss_mlp": 0.01033317, - "balance_loss_clip": 1.04040623, - "balance_loss_mlp": 1.01967335, - "epoch": 0.7131820231474523, - "flos": 26651894457600.0, - "grad_norm": 1.5298656478489163, - "language_loss": 0.66931856, - "learning_rate": 8.025149072801849e-07, - "loss": 0.69060439, - "num_input_tokens_seen": 255931140, - "step": 11862, - "time_per_iteration": 2.7272536754608154 - }, - { - "auxiliary_loss_clip": 0.01084067, - "auxiliary_loss_mlp": 0.01040707, - "balance_loss_clip": 1.03703427, - "balance_loss_mlp": 1.02913761, - "epoch": 0.7132421464001203, - "flos": 29205609840000.0, - "grad_norm": 1.958177792409317, - "language_loss": 0.66627884, - "learning_rate": 8.022029939445214e-07, - "loss": 0.68752658, - "num_input_tokens_seen": 255951665, - "step": 11863, - "time_per_iteration": 4.389364957809448 - }, - { - "auxiliary_loss_clip": 0.01071831, - "auxiliary_loss_mlp": 0.01047442, - "balance_loss_clip": 1.03993106, - "balance_loss_mlp": 1.03224277, - "epoch": 0.7133022696527882, - "flos": 23073611535360.0, - "grad_norm": 1.9615071733998684, - "language_loss": 0.65745306, - "learning_rate": 8.018911260294414e-07, - "loss": 0.67864573, - "num_input_tokens_seen": 255970055, - "step": 11864, - "time_per_iteration": 2.7246596813201904 - }, - { - "auxiliary_loss_clip": 0.01101997, - "auxiliary_loss_mlp": 0.01032961, - "balance_loss_clip": 1.03820133, - "balance_loss_mlp": 1.01960993, - "epoch": 0.7133623929054562, - "flos": 17457111267840.0, - "grad_norm": 1.8809252452747804, - "language_loss": 0.86299706, - "learning_rate": 8.015793035467697e-07, - "loss": 0.8843466, - "num_input_tokens_seen": 255987720, - "step": 11865, - "time_per_iteration": 4.186030149459839 - }, - { - "auxiliary_loss_clip": 0.01071299, - "auxiliary_loss_mlp": 0.01037479, - "balance_loss_clip": 1.0331533, - "balance_loss_mlp": 1.02338219, - "epoch": 0.7134225161581241, - "flos": 19536554448000.0, - "grad_norm": 4.424017900151165, - "language_loss": 0.75215453, - "learning_rate": 8.012675265083304e-07, - "loss": 0.77324229, - "num_input_tokens_seen": 256005490, - "step": 11866, - "time_per_iteration": 4.38300085067749 - }, - { - "auxiliary_loss_clip": 0.01075897, - "auxiliary_loss_mlp": 0.01035618, - "balance_loss_clip": 1.03858542, - "balance_loss_mlp": 1.02196276, - "epoch": 0.7134826394107922, - "flos": 26250089944320.0, - "grad_norm": 1.787273089908781, - "language_loss": 0.70222098, - "learning_rate": 8.009557949259464e-07, - "loss": 0.72333616, - "num_input_tokens_seen": 256026030, - "step": 11867, - "time_per_iteration": 2.7252299785614014 - }, - { - "auxiliary_loss_clip": 0.0109972, - "auxiliary_loss_mlp": 0.01030726, - "balance_loss_clip": 1.03978539, - "balance_loss_mlp": 1.01921654, - "epoch": 0.7135427626634601, - "flos": 15815311395840.0, - "grad_norm": 2.323638504940392, - "language_loss": 0.72056556, - "learning_rate": 8.006441088114397e-07, - "loss": 0.74186999, - "num_input_tokens_seen": 256043680, - "step": 11868, - "time_per_iteration": 2.6166346073150635 - }, - { - "auxiliary_loss_clip": 0.01063174, - "auxiliary_loss_mlp": 0.01035209, - "balance_loss_clip": 1.03661656, - "balance_loss_mlp": 1.02014136, - "epoch": 0.7136028859161281, - "flos": 18223409041920.0, - "grad_norm": 2.386444069797043, - "language_loss": 0.66029108, - "learning_rate": 8.003324681766286e-07, - "loss": 0.68127489, - "num_input_tokens_seen": 256059705, - "step": 11869, - "time_per_iteration": 2.6557157039642334 - }, - { - "auxiliary_loss_clip": 0.01086038, - "auxiliary_loss_mlp": 0.01027932, - "balance_loss_clip": 1.03453624, - "balance_loss_mlp": 1.01540327, - "epoch": 0.713663009168796, - "flos": 24314827956480.0, - "grad_norm": 1.5122287108134371, - "language_loss": 0.77901238, - "learning_rate": 8.000208730333298e-07, - "loss": 0.80015206, - "num_input_tokens_seen": 256079785, - "step": 11870, - "time_per_iteration": 2.767284870147705 - }, - { - "auxiliary_loss_clip": 0.01062535, - "auxiliary_loss_mlp": 0.01035444, - "balance_loss_clip": 1.03716147, - "balance_loss_mlp": 1.02176499, - "epoch": 0.713723132421464, - "flos": 26538488242560.0, - "grad_norm": 1.7572988726243002, - "language_loss": 0.81102479, - "learning_rate": 7.997093233933597e-07, - "loss": 0.83200461, - "num_input_tokens_seen": 256099000, - "step": 11871, - "time_per_iteration": 2.7799062728881836 - }, - { - "auxiliary_loss_clip": 0.01081304, - "auxiliary_loss_mlp": 0.01037741, - "balance_loss_clip": 1.03814363, - "balance_loss_mlp": 1.02452111, - "epoch": 0.7137832556741319, - "flos": 19865675790720.0, - "grad_norm": 1.5739267518019031, - "language_loss": 0.78791887, - "learning_rate": 7.993978192685331e-07, - "loss": 0.80910927, - "num_input_tokens_seen": 256117985, - "step": 11872, - "time_per_iteration": 4.27405309677124 - }, - { - "auxiliary_loss_clip": 0.01104458, - "auxiliary_loss_mlp": 0.01029654, - "balance_loss_clip": 1.04009414, - "balance_loss_mlp": 1.0162369, - "epoch": 0.7138433789267999, - "flos": 21688932193920.0, - "grad_norm": 2.3871550053143893, - "language_loss": 0.84496498, - "learning_rate": 7.990863606706606e-07, - "loss": 0.86630619, - "num_input_tokens_seen": 256134350, - "step": 11873, - "time_per_iteration": 2.6260197162628174 - }, - { - "auxiliary_loss_clip": 0.01073276, - "auxiliary_loss_mlp": 0.01032331, - "balance_loss_clip": 1.03462076, - "balance_loss_mlp": 1.02040982, - "epoch": 0.713903502179468, - "flos": 17602729004160.0, - "grad_norm": 2.5229231415013116, - "language_loss": 0.86355793, - "learning_rate": 7.987749476115539e-07, - "loss": 0.88461399, - "num_input_tokens_seen": 256150610, - "step": 11874, - "time_per_iteration": 2.680554151535034 - }, - { - "auxiliary_loss_clip": 0.01103576, - "auxiliary_loss_mlp": 0.01031521, - "balance_loss_clip": 1.0389179, - "balance_loss_mlp": 1.01873553, - "epoch": 0.7139636254321359, - "flos": 18040336398720.0, - "grad_norm": 1.760053080674637, - "language_loss": 0.8337326, - "learning_rate": 7.984635801030228e-07, - "loss": 0.85508358, - "num_input_tokens_seen": 256168620, - "step": 11875, - "time_per_iteration": 2.597926616668701 - }, - { - "auxiliary_loss_clip": 0.01091056, - "auxiliary_loss_mlp": 0.01038348, - "balance_loss_clip": 1.03766298, - "balance_loss_mlp": 1.02233863, - "epoch": 0.7140237486848039, - "flos": 23331127115520.0, - "grad_norm": 1.7238625463047035, - "language_loss": 0.69539726, - "learning_rate": 7.981522581568721e-07, - "loss": 0.71669132, - "num_input_tokens_seen": 256186700, - "step": 11876, - "time_per_iteration": 2.7075090408325195 - }, - { - "auxiliary_loss_clip": 0.01115515, - "auxiliary_loss_mlp": 0.01036257, - "balance_loss_clip": 1.04096556, - "balance_loss_mlp": 1.02292919, - "epoch": 0.7140838719374718, - "flos": 16837077674880.0, - "grad_norm": 1.7495986259479304, - "language_loss": 0.78027952, - "learning_rate": 7.978409817849079e-07, - "loss": 0.80179715, - "num_input_tokens_seen": 256205390, - "step": 11877, - "time_per_iteration": 2.579984188079834 - }, - { - "auxiliary_loss_clip": 0.01100542, - "auxiliary_loss_mlp": 0.01039441, - "balance_loss_clip": 1.03897512, - "balance_loss_mlp": 1.02755046, - "epoch": 0.7141439951901398, - "flos": 21142012734720.0, - "grad_norm": 2.00893168794746, - "language_loss": 0.69702816, - "learning_rate": 7.97529750998934e-07, - "loss": 0.71842802, - "num_input_tokens_seen": 256224575, - "step": 11878, - "time_per_iteration": 2.7117369174957275 - }, - { - "auxiliary_loss_clip": 0.01075067, - "auxiliary_loss_mlp": 0.01035543, - "balance_loss_clip": 1.03836131, - "balance_loss_mlp": 1.024194, - "epoch": 0.7142041184428077, - "flos": 24717709877760.0, - "grad_norm": 1.9345471164629369, - "language_loss": 0.67564619, - "learning_rate": 7.972185658107535e-07, - "loss": 0.69675231, - "num_input_tokens_seen": 256242130, - "step": 11879, - "time_per_iteration": 2.781487226486206 - }, - { - "auxiliary_loss_clip": 0.01052886, - "auxiliary_loss_mlp": 0.01039936, - "balance_loss_clip": 1.03587782, - "balance_loss_mlp": 1.02522612, - "epoch": 0.7142642416954758, - "flos": 21908202768000.0, - "grad_norm": 2.4025708755379136, - "language_loss": 0.68949473, - "learning_rate": 7.969074262321646e-07, - "loss": 0.71042299, - "num_input_tokens_seen": 256261920, - "step": 11880, - "time_per_iteration": 2.7956559658050537 - }, - { - "auxiliary_loss_clip": 0.01085326, - "auxiliary_loss_mlp": 0.01038627, - "balance_loss_clip": 1.03614664, - "balance_loss_mlp": 1.02517402, - "epoch": 0.7143243649481437, - "flos": 20805636844800.0, - "grad_norm": 2.7211845383040263, - "language_loss": 0.80758023, - "learning_rate": 7.965963322749674e-07, - "loss": 0.82881975, - "num_input_tokens_seen": 256277970, - "step": 11881, - "time_per_iteration": 2.7760164737701416 - }, - { - "auxiliary_loss_clip": 0.01069489, - "auxiliary_loss_mlp": 0.01031682, - "balance_loss_clip": 1.03435218, - "balance_loss_mlp": 1.01974893, - "epoch": 0.7143844882008117, - "flos": 27235011847680.0, - "grad_norm": 1.9142544481843773, - "language_loss": 0.63496864, - "learning_rate": 7.962852839509579e-07, - "loss": 0.65598035, - "num_input_tokens_seen": 256298205, - "step": 11882, - "time_per_iteration": 2.8055615425109863 - }, - { - "auxiliary_loss_clip": 0.01115484, - "auxiliary_loss_mlp": 0.01033467, - "balance_loss_clip": 1.0405947, - "balance_loss_mlp": 1.02086067, - "epoch": 0.7144446114534796, - "flos": 17929623703680.0, - "grad_norm": 1.6563668139876793, - "language_loss": 0.68799591, - "learning_rate": 7.959742812719304e-07, - "loss": 0.70948541, - "num_input_tokens_seen": 256316685, - "step": 11883, - "time_per_iteration": 2.6891119480133057 - }, - { - "auxiliary_loss_clip": 0.0110208, - "auxiliary_loss_mlp": 0.01037358, - "balance_loss_clip": 1.04018784, - "balance_loss_mlp": 1.02402401, - "epoch": 0.7145047347061476, - "flos": 20740962407040.0, - "grad_norm": 1.7218148321096673, - "language_loss": 0.77569342, - "learning_rate": 7.956633242496788e-07, - "loss": 0.79708779, - "num_input_tokens_seen": 256334205, - "step": 11884, - "time_per_iteration": 2.6530849933624268 - }, - { - "auxiliary_loss_clip": 0.01107156, - "auxiliary_loss_mlp": 0.01036925, - "balance_loss_clip": 1.0385685, - "balance_loss_mlp": 1.02221453, - "epoch": 0.7145648579588155, - "flos": 21178605715200.0, - "grad_norm": 4.109766479944614, - "language_loss": 0.73748314, - "learning_rate": 7.953524128959954e-07, - "loss": 0.75892401, - "num_input_tokens_seen": 256353340, - "step": 11885, - "time_per_iteration": 2.8627066612243652 - }, - { - "auxiliary_loss_clip": 0.01014823, - "auxiliary_loss_mlp": 0.00999083, - "balance_loss_clip": 1.01118517, - "balance_loss_mlp": 0.9980278, - "epoch": 0.7146249812114835, - "flos": 64784539509120.0, - "grad_norm": 0.8971094917641942, - "language_loss": 0.66321898, - "learning_rate": 7.95041547222669e-07, - "loss": 0.68335795, - "num_input_tokens_seen": 256411550, - "step": 11886, - "time_per_iteration": 3.2624523639678955 - }, - { - "auxiliary_loss_clip": 0.01068235, - "auxiliary_loss_mlp": 0.01029631, - "balance_loss_clip": 1.03730834, - "balance_loss_mlp": 1.01627326, - "epoch": 0.7146851044641516, - "flos": 18113881495680.0, - "grad_norm": 1.637061044438449, - "language_loss": 0.74940675, - "learning_rate": 7.947307272414874e-07, - "loss": 0.77038538, - "num_input_tokens_seen": 256430360, - "step": 11887, - "time_per_iteration": 2.951922655105591 - }, - { - "auxiliary_loss_clip": 0.0110054, - "auxiliary_loss_mlp": 0.01027491, - "balance_loss_clip": 1.03856289, - "balance_loss_mlp": 1.01542068, - "epoch": 0.7147452277168195, - "flos": 19243846517760.0, - "grad_norm": 1.834582654468692, - "language_loss": 0.71475005, - "learning_rate": 7.944199529642372e-07, - "loss": 0.73603028, - "num_input_tokens_seen": 256449750, - "step": 11888, - "time_per_iteration": 2.7142348289489746 - }, - { - "auxiliary_loss_clip": 0.01097744, - "auxiliary_loss_mlp": 0.0103845, - "balance_loss_clip": 1.03603697, - "balance_loss_mlp": 1.02444923, - "epoch": 0.7148053509694875, - "flos": 23764712186880.0, - "grad_norm": 1.9131125822464334, - "language_loss": 0.84173727, - "learning_rate": 7.941092244027041e-07, - "loss": 0.86309922, - "num_input_tokens_seen": 256467330, - "step": 11889, - "time_per_iteration": 2.7939958572387695 - }, - { - "auxiliary_loss_clip": 0.01066177, - "auxiliary_loss_mlp": 0.01028337, - "balance_loss_clip": 1.04017806, - "balance_loss_mlp": 1.01598644, - "epoch": 0.7148654742221554, - "flos": 22485322586880.0, - "grad_norm": 1.7213621841277236, - "language_loss": 0.76025808, - "learning_rate": 7.937985415686695e-07, - "loss": 0.78120321, - "num_input_tokens_seen": 256485705, - "step": 11890, - "time_per_iteration": 2.909778594970703 - }, - { - "auxiliary_loss_clip": 0.0106853, - "auxiliary_loss_mlp": 0.01036534, - "balance_loss_clip": 1.03322911, - "balance_loss_mlp": 1.0240227, - "epoch": 0.7149255974748234, - "flos": 24679213476480.0, - "grad_norm": 1.510956653160521, - "language_loss": 0.74061215, - "learning_rate": 7.934879044739147e-07, - "loss": 0.76166284, - "num_input_tokens_seen": 256504755, - "step": 11891, - "time_per_iteration": 2.870742082595825 - }, - { - "auxiliary_loss_clip": 0.01069165, - "auxiliary_loss_mlp": 0.01036831, - "balance_loss_clip": 1.03776526, - "balance_loss_mlp": 1.0234617, - "epoch": 0.7149857207274913, - "flos": 18405583845120.0, - "grad_norm": 2.1855656268859565, - "language_loss": 0.67586207, - "learning_rate": 7.931773131302211e-07, - "loss": 0.69692206, - "num_input_tokens_seen": 256523670, - "step": 11892, - "time_per_iteration": 2.879074811935425 - }, - { - "auxiliary_loss_clip": 0.01078901, - "auxiliary_loss_mlp": 0.01034356, - "balance_loss_clip": 1.03972173, - "balance_loss_mlp": 1.02015805, - "epoch": 0.7150458439801594, - "flos": 24969515195520.0, - "grad_norm": 1.7990304927260297, - "language_loss": 0.737535, - "learning_rate": 7.928667675493632e-07, - "loss": 0.75866759, - "num_input_tokens_seen": 256542225, - "step": 11893, - "time_per_iteration": 2.797793388366699 - }, - { - "auxiliary_loss_clip": 0.01118028, - "auxiliary_loss_mlp": 0.01031131, - "balance_loss_clip": 1.04243374, - "balance_loss_mlp": 1.01739264, - "epoch": 0.7151059672328273, - "flos": 16690777580160.0, - "grad_norm": 2.922265419299714, - "language_loss": 0.67378318, - "learning_rate": 7.925562677431185e-07, - "loss": 0.69527477, - "num_input_tokens_seen": 256560730, - "step": 11894, - "time_per_iteration": 2.6411194801330566 - }, - { - "auxiliary_loss_clip": 0.01079135, - "auxiliary_loss_mlp": 0.01032012, - "balance_loss_clip": 1.04023933, - "balance_loss_mlp": 1.01957238, - "epoch": 0.7151660904854953, - "flos": 27271820309760.0, - "grad_norm": 1.6674046722753406, - "language_loss": 0.77498591, - "learning_rate": 7.922458137232613e-07, - "loss": 0.7960974, - "num_input_tokens_seen": 256580505, - "step": 11895, - "time_per_iteration": 2.9311444759368896 - }, - { - "auxiliary_loss_clip": 0.01102223, - "auxiliary_loss_mlp": 0.01031828, - "balance_loss_clip": 1.03921759, - "balance_loss_mlp": 1.0176903, - "epoch": 0.7152262137381632, - "flos": 18332254229760.0, - "grad_norm": 1.8566798780150704, - "language_loss": 0.69233418, - "learning_rate": 7.919354055015643e-07, - "loss": 0.71367466, - "num_input_tokens_seen": 256597330, - "step": 11896, - "time_per_iteration": 2.708909034729004 - }, - { - "auxiliary_loss_clip": 0.010908, - "auxiliary_loss_mlp": 0.01041603, - "balance_loss_clip": 1.03788733, - "balance_loss_mlp": 1.02761424, - "epoch": 0.7152863369908312, - "flos": 21799285752960.0, - "grad_norm": 2.0196702259188952, - "language_loss": 0.86874604, - "learning_rate": 7.91625043089798e-07, - "loss": 0.89007008, - "num_input_tokens_seen": 256616030, - "step": 11897, - "time_per_iteration": 2.8452200889587402 - }, - { - "auxiliary_loss_clip": 0.01091656, - "auxiliary_loss_mlp": 0.01035463, - "balance_loss_clip": 1.03988373, - "balance_loss_mlp": 1.0220046, - "epoch": 0.7153464602434991, - "flos": 22158427887360.0, - "grad_norm": 3.4189922155736965, - "language_loss": 0.7799052, - "learning_rate": 7.913147264997304e-07, - "loss": 0.80117643, - "num_input_tokens_seen": 256635570, - "step": 11898, - "time_per_iteration": 2.73362398147583 - }, - { - "auxiliary_loss_clip": 0.01089871, - "auxiliary_loss_mlp": 0.01033056, - "balance_loss_clip": 1.03692102, - "balance_loss_mlp": 1.01879835, - "epoch": 0.7154065834961671, - "flos": 24716057852160.0, - "grad_norm": 2.2668196785220895, - "language_loss": 0.73072803, - "learning_rate": 7.910044557431302e-07, - "loss": 0.7519573, - "num_input_tokens_seen": 256655290, - "step": 11899, - "time_per_iteration": 2.7390663623809814 - }, - { - "auxiliary_loss_clip": 0.01101493, - "auxiliary_loss_mlp": 0.01034602, - "balance_loss_clip": 1.03773189, - "balance_loss_mlp": 1.02130437, - "epoch": 0.7154667067488351, - "flos": 22601494149120.0, - "grad_norm": 5.969579255187867, - "language_loss": 0.75829309, - "learning_rate": 7.906942308317614e-07, - "loss": 0.77965403, - "num_input_tokens_seen": 256671605, - "step": 11900, - "time_per_iteration": 2.6649601459503174 - }, - { - "auxiliary_loss_clip": 0.01103632, - "auxiliary_loss_mlp": 0.01030943, - "balance_loss_clip": 1.04124916, - "balance_loss_mlp": 1.01839614, - "epoch": 0.7155268300015031, - "flos": 18771154513920.0, - "grad_norm": 1.8695849033514778, - "language_loss": 0.80723226, - "learning_rate": 7.903840517773886e-07, - "loss": 0.828578, - "num_input_tokens_seen": 256689680, - "step": 11901, - "time_per_iteration": 2.7060022354125977 - }, - { - "auxiliary_loss_clip": 0.01080211, - "auxiliary_loss_mlp": 0.01038068, - "balance_loss_clip": 1.03678465, - "balance_loss_mlp": 1.02424598, - "epoch": 0.7155869532541711, - "flos": 18296343607680.0, - "grad_norm": 1.8343268513832525, - "language_loss": 0.81889194, - "learning_rate": 7.900739185917744e-07, - "loss": 0.84007472, - "num_input_tokens_seen": 256707760, - "step": 11902, - "time_per_iteration": 2.7816693782806396 - }, - { - "auxiliary_loss_clip": 0.01069017, - "auxiliary_loss_mlp": 0.01030025, - "balance_loss_clip": 1.03530717, - "balance_loss_mlp": 1.01750159, - "epoch": 0.715647076506839, - "flos": 11980805783040.0, - "grad_norm": 1.7267279020747466, - "language_loss": 0.68092871, - "learning_rate": 7.897638312866785e-07, - "loss": 0.70191914, - "num_input_tokens_seen": 256724150, - "step": 11903, - "time_per_iteration": 4.382705926895142 - }, - { - "auxiliary_loss_clip": 0.0106915, - "auxiliary_loss_mlp": 0.01031243, - "balance_loss_clip": 1.03447473, - "balance_loss_mlp": 1.01918483, - "epoch": 0.715707199759507, - "flos": 18951641377920.0, - "grad_norm": 5.343255365048286, - "language_loss": 0.75641096, - "learning_rate": 7.894537898738589e-07, - "loss": 0.77741492, - "num_input_tokens_seen": 256742780, - "step": 11904, - "time_per_iteration": 4.288340330123901 - }, - { - "auxiliary_loss_clip": 0.01091072, - "auxiliary_loss_mlp": 0.01039419, - "balance_loss_clip": 1.03938174, - "balance_loss_mlp": 1.02566779, - "epoch": 0.7157673230121749, - "flos": 15304410299520.0, - "grad_norm": 2.088773074445301, - "language_loss": 0.72025734, - "learning_rate": 7.891437943650727e-07, - "loss": 0.74156225, - "num_input_tokens_seen": 256761355, - "step": 11905, - "time_per_iteration": 4.343631267547607 - }, - { - "auxiliary_loss_clip": 0.01077244, - "auxiliary_loss_mlp": 0.0103248, - "balance_loss_clip": 1.03842819, - "balance_loss_mlp": 1.02001703, - "epoch": 0.715827446264843, - "flos": 23221850964480.0, - "grad_norm": 1.657779728748099, - "language_loss": 0.779338, - "learning_rate": 7.88833844772076e-07, - "loss": 0.8004353, - "num_input_tokens_seen": 256781335, - "step": 11906, - "time_per_iteration": 2.8104159832000732 - }, - { - "auxiliary_loss_clip": 0.01014211, - "auxiliary_loss_mlp": 0.0099711, - "balance_loss_clip": 1.01162815, - "balance_loss_mlp": 0.99602473, - "epoch": 0.7158875695175109, - "flos": 60975421833600.0, - "grad_norm": 0.7366961855147857, - "language_loss": 0.55325353, - "learning_rate": 7.885239411066205e-07, - "loss": 0.57336664, - "num_input_tokens_seen": 256838890, - "step": 11907, - "time_per_iteration": 3.1521129608154297 - }, - { - "auxiliary_loss_clip": 0.01094066, - "auxiliary_loss_mlp": 0.01039845, - "balance_loss_clip": 1.03540492, - "balance_loss_mlp": 1.02677381, - "epoch": 0.7159476927701789, - "flos": 17128780024320.0, - "grad_norm": 1.89939740443007, - "language_loss": 0.69593656, - "learning_rate": 7.882140833804593e-07, - "loss": 0.71727568, - "num_input_tokens_seen": 256858145, - "step": 11908, - "time_per_iteration": 2.6724538803100586 - }, - { - "auxiliary_loss_clip": 0.01059783, - "auxiliary_loss_mlp": 0.01036603, - "balance_loss_clip": 1.03303337, - "balance_loss_mlp": 1.02254832, - "epoch": 0.7160078160228468, - "flos": 22490601886080.0, - "grad_norm": 1.6751841094057447, - "language_loss": 0.71237969, - "learning_rate": 7.879042716053415e-07, - "loss": 0.7333436, - "num_input_tokens_seen": 256878545, - "step": 11909, - "time_per_iteration": 2.779273509979248 - }, - { - "auxiliary_loss_clip": 0.01099917, - "auxiliary_loss_mlp": 0.01030028, - "balance_loss_clip": 1.0387938, - "balance_loss_mlp": 1.01755881, - "epoch": 0.7160679392755148, - "flos": 30590935626240.0, - "grad_norm": 1.4959151522362304, - "language_loss": 0.75010902, - "learning_rate": 7.875945057930144e-07, - "loss": 0.7714085, - "num_input_tokens_seen": 256899920, - "step": 11910, - "time_per_iteration": 2.7424912452697754 - }, - { - "auxiliary_loss_clip": 0.01085268, - "auxiliary_loss_mlp": 0.01034213, - "balance_loss_clip": 1.0382638, - "balance_loss_mlp": 1.02263737, - "epoch": 0.7161280625281827, - "flos": 21323648833920.0, - "grad_norm": 1.5302691845486787, - "language_loss": 0.76587963, - "learning_rate": 7.872847859552251e-07, - "loss": 0.78707445, - "num_input_tokens_seen": 256918460, - "step": 11911, - "time_per_iteration": 4.259274244308472 - }, - { - "auxiliary_loss_clip": 0.01069944, - "auxiliary_loss_mlp": 0.01043229, - "balance_loss_clip": 1.03755224, - "balance_loss_mlp": 1.02831018, - "epoch": 0.7161881857808508, - "flos": 61860078921600.0, - "grad_norm": 1.649161828071685, - "language_loss": 0.58413363, - "learning_rate": 7.869751121037192e-07, - "loss": 0.60526532, - "num_input_tokens_seen": 256942015, - "step": 11912, - "time_per_iteration": 3.1699318885803223 - }, - { - "auxiliary_loss_clip": 0.01101612, - "auxiliary_loss_mlp": 0.01031486, - "balance_loss_clip": 1.04070008, - "balance_loss_mlp": 1.01849806, - "epoch": 0.7162483090335187, - "flos": 20812101292800.0, - "grad_norm": 1.859500164824888, - "language_loss": 0.7810173, - "learning_rate": 7.866654842502376e-07, - "loss": 0.80234826, - "num_input_tokens_seen": 256961065, - "step": 11913, - "time_per_iteration": 2.704882860183716 - }, - { - "auxiliary_loss_clip": 0.01087765, - "auxiliary_loss_mlp": 0.0102754, - "balance_loss_clip": 1.03807175, - "balance_loss_mlp": 1.01646566, - "epoch": 0.7163084322861867, - "flos": 24097532630400.0, - "grad_norm": 1.6076637682641197, - "language_loss": 0.74075729, - "learning_rate": 7.863559024065234e-07, - "loss": 0.76191038, - "num_input_tokens_seen": 256982165, - "step": 11914, - "time_per_iteration": 2.7636988162994385 - }, - { - "auxiliary_loss_clip": 0.01075409, - "auxiliary_loss_mlp": 0.01033103, - "balance_loss_clip": 1.036044, - "balance_loss_mlp": 1.02074111, - "epoch": 0.7163685555388547, - "flos": 20080888128000.0, - "grad_norm": 1.6922973692533387, - "language_loss": 0.74138194, - "learning_rate": 7.860463665843143e-07, - "loss": 0.76246703, - "num_input_tokens_seen": 256999825, - "step": 11915, - "time_per_iteration": 2.816134452819824 - }, - { - "auxiliary_loss_clip": 0.01111475, - "auxiliary_loss_mlp": 0.01032503, - "balance_loss_clip": 1.0383029, - "balance_loss_mlp": 1.02015853, - "epoch": 0.7164286787915226, - "flos": 17456967613440.0, - "grad_norm": 2.8016306362793353, - "language_loss": 0.80886412, - "learning_rate": 7.85736876795349e-07, - "loss": 0.83030391, - "num_input_tokens_seen": 257017450, - "step": 11916, - "time_per_iteration": 2.666930675506592 - }, - { - "auxiliary_loss_clip": 0.01033862, - "auxiliary_loss_mlp": 0.01034435, - "balance_loss_clip": 1.03228307, - "balance_loss_mlp": 1.0218699, - "epoch": 0.7164888020441906, - "flos": 19718908819200.0, - "grad_norm": 1.9058816458994292, - "language_loss": 0.6875428, - "learning_rate": 7.854274330513626e-07, - "loss": 0.70822579, - "num_input_tokens_seen": 257035465, - "step": 11917, - "time_per_iteration": 3.0043599605560303 - }, - { - "auxiliary_loss_clip": 0.0108964, - "auxiliary_loss_mlp": 0.01035669, - "balance_loss_clip": 1.03826666, - "balance_loss_mlp": 1.0224905, - "epoch": 0.7165489252968585, - "flos": 21470523546240.0, - "grad_norm": 2.1418903876984614, - "language_loss": 0.75930321, - "learning_rate": 7.851180353640896e-07, - "loss": 0.78055626, - "num_input_tokens_seen": 257053750, - "step": 11918, - "time_per_iteration": 2.8666863441467285 - }, - { - "auxiliary_loss_clip": 0.01012914, - "auxiliary_loss_mlp": 0.0100742, - "balance_loss_clip": 1.00994635, - "balance_loss_mlp": 1.00643027, - "epoch": 0.7166090485495266, - "flos": 69928060464000.0, - "grad_norm": 0.6290817017745445, - "language_loss": 0.53839982, - "learning_rate": 7.848086837452639e-07, - "loss": 0.55860317, - "num_input_tokens_seen": 257121215, - "step": 11919, - "time_per_iteration": 3.3189728260040283 - }, - { - "auxiliary_loss_clip": 0.01090721, - "auxiliary_loss_mlp": 0.0103132, - "balance_loss_clip": 1.03968215, - "balance_loss_mlp": 1.01944053, - "epoch": 0.7166691718021945, - "flos": 27343892949120.0, - "grad_norm": 2.4558245905246188, - "language_loss": 0.68792629, - "learning_rate": 7.844993782066132e-07, - "loss": 0.70914674, - "num_input_tokens_seen": 257143370, - "step": 11920, - "time_per_iteration": 2.7760236263275146 - }, - { - "auxiliary_loss_clip": 0.01093244, - "auxiliary_loss_mlp": 0.01042352, - "balance_loss_clip": 1.03837049, - "balance_loss_mlp": 1.02936387, - "epoch": 0.7167292950548625, - "flos": 30408868563840.0, - "grad_norm": 1.7838996304195904, - "language_loss": 0.75269383, - "learning_rate": 7.841901187598678e-07, - "loss": 0.77404976, - "num_input_tokens_seen": 257162160, - "step": 11921, - "time_per_iteration": 2.775209426879883 - }, - { - "auxiliary_loss_clip": 0.01081729, - "auxiliary_loss_mlp": 0.01036086, - "balance_loss_clip": 1.04076838, - "balance_loss_mlp": 1.02052867, - "epoch": 0.7167894183075304, - "flos": 14571257800320.0, - "grad_norm": 2.2701090477680546, - "language_loss": 0.75837505, - "learning_rate": 7.83880905416755e-07, - "loss": 0.77955317, - "num_input_tokens_seen": 257179300, - "step": 11922, - "time_per_iteration": 2.7607452869415283 - }, - { - "auxiliary_loss_clip": 0.01014406, - "auxiliary_loss_mlp": 0.01014898, - "balance_loss_clip": 1.00970268, - "balance_loss_mlp": 1.01383746, - "epoch": 0.7168495415601984, - "flos": 64110674407680.0, - "grad_norm": 0.7523286809102585, - "language_loss": 0.55089313, - "learning_rate": 7.83571738189001e-07, - "loss": 0.57118618, - "num_input_tokens_seen": 257235470, - "step": 11923, - "time_per_iteration": 3.0676429271698 - }, - { - "auxiliary_loss_clip": 0.01080014, - "auxiliary_loss_mlp": 0.01037915, - "balance_loss_clip": 1.03623641, - "balance_loss_mlp": 1.024611, - "epoch": 0.7169096648128663, - "flos": 24681440119680.0, - "grad_norm": 1.4525334689031153, - "language_loss": 0.7698282, - "learning_rate": 7.832626170883279e-07, - "loss": 0.79100752, - "num_input_tokens_seen": 257255850, - "step": 11924, - "time_per_iteration": 2.823679208755493 - }, - { - "auxiliary_loss_clip": 0.01078337, - "auxiliary_loss_mlp": 0.01034494, - "balance_loss_clip": 1.0381155, - "balance_loss_mlp": 1.02288294, - "epoch": 0.7169697880655344, - "flos": 20667525050880.0, - "grad_norm": 1.7352538037253364, - "language_loss": 0.68109524, - "learning_rate": 7.829535421264588e-07, - "loss": 0.70222354, - "num_input_tokens_seen": 257275425, - "step": 11925, - "time_per_iteration": 2.7586591243743896 - }, - { - "auxiliary_loss_clip": 0.01080533, - "auxiliary_loss_mlp": 0.01032844, - "balance_loss_clip": 1.03722239, - "balance_loss_mlp": 1.02085745, - "epoch": 0.7170299113182023, - "flos": 21032700670080.0, - "grad_norm": 1.565689357795704, - "language_loss": 0.77380347, - "learning_rate": 7.826445133151133e-07, - "loss": 0.79493719, - "num_input_tokens_seen": 257295740, - "step": 11926, - "time_per_iteration": 2.777597188949585 - }, - { - "auxiliary_loss_clip": 0.01099959, - "auxiliary_loss_mlp": 0.00771085, - "balance_loss_clip": 1.03791356, - "balance_loss_mlp": 1.00019264, - "epoch": 0.7170900345708703, - "flos": 22893304239360.0, - "grad_norm": 1.9891447928832446, - "language_loss": 0.77106082, - "learning_rate": 7.823355306660093e-07, - "loss": 0.78977132, - "num_input_tokens_seen": 257315970, - "step": 11927, - "time_per_iteration": 2.722008228302002 - }, - { - "auxiliary_loss_clip": 0.01103176, - "auxiliary_loss_mlp": 0.01032942, - "balance_loss_clip": 1.04161656, - "balance_loss_mlp": 1.01948345, - "epoch": 0.7171501578235383, - "flos": 15518688883200.0, - "grad_norm": 1.5109458575320354, - "language_loss": 0.69240952, - "learning_rate": 7.820265941908642e-07, - "loss": 0.71377075, - "num_input_tokens_seen": 257334230, - "step": 11928, - "time_per_iteration": 2.685173511505127 - }, - { - "auxiliary_loss_clip": 0.01063212, - "auxiliary_loss_mlp": 0.01033233, - "balance_loss_clip": 1.03615737, - "balance_loss_mlp": 1.02093053, - "epoch": 0.7172102810762062, - "flos": 26104292640000.0, - "grad_norm": 1.8437632186543573, - "language_loss": 0.64895999, - "learning_rate": 7.817177039013931e-07, - "loss": 0.66992444, - "num_input_tokens_seen": 257352145, - "step": 11929, - "time_per_iteration": 2.811458110809326 - }, - { - "auxiliary_loss_clip": 0.01084354, - "auxiliary_loss_mlp": 0.0103302, - "balance_loss_clip": 1.03474772, - "balance_loss_mlp": 1.0201571, - "epoch": 0.7172704043288742, - "flos": 21506649649920.0, - "grad_norm": 4.025535473729134, - "language_loss": 0.70036447, - "learning_rate": 7.81408859809308e-07, - "loss": 0.72153819, - "num_input_tokens_seen": 257371460, - "step": 11930, - "time_per_iteration": 2.7018861770629883 - }, - { - "auxiliary_loss_clip": 0.01073615, - "auxiliary_loss_mlp": 0.01032596, - "balance_loss_clip": 1.03261399, - "balance_loss_mlp": 1.01994824, - "epoch": 0.7173305275815421, - "flos": 18770939032320.0, - "grad_norm": 1.865130875534894, - "language_loss": 0.80753005, - "learning_rate": 7.811000619263219e-07, - "loss": 0.82859218, - "num_input_tokens_seen": 257390800, - "step": 11931, - "time_per_iteration": 2.814512252807617 - }, - { - "auxiliary_loss_clip": 0.01099893, - "auxiliary_loss_mlp": 0.01032133, - "balance_loss_clip": 1.03860784, - "balance_loss_mlp": 1.02030206, - "epoch": 0.7173906508342102, - "flos": 16179876483840.0, - "grad_norm": 2.1237811167102967, - "language_loss": 0.77989686, - "learning_rate": 7.80791310264143e-07, - "loss": 0.80121714, - "num_input_tokens_seen": 257407495, - "step": 11932, - "time_per_iteration": 2.643590211868286 - }, - { - "auxiliary_loss_clip": 0.01094325, - "auxiliary_loss_mlp": 0.01031086, - "balance_loss_clip": 1.03725207, - "balance_loss_mlp": 1.01856303, - "epoch": 0.7174507740868781, - "flos": 26613864933120.0, - "grad_norm": 1.4329540611911684, - "language_loss": 0.75208265, - "learning_rate": 7.804826048344803e-07, - "loss": 0.77333677, - "num_input_tokens_seen": 257429675, - "step": 11933, - "time_per_iteration": 2.73256254196167 - }, - { - "auxiliary_loss_clip": 0.01118631, - "auxiliary_loss_mlp": 0.01038608, - "balance_loss_clip": 1.04044771, - "balance_loss_mlp": 1.02359951, - "epoch": 0.7175108973395461, - "flos": 18432911116800.0, - "grad_norm": 2.5273912434143537, - "language_loss": 0.69165599, - "learning_rate": 7.801739456490388e-07, - "loss": 0.71322834, - "num_input_tokens_seen": 257442765, - "step": 11934, - "time_per_iteration": 2.63053822517395 - }, - { - "auxiliary_loss_clip": 0.01101966, - "auxiliary_loss_mlp": 0.01034522, - "balance_loss_clip": 1.03851914, - "balance_loss_mlp": 1.02134395, - "epoch": 0.717571020592214, - "flos": 23914962777600.0, - "grad_norm": 2.3786346670781886, - "language_loss": 0.86663944, - "learning_rate": 7.798653327195237e-07, - "loss": 0.88800436, - "num_input_tokens_seen": 257459310, - "step": 11935, - "time_per_iteration": 2.7059433460235596 - }, - { - "auxiliary_loss_clip": 0.01068502, - "auxiliary_loss_mlp": 0.01030899, - "balance_loss_clip": 1.03335261, - "balance_loss_mlp": 1.01750588, - "epoch": 0.717631143844882, - "flos": 38256930109440.0, - "grad_norm": 1.5650811923001593, - "language_loss": 0.73900878, - "learning_rate": 7.795567660576388e-07, - "loss": 0.76000285, - "num_input_tokens_seen": 257484750, - "step": 11936, - "time_per_iteration": 2.8850317001342773 - }, - { - "auxiliary_loss_clip": 0.01029429, - "auxiliary_loss_mlp": 0.01001743, - "balance_loss_clip": 1.00656271, - "balance_loss_mlp": 1.00076544, - "epoch": 0.7176912670975499, - "flos": 65515896328320.0, - "grad_norm": 0.7545285974494826, - "language_loss": 0.55848956, - "learning_rate": 7.79248245675082e-07, - "loss": 0.57880127, - "num_input_tokens_seen": 257543110, - "step": 11937, - "time_per_iteration": 3.1446144580841064 - }, - { - "auxiliary_loss_clip": 0.01104456, - "auxiliary_loss_mlp": 0.01037308, - "balance_loss_clip": 1.03975892, - "balance_loss_mlp": 1.02318776, - "epoch": 0.717751390350218, - "flos": 31281066610560.0, - "grad_norm": 1.8325127153814165, - "language_loss": 0.54673332, - "learning_rate": 7.789397715835542e-07, - "loss": 0.568151, - "num_input_tokens_seen": 257567410, - "step": 11938, - "time_per_iteration": 2.7281179428100586 - }, - { - "auxiliary_loss_clip": 0.01098499, - "auxiliary_loss_mlp": 0.01031442, - "balance_loss_clip": 1.03886163, - "balance_loss_mlp": 1.01891303, - "epoch": 0.7178115136028859, - "flos": 19859031774720.0, - "grad_norm": 1.5418999350026745, - "language_loss": 0.76693535, - "learning_rate": 7.786313437947527e-07, - "loss": 0.78823477, - "num_input_tokens_seen": 257586270, - "step": 11939, - "time_per_iteration": 2.681007146835327 - }, - { - "auxiliary_loss_clip": 0.01013928, - "auxiliary_loss_mlp": 0.01000101, - "balance_loss_clip": 1.01088846, - "balance_loss_mlp": 0.99894488, - "epoch": 0.7178716368555539, - "flos": 64348655967360.0, - "grad_norm": 0.7513743107787466, - "language_loss": 0.61356354, - "learning_rate": 7.783229623203738e-07, - "loss": 0.63370389, - "num_input_tokens_seen": 257647415, - "step": 11940, - "time_per_iteration": 3.202899694442749 - }, - { - "auxiliary_loss_clip": 0.01071936, - "auxiliary_loss_mlp": 0.01033376, - "balance_loss_clip": 1.03435445, - "balance_loss_mlp": 1.02100182, - "epoch": 0.7179317601082219, - "flos": 26762607152640.0, - "grad_norm": 1.8000940083228283, - "language_loss": 0.58835107, - "learning_rate": 7.780146271721097e-07, - "loss": 0.60940421, - "num_input_tokens_seen": 257669795, - "step": 11941, - "time_per_iteration": 2.8157269954681396 - }, - { - "auxiliary_loss_clip": 0.01090967, - "auxiliary_loss_mlp": 0.01035406, - "balance_loss_clip": 1.03997254, - "balance_loss_mlp": 1.02213192, - "epoch": 0.7179918833608898, - "flos": 23513804709120.0, - "grad_norm": 1.9761608738591345, - "language_loss": 0.79027683, - "learning_rate": 7.777063383616543e-07, - "loss": 0.8115406, - "num_input_tokens_seen": 257687415, - "step": 11942, - "time_per_iteration": 4.7641441822052 - }, - { - "auxiliary_loss_clip": 0.01101717, - "auxiliary_loss_mlp": 0.01043851, - "balance_loss_clip": 1.03940737, - "balance_loss_mlp": 1.03082132, - "epoch": 0.7180520066135578, - "flos": 17165588486400.0, - "grad_norm": 2.14920903348502, - "language_loss": 0.66369361, - "learning_rate": 7.773980959006968e-07, - "loss": 0.68514931, - "num_input_tokens_seen": 257706215, - "step": 11943, - "time_per_iteration": 4.182480335235596 - }, - { - "auxiliary_loss_clip": 0.01111064, - "auxiliary_loss_mlp": 0.01032443, - "balance_loss_clip": 1.03972828, - "balance_loss_mlp": 1.01943135, - "epoch": 0.7181121298662257, - "flos": 17566638814080.0, - "grad_norm": 1.806449010910671, - "language_loss": 0.79078984, - "learning_rate": 7.770898998009254e-07, - "loss": 0.81222498, - "num_input_tokens_seen": 257724740, - "step": 11944, - "time_per_iteration": 2.5949878692626953 - }, - { - "auxiliary_loss_clip": 0.01088381, - "auxiliary_loss_mlp": 0.00771584, - "balance_loss_clip": 1.0390811, - "balance_loss_mlp": 1.00018096, - "epoch": 0.7181722531188938, - "flos": 11947660508160.0, - "grad_norm": 2.453862625605413, - "language_loss": 0.63021427, - "learning_rate": 7.767817500740277e-07, - "loss": 0.64881396, - "num_input_tokens_seen": 257742060, - "step": 11945, - "time_per_iteration": 4.4570722579956055 - }, - { - "auxiliary_loss_clip": 0.01016433, - "auxiliary_loss_mlp": 0.01004566, - "balance_loss_clip": 1.00740266, - "balance_loss_mlp": 1.00340927, - "epoch": 0.7182323763715617, - "flos": 65503649790720.0, - "grad_norm": 0.7009639524775984, - "language_loss": 0.51063281, - "learning_rate": 7.76473646731689e-07, - "loss": 0.53084278, - "num_input_tokens_seen": 257802250, - "step": 11946, - "time_per_iteration": 3.083326816558838 - }, - { - "auxiliary_loss_clip": 0.01082274, - "auxiliary_loss_mlp": 0.01035503, - "balance_loss_clip": 1.03858232, - "balance_loss_mlp": 1.02061403, - "epoch": 0.7182924996242297, - "flos": 20630932070400.0, - "grad_norm": 1.6221308546961208, - "language_loss": 0.74305403, - "learning_rate": 7.761655897855925e-07, - "loss": 0.7642318, - "num_input_tokens_seen": 257821155, - "step": 11947, - "time_per_iteration": 2.690142869949341 - }, - { - "auxiliary_loss_clip": 0.01063215, - "auxiliary_loss_mlp": 0.00770856, - "balance_loss_clip": 1.03264832, - "balance_loss_mlp": 1.0000999, - "epoch": 0.7183526228768976, - "flos": 16216433550720.0, - "grad_norm": 1.4641702489475559, - "language_loss": 0.72301382, - "learning_rate": 7.758575792474187e-07, - "loss": 0.74135453, - "num_input_tokens_seen": 257839905, - "step": 11948, - "time_per_iteration": 2.722843647003174 - }, - { - "auxiliary_loss_clip": 0.01090958, - "auxiliary_loss_mlp": 0.01044843, - "balance_loss_clip": 1.03650224, - "balance_loss_mlp": 1.0302515, - "epoch": 0.7184127461295656, - "flos": 22232655342720.0, - "grad_norm": 1.5800605567869153, - "language_loss": 0.71426845, - "learning_rate": 7.755496151288483e-07, - "loss": 0.73562646, - "num_input_tokens_seen": 257860055, - "step": 11949, - "time_per_iteration": 2.6724255084991455 - }, - { - "auxiliary_loss_clip": 0.01110775, - "auxiliary_loss_mlp": 0.00770919, - "balance_loss_clip": 1.03964746, - "balance_loss_mlp": 1.00022686, - "epoch": 0.7184728693822335, - "flos": 27344503480320.0, - "grad_norm": 2.2408917866135116, - "language_loss": 0.76207352, - "learning_rate": 7.752416974415598e-07, - "loss": 0.78089041, - "num_input_tokens_seen": 257879315, - "step": 11950, - "time_per_iteration": 4.192263603210449 - }, - { - "auxiliary_loss_clip": 0.011156, - "auxiliary_loss_mlp": 0.01034636, - "balance_loss_clip": 1.04076946, - "balance_loss_mlp": 1.02039647, - "epoch": 0.7185329926349016, - "flos": 16508530949760.0, - "grad_norm": 2.236939541243443, - "language_loss": 0.67911047, - "learning_rate": 7.749338261972282e-07, - "loss": 0.70061278, - "num_input_tokens_seen": 257896570, - "step": 11951, - "time_per_iteration": 2.506354808807373 - }, - { - "auxiliary_loss_clip": 0.01093328, - "auxiliary_loss_mlp": 0.01038497, - "balance_loss_clip": 1.03931642, - "balance_loss_mlp": 1.02329814, - "epoch": 0.7185931158875695, - "flos": 23951052967680.0, - "grad_norm": 1.74286410335133, - "language_loss": 0.78158391, - "learning_rate": 7.746260014075286e-07, - "loss": 0.8029021, - "num_input_tokens_seen": 257916855, - "step": 11952, - "time_per_iteration": 2.660937547683716 - }, - { - "auxiliary_loss_clip": 0.01106031, - "auxiliary_loss_mlp": 0.01036025, - "balance_loss_clip": 1.03961015, - "balance_loss_mlp": 1.02241182, - "epoch": 0.7186532391402375, - "flos": 26542007775360.0, - "grad_norm": 1.8142092778297234, - "language_loss": 0.74966663, - "learning_rate": 7.743182230841352e-07, - "loss": 0.77108717, - "num_input_tokens_seen": 257937140, - "step": 11953, - "time_per_iteration": 2.64990234375 - }, - { - "auxiliary_loss_clip": 0.01104406, - "auxiliary_loss_mlp": 0.010347, - "balance_loss_clip": 1.03859532, - "balance_loss_mlp": 1.0209074, - "epoch": 0.7187133623929055, - "flos": 22383049587840.0, - "grad_norm": 1.8633986860843366, - "language_loss": 0.73231012, - "learning_rate": 7.740104912387164e-07, - "loss": 0.75370121, - "num_input_tokens_seen": 257956785, - "step": 11954, - "time_per_iteration": 2.667728900909424 - }, - { - "auxiliary_loss_clip": 0.01092336, - "auxiliary_loss_mlp": 0.01037743, - "balance_loss_clip": 1.04056668, - "balance_loss_mlp": 1.02468944, - "epoch": 0.7187734856455734, - "flos": 15779580341760.0, - "grad_norm": 1.6371467452088548, - "language_loss": 0.7436921, - "learning_rate": 7.737028058829425e-07, - "loss": 0.76499295, - "num_input_tokens_seen": 257975455, - "step": 11955, - "time_per_iteration": 2.750943660736084 - }, - { - "auxiliary_loss_clip": 0.01077053, - "auxiliary_loss_mlp": 0.01034821, - "balance_loss_clip": 1.03667569, - "balance_loss_mlp": 1.02145171, - "epoch": 0.7188336088982414, - "flos": 31759612531200.0, - "grad_norm": 1.63362456002572, - "language_loss": 0.73112231, - "learning_rate": 7.733951670284817e-07, - "loss": 0.75224108, - "num_input_tokens_seen": 257996850, - "step": 11956, - "time_per_iteration": 2.7964000701904297 - }, - { - "auxiliary_loss_clip": 0.01027108, - "auxiliary_loss_mlp": 0.01054242, - "balance_loss_clip": 1.0295012, - "balance_loss_mlp": 1.0388875, - "epoch": 0.7188937321509093, - "flos": 21465208333440.0, - "grad_norm": 1.634055582279059, - "language_loss": 0.71066529, - "learning_rate": 7.730875746869987e-07, - "loss": 0.73147881, - "num_input_tokens_seen": 258016145, - "step": 11957, - "time_per_iteration": 2.920449733734131 - }, - { - "auxiliary_loss_clip": 0.01066083, - "auxiliary_loss_mlp": 0.01046033, - "balance_loss_clip": 1.03746307, - "balance_loss_mlp": 1.03144193, - "epoch": 0.7189538554035774, - "flos": 27271497087360.0, - "grad_norm": 1.9298649142974575, - "language_loss": 0.73817873, - "learning_rate": 7.727800288701582e-07, - "loss": 0.75929987, - "num_input_tokens_seen": 258035420, - "step": 11958, - "time_per_iteration": 2.8204050064086914 - }, - { - "auxiliary_loss_clip": 0.01097894, - "auxiliary_loss_mlp": 0.01043657, - "balance_loss_clip": 1.03673959, - "balance_loss_mlp": 1.03006124, - "epoch": 0.7190139786562453, - "flos": 21580625710080.0, - "grad_norm": 1.5794968369614186, - "language_loss": 0.83998394, - "learning_rate": 7.724725295896215e-07, - "loss": 0.86139941, - "num_input_tokens_seen": 258053520, - "step": 11959, - "time_per_iteration": 2.7135143280029297 - }, - { - "auxiliary_loss_clip": 0.01118944, - "auxiliary_loss_mlp": 0.01033809, - "balance_loss_clip": 1.04263496, - "balance_loss_mlp": 1.0193491, - "epoch": 0.7190741019089133, - "flos": 26721237663360.0, - "grad_norm": 1.6672676962556263, - "language_loss": 0.81917083, - "learning_rate": 7.7216507685705e-07, - "loss": 0.84069836, - "num_input_tokens_seen": 258073020, - "step": 11960, - "time_per_iteration": 2.6510887145996094 - }, - { - "auxiliary_loss_clip": 0.01085237, - "auxiliary_loss_mlp": 0.01040267, - "balance_loss_clip": 1.03664184, - "balance_loss_mlp": 1.02624774, - "epoch": 0.7191342251615812, - "flos": 26104759516800.0, - "grad_norm": 1.541177269995967, - "language_loss": 0.77309084, - "learning_rate": 7.718576706841013e-07, - "loss": 0.79434586, - "num_input_tokens_seen": 258093155, - "step": 11961, - "time_per_iteration": 2.720644950866699 - }, - { - "auxiliary_loss_clip": 0.01093865, - "auxiliary_loss_mlp": 0.01034927, - "balance_loss_clip": 1.03698349, - "balance_loss_mlp": 1.02280951, - "epoch": 0.7191943484142492, - "flos": 22967028904320.0, - "grad_norm": 1.422930099710146, - "language_loss": 0.75150669, - "learning_rate": 7.715503110824326e-07, - "loss": 0.7727946, - "num_input_tokens_seen": 258113905, - "step": 11962, - "time_per_iteration": 2.602642774581909 - }, - { - "auxiliary_loss_clip": 0.01101563, - "auxiliary_loss_mlp": 0.01033205, - "balance_loss_clip": 1.03852582, - "balance_loss_mlp": 1.01952553, - "epoch": 0.7192544716669171, - "flos": 22565332131840.0, - "grad_norm": 1.6830971031616218, - "language_loss": 0.74998534, - "learning_rate": 7.712429980637001e-07, - "loss": 0.77133304, - "num_input_tokens_seen": 258132820, - "step": 11963, - "time_per_iteration": 2.6065595149993896 - }, - { - "auxiliary_loss_clip": 0.01076507, - "auxiliary_loss_mlp": 0.0103598, - "balance_loss_clip": 1.03903389, - "balance_loss_mlp": 1.02130532, - "epoch": 0.7193145949195852, - "flos": 18982200873600.0, - "grad_norm": 2.2290722742706253, - "language_loss": 0.80742419, - "learning_rate": 7.709357316395564e-07, - "loss": 0.82854903, - "num_input_tokens_seen": 258148055, - "step": 11964, - "time_per_iteration": 2.623037338256836 - }, - { - "auxiliary_loss_clip": 0.0110166, - "auxiliary_loss_mlp": 0.01035653, - "balance_loss_clip": 1.03931797, - "balance_loss_mlp": 1.02267718, - "epoch": 0.7193747181722531, - "flos": 18004246208640.0, - "grad_norm": 1.8511533341084931, - "language_loss": 0.74847329, - "learning_rate": 7.70628511821652e-07, - "loss": 0.76984644, - "num_input_tokens_seen": 258165995, - "step": 11965, - "time_per_iteration": 2.6308131217956543 - }, - { - "auxiliary_loss_clip": 0.01088669, - "auxiliary_loss_mlp": 0.0103597, - "balance_loss_clip": 1.04116011, - "balance_loss_mlp": 1.02225494, - "epoch": 0.7194348414249211, - "flos": 24389414547840.0, - "grad_norm": 1.5072398598153138, - "language_loss": 0.77484959, - "learning_rate": 7.703213386216377e-07, - "loss": 0.79609603, - "num_input_tokens_seen": 258186165, - "step": 11966, - "time_per_iteration": 2.7064943313598633 - }, - { - "auxiliary_loss_clip": 0.0108693, - "auxiliary_loss_mlp": 0.01040354, - "balance_loss_clip": 1.03570664, - "balance_loss_mlp": 1.02598929, - "epoch": 0.7194949646775891, - "flos": 22163455791360.0, - "grad_norm": 2.094523780207328, - "language_loss": 0.72974217, - "learning_rate": 7.700142120511619e-07, - "loss": 0.75101507, - "num_input_tokens_seen": 258204595, - "step": 11967, - "time_per_iteration": 2.6798341274261475 - }, - { - "auxiliary_loss_clip": 0.01084414, - "auxiliary_loss_mlp": 0.01030462, - "balance_loss_clip": 1.03810835, - "balance_loss_mlp": 1.01876187, - "epoch": 0.719555087930257, - "flos": 20266366982400.0, - "grad_norm": 1.6400995747939784, - "language_loss": 0.81876254, - "learning_rate": 7.6970713212187e-07, - "loss": 0.83991134, - "num_input_tokens_seen": 258223110, - "step": 11968, - "time_per_iteration": 2.5945241451263428 - }, - { - "auxiliary_loss_clip": 0.01090809, - "auxiliary_loss_mlp": 0.01030819, - "balance_loss_clip": 1.03921008, - "balance_loss_mlp": 1.01730037, - "epoch": 0.719615211182925, - "flos": 24716309247360.0, - "grad_norm": 6.059293757732166, - "language_loss": 0.76039946, - "learning_rate": 7.69400098845407e-07, - "loss": 0.78161573, - "num_input_tokens_seen": 258242660, - "step": 11969, - "time_per_iteration": 2.669769763946533 - }, - { - "auxiliary_loss_clip": 0.01071764, - "auxiliary_loss_mlp": 0.01035075, - "balance_loss_clip": 1.03422332, - "balance_loss_mlp": 1.02085924, - "epoch": 0.719675334435593, - "flos": 20009641501440.0, - "grad_norm": 1.7540280095141672, - "language_loss": 0.71060121, - "learning_rate": 7.69093112233417e-07, - "loss": 0.7316696, - "num_input_tokens_seen": 258261850, - "step": 11970, - "time_per_iteration": 2.679556131362915 - }, - { - "auxiliary_loss_clip": 0.01013659, - "auxiliary_loss_mlp": 0.01009131, - "balance_loss_clip": 1.00968122, - "balance_loss_mlp": 1.00800419, - "epoch": 0.719735457688261, - "flos": 44199861177600.0, - "grad_norm": 0.9164669258671052, - "language_loss": 0.60825729, - "learning_rate": 7.68786172297538e-07, - "loss": 0.6284852, - "num_input_tokens_seen": 258312570, - "step": 11971, - "time_per_iteration": 3.07918381690979 - }, - { - "auxiliary_loss_clip": 0.01119878, - "auxiliary_loss_mlp": 0.0103657, - "balance_loss_clip": 1.04122591, - "balance_loss_mlp": 1.02223504, - "epoch": 0.7197955809409289, - "flos": 16802890905600.0, - "grad_norm": 2.0890119632055772, - "language_loss": 0.80200607, - "learning_rate": 7.684792790494105e-07, - "loss": 0.82357055, - "num_input_tokens_seen": 258331600, - "step": 11972, - "time_per_iteration": 2.6157615184783936 - }, - { - "auxiliary_loss_clip": 0.01094231, - "auxiliary_loss_mlp": 0.01036827, - "balance_loss_clip": 1.03909624, - "balance_loss_mlp": 1.02286744, - "epoch": 0.7198557041935969, - "flos": 24535391420160.0, - "grad_norm": 1.4459296159534718, - "language_loss": 0.75361621, - "learning_rate": 7.681724325006733e-07, - "loss": 0.77492678, - "num_input_tokens_seen": 258351785, - "step": 11973, - "time_per_iteration": 2.7092697620391846 - }, - { - "auxiliary_loss_clip": 0.00998126, - "auxiliary_loss_mlp": 0.01000341, - "balance_loss_clip": 1.01353586, - "balance_loss_mlp": 0.99922049, - "epoch": 0.7199158274462648, - "flos": 70710839602560.0, - "grad_norm": 0.8513128948563679, - "language_loss": 0.5708431, - "learning_rate": 7.6786563266296e-07, - "loss": 0.5908277, - "num_input_tokens_seen": 258404035, - "step": 11974, - "time_per_iteration": 3.085857391357422 - }, - { - "auxiliary_loss_clip": 0.01087282, - "auxiliary_loss_mlp": 0.01034258, - "balance_loss_clip": 1.03747392, - "balance_loss_mlp": 1.02043021, - "epoch": 0.7199759506989328, - "flos": 29347995352320.0, - "grad_norm": 2.3096725812803225, - "language_loss": 0.61059892, - "learning_rate": 7.675588795479062e-07, - "loss": 0.6318143, - "num_input_tokens_seen": 258424850, - "step": 11975, - "time_per_iteration": 2.7332818508148193 - }, - { - "auxiliary_loss_clip": 0.01100807, - "auxiliary_loss_mlp": 0.01033871, - "balance_loss_clip": 1.03652167, - "balance_loss_mlp": 1.02041817, - "epoch": 0.7200360739516007, - "flos": 24640465680000.0, - "grad_norm": 2.671508087455807, - "language_loss": 0.67916059, - "learning_rate": 7.672521731671425e-07, - "loss": 0.7005074, - "num_input_tokens_seen": 258445485, - "step": 11976, - "time_per_iteration": 2.6940202713012695 - }, - { - "auxiliary_loss_clip": 0.0108397, - "auxiliary_loss_mlp": 0.01030323, - "balance_loss_clip": 1.03955865, - "balance_loss_mlp": 1.0175494, - "epoch": 0.7200961972042688, - "flos": 20812855478400.0, - "grad_norm": 1.8443077153848637, - "language_loss": 0.67261469, - "learning_rate": 7.669455135323004e-07, - "loss": 0.69375765, - "num_input_tokens_seen": 258464505, - "step": 11977, - "time_per_iteration": 2.6581647396087646 - }, - { - "auxiliary_loss_clip": 0.01091707, - "auxiliary_loss_mlp": 0.01036372, - "balance_loss_clip": 1.03710294, - "balance_loss_mlp": 1.02315187, - "epoch": 0.7201563204569367, - "flos": 31245910174080.0, - "grad_norm": 1.5443170627433962, - "language_loss": 0.75495118, - "learning_rate": 7.666389006550074e-07, - "loss": 0.776232, - "num_input_tokens_seen": 258487190, - "step": 11978, - "time_per_iteration": 2.8164350986480713 - }, - { - "auxiliary_loss_clip": 0.0111045, - "auxiliary_loss_mlp": 0.01033615, - "balance_loss_clip": 1.03798056, - "balance_loss_mlp": 1.02009642, - "epoch": 0.7202164437096047, - "flos": 26651391667200.0, - "grad_norm": 2.011628151794158, - "language_loss": 0.78906727, - "learning_rate": 7.663323345468908e-07, - "loss": 0.81050789, - "num_input_tokens_seen": 258503790, - "step": 11979, - "time_per_iteration": 2.603609323501587 - }, - { - "auxiliary_loss_clip": 0.01100805, - "auxiliary_loss_mlp": 0.01032476, - "balance_loss_clip": 1.03782308, - "balance_loss_mlp": 1.01863027, - "epoch": 0.7202765669622727, - "flos": 25959608657280.0, - "grad_norm": 1.489458439869756, - "language_loss": 0.64516908, - "learning_rate": 7.660258152195767e-07, - "loss": 0.66650194, - "num_input_tokens_seen": 258527335, - "step": 11980, - "time_per_iteration": 2.6712260246276855 - }, - { - "auxiliary_loss_clip": 0.01106474, - "auxiliary_loss_mlp": 0.01037898, - "balance_loss_clip": 1.04096806, - "balance_loss_mlp": 1.02322936, - "epoch": 0.7203366902149406, - "flos": 28512354372480.0, - "grad_norm": 3.283132344520263, - "language_loss": 0.67034644, - "learning_rate": 7.657193426846871e-07, - "loss": 0.69179016, - "num_input_tokens_seen": 258546690, - "step": 11981, - "time_per_iteration": 4.248534202575684 - }, - { - "auxiliary_loss_clip": 0.01080413, - "auxiliary_loss_mlp": 0.01035174, - "balance_loss_clip": 1.03540182, - "balance_loss_mlp": 1.02077293, - "epoch": 0.7203968134676086, - "flos": 21106030285440.0, - "grad_norm": 1.9200957279106055, - "language_loss": 0.74228042, - "learning_rate": 7.65412916953843e-07, - "loss": 0.76343632, - "num_input_tokens_seen": 258566340, - "step": 11982, - "time_per_iteration": 2.6612656116485596 - }, - { - "auxiliary_loss_clip": 0.01082612, - "auxiliary_loss_mlp": 0.00771666, - "balance_loss_clip": 1.03610659, - "balance_loss_mlp": 1.00010824, - "epoch": 0.7204569367202766, - "flos": 18332146488960.0, - "grad_norm": 1.9444187102114145, - "language_loss": 0.65890288, - "learning_rate": 7.65106538038665e-07, - "loss": 0.67744565, - "num_input_tokens_seen": 258584455, - "step": 11983, - "time_per_iteration": 5.959589004516602 - }, - { - "auxiliary_loss_clip": 0.01084437, - "auxiliary_loss_mlp": 0.01035638, - "balance_loss_clip": 1.04208398, - "balance_loss_mlp": 1.02224469, - "epoch": 0.7205170599729446, - "flos": 23255103980160.0, - "grad_norm": 1.5232420204646802, - "language_loss": 0.66515326, - "learning_rate": 7.648002059507715e-07, - "loss": 0.68635398, - "num_input_tokens_seen": 258604725, - "step": 11984, - "time_per_iteration": 2.6606063842773438 - }, - { - "auxiliary_loss_clip": 0.01102672, - "auxiliary_loss_mlp": 0.01035615, - "balance_loss_clip": 1.03870726, - "balance_loss_mlp": 1.02119064, - "epoch": 0.7205771832256125, - "flos": 20120892900480.0, - "grad_norm": 1.688320312491579, - "language_loss": 0.74081761, - "learning_rate": 7.644939207017771e-07, - "loss": 0.76220047, - "num_input_tokens_seen": 258622885, - "step": 11985, - "time_per_iteration": 2.6758813858032227 - }, - { - "auxiliary_loss_clip": 0.01100706, - "auxiliary_loss_mlp": 0.01032026, - "balance_loss_clip": 1.03882444, - "balance_loss_mlp": 1.01896691, - "epoch": 0.7206373064782805, - "flos": 27703250565120.0, - "grad_norm": 2.1824579845147287, - "language_loss": 0.62681192, - "learning_rate": 7.641876823032977e-07, - "loss": 0.64813924, - "num_input_tokens_seen": 258644305, - "step": 11986, - "time_per_iteration": 2.6787214279174805 - }, - { - "auxiliary_loss_clip": 0.01094506, - "auxiliary_loss_mlp": 0.01035959, - "balance_loss_clip": 1.0400337, - "balance_loss_mlp": 1.02129614, - "epoch": 0.7206974297309484, - "flos": 17968156018560.0, - "grad_norm": 1.6774381581209574, - "language_loss": 0.72387213, - "learning_rate": 7.638814907669455e-07, - "loss": 0.74517679, - "num_input_tokens_seen": 258661775, - "step": 11987, - "time_per_iteration": 2.6494300365448 - }, - { - "auxiliary_loss_clip": 0.01091554, - "auxiliary_loss_mlp": 0.01036778, - "balance_loss_clip": 1.03807402, - "balance_loss_mlp": 1.0230689, - "epoch": 0.7207575529836164, - "flos": 16983162288000.0, - "grad_norm": 2.0154158747708886, - "language_loss": 0.78542352, - "learning_rate": 7.635753461043301e-07, - "loss": 0.80670691, - "num_input_tokens_seen": 258679830, - "step": 11988, - "time_per_iteration": 2.7818825244903564 - }, - { - "auxiliary_loss_clip": 0.01112006, - "auxiliary_loss_mlp": 0.0103674, - "balance_loss_clip": 1.03854907, - "balance_loss_mlp": 1.02319229, - "epoch": 0.7208176762362843, - "flos": 18727594295040.0, - "grad_norm": 2.5683487455576013, - "language_loss": 0.78912222, - "learning_rate": 7.632692483270618e-07, - "loss": 0.8106097, - "num_input_tokens_seen": 258697415, - "step": 11989, - "time_per_iteration": 4.105331659317017 - }, - { - "auxiliary_loss_clip": 0.01110244, - "auxiliary_loss_mlp": 0.01035931, - "balance_loss_clip": 1.03845143, - "balance_loss_mlp": 1.02281189, - "epoch": 0.7208777994889524, - "flos": 18734489706240.0, - "grad_norm": 1.667538370245498, - "language_loss": 0.8218925, - "learning_rate": 7.629631974467481e-07, - "loss": 0.84335428, - "num_input_tokens_seen": 258716755, - "step": 11990, - "time_per_iteration": 2.59250545501709 - }, - { - "auxiliary_loss_clip": 0.01084798, - "auxiliary_loss_mlp": 0.01039501, - "balance_loss_clip": 1.03765297, - "balance_loss_mlp": 1.0263406, - "epoch": 0.7209379227416203, - "flos": 14793437376000.0, - "grad_norm": 2.0017944237848146, - "language_loss": 0.76018798, - "learning_rate": 7.626571934749931e-07, - "loss": 0.78143102, - "num_input_tokens_seen": 258733270, - "step": 11991, - "time_per_iteration": 2.6581742763519287 - }, - { - "auxiliary_loss_clip": 0.01069068, - "auxiliary_loss_mlp": 0.01036802, - "balance_loss_clip": 1.03637481, - "balance_loss_mlp": 1.02277708, - "epoch": 0.7209980459942883, - "flos": 29636860527360.0, - "grad_norm": 1.4417836781723634, - "language_loss": 0.7278806, - "learning_rate": 7.623512364234022e-07, - "loss": 0.74893934, - "num_input_tokens_seen": 258755270, - "step": 11992, - "time_per_iteration": 2.762066602706909 - }, - { - "auxiliary_loss_clip": 0.01101853, - "auxiliary_loss_mlp": 0.01035181, - "balance_loss_clip": 1.03684831, - "balance_loss_mlp": 1.0217396, - "epoch": 0.7210581692469563, - "flos": 23477175815040.0, - "grad_norm": 1.590664380995942, - "language_loss": 0.66213107, - "learning_rate": 7.620453263035755e-07, - "loss": 0.68350136, - "num_input_tokens_seen": 258775340, - "step": 11993, - "time_per_iteration": 2.669746160507202 - }, - { - "auxiliary_loss_clip": 0.01103083, - "auxiliary_loss_mlp": 0.01034792, - "balance_loss_clip": 1.03803623, - "balance_loss_mlp": 1.02193534, - "epoch": 0.7211182924996242, - "flos": 26099839353600.0, - "grad_norm": 3.884072112544962, - "language_loss": 0.65876019, - "learning_rate": 7.61739463127115e-07, - "loss": 0.68013895, - "num_input_tokens_seen": 258794580, - "step": 11994, - "time_per_iteration": 2.6249778270721436 - }, - { - "auxiliary_loss_clip": 0.01103021, - "auxiliary_loss_mlp": 0.01036805, - "balance_loss_clip": 1.03799295, - "balance_loss_mlp": 1.02208841, - "epoch": 0.7211784157522922, - "flos": 17712076982400.0, - "grad_norm": 2.8589170011893006, - "language_loss": 0.67324853, - "learning_rate": 7.614336469056172e-07, - "loss": 0.69464678, - "num_input_tokens_seen": 258812330, - "step": 11995, - "time_per_iteration": 2.5577452182769775 - }, - { - "auxiliary_loss_clip": 0.01084316, - "auxiliary_loss_mlp": 0.01033821, - "balance_loss_clip": 1.03543901, - "balance_loss_mlp": 1.01986206, - "epoch": 0.7212385390049602, - "flos": 24423637230720.0, - "grad_norm": 2.2331481184505537, - "language_loss": 0.79888833, - "learning_rate": 7.6112787765068e-07, - "loss": 0.82006973, - "num_input_tokens_seen": 258831770, - "step": 11996, - "time_per_iteration": 2.6798765659332275 - }, - { - "auxiliary_loss_clip": 0.01112754, - "auxiliary_loss_mlp": 0.01038784, - "balance_loss_clip": 1.03908491, - "balance_loss_mlp": 1.02556992, - "epoch": 0.7212986622576282, - "flos": 28147250580480.0, - "grad_norm": 1.9315796052948224, - "language_loss": 0.81023175, - "learning_rate": 7.60822155373899e-07, - "loss": 0.83174717, - "num_input_tokens_seen": 258849090, - "step": 11997, - "time_per_iteration": 2.656759023666382 - }, - { - "auxiliary_loss_clip": 0.01114647, - "auxiliary_loss_mlp": 0.0103518, - "balance_loss_clip": 1.03915894, - "balance_loss_mlp": 1.02126861, - "epoch": 0.7213587855102961, - "flos": 21835770992640.0, - "grad_norm": 1.8930751745760046, - "language_loss": 0.67190164, - "learning_rate": 7.605164800868646e-07, - "loss": 0.69339991, - "num_input_tokens_seen": 258868230, - "step": 11998, - "time_per_iteration": 2.6269752979278564 - }, - { - "auxiliary_loss_clip": 0.01113247, - "auxiliary_loss_mlp": 0.01032004, - "balance_loss_clip": 1.0402633, - "balance_loss_mlp": 1.01999927, - "epoch": 0.7214189087629641, - "flos": 14611549881600.0, - "grad_norm": 2.2123816168992287, - "language_loss": 0.72197175, - "learning_rate": 7.602108518011696e-07, - "loss": 0.74342418, - "num_input_tokens_seen": 258885525, - "step": 11999, - "time_per_iteration": 2.7030436992645264 - }, - { - "auxiliary_loss_clip": 0.01095225, - "auxiliary_loss_mlp": 0.01030017, - "balance_loss_clip": 1.03975248, - "balance_loss_mlp": 1.01632595, - "epoch": 0.721479032015632, - "flos": 19390864884480.0, - "grad_norm": 2.1896556782870986, - "language_loss": 0.82891619, - "learning_rate": 7.599052705284039e-07, - "loss": 0.85016865, - "num_input_tokens_seen": 258903245, - "step": 12000, - "time_per_iteration": 2.72419810295105 - }, - { - "auxiliary_loss_clip": 0.0110488, - "auxiliary_loss_mlp": 0.01036877, - "balance_loss_clip": 1.04077649, - "balance_loss_mlp": 1.02337074, - "epoch": 0.7215391552683, - "flos": 18512884748160.0, - "grad_norm": 2.238210081957985, - "language_loss": 0.77015889, - "learning_rate": 7.59599736280154e-07, - "loss": 0.79157639, - "num_input_tokens_seen": 258921245, - "step": 12001, - "time_per_iteration": 2.6786983013153076 - }, - { - "auxiliary_loss_clip": 0.01096613, - "auxiliary_loss_mlp": 0.01041613, - "balance_loss_clip": 1.03922153, - "balance_loss_mlp": 1.02826142, - "epoch": 0.721599278520968, - "flos": 23258731253760.0, - "grad_norm": 1.7647688561278618, - "language_loss": 0.81434, - "learning_rate": 7.592942490680066e-07, - "loss": 0.83572221, - "num_input_tokens_seen": 258939425, - "step": 12002, - "time_per_iteration": 2.766787052154541 - }, - { - "auxiliary_loss_clip": 0.01103657, - "auxiliary_loss_mlp": 0.0102914, - "balance_loss_clip": 1.03956521, - "balance_loss_mlp": 1.01506686, - "epoch": 0.721659401773636, - "flos": 39199045979520.0, - "grad_norm": 1.90156490746599, - "language_loss": 0.62442046, - "learning_rate": 7.589888089035462e-07, - "loss": 0.64574844, - "num_input_tokens_seen": 258960710, - "step": 12003, - "time_per_iteration": 2.7572412490844727 - }, - { - "auxiliary_loss_clip": 0.01114647, - "auxiliary_loss_mlp": 0.01033073, - "balance_loss_clip": 1.0397718, - "balance_loss_mlp": 1.019418, - "epoch": 0.7217195250263039, - "flos": 14939917038720.0, - "grad_norm": 2.6609118210523146, - "language_loss": 0.6843828, - "learning_rate": 7.586834157983544e-07, - "loss": 0.70586002, - "num_input_tokens_seen": 258978475, - "step": 12004, - "time_per_iteration": 2.553619623184204 - }, - { - "auxiliary_loss_clip": 0.01013578, - "auxiliary_loss_mlp": 0.01003303, - "balance_loss_clip": 1.01591694, - "balance_loss_mlp": 1.0020926, - "epoch": 0.7217796482789719, - "flos": 70869206666880.0, - "grad_norm": 0.858251890961465, - "language_loss": 0.54091179, - "learning_rate": 7.583780697640112e-07, - "loss": 0.56108057, - "num_input_tokens_seen": 259037520, - "step": 12005, - "time_per_iteration": 3.186676502227783 - }, - { - "auxiliary_loss_clip": 0.0107998, - "auxiliary_loss_mlp": 0.01033092, - "balance_loss_clip": 1.03859079, - "balance_loss_mlp": 1.0192821, - "epoch": 0.7218397715316398, - "flos": 37451525402880.0, - "grad_norm": 1.66711169237072, - "language_loss": 0.63384253, - "learning_rate": 7.580727708120962e-07, - "loss": 0.65497327, - "num_input_tokens_seen": 259061325, - "step": 12006, - "time_per_iteration": 2.8096885681152344 - }, - { - "auxiliary_loss_clip": 0.01084341, - "auxiliary_loss_mlp": 0.01034652, - "balance_loss_clip": 1.03541422, - "balance_loss_mlp": 1.02141964, - "epoch": 0.7218998947843078, - "flos": 22710662559360.0, - "grad_norm": 1.8415091001444905, - "language_loss": 0.91831303, - "learning_rate": 7.577675189541865e-07, - "loss": 0.93950289, - "num_input_tokens_seen": 259078135, - "step": 12007, - "time_per_iteration": 2.636061668395996 - }, - { - "auxiliary_loss_clip": 0.01074819, - "auxiliary_loss_mlp": 0.01038291, - "balance_loss_clip": 1.03386235, - "balance_loss_mlp": 1.02249599, - "epoch": 0.7219600180369758, - "flos": 12167182477440.0, - "grad_norm": 1.9560042300828953, - "language_loss": 0.64139968, - "learning_rate": 7.574623142018568e-07, - "loss": 0.66253078, - "num_input_tokens_seen": 259095910, - "step": 12008, - "time_per_iteration": 2.6658670902252197 - }, - { - "auxiliary_loss_clip": 0.0110234, - "auxiliary_loss_mlp": 0.0103902, - "balance_loss_clip": 1.03860354, - "balance_loss_mlp": 1.02491176, - "epoch": 0.7220201412896438, - "flos": 22596573985920.0, - "grad_norm": 1.9949931171952824, - "language_loss": 0.78768408, - "learning_rate": 7.57157156566681e-07, - "loss": 0.80909771, - "num_input_tokens_seen": 259114225, - "step": 12009, - "time_per_iteration": 2.6496176719665527 - }, - { - "auxiliary_loss_clip": 0.01103715, - "auxiliary_loss_mlp": 0.01040084, - "balance_loss_clip": 1.04009509, - "balance_loss_mlp": 1.02490854, - "epoch": 0.7220802645423118, - "flos": 26718651884160.0, - "grad_norm": 1.8397913257632763, - "language_loss": 0.64088428, - "learning_rate": 7.568520460602297e-07, - "loss": 0.66232234, - "num_input_tokens_seen": 259134660, - "step": 12010, - "time_per_iteration": 2.7039434909820557 - }, - { - "auxiliary_loss_clip": 0.01112341, - "auxiliary_loss_mlp": 0.01028267, - "balance_loss_clip": 1.0384059, - "balance_loss_mlp": 1.01517224, - "epoch": 0.7221403877949797, - "flos": 24420548661120.0, - "grad_norm": 2.031062192481546, - "language_loss": 0.7745133, - "learning_rate": 7.565469826940742e-07, - "loss": 0.79591942, - "num_input_tokens_seen": 259153300, - "step": 12011, - "time_per_iteration": 2.6566684246063232 - }, - { - "auxiliary_loss_clip": 0.01095954, - "auxiliary_loss_mlp": 0.01036039, - "balance_loss_clip": 1.03788853, - "balance_loss_mlp": 1.02336133, - "epoch": 0.7222005110476477, - "flos": 23514379326720.0, - "grad_norm": 2.0943143808042617, - "language_loss": 0.78936207, - "learning_rate": 7.56241966479781e-07, - "loss": 0.81068206, - "num_input_tokens_seen": 259172115, - "step": 12012, - "time_per_iteration": 2.6651875972747803 - }, - { - "auxiliary_loss_clip": 0.0109279, - "auxiliary_loss_mlp": 0.01031271, - "balance_loss_clip": 1.03982329, - "balance_loss_mlp": 1.01809883, - "epoch": 0.7222606343003156, - "flos": 23112538899840.0, - "grad_norm": 1.7259096547472548, - "language_loss": 0.75816202, - "learning_rate": 7.559369974289171e-07, - "loss": 0.77940267, - "num_input_tokens_seen": 259191345, - "step": 12013, - "time_per_iteration": 2.6666300296783447 - }, - { - "auxiliary_loss_clip": 0.01112282, - "auxiliary_loss_mlp": 0.01027778, - "balance_loss_clip": 1.03951406, - "balance_loss_mlp": 1.01493895, - "epoch": 0.7223207575529836, - "flos": 24351169541760.0, - "grad_norm": 1.5900887482073394, - "language_loss": 0.76009625, - "learning_rate": 7.556320755530484e-07, - "loss": 0.78149676, - "num_input_tokens_seen": 259211700, - "step": 12014, - "time_per_iteration": 2.8077309131622314 - }, - { - "auxiliary_loss_clip": 0.01103939, - "auxiliary_loss_mlp": 0.01031964, - "balance_loss_clip": 1.03792763, - "balance_loss_mlp": 1.01870835, - "epoch": 0.7223808808056515, - "flos": 28330179569280.0, - "grad_norm": 1.5772479389327612, - "language_loss": 0.86851835, - "learning_rate": 7.553272008637346e-07, - "loss": 0.88987738, - "num_input_tokens_seen": 259233825, - "step": 12015, - "time_per_iteration": 2.658083915710449 - }, - { - "auxiliary_loss_clip": 0.01099282, - "auxiliary_loss_mlp": 0.01033999, - "balance_loss_clip": 1.0388813, - "balance_loss_mlp": 1.02105308, - "epoch": 0.7224410040583196, - "flos": 21069437304960.0, - "grad_norm": 1.834690814791336, - "language_loss": 0.7801137, - "learning_rate": 7.55022373372538e-07, - "loss": 0.80144656, - "num_input_tokens_seen": 259253055, - "step": 12016, - "time_per_iteration": 2.623483180999756 - }, - { - "auxiliary_loss_clip": 0.01067391, - "auxiliary_loss_mlp": 0.0105171, - "balance_loss_clip": 1.03403831, - "balance_loss_mlp": 1.03612971, - "epoch": 0.7225011273109875, - "flos": 26795429205120.0, - "grad_norm": 1.3753282936745013, - "language_loss": 0.77807558, - "learning_rate": 7.547175930910186e-07, - "loss": 0.79926664, - "num_input_tokens_seen": 259273420, - "step": 12017, - "time_per_iteration": 2.7652459144592285 - }, - { - "auxiliary_loss_clip": 0.01109706, - "auxiliary_loss_mlp": 0.01031666, - "balance_loss_clip": 1.03881669, - "balance_loss_mlp": 1.01943493, - "epoch": 0.7225612505636555, - "flos": 23583578878080.0, - "grad_norm": 1.9142448581528158, - "language_loss": 0.73780286, - "learning_rate": 7.54412860030732e-07, - "loss": 0.75921661, - "num_input_tokens_seen": 259291000, - "step": 12018, - "time_per_iteration": 2.640007495880127 - }, - { - "auxiliary_loss_clip": 0.01084854, - "auxiliary_loss_mlp": 0.01034783, - "balance_loss_clip": 1.04522383, - "balance_loss_mlp": 1.02281451, - "epoch": 0.7226213738163234, - "flos": 20777627214720.0, - "grad_norm": 4.152096025533445, - "language_loss": 0.77579439, - "learning_rate": 7.541081742032347e-07, - "loss": 0.79699075, - "num_input_tokens_seen": 259312390, - "step": 12019, - "time_per_iteration": 2.6887192726135254 - }, - { - "auxiliary_loss_clip": 0.01087897, - "auxiliary_loss_mlp": 0.01029897, - "balance_loss_clip": 1.03979766, - "balance_loss_mlp": 1.01615798, - "epoch": 0.7226814970689914, - "flos": 32635832901120.0, - "grad_norm": 1.8249624922907017, - "language_loss": 0.73749167, - "learning_rate": 7.53803535620081e-07, - "loss": 0.75866961, - "num_input_tokens_seen": 259332645, - "step": 12020, - "time_per_iteration": 2.714838743209839 - }, - { - "auxiliary_loss_clip": 0.01096548, - "auxiliary_loss_mlp": 0.01033021, - "balance_loss_clip": 1.03796768, - "balance_loss_mlp": 1.0203011, - "epoch": 0.7227416203216595, - "flos": 22454368041600.0, - "grad_norm": 1.8291980950612234, - "language_loss": 0.77410042, - "learning_rate": 7.534989442928219e-07, - "loss": 0.79539609, - "num_input_tokens_seen": 259353810, - "step": 12021, - "time_per_iteration": 4.313388347625732 - }, - { - "auxiliary_loss_clip": 0.01074387, - "auxiliary_loss_mlp": 0.0103505, - "balance_loss_clip": 1.03570378, - "balance_loss_mlp": 1.02155018, - "epoch": 0.7228017435743274, - "flos": 21652303299840.0, - "grad_norm": 1.8872518659613802, - "language_loss": 0.68324184, - "learning_rate": 7.531944002330073e-07, - "loss": 0.70433629, - "num_input_tokens_seen": 259372460, - "step": 12022, - "time_per_iteration": 2.7648468017578125 - }, - { - "auxiliary_loss_clip": 0.01102722, - "auxiliary_loss_mlp": 0.0103106, - "balance_loss_clip": 1.03769839, - "balance_loss_mlp": 1.01741076, - "epoch": 0.7228618668269954, - "flos": 29533474206720.0, - "grad_norm": 1.7890580535020497, - "language_loss": 0.69560903, - "learning_rate": 7.528899034521858e-07, - "loss": 0.71694684, - "num_input_tokens_seen": 259393275, - "step": 12023, - "time_per_iteration": 5.942451000213623 - }, - { - "auxiliary_loss_clip": 0.01082247, - "auxiliary_loss_mlp": 0.01030033, - "balance_loss_clip": 1.03305829, - "balance_loss_mlp": 1.0162704, - "epoch": 0.7229219900796633, - "flos": 27453815544960.0, - "grad_norm": 1.630981256405689, - "language_loss": 0.71236169, - "learning_rate": 7.525854539619052e-07, - "loss": 0.73348451, - "num_input_tokens_seen": 259416205, - "step": 12024, - "time_per_iteration": 2.673879861831665 - }, - { - "auxiliary_loss_clip": 0.01079579, - "auxiliary_loss_mlp": 0.01035111, - "balance_loss_clip": 1.0382725, - "balance_loss_mlp": 1.02249229, - "epoch": 0.7229821133323313, - "flos": 16289368116480.0, - "grad_norm": 2.2051730456809655, - "language_loss": 0.75628078, - "learning_rate": 7.522810517737089e-07, - "loss": 0.77742761, - "num_input_tokens_seen": 259433115, - "step": 12025, - "time_per_iteration": 2.7355802059173584 - }, - { - "auxiliary_loss_clip": 0.01099666, - "auxiliary_loss_mlp": 0.01030116, - "balance_loss_clip": 1.03707576, - "balance_loss_mlp": 1.01740193, - "epoch": 0.7230422365849992, - "flos": 20412343854720.0, - "grad_norm": 2.068852797373043, - "language_loss": 0.76397157, - "learning_rate": 7.519766968991395e-07, - "loss": 0.78526938, - "num_input_tokens_seen": 259450475, - "step": 12026, - "time_per_iteration": 2.6082088947296143 - }, - { - "auxiliary_loss_clip": 0.01102144, - "auxiliary_loss_mlp": 0.01042375, - "balance_loss_clip": 1.0383482, - "balance_loss_mlp": 1.02952373, - "epoch": 0.7231023598376672, - "flos": 25593499284480.0, - "grad_norm": 1.9477752433448912, - "language_loss": 0.6773926, - "learning_rate": 7.516723893497388e-07, - "loss": 0.69883776, - "num_input_tokens_seen": 259469355, - "step": 12027, - "time_per_iteration": 2.6620283126831055 - }, - { - "auxiliary_loss_clip": 0.01062411, - "auxiliary_loss_mlp": 0.01030772, - "balance_loss_clip": 1.0401032, - "balance_loss_mlp": 1.0175457, - "epoch": 0.7231624830903352, - "flos": 25149607009920.0, - "grad_norm": 2.2693920109033403, - "language_loss": 0.79310131, - "learning_rate": 7.513681291370469e-07, - "loss": 0.81403315, - "num_input_tokens_seen": 259486565, - "step": 12028, - "time_per_iteration": 4.312790870666504 - }, - { - "auxiliary_loss_clip": 0.01071831, - "auxiliary_loss_mlp": 0.01030546, - "balance_loss_clip": 1.03564012, - "balance_loss_mlp": 1.01683056, - "epoch": 0.7232226063430032, - "flos": 21725740656000.0, - "grad_norm": 1.7649088716190047, - "language_loss": 0.8226198, - "learning_rate": 7.510639162726e-07, - "loss": 0.84364355, - "num_input_tokens_seen": 259505070, - "step": 12029, - "time_per_iteration": 2.6882169246673584 - }, - { - "auxiliary_loss_clip": 0.01012512, - "auxiliary_loss_mlp": 0.01001695, - "balance_loss_clip": 1.01107883, - "balance_loss_mlp": 1.00054455, - "epoch": 0.7232827295956711, - "flos": 68436798491520.0, - "grad_norm": 0.8099058839034723, - "language_loss": 0.61733758, - "learning_rate": 7.507597507679347e-07, - "loss": 0.63747966, - "num_input_tokens_seen": 259569135, - "step": 12030, - "time_per_iteration": 3.252488136291504 - }, - { - "auxiliary_loss_clip": 0.01094272, - "auxiliary_loss_mlp": 0.01037532, - "balance_loss_clip": 1.03575993, - "balance_loss_mlp": 1.02277446, - "epoch": 0.7233428528483391, - "flos": 20192642317440.0, - "grad_norm": 1.6655467134622794, - "language_loss": 0.77807963, - "learning_rate": 7.504556326345859e-07, - "loss": 0.79939759, - "num_input_tokens_seen": 259587035, - "step": 12031, - "time_per_iteration": 2.6133508682250977 - }, - { - "auxiliary_loss_clip": 0.01102197, - "auxiliary_loss_mlp": 0.01030343, - "balance_loss_clip": 1.0374577, - "balance_loss_mlp": 1.01696777, - "epoch": 0.723402976101007, - "flos": 23949472769280.0, - "grad_norm": 1.9738785195921462, - "language_loss": 0.81575108, - "learning_rate": 7.501515618840834e-07, - "loss": 0.83707643, - "num_input_tokens_seen": 259606140, - "step": 12032, - "time_per_iteration": 2.7112133502960205 - }, - { - "auxiliary_loss_clip": 0.01075376, - "auxiliary_loss_mlp": 0.01037925, - "balance_loss_clip": 1.03567076, - "balance_loss_mlp": 1.02435255, - "epoch": 0.723463099353675, - "flos": 20813394182400.0, - "grad_norm": 1.776312475495692, - "language_loss": 0.75339031, - "learning_rate": 7.498475385279592e-07, - "loss": 0.77452338, - "num_input_tokens_seen": 259624275, - "step": 12033, - "time_per_iteration": 2.718799114227295 - }, - { - "auxiliary_loss_clip": 0.01077923, - "auxiliary_loss_mlp": 0.01029541, - "balance_loss_clip": 1.03677177, - "balance_loss_mlp": 1.01704192, - "epoch": 0.723523222606343, - "flos": 19098013299840.0, - "grad_norm": 1.7129862588080287, - "language_loss": 0.75157291, - "learning_rate": 7.495435625777423e-07, - "loss": 0.7726475, - "num_input_tokens_seen": 259643465, - "step": 12034, - "time_per_iteration": 2.6831793785095215 - }, - { - "auxiliary_loss_clip": 0.01089243, - "auxiliary_loss_mlp": 0.01032235, - "balance_loss_clip": 1.03759241, - "balance_loss_mlp": 1.01996899, - "epoch": 0.723583345859011, - "flos": 26506994993280.0, - "grad_norm": 1.842898991016843, - "language_loss": 0.80809641, - "learning_rate": 7.492396340449578e-07, - "loss": 0.82931113, - "num_input_tokens_seen": 259662500, - "step": 12035, - "time_per_iteration": 2.695371627807617 - }, - { - "auxiliary_loss_clip": 0.01050925, - "auxiliary_loss_mlp": 0.01037786, - "balance_loss_clip": 1.03530586, - "balance_loss_mlp": 1.0243392, - "epoch": 0.723643469111679, - "flos": 16033863697920.0, - "grad_norm": 2.241481195422046, - "language_loss": 0.61241198, - "learning_rate": 7.489357529411326e-07, - "loss": 0.63329911, - "num_input_tokens_seen": 259680140, - "step": 12036, - "time_per_iteration": 2.809441566467285 - }, - { - "auxiliary_loss_clip": 0.01095223, - "auxiliary_loss_mlp": 0.01037212, - "balance_loss_clip": 1.03603697, - "balance_loss_mlp": 1.02554715, - "epoch": 0.7237035923643469, - "flos": 21945549934080.0, - "grad_norm": 1.6262385259954, - "language_loss": 0.67594683, - "learning_rate": 7.486319192777883e-07, - "loss": 0.69727111, - "num_input_tokens_seen": 259700160, - "step": 12037, - "time_per_iteration": 2.7354328632354736 - }, - { - "auxiliary_loss_clip": 0.0111287, - "auxiliary_loss_mlp": 0.01037592, - "balance_loss_clip": 1.03997326, - "balance_loss_mlp": 1.02422309, - "epoch": 0.7237637156170149, - "flos": 23583112001280.0, - "grad_norm": 2.066772048559837, - "language_loss": 0.72353923, - "learning_rate": 7.483281330664479e-07, - "loss": 0.74504387, - "num_input_tokens_seen": 259720525, - "step": 12038, - "time_per_iteration": 2.704622983932495 - }, - { - "auxiliary_loss_clip": 0.01111581, - "auxiliary_loss_mlp": 0.01034396, - "balance_loss_clip": 1.0390476, - "balance_loss_mlp": 1.02059746, - "epoch": 0.7238238388696828, - "flos": 20594698225920.0, - "grad_norm": 1.734011651040034, - "language_loss": 0.72293609, - "learning_rate": 7.480243943186293e-07, - "loss": 0.74439585, - "num_input_tokens_seen": 259738680, - "step": 12039, - "time_per_iteration": 2.6200029850006104 - }, - { - "auxiliary_loss_clip": 0.01112988, - "auxiliary_loss_mlp": 0.01033651, - "balance_loss_clip": 1.03924608, - "balance_loss_mlp": 1.02135432, - "epoch": 0.7238839621223508, - "flos": 24207024263040.0, - "grad_norm": 1.7505923285041294, - "language_loss": 0.76183081, - "learning_rate": 7.477207030458513e-07, - "loss": 0.78329718, - "num_input_tokens_seen": 259758790, - "step": 12040, - "time_per_iteration": 2.560269832611084 - }, - { - "auxiliary_loss_clip": 0.01079576, - "auxiliary_loss_mlp": 0.01035383, - "balance_loss_clip": 1.03573811, - "balance_loss_mlp": 1.0221684, - "epoch": 0.7239440853750188, - "flos": 14209745368320.0, - "grad_norm": 2.0682435916617075, - "language_loss": 0.7625649, - "learning_rate": 7.474170592596301e-07, - "loss": 0.78371453, - "num_input_tokens_seen": 259777370, - "step": 12041, - "time_per_iteration": 2.714940309524536 - }, - { - "auxiliary_loss_clip": 0.01102621, - "auxiliary_loss_mlp": 0.01029545, - "balance_loss_clip": 1.0374378, - "balance_loss_mlp": 1.01699817, - "epoch": 0.7240042086276868, - "flos": 21614812479360.0, - "grad_norm": 2.6117170122590636, - "language_loss": 0.63805127, - "learning_rate": 7.471134629714797e-07, - "loss": 0.65937293, - "num_input_tokens_seen": 259794665, - "step": 12042, - "time_per_iteration": 2.6314237117767334 - }, - { - "auxiliary_loss_clip": 0.01075777, - "auxiliary_loss_mlp": 0.01034099, - "balance_loss_clip": 1.03741169, - "balance_loss_mlp": 1.02075338, - "epoch": 0.7240643318803547, - "flos": 23331450337920.0, - "grad_norm": 1.8128616053031077, - "language_loss": 0.83376384, - "learning_rate": 7.468099141929116e-07, - "loss": 0.85486257, - "num_input_tokens_seen": 259811110, - "step": 12043, - "time_per_iteration": 2.676255226135254 - }, - { - "auxiliary_loss_clip": 0.01079486, - "auxiliary_loss_mlp": 0.01030427, - "balance_loss_clip": 1.03760707, - "balance_loss_mlp": 1.01697433, - "epoch": 0.7241244551330227, - "flos": 24024849459840.0, - "grad_norm": 1.7443833351104767, - "language_loss": 0.64167023, - "learning_rate": 7.465064129354379e-07, - "loss": 0.66276932, - "num_input_tokens_seen": 259831080, - "step": 12044, - "time_per_iteration": 2.7761828899383545 - }, - { - "auxiliary_loss_clip": 0.0111317, - "auxiliary_loss_mlp": 0.01032715, - "balance_loss_clip": 1.04010856, - "balance_loss_mlp": 1.01904798, - "epoch": 0.7241845783856906, - "flos": 18730323728640.0, - "grad_norm": 1.9383242043113957, - "language_loss": 0.81468868, - "learning_rate": 7.462029592105658e-07, - "loss": 0.83614755, - "num_input_tokens_seen": 259850135, - "step": 12045, - "time_per_iteration": 2.5996835231781006 - }, - { - "auxiliary_loss_clip": 0.01108154, - "auxiliary_loss_mlp": 0.01032292, - "balance_loss_clip": 1.03746927, - "balance_loss_mlp": 1.01956022, - "epoch": 0.7242447016383586, - "flos": 19498668577920.0, - "grad_norm": 1.5954644621567537, - "language_loss": 0.71763444, - "learning_rate": 7.458995530298034e-07, - "loss": 0.73903888, - "num_input_tokens_seen": 259868185, - "step": 12046, - "time_per_iteration": 2.5615580081939697 - }, - { - "auxiliary_loss_clip": 0.01075175, - "auxiliary_loss_mlp": 0.01033472, - "balance_loss_clip": 1.03313971, - "balance_loss_mlp": 1.01897645, - "epoch": 0.7243048248910267, - "flos": 22163491704960.0, - "grad_norm": 2.0910154490498125, - "language_loss": 0.71177173, - "learning_rate": 7.455961944046553e-07, - "loss": 0.73285818, - "num_input_tokens_seen": 259887055, - "step": 12047, - "time_per_iteration": 2.700878381729126 - }, - { - "auxiliary_loss_clip": 0.01086391, - "auxiliary_loss_mlp": 0.01041787, - "balance_loss_clip": 1.03794575, - "balance_loss_mlp": 1.02800667, - "epoch": 0.7243649481436946, - "flos": 27672762896640.0, - "grad_norm": 1.5839782384796177, - "language_loss": 0.70204568, - "learning_rate": 7.45292883346627e-07, - "loss": 0.72332752, - "num_input_tokens_seen": 259908295, - "step": 12048, - "time_per_iteration": 2.690060615539551 - }, - { - "auxiliary_loss_clip": 0.01011684, - "auxiliary_loss_mlp": 0.01004259, - "balance_loss_clip": 1.00705278, - "balance_loss_mlp": 1.0028162, - "epoch": 0.7244250713963626, - "flos": 63244545759360.0, - "grad_norm": 0.8298796504425336, - "language_loss": 0.53679693, - "learning_rate": 7.449896198672168e-07, - "loss": 0.55695641, - "num_input_tokens_seen": 259968475, - "step": 12049, - "time_per_iteration": 3.2119057178497314 - }, - { - "auxiliary_loss_clip": 0.01088982, - "auxiliary_loss_mlp": 0.01032865, - "balance_loss_clip": 1.03676033, - "balance_loss_mlp": 1.01766598, - "epoch": 0.7244851946490305, - "flos": 17967114524160.0, - "grad_norm": 2.0687483221381897, - "language_loss": 0.59396434, - "learning_rate": 7.446864039779258e-07, - "loss": 0.61518282, - "num_input_tokens_seen": 259984865, - "step": 12050, - "time_per_iteration": 2.632354736328125 - }, - { - "auxiliary_loss_clip": 0.0099629, - "auxiliary_loss_mlp": 0.01011839, - "balance_loss_clip": 1.0111258, - "balance_loss_mlp": 1.01062906, - "epoch": 0.7245453179016985, - "flos": 70943649603840.0, - "grad_norm": 0.7230865999860119, - "language_loss": 0.53218287, - "learning_rate": 7.443832356902528e-07, - "loss": 0.55226415, - "num_input_tokens_seen": 260046735, - "step": 12051, - "time_per_iteration": 3.2180604934692383 - }, - { - "auxiliary_loss_clip": 0.01097618, - "auxiliary_loss_mlp": 0.01032159, - "balance_loss_clip": 1.03679287, - "balance_loss_mlp": 1.02010143, - "epoch": 0.7246054411543664, - "flos": 24568464867840.0, - "grad_norm": 1.7070120237628115, - "language_loss": 0.72170782, - "learning_rate": 7.440801150156927e-07, - "loss": 0.74300563, - "num_input_tokens_seen": 260067950, - "step": 12052, - "time_per_iteration": 2.6380202770233154 - }, - { - "auxiliary_loss_clip": 0.01099407, - "auxiliary_loss_mlp": 0.01034919, - "balance_loss_clip": 1.03736925, - "balance_loss_mlp": 1.01992285, - "epoch": 0.7246655644070344, - "flos": 32338312548480.0, - "grad_norm": 1.8571757187229716, - "language_loss": 0.74080825, - "learning_rate": 7.437770419657415e-07, - "loss": 0.76215148, - "num_input_tokens_seen": 260087730, - "step": 12053, - "time_per_iteration": 2.691523790359497 - }, - { - "auxiliary_loss_clip": 0.01072566, - "auxiliary_loss_mlp": 0.0103532, - "balance_loss_clip": 1.03622317, - "balance_loss_mlp": 1.02119958, - "epoch": 0.7247256876597024, - "flos": 21872471713920.0, - "grad_norm": 1.7294141781477532, - "language_loss": 0.78110063, - "learning_rate": 7.434740165518898e-07, - "loss": 0.80217946, - "num_input_tokens_seen": 260107760, - "step": 12054, - "time_per_iteration": 2.658952236175537 - }, - { - "auxiliary_loss_clip": 0.01077648, - "auxiliary_loss_mlp": 0.01035486, - "balance_loss_clip": 1.03661764, - "balance_loss_mlp": 1.02215791, - "epoch": 0.7247858109123704, - "flos": 16213093585920.0, - "grad_norm": 2.4200013582642437, - "language_loss": 0.67830694, - "learning_rate": 7.431710387856301e-07, - "loss": 0.69943827, - "num_input_tokens_seen": 260123660, - "step": 12055, - "time_per_iteration": 2.646244525909424 - }, - { - "auxiliary_loss_clip": 0.01080369, - "auxiliary_loss_mlp": 0.01036731, - "balance_loss_clip": 1.03789568, - "balance_loss_mlp": 1.02451193, - "epoch": 0.7248459341650383, - "flos": 20850705434880.0, - "grad_norm": 1.6702264045613682, - "language_loss": 0.74097568, - "learning_rate": 7.428681086784496e-07, - "loss": 0.76214665, - "num_input_tokens_seen": 260142690, - "step": 12056, - "time_per_iteration": 2.7628982067108154 - }, - { - "auxiliary_loss_clip": 0.01108663, - "auxiliary_loss_mlp": 0.010276, - "balance_loss_clip": 1.03835511, - "balance_loss_mlp": 1.01454699, - "epoch": 0.7249060574177063, - "flos": 25921794614400.0, - "grad_norm": 1.66863868022831, - "language_loss": 0.70870286, - "learning_rate": 7.425652262418368e-07, - "loss": 0.73006552, - "num_input_tokens_seen": 260162590, - "step": 12057, - "time_per_iteration": 2.71063232421875 - }, - { - "auxiliary_loss_clip": 0.01058179, - "auxiliary_loss_mlp": 0.01044744, - "balance_loss_clip": 1.03556621, - "balance_loss_mlp": 1.03009939, - "epoch": 0.7249661806703742, - "flos": 17345536646400.0, - "grad_norm": 1.8439836916669041, - "language_loss": 0.6237672, - "learning_rate": 7.42262391487277e-07, - "loss": 0.64479643, - "num_input_tokens_seen": 260181065, - "step": 12058, - "time_per_iteration": 2.8430051803588867 - }, - { - "auxiliary_loss_clip": 0.01070122, - "auxiliary_loss_mlp": 0.01031873, - "balance_loss_clip": 1.03506172, - "balance_loss_mlp": 1.01852131, - "epoch": 0.7250263039230422, - "flos": 19574153009280.0, - "grad_norm": 1.8897334856820058, - "language_loss": 0.74905157, - "learning_rate": 7.419596044262535e-07, - "loss": 0.77007163, - "num_input_tokens_seen": 260200330, - "step": 12059, - "time_per_iteration": 2.832826614379883 - }, - { - "auxiliary_loss_clip": 0.01098356, - "auxiliary_loss_mlp": 0.01033461, - "balance_loss_clip": 1.03746486, - "balance_loss_mlp": 1.02145672, - "epoch": 0.7250864271757103, - "flos": 21976648133760.0, - "grad_norm": 1.8419617438371911, - "language_loss": 0.79300022, - "learning_rate": 7.416568650702472e-07, - "loss": 0.81431836, - "num_input_tokens_seen": 260219975, - "step": 12060, - "time_per_iteration": 4.281320095062256 - }, - { - "auxiliary_loss_clip": 0.01100606, - "auxiliary_loss_mlp": 0.01026628, - "balance_loss_clip": 1.03860307, - "balance_loss_mlp": 1.01334846, - "epoch": 0.7251465504283782, - "flos": 25012608537600.0, - "grad_norm": 1.7927785216248016, - "language_loss": 0.76260906, - "learning_rate": 7.413541734307393e-07, - "loss": 0.78388143, - "num_input_tokens_seen": 260242025, - "step": 12061, - "time_per_iteration": 2.748656749725342 - }, - { - "auxiliary_loss_clip": 0.01108857, - "auxiliary_loss_mlp": 0.00769754, - "balance_loss_clip": 1.03873777, - "balance_loss_mlp": 1.00011206, - "epoch": 0.7252066736810462, - "flos": 16690131135360.0, - "grad_norm": 1.7879167066361221, - "language_loss": 0.81589133, - "learning_rate": 7.410515295192068e-07, - "loss": 0.83467746, - "num_input_tokens_seen": 260260015, - "step": 12062, - "time_per_iteration": 4.1720802783966064 - }, - { - "auxiliary_loss_clip": 0.0106197, - "auxiliary_loss_mlp": 0.01035204, - "balance_loss_clip": 1.03478372, - "balance_loss_mlp": 1.0198977, - "epoch": 0.7252667969337141, - "flos": 25703026830720.0, - "grad_norm": 2.017910234455411, - "language_loss": 0.69402146, - "learning_rate": 7.407489333471262e-07, - "loss": 0.71499324, - "num_input_tokens_seen": 260278635, - "step": 12063, - "time_per_iteration": 4.450777769088745 - }, - { - "auxiliary_loss_clip": 0.01076449, - "auxiliary_loss_mlp": 0.01034693, - "balance_loss_clip": 1.03741121, - "balance_loss_mlp": 1.02178848, - "epoch": 0.7253269201863821, - "flos": 18259930195200.0, - "grad_norm": 1.4900878050946833, - "language_loss": 0.69918656, - "learning_rate": 7.40446384925973e-07, - "loss": 0.72029793, - "num_input_tokens_seen": 260298510, - "step": 12064, - "time_per_iteration": 2.7114603519439697 - }, - { - "auxiliary_loss_clip": 0.01091634, - "auxiliary_loss_mlp": 0.01035443, - "balance_loss_clip": 1.03896451, - "balance_loss_mlp": 1.02210331, - "epoch": 0.72538704343905, - "flos": 20411805150720.0, - "grad_norm": 1.7705588276559046, - "language_loss": 0.90465009, - "learning_rate": 7.401438842672192e-07, - "loss": 0.92592084, - "num_input_tokens_seen": 260317405, - "step": 12065, - "time_per_iteration": 2.723996877670288 - }, - { - "auxiliary_loss_clip": 0.01020643, - "auxiliary_loss_mlp": 0.01001515, - "balance_loss_clip": 1.00699556, - "balance_loss_mlp": 1.00026369, - "epoch": 0.725447166691718, - "flos": 70151209706880.0, - "grad_norm": 0.6554583314348987, - "language_loss": 0.56083691, - "learning_rate": 7.398414313823349e-07, - "loss": 0.58105844, - "num_input_tokens_seen": 260388085, - "step": 12066, - "time_per_iteration": 3.332350254058838 - }, - { - "auxiliary_loss_clip": 0.01062291, - "auxiliary_loss_mlp": 0.01030469, - "balance_loss_clip": 1.03549218, - "balance_loss_mlp": 1.01799369, - "epoch": 0.725507289944386, - "flos": 27052334254080.0, - "grad_norm": 1.7495752784177439, - "language_loss": 0.76740146, - "learning_rate": 7.395390262827897e-07, - "loss": 0.78832901, - "num_input_tokens_seen": 260406165, - "step": 12067, - "time_per_iteration": 2.815978765487671 - }, - { - "auxiliary_loss_clip": 0.0101369, - "auxiliary_loss_mlp": 0.01006237, - "balance_loss_clip": 1.01036, - "balance_loss_mlp": 1.0050863, - "epoch": 0.725567413197054, - "flos": 62921924778240.0, - "grad_norm": 0.722755917983848, - "language_loss": 0.56971467, - "learning_rate": 7.392366689800515e-07, - "loss": 0.58991396, - "num_input_tokens_seen": 260461365, - "step": 12068, - "time_per_iteration": 4.744567394256592 - }, - { - "auxiliary_loss_clip": 0.0099354, - "auxiliary_loss_mlp": 0.01007822, - "balance_loss_clip": 1.00846553, - "balance_loss_mlp": 1.00654685, - "epoch": 0.7256275364497219, - "flos": 60295957188480.0, - "grad_norm": 0.663737486882956, - "language_loss": 0.55370045, - "learning_rate": 7.389343594855848e-07, - "loss": 0.57371408, - "num_input_tokens_seen": 260523795, - "step": 12069, - "time_per_iteration": 3.275995969772339 - }, - { - "auxiliary_loss_clip": 0.01077438, - "auxiliary_loss_mlp": 0.01027102, - "balance_loss_clip": 1.03855562, - "balance_loss_mlp": 1.01507938, - "epoch": 0.7256876597023899, - "flos": 24498511130880.0, - "grad_norm": 1.6562852272905184, - "language_loss": 0.79984176, - "learning_rate": 7.38632097810854e-07, - "loss": 0.82088709, - "num_input_tokens_seen": 260544765, - "step": 12070, - "time_per_iteration": 2.806398391723633 - }, - { - "auxiliary_loss_clip": 0.01083416, - "auxiliary_loss_mlp": 0.01036302, - "balance_loss_clip": 1.03607607, - "balance_loss_mlp": 1.02395165, - "epoch": 0.7257477829550578, - "flos": 24352749740160.0, - "grad_norm": 1.8683198427961691, - "language_loss": 0.71817708, - "learning_rate": 7.383298839673197e-07, - "loss": 0.73937428, - "num_input_tokens_seen": 260564340, - "step": 12071, - "time_per_iteration": 2.7380881309509277 - }, - { - "auxiliary_loss_clip": 0.01108781, - "auxiliary_loss_mlp": 0.01039283, - "balance_loss_clip": 1.03857553, - "balance_loss_mlp": 1.02693939, - "epoch": 0.7258079062077258, - "flos": 17202217380480.0, - "grad_norm": 2.1132155235444183, - "language_loss": 0.70214903, - "learning_rate": 7.380277179664436e-07, - "loss": 0.72362965, - "num_input_tokens_seen": 260582565, - "step": 12072, - "time_per_iteration": 2.639300584793091 - }, - { - "auxiliary_loss_clip": 0.01075383, - "auxiliary_loss_mlp": 0.01033628, - "balance_loss_clip": 1.03398466, - "balance_loss_mlp": 1.01966858, - "epoch": 0.7258680294603939, - "flos": 21580338401280.0, - "grad_norm": 1.7211132025466964, - "language_loss": 0.78522944, - "learning_rate": 7.377255998196821e-07, - "loss": 0.80631953, - "num_input_tokens_seen": 260601700, - "step": 12073, - "time_per_iteration": 2.707505226135254 - }, - { - "auxiliary_loss_clip": 0.01089188, - "auxiliary_loss_mlp": 0.01031416, - "balance_loss_clip": 1.03761029, - "balance_loss_mlp": 1.0188278, - "epoch": 0.7259281527130618, - "flos": 34855399036800.0, - "grad_norm": 1.5601813308837964, - "language_loss": 0.70586532, - "learning_rate": 7.374235295384923e-07, - "loss": 0.72707134, - "num_input_tokens_seen": 260623040, - "step": 12074, - "time_per_iteration": 2.7605321407318115 - }, - { - "auxiliary_loss_clip": 0.01089374, - "auxiliary_loss_mlp": 0.01031427, - "balance_loss_clip": 1.03541577, - "balance_loss_mlp": 1.01786137, - "epoch": 0.7259882759657298, - "flos": 25404644551680.0, - "grad_norm": 1.7787306902519031, - "language_loss": 0.74126077, - "learning_rate": 7.371215071343302e-07, - "loss": 0.76246876, - "num_input_tokens_seen": 260642735, - "step": 12075, - "time_per_iteration": 2.809924840927124 - }, - { - "auxiliary_loss_clip": 0.01102235, - "auxiliary_loss_mlp": 0.01037145, - "balance_loss_clip": 1.03854585, - "balance_loss_mlp": 1.02345967, - "epoch": 0.7260483992183977, - "flos": 62953630531200.0, - "grad_norm": 2.761502875821282, - "language_loss": 0.63991046, - "learning_rate": 7.368195326186458e-07, - "loss": 0.6613043, - "num_input_tokens_seen": 260669935, - "step": 12076, - "time_per_iteration": 3.073396921157837 - }, - { - "auxiliary_loss_clip": 0.01073377, - "auxiliary_loss_mlp": 0.01030745, - "balance_loss_clip": 1.03426909, - "balance_loss_mlp": 1.01711977, - "epoch": 0.7261085224710657, - "flos": 26467528924800.0, - "grad_norm": 1.967529180708395, - "language_loss": 0.78661555, - "learning_rate": 7.365176060028912e-07, - "loss": 0.80765676, - "num_input_tokens_seen": 260689605, - "step": 12077, - "time_per_iteration": 2.748734712600708 - }, - { - "auxiliary_loss_clip": 0.01030217, - "auxiliary_loss_mlp": 0.00751512, - "balance_loss_clip": 1.00731969, - "balance_loss_mlp": 0.99968779, - "epoch": 0.7261686457237336, - "flos": 66772732187520.0, - "grad_norm": 0.8834354289567558, - "language_loss": 0.64973843, - "learning_rate": 7.362157272985163e-07, - "loss": 0.66755569, - "num_input_tokens_seen": 260748265, - "step": 12078, - "time_per_iteration": 3.1502130031585693 - }, - { - "auxiliary_loss_clip": 0.01023011, - "auxiliary_loss_mlp": 0.01002876, - "balance_loss_clip": 1.00983262, - "balance_loss_mlp": 1.00162983, - "epoch": 0.7262287689764017, - "flos": 69999594399360.0, - "grad_norm": 0.7148369654201937, - "language_loss": 0.59227604, - "learning_rate": 7.359138965169671e-07, - "loss": 0.61253494, - "num_input_tokens_seen": 260816715, - "step": 12079, - "time_per_iteration": 3.2680857181549072 - }, - { - "auxiliary_loss_clip": 0.01064199, - "auxiliary_loss_mlp": 0.01033019, - "balance_loss_clip": 1.03485882, - "balance_loss_mlp": 1.01984644, - "epoch": 0.7262888922290696, - "flos": 23805435231360.0, - "grad_norm": 2.2028126662157383, - "language_loss": 0.64762789, - "learning_rate": 7.356121136696895e-07, - "loss": 0.66860008, - "num_input_tokens_seen": 260836765, - "step": 12080, - "time_per_iteration": 2.718738317489624 - }, - { - "auxiliary_loss_clip": 0.01064639, - "auxiliary_loss_mlp": 0.01029211, - "balance_loss_clip": 1.03282523, - "balance_loss_mlp": 1.01555538, - "epoch": 0.7263490154817376, - "flos": 19500320603520.0, - "grad_norm": 2.4686396191281235, - "language_loss": 0.69309068, - "learning_rate": 7.35310378768128e-07, - "loss": 0.71402919, - "num_input_tokens_seen": 260854610, - "step": 12081, - "time_per_iteration": 2.869288444519043 - }, - { - "auxiliary_loss_clip": 0.01114886, - "auxiliary_loss_mlp": 0.01031024, - "balance_loss_clip": 1.04031432, - "balance_loss_mlp": 1.01794684, - "epoch": 0.7264091387344055, - "flos": 16286243633280.0, - "grad_norm": 1.842145300936274, - "language_loss": 0.81440926, - "learning_rate": 7.350086918237237e-07, - "loss": 0.83586842, - "num_input_tokens_seen": 260871620, - "step": 12082, - "time_per_iteration": 2.558000087738037 - }, - { - "auxiliary_loss_clip": 0.01104122, - "auxiliary_loss_mlp": 0.01037212, - "balance_loss_clip": 1.0367763, - "balance_loss_mlp": 1.02259684, - "epoch": 0.7264692619870735, - "flos": 24352031468160.0, - "grad_norm": 1.7329186007952004, - "language_loss": 0.77324694, - "learning_rate": 7.347070528479158e-07, - "loss": 0.79466033, - "num_input_tokens_seen": 260890490, - "step": 12083, - "time_per_iteration": 2.707674741744995 - }, - { - "auxiliary_loss_clip": 0.01114141, - "auxiliary_loss_mlp": 0.01032066, - "balance_loss_clip": 1.04018736, - "balance_loss_mlp": 1.01889968, - "epoch": 0.7265293852397414, - "flos": 25119478477440.0, - "grad_norm": 1.8409046940193436, - "language_loss": 0.73034543, - "learning_rate": 7.344054618521433e-07, - "loss": 0.75180745, - "num_input_tokens_seen": 260909700, - "step": 12084, - "time_per_iteration": 2.656688928604126 - }, - { - "auxiliary_loss_clip": 0.01114376, - "auxiliary_loss_mlp": 0.01036848, - "balance_loss_clip": 1.03960419, - "balance_loss_mlp": 1.02362156, - "epoch": 0.7265895084924094, - "flos": 22638230784000.0, - "grad_norm": 3.047460171891373, - "language_loss": 0.7778368, - "learning_rate": 7.34103918847843e-07, - "loss": 0.79934901, - "num_input_tokens_seen": 260929090, - "step": 12085, - "time_per_iteration": 2.645911693572998 - }, - { - "auxiliary_loss_clip": 0.01099641, - "auxiliary_loss_mlp": 0.01034439, - "balance_loss_clip": 1.03661323, - "balance_loss_mlp": 1.02154636, - "epoch": 0.7266496317450775, - "flos": 23368222886400.0, - "grad_norm": 1.5977221637963412, - "language_loss": 0.72068805, - "learning_rate": 7.338024238464493e-07, - "loss": 0.74202883, - "num_input_tokens_seen": 260946615, - "step": 12086, - "time_per_iteration": 2.6855533123016357 - }, - { - "auxiliary_loss_clip": 0.01073096, - "auxiliary_loss_mlp": 0.01041748, - "balance_loss_clip": 1.03401077, - "balance_loss_mlp": 1.02729964, - "epoch": 0.7267097549977454, - "flos": 28074603323520.0, - "grad_norm": 1.6297510133590103, - "language_loss": 0.6963405, - "learning_rate": 7.335009768593938e-07, - "loss": 0.71748894, - "num_input_tokens_seen": 260968515, - "step": 12087, - "time_per_iteration": 2.8121585845947266 - }, - { - "auxiliary_loss_clip": 0.01115392, - "auxiliary_loss_mlp": 0.01035484, - "balance_loss_clip": 1.04074097, - "balance_loss_mlp": 1.02153099, - "epoch": 0.7267698782504134, - "flos": 22195523658240.0, - "grad_norm": 5.160414648565969, - "language_loss": 0.79164052, - "learning_rate": 7.331995778981088e-07, - "loss": 0.81314927, - "num_input_tokens_seen": 260986790, - "step": 12088, - "time_per_iteration": 2.563143491744995 - }, - { - "auxiliary_loss_clip": 0.01097059, - "auxiliary_loss_mlp": 0.01037688, - "balance_loss_clip": 1.03751171, - "balance_loss_mlp": 1.02490282, - "epoch": 0.7268300015030813, - "flos": 18514859996160.0, - "grad_norm": 1.723527946831352, - "language_loss": 0.73941064, - "learning_rate": 7.328982269740221e-07, - "loss": 0.76075816, - "num_input_tokens_seen": 261004925, - "step": 12089, - "time_per_iteration": 2.6264712810516357 - }, - { - "auxiliary_loss_clip": 0.01088906, - "auxiliary_loss_mlp": 0.01035753, - "balance_loss_clip": 1.03559196, - "balance_loss_mlp": 1.02308106, - "epoch": 0.7268901247557493, - "flos": 23986029836160.0, - "grad_norm": 1.6147699540484286, - "language_loss": 0.70883548, - "learning_rate": 7.325969240985616e-07, - "loss": 0.73008209, - "num_input_tokens_seen": 261023895, - "step": 12090, - "time_per_iteration": 2.674154281616211 - }, - { - "auxiliary_loss_clip": 0.01057949, - "auxiliary_loss_mlp": 0.01033382, - "balance_loss_clip": 1.03447902, - "balance_loss_mlp": 1.01989388, - "epoch": 0.7269502480084172, - "flos": 32088087429120.0, - "grad_norm": 1.7900263733785062, - "language_loss": 0.7724641, - "learning_rate": 7.322956692831528e-07, - "loss": 0.7933774, - "num_input_tokens_seen": 261045445, - "step": 12091, - "time_per_iteration": 2.837162494659424 - }, - { - "auxiliary_loss_clip": 0.0109404, - "auxiliary_loss_mlp": 0.00771553, - "balance_loss_clip": 1.03523159, - "balance_loss_mlp": 1.00019574, - "epoch": 0.7270103712610853, - "flos": 19062785036160.0, - "grad_norm": 2.0691872442271415, - "language_loss": 0.71682477, - "learning_rate": 7.319944625392205e-07, - "loss": 0.73548067, - "num_input_tokens_seen": 261064275, - "step": 12092, - "time_per_iteration": 2.6305599212646484 - }, - { - "auxiliary_loss_clip": 0.01101746, - "auxiliary_loss_mlp": 0.01033427, - "balance_loss_clip": 1.03929043, - "balance_loss_mlp": 1.02035582, - "epoch": 0.7270704945137532, - "flos": 34532921710080.0, - "grad_norm": 2.2398684774576156, - "language_loss": 0.61100423, - "learning_rate": 7.31693303878184e-07, - "loss": 0.63235605, - "num_input_tokens_seen": 261083310, - "step": 12093, - "time_per_iteration": 2.750157117843628 - }, - { - "auxiliary_loss_clip": 0.01090608, - "auxiliary_loss_mlp": 0.01037448, - "balance_loss_clip": 1.03955996, - "balance_loss_mlp": 1.02412009, - "epoch": 0.7271306177664212, - "flos": 21507583403520.0, - "grad_norm": 1.6663796185948798, - "language_loss": 0.75200593, - "learning_rate": 7.313921933114644e-07, - "loss": 0.77328646, - "num_input_tokens_seen": 261103460, - "step": 12094, - "time_per_iteration": 2.63088059425354 - }, - { - "auxiliary_loss_clip": 0.01076646, - "auxiliary_loss_mlp": 0.01031624, - "balance_loss_clip": 1.03417659, - "balance_loss_mlp": 1.01941681, - "epoch": 0.7271907410190891, - "flos": 22272444633600.0, - "grad_norm": 1.8443350683921131, - "language_loss": 0.84625936, - "learning_rate": 7.310911308504808e-07, - "loss": 0.867342, - "num_input_tokens_seen": 261121375, - "step": 12095, - "time_per_iteration": 2.7300918102264404 - }, - { - "auxiliary_loss_clip": 0.010978, - "auxiliary_loss_mlp": 0.01037417, - "balance_loss_clip": 1.03561294, - "balance_loss_mlp": 1.02383316, - "epoch": 0.7272508642717571, - "flos": 22893124671360.0, - "grad_norm": 2.27024179817087, - "language_loss": 0.77610254, - "learning_rate": 7.307901165066479e-07, - "loss": 0.79745466, - "num_input_tokens_seen": 261141105, - "step": 12096, - "time_per_iteration": 2.754016399383545 - }, - { - "auxiliary_loss_clip": 0.01113914, - "auxiliary_loss_mlp": 0.01037152, - "balance_loss_clip": 1.04082382, - "balance_loss_mlp": 1.02434897, - "epoch": 0.727310987524425, - "flos": 11655886331520.0, - "grad_norm": 1.96001611615308, - "language_loss": 0.72508037, - "learning_rate": 7.30489150291381e-07, - "loss": 0.74659109, - "num_input_tokens_seen": 261159255, - "step": 12097, - "time_per_iteration": 2.57547664642334 - }, - { - "auxiliary_loss_clip": 0.01101296, - "auxiliary_loss_mlp": 0.00771623, - "balance_loss_clip": 1.03833079, - "balance_loss_mlp": 1.00024211, - "epoch": 0.727371110777093, - "flos": 24535319592960.0, - "grad_norm": 1.744636039852928, - "language_loss": 0.77178752, - "learning_rate": 7.301882322160935e-07, - "loss": 0.79051673, - "num_input_tokens_seen": 261177960, - "step": 12098, - "time_per_iteration": 2.697739601135254 - }, - { - "auxiliary_loss_clip": 0.01090376, - "auxiliary_loss_mlp": 0.01033826, - "balance_loss_clip": 1.03530288, - "balance_loss_mlp": 1.02023625, - "epoch": 0.7274312340297611, - "flos": 74739835405440.0, - "grad_norm": 1.6470814614885703, - "language_loss": 0.67452812, - "learning_rate": 7.298873622921952e-07, - "loss": 0.69577014, - "num_input_tokens_seen": 261205660, - "step": 12099, - "time_per_iteration": 4.734724283218384 - }, - { - "auxiliary_loss_clip": 0.01100384, - "auxiliary_loss_mlp": 0.01040178, - "balance_loss_clip": 1.0354315, - "balance_loss_mlp": 1.02401924, - "epoch": 0.727491357282429, - "flos": 22342865247360.0, - "grad_norm": 1.6470477467852347, - "language_loss": 0.72511584, - "learning_rate": 7.29586540531095e-07, - "loss": 0.74652147, - "num_input_tokens_seen": 261225185, - "step": 12100, - "time_per_iteration": 2.6307733058929443 - }, - { - "auxiliary_loss_clip": 0.01101803, - "auxiliary_loss_mlp": 0.01038394, - "balance_loss_clip": 1.03856468, - "balance_loss_mlp": 1.02577031, - "epoch": 0.727551480535097, - "flos": 23297550877440.0, - "grad_norm": 1.4604095726641635, - "language_loss": 0.74780536, - "learning_rate": 7.292857669442005e-07, - "loss": 0.76920736, - "num_input_tokens_seen": 261247965, - "step": 12101, - "time_per_iteration": 2.6731035709381104 - }, - { - "auxiliary_loss_clip": 0.01070063, - "auxiliary_loss_mlp": 0.01029767, - "balance_loss_clip": 1.03622627, - "balance_loss_mlp": 1.01775718, - "epoch": 0.7276116037877649, - "flos": 21470559459840.0, - "grad_norm": 1.7882931756264577, - "language_loss": 0.82550085, - "learning_rate": 7.289850415429177e-07, - "loss": 0.8464992, - "num_input_tokens_seen": 261267585, - "step": 12102, - "time_per_iteration": 5.8568243980407715 - }, - { - "auxiliary_loss_clip": 0.01100092, - "auxiliary_loss_mlp": 0.01035417, - "balance_loss_clip": 1.03823566, - "balance_loss_mlp": 1.02270937, - "epoch": 0.7276717270404329, - "flos": 21464059098240.0, - "grad_norm": 2.5021196197746396, - "language_loss": 0.81821334, - "learning_rate": 7.286843643386495e-07, - "loss": 0.83956838, - "num_input_tokens_seen": 261285200, - "step": 12103, - "time_per_iteration": 2.619070291519165 - }, - { - "auxiliary_loss_clip": 0.0109026, - "auxiliary_loss_mlp": 0.01027412, - "balance_loss_clip": 1.03774977, - "balance_loss_mlp": 1.01372027, - "epoch": 0.7277318502931008, - "flos": 16837221329280.0, - "grad_norm": 1.6323298348226507, - "language_loss": 0.66439486, - "learning_rate": 7.283837353427968e-07, - "loss": 0.68557155, - "num_input_tokens_seen": 261303645, - "step": 12104, - "time_per_iteration": 2.7373523712158203 - }, - { - "auxiliary_loss_clip": 0.01079506, - "auxiliary_loss_mlp": 0.01033638, - "balance_loss_clip": 1.03706837, - "balance_loss_mlp": 1.02034616, - "epoch": 0.7277919735457689, - "flos": 33400550476800.0, - "grad_norm": 3.4364169718839324, - "language_loss": 0.66114849, - "learning_rate": 7.280831545667611e-07, - "loss": 0.68227994, - "num_input_tokens_seen": 261323265, - "step": 12105, - "time_per_iteration": 2.767533302307129 - }, - { - "auxiliary_loss_clip": 0.01115684, - "auxiliary_loss_mlp": 0.01034833, - "balance_loss_clip": 1.04181576, - "balance_loss_mlp": 1.02132106, - "epoch": 0.7278520967984368, - "flos": 19206499351680.0, - "grad_norm": 3.014598256639034, - "language_loss": 0.75495023, - "learning_rate": 7.27782622021939e-07, - "loss": 0.7764554, - "num_input_tokens_seen": 261339745, - "step": 12106, - "time_per_iteration": 2.595414161682129 - }, - { - "auxiliary_loss_clip": 0.01103034, - "auxiliary_loss_mlp": 0.01033288, - "balance_loss_clip": 1.03735209, - "balance_loss_mlp": 1.01898909, - "epoch": 0.7279122200511048, - "flos": 34094667870720.0, - "grad_norm": 2.1351092676673162, - "language_loss": 0.70326072, - "learning_rate": 7.274821377197273e-07, - "loss": 0.72462392, - "num_input_tokens_seen": 261359310, - "step": 12107, - "time_per_iteration": 4.187346935272217 - }, - { - "auxiliary_loss_clip": 0.01094591, - "auxiliary_loss_mlp": 0.0103929, - "balance_loss_clip": 1.03660846, - "balance_loss_mlp": 1.02583683, - "epoch": 0.7279723433037727, - "flos": 54599049348480.0, - "grad_norm": 1.7543215604249431, - "language_loss": 0.75391257, - "learning_rate": 7.271817016715205e-07, - "loss": 0.77525139, - "num_input_tokens_seen": 261384640, - "step": 12108, - "time_per_iteration": 2.922069549560547 - }, - { - "auxiliary_loss_clip": 0.01111137, - "auxiliary_loss_mlp": 0.01031166, - "balance_loss_clip": 1.03809679, - "balance_loss_mlp": 1.01802313, - "epoch": 0.7280324665564407, - "flos": 36137482156800.0, - "grad_norm": 1.5176447474285724, - "language_loss": 0.67057818, - "learning_rate": 7.268813138887124e-07, - "loss": 0.69200122, - "num_input_tokens_seen": 261405290, - "step": 12109, - "time_per_iteration": 2.691226005554199 - }, - { - "auxiliary_loss_clip": 0.01073593, - "auxiliary_loss_mlp": 0.01033469, - "balance_loss_clip": 1.03573251, - "balance_loss_mlp": 1.01958656, - "epoch": 0.7280925898091086, - "flos": 11618539165440.0, - "grad_norm": 2.3584964062920646, - "language_loss": 0.63489443, - "learning_rate": 7.265809743826912e-07, - "loss": 0.65596509, - "num_input_tokens_seen": 261419710, - "step": 12110, - "time_per_iteration": 2.7957284450531006 - }, - { - "auxiliary_loss_clip": 0.01079859, - "auxiliary_loss_mlp": 0.01029854, - "balance_loss_clip": 1.03503799, - "balance_loss_mlp": 1.01581717, - "epoch": 0.7281527130617766, - "flos": 34277094069120.0, - "grad_norm": 2.403450287181842, - "language_loss": 0.58412719, - "learning_rate": 7.26280683164847e-07, - "loss": 0.60522431, - "num_input_tokens_seen": 261442385, - "step": 12111, - "time_per_iteration": 2.8229284286499023 - }, - { - "auxiliary_loss_clip": 0.01063232, - "auxiliary_loss_mlp": 0.0103291, - "balance_loss_clip": 1.03874135, - "balance_loss_mlp": 1.01915908, - "epoch": 0.7282128363144446, - "flos": 13918043018880.0, - "grad_norm": 2.038144887813222, - "language_loss": 0.73754865, - "learning_rate": 7.259804402465677e-07, - "loss": 0.75851005, - "num_input_tokens_seen": 261459805, - "step": 12112, - "time_per_iteration": 2.7780139446258545 - }, - { - "auxiliary_loss_clip": 0.01098263, - "auxiliary_loss_mlp": 0.01031686, - "balance_loss_clip": 1.03572726, - "balance_loss_mlp": 1.01943743, - "epoch": 0.7282729595671126, - "flos": 20777627214720.0, - "grad_norm": 2.316952642046255, - "language_loss": 0.66911846, - "learning_rate": 7.25680245639237e-07, - "loss": 0.69041795, - "num_input_tokens_seen": 261477175, - "step": 12113, - "time_per_iteration": 2.6054317951202393 - }, - { - "auxiliary_loss_clip": 0.01073794, - "auxiliary_loss_mlp": 0.01034736, - "balance_loss_clip": 1.03603506, - "balance_loss_mlp": 1.02081203, - "epoch": 0.7283330828197806, - "flos": 16325422392960.0, - "grad_norm": 2.2071094228181716, - "language_loss": 0.73312247, - "learning_rate": 7.253800993542399e-07, - "loss": 0.75420773, - "num_input_tokens_seen": 261494990, - "step": 12114, - "time_per_iteration": 2.779949188232422 - }, - { - "auxiliary_loss_clip": 0.0108015, - "auxiliary_loss_mlp": 0.01031985, - "balance_loss_clip": 1.03596735, - "balance_loss_mlp": 1.01860976, - "epoch": 0.7283932060724485, - "flos": 27490193043840.0, - "grad_norm": 2.0284186088728604, - "language_loss": 0.68312764, - "learning_rate": 7.250800014029564e-07, - "loss": 0.70424896, - "num_input_tokens_seen": 261514445, - "step": 12115, - "time_per_iteration": 2.7396066188812256 - }, - { - "auxiliary_loss_clip": 0.01112838, - "auxiliary_loss_mlp": 0.01035969, - "balance_loss_clip": 1.03786767, - "balance_loss_mlp": 1.02284992, - "epoch": 0.7284533293251165, - "flos": 18367877543040.0, - "grad_norm": 1.7392304859469863, - "language_loss": 0.60055017, - "learning_rate": 7.247799517967674e-07, - "loss": 0.62203836, - "num_input_tokens_seen": 261533565, - "step": 12116, - "time_per_iteration": 2.6416893005371094 - }, - { - "auxiliary_loss_clip": 0.01101571, - "auxiliary_loss_mlp": 0.010328, - "balance_loss_clip": 1.03989601, - "balance_loss_mlp": 1.01943648, - "epoch": 0.7285134525777844, - "flos": 21725525174400.0, - "grad_norm": 1.8456050280461243, - "language_loss": 0.73165786, - "learning_rate": 7.2447995054705e-07, - "loss": 0.75300157, - "num_input_tokens_seen": 261553795, - "step": 12117, - "time_per_iteration": 2.680856704711914 - }, - { - "auxiliary_loss_clip": 0.01096697, - "auxiliary_loss_mlp": 0.01032842, - "balance_loss_clip": 1.03561711, - "balance_loss_mlp": 1.01907897, - "epoch": 0.7285735758304525, - "flos": 20741357456640.0, - "grad_norm": 1.892233976782661, - "language_loss": 0.69420332, - "learning_rate": 7.241799976651807e-07, - "loss": 0.71549869, - "num_input_tokens_seen": 261572565, - "step": 12118, - "time_per_iteration": 2.689328908920288 - }, - { - "auxiliary_loss_clip": 0.01054191, - "auxiliary_loss_mlp": 0.01039747, - "balance_loss_clip": 1.03333414, - "balance_loss_mlp": 1.026968, - "epoch": 0.7286336990831204, - "flos": 17310954827520.0, - "grad_norm": 6.128645472594502, - "language_loss": 0.84134108, - "learning_rate": 7.238800931625346e-07, - "loss": 0.86228043, - "num_input_tokens_seen": 261590910, - "step": 12119, - "time_per_iteration": 2.811901330947876 - }, - { - "auxiliary_loss_clip": 0.01112084, - "auxiliary_loss_mlp": 0.01029087, - "balance_loss_clip": 1.03825903, - "balance_loss_mlp": 1.01655173, - "epoch": 0.7286938223357884, - "flos": 19787390098560.0, - "grad_norm": 2.0681771064873544, - "language_loss": 0.81878972, - "learning_rate": 7.235802370504831e-07, - "loss": 0.84020138, - "num_input_tokens_seen": 261606005, - "step": 12120, - "time_per_iteration": 2.6672909259796143 - }, - { - "auxiliary_loss_clip": 0.01072804, - "auxiliary_loss_mlp": 0.01040557, - "balance_loss_clip": 1.03617036, - "balance_loss_mlp": 1.02706861, - "epoch": 0.7287539455884563, - "flos": 15340859625600.0, - "grad_norm": 1.933953511288546, - "language_loss": 0.7878201, - "learning_rate": 7.232804293403963e-07, - "loss": 0.8089537, - "num_input_tokens_seen": 261622305, - "step": 12121, - "time_per_iteration": 2.6193950176239014 - }, - { - "auxiliary_loss_clip": 0.01111609, - "auxiliary_loss_mlp": 0.01036655, - "balance_loss_clip": 1.0360496, - "balance_loss_mlp": 1.02327943, - "epoch": 0.7288140688411243, - "flos": 25192484870400.0, - "grad_norm": 1.533681893436525, - "language_loss": 0.69097638, - "learning_rate": 7.229806700436441e-07, - "loss": 0.71245903, - "num_input_tokens_seen": 261642465, - "step": 12122, - "time_per_iteration": 2.650777578353882 - }, - { - "auxiliary_loss_clip": 0.01064636, - "auxiliary_loss_mlp": 0.01033566, - "balance_loss_clip": 1.03321254, - "balance_loss_mlp": 1.02150214, - "epoch": 0.7288741920937922, - "flos": 23984162328960.0, - "grad_norm": 1.9841747121514857, - "language_loss": 0.87224233, - "learning_rate": 7.226809591715923e-07, - "loss": 0.89322436, - "num_input_tokens_seen": 261661420, - "step": 12123, - "time_per_iteration": 2.767803907394409 - }, - { - "auxiliary_loss_clip": 0.01077874, - "auxiliary_loss_mlp": 0.01035309, - "balance_loss_clip": 1.0370611, - "balance_loss_mlp": 1.02279758, - "epoch": 0.7289343153464602, - "flos": 22744921155840.0, - "grad_norm": 19.006549121525065, - "language_loss": 0.8255595, - "learning_rate": 7.223812967356065e-07, - "loss": 0.84669125, - "num_input_tokens_seen": 261680865, - "step": 12124, - "time_per_iteration": 2.7401580810546875 - }, - { - "auxiliary_loss_clip": 0.01082733, - "auxiliary_loss_mlp": 0.01032589, - "balance_loss_clip": 1.03729665, - "balance_loss_mlp": 1.01955354, - "epoch": 0.7289944385991282, - "flos": 24900028335360.0, - "grad_norm": 2.2469511782017726, - "language_loss": 0.67069578, - "learning_rate": 7.220816827470499e-07, - "loss": 0.69184899, - "num_input_tokens_seen": 261701455, - "step": 12125, - "time_per_iteration": 2.681535243988037 - }, - { - "auxiliary_loss_clip": 0.01104267, - "auxiliary_loss_mlp": 0.01038084, - "balance_loss_clip": 1.03742492, - "balance_loss_mlp": 1.02412462, - "epoch": 0.7290545618517962, - "flos": 22967064817920.0, - "grad_norm": 2.039401823737763, - "language_loss": 0.74920547, - "learning_rate": 7.217821172172855e-07, - "loss": 0.77062899, - "num_input_tokens_seen": 261721260, - "step": 12126, - "time_per_iteration": 2.6920571327209473 - }, - { - "auxiliary_loss_clip": 0.01016131, - "auxiliary_loss_mlp": 0.00997812, - "balance_loss_clip": 1.01327682, - "balance_loss_mlp": 0.99669784, - "epoch": 0.7291146851044642, - "flos": 61901523216000.0, - "grad_norm": 0.8366377087030958, - "language_loss": 0.5864383, - "learning_rate": 7.2148260015767e-07, - "loss": 0.60657775, - "num_input_tokens_seen": 261779370, - "step": 12127, - "time_per_iteration": 3.1948511600494385 - }, - { - "auxiliary_loss_clip": 0.01076598, - "auxiliary_loss_mlp": 0.01031063, - "balance_loss_clip": 1.03621507, - "balance_loss_mlp": 1.01911807, - "epoch": 0.7291748083571321, - "flos": 23330947547520.0, - "grad_norm": 2.1989684199567376, - "language_loss": 0.68995476, - "learning_rate": 7.21183131579562e-07, - "loss": 0.71103132, - "num_input_tokens_seen": 261798050, - "step": 12128, - "time_per_iteration": 2.761828899383545 - }, - { - "auxiliary_loss_clip": 0.01085147, - "auxiliary_loss_mlp": 0.01035559, - "balance_loss_clip": 1.03663111, - "balance_loss_mlp": 1.02137899, - "epoch": 0.7292349316098001, - "flos": 28330000001280.0, - "grad_norm": 1.8229974773388113, - "language_loss": 0.65319067, - "learning_rate": 7.20883711494319e-07, - "loss": 0.67439777, - "num_input_tokens_seen": 261817660, - "step": 12129, - "time_per_iteration": 2.7223851680755615 - }, - { - "auxiliary_loss_clip": 0.01108826, - "auxiliary_loss_mlp": 0.01030564, - "balance_loss_clip": 1.03813577, - "balance_loss_mlp": 1.01728415, - "epoch": 0.729295054862468, - "flos": 24132222190080.0, - "grad_norm": 1.987746290779436, - "language_loss": 0.74474001, - "learning_rate": 7.205843399132927e-07, - "loss": 0.7661339, - "num_input_tokens_seen": 261837935, - "step": 12130, - "time_per_iteration": 2.624861001968384 - }, - { - "auxiliary_loss_clip": 0.01084684, - "auxiliary_loss_mlp": 0.01036003, - "balance_loss_clip": 1.03371596, - "balance_loss_mlp": 1.02260351, - "epoch": 0.7293551781151361, - "flos": 22816239609600.0, - "grad_norm": 1.9230016702733295, - "language_loss": 0.69777483, - "learning_rate": 7.202850168478374e-07, - "loss": 0.71898174, - "num_input_tokens_seen": 261857575, - "step": 12131, - "time_per_iteration": 2.686483383178711 - }, - { - "auxiliary_loss_clip": 0.01075038, - "auxiliary_loss_mlp": 0.01032777, - "balance_loss_clip": 1.03706694, - "balance_loss_mlp": 1.02072525, - "epoch": 0.729415301367804, - "flos": 22126683242880.0, - "grad_norm": 1.5997534699121376, - "language_loss": 0.77348047, - "learning_rate": 7.199857423093025e-07, - "loss": 0.79455858, - "num_input_tokens_seen": 261877265, - "step": 12132, - "time_per_iteration": 2.7391042709350586 - }, - { - "auxiliary_loss_clip": 0.0110301, - "auxiliary_loss_mlp": 0.01038259, - "balance_loss_clip": 1.03978968, - "balance_loss_mlp": 1.02559876, - "epoch": 0.729475424620472, - "flos": 12349608675840.0, - "grad_norm": 2.2281458510507797, - "language_loss": 0.78860861, - "learning_rate": 7.196865163090358e-07, - "loss": 0.81002128, - "num_input_tokens_seen": 261893695, - "step": 12133, - "time_per_iteration": 2.5943353176116943 - }, - { - "auxiliary_loss_clip": 0.01060968, - "auxiliary_loss_mlp": 0.01032725, - "balance_loss_clip": 1.03212547, - "balance_loss_mlp": 1.01933742, - "epoch": 0.7295355478731399, - "flos": 22195308176640.0, - "grad_norm": 2.7553273898402333, - "language_loss": 0.72054434, - "learning_rate": 7.193873388583846e-07, - "loss": 0.7414813, - "num_input_tokens_seen": 261911825, - "step": 12134, - "time_per_iteration": 2.764251470565796 - }, - { - "auxiliary_loss_clip": 0.01091285, - "auxiliary_loss_mlp": 0.01040465, - "balance_loss_clip": 1.03840399, - "balance_loss_mlp": 1.02753675, - "epoch": 0.7295956711258079, - "flos": 23222030532480.0, - "grad_norm": 2.1447336349614203, - "language_loss": 0.71251649, - "learning_rate": 7.190882099686939e-07, - "loss": 0.73383397, - "num_input_tokens_seen": 261931190, - "step": 12135, - "time_per_iteration": 2.7322559356689453 - }, - { - "auxiliary_loss_clip": 0.01077251, - "auxiliary_loss_mlp": 0.01035683, - "balance_loss_clip": 1.03486896, - "balance_loss_mlp": 1.02259374, - "epoch": 0.7296557943784758, - "flos": 31869104163840.0, - "grad_norm": 2.309450763309982, - "language_loss": 0.61924529, - "learning_rate": 7.187891296513075e-07, - "loss": 0.64037454, - "num_input_tokens_seen": 261951240, - "step": 12136, - "time_per_iteration": 2.7608072757720947 - }, - { - "auxiliary_loss_clip": 0.01094465, - "auxiliary_loss_mlp": 0.00770512, - "balance_loss_clip": 1.03708506, - "balance_loss_mlp": 1.00022686, - "epoch": 0.7297159176311439, - "flos": 26651714889600.0, - "grad_norm": 1.8756317332834676, - "language_loss": 0.74414635, - "learning_rate": 7.184900979175654e-07, - "loss": 0.76279616, - "num_input_tokens_seen": 261971605, - "step": 12137, - "time_per_iteration": 2.6699535846710205 - }, - { - "auxiliary_loss_clip": 0.01104052, - "auxiliary_loss_mlp": 0.00771068, - "balance_loss_clip": 1.04109406, - "balance_loss_mlp": 1.00024545, - "epoch": 0.7297760408838118, - "flos": 24749562263040.0, - "grad_norm": 1.6416252206910797, - "language_loss": 0.74556518, - "learning_rate": 7.181911147788069e-07, - "loss": 0.76431638, - "num_input_tokens_seen": 261990830, - "step": 12138, - "time_per_iteration": 2.6462252140045166 - }, - { - "auxiliary_loss_clip": 0.01073993, - "auxiliary_loss_mlp": 0.01030576, - "balance_loss_clip": 1.03440869, - "balance_loss_mlp": 1.01832712, - "epoch": 0.7298361641364798, - "flos": 18073768982400.0, - "grad_norm": 2.2048130444672527, - "language_loss": 0.71792364, - "learning_rate": 7.178921802463702e-07, - "loss": 0.73896933, - "num_input_tokens_seen": 262008190, - "step": 12139, - "time_per_iteration": 2.637579917907715 - }, - { - "auxiliary_loss_clip": 0.01094798, - "auxiliary_loss_mlp": 0.01029337, - "balance_loss_clip": 1.03654766, - "balance_loss_mlp": 1.01727343, - "epoch": 0.7298962873891478, - "flos": 29895597169920.0, - "grad_norm": 1.5727231241692394, - "language_loss": 0.73340857, - "learning_rate": 7.175932943315898e-07, - "loss": 0.75464988, - "num_input_tokens_seen": 262030460, - "step": 12140, - "time_per_iteration": 4.322738170623779 - }, - { - "auxiliary_loss_clip": 0.01086242, - "auxiliary_loss_mlp": 0.01033553, - "balance_loss_clip": 1.03733993, - "balance_loss_mlp": 1.02028465, - "epoch": 0.7299564106418157, - "flos": 32266096254720.0, - "grad_norm": 2.108634462016176, - "language_loss": 0.55439997, - "learning_rate": 7.172944570458003e-07, - "loss": 0.57559788, - "num_input_tokens_seen": 262050830, - "step": 12141, - "time_per_iteration": 4.280510425567627 - }, - { - "auxiliary_loss_clip": 0.01072661, - "auxiliary_loss_mlp": 0.01030923, - "balance_loss_clip": 1.03414416, - "balance_loss_mlp": 1.0185132, - "epoch": 0.7300165338944837, - "flos": 22930292269440.0, - "grad_norm": 1.6200088413354243, - "language_loss": 0.72661757, - "learning_rate": 7.169956684003342e-07, - "loss": 0.74765337, - "num_input_tokens_seen": 262071245, - "step": 12142, - "time_per_iteration": 4.36347508430481 - }, - { - "auxiliary_loss_clip": 0.01109011, - "auxiliary_loss_mlp": 0.01039998, - "balance_loss_clip": 1.03754866, - "balance_loss_mlp": 1.02798176, - "epoch": 0.7300766571471516, - "flos": 19828795501440.0, - "grad_norm": 1.8395683964833187, - "language_loss": 0.73354667, - "learning_rate": 7.16696928406521e-07, - "loss": 0.75503671, - "num_input_tokens_seen": 262087525, - "step": 12143, - "time_per_iteration": 2.562661647796631 - }, - { - "auxiliary_loss_clip": 0.01072117, - "auxiliary_loss_mlp": 0.01036251, - "balance_loss_clip": 1.0354147, - "balance_loss_mlp": 1.02270293, - "epoch": 0.7301367803998197, - "flos": 24347829576960.0, - "grad_norm": 11.58693755368333, - "language_loss": 0.67069697, - "learning_rate": 7.163982370756882e-07, - "loss": 0.69178069, - "num_input_tokens_seen": 262107355, - "step": 12144, - "time_per_iteration": 2.7019169330596924 - }, - { - "auxiliary_loss_clip": 0.01087218, - "auxiliary_loss_mlp": 0.01031157, - "balance_loss_clip": 1.03756452, - "balance_loss_mlp": 1.01808596, - "epoch": 0.7301969036524876, - "flos": 15304518040320.0, - "grad_norm": 2.004686825288867, - "language_loss": 0.79088622, - "learning_rate": 7.160995944191627e-07, - "loss": 0.81206995, - "num_input_tokens_seen": 262125645, - "step": 12145, - "time_per_iteration": 2.609962224960327 - }, - { - "auxiliary_loss_clip": 0.01071068, - "auxiliary_loss_mlp": 0.01038463, - "balance_loss_clip": 1.03582478, - "balance_loss_mlp": 1.02542722, - "epoch": 0.7302570269051556, - "flos": 23507268433920.0, - "grad_norm": 2.189602190838667, - "language_loss": 0.91191077, - "learning_rate": 7.158010004482702e-07, - "loss": 0.93300605, - "num_input_tokens_seen": 262144075, - "step": 12146, - "time_per_iteration": 4.17360258102417 - }, - { - "auxiliary_loss_clip": 0.01107983, - "auxiliary_loss_mlp": 0.01027437, - "balance_loss_clip": 1.03820586, - "balance_loss_mlp": 1.01547432, - "epoch": 0.7303171501578235, - "flos": 20523056549760.0, - "grad_norm": 1.801228566583195, - "language_loss": 0.62361127, - "learning_rate": 7.155024551743316e-07, - "loss": 0.64496547, - "num_input_tokens_seen": 262165940, - "step": 12147, - "time_per_iteration": 2.7316384315490723 - }, - { - "auxiliary_loss_clip": 0.01113892, - "auxiliary_loss_mlp": 0.01039081, - "balance_loss_clip": 1.0402323, - "balance_loss_mlp": 1.02578294, - "epoch": 0.7303772734104915, - "flos": 18332613365760.0, - "grad_norm": 1.9466892860385239, - "language_loss": 0.75526571, - "learning_rate": 7.152039586086693e-07, - "loss": 0.77679539, - "num_input_tokens_seen": 262184520, - "step": 12148, - "time_per_iteration": 2.55757999420166 - }, - { - "auxiliary_loss_clip": 0.01010613, - "auxiliary_loss_mlp": 0.0075184, - "balance_loss_clip": 1.00818348, - "balance_loss_mlp": 0.99964029, - "epoch": 0.7304373966631594, - "flos": 60654776100480.0, - "grad_norm": 0.6918687189528673, - "language_loss": 0.56630087, - "learning_rate": 7.149055107626017e-07, - "loss": 0.58392537, - "num_input_tokens_seen": 262247070, - "step": 12149, - "time_per_iteration": 3.1780648231506348 - }, - { - "auxiliary_loss_clip": 0.01090981, - "auxiliary_loss_mlp": 0.01036437, - "balance_loss_clip": 1.03665161, - "balance_loss_mlp": 1.02352667, - "epoch": 0.7304975199158275, - "flos": 19828077229440.0, - "grad_norm": 1.6617515713368272, - "language_loss": 0.73949683, - "learning_rate": 7.146071116474451e-07, - "loss": 0.76077104, - "num_input_tokens_seen": 262266605, - "step": 12150, - "time_per_iteration": 2.6775600910186768 - }, - { - "auxiliary_loss_clip": 0.0111323, - "auxiliary_loss_mlp": 0.01034483, - "balance_loss_clip": 1.03854418, - "balance_loss_mlp": 1.02156699, - "epoch": 0.7305576431684954, - "flos": 13223997452160.0, - "grad_norm": 2.052406638174018, - "language_loss": 0.84060204, - "learning_rate": 7.143087612745158e-07, - "loss": 0.86207914, - "num_input_tokens_seen": 262283880, - "step": 12151, - "time_per_iteration": 2.589292049407959 - }, - { - "auxiliary_loss_clip": 0.01072466, - "auxiliary_loss_mlp": 0.01040374, - "balance_loss_clip": 1.03497267, - "balance_loss_mlp": 1.02686191, - "epoch": 0.7306177664211634, - "flos": 24060472773120.0, - "grad_norm": 1.844893248025129, - "language_loss": 0.78079808, - "learning_rate": 7.14010459655127e-07, - "loss": 0.80192649, - "num_input_tokens_seen": 262304155, - "step": 12152, - "time_per_iteration": 2.7783727645874023 - }, - { - "auxiliary_loss_clip": 0.01075382, - "auxiliary_loss_mlp": 0.01032051, - "balance_loss_clip": 1.03711772, - "balance_loss_mlp": 1.01889646, - "epoch": 0.7306778896738314, - "flos": 27089106802560.0, - "grad_norm": 2.295487047202377, - "language_loss": 0.79554176, - "learning_rate": 7.137122068005919e-07, - "loss": 0.81661606, - "num_input_tokens_seen": 262325660, - "step": 12153, - "time_per_iteration": 2.773252010345459 - }, - { - "auxiliary_loss_clip": 0.01100913, - "auxiliary_loss_mlp": 0.01037363, - "balance_loss_clip": 1.03726029, - "balance_loss_mlp": 1.02455413, - "epoch": 0.7307380129264993, - "flos": 16690669839360.0, - "grad_norm": 1.708854446027603, - "language_loss": 0.67438841, - "learning_rate": 7.134140027222173e-07, - "loss": 0.69577122, - "num_input_tokens_seen": 262344075, - "step": 12154, - "time_per_iteration": 2.657804489135742 - }, - { - "auxiliary_loss_clip": 0.01064569, - "auxiliary_loss_mlp": 0.01032574, - "balance_loss_clip": 1.03754902, - "balance_loss_mlp": 1.01900196, - "epoch": 0.7307981361791673, - "flos": 21725740656000.0, - "grad_norm": 1.7409892720521978, - "language_loss": 0.6598506, - "learning_rate": 7.131158474313128e-07, - "loss": 0.68082201, - "num_input_tokens_seen": 262363305, - "step": 12155, - "time_per_iteration": 2.727818012237549 - }, - { - "auxiliary_loss_clip": 0.01090955, - "auxiliary_loss_mlp": 0.01028633, - "balance_loss_clip": 1.03944302, - "balance_loss_mlp": 1.01606798, - "epoch": 0.7308582594318352, - "flos": 18040659621120.0, - "grad_norm": 2.059846064937341, - "language_loss": 0.81401372, - "learning_rate": 7.128177409391851e-07, - "loss": 0.83520961, - "num_input_tokens_seen": 262380730, - "step": 12156, - "time_per_iteration": 2.6713905334472656 - }, - { - "auxiliary_loss_clip": 0.01069178, - "auxiliary_loss_mlp": 0.01038604, - "balance_loss_clip": 1.03357935, - "balance_loss_mlp": 1.02677894, - "epoch": 0.7309183826845033, - "flos": 13844964798720.0, - "grad_norm": 2.368813587947745, - "language_loss": 0.7572211, - "learning_rate": 7.125196832571367e-07, - "loss": 0.77829891, - "num_input_tokens_seen": 262395480, - "step": 12157, - "time_per_iteration": 2.6478710174560547 - }, - { - "auxiliary_loss_clip": 0.01097661, - "auxiliary_loss_mlp": 0.01029375, - "balance_loss_clip": 1.03817534, - "balance_loss_mlp": 1.01818156, - "epoch": 0.7309785059371712, - "flos": 17019216564480.0, - "grad_norm": 2.20999557197409, - "language_loss": 0.72660947, - "learning_rate": 7.122216743964713e-07, - "loss": 0.74787986, - "num_input_tokens_seen": 262413340, - "step": 12158, - "time_per_iteration": 2.6752305030822754 - }, - { - "auxiliary_loss_clip": 0.01090002, - "auxiliary_loss_mlp": 0.01036269, - "balance_loss_clip": 1.03874135, - "balance_loss_mlp": 1.02343071, - "epoch": 0.7310386291898392, - "flos": 26502398052480.0, - "grad_norm": 1.5980086656224926, - "language_loss": 0.85433125, - "learning_rate": 7.119237143684896e-07, - "loss": 0.87559396, - "num_input_tokens_seen": 262433455, - "step": 12159, - "time_per_iteration": 2.722282886505127 - }, - { - "auxiliary_loss_clip": 0.01090808, - "auxiliary_loss_mlp": 0.01033782, - "balance_loss_clip": 1.0357151, - "balance_loss_mlp": 1.01996553, - "epoch": 0.7310987524425071, - "flos": 16945922862720.0, - "grad_norm": 2.240373926166887, - "language_loss": 0.73471999, - "learning_rate": 7.116258031844895e-07, - "loss": 0.75596595, - "num_input_tokens_seen": 262450335, - "step": 12160, - "time_per_iteration": 2.6522862911224365 - }, - { - "auxiliary_loss_clip": 0.01103069, - "auxiliary_loss_mlp": 0.01035667, - "balance_loss_clip": 1.0388577, - "balance_loss_mlp": 1.0220058, - "epoch": 0.7311588756951751, - "flos": 13845288021120.0, - "grad_norm": 1.9039689153632533, - "language_loss": 0.72493577, - "learning_rate": 7.113279408557675e-07, - "loss": 0.74632311, - "num_input_tokens_seen": 262468240, - "step": 12161, - "time_per_iteration": 2.5589683055877686 - }, - { - "auxiliary_loss_clip": 0.01083193, - "auxiliary_loss_mlp": 0.00772186, - "balance_loss_clip": 1.03667367, - "balance_loss_mlp": 1.00028253, - "epoch": 0.731218998947843, - "flos": 28767894704640.0, - "grad_norm": 1.765712961659712, - "language_loss": 0.69565916, - "learning_rate": 7.110301273936192e-07, - "loss": 0.71421289, - "num_input_tokens_seen": 262487045, - "step": 12162, - "time_per_iteration": 2.8083322048187256 - }, - { - "auxiliary_loss_clip": 0.01102238, - "auxiliary_loss_mlp": 0.01030969, - "balance_loss_clip": 1.03934407, - "balance_loss_mlp": 1.01765895, - "epoch": 0.7312791222005111, - "flos": 27088783580160.0, - "grad_norm": 1.79396916880486, - "language_loss": 0.66982478, - "learning_rate": 7.107323628093382e-07, - "loss": 0.69115686, - "num_input_tokens_seen": 262504855, - "step": 12163, - "time_per_iteration": 2.664005756378174 - }, - { - "auxiliary_loss_clip": 0.01088818, - "auxiliary_loss_mlp": 0.01029215, - "balance_loss_clip": 1.03657246, - "balance_loss_mlp": 1.01618505, - "epoch": 0.731339245453179, - "flos": 20924035050240.0, - "grad_norm": 1.4858782021210455, - "language_loss": 0.68422931, - "learning_rate": 7.104346471142153e-07, - "loss": 0.70540965, - "num_input_tokens_seen": 262524920, - "step": 12164, - "time_per_iteration": 2.730407953262329 - }, - { - "auxiliary_loss_clip": 0.01064444, - "auxiliary_loss_mlp": 0.01035496, - "balance_loss_clip": 1.03925169, - "balance_loss_mlp": 1.02344418, - "epoch": 0.731399368705847, - "flos": 23075694524160.0, - "grad_norm": 1.621904213104564, - "language_loss": 0.73121232, - "learning_rate": 7.101369803195391e-07, - "loss": 0.75221169, - "num_input_tokens_seen": 262545725, - "step": 12165, - "time_per_iteration": 2.745304584503174 - }, - { - "auxiliary_loss_clip": 0.01104061, - "auxiliary_loss_mlp": 0.0103506, - "balance_loss_clip": 1.03919411, - "balance_loss_mlp": 1.02191114, - "epoch": 0.731459491958515, - "flos": 23582681038080.0, - "grad_norm": 1.959130136013477, - "language_loss": 0.7631768, - "learning_rate": 7.098393624365988e-07, - "loss": 0.78456795, - "num_input_tokens_seen": 262565480, - "step": 12166, - "time_per_iteration": 2.655210256576538 - }, - { - "auxiliary_loss_clip": 0.01083193, - "auxiliary_loss_mlp": 0.01031215, - "balance_loss_clip": 1.03837287, - "balance_loss_mlp": 1.01877546, - "epoch": 0.7315196152111829, - "flos": 22379278659840.0, - "grad_norm": 1.7735251016583573, - "language_loss": 0.79791737, - "learning_rate": 7.095417934766781e-07, - "loss": 0.81906146, - "num_input_tokens_seen": 262584145, - "step": 12167, - "time_per_iteration": 2.686013698577881 - }, - { - "auxiliary_loss_clip": 0.01099781, - "auxiliary_loss_mlp": 0.01043597, - "balance_loss_clip": 1.03856659, - "balance_loss_mlp": 1.03108573, - "epoch": 0.7315797384638509, - "flos": 26177047637760.0, - "grad_norm": 1.6689116898679521, - "language_loss": 0.76710904, - "learning_rate": 7.092442734510622e-07, - "loss": 0.78854281, - "num_input_tokens_seen": 262604045, - "step": 12168, - "time_per_iteration": 2.6875557899475098 - }, - { - "auxiliary_loss_clip": 0.0109665, - "auxiliary_loss_mlp": 0.01043712, - "balance_loss_clip": 1.03574252, - "balance_loss_mlp": 1.02774954, - "epoch": 0.7316398617165188, - "flos": 21506326427520.0, - "grad_norm": 2.5442709815389684, - "language_loss": 0.81822222, - "learning_rate": 7.089468023710326e-07, - "loss": 0.83962584, - "num_input_tokens_seen": 262624540, - "step": 12169, - "time_per_iteration": 2.592453718185425 - }, - { - "auxiliary_loss_clip": 0.01097824, - "auxiliary_loss_mlp": 0.01039563, - "balance_loss_clip": 1.03882432, - "balance_loss_mlp": 1.0264802, - "epoch": 0.7316999849691869, - "flos": 30482557315200.0, - "grad_norm": 1.9915594425883627, - "language_loss": 0.69992799, - "learning_rate": 7.08649380247871e-07, - "loss": 0.72130191, - "num_input_tokens_seen": 262644545, - "step": 12170, - "time_per_iteration": 2.7040326595306396 - }, - { - "auxiliary_loss_clip": 0.01109905, - "auxiliary_loss_mlp": 0.01032057, - "balance_loss_clip": 1.03831005, - "balance_loss_mlp": 1.01799059, - "epoch": 0.7317601082218548, - "flos": 21543781334400.0, - "grad_norm": 15.0863481947429, - "language_loss": 0.69820881, - "learning_rate": 7.083520070928533e-07, - "loss": 0.71962845, - "num_input_tokens_seen": 262662570, - "step": 12171, - "time_per_iteration": 2.5760347843170166 - }, - { - "auxiliary_loss_clip": 0.01111903, - "auxiliary_loss_mlp": 0.0104052, - "balance_loss_clip": 1.03991163, - "balance_loss_mlp": 1.0280571, - "epoch": 0.7318202314745228, - "flos": 33251592775680.0, - "grad_norm": 4.139375107953077, - "language_loss": 0.65600061, - "learning_rate": 7.080546829172564e-07, - "loss": 0.67752481, - "num_input_tokens_seen": 262683245, - "step": 12172, - "time_per_iteration": 2.629512071609497 - }, - { - "auxiliary_loss_clip": 0.01112155, - "auxiliary_loss_mlp": 0.01027678, - "balance_loss_clip": 1.03968287, - "balance_loss_mlp": 1.01504803, - "epoch": 0.7318803547271907, - "flos": 20157054917760.0, - "grad_norm": 2.4544456450965577, - "language_loss": 0.6181004, - "learning_rate": 7.077574077323564e-07, - "loss": 0.63949871, - "num_input_tokens_seen": 262701585, - "step": 12173, - "time_per_iteration": 2.714617967605591 - }, - { - "auxiliary_loss_clip": 0.01056565, - "auxiliary_loss_mlp": 0.01030506, - "balance_loss_clip": 1.03468084, - "balance_loss_mlp": 1.01789331, - "epoch": 0.7319404779798587, - "flos": 20558536208640.0, - "grad_norm": 3.4474002403228714, - "language_loss": 0.74141943, - "learning_rate": 7.074601815494243e-07, - "loss": 0.76229018, - "num_input_tokens_seen": 262719295, - "step": 12174, - "time_per_iteration": 2.691361427307129 - }, - { - "auxiliary_loss_clip": 0.0110738, - "auxiliary_loss_mlp": 0.01029138, - "balance_loss_clip": 1.03786492, - "balance_loss_mlp": 1.01689529, - "epoch": 0.7320006012325266, - "flos": 28695391102080.0, - "grad_norm": 1.70169272855857, - "language_loss": 0.80771077, - "learning_rate": 7.071630043797317e-07, - "loss": 0.82907599, - "num_input_tokens_seen": 262739995, - "step": 12175, - "time_per_iteration": 2.6333701610565186 - }, - { - "auxiliary_loss_clip": 0.01091186, - "auxiliary_loss_mlp": 0.01029927, - "balance_loss_clip": 1.03785181, - "balance_loss_mlp": 1.01719511, - "epoch": 0.7320607244851947, - "flos": 16362697731840.0, - "grad_norm": 2.2994636661960777, - "language_loss": 0.76175666, - "learning_rate": 7.068658762345488e-07, - "loss": 0.78296781, - "num_input_tokens_seen": 262757680, - "step": 12176, - "time_per_iteration": 2.6684181690216064 - }, - { - "auxiliary_loss_clip": 0.01099222, - "auxiliary_loss_mlp": 0.01033517, - "balance_loss_clip": 1.03950393, - "balance_loss_mlp": 1.02143455, - "epoch": 0.7321208477378626, - "flos": 20955097336320.0, - "grad_norm": 1.7266339084119442, - "language_loss": 0.76393938, - "learning_rate": 7.065687971251399e-07, - "loss": 0.78526676, - "num_input_tokens_seen": 262776990, - "step": 12177, - "time_per_iteration": 2.5895602703094482 - }, - { - "auxiliary_loss_clip": 0.01076316, - "auxiliary_loss_mlp": 0.0103859, - "balance_loss_clip": 1.03529096, - "balance_loss_mlp": 1.02638888, - "epoch": 0.7321809709905306, - "flos": 13845072539520.0, - "grad_norm": 2.2196900974647003, - "language_loss": 0.74673522, - "learning_rate": 7.06271767062772e-07, - "loss": 0.76788431, - "num_input_tokens_seen": 262795440, - "step": 12178, - "time_per_iteration": 2.6741504669189453 - }, - { - "auxiliary_loss_clip": 0.01091987, - "auxiliary_loss_mlp": 0.01034757, - "balance_loss_clip": 1.03604901, - "balance_loss_mlp": 1.02187705, - "epoch": 0.7322410942431986, - "flos": 26979938392320.0, - "grad_norm": 2.2839200958654584, - "language_loss": 0.82424951, - "learning_rate": 7.059747860587084e-07, - "loss": 0.84551692, - "num_input_tokens_seen": 262816385, - "step": 12179, - "time_per_iteration": 4.333508253097534 - }, - { - "auxiliary_loss_clip": 0.01073556, - "auxiliary_loss_mlp": 0.01040091, - "balance_loss_clip": 1.03531742, - "balance_loss_mlp": 1.02663827, - "epoch": 0.7323012174958665, - "flos": 17639717034240.0, - "grad_norm": 4.252835567274656, - "language_loss": 0.74462938, - "learning_rate": 7.056778541242115e-07, - "loss": 0.76576585, - "num_input_tokens_seen": 262834955, - "step": 12180, - "time_per_iteration": 2.64694881439209 - }, - { - "auxiliary_loss_clip": 0.01100626, - "auxiliary_loss_mlp": 0.00770628, - "balance_loss_clip": 1.03525329, - "balance_loss_mlp": 1.00013947, - "epoch": 0.7323613407485345, - "flos": 32342765834880.0, - "grad_norm": 2.118039690946721, - "language_loss": 0.79425126, - "learning_rate": 7.053809712705396e-07, - "loss": 0.81296378, - "num_input_tokens_seen": 262853555, - "step": 12181, - "time_per_iteration": 5.950862407684326 - }, - { - "auxiliary_loss_clip": 0.01104749, - "auxiliary_loss_mlp": 0.00770994, - "balance_loss_clip": 1.0405333, - "balance_loss_mlp": 1.00015044, - "epoch": 0.7324214640012024, - "flos": 18362777811840.0, - "grad_norm": 3.5037562339731343, - "language_loss": 0.72006238, - "learning_rate": 7.050841375089506e-07, - "loss": 0.73881984, - "num_input_tokens_seen": 262870975, - "step": 12182, - "time_per_iteration": 2.60955810546875 - }, - { - "auxiliary_loss_clip": 0.01113664, - "auxiliary_loss_mlp": 0.01031542, - "balance_loss_clip": 1.04023218, - "balance_loss_mlp": 1.01922774, - "epoch": 0.7324815872538705, - "flos": 30812289189120.0, - "grad_norm": 1.455017822583619, - "language_loss": 0.7080251, - "learning_rate": 7.047873528507015e-07, - "loss": 0.72947717, - "num_input_tokens_seen": 262892635, - "step": 12183, - "time_per_iteration": 2.651121139526367 - }, - { - "auxiliary_loss_clip": 0.01100782, - "auxiliary_loss_mlp": 0.01035961, - "balance_loss_clip": 1.04088736, - "balance_loss_mlp": 1.02230549, - "epoch": 0.7325417105065384, - "flos": 21505069451520.0, - "grad_norm": 1.9960836350213491, - "language_loss": 0.73006004, - "learning_rate": 7.04490617307045e-07, - "loss": 0.75142741, - "num_input_tokens_seen": 262910725, - "step": 12184, - "time_per_iteration": 4.158590078353882 - }, - { - "auxiliary_loss_clip": 0.01011352, - "auxiliary_loss_mlp": 0.01007926, - "balance_loss_clip": 1.00717974, - "balance_loss_mlp": 1.0068059, - "epoch": 0.7326018337592064, - "flos": 67257742556160.0, - "grad_norm": 0.7629811613061157, - "language_loss": 0.65181279, - "learning_rate": 7.041939308892344e-07, - "loss": 0.67200553, - "num_input_tokens_seen": 262974150, - "step": 12185, - "time_per_iteration": 3.1753084659576416 - }, - { - "auxiliary_loss_clip": 0.01110902, - "auxiliary_loss_mlp": 0.01027525, - "balance_loss_clip": 1.03751791, - "balance_loss_mlp": 1.01419187, - "epoch": 0.7326619570118743, - "flos": 22857070394880.0, - "grad_norm": 1.8466605492768327, - "language_loss": 0.80407894, - "learning_rate": 7.038972936085197e-07, - "loss": 0.82546324, - "num_input_tokens_seen": 262993370, - "step": 12186, - "time_per_iteration": 2.7113280296325684 - }, - { - "auxiliary_loss_clip": 0.01095897, - "auxiliary_loss_mlp": 0.01035902, - "balance_loss_clip": 1.03822923, - "balance_loss_mlp": 1.02185869, - "epoch": 0.7327220802645423, - "flos": 23327499841920.0, - "grad_norm": 1.6891374777680592, - "language_loss": 0.73376352, - "learning_rate": 7.036007054761508e-07, - "loss": 0.75508153, - "num_input_tokens_seen": 263012665, - "step": 12187, - "time_per_iteration": 2.6341447830200195 - }, - { - "auxiliary_loss_clip": 0.01113144, - "auxiliary_loss_mlp": 0.01032976, - "balance_loss_clip": 1.03975987, - "balance_loss_mlp": 1.020298, - "epoch": 0.7327822035172102, - "flos": 23180661043200.0, - "grad_norm": 1.849813706667638, - "language_loss": 0.88717717, - "learning_rate": 7.033041665033716e-07, - "loss": 0.90863836, - "num_input_tokens_seen": 263031475, - "step": 12188, - "time_per_iteration": 2.5466268062591553 - }, - { - "auxiliary_loss_clip": 0.01068599, - "auxiliary_loss_mlp": 0.01036205, - "balance_loss_clip": 1.03427935, - "balance_loss_mlp": 1.02241302, - "epoch": 0.7328423267698783, - "flos": 21066600130560.0, - "grad_norm": 2.0499334322207856, - "language_loss": 0.74851215, - "learning_rate": 7.030076767014284e-07, - "loss": 0.76956022, - "num_input_tokens_seen": 263051445, - "step": 12189, - "time_per_iteration": 2.7621939182281494 - }, - { - "auxiliary_loss_clip": 0.01078663, - "auxiliary_loss_mlp": 0.0103229, - "balance_loss_clip": 1.03718972, - "balance_loss_mlp": 1.01898003, - "epoch": 0.7329024500225462, - "flos": 21689578638720.0, - "grad_norm": 1.96321719925377, - "language_loss": 0.82236755, - "learning_rate": 7.027112360815648e-07, - "loss": 0.84347707, - "num_input_tokens_seen": 263070835, - "step": 12190, - "time_per_iteration": 2.701537609100342 - }, - { - "auxiliary_loss_clip": 0.01073099, - "auxiliary_loss_mlp": 0.01036113, - "balance_loss_clip": 1.03755641, - "balance_loss_mlp": 1.02225447, - "epoch": 0.7329625732752142, - "flos": 24164038661760.0, - "grad_norm": 1.6849977085368404, - "language_loss": 0.71588874, - "learning_rate": 7.024148446550204e-07, - "loss": 0.73698092, - "num_input_tokens_seen": 263090070, - "step": 12191, - "time_per_iteration": 2.72813081741333 - }, - { - "auxiliary_loss_clip": 0.01112512, - "auxiliary_loss_mlp": 0.01035784, - "balance_loss_clip": 1.03892088, - "balance_loss_mlp": 1.02245009, - "epoch": 0.7330226965278822, - "flos": 30077915627520.0, - "grad_norm": 1.5354384218805013, - "language_loss": 0.69254857, - "learning_rate": 7.021185024330361e-07, - "loss": 0.71403152, - "num_input_tokens_seen": 263110030, - "step": 12192, - "time_per_iteration": 2.6177656650543213 - }, - { - "auxiliary_loss_clip": 0.01099104, - "auxiliary_loss_mlp": 0.01030904, - "balance_loss_clip": 1.0388236, - "balance_loss_mlp": 1.01836967, - "epoch": 0.7330828197805501, - "flos": 23368294713600.0, - "grad_norm": 1.627423362173816, - "language_loss": 0.73143125, - "learning_rate": 7.01822209426848e-07, - "loss": 0.75273132, - "num_input_tokens_seen": 263129735, - "step": 12193, - "time_per_iteration": 2.6829118728637695 - }, - { - "auxiliary_loss_clip": 0.01094199, - "auxiliary_loss_mlp": 0.01034877, - "balance_loss_clip": 1.03632629, - "balance_loss_mlp": 1.02171612, - "epoch": 0.7331429430332181, - "flos": 21032808410880.0, - "grad_norm": 2.400736232898333, - "language_loss": 0.76939815, - "learning_rate": 7.015259656476911e-07, - "loss": 0.79068899, - "num_input_tokens_seen": 263149100, - "step": 12194, - "time_per_iteration": 2.589165687561035 - }, - { - "auxiliary_loss_clip": 0.01100113, - "auxiliary_loss_mlp": 0.01029986, - "balance_loss_clip": 1.03972054, - "balance_loss_mlp": 1.01695681, - "epoch": 0.733203066285886, - "flos": 14647891466880.0, - "grad_norm": 1.9190061960430176, - "language_loss": 0.70403659, - "learning_rate": 7.012297711067998e-07, - "loss": 0.72533756, - "num_input_tokens_seen": 263166620, - "step": 12195, - "time_per_iteration": 2.550752639770508 - }, - { - "auxiliary_loss_clip": 0.01111325, - "auxiliary_loss_mlp": 0.01036105, - "balance_loss_clip": 1.03835511, - "balance_loss_mlp": 1.02386189, - "epoch": 0.7332631895385541, - "flos": 17165301177600.0, - "grad_norm": 1.958340476490106, - "language_loss": 0.72090805, - "learning_rate": 7.009336258154057e-07, - "loss": 0.74238235, - "num_input_tokens_seen": 263184780, - "step": 12196, - "time_per_iteration": 2.540836811065674 - }, - { - "auxiliary_loss_clip": 0.01111546, - "auxiliary_loss_mlp": 0.01030796, - "balance_loss_clip": 1.04016924, - "balance_loss_mlp": 1.01791, - "epoch": 0.733323312791222, - "flos": 28658151676800.0, - "grad_norm": 1.92503318264866, - "language_loss": 0.71952534, - "learning_rate": 7.006375297847394e-07, - "loss": 0.7409488, - "num_input_tokens_seen": 263204625, - "step": 12197, - "time_per_iteration": 2.6192398071289062 - }, - { - "auxiliary_loss_clip": 0.01058905, - "auxiliary_loss_mlp": 0.00771452, - "balance_loss_clip": 1.03431988, - "balance_loss_mlp": 1.00020027, - "epoch": 0.73338343604389, - "flos": 16618417632000.0, - "grad_norm": 3.2701178801425983, - "language_loss": 0.77824599, - "learning_rate": 7.003414830260282e-07, - "loss": 0.79654956, - "num_input_tokens_seen": 263221565, - "step": 12198, - "time_per_iteration": 2.751495599746704 - }, - { - "auxiliary_loss_clip": 0.0105527, - "auxiliary_loss_mlp": 0.01033337, - "balance_loss_clip": 1.0351963, - "balance_loss_mlp": 1.02071261, - "epoch": 0.7334435592965579, - "flos": 21142084561920.0, - "grad_norm": 1.9440363866172514, - "language_loss": 0.74263847, - "learning_rate": 7.000454855504974e-07, - "loss": 0.76352453, - "num_input_tokens_seen": 263240620, - "step": 12199, - "time_per_iteration": 2.767896890640259 - }, - { - "auxiliary_loss_clip": 0.01094013, - "auxiliary_loss_mlp": 0.01032416, - "balance_loss_clip": 1.03940797, - "balance_loss_mlp": 1.01919568, - "epoch": 0.7335036825492259, - "flos": 17125332318720.0, - "grad_norm": 2.5044351330443377, - "language_loss": 0.76926482, - "learning_rate": 6.997495373693729e-07, - "loss": 0.79052913, - "num_input_tokens_seen": 263254365, - "step": 12200, - "time_per_iteration": 2.6367027759552 - }, - { - "auxiliary_loss_clip": 0.01074082, - "auxiliary_loss_mlp": 0.01027226, - "balance_loss_clip": 1.03776014, - "balance_loss_mlp": 1.01524007, - "epoch": 0.7335638058018938, - "flos": 23731818307200.0, - "grad_norm": 2.389152390847936, - "language_loss": 0.61618876, - "learning_rate": 6.994536384938754e-07, - "loss": 0.63720185, - "num_input_tokens_seen": 263275880, - "step": 12201, - "time_per_iteration": 2.6798954010009766 - }, - { - "auxiliary_loss_clip": 0.0107342, - "auxiliary_loss_mlp": 0.00770019, - "balance_loss_clip": 1.03417397, - "balance_loss_mlp": 1.00014138, - "epoch": 0.7336239290545619, - "flos": 34933289679360.0, - "grad_norm": 2.0307356501592526, - "language_loss": 0.52253979, - "learning_rate": 6.991577889352264e-07, - "loss": 0.5409742, - "num_input_tokens_seen": 263298315, - "step": 12202, - "time_per_iteration": 2.8340702056884766 - }, - { - "auxiliary_loss_clip": 0.01087087, - "auxiliary_loss_mlp": 0.01030166, - "balance_loss_clip": 1.03677177, - "balance_loss_mlp": 1.017923, - "epoch": 0.7336840523072298, - "flos": 21103049456640.0, - "grad_norm": 1.7212231979753123, - "language_loss": 0.68485624, - "learning_rate": 6.98861988704645e-07, - "loss": 0.70602876, - "num_input_tokens_seen": 263318615, - "step": 12203, - "time_per_iteration": 2.642812967300415 - }, - { - "auxiliary_loss_clip": 0.01088423, - "auxiliary_loss_mlp": 0.01037493, - "balance_loss_clip": 1.03938603, - "balance_loss_mlp": 1.02476776, - "epoch": 0.7337441755598978, - "flos": 24024418496640.0, - "grad_norm": 2.034834601717817, - "language_loss": 0.6607222, - "learning_rate": 6.985662378133474e-07, - "loss": 0.68198133, - "num_input_tokens_seen": 263336705, - "step": 12204, - "time_per_iteration": 2.74241042137146 - }, - { - "auxiliary_loss_clip": 0.01089625, - "auxiliary_loss_mlp": 0.01034455, - "balance_loss_clip": 1.04081655, - "balance_loss_mlp": 1.02211094, - "epoch": 0.7338042988125658, - "flos": 22711309004160.0, - "grad_norm": 1.8580582529828333, - "language_loss": 0.77225935, - "learning_rate": 6.982705362725479e-07, - "loss": 0.79350007, - "num_input_tokens_seen": 263355065, - "step": 12205, - "time_per_iteration": 2.6422648429870605 - }, - { - "auxiliary_loss_clip": 0.01058875, - "auxiliary_loss_mlp": 0.01032513, - "balance_loss_clip": 1.03662992, - "balance_loss_mlp": 1.02064013, - "epoch": 0.7338644220652337, - "flos": 21360996000000.0, - "grad_norm": 2.159301504218906, - "language_loss": 0.79434526, - "learning_rate": 6.979748840934601e-07, - "loss": 0.8152591, - "num_input_tokens_seen": 263374460, - "step": 12206, - "time_per_iteration": 2.722921848297119 - }, - { - "auxiliary_loss_clip": 0.01071317, - "auxiliary_loss_mlp": 0.01031287, - "balance_loss_clip": 1.03451514, - "balance_loss_mlp": 1.01825154, - "epoch": 0.7339245453179017, - "flos": 30920236536960.0, - "grad_norm": 2.0535884600804817, - "language_loss": 0.71176481, - "learning_rate": 6.976792812872958e-07, - "loss": 0.73279089, - "num_input_tokens_seen": 263393610, - "step": 12207, - "time_per_iteration": 2.9302005767822266 - }, - { - "auxiliary_loss_clip": 0.01014266, - "auxiliary_loss_mlp": 0.01003684, - "balance_loss_clip": 1.01024389, - "balance_loss_mlp": 1.00252759, - "epoch": 0.7339846685705697, - "flos": 67899429072000.0, - "grad_norm": 0.7780632600453249, - "language_loss": 0.54746544, - "learning_rate": 6.97383727865263e-07, - "loss": 0.56764495, - "num_input_tokens_seen": 263450340, - "step": 12208, - "time_per_iteration": 3.267242431640625 - }, - { - "auxiliary_loss_clip": 0.01111313, - "auxiliary_loss_mlp": 0.01029971, - "balance_loss_clip": 1.03991294, - "balance_loss_mlp": 1.01901555, - "epoch": 0.7340447918232377, - "flos": 22236749493120.0, - "grad_norm": 1.4520136816915177, - "language_loss": 0.8051306, - "learning_rate": 6.970882238385703e-07, - "loss": 0.82654339, - "num_input_tokens_seen": 263471735, - "step": 12209, - "time_per_iteration": 2.6250216960906982 - }, - { - "auxiliary_loss_clip": 0.01108587, - "auxiliary_loss_mlp": 0.01033248, - "balance_loss_clip": 1.0370816, - "balance_loss_mlp": 1.02134514, - "epoch": 0.7341049150759056, - "flos": 23764784014080.0, - "grad_norm": 1.461722216284673, - "language_loss": 0.79026657, - "learning_rate": 6.96792769218423e-07, - "loss": 0.81168497, - "num_input_tokens_seen": 263493245, - "step": 12210, - "time_per_iteration": 2.5592970848083496 - }, - { - "auxiliary_loss_clip": 0.01108387, - "auxiliary_loss_mlp": 0.01029679, - "balance_loss_clip": 1.03799284, - "balance_loss_mlp": 1.01709008, - "epoch": 0.7341650383285736, - "flos": 17236547804160.0, - "grad_norm": 1.73695170749579, - "language_loss": 0.76122808, - "learning_rate": 6.964973640160236e-07, - "loss": 0.78260869, - "num_input_tokens_seen": 263511660, - "step": 12211, - "time_per_iteration": 2.571751117706299 - }, - { - "auxiliary_loss_clip": 0.01087498, - "auxiliary_loss_mlp": 0.01031413, - "balance_loss_clip": 1.03891158, - "balance_loss_mlp": 1.018592, - "epoch": 0.7342251615812415, - "flos": 23403953940480.0, - "grad_norm": 6.531715121329498, - "language_loss": 0.71997905, - "learning_rate": 6.962020082425748e-07, - "loss": 0.74116814, - "num_input_tokens_seen": 263530875, - "step": 12212, - "time_per_iteration": 2.6509475708007812 - }, - { - "auxiliary_loss_clip": 0.01112722, - "auxiliary_loss_mlp": 0.01033281, - "balance_loss_clip": 1.04100943, - "balance_loss_mlp": 1.02054381, - "epoch": 0.7342852848339095, - "flos": 22747183712640.0, - "grad_norm": 1.5833725401172443, - "language_loss": 0.68744397, - "learning_rate": 6.959067019092766e-07, - "loss": 0.70890403, - "num_input_tokens_seen": 263551585, - "step": 12213, - "time_per_iteration": 2.5494189262390137 - }, - { - "auxiliary_loss_clip": 0.010305, - "auxiliary_loss_mlp": 0.01005419, - "balance_loss_clip": 1.00768566, - "balance_loss_mlp": 1.004251, - "epoch": 0.7343454080865774, - "flos": 53942353925760.0, - "grad_norm": 0.7305513742092771, - "language_loss": 0.54231656, - "learning_rate": 6.956114450273276e-07, - "loss": 0.56267571, - "num_input_tokens_seen": 263609545, - "step": 12214, - "time_per_iteration": 3.0239064693450928 - }, - { - "auxiliary_loss_clip": 0.01112827, - "auxiliary_loss_mlp": 0.01031447, - "balance_loss_clip": 1.03797483, - "balance_loss_mlp": 1.01904964, - "epoch": 0.7344055313392455, - "flos": 12166859255040.0, - "grad_norm": 1.9946109817082227, - "language_loss": 0.70621991, - "learning_rate": 6.953162376079233e-07, - "loss": 0.72766268, - "num_input_tokens_seen": 263627880, - "step": 12215, - "time_per_iteration": 2.5570547580718994 - }, - { - "auxiliary_loss_clip": 0.01082063, - "auxiliary_loss_mlp": 0.01033308, - "balance_loss_clip": 1.03650701, - "balance_loss_mlp": 1.02130389, - "epoch": 0.7344656545919134, - "flos": 18550052346240.0, - "grad_norm": 1.5883175175393598, - "language_loss": 0.72867477, - "learning_rate": 6.950210796622573e-07, - "loss": 0.74982846, - "num_input_tokens_seen": 263645665, - "step": 12216, - "time_per_iteration": 2.621229887008667 - }, - { - "auxiliary_loss_clip": 0.0111704, - "auxiliary_loss_mlp": 0.01039831, - "balance_loss_clip": 1.0392859, - "balance_loss_mlp": 1.02483487, - "epoch": 0.7345257778445814, - "flos": 23661649088640.0, - "grad_norm": 1.6902289453280186, - "language_loss": 0.78386879, - "learning_rate": 6.947259712015236e-07, - "loss": 0.80543745, - "num_input_tokens_seen": 263668170, - "step": 12217, - "time_per_iteration": 2.594928503036499 - }, - { - "auxiliary_loss_clip": 0.01072057, - "auxiliary_loss_mlp": 0.01027279, - "balance_loss_clip": 1.03669691, - "balance_loss_mlp": 1.01602566, - "epoch": 0.7345859010972494, - "flos": 13808659127040.0, - "grad_norm": 1.9223508730662753, - "language_loss": 0.77991557, - "learning_rate": 6.94430912236911e-07, - "loss": 0.80090904, - "num_input_tokens_seen": 263684190, - "step": 12218, - "time_per_iteration": 4.173985958099365 - }, - { - "auxiliary_loss_clip": 0.01060122, - "auxiliary_loss_mlp": 0.01038245, - "balance_loss_clip": 1.03246057, - "balance_loss_mlp": 1.02410722, - "epoch": 0.7346460243499173, - "flos": 22272731942400.0, - "grad_norm": 1.7300149246142222, - "language_loss": 0.71998847, - "learning_rate": 6.941359027796092e-07, - "loss": 0.74097216, - "num_input_tokens_seen": 263702095, - "step": 12219, - "time_per_iteration": 2.7360141277313232 - }, - { - "auxiliary_loss_clip": 0.01084965, - "auxiliary_loss_mlp": 0.01031146, - "balance_loss_clip": 1.03496408, - "balance_loss_mlp": 1.01936817, - "epoch": 0.7347061476025853, - "flos": 23255247634560.0, - "grad_norm": 6.086208044404794, - "language_loss": 0.74677491, - "learning_rate": 6.938409428408061e-07, - "loss": 0.76793599, - "num_input_tokens_seen": 263721385, - "step": 12220, - "time_per_iteration": 4.237574577331543 - }, - { - "auxiliary_loss_clip": 0.01101059, - "auxiliary_loss_mlp": 0.01032621, - "balance_loss_clip": 1.03634357, - "balance_loss_mlp": 1.02002692, - "epoch": 0.7347662708552533, - "flos": 15267565923840.0, - "grad_norm": 1.7582091320116324, - "language_loss": 0.65720487, - "learning_rate": 6.93546032431684e-07, - "loss": 0.67854166, - "num_input_tokens_seen": 263737835, - "step": 12221, - "time_per_iteration": 4.174748182296753 - }, - { - "auxiliary_loss_clip": 0.0108489, - "auxiliary_loss_mlp": 0.01039185, - "balance_loss_clip": 1.0352186, - "balance_loss_mlp": 1.02567315, - "epoch": 0.7348263941079213, - "flos": 24859987649280.0, - "grad_norm": 1.907694939441604, - "language_loss": 0.69323444, - "learning_rate": 6.932511715634273e-07, - "loss": 0.71447521, - "num_input_tokens_seen": 263756480, - "step": 12222, - "time_per_iteration": 2.704784393310547 - }, - { - "auxiliary_loss_clip": 0.01063424, - "auxiliary_loss_mlp": 0.01030995, - "balance_loss_clip": 1.03514957, - "balance_loss_mlp": 1.01988506, - "epoch": 0.7348865173605892, - "flos": 24352103295360.0, - "grad_norm": 1.9184398882939155, - "language_loss": 0.66062474, - "learning_rate": 6.92956360247217e-07, - "loss": 0.68156886, - "num_input_tokens_seen": 263776440, - "step": 12223, - "time_per_iteration": 2.8198130130767822 - }, - { - "auxiliary_loss_clip": 0.01094086, - "auxiliary_loss_mlp": 0.01029505, - "balance_loss_clip": 1.03635502, - "balance_loss_mlp": 1.01708925, - "epoch": 0.7349466406132572, - "flos": 20004613597440.0, - "grad_norm": 1.6947927626477597, - "language_loss": 0.72573948, - "learning_rate": 6.926615984942332e-07, - "loss": 0.7469753, - "num_input_tokens_seen": 263793700, - "step": 12224, - "time_per_iteration": 4.08525276184082 - }, - { - "auxiliary_loss_clip": 0.01085057, - "auxiliary_loss_mlp": 0.01029564, - "balance_loss_clip": 1.04095888, - "balance_loss_mlp": 1.01713049, - "epoch": 0.7350067638659251, - "flos": 29825068815360.0, - "grad_norm": 1.830057292997908, - "language_loss": 0.72199714, - "learning_rate": 6.92366886315652e-07, - "loss": 0.74314332, - "num_input_tokens_seen": 263814620, - "step": 12225, - "time_per_iteration": 2.736055850982666 - }, - { - "auxiliary_loss_clip": 0.0111514, - "auxiliary_loss_mlp": 0.01035041, - "balance_loss_clip": 1.03917527, - "balance_loss_mlp": 1.02134943, - "epoch": 0.7350668871185931, - "flos": 21866150920320.0, - "grad_norm": 1.7365051701265057, - "language_loss": 0.76401973, - "learning_rate": 6.920722237226501e-07, - "loss": 0.78552151, - "num_input_tokens_seen": 263832725, - "step": 12226, - "time_per_iteration": 2.578805446624756 - }, - { - "auxiliary_loss_clip": 0.01085278, - "auxiliary_loss_mlp": 0.0103433, - "balance_loss_clip": 1.03646374, - "balance_loss_mlp": 1.01977456, - "epoch": 0.735127010371261, - "flos": 22566122231040.0, - "grad_norm": 1.442598448518307, - "language_loss": 0.6717149, - "learning_rate": 6.917776107264008e-07, - "loss": 0.69291103, - "num_input_tokens_seen": 263853850, - "step": 12227, - "time_per_iteration": 2.638720989227295 - }, - { - "auxiliary_loss_clip": 0.01101144, - "auxiliary_loss_mlp": 0.0103552, - "balance_loss_clip": 1.03755474, - "balance_loss_mlp": 1.02331293, - "epoch": 0.7351871336239291, - "flos": 25884339707520.0, - "grad_norm": 2.1955172179062536, - "language_loss": 0.63554502, - "learning_rate": 6.914830473380749e-07, - "loss": 0.65691161, - "num_input_tokens_seen": 263874760, - "step": 12228, - "time_per_iteration": 2.646679162979126 - }, - { - "auxiliary_loss_clip": 0.0109047, - "auxiliary_loss_mlp": 0.01036115, - "balance_loss_clip": 1.03838301, - "balance_loss_mlp": 1.02450967, - "epoch": 0.735247256876597, - "flos": 17932173569280.0, - "grad_norm": 1.6447533892101769, - "language_loss": 0.63384873, - "learning_rate": 6.911885335688427e-07, - "loss": 0.65511459, - "num_input_tokens_seen": 263893390, - "step": 12229, - "time_per_iteration": 2.626433849334717 - }, - { - "auxiliary_loss_clip": 0.01087319, - "auxiliary_loss_mlp": 0.01037821, - "balance_loss_clip": 1.03916466, - "balance_loss_mlp": 1.02470779, - "epoch": 0.735307380129265, - "flos": 28875159694080.0, - "grad_norm": 1.6569871387550634, - "language_loss": 0.73374206, - "learning_rate": 6.908940694298726e-07, - "loss": 0.75499344, - "num_input_tokens_seen": 263911180, - "step": 12230, - "time_per_iteration": 2.719008207321167 - }, - { - "auxiliary_loss_clip": 0.01058297, - "auxiliary_loss_mlp": 0.01032553, - "balance_loss_clip": 1.03558922, - "balance_loss_mlp": 1.0192132, - "epoch": 0.7353675033819329, - "flos": 13625658311040.0, - "grad_norm": 2.410798964065256, - "language_loss": 0.72446096, - "learning_rate": 6.90599654932332e-07, - "loss": 0.74536955, - "num_input_tokens_seen": 263928975, - "step": 12231, - "time_per_iteration": 2.7233800888061523 - }, - { - "auxiliary_loss_clip": 0.01102609, - "auxiliary_loss_mlp": 0.01037553, - "balance_loss_clip": 1.0392592, - "balance_loss_mlp": 1.0230689, - "epoch": 0.7354276266346009, - "flos": 19463081178240.0, - "grad_norm": 2.5985105749536332, - "language_loss": 0.63813508, - "learning_rate": 6.903052900873823e-07, - "loss": 0.65953672, - "num_input_tokens_seen": 263944495, - "step": 12232, - "time_per_iteration": 2.626089334487915 - }, - { - "auxiliary_loss_clip": 0.0109166, - "auxiliary_loss_mlp": 0.01032177, - "balance_loss_clip": 1.03764665, - "balance_loss_mlp": 1.01987481, - "epoch": 0.735487749887269, - "flos": 15771858917760.0, - "grad_norm": 1.7852756816189446, - "language_loss": 0.75511599, - "learning_rate": 6.900109749061874e-07, - "loss": 0.77635431, - "num_input_tokens_seen": 263961325, - "step": 12233, - "time_per_iteration": 2.614691972732544 - }, - { - "auxiliary_loss_clip": 0.01112187, - "auxiliary_loss_mlp": 0.01028949, - "balance_loss_clip": 1.03919733, - "balance_loss_mlp": 1.01619315, - "epoch": 0.7355478731399369, - "flos": 18260648467200.0, - "grad_norm": 4.244761548872676, - "language_loss": 0.73351365, - "learning_rate": 6.897167093999079e-07, - "loss": 0.75492501, - "num_input_tokens_seen": 263980445, - "step": 12234, - "time_per_iteration": 2.5742101669311523 - }, - { - "auxiliary_loss_clip": 0.01099473, - "auxiliary_loss_mlp": 0.0103086, - "balance_loss_clip": 1.03804564, - "balance_loss_mlp": 1.01721096, - "epoch": 0.7356079963926049, - "flos": 26542043688960.0, - "grad_norm": 2.1824026453384078, - "language_loss": 0.59852672, - "learning_rate": 6.894224935797017e-07, - "loss": 0.61983013, - "num_input_tokens_seen": 263999330, - "step": 12235, - "time_per_iteration": 2.661247730255127 - }, - { - "auxiliary_loss_clip": 0.01088694, - "auxiliary_loss_mlp": 0.01027233, - "balance_loss_clip": 1.03844726, - "balance_loss_mlp": 1.01487708, - "epoch": 0.7356681196452728, - "flos": 10778624467200.0, - "grad_norm": 2.763935396627176, - "language_loss": 0.85834122, - "learning_rate": 6.891283274567259e-07, - "loss": 0.87950051, - "num_input_tokens_seen": 264014150, - "step": 12236, - "time_per_iteration": 2.589035749435425 - }, - { - "auxiliary_loss_clip": 0.0110083, - "auxiliary_loss_mlp": 0.00769741, - "balance_loss_clip": 1.03816271, - "balance_loss_mlp": 1.00019503, - "epoch": 0.7357282428979408, - "flos": 19718693337600.0, - "grad_norm": 5.1654234015242215, - "language_loss": 0.69555867, - "learning_rate": 6.888342110421364e-07, - "loss": 0.71426433, - "num_input_tokens_seen": 264033140, - "step": 12237, - "time_per_iteration": 2.652851104736328 - }, - { - "auxiliary_loss_clip": 0.01022711, - "auxiliary_loss_mlp": 0.01031396, - "balance_loss_clip": 1.02870941, - "balance_loss_mlp": 1.01868236, - "epoch": 0.7357883661506087, - "flos": 19464014931840.0, - "grad_norm": 1.6842160267600648, - "language_loss": 0.72287041, - "learning_rate": 6.885401443470839e-07, - "loss": 0.74341154, - "num_input_tokens_seen": 264052105, - "step": 12238, - "time_per_iteration": 2.887967586517334 - }, - { - "auxiliary_loss_clip": 0.0108237, - "auxiliary_loss_mlp": 0.01030077, - "balance_loss_clip": 1.03519797, - "balance_loss_mlp": 1.01672542, - "epoch": 0.7358484894032767, - "flos": 27123006263040.0, - "grad_norm": 2.119394608491001, - "language_loss": 0.72818553, - "learning_rate": 6.882461273827205e-07, - "loss": 0.74930996, - "num_input_tokens_seen": 264070690, - "step": 12239, - "time_per_iteration": 3.308215618133545 - }, - { - "auxiliary_loss_clip": 0.01079481, - "auxiliary_loss_mlp": 0.0103047, - "balance_loss_clip": 1.03759682, - "balance_loss_mlp": 1.01827478, - "epoch": 0.7359086126559446, - "flos": 24502282058880.0, - "grad_norm": 1.656407411551667, - "language_loss": 0.78889048, - "learning_rate": 6.879521601601954e-07, - "loss": 0.80998993, - "num_input_tokens_seen": 264094225, - "step": 12240, - "time_per_iteration": 2.6716065406799316 - }, - { - "auxiliary_loss_clip": 0.01101629, - "auxiliary_loss_mlp": 0.01037535, - "balance_loss_clip": 1.03955805, - "balance_loss_mlp": 1.02480888, - "epoch": 0.7359687359086127, - "flos": 23331270769920.0, - "grad_norm": 1.888852774104125, - "language_loss": 0.82579136, - "learning_rate": 6.876582426906565e-07, - "loss": 0.84718299, - "num_input_tokens_seen": 264113190, - "step": 12241, - "time_per_iteration": 2.687603712081909 - }, - { - "auxiliary_loss_clip": 0.01097273, - "auxiliary_loss_mlp": 0.01025951, - "balance_loss_clip": 1.03536153, - "balance_loss_mlp": 1.01373816, - "epoch": 0.7360288591612806, - "flos": 20193396503040.0, - "grad_norm": 1.823724311239111, - "language_loss": 0.78747702, - "learning_rate": 6.873643749852484e-07, - "loss": 0.80870926, - "num_input_tokens_seen": 264132050, - "step": 12242, - "time_per_iteration": 2.6332826614379883 - }, - { - "auxiliary_loss_clip": 0.01062855, - "auxiliary_loss_mlp": 0.01032129, - "balance_loss_clip": 1.03485787, - "balance_loss_mlp": 1.01942182, - "epoch": 0.7360889824139486, - "flos": 24972783333120.0, - "grad_norm": 1.7248872165867588, - "language_loss": 0.79574555, - "learning_rate": 6.870705570551145e-07, - "loss": 0.81669545, - "num_input_tokens_seen": 264152800, - "step": 12243, - "time_per_iteration": 2.6513876914978027 - }, - { - "auxiliary_loss_clip": 0.01101249, - "auxiliary_loss_mlp": 0.01032749, - "balance_loss_clip": 1.03733206, - "balance_loss_mlp": 1.01998186, - "epoch": 0.7361491056666165, - "flos": 15012312900480.0, - "grad_norm": 2.291279589424139, - "language_loss": 0.74445826, - "learning_rate": 6.867767889113969e-07, - "loss": 0.76579821, - "num_input_tokens_seen": 264169650, - "step": 12244, - "time_per_iteration": 2.4683594703674316 - }, - { - "auxiliary_loss_clip": 0.01094664, - "auxiliary_loss_mlp": 0.01032807, - "balance_loss_clip": 1.03583598, - "balance_loss_mlp": 1.02007556, - "epoch": 0.7362092289192845, - "flos": 22930400010240.0, - "grad_norm": 1.867590406442262, - "language_loss": 0.69203222, - "learning_rate": 6.864830705652347e-07, - "loss": 0.7133069, - "num_input_tokens_seen": 264190530, - "step": 12245, - "time_per_iteration": 2.687621831893921 - }, - { - "auxiliary_loss_clip": 0.01072242, - "auxiliary_loss_mlp": 0.01034229, - "balance_loss_clip": 1.03500962, - "balance_loss_mlp": 1.02093101, - "epoch": 0.7362693521719526, - "flos": 20702681487360.0, - "grad_norm": 1.5504904420549481, - "language_loss": 0.73484623, - "learning_rate": 6.861894020277658e-07, - "loss": 0.75591099, - "num_input_tokens_seen": 264210820, - "step": 12246, - "time_per_iteration": 2.73628568649292 - }, - { - "auxiliary_loss_clip": 0.01084679, - "auxiliary_loss_mlp": 0.01025875, - "balance_loss_clip": 1.03512716, - "balance_loss_mlp": 1.01378119, - "epoch": 0.7363294754246205, - "flos": 13111381336320.0, - "grad_norm": 2.1569575321455163, - "language_loss": 0.73685145, - "learning_rate": 6.858957833101266e-07, - "loss": 0.75795692, - "num_input_tokens_seen": 264227430, - "step": 12247, - "time_per_iteration": 2.5930237770080566 - }, - { - "auxiliary_loss_clip": 0.01101325, - "auxiliary_loss_mlp": 0.01032427, - "balance_loss_clip": 1.04162931, - "balance_loss_mlp": 1.02031505, - "epoch": 0.7363895986772885, - "flos": 14027426910720.0, - "grad_norm": 1.6102027523975817, - "language_loss": 0.7423265, - "learning_rate": 6.856022144234526e-07, - "loss": 0.76366401, - "num_input_tokens_seen": 264245230, - "step": 12248, - "time_per_iteration": 2.5792789459228516 - }, - { - "auxiliary_loss_clip": 0.0109033, - "auxiliary_loss_mlp": 0.01033502, - "balance_loss_clip": 1.03816319, - "balance_loss_mlp": 1.02057934, - "epoch": 0.7364497219299564, - "flos": 19719986227200.0, - "grad_norm": 1.8750204418443517, - "language_loss": 0.72477007, - "learning_rate": 6.853086953788727e-07, - "loss": 0.7460084, - "num_input_tokens_seen": 264263945, - "step": 12249, - "time_per_iteration": 2.624386787414551 - }, - { - "auxiliary_loss_clip": 0.01089724, - "auxiliary_loss_mlp": 0.01033559, - "balance_loss_clip": 1.03801394, - "balance_loss_mlp": 1.02015996, - "epoch": 0.7365098451826244, - "flos": 21361391049600.0, - "grad_norm": 2.586847113789983, - "language_loss": 0.77382159, - "learning_rate": 6.850152261875189e-07, - "loss": 0.7950545, - "num_input_tokens_seen": 264281500, - "step": 12250, - "time_per_iteration": 2.6388142108917236 - }, - { - "auxiliary_loss_clip": 0.01066882, - "auxiliary_loss_mlp": 0.01031238, - "balance_loss_clip": 1.03667164, - "balance_loss_mlp": 1.01857233, - "epoch": 0.7365699684352923, - "flos": 23368222886400.0, - "grad_norm": 1.6519467305081468, - "language_loss": 0.71352232, - "learning_rate": 6.8472180686052e-07, - "loss": 0.73450345, - "num_input_tokens_seen": 264301625, - "step": 12251, - "time_per_iteration": 2.7391629219055176 - }, - { - "auxiliary_loss_clip": 0.01095208, - "auxiliary_loss_mlp": 0.01035371, - "balance_loss_clip": 1.03801441, - "balance_loss_mlp": 1.0229789, - "epoch": 0.7366300916879603, - "flos": 59524879927680.0, - "grad_norm": 1.575545988693255, - "language_loss": 0.65908438, - "learning_rate": 6.844284374090015e-07, - "loss": 0.68039018, - "num_input_tokens_seen": 264323975, - "step": 12252, - "time_per_iteration": 2.9795963764190674 - }, - { - "auxiliary_loss_clip": 0.0106263, - "auxiliary_loss_mlp": 0.01035896, - "balance_loss_clip": 1.03544736, - "balance_loss_mlp": 1.02261591, - "epoch": 0.7366902149406283, - "flos": 20923137210240.0, - "grad_norm": 1.669933486125426, - "language_loss": 0.79418141, - "learning_rate": 6.841351178440884e-07, - "loss": 0.81516671, - "num_input_tokens_seen": 264343785, - "step": 12253, - "time_per_iteration": 2.762692451477051 - }, - { - "auxiliary_loss_clip": 0.01107479, - "auxiliary_loss_mlp": 0.00769571, - "balance_loss_clip": 1.03836572, - "balance_loss_mlp": 1.00025702, - "epoch": 0.7367503381932963, - "flos": 17348158339200.0, - "grad_norm": 2.0410258772790604, - "language_loss": 0.76204622, - "learning_rate": 6.83841848176905e-07, - "loss": 0.78081673, - "num_input_tokens_seen": 264361130, - "step": 12254, - "time_per_iteration": 2.518159866333008 - }, - { - "auxiliary_loss_clip": 0.01085242, - "auxiliary_loss_mlp": 0.01042117, - "balance_loss_clip": 1.03690898, - "balance_loss_mlp": 1.02805638, - "epoch": 0.7368104614459642, - "flos": 17821317219840.0, - "grad_norm": 4.287032087933439, - "language_loss": 0.7025637, - "learning_rate": 6.835486284185692e-07, - "loss": 0.72383738, - "num_input_tokens_seen": 264376965, - "step": 12255, - "time_per_iteration": 2.589442729949951 - }, - { - "auxiliary_loss_clip": 0.0110157, - "auxiliary_loss_mlp": 0.01029844, - "balance_loss_clip": 1.03971469, - "balance_loss_mlp": 1.01649857, - "epoch": 0.7368705846986322, - "flos": 24606099342720.0, - "grad_norm": 1.8002690456311732, - "language_loss": 0.75496477, - "learning_rate": 6.832554585802012e-07, - "loss": 0.77627891, - "num_input_tokens_seen": 264396310, - "step": 12256, - "time_per_iteration": 2.6408097743988037 - }, - { - "auxiliary_loss_clip": 0.0110194, - "auxiliary_loss_mlp": 0.01031829, - "balance_loss_clip": 1.03902447, - "balance_loss_mlp": 1.01861525, - "epoch": 0.7369307079513001, - "flos": 34970169968640.0, - "grad_norm": 1.8159152177837306, - "language_loss": 0.73517919, - "learning_rate": 6.829623386729182e-07, - "loss": 0.75651693, - "num_input_tokens_seen": 264418085, - "step": 12257, - "time_per_iteration": 2.6984493732452393 - }, - { - "auxiliary_loss_clip": 0.01092873, - "auxiliary_loss_mlp": 0.01038875, - "balance_loss_clip": 1.03521228, - "balance_loss_mlp": 1.02668011, - "epoch": 0.7369908312039681, - "flos": 21214588164480.0, - "grad_norm": 1.793311215899037, - "language_loss": 0.78370535, - "learning_rate": 6.826692687078362e-07, - "loss": 0.80502284, - "num_input_tokens_seen": 264437595, - "step": 12258, - "time_per_iteration": 4.2666544914245605 - }, - { - "auxiliary_loss_clip": 0.01103154, - "auxiliary_loss_mlp": 0.0103457, - "balance_loss_clip": 1.03888559, - "balance_loss_mlp": 1.02195156, - "epoch": 0.7370509544566362, - "flos": 23623655477760.0, - "grad_norm": 1.4256743681063133, - "language_loss": 0.66447318, - "learning_rate": 6.823762486960674e-07, - "loss": 0.68585044, - "num_input_tokens_seen": 264457385, - "step": 12259, - "time_per_iteration": 2.6215436458587646 - }, - { - "auxiliary_loss_clip": 0.01101635, - "auxiliary_loss_mlp": 0.01036273, - "balance_loss_clip": 1.0403527, - "balance_loss_mlp": 1.02288604, - "epoch": 0.7371110777093041, - "flos": 24827704300800.0, - "grad_norm": 1.885600567170779, - "language_loss": 0.73500818, - "learning_rate": 6.820832786487225e-07, - "loss": 0.75638729, - "num_input_tokens_seen": 264477205, - "step": 12260, - "time_per_iteration": 5.883468866348267 - }, - { - "auxiliary_loss_clip": 0.01096844, - "auxiliary_loss_mlp": 0.010343, - "balance_loss_clip": 1.0374378, - "balance_loss_mlp": 1.02105618, - "epoch": 0.7371712009619721, - "flos": 23149491016320.0, - "grad_norm": 1.6200420783650578, - "language_loss": 0.73566377, - "learning_rate": 6.817903585769125e-07, - "loss": 0.75697523, - "num_input_tokens_seen": 264497195, - "step": 12261, - "time_per_iteration": 2.611388683319092 - }, - { - "auxiliary_loss_clip": 0.01091123, - "auxiliary_loss_mlp": 0.01034498, - "balance_loss_clip": 1.03705454, - "balance_loss_mlp": 1.02096152, - "epoch": 0.73723132421464, - "flos": 23112898035840.0, - "grad_norm": 1.9187106646052445, - "language_loss": 0.66943705, - "learning_rate": 6.814974884917438e-07, - "loss": 0.69069326, - "num_input_tokens_seen": 264516950, - "step": 12262, - "time_per_iteration": 2.605332374572754 - }, - { - "auxiliary_loss_clip": 0.01112628, - "auxiliary_loss_mlp": 0.01032891, - "balance_loss_clip": 1.03917944, - "balance_loss_mlp": 1.01943254, - "epoch": 0.737291447467308, - "flos": 19273328605440.0, - "grad_norm": 2.61578609371499, - "language_loss": 0.88660431, - "learning_rate": 6.81204668404322e-07, - "loss": 0.90805948, - "num_input_tokens_seen": 264532675, - "step": 12263, - "time_per_iteration": 4.228296279907227 - }, - { - "auxiliary_loss_clip": 0.01107513, - "auxiliary_loss_mlp": 0.01028636, - "balance_loss_clip": 1.03926718, - "balance_loss_mlp": 1.01731133, - "epoch": 0.7373515707199759, - "flos": 25118257415040.0, - "grad_norm": 1.6036669439356246, - "language_loss": 0.67279935, - "learning_rate": 6.809118983257522e-07, - "loss": 0.69416088, - "num_input_tokens_seen": 264555635, - "step": 12264, - "time_per_iteration": 2.6264944076538086 - }, - { - "auxiliary_loss_clip": 0.01107424, - "auxiliary_loss_mlp": 0.01032446, - "balance_loss_clip": 1.0380187, - "balance_loss_mlp": 1.02020919, - "epoch": 0.737411693972644, - "flos": 32408481767040.0, - "grad_norm": 5.628920745941572, - "language_loss": 0.80262679, - "learning_rate": 6.806191782671356e-07, - "loss": 0.82402551, - "num_input_tokens_seen": 264573140, - "step": 12265, - "time_per_iteration": 2.6175074577331543 - }, - { - "auxiliary_loss_clip": 0.01104877, - "auxiliary_loss_mlp": 0.01031656, - "balance_loss_clip": 1.03860068, - "balance_loss_mlp": 1.01912761, - "epoch": 0.7374718172253119, - "flos": 24315797623680.0, - "grad_norm": 2.6431361651655094, - "language_loss": 0.74271613, - "learning_rate": 6.803265082395711e-07, - "loss": 0.76408148, - "num_input_tokens_seen": 264591610, - "step": 12266, - "time_per_iteration": 2.6342427730560303 - }, - { - "auxiliary_loss_clip": 0.01102733, - "auxiliary_loss_mlp": 0.01039895, - "balance_loss_clip": 1.03989673, - "balance_loss_mlp": 1.02624547, - "epoch": 0.7375319404779799, - "flos": 27156115624320.0, - "grad_norm": 1.6143075154919249, - "language_loss": 0.72911859, - "learning_rate": 6.800338882541576e-07, - "loss": 0.75054485, - "num_input_tokens_seen": 264611170, - "step": 12267, - "time_per_iteration": 2.638545036315918 - }, - { - "auxiliary_loss_clip": 0.01075616, - "auxiliary_loss_mlp": 0.01036972, - "balance_loss_clip": 1.03733301, - "balance_loss_mlp": 1.02528942, - "epoch": 0.7375920637306478, - "flos": 18879999701760.0, - "grad_norm": 2.114502804369275, - "language_loss": 0.83173954, - "learning_rate": 6.797413183219923e-07, - "loss": 0.85286546, - "num_input_tokens_seen": 264629365, - "step": 12268, - "time_per_iteration": 2.6624231338500977 - }, - { - "auxiliary_loss_clip": 0.0111022, - "auxiliary_loss_mlp": 0.01043154, - "balance_loss_clip": 1.03934455, - "balance_loss_mlp": 1.03039253, - "epoch": 0.7376521869833158, - "flos": 15669765486720.0, - "grad_norm": 1.8306850804928718, - "language_loss": 0.73056579, - "learning_rate": 6.794487984541677e-07, - "loss": 0.75209951, - "num_input_tokens_seen": 264647915, - "step": 12269, - "time_per_iteration": 2.5542378425598145 - }, - { - "auxiliary_loss_clip": 0.01086703, - "auxiliary_loss_mlp": 0.01036517, - "balance_loss_clip": 1.03575897, - "balance_loss_mlp": 1.02278399, - "epoch": 0.7377123102359837, - "flos": 36971973901440.0, - "grad_norm": 2.033998429253707, - "language_loss": 0.70437771, - "learning_rate": 6.791563286617776e-07, - "loss": 0.72560984, - "num_input_tokens_seen": 264669620, - "step": 12270, - "time_per_iteration": 2.738266706466675 - }, - { - "auxiliary_loss_clip": 0.01096302, - "auxiliary_loss_mlp": 0.01032958, - "balance_loss_clip": 1.03592134, - "balance_loss_mlp": 1.02121568, - "epoch": 0.7377724334886517, - "flos": 24496284487680.0, - "grad_norm": 1.5966861797114758, - "language_loss": 0.69652647, - "learning_rate": 6.788639089559119e-07, - "loss": 0.71781904, - "num_input_tokens_seen": 264689345, - "step": 12271, - "time_per_iteration": 2.664652109146118 - }, - { - "auxiliary_loss_clip": 0.01080906, - "auxiliary_loss_mlp": 0.01034393, - "balance_loss_clip": 1.03928661, - "balance_loss_mlp": 1.02066565, - "epoch": 0.7378325567413198, - "flos": 24390025079040.0, - "grad_norm": 2.652639550639501, - "language_loss": 0.67802662, - "learning_rate": 6.785715393476586e-07, - "loss": 0.69917965, - "num_input_tokens_seen": 264707625, - "step": 12272, - "time_per_iteration": 2.6848604679107666 - }, - { - "auxiliary_loss_clip": 0.01086013, - "auxiliary_loss_mlp": 0.01030527, - "balance_loss_clip": 1.03750646, - "balance_loss_mlp": 1.01848674, - "epoch": 0.7378926799939877, - "flos": 17416388223360.0, - "grad_norm": 2.2309811346874655, - "language_loss": 0.780334, - "learning_rate": 6.782792198481049e-07, - "loss": 0.80149937, - "num_input_tokens_seen": 264725575, - "step": 12273, - "time_per_iteration": 2.635556936264038 - }, - { - "auxiliary_loss_clip": 0.01109904, - "auxiliary_loss_mlp": 0.01030975, - "balance_loss_clip": 1.03768742, - "balance_loss_mlp": 1.01857686, - "epoch": 0.7379528032466557, - "flos": 18474208778880.0, - "grad_norm": 1.8331912360811773, - "language_loss": 0.83564162, - "learning_rate": 6.779869504683355e-07, - "loss": 0.85705042, - "num_input_tokens_seen": 264742855, - "step": 12274, - "time_per_iteration": 2.5715138912200928 - }, - { - "auxiliary_loss_clip": 0.01091523, - "auxiliary_loss_mlp": 0.00771783, - "balance_loss_clip": 1.03963578, - "balance_loss_mlp": 1.00021505, - "epoch": 0.7380129264993236, - "flos": 17821999578240.0, - "grad_norm": 2.3015182106996237, - "language_loss": 0.73600042, - "learning_rate": 6.776947312194341e-07, - "loss": 0.75463349, - "num_input_tokens_seen": 264761155, - "step": 12275, - "time_per_iteration": 2.715363025665283 - }, - { - "auxiliary_loss_clip": 0.01078211, - "auxiliary_loss_mlp": 0.01054085, - "balance_loss_clip": 1.03664327, - "balance_loss_mlp": 1.03894567, - "epoch": 0.7380730497519916, - "flos": 22997372918400.0, - "grad_norm": 1.6539392854769155, - "language_loss": 0.73462373, - "learning_rate": 6.774025621124813e-07, - "loss": 0.75594664, - "num_input_tokens_seen": 264780660, - "step": 12276, - "time_per_iteration": 2.7231481075286865 - }, - { - "auxiliary_loss_clip": 0.01112925, - "auxiliary_loss_mlp": 0.01031524, - "balance_loss_clip": 1.03907084, - "balance_loss_mlp": 1.01874495, - "epoch": 0.7381331730046595, - "flos": 20266259241600.0, - "grad_norm": 1.9864441113033549, - "language_loss": 0.7796191, - "learning_rate": 6.771104431585551e-07, - "loss": 0.80106354, - "num_input_tokens_seen": 264798850, - "step": 12277, - "time_per_iteration": 2.5575850009918213 - }, - { - "auxiliary_loss_clip": 0.01110863, - "auxiliary_loss_mlp": 0.01038626, - "balance_loss_clip": 1.03995776, - "balance_loss_mlp": 1.0259068, - "epoch": 0.7381932962573275, - "flos": 19754532132480.0, - "grad_norm": 2.416998693757566, - "language_loss": 0.78511059, - "learning_rate": 6.768183743687338e-07, - "loss": 0.80660546, - "num_input_tokens_seen": 264816795, - "step": 12278, - "time_per_iteration": 2.542168617248535 - }, - { - "auxiliary_loss_clip": 0.01102779, - "auxiliary_loss_mlp": 0.00771237, - "balance_loss_clip": 1.03840756, - "balance_loss_mlp": 1.00024569, - "epoch": 0.7382534195099955, - "flos": 17305316392320.0, - "grad_norm": 2.0236332127409, - "language_loss": 0.72539043, - "learning_rate": 6.765263557540921e-07, - "loss": 0.74413061, - "num_input_tokens_seen": 264834105, - "step": 12279, - "time_per_iteration": 2.612534761428833 - }, - { - "auxiliary_loss_clip": 0.01103104, - "auxiliary_loss_mlp": 0.01035418, - "balance_loss_clip": 1.03738606, - "balance_loss_mlp": 1.02173257, - "epoch": 0.7383135427626635, - "flos": 18697358021760.0, - "grad_norm": 2.394018024730235, - "language_loss": 0.86069536, - "learning_rate": 6.762343873257034e-07, - "loss": 0.88208055, - "num_input_tokens_seen": 264850895, - "step": 12280, - "time_per_iteration": 2.611475944519043 - }, - { - "auxiliary_loss_clip": 0.01073789, - "auxiliary_loss_mlp": 0.01032032, - "balance_loss_clip": 1.03634775, - "balance_loss_mlp": 1.01885295, - "epoch": 0.7383736660153314, - "flos": 20881300844160.0, - "grad_norm": 1.8693617932134328, - "language_loss": 0.72391272, - "learning_rate": 6.759424690946408e-07, - "loss": 0.74497092, - "num_input_tokens_seen": 264869505, - "step": 12281, - "time_per_iteration": 2.718876361846924 - }, - { - "auxiliary_loss_clip": 0.0106943, - "auxiliary_loss_mlp": 0.01035051, - "balance_loss_clip": 1.0354619, - "balance_loss_mlp": 1.02190232, - "epoch": 0.7384337892679994, - "flos": 20663215418880.0, - "grad_norm": 1.705222549149129, - "language_loss": 0.60742152, - "learning_rate": 6.756506010719711e-07, - "loss": 0.62846637, - "num_input_tokens_seen": 264886915, - "step": 12282, - "time_per_iteration": 2.70023775100708 - }, - { - "auxiliary_loss_clip": 0.01077848, - "auxiliary_loss_mlp": 0.01030119, - "balance_loss_clip": 1.03686452, - "balance_loss_mlp": 1.01697028, - "epoch": 0.7384939125206673, - "flos": 29169627390720.0, - "grad_norm": 1.8611774916735326, - "language_loss": 0.6824851, - "learning_rate": 6.753587832687632e-07, - "loss": 0.70356476, - "num_input_tokens_seen": 264910350, - "step": 12283, - "time_per_iteration": 2.758152484893799 - }, - { - "auxiliary_loss_clip": 0.01112935, - "auxiliary_loss_mlp": 0.00771245, - "balance_loss_clip": 1.040452, - "balance_loss_mlp": 1.00015855, - "epoch": 0.7385540357733353, - "flos": 36312833376000.0, - "grad_norm": 1.7271477850401677, - "language_loss": 0.76260293, - "learning_rate": 6.750670156960832e-07, - "loss": 0.78144467, - "num_input_tokens_seen": 264930705, - "step": 12284, - "time_per_iteration": 2.7076218128204346 - }, - { - "auxiliary_loss_clip": 0.01094916, - "auxiliary_loss_mlp": 0.01035376, - "balance_loss_clip": 1.03557301, - "balance_loss_mlp": 1.02121985, - "epoch": 0.7386141590260034, - "flos": 20302600826880.0, - "grad_norm": 1.9358750531249929, - "language_loss": 0.68962932, - "learning_rate": 6.747752983649954e-07, - "loss": 0.7109322, - "num_input_tokens_seen": 264946975, - "step": 12285, - "time_per_iteration": 2.572366714477539 - }, - { - "auxiliary_loss_clip": 0.01095815, - "auxiliary_loss_mlp": 0.01038084, - "balance_loss_clip": 1.03904641, - "balance_loss_mlp": 1.02421951, - "epoch": 0.7386742822786713, - "flos": 25483792170240.0, - "grad_norm": 1.9975794318154387, - "language_loss": 0.79803824, - "learning_rate": 6.744836312865602e-07, - "loss": 0.81937724, - "num_input_tokens_seen": 264967665, - "step": 12286, - "time_per_iteration": 2.6924288272857666 - }, - { - "auxiliary_loss_clip": 0.01062201, - "auxiliary_loss_mlp": 0.01027877, - "balance_loss_clip": 1.03638017, - "balance_loss_mlp": 1.01515102, - "epoch": 0.7387344055313393, - "flos": 13771958405760.0, - "grad_norm": 2.075219582835579, - "language_loss": 0.65311086, - "learning_rate": 6.741920144718396e-07, - "loss": 0.67401159, - "num_input_tokens_seen": 264985480, - "step": 12287, - "time_per_iteration": 2.7654411792755127 - }, - { - "auxiliary_loss_clip": 0.010848, - "auxiliary_loss_mlp": 0.01026868, - "balance_loss_clip": 1.03562939, - "balance_loss_mlp": 1.01483417, - "epoch": 0.7387945287840072, - "flos": 27855189095040.0, - "grad_norm": 2.1085520874155046, - "language_loss": 0.76855958, - "learning_rate": 6.739004479318903e-07, - "loss": 0.78967619, - "num_input_tokens_seen": 265004790, - "step": 12288, - "time_per_iteration": 2.6597485542297363 - }, - { - "auxiliary_loss_clip": 0.01104274, - "auxiliary_loss_mlp": 0.00771655, - "balance_loss_clip": 1.04053795, - "balance_loss_mlp": 1.00024295, - "epoch": 0.7388546520366752, - "flos": 44233039388160.0, - "grad_norm": 1.5714095328418676, - "language_loss": 0.58359075, - "learning_rate": 6.736089316777684e-07, - "loss": 0.60235, - "num_input_tokens_seen": 265028790, - "step": 12289, - "time_per_iteration": 2.790731906890869 - }, - { - "auxiliary_loss_clip": 0.01031232, - "auxiliary_loss_mlp": 0.00751213, - "balance_loss_clip": 1.00846362, - "balance_loss_mlp": 0.99965459, - "epoch": 0.7389147752893431, - "flos": 70680890638080.0, - "grad_norm": 0.6357735365195177, - "language_loss": 0.49246126, - "learning_rate": 6.733174657205287e-07, - "loss": 0.51028574, - "num_input_tokens_seen": 265096660, - "step": 12290, - "time_per_iteration": 3.243767261505127 - }, - { - "auxiliary_loss_clip": 0.01096247, - "auxiliary_loss_mlp": 0.01035221, - "balance_loss_clip": 1.03841698, - "balance_loss_mlp": 1.02171409, - "epoch": 0.7389748985420111, - "flos": 25994980575360.0, - "grad_norm": 3.780514148170796, - "language_loss": 0.67435575, - "learning_rate": 6.730260500712237e-07, - "loss": 0.69567037, - "num_input_tokens_seen": 265116375, - "step": 12291, - "time_per_iteration": 2.605470895767212 - }, - { - "auxiliary_loss_clip": 0.0099264, - "auxiliary_loss_mlp": 0.01000802, - "balance_loss_clip": 1.00994468, - "balance_loss_mlp": 0.99969369, - "epoch": 0.7390350217946791, - "flos": 54403661318400.0, - "grad_norm": 0.9871071197765896, - "language_loss": 0.60852838, - "learning_rate": 6.727346847409052e-07, - "loss": 0.62846279, - "num_input_tokens_seen": 265161230, - "step": 12292, - "time_per_iteration": 2.888421058654785 - }, - { - "auxiliary_loss_clip": 0.0106381, - "auxiliary_loss_mlp": 0.01034194, - "balance_loss_clip": 1.03513324, - "balance_loss_mlp": 1.0222311, - "epoch": 0.7390951450473471, - "flos": 32196968530560.0, - "grad_norm": 2.192815626746647, - "language_loss": 0.66975296, - "learning_rate": 6.724433697406191e-07, - "loss": 0.69073296, - "num_input_tokens_seen": 265182515, - "step": 12293, - "time_per_iteration": 2.8275856971740723 - }, - { - "auxiliary_loss_clip": 0.01100034, - "auxiliary_loss_mlp": 0.0103067, - "balance_loss_clip": 1.03730226, - "balance_loss_mlp": 1.01779556, - "epoch": 0.739155268300015, - "flos": 16684241304960.0, - "grad_norm": 1.9827271257615733, - "language_loss": 0.83464789, - "learning_rate": 6.721521050814134e-07, - "loss": 0.85595489, - "num_input_tokens_seen": 265198160, - "step": 12294, - "time_per_iteration": 2.597766160964966 - }, - { - "auxiliary_loss_clip": 0.01077206, - "auxiliary_loss_mlp": 0.01033056, - "balance_loss_clip": 1.03740942, - "balance_loss_mlp": 1.0197401, - "epoch": 0.739215391552683, - "flos": 31649761762560.0, - "grad_norm": 1.5365825507794162, - "language_loss": 0.72879148, - "learning_rate": 6.718608907743337e-07, - "loss": 0.74989408, - "num_input_tokens_seen": 265218480, - "step": 12295, - "time_per_iteration": 2.7728140354156494 - }, - { - "auxiliary_loss_clip": 0.0109979, - "auxiliary_loss_mlp": 0.01037539, - "balance_loss_clip": 1.03960156, - "balance_loss_mlp": 1.02521241, - "epoch": 0.7392755148053509, - "flos": 29718522097920.0, - "grad_norm": 2.087551297048025, - "language_loss": 0.7901718, - "learning_rate": 6.715697268304215e-07, - "loss": 0.81154513, - "num_input_tokens_seen": 265240165, - "step": 12296, - "time_per_iteration": 2.7069132328033447 - }, - { - "auxiliary_loss_clip": 0.01112194, - "auxiliary_loss_mlp": 0.01031879, - "balance_loss_clip": 1.03957283, - "balance_loss_mlp": 1.01797891, - "epoch": 0.7393356380580189, - "flos": 37050475075200.0, - "grad_norm": 2.421267182668315, - "language_loss": 0.66443473, - "learning_rate": 6.712786132607182e-07, - "loss": 0.68587548, - "num_input_tokens_seen": 265263295, - "step": 12297, - "time_per_iteration": 4.15710186958313 - }, - { - "auxiliary_loss_clip": 0.01086243, - "auxiliary_loss_mlp": 0.01038586, - "balance_loss_clip": 1.03743219, - "balance_loss_mlp": 1.02521062, - "epoch": 0.739395761310687, - "flos": 19719627091200.0, - "grad_norm": 2.031169028874948, - "language_loss": 0.68639588, - "learning_rate": 6.709875500762645e-07, - "loss": 0.70764422, - "num_input_tokens_seen": 265282740, - "step": 12298, - "time_per_iteration": 2.6803133487701416 - }, - { - "auxiliary_loss_clip": 0.01083526, - "auxiliary_loss_mlp": 0.0103472, - "balance_loss_clip": 1.03630257, - "balance_loss_mlp": 1.02177382, - "epoch": 0.7394558845633549, - "flos": 11801504067840.0, - "grad_norm": 1.810073219689882, - "language_loss": 0.7460804, - "learning_rate": 6.706965372880946e-07, - "loss": 0.76726282, - "num_input_tokens_seen": 265300175, - "step": 12299, - "time_per_iteration": 4.1317057609558105 - }, - { - "auxiliary_loss_clip": 0.01013835, - "auxiliary_loss_mlp": 0.00999495, - "balance_loss_clip": 1.0160886, - "balance_loss_mlp": 0.99818373, - "epoch": 0.7395160078160229, - "flos": 66195827850240.0, - "grad_norm": 0.7191377980528004, - "language_loss": 0.60850734, - "learning_rate": 6.704055749072455e-07, - "loss": 0.62864065, - "num_input_tokens_seen": 265363275, - "step": 12300, - "time_per_iteration": 4.986863136291504 - }, - { - "auxiliary_loss_clip": 0.01084534, - "auxiliary_loss_mlp": 0.01031775, - "balance_loss_clip": 1.03953075, - "balance_loss_mlp": 1.01876962, - "epoch": 0.7395761310686908, - "flos": 21249708687360.0, - "grad_norm": 1.6608612377328966, - "language_loss": 0.80444926, - "learning_rate": 6.7011466294475e-07, - "loss": 0.82561237, - "num_input_tokens_seen": 265382935, - "step": 12301, - "time_per_iteration": 2.635004997253418 - }, - { - "auxiliary_loss_clip": 0.01109746, - "auxiliary_loss_mlp": 0.01029708, - "balance_loss_clip": 1.03857565, - "balance_loss_mlp": 1.01823974, - "epoch": 0.7396362543213588, - "flos": 25955299025280.0, - "grad_norm": 1.5135415761232773, - "language_loss": 0.73152131, - "learning_rate": 6.698238014116406e-07, - "loss": 0.75291586, - "num_input_tokens_seen": 265403245, - "step": 12302, - "time_per_iteration": 2.612121105194092 - }, - { - "auxiliary_loss_clip": 0.01113143, - "auxiliary_loss_mlp": 0.01041216, - "balance_loss_clip": 1.03972757, - "balance_loss_mlp": 1.02819264, - "epoch": 0.7396963775740267, - "flos": 27377936064000.0, - "grad_norm": 6.478228728649492, - "language_loss": 0.73720932, - "learning_rate": 6.695329903189451e-07, - "loss": 0.75875294, - "num_input_tokens_seen": 265423105, - "step": 12303, - "time_per_iteration": 4.152388334274292 - }, - { - "auxiliary_loss_clip": 0.01109918, - "auxiliary_loss_mlp": 0.01030651, - "balance_loss_clip": 1.03906059, - "balance_loss_mlp": 1.01861048, - "epoch": 0.7397565008266948, - "flos": 25520133755520.0, - "grad_norm": 1.665147368260365, - "language_loss": 0.53981858, - "learning_rate": 6.692422296776927e-07, - "loss": 0.56122428, - "num_input_tokens_seen": 265443445, - "step": 12304, - "time_per_iteration": 2.6007986068725586 - }, - { - "auxiliary_loss_clip": 0.01088478, - "auxiliary_loss_mlp": 0.01038752, - "balance_loss_clip": 1.03643012, - "balance_loss_mlp": 1.02587104, - "epoch": 0.7398166240793627, - "flos": 23727760070400.0, - "grad_norm": 4.218553993502621, - "language_loss": 0.84787995, - "learning_rate": 6.689515194989084e-07, - "loss": 0.86915219, - "num_input_tokens_seen": 265462085, - "step": 12305, - "time_per_iteration": 2.7033863067626953 - }, - { - "auxiliary_loss_clip": 0.01007992, - "auxiliary_loss_mlp": 0.01002097, - "balance_loss_clip": 1.00802636, - "balance_loss_mlp": 1.00075579, - "epoch": 0.7398767473320307, - "flos": 67267582882560.0, - "grad_norm": 0.8984474927660691, - "language_loss": 0.57649475, - "learning_rate": 6.68660859793615e-07, - "loss": 0.59659564, - "num_input_tokens_seen": 265521190, - "step": 12306, - "time_per_iteration": 3.190584421157837 - }, - { - "auxiliary_loss_clip": 0.01091647, - "auxiliary_loss_mlp": 0.01034585, - "balance_loss_clip": 1.03991795, - "balance_loss_mlp": 1.02137649, - "epoch": 0.7399368705846986, - "flos": 22018699981440.0, - "grad_norm": 1.9564303795331826, - "language_loss": 0.81826288, - "learning_rate": 6.683702505728355e-07, - "loss": 0.83952522, - "num_input_tokens_seen": 265539705, - "step": 12307, - "time_per_iteration": 2.760991096496582 - }, - { - "auxiliary_loss_clip": 0.01094355, - "auxiliary_loss_mlp": 0.01035489, - "balance_loss_clip": 1.04020476, - "balance_loss_mlp": 1.0237112, - "epoch": 0.7399969938373666, - "flos": 14173870659840.0, - "grad_norm": 1.7875471048417528, - "language_loss": 0.69662929, - "learning_rate": 6.680796918475893e-07, - "loss": 0.71792769, - "num_input_tokens_seen": 265555855, - "step": 12308, - "time_per_iteration": 2.786059617996216 - }, - { - "auxiliary_loss_clip": 0.01080019, - "auxiliary_loss_mlp": 0.01030655, - "balance_loss_clip": 1.03736496, - "balance_loss_mlp": 1.01869845, - "epoch": 0.7400571170900345, - "flos": 25301473712640.0, - "grad_norm": 1.9234846760439523, - "language_loss": 0.81795132, - "learning_rate": 6.67789183628896e-07, - "loss": 0.83905804, - "num_input_tokens_seen": 265575455, - "step": 12309, - "time_per_iteration": 2.6756904125213623 - }, - { - "auxiliary_loss_clip": 0.01100831, - "auxiliary_loss_mlp": 0.01034821, - "balance_loss_clip": 1.03873348, - "balance_loss_mlp": 1.02133226, - "epoch": 0.7401172403427025, - "flos": 22711344917760.0, - "grad_norm": 3.264420183038049, - "language_loss": 0.72705656, - "learning_rate": 6.674987259277692e-07, - "loss": 0.74841309, - "num_input_tokens_seen": 265595250, - "step": 12310, - "time_per_iteration": 2.7013933658599854 - }, - { - "auxiliary_loss_clip": 0.01075917, - "auxiliary_loss_mlp": 0.01042964, - "balance_loss_clip": 1.0368607, - "balance_loss_mlp": 1.02921319, - "epoch": 0.7401773635953706, - "flos": 18067448188800.0, - "grad_norm": 2.4013054691194915, - "language_loss": 0.88485903, - "learning_rate": 6.672083187552239e-07, - "loss": 0.90604782, - "num_input_tokens_seen": 265606945, - "step": 12311, - "time_per_iteration": 2.6424548625946045 - }, - { - "auxiliary_loss_clip": 0.01046645, - "auxiliary_loss_mlp": 0.01029353, - "balance_loss_clip": 1.0324477, - "balance_loss_mlp": 1.01692545, - "epoch": 0.7402374868480385, - "flos": 22712135016960.0, - "grad_norm": 1.58737852842035, - "language_loss": 0.80510384, - "learning_rate": 6.669179621222738e-07, - "loss": 0.82586384, - "num_input_tokens_seen": 265626115, - "step": 12312, - "time_per_iteration": 2.820053815841675 - }, - { - "auxiliary_loss_clip": 0.01060693, - "auxiliary_loss_mlp": 0.01035735, - "balance_loss_clip": 1.03197908, - "balance_loss_mlp": 1.02264023, - "epoch": 0.7402976101007065, - "flos": 22856675345280.0, - "grad_norm": 1.990612245665929, - "language_loss": 0.78425479, - "learning_rate": 6.666276560399273e-07, - "loss": 0.80521905, - "num_input_tokens_seen": 265646520, - "step": 12313, - "time_per_iteration": 2.756864547729492 - }, - { - "auxiliary_loss_clip": 0.01059901, - "auxiliary_loss_mlp": 0.01038311, - "balance_loss_clip": 1.03464198, - "balance_loss_mlp": 1.02487016, - "epoch": 0.7403577333533744, - "flos": 12345801834240.0, - "grad_norm": 2.1312329589300947, - "language_loss": 0.78784394, - "learning_rate": 6.663374005191937e-07, - "loss": 0.80882609, - "num_input_tokens_seen": 265661875, - "step": 12314, - "time_per_iteration": 2.7299044132232666 - }, - { - "auxiliary_loss_clip": 0.01020285, - "auxiliary_loss_mlp": 0.01000472, - "balance_loss_clip": 1.00777555, - "balance_loss_mlp": 0.99948281, - "epoch": 0.7404178566060424, - "flos": 60327270869760.0, - "grad_norm": 0.9319847439120421, - "language_loss": 0.55094397, - "learning_rate": 6.660471955710809e-07, - "loss": 0.57115149, - "num_input_tokens_seen": 265721255, - "step": 12315, - "time_per_iteration": 3.201897382736206 - }, - { - "auxiliary_loss_clip": 0.01093771, - "auxiliary_loss_mlp": 0.01036367, - "balance_loss_clip": 1.03759921, - "balance_loss_mlp": 1.02371287, - "epoch": 0.7404779798587103, - "flos": 32014650072960.0, - "grad_norm": 1.5030342819067668, - "language_loss": 0.79353088, - "learning_rate": 6.65757041206591e-07, - "loss": 0.81483227, - "num_input_tokens_seen": 265743970, - "step": 12316, - "time_per_iteration": 2.705349922180176 - }, - { - "auxiliary_loss_clip": 0.01098009, - "auxiliary_loss_mlp": 0.01031964, - "balance_loss_clip": 1.03624582, - "balance_loss_mlp": 1.01957273, - "epoch": 0.7405381031113784, - "flos": 12889704551040.0, - "grad_norm": 1.7371134770990158, - "language_loss": 0.7492671, - "learning_rate": 6.654669374367275e-07, - "loss": 0.77056682, - "num_input_tokens_seen": 265760890, - "step": 12317, - "time_per_iteration": 2.637202024459839 - }, - { - "auxiliary_loss_clip": 0.01078909, - "auxiliary_loss_mlp": 0.01035186, - "balance_loss_clip": 1.03754401, - "balance_loss_mlp": 1.02296102, - "epoch": 0.7405982263640463, - "flos": 20229127557120.0, - "grad_norm": 1.520938817583414, - "language_loss": 0.81343406, - "learning_rate": 6.651768842724917e-07, - "loss": 0.834575, - "num_input_tokens_seen": 265779600, - "step": 12318, - "time_per_iteration": 2.7076103687286377 - }, - { - "auxiliary_loss_clip": 0.01084776, - "auxiliary_loss_mlp": 0.0103153, - "balance_loss_clip": 1.03475654, - "balance_loss_mlp": 1.0187031, - "epoch": 0.7406583496167143, - "flos": 17567213431680.0, - "grad_norm": 1.9057934883865575, - "language_loss": 0.76502925, - "learning_rate": 6.648868817248827e-07, - "loss": 0.7861923, - "num_input_tokens_seen": 265797030, - "step": 12319, - "time_per_iteration": 2.6530611515045166 - }, - { - "auxiliary_loss_clip": 0.01080701, - "auxiliary_loss_mlp": 0.01032886, - "balance_loss_clip": 1.0368222, - "balance_loss_mlp": 1.0211314, - "epoch": 0.7407184728693822, - "flos": 18295733076480.0, - "grad_norm": 2.7907820586254064, - "language_loss": 0.64157581, - "learning_rate": 6.64596929804897e-07, - "loss": 0.66271174, - "num_input_tokens_seen": 265815055, - "step": 12320, - "time_per_iteration": 2.7634599208831787 - }, - { - "auxiliary_loss_clip": 0.0110264, - "auxiliary_loss_mlp": 0.01041469, - "balance_loss_clip": 1.03931427, - "balance_loss_mlp": 1.02880883, - "epoch": 0.7407785961220502, - "flos": 16690562098560.0, - "grad_norm": 2.6669296111663168, - "language_loss": 0.8214829, - "learning_rate": 6.643070285235288e-07, - "loss": 0.842924, - "num_input_tokens_seen": 265828480, - "step": 12321, - "time_per_iteration": 2.603889226913452 - }, - { - "auxiliary_loss_clip": 0.01091833, - "auxiliary_loss_mlp": 0.01048957, - "balance_loss_clip": 1.03682292, - "balance_loss_mlp": 1.03459191, - "epoch": 0.7408387193747181, - "flos": 22088330496000.0, - "grad_norm": 2.755383259535151, - "language_loss": 0.72079754, - "learning_rate": 6.640171778917727e-07, - "loss": 0.74220538, - "num_input_tokens_seen": 265845825, - "step": 12322, - "time_per_iteration": 2.5962164402008057 - }, - { - "auxiliary_loss_clip": 0.01100778, - "auxiliary_loss_mlp": 0.00770917, - "balance_loss_clip": 1.03753436, - "balance_loss_mlp": 1.0002656, - "epoch": 0.7408988426273861, - "flos": 24236721832320.0, - "grad_norm": 1.859375439746312, - "language_loss": 0.64215767, - "learning_rate": 6.637273779206183e-07, - "loss": 0.66087461, - "num_input_tokens_seen": 265866335, - "step": 12323, - "time_per_iteration": 2.650984525680542 - }, - { - "auxiliary_loss_clip": 0.01074935, - "auxiliary_loss_mlp": 0.01032883, - "balance_loss_clip": 1.03454328, - "balance_loss_mlp": 1.01984739, - "epoch": 0.7409589658800542, - "flos": 29023004073600.0, - "grad_norm": 1.364972718978451, - "language_loss": 0.75983679, - "learning_rate": 6.634376286210559e-07, - "loss": 0.78091496, - "num_input_tokens_seen": 265888945, - "step": 12324, - "time_per_iteration": 2.758053779602051 - }, - { - "auxiliary_loss_clip": 0.01079211, - "auxiliary_loss_mlp": 0.01027292, - "balance_loss_clip": 1.03694987, - "balance_loss_mlp": 1.01489401, - "epoch": 0.7410190891327221, - "flos": 19351362902400.0, - "grad_norm": 1.7409894929083622, - "language_loss": 0.74638963, - "learning_rate": 6.63147930004073e-07, - "loss": 0.76745468, - "num_input_tokens_seen": 265908030, - "step": 12325, - "time_per_iteration": 2.6512198448181152 - }, - { - "auxiliary_loss_clip": 0.01070767, - "auxiliary_loss_mlp": 0.01038532, - "balance_loss_clip": 1.03589809, - "balance_loss_mlp": 1.02524054, - "epoch": 0.7410792123853901, - "flos": 22747650589440.0, - "grad_norm": 1.8899213582685095, - "language_loss": 0.68341279, - "learning_rate": 6.628582820806545e-07, - "loss": 0.7045058, - "num_input_tokens_seen": 265927030, - "step": 12326, - "time_per_iteration": 2.760312557220459 - }, - { - "auxiliary_loss_clip": 0.01072406, - "auxiliary_loss_mlp": 0.01028918, - "balance_loss_clip": 1.03731251, - "balance_loss_mlp": 1.01672876, - "epoch": 0.741139335638058, - "flos": 25372433030400.0, - "grad_norm": 1.6031079526338634, - "language_loss": 0.89560592, - "learning_rate": 6.625686848617835e-07, - "loss": 0.91661912, - "num_input_tokens_seen": 265945490, - "step": 12327, - "time_per_iteration": 2.753051519393921 - }, - { - "auxiliary_loss_clip": 0.01110031, - "auxiliary_loss_mlp": 0.01032575, - "balance_loss_clip": 1.03885567, - "balance_loss_mlp": 1.0198555, - "epoch": 0.741199458890726, - "flos": 18585639745920.0, - "grad_norm": 1.7237905370438114, - "language_loss": 0.85383123, - "learning_rate": 6.62279138358442e-07, - "loss": 0.87525725, - "num_input_tokens_seen": 265963265, - "step": 12328, - "time_per_iteration": 2.5977120399475098 - }, - { - "auxiliary_loss_clip": 0.01098285, - "auxiliary_loss_mlp": 0.01032958, - "balance_loss_clip": 1.0383029, - "balance_loss_mlp": 1.01909983, - "epoch": 0.7412595821433939, - "flos": 22127078292480.0, - "grad_norm": 1.669888281499519, - "language_loss": 0.66867191, - "learning_rate": 6.619896425816103e-07, - "loss": 0.68998432, - "num_input_tokens_seen": 265982270, - "step": 12329, - "time_per_iteration": 2.63157057762146 - }, - { - "auxiliary_loss_clip": 0.01078104, - "auxiliary_loss_mlp": 0.01042687, - "balance_loss_clip": 1.03691041, - "balance_loss_mlp": 1.02878761, - "epoch": 0.741319705396062, - "flos": 29169699217920.0, - "grad_norm": 1.6151090072025307, - "language_loss": 0.66697407, - "learning_rate": 6.617001975422647e-07, - "loss": 0.688182, - "num_input_tokens_seen": 266003835, - "step": 12330, - "time_per_iteration": 2.8134610652923584 - }, - { - "auxiliary_loss_clip": 0.01078521, - "auxiliary_loss_mlp": 0.01036152, - "balance_loss_clip": 1.04112339, - "balance_loss_mlp": 1.02134609, - "epoch": 0.7413798286487299, - "flos": 20667489137280.0, - "grad_norm": 2.0428405490816837, - "language_loss": 0.85805637, - "learning_rate": 6.614108032513823e-07, - "loss": 0.87920308, - "num_input_tokens_seen": 266021595, - "step": 12331, - "time_per_iteration": 2.812793493270874 - }, - { - "auxiliary_loss_clip": 0.01048375, - "auxiliary_loss_mlp": 0.01034839, - "balance_loss_clip": 1.0381304, - "balance_loss_mlp": 1.02189922, - "epoch": 0.7414399519013979, - "flos": 16398895662720.0, - "grad_norm": 1.9478476477957887, - "language_loss": 0.6967262, - "learning_rate": 6.611214597199364e-07, - "loss": 0.71755838, - "num_input_tokens_seen": 266039860, - "step": 12332, - "time_per_iteration": 3.0447654724121094 - }, - { - "auxiliary_loss_clip": 0.01112852, - "auxiliary_loss_mlp": 0.01040645, - "balance_loss_clip": 1.03986526, - "balance_loss_mlp": 1.02710271, - "epoch": 0.7415000751540658, - "flos": 25630235919360.0, - "grad_norm": 1.894199070779257, - "language_loss": 0.63652647, - "learning_rate": 6.608321669588984e-07, - "loss": 0.65806139, - "num_input_tokens_seen": 266058050, - "step": 12333, - "time_per_iteration": 2.8000104427337646 - }, - { - "auxiliary_loss_clip": 0.010897, - "auxiliary_loss_mlp": 0.01035147, - "balance_loss_clip": 1.04135418, - "balance_loss_mlp": 1.02300525, - "epoch": 0.7415601984067338, - "flos": 24499732193280.0, - "grad_norm": 1.6946502841165116, - "language_loss": 0.71084702, - "learning_rate": 6.605429249792387e-07, - "loss": 0.73209548, - "num_input_tokens_seen": 266078060, - "step": 12334, - "time_per_iteration": 2.7801129817962646 - }, - { - "auxiliary_loss_clip": 0.01065371, - "auxiliary_loss_mlp": 0.01027933, - "balance_loss_clip": 1.0374248, - "balance_loss_mlp": 1.01558292, - "epoch": 0.7416203216594017, - "flos": 20887154760960.0, - "grad_norm": 1.6662744969405867, - "language_loss": 0.82556254, - "learning_rate": 6.602537337919257e-07, - "loss": 0.84649551, - "num_input_tokens_seen": 266097110, - "step": 12335, - "time_per_iteration": 2.7619669437408447 - }, - { - "auxiliary_loss_clip": 0.01111608, - "auxiliary_loss_mlp": 0.01031427, - "balance_loss_clip": 1.03896701, - "balance_loss_mlp": 1.01763475, - "epoch": 0.7416804449120697, - "flos": 15624265933440.0, - "grad_norm": 2.6708776221620134, - "language_loss": 0.74853325, - "learning_rate": 6.599645934079259e-07, - "loss": 0.76996362, - "num_input_tokens_seen": 266110870, - "step": 12336, - "time_per_iteration": 4.294764518737793 - }, - { - "auxiliary_loss_clip": 0.01068313, - "auxiliary_loss_mlp": 0.01036465, - "balance_loss_clip": 1.03603351, - "balance_loss_mlp": 1.02284563, - "epoch": 0.7417405681647377, - "flos": 17120483982720.0, - "grad_norm": 1.9180906175997412, - "language_loss": 0.73796511, - "learning_rate": 6.596755038382029e-07, - "loss": 0.75901294, - "num_input_tokens_seen": 266127845, - "step": 12337, - "time_per_iteration": 2.8595807552337646 - }, - { - "auxiliary_loss_clip": 0.01083057, - "auxiliary_loss_mlp": 0.01039105, - "balance_loss_clip": 1.03681028, - "balance_loss_mlp": 1.0262543, - "epoch": 0.7418006914174057, - "flos": 18880322924160.0, - "grad_norm": 1.6574383205520367, - "language_loss": 0.76809967, - "learning_rate": 6.593864650937186e-07, - "loss": 0.78932124, - "num_input_tokens_seen": 266145400, - "step": 12338, - "time_per_iteration": 4.203794240951538 - }, - { - "auxiliary_loss_clip": 0.01099752, - "auxiliary_loss_mlp": 0.01031615, - "balance_loss_clip": 1.03882122, - "balance_loss_mlp": 1.02033818, - "epoch": 0.7418608146700737, - "flos": 21580733450880.0, - "grad_norm": 1.7161166457507804, - "language_loss": 0.73070621, - "learning_rate": 6.590974771854345e-07, - "loss": 0.75201988, - "num_input_tokens_seen": 266164430, - "step": 12339, - "time_per_iteration": 4.210087776184082 - }, - { - "auxiliary_loss_clip": 0.01092405, - "auxiliary_loss_mlp": 0.01031981, - "balance_loss_clip": 1.0387336, - "balance_loss_mlp": 1.01890945, - "epoch": 0.7419209379227416, - "flos": 22340459036160.0, - "grad_norm": 2.0219989421818276, - "language_loss": 0.79605651, - "learning_rate": 6.588085401243077e-07, - "loss": 0.81730038, - "num_input_tokens_seen": 266183855, - "step": 12340, - "time_per_iteration": 2.670774221420288 - }, - { - "auxiliary_loss_clip": 0.01069023, - "auxiliary_loss_mlp": 0.01036356, - "balance_loss_clip": 1.03491449, - "balance_loss_mlp": 1.02310038, - "epoch": 0.7419810611754096, - "flos": 16762275601920.0, - "grad_norm": 2.432257860237773, - "language_loss": 0.75854677, - "learning_rate": 6.585196539212958e-07, - "loss": 0.77960056, - "num_input_tokens_seen": 266202085, - "step": 12341, - "time_per_iteration": 2.686434268951416 - }, - { - "auxiliary_loss_clip": 0.0107769, - "auxiliary_loss_mlp": 0.01041186, - "balance_loss_clip": 1.03510964, - "balance_loss_mlp": 1.02783489, - "epoch": 0.7420411844280775, - "flos": 26212958259840.0, - "grad_norm": 1.4473494294427032, - "language_loss": 0.8024286, - "learning_rate": 6.582308185873535e-07, - "loss": 0.8236174, - "num_input_tokens_seen": 266223445, - "step": 12342, - "time_per_iteration": 4.343433380126953 - }, - { - "auxiliary_loss_clip": 0.01075896, - "auxiliary_loss_mlp": 0.01027447, - "balance_loss_clip": 1.03609908, - "balance_loss_mlp": 1.01511443, - "epoch": 0.7421013076807456, - "flos": 68529371840640.0, - "grad_norm": 1.749760257309467, - "language_loss": 0.77626014, - "learning_rate": 6.57942034133433e-07, - "loss": 0.79729354, - "num_input_tokens_seen": 266246575, - "step": 12343, - "time_per_iteration": 3.107714891433716 - }, - { - "auxiliary_loss_clip": 0.01082874, - "auxiliary_loss_mlp": 0.01034526, - "balance_loss_clip": 1.03323293, - "balance_loss_mlp": 1.02221727, - "epoch": 0.7421614309334135, - "flos": 24425325169920.0, - "grad_norm": 1.6706510034937676, - "language_loss": 0.67636979, - "learning_rate": 6.576533005704843e-07, - "loss": 0.69754374, - "num_input_tokens_seen": 266266055, - "step": 12344, - "time_per_iteration": 2.7599802017211914 - }, - { - "auxiliary_loss_clip": 0.01065258, - "auxiliary_loss_mlp": 0.01037206, - "balance_loss_clip": 1.03660572, - "balance_loss_mlp": 1.02291846, - "epoch": 0.7422215541860815, - "flos": 12311076360960.0, - "grad_norm": 2.3156123925604692, - "language_loss": 0.81109858, - "learning_rate": 6.573646179094572e-07, - "loss": 0.83212328, - "num_input_tokens_seen": 266282240, - "step": 12345, - "time_per_iteration": 2.7414791584014893 - }, - { - "auxiliary_loss_clip": 0.01072147, - "auxiliary_loss_mlp": 0.0103856, - "balance_loss_clip": 1.03549957, - "balance_loss_mlp": 1.02523887, - "epoch": 0.7422816774387494, - "flos": 19645579203840.0, - "grad_norm": 1.9382183588535902, - "language_loss": 0.70441389, - "learning_rate": 6.570759861612988e-07, - "loss": 0.72552097, - "num_input_tokens_seen": 266300980, - "step": 12346, - "time_per_iteration": 2.728034734725952 - }, - { - "auxiliary_loss_clip": 0.01102385, - "auxiliary_loss_mlp": 0.0103363, - "balance_loss_clip": 1.03974307, - "balance_loss_mlp": 1.02126789, - "epoch": 0.7423418006914174, - "flos": 32015978876160.0, - "grad_norm": 2.081189833506492, - "language_loss": 0.73518687, - "learning_rate": 6.56787405336953e-07, - "loss": 0.75654697, - "num_input_tokens_seen": 266322215, - "step": 12347, - "time_per_iteration": 2.691364049911499 - }, - { - "auxiliary_loss_clip": 0.01090637, - "auxiliary_loss_mlp": 0.01034498, - "balance_loss_clip": 1.03648269, - "balance_loss_mlp": 1.02162337, - "epoch": 0.7424019239440853, - "flos": 18916951818240.0, - "grad_norm": 1.681708315108595, - "language_loss": 0.80881745, - "learning_rate": 6.564988754473642e-07, - "loss": 0.83006883, - "num_input_tokens_seen": 266341600, - "step": 12348, - "time_per_iteration": 2.719554901123047 - }, - { - "auxiliary_loss_clip": 0.01110126, - "auxiliary_loss_mlp": 0.01033153, - "balance_loss_clip": 1.03918421, - "balance_loss_mlp": 1.02082634, - "epoch": 0.7424620471967533, - "flos": 35876518871040.0, - "grad_norm": 1.8616740684019923, - "language_loss": 0.73023462, - "learning_rate": 6.562103965034724e-07, - "loss": 0.7516675, - "num_input_tokens_seen": 266362895, - "step": 12349, - "time_per_iteration": 2.762857437133789 - }, - { - "auxiliary_loss_clip": 0.01091582, - "auxiliary_loss_mlp": 0.01035038, - "balance_loss_clip": 1.03577137, - "balance_loss_mlp": 1.02081633, - "epoch": 0.7425221704494213, - "flos": 27016603200000.0, - "grad_norm": 2.2070987228261427, - "language_loss": 0.78727913, - "learning_rate": 6.559219685162165e-07, - "loss": 0.80854535, - "num_input_tokens_seen": 266384015, - "step": 12350, - "time_per_iteration": 2.67797589302063 - }, - { - "auxiliary_loss_clip": 0.01067839, - "auxiliary_loss_mlp": 0.01035914, - "balance_loss_clip": 1.03754306, - "balance_loss_mlp": 1.0233134, - "epoch": 0.7425822937020893, - "flos": 34167135559680.0, - "grad_norm": 1.5216618153297856, - "language_loss": 0.74963629, - "learning_rate": 6.556335914965343e-07, - "loss": 0.77067381, - "num_input_tokens_seen": 266405990, - "step": 12351, - "time_per_iteration": 2.8214800357818604 - }, - { - "auxiliary_loss_clip": 0.01055755, - "auxiliary_loss_mlp": 0.01030345, - "balance_loss_clip": 1.0381254, - "balance_loss_mlp": 1.01733303, - "epoch": 0.7426424169547573, - "flos": 21283572234240.0, - "grad_norm": 2.67642082180286, - "language_loss": 0.81345606, - "learning_rate": 6.553452654553611e-07, - "loss": 0.83431703, - "num_input_tokens_seen": 266424260, - "step": 12352, - "time_per_iteration": 2.8043935298919678 - }, - { - "auxiliary_loss_clip": 0.01103554, - "auxiliary_loss_mlp": 0.01039938, - "balance_loss_clip": 1.0413506, - "balance_loss_mlp": 1.02751637, - "epoch": 0.7427025402074252, - "flos": 22448442297600.0, - "grad_norm": 1.8427124307905225, - "language_loss": 0.72003049, - "learning_rate": 6.550569904036307e-07, - "loss": 0.74146539, - "num_input_tokens_seen": 266444580, - "step": 12353, - "time_per_iteration": 2.726813793182373 - }, - { - "auxiliary_loss_clip": 0.0110208, - "auxiliary_loss_mlp": 0.01030474, - "balance_loss_clip": 1.04067636, - "balance_loss_mlp": 1.01913714, - "epoch": 0.7427626634600932, - "flos": 22524609087360.0, - "grad_norm": 2.0628021124051275, - "language_loss": 0.72218555, - "learning_rate": 6.547687663522739e-07, - "loss": 0.74351114, - "num_input_tokens_seen": 266465640, - "step": 12354, - "time_per_iteration": 2.6648378372192383 - }, - { - "auxiliary_loss_clip": 0.01020848, - "auxiliary_loss_mlp": 0.01006019, - "balance_loss_clip": 1.00787544, - "balance_loss_mlp": 1.00489271, - "epoch": 0.7428227867127611, - "flos": 67209477655680.0, - "grad_norm": 0.694826107122343, - "language_loss": 0.59537125, - "learning_rate": 6.544805933122199e-07, - "loss": 0.61563993, - "num_input_tokens_seen": 266531950, - "step": 12355, - "time_per_iteration": 3.3000428676605225 - }, - { - "auxiliary_loss_clip": 0.01111904, - "auxiliary_loss_mlp": 0.01030428, - "balance_loss_clip": 1.03898406, - "balance_loss_mlp": 1.01765478, - "epoch": 0.7428829099654292, - "flos": 14721221082240.0, - "grad_norm": 1.7387842003260185, - "language_loss": 0.677315, - "learning_rate": 6.541924712943971e-07, - "loss": 0.69873834, - "num_input_tokens_seen": 266550665, - "step": 12356, - "time_per_iteration": 2.577047824859619 - }, - { - "auxiliary_loss_clip": 0.01100444, - "auxiliary_loss_mlp": 0.00771382, - "balance_loss_clip": 1.03524387, - "balance_loss_mlp": 1.00019741, - "epoch": 0.7429430332180971, - "flos": 48646496413440.0, - "grad_norm": 1.7685989280794623, - "language_loss": 0.72208947, - "learning_rate": 6.539044003097301e-07, - "loss": 0.74080771, - "num_input_tokens_seen": 266572455, - "step": 12357, - "time_per_iteration": 2.9096696376800537 - }, - { - "auxiliary_loss_clip": 0.01088209, - "auxiliary_loss_mlp": 0.01029654, - "balance_loss_clip": 1.03906703, - "balance_loss_mlp": 1.01782274, - "epoch": 0.7430031564707651, - "flos": 16764071281920.0, - "grad_norm": 1.8287713858548653, - "language_loss": 0.65631384, - "learning_rate": 6.53616380369143e-07, - "loss": 0.6774925, - "num_input_tokens_seen": 266590895, - "step": 12358, - "time_per_iteration": 2.668260097503662 - }, - { - "auxiliary_loss_clip": 0.01073582, - "auxiliary_loss_mlp": 0.0103526, - "balance_loss_clip": 1.0399549, - "balance_loss_mlp": 1.02100861, - "epoch": 0.743063279723433, - "flos": 23870576545920.0, - "grad_norm": 1.7940637938845212, - "language_loss": 0.81230819, - "learning_rate": 6.533284114835591e-07, - "loss": 0.83339661, - "num_input_tokens_seen": 266607660, - "step": 12359, - "time_per_iteration": 2.750425338745117 - }, - { - "auxiliary_loss_clip": 0.01100028, - "auxiliary_loss_mlp": 0.01032725, - "balance_loss_clip": 1.03793418, - "balance_loss_mlp": 1.01983833, - "epoch": 0.743123402976101, - "flos": 14391704689920.0, - "grad_norm": 2.122041383037816, - "language_loss": 0.67954987, - "learning_rate": 6.530404936638956e-07, - "loss": 0.70087737, - "num_input_tokens_seen": 266624260, - "step": 12360, - "time_per_iteration": 2.638991355895996 - }, - { - "auxiliary_loss_clip": 0.01099874, - "auxiliary_loss_mlp": 0.00770722, - "balance_loss_clip": 1.03788424, - "balance_loss_mlp": 1.00024271, - "epoch": 0.7431835262287689, - "flos": 27454318335360.0, - "grad_norm": 1.6135955801091852, - "language_loss": 0.72960168, - "learning_rate": 6.527526269210715e-07, - "loss": 0.74830765, - "num_input_tokens_seen": 266644210, - "step": 12361, - "time_per_iteration": 2.6851212978363037 - }, - { - "auxiliary_loss_clip": 0.01061783, - "auxiliary_loss_mlp": 0.01043643, - "balance_loss_clip": 1.03427052, - "balance_loss_mlp": 1.02964807, - "epoch": 0.743243649481437, - "flos": 20959514709120.0, - "grad_norm": 1.8538295437323902, - "language_loss": 0.55904317, - "learning_rate": 6.524648112660027e-07, - "loss": 0.58009744, - "num_input_tokens_seen": 266664230, - "step": 12362, - "time_per_iteration": 2.6957335472106934 - }, - { - "auxiliary_loss_clip": 0.01075259, - "auxiliary_loss_mlp": 0.01030795, - "balance_loss_clip": 1.03825688, - "balance_loss_mlp": 1.01771164, - "epoch": 0.7433037727341049, - "flos": 22783166161920.0, - "grad_norm": 1.5750012237947109, - "language_loss": 0.77069867, - "learning_rate": 6.521770467096039e-07, - "loss": 0.79175913, - "num_input_tokens_seen": 266683270, - "step": 12363, - "time_per_iteration": 2.7211437225341797 - }, - { - "auxiliary_loss_clip": 0.01082709, - "auxiliary_loss_mlp": 0.01036524, - "balance_loss_clip": 1.03588808, - "balance_loss_mlp": 1.02383995, - "epoch": 0.7433638959867729, - "flos": 22196708807040.0, - "grad_norm": 1.6083671142844838, - "language_loss": 0.78007239, - "learning_rate": 6.518893332627862e-07, - "loss": 0.8012647, - "num_input_tokens_seen": 266701235, - "step": 12364, - "time_per_iteration": 2.6894009113311768 - }, - { - "auxiliary_loss_clip": 0.01098885, - "auxiliary_loss_mlp": 0.01037373, - "balance_loss_clip": 1.03761303, - "balance_loss_mlp": 1.025172, - "epoch": 0.7434240192394409, - "flos": 23296760778240.0, - "grad_norm": 1.5760163793718025, - "language_loss": 0.78754139, - "learning_rate": 6.516016709364604e-07, - "loss": 0.80890405, - "num_input_tokens_seen": 266721495, - "step": 12365, - "time_per_iteration": 2.625281572341919 - }, - { - "auxiliary_loss_clip": 0.01087609, - "auxiliary_loss_mlp": 0.01033626, - "balance_loss_clip": 1.03624249, - "balance_loss_mlp": 1.02065635, - "epoch": 0.7434841424921088, - "flos": 54009575251200.0, - "grad_norm": 1.5814760031444242, - "language_loss": 0.76864719, - "learning_rate": 6.513140597415346e-07, - "loss": 0.78985953, - "num_input_tokens_seen": 266747400, - "step": 12366, - "time_per_iteration": 2.9688045978546143 - }, - { - "auxiliary_loss_clip": 0.01099866, - "auxiliary_loss_mlp": 0.01028311, - "balance_loss_clip": 1.04013896, - "balance_loss_mlp": 1.01761758, - "epoch": 0.7435442657447768, - "flos": 21433966479360.0, - "grad_norm": 1.3642058865548359, - "language_loss": 0.71373397, - "learning_rate": 6.510264996889141e-07, - "loss": 0.73501575, - "num_input_tokens_seen": 266767630, - "step": 12367, - "time_per_iteration": 2.661372184753418 - }, - { - "auxiliary_loss_clip": 0.01084148, - "auxiliary_loss_mlp": 0.01036563, - "balance_loss_clip": 1.0383482, - "balance_loss_mlp": 1.02371848, - "epoch": 0.7436043889974447, - "flos": 24499408970880.0, - "grad_norm": 1.5961683932504214, - "language_loss": 0.74215865, - "learning_rate": 6.507389907895038e-07, - "loss": 0.76336575, - "num_input_tokens_seen": 266788015, - "step": 12368, - "time_per_iteration": 2.712043285369873 - }, - { - "auxiliary_loss_clip": 0.01097444, - "auxiliary_loss_mlp": 0.01031837, - "balance_loss_clip": 1.03949308, - "balance_loss_mlp": 1.02042866, - "epoch": 0.7436645122501128, - "flos": 40698388512000.0, - "grad_norm": 2.9422959785728757, - "language_loss": 0.69383776, - "learning_rate": 6.50451533054207e-07, - "loss": 0.71513051, - "num_input_tokens_seen": 266809010, - "step": 12369, - "time_per_iteration": 2.7961301803588867 - }, - { - "auxiliary_loss_clip": 0.01088683, - "auxiliary_loss_mlp": 0.00770011, - "balance_loss_clip": 1.03793979, - "balance_loss_mlp": 1.00026274, - "epoch": 0.7437246355027807, - "flos": 18908835344640.0, - "grad_norm": 1.8064840643083067, - "language_loss": 0.75919938, - "learning_rate": 6.501641264939233e-07, - "loss": 0.77778637, - "num_input_tokens_seen": 266825390, - "step": 12370, - "time_per_iteration": 2.7155323028564453 - }, - { - "auxiliary_loss_clip": 0.01111903, - "auxiliary_loss_mlp": 0.01036048, - "balance_loss_clip": 1.04072666, - "balance_loss_mlp": 1.02287519, - "epoch": 0.7437847587554487, - "flos": 21543817248000.0, - "grad_norm": 2.0269448614883863, - "language_loss": 0.78193456, - "learning_rate": 6.498767711195503e-07, - "loss": 0.80341411, - "num_input_tokens_seen": 266844675, - "step": 12371, - "time_per_iteration": 2.6484358310699463 - }, - { - "auxiliary_loss_clip": 0.01091423, - "auxiliary_loss_mlp": 0.01029857, - "balance_loss_clip": 1.03848553, - "balance_loss_mlp": 1.01723862, - "epoch": 0.7438448820081166, - "flos": 27782470010880.0, - "grad_norm": 1.6126638712287897, - "language_loss": 0.69267446, - "learning_rate": 6.495894669419857e-07, - "loss": 0.71388721, - "num_input_tokens_seen": 266865160, - "step": 12372, - "time_per_iteration": 2.7042236328125 - }, - { - "auxiliary_loss_clip": 0.01079002, - "auxiliary_loss_mlp": 0.01037244, - "balance_loss_clip": 1.03700709, - "balance_loss_mlp": 1.02461922, - "epoch": 0.7439050052607846, - "flos": 17967832796160.0, - "grad_norm": 1.9384549362985082, - "language_loss": 0.75196183, - "learning_rate": 6.493022139721245e-07, - "loss": 0.77312428, - "num_input_tokens_seen": 266883285, - "step": 12373, - "time_per_iteration": 2.6364054679870605 - }, - { - "auxiliary_loss_clip": 0.01057413, - "auxiliary_loss_mlp": 0.01039492, - "balance_loss_clip": 1.03332591, - "balance_loss_mlp": 1.02528191, - "epoch": 0.7439651285134525, - "flos": 22958696949120.0, - "grad_norm": 1.7332911073866848, - "language_loss": 0.7709462, - "learning_rate": 6.49015012220858e-07, - "loss": 0.7919153, - "num_input_tokens_seen": 266900960, - "step": 12374, - "time_per_iteration": 2.7238872051239014 - }, - { - "auxiliary_loss_clip": 0.01048312, - "auxiliary_loss_mlp": 0.01037947, - "balance_loss_clip": 1.03520083, - "balance_loss_mlp": 1.02472675, - "epoch": 0.7440252517661206, - "flos": 18806777827200.0, - "grad_norm": 2.3876563861488496, - "language_loss": 0.76403177, - "learning_rate": 6.487278616990774e-07, - "loss": 0.78489435, - "num_input_tokens_seen": 266917710, - "step": 12375, - "time_per_iteration": 2.8014628887176514 - }, - { - "auxiliary_loss_clip": 0.01098112, - "auxiliary_loss_mlp": 0.01032005, - "balance_loss_clip": 1.03817892, - "balance_loss_mlp": 1.02062082, - "epoch": 0.7440853750187885, - "flos": 20266295155200.0, - "grad_norm": 1.9311839942562836, - "language_loss": 0.77011836, - "learning_rate": 6.484407624176733e-07, - "loss": 0.79141957, - "num_input_tokens_seen": 266934220, - "step": 12376, - "time_per_iteration": 4.12352442741394 - }, - { - "auxiliary_loss_clip": 0.01071601, - "auxiliary_loss_mlp": 0.01038876, - "balance_loss_clip": 1.03379536, - "balance_loss_mlp": 1.02320004, - "epoch": 0.7441454982714565, - "flos": 25337276593920.0, - "grad_norm": 1.692291173938847, - "language_loss": 0.79398865, - "learning_rate": 6.481537143875296e-07, - "loss": 0.8150934, - "num_input_tokens_seen": 266955210, - "step": 12377, - "time_per_iteration": 4.235915184020996 - }, - { - "auxiliary_loss_clip": 0.010991, - "auxiliary_loss_mlp": 0.01030466, - "balance_loss_clip": 1.03905261, - "balance_loss_mlp": 1.01754928, - "epoch": 0.7442056215241245, - "flos": 64480910866560.0, - "grad_norm": 1.9747138110607607, - "language_loss": 0.67284125, - "learning_rate": 6.478667176195322e-07, - "loss": 0.69413698, - "num_input_tokens_seen": 266976555, - "step": 12378, - "time_per_iteration": 4.622121572494507 - }, - { - "auxiliary_loss_clip": 0.010776, - "auxiliary_loss_mlp": 0.01037137, - "balance_loss_clip": 1.03861165, - "balance_loss_mlp": 1.02326727, - "epoch": 0.7442657447767924, - "flos": 31285376242560.0, - "grad_norm": 1.7913513654463005, - "language_loss": 0.71687776, - "learning_rate": 6.475797721245648e-07, - "loss": 0.73802519, - "num_input_tokens_seen": 266997640, - "step": 12379, - "time_per_iteration": 2.7747161388397217 - }, - { - "auxiliary_loss_clip": 0.01072089, - "auxiliary_loss_mlp": 0.00772364, - "balance_loss_clip": 1.0351454, - "balance_loss_mlp": 1.00025105, - "epoch": 0.7443258680294604, - "flos": 20807899401600.0, - "grad_norm": 2.0210704518096523, - "language_loss": 0.65216178, - "learning_rate": 6.472928779135085e-07, - "loss": 0.67060632, - "num_input_tokens_seen": 267016165, - "step": 12380, - "time_per_iteration": 2.7074787616729736 - }, - { - "auxiliary_loss_clip": 0.01101589, - "auxiliary_loss_mlp": 0.01034892, - "balance_loss_clip": 1.03957582, - "balance_loss_mlp": 1.0219394, - "epoch": 0.7443859912821283, - "flos": 22199833290240.0, - "grad_norm": 2.7838482793597388, - "language_loss": 0.78674221, - "learning_rate": 6.470060349972411e-07, - "loss": 0.80810702, - "num_input_tokens_seen": 267034075, - "step": 12381, - "time_per_iteration": 2.6567366123199463 - }, - { - "auxiliary_loss_clip": 0.01072016, - "auxiliary_loss_mlp": 0.01045243, - "balance_loss_clip": 1.03785646, - "balance_loss_mlp": 1.02981174, - "epoch": 0.7444461145347964, - "flos": 22017838055040.0, - "grad_norm": 2.878241445403415, - "language_loss": 0.72793961, - "learning_rate": 6.467192433866411e-07, - "loss": 0.74911219, - "num_input_tokens_seen": 267053645, - "step": 12382, - "time_per_iteration": 4.307409763336182 - }, - { - "auxiliary_loss_clip": 0.01005043, - "auxiliary_loss_mlp": 0.01004958, - "balance_loss_clip": 1.01257348, - "balance_loss_mlp": 1.00380802, - "epoch": 0.7445062377874643, - "flos": 70559047704960.0, - "grad_norm": 0.6531954820349142, - "language_loss": 0.54669428, - "learning_rate": 6.464325030925831e-07, - "loss": 0.56679428, - "num_input_tokens_seen": 267121830, - "step": 12383, - "time_per_iteration": 3.4219913482666016 - }, - { - "auxiliary_loss_clip": 0.01085875, - "auxiliary_loss_mlp": 0.01027749, - "balance_loss_clip": 1.03667879, - "balance_loss_mlp": 1.01498723, - "epoch": 0.7445663610401323, - "flos": 22164425458560.0, - "grad_norm": 5.0246719589759365, - "language_loss": 0.76023626, - "learning_rate": 6.461458141259395e-07, - "loss": 0.78137243, - "num_input_tokens_seen": 267141145, - "step": 12384, - "time_per_iteration": 2.6512553691864014 - }, - { - "auxiliary_loss_clip": 0.01098981, - "auxiliary_loss_mlp": 0.01029554, - "balance_loss_clip": 1.03816116, - "balance_loss_mlp": 1.01680422, - "epoch": 0.7446264842928002, - "flos": 24170251714560.0, - "grad_norm": 1.9156504433833408, - "language_loss": 0.78836381, - "learning_rate": 6.458591764975823e-07, - "loss": 0.80964911, - "num_input_tokens_seen": 267159280, - "step": 12385, - "time_per_iteration": 2.6723034381866455 - }, - { - "auxiliary_loss_clip": 0.01078718, - "auxiliary_loss_mlp": 0.01032784, - "balance_loss_clip": 1.03725076, - "balance_loss_mlp": 1.01855612, - "epoch": 0.7446866075454682, - "flos": 24134556574080.0, - "grad_norm": 1.6864726540587271, - "language_loss": 0.81386524, - "learning_rate": 6.455725902183813e-07, - "loss": 0.83498025, - "num_input_tokens_seen": 267179390, - "step": 12386, - "time_per_iteration": 2.724527359008789 - }, - { - "auxiliary_loss_clip": 0.01097105, - "auxiliary_loss_mlp": 0.01034795, - "balance_loss_clip": 1.03846228, - "balance_loss_mlp": 1.02235591, - "epoch": 0.7447467307981361, - "flos": 23548063305600.0, - "grad_norm": 1.6785981407963104, - "language_loss": 0.71043932, - "learning_rate": 6.452860552992037e-07, - "loss": 0.73175836, - "num_input_tokens_seen": 267198165, - "step": 12387, - "time_per_iteration": 2.7917346954345703 - }, - { - "auxiliary_loss_clip": 0.0107995, - "auxiliary_loss_mlp": 0.01031199, - "balance_loss_clip": 1.03891492, - "balance_loss_mlp": 1.01899815, - "epoch": 0.7448068540508042, - "flos": 19567832215680.0, - "grad_norm": 2.0106336394947597, - "language_loss": 0.70168763, - "learning_rate": 6.449995717509138e-07, - "loss": 0.72279912, - "num_input_tokens_seen": 267214520, - "step": 12388, - "time_per_iteration": 2.831563949584961 - }, - { - "auxiliary_loss_clip": 0.01099712, - "auxiliary_loss_mlp": 0.010311, - "balance_loss_clip": 1.03740311, - "balance_loss_mlp": 1.01846361, - "epoch": 0.7448669773034721, - "flos": 21839721488640.0, - "grad_norm": 2.075908043210206, - "language_loss": 0.84796858, - "learning_rate": 6.447131395843761e-07, - "loss": 0.86927676, - "num_input_tokens_seen": 267236555, - "step": 12389, - "time_per_iteration": 2.6563961505889893 - }, - { - "auxiliary_loss_clip": 0.01069109, - "auxiliary_loss_mlp": 0.01035071, - "balance_loss_clip": 1.03659904, - "balance_loss_mlp": 1.02245224, - "epoch": 0.7449271005561401, - "flos": 25155389099520.0, - "grad_norm": 2.0263392027511298, - "language_loss": 0.79228258, - "learning_rate": 6.444267588104526e-07, - "loss": 0.81332433, - "num_input_tokens_seen": 267254800, - "step": 12390, - "time_per_iteration": 2.756574869155884 - }, - { - "auxiliary_loss_clip": 0.01089478, - "auxiliary_loss_mlp": 0.0103054, - "balance_loss_clip": 1.03688502, - "balance_loss_mlp": 1.01727843, - "epoch": 0.7449872238088081, - "flos": 22273342473600.0, - "grad_norm": 1.8599579606909906, - "language_loss": 0.851529, - "learning_rate": 6.441404294400014e-07, - "loss": 0.87272918, - "num_input_tokens_seen": 267274610, - "step": 12391, - "time_per_iteration": 2.6953816413879395 - }, - { - "auxiliary_loss_clip": 0.01111566, - "auxiliary_loss_mlp": 0.01028434, - "balance_loss_clip": 1.03942573, - "balance_loss_mlp": 1.0161674, - "epoch": 0.745047347061476, - "flos": 20594805966720.0, - "grad_norm": 1.7091676728035188, - "language_loss": 0.73478818, - "learning_rate": 6.438541514838811e-07, - "loss": 0.75618815, - "num_input_tokens_seen": 267292600, - "step": 12392, - "time_per_iteration": 2.566464424133301 - }, - { - "auxiliary_loss_clip": 0.010973, - "auxiliary_loss_mlp": 0.01035758, - "balance_loss_clip": 1.03854799, - "balance_loss_mlp": 1.02366483, - "epoch": 0.745107470314144, - "flos": 22127545169280.0, - "grad_norm": 3.0948074405421617, - "language_loss": 0.76522237, - "learning_rate": 6.435679249529487e-07, - "loss": 0.78655297, - "num_input_tokens_seen": 267311295, - "step": 12393, - "time_per_iteration": 2.614400625228882 - }, - { - "auxiliary_loss_clip": 0.01100705, - "auxiliary_loss_mlp": 0.0103966, - "balance_loss_clip": 1.03918004, - "balance_loss_mlp": 1.02523553, - "epoch": 0.745167593566812, - "flos": 22236498097920.0, - "grad_norm": 1.8734262060070255, - "language_loss": 0.72774941, - "learning_rate": 6.432817498580552e-07, - "loss": 0.74915308, - "num_input_tokens_seen": 267328390, - "step": 12394, - "time_per_iteration": 2.6467761993408203 - }, - { - "auxiliary_loss_clip": 0.01058489, - "auxiliary_loss_mlp": 0.00770508, - "balance_loss_clip": 1.04145324, - "balance_loss_mlp": 1.0001545, - "epoch": 0.74522771681948, - "flos": 20666232161280.0, - "grad_norm": 1.9226493220785308, - "language_loss": 0.81523216, - "learning_rate": 6.429956262100535e-07, - "loss": 0.83352214, - "num_input_tokens_seen": 267348185, - "step": 12395, - "time_per_iteration": 2.772284984588623 - }, - { - "auxiliary_loss_clip": 0.0110524, - "auxiliary_loss_mlp": 0.01037708, - "balance_loss_clip": 1.03964758, - "balance_loss_mlp": 1.0240705, - "epoch": 0.7452878400721479, - "flos": 21106999952640.0, - "grad_norm": 1.9270177813162948, - "language_loss": 0.7149328, - "learning_rate": 6.427095540197937e-07, - "loss": 0.73636222, - "num_input_tokens_seen": 267367010, - "step": 12396, - "time_per_iteration": 2.6198830604553223 - }, - { - "auxiliary_loss_clip": 0.0107235, - "auxiliary_loss_mlp": 0.01033046, - "balance_loss_clip": 1.03889275, - "balance_loss_mlp": 1.02018356, - "epoch": 0.7453479633248159, - "flos": 26688056474880.0, - "grad_norm": 1.7432262055203618, - "language_loss": 0.68239546, - "learning_rate": 6.424235332981245e-07, - "loss": 0.70344937, - "num_input_tokens_seen": 267386605, - "step": 12397, - "time_per_iteration": 2.8147408962249756 - }, - { - "auxiliary_loss_clip": 0.01111263, - "auxiliary_loss_mlp": 0.01038637, - "balance_loss_clip": 1.03894281, - "balance_loss_mlp": 1.02567935, - "epoch": 0.7454080865774838, - "flos": 17016056167680.0, - "grad_norm": 1.7819734884556382, - "language_loss": 0.77117336, - "learning_rate": 6.421375640558908e-07, - "loss": 0.79267234, - "num_input_tokens_seen": 267404135, - "step": 12398, - "time_per_iteration": 2.561169385910034 - }, - { - "auxiliary_loss_clip": 0.01100902, - "auxiliary_loss_mlp": 0.01029647, - "balance_loss_clip": 1.04031086, - "balance_loss_mlp": 1.01657581, - "epoch": 0.7454682098301518, - "flos": 21323900229120.0, - "grad_norm": 1.713165335415303, - "language_loss": 0.779158, - "learning_rate": 6.418516463039363e-07, - "loss": 0.80046344, - "num_input_tokens_seen": 267423120, - "step": 12399, - "time_per_iteration": 2.6413347721099854 - }, - { - "auxiliary_loss_clip": 0.010824, - "auxiliary_loss_mlp": 0.01035168, - "balance_loss_clip": 1.03334904, - "balance_loss_mlp": 1.02338409, - "epoch": 0.7455283330828197, - "flos": 17858341163520.0, - "grad_norm": 2.1285775991405482, - "language_loss": 0.73999, - "learning_rate": 6.415657800531038e-07, - "loss": 0.76116568, - "num_input_tokens_seen": 267441250, - "step": 12400, - "time_per_iteration": 2.696606159210205 - }, - { - "auxiliary_loss_clip": 0.01096917, - "auxiliary_loss_mlp": 0.01030276, - "balance_loss_clip": 1.03786886, - "balance_loss_mlp": 1.01809835, - "epoch": 0.7455884563354878, - "flos": 30774259664640.0, - "grad_norm": 2.4044760151763174, - "language_loss": 0.82103872, - "learning_rate": 6.412799653142327e-07, - "loss": 0.84231067, - "num_input_tokens_seen": 267462820, - "step": 12401, - "time_per_iteration": 2.700671434402466 - }, - { - "auxiliary_loss_clip": 0.01078431, - "auxiliary_loss_mlp": 0.01035329, - "balance_loss_clip": 1.03934383, - "balance_loss_mlp": 1.02312756, - "epoch": 0.7456485795881557, - "flos": 23185545292800.0, - "grad_norm": 2.1019998768613326, - "language_loss": 0.64676833, - "learning_rate": 6.409942020981611e-07, - "loss": 0.66790593, - "num_input_tokens_seen": 267483065, - "step": 12402, - "time_per_iteration": 2.775984287261963 - }, - { - "auxiliary_loss_clip": 0.01077021, - "auxiliary_loss_mlp": 0.01033791, - "balance_loss_clip": 1.03509498, - "balance_loss_mlp": 1.02227569, - "epoch": 0.7457087028408237, - "flos": 38727144074880.0, - "grad_norm": 1.560080300868097, - "language_loss": 0.73373783, - "learning_rate": 6.407084904157265e-07, - "loss": 0.75484598, - "num_input_tokens_seen": 267504825, - "step": 12403, - "time_per_iteration": 2.8398375511169434 - }, - { - "auxiliary_loss_clip": 0.01002548, - "auxiliary_loss_mlp": 0.01008627, - "balance_loss_clip": 1.01085329, - "balance_loss_mlp": 1.00753641, - "epoch": 0.7457688260934917, - "flos": 56043737337600.0, - "grad_norm": 0.830633313503113, - "language_loss": 0.58735222, - "learning_rate": 6.404228302777621e-07, - "loss": 0.60746402, - "num_input_tokens_seen": 267559260, - "step": 12404, - "time_per_iteration": 3.018889904022217 - }, - { - "auxiliary_loss_clip": 0.01110759, - "auxiliary_loss_mlp": 0.01032429, - "balance_loss_clip": 1.03871632, - "balance_loss_mlp": 1.020383, - "epoch": 0.7458289493461596, - "flos": 20116152305280.0, - "grad_norm": 1.8575983002348149, - "language_loss": 0.77702922, - "learning_rate": 6.401372216950995e-07, - "loss": 0.79846108, - "num_input_tokens_seen": 267578720, - "step": 12405, - "time_per_iteration": 2.607694625854492 - }, - { - "auxiliary_loss_clip": 0.01083469, - "auxiliary_loss_mlp": 0.01036873, - "balance_loss_clip": 1.03548229, - "balance_loss_mlp": 1.02420723, - "epoch": 0.7458890725988276, - "flos": 20193073280640.0, - "grad_norm": 1.6476155913625474, - "language_loss": 0.69351685, - "learning_rate": 6.398516646785698e-07, - "loss": 0.71472031, - "num_input_tokens_seen": 267598250, - "step": 12406, - "time_per_iteration": 2.651949882507324 - }, - { - "auxiliary_loss_clip": 0.01047021, - "auxiliary_loss_mlp": 0.01036186, - "balance_loss_clip": 1.03744388, - "balance_loss_mlp": 1.02236354, - "epoch": 0.7459491958514956, - "flos": 17018749687680.0, - "grad_norm": 2.2152803431091685, - "language_loss": 0.65254861, - "learning_rate": 6.39566159239002e-07, - "loss": 0.67338073, - "num_input_tokens_seen": 267615430, - "step": 12407, - "time_per_iteration": 2.761862277984619 - }, - { - "auxiliary_loss_clip": 0.01070552, - "auxiliary_loss_mlp": 0.01034751, - "balance_loss_clip": 1.03763545, - "balance_loss_mlp": 1.02068424, - "epoch": 0.7460093191041636, - "flos": 25078719519360.0, - "grad_norm": 2.453425787686552, - "language_loss": 0.72200561, - "learning_rate": 6.392807053872212e-07, - "loss": 0.74305862, - "num_input_tokens_seen": 267635075, - "step": 12408, - "time_per_iteration": 2.7553751468658447 - }, - { - "auxiliary_loss_clip": 0.01105957, - "auxiliary_loss_mlp": 0.0103312, - "balance_loss_clip": 1.03999674, - "balance_loss_mlp": 1.01942849, - "epoch": 0.7460694423568315, - "flos": 21908525990400.0, - "grad_norm": 2.19086035143854, - "language_loss": 0.72995472, - "learning_rate": 6.38995303134053e-07, - "loss": 0.7513454, - "num_input_tokens_seen": 267654105, - "step": 12409, - "time_per_iteration": 2.6748335361480713 - }, - { - "auxiliary_loss_clip": 0.01097314, - "auxiliary_loss_mlp": 0.01031591, - "balance_loss_clip": 1.03749943, - "balance_loss_mlp": 1.02024233, - "epoch": 0.7461295656094995, - "flos": 21215737399680.0, - "grad_norm": 2.015553074030815, - "language_loss": 0.65646017, - "learning_rate": 6.38709952490319e-07, - "loss": 0.67774916, - "num_input_tokens_seen": 267673090, - "step": 12410, - "time_per_iteration": 2.599883794784546 - }, - { - "auxiliary_loss_clip": 0.01094288, - "auxiliary_loss_mlp": 0.00770134, - "balance_loss_clip": 1.0380162, - "balance_loss_mlp": 1.00011945, - "epoch": 0.7461896888621674, - "flos": 22346851656960.0, - "grad_norm": 1.8387948527336508, - "language_loss": 0.84203392, - "learning_rate": 6.384246534668396e-07, - "loss": 0.86067814, - "num_input_tokens_seen": 267690605, - "step": 12411, - "time_per_iteration": 2.7593939304351807 - }, - { - "auxiliary_loss_clip": 0.01076302, - "auxiliary_loss_mlp": 0.01030521, - "balance_loss_clip": 1.0369643, - "balance_loss_mlp": 1.01747966, - "epoch": 0.7462498121148354, - "flos": 25482930243840.0, - "grad_norm": 2.2444375236075578, - "language_loss": 0.77899462, - "learning_rate": 6.381394060744339e-07, - "loss": 0.80006284, - "num_input_tokens_seen": 267710540, - "step": 12412, - "time_per_iteration": 2.880466938018799 - }, - { - "auxiliary_loss_clip": 0.01069141, - "auxiliary_loss_mlp": 0.01041632, - "balance_loss_clip": 1.03378701, - "balance_loss_mlp": 1.02820313, - "epoch": 0.7463099353675033, - "flos": 33947936812800.0, - "grad_norm": 2.442333824498856, - "language_loss": 0.62740505, - "learning_rate": 6.378542103239188e-07, - "loss": 0.64851284, - "num_input_tokens_seen": 267730780, - "step": 12413, - "time_per_iteration": 2.8031466007232666 - }, - { - "auxiliary_loss_clip": 0.01023176, - "auxiliary_loss_mlp": 0.00751261, - "balance_loss_clip": 1.00943136, - "balance_loss_mlp": 0.99959147, - "epoch": 0.7463700586201714, - "flos": 62767723691520.0, - "grad_norm": 0.7172744197889728, - "language_loss": 0.54801792, - "learning_rate": 6.375690662261082e-07, - "loss": 0.56576228, - "num_input_tokens_seen": 267794240, - "step": 12414, - "time_per_iteration": 3.2076735496520996 - }, - { - "auxiliary_loss_clip": 0.01081911, - "auxiliary_loss_mlp": 0.01031661, - "balance_loss_clip": 1.03365874, - "balance_loss_mlp": 1.01806545, - "epoch": 0.7464301818728393, - "flos": 33432654257280.0, - "grad_norm": 1.4875618615685628, - "language_loss": 0.5517059, - "learning_rate": 6.372839737918154e-07, - "loss": 0.57284164, - "num_input_tokens_seen": 267817190, - "step": 12415, - "time_per_iteration": 4.414318084716797 - }, - { - "auxiliary_loss_clip": 0.0104777, - "auxiliary_loss_mlp": 0.01036648, - "balance_loss_clip": 1.03617668, - "balance_loss_mlp": 1.02174664, - "epoch": 0.7464903051255073, - "flos": 26869872142080.0, - "grad_norm": 1.6764979613333528, - "language_loss": 0.75015157, - "learning_rate": 6.369989330318506e-07, - "loss": 0.77099568, - "num_input_tokens_seen": 267836245, - "step": 12416, - "time_per_iteration": 2.831061840057373 - }, - { - "auxiliary_loss_clip": 0.01060971, - "auxiliary_loss_mlp": 0.01042536, - "balance_loss_clip": 1.03266478, - "balance_loss_mlp": 1.02845144, - "epoch": 0.7465504283781753, - "flos": 44086954775040.0, - "grad_norm": 5.110704099754697, - "language_loss": 0.69582009, - "learning_rate": 6.367139439570233e-07, - "loss": 0.71685511, - "num_input_tokens_seen": 267858310, - "step": 12417, - "time_per_iteration": 6.061137676239014 - }, - { - "auxiliary_loss_clip": 0.01087135, - "auxiliary_loss_mlp": 0.010359, - "balance_loss_clip": 1.04298329, - "balance_loss_mlp": 1.02211332, - "epoch": 0.7466105516308432, - "flos": 19676102785920.0, - "grad_norm": 1.7520602773189389, - "language_loss": 0.73654354, - "learning_rate": 6.364290065781392e-07, - "loss": 0.75777388, - "num_input_tokens_seen": 267876345, - "step": 12418, - "time_per_iteration": 2.719461441040039 - }, - { - "auxiliary_loss_clip": 0.01101371, - "auxiliary_loss_mlp": 0.01032166, - "balance_loss_clip": 1.03970969, - "balance_loss_mlp": 1.01958394, - "epoch": 0.7466706748835112, - "flos": 20520722165760.0, - "grad_norm": 1.5723677716415394, - "language_loss": 0.68733931, - "learning_rate": 6.361441209060039e-07, - "loss": 0.70867467, - "num_input_tokens_seen": 267896740, - "step": 12419, - "time_per_iteration": 2.658419370651245 - }, - { - "auxiliary_loss_clip": 0.01106886, - "auxiliary_loss_mlp": 0.01034487, - "balance_loss_clip": 1.03877735, - "balance_loss_mlp": 1.0225246, - "epoch": 0.7467307981361792, - "flos": 21690260997120.0, - "grad_norm": 2.325148718588452, - "language_loss": 0.74999017, - "learning_rate": 6.358592869514216e-07, - "loss": 0.77140391, - "num_input_tokens_seen": 267914765, - "step": 12420, - "time_per_iteration": 2.6232640743255615 - }, - { - "auxiliary_loss_clip": 0.01105813, - "auxiliary_loss_mlp": 0.0103159, - "balance_loss_clip": 1.04157043, - "balance_loss_mlp": 1.01868558, - "epoch": 0.7467909213888472, - "flos": 19573686132480.0, - "grad_norm": 1.5853276507887042, - "language_loss": 0.6715399, - "learning_rate": 6.355745047251904e-07, - "loss": 0.69291389, - "num_input_tokens_seen": 267934085, - "step": 12421, - "time_per_iteration": 4.228281021118164 - }, - { - "auxiliary_loss_clip": 0.01087742, - "auxiliary_loss_mlp": 0.01034402, - "balance_loss_clip": 1.03845739, - "balance_loss_mlp": 1.02044845, - "epoch": 0.7468510446415151, - "flos": 23695225326720.0, - "grad_norm": 1.7891201641771508, - "language_loss": 0.72700393, - "learning_rate": 6.352897742381107e-07, - "loss": 0.74822545, - "num_input_tokens_seen": 267955170, - "step": 12422, - "time_per_iteration": 2.678581953048706 - }, - { - "auxiliary_loss_clip": 0.0107257, - "auxiliary_loss_mlp": 0.01034325, - "balance_loss_clip": 1.03739822, - "balance_loss_mlp": 1.02140832, - "epoch": 0.7469111678941831, - "flos": 29315783831040.0, - "grad_norm": 1.7729815610764, - "language_loss": 0.7519784, - "learning_rate": 6.350050955009796e-07, - "loss": 0.77304733, - "num_input_tokens_seen": 267974980, - "step": 12423, - "time_per_iteration": 2.884932518005371 - }, - { - "auxiliary_loss_clip": 0.01097508, - "auxiliary_loss_mlp": 0.01026494, - "balance_loss_clip": 1.03815055, - "balance_loss_mlp": 1.01491261, - "epoch": 0.746971291146851, - "flos": 21798639308160.0, - "grad_norm": 1.3102627766091752, - "language_loss": 0.67454731, - "learning_rate": 6.347204685245929e-07, - "loss": 0.69578731, - "num_input_tokens_seen": 267994985, - "step": 12424, - "time_per_iteration": 2.665360927581787 - }, - { - "auxiliary_loss_clip": 0.01106731, - "auxiliary_loss_mlp": 0.0103674, - "balance_loss_clip": 1.04188585, - "balance_loss_mlp": 1.02385378, - "epoch": 0.747031414399519, - "flos": 36245070368640.0, - "grad_norm": 1.8168677421099624, - "language_loss": 0.7413224, - "learning_rate": 6.344358933197418e-07, - "loss": 0.76275706, - "num_input_tokens_seen": 268014985, - "step": 12425, - "time_per_iteration": 2.684622049331665 - }, - { - "auxiliary_loss_clip": 0.01071034, - "auxiliary_loss_mlp": 0.01034399, - "balance_loss_clip": 1.03520596, - "balance_loss_mlp": 1.0205828, - "epoch": 0.7470915376521869, - "flos": 19974916028160.0, - "grad_norm": 8.361913341794455, - "language_loss": 0.69433403, - "learning_rate": 6.341513698972194e-07, - "loss": 0.71538836, - "num_input_tokens_seen": 268034395, - "step": 12426, - "time_per_iteration": 2.686992645263672 - }, - { - "auxiliary_loss_clip": 0.01070297, - "auxiliary_loss_mlp": 0.01035278, - "balance_loss_clip": 1.03655338, - "balance_loss_mlp": 1.02329111, - "epoch": 0.747151660904855, - "flos": 20084299920000.0, - "grad_norm": 1.4050021872275102, - "language_loss": 0.65497875, - "learning_rate": 6.338668982678139e-07, - "loss": 0.67603451, - "num_input_tokens_seen": 268054485, - "step": 12427, - "time_per_iteration": 2.8737995624542236 - }, - { - "auxiliary_loss_clip": 0.0111177, - "auxiliary_loss_mlp": 0.01030082, - "balance_loss_clip": 1.03934562, - "balance_loss_mlp": 1.01686215, - "epoch": 0.7472117841575229, - "flos": 16290373697280.0, - "grad_norm": 1.6443370194470839, - "language_loss": 0.74700832, - "learning_rate": 6.335824784423118e-07, - "loss": 0.7684269, - "num_input_tokens_seen": 268072250, - "step": 12428, - "time_per_iteration": 2.5923843383789062 - }, - { - "auxiliary_loss_clip": 0.01105561, - "auxiliary_loss_mlp": 0.0103113, - "balance_loss_clip": 1.03948128, - "balance_loss_mlp": 1.01726604, - "epoch": 0.7472719074101909, - "flos": 21389939383680.0, - "grad_norm": 1.8997644217403626, - "language_loss": 0.5859766, - "learning_rate": 6.33298110431499e-07, - "loss": 0.60734349, - "num_input_tokens_seen": 268089840, - "step": 12429, - "time_per_iteration": 2.673205614089966 - }, - { - "auxiliary_loss_clip": 0.01100742, - "auxiliary_loss_mlp": 0.01035285, - "balance_loss_clip": 1.0397048, - "balance_loss_mlp": 1.02210021, - "epoch": 0.7473320306628589, - "flos": 29643289061760.0, - "grad_norm": 2.191924076091365, - "language_loss": 0.60676718, - "learning_rate": 6.330137942461595e-07, - "loss": 0.62812746, - "num_input_tokens_seen": 268109360, - "step": 12430, - "time_per_iteration": 2.695838212966919 - }, - { - "auxiliary_loss_clip": 0.01089402, - "auxiliary_loss_mlp": 0.01035646, - "balance_loss_clip": 1.0370059, - "balance_loss_mlp": 1.02266431, - "epoch": 0.7473921539155268, - "flos": 24136100858880.0, - "grad_norm": 1.60839761436318, - "language_loss": 0.75666201, - "learning_rate": 6.327295298970734e-07, - "loss": 0.7779125, - "num_input_tokens_seen": 268131840, - "step": 12431, - "time_per_iteration": 2.7131593227386475 - }, - { - "auxiliary_loss_clip": 0.01098694, - "auxiliary_loss_mlp": 0.01031285, - "balance_loss_clip": 1.03696167, - "balance_loss_mlp": 1.01853514, - "epoch": 0.7474522771681948, - "flos": 17487958072320.0, - "grad_norm": 1.8735643765316532, - "language_loss": 0.75119841, - "learning_rate": 6.32445317395021e-07, - "loss": 0.77249819, - "num_input_tokens_seen": 268148300, - "step": 12432, - "time_per_iteration": 2.596440315246582 - }, - { - "auxiliary_loss_clip": 0.01088473, - "auxiliary_loss_mlp": 0.01036339, - "balance_loss_clip": 1.03782606, - "balance_loss_mlp": 1.02223635, - "epoch": 0.7475124004208628, - "flos": 16727298733440.0, - "grad_norm": 2.50552734802935, - "language_loss": 0.69950736, - "learning_rate": 6.321611567507787e-07, - "loss": 0.72075546, - "num_input_tokens_seen": 268166450, - "step": 12433, - "time_per_iteration": 2.606110095977783 - }, - { - "auxiliary_loss_clip": 0.01063022, - "auxiliary_loss_mlp": 0.01031886, - "balance_loss_clip": 1.03389204, - "balance_loss_mlp": 1.01835036, - "epoch": 0.7475725236735308, - "flos": 19720237622400.0, - "grad_norm": 2.703159081845411, - "language_loss": 0.67130244, - "learning_rate": 6.318770479751232e-07, - "loss": 0.6922515, - "num_input_tokens_seen": 268186165, - "step": 12434, - "time_per_iteration": 2.751291513442993 - }, - { - "auxiliary_loss_clip": 0.01105439, - "auxiliary_loss_mlp": 0.01035376, - "balance_loss_clip": 1.03803849, - "balance_loss_mlp": 1.02368116, - "epoch": 0.7476326469261987, - "flos": 26286000566400.0, - "grad_norm": 3.4930601864100224, - "language_loss": 0.7979542, - "learning_rate": 6.315929910788263e-07, - "loss": 0.8193624, - "num_input_tokens_seen": 268208145, - "step": 12435, - "time_per_iteration": 2.6472816467285156 - }, - { - "auxiliary_loss_clip": 0.01083734, - "auxiliary_loss_mlp": 0.01027736, - "balance_loss_clip": 1.03813887, - "balance_loss_mlp": 1.01502252, - "epoch": 0.7476927701788667, - "flos": 31831828824960.0, - "grad_norm": 1.8861832027521432, - "language_loss": 0.68124855, - "learning_rate": 6.313089860726604e-07, - "loss": 0.70236325, - "num_input_tokens_seen": 268228345, - "step": 12436, - "time_per_iteration": 2.813854694366455 - }, - { - "auxiliary_loss_clip": 0.0108534, - "auxiliary_loss_mlp": 0.01034815, - "balance_loss_clip": 1.0374372, - "balance_loss_mlp": 1.02242923, - "epoch": 0.7477528934315346, - "flos": 31795487239680.0, - "grad_norm": 1.9570276627413858, - "language_loss": 0.70576406, - "learning_rate": 6.31025032967396e-07, - "loss": 0.72696555, - "num_input_tokens_seen": 268250260, - "step": 12437, - "time_per_iteration": 2.7825896739959717 - }, - { - "auxiliary_loss_clip": 0.01071415, - "auxiliary_loss_mlp": 0.01028505, - "balance_loss_clip": 1.03356171, - "balance_loss_mlp": 1.01697707, - "epoch": 0.7478130166842026, - "flos": 20371979946240.0, - "grad_norm": 2.395892152897482, - "language_loss": 0.67251343, - "learning_rate": 6.307411317737986e-07, - "loss": 0.69351262, - "num_input_tokens_seen": 268268440, - "step": 12438, - "time_per_iteration": 2.706458568572998 - }, - { - "auxiliary_loss_clip": 0.01087999, - "auxiliary_loss_mlp": 0.01035267, - "balance_loss_clip": 1.03646779, - "balance_loss_mlp": 1.0229404, - "epoch": 0.7478731399368705, - "flos": 18148930191360.0, - "grad_norm": 1.593097914021623, - "language_loss": 0.8085202, - "learning_rate": 6.304572825026344e-07, - "loss": 0.8297528, - "num_input_tokens_seen": 268285765, - "step": 12439, - "time_per_iteration": 2.665294647216797 - }, - { - "auxiliary_loss_clip": 0.01074236, - "auxiliary_loss_mlp": 0.01040046, - "balance_loss_clip": 1.03548503, - "balance_loss_mlp": 1.02805412, - "epoch": 0.7479332631895386, - "flos": 15267889146240.0, - "grad_norm": 2.6477676249196334, - "language_loss": 0.70738852, - "learning_rate": 6.301734851646674e-07, - "loss": 0.72853136, - "num_input_tokens_seen": 268304015, - "step": 12440, - "time_per_iteration": 2.7106735706329346 - }, - { - "auxiliary_loss_clip": 0.01088049, - "auxiliary_loss_mlp": 0.01026861, - "balance_loss_clip": 1.04011965, - "balance_loss_mlp": 1.01467144, - "epoch": 0.7479933864422065, - "flos": 21142515525120.0, - "grad_norm": 1.6270418049825819, - "language_loss": 0.74380887, - "learning_rate": 6.298897397706597e-07, - "loss": 0.7649579, - "num_input_tokens_seen": 268323290, - "step": 12441, - "time_per_iteration": 2.7022409439086914 - }, - { - "auxiliary_loss_clip": 0.01105099, - "auxiliary_loss_mlp": 0.00770813, - "balance_loss_clip": 1.04095459, - "balance_loss_mlp": 1.00020576, - "epoch": 0.7480535096948745, - "flos": 14392027912320.0, - "grad_norm": 2.187499472876037, - "language_loss": 0.82711899, - "learning_rate": 6.296060463313698e-07, - "loss": 0.84587812, - "num_input_tokens_seen": 268339490, - "step": 12442, - "time_per_iteration": 2.7588963508605957 - }, - { - "auxiliary_loss_clip": 0.0105579, - "auxiliary_loss_mlp": 0.01031459, - "balance_loss_clip": 1.03666043, - "balance_loss_mlp": 1.01823914, - "epoch": 0.7481136329475425, - "flos": 27344683048320.0, - "grad_norm": 2.073136454951009, - "language_loss": 0.63220263, - "learning_rate": 6.293224048575565e-07, - "loss": 0.65307516, - "num_input_tokens_seen": 268359865, - "step": 12443, - "time_per_iteration": 2.874648094177246 - }, - { - "auxiliary_loss_clip": 0.01067932, - "auxiliary_loss_mlp": 0.0102658, - "balance_loss_clip": 1.03455901, - "balance_loss_mlp": 1.01451015, - "epoch": 0.7481737562002104, - "flos": 19531454716800.0, - "grad_norm": 2.062388953360283, - "language_loss": 0.7137714, - "learning_rate": 6.29038815359975e-07, - "loss": 0.73471653, - "num_input_tokens_seen": 268377065, - "step": 12444, - "time_per_iteration": 2.703878402709961 - }, - { - "auxiliary_loss_clip": 0.01059747, - "auxiliary_loss_mlp": 0.01031938, - "balance_loss_clip": 1.03627777, - "balance_loss_mlp": 1.01890206, - "epoch": 0.7482338794528784, - "flos": 21760035166080.0, - "grad_norm": 1.378277825499583, - "language_loss": 0.69101679, - "learning_rate": 6.287552778493786e-07, - "loss": 0.71193373, - "num_input_tokens_seen": 268396935, - "step": 12445, - "time_per_iteration": 2.757577657699585 - }, - { - "auxiliary_loss_clip": 0.01098864, - "auxiliary_loss_mlp": 0.0102548, - "balance_loss_clip": 1.03871107, - "balance_loss_mlp": 1.01329112, - "epoch": 0.7482940027055464, - "flos": 18697358021760.0, - "grad_norm": 1.944144924482792, - "language_loss": 0.74288422, - "learning_rate": 6.28471792336519e-07, - "loss": 0.76412767, - "num_input_tokens_seen": 268414460, - "step": 12446, - "time_per_iteration": 2.69356107711792 - }, - { - "auxiliary_loss_clip": 0.01094765, - "auxiliary_loss_mlp": 0.00771514, - "balance_loss_clip": 1.04004169, - "balance_loss_mlp": 1.0002172, - "epoch": 0.7483541259582144, - "flos": 15998024903040.0, - "grad_norm": 2.4465560126403245, - "language_loss": 0.7326262, - "learning_rate": 6.281883588321475e-07, - "loss": 0.75128901, - "num_input_tokens_seen": 268432225, - "step": 12447, - "time_per_iteration": 2.662238597869873 - }, - { - "auxiliary_loss_clip": 0.01068097, - "auxiliary_loss_mlp": 0.01031231, - "balance_loss_clip": 1.03563976, - "balance_loss_mlp": 1.0193516, - "epoch": 0.7484142492108823, - "flos": 25556295772800.0, - "grad_norm": 2.4715537752348906, - "language_loss": 0.7231704, - "learning_rate": 6.279049773470109e-07, - "loss": 0.74416363, - "num_input_tokens_seen": 268449270, - "step": 12448, - "time_per_iteration": 2.7589666843414307 - }, - { - "auxiliary_loss_clip": 0.01113987, - "auxiliary_loss_mlp": 0.01037666, - "balance_loss_clip": 1.04052019, - "balance_loss_mlp": 1.02560151, - "epoch": 0.7484743724635503, - "flos": 22887737631360.0, - "grad_norm": 1.8427048424278483, - "language_loss": 0.73759341, - "learning_rate": 6.276216478918543e-07, - "loss": 0.75910997, - "num_input_tokens_seen": 268467250, - "step": 12449, - "time_per_iteration": 2.6071417331695557 - }, - { - "auxiliary_loss_clip": 0.01076255, - "auxiliary_loss_mlp": 0.01037131, - "balance_loss_clip": 1.03802109, - "balance_loss_mlp": 1.02391624, - "epoch": 0.7485344957162182, - "flos": 25300288563840.0, - "grad_norm": 2.0043420955718716, - "language_loss": 0.6146363, - "learning_rate": 6.273383704774225e-07, - "loss": 0.6357702, - "num_input_tokens_seen": 268487270, - "step": 12450, - "time_per_iteration": 2.7463302612304688 - }, - { - "auxiliary_loss_clip": 0.01106441, - "auxiliary_loss_mlp": 0.01026536, - "balance_loss_clip": 1.03821647, - "balance_loss_mlp": 1.01458502, - "epoch": 0.7485946189688862, - "flos": 27053016612480.0, - "grad_norm": 1.9632558902155064, - "language_loss": 0.70478344, - "learning_rate": 6.270551451144577e-07, - "loss": 0.7261132, - "num_input_tokens_seen": 268508020, - "step": 12451, - "time_per_iteration": 2.632495641708374 - }, - { - "auxiliary_loss_clip": 0.01103126, - "auxiliary_loss_mlp": 0.01029478, - "balance_loss_clip": 1.03716731, - "balance_loss_mlp": 1.0168184, - "epoch": 0.7486547422215541, - "flos": 26906752431360.0, - "grad_norm": 2.915106727246987, - "language_loss": 0.80665791, - "learning_rate": 6.267719718136988e-07, - "loss": 0.82798392, - "num_input_tokens_seen": 268527375, - "step": 12452, - "time_per_iteration": 2.6505486965179443 - }, - { - "auxiliary_loss_clip": 0.01119519, - "auxiliary_loss_mlp": 0.0103336, - "balance_loss_clip": 1.04324985, - "balance_loss_mlp": 1.02005577, - "epoch": 0.7487148654742222, - "flos": 22346277039360.0, - "grad_norm": 2.8493444529110215, - "language_loss": 0.71248496, - "learning_rate": 6.264888505858843e-07, - "loss": 0.73401374, - "num_input_tokens_seen": 268544870, - "step": 12453, - "time_per_iteration": 2.6861732006073 - }, - { - "auxiliary_loss_clip": 0.01091229, - "auxiliary_loss_mlp": 0.01034252, - "balance_loss_clip": 1.03970766, - "balance_loss_mlp": 1.02196777, - "epoch": 0.7487749887268901, - "flos": 23038814234880.0, - "grad_norm": 1.5893693791461498, - "language_loss": 0.73979241, - "learning_rate": 6.262057814417517e-07, - "loss": 0.76104718, - "num_input_tokens_seen": 268564580, - "step": 12454, - "time_per_iteration": 2.716642379760742 - }, - { - "auxiliary_loss_clip": 0.0100113, - "auxiliary_loss_mlp": 0.01001978, - "balance_loss_clip": 1.00717449, - "balance_loss_mlp": 1.00067317, - "epoch": 0.7488351119795581, - "flos": 71525294536320.0, - "grad_norm": 0.7358432419267441, - "language_loss": 0.59396183, - "learning_rate": 6.259227643920322e-07, - "loss": 0.61399293, - "num_input_tokens_seen": 268629550, - "step": 12455, - "time_per_iteration": 4.886117935180664 - }, - { - "auxiliary_loss_clip": 0.01072127, - "auxiliary_loss_mlp": 0.01029798, - "balance_loss_clip": 1.0343852, - "balance_loss_mlp": 1.01737666, - "epoch": 0.748895235232226, - "flos": 17196255722880.0, - "grad_norm": 2.489880729520255, - "language_loss": 0.79817784, - "learning_rate": 6.256397994474592e-07, - "loss": 0.81919706, - "num_input_tokens_seen": 268646645, - "step": 12456, - "time_per_iteration": 5.9515721797943115 - }, - { - "auxiliary_loss_clip": 0.01020316, - "auxiliary_loss_mlp": 0.01001663, - "balance_loss_clip": 1.00686383, - "balance_loss_mlp": 1.00054216, - "epoch": 0.748955358484894, - "flos": 58979256336000.0, - "grad_norm": 0.849157440562182, - "language_loss": 0.61421359, - "learning_rate": 6.25356886618763e-07, - "loss": 0.63443339, - "num_input_tokens_seen": 268702275, - "step": 12457, - "time_per_iteration": 3.1303980350494385 - }, - { - "auxiliary_loss_clip": 0.01098576, - "auxiliary_loss_mlp": 0.01035418, - "balance_loss_clip": 1.04226291, - "balance_loss_mlp": 1.02326477, - "epoch": 0.749015481737562, - "flos": 11360413054080.0, - "grad_norm": 1.9444047716710122, - "language_loss": 0.6761775, - "learning_rate": 6.250740259166711e-07, - "loss": 0.6975174, - "num_input_tokens_seen": 268716265, - "step": 12458, - "time_per_iteration": 2.665384292602539 - }, - { - "auxiliary_loss_clip": 0.0105583, - "auxiliary_loss_mlp": 0.01032729, - "balance_loss_clip": 1.03316355, - "balance_loss_mlp": 1.02080858, - "epoch": 0.74907560499023, - "flos": 21106497162240.0, - "grad_norm": 2.619057127646577, - "language_loss": 0.79952264, - "learning_rate": 6.247912173519106e-07, - "loss": 0.82040823, - "num_input_tokens_seen": 268734330, - "step": 12459, - "time_per_iteration": 2.754957675933838 - }, - { - "auxiliary_loss_clip": 0.01072944, - "auxiliary_loss_mlp": 0.01036735, - "balance_loss_clip": 1.0369221, - "balance_loss_mlp": 1.02394927, - "epoch": 0.749135728242898, - "flos": 22268027260800.0, - "grad_norm": 1.4984057584596764, - "language_loss": 0.80603898, - "learning_rate": 6.245084609352043e-07, - "loss": 0.82713568, - "num_input_tokens_seen": 268753500, - "step": 12460, - "time_per_iteration": 4.2594664096832275 - }, - { - "auxiliary_loss_clip": 0.01082271, - "auxiliary_loss_mlp": 0.01031927, - "balance_loss_clip": 1.03578806, - "balance_loss_mlp": 1.01876581, - "epoch": 0.7491958514955659, - "flos": 24057527857920.0, - "grad_norm": 1.80824785320189, - "language_loss": 0.85877681, - "learning_rate": 6.242257566772755e-07, - "loss": 0.87991881, - "num_input_tokens_seen": 268772055, - "step": 12461, - "time_per_iteration": 2.6852405071258545 - }, - { - "auxiliary_loss_clip": 0.01093212, - "auxiliary_loss_mlp": 0.01035506, - "balance_loss_clip": 1.03965092, - "balance_loss_mlp": 1.02309084, - "epoch": 0.7492559747482339, - "flos": 24492118510080.0, - "grad_norm": 1.8962735896690046, - "language_loss": 0.69416398, - "learning_rate": 6.239431045888435e-07, - "loss": 0.71545118, - "num_input_tokens_seen": 268792265, - "step": 12462, - "time_per_iteration": 2.768845319747925 - }, - { - "auxiliary_loss_clip": 0.01110765, - "auxiliary_loss_mlp": 0.01033779, - "balance_loss_clip": 1.03923655, - "balance_loss_mlp": 1.02101731, - "epoch": 0.7493160980009018, - "flos": 27745338326400.0, - "grad_norm": 2.365885457635203, - "language_loss": 0.7031799, - "learning_rate": 6.236605046806267e-07, - "loss": 0.72462535, - "num_input_tokens_seen": 268812735, - "step": 12463, - "time_per_iteration": 2.6340458393096924 - }, - { - "auxiliary_loss_clip": 0.01074204, - "auxiliary_loss_mlp": 0.01032497, - "balance_loss_clip": 1.03618455, - "balance_loss_mlp": 1.02071965, - "epoch": 0.7493762212535698, - "flos": 30226190970240.0, - "grad_norm": 2.141316726058728, - "language_loss": 0.77804828, - "learning_rate": 6.233779569633419e-07, - "loss": 0.7991153, - "num_input_tokens_seen": 268833090, - "step": 12464, - "time_per_iteration": 2.751758098602295 - }, - { - "auxiliary_loss_clip": 0.0108502, - "auxiliary_loss_mlp": 0.01026607, - "balance_loss_clip": 1.03515768, - "balance_loss_mlp": 1.01502621, - "epoch": 0.7494363445062378, - "flos": 21944472526080.0, - "grad_norm": 1.8114572432161449, - "language_loss": 0.78449178, - "learning_rate": 6.230954614477034e-07, - "loss": 0.80560803, - "num_input_tokens_seen": 268851880, - "step": 12465, - "time_per_iteration": 2.6739721298217773 - }, - { - "auxiliary_loss_clip": 0.0108024, - "auxiliary_loss_mlp": 0.01039302, - "balance_loss_clip": 1.03586817, - "balance_loss_mlp": 1.02480614, - "epoch": 0.7494964677589058, - "flos": 12490342162560.0, - "grad_norm": 2.3217514076277697, - "language_loss": 0.74179816, - "learning_rate": 6.22813018144422e-07, - "loss": 0.76299357, - "num_input_tokens_seen": 268867910, - "step": 12466, - "time_per_iteration": 2.63236665725708 - }, - { - "auxiliary_loss_clip": 0.01098476, - "auxiliary_loss_mlp": 0.01036506, - "balance_loss_clip": 1.03608537, - "balance_loss_mlp": 1.02381599, - "epoch": 0.7495565910115737, - "flos": 21653057485440.0, - "grad_norm": 2.1977964760321362, - "language_loss": 0.66625774, - "learning_rate": 6.22530627064209e-07, - "loss": 0.68760759, - "num_input_tokens_seen": 268887260, - "step": 12467, - "time_per_iteration": 2.6381313800811768 - }, - { - "auxiliary_loss_clip": 0.01062241, - "auxiliary_loss_mlp": 0.00773108, - "balance_loss_clip": 1.03538942, - "balance_loss_mlp": 1.00025678, - "epoch": 0.7496167142642417, - "flos": 15268535591040.0, - "grad_norm": 2.2660950859773425, - "language_loss": 0.76690638, - "learning_rate": 6.222482882177735e-07, - "loss": 0.7852599, - "num_input_tokens_seen": 268902520, - "step": 12468, - "time_per_iteration": 2.717893123626709 - }, - { - "auxiliary_loss_clip": 0.01071579, - "auxiliary_loss_mlp": 0.01029806, - "balance_loss_clip": 1.03752029, - "balance_loss_mlp": 1.0167706, - "epoch": 0.7496768375169096, - "flos": 22054933825920.0, - "grad_norm": 2.258197229303168, - "language_loss": 0.69274288, - "learning_rate": 6.219660016158201e-07, - "loss": 0.7137568, - "num_input_tokens_seen": 268920970, - "step": 12469, - "time_per_iteration": 2.7141220569610596 - }, - { - "auxiliary_loss_clip": 0.01089029, - "auxiliary_loss_mlp": 0.01032283, - "balance_loss_clip": 1.03684139, - "balance_loss_mlp": 1.01970625, - "epoch": 0.7497369607695776, - "flos": 19057038860160.0, - "grad_norm": 1.9749809581101754, - "language_loss": 0.69305575, - "learning_rate": 6.216837672690543e-07, - "loss": 0.71426892, - "num_input_tokens_seen": 268936600, - "step": 12470, - "time_per_iteration": 2.736288547515869 - }, - { - "auxiliary_loss_clip": 0.01082647, - "auxiliary_loss_mlp": 0.01033832, - "balance_loss_clip": 1.03593028, - "balance_loss_mlp": 1.01967597, - "epoch": 0.7497970840222457, - "flos": 21617434172160.0, - "grad_norm": 1.8937148851838135, - "language_loss": 0.75178516, - "learning_rate": 6.214015851881793e-07, - "loss": 0.77294993, - "num_input_tokens_seen": 268956560, - "step": 12471, - "time_per_iteration": 2.664313554763794 - }, - { - "auxiliary_loss_clip": 0.01084709, - "auxiliary_loss_mlp": 0.01035791, - "balance_loss_clip": 1.0353353, - "balance_loss_mlp": 1.02159286, - "epoch": 0.7498572072749136, - "flos": 13735580906880.0, - "grad_norm": 2.796464416827846, - "language_loss": 0.77233744, - "learning_rate": 6.211194553838929e-07, - "loss": 0.79354239, - "num_input_tokens_seen": 268973945, - "step": 12472, - "time_per_iteration": 2.657557487487793 - }, - { - "auxiliary_loss_clip": 0.01094543, - "auxiliary_loss_mlp": 0.00769819, - "balance_loss_clip": 1.03535211, - "balance_loss_mlp": 1.00018263, - "epoch": 0.7499173305275816, - "flos": 22966526113920.0, - "grad_norm": 1.5448300611730317, - "language_loss": 0.84419262, - "learning_rate": 6.208373778668951e-07, - "loss": 0.8628363, - "num_input_tokens_seen": 268993245, - "step": 12473, - "time_per_iteration": 2.7043027877807617 - }, - { - "auxiliary_loss_clip": 0.01079095, - "auxiliary_loss_mlp": 0.01031863, - "balance_loss_clip": 1.03500473, - "balance_loss_mlp": 1.01823711, - "epoch": 0.7499774537802495, - "flos": 22740467869440.0, - "grad_norm": 2.038219260751869, - "language_loss": 0.7402907, - "learning_rate": 6.205553526478829e-07, - "loss": 0.76140028, - "num_input_tokens_seen": 269012125, - "step": 12474, - "time_per_iteration": 2.74438214302063 - }, - { - "auxiliary_loss_clip": 0.01088373, - "auxiliary_loss_mlp": 0.01038948, - "balance_loss_clip": 1.03736258, - "balance_loss_mlp": 1.02587676, - "epoch": 0.7500375770329175, - "flos": 18296559089280.0, - "grad_norm": 2.2001386818620263, - "language_loss": 0.74208605, - "learning_rate": 6.202733797375492e-07, - "loss": 0.76335931, - "num_input_tokens_seen": 269030545, - "step": 12475, - "time_per_iteration": 2.6366353034973145 - }, - { - "auxiliary_loss_clip": 0.0110606, - "auxiliary_loss_mlp": 0.01035347, - "balance_loss_clip": 1.03846169, - "balance_loss_mlp": 1.02150083, - "epoch": 0.7500977002855854, - "flos": 19169978198400.0, - "grad_norm": 1.7274221168077015, - "language_loss": 0.80403024, - "learning_rate": 6.199914591465878e-07, - "loss": 0.82544434, - "num_input_tokens_seen": 269048180, - "step": 12476, - "time_per_iteration": 2.622103691101074 - }, - { - "auxiliary_loss_clip": 0.01076959, - "auxiliary_loss_mlp": 0.01034735, - "balance_loss_clip": 1.0369035, - "balance_loss_mlp": 1.02214074, - "epoch": 0.7501578235382534, - "flos": 22163886754560.0, - "grad_norm": 1.9425018569967707, - "language_loss": 0.77756828, - "learning_rate": 6.19709590885688e-07, - "loss": 0.79868519, - "num_input_tokens_seen": 269068600, - "step": 12477, - "time_per_iteration": 2.6923439502716064 - }, - { - "auxiliary_loss_clip": 0.01010213, - "auxiliary_loss_mlp": 0.01001269, - "balance_loss_clip": 1.00773573, - "balance_loss_mlp": 1.00022018, - "epoch": 0.7502179467909214, - "flos": 64465040033280.0, - "grad_norm": 0.8187484606770943, - "language_loss": 0.54458755, - "learning_rate": 6.194277749655394e-07, - "loss": 0.56470239, - "num_input_tokens_seen": 269119045, - "step": 12478, - "time_per_iteration": 3.204738140106201 - }, - { - "auxiliary_loss_clip": 0.0108167, - "auxiliary_loss_mlp": 0.01032285, - "balance_loss_clip": 1.03592229, - "balance_loss_mlp": 1.02035236, - "epoch": 0.7502780700435894, - "flos": 20478275268480.0, - "grad_norm": 1.6024244799309337, - "language_loss": 0.80039358, - "learning_rate": 6.191460113968272e-07, - "loss": 0.82153314, - "num_input_tokens_seen": 269136755, - "step": 12479, - "time_per_iteration": 2.690080165863037 - }, - { - "auxiliary_loss_clip": 0.01104663, - "auxiliary_loss_mlp": 0.01038768, - "balance_loss_clip": 1.03951621, - "balance_loss_mlp": 1.02505875, - "epoch": 0.7503381932962573, - "flos": 20445273648000.0, - "grad_norm": 2.9599564657820805, - "language_loss": 0.62753713, - "learning_rate": 6.188643001902369e-07, - "loss": 0.64897144, - "num_input_tokens_seen": 269156120, - "step": 12480, - "time_per_iteration": 2.6162097454071045 - }, - { - "auxiliary_loss_clip": 0.0108428, - "auxiliary_loss_mlp": 0.01034909, - "balance_loss_clip": 1.03689194, - "balance_loss_mlp": 1.02272034, - "epoch": 0.7503983165489253, - "flos": 22381936266240.0, - "grad_norm": 2.3943314671981955, - "language_loss": 0.78243744, - "learning_rate": 6.185826413564512e-07, - "loss": 0.80362934, - "num_input_tokens_seen": 269175650, - "step": 12481, - "time_per_iteration": 2.669548988342285 - }, - { - "auxiliary_loss_clip": 0.0106997, - "auxiliary_loss_mlp": 0.01038221, - "balance_loss_clip": 1.0354079, - "balance_loss_mlp": 1.02479172, - "epoch": 0.7504584398015932, - "flos": 24899453717760.0, - "grad_norm": 1.8872543755880817, - "language_loss": 0.71297598, - "learning_rate": 6.183010349061501e-07, - "loss": 0.73405796, - "num_input_tokens_seen": 269197080, - "step": 12482, - "time_per_iteration": 2.7567055225372314 - }, - { - "auxiliary_loss_clip": 0.01111149, - "auxiliary_loss_mlp": 0.01035657, - "balance_loss_clip": 1.03868306, - "balance_loss_mlp": 1.02335453, - "epoch": 0.7505185630542612, - "flos": 25885237547520.0, - "grad_norm": 1.839701731381712, - "language_loss": 0.6994698, - "learning_rate": 6.180194808500118e-07, - "loss": 0.72093785, - "num_input_tokens_seen": 269218600, - "step": 12483, - "time_per_iteration": 2.606757402420044 - }, - { - "auxiliary_loss_clip": 0.01110582, - "auxiliary_loss_mlp": 0.01027036, - "balance_loss_clip": 1.03916931, - "balance_loss_mlp": 1.01574111, - "epoch": 0.7505786863069293, - "flos": 23143852581120.0, - "grad_norm": 2.0560449071537943, - "language_loss": 0.74602097, - "learning_rate": 6.177379791987131e-07, - "loss": 0.76739717, - "num_input_tokens_seen": 269239245, - "step": 12484, - "time_per_iteration": 2.618504285812378 - }, - { - "auxiliary_loss_clip": 0.01087809, - "auxiliary_loss_mlp": 0.01029947, - "balance_loss_clip": 1.03753555, - "balance_loss_mlp": 1.01745975, - "epoch": 0.7506388095595972, - "flos": 16983377769600.0, - "grad_norm": 1.9415131613647365, - "language_loss": 0.84624791, - "learning_rate": 6.174565299629295e-07, - "loss": 0.86742544, - "num_input_tokens_seen": 269258520, - "step": 12485, - "time_per_iteration": 2.697805404663086 - }, - { - "auxiliary_loss_clip": 0.01072795, - "auxiliary_loss_mlp": 0.0103091, - "balance_loss_clip": 1.03648996, - "balance_loss_mlp": 1.01851201, - "epoch": 0.7506989328122652, - "flos": 22344984149760.0, - "grad_norm": 1.6745365119179365, - "language_loss": 0.78448224, - "learning_rate": 6.171751331533323e-07, - "loss": 0.80551928, - "num_input_tokens_seen": 269278320, - "step": 12486, - "time_per_iteration": 2.714510202407837 - }, - { - "auxiliary_loss_clip": 0.01099772, - "auxiliary_loss_mlp": 0.01033013, - "balance_loss_clip": 1.03659987, - "balance_loss_mlp": 1.01920259, - "epoch": 0.7507590560649331, - "flos": 25776069137280.0, - "grad_norm": 2.4012807743392477, - "language_loss": 0.72792411, - "learning_rate": 6.168937887805932e-07, - "loss": 0.74925202, - "num_input_tokens_seen": 269298025, - "step": 12487, - "time_per_iteration": 2.7071502208709717 - }, - { - "auxiliary_loss_clip": 0.01085256, - "auxiliary_loss_mlp": 0.0103171, - "balance_loss_clip": 1.0348568, - "balance_loss_mlp": 1.01866841, - "epoch": 0.7508191793176011, - "flos": 24279420124800.0, - "grad_norm": 4.846564325155201, - "language_loss": 0.67752981, - "learning_rate": 6.166124968553801e-07, - "loss": 0.69869953, - "num_input_tokens_seen": 269316770, - "step": 12488, - "time_per_iteration": 2.644109010696411 - }, - { - "auxiliary_loss_clip": 0.01045289, - "auxiliary_loss_mlp": 0.01033728, - "balance_loss_clip": 1.0341351, - "balance_loss_mlp": 1.02041197, - "epoch": 0.750879302570269, - "flos": 19899575251200.0, - "grad_norm": 1.8778545582321347, - "language_loss": 0.77185404, - "learning_rate": 6.163312573883592e-07, - "loss": 0.7926442, - "num_input_tokens_seen": 269334755, - "step": 12489, - "time_per_iteration": 2.73962664604187 - }, - { - "auxiliary_loss_clip": 0.01096988, - "auxiliary_loss_mlp": 0.01031266, - "balance_loss_clip": 1.03820062, - "balance_loss_mlp": 1.01943493, - "epoch": 0.750939425822937, - "flos": 29205681667200.0, - "grad_norm": 2.4146735284189393, - "language_loss": 0.75405651, - "learning_rate": 6.160500703901956e-07, - "loss": 0.77533901, - "num_input_tokens_seen": 269353810, - "step": 12490, - "time_per_iteration": 2.6824519634246826 - }, - { - "auxiliary_loss_clip": 0.01109505, - "auxiliary_loss_mlp": 0.0103058, - "balance_loss_clip": 1.03855062, - "balance_loss_mlp": 1.01803946, - "epoch": 0.750999549075605, - "flos": 21142300043520.0, - "grad_norm": 1.5627078953093116, - "language_loss": 0.78168178, - "learning_rate": 6.157689358715527e-07, - "loss": 0.80308264, - "num_input_tokens_seen": 269372910, - "step": 12491, - "time_per_iteration": 2.6018178462982178 - }, - { - "auxiliary_loss_clip": 0.01097672, - "auxiliary_loss_mlp": 0.01031858, - "balance_loss_clip": 1.03719735, - "balance_loss_mlp": 1.02034187, - "epoch": 0.751059672328273, - "flos": 23547740083200.0, - "grad_norm": 1.6628642916222176, - "language_loss": 0.76332009, - "learning_rate": 6.154878538430899e-07, - "loss": 0.7846154, - "num_input_tokens_seen": 269391545, - "step": 12492, - "time_per_iteration": 2.691298484802246 - }, - { - "auxiliary_loss_clip": 0.01078534, - "auxiliary_loss_mlp": 0.01032513, - "balance_loss_clip": 1.03569448, - "balance_loss_mlp": 1.02058053, - "epoch": 0.7511197955809409, - "flos": 18989742729600.0, - "grad_norm": 1.9903305425404954, - "language_loss": 0.71488953, - "learning_rate": 6.152068243154671e-07, - "loss": 0.736, - "num_input_tokens_seen": 269408530, - "step": 12493, - "time_per_iteration": 2.718707323074341 - }, - { - "auxiliary_loss_clip": 0.01099033, - "auxiliary_loss_mlp": 0.00770094, - "balance_loss_clip": 1.03731656, - "balance_loss_mlp": 1.00024784, - "epoch": 0.7511799188336089, - "flos": 22046961006720.0, - "grad_norm": 4.285406665827556, - "language_loss": 0.80753833, - "learning_rate": 6.149258472993395e-07, - "loss": 0.82622963, - "num_input_tokens_seen": 269425930, - "step": 12494, - "time_per_iteration": 4.076538562774658 - }, - { - "auxiliary_loss_clip": 0.01111429, - "auxiliary_loss_mlp": 0.01029891, - "balance_loss_clip": 1.03875446, - "balance_loss_mlp": 1.01716495, - "epoch": 0.7512400420862768, - "flos": 16467125546880.0, - "grad_norm": 3.1005011583642084, - "language_loss": 0.78857327, - "learning_rate": 6.146449228053634e-07, - "loss": 0.80998647, - "num_input_tokens_seen": 269443945, - "step": 12495, - "time_per_iteration": 2.608964204788208 - }, - { - "auxiliary_loss_clip": 0.01110172, - "auxiliary_loss_mlp": 0.00769806, - "balance_loss_clip": 1.03854084, - "balance_loss_mlp": 1.0001905, - "epoch": 0.7513001653389448, - "flos": 20448326304000.0, - "grad_norm": 2.1655519437431967, - "language_loss": 0.7114259, - "learning_rate": 6.143640508441898e-07, - "loss": 0.73022562, - "num_input_tokens_seen": 269463625, - "step": 12496, - "time_per_iteration": 5.996880769729614 - }, - { - "auxiliary_loss_clip": 0.01065225, - "auxiliary_loss_mlp": 0.01035626, - "balance_loss_clip": 1.03378069, - "balance_loss_mlp": 1.02353823, - "epoch": 0.7513602885916129, - "flos": 23476816679040.0, - "grad_norm": 1.61396701477023, - "language_loss": 0.78199899, - "learning_rate": 6.140832314264705e-07, - "loss": 0.80300748, - "num_input_tokens_seen": 269483415, - "step": 12497, - "time_per_iteration": 2.9391214847564697 - }, - { - "auxiliary_loss_clip": 0.01100389, - "auxiliary_loss_mlp": 0.0103612, - "balance_loss_clip": 1.03779829, - "balance_loss_mlp": 1.02334642, - "epoch": 0.7514204118442808, - "flos": 26797224885120.0, - "grad_norm": 1.6137991944491499, - "language_loss": 0.76816785, - "learning_rate": 6.13802464562855e-07, - "loss": 0.7895329, - "num_input_tokens_seen": 269504635, - "step": 12498, - "time_per_iteration": 2.6544244289398193 - }, - { - "auxiliary_loss_clip": 0.0108807, - "auxiliary_loss_mlp": 0.01033331, - "balance_loss_clip": 1.03969288, - "balance_loss_mlp": 1.02200651, - "epoch": 0.7514805350969488, - "flos": 19865639877120.0, - "grad_norm": 1.7444376873678542, - "language_loss": 0.74047679, - "learning_rate": 6.135217502639878e-07, - "loss": 0.7616908, - "num_input_tokens_seen": 269523955, - "step": 12499, - "time_per_iteration": 4.209566831588745 - }, - { - "auxiliary_loss_clip": 0.01096501, - "auxiliary_loss_mlp": 0.01028936, - "balance_loss_clip": 1.03525448, - "balance_loss_mlp": 1.01752162, - "epoch": 0.7515406583496167, - "flos": 24571553437440.0, - "grad_norm": 2.0366798192363698, - "language_loss": 0.79610258, - "learning_rate": 6.132410885405148e-07, - "loss": 0.81735694, - "num_input_tokens_seen": 269544410, - "step": 12500, - "time_per_iteration": 2.6563799381256104 - }, - { - "auxiliary_loss_clip": 0.01108205, - "auxiliary_loss_mlp": 0.01037277, - "balance_loss_clip": 1.03992486, - "balance_loss_mlp": 1.02259684, - "epoch": 0.7516007816022847, - "flos": 20120246455680.0, - "grad_norm": 3.0425120159741588, - "language_loss": 0.73648608, - "learning_rate": 6.129604794030794e-07, - "loss": 0.75794089, - "num_input_tokens_seen": 269563315, - "step": 12501, - "time_per_iteration": 2.744978666305542 - }, - { - "auxiliary_loss_clip": 0.01086633, - "auxiliary_loss_mlp": 0.01027091, - "balance_loss_clip": 1.03513741, - "balance_loss_mlp": 1.01484871, - "epoch": 0.7516609048549526, - "flos": 22784638619520.0, - "grad_norm": 1.7898399637161078, - "language_loss": 0.78497088, - "learning_rate": 6.126799228623207e-07, - "loss": 0.80610812, - "num_input_tokens_seen": 269583950, - "step": 12502, - "time_per_iteration": 2.738304615020752 - }, - { - "auxiliary_loss_clip": 0.01089762, - "auxiliary_loss_mlp": 0.01036729, - "balance_loss_clip": 1.03781581, - "balance_loss_mlp": 1.02402735, - "epoch": 0.7517210281076206, - "flos": 10634012311680.0, - "grad_norm": 2.4706577261656277, - "language_loss": 0.70263046, - "learning_rate": 6.123994189288786e-07, - "loss": 0.72389537, - "num_input_tokens_seen": 269600120, - "step": 12503, - "time_per_iteration": 2.647141695022583 - }, - { - "auxiliary_loss_clip": 0.0102855, - "auxiliary_loss_mlp": 0.00998893, - "balance_loss_clip": 1.00588393, - "balance_loss_mlp": 0.99776667, - "epoch": 0.7517811513602886, - "flos": 66052221275520.0, - "grad_norm": 0.9994462005888404, - "language_loss": 0.63930368, - "learning_rate": 6.121189676133903e-07, - "loss": 0.65957808, - "num_input_tokens_seen": 269659815, - "step": 12504, - "time_per_iteration": 3.0780868530273438 - }, - { - "auxiliary_loss_clip": 0.01067894, - "auxiliary_loss_mlp": 0.01035688, - "balance_loss_clip": 1.03288054, - "balance_loss_mlp": 1.02317679, - "epoch": 0.7518412746129566, - "flos": 37268345018880.0, - "grad_norm": 1.4012015647577118, - "language_loss": 0.68983722, - "learning_rate": 6.118385689264896e-07, - "loss": 0.71087301, - "num_input_tokens_seen": 269684565, - "step": 12505, - "time_per_iteration": 2.979429244995117 - }, - { - "auxiliary_loss_clip": 0.01018848, - "auxiliary_loss_mlp": 0.00750909, - "balance_loss_clip": 1.00648499, - "balance_loss_mlp": 0.9996025, - "epoch": 0.7519013978656245, - "flos": 60518567727360.0, - "grad_norm": 1.3160178950136667, - "language_loss": 0.55058348, - "learning_rate": 6.11558222878809e-07, - "loss": 0.56828105, - "num_input_tokens_seen": 269752325, - "step": 12506, - "time_per_iteration": 3.3165297508239746 - }, - { - "auxiliary_loss_clip": 0.01099755, - "auxiliary_loss_mlp": 0.0103953, - "balance_loss_clip": 1.03766441, - "balance_loss_mlp": 1.02648234, - "epoch": 0.7519615211182925, - "flos": 18806885568000.0, - "grad_norm": 2.1648531082865, - "language_loss": 0.78275704, - "learning_rate": 6.112779294809796e-07, - "loss": 0.80414987, - "num_input_tokens_seen": 269770630, - "step": 12507, - "time_per_iteration": 2.608264923095703 - }, - { - "auxiliary_loss_clip": 0.01083834, - "auxiliary_loss_mlp": 0.01032238, - "balance_loss_clip": 1.03923869, - "balance_loss_mlp": 1.02056146, - "epoch": 0.7520216443709604, - "flos": 14575244209920.0, - "grad_norm": 1.7931867783569413, - "language_loss": 0.71366513, - "learning_rate": 6.10997688743631e-07, - "loss": 0.73482585, - "num_input_tokens_seen": 269787280, - "step": 12508, - "time_per_iteration": 2.695327043533325 - }, - { - "auxiliary_loss_clip": 0.01095204, - "auxiliary_loss_mlp": 0.01031492, - "balance_loss_clip": 1.03605807, - "balance_loss_mlp": 1.01884961, - "epoch": 0.7520817676236284, - "flos": 17056599644160.0, - "grad_norm": 1.7777789026239683, - "language_loss": 0.71897995, - "learning_rate": 6.107175006773885e-07, - "loss": 0.74024695, - "num_input_tokens_seen": 269805205, - "step": 12509, - "time_per_iteration": 2.649292230606079 - }, - { - "auxiliary_loss_clip": 0.01116565, - "auxiliary_loss_mlp": 0.01036188, - "balance_loss_clip": 1.04018068, - "balance_loss_mlp": 1.02252054, - "epoch": 0.7521418908762965, - "flos": 25666397936640.0, - "grad_norm": 1.6131110543422247, - "language_loss": 0.62129647, - "learning_rate": 6.104373652928785e-07, - "loss": 0.64282399, - "num_input_tokens_seen": 269824820, - "step": 12510, - "time_per_iteration": 2.5948257446289062 - }, - { - "auxiliary_loss_clip": 0.01097666, - "auxiliary_loss_mlp": 0.01031956, - "balance_loss_clip": 1.03866065, - "balance_loss_mlp": 1.01975513, - "epoch": 0.7522020141289644, - "flos": 20886759711360.0, - "grad_norm": 3.9854305674745474, - "language_loss": 0.81469762, - "learning_rate": 6.10157282600722e-07, - "loss": 0.83599389, - "num_input_tokens_seen": 269842825, - "step": 12511, - "time_per_iteration": 2.6505610942840576 - }, - { - "auxiliary_loss_clip": 0.01087038, - "auxiliary_loss_mlp": 0.01038744, - "balance_loss_clip": 1.03666866, - "balance_loss_mlp": 1.02523184, - "epoch": 0.7522621373816324, - "flos": 12640305444480.0, - "grad_norm": 1.8827619698116422, - "language_loss": 0.75637031, - "learning_rate": 6.098772526115412e-07, - "loss": 0.77762812, - "num_input_tokens_seen": 269859000, - "step": 12512, - "time_per_iteration": 2.647817850112915 - }, - { - "auxiliary_loss_clip": 0.01093893, - "auxiliary_loss_mlp": 0.01030376, - "balance_loss_clip": 1.03682494, - "balance_loss_mlp": 1.01915812, - "epoch": 0.7523222606343003, - "flos": 25626141768960.0, - "grad_norm": 1.9631337780364373, - "language_loss": 0.82056046, - "learning_rate": 6.095972753359537e-07, - "loss": 0.84180307, - "num_input_tokens_seen": 269878895, - "step": 12513, - "time_per_iteration": 2.6645336151123047 - }, - { - "auxiliary_loss_clip": 0.01097529, - "auxiliary_loss_mlp": 0.01037828, - "balance_loss_clip": 1.03800118, - "balance_loss_mlp": 1.02469754, - "epoch": 0.7523823838869683, - "flos": 20448900921600.0, - "grad_norm": 2.0970776368608846, - "language_loss": 0.74827564, - "learning_rate": 6.093173507845771e-07, - "loss": 0.76962924, - "num_input_tokens_seen": 269897280, - "step": 12514, - "time_per_iteration": 2.617037057876587 - }, - { - "auxiliary_loss_clip": 0.01090674, - "auxiliary_loss_mlp": 0.01031924, - "balance_loss_clip": 1.03809762, - "balance_loss_mlp": 1.02052188, - "epoch": 0.7524425071396362, - "flos": 14720610551040.0, - "grad_norm": 3.0939358054724146, - "language_loss": 0.68892944, - "learning_rate": 6.090374789680271e-07, - "loss": 0.71015543, - "num_input_tokens_seen": 269914640, - "step": 12515, - "time_per_iteration": 2.59306001663208 - }, - { - "auxiliary_loss_clip": 0.01100231, - "auxiliary_loss_mlp": 0.0103411, - "balance_loss_clip": 1.03811383, - "balance_loss_mlp": 1.02207565, - "epoch": 0.7525026303923043, - "flos": 30592048947840.0, - "grad_norm": 2.040446314697398, - "language_loss": 0.69929761, - "learning_rate": 6.087576598969137e-07, - "loss": 0.72064102, - "num_input_tokens_seen": 269934960, - "step": 12516, - "time_per_iteration": 2.6960794925689697 - }, - { - "auxiliary_loss_clip": 0.0106292, - "auxiliary_loss_mlp": 0.01032821, - "balance_loss_clip": 1.03736663, - "balance_loss_mlp": 1.020751, - "epoch": 0.7525627536449722, - "flos": 24791757765120.0, - "grad_norm": 1.5503440947431564, - "language_loss": 0.89659667, - "learning_rate": 6.084778935818495e-07, - "loss": 0.91755402, - "num_input_tokens_seen": 269956655, - "step": 12517, - "time_per_iteration": 2.9062299728393555 - }, - { - "auxiliary_loss_clip": 0.0108799, - "auxiliary_loss_mlp": 0.01032916, - "balance_loss_clip": 1.036955, - "balance_loss_mlp": 1.02054834, - "epoch": 0.7526228768976402, - "flos": 20779782030720.0, - "grad_norm": 1.60812028776888, - "language_loss": 0.74420178, - "learning_rate": 6.081981800334437e-07, - "loss": 0.76541078, - "num_input_tokens_seen": 269976835, - "step": 12518, - "time_per_iteration": 2.9830613136291504 - }, - { - "auxiliary_loss_clip": 0.00997374, - "auxiliary_loss_mlp": 0.01010959, - "balance_loss_clip": 1.01711154, - "balance_loss_mlp": 1.0097965, - "epoch": 0.7526830001503081, - "flos": 66559243703040.0, - "grad_norm": 0.708684039109314, - "language_loss": 0.55700099, - "learning_rate": 6.079185192623017e-07, - "loss": 0.5770843, - "num_input_tokens_seen": 270040630, - "step": 12519, - "time_per_iteration": 3.3146941661834717 - }, - { - "auxiliary_loss_clip": 0.01093149, - "auxiliary_loss_mlp": 0.01034827, - "balance_loss_clip": 1.0377202, - "balance_loss_mlp": 1.0233829, - "epoch": 0.7527431234029761, - "flos": 23477894087040.0, - "grad_norm": 1.471289032229335, - "language_loss": 0.77771223, - "learning_rate": 6.07638911279029e-07, - "loss": 0.79899204, - "num_input_tokens_seen": 270059695, - "step": 12520, - "time_per_iteration": 2.6884288787841797 - }, - { - "auxiliary_loss_clip": 0.01092157, - "auxiliary_loss_mlp": 0.01040045, - "balance_loss_clip": 1.0348748, - "balance_loss_mlp": 1.02787995, - "epoch": 0.752803246655644, - "flos": 22049546785920.0, - "grad_norm": 1.9875940404305874, - "language_loss": 0.73850435, - "learning_rate": 6.07359356094229e-07, - "loss": 0.75982636, - "num_input_tokens_seen": 270078420, - "step": 12521, - "time_per_iteration": 2.6750216484069824 - }, - { - "auxiliary_loss_clip": 0.01088057, - "auxiliary_loss_mlp": 0.01037596, - "balance_loss_clip": 1.03908432, - "balance_loss_mlp": 1.02416718, - "epoch": 0.752863369908312, - "flos": 30153795108480.0, - "grad_norm": 2.5051463409980634, - "language_loss": 0.67080051, - "learning_rate": 6.070798537185016e-07, - "loss": 0.69205701, - "num_input_tokens_seen": 270097040, - "step": 12522, - "time_per_iteration": 2.772545576095581 - }, - { - "auxiliary_loss_clip": 0.01101858, - "auxiliary_loss_mlp": 0.01042231, - "balance_loss_clip": 1.0390172, - "balance_loss_mlp": 1.02954745, - "epoch": 0.7529234931609801, - "flos": 24567638855040.0, - "grad_norm": 1.9325900284520732, - "language_loss": 0.78271604, - "learning_rate": 6.068004041624453e-07, - "loss": 0.8041569, - "num_input_tokens_seen": 270116365, - "step": 12523, - "time_per_iteration": 2.5928404331207275 - }, - { - "auxiliary_loss_clip": 0.01107861, - "auxiliary_loss_mlp": 0.01033051, - "balance_loss_clip": 1.03753757, - "balance_loss_mlp": 1.02056384, - "epoch": 0.752983616413648, - "flos": 23112395245440.0, - "grad_norm": 1.9643840273203326, - "language_loss": 0.80548674, - "learning_rate": 6.065210074366571e-07, - "loss": 0.82689583, - "num_input_tokens_seen": 270135395, - "step": 12524, - "time_per_iteration": 2.5654656887054443 - }, - { - "auxiliary_loss_clip": 0.01100021, - "auxiliary_loss_mlp": 0.00769024, - "balance_loss_clip": 1.03862953, - "balance_loss_mlp": 1.00022125, - "epoch": 0.753043739666316, - "flos": 24316946858880.0, - "grad_norm": 1.7390985823733704, - "language_loss": 0.74004686, - "learning_rate": 6.062416635517326e-07, - "loss": 0.75873733, - "num_input_tokens_seen": 270156425, - "step": 12525, - "time_per_iteration": 2.629235029220581 - }, - { - "auxiliary_loss_clip": 0.01076975, - "auxiliary_loss_mlp": 0.01030405, - "balance_loss_clip": 1.03678906, - "balance_loss_mlp": 1.01844287, - "epoch": 0.7531038629189839, - "flos": 24243294021120.0, - "grad_norm": 1.8793498338294334, - "language_loss": 0.72428775, - "learning_rate": 6.059623725182641e-07, - "loss": 0.74536157, - "num_input_tokens_seen": 270176905, - "step": 12526, - "time_per_iteration": 2.7281389236450195 - }, - { - "auxiliary_loss_clip": 0.01088063, - "auxiliary_loss_mlp": 0.01028205, - "balance_loss_clip": 1.03752398, - "balance_loss_mlp": 1.01674342, - "epoch": 0.7531639861716519, - "flos": 30188807890560.0, - "grad_norm": 1.5986018665355572, - "language_loss": 0.72446048, - "learning_rate": 6.056831343468414e-07, - "loss": 0.74562311, - "num_input_tokens_seen": 270196640, - "step": 12527, - "time_per_iteration": 2.765742301940918 - }, - { - "auxiliary_loss_clip": 0.01077327, - "auxiliary_loss_mlp": 0.01027333, - "balance_loss_clip": 1.03892338, - "balance_loss_mlp": 1.01588261, - "epoch": 0.7532241094243198, - "flos": 18223193560320.0, - "grad_norm": 1.7490164070315937, - "language_loss": 0.81002724, - "learning_rate": 6.054039490480539e-07, - "loss": 0.83107388, - "num_input_tokens_seen": 270213905, - "step": 12528, - "time_per_iteration": 2.8258893489837646 - }, - { - "auxiliary_loss_clip": 0.01062731, - "auxiliary_loss_mlp": 0.01037431, - "balance_loss_clip": 1.04194808, - "balance_loss_mlp": 1.02391267, - "epoch": 0.7532842326769879, - "flos": 20881049448960.0, - "grad_norm": 2.0737257705998084, - "language_loss": 0.84989285, - "learning_rate": 6.051248166324892e-07, - "loss": 0.87089443, - "num_input_tokens_seen": 270231995, - "step": 12529, - "time_per_iteration": 2.8930623531341553 - }, - { - "auxiliary_loss_clip": 0.01084647, - "auxiliary_loss_mlp": 0.01037761, - "balance_loss_clip": 1.04127479, - "balance_loss_mlp": 1.02479124, - "epoch": 0.7533443559296558, - "flos": 18078689145600.0, - "grad_norm": 1.9050070504159882, - "language_loss": 0.73877907, - "learning_rate": 6.048457371107303e-07, - "loss": 0.76000321, - "num_input_tokens_seen": 270251480, - "step": 12530, - "time_per_iteration": 2.765109062194824 - }, - { - "auxiliary_loss_clip": 0.0098332, - "auxiliary_loss_mlp": 0.01008335, - "balance_loss_clip": 1.01471329, - "balance_loss_mlp": 1.00720811, - "epoch": 0.7534044791823238, - "flos": 50254830766080.0, - "grad_norm": 0.8264532859334547, - "language_loss": 0.63601089, - "learning_rate": 6.045667104933612e-07, - "loss": 0.65592742, - "num_input_tokens_seen": 270306480, - "step": 12531, - "time_per_iteration": 3.203054428100586 - }, - { - "auxiliary_loss_clip": 0.01090436, - "auxiliary_loss_mlp": 0.01030983, - "balance_loss_clip": 1.03936112, - "balance_loss_mlp": 1.01770902, - "epoch": 0.7534646024349917, - "flos": 20850274471680.0, - "grad_norm": 2.3022787240399087, - "language_loss": 0.69915926, - "learning_rate": 6.042877367909633e-07, - "loss": 0.72037345, - "num_input_tokens_seen": 270324595, - "step": 12532, - "time_per_iteration": 2.8513519763946533 - }, - { - "auxiliary_loss_clip": 0.01080734, - "auxiliary_loss_mlp": 0.010295, - "balance_loss_clip": 1.0378058, - "balance_loss_mlp": 1.01846147, - "epoch": 0.7535247256876597, - "flos": 23071779941760.0, - "grad_norm": 1.6087653356009437, - "language_loss": 0.77676719, - "learning_rate": 6.040088160141132e-07, - "loss": 0.79786956, - "num_input_tokens_seen": 270344375, - "step": 12533, - "time_per_iteration": 5.849594831466675 - }, - { - "auxiliary_loss_clip": 0.01019649, - "auxiliary_loss_mlp": 0.01000792, - "balance_loss_clip": 1.00604045, - "balance_loss_mlp": 0.99969578, - "epoch": 0.7535848489403276, - "flos": 58623418252800.0, - "grad_norm": 0.7831538235922403, - "language_loss": 0.57278597, - "learning_rate": 6.037299481733886e-07, - "loss": 0.5929904, - "num_input_tokens_seen": 270405235, - "step": 12534, - "time_per_iteration": 4.745975494384766 - }, - { - "auxiliary_loss_clip": 0.01087528, - "auxiliary_loss_mlp": 0.01028431, - "balance_loss_clip": 1.03641176, - "balance_loss_mlp": 1.01590824, - "epoch": 0.7536449721929956, - "flos": 26577882483840.0, - "grad_norm": 1.758420059943407, - "language_loss": 0.71251357, - "learning_rate": 6.03451133279365e-07, - "loss": 0.73367316, - "num_input_tokens_seen": 270425820, - "step": 12535, - "time_per_iteration": 4.465839862823486 - }, - { - "auxiliary_loss_clip": 0.01084954, - "auxiliary_loss_mlp": 0.01032093, - "balance_loss_clip": 1.0328145, - "balance_loss_mlp": 1.01900923, - "epoch": 0.7537050954456637, - "flos": 25735992537600.0, - "grad_norm": 1.6235192895129946, - "language_loss": 0.80976534, - "learning_rate": 6.031723713426135e-07, - "loss": 0.83093584, - "num_input_tokens_seen": 270447120, - "step": 12536, - "time_per_iteration": 2.8644282817840576 - }, - { - "auxiliary_loss_clip": 0.01075788, - "auxiliary_loss_mlp": 0.01032433, - "balance_loss_clip": 1.03380179, - "balance_loss_mlp": 1.02025628, - "epoch": 0.7537652186983316, - "flos": 30224431203840.0, - "grad_norm": 2.1219622248663095, - "language_loss": 0.74480766, - "learning_rate": 6.028936623737067e-07, - "loss": 0.76588988, - "num_input_tokens_seen": 270468680, - "step": 12537, - "time_per_iteration": 2.8825693130493164 - }, - { - "auxiliary_loss_clip": 0.01110837, - "auxiliary_loss_mlp": 0.01034517, - "balance_loss_clip": 1.03765774, - "balance_loss_mlp": 1.0218091, - "epoch": 0.7538253419509996, - "flos": 12641239198080.0, - "grad_norm": 1.7916470224859762, - "language_loss": 0.74127239, - "learning_rate": 6.026150063832111e-07, - "loss": 0.76272595, - "num_input_tokens_seen": 270486310, - "step": 12538, - "time_per_iteration": 2.6497671604156494 - }, - { - "auxiliary_loss_clip": 0.0107304, - "auxiliary_loss_mlp": 0.01037868, - "balance_loss_clip": 1.03775454, - "balance_loss_mlp": 1.02487969, - "epoch": 0.7538854652036675, - "flos": 23185976256000.0, - "grad_norm": 1.6497097895252697, - "language_loss": 0.67839807, - "learning_rate": 6.023364033816956e-07, - "loss": 0.69950712, - "num_input_tokens_seen": 270507210, - "step": 12539, - "time_per_iteration": 4.390820503234863 - }, - { - "auxiliary_loss_clip": 0.01109728, - "auxiliary_loss_mlp": 0.01030908, - "balance_loss_clip": 1.03869367, - "balance_loss_mlp": 1.01831353, - "epoch": 0.7539455884563355, - "flos": 23186227651200.0, - "grad_norm": 1.7887923247322153, - "language_loss": 0.74677789, - "learning_rate": 6.020578533797229e-07, - "loss": 0.76818419, - "num_input_tokens_seen": 270525250, - "step": 12540, - "time_per_iteration": 2.6644110679626465 - }, - { - "auxiliary_loss_clip": 0.01112068, - "auxiliary_loss_mlp": 0.01031348, - "balance_loss_clip": 1.03821325, - "balance_loss_mlp": 1.01833093, - "epoch": 0.7540057117090034, - "flos": 13181155505280.0, - "grad_norm": 2.2413467917064844, - "language_loss": 0.72496325, - "learning_rate": 6.017793563878566e-07, - "loss": 0.74639738, - "num_input_tokens_seen": 270539295, - "step": 12541, - "time_per_iteration": 2.6159961223602295 - }, - { - "auxiliary_loss_clip": 0.0110964, - "auxiliary_loss_mlp": 0.01031618, - "balance_loss_clip": 1.03806591, - "balance_loss_mlp": 1.0190115, - "epoch": 0.7540658349616715, - "flos": 45478134478080.0, - "grad_norm": 1.701926826906392, - "language_loss": 0.72403926, - "learning_rate": 6.015009124166576e-07, - "loss": 0.74545187, - "num_input_tokens_seen": 270562815, - "step": 12542, - "time_per_iteration": 2.8387362957000732 - }, - { - "auxiliary_loss_clip": 0.01085175, - "auxiliary_loss_mlp": 0.0102804, - "balance_loss_clip": 1.03588843, - "balance_loss_mlp": 1.01508224, - "epoch": 0.7541259582143394, - "flos": 19930817105280.0, - "grad_norm": 2.526006786337045, - "language_loss": 0.8460182, - "learning_rate": 6.012225214766844e-07, - "loss": 0.86715031, - "num_input_tokens_seen": 270579055, - "step": 12543, - "time_per_iteration": 2.6901259422302246 - }, - { - "auxiliary_loss_clip": 0.01077755, - "auxiliary_loss_mlp": 0.01034905, - "balance_loss_clip": 1.04070735, - "balance_loss_mlp": 1.02253056, - "epoch": 0.7541860814670074, - "flos": 27198239299200.0, - "grad_norm": 2.1653550809548587, - "language_loss": 0.73906153, - "learning_rate": 6.009441835784927e-07, - "loss": 0.7601881, - "num_input_tokens_seen": 270599080, - "step": 12544, - "time_per_iteration": 2.729667901992798 - }, - { - "auxiliary_loss_clip": 0.0109777, - "auxiliary_loss_mlp": 0.01030811, - "balance_loss_clip": 1.03749204, - "balance_loss_mlp": 1.01909888, - "epoch": 0.7542462047196753, - "flos": 21324151624320.0, - "grad_norm": 1.9325798259203488, - "language_loss": 0.6805954, - "learning_rate": 6.006658987326383e-07, - "loss": 0.70188129, - "num_input_tokens_seen": 270618715, - "step": 12545, - "time_per_iteration": 2.6119935512542725 - }, - { - "auxiliary_loss_clip": 0.01085784, - "auxiliary_loss_mlp": 0.01033147, - "balance_loss_clip": 1.03426456, - "balance_loss_mlp": 1.0204457, - "epoch": 0.7543063279723433, - "flos": 11940944664960.0, - "grad_norm": 1.8867589100270292, - "language_loss": 0.68448645, - "learning_rate": 6.003876669496728e-07, - "loss": 0.70567578, - "num_input_tokens_seen": 270635695, - "step": 12546, - "time_per_iteration": 2.644932270050049 - }, - { - "auxiliary_loss_clip": 0.01096622, - "auxiliary_loss_mlp": 0.01036368, - "balance_loss_clip": 1.03690362, - "balance_loss_mlp": 1.02293336, - "epoch": 0.7543664512250112, - "flos": 22819974624000.0, - "grad_norm": 2.226922026836887, - "language_loss": 0.73148012, - "learning_rate": 6.00109488240147e-07, - "loss": 0.75281006, - "num_input_tokens_seen": 270654325, - "step": 12547, - "time_per_iteration": 2.592843770980835 - }, - { - "auxiliary_loss_clip": 0.0110976, - "auxiliary_loss_mlp": 0.01029553, - "balance_loss_clip": 1.037709, - "balance_loss_mlp": 1.01641619, - "epoch": 0.7544265744776792, - "flos": 20923855482240.0, - "grad_norm": 2.152338960632508, - "language_loss": 0.67440069, - "learning_rate": 5.998313626146099e-07, - "loss": 0.69579387, - "num_input_tokens_seen": 270674260, - "step": 12548, - "time_per_iteration": 2.646831750869751 - }, - { - "auxiliary_loss_clip": 0.01090643, - "auxiliary_loss_mlp": 0.01034531, - "balance_loss_clip": 1.03753376, - "balance_loss_mlp": 1.02168059, - "epoch": 0.7544866977303473, - "flos": 15195493284480.0, - "grad_norm": 1.8439150079595696, - "language_loss": 0.87032682, - "learning_rate": 5.995532900836088e-07, - "loss": 0.89157856, - "num_input_tokens_seen": 270692200, - "step": 12549, - "time_per_iteration": 2.73703932762146 - }, - { - "auxiliary_loss_clip": 0.01062401, - "auxiliary_loss_mlp": 0.0103417, - "balance_loss_clip": 1.03635311, - "balance_loss_mlp": 1.02223134, - "epoch": 0.7545468209830152, - "flos": 27083683848960.0, - "grad_norm": 1.964347561010599, - "language_loss": 0.77038085, - "learning_rate": 5.992752706576865e-07, - "loss": 0.79134655, - "num_input_tokens_seen": 270709675, - "step": 12550, - "time_per_iteration": 2.7760634422302246 - }, - { - "auxiliary_loss_clip": 0.01110423, - "auxiliary_loss_mlp": 0.01024865, - "balance_loss_clip": 1.03772533, - "balance_loss_mlp": 1.01295626, - "epoch": 0.7546069442356832, - "flos": 26871703735680.0, - "grad_norm": 1.48969324659374, - "language_loss": 0.69521177, - "learning_rate": 5.98997304347386e-07, - "loss": 0.71656471, - "num_input_tokens_seen": 270733055, - "step": 12551, - "time_per_iteration": 2.612513303756714 - }, - { - "auxiliary_loss_clip": 0.0108872, - "auxiliary_loss_mlp": 0.01029008, - "balance_loss_clip": 1.03803182, - "balance_loss_mlp": 1.01590717, - "epoch": 0.7546670674883511, - "flos": 15743131015680.0, - "grad_norm": 1.9528134557769512, - "language_loss": 0.86114484, - "learning_rate": 5.987193911632487e-07, - "loss": 0.88232207, - "num_input_tokens_seen": 270749275, - "step": 12552, - "time_per_iteration": 2.7746293544769287 - }, - { - "auxiliary_loss_clip": 0.0110308, - "auxiliary_loss_mlp": 0.01032898, - "balance_loss_clip": 1.03883934, - "balance_loss_mlp": 1.02059603, - "epoch": 0.7547271907410191, - "flos": 23477714519040.0, - "grad_norm": 1.7307464295257877, - "language_loss": 0.78382206, - "learning_rate": 5.98441531115812e-07, - "loss": 0.8051818, - "num_input_tokens_seen": 270768230, - "step": 12553, - "time_per_iteration": 2.7080695629119873 - }, - { - "auxiliary_loss_clip": 0.01099832, - "auxiliary_loss_mlp": 0.01035652, - "balance_loss_clip": 1.0393219, - "balance_loss_mlp": 1.0227654, - "epoch": 0.754787313993687, - "flos": 31722804069120.0, - "grad_norm": 2.043637353991968, - "language_loss": 0.62419349, - "learning_rate": 5.981637242156135e-07, - "loss": 0.64554828, - "num_input_tokens_seen": 270786285, - "step": 12554, - "time_per_iteration": 2.6828603744506836 - }, - { - "auxiliary_loss_clip": 0.01087132, - "auxiliary_loss_mlp": 0.01036641, - "balance_loss_clip": 1.03482223, - "balance_loss_mlp": 1.02456522, - "epoch": 0.7548474372463551, - "flos": 27563055782400.0, - "grad_norm": 1.8726381797429124, - "language_loss": 0.73138636, - "learning_rate": 5.978859704731864e-07, - "loss": 0.75262409, - "num_input_tokens_seen": 270805505, - "step": 12555, - "time_per_iteration": 2.765606164932251 - }, - { - "auxiliary_loss_clip": 0.01089159, - "auxiliary_loss_mlp": 0.01033035, - "balance_loss_clip": 1.04132962, - "balance_loss_mlp": 1.0199343, - "epoch": 0.754907560499023, - "flos": 19318576763520.0, - "grad_norm": 1.687430506523416, - "language_loss": 0.78570682, - "learning_rate": 5.976082698990645e-07, - "loss": 0.80692875, - "num_input_tokens_seen": 270824610, - "step": 12556, - "time_per_iteration": 2.7887120246887207 - }, - { - "auxiliary_loss_clip": 0.0102254, - "auxiliary_loss_mlp": 0.01000624, - "balance_loss_clip": 1.00953579, - "balance_loss_mlp": 0.99957508, - "epoch": 0.754967683751691, - "flos": 69744628684800.0, - "grad_norm": 0.7056309097064257, - "language_loss": 0.50379604, - "learning_rate": 5.973306225037769e-07, - "loss": 0.52402771, - "num_input_tokens_seen": 270886155, - "step": 12557, - "time_per_iteration": 3.15433931350708 - }, - { - "auxiliary_loss_clip": 0.01101663, - "auxiliary_loss_mlp": 0.0103538, - "balance_loss_clip": 1.0402422, - "balance_loss_mlp": 1.02214742, - "epoch": 0.7550278070043589, - "flos": 24421913377920.0, - "grad_norm": 1.864770698097121, - "language_loss": 0.71454239, - "learning_rate": 5.970530282978525e-07, - "loss": 0.7359128, - "num_input_tokens_seen": 270905325, - "step": 12558, - "time_per_iteration": 2.6398966312408447 - }, - { - "auxiliary_loss_clip": 0.01086077, - "auxiliary_loss_mlp": 0.01039687, - "balance_loss_clip": 1.03605294, - "balance_loss_mlp": 1.02564383, - "epoch": 0.7550879302570269, - "flos": 32634611838720.0, - "grad_norm": 1.9214211385606932, - "language_loss": 0.80440283, - "learning_rate": 5.967754872918187e-07, - "loss": 0.82566047, - "num_input_tokens_seen": 270927535, - "step": 12559, - "time_per_iteration": 2.774087905883789 - }, - { - "auxiliary_loss_clip": 0.01064062, - "auxiliary_loss_mlp": 0.01030442, - "balance_loss_clip": 1.03735518, - "balance_loss_mlp": 1.01727533, - "epoch": 0.7551480535096948, - "flos": 21795550738560.0, - "grad_norm": 1.681888372687875, - "language_loss": 0.78732002, - "learning_rate": 5.96497999496199e-07, - "loss": 0.80826509, - "num_input_tokens_seen": 270946920, - "step": 12560, - "time_per_iteration": 2.773224115371704 - }, - { - "auxiliary_loss_clip": 0.01059602, - "auxiliary_loss_mlp": 0.01042887, - "balance_loss_clip": 1.03382421, - "balance_loss_mlp": 1.0288794, - "epoch": 0.7552081767623628, - "flos": 18515111391360.0, - "grad_norm": 2.5084045238772625, - "language_loss": 0.70966113, - "learning_rate": 5.96220564921515e-07, - "loss": 0.73068601, - "num_input_tokens_seen": 270965705, - "step": 12561, - "time_per_iteration": 2.7290430068969727 - }, - { - "auxiliary_loss_clip": 0.01084123, - "auxiliary_loss_mlp": 0.0077333, - "balance_loss_clip": 1.03486896, - "balance_loss_mlp": 1.00013804, - "epoch": 0.7552683000150308, - "flos": 27634805199360.0, - "grad_norm": 1.645858172778927, - "language_loss": 0.7574439, - "learning_rate": 5.959431835782889e-07, - "loss": 0.7760185, - "num_input_tokens_seen": 270986550, - "step": 12562, - "time_per_iteration": 2.766808032989502 - }, - { - "auxiliary_loss_clip": 0.01084916, - "auxiliary_loss_mlp": 0.01028344, - "balance_loss_clip": 1.03713727, - "balance_loss_mlp": 1.01534379, - "epoch": 0.7553284232676988, - "flos": 20302924049280.0, - "grad_norm": 1.8387284199108043, - "language_loss": 0.76086068, - "learning_rate": 5.956658554770371e-07, - "loss": 0.78199327, - "num_input_tokens_seen": 271006250, - "step": 12563, - "time_per_iteration": 2.6442339420318604 - }, - { - "auxiliary_loss_clip": 0.01082317, - "auxiliary_loss_mlp": 0.01032838, - "balance_loss_clip": 1.03697193, - "balance_loss_mlp": 1.01750755, - "epoch": 0.7553885465203668, - "flos": 33255471444480.0, - "grad_norm": 2.643775015065329, - "language_loss": 0.67393947, - "learning_rate": 5.953885806282768e-07, - "loss": 0.69509107, - "num_input_tokens_seen": 271025575, - "step": 12564, - "time_per_iteration": 2.780668020248413 - }, - { - "auxiliary_loss_clip": 0.01084523, - "auxiliary_loss_mlp": 0.01036081, - "balance_loss_clip": 1.03688002, - "balance_loss_mlp": 1.02188349, - "epoch": 0.7554486697730347, - "flos": 21616249023360.0, - "grad_norm": 2.407953823392175, - "language_loss": 0.69013596, - "learning_rate": 5.951113590425228e-07, - "loss": 0.71134198, - "num_input_tokens_seen": 271045805, - "step": 12565, - "time_per_iteration": 2.665789842605591 - }, - { - "auxiliary_loss_clip": 0.01091959, - "auxiliary_loss_mlp": 0.01032729, - "balance_loss_clip": 1.03703809, - "balance_loss_mlp": 1.01887071, - "epoch": 0.7555087930257027, - "flos": 27632973605760.0, - "grad_norm": 1.874045640971064, - "language_loss": 0.75261271, - "learning_rate": 5.94834190730287e-07, - "loss": 0.77385962, - "num_input_tokens_seen": 271066065, - "step": 12566, - "time_per_iteration": 2.6897921562194824 - }, - { - "auxiliary_loss_clip": 0.01105994, - "auxiliary_loss_mlp": 0.01036283, - "balance_loss_clip": 1.03961587, - "balance_loss_mlp": 1.02240658, - "epoch": 0.7555689162783706, - "flos": 23621644316160.0, - "grad_norm": 2.19029922676856, - "language_loss": 0.73804015, - "learning_rate": 5.945570757020789e-07, - "loss": 0.75946295, - "num_input_tokens_seen": 271085870, - "step": 12567, - "time_per_iteration": 2.681082248687744 - }, - { - "auxiliary_loss_clip": 0.01112381, - "auxiliary_loss_mlp": 0.01028784, - "balance_loss_clip": 1.03940594, - "balance_loss_mlp": 1.01680374, - "epoch": 0.7556290395310387, - "flos": 24863076218880.0, - "grad_norm": 2.047451974712634, - "language_loss": 0.62868547, - "learning_rate": 5.942800139684073e-07, - "loss": 0.65009713, - "num_input_tokens_seen": 271104260, - "step": 12568, - "time_per_iteration": 2.663501739501953 - }, - { - "auxiliary_loss_clip": 0.0102291, - "auxiliary_loss_mlp": 0.01041785, - "balance_loss_clip": 1.03343916, - "balance_loss_mlp": 1.02825463, - "epoch": 0.7556891627837066, - "flos": 43543770330240.0, - "grad_norm": 1.8712587826927434, - "language_loss": 0.66730088, - "learning_rate": 5.940030055397789e-07, - "loss": 0.68794787, - "num_input_tokens_seen": 271125745, - "step": 12569, - "time_per_iteration": 3.4009649753570557 - }, - { - "auxiliary_loss_clip": 0.01104803, - "auxiliary_loss_mlp": 0.01036972, - "balance_loss_clip": 1.03995872, - "balance_loss_mlp": 1.02256608, - "epoch": 0.7557492860363746, - "flos": 26650924790400.0, - "grad_norm": 1.7459864458479233, - "language_loss": 0.67298895, - "learning_rate": 5.93726050426697e-07, - "loss": 0.69440669, - "num_input_tokens_seen": 271147145, - "step": 12570, - "time_per_iteration": 3.006865978240967 - }, - { - "auxiliary_loss_clip": 0.01112467, - "auxiliary_loss_mlp": 0.01035865, - "balance_loss_clip": 1.03923225, - "balance_loss_mlp": 1.02238834, - "epoch": 0.7558094092890425, - "flos": 55182885010560.0, - "grad_norm": 1.8543133954373656, - "language_loss": 0.71857494, - "learning_rate": 5.934491486396647e-07, - "loss": 0.74005824, - "num_input_tokens_seen": 271170865, - "step": 12571, - "time_per_iteration": 2.9743287563323975 - }, - { - "auxiliary_loss_clip": 0.01066938, - "auxiliary_loss_mlp": 0.01037876, - "balance_loss_clip": 1.03525424, - "balance_loss_mlp": 1.02339244, - "epoch": 0.7558695325417105, - "flos": 23988292392960.0, - "grad_norm": 1.8208269811999866, - "language_loss": 0.73415917, - "learning_rate": 5.931723001891811e-07, - "loss": 0.7552073, - "num_input_tokens_seen": 271191450, - "step": 12572, - "time_per_iteration": 2.819380044937134 - }, - { - "auxiliary_loss_clip": 0.0109252, - "auxiliary_loss_mlp": 0.01033068, - "balance_loss_clip": 1.04051542, - "balance_loss_mlp": 1.02049112, - "epoch": 0.7559296557943784, - "flos": 14611262572800.0, - "grad_norm": 2.0177969949137577, - "language_loss": 0.76612377, - "learning_rate": 5.928955050857456e-07, - "loss": 0.78737968, - "num_input_tokens_seen": 271207335, - "step": 12573, - "time_per_iteration": 4.514675617218018 - }, - { - "auxiliary_loss_clip": 0.01087889, - "auxiliary_loss_mlp": 0.01036063, - "balance_loss_clip": 1.04069138, - "balance_loss_mlp": 1.02323067, - "epoch": 0.7559897790470465, - "flos": 18550483309440.0, - "grad_norm": 1.5618514080613375, - "language_loss": 0.69153476, - "learning_rate": 5.926187633398527e-07, - "loss": 0.71277434, - "num_input_tokens_seen": 271226895, - "step": 12574, - "time_per_iteration": 4.325180530548096 - }, - { - "auxiliary_loss_clip": 0.0107305, - "auxiliary_loss_mlp": 0.01033848, - "balance_loss_clip": 1.03176165, - "balance_loss_mlp": 1.01988304, - "epoch": 0.7560499022997144, - "flos": 17967868709760.0, - "grad_norm": 2.3174142994510065, - "language_loss": 0.71567178, - "learning_rate": 5.923420749619974e-07, - "loss": 0.73674083, - "num_input_tokens_seen": 271244375, - "step": 12575, - "time_per_iteration": 4.343465805053711 - }, - { - "auxiliary_loss_clip": 0.0110949, - "auxiliary_loss_mlp": 0.00770549, - "balance_loss_clip": 1.03735065, - "balance_loss_mlp": 1.00018251, - "epoch": 0.7561100255523824, - "flos": 15737815802880.0, - "grad_norm": 2.1045282969153125, - "language_loss": 0.71783686, - "learning_rate": 5.92065439962673e-07, - "loss": 0.73663723, - "num_input_tokens_seen": 271259530, - "step": 12576, - "time_per_iteration": 2.6967074871063232 - }, - { - "auxiliary_loss_clip": 0.01076617, - "auxiliary_loss_mlp": 0.01031643, - "balance_loss_clip": 1.03790188, - "balance_loss_mlp": 1.01866078, - "epoch": 0.7561701488050504, - "flos": 15888102307200.0, - "grad_norm": 2.0401468166974857, - "language_loss": 0.67187804, - "learning_rate": 5.917888583523669e-07, - "loss": 0.69296062, - "num_input_tokens_seen": 271276835, - "step": 12577, - "time_per_iteration": 2.6873996257781982 - }, - { - "auxiliary_loss_clip": 0.01088122, - "auxiliary_loss_mlp": 0.01037075, - "balance_loss_clip": 1.03602171, - "balance_loss_mlp": 1.02463531, - "epoch": 0.7562302720577183, - "flos": 20339157893760.0, - "grad_norm": 1.8873015547804852, - "language_loss": 0.78041267, - "learning_rate": 5.915123301415685e-07, - "loss": 0.80166459, - "num_input_tokens_seen": 271296275, - "step": 12578, - "time_per_iteration": 4.377631664276123 - }, - { - "auxiliary_loss_clip": 0.01100787, - "auxiliary_loss_mlp": 0.01032965, - "balance_loss_clip": 1.03763413, - "balance_loss_mlp": 1.02016759, - "epoch": 0.7562903953103863, - "flos": 20812209033600.0, - "grad_norm": 1.6803508279416246, - "language_loss": 0.75802839, - "learning_rate": 5.912358553407641e-07, - "loss": 0.7793659, - "num_input_tokens_seen": 271315685, - "step": 12579, - "time_per_iteration": 2.778723955154419 - }, - { - "auxiliary_loss_clip": 0.01070667, - "auxiliary_loss_mlp": 0.01036269, - "balance_loss_clip": 1.03752732, - "balance_loss_mlp": 1.02198792, - "epoch": 0.7563505185630542, - "flos": 37596999484800.0, - "grad_norm": 2.5693429830085397, - "language_loss": 0.627738, - "learning_rate": 5.90959433960437e-07, - "loss": 0.64880729, - "num_input_tokens_seen": 271336790, - "step": 12580, - "time_per_iteration": 2.996838331222534 - }, - { - "auxiliary_loss_clip": 0.01067496, - "auxiliary_loss_mlp": 0.01033758, - "balance_loss_clip": 1.03585196, - "balance_loss_mlp": 1.02117586, - "epoch": 0.7564106418157223, - "flos": 20230995064320.0, - "grad_norm": 1.6306554766999415, - "language_loss": 0.74993187, - "learning_rate": 5.906830660110691e-07, - "loss": 0.77094436, - "num_input_tokens_seen": 271355470, - "step": 12581, - "time_per_iteration": 2.8892579078674316 - }, - { - "auxiliary_loss_clip": 0.01071961, - "auxiliary_loss_mlp": 0.01033537, - "balance_loss_clip": 1.03673053, - "balance_loss_mlp": 1.02031684, - "epoch": 0.7564707650683902, - "flos": 24754877475840.0, - "grad_norm": 1.6534906098525708, - "language_loss": 0.62473053, - "learning_rate": 5.904067515031412e-07, - "loss": 0.64578557, - "num_input_tokens_seen": 271375810, - "step": 12582, - "time_per_iteration": 2.78520131111145 - }, - { - "auxiliary_loss_clip": 0.01031417, - "auxiliary_loss_mlp": 0.0099978, - "balance_loss_clip": 1.00870466, - "balance_loss_mlp": 0.99880236, - "epoch": 0.7565308883210582, - "flos": 48530076433920.0, - "grad_norm": 0.9397092341612294, - "language_loss": 0.6060046, - "learning_rate": 5.901304904471307e-07, - "loss": 0.62631655, - "num_input_tokens_seen": 271424775, - "step": 12583, - "time_per_iteration": 2.9951171875 - }, - { - "auxiliary_loss_clip": 0.01084102, - "auxiliary_loss_mlp": 0.01041483, - "balance_loss_clip": 1.03840542, - "balance_loss_mlp": 1.02859008, - "epoch": 0.7565910115737261, - "flos": 12495082757760.0, - "grad_norm": 2.173716211625989, - "language_loss": 0.7912221, - "learning_rate": 5.898542828535125e-07, - "loss": 0.81247795, - "num_input_tokens_seen": 271440500, - "step": 12584, - "time_per_iteration": 2.724681854248047 - }, - { - "auxiliary_loss_clip": 0.01079444, - "auxiliary_loss_mlp": 0.01038295, - "balance_loss_clip": 1.03406775, - "balance_loss_mlp": 1.02354908, - "epoch": 0.7566511348263941, - "flos": 21173003193600.0, - "grad_norm": 2.334504412939606, - "language_loss": 0.77645278, - "learning_rate": 5.895781287327612e-07, - "loss": 0.79763019, - "num_input_tokens_seen": 271458180, - "step": 12585, - "time_per_iteration": 2.7006850242614746 - }, - { - "auxiliary_loss_clip": 0.01116119, - "auxiliary_loss_mlp": 0.01038399, - "balance_loss_clip": 1.04165065, - "balance_loss_mlp": 1.0249517, - "epoch": 0.756711258079062, - "flos": 21754827694080.0, - "grad_norm": 1.6643260913798816, - "language_loss": 0.83146328, - "learning_rate": 5.893020280953493e-07, - "loss": 0.85300845, - "num_input_tokens_seen": 271475730, - "step": 12586, - "time_per_iteration": 2.7549026012420654 - }, - { - "auxiliary_loss_clip": 0.01115138, - "auxiliary_loss_mlp": 0.01030771, - "balance_loss_clip": 1.04039466, - "balance_loss_mlp": 1.01833797, - "epoch": 0.75677138133173, - "flos": 22382905933440.0, - "grad_norm": 2.0582325962784207, - "language_loss": 0.83617753, - "learning_rate": 5.890259809517459e-07, - "loss": 0.85763657, - "num_input_tokens_seen": 271495030, - "step": 12587, - "time_per_iteration": 2.6982500553131104 - }, - { - "auxiliary_loss_clip": 0.01076996, - "auxiliary_loss_mlp": 0.01027662, - "balance_loss_clip": 1.03665411, - "balance_loss_mlp": 1.01509786, - "epoch": 0.756831504584398, - "flos": 22708974620160.0, - "grad_norm": 1.4789161114631317, - "language_loss": 0.71016109, - "learning_rate": 5.88749987312418e-07, - "loss": 0.73120773, - "num_input_tokens_seen": 271515355, - "step": 12588, - "time_per_iteration": 2.811058282852173 - }, - { - "auxiliary_loss_clip": 0.01113651, - "auxiliary_loss_mlp": 0.00770901, - "balance_loss_clip": 1.03982472, - "balance_loss_mlp": 1.00019073, - "epoch": 0.756891627837066, - "flos": 24098358643200.0, - "grad_norm": 1.7170735982642948, - "language_loss": 0.68827093, - "learning_rate": 5.884740471878327e-07, - "loss": 0.70711648, - "num_input_tokens_seen": 271535090, - "step": 12589, - "time_per_iteration": 2.668159008026123 - }, - { - "auxiliary_loss_clip": 0.01100202, - "auxiliary_loss_mlp": 0.01029096, - "balance_loss_clip": 1.03817892, - "balance_loss_mlp": 1.01629877, - "epoch": 0.756951751089734, - "flos": 19749001438080.0, - "grad_norm": 1.693382160425306, - "language_loss": 0.92356181, - "learning_rate": 5.881981605884522e-07, - "loss": 0.9448548, - "num_input_tokens_seen": 271551075, - "step": 12590, - "time_per_iteration": 2.737993001937866 - }, - { - "auxiliary_loss_clip": 0.01081772, - "auxiliary_loss_mlp": 0.01030735, - "balance_loss_clip": 1.03454733, - "balance_loss_mlp": 1.01852822, - "epoch": 0.7570118743424019, - "flos": 35079266551680.0, - "grad_norm": 2.1087448505355364, - "language_loss": 0.6530177, - "learning_rate": 5.879223275247391e-07, - "loss": 0.67414272, - "num_input_tokens_seen": 271571035, - "step": 12591, - "time_per_iteration": 2.836533308029175 - }, - { - "auxiliary_loss_clip": 0.01099676, - "auxiliary_loss_mlp": 0.01029683, - "balance_loss_clip": 1.03951907, - "balance_loss_mlp": 1.01828074, - "epoch": 0.7570719975950699, - "flos": 25594540778880.0, - "grad_norm": 10.362773010711903, - "language_loss": 0.73889554, - "learning_rate": 5.876465480071528e-07, - "loss": 0.76018918, - "num_input_tokens_seen": 271592950, - "step": 12592, - "time_per_iteration": 2.729740619659424 - }, - { - "auxiliary_loss_clip": 0.01100337, - "auxiliary_loss_mlp": 0.01036287, - "balance_loss_clip": 1.03738928, - "balance_loss_mlp": 1.02323985, - "epoch": 0.7571321208477378, - "flos": 10816223028480.0, - "grad_norm": 2.217401018900874, - "language_loss": 0.71442747, - "learning_rate": 5.873708220461522e-07, - "loss": 0.73579371, - "num_input_tokens_seen": 271608835, - "step": 12593, - "time_per_iteration": 2.684826135635376 - }, - { - "auxiliary_loss_clip": 0.01112155, - "auxiliary_loss_mlp": 0.01031949, - "balance_loss_clip": 1.03883767, - "balance_loss_mlp": 1.01900887, - "epoch": 0.7571922441004059, - "flos": 18260109763200.0, - "grad_norm": 2.277271762211562, - "language_loss": 0.66408104, - "learning_rate": 5.870951496521903e-07, - "loss": 0.68552208, - "num_input_tokens_seen": 271627730, - "step": 12594, - "time_per_iteration": 2.66044545173645 - }, - { - "auxiliary_loss_clip": 0.01081064, - "auxiliary_loss_mlp": 0.01034067, - "balance_loss_clip": 1.03765464, - "balance_loss_mlp": 1.02116287, - "epoch": 0.7572523673530738, - "flos": 22890502978560.0, - "grad_norm": 1.5512103327237567, - "language_loss": 0.80722225, - "learning_rate": 5.86819530835722e-07, - "loss": 0.82837361, - "num_input_tokens_seen": 271646415, - "step": 12595, - "time_per_iteration": 2.75352144241333 - }, - { - "auxiliary_loss_clip": 0.01078291, - "auxiliary_loss_mlp": 0.01034396, - "balance_loss_clip": 1.03972101, - "balance_loss_mlp": 1.02266574, - "epoch": 0.7573124906057418, - "flos": 20996323171200.0, - "grad_norm": 1.9894880322297872, - "language_loss": 0.71883428, - "learning_rate": 5.865439656071993e-07, - "loss": 0.73996115, - "num_input_tokens_seen": 271666240, - "step": 12596, - "time_per_iteration": 2.830939531326294 - }, - { - "auxiliary_loss_clip": 0.01013568, - "auxiliary_loss_mlp": 0.01033181, - "balance_loss_clip": 1.03547406, - "balance_loss_mlp": 1.02174306, - "epoch": 0.7573726138584097, - "flos": 20886292834560.0, - "grad_norm": 1.6646538422679886, - "language_loss": 0.80251002, - "learning_rate": 5.862684539770706e-07, - "loss": 0.82297754, - "num_input_tokens_seen": 271686370, - "step": 12597, - "time_per_iteration": 3.2770867347717285 - }, - { - "auxiliary_loss_clip": 0.01084183, - "auxiliary_loss_mlp": 0.01030961, - "balance_loss_clip": 1.04002273, - "balance_loss_mlp": 1.01700711, - "epoch": 0.7574327371110777, - "flos": 24530507170560.0, - "grad_norm": 2.8794945787689477, - "language_loss": 0.83217478, - "learning_rate": 5.859929959557835e-07, - "loss": 0.8533262, - "num_input_tokens_seen": 271705050, - "step": 12598, - "time_per_iteration": 3.5696053504943848 - }, - { - "auxiliary_loss_clip": 0.01083032, - "auxiliary_loss_mlp": 0.0102703, - "balance_loss_clip": 1.03725743, - "balance_loss_mlp": 1.01568758, - "epoch": 0.7574928603637456, - "flos": 23364523785600.0, - "grad_norm": 1.9599324451937288, - "language_loss": 0.62513769, - "learning_rate": 5.857175915537845e-07, - "loss": 0.64623827, - "num_input_tokens_seen": 271724915, - "step": 12599, - "time_per_iteration": 2.9659054279327393 - }, - { - "auxiliary_loss_clip": 0.01087639, - "auxiliary_loss_mlp": 0.00772119, - "balance_loss_clip": 1.03743839, - "balance_loss_mlp": 1.00022399, - "epoch": 0.7575529836164137, - "flos": 13516274419200.0, - "grad_norm": 2.6514435576773767, - "language_loss": 0.63275111, - "learning_rate": 5.854422407815161e-07, - "loss": 0.65134871, - "num_input_tokens_seen": 271742410, - "step": 12600, - "time_per_iteration": 2.761671304702759 - }, - { - "auxiliary_loss_clip": 0.01081508, - "auxiliary_loss_mlp": 0.010341, - "balance_loss_clip": 1.03465056, - "balance_loss_mlp": 1.01968765, - "epoch": 0.7576131068690816, - "flos": 19646584784640.0, - "grad_norm": 1.9759732214873023, - "language_loss": 0.66604817, - "learning_rate": 5.851669436494191e-07, - "loss": 0.68720412, - "num_input_tokens_seen": 271761425, - "step": 12601, - "time_per_iteration": 2.8752126693725586 - }, - { - "auxiliary_loss_clip": 0.01081767, - "auxiliary_loss_mlp": 0.01030188, - "balance_loss_clip": 1.03683853, - "balance_loss_mlp": 1.01856518, - "epoch": 0.7576732301217496, - "flos": 20048245643520.0, - "grad_norm": 1.862908723746181, - "language_loss": 0.6777848, - "learning_rate": 5.848917001679335e-07, - "loss": 0.69890434, - "num_input_tokens_seen": 271780875, - "step": 12602, - "time_per_iteration": 2.7810614109039307 - }, - { - "auxiliary_loss_clip": 0.01102089, - "auxiliary_loss_mlp": 0.01035949, - "balance_loss_clip": 1.03889537, - "balance_loss_mlp": 1.02206695, - "epoch": 0.7577333533744176, - "flos": 15377093470080.0, - "grad_norm": 3.3133966859560138, - "language_loss": 0.67229289, - "learning_rate": 5.846165103474967e-07, - "loss": 0.69367325, - "num_input_tokens_seen": 271799490, - "step": 12603, - "time_per_iteration": 2.677644968032837 - }, - { - "auxiliary_loss_clip": 0.01086121, - "auxiliary_loss_mlp": 0.01033317, - "balance_loss_clip": 1.03466463, - "balance_loss_mlp": 1.02153969, - "epoch": 0.7577934766270855, - "flos": 17894862316800.0, - "grad_norm": 2.091164920728678, - "language_loss": 0.61460161, - "learning_rate": 5.843413741985439e-07, - "loss": 0.63579607, - "num_input_tokens_seen": 271817040, - "step": 12604, - "time_per_iteration": 2.690556287765503 - }, - { - "auxiliary_loss_clip": 0.01113132, - "auxiliary_loss_mlp": 0.01037248, - "balance_loss_clip": 1.04157591, - "balance_loss_mlp": 1.0240519, - "epoch": 0.7578535998797535, - "flos": 21613770984960.0, - "grad_norm": 1.860643993925951, - "language_loss": 0.79847634, - "learning_rate": 5.840662917315076e-07, - "loss": 0.81998014, - "num_input_tokens_seen": 271835480, - "step": 12605, - "time_per_iteration": 2.650987148284912 - }, - { - "auxiliary_loss_clip": 0.01114865, - "auxiliary_loss_mlp": 0.01030793, - "balance_loss_clip": 1.03956521, - "balance_loss_mlp": 1.01750159, - "epoch": 0.7579137231324214, - "flos": 18478374756480.0, - "grad_norm": 2.6305225547150286, - "language_loss": 0.79649675, - "learning_rate": 5.837912629568198e-07, - "loss": 0.81795335, - "num_input_tokens_seen": 271849835, - "step": 12606, - "time_per_iteration": 2.6179733276367188 - }, - { - "auxiliary_loss_clip": 0.01094397, - "auxiliary_loss_mlp": 0.01030813, - "balance_loss_clip": 1.03664708, - "balance_loss_mlp": 1.01947641, - "epoch": 0.7579738463850895, - "flos": 23255032152960.0, - "grad_norm": 1.422559911510894, - "language_loss": 0.73040879, - "learning_rate": 5.835162878849087e-07, - "loss": 0.75166082, - "num_input_tokens_seen": 271869560, - "step": 12607, - "time_per_iteration": 2.660883903503418 - }, - { - "auxiliary_loss_clip": 0.01085893, - "auxiliary_loss_mlp": 0.01032003, - "balance_loss_clip": 1.03795099, - "balance_loss_mlp": 1.01872361, - "epoch": 0.7580339696377574, - "flos": 14027031861120.0, - "grad_norm": 1.8402667548029668, - "language_loss": 0.75154114, - "learning_rate": 5.83241366526202e-07, - "loss": 0.7727201, - "num_input_tokens_seen": 271887950, - "step": 12608, - "time_per_iteration": 2.7164134979248047 - }, - { - "auxiliary_loss_clip": 0.01074571, - "auxiliary_loss_mlp": 0.00770045, - "balance_loss_clip": 1.0365268, - "balance_loss_mlp": 1.00018573, - "epoch": 0.7580940928904254, - "flos": 25082777756160.0, - "grad_norm": 1.7434049205366062, - "language_loss": 0.71609342, - "learning_rate": 5.829664988911245e-07, - "loss": 0.73453957, - "num_input_tokens_seen": 271907700, - "step": 12609, - "time_per_iteration": 2.788742780685425 - }, - { - "auxiliary_loss_clip": 0.0111318, - "auxiliary_loss_mlp": 0.01033401, - "balance_loss_clip": 1.0384202, - "balance_loss_mlp": 1.02005589, - "epoch": 0.7581542161430933, - "flos": 23836425690240.0, - "grad_norm": 1.6307456106692844, - "language_loss": 0.81648767, - "learning_rate": 5.826916849901007e-07, - "loss": 0.83795345, - "num_input_tokens_seen": 271926840, - "step": 12610, - "time_per_iteration": 2.6684138774871826 - }, - { - "auxiliary_loss_clip": 0.01096074, - "auxiliary_loss_mlp": 0.01034645, - "balance_loss_clip": 1.03990412, - "balance_loss_mlp": 1.0215261, - "epoch": 0.7582143393957613, - "flos": 22237000888320.0, - "grad_norm": 1.7108192328279062, - "language_loss": 0.70459145, - "learning_rate": 5.824169248335488e-07, - "loss": 0.72589862, - "num_input_tokens_seen": 271946465, - "step": 12611, - "time_per_iteration": 2.7615582942962646 - }, - { - "auxiliary_loss_clip": 0.01111911, - "auxiliary_loss_mlp": 0.01031837, - "balance_loss_clip": 1.03971386, - "balance_loss_mlp": 1.01939797, - "epoch": 0.7582744626484292, - "flos": 21106389421440.0, - "grad_norm": 1.4842490716025172, - "language_loss": 0.70994782, - "learning_rate": 5.821422184318893e-07, - "loss": 0.73138535, - "num_input_tokens_seen": 271967295, - "step": 12612, - "time_per_iteration": 4.388495445251465 - }, - { - "auxiliary_loss_clip": 0.01051139, - "auxiliary_loss_mlp": 0.01043129, - "balance_loss_clip": 1.03555894, - "balance_loss_mlp": 1.03022408, - "epoch": 0.7583345859010973, - "flos": 24604770539520.0, - "grad_norm": 1.3817563743236791, - "language_loss": 0.59341693, - "learning_rate": 5.818675657955397e-07, - "loss": 0.61435962, - "num_input_tokens_seen": 271987960, - "step": 12613, - "time_per_iteration": 4.679025411605835 - }, - { - "auxiliary_loss_clip": 0.01085628, - "auxiliary_loss_mlp": 0.01039806, - "balance_loss_clip": 1.0359726, - "balance_loss_mlp": 1.02548921, - "epoch": 0.7583947091537652, - "flos": 33546814657920.0, - "grad_norm": 1.5041496360989353, - "language_loss": 0.59715927, - "learning_rate": 5.815929669349135e-07, - "loss": 0.61841357, - "num_input_tokens_seen": 272011780, - "step": 12614, - "time_per_iteration": 4.3984222412109375 - }, - { - "auxiliary_loss_clip": 0.01075793, - "auxiliary_loss_mlp": 0.01029168, - "balance_loss_clip": 1.03387702, - "balance_loss_mlp": 1.01572776, - "epoch": 0.7584548324064332, - "flos": 20121000641280.0, - "grad_norm": 1.921615771870116, - "language_loss": 0.73268729, - "learning_rate": 5.813184218604246e-07, - "loss": 0.75373691, - "num_input_tokens_seen": 272030825, - "step": 12615, - "time_per_iteration": 2.8314290046691895 - }, - { - "auxiliary_loss_clip": 0.01011548, - "auxiliary_loss_mlp": 0.00999712, - "balance_loss_clip": 1.01067567, - "balance_loss_mlp": 0.99848443, - "epoch": 0.7585149556591012, - "flos": 70402584061440.0, - "grad_norm": 0.8045882645133534, - "language_loss": 0.67647672, - "learning_rate": 5.810439305824828e-07, - "loss": 0.69658935, - "num_input_tokens_seen": 272095825, - "step": 12616, - "time_per_iteration": 3.260563850402832 - }, - { - "auxiliary_loss_clip": 0.0108171, - "auxiliary_loss_mlp": 0.01039897, - "balance_loss_clip": 1.03870976, - "balance_loss_mlp": 1.02642608, - "epoch": 0.7585750789117691, - "flos": 16143786293760.0, - "grad_norm": 1.736965809635246, - "language_loss": 0.84524256, - "learning_rate": 5.807694931114979e-07, - "loss": 0.86645865, - "num_input_tokens_seen": 272113950, - "step": 12617, - "time_per_iteration": 2.8263378143310547 - }, - { - "auxiliary_loss_clip": 0.01078721, - "auxiliary_loss_mlp": 0.01039251, - "balance_loss_clip": 1.0390234, - "balance_loss_mlp": 1.02730036, - "epoch": 0.7586352021644371, - "flos": 17493165544320.0, - "grad_norm": 2.328657460169902, - "language_loss": 0.74700725, - "learning_rate": 5.804951094578757e-07, - "loss": 0.76818699, - "num_input_tokens_seen": 272130315, - "step": 12618, - "time_per_iteration": 4.2552900314331055 - }, - { - "auxiliary_loss_clip": 0.0109138, - "auxiliary_loss_mlp": 0.01032269, - "balance_loss_clip": 1.03749752, - "balance_loss_mlp": 1.01850069, - "epoch": 0.758695325417105, - "flos": 17275187859840.0, - "grad_norm": 1.9133972925292233, - "language_loss": 0.77211189, - "learning_rate": 5.802207796320209e-07, - "loss": 0.79334843, - "num_input_tokens_seen": 272149080, - "step": 12619, - "time_per_iteration": 2.7758803367614746 - }, - { - "auxiliary_loss_clip": 0.0107017, - "auxiliary_loss_mlp": 0.01037442, - "balance_loss_clip": 1.03423786, - "balance_loss_mlp": 1.02421534, - "epoch": 0.7587554486697731, - "flos": 29495660163840.0, - "grad_norm": 1.9790425844010804, - "language_loss": 0.82581639, - "learning_rate": 5.79946503644337e-07, - "loss": 0.84689248, - "num_input_tokens_seen": 272168285, - "step": 12620, - "time_per_iteration": 2.860680341720581 - }, - { - "auxiliary_loss_clip": 0.01086979, - "auxiliary_loss_mlp": 0.01039531, - "balance_loss_clip": 1.03506887, - "balance_loss_mlp": 1.02535069, - "epoch": 0.758815571922441, - "flos": 16100800692480.0, - "grad_norm": 2.7719237542052335, - "language_loss": 0.82916582, - "learning_rate": 5.796722815052242e-07, - "loss": 0.85043091, - "num_input_tokens_seen": 272184585, - "step": 12621, - "time_per_iteration": 2.6819448471069336 - }, - { - "auxiliary_loss_clip": 0.01090396, - "auxiliary_loss_mlp": 0.01032874, - "balance_loss_clip": 1.03831124, - "balance_loss_mlp": 1.02035689, - "epoch": 0.758875695175109, - "flos": 16143714466560.0, - "grad_norm": 2.331369198169253, - "language_loss": 0.73694873, - "learning_rate": 5.7939811322508e-07, - "loss": 0.75818145, - "num_input_tokens_seen": 272200205, - "step": 12622, - "time_per_iteration": 2.867482900619507 - }, - { - "auxiliary_loss_clip": 0.01020627, - "auxiliary_loss_mlp": 0.00999479, - "balance_loss_clip": 1.00808787, - "balance_loss_mlp": 0.99837667, - "epoch": 0.7589358184277769, - "flos": 68462006860800.0, - "grad_norm": 0.8939637430208361, - "language_loss": 0.60890412, - "learning_rate": 5.791239988143024e-07, - "loss": 0.62910521, - "num_input_tokens_seen": 272259670, - "step": 12623, - "time_per_iteration": 3.399125814437866 - }, - { - "auxiliary_loss_clip": 0.01108354, - "auxiliary_loss_mlp": 0.01032352, - "balance_loss_clip": 1.0389595, - "balance_loss_mlp": 1.0204668, - "epoch": 0.7589959416804449, - "flos": 20047311889920.0, - "grad_norm": 2.1862107817163374, - "language_loss": 0.67437398, - "learning_rate": 5.788499382832847e-07, - "loss": 0.69578105, - "num_input_tokens_seen": 272277925, - "step": 12624, - "time_per_iteration": 2.7711076736450195 - }, - { - "auxiliary_loss_clip": 0.01108684, - "auxiliary_loss_mlp": 0.01029908, - "balance_loss_clip": 1.03826535, - "balance_loss_mlp": 1.01691461, - "epoch": 0.7590560649331128, - "flos": 18771800958720.0, - "grad_norm": 11.908372705872578, - "language_loss": 0.76173502, - "learning_rate": 5.785759316424196e-07, - "loss": 0.78312099, - "num_input_tokens_seen": 272296010, - "step": 12625, - "time_per_iteration": 2.695136308670044 - }, - { - "auxiliary_loss_clip": 0.0108337, - "auxiliary_loss_mlp": 0.01043075, - "balance_loss_clip": 1.03519034, - "balance_loss_mlp": 1.02824545, - "epoch": 0.7591161881857809, - "flos": 29825284296960.0, - "grad_norm": 1.865247644851499, - "language_loss": 0.63104314, - "learning_rate": 5.783019789020977e-07, - "loss": 0.65230757, - "num_input_tokens_seen": 272318330, - "step": 12626, - "time_per_iteration": 2.815093517303467 - }, - { - "auxiliary_loss_clip": 0.01080043, - "auxiliary_loss_mlp": 0.00771292, - "balance_loss_clip": 1.04494154, - "balance_loss_mlp": 1.00028062, - "epoch": 0.7591763114384488, - "flos": 20302708567680.0, - "grad_norm": 2.0523273844402605, - "language_loss": 0.74221742, - "learning_rate": 5.780280800727084e-07, - "loss": 0.76073074, - "num_input_tokens_seen": 272335265, - "step": 12627, - "time_per_iteration": 3.018779993057251 - }, - { - "auxiliary_loss_clip": 0.01100814, - "auxiliary_loss_mlp": 0.0103172, - "balance_loss_clip": 1.03871465, - "balance_loss_mlp": 1.0191313, - "epoch": 0.7592364346911168, - "flos": 20813609664000.0, - "grad_norm": 2.9039370071826145, - "language_loss": 0.6930986, - "learning_rate": 5.777542351646356e-07, - "loss": 0.71442395, - "num_input_tokens_seen": 272354795, - "step": 12628, - "time_per_iteration": 2.717823028564453 - }, - { - "auxiliary_loss_clip": 0.01102671, - "auxiliary_loss_mlp": 0.01034671, - "balance_loss_clip": 1.04052353, - "balance_loss_mlp": 1.02078366, - "epoch": 0.7592965579437848, - "flos": 21251504367360.0, - "grad_norm": 1.7338759935871468, - "language_loss": 0.63148701, - "learning_rate": 5.774804441882648e-07, - "loss": 0.6528604, - "num_input_tokens_seen": 272372875, - "step": 12629, - "time_per_iteration": 2.6770267486572266 - }, - { - "auxiliary_loss_clip": 0.01084801, - "auxiliary_loss_mlp": 0.01032088, - "balance_loss_clip": 1.03561509, - "balance_loss_mlp": 1.02010107, - "epoch": 0.7593566811964527, - "flos": 26213604704640.0, - "grad_norm": 1.4746577606675504, - "language_loss": 0.77671874, - "learning_rate": 5.772067071539786e-07, - "loss": 0.79788756, - "num_input_tokens_seen": 272394715, - "step": 12630, - "time_per_iteration": 2.9122629165649414 - }, - { - "auxiliary_loss_clip": 0.01029746, - "auxiliary_loss_mlp": 0.01002357, - "balance_loss_clip": 1.00722373, - "balance_loss_mlp": 1.00131977, - "epoch": 0.7594168044491207, - "flos": 71237255374080.0, - "grad_norm": 0.8115073267704523, - "language_loss": 0.61498612, - "learning_rate": 5.769330240721562e-07, - "loss": 0.63530719, - "num_input_tokens_seen": 272458775, - "step": 12631, - "time_per_iteration": 3.267413377761841 - }, - { - "auxiliary_loss_clip": 0.01084169, - "auxiliary_loss_mlp": 0.00772349, - "balance_loss_clip": 1.03867972, - "balance_loss_mlp": 1.00034893, - "epoch": 0.7594769277017887, - "flos": 26613326229120.0, - "grad_norm": 1.5722858256300303, - "language_loss": 0.73812342, - "learning_rate": 5.766593949531767e-07, - "loss": 0.75668871, - "num_input_tokens_seen": 272479355, - "step": 12632, - "time_per_iteration": 2.9674253463745117 - }, - { - "auxiliary_loss_clip": 0.01089012, - "auxiliary_loss_mlp": 0.01030973, - "balance_loss_clip": 1.0375607, - "balance_loss_mlp": 1.01855755, - "epoch": 0.7595370509544567, - "flos": 17595941333760.0, - "grad_norm": 2.3123827403326005, - "language_loss": 0.75472778, - "learning_rate": 5.763858198074154e-07, - "loss": 0.77592766, - "num_input_tokens_seen": 272493555, - "step": 12633, - "time_per_iteration": 2.733344078063965 - }, - { - "auxiliary_loss_clip": 0.01087192, - "auxiliary_loss_mlp": 0.01028663, - "balance_loss_clip": 1.03815973, - "balance_loss_mlp": 1.017272, - "epoch": 0.7595971742071246, - "flos": 18002953319040.0, - "grad_norm": 2.016293205622038, - "language_loss": 0.73391056, - "learning_rate": 5.76112298645246e-07, - "loss": 0.75506908, - "num_input_tokens_seen": 272508925, - "step": 12634, - "time_per_iteration": 2.7296500205993652 - }, - { - "auxiliary_loss_clip": 0.01111487, - "auxiliary_loss_mlp": 0.01034168, - "balance_loss_clip": 1.03916657, - "balance_loss_mlp": 1.02143645, - "epoch": 0.7596572974597926, - "flos": 28840326480000.0, - "grad_norm": 2.834994861255327, - "language_loss": 0.64522898, - "learning_rate": 5.758388314770408e-07, - "loss": 0.66668558, - "num_input_tokens_seen": 272528805, - "step": 12635, - "time_per_iteration": 2.79398512840271 - }, - { - "auxiliary_loss_clip": 0.01054416, - "auxiliary_loss_mlp": 0.01048736, - "balance_loss_clip": 1.03525424, - "balance_loss_mlp": 1.03316736, - "epoch": 0.7597174207124605, - "flos": 14282823588480.0, - "grad_norm": 1.8350185732096174, - "language_loss": 0.69167364, - "learning_rate": 5.7556541831317e-07, - "loss": 0.71270514, - "num_input_tokens_seen": 272546655, - "step": 12636, - "time_per_iteration": 2.827582836151123 - }, - { - "auxiliary_loss_clip": 0.01094213, - "auxiliary_loss_mlp": 0.01034821, - "balance_loss_clip": 1.03955829, - "balance_loss_mlp": 1.02246487, - "epoch": 0.7597775439651285, - "flos": 21688932193920.0, - "grad_norm": 2.1812107272877665, - "language_loss": 0.81070203, - "learning_rate": 5.752920591640018e-07, - "loss": 0.83199233, - "num_input_tokens_seen": 272564010, - "step": 12637, - "time_per_iteration": 2.805816650390625 - }, - { - "auxiliary_loss_clip": 0.01098118, - "auxiliary_loss_mlp": 0.01032589, - "balance_loss_clip": 1.03679478, - "balance_loss_mlp": 1.02025676, - "epoch": 0.7598376672177964, - "flos": 36101248312320.0, - "grad_norm": 1.701654856542883, - "language_loss": 0.66547924, - "learning_rate": 5.750187540399017e-07, - "loss": 0.68678635, - "num_input_tokens_seen": 272585840, - "step": 12638, - "time_per_iteration": 2.8566620349884033 - }, - { - "auxiliary_loss_clip": 0.01114657, - "auxiliary_loss_mlp": 0.01040373, - "balance_loss_clip": 1.04063082, - "balance_loss_mlp": 1.02584124, - "epoch": 0.7598977904704645, - "flos": 18332326056960.0, - "grad_norm": 2.2747225954566193, - "language_loss": 0.6550855, - "learning_rate": 5.747455029512323e-07, - "loss": 0.6766358, - "num_input_tokens_seen": 272602300, - "step": 12639, - "time_per_iteration": 2.6449224948883057 - }, - { - "auxiliary_loss_clip": 0.01097983, - "auxiliary_loss_mlp": 0.01032447, - "balance_loss_clip": 1.03591299, - "balance_loss_mlp": 1.01951265, - "epoch": 0.7599579137231324, - "flos": 20192642317440.0, - "grad_norm": 2.3376509636382057, - "language_loss": 0.70271343, - "learning_rate": 5.744723059083572e-07, - "loss": 0.72401774, - "num_input_tokens_seen": 272619595, - "step": 12640, - "time_per_iteration": 2.813169240951538 - }, - { - "auxiliary_loss_clip": 0.01091857, - "auxiliary_loss_mlp": 0.01033957, - "balance_loss_clip": 1.03943181, - "balance_loss_mlp": 1.0203433, - "epoch": 0.7600180369758004, - "flos": 24024849459840.0, - "grad_norm": 2.141253081598676, - "language_loss": 0.66953784, - "learning_rate": 5.741991629216343e-07, - "loss": 0.69079602, - "num_input_tokens_seen": 272638825, - "step": 12641, - "time_per_iteration": 2.8210034370422363 - }, - { - "auxiliary_loss_clip": 0.01098494, - "auxiliary_loss_mlp": 0.01031655, - "balance_loss_clip": 1.03655171, - "balance_loss_mlp": 1.01808345, - "epoch": 0.7600781602284684, - "flos": 18989527248000.0, - "grad_norm": 3.210818856983626, - "language_loss": 0.66875279, - "learning_rate": 5.73926074001422e-07, - "loss": 0.6900543, - "num_input_tokens_seen": 272657240, - "step": 12642, - "time_per_iteration": 2.6761434078216553 - }, - { - "auxiliary_loss_clip": 0.01092897, - "auxiliary_loss_mlp": 0.01031582, - "balance_loss_clip": 1.04070377, - "balance_loss_mlp": 1.01937461, - "epoch": 0.7601382834811363, - "flos": 26067520091520.0, - "grad_norm": 1.951124783740963, - "language_loss": 0.75605899, - "learning_rate": 5.736530391580765e-07, - "loss": 0.77730376, - "num_input_tokens_seen": 272677520, - "step": 12643, - "time_per_iteration": 2.858407497406006 - }, - { - "auxiliary_loss_clip": 0.01076624, - "auxiliary_loss_mlp": 0.0103542, - "balance_loss_clip": 1.03911448, - "balance_loss_mlp": 1.02123976, - "epoch": 0.7601984067338043, - "flos": 18844232734080.0, - "grad_norm": 1.8455815779990985, - "language_loss": 0.78802508, - "learning_rate": 5.733800584019508e-07, - "loss": 0.80914557, - "num_input_tokens_seen": 272696770, - "step": 12644, - "time_per_iteration": 2.820368766784668 - }, - { - "auxiliary_loss_clip": 0.01084265, - "auxiliary_loss_mlp": 0.01032434, - "balance_loss_clip": 1.0353601, - "balance_loss_mlp": 1.01994061, - "epoch": 0.7602585299864723, - "flos": 24646391424000.0, - "grad_norm": 1.5239807064585273, - "language_loss": 0.80124938, - "learning_rate": 5.731071317433957e-07, - "loss": 0.82241637, - "num_input_tokens_seen": 272718340, - "step": 12645, - "time_per_iteration": 2.8698811531066895 - }, - { - "auxiliary_loss_clip": 0.01087859, - "auxiliary_loss_mlp": 0.01033405, - "balance_loss_clip": 1.0394851, - "balance_loss_mlp": 1.02041101, - "epoch": 0.7603186532391403, - "flos": 23842100039040.0, - "grad_norm": 1.4661810849316874, - "language_loss": 0.72646892, - "learning_rate": 5.728342591927611e-07, - "loss": 0.74768156, - "num_input_tokens_seen": 272739575, - "step": 12646, - "time_per_iteration": 2.8227429389953613 - }, - { - "auxiliary_loss_clip": 0.01098686, - "auxiliary_loss_mlp": 0.01035491, - "balance_loss_clip": 1.03704524, - "balance_loss_mlp": 1.02336717, - "epoch": 0.7603787764918082, - "flos": 22199905117440.0, - "grad_norm": 2.4220316312811025, - "language_loss": 0.67611617, - "learning_rate": 5.725614407603949e-07, - "loss": 0.69745797, - "num_input_tokens_seen": 272758710, - "step": 12647, - "time_per_iteration": 2.8083581924438477 - }, - { - "auxiliary_loss_clip": 0.01019336, - "auxiliary_loss_mlp": 0.01006453, - "balance_loss_clip": 1.00592494, - "balance_loss_mlp": 1.00521874, - "epoch": 0.7604388997444762, - "flos": 54086894254080.0, - "grad_norm": 0.6752663503199356, - "language_loss": 0.48949182, - "learning_rate": 5.722886764566415e-07, - "loss": 0.50974971, - "num_input_tokens_seen": 272814855, - "step": 12648, - "time_per_iteration": 3.211672782897949 - }, - { - "auxiliary_loss_clip": 0.0109722, - "auxiliary_loss_mlp": 0.01036106, - "balance_loss_clip": 1.03749037, - "balance_loss_mlp": 1.02400017, - "epoch": 0.7604990229971441, - "flos": 19681920789120.0, - "grad_norm": 2.4521174104078467, - "language_loss": 0.76747489, - "learning_rate": 5.720159662918451e-07, - "loss": 0.78880811, - "num_input_tokens_seen": 272834400, - "step": 12649, - "time_per_iteration": 2.72628116607666 - }, - { - "auxiliary_loss_clip": 0.0106851, - "auxiliary_loss_mlp": 0.01035355, - "balance_loss_clip": 1.03517592, - "balance_loss_mlp": 1.02242661, - "epoch": 0.7605591462498121, - "flos": 25228036356480.0, - "grad_norm": 1.7702335478327527, - "language_loss": 0.68660265, - "learning_rate": 5.717433102763462e-07, - "loss": 0.7076413, - "num_input_tokens_seen": 272854760, - "step": 12650, - "time_per_iteration": 2.8096909523010254 - }, - { - "auxiliary_loss_clip": 0.01020249, - "auxiliary_loss_mlp": 0.01003738, - "balance_loss_clip": 1.00758457, - "balance_loss_mlp": 1.00275445, - "epoch": 0.76061926950248, - "flos": 66783757662720.0, - "grad_norm": 0.8336646667507255, - "language_loss": 0.62671125, - "learning_rate": 5.714707084204838e-07, - "loss": 0.64695108, - "num_input_tokens_seen": 272919030, - "step": 12651, - "time_per_iteration": 4.8483662605285645 - }, - { - "auxiliary_loss_clip": 0.01076594, - "auxiliary_loss_mlp": 0.01036234, - "balance_loss_clip": 1.0368011, - "balance_loss_mlp": 1.02373505, - "epoch": 0.7606793927551481, - "flos": 25338354001920.0, - "grad_norm": 1.4829724837753475, - "language_loss": 0.71288872, - "learning_rate": 5.711981607345951e-07, - "loss": 0.73401701, - "num_input_tokens_seen": 272938925, - "step": 12652, - "time_per_iteration": 2.85551118850708 - }, - { - "auxiliary_loss_clip": 0.01059292, - "auxiliary_loss_mlp": 0.01037324, - "balance_loss_clip": 1.0363282, - "balance_loss_mlp": 1.02425838, - "epoch": 0.760739516007816, - "flos": 18223624523520.0, - "grad_norm": 2.085886887216812, - "language_loss": 0.80261797, - "learning_rate": 5.709256672290152e-07, - "loss": 0.82358414, - "num_input_tokens_seen": 272954945, - "step": 12653, - "time_per_iteration": 6.101754665374756 - }, - { - "auxiliary_loss_clip": 0.01116949, - "auxiliary_loss_mlp": 0.01032648, - "balance_loss_clip": 1.04151583, - "balance_loss_mlp": 1.01946926, - "epoch": 0.760799639260484, - "flos": 22559119079040.0, - "grad_norm": 1.7273806090867658, - "language_loss": 0.79977405, - "learning_rate": 5.706532279140785e-07, - "loss": 0.82127005, - "num_input_tokens_seen": 272972855, - "step": 12654, - "time_per_iteration": 2.7514119148254395 - }, - { - "auxiliary_loss_clip": 0.01074955, - "auxiliary_loss_mlp": 0.0103594, - "balance_loss_clip": 1.03562033, - "balance_loss_mlp": 1.02221942, - "epoch": 0.760859762513152, - "flos": 22309324922880.0, - "grad_norm": 2.0189360189402357, - "language_loss": 0.79458809, - "learning_rate": 5.703808428001136e-07, - "loss": 0.81569707, - "num_input_tokens_seen": 272989895, - "step": 12655, - "time_per_iteration": 2.78948712348938 - }, - { - "auxiliary_loss_clip": 0.01094485, - "auxiliary_loss_mlp": 0.01028246, - "balance_loss_clip": 1.03769636, - "balance_loss_mlp": 1.01743925, - "epoch": 0.7609198857658199, - "flos": 24863902231680.0, - "grad_norm": 1.6124233768982144, - "language_loss": 0.68051422, - "learning_rate": 5.701085118974505e-07, - "loss": 0.70174152, - "num_input_tokens_seen": 273011695, - "step": 12656, - "time_per_iteration": 2.795375347137451 - }, - { - "auxiliary_loss_clip": 0.01101665, - "auxiliary_loss_mlp": 0.01030913, - "balance_loss_clip": 1.03489399, - "balance_loss_mlp": 1.01786578, - "epoch": 0.760980009018488, - "flos": 16836790366080.0, - "grad_norm": 2.7645541741379347, - "language_loss": 0.73798579, - "learning_rate": 5.698362352164164e-07, - "loss": 0.75931156, - "num_input_tokens_seen": 273028815, - "step": 12657, - "time_per_iteration": 4.21469521522522 - }, - { - "auxiliary_loss_clip": 0.01012936, - "auxiliary_loss_mlp": 0.01000637, - "balance_loss_clip": 1.00884259, - "balance_loss_mlp": 0.99950486, - "epoch": 0.7610401322711559, - "flos": 61230603029760.0, - "grad_norm": 0.85360954009419, - "language_loss": 0.64932978, - "learning_rate": 5.695640127673347e-07, - "loss": 0.66946548, - "num_input_tokens_seen": 273084080, - "step": 12658, - "time_per_iteration": 3.2157864570617676 - }, - { - "auxiliary_loss_clip": 0.01092364, - "auxiliary_loss_mlp": 0.01034541, - "balance_loss_clip": 1.03773546, - "balance_loss_mlp": 1.02238202, - "epoch": 0.7611002555238239, - "flos": 19640730867840.0, - "grad_norm": 2.0304267981282353, - "language_loss": 0.79544449, - "learning_rate": 5.692918445605293e-07, - "loss": 0.81671351, - "num_input_tokens_seen": 273102295, - "step": 12659, - "time_per_iteration": 2.6572675704956055 - }, - { - "auxiliary_loss_clip": 0.01097791, - "auxiliary_loss_mlp": 0.01028001, - "balance_loss_clip": 1.03714883, - "balance_loss_mlp": 1.015746, - "epoch": 0.7611603787764918, - "flos": 26872206526080.0, - "grad_norm": 1.589308819258603, - "language_loss": 0.68846476, - "learning_rate": 5.690197306063209e-07, - "loss": 0.7097227, - "num_input_tokens_seen": 273123400, - "step": 12660, - "time_per_iteration": 2.815166473388672 - }, - { - "auxiliary_loss_clip": 0.01111243, - "auxiliary_loss_mlp": 0.01032754, - "balance_loss_clip": 1.03771544, - "balance_loss_mlp": 1.02017736, - "epoch": 0.7612205020291598, - "flos": 27344252085120.0, - "grad_norm": 2.023337576106856, - "language_loss": 0.70192313, - "learning_rate": 5.687476709150281e-07, - "loss": 0.7233631, - "num_input_tokens_seen": 273145150, - "step": 12661, - "time_per_iteration": 2.7765588760375977 - }, - { - "auxiliary_loss_clip": 0.01099752, - "auxiliary_loss_mlp": 0.01034729, - "balance_loss_clip": 1.03683341, - "balance_loss_mlp": 1.02217579, - "epoch": 0.7612806252818277, - "flos": 29314598682240.0, - "grad_norm": 1.6042830797514005, - "language_loss": 0.83190757, - "learning_rate": 5.68475665496966e-07, - "loss": 0.85325241, - "num_input_tokens_seen": 273165180, - "step": 12662, - "time_per_iteration": 2.7277190685272217 - }, - { - "auxiliary_loss_clip": 0.01088049, - "auxiliary_loss_mlp": 0.0104358, - "balance_loss_clip": 1.03722358, - "balance_loss_mlp": 1.03130126, - "epoch": 0.7613407485344957, - "flos": 19026048401280.0, - "grad_norm": 1.7304537436557308, - "language_loss": 0.68582624, - "learning_rate": 5.682037143624505e-07, - "loss": 0.70714259, - "num_input_tokens_seen": 273184005, - "step": 12663, - "time_per_iteration": 2.770902156829834 - }, - { - "auxiliary_loss_clip": 0.01100065, - "auxiliary_loss_mlp": 0.01026036, - "balance_loss_clip": 1.03998029, - "balance_loss_mlp": 1.0138464, - "epoch": 0.7614008717871636, - "flos": 23256037733760.0, - "grad_norm": 2.1194736525192357, - "language_loss": 0.70156157, - "learning_rate": 5.67931817521794e-07, - "loss": 0.72282255, - "num_input_tokens_seen": 273203565, - "step": 12664, - "time_per_iteration": 2.7074570655822754 - }, - { - "auxiliary_loss_clip": 0.01105735, - "auxiliary_loss_mlp": 0.01039411, - "balance_loss_clip": 1.04057264, - "balance_loss_mlp": 1.02536225, - "epoch": 0.7614609950398317, - "flos": 21579907438080.0, - "grad_norm": 1.8390360170720685, - "language_loss": 0.79482293, - "learning_rate": 5.676599749853066e-07, - "loss": 0.8162744, - "num_input_tokens_seen": 273221645, - "step": 12665, - "time_per_iteration": 2.7299532890319824 - }, - { - "auxiliary_loss_clip": 0.0111148, - "auxiliary_loss_mlp": 0.00769447, - "balance_loss_clip": 1.04143631, - "balance_loss_mlp": 1.00019884, - "epoch": 0.7615211182924996, - "flos": 29277897960960.0, - "grad_norm": 1.9892685132164005, - "language_loss": 0.87823522, - "learning_rate": 5.673881867632959e-07, - "loss": 0.89704448, - "num_input_tokens_seen": 273242040, - "step": 12666, - "time_per_iteration": 2.7689883708953857 - }, - { - "auxiliary_loss_clip": 0.01055194, - "auxiliary_loss_mlp": 0.01033948, - "balance_loss_clip": 1.03460038, - "balance_loss_mlp": 1.02016783, - "epoch": 0.7615812415451676, - "flos": 13261129136640.0, - "grad_norm": 2.3749707608693513, - "language_loss": 0.8353771, - "learning_rate": 5.671164528660693e-07, - "loss": 0.85626853, - "num_input_tokens_seen": 273257365, - "step": 12667, - "time_per_iteration": 2.920854091644287 - }, - { - "auxiliary_loss_clip": 0.01089109, - "auxiliary_loss_mlp": 0.01037332, - "balance_loss_clip": 1.03897429, - "balance_loss_mlp": 1.02510726, - "epoch": 0.7616413647978356, - "flos": 18584741905920.0, - "grad_norm": 1.7012297272780605, - "language_loss": 0.78357065, - "learning_rate": 5.668447733039296e-07, - "loss": 0.80483508, - "num_input_tokens_seen": 273274710, - "step": 12668, - "time_per_iteration": 2.694464683532715 - }, - { - "auxiliary_loss_clip": 0.01075984, - "auxiliary_loss_mlp": 0.01034063, - "balance_loss_clip": 1.03536892, - "balance_loss_mlp": 1.02142668, - "epoch": 0.7617014880505035, - "flos": 18516188799360.0, - "grad_norm": 1.900278924462426, - "language_loss": 0.64169192, - "learning_rate": 5.6657314808718e-07, - "loss": 0.66279244, - "num_input_tokens_seen": 273292870, - "step": 12669, - "time_per_iteration": 2.793607234954834 - }, - { - "auxiliary_loss_clip": 0.01086136, - "auxiliary_loss_mlp": 0.01037174, - "balance_loss_clip": 1.03618228, - "balance_loss_mlp": 1.02251148, - "epoch": 0.7617616113031715, - "flos": 24973178382720.0, - "grad_norm": 2.3594416048527886, - "language_loss": 0.66683328, - "learning_rate": 5.663015772261202e-07, - "loss": 0.68806642, - "num_input_tokens_seen": 273312375, - "step": 12670, - "time_per_iteration": 2.784454584121704 - }, - { - "auxiliary_loss_clip": 0.01101863, - "auxiliary_loss_mlp": 0.01036176, - "balance_loss_clip": 1.03805709, - "balance_loss_mlp": 1.02371264, - "epoch": 0.7618217345558395, - "flos": 23295036925440.0, - "grad_norm": 1.6675852646548754, - "language_loss": 0.73051012, - "learning_rate": 5.660300607310493e-07, - "loss": 0.75189054, - "num_input_tokens_seen": 273332590, - "step": 12671, - "time_per_iteration": 2.7376444339752197 - }, - { - "auxiliary_loss_clip": 0.01072018, - "auxiliary_loss_mlp": 0.01036198, - "balance_loss_clip": 1.03299487, - "balance_loss_mlp": 1.02336478, - "epoch": 0.7618818578085075, - "flos": 25482894330240.0, - "grad_norm": 1.6810616532176517, - "language_loss": 0.73379242, - "learning_rate": 5.657585986122613e-07, - "loss": 0.75487459, - "num_input_tokens_seen": 273352885, - "step": 12672, - "time_per_iteration": 2.839824914932251 - }, - { - "auxiliary_loss_clip": 0.01001779, - "auxiliary_loss_mlp": 0.01000945, - "balance_loss_clip": 1.00902843, - "balance_loss_mlp": 0.99994415, - "epoch": 0.7619419810611754, - "flos": 61151994115200.0, - "grad_norm": 0.7605802796728681, - "language_loss": 0.56720763, - "learning_rate": 5.654871908800506e-07, - "loss": 0.58723491, - "num_input_tokens_seen": 273411730, - "step": 12673, - "time_per_iteration": 3.201223850250244 - }, - { - "auxiliary_loss_clip": 0.01100506, - "auxiliary_loss_mlp": 0.010336, - "balance_loss_clip": 1.03872013, - "balance_loss_mlp": 1.02017713, - "epoch": 0.7620021043138434, - "flos": 23258659426560.0, - "grad_norm": 1.740985764323004, - "language_loss": 0.74985719, - "learning_rate": 5.652158375447102e-07, - "loss": 0.77119827, - "num_input_tokens_seen": 273430020, - "step": 12674, - "time_per_iteration": 2.7523674964904785 - }, - { - "auxiliary_loss_clip": 0.01078335, - "auxiliary_loss_mlp": 0.01034219, - "balance_loss_clip": 1.03280282, - "balance_loss_mlp": 1.02115917, - "epoch": 0.7620622275665113, - "flos": 25082490447360.0, - "grad_norm": 2.016968785159948, - "language_loss": 0.7202276, - "learning_rate": 5.649445386165286e-07, - "loss": 0.74135315, - "num_input_tokens_seen": 273448690, - "step": 12675, - "time_per_iteration": 2.796057939529419 - }, - { - "auxiliary_loss_clip": 0.01095004, - "auxiliary_loss_mlp": 0.01030807, - "balance_loss_clip": 1.03599072, - "balance_loss_mlp": 1.01886785, - "epoch": 0.7621223508191793, - "flos": 20155007842560.0, - "grad_norm": 2.355672138276969, - "language_loss": 0.73052359, - "learning_rate": 5.646732941057936e-07, - "loss": 0.7517817, - "num_input_tokens_seen": 273465190, - "step": 12676, - "time_per_iteration": 2.734591484069824 - }, - { - "auxiliary_loss_clip": 0.01081109, - "auxiliary_loss_mlp": 0.00771918, - "balance_loss_clip": 1.03906035, - "balance_loss_mlp": 1.00022256, - "epoch": 0.7621824740718472, - "flos": 18000187971840.0, - "grad_norm": 2.93709923203383, - "language_loss": 0.54046768, - "learning_rate": 5.644021040227927e-07, - "loss": 0.55899793, - "num_input_tokens_seen": 273478620, - "step": 12677, - "time_per_iteration": 2.8109676837921143 - }, - { - "auxiliary_loss_clip": 0.01052826, - "auxiliary_loss_mlp": 0.01035963, - "balance_loss_clip": 1.0335747, - "balance_loss_mlp": 1.02283812, - "epoch": 0.7622425973245153, - "flos": 21725668828800.0, - "grad_norm": 2.0762274911054184, - "language_loss": 0.78760284, - "learning_rate": 5.641309683778064e-07, - "loss": 0.80849069, - "num_input_tokens_seen": 273497635, - "step": 12678, - "time_per_iteration": 2.860340118408203 - }, - { - "auxiliary_loss_clip": 0.01073918, - "auxiliary_loss_mlp": 0.01036673, - "balance_loss_clip": 1.0344305, - "balance_loss_mlp": 1.02257085, - "epoch": 0.7623027205771832, - "flos": 19718549683200.0, - "grad_norm": 3.9645067236030114, - "language_loss": 0.77204514, - "learning_rate": 5.638598871811175e-07, - "loss": 0.79315102, - "num_input_tokens_seen": 273513955, - "step": 12679, - "time_per_iteration": 2.772916793823242 - }, - { - "auxiliary_loss_clip": 0.01100617, - "auxiliary_loss_mlp": 0.01027269, - "balance_loss_clip": 1.03917551, - "balance_loss_mlp": 1.01456678, - "epoch": 0.7623628438298512, - "flos": 23988831096960.0, - "grad_norm": 1.434526846534088, - "language_loss": 0.80099726, - "learning_rate": 5.635888604430059e-07, - "loss": 0.82227612, - "num_input_tokens_seen": 273533970, - "step": 12680, - "time_per_iteration": 2.7801437377929688 - }, - { - "auxiliary_loss_clip": 0.01089966, - "auxiliary_loss_mlp": 0.01032719, - "balance_loss_clip": 1.03768969, - "balance_loss_mlp": 1.01880169, - "epoch": 0.7624229670825191, - "flos": 22345702421760.0, - "grad_norm": 1.9046696360663191, - "language_loss": 0.62818468, - "learning_rate": 5.633178881737493e-07, - "loss": 0.64941156, - "num_input_tokens_seen": 273553090, - "step": 12681, - "time_per_iteration": 2.8114664554595947 - }, - { - "auxiliary_loss_clip": 0.01076613, - "auxiliary_loss_mlp": 0.01031955, - "balance_loss_clip": 1.03848743, - "balance_loss_mlp": 1.01964092, - "epoch": 0.7624830903351871, - "flos": 22711775880960.0, - "grad_norm": 2.2465025277457755, - "language_loss": 0.76199776, - "learning_rate": 5.63046970383622e-07, - "loss": 0.78308344, - "num_input_tokens_seen": 273572460, - "step": 12682, - "time_per_iteration": 2.8621296882629395 - }, - { - "auxiliary_loss_clip": 0.01085809, - "auxiliary_loss_mlp": 0.01032162, - "balance_loss_clip": 1.03555107, - "balance_loss_mlp": 1.02058053, - "epoch": 0.7625432135878552, - "flos": 25593714766080.0, - "grad_norm": 1.5266925893040741, - "language_loss": 0.68380392, - "learning_rate": 5.627761070828974e-07, - "loss": 0.70498371, - "num_input_tokens_seen": 273592815, - "step": 12683, - "time_per_iteration": 2.804927349090576 - }, - { - "auxiliary_loss_clip": 0.01067779, - "auxiliary_loss_mlp": 0.00772982, - "balance_loss_clip": 1.03292143, - "balance_loss_mlp": 1.00020671, - "epoch": 0.7626033368405231, - "flos": 23987645948160.0, - "grad_norm": 2.022263962104647, - "language_loss": 0.83156735, - "learning_rate": 5.625052982818472e-07, - "loss": 0.84997493, - "num_input_tokens_seen": 273611790, - "step": 12684, - "time_per_iteration": 2.7787985801696777 - }, - { - "auxiliary_loss_clip": 0.0108949, - "auxiliary_loss_mlp": 0.01041206, - "balance_loss_clip": 1.03807712, - "balance_loss_mlp": 1.02769983, - "epoch": 0.7626634600931911, - "flos": 12599115523200.0, - "grad_norm": 2.242764424782362, - "language_loss": 0.82618159, - "learning_rate": 5.622345439907396e-07, - "loss": 0.84748858, - "num_input_tokens_seen": 273628340, - "step": 12685, - "time_per_iteration": 2.735823631286621 - }, - { - "auxiliary_loss_clip": 0.0107975, - "auxiliary_loss_mlp": 0.00770301, - "balance_loss_clip": 1.03726244, - "balance_loss_mlp": 1.00022709, - "epoch": 0.762723583345859, - "flos": 26322593546880.0, - "grad_norm": 2.461504636054881, - "language_loss": 0.77635926, - "learning_rate": 5.619638442198422e-07, - "loss": 0.79485977, - "num_input_tokens_seen": 273646585, - "step": 12686, - "time_per_iteration": 2.906090021133423 - }, - { - "auxiliary_loss_clip": 0.01052651, - "auxiliary_loss_mlp": 0.01057311, - "balance_loss_clip": 1.03302336, - "balance_loss_mlp": 1.0405736, - "epoch": 0.762783706598527, - "flos": 21907053532800.0, - "grad_norm": 1.7909742891247455, - "language_loss": 0.72059739, - "learning_rate": 5.616931989794198e-07, - "loss": 0.74169701, - "num_input_tokens_seen": 273665410, - "step": 12687, - "time_per_iteration": 2.736345052719116 - }, - { - "auxiliary_loss_clip": 0.01084081, - "auxiliary_loss_mlp": 0.01042723, - "balance_loss_clip": 1.03387547, - "balance_loss_mlp": 1.02746391, - "epoch": 0.7628438298511949, - "flos": 15339782217600.0, - "grad_norm": 1.8904556177994511, - "language_loss": 0.65018427, - "learning_rate": 5.614226082797369e-07, - "loss": 0.67145234, - "num_input_tokens_seen": 273683035, - "step": 12688, - "time_per_iteration": 2.7697956562042236 - }, - { - "auxiliary_loss_clip": 0.01101479, - "auxiliary_loss_mlp": 0.01027744, - "balance_loss_clip": 1.03997755, - "balance_loss_mlp": 1.01574564, - "epoch": 0.7629039531038629, - "flos": 13006307076480.0, - "grad_norm": 3.084065426087135, - "language_loss": 0.70538044, - "learning_rate": 5.611520721310515e-07, - "loss": 0.72667265, - "num_input_tokens_seen": 273700130, - "step": 12689, - "time_per_iteration": 2.9508743286132812 - }, - { - "auxiliary_loss_clip": 0.01081126, - "auxiliary_loss_mlp": 0.01040898, - "balance_loss_clip": 1.03614342, - "balance_loss_mlp": 1.0274868, - "epoch": 0.7629640763565309, - "flos": 26171660597760.0, - "grad_norm": 1.827453823319608, - "language_loss": 0.69980061, - "learning_rate": 5.608815905436238e-07, - "loss": 0.72102082, - "num_input_tokens_seen": 273720310, - "step": 12690, - "time_per_iteration": 2.8916642665863037 - }, - { - "auxiliary_loss_clip": 0.01084164, - "auxiliary_loss_mlp": 0.01040929, - "balance_loss_clip": 1.03480482, - "balance_loss_mlp": 1.02747643, - "epoch": 0.7630241996091989, - "flos": 36793713680640.0, - "grad_norm": 1.455347798519734, - "language_loss": 0.69115114, - "learning_rate": 5.606111635277109e-07, - "loss": 0.71240205, - "num_input_tokens_seen": 273744475, - "step": 12691, - "time_per_iteration": 4.387454032897949 - }, - { - "auxiliary_loss_clip": 0.01093867, - "auxiliary_loss_mlp": 0.01037257, - "balance_loss_clip": 1.03709197, - "balance_loss_mlp": 1.02576542, - "epoch": 0.7630843228618668, - "flos": 21835160461440.0, - "grad_norm": 1.950930402576883, - "language_loss": 0.81791067, - "learning_rate": 5.603407910935662e-07, - "loss": 0.83922184, - "num_input_tokens_seen": 273764635, - "step": 12692, - "time_per_iteration": 5.863187551498413 - }, - { - "auxiliary_loss_clip": 0.01078564, - "auxiliary_loss_mlp": 0.010271, - "balance_loss_clip": 1.04068136, - "balance_loss_mlp": 1.01536989, - "epoch": 0.7631444461145348, - "flos": 12640520926080.0, - "grad_norm": 2.677454083590648, - "language_loss": 0.77390575, - "learning_rate": 5.600704732514438e-07, - "loss": 0.79496241, - "num_input_tokens_seen": 273780115, - "step": 12693, - "time_per_iteration": 2.8327314853668213 - }, - { - "auxiliary_loss_clip": 0.0107301, - "auxiliary_loss_mlp": 0.01034355, - "balance_loss_clip": 1.03885221, - "balance_loss_mlp": 1.02155745, - "epoch": 0.7632045693672027, - "flos": 16836610798080.0, - "grad_norm": 3.1941491202097523, - "language_loss": 0.72766727, - "learning_rate": 5.598002100115933e-07, - "loss": 0.74874091, - "num_input_tokens_seen": 273796605, - "step": 12694, - "time_per_iteration": 2.771289587020874 - }, - { - "auxiliary_loss_clip": 0.01096742, - "auxiliary_loss_mlp": 0.01029277, - "balance_loss_clip": 1.03683043, - "balance_loss_mlp": 1.01703393, - "epoch": 0.7632646926198707, - "flos": 22017335264640.0, - "grad_norm": 1.9917055644917767, - "language_loss": 0.70419681, - "learning_rate": 5.595300013842625e-07, - "loss": 0.72545701, - "num_input_tokens_seen": 273816515, - "step": 12695, - "time_per_iteration": 2.616629123687744 - }, - { - "auxiliary_loss_clip": 0.01109838, - "auxiliary_loss_mlp": 0.01031794, - "balance_loss_clip": 1.03797019, - "balance_loss_mlp": 1.0198853, - "epoch": 0.7633248158725388, - "flos": 23114011357440.0, - "grad_norm": 1.5503240571511046, - "language_loss": 0.72249472, - "learning_rate": 5.592598473796985e-07, - "loss": 0.74391103, - "num_input_tokens_seen": 273837060, - "step": 12696, - "time_per_iteration": 2.7050669193267822 - }, - { - "auxiliary_loss_clip": 0.01051627, - "auxiliary_loss_mlp": 0.01040707, - "balance_loss_clip": 1.03538561, - "balance_loss_mlp": 1.02642596, - "epoch": 0.7633849391252067, - "flos": 10889839952640.0, - "grad_norm": 2.077421826663572, - "language_loss": 0.71310508, - "learning_rate": 5.589897480081453e-07, - "loss": 0.73402846, - "num_input_tokens_seen": 273853365, - "step": 12697, - "time_per_iteration": 4.246352672576904 - }, - { - "auxiliary_loss_clip": 0.01077141, - "auxiliary_loss_mlp": 0.01034421, - "balance_loss_clip": 1.0388602, - "balance_loss_mlp": 1.02219009, - "epoch": 0.7634450623778747, - "flos": 20994168355200.0, - "grad_norm": 3.071082049112887, - "language_loss": 0.66922784, - "learning_rate": 5.587197032798461e-07, - "loss": 0.69034344, - "num_input_tokens_seen": 273870750, - "step": 12698, - "time_per_iteration": 2.7623679637908936 - }, - { - "auxiliary_loss_clip": 0.01097288, - "auxiliary_loss_mlp": 0.01029151, - "balance_loss_clip": 1.03538871, - "balance_loss_mlp": 1.01636612, - "epoch": 0.7635051856305426, - "flos": 18882046776960.0, - "grad_norm": 1.6894035015942557, - "language_loss": 0.72252488, - "learning_rate": 5.5844971320504e-07, - "loss": 0.74378926, - "num_input_tokens_seen": 273890890, - "step": 12699, - "time_per_iteration": 2.681185483932495 - }, - { - "auxiliary_loss_clip": 0.01088089, - "auxiliary_loss_mlp": 0.01032373, - "balance_loss_clip": 1.03612185, - "balance_loss_mlp": 1.02065527, - "epoch": 0.7635653088832106, - "flos": 34786989584640.0, - "grad_norm": 1.7379546952285325, - "language_loss": 0.73000193, - "learning_rate": 5.581797777939648e-07, - "loss": 0.75120658, - "num_input_tokens_seen": 273914015, - "step": 12700, - "time_per_iteration": 2.788801908493042 - }, - { - "auxiliary_loss_clip": 0.01109919, - "auxiliary_loss_mlp": 0.01030309, - "balance_loss_clip": 1.03708696, - "balance_loss_mlp": 1.01822746, - "epoch": 0.7636254321358785, - "flos": 23178434400000.0, - "grad_norm": 2.5171117546055717, - "language_loss": 0.69465768, - "learning_rate": 5.579098970568574e-07, - "loss": 0.71606004, - "num_input_tokens_seen": 273927415, - "step": 12701, - "time_per_iteration": 2.6201059818267822 - }, - { - "auxiliary_loss_clip": 0.01083521, - "auxiliary_loss_mlp": 0.01031087, - "balance_loss_clip": 1.03899217, - "balance_loss_mlp": 1.01891518, - "epoch": 0.7636855553885465, - "flos": 21325229032320.0, - "grad_norm": 2.215440552723354, - "language_loss": 0.64664185, - "learning_rate": 5.576400710039508e-07, - "loss": 0.66778791, - "num_input_tokens_seen": 273946690, - "step": 12702, - "time_per_iteration": 2.7970054149627686 - }, - { - "auxiliary_loss_clip": 0.01079185, - "auxiliary_loss_mlp": 0.01033415, - "balance_loss_clip": 1.03836131, - "balance_loss_mlp": 1.02095747, - "epoch": 0.7637456786412145, - "flos": 28658079849600.0, - "grad_norm": 1.9784000831539899, - "language_loss": 0.66083431, - "learning_rate": 5.57370299645477e-07, - "loss": 0.68196028, - "num_input_tokens_seen": 273966870, - "step": 12703, - "time_per_iteration": 2.822849750518799 - }, - { - "auxiliary_loss_clip": 0.01087834, - "auxiliary_loss_mlp": 0.01026937, - "balance_loss_clip": 1.03842688, - "balance_loss_mlp": 1.01438999, - "epoch": 0.7638058018938825, - "flos": 21907269014400.0, - "grad_norm": 2.027090239685764, - "language_loss": 0.83859146, - "learning_rate": 5.571005829916668e-07, - "loss": 0.85973918, - "num_input_tokens_seen": 273986360, - "step": 12704, - "time_per_iteration": 2.728527784347534 - }, - { - "auxiliary_loss_clip": 0.01088663, - "auxiliary_loss_mlp": 0.01032796, - "balance_loss_clip": 1.03736877, - "balance_loss_mlp": 1.02039814, - "epoch": 0.7638659251465504, - "flos": 29643899592960.0, - "grad_norm": 1.895547997363001, - "language_loss": 0.67812586, - "learning_rate": 5.568309210527469e-07, - "loss": 0.69934046, - "num_input_tokens_seen": 274009745, - "step": 12705, - "time_per_iteration": 2.818378448486328 - }, - { - "auxiliary_loss_clip": 0.01083042, - "auxiliary_loss_mlp": 0.01032131, - "balance_loss_clip": 1.03550816, - "balance_loss_mlp": 1.01972699, - "epoch": 0.7639260483992184, - "flos": 26141172929280.0, - "grad_norm": 1.7310921121136604, - "language_loss": 0.73945439, - "learning_rate": 5.565613138389427e-07, - "loss": 0.76060611, - "num_input_tokens_seen": 274028775, - "step": 12706, - "time_per_iteration": 2.7738003730773926 - }, - { - "auxiliary_loss_clip": 0.0109458, - "auxiliary_loss_mlp": 0.01037611, - "balance_loss_clip": 1.03670621, - "balance_loss_mlp": 1.02431297, - "epoch": 0.7639861716518863, - "flos": 20156695781760.0, - "grad_norm": 2.5805411754522396, - "language_loss": 0.78420258, - "learning_rate": 5.562917613604781e-07, - "loss": 0.80552453, - "num_input_tokens_seen": 274047520, - "step": 12707, - "time_per_iteration": 2.785919666290283 - }, - { - "auxiliary_loss_clip": 0.01083532, - "auxiliary_loss_mlp": 0.01028293, - "balance_loss_clip": 1.03674436, - "balance_loss_mlp": 1.01594281, - "epoch": 0.7640462949045543, - "flos": 18583125793920.0, - "grad_norm": 1.8763992467573365, - "language_loss": 0.79923272, - "learning_rate": 5.560222636275751e-07, - "loss": 0.82035094, - "num_input_tokens_seen": 274065350, - "step": 12708, - "time_per_iteration": 2.7112326622009277 - }, - { - "auxiliary_loss_clip": 0.0102089, - "auxiliary_loss_mlp": 0.00999756, - "balance_loss_clip": 1.0106082, - "balance_loss_mlp": 0.99848616, - "epoch": 0.7641064181572224, - "flos": 68321991646080.0, - "grad_norm": 0.8077298698173723, - "language_loss": 0.56427336, - "learning_rate": 5.557528206504521e-07, - "loss": 0.58447981, - "num_input_tokens_seen": 274122315, - "step": 12709, - "time_per_iteration": 3.2111401557922363 - }, - { - "auxiliary_loss_clip": 0.01098648, - "auxiliary_loss_mlp": 0.01040257, - "balance_loss_clip": 1.03582978, - "balance_loss_mlp": 1.02636278, - "epoch": 0.7641665414098903, - "flos": 17968982031360.0, - "grad_norm": 1.9630774322237245, - "language_loss": 0.63484347, - "learning_rate": 5.554834324393271e-07, - "loss": 0.65623254, - "num_input_tokens_seen": 274140555, - "step": 12710, - "time_per_iteration": 2.685795545578003 - }, - { - "auxiliary_loss_clip": 0.01062185, - "auxiliary_loss_mlp": 0.00771699, - "balance_loss_clip": 1.03377032, - "balance_loss_mlp": 1.00016749, - "epoch": 0.7642266646625583, - "flos": 21252078984960.0, - "grad_norm": 2.5143918151768867, - "language_loss": 0.64498585, - "learning_rate": 5.552140990044154e-07, - "loss": 0.66332471, - "num_input_tokens_seen": 274161125, - "step": 12711, - "time_per_iteration": 2.845017671585083 - }, - { - "auxiliary_loss_clip": 0.01088311, - "auxiliary_loss_mlp": 0.01037404, - "balance_loss_clip": 1.03707993, - "balance_loss_mlp": 1.02514362, - "epoch": 0.7642867879152262, - "flos": 22747794243840.0, - "grad_norm": 1.7149688745487186, - "language_loss": 0.72759664, - "learning_rate": 5.549448203559293e-07, - "loss": 0.7488538, - "num_input_tokens_seen": 274180835, - "step": 12712, - "time_per_iteration": 2.7211430072784424 - }, - { - "auxiliary_loss_clip": 0.01077131, - "auxiliary_loss_mlp": 0.01032428, - "balance_loss_clip": 1.03835392, - "balance_loss_mlp": 1.02084625, - "epoch": 0.7643469111678942, - "flos": 23332132696320.0, - "grad_norm": 2.218446959632987, - "language_loss": 0.80380988, - "learning_rate": 5.546755965040804e-07, - "loss": 0.82490551, - "num_input_tokens_seen": 274201190, - "step": 12713, - "time_per_iteration": 2.822138786315918 - }, - { - "auxiliary_loss_clip": 0.01102023, - "auxiliary_loss_mlp": 0.00771212, - "balance_loss_clip": 1.03739047, - "balance_loss_mlp": 1.00028956, - "epoch": 0.7644070344205621, - "flos": 19857092440320.0, - "grad_norm": 2.084525894573783, - "language_loss": 0.83132589, - "learning_rate": 5.544064274590776e-07, - "loss": 0.85005832, - "num_input_tokens_seen": 274217595, - "step": 12714, - "time_per_iteration": 2.67500638961792 - }, - { - "auxiliary_loss_clip": 0.01104132, - "auxiliary_loss_mlp": 0.01037809, - "balance_loss_clip": 1.0384692, - "balance_loss_mlp": 1.02498782, - "epoch": 0.7644671576732301, - "flos": 22090628966400.0, - "grad_norm": 1.7447994690858495, - "language_loss": 0.73020244, - "learning_rate": 5.541373132311287e-07, - "loss": 0.75162184, - "num_input_tokens_seen": 274237885, - "step": 12715, - "time_per_iteration": 2.705496072769165 - }, - { - "auxiliary_loss_clip": 0.0106908, - "auxiliary_loss_mlp": 0.01029403, - "balance_loss_clip": 1.03376102, - "balance_loss_mlp": 1.01651025, - "epoch": 0.7645272809258981, - "flos": 25481421872640.0, - "grad_norm": 1.9750549289299242, - "language_loss": 0.63063681, - "learning_rate": 5.538682538304376e-07, - "loss": 0.65162164, - "num_input_tokens_seen": 274258820, - "step": 12716, - "time_per_iteration": 2.7983617782592773 - }, - { - "auxiliary_loss_clip": 0.01115577, - "auxiliary_loss_mlp": 0.01037115, - "balance_loss_clip": 1.03981853, - "balance_loss_mlp": 1.02357841, - "epoch": 0.7645874041785661, - "flos": 21541877913600.0, - "grad_norm": 1.536427490036212, - "language_loss": 0.79740059, - "learning_rate": 5.535992492672068e-07, - "loss": 0.81892753, - "num_input_tokens_seen": 274278835, - "step": 12717, - "time_per_iteration": 2.595195770263672 - }, - { - "auxiliary_loss_clip": 0.01110878, - "auxiliary_loss_mlp": 0.01037171, - "balance_loss_clip": 1.03890347, - "balance_loss_mlp": 1.02481461, - "epoch": 0.764647527431234, - "flos": 20630896156800.0, - "grad_norm": 2.30472579589713, - "language_loss": 0.66033196, - "learning_rate": 5.53330299551638e-07, - "loss": 0.68181252, - "num_input_tokens_seen": 274297110, - "step": 12718, - "time_per_iteration": 2.673990488052368 - }, - { - "auxiliary_loss_clip": 0.01063441, - "auxiliary_loss_mlp": 0.01036496, - "balance_loss_clip": 1.03585815, - "balance_loss_mlp": 1.02499259, - "epoch": 0.764707650683902, - "flos": 21434074220160.0, - "grad_norm": 2.1613863310626287, - "language_loss": 0.77098262, - "learning_rate": 5.530614046939286e-07, - "loss": 0.791982, - "num_input_tokens_seen": 274315610, - "step": 12719, - "time_per_iteration": 2.6510918140411377 - }, - { - "auxiliary_loss_clip": 0.01112525, - "auxiliary_loss_mlp": 0.01029144, - "balance_loss_clip": 1.03881288, - "balance_loss_mlp": 1.01615012, - "epoch": 0.7647677739365699, - "flos": 22711201263360.0, - "grad_norm": 2.267731943336326, - "language_loss": 0.7029593, - "learning_rate": 5.527925647042754e-07, - "loss": 0.72437602, - "num_input_tokens_seen": 274333975, - "step": 12720, - "time_per_iteration": 2.5991692543029785 - }, - { - "auxiliary_loss_clip": 0.01079824, - "auxiliary_loss_mlp": 0.01040879, - "balance_loss_clip": 1.03855467, - "balance_loss_mlp": 1.02823687, - "epoch": 0.7648278971892379, - "flos": 21324115710720.0, - "grad_norm": 1.5967062450845435, - "language_loss": 0.73703921, - "learning_rate": 5.52523779592875e-07, - "loss": 0.7582463, - "num_input_tokens_seen": 274353695, - "step": 12721, - "time_per_iteration": 2.764606237411499 - }, - { - "auxiliary_loss_clip": 0.01070414, - "auxiliary_loss_mlp": 0.01030705, - "balance_loss_clip": 1.03494334, - "balance_loss_mlp": 1.01805067, - "epoch": 0.764888020441906, - "flos": 20667345482880.0, - "grad_norm": 1.6944622449827433, - "language_loss": 0.73529649, - "learning_rate": 5.522550493699163e-07, - "loss": 0.75630772, - "num_input_tokens_seen": 274371120, - "step": 12722, - "time_per_iteration": 2.7863218784332275 - }, - { - "auxiliary_loss_clip": 0.01099467, - "auxiliary_loss_mlp": 0.01038218, - "balance_loss_clip": 1.03691196, - "balance_loss_mlp": 1.02573085, - "epoch": 0.7649481436945739, - "flos": 25082526360960.0, - "grad_norm": 1.8664873966014532, - "language_loss": 0.74043649, - "learning_rate": 5.519863740455912e-07, - "loss": 0.76181328, - "num_input_tokens_seen": 274389665, - "step": 12723, - "time_per_iteration": 2.6984498500823975 - }, - { - "auxiliary_loss_clip": 0.01111926, - "auxiliary_loss_mlp": 0.0103197, - "balance_loss_clip": 1.03712893, - "balance_loss_mlp": 1.01897049, - "epoch": 0.7650082669472419, - "flos": 24900890261760.0, - "grad_norm": 1.9718177009092177, - "language_loss": 0.73098785, - "learning_rate": 5.517177536300881e-07, - "loss": 0.75242674, - "num_input_tokens_seen": 274408750, - "step": 12724, - "time_per_iteration": 2.723292112350464 - }, - { - "auxiliary_loss_clip": 0.0109622, - "auxiliary_loss_mlp": 0.01027237, - "balance_loss_clip": 1.03798413, - "balance_loss_mlp": 1.01521456, - "epoch": 0.7650683901999098, - "flos": 14647388676480.0, - "grad_norm": 1.8049167073820385, - "language_loss": 0.83982503, - "learning_rate": 5.514491881335935e-07, - "loss": 0.86105955, - "num_input_tokens_seen": 274424600, - "step": 12725, - "time_per_iteration": 2.6900579929351807 - }, - { - "auxiliary_loss_clip": 0.01071599, - "auxiliary_loss_mlp": 0.01033335, - "balance_loss_clip": 1.03815186, - "balance_loss_mlp": 1.01962614, - "epoch": 0.7651285134525778, - "flos": 26352434770560.0, - "grad_norm": 1.764771346840138, - "language_loss": 0.77535796, - "learning_rate": 5.511806775662901e-07, - "loss": 0.79640734, - "num_input_tokens_seen": 274443075, - "step": 12726, - "time_per_iteration": 2.7554757595062256 - }, - { - "auxiliary_loss_clip": 0.01098675, - "auxiliary_loss_mlp": 0.0103653, - "balance_loss_clip": 1.03659284, - "balance_loss_mlp": 1.0239116, - "epoch": 0.7651886367052457, - "flos": 26646866553600.0, - "grad_norm": 1.727505900288767, - "language_loss": 0.70817876, - "learning_rate": 5.509122219383615e-07, - "loss": 0.72953087, - "num_input_tokens_seen": 274463240, - "step": 12727, - "time_per_iteration": 2.679713249206543 - }, - { - "auxiliary_loss_clip": 0.0110535, - "auxiliary_loss_mlp": 0.01031096, - "balance_loss_clip": 1.03530371, - "balance_loss_mlp": 1.01887083, - "epoch": 0.7652487599579137, - "flos": 25702847262720.0, - "grad_norm": 1.645567589950576, - "language_loss": 0.79781538, - "learning_rate": 5.506438212599864e-07, - "loss": 0.81917983, - "num_input_tokens_seen": 274482750, - "step": 12728, - "time_per_iteration": 2.6556482315063477 - }, - { - "auxiliary_loss_clip": 0.01112141, - "auxiliary_loss_mlp": 0.01029917, - "balance_loss_clip": 1.03871763, - "balance_loss_mlp": 1.01615465, - "epoch": 0.7653088832105817, - "flos": 28585576247040.0, - "grad_norm": 2.018168225354916, - "language_loss": 0.55207121, - "learning_rate": 5.503754755413424e-07, - "loss": 0.57349181, - "num_input_tokens_seen": 274503545, - "step": 12729, - "time_per_iteration": 2.656604290008545 - }, - { - "auxiliary_loss_clip": 0.01087792, - "auxiliary_loss_mlp": 0.00770692, - "balance_loss_clip": 1.03700304, - "balance_loss_mlp": 1.00016689, - "epoch": 0.7653690064632497, - "flos": 23366750428800.0, - "grad_norm": 2.0285553204704914, - "language_loss": 0.78009534, - "learning_rate": 5.501071847926055e-07, - "loss": 0.79868019, - "num_input_tokens_seen": 274523825, - "step": 12730, - "time_per_iteration": 4.308157920837402 - }, - { - "auxiliary_loss_clip": 0.01104921, - "auxiliary_loss_mlp": 0.01038983, - "balance_loss_clip": 1.04124045, - "balance_loss_mlp": 1.02547646, - "epoch": 0.7654291297159176, - "flos": 15773905992960.0, - "grad_norm": 1.8100841028281673, - "language_loss": 0.69162709, - "learning_rate": 5.498389490239495e-07, - "loss": 0.7130661, - "num_input_tokens_seen": 274541625, - "step": 12731, - "time_per_iteration": 5.375198841094971 - }, - { - "auxiliary_loss_clip": 0.0111224, - "auxiliary_loss_mlp": 0.01032378, - "balance_loss_clip": 1.03824425, - "balance_loss_mlp": 1.0195576, - "epoch": 0.7654892529685856, - "flos": 18033800123520.0, - "grad_norm": 2.185341705177071, - "language_loss": 0.70105004, - "learning_rate": 5.495707682455471e-07, - "loss": 0.72249627, - "num_input_tokens_seen": 274557580, - "step": 12732, - "time_per_iteration": 4.1254401206970215 - }, - { - "auxiliary_loss_clip": 0.01092112, - "auxiliary_loss_mlp": 0.01027482, - "balance_loss_clip": 1.0373385, - "balance_loss_mlp": 1.01429737, - "epoch": 0.7655493762212535, - "flos": 27236017428480.0, - "grad_norm": 1.4842742362274353, - "language_loss": 0.78410125, - "learning_rate": 5.493026424675653e-07, - "loss": 0.8052972, - "num_input_tokens_seen": 274578135, - "step": 12733, - "time_per_iteration": 2.7428579330444336 - }, - { - "auxiliary_loss_clip": 0.0109795, - "auxiliary_loss_mlp": 0.01031014, - "balance_loss_clip": 1.03692389, - "balance_loss_mlp": 1.0184319, - "epoch": 0.7656094994739215, - "flos": 20773964027520.0, - "grad_norm": 1.7566510390792163, - "language_loss": 0.7753557, - "learning_rate": 5.490345717001726e-07, - "loss": 0.79664528, - "num_input_tokens_seen": 274595655, - "step": 12734, - "time_per_iteration": 2.7525999546051025 - }, - { - "auxiliary_loss_clip": 0.01085843, - "auxiliary_loss_mlp": 0.01034242, - "balance_loss_clip": 1.03541505, - "balance_loss_mlp": 1.01981783, - "epoch": 0.7656696227265896, - "flos": 23039245198080.0, - "grad_norm": 1.5677045475604683, - "language_loss": 0.73221684, - "learning_rate": 5.48766555953535e-07, - "loss": 0.75341773, - "num_input_tokens_seen": 274616305, - "step": 12735, - "time_per_iteration": 2.7425713539123535 - }, - { - "auxiliary_loss_clip": 0.01081818, - "auxiliary_loss_mlp": 0.01035075, - "balance_loss_clip": 1.03768682, - "balance_loss_mlp": 1.02273107, - "epoch": 0.7657297459792575, - "flos": 27525636789120.0, - "grad_norm": 1.7042118812882554, - "language_loss": 0.72533989, - "learning_rate": 5.484985952378145e-07, - "loss": 0.74650872, - "num_input_tokens_seen": 274638110, - "step": 12736, - "time_per_iteration": 4.268921852111816 - }, - { - "auxiliary_loss_clip": 0.01100818, - "auxiliary_loss_mlp": 0.00771184, - "balance_loss_clip": 1.0399543, - "balance_loss_mlp": 1.00027192, - "epoch": 0.7657898692319255, - "flos": 17128456801920.0, - "grad_norm": 2.232664044830526, - "language_loss": 0.77698004, - "learning_rate": 5.482306895631728e-07, - "loss": 0.79570007, - "num_input_tokens_seen": 274656565, - "step": 12737, - "time_per_iteration": 2.751887321472168 - }, - { - "auxiliary_loss_clip": 0.0108412, - "auxiliary_loss_mlp": 0.01034529, - "balance_loss_clip": 1.03502047, - "balance_loss_mlp": 1.02128458, - "epoch": 0.7658499924845934, - "flos": 21465747037440.0, - "grad_norm": 1.8163284528378292, - "language_loss": 0.76455462, - "learning_rate": 5.479628389397699e-07, - "loss": 0.78574109, - "num_input_tokens_seen": 274674215, - "step": 12738, - "time_per_iteration": 2.7251851558685303 - }, - { - "auxiliary_loss_clip": 0.01092339, - "auxiliary_loss_mlp": 0.01031134, - "balance_loss_clip": 1.03848684, - "balance_loss_mlp": 1.01825941, - "epoch": 0.7659101157372614, - "flos": 29496665744640.0, - "grad_norm": 1.9441100679422159, - "language_loss": 0.62250507, - "learning_rate": 5.476950433777603e-07, - "loss": 0.64373976, - "num_input_tokens_seen": 274693445, - "step": 12739, - "time_per_iteration": 2.858171224594116 - }, - { - "auxiliary_loss_clip": 0.01112469, - "auxiliary_loss_mlp": 0.01035738, - "balance_loss_clip": 1.03928363, - "balance_loss_mlp": 1.02203465, - "epoch": 0.7659702389899293, - "flos": 18551812112640.0, - "grad_norm": 2.47113097275276, - "language_loss": 0.79031968, - "learning_rate": 5.474273028873004e-07, - "loss": 0.81180167, - "num_input_tokens_seen": 274712815, - "step": 12740, - "time_per_iteration": 2.624732732772827 - }, - { - "auxiliary_loss_clip": 0.01100888, - "auxiliary_loss_mlp": 0.01032866, - "balance_loss_clip": 1.03686976, - "balance_loss_mlp": 1.01987791, - "epoch": 0.7660303622425974, - "flos": 23549176627200.0, - "grad_norm": 1.653199646827083, - "language_loss": 0.65173864, - "learning_rate": 5.471596174785429e-07, - "loss": 0.67307615, - "num_input_tokens_seen": 274732690, - "step": 12741, - "time_per_iteration": 2.716336488723755 - }, - { - "auxiliary_loss_clip": 0.01083513, - "auxiliary_loss_mlp": 0.01030711, - "balance_loss_clip": 1.03482628, - "balance_loss_mlp": 1.0174545, - "epoch": 0.7660904854952653, - "flos": 18916736336640.0, - "grad_norm": 1.544754015503659, - "language_loss": 0.75767601, - "learning_rate": 5.468919871616386e-07, - "loss": 0.77881825, - "num_input_tokens_seen": 274752460, - "step": 12742, - "time_per_iteration": 2.7747738361358643 - }, - { - "auxiliary_loss_clip": 0.01086511, - "auxiliary_loss_mlp": 0.01031712, - "balance_loss_clip": 1.03885317, - "balance_loss_mlp": 1.01983905, - "epoch": 0.7661506087479333, - "flos": 23147515768320.0, - "grad_norm": 1.4566796365103731, - "language_loss": 0.76655585, - "learning_rate": 5.46624411946736e-07, - "loss": 0.78773808, - "num_input_tokens_seen": 274773070, - "step": 12743, - "time_per_iteration": 2.780097484588623 - }, - { - "auxiliary_loss_clip": 0.01085441, - "auxiliary_loss_mlp": 0.01034742, - "balance_loss_clip": 1.03478014, - "balance_loss_mlp": 1.02236819, - "epoch": 0.7662107320006012, - "flos": 17565776887680.0, - "grad_norm": 1.917782267543357, - "language_loss": 0.74838865, - "learning_rate": 5.463568918439805e-07, - "loss": 0.76959044, - "num_input_tokens_seen": 274790220, - "step": 12744, - "time_per_iteration": 2.8596222400665283 - }, - { - "auxiliary_loss_clip": 0.01099606, - "auxiliary_loss_mlp": 0.01033696, - "balance_loss_clip": 1.03648257, - "balance_loss_mlp": 1.02051127, - "epoch": 0.7662708552532692, - "flos": 22303075956480.0, - "grad_norm": 2.0417086586666424, - "language_loss": 0.71049422, - "learning_rate": 5.460894268635181e-07, - "loss": 0.73182726, - "num_input_tokens_seen": 274805095, - "step": 12745, - "time_per_iteration": 2.7712717056274414 - }, - { - "auxiliary_loss_clip": 0.01095184, - "auxiliary_loss_mlp": 0.01038801, - "balance_loss_clip": 1.03534567, - "balance_loss_mlp": 1.0241797, - "epoch": 0.7663309785059371, - "flos": 15742053607680.0, - "grad_norm": 2.301646519557661, - "language_loss": 0.77083957, - "learning_rate": 5.458220170154896e-07, - "loss": 0.79217947, - "num_input_tokens_seen": 274821800, - "step": 12746, - "time_per_iteration": 2.6804726123809814 - }, - { - "auxiliary_loss_clip": 0.0100528, - "auxiliary_loss_mlp": 0.01001059, - "balance_loss_clip": 1.01132298, - "balance_loss_mlp": 0.99997419, - "epoch": 0.7663911017586051, - "flos": 62163312514560.0, - "grad_norm": 0.6620577659541201, - "language_loss": 0.56773937, - "learning_rate": 5.455546623100362e-07, - "loss": 0.58780277, - "num_input_tokens_seen": 274886970, - "step": 12747, - "time_per_iteration": 3.3290786743164062 - }, - { - "auxiliary_loss_clip": 0.01108005, - "auxiliary_loss_mlp": 0.01035791, - "balance_loss_clip": 1.03717351, - "balance_loss_mlp": 1.02456689, - "epoch": 0.7664512250112732, - "flos": 26506025326080.0, - "grad_norm": 1.9151583390314333, - "language_loss": 0.72503966, - "learning_rate": 5.452873627572956e-07, - "loss": 0.7464776, - "num_input_tokens_seen": 274907240, - "step": 12748, - "time_per_iteration": 2.730177640914917 - }, - { - "auxiliary_loss_clip": 0.01074476, - "auxiliary_loss_mlp": 0.01028824, - "balance_loss_clip": 1.03368735, - "balance_loss_mlp": 1.01592588, - "epoch": 0.7665113482639411, - "flos": 16249542912000.0, - "grad_norm": 1.8433426874848031, - "language_loss": 0.69247651, - "learning_rate": 5.450201183674052e-07, - "loss": 0.7135095, - "num_input_tokens_seen": 274924650, - "step": 12749, - "time_per_iteration": 2.755204439163208 - }, - { - "auxiliary_loss_clip": 0.01101353, - "auxiliary_loss_mlp": 0.01030402, - "balance_loss_clip": 1.03804362, - "balance_loss_mlp": 1.01727748, - "epoch": 0.7665714715166091, - "flos": 27197880163200.0, - "grad_norm": 1.535641047844791, - "language_loss": 0.73516762, - "learning_rate": 5.447529291504967e-07, - "loss": 0.75648522, - "num_input_tokens_seen": 274944550, - "step": 12750, - "time_per_iteration": 2.7742607593536377 - }, - { - "auxiliary_loss_clip": 0.01097021, - "auxiliary_loss_mlp": 0.01031967, - "balance_loss_clip": 1.0379684, - "balance_loss_mlp": 1.02008176, - "epoch": 0.766631594769277, - "flos": 21067785279360.0, - "grad_norm": 2.3156427112447147, - "language_loss": 0.76064527, - "learning_rate": 5.444857951167026e-07, - "loss": 0.78193521, - "num_input_tokens_seen": 274961330, - "step": 12751, - "time_per_iteration": 2.730836868286133 - }, - { - "auxiliary_loss_clip": 0.01077429, - "auxiliary_loss_mlp": 0.01037666, - "balance_loss_clip": 1.03694248, - "balance_loss_mlp": 1.02451706, - "epoch": 0.766691718021945, - "flos": 24097963593600.0, - "grad_norm": 1.9738925392982969, - "language_loss": 0.6149745, - "learning_rate": 5.442187162761537e-07, - "loss": 0.63612545, - "num_input_tokens_seen": 274981655, - "step": 12752, - "time_per_iteration": 2.869851589202881 - }, - { - "auxiliary_loss_clip": 0.01102451, - "auxiliary_loss_mlp": 0.01036291, - "balance_loss_clip": 1.03904963, - "balance_loss_mlp": 1.02302337, - "epoch": 0.7667518412746129, - "flos": 23440654661760.0, - "grad_norm": 1.931365168470797, - "language_loss": 0.69503748, - "learning_rate": 5.439516926389767e-07, - "loss": 0.71642488, - "num_input_tokens_seen": 274999970, - "step": 12753, - "time_per_iteration": 2.7476491928100586 - }, - { - "auxiliary_loss_clip": 0.01101717, - "auxiliary_loss_mlp": 0.01036587, - "balance_loss_clip": 1.03879189, - "balance_loss_mlp": 1.02405787, - "epoch": 0.766811964527281, - "flos": 18148786536960.0, - "grad_norm": 2.611222297039761, - "language_loss": 0.62583512, - "learning_rate": 5.436847242152971e-07, - "loss": 0.64721823, - "num_input_tokens_seen": 275015805, - "step": 12754, - "time_per_iteration": 2.7371304035186768 - }, - { - "auxiliary_loss_clip": 0.01110914, - "auxiliary_loss_mlp": 0.01030173, - "balance_loss_clip": 1.03996325, - "balance_loss_mlp": 1.01831782, - "epoch": 0.7668720877799489, - "flos": 19536051657600.0, - "grad_norm": 2.549051131454572, - "language_loss": 0.80213803, - "learning_rate": 5.434178110152401e-07, - "loss": 0.82354891, - "num_input_tokens_seen": 275031810, - "step": 12755, - "time_per_iteration": 2.643878936767578 - }, - { - "auxiliary_loss_clip": 0.01110814, - "auxiliary_loss_mlp": 0.01030285, - "balance_loss_clip": 1.03913355, - "balance_loss_mlp": 1.01825666, - "epoch": 0.7669322110326169, - "flos": 22674320974080.0, - "grad_norm": 2.28671666205893, - "language_loss": 0.70240182, - "learning_rate": 5.431509530489242e-07, - "loss": 0.72381282, - "num_input_tokens_seen": 275049325, - "step": 12756, - "time_per_iteration": 2.666398763656616 - }, - { - "auxiliary_loss_clip": 0.01101033, - "auxiliary_loss_mlp": 0.0103684, - "balance_loss_clip": 1.03897476, - "balance_loss_mlp": 1.02491951, - "epoch": 0.7669923342852848, - "flos": 26469396432000.0, - "grad_norm": 1.5126125205867516, - "language_loss": 0.70042777, - "learning_rate": 5.428841503264706e-07, - "loss": 0.72180653, - "num_input_tokens_seen": 275070865, - "step": 12757, - "time_per_iteration": 2.9036061763763428 - }, - { - "auxiliary_loss_clip": 0.01090769, - "auxiliary_loss_mlp": 0.01039374, - "balance_loss_clip": 1.03925812, - "balance_loss_mlp": 1.02609968, - "epoch": 0.7670524575379528, - "flos": 22856136641280.0, - "grad_norm": 1.9623271762553347, - "language_loss": 0.76281571, - "learning_rate": 5.426174028579955e-07, - "loss": 0.7841171, - "num_input_tokens_seen": 275088015, - "step": 12758, - "time_per_iteration": 2.7477500438690186 - }, - { - "auxiliary_loss_clip": 0.0109864, - "auxiliary_loss_mlp": 0.01041128, - "balance_loss_clip": 1.03716195, - "balance_loss_mlp": 1.0282712, - "epoch": 0.7671125807906207, - "flos": 22452141398400.0, - "grad_norm": 1.933344061408033, - "language_loss": 0.76319116, - "learning_rate": 5.423507106536156e-07, - "loss": 0.78458881, - "num_input_tokens_seen": 275106975, - "step": 12759, - "time_per_iteration": 2.714374303817749 - }, - { - "auxiliary_loss_clip": 0.01087695, - "auxiliary_loss_mlp": 0.01028999, - "balance_loss_clip": 1.03469515, - "balance_loss_mlp": 1.0170604, - "epoch": 0.7671727040432887, - "flos": 35371543518720.0, - "grad_norm": 4.630848621895134, - "language_loss": 0.67929637, - "learning_rate": 5.420840737234425e-07, - "loss": 0.70046335, - "num_input_tokens_seen": 275129560, - "step": 12760, - "time_per_iteration": 2.7753570079803467 - }, - { - "auxiliary_loss_clip": 0.01089951, - "auxiliary_loss_mlp": 0.01034392, - "balance_loss_clip": 1.03797793, - "balance_loss_mlp": 1.02147603, - "epoch": 0.7672328272959568, - "flos": 22494947431680.0, - "grad_norm": 1.455109874708046, - "language_loss": 0.79299426, - "learning_rate": 5.418174920775871e-07, - "loss": 0.81423771, - "num_input_tokens_seen": 275151180, - "step": 12761, - "time_per_iteration": 2.7769343852996826 - }, - { - "auxiliary_loss_clip": 0.01085141, - "auxiliary_loss_mlp": 0.01035009, - "balance_loss_clip": 1.03607702, - "balance_loss_mlp": 1.022295, - "epoch": 0.7672929505486247, - "flos": 22815557251200.0, - "grad_norm": 18.920071863703896, - "language_loss": 0.66145515, - "learning_rate": 5.415509657261589e-07, - "loss": 0.68265665, - "num_input_tokens_seen": 275170605, - "step": 12762, - "time_per_iteration": 2.8406293392181396 - }, - { - "auxiliary_loss_clip": 0.01101121, - "auxiliary_loss_mlp": 0.01034249, - "balance_loss_clip": 1.03835821, - "balance_loss_mlp": 1.02105296, - "epoch": 0.7673530738012927, - "flos": 20338834671360.0, - "grad_norm": 1.6976408594267334, - "language_loss": 0.74313831, - "learning_rate": 5.412844946792639e-07, - "loss": 0.76449203, - "num_input_tokens_seen": 275188750, - "step": 12763, - "time_per_iteration": 2.6841235160827637 - }, - { - "auxiliary_loss_clip": 0.01088871, - "auxiliary_loss_mlp": 0.01033223, - "balance_loss_clip": 1.03973687, - "balance_loss_mlp": 1.02024698, - "epoch": 0.7674131970539606, - "flos": 34933576988160.0, - "grad_norm": 1.693482308646493, - "language_loss": 0.70655918, - "learning_rate": 5.410180789470067e-07, - "loss": 0.7277801, - "num_input_tokens_seen": 275211365, - "step": 12764, - "time_per_iteration": 2.821410894393921 - }, - { - "auxiliary_loss_clip": 0.01101312, - "auxiliary_loss_mlp": 0.01031147, - "balance_loss_clip": 1.03925323, - "balance_loss_mlp": 1.01875496, - "epoch": 0.7674733203066286, - "flos": 28328850766080.0, - "grad_norm": 1.8643050168393442, - "language_loss": 0.69511282, - "learning_rate": 5.40751718539491e-07, - "loss": 0.7164374, - "num_input_tokens_seen": 275231670, - "step": 12765, - "time_per_iteration": 2.7457258701324463 - }, - { - "auxiliary_loss_clip": 0.01081052, - "auxiliary_loss_mlp": 0.01029756, - "balance_loss_clip": 1.03556418, - "balance_loss_mlp": 1.01865792, - "epoch": 0.7675334435592965, - "flos": 16289727252480.0, - "grad_norm": 3.667092334043392, - "language_loss": 0.60817224, - "learning_rate": 5.404854134668162e-07, - "loss": 0.62928033, - "num_input_tokens_seen": 275249425, - "step": 12766, - "time_per_iteration": 2.6500067710876465 - }, - { - "auxiliary_loss_clip": 0.01001024, - "auxiliary_loss_mlp": 0.01013385, - "balance_loss_clip": 1.01323843, - "balance_loss_mlp": 1.01216352, - "epoch": 0.7675935668119646, - "flos": 64826232220800.0, - "grad_norm": 0.7347382071618644, - "language_loss": 0.60767788, - "learning_rate": 5.402191637390803e-07, - "loss": 0.62782198, - "num_input_tokens_seen": 275312485, - "step": 12767, - "time_per_iteration": 3.39412260055542 - }, - { - "auxiliary_loss_clip": 0.01089304, - "auxiliary_loss_mlp": 0.01027185, - "balance_loss_clip": 1.04006386, - "balance_loss_mlp": 1.01521647, - "epoch": 0.7676536900646325, - "flos": 22675398382080.0, - "grad_norm": 1.6451651301272818, - "language_loss": 0.69793016, - "learning_rate": 5.399529693663801e-07, - "loss": 0.71909499, - "num_input_tokens_seen": 275331680, - "step": 12768, - "time_per_iteration": 2.730433464050293 - }, - { - "auxiliary_loss_clip": 0.01106486, - "auxiliary_loss_mlp": 0.01036773, - "balance_loss_clip": 1.0407027, - "balance_loss_mlp": 1.0239104, - "epoch": 0.7677138133173005, - "flos": 26939682224640.0, - "grad_norm": 1.8343046170579347, - "language_loss": 0.71094149, - "learning_rate": 5.3968683035881e-07, - "loss": 0.73237407, - "num_input_tokens_seen": 275351615, - "step": 12769, - "time_per_iteration": 4.170667409896851 - }, - { - "auxiliary_loss_clip": 0.01103072, - "auxiliary_loss_mlp": 0.01029544, - "balance_loss_clip": 1.04003, - "balance_loss_mlp": 1.01668179, - "epoch": 0.7677739365699684, - "flos": 23799545400960.0, - "grad_norm": 1.983209153557694, - "language_loss": 0.80168104, - "learning_rate": 5.394207467264611e-07, - "loss": 0.82300717, - "num_input_tokens_seen": 275368815, - "step": 12770, - "time_per_iteration": 5.3567235469818115 - }, - { - "auxiliary_loss_clip": 0.01073219, - "auxiliary_loss_mlp": 0.01038788, - "balance_loss_clip": 1.03567314, - "balance_loss_mlp": 1.02632451, - "epoch": 0.7678340598226364, - "flos": 34455497944320.0, - "grad_norm": 1.6213929898270116, - "language_loss": 0.78927696, - "learning_rate": 5.391547184794245e-07, - "loss": 0.81039715, - "num_input_tokens_seen": 275389345, - "step": 12771, - "time_per_iteration": 4.329530954360962 - }, - { - "auxiliary_loss_clip": 0.01110874, - "auxiliary_loss_mlp": 0.01033957, - "balance_loss_clip": 1.03865027, - "balance_loss_mlp": 1.02205408, - "epoch": 0.7678941830753043, - "flos": 23841740903040.0, - "grad_norm": 1.3882460901064075, - "language_loss": 0.68299866, - "learning_rate": 5.388887456277876e-07, - "loss": 0.70444703, - "num_input_tokens_seen": 275411240, - "step": 12772, - "time_per_iteration": 2.6789863109588623 - }, - { - "auxiliary_loss_clip": 0.01095405, - "auxiliary_loss_mlp": 0.01027019, - "balance_loss_clip": 1.03676343, - "balance_loss_mlp": 1.01512742, - "epoch": 0.7679543063279723, - "flos": 25410929431680.0, - "grad_norm": 1.5084750243321292, - "language_loss": 0.73452669, - "learning_rate": 5.386228281816349e-07, - "loss": 0.75575089, - "num_input_tokens_seen": 275432010, - "step": 12773, - "time_per_iteration": 2.6992523670196533 - }, - { - "auxiliary_loss_clip": 0.01069552, - "auxiliary_loss_mlp": 0.01031097, - "balance_loss_clip": 1.03272963, - "balance_loss_mlp": 1.0193727, - "epoch": 0.7680144295806404, - "flos": 27962382257280.0, - "grad_norm": 1.681002895076516, - "language_loss": 0.81144333, - "learning_rate": 5.383569661510512e-07, - "loss": 0.83244979, - "num_input_tokens_seen": 275453710, - "step": 12774, - "time_per_iteration": 2.8317103385925293 - }, - { - "auxiliary_loss_clip": 0.01102442, - "auxiliary_loss_mlp": 0.00769635, - "balance_loss_clip": 1.04086018, - "balance_loss_mlp": 1.00017095, - "epoch": 0.7680745528333083, - "flos": 20412810731520.0, - "grad_norm": 1.7406217670940616, - "language_loss": 0.69881612, - "learning_rate": 5.380911595461177e-07, - "loss": 0.71753687, - "num_input_tokens_seen": 275472915, - "step": 12775, - "time_per_iteration": 2.6908600330352783 - }, - { - "auxiliary_loss_clip": 0.00994458, - "auxiliary_loss_mlp": 0.01000081, - "balance_loss_clip": 1.01208818, - "balance_loss_mlp": 0.99908555, - "epoch": 0.7681346760859763, - "flos": 68401103351040.0, - "grad_norm": 0.7006055087346096, - "language_loss": 0.5683471, - "learning_rate": 5.378254083769147e-07, - "loss": 0.58829248, - "num_input_tokens_seen": 275534785, - "step": 12776, - "time_per_iteration": 4.903045415878296 - }, - { - "auxiliary_loss_clip": 0.01097484, - "auxiliary_loss_mlp": 0.01038787, - "balance_loss_clip": 1.03686929, - "balance_loss_mlp": 1.02621067, - "epoch": 0.7681947993386442, - "flos": 21251468453760.0, - "grad_norm": 1.9522911810284198, - "language_loss": 0.73814118, - "learning_rate": 5.375597126535188e-07, - "loss": 0.75950396, - "num_input_tokens_seen": 275553205, - "step": 12777, - "time_per_iteration": 2.6122212409973145 - }, - { - "auxiliary_loss_clip": 0.01086003, - "auxiliary_loss_mlp": 0.01032363, - "balance_loss_clip": 1.04298782, - "balance_loss_mlp": 1.02055573, - "epoch": 0.7682549225913122, - "flos": 21397696721280.0, - "grad_norm": 2.745693545308853, - "language_loss": 0.70324051, - "learning_rate": 5.372940723860043e-07, - "loss": 0.72442418, - "num_input_tokens_seen": 275571490, - "step": 12778, - "time_per_iteration": 2.67712664604187 - }, - { - "auxiliary_loss_clip": 0.01097946, - "auxiliary_loss_mlp": 0.01036667, - "balance_loss_clip": 1.0395422, - "balance_loss_mlp": 1.02473378, - "epoch": 0.7683150458439801, - "flos": 23038921975680.0, - "grad_norm": 1.741525859100896, - "language_loss": 0.70140779, - "learning_rate": 5.37028487584446e-07, - "loss": 0.72275388, - "num_input_tokens_seen": 275589665, - "step": 12779, - "time_per_iteration": 2.699604034423828 - }, - { - "auxiliary_loss_clip": 0.01086473, - "auxiliary_loss_mlp": 0.01031094, - "balance_loss_clip": 1.03794789, - "balance_loss_mlp": 1.01829696, - "epoch": 0.7683751690966482, - "flos": 67332397996800.0, - "grad_norm": 9.13576096667177, - "language_loss": 0.58861399, - "learning_rate": 5.367629582589133e-07, - "loss": 0.60978961, - "num_input_tokens_seen": 275615605, - "step": 12780, - "time_per_iteration": 3.0669844150543213 - }, - { - "auxiliary_loss_clip": 0.01104147, - "auxiliary_loss_mlp": 0.01037402, - "balance_loss_clip": 1.03906894, - "balance_loss_mlp": 1.02291799, - "epoch": 0.7684352923493161, - "flos": 21798890703360.0, - "grad_norm": 1.8034337516792285, - "language_loss": 0.67968678, - "learning_rate": 5.364974844194759e-07, - "loss": 0.70110226, - "num_input_tokens_seen": 275634965, - "step": 12781, - "time_per_iteration": 2.651834726333618 - }, - { - "auxiliary_loss_clip": 0.01060523, - "auxiliary_loss_mlp": 0.01036749, - "balance_loss_clip": 1.03551328, - "balance_loss_mlp": 1.02461362, - "epoch": 0.7684954156019841, - "flos": 25847603072640.0, - "grad_norm": 1.4376609198163834, - "language_loss": 0.79309833, - "learning_rate": 5.362320660762016e-07, - "loss": 0.81407106, - "num_input_tokens_seen": 275655785, - "step": 12782, - "time_per_iteration": 2.847486972808838 - }, - { - "auxiliary_loss_clip": 0.01082383, - "auxiliary_loss_mlp": 0.0103233, - "balance_loss_clip": 1.03683078, - "balance_loss_mlp": 1.01938355, - "epoch": 0.768555538854652, - "flos": 25447378757760.0, - "grad_norm": 1.7564402643439623, - "language_loss": 0.67005706, - "learning_rate": 5.35966703239153e-07, - "loss": 0.69120419, - "num_input_tokens_seen": 275676160, - "step": 12783, - "time_per_iteration": 2.703382730484009 - }, - { - "auxiliary_loss_clip": 0.01090024, - "auxiliary_loss_mlp": 0.01032797, - "balance_loss_clip": 1.03791714, - "balance_loss_mlp": 1.01942182, - "epoch": 0.76861566210732, - "flos": 19646369303040.0, - "grad_norm": 1.6469852773745217, - "language_loss": 0.69382596, - "learning_rate": 5.357013959183938e-07, - "loss": 0.71505415, - "num_input_tokens_seen": 275695660, - "step": 12784, - "time_per_iteration": 2.704110860824585 - }, - { - "auxiliary_loss_clip": 0.01069442, - "auxiliary_loss_mlp": 0.01027242, - "balance_loss_clip": 1.03885603, - "balance_loss_mlp": 1.01570261, - "epoch": 0.7686757853599879, - "flos": 22419032037120.0, - "grad_norm": 1.8804976619771494, - "language_loss": 0.80312717, - "learning_rate": 5.354361441239843e-07, - "loss": 0.824094, - "num_input_tokens_seen": 275714025, - "step": 12785, - "time_per_iteration": 2.7998046875 - }, - { - "auxiliary_loss_clip": 0.0109676, - "auxiliary_loss_mlp": 0.01038542, - "balance_loss_clip": 1.03655457, - "balance_loss_mlp": 1.02337885, - "epoch": 0.768735908612656, - "flos": 47774262453120.0, - "grad_norm": 1.5387616772885826, - "language_loss": 0.77432472, - "learning_rate": 5.351709478659836e-07, - "loss": 0.79567772, - "num_input_tokens_seen": 275737300, - "step": 12786, - "time_per_iteration": 2.8903398513793945 - }, - { - "auxiliary_loss_clip": 0.01110354, - "auxiliary_loss_mlp": 0.01035321, - "balance_loss_clip": 1.03830373, - "balance_loss_mlp": 1.02295876, - "epoch": 0.7687960318653239, - "flos": 30263179000320.0, - "grad_norm": 1.918052748759356, - "language_loss": 0.58398765, - "learning_rate": 5.349058071544468e-07, - "loss": 0.60544437, - "num_input_tokens_seen": 275757895, - "step": 12787, - "time_per_iteration": 2.699540376663208 - }, - { - "auxiliary_loss_clip": 0.01082553, - "auxiliary_loss_mlp": 0.01032316, - "balance_loss_clip": 1.03361166, - "balance_loss_mlp": 1.01962042, - "epoch": 0.7688561551179919, - "flos": 19573434737280.0, - "grad_norm": 1.5809067798231773, - "language_loss": 0.76156747, - "learning_rate": 5.346407219994292e-07, - "loss": 0.78271621, - "num_input_tokens_seen": 275776745, - "step": 12788, - "time_per_iteration": 2.81557559967041 - }, - { - "auxiliary_loss_clip": 0.01071579, - "auxiliary_loss_mlp": 0.00770364, - "balance_loss_clip": 1.03880525, - "balance_loss_mlp": 1.00020683, - "epoch": 0.7689162783706599, - "flos": 22783776693120.0, - "grad_norm": 1.957956891358716, - "language_loss": 0.66906554, - "learning_rate": 5.343756924109821e-07, - "loss": 0.68748498, - "num_input_tokens_seen": 275797205, - "step": 12789, - "time_per_iteration": 2.8146092891693115 - }, - { - "auxiliary_loss_clip": 0.01090409, - "auxiliary_loss_mlp": 0.0103625, - "balance_loss_clip": 1.03680754, - "balance_loss_mlp": 1.02214777, - "epoch": 0.7689764016233278, - "flos": 34204195416960.0, - "grad_norm": 1.6643565512884475, - "language_loss": 0.68623877, - "learning_rate": 5.341107183991553e-07, - "loss": 0.70750535, - "num_input_tokens_seen": 275817935, - "step": 12790, - "time_per_iteration": 2.812708854675293 - }, - { - "auxiliary_loss_clip": 0.0108634, - "auxiliary_loss_mlp": 0.01032838, - "balance_loss_clip": 1.03740978, - "balance_loss_mlp": 1.01972485, - "epoch": 0.7690365248759958, - "flos": 17274469587840.0, - "grad_norm": 1.474038307182623, - "language_loss": 0.68689752, - "learning_rate": 5.338457999739969e-07, - "loss": 0.70808923, - "num_input_tokens_seen": 275837145, - "step": 12791, - "time_per_iteration": 2.7558822631835938 - }, - { - "auxiliary_loss_clip": 0.01097751, - "auxiliary_loss_mlp": 0.01036178, - "balance_loss_clip": 1.038535, - "balance_loss_mlp": 1.0244422, - "epoch": 0.7690966481286637, - "flos": 18223157646720.0, - "grad_norm": 2.037350378754986, - "language_loss": 0.79861724, - "learning_rate": 5.335809371455526e-07, - "loss": 0.81995654, - "num_input_tokens_seen": 275855705, - "step": 12792, - "time_per_iteration": 2.6373798847198486 - }, - { - "auxiliary_loss_clip": 0.01086002, - "auxiliary_loss_mlp": 0.00771512, - "balance_loss_clip": 1.04310513, - "balance_loss_mlp": 1.0003171, - "epoch": 0.7691567713813318, - "flos": 21537568281600.0, - "grad_norm": 1.8617627243354054, - "language_loss": 0.72776759, - "learning_rate": 5.333161299238673e-07, - "loss": 0.74634272, - "num_input_tokens_seen": 275873930, - "step": 12793, - "time_per_iteration": 2.8017160892486572 - }, - { - "auxiliary_loss_clip": 0.01074333, - "auxiliary_loss_mlp": 0.01036724, - "balance_loss_clip": 1.03909159, - "balance_loss_mlp": 1.02368283, - "epoch": 0.7692168946339997, - "flos": 39379999720320.0, - "grad_norm": 1.9633300130492255, - "language_loss": 0.63842422, - "learning_rate": 5.330513783189803e-07, - "loss": 0.65953475, - "num_input_tokens_seen": 275895895, - "step": 12794, - "time_per_iteration": 2.8763763904571533 - }, - { - "auxiliary_loss_clip": 0.01088067, - "auxiliary_loss_mlp": 0.01038769, - "balance_loss_clip": 1.03724957, - "balance_loss_mlp": 1.02609682, - "epoch": 0.7692770178866677, - "flos": 25009950931200.0, - "grad_norm": 1.537212991597864, - "language_loss": 0.76528752, - "learning_rate": 5.327866823409319e-07, - "loss": 0.78655589, - "num_input_tokens_seen": 275917825, - "step": 12795, - "time_per_iteration": 2.7116506099700928 - }, - { - "auxiliary_loss_clip": 0.01075556, - "auxiliary_loss_mlp": 0.01025881, - "balance_loss_clip": 1.03575516, - "balance_loss_mlp": 1.01325679, - "epoch": 0.7693371411393356, - "flos": 24716273333760.0, - "grad_norm": 1.8098665948309556, - "language_loss": 0.71871811, - "learning_rate": 5.325220419997601e-07, - "loss": 0.7397325, - "num_input_tokens_seen": 275937890, - "step": 12796, - "time_per_iteration": 2.770573139190674 - }, - { - "auxiliary_loss_clip": 0.01110769, - "auxiliary_loss_mlp": 0.01030191, - "balance_loss_clip": 1.03795838, - "balance_loss_mlp": 1.01753139, - "epoch": 0.7693972643920036, - "flos": 15924803028480.0, - "grad_norm": 1.8945883944315456, - "language_loss": 0.64692825, - "learning_rate": 5.32257457305499e-07, - "loss": 0.66833782, - "num_input_tokens_seen": 275954495, - "step": 12797, - "time_per_iteration": 2.597770929336548 - }, - { - "auxiliary_loss_clip": 0.01074194, - "auxiliary_loss_mlp": 0.01036918, - "balance_loss_clip": 1.03441215, - "balance_loss_mlp": 1.02261305, - "epoch": 0.7694573876446715, - "flos": 25405901527680.0, - "grad_norm": 2.104503388065538, - "language_loss": 0.91503501, - "learning_rate": 5.319929282681823e-07, - "loss": 0.93614614, - "num_input_tokens_seen": 275972395, - "step": 12798, - "time_per_iteration": 2.7857353687286377 - }, - { - "auxiliary_loss_clip": 0.01061452, - "auxiliary_loss_mlp": 0.01027349, - "balance_loss_clip": 1.03667367, - "balance_loss_mlp": 1.01509404, - "epoch": 0.7695175108973396, - "flos": 16654220513280.0, - "grad_norm": 1.8305644604969793, - "language_loss": 0.82303166, - "learning_rate": 5.317284548978418e-07, - "loss": 0.84391975, - "num_input_tokens_seen": 275989020, - "step": 12799, - "time_per_iteration": 2.7627201080322266 - }, - { - "auxiliary_loss_clip": 0.01057867, - "auxiliary_loss_mlp": 0.0102915, - "balance_loss_clip": 1.03739285, - "balance_loss_mlp": 1.01601338, - "epoch": 0.7695776341500075, - "flos": 13626520237440.0, - "grad_norm": 1.9375837310730932, - "language_loss": 0.7841835, - "learning_rate": 5.314640372045045e-07, - "loss": 0.80505365, - "num_input_tokens_seen": 276006525, - "step": 12800, - "time_per_iteration": 2.860802173614502 - }, - { - "auxiliary_loss_clip": 0.01094192, - "auxiliary_loss_mlp": 0.01029605, - "balance_loss_clip": 1.03736687, - "balance_loss_mlp": 1.01572347, - "epoch": 0.7696377574026755, - "flos": 24276690691200.0, - "grad_norm": 1.6551183463192032, - "language_loss": 0.83884531, - "learning_rate": 5.31199675198198e-07, - "loss": 0.86008328, - "num_input_tokens_seen": 276027130, - "step": 12801, - "time_per_iteration": 2.8100953102111816 - }, - { - "auxiliary_loss_clip": 0.0108893, - "auxiliary_loss_mlp": 0.01030665, - "balance_loss_clip": 1.03665733, - "balance_loss_mlp": 1.01778448, - "epoch": 0.7696978806553435, - "flos": 20923137210240.0, - "grad_norm": 2.4183621963241357, - "language_loss": 0.72267437, - "learning_rate": 5.30935368888947e-07, - "loss": 0.74387032, - "num_input_tokens_seen": 276045715, - "step": 12802, - "time_per_iteration": 2.716482639312744 - }, - { - "auxiliary_loss_clip": 0.0108354, - "auxiliary_loss_mlp": 0.01034672, - "balance_loss_clip": 1.0340662, - "balance_loss_mlp": 1.022048, - "epoch": 0.7697580039080114, - "flos": 22929609911040.0, - "grad_norm": 1.7224396030439215, - "language_loss": 0.75905406, - "learning_rate": 5.306711182867747e-07, - "loss": 0.78023618, - "num_input_tokens_seen": 276065375, - "step": 12803, - "time_per_iteration": 2.7502260208129883 - }, - { - "auxiliary_loss_clip": 0.01018092, - "auxiliary_loss_mlp": 0.01000358, - "balance_loss_clip": 1.01451325, - "balance_loss_mlp": 0.99920207, - "epoch": 0.7698181271606794, - "flos": 68717654933760.0, - "grad_norm": 0.7330583208910887, - "language_loss": 0.55806667, - "learning_rate": 5.304069234017001e-07, - "loss": 0.57825118, - "num_input_tokens_seen": 276131405, - "step": 12804, - "time_per_iteration": 3.3005380630493164 - }, - { - "auxiliary_loss_clip": 0.0101265, - "auxiliary_loss_mlp": 0.01002009, - "balance_loss_clip": 1.00900471, - "balance_loss_mlp": 1.00096023, - "epoch": 0.7698782504133473, - "flos": 67409716999680.0, - "grad_norm": 0.7614116720269231, - "language_loss": 0.54004955, - "learning_rate": 5.301427842437429e-07, - "loss": 0.56019616, - "num_input_tokens_seen": 276200755, - "step": 12805, - "time_per_iteration": 3.3900198936462402 - }, - { - "auxiliary_loss_clip": 0.0108001, - "auxiliary_loss_mlp": 0.01033208, - "balance_loss_clip": 1.03882051, - "balance_loss_mlp": 1.02053022, - "epoch": 0.7699383736660154, - "flos": 22488842119680.0, - "grad_norm": 1.986233467865104, - "language_loss": 0.73035413, - "learning_rate": 5.298787008229187e-07, - "loss": 0.7514863, - "num_input_tokens_seen": 276217880, - "step": 12806, - "time_per_iteration": 2.7341980934143066 - }, - { - "auxiliary_loss_clip": 0.01086866, - "auxiliary_loss_mlp": 0.01035537, - "balance_loss_clip": 1.03594339, - "balance_loss_mlp": 1.02238786, - "epoch": 0.7699984969186833, - "flos": 21539723097600.0, - "grad_norm": 2.048367090429927, - "language_loss": 0.75222588, - "learning_rate": 5.296146731492408e-07, - "loss": 0.7734499, - "num_input_tokens_seen": 276234810, - "step": 12807, - "time_per_iteration": 2.724539041519165 - }, - { - "auxiliary_loss_clip": 0.01106456, - "auxiliary_loss_mlp": 0.01031388, - "balance_loss_clip": 1.04034483, - "balance_loss_mlp": 1.01792347, - "epoch": 0.7700586201713513, - "flos": 21719096640000.0, - "grad_norm": 2.054947719033548, - "language_loss": 0.80061448, - "learning_rate": 5.293507012327218e-07, - "loss": 0.82199287, - "num_input_tokens_seen": 276252850, - "step": 12808, - "time_per_iteration": 4.215209722518921 - }, - { - "auxiliary_loss_clip": 0.01105023, - "auxiliary_loss_mlp": 0.01039739, - "balance_loss_clip": 1.03983986, - "balance_loss_mlp": 1.02620244, - "epoch": 0.7701187434240192, - "flos": 27856015107840.0, - "grad_norm": 2.2828692902230743, - "language_loss": 0.79191184, - "learning_rate": 5.290867850833718e-07, - "loss": 0.8133595, - "num_input_tokens_seen": 276272525, - "step": 12809, - "time_per_iteration": 4.67883825302124 - }, - { - "auxiliary_loss_clip": 0.01075128, - "auxiliary_loss_mlp": 0.01026317, - "balance_loss_clip": 1.03558159, - "balance_loss_mlp": 1.014974, - "epoch": 0.7701788666766872, - "flos": 28621307301120.0, - "grad_norm": 1.7126957543660224, - "language_loss": 0.70423043, - "learning_rate": 5.288229247111993e-07, - "loss": 0.72524494, - "num_input_tokens_seen": 276294210, - "step": 12810, - "time_per_iteration": 4.299976110458374 - }, - { - "auxiliary_loss_clip": 0.0108663, - "auxiliary_loss_mlp": 0.01043871, - "balance_loss_clip": 1.03548312, - "balance_loss_mlp": 1.02746737, - "epoch": 0.7702389899293551, - "flos": 14246446089600.0, - "grad_norm": 2.84512278280032, - "language_loss": 0.77875537, - "learning_rate": 5.285591201262079e-07, - "loss": 0.80006033, - "num_input_tokens_seen": 276310290, - "step": 12811, - "time_per_iteration": 2.792184352874756 - }, - { - "auxiliary_loss_clip": 0.01001395, - "auxiliary_loss_mlp": 0.01001171, - "balance_loss_clip": 1.00706363, - "balance_loss_mlp": 0.99988317, - "epoch": 0.7702991131820232, - "flos": 70574128439040.0, - "grad_norm": 0.8151907995721069, - "language_loss": 0.56650817, - "learning_rate": 5.28295371338402e-07, - "loss": 0.5865339, - "num_input_tokens_seen": 276371715, - "step": 12812, - "time_per_iteration": 3.301762819290161 - }, - { - "auxiliary_loss_clip": 0.01073584, - "auxiliary_loss_mlp": 0.01035613, - "balance_loss_clip": 1.03664494, - "balance_loss_mlp": 1.02299511, - "epoch": 0.7703592364346911, - "flos": 25480021242240.0, - "grad_norm": 3.4768581734180453, - "language_loss": 0.72098076, - "learning_rate": 5.280316783577836e-07, - "loss": 0.74207264, - "num_input_tokens_seen": 276389895, - "step": 12813, - "time_per_iteration": 2.8251900672912598 - }, - { - "auxiliary_loss_clip": 0.0110181, - "auxiliary_loss_mlp": 0.01030481, - "balance_loss_clip": 1.03734303, - "balance_loss_mlp": 1.01664054, - "epoch": 0.7704193596873591, - "flos": 19280906375040.0, - "grad_norm": 2.0063403023078297, - "language_loss": 0.66324687, - "learning_rate": 5.27768041194351e-07, - "loss": 0.68456984, - "num_input_tokens_seen": 276408990, - "step": 12814, - "time_per_iteration": 2.7897889614105225 - }, - { - "auxiliary_loss_clip": 0.01089036, - "auxiliary_loss_mlp": 0.01038036, - "balance_loss_clip": 1.03707969, - "balance_loss_mlp": 1.02553058, - "epoch": 0.7704794829400271, - "flos": 23658452778240.0, - "grad_norm": 1.8618896845056536, - "language_loss": 0.65574408, - "learning_rate": 5.275044598581018e-07, - "loss": 0.67701477, - "num_input_tokens_seen": 276428190, - "step": 12815, - "time_per_iteration": 2.745948314666748 - }, - { - "auxiliary_loss_clip": 0.0109967, - "auxiliary_loss_mlp": 0.01034412, - "balance_loss_clip": 1.03795624, - "balance_loss_mlp": 1.02119207, - "epoch": 0.770539606192695, - "flos": 18989311766400.0, - "grad_norm": 3.9090080450756703, - "language_loss": 0.65051812, - "learning_rate": 5.272409343590322e-07, - "loss": 0.67185891, - "num_input_tokens_seen": 276446855, - "step": 12816, - "time_per_iteration": 4.193779230117798 - }, - { - "auxiliary_loss_clip": 0.01102885, - "auxiliary_loss_mlp": 0.01034999, - "balance_loss_clip": 1.03968191, - "balance_loss_mlp": 1.02194536, - "epoch": 0.770599729445363, - "flos": 11830160142720.0, - "grad_norm": 2.3027657135701496, - "language_loss": 0.71589029, - "learning_rate": 5.26977464707133e-07, - "loss": 0.73726916, - "num_input_tokens_seen": 276462000, - "step": 12817, - "time_per_iteration": 2.701976776123047 - }, - { - "auxiliary_loss_clip": 0.01067462, - "auxiliary_loss_mlp": 0.01031755, - "balance_loss_clip": 1.03671288, - "balance_loss_mlp": 1.01967907, - "epoch": 0.770659852698031, - "flos": 17822610109440.0, - "grad_norm": 2.117205920773346, - "language_loss": 0.61316186, - "learning_rate": 5.267140509123957e-07, - "loss": 0.63415402, - "num_input_tokens_seen": 276481190, - "step": 12818, - "time_per_iteration": 2.894584894180298 - }, - { - "auxiliary_loss_clip": 0.01098817, - "auxiliary_loss_mlp": 0.01029481, - "balance_loss_clip": 1.03884339, - "balance_loss_mlp": 1.01770937, - "epoch": 0.770719975950699, - "flos": 21871968923520.0, - "grad_norm": 1.8092629622591248, - "language_loss": 0.67272353, - "learning_rate": 5.264506929848093e-07, - "loss": 0.69400644, - "num_input_tokens_seen": 276499520, - "step": 12819, - "time_per_iteration": 2.6729207038879395 - }, - { - "auxiliary_loss_clip": 0.01114198, - "auxiliary_loss_mlp": 0.01031273, - "balance_loss_clip": 1.04036117, - "balance_loss_mlp": 1.0183568, - "epoch": 0.7707800992033669, - "flos": 21325049464320.0, - "grad_norm": 3.8495844407525786, - "language_loss": 0.57512546, - "learning_rate": 5.261873909343608e-07, - "loss": 0.59658015, - "num_input_tokens_seen": 276519110, - "step": 12820, - "time_per_iteration": 2.6065587997436523 - }, - { - "auxiliary_loss_clip": 0.01082909, - "auxiliary_loss_mlp": 0.01030006, - "balance_loss_clip": 1.037233, - "balance_loss_mlp": 1.01698244, - "epoch": 0.7708402224560349, - "flos": 28179426188160.0, - "grad_norm": 2.6946227990391742, - "language_loss": 0.80718732, - "learning_rate": 5.259241447710343e-07, - "loss": 0.82831645, - "num_input_tokens_seen": 276538805, - "step": 12821, - "time_per_iteration": 2.7545745372772217 - }, - { - "auxiliary_loss_clip": 0.01113447, - "auxiliary_loss_mlp": 0.01036131, - "balance_loss_clip": 1.04009652, - "balance_loss_mlp": 1.02311945, - "epoch": 0.7709003457087028, - "flos": 15377057556480.0, - "grad_norm": 3.179311365273749, - "language_loss": 0.68571889, - "learning_rate": 5.256609545048114e-07, - "loss": 0.70721459, - "num_input_tokens_seen": 276554770, - "step": 12822, - "time_per_iteration": 2.6314475536346436 - }, - { - "auxiliary_loss_clip": 0.0108847, - "auxiliary_loss_mlp": 0.01036733, - "balance_loss_clip": 1.03697228, - "balance_loss_mlp": 1.02384686, - "epoch": 0.7709604689613708, - "flos": 30621854257920.0, - "grad_norm": 1.8530631240007662, - "language_loss": 0.72300768, - "learning_rate": 5.253978201456733e-07, - "loss": 0.74425972, - "num_input_tokens_seen": 276574535, - "step": 12823, - "time_per_iteration": 2.7124979496002197 - }, - { - "auxiliary_loss_clip": 0.01107629, - "auxiliary_loss_mlp": 0.01039791, - "balance_loss_clip": 1.04024911, - "balance_loss_mlp": 1.02459168, - "epoch": 0.7710205922140387, - "flos": 20301272023680.0, - "grad_norm": 1.7759548619058283, - "language_loss": 0.76394266, - "learning_rate": 5.251347417035969e-07, - "loss": 0.78541684, - "num_input_tokens_seen": 276592925, - "step": 12824, - "time_per_iteration": 2.7012369632720947 - }, - { - "auxiliary_loss_clip": 0.0108641, - "auxiliary_loss_mlp": 0.01031747, - "balance_loss_clip": 1.0379014, - "balance_loss_mlp": 1.01897967, - "epoch": 0.7710807154667068, - "flos": 19644214487040.0, - "grad_norm": 2.5594814083345856, - "language_loss": 0.72377741, - "learning_rate": 5.248717191885592e-07, - "loss": 0.744959, - "num_input_tokens_seen": 276610540, - "step": 12825, - "time_per_iteration": 2.711148977279663 - }, - { - "auxiliary_loss_clip": 0.0110825, - "auxiliary_loss_mlp": 0.01037397, - "balance_loss_clip": 1.03889346, - "balance_loss_mlp": 1.02650094, - "epoch": 0.7711408387193747, - "flos": 20006337450240.0, - "grad_norm": 1.6443549229743277, - "language_loss": 0.73782164, - "learning_rate": 5.246087526105343e-07, - "loss": 0.75927812, - "num_input_tokens_seen": 276629200, - "step": 12826, - "time_per_iteration": 2.6268928050994873 - }, - { - "auxiliary_loss_clip": 0.01112855, - "auxiliary_loss_mlp": 0.01035754, - "balance_loss_clip": 1.03778219, - "balance_loss_mlp": 1.02234912, - "epoch": 0.7712009619720427, - "flos": 24971131307520.0, - "grad_norm": 1.6817186914054845, - "language_loss": 0.81052697, - "learning_rate": 5.243458419794933e-07, - "loss": 0.83201313, - "num_input_tokens_seen": 276648655, - "step": 12827, - "time_per_iteration": 2.6236133575439453 - }, - { - "auxiliary_loss_clip": 0.01030504, - "auxiliary_loss_mlp": 0.01001401, - "balance_loss_clip": 1.0079608, - "balance_loss_mlp": 1.0003643, - "epoch": 0.7712610852247107, - "flos": 63249681404160.0, - "grad_norm": 0.8667997379462846, - "language_loss": 0.55184829, - "learning_rate": 5.240829873054051e-07, - "loss": 0.57216728, - "num_input_tokens_seen": 276716500, - "step": 12828, - "time_per_iteration": 3.314025640487671 - }, - { - "auxiliary_loss_clip": 0.01062789, - "auxiliary_loss_mlp": 0.01034135, - "balance_loss_clip": 1.03295088, - "balance_loss_mlp": 1.02165389, - "epoch": 0.7713212084773786, - "flos": 18697860812160.0, - "grad_norm": 1.7251497465168657, - "language_loss": 0.6980052, - "learning_rate": 5.23820188598238e-07, - "loss": 0.71897441, - "num_input_tokens_seen": 276733535, - "step": 12829, - "time_per_iteration": 2.7099921703338623 - }, - { - "auxiliary_loss_clip": 0.01085241, - "auxiliary_loss_mlp": 0.01036187, - "balance_loss_clip": 1.04121757, - "balance_loss_mlp": 1.02271688, - "epoch": 0.7713813317300466, - "flos": 14173367869440.0, - "grad_norm": 2.8210703511982, - "language_loss": 0.79999912, - "learning_rate": 5.235574458679579e-07, - "loss": 0.82121342, - "num_input_tokens_seen": 276749575, - "step": 12830, - "time_per_iteration": 2.754983901977539 - }, - { - "auxiliary_loss_clip": 0.01104042, - "auxiliary_loss_mlp": 0.01037065, - "balance_loss_clip": 1.03856182, - "balance_loss_mlp": 1.02329099, - "epoch": 0.7714414549827145, - "flos": 25703960584320.0, - "grad_norm": 1.661801317561211, - "language_loss": 0.77825183, - "learning_rate": 5.232947591245269e-07, - "loss": 0.79966295, - "num_input_tokens_seen": 276769460, - "step": 12831, - "time_per_iteration": 2.7142996788024902 - }, - { - "auxiliary_loss_clip": 0.01078302, - "auxiliary_loss_mlp": 0.01036061, - "balance_loss_clip": 1.0332458, - "balance_loss_mlp": 1.02210712, - "epoch": 0.7715015782353826, - "flos": 30555312312960.0, - "grad_norm": 1.5151652331679557, - "language_loss": 0.6105473, - "learning_rate": 5.230321283779071e-07, - "loss": 0.63169092, - "num_input_tokens_seen": 276790820, - "step": 12832, - "time_per_iteration": 2.717639684677124 - }, - { - "auxiliary_loss_clip": 0.01085655, - "auxiliary_loss_mlp": 0.01039371, - "balance_loss_clip": 1.03684115, - "balance_loss_mlp": 1.02620983, - "epoch": 0.7715617014880505, - "flos": 20229343038720.0, - "grad_norm": 1.801135841177815, - "language_loss": 0.79230422, - "learning_rate": 5.227695536380572e-07, - "loss": 0.81355441, - "num_input_tokens_seen": 276811345, - "step": 12833, - "time_per_iteration": 2.7320380210876465 - }, - { - "auxiliary_loss_clip": 0.00988976, - "auxiliary_loss_mlp": 0.01003321, - "balance_loss_clip": 1.00962079, - "balance_loss_mlp": 1.00185442, - "epoch": 0.7716218247407185, - "flos": 63664770971520.0, - "grad_norm": 0.8509203865481852, - "language_loss": 0.55384171, - "learning_rate": 5.22507034914933e-07, - "loss": 0.57376468, - "num_input_tokens_seen": 276870950, - "step": 12834, - "time_per_iteration": 3.2906105518341064 - }, - { - "auxiliary_loss_clip": 0.01065317, - "auxiliary_loss_mlp": 0.01033528, - "balance_loss_clip": 1.03433681, - "balance_loss_mlp": 1.019449, - "epoch": 0.7716819479933864, - "flos": 19791807471360.0, - "grad_norm": 2.0007244746658905, - "language_loss": 0.72596645, - "learning_rate": 5.222445722184903e-07, - "loss": 0.74695486, - "num_input_tokens_seen": 276890760, - "step": 12835, - "time_per_iteration": 2.789001941680908 - }, - { - "auxiliary_loss_clip": 0.01078061, - "auxiliary_loss_mlp": 0.00771412, - "balance_loss_clip": 1.03562582, - "balance_loss_mlp": 1.00025511, - "epoch": 0.7717420712460544, - "flos": 18442176825600.0, - "grad_norm": 1.8060607168740586, - "language_loss": 0.70171171, - "learning_rate": 5.219821655586814e-07, - "loss": 0.72020638, - "num_input_tokens_seen": 276909625, - "step": 12836, - "time_per_iteration": 2.728555917739868 - }, - { - "auxiliary_loss_clip": 0.01087588, - "auxiliary_loss_mlp": 0.01031496, - "balance_loss_clip": 1.03710699, - "balance_loss_mlp": 1.01896143, - "epoch": 0.7718021944987223, - "flos": 35189476456320.0, - "grad_norm": 1.7672669175991982, - "language_loss": 0.59498906, - "learning_rate": 5.217198149454575e-07, - "loss": 0.61617988, - "num_input_tokens_seen": 276930760, - "step": 12837, - "time_per_iteration": 2.771662712097168 - }, - { - "auxiliary_loss_clip": 0.01019463, - "auxiliary_loss_mlp": 0.01007255, - "balance_loss_clip": 1.0126214, - "balance_loss_mlp": 1.00599122, - "epoch": 0.7718623177513904, - "flos": 67923167961600.0, - "grad_norm": 0.860607802199013, - "language_loss": 0.55781054, - "learning_rate": 5.214575203887666e-07, - "loss": 0.57807767, - "num_input_tokens_seen": 276989580, - "step": 12838, - "time_per_iteration": 3.17033052444458 - }, - { - "auxiliary_loss_clip": 0.0110038, - "auxiliary_loss_mlp": 0.01028077, - "balance_loss_clip": 1.03804731, - "balance_loss_mlp": 1.01625776, - "epoch": 0.7719224410040583, - "flos": 18581401941120.0, - "grad_norm": 2.316418806228274, - "language_loss": 0.69647658, - "learning_rate": 5.211952818985538e-07, - "loss": 0.71776116, - "num_input_tokens_seen": 277005450, - "step": 12839, - "time_per_iteration": 2.645826578140259 - }, - { - "auxiliary_loss_clip": 0.01099944, - "auxiliary_loss_mlp": 0.01027637, - "balance_loss_clip": 1.03894663, - "balance_loss_mlp": 1.01572192, - "epoch": 0.7719825642567263, - "flos": 23075802264960.0, - "grad_norm": 1.8115476435749553, - "language_loss": 0.79911268, - "learning_rate": 5.209330994847647e-07, - "loss": 0.8203885, - "num_input_tokens_seen": 277023055, - "step": 12840, - "time_per_iteration": 2.706791400909424 - }, - { - "auxiliary_loss_clip": 0.0110078, - "auxiliary_loss_mlp": 0.00770822, - "balance_loss_clip": 1.03851485, - "balance_loss_mlp": 1.00014949, - "epoch": 0.7720426875093943, - "flos": 20339086066560.0, - "grad_norm": 2.545853908868313, - "language_loss": 0.80008757, - "learning_rate": 5.206709731573402e-07, - "loss": 0.81880367, - "num_input_tokens_seen": 277041150, - "step": 12841, - "time_per_iteration": 2.7192368507385254 - }, - { - "auxiliary_loss_clip": 0.01075766, - "auxiliary_loss_mlp": 0.01028708, - "balance_loss_clip": 1.03847384, - "balance_loss_mlp": 1.01574421, - "epoch": 0.7721028107620622, - "flos": 23880704181120.0, - "grad_norm": 1.5305578365970447, - "language_loss": 0.76161742, - "learning_rate": 5.204089029262208e-07, - "loss": 0.78266215, - "num_input_tokens_seen": 277063895, - "step": 12842, - "time_per_iteration": 2.7325236797332764 - }, - { - "auxiliary_loss_clip": 0.01059079, - "auxiliary_loss_mlp": 0.00771703, - "balance_loss_clip": 1.03726017, - "balance_loss_mlp": 1.0002687, - "epoch": 0.7721629340147302, - "flos": 26651571235200.0, - "grad_norm": 4.828495725379175, - "language_loss": 0.68726575, - "learning_rate": 5.201468888013445e-07, - "loss": 0.70557356, - "num_input_tokens_seen": 277084045, - "step": 12843, - "time_per_iteration": 2.81326961517334 - }, - { - "auxiliary_loss_clip": 0.01088182, - "auxiliary_loss_mlp": 0.01032978, - "balance_loss_clip": 1.03403521, - "balance_loss_mlp": 1.02059186, - "epoch": 0.7722230572673981, - "flos": 21178857110400.0, - "grad_norm": 3.7944489397426286, - "language_loss": 0.73675692, - "learning_rate": 5.198849307926465e-07, - "loss": 0.75796854, - "num_input_tokens_seen": 277102625, - "step": 12844, - "time_per_iteration": 2.660747766494751 - }, - { - "auxiliary_loss_clip": 0.0109532, - "auxiliary_loss_mlp": 0.01041057, - "balance_loss_clip": 1.03639054, - "balance_loss_mlp": 1.02721667, - "epoch": 0.7722831805200662, - "flos": 27964644814080.0, - "grad_norm": 1.567829696052933, - "language_loss": 0.71341336, - "learning_rate": 5.196230289100596e-07, - "loss": 0.73477709, - "num_input_tokens_seen": 277123210, - "step": 12845, - "time_per_iteration": 2.720493793487549 - }, - { - "auxiliary_loss_clip": 0.01109647, - "auxiliary_loss_mlp": 0.01032633, - "balance_loss_clip": 1.03851032, - "balance_loss_mlp": 1.02038407, - "epoch": 0.7723433037727341, - "flos": 33875576864640.0, - "grad_norm": 1.7586648256902582, - "language_loss": 0.64064783, - "learning_rate": 5.193611831635159e-07, - "loss": 0.66207063, - "num_input_tokens_seen": 277144895, - "step": 12846, - "time_per_iteration": 2.7434511184692383 - }, - { - "auxiliary_loss_clip": 0.0102204, - "auxiliary_loss_mlp": 0.00751187, - "balance_loss_clip": 1.0084672, - "balance_loss_mlp": 0.99961835, - "epoch": 0.7724034270254021, - "flos": 62848271940480.0, - "grad_norm": 0.7939383469798397, - "language_loss": 0.61696756, - "learning_rate": 5.19099393562945e-07, - "loss": 0.63469982, - "num_input_tokens_seen": 277205160, - "step": 12847, - "time_per_iteration": 3.1408584117889404 - }, - { - "auxiliary_loss_clip": 0.01109979, - "auxiliary_loss_mlp": 0.01027701, - "balance_loss_clip": 1.0360781, - "balance_loss_mlp": 1.01481414, - "epoch": 0.77246355027807, - "flos": 23295467888640.0, - "grad_norm": 2.7620733076627255, - "language_loss": 0.7912066, - "learning_rate": 5.188376601182732e-07, - "loss": 0.81258333, - "num_input_tokens_seen": 277223005, - "step": 12848, - "time_per_iteration": 5.833041191101074 - }, - { - "auxiliary_loss_clip": 0.01073036, - "auxiliary_loss_mlp": 0.01041471, - "balance_loss_clip": 1.03511548, - "balance_loss_mlp": 1.02746367, - "epoch": 0.772523673530738, - "flos": 20121287950080.0, - "grad_norm": 1.5824412187396433, - "language_loss": 0.72673213, - "learning_rate": 5.185759828394261e-07, - "loss": 0.74787724, - "num_input_tokens_seen": 277241785, - "step": 12849, - "time_per_iteration": 2.7188072204589844 - }, - { - "auxiliary_loss_clip": 0.01110027, - "auxiliary_loss_mlp": 0.01031983, - "balance_loss_clip": 1.03745866, - "balance_loss_mlp": 1.01899564, - "epoch": 0.7725837967834059, - "flos": 17820096157440.0, - "grad_norm": 2.4780134177178166, - "language_loss": 0.78607786, - "learning_rate": 5.183143617363261e-07, - "loss": 0.80749798, - "num_input_tokens_seen": 277259050, - "step": 12850, - "time_per_iteration": 4.190839529037476 - }, - { - "auxiliary_loss_clip": 0.01054122, - "auxiliary_loss_mlp": 0.00771579, - "balance_loss_clip": 1.03170514, - "balance_loss_mlp": 1.00020933, - "epoch": 0.772643920036074, - "flos": 27198921657600.0, - "grad_norm": 1.5628406207285341, - "language_loss": 0.80081898, - "learning_rate": 5.180527968188935e-07, - "loss": 0.819076, - "num_input_tokens_seen": 277278235, - "step": 12851, - "time_per_iteration": 2.8007707595825195 - }, - { - "auxiliary_loss_clip": 0.01097831, - "auxiliary_loss_mlp": 0.01027911, - "balance_loss_clip": 1.03627992, - "balance_loss_mlp": 1.01439285, - "epoch": 0.7727040432887419, - "flos": 21579512388480.0, - "grad_norm": 1.50165866044674, - "language_loss": 0.73771137, - "learning_rate": 5.177912880970474e-07, - "loss": 0.75896883, - "num_input_tokens_seen": 277298355, - "step": 12852, - "time_per_iteration": 2.640066146850586 - }, - { - "auxiliary_loss_clip": 0.01108862, - "auxiliary_loss_mlp": 0.01036354, - "balance_loss_clip": 1.0370307, - "balance_loss_mlp": 1.02388501, - "epoch": 0.7727641665414099, - "flos": 22236641752320.0, - "grad_norm": 1.9047864889104873, - "language_loss": 0.82604998, - "learning_rate": 5.17529835580704e-07, - "loss": 0.84750211, - "num_input_tokens_seen": 277316095, - "step": 12853, - "time_per_iteration": 2.6782071590423584 - }, - { - "auxiliary_loss_clip": 0.01028971, - "auxiliary_loss_mlp": 0.01000563, - "balance_loss_clip": 1.00643969, - "balance_loss_mlp": 0.99953192, - "epoch": 0.7728242897940779, - "flos": 54832221463680.0, - "grad_norm": 0.7951405489665233, - "language_loss": 0.54508865, - "learning_rate": 5.172684392797786e-07, - "loss": 0.56538397, - "num_input_tokens_seen": 277380130, - "step": 12854, - "time_per_iteration": 3.2313177585601807 - }, - { - "auxiliary_loss_clip": 0.01102068, - "auxiliary_loss_mlp": 0.01032288, - "balance_loss_clip": 1.03806114, - "balance_loss_mlp": 1.01808441, - "epoch": 0.7728844130467458, - "flos": 34461962392320.0, - "grad_norm": 1.5042786507697257, - "language_loss": 0.71595842, - "learning_rate": 5.170070992041826e-07, - "loss": 0.73730195, - "num_input_tokens_seen": 277404015, - "step": 12855, - "time_per_iteration": 4.29422926902771 - }, - { - "auxiliary_loss_clip": 0.01111402, - "auxiliary_loss_mlp": 0.01031298, - "balance_loss_clip": 1.03859937, - "balance_loss_mlp": 1.01755357, - "epoch": 0.7729445362994138, - "flos": 18916341287040.0, - "grad_norm": 1.8322894078322527, - "language_loss": 0.68102384, - "learning_rate": 5.167458153638254e-07, - "loss": 0.70245087, - "num_input_tokens_seen": 277421375, - "step": 12856, - "time_per_iteration": 2.6372880935668945 - }, - { - "auxiliary_loss_clip": 0.010814, - "auxiliary_loss_mlp": 0.01035586, - "balance_loss_clip": 1.03660607, - "balance_loss_mlp": 1.02275896, - "epoch": 0.7730046595520818, - "flos": 22200048771840.0, - "grad_norm": 1.6258522353598035, - "language_loss": 0.79057026, - "learning_rate": 5.164845877686162e-07, - "loss": 0.81174016, - "num_input_tokens_seen": 277440170, - "step": 12857, - "time_per_iteration": 2.796715021133423 - }, - { - "auxiliary_loss_clip": 0.01063249, - "auxiliary_loss_mlp": 0.00770001, - "balance_loss_clip": 1.04108429, - "balance_loss_mlp": 1.00020409, - "epoch": 0.7730647828047498, - "flos": 13552328695680.0, - "grad_norm": 1.8401408925492355, - "language_loss": 0.78711581, - "learning_rate": 5.162234164284591e-07, - "loss": 0.80544829, - "num_input_tokens_seen": 277456880, - "step": 12858, - "time_per_iteration": 2.8125572204589844 - }, - { - "auxiliary_loss_clip": 0.01112062, - "auxiliary_loss_mlp": 0.01031889, - "balance_loss_clip": 1.03837538, - "balance_loss_mlp": 1.0190742, - "epoch": 0.7731249060574177, - "flos": 21976037602560.0, - "grad_norm": 1.938091007163787, - "language_loss": 0.77033961, - "learning_rate": 5.159623013532591e-07, - "loss": 0.7917791, - "num_input_tokens_seen": 277475365, - "step": 12859, - "time_per_iteration": 2.659550428390503 - }, - { - "auxiliary_loss_clip": 0.0109902, - "auxiliary_loss_mlp": 0.01030466, - "balance_loss_clip": 1.04030442, - "balance_loss_mlp": 1.01920676, - "epoch": 0.7731850293100857, - "flos": 22601817371520.0, - "grad_norm": 1.3916188047238045, - "language_loss": 0.67878425, - "learning_rate": 5.157012425529186e-07, - "loss": 0.7000792, - "num_input_tokens_seen": 277494975, - "step": 12860, - "time_per_iteration": 2.8458962440490723 - }, - { - "auxiliary_loss_clip": 0.01114237, - "auxiliary_loss_mlp": 0.01038751, - "balance_loss_clip": 1.03815317, - "balance_loss_mlp": 1.02510166, - "epoch": 0.7732451525627536, - "flos": 14098422142080.0, - "grad_norm": 2.3344978091609656, - "language_loss": 0.74838078, - "learning_rate": 5.154402400373343e-07, - "loss": 0.76991069, - "num_input_tokens_seen": 277510520, - "step": 12861, - "time_per_iteration": 2.5893940925598145 - }, - { - "auxiliary_loss_clip": 0.01105983, - "auxiliary_loss_mlp": 0.01031912, - "balance_loss_clip": 1.04054725, - "balance_loss_mlp": 1.01798797, - "epoch": 0.7733052758154216, - "flos": 21470020755840.0, - "grad_norm": 2.1487952861807558, - "language_loss": 0.74759662, - "learning_rate": 5.15179293816405e-07, - "loss": 0.7689755, - "num_input_tokens_seen": 277530505, - "step": 12862, - "time_per_iteration": 2.7624194622039795 - }, - { - "auxiliary_loss_clip": 0.01064299, - "auxiliary_loss_mlp": 0.01032266, - "balance_loss_clip": 1.03402948, - "balance_loss_mlp": 1.02048767, - "epoch": 0.7733653990680895, - "flos": 21394284929280.0, - "grad_norm": 1.5250392948978249, - "language_loss": 0.83059877, - "learning_rate": 5.149184039000256e-07, - "loss": 0.85156441, - "num_input_tokens_seen": 277550810, - "step": 12863, - "time_per_iteration": 2.771484851837158 - }, - { - "auxiliary_loss_clip": 0.01110135, - "auxiliary_loss_mlp": 0.01033251, - "balance_loss_clip": 1.03735471, - "balance_loss_mlp": 1.02050209, - "epoch": 0.7734255223207576, - "flos": 17676058619520.0, - "grad_norm": 1.7056890510124847, - "language_loss": 0.73495519, - "learning_rate": 5.146575702980898e-07, - "loss": 0.75638908, - "num_input_tokens_seen": 277567680, - "step": 12864, - "time_per_iteration": 2.6594743728637695 - }, - { - "auxiliary_loss_clip": 0.01089331, - "auxiliary_loss_mlp": 0.01031855, - "balance_loss_clip": 1.03545022, - "balance_loss_mlp": 1.0199455, - "epoch": 0.7734856455734255, - "flos": 25230837617280.0, - "grad_norm": 1.592544393546876, - "language_loss": 0.8264727, - "learning_rate": 5.143967930204871e-07, - "loss": 0.84768456, - "num_input_tokens_seen": 277588970, - "step": 12865, - "time_per_iteration": 2.7463982105255127 - }, - { - "auxiliary_loss_clip": 0.01116112, - "auxiliary_loss_mlp": 0.01033209, - "balance_loss_clip": 1.04054976, - "balance_loss_mlp": 1.01934528, - "epoch": 0.7735457688260935, - "flos": 23433112805760.0, - "grad_norm": 2.1106851031269365, - "language_loss": 0.72128093, - "learning_rate": 5.141360720771077e-07, - "loss": 0.74277413, - "num_input_tokens_seen": 277605450, - "step": 12866, - "time_per_iteration": 2.574566125869751 - }, - { - "auxiliary_loss_clip": 0.01069034, - "auxiliary_loss_mlp": 0.00770892, - "balance_loss_clip": 1.03813267, - "balance_loss_mlp": 1.00030208, - "epoch": 0.7736058920787615, - "flos": 18729246320640.0, - "grad_norm": 3.2060196397051444, - "language_loss": 0.64442635, - "learning_rate": 5.138754074778371e-07, - "loss": 0.66282552, - "num_input_tokens_seen": 277622530, - "step": 12867, - "time_per_iteration": 2.701490879058838 - }, - { - "auxiliary_loss_clip": 0.01098529, - "auxiliary_loss_mlp": 0.01037441, - "balance_loss_clip": 1.03714955, - "balance_loss_mlp": 1.02506101, - "epoch": 0.7736660153314294, - "flos": 22893304239360.0, - "grad_norm": 1.5193783690331675, - "language_loss": 0.71179724, - "learning_rate": 5.136147992325595e-07, - "loss": 0.73315698, - "num_input_tokens_seen": 277642700, - "step": 12868, - "time_per_iteration": 2.6771240234375 - }, - { - "auxiliary_loss_clip": 0.01105128, - "auxiliary_loss_mlp": 0.01031721, - "balance_loss_clip": 1.04049754, - "balance_loss_mlp": 1.01892424, - "epoch": 0.7737261385840974, - "flos": 13800901789440.0, - "grad_norm": 2.0821303548121284, - "language_loss": 0.77995592, - "learning_rate": 5.133542473511578e-07, - "loss": 0.80132443, - "num_input_tokens_seen": 277660005, - "step": 12869, - "time_per_iteration": 2.6456408500671387 - }, - { - "auxiliary_loss_clip": 0.01097602, - "auxiliary_loss_mlp": 0.01027939, - "balance_loss_clip": 1.03767705, - "balance_loss_mlp": 1.01517785, - "epoch": 0.7737862618367654, - "flos": 28730727106560.0, - "grad_norm": 1.7351593875890767, - "language_loss": 0.73740292, - "learning_rate": 5.130937518435124e-07, - "loss": 0.75865841, - "num_input_tokens_seen": 277682890, - "step": 12870, - "time_per_iteration": 2.670896530151367 - }, - { - "auxiliary_loss_clip": 0.01102985, - "auxiliary_loss_mlp": 0.01032417, - "balance_loss_clip": 1.03815126, - "balance_loss_mlp": 1.01947141, - "epoch": 0.7738463850894334, - "flos": 17018570119680.0, - "grad_norm": 1.9332947968793013, - "language_loss": 0.76220596, - "learning_rate": 5.12833312719501e-07, - "loss": 0.78355992, - "num_input_tokens_seen": 277699330, - "step": 12871, - "time_per_iteration": 2.5998897552490234 - }, - { - "auxiliary_loss_clip": 0.0108707, - "auxiliary_loss_mlp": 0.01035167, - "balance_loss_clip": 1.03574061, - "balance_loss_mlp": 1.02281117, - "epoch": 0.7739065083421013, - "flos": 20704010290560.0, - "grad_norm": 2.0007285261409407, - "language_loss": 0.69219184, - "learning_rate": 5.12572929988999e-07, - "loss": 0.71341425, - "num_input_tokens_seen": 277718750, - "step": 12872, - "time_per_iteration": 2.673105478286743 - }, - { - "auxiliary_loss_clip": 0.01111983, - "auxiliary_loss_mlp": 0.01031736, - "balance_loss_clip": 1.03831863, - "balance_loss_mlp": 1.01781273, - "epoch": 0.7739666315947693, - "flos": 20697222620160.0, - "grad_norm": 2.4536948806528502, - "language_loss": 0.85142237, - "learning_rate": 5.123126036618804e-07, - "loss": 0.8728596, - "num_input_tokens_seen": 277734645, - "step": 12873, - "time_per_iteration": 2.590299606323242 - }, - { - "auxiliary_loss_clip": 0.01115241, - "auxiliary_loss_mlp": 0.01037294, - "balance_loss_clip": 1.04048181, - "balance_loss_mlp": 1.02497935, - "epoch": 0.7740267548474372, - "flos": 29570677718400.0, - "grad_norm": 2.480997222503817, - "language_loss": 0.65266359, - "learning_rate": 5.120523337480174e-07, - "loss": 0.67418897, - "num_input_tokens_seen": 277755535, - "step": 12874, - "time_per_iteration": 2.6324357986450195 - }, - { - "auxiliary_loss_clip": 0.01072577, - "auxiliary_loss_mlp": 0.01031243, - "balance_loss_clip": 1.0420754, - "balance_loss_mlp": 1.01826084, - "epoch": 0.7740868781001052, - "flos": 23659099223040.0, - "grad_norm": 1.5630841905142332, - "language_loss": 0.62254053, - "learning_rate": 5.117921202572785e-07, - "loss": 0.64357871, - "num_input_tokens_seen": 277775585, - "step": 12875, - "time_per_iteration": 2.7664403915405273 - }, - { - "auxiliary_loss_clip": 0.0110217, - "auxiliary_loss_mlp": 0.0103118, - "balance_loss_clip": 1.0375613, - "balance_loss_mlp": 1.01843607, - "epoch": 0.7741470013527731, - "flos": 24717314828160.0, - "grad_norm": 2.655709255641646, - "language_loss": 0.65554249, - "learning_rate": 5.115319631995318e-07, - "loss": 0.67687607, - "num_input_tokens_seen": 277794795, - "step": 12876, - "time_per_iteration": 2.696556806564331 - }, - { - "auxiliary_loss_clip": 0.01082571, - "auxiliary_loss_mlp": 0.01036668, - "balance_loss_clip": 1.03536308, - "balance_loss_mlp": 1.02387714, - "epoch": 0.7742071246054412, - "flos": 21871645701120.0, - "grad_norm": 1.869396409074905, - "language_loss": 0.71216834, - "learning_rate": 5.112718625846433e-07, - "loss": 0.73336065, - "num_input_tokens_seen": 277813235, - "step": 12877, - "time_per_iteration": 2.692688465118408 - }, - { - "auxiliary_loss_clip": 0.01073259, - "auxiliary_loss_mlp": 0.01040102, - "balance_loss_clip": 1.03579319, - "balance_loss_mlp": 1.02468836, - "epoch": 0.7742672478581091, - "flos": 22674249146880.0, - "grad_norm": 1.8081756921528234, - "language_loss": 0.82974255, - "learning_rate": 5.110118184224736e-07, - "loss": 0.85087615, - "num_input_tokens_seen": 277832560, - "step": 12878, - "time_per_iteration": 2.7693746089935303 - }, - { - "auxiliary_loss_clip": 0.01091515, - "auxiliary_loss_mlp": 0.01034091, - "balance_loss_clip": 1.03874159, - "balance_loss_mlp": 1.0199523, - "epoch": 0.7743273711107771, - "flos": 18840892769280.0, - "grad_norm": 1.713941118960012, - "language_loss": 0.73144233, - "learning_rate": 5.10751830722885e-07, - "loss": 0.75269836, - "num_input_tokens_seen": 277850120, - "step": 12879, - "time_per_iteration": 2.6757094860076904 - }, - { - "auxiliary_loss_clip": 0.0108601, - "auxiliary_loss_mlp": 0.01027949, - "balance_loss_clip": 1.03621507, - "balance_loss_mlp": 1.01507425, - "epoch": 0.7743874943634451, - "flos": 28729326476160.0, - "grad_norm": 1.9120090925944704, - "language_loss": 0.79831159, - "learning_rate": 5.104918994957364e-07, - "loss": 0.81945121, - "num_input_tokens_seen": 277871020, - "step": 12880, - "time_per_iteration": 2.8304030895233154 - }, - { - "auxiliary_loss_clip": 0.01087192, - "auxiliary_loss_mlp": 0.01037799, - "balance_loss_clip": 1.03749204, - "balance_loss_mlp": 1.02506709, - "epoch": 0.774447617616113, - "flos": 21909639312000.0, - "grad_norm": 1.5834670208699275, - "language_loss": 0.70202577, - "learning_rate": 5.102320247508847e-07, - "loss": 0.72327566, - "num_input_tokens_seen": 277891525, - "step": 12881, - "time_per_iteration": 2.7064766883850098 - }, - { - "auxiliary_loss_clip": 0.01091686, - "auxiliary_loss_mlp": 0.01043391, - "balance_loss_clip": 1.03600717, - "balance_loss_mlp": 1.02921081, - "epoch": 0.774507740868781, - "flos": 19500643825920.0, - "grad_norm": 1.9376715027667266, - "language_loss": 0.84492528, - "learning_rate": 5.099722064981832e-07, - "loss": 0.86627603, - "num_input_tokens_seen": 277910425, - "step": 12882, - "time_per_iteration": 2.704357862472534 - }, - { - "auxiliary_loss_clip": 0.01002891, - "auxiliary_loss_mlp": 0.01007527, - "balance_loss_clip": 1.01538849, - "balance_loss_mlp": 1.00624514, - "epoch": 0.774567864121449, - "flos": 59426560402560.0, - "grad_norm": 0.7677682887225041, - "language_loss": 0.60380936, - "learning_rate": 5.097124447474858e-07, - "loss": 0.62391353, - "num_input_tokens_seen": 277972795, - "step": 12883, - "time_per_iteration": 3.2393903732299805 - }, - { - "auxiliary_loss_clip": 0.01064866, - "auxiliary_loss_mlp": 0.0103875, - "balance_loss_clip": 1.03618407, - "balance_loss_mlp": 1.023646, - "epoch": 0.774627987374117, - "flos": 13225326255360.0, - "grad_norm": 6.057542406739813, - "language_loss": 0.72638834, - "learning_rate": 5.094527395086416e-07, - "loss": 0.7474246, - "num_input_tokens_seen": 277990675, - "step": 12884, - "time_per_iteration": 2.798553705215454 - }, - { - "auxiliary_loss_clip": 0.01100426, - "auxiliary_loss_mlp": 0.01035391, - "balance_loss_clip": 1.03860021, - "balance_loss_mlp": 1.0236789, - "epoch": 0.7746881106267849, - "flos": 21394033534080.0, - "grad_norm": 1.4931379605931039, - "language_loss": 0.8105005, - "learning_rate": 5.091930907914986e-07, - "loss": 0.83185869, - "num_input_tokens_seen": 278010050, - "step": 12885, - "time_per_iteration": 2.638674736022949 - }, - { - "auxiliary_loss_clip": 0.01108511, - "auxiliary_loss_mlp": 0.01036987, - "balance_loss_clip": 1.03706241, - "balance_loss_mlp": 1.0250479, - "epoch": 0.7747482338794529, - "flos": 25629338079360.0, - "grad_norm": 1.712628084719396, - "language_loss": 0.63937521, - "learning_rate": 5.089334986059029e-07, - "loss": 0.6608302, - "num_input_tokens_seen": 278030660, - "step": 12886, - "time_per_iteration": 2.65639328956604 - }, - { - "auxiliary_loss_clip": 0.01072173, - "auxiliary_loss_mlp": 0.0103036, - "balance_loss_clip": 1.03371465, - "balance_loss_mlp": 1.01826668, - "epoch": 0.7748083571321208, - "flos": 11546933402880.0, - "grad_norm": 1.8883319339128437, - "language_loss": 0.69462442, - "learning_rate": 5.086739629616987e-07, - "loss": 0.71564978, - "num_input_tokens_seen": 278047645, - "step": 12887, - "time_per_iteration": 4.30758261680603 - }, - { - "auxiliary_loss_clip": 0.01100015, - "auxiliary_loss_mlp": 0.0103293, - "balance_loss_clip": 1.03749061, - "balance_loss_mlp": 1.02090144, - "epoch": 0.7748684803847888, - "flos": 19062425900160.0, - "grad_norm": 1.702042840830708, - "language_loss": 0.70615542, - "learning_rate": 5.084144838687275e-07, - "loss": 0.72748482, - "num_input_tokens_seen": 278066170, - "step": 12888, - "time_per_iteration": 2.681607246398926 - }, - { - "auxiliary_loss_clip": 0.01101783, - "auxiliary_loss_mlp": 0.01034032, - "balance_loss_clip": 1.03678536, - "balance_loss_mlp": 1.02094269, - "epoch": 0.7749286036374567, - "flos": 22273162905600.0, - "grad_norm": 1.6866747197007421, - "language_loss": 0.8189441, - "learning_rate": 5.081550613368279e-07, - "loss": 0.84030223, - "num_input_tokens_seen": 278085545, - "step": 12889, - "time_per_iteration": 4.1007890701293945 - }, - { - "auxiliary_loss_clip": 0.0107657, - "auxiliary_loss_mlp": 0.01028595, - "balance_loss_clip": 1.03708053, - "balance_loss_mlp": 1.01628113, - "epoch": 0.7749887268901248, - "flos": 20192462749440.0, - "grad_norm": 2.1944312112845057, - "language_loss": 0.79288089, - "learning_rate": 5.07895695375838e-07, - "loss": 0.81393254, - "num_input_tokens_seen": 278102995, - "step": 12890, - "time_per_iteration": 2.8066084384918213 - }, - { - "auxiliary_loss_clip": 0.01084496, - "auxiliary_loss_mlp": 0.01034255, - "balance_loss_clip": 1.03861511, - "balance_loss_mlp": 1.02098715, - "epoch": 0.7750488501427927, - "flos": 20337541781760.0, - "grad_norm": 1.9334241832657861, - "language_loss": 0.66675818, - "learning_rate": 5.076363859955932e-07, - "loss": 0.68794572, - "num_input_tokens_seen": 278121460, - "step": 12891, - "time_per_iteration": 2.7070491313934326 - }, - { - "auxiliary_loss_clip": 0.01100079, - "auxiliary_loss_mlp": 0.01033227, - "balance_loss_clip": 1.03662086, - "balance_loss_mlp": 1.02079916, - "epoch": 0.7751089733954607, - "flos": 28364043116160.0, - "grad_norm": 1.6084014662033723, - "language_loss": 0.78700238, - "learning_rate": 5.073771332059257e-07, - "loss": 0.80833542, - "num_input_tokens_seen": 278143905, - "step": 12892, - "time_per_iteration": 2.6891307830810547 - }, - { - "auxiliary_loss_clip": 0.01105106, - "auxiliary_loss_mlp": 0.01029124, - "balance_loss_clip": 1.04138756, - "balance_loss_mlp": 1.01607716, - "epoch": 0.7751690966481286, - "flos": 16943803960320.0, - "grad_norm": 2.167077484645157, - "language_loss": 0.67164677, - "learning_rate": 5.071179370166669e-07, - "loss": 0.69298911, - "num_input_tokens_seen": 278160850, - "step": 12893, - "time_per_iteration": 2.6599507331848145 - }, - { - "auxiliary_loss_clip": 0.01022351, - "auxiliary_loss_mlp": 0.01001788, - "balance_loss_clip": 1.00947237, - "balance_loss_mlp": 1.00071514, - "epoch": 0.7752292199007966, - "flos": 65668050339840.0, - "grad_norm": 0.8059900442079823, - "language_loss": 0.58441579, - "learning_rate": 5.068587974376468e-07, - "loss": 0.60465717, - "num_input_tokens_seen": 278219950, - "step": 12894, - "time_per_iteration": 4.591580629348755 - }, - { - "auxiliary_loss_clip": 0.01093145, - "auxiliary_loss_mlp": 0.01033525, - "balance_loss_clip": 1.03960991, - "balance_loss_mlp": 1.02001882, - "epoch": 0.7752893431534646, - "flos": 20594662312320.0, - "grad_norm": 2.026677697607631, - "language_loss": 0.77940953, - "learning_rate": 5.065997144786895e-07, - "loss": 0.80067623, - "num_input_tokens_seen": 278237805, - "step": 12895, - "time_per_iteration": 2.550419807434082 - }, - { - "auxiliary_loss_clip": 0.01070115, - "auxiliary_loss_mlp": 0.01035434, - "balance_loss_clip": 1.03553057, - "balance_loss_mlp": 1.02099133, - "epoch": 0.7753494664061326, - "flos": 20485350247680.0, - "grad_norm": 1.9545538067157624, - "language_loss": 0.67606688, - "learning_rate": 5.063406881496209e-07, - "loss": 0.69712234, - "num_input_tokens_seen": 278257660, - "step": 12896, - "time_per_iteration": 2.573294162750244 - }, - { - "auxiliary_loss_clip": 0.01086749, - "auxiliary_loss_mlp": 0.01040132, - "balance_loss_clip": 1.03621519, - "balance_loss_mlp": 1.02843189, - "epoch": 0.7754095896588006, - "flos": 20265900105600.0, - "grad_norm": 1.6654676924809417, - "language_loss": 0.6842171, - "learning_rate": 5.060817184602629e-07, - "loss": 0.70548594, - "num_input_tokens_seen": 278275110, - "step": 12897, - "time_per_iteration": 2.646030902862549 - }, - { - "auxiliary_loss_clip": 0.0111523, - "auxiliary_loss_mlp": 0.01041854, - "balance_loss_clip": 1.04096043, - "balance_loss_mlp": 1.02774525, - "epoch": 0.7754697129114685, - "flos": 23331091201920.0, - "grad_norm": 1.6795213586563635, - "language_loss": 0.75452977, - "learning_rate": 5.058228054204364e-07, - "loss": 0.77610064, - "num_input_tokens_seen": 278293035, - "step": 12898, - "time_per_iteration": 2.589974880218506 - }, - { - "auxiliary_loss_clip": 0.01101527, - "auxiliary_loss_mlp": 0.00771705, - "balance_loss_clip": 1.0381062, - "balance_loss_mlp": 1.00029922, - "epoch": 0.7755298361641365, - "flos": 17347619635200.0, - "grad_norm": 1.834394412240628, - "language_loss": 0.70020843, - "learning_rate": 5.055639490399588e-07, - "loss": 0.71894073, - "num_input_tokens_seen": 278311010, - "step": 12899, - "time_per_iteration": 2.569342851638794 - }, - { - "auxiliary_loss_clip": 0.01076575, - "auxiliary_loss_mlp": 0.01037603, - "balance_loss_clip": 1.03510606, - "balance_loss_mlp": 1.02406061, - "epoch": 0.7755899594168044, - "flos": 19645866512640.0, - "grad_norm": 2.136951327661946, - "language_loss": 0.7508406, - "learning_rate": 5.053051493286453e-07, - "loss": 0.77198243, - "num_input_tokens_seen": 278329900, - "step": 12900, - "time_per_iteration": 2.6928303241729736 - }, - { - "auxiliary_loss_clip": 0.01093277, - "auxiliary_loss_mlp": 0.01036499, - "balance_loss_clip": 1.03764784, - "balance_loss_mlp": 1.02486384, - "epoch": 0.7756500826694724, - "flos": 27414457217280.0, - "grad_norm": 2.3252412495258867, - "language_loss": 0.77514052, - "learning_rate": 5.050464062963113e-07, - "loss": 0.79643828, - "num_input_tokens_seen": 278349980, - "step": 12901, - "time_per_iteration": 2.7284209728240967 - }, - { - "auxiliary_loss_clip": 0.01102085, - "auxiliary_loss_mlp": 0.01032875, - "balance_loss_clip": 1.04122436, - "balance_loss_mlp": 1.01966059, - "epoch": 0.7757102059221404, - "flos": 28730511624960.0, - "grad_norm": 1.6090174147117244, - "language_loss": 0.7720294, - "learning_rate": 5.047877199527666e-07, - "loss": 0.79337895, - "num_input_tokens_seen": 278372485, - "step": 12902, - "time_per_iteration": 2.7194478511810303 - }, - { - "auxiliary_loss_clip": 0.01100702, - "auxiliary_loss_mlp": 0.01031411, - "balance_loss_clip": 1.03726745, - "balance_loss_mlp": 1.01915073, - "epoch": 0.7757703291748084, - "flos": 22486795044480.0, - "grad_norm": 1.743455027715563, - "language_loss": 0.73384994, - "learning_rate": 5.045290903078215e-07, - "loss": 0.75517106, - "num_input_tokens_seen": 278391660, - "step": 12903, - "time_per_iteration": 2.705784797668457 - }, - { - "auxiliary_loss_clip": 0.01089793, - "auxiliary_loss_mlp": 0.01030768, - "balance_loss_clip": 1.03994238, - "balance_loss_mlp": 1.01834655, - "epoch": 0.7758304524274763, - "flos": 21430159637760.0, - "grad_norm": 18.791554059780267, - "language_loss": 0.76102394, - "learning_rate": 5.042705173712835e-07, - "loss": 0.78222954, - "num_input_tokens_seen": 278409125, - "step": 12904, - "time_per_iteration": 2.6935760974884033 - }, - { - "auxiliary_loss_clip": 0.01109136, - "auxiliary_loss_mlp": 0.01027029, - "balance_loss_clip": 1.03901672, - "balance_loss_mlp": 1.01484025, - "epoch": 0.7758905756801443, - "flos": 23659242877440.0, - "grad_norm": 2.2307497011290462, - "language_loss": 0.68197864, - "learning_rate": 5.040120011529576e-07, - "loss": 0.70334029, - "num_input_tokens_seen": 278429450, - "step": 12905, - "time_per_iteration": 2.6777610778808594 - }, - { - "auxiliary_loss_clip": 0.01097117, - "auxiliary_loss_mlp": 0.00770393, - "balance_loss_clip": 1.03989148, - "balance_loss_mlp": 1.00023961, - "epoch": 0.7759506989328122, - "flos": 28365479660160.0, - "grad_norm": 1.6211065141580052, - "language_loss": 0.67231417, - "learning_rate": 5.037535416626459e-07, - "loss": 0.69098926, - "num_input_tokens_seen": 278449925, - "step": 12906, - "time_per_iteration": 2.7337546348571777 - }, - { - "auxiliary_loss_clip": 0.01072574, - "auxiliary_loss_mlp": 0.01034309, - "balance_loss_clip": 1.03331351, - "balance_loss_mlp": 1.02119029, - "epoch": 0.7760108221854802, - "flos": 14902785354240.0, - "grad_norm": 1.9856089108583717, - "language_loss": 0.81587309, - "learning_rate": 5.034951389101498e-07, - "loss": 0.83694196, - "num_input_tokens_seen": 278467255, - "step": 12907, - "time_per_iteration": 2.687721014022827 - }, - { - "auxiliary_loss_clip": 0.01096211, - "auxiliary_loss_mlp": 0.01035671, - "balance_loss_clip": 1.03709292, - "balance_loss_mlp": 1.02327871, - "epoch": 0.7760709454381483, - "flos": 14792503622400.0, - "grad_norm": 2.1316068769770213, - "language_loss": 0.6746856, - "learning_rate": 5.032367929052685e-07, - "loss": 0.69600445, - "num_input_tokens_seen": 278484250, - "step": 12908, - "time_per_iteration": 2.6765284538269043 - }, - { - "auxiliary_loss_clip": 0.01079432, - "auxiliary_loss_mlp": 0.01041924, - "balance_loss_clip": 1.03593588, - "balance_loss_mlp": 1.02890027, - "epoch": 0.7761310686908162, - "flos": 17379831156480.0, - "grad_norm": 1.487534860967946, - "language_loss": 0.70260543, - "learning_rate": 5.029785036577976e-07, - "loss": 0.72381896, - "num_input_tokens_seen": 278502740, - "step": 12909, - "time_per_iteration": 2.711395502090454 - }, - { - "auxiliary_loss_clip": 0.01100377, - "auxiliary_loss_mlp": 0.01035688, - "balance_loss_clip": 1.03995848, - "balance_loss_mlp": 1.02347469, - "epoch": 0.7761911919434842, - "flos": 25556547168000.0, - "grad_norm": 1.6219590580384207, - "language_loss": 0.6782195, - "learning_rate": 5.027202711775324e-07, - "loss": 0.69958019, - "num_input_tokens_seen": 278523890, - "step": 12910, - "time_per_iteration": 2.703979969024658 - }, - { - "auxiliary_loss_clip": 0.01064156, - "auxiliary_loss_mlp": 0.01032945, - "balance_loss_clip": 1.03646898, - "balance_loss_mlp": 1.02076757, - "epoch": 0.7762513151961521, - "flos": 23179763203200.0, - "grad_norm": 1.5806807809655474, - "language_loss": 0.71997929, - "learning_rate": 5.024620954742646e-07, - "loss": 0.74095035, - "num_input_tokens_seen": 278543185, - "step": 12911, - "time_per_iteration": 2.8058223724365234 - }, - { - "auxiliary_loss_clip": 0.01114991, - "auxiliary_loss_mlp": 0.00771737, - "balance_loss_clip": 1.04081869, - "balance_loss_mlp": 1.00030136, - "epoch": 0.7763114384488201, - "flos": 21689614552320.0, - "grad_norm": 3.3864854327362592, - "language_loss": 0.63468528, - "learning_rate": 5.022039765577836e-07, - "loss": 0.65355253, - "num_input_tokens_seen": 278559220, - "step": 12912, - "time_per_iteration": 2.641256809234619 - }, - { - "auxiliary_loss_clip": 0.01001929, - "auxiliary_loss_mlp": 0.01001295, - "balance_loss_clip": 1.00920105, - "balance_loss_mlp": 1.00030553, - "epoch": 0.776371561701488, - "flos": 69025554316800.0, - "grad_norm": 0.7664213178657265, - "language_loss": 0.53195411, - "learning_rate": 5.019459144378779e-07, - "loss": 0.55198634, - "num_input_tokens_seen": 278618185, - "step": 12913, - "time_per_iteration": 3.3077611923217773 - }, - { - "auxiliary_loss_clip": 0.01093414, - "auxiliary_loss_mlp": 0.01037157, - "balance_loss_clip": 1.04078877, - "balance_loss_mlp": 1.02395415, - "epoch": 0.776431684954156, - "flos": 22893914770560.0, - "grad_norm": 1.9204798335439963, - "language_loss": 0.62302238, - "learning_rate": 5.016879091243338e-07, - "loss": 0.644328, - "num_input_tokens_seen": 278636210, - "step": 12914, - "time_per_iteration": 2.7050273418426514 - }, - { - "auxiliary_loss_clip": 0.0108926, - "auxiliary_loss_mlp": 0.01032265, - "balance_loss_clip": 1.03807616, - "balance_loss_mlp": 1.01977742, - "epoch": 0.776491808206824, - "flos": 20261554560000.0, - "grad_norm": 1.7420543212332402, - "language_loss": 0.82108057, - "learning_rate": 5.014299606269339e-07, - "loss": 0.84229577, - "num_input_tokens_seen": 278653305, - "step": 12915, - "time_per_iteration": 2.7126035690307617 - }, - { - "auxiliary_loss_clip": 0.01099353, - "auxiliary_loss_mlp": 0.0103825, - "balance_loss_clip": 1.03824329, - "balance_loss_mlp": 1.02410579, - "epoch": 0.776551931459492, - "flos": 26759051706240.0, - "grad_norm": 1.7876763975048962, - "language_loss": 0.74624789, - "learning_rate": 5.011720689554603e-07, - "loss": 0.76762396, - "num_input_tokens_seen": 278671850, - "step": 12916, - "time_per_iteration": 2.6998839378356934 - }, - { - "auxiliary_loss_clip": 0.01056671, - "auxiliary_loss_mlp": 0.01036878, - "balance_loss_clip": 1.03597093, - "balance_loss_mlp": 1.02252531, - "epoch": 0.7766120547121599, - "flos": 52665080250240.0, - "grad_norm": 1.5017921162458647, - "language_loss": 0.65888739, - "learning_rate": 5.009142341196919e-07, - "loss": 0.67982292, - "num_input_tokens_seen": 278697860, - "step": 12917, - "time_per_iteration": 3.097477674484253 - }, - { - "auxiliary_loss_clip": 0.01099882, - "auxiliary_loss_mlp": 0.01033637, - "balance_loss_clip": 1.03583741, - "balance_loss_mlp": 1.02095342, - "epoch": 0.7766721779648279, - "flos": 25156215112320.0, - "grad_norm": 1.458879337938595, - "language_loss": 0.64478171, - "learning_rate": 5.006564561294065e-07, - "loss": 0.66611689, - "num_input_tokens_seen": 278720655, - "step": 12918, - "time_per_iteration": 2.7446439266204834 - }, - { - "auxiliary_loss_clip": 0.01111393, - "auxiliary_loss_mlp": 0.01037511, - "balance_loss_clip": 1.0397799, - "balance_loss_mlp": 1.02533412, - "epoch": 0.7767323012174958, - "flos": 23760761690880.0, - "grad_norm": 2.338899246619233, - "language_loss": 0.72807854, - "learning_rate": 5.003987349943777e-07, - "loss": 0.74956757, - "num_input_tokens_seen": 278737375, - "step": 12919, - "time_per_iteration": 2.631877899169922 - }, - { - "auxiliary_loss_clip": 0.01069782, - "auxiliary_loss_mlp": 0.01029892, - "balance_loss_clip": 1.03774428, - "balance_loss_mlp": 1.01674342, - "epoch": 0.7767924244701638, - "flos": 22086642556800.0, - "grad_norm": 2.3274821948551265, - "language_loss": 0.78924805, - "learning_rate": 5.001410707243792e-07, - "loss": 0.8102448, - "num_input_tokens_seen": 278756510, - "step": 12920, - "time_per_iteration": 2.8133649826049805 - }, - { - "auxiliary_loss_clip": 0.01102553, - "auxiliary_loss_mlp": 0.01033949, - "balance_loss_clip": 1.03963828, - "balance_loss_mlp": 1.0209614, - "epoch": 0.7768525477228319, - "flos": 21981640124160.0, - "grad_norm": 11.784624421403892, - "language_loss": 0.70922899, - "learning_rate": 4.998834633291829e-07, - "loss": 0.73059404, - "num_input_tokens_seen": 278775410, - "step": 12921, - "time_per_iteration": 2.6603341102600098 - }, - { - "auxiliary_loss_clip": 0.01105803, - "auxiliary_loss_mlp": 0.01034407, - "balance_loss_clip": 1.04023492, - "balance_loss_mlp": 1.02050102, - "epoch": 0.7769126709754998, - "flos": 21794581071360.0, - "grad_norm": 3.3431959549038885, - "language_loss": 0.76222974, - "learning_rate": 4.996259128185547e-07, - "loss": 0.7836318, - "num_input_tokens_seen": 278794260, - "step": 12922, - "time_per_iteration": 2.7015247344970703 - }, - { - "auxiliary_loss_clip": 0.01063506, - "auxiliary_loss_mlp": 0.0103979, - "balance_loss_clip": 1.03708482, - "balance_loss_mlp": 1.0270822, - "epoch": 0.7769727942281678, - "flos": 20047994248320.0, - "grad_norm": 1.6454971966787777, - "language_loss": 0.80262136, - "learning_rate": 4.993684192022625e-07, - "loss": 0.82365435, - "num_input_tokens_seen": 278813290, - "step": 12923, - "time_per_iteration": 2.7818875312805176 - }, - { - "auxiliary_loss_clip": 0.01076451, - "auxiliary_loss_mlp": 0.01040924, - "balance_loss_clip": 1.04072833, - "balance_loss_mlp": 1.02828157, - "epoch": 0.7770329174808357, - "flos": 21686777377920.0, - "grad_norm": 2.0408616917549067, - "language_loss": 0.92191219, - "learning_rate": 4.991109824900699e-07, - "loss": 0.94308597, - "num_input_tokens_seen": 278830610, - "step": 12924, - "time_per_iteration": 2.8274574279785156 - }, - { - "auxiliary_loss_clip": 0.01099709, - "auxiliary_loss_mlp": 0.01032924, - "balance_loss_clip": 1.03679144, - "balance_loss_mlp": 1.02001929, - "epoch": 0.7770930407335037, - "flos": 25849255098240.0, - "grad_norm": 1.8094451984441313, - "language_loss": 0.66132891, - "learning_rate": 4.988536026917401e-07, - "loss": 0.68265527, - "num_input_tokens_seen": 278849530, - "step": 12925, - "time_per_iteration": 2.69667649269104 - }, - { - "auxiliary_loss_clip": 0.01078276, - "auxiliary_loss_mlp": 0.01032472, - "balance_loss_clip": 1.03612852, - "balance_loss_mlp": 1.01974022, - "epoch": 0.7771531639861716, - "flos": 24347865490560.0, - "grad_norm": 2.0756313412895815, - "language_loss": 0.7192542, - "learning_rate": 4.985962798170314e-07, - "loss": 0.74036169, - "num_input_tokens_seen": 278869005, - "step": 12926, - "time_per_iteration": 4.349314451217651 - }, - { - "auxiliary_loss_clip": 0.01103533, - "auxiliary_loss_mlp": 0.01029731, - "balance_loss_clip": 1.0389967, - "balance_loss_mlp": 1.01636767, - "epoch": 0.7772132872388396, - "flos": 25629948610560.0, - "grad_norm": 1.6573712780681307, - "language_loss": 0.65608656, - "learning_rate": 4.983390138757027e-07, - "loss": 0.67741919, - "num_input_tokens_seen": 278888790, - "step": 12927, - "time_per_iteration": 4.16760778427124 - }, - { - "auxiliary_loss_clip": 0.01089675, - "auxiliary_loss_mlp": 0.01039623, - "balance_loss_clip": 1.03830886, - "balance_loss_mlp": 1.02623534, - "epoch": 0.7772734104915076, - "flos": 26067412350720.0, - "grad_norm": 1.7538632415038142, - "language_loss": 0.72743905, - "learning_rate": 4.980818048775093e-07, - "loss": 0.74873203, - "num_input_tokens_seen": 278908150, - "step": 12928, - "time_per_iteration": 2.755859851837158 - }, - { - "auxiliary_loss_clip": 0.01071134, - "auxiliary_loss_mlp": 0.01033028, - "balance_loss_clip": 1.03876746, - "balance_loss_mlp": 1.02003419, - "epoch": 0.7773335337441756, - "flos": 22925048883840.0, - "grad_norm": 1.8588967363228528, - "language_loss": 0.74152476, - "learning_rate": 4.978246528322036e-07, - "loss": 0.76256645, - "num_input_tokens_seen": 278927425, - "step": 12929, - "time_per_iteration": 4.2707133293151855 - }, - { - "auxiliary_loss_clip": 0.01074549, - "auxiliary_loss_mlp": 0.01031484, - "balance_loss_clip": 1.03665006, - "balance_loss_mlp": 1.01832283, - "epoch": 0.7773936569968435, - "flos": 20776765288320.0, - "grad_norm": 1.9039476143036729, - "language_loss": 0.7758745, - "learning_rate": 4.975675577495377e-07, - "loss": 0.79693484, - "num_input_tokens_seen": 278946475, - "step": 12930, - "time_per_iteration": 2.7537360191345215 - }, - { - "auxiliary_loss_clip": 0.01113583, - "auxiliary_loss_mlp": 0.01034326, - "balance_loss_clip": 1.04102445, - "balance_loss_mlp": 1.02152324, - "epoch": 0.7774537802495115, - "flos": 20372267255040.0, - "grad_norm": 1.8280345361242294, - "language_loss": 0.79341066, - "learning_rate": 4.973105196392613e-07, - "loss": 0.81488979, - "num_input_tokens_seen": 278964345, - "step": 12931, - "time_per_iteration": 2.608551502227783 - }, - { - "auxiliary_loss_clip": 0.01003397, - "auxiliary_loss_mlp": 0.01004694, - "balance_loss_clip": 1.02223182, - "balance_loss_mlp": 1.00322199, - "epoch": 0.7775139035021794, - "flos": 53912081738880.0, - "grad_norm": 0.8525982586440103, - "language_loss": 0.59734511, - "learning_rate": 4.970535385111199e-07, - "loss": 0.61742604, - "num_input_tokens_seen": 279022380, - "step": 12932, - "time_per_iteration": 3.19950270652771 - }, - { - "auxiliary_loss_clip": 0.01102586, - "auxiliary_loss_mlp": 0.01034922, - "balance_loss_clip": 1.03881812, - "balance_loss_mlp": 1.02250659, - "epoch": 0.7775740267548474, - "flos": 28842481296000.0, - "grad_norm": 1.5001192410807755, - "language_loss": 0.76264286, - "learning_rate": 4.967966143748595e-07, - "loss": 0.78401792, - "num_input_tokens_seen": 279044275, - "step": 12933, - "time_per_iteration": 2.838245391845703 - }, - { - "auxiliary_loss_clip": 0.01086722, - "auxiliary_loss_mlp": 0.01039418, - "balance_loss_clip": 1.03855717, - "balance_loss_mlp": 1.02625704, - "epoch": 0.7776341500075155, - "flos": 21872471713920.0, - "grad_norm": 1.9749580896078973, - "language_loss": 0.73223925, - "learning_rate": 4.965397472402215e-07, - "loss": 0.75350064, - "num_input_tokens_seen": 279063375, - "step": 12934, - "time_per_iteration": 4.214959621429443 - }, - { - "auxiliary_loss_clip": 0.01069437, - "auxiliary_loss_mlp": 0.01028937, - "balance_loss_clip": 1.03676343, - "balance_loss_mlp": 1.01571107, - "epoch": 0.7776942732601834, - "flos": 20229845829120.0, - "grad_norm": 1.8916304823351247, - "language_loss": 0.70821279, - "learning_rate": 4.962829371169475e-07, - "loss": 0.72919655, - "num_input_tokens_seen": 279082680, - "step": 12935, - "time_per_iteration": 2.8492965698242188 - }, - { - "auxiliary_loss_clip": 0.0108792, - "auxiliary_loss_mlp": 0.00771991, - "balance_loss_clip": 1.03933454, - "balance_loss_mlp": 1.0001905, - "epoch": 0.7777543965128514, - "flos": 22231829329920.0, - "grad_norm": 1.8453474181089096, - "language_loss": 0.83784235, - "learning_rate": 4.960261840147746e-07, - "loss": 0.85644144, - "num_input_tokens_seen": 279099805, - "step": 12936, - "time_per_iteration": 2.6989262104034424 - }, - { - "auxiliary_loss_clip": 0.01105595, - "auxiliary_loss_mlp": 0.01032215, - "balance_loss_clip": 1.03868532, - "balance_loss_mlp": 1.01979923, - "epoch": 0.7778145197655193, - "flos": 14501950508160.0, - "grad_norm": 2.021883178321684, - "language_loss": 0.6742574, - "learning_rate": 4.957694879434397e-07, - "loss": 0.69563556, - "num_input_tokens_seen": 279117975, - "step": 12937, - "time_per_iteration": 2.6387362480163574 - }, - { - "auxiliary_loss_clip": 0.01113841, - "auxiliary_loss_mlp": 0.01033827, - "balance_loss_clip": 1.03934264, - "balance_loss_mlp": 1.021245, - "epoch": 0.7778746430181873, - "flos": 21140288881920.0, - "grad_norm": 1.5206574462967066, - "language_loss": 0.87595057, - "learning_rate": 4.955128489126777e-07, - "loss": 0.89742726, - "num_input_tokens_seen": 279137255, - "step": 12938, - "time_per_iteration": 2.699613332748413 - }, - { - "auxiliary_loss_clip": 0.01101775, - "auxiliary_loss_mlp": 0.01034, - "balance_loss_clip": 1.03820324, - "balance_loss_mlp": 1.02050602, - "epoch": 0.7779347662708552, - "flos": 20266366982400.0, - "grad_norm": 2.05617872988158, - "language_loss": 0.8537035, - "learning_rate": 4.95256266932218e-07, - "loss": 0.87506127, - "num_input_tokens_seen": 279154500, - "step": 12939, - "time_per_iteration": 2.648550510406494 - }, - { - "auxiliary_loss_clip": 0.01108461, - "auxiliary_loss_mlp": 0.00770264, - "balance_loss_clip": 1.03820562, - "balance_loss_mlp": 1.00022864, - "epoch": 0.7779948895235232, - "flos": 19209013303680.0, - "grad_norm": 1.778278891076628, - "language_loss": 0.69293523, - "learning_rate": 4.949997420117915e-07, - "loss": 0.71172249, - "num_input_tokens_seen": 279173635, - "step": 12940, - "time_per_iteration": 2.5725789070129395 - }, - { - "auxiliary_loss_clip": 0.01077299, - "auxiliary_loss_mlp": 0.01026745, - "balance_loss_clip": 1.03700173, - "balance_loss_mlp": 1.01481247, - "epoch": 0.7780550127761912, - "flos": 23914711382400.0, - "grad_norm": 2.1657166687563887, - "language_loss": 0.77734792, - "learning_rate": 4.947432741611255e-07, - "loss": 0.7983883, - "num_input_tokens_seen": 279194430, - "step": 12941, - "time_per_iteration": 2.74072265625 - }, - { - "auxiliary_loss_clip": 0.01105122, - "auxiliary_loss_mlp": 0.01039107, - "balance_loss_clip": 1.03774464, - "balance_loss_mlp": 1.02505839, - "epoch": 0.7781151360288592, - "flos": 32415951795840.0, - "grad_norm": 2.6599455867272157, - "language_loss": 0.73127586, - "learning_rate": 4.944868633899462e-07, - "loss": 0.75271809, - "num_input_tokens_seen": 279212920, - "step": 12942, - "time_per_iteration": 2.717205047607422 - }, - { - "auxiliary_loss_clip": 0.0105644, - "auxiliary_loss_mlp": 0.01043958, - "balance_loss_clip": 1.03546214, - "balance_loss_mlp": 1.03034472, - "epoch": 0.7781752592815271, - "flos": 22346384780160.0, - "grad_norm": 2.908887240584156, - "language_loss": 0.67917764, - "learning_rate": 4.942305097079751e-07, - "loss": 0.7001816, - "num_input_tokens_seen": 279232310, - "step": 12943, - "time_per_iteration": 2.7333195209503174 - }, - { - "auxiliary_loss_clip": 0.01002881, - "auxiliary_loss_mlp": 0.01004649, - "balance_loss_clip": 1.00792861, - "balance_loss_mlp": 1.00336123, - "epoch": 0.7782353825341951, - "flos": 70460183520000.0, - "grad_norm": 0.7871784265530566, - "language_loss": 0.5845629, - "learning_rate": 4.939742131249347e-07, - "loss": 0.60463822, - "num_input_tokens_seen": 279295375, - "step": 12944, - "time_per_iteration": 3.390233039855957 - }, - { - "auxiliary_loss_clip": 0.01113922, - "auxiliary_loss_mlp": 0.01036077, - "balance_loss_clip": 1.03909469, - "balance_loss_mlp": 1.02220058, - "epoch": 0.778295505786863, - "flos": 19062569554560.0, - "grad_norm": 1.8086848755411578, - "language_loss": 0.67537427, - "learning_rate": 4.937179736505428e-07, - "loss": 0.69687426, - "num_input_tokens_seen": 279313660, - "step": 12945, - "time_per_iteration": 2.6378118991851807 - }, - { - "auxiliary_loss_clip": 0.01098229, - "auxiliary_loss_mlp": 0.0103623, - "balance_loss_clip": 1.03687143, - "balance_loss_mlp": 1.02295065, - "epoch": 0.778355629039531, - "flos": 20999734963200.0, - "grad_norm": 2.112440511554347, - "language_loss": 0.69157761, - "learning_rate": 4.93461791294516e-07, - "loss": 0.71292222, - "num_input_tokens_seen": 279334495, - "step": 12946, - "time_per_iteration": 2.7236101627349854 - }, - { - "auxiliary_loss_clip": 0.0111324, - "auxiliary_loss_mlp": 0.0102872, - "balance_loss_clip": 1.03970623, - "balance_loss_mlp": 1.01546407, - "epoch": 0.7784157522921991, - "flos": 21398091770880.0, - "grad_norm": 2.366818430498899, - "language_loss": 0.65404934, - "learning_rate": 4.932056660665689e-07, - "loss": 0.67546898, - "num_input_tokens_seen": 279352985, - "step": 12947, - "time_per_iteration": 2.6700103282928467 - }, - { - "auxiliary_loss_clip": 0.01049825, - "auxiliary_loss_mlp": 0.01043003, - "balance_loss_clip": 1.03298378, - "balance_loss_mlp": 1.02796459, - "epoch": 0.778475875544867, - "flos": 20813861059200.0, - "grad_norm": 1.8657083989144876, - "language_loss": 0.64925945, - "learning_rate": 4.929495979764147e-07, - "loss": 0.67018777, - "num_input_tokens_seen": 279371360, - "step": 12948, - "time_per_iteration": 2.8412203788757324 - }, - { - "auxiliary_loss_clip": 0.01112305, - "auxiliary_loss_mlp": 0.01035608, - "balance_loss_clip": 1.03932905, - "balance_loss_mlp": 1.02261424, - "epoch": 0.778535998797535, - "flos": 14355363104640.0, - "grad_norm": 1.8274723515678126, - "language_loss": 0.75157881, - "learning_rate": 4.926935870337625e-07, - "loss": 0.77305794, - "num_input_tokens_seen": 279389400, - "step": 12949, - "time_per_iteration": 2.641893148422241 - }, - { - "auxiliary_loss_clip": 0.01116388, - "auxiliary_loss_mlp": 0.01033844, - "balance_loss_clip": 1.04068756, - "balance_loss_mlp": 1.02045703, - "epoch": 0.7785961220502029, - "flos": 19209552007680.0, - "grad_norm": 2.2581725959312156, - "language_loss": 0.68925655, - "learning_rate": 4.924376332483202e-07, - "loss": 0.71075886, - "num_input_tokens_seen": 279409715, - "step": 12950, - "time_per_iteration": 2.7213573455810547 - }, - { - "auxiliary_loss_clip": 0.01096074, - "auxiliary_loss_mlp": 0.01034434, - "balance_loss_clip": 1.03823721, - "balance_loss_mlp": 1.02142787, - "epoch": 0.7786562453028709, - "flos": 25738757884800.0, - "grad_norm": 1.7372750277816864, - "language_loss": 0.71980989, - "learning_rate": 4.921817366297938e-07, - "loss": 0.74111497, - "num_input_tokens_seen": 279427705, - "step": 12951, - "time_per_iteration": 2.741422414779663 - }, - { - "auxiliary_loss_clip": 0.01087111, - "auxiliary_loss_mlp": 0.01035076, - "balance_loss_clip": 1.03640008, - "balance_loss_mlp": 1.02192152, - "epoch": 0.7787163685555388, - "flos": 25739440243200.0, - "grad_norm": 1.8924083949863153, - "language_loss": 0.65915614, - "learning_rate": 4.919258971878877e-07, - "loss": 0.68037808, - "num_input_tokens_seen": 279448215, - "step": 12952, - "time_per_iteration": 2.770171880722046 - }, - { - "auxiliary_loss_clip": 0.0108209, - "auxiliary_loss_mlp": 0.01031543, - "balance_loss_clip": 1.03549063, - "balance_loss_mlp": 1.01928258, - "epoch": 0.7787764918082068, - "flos": 22747722416640.0, - "grad_norm": 1.528475201753157, - "language_loss": 0.81114817, - "learning_rate": 4.916701149323022e-07, - "loss": 0.83228457, - "num_input_tokens_seen": 279466260, - "step": 12953, - "time_per_iteration": 2.708888530731201 - }, - { - "auxiliary_loss_clip": 0.01118162, - "auxiliary_loss_mlp": 0.01035624, - "balance_loss_clip": 1.04281271, - "balance_loss_mlp": 1.02266002, - "epoch": 0.7788366150608748, - "flos": 15190860430080.0, - "grad_norm": 2.122341354514922, - "language_loss": 0.76798481, - "learning_rate": 4.91414389872737e-07, - "loss": 0.78952265, - "num_input_tokens_seen": 279484520, - "step": 12954, - "time_per_iteration": 2.5349183082580566 - }, - { - "auxiliary_loss_clip": 0.01100423, - "auxiliary_loss_mlp": 0.01030129, - "balance_loss_clip": 1.0369153, - "balance_loss_mlp": 1.01788616, - "epoch": 0.7788967383135428, - "flos": 21210242618880.0, - "grad_norm": 1.5352459629047766, - "language_loss": 0.72880197, - "learning_rate": 4.911587220188905e-07, - "loss": 0.75010741, - "num_input_tokens_seen": 279503130, - "step": 12955, - "time_per_iteration": 2.7405974864959717 - }, - { - "auxiliary_loss_clip": 0.01079595, - "auxiliary_loss_mlp": 0.0104146, - "balance_loss_clip": 1.03563166, - "balance_loss_mlp": 1.02835917, - "epoch": 0.7789568615662107, - "flos": 21682970536320.0, - "grad_norm": 1.6339057488297875, - "language_loss": 0.68833733, - "learning_rate": 4.909031113804551e-07, - "loss": 0.70954788, - "num_input_tokens_seen": 279521930, - "step": 12956, - "time_per_iteration": 2.6949398517608643 - }, - { - "auxiliary_loss_clip": 0.01076197, - "auxiliary_loss_mlp": 0.01034972, - "balance_loss_clip": 1.0365479, - "balance_loss_mlp": 1.0227586, - "epoch": 0.7790169848188787, - "flos": 26360371676160.0, - "grad_norm": 1.6442430086846629, - "language_loss": 0.76081383, - "learning_rate": 4.906475579671252e-07, - "loss": 0.78192556, - "num_input_tokens_seen": 279542375, - "step": 12957, - "time_per_iteration": 2.7577781677246094 - }, - { - "auxiliary_loss_clip": 0.01041804, - "auxiliary_loss_mlp": 0.01027809, - "balance_loss_clip": 1.03647232, - "balance_loss_mlp": 1.01506531, - "epoch": 0.7790771080715466, - "flos": 25516183259520.0, - "grad_norm": 1.5510435056277987, - "language_loss": 0.77168477, - "learning_rate": 4.903920617885917e-07, - "loss": 0.79238093, - "num_input_tokens_seen": 279561885, - "step": 12958, - "time_per_iteration": 2.902573585510254 - }, - { - "auxiliary_loss_clip": 0.01099333, - "auxiliary_loss_mlp": 0.01042234, - "balance_loss_clip": 1.03847003, - "balance_loss_mlp": 1.02687943, - "epoch": 0.7791372313242146, - "flos": 16034186920320.0, - "grad_norm": 2.00916706928726, - "language_loss": 0.71559989, - "learning_rate": 4.901366228545418e-07, - "loss": 0.73701555, - "num_input_tokens_seen": 279579965, - "step": 12959, - "time_per_iteration": 2.6020190715789795 - }, - { - "auxiliary_loss_clip": 0.01094821, - "auxiliary_loss_mlp": 0.00771197, - "balance_loss_clip": 1.03832543, - "balance_loss_mlp": 1.00027037, - "epoch": 0.7791973545768827, - "flos": 23842207779840.0, - "grad_norm": 1.6491836005518046, - "language_loss": 0.78150439, - "learning_rate": 4.898812411746632e-07, - "loss": 0.80016458, - "num_input_tokens_seen": 279599030, - "step": 12960, - "time_per_iteration": 2.6712677478790283 - }, - { - "auxiliary_loss_clip": 0.01104299, - "auxiliary_loss_mlp": 0.01040928, - "balance_loss_clip": 1.03950214, - "balance_loss_mlp": 1.02792239, - "epoch": 0.7792574778295506, - "flos": 24168384207360.0, - "grad_norm": 2.171108267887673, - "language_loss": 0.75204015, - "learning_rate": 4.896259167586385e-07, - "loss": 0.77349246, - "num_input_tokens_seen": 279614400, - "step": 12961, - "time_per_iteration": 2.6742923259735107 - }, - { - "auxiliary_loss_clip": 0.01087433, - "auxiliary_loss_mlp": 0.01038038, - "balance_loss_clip": 1.03869224, - "balance_loss_mlp": 1.02624202, - "epoch": 0.7793176010822186, - "flos": 21464921024640.0, - "grad_norm": 1.7879944844879476, - "language_loss": 0.73984349, - "learning_rate": 4.893706496161511e-07, - "loss": 0.76109815, - "num_input_tokens_seen": 279633745, - "step": 12962, - "time_per_iteration": 2.6879115104675293 - }, - { - "auxiliary_loss_clip": 0.01101036, - "auxiliary_loss_mlp": 0.01030588, - "balance_loss_clip": 1.03875148, - "balance_loss_mlp": 1.01782036, - "epoch": 0.7793777243348865, - "flos": 20666699038080.0, - "grad_norm": 1.7723287922493858, - "language_loss": 0.69576943, - "learning_rate": 4.891154397568795e-07, - "loss": 0.71708572, - "num_input_tokens_seen": 279651165, - "step": 12963, - "time_per_iteration": 2.6385724544525146 - }, - { - "auxiliary_loss_clip": 0.01101416, - "auxiliary_loss_mlp": 0.00770165, - "balance_loss_clip": 1.04028928, - "balance_loss_mlp": 1.00022078, - "epoch": 0.7794378475875545, - "flos": 27125771610240.0, - "grad_norm": 1.6031620431196494, - "language_loss": 0.63797098, - "learning_rate": 4.888602871905019e-07, - "loss": 0.65668678, - "num_input_tokens_seen": 279671175, - "step": 12964, - "time_per_iteration": 2.6388909816741943 - }, - { - "auxiliary_loss_clip": 0.01092497, - "auxiliary_loss_mlp": 0.010352, - "balance_loss_clip": 1.03853321, - "balance_loss_mlp": 1.02259946, - "epoch": 0.7794979708402224, - "flos": 28074136446720.0, - "grad_norm": 1.8780726000065868, - "language_loss": 0.76702619, - "learning_rate": 4.88605191926694e-07, - "loss": 0.7883032, - "num_input_tokens_seen": 279688675, - "step": 12965, - "time_per_iteration": 4.301928758621216 - }, - { - "auxiliary_loss_clip": 0.01089139, - "auxiliary_loss_mlp": 0.01039643, - "balance_loss_clip": 1.03389204, - "balance_loss_mlp": 1.02626801, - "epoch": 0.7795580940928905, - "flos": 26869548919680.0, - "grad_norm": 1.824856626010527, - "language_loss": 0.73063076, - "learning_rate": 4.883501539751289e-07, - "loss": 0.75191855, - "num_input_tokens_seen": 279710245, - "step": 12966, - "time_per_iteration": 4.184988498687744 - }, - { - "auxiliary_loss_clip": 0.01088561, - "auxiliary_loss_mlp": 0.00769043, - "balance_loss_clip": 1.04008389, - "balance_loss_mlp": 1.00027704, - "epoch": 0.7796182173455584, - "flos": 23835384195840.0, - "grad_norm": 1.6189038671897886, - "language_loss": 0.7464664, - "learning_rate": 4.880951733454768e-07, - "loss": 0.76504242, - "num_input_tokens_seen": 279729045, - "step": 12967, - "time_per_iteration": 2.7788522243499756 - }, - { - "auxiliary_loss_clip": 0.0111227, - "auxiliary_loss_mlp": 0.01032605, - "balance_loss_clip": 1.03953099, - "balance_loss_mlp": 1.01915836, - "epoch": 0.7796783405982264, - "flos": 19792238434560.0, - "grad_norm": 3.3219826288937253, - "language_loss": 0.72220939, - "learning_rate": 4.878402500474073e-07, - "loss": 0.74365819, - "num_input_tokens_seen": 279748350, - "step": 12968, - "time_per_iteration": 4.058116436004639 - }, - { - "auxiliary_loss_clip": 0.01085681, - "auxiliary_loss_mlp": 0.01039035, - "balance_loss_clip": 1.03827214, - "balance_loss_mlp": 1.02664959, - "epoch": 0.7797384638508943, - "flos": 15450207603840.0, - "grad_norm": 1.9018596701865034, - "language_loss": 0.61007255, - "learning_rate": 4.875853840905874e-07, - "loss": 0.6313197, - "num_input_tokens_seen": 279765620, - "step": 12969, - "time_per_iteration": 2.6471657752990723 - }, - { - "auxiliary_loss_clip": 0.01090989, - "auxiliary_loss_mlp": 0.01034313, - "balance_loss_clip": 1.03693545, - "balance_loss_mlp": 1.0227617, - "epoch": 0.7797985871035623, - "flos": 20922742160640.0, - "grad_norm": 1.800586767958732, - "language_loss": 0.70180488, - "learning_rate": 4.873305754846811e-07, - "loss": 0.72305787, - "num_input_tokens_seen": 279782485, - "step": 12970, - "time_per_iteration": 2.6519546508789062 - }, - { - "auxiliary_loss_clip": 0.01075649, - "auxiliary_loss_mlp": 0.00770753, - "balance_loss_clip": 1.04074073, - "balance_loss_mlp": 1.00021172, - "epoch": 0.7798587103562302, - "flos": 36937212514560.0, - "grad_norm": 1.645308207198137, - "language_loss": 0.72213817, - "learning_rate": 4.870758242393507e-07, - "loss": 0.7406022, - "num_input_tokens_seen": 279804170, - "step": 12971, - "time_per_iteration": 2.818019390106201 - }, - { - "auxiliary_loss_clip": 0.0106953, - "auxiliary_loss_mlp": 0.01037101, - "balance_loss_clip": 1.03421068, - "balance_loss_mlp": 1.02360034, - "epoch": 0.7799188336088982, - "flos": 22419283432320.0, - "grad_norm": 3.5320834107901486, - "language_loss": 0.74761558, - "learning_rate": 4.868211303642578e-07, - "loss": 0.76868188, - "num_input_tokens_seen": 279823730, - "step": 12972, - "time_per_iteration": 2.7724294662475586 - }, - { - "auxiliary_loss_clip": 0.01111753, - "auxiliary_loss_mlp": 0.01025205, - "balance_loss_clip": 1.03887677, - "balance_loss_mlp": 1.01239038, - "epoch": 0.7799789568615663, - "flos": 18880466578560.0, - "grad_norm": 1.8982422603948057, - "language_loss": 0.71497881, - "learning_rate": 4.865664938690584e-07, - "loss": 0.73634839, - "num_input_tokens_seen": 279843035, - "step": 12973, - "time_per_iteration": 4.207505226135254 - }, - { - "auxiliary_loss_clip": 0.01099331, - "auxiliary_loss_mlp": 0.01032267, - "balance_loss_clip": 1.03967643, - "balance_loss_mlp": 1.0208292, - "epoch": 0.7800390801142342, - "flos": 20262272832000.0, - "grad_norm": 1.9924582249119662, - "language_loss": 0.77612895, - "learning_rate": 4.863119147634089e-07, - "loss": 0.79744494, - "num_input_tokens_seen": 279861450, - "step": 12974, - "time_per_iteration": 2.6812784671783447 - }, - { - "auxiliary_loss_clip": 0.01077043, - "auxiliary_loss_mlp": 0.01031844, - "balance_loss_clip": 1.0368197, - "balance_loss_mlp": 1.01885045, - "epoch": 0.7800992033669022, - "flos": 16690310703360.0, - "grad_norm": 1.5902544107071221, - "language_loss": 0.69343281, - "learning_rate": 4.86057393056964e-07, - "loss": 0.71452165, - "num_input_tokens_seen": 279878660, - "step": 12975, - "time_per_iteration": 2.668877124786377 - }, - { - "auxiliary_loss_clip": 0.01074216, - "auxiliary_loss_mlp": 0.01029987, - "balance_loss_clip": 1.03641438, - "balance_loss_mlp": 1.0174228, - "epoch": 0.7801593266195701, - "flos": 18585208782720.0, - "grad_norm": 1.9719657409906464, - "language_loss": 0.82066941, - "learning_rate": 4.858029287593739e-07, - "loss": 0.84171152, - "num_input_tokens_seen": 279895685, - "step": 12976, - "time_per_iteration": 2.760437488555908 - }, - { - "auxiliary_loss_clip": 0.01090901, - "auxiliary_loss_mlp": 0.00770609, - "balance_loss_clip": 1.03640187, - "balance_loss_mlp": 1.00019145, - "epoch": 0.7802194498722381, - "flos": 25484941405440.0, - "grad_norm": 1.5169608974947786, - "language_loss": 0.66052604, - "learning_rate": 4.85548521880289e-07, - "loss": 0.6791411, - "num_input_tokens_seen": 279917240, - "step": 12977, - "time_per_iteration": 2.7686586380004883 - }, - { - "auxiliary_loss_clip": 0.01087933, - "auxiliary_loss_mlp": 0.01028467, - "balance_loss_clip": 1.03792357, - "balance_loss_mlp": 1.01699352, - "epoch": 0.780279573124906, - "flos": 31176315573120.0, - "grad_norm": 1.5120129099478161, - "language_loss": 0.74935395, - "learning_rate": 4.852941724293554e-07, - "loss": 0.77051795, - "num_input_tokens_seen": 279938665, - "step": 12978, - "time_per_iteration": 2.775379180908203 - }, - { - "auxiliary_loss_clip": 0.01087009, - "auxiliary_loss_mlp": 0.01044028, - "balance_loss_clip": 1.03668034, - "balance_loss_mlp": 1.02886498, - "epoch": 0.780339696377574, - "flos": 26944027770240.0, - "grad_norm": 2.430538160229645, - "language_loss": 0.62134832, - "learning_rate": 4.85039880416219e-07, - "loss": 0.64265871, - "num_input_tokens_seen": 279957965, - "step": 12979, - "time_per_iteration": 2.715329170227051 - }, - { - "auxiliary_loss_clip": 0.01111779, - "auxiliary_loss_mlp": 0.01030782, - "balance_loss_clip": 1.03983402, - "balance_loss_mlp": 1.01825941, - "epoch": 0.780399819630242, - "flos": 27957426180480.0, - "grad_norm": 1.9760483655422685, - "language_loss": 0.77112854, - "learning_rate": 4.847856458505217e-07, - "loss": 0.79255414, - "num_input_tokens_seen": 279977490, - "step": 12980, - "time_per_iteration": 2.6605019569396973 - }, - { - "auxiliary_loss_clip": 0.0111412, - "auxiliary_loss_mlp": 0.01033295, - "balance_loss_clip": 1.03999209, - "balance_loss_mlp": 1.02089083, - "epoch": 0.78045994288291, - "flos": 22486795044480.0, - "grad_norm": 1.9592981721345673, - "language_loss": 0.77939653, - "learning_rate": 4.845314687419046e-07, - "loss": 0.80087066, - "num_input_tokens_seen": 279994220, - "step": 12981, - "time_per_iteration": 2.658205032348633 - }, - { - "auxiliary_loss_clip": 0.01069277, - "auxiliary_loss_mlp": 0.01036103, - "balance_loss_clip": 1.0379262, - "balance_loss_mlp": 1.02320492, - "epoch": 0.7805200661355779, - "flos": 20850849089280.0, - "grad_norm": 2.387436806844423, - "language_loss": 0.72364557, - "learning_rate": 4.842773491000067e-07, - "loss": 0.74469936, - "num_input_tokens_seen": 280012590, - "step": 12982, - "time_per_iteration": 2.6541051864624023 - }, - { - "auxiliary_loss_clip": 0.01084276, - "auxiliary_loss_mlp": 0.01030772, - "balance_loss_clip": 1.03608441, - "balance_loss_mlp": 1.01907182, - "epoch": 0.7805801893882459, - "flos": 25665966973440.0, - "grad_norm": 1.5019346121142914, - "language_loss": 0.73412144, - "learning_rate": 4.840232869344636e-07, - "loss": 0.75527191, - "num_input_tokens_seen": 280033700, - "step": 12983, - "time_per_iteration": 2.6957807540893555 - }, - { - "auxiliary_loss_clip": 0.01083908, - "auxiliary_loss_mlp": 0.01031264, - "balance_loss_clip": 1.03741837, - "balance_loss_mlp": 1.0185684, - "epoch": 0.7806403126409138, - "flos": 11327806483200.0, - "grad_norm": 1.8415039374254183, - "language_loss": 0.74685752, - "learning_rate": 4.837692822549086e-07, - "loss": 0.76800919, - "num_input_tokens_seen": 280052215, - "step": 12984, - "time_per_iteration": 2.6339285373687744 - }, - { - "auxiliary_loss_clip": 0.0108127, - "auxiliary_loss_mlp": 0.01032816, - "balance_loss_clip": 1.03251958, - "balance_loss_mlp": 1.02092516, - "epoch": 0.7807004358935818, - "flos": 19573362910080.0, - "grad_norm": 2.0272215357184646, - "language_loss": 0.81049699, - "learning_rate": 4.835153350709746e-07, - "loss": 0.83163786, - "num_input_tokens_seen": 280070525, - "step": 12985, - "time_per_iteration": 2.6104180812835693 - }, - { - "auxiliary_loss_clip": 0.01088889, - "auxiliary_loss_mlp": 0.01033801, - "balance_loss_clip": 1.03684783, - "balance_loss_mlp": 1.02074158, - "epoch": 0.7807605591462499, - "flos": 19135827342720.0, - "grad_norm": 2.394678033024852, - "language_loss": 0.76863611, - "learning_rate": 4.832614453922915e-07, - "loss": 0.78986299, - "num_input_tokens_seen": 280089855, - "step": 12986, - "time_per_iteration": 2.6664822101593018 - }, - { - "auxiliary_loss_clip": 0.01100426, - "auxiliary_loss_mlp": 0.01035599, - "balance_loss_clip": 1.037323, - "balance_loss_mlp": 1.02314782, - "epoch": 0.7808206823989178, - "flos": 32374654133760.0, - "grad_norm": 2.6632985579320128, - "language_loss": 0.73982435, - "learning_rate": 4.830076132284859e-07, - "loss": 0.76118457, - "num_input_tokens_seen": 280109960, - "step": 12987, - "time_per_iteration": 2.6844065189361572 - }, - { - "auxiliary_loss_clip": 0.01022794, - "auxiliary_loss_mlp": 0.00999717, - "balance_loss_clip": 1.01035285, - "balance_loss_mlp": 0.99873984, - "epoch": 0.7808808056515858, - "flos": 55050235061760.0, - "grad_norm": 0.7788144710273639, - "language_loss": 0.55080384, - "learning_rate": 4.82753838589184e-07, - "loss": 0.57102895, - "num_input_tokens_seen": 280169805, - "step": 12988, - "time_per_iteration": 3.1616508960723877 - }, - { - "auxiliary_loss_clip": 0.0107797, - "auxiliary_loss_mlp": 0.01031828, - "balance_loss_clip": 1.03549254, - "balance_loss_mlp": 1.01985967, - "epoch": 0.7809409289042537, - "flos": 12859468277760.0, - "grad_norm": 3.638882308233123, - "language_loss": 0.81044191, - "learning_rate": 4.82500121484009e-07, - "loss": 0.83153987, - "num_input_tokens_seen": 280184630, - "step": 12989, - "time_per_iteration": 2.660554885864258 - }, - { - "auxiliary_loss_clip": 0.01077669, - "auxiliary_loss_mlp": 0.01031778, - "balance_loss_clip": 1.03635395, - "balance_loss_mlp": 1.01853991, - "epoch": 0.7810010521569217, - "flos": 21687244254720.0, - "grad_norm": 1.7099876560000518, - "language_loss": 0.70650768, - "learning_rate": 4.822464619225806e-07, - "loss": 0.72760212, - "num_input_tokens_seen": 280203880, - "step": 12990, - "time_per_iteration": 2.7570815086364746 - }, - { - "auxiliary_loss_clip": 0.01087538, - "auxiliary_loss_mlp": 0.01034172, - "balance_loss_clip": 1.03705347, - "balance_loss_mlp": 1.02005148, - "epoch": 0.7810611754095896, - "flos": 16757068129920.0, - "grad_norm": 1.9898673429166094, - "language_loss": 0.77492607, - "learning_rate": 4.819928599145184e-07, - "loss": 0.79614317, - "num_input_tokens_seen": 280220460, - "step": 12991, - "time_per_iteration": 2.655853748321533 - }, - { - "auxiliary_loss_clip": 0.01071528, - "auxiliary_loss_mlp": 0.0103742, - "balance_loss_clip": 1.03491211, - "balance_loss_mlp": 1.02443242, - "epoch": 0.7811212986622577, - "flos": 43507464658560.0, - "grad_norm": 1.7999740041710885, - "language_loss": 0.6594398, - "learning_rate": 4.817393154694398e-07, - "loss": 0.68052924, - "num_input_tokens_seen": 280242680, - "step": 12992, - "time_per_iteration": 2.885798931121826 - }, - { - "auxiliary_loss_clip": 0.01114039, - "auxiliary_loss_mlp": 0.01030727, - "balance_loss_clip": 1.04082417, - "balance_loss_mlp": 1.01861548, - "epoch": 0.7811814219149256, - "flos": 21757700782080.0, - "grad_norm": 1.7673036757148999, - "language_loss": 0.61809367, - "learning_rate": 4.814858285969578e-07, - "loss": 0.63954139, - "num_input_tokens_seen": 280260655, - "step": 12993, - "time_per_iteration": 2.5982654094696045 - }, - { - "auxiliary_loss_clip": 0.01085768, - "auxiliary_loss_mlp": 0.01032115, - "balance_loss_clip": 1.03539443, - "balance_loss_mlp": 1.01902032, - "epoch": 0.7812415451675936, - "flos": 24061514267520.0, - "grad_norm": 1.4534277443177828, - "language_loss": 0.68547273, - "learning_rate": 4.812323993066862e-07, - "loss": 0.70665157, - "num_input_tokens_seen": 280281185, - "step": 12994, - "time_per_iteration": 2.7115039825439453 - }, - { - "auxiliary_loss_clip": 0.01109576, - "auxiliary_loss_mlp": 0.01027659, - "balance_loss_clip": 1.03841043, - "balance_loss_mlp": 1.01556516, - "epoch": 0.7813016684202615, - "flos": 18989706816000.0, - "grad_norm": 1.8869179456101774, - "language_loss": 0.68850774, - "learning_rate": 4.809790276082335e-07, - "loss": 0.70988011, - "num_input_tokens_seen": 280298255, - "step": 12995, - "time_per_iteration": 2.6276211738586426 - }, - { - "auxiliary_loss_clip": 0.01066626, - "auxiliary_loss_mlp": 0.01029556, - "balance_loss_clip": 1.03578615, - "balance_loss_mlp": 1.01815367, - "epoch": 0.7813617916729295, - "flos": 25260786581760.0, - "grad_norm": 1.6581465647867601, - "language_loss": 0.74758989, - "learning_rate": 4.807257135112088e-07, - "loss": 0.76855165, - "num_input_tokens_seen": 280319000, - "step": 12996, - "time_per_iteration": 2.7556345462799072 - }, - { - "auxiliary_loss_clip": 0.01115417, - "auxiliary_loss_mlp": 0.01034278, - "balance_loss_clip": 1.04004622, - "balance_loss_mlp": 1.02160597, - "epoch": 0.7814219149255974, - "flos": 17966037116160.0, - "grad_norm": 2.7982414236779385, - "language_loss": 0.68035823, - "learning_rate": 4.804724570252167e-07, - "loss": 0.70185518, - "num_input_tokens_seen": 280336375, - "step": 12997, - "time_per_iteration": 2.633403778076172 - }, - { - "auxiliary_loss_clip": 0.01115354, - "auxiliary_loss_mlp": 0.01035627, - "balance_loss_clip": 1.03941298, - "balance_loss_mlp": 1.02176905, - "epoch": 0.7814820381782654, - "flos": 25776176878080.0, - "grad_norm": 1.750414047475771, - "language_loss": 0.81803972, - "learning_rate": 4.802192581598614e-07, - "loss": 0.83954954, - "num_input_tokens_seen": 280358760, - "step": 12998, - "time_per_iteration": 2.6855201721191406 - }, - { - "auxiliary_loss_clip": 0.01083435, - "auxiliary_loss_mlp": 0.01038171, - "balance_loss_clip": 1.03414857, - "balance_loss_mlp": 1.02490866, - "epoch": 0.7815421614309335, - "flos": 20519572930560.0, - "grad_norm": 2.2116291760065523, - "language_loss": 0.74893302, - "learning_rate": 4.799661169247453e-07, - "loss": 0.77014905, - "num_input_tokens_seen": 280377085, - "step": 12999, - "time_per_iteration": 2.657938241958618 - }, - { - "auxiliary_loss_clip": 0.01098221, - "auxiliary_loss_mlp": 0.01042598, - "balance_loss_clip": 1.0372951, - "balance_loss_mlp": 1.02817392, - "epoch": 0.7816022846836014, - "flos": 21287666384640.0, - "grad_norm": 3.4549180565502957, - "language_loss": 0.84463656, - "learning_rate": 4.797130333294652e-07, - "loss": 0.8660447, - "num_input_tokens_seen": 280395465, - "step": 13000, - "time_per_iteration": 2.652675151824951 - }, - { - "auxiliary_loss_clip": 0.01102345, - "auxiliary_loss_mlp": 0.01033163, - "balance_loss_clip": 1.03934288, - "balance_loss_mlp": 1.02033567, - "epoch": 0.7816624079362694, - "flos": 19208402772480.0, - "grad_norm": 1.8050152528671168, - "language_loss": 0.66003239, - "learning_rate": 4.794600073836192e-07, - "loss": 0.68138748, - "num_input_tokens_seen": 280412775, - "step": 13001, - "time_per_iteration": 2.650995969772339 - }, - { - "auxiliary_loss_clip": 0.01073705, - "auxiliary_loss_mlp": 0.01037556, - "balance_loss_clip": 1.03569674, - "balance_loss_mlp": 1.02527714, - "epoch": 0.7817225311889373, - "flos": 26104687689600.0, - "grad_norm": 1.5795311212034024, - "language_loss": 0.67058933, - "learning_rate": 4.792070390968027e-07, - "loss": 0.69170189, - "num_input_tokens_seen": 280432905, - "step": 13002, - "time_per_iteration": 2.811582326889038 - }, - { - "auxiliary_loss_clip": 0.01105543, - "auxiliary_loss_mlp": 0.01035107, - "balance_loss_clip": 1.04086781, - "balance_loss_mlp": 1.02175558, - "epoch": 0.7817826544416053, - "flos": 21250929749760.0, - "grad_norm": 2.254654590765684, - "language_loss": 0.73237813, - "learning_rate": 4.78954128478607e-07, - "loss": 0.75378466, - "num_input_tokens_seen": 280450785, - "step": 13003, - "time_per_iteration": 2.6425418853759766 - }, - { - "auxiliary_loss_clip": 0.01101875, - "auxiliary_loss_mlp": 0.01035905, - "balance_loss_clip": 1.03900814, - "balance_loss_mlp": 1.02342987, - "epoch": 0.7818427776942732, - "flos": 19932181822080.0, - "grad_norm": 2.055818402329747, - "language_loss": 0.61984468, - "learning_rate": 4.787012755386233e-07, - "loss": 0.64122242, - "num_input_tokens_seen": 280468400, - "step": 13004, - "time_per_iteration": 2.6202731132507324 - }, - { - "auxiliary_loss_clip": 0.0110586, - "auxiliary_loss_mlp": 0.01032863, - "balance_loss_clip": 1.03744853, - "balance_loss_mlp": 1.02140069, - "epoch": 0.7819029009469413, - "flos": 11363753018880.0, - "grad_norm": 1.8629152700369227, - "language_loss": 0.82870841, - "learning_rate": 4.784484802864403e-07, - "loss": 0.85009563, - "num_input_tokens_seen": 280483930, - "step": 13005, - "time_per_iteration": 4.243497371673584 - }, - { - "auxiliary_loss_clip": 0.01070901, - "auxiliary_loss_mlp": 0.00771151, - "balance_loss_clip": 1.03450751, - "balance_loss_mlp": 1.00017846, - "epoch": 0.7819630241996092, - "flos": 24279276470400.0, - "grad_norm": 1.8867656052793076, - "language_loss": 0.72342831, - "learning_rate": 4.781957427316432e-07, - "loss": 0.74184883, - "num_input_tokens_seen": 280503465, - "step": 13006, - "time_per_iteration": 4.330881357192993 - }, - { - "auxiliary_loss_clip": 0.01101615, - "auxiliary_loss_mlp": 0.00771026, - "balance_loss_clip": 1.03858209, - "balance_loss_mlp": 1.00022697, - "epoch": 0.7820231474522772, - "flos": 22708902792960.0, - "grad_norm": 1.62503166301797, - "language_loss": 0.72343135, - "learning_rate": 4.779430628838157e-07, - "loss": 0.74215776, - "num_input_tokens_seen": 280523375, - "step": 13007, - "time_per_iteration": 4.214543581008911 - }, - { - "auxiliary_loss_clip": 0.01111637, - "auxiliary_loss_mlp": 0.01028444, - "balance_loss_clip": 1.03696549, - "balance_loss_mlp": 1.01505101, - "epoch": 0.7820832707049451, - "flos": 20047419630720.0, - "grad_norm": 1.901361826456498, - "language_loss": 0.68807894, - "learning_rate": 4.776904407525397e-07, - "loss": 0.70947969, - "num_input_tokens_seen": 280542920, - "step": 13008, - "time_per_iteration": 2.6050710678100586 - }, - { - "auxiliary_loss_clip": 0.01082934, - "auxiliary_loss_mlp": 0.01028936, - "balance_loss_clip": 1.03609729, - "balance_loss_mlp": 1.01611555, - "epoch": 0.7821433939576131, - "flos": 27162795553920.0, - "grad_norm": 2.2501417791775036, - "language_loss": 0.69864273, - "learning_rate": 4.774378763473954e-07, - "loss": 0.71976143, - "num_input_tokens_seen": 280561700, - "step": 13009, - "time_per_iteration": 2.7489216327667236 - }, - { - "auxiliary_loss_clip": 0.01069744, - "auxiliary_loss_mlp": 0.01029641, - "balance_loss_clip": 1.03394186, - "balance_loss_mlp": 1.01677287, - "epoch": 0.782203517210281, - "flos": 22602068766720.0, - "grad_norm": 2.6181121023195386, - "language_loss": 0.81756222, - "learning_rate": 4.771853696779586e-07, - "loss": 0.83855605, - "num_input_tokens_seen": 280580605, - "step": 13010, - "time_per_iteration": 2.754182815551758 - }, - { - "auxiliary_loss_clip": 0.01097326, - "auxiliary_loss_mlp": 0.01033843, - "balance_loss_clip": 1.03652465, - "balance_loss_mlp": 1.02199399, - "epoch": 0.782263640462949, - "flos": 29059812535680.0, - "grad_norm": 1.5058057514043965, - "language_loss": 0.61957836, - "learning_rate": 4.76932920753806e-07, - "loss": 0.64089006, - "num_input_tokens_seen": 280601495, - "step": 13011, - "time_per_iteration": 2.676269292831421 - }, - { - "auxiliary_loss_clip": 0.01098762, - "auxiliary_loss_mlp": 0.01028506, - "balance_loss_clip": 1.03798711, - "balance_loss_mlp": 1.01740146, - "epoch": 0.782323763715617, - "flos": 25299498464640.0, - "grad_norm": 1.7834780447298506, - "language_loss": 0.703578, - "learning_rate": 4.7668052958450913e-07, - "loss": 0.72485065, - "num_input_tokens_seen": 280622760, - "step": 13012, - "time_per_iteration": 4.222137451171875 - }, - { - "auxiliary_loss_clip": 0.01030861, - "auxiliary_loss_mlp": 0.00999997, - "balance_loss_clip": 1.00834417, - "balance_loss_mlp": 0.99901354, - "epoch": 0.782383886968285, - "flos": 65194388668800.0, - "grad_norm": 0.7024977302558347, - "language_loss": 0.55065835, - "learning_rate": 4.764281961796395e-07, - "loss": 0.57096696, - "num_input_tokens_seen": 280687115, - "step": 13013, - "time_per_iteration": 3.2604727745056152 - }, - { - "auxiliary_loss_clip": 0.01088673, - "auxiliary_loss_mlp": 0.01038861, - "balance_loss_clip": 1.0394305, - "balance_loss_mlp": 1.02612925, - "epoch": 0.782444010220953, - "flos": 18405440190720.0, - "grad_norm": 1.739605099015053, - "language_loss": 0.65488654, - "learning_rate": 4.76175920548765e-07, - "loss": 0.67616189, - "num_input_tokens_seen": 280705000, - "step": 13014, - "time_per_iteration": 2.702570915222168 - }, - { - "auxiliary_loss_clip": 0.01005007, - "auxiliary_loss_mlp": 0.01000676, - "balance_loss_clip": 1.0074594, - "balance_loss_mlp": 0.99947244, - "epoch": 0.7825041334736209, - "flos": 63955003841280.0, - "grad_norm": 1.5199496836725135, - "language_loss": 0.58456129, - "learning_rate": 4.759237027014524e-07, - "loss": 0.60461813, - "num_input_tokens_seen": 280773525, - "step": 13015, - "time_per_iteration": 3.2708168029785156 - }, - { - "auxiliary_loss_clip": 0.01082509, - "auxiliary_loss_mlp": 0.0103455, - "balance_loss_clip": 1.03745651, - "balance_loss_mlp": 1.02287316, - "epoch": 0.7825642567262889, - "flos": 20339373375360.0, - "grad_norm": 1.6009097708406814, - "language_loss": 0.74550229, - "learning_rate": 4.756715426472666e-07, - "loss": 0.76667285, - "num_input_tokens_seen": 280791915, - "step": 13016, - "time_per_iteration": 2.775660514831543 - }, - { - "auxiliary_loss_clip": 0.01111525, - "auxiliary_loss_mlp": 0.01032595, - "balance_loss_clip": 1.03842187, - "balance_loss_mlp": 1.01902854, - "epoch": 0.7826243799789568, - "flos": 20262955190400.0, - "grad_norm": 1.7770751531413016, - "language_loss": 0.75118351, - "learning_rate": 4.7541944039576766e-07, - "loss": 0.77262467, - "num_input_tokens_seen": 280811460, - "step": 13017, - "time_per_iteration": 2.6645398139953613 - }, - { - "auxiliary_loss_clip": 0.01085213, - "auxiliary_loss_mlp": 0.01034128, - "balance_loss_clip": 1.03540921, - "balance_loss_mlp": 1.0211221, - "epoch": 0.7826845032316249, - "flos": 21132926593920.0, - "grad_norm": 1.9823505334349008, - "language_loss": 0.75479347, - "learning_rate": 4.7516739595651636e-07, - "loss": 0.77598691, - "num_input_tokens_seen": 280825415, - "step": 13018, - "time_per_iteration": 2.6840744018554688 - }, - { - "auxiliary_loss_clip": 0.01108451, - "auxiliary_loss_mlp": 0.01029158, - "balance_loss_clip": 1.03651655, - "balance_loss_mlp": 1.0168916, - "epoch": 0.7827446264842928, - "flos": 22492253911680.0, - "grad_norm": 1.4306758819867016, - "language_loss": 0.77329135, - "learning_rate": 4.749154093390708e-07, - "loss": 0.79466748, - "num_input_tokens_seen": 280845335, - "step": 13019, - "time_per_iteration": 2.6806087493896484 - }, - { - "auxiliary_loss_clip": 0.01065952, - "auxiliary_loss_mlp": 0.01028104, - "balance_loss_clip": 1.03685999, - "balance_loss_mlp": 1.01612306, - "epoch": 0.7828047497369608, - "flos": 28840649702400.0, - "grad_norm": 1.4806512863046632, - "language_loss": 0.67511666, - "learning_rate": 4.746634805529852e-07, - "loss": 0.6960572, - "num_input_tokens_seen": 280867145, - "step": 13020, - "time_per_iteration": 2.9394872188568115 - }, - { - "auxiliary_loss_clip": 0.01099304, - "auxiliary_loss_mlp": 0.0103012, - "balance_loss_clip": 1.03999841, - "balance_loss_mlp": 1.01744223, - "epoch": 0.7828648729896287, - "flos": 23257689759360.0, - "grad_norm": 3.1494596140171787, - "language_loss": 0.62587798, - "learning_rate": 4.7441160960781325e-07, - "loss": 0.64717221, - "num_input_tokens_seen": 280886185, - "step": 13021, - "time_per_iteration": 2.6747660636901855 - }, - { - "auxiliary_loss_clip": 0.01107745, - "auxiliary_loss_mlp": 0.01035666, - "balance_loss_clip": 1.03731537, - "balance_loss_mlp": 1.02393007, - "epoch": 0.7829249962422967, - "flos": 25265670831360.0, - "grad_norm": 1.6912709426048431, - "language_loss": 0.69153851, - "learning_rate": 4.7415979651310636e-07, - "loss": 0.71297264, - "num_input_tokens_seen": 280907665, - "step": 13022, - "time_per_iteration": 2.698918342590332 - }, - { - "auxiliary_loss_clip": 0.00980906, - "auxiliary_loss_mlp": 0.0100189, - "balance_loss_clip": 1.01163054, - "balance_loss_mlp": 1.00038803, - "epoch": 0.7829851194949646, - "flos": 70722044645760.0, - "grad_norm": 0.6377469168354571, - "language_loss": 0.56205934, - "learning_rate": 4.739080412784131e-07, - "loss": 0.58188736, - "num_input_tokens_seen": 280971405, - "step": 13023, - "time_per_iteration": 3.4054768085479736 - }, - { - "auxiliary_loss_clip": 0.0107826, - "auxiliary_loss_mlp": 0.01032398, - "balance_loss_clip": 1.03205729, - "balance_loss_mlp": 1.01958895, - "epoch": 0.7830452427476327, - "flos": 25660795415040.0, - "grad_norm": 1.7775757007122028, - "language_loss": 0.67073244, - "learning_rate": 4.736563439132792e-07, - "loss": 0.69183898, - "num_input_tokens_seen": 280989615, - "step": 13024, - "time_per_iteration": 2.646439790725708 - }, - { - "auxiliary_loss_clip": 0.01112317, - "auxiliary_loss_mlp": 0.01028491, - "balance_loss_clip": 1.03878796, - "balance_loss_mlp": 1.01559806, - "epoch": 0.7831053660003006, - "flos": 22784315397120.0, - "grad_norm": 1.5682779156650977, - "language_loss": 0.77674961, - "learning_rate": 4.734047044272498e-07, - "loss": 0.79815769, - "num_input_tokens_seen": 281009450, - "step": 13025, - "time_per_iteration": 2.632951021194458 - }, - { - "auxiliary_loss_clip": 0.01084338, - "auxiliary_loss_mlp": 0.01036499, - "balance_loss_clip": 1.03556383, - "balance_loss_mlp": 1.02404797, - "epoch": 0.7831654892529686, - "flos": 25812267068160.0, - "grad_norm": 2.0934383648650194, - "language_loss": 0.78239512, - "learning_rate": 4.731531228298673e-07, - "loss": 0.80360353, - "num_input_tokens_seen": 281028120, - "step": 13026, - "time_per_iteration": 2.7387208938598633 - }, - { - "auxiliary_loss_clip": 0.01097191, - "auxiliary_loss_mlp": 0.01028382, - "balance_loss_clip": 1.03798652, - "balance_loss_mlp": 1.01656842, - "epoch": 0.7832256125056366, - "flos": 20771557816320.0, - "grad_norm": 2.1298369773301, - "language_loss": 0.75428832, - "learning_rate": 4.729015991306715e-07, - "loss": 0.77554405, - "num_input_tokens_seen": 281042130, - "step": 13027, - "time_per_iteration": 2.62705135345459 - }, - { - "auxiliary_loss_clip": 0.01102237, - "auxiliary_loss_mlp": 0.01031953, - "balance_loss_clip": 1.04018044, - "balance_loss_mlp": 1.01980579, - "epoch": 0.7832857357583045, - "flos": 21506541909120.0, - "grad_norm": 1.7296473073785772, - "language_loss": 0.70366251, - "learning_rate": 4.726501333391997e-07, - "loss": 0.72500432, - "num_input_tokens_seen": 281060945, - "step": 13028, - "time_per_iteration": 2.651749849319458 - }, - { - "auxiliary_loss_clip": 0.01063459, - "auxiliary_loss_mlp": 0.01038176, - "balance_loss_clip": 1.03720903, - "balance_loss_mlp": 1.02482486, - "epoch": 0.7833458590109725, - "flos": 18077791305600.0, - "grad_norm": 2.132518402701666, - "language_loss": 0.68704486, - "learning_rate": 4.7239872546498774e-07, - "loss": 0.70806122, - "num_input_tokens_seen": 281079270, - "step": 13029, - "time_per_iteration": 2.733846664428711 - }, - { - "auxiliary_loss_clip": 0.01085127, - "auxiliary_loss_mlp": 0.01033179, - "balance_loss_clip": 1.03914356, - "balance_loss_mlp": 1.01970291, - "epoch": 0.7834059822636404, - "flos": 28288738252800.0, - "grad_norm": 1.9111735577193074, - "language_loss": 0.81041169, - "learning_rate": 4.721473755175698e-07, - "loss": 0.83159471, - "num_input_tokens_seen": 281099500, - "step": 13030, - "time_per_iteration": 2.7992770671844482 - }, - { - "auxiliary_loss_clip": 0.01104778, - "auxiliary_loss_mlp": 0.01033096, - "balance_loss_clip": 1.03917181, - "balance_loss_mlp": 1.0206331, - "epoch": 0.7834661055163085, - "flos": 31686211088640.0, - "grad_norm": 2.9451675534531847, - "language_loss": 0.70219892, - "learning_rate": 4.71896083506476e-07, - "loss": 0.72357768, - "num_input_tokens_seen": 281121250, - "step": 13031, - "time_per_iteration": 2.9042952060699463 - }, - { - "auxiliary_loss_clip": 0.01072572, - "auxiliary_loss_mlp": 0.01034559, - "balance_loss_clip": 1.03533792, - "balance_loss_mlp": 1.02237034, - "epoch": 0.7835262287689764, - "flos": 12933192942720.0, - "grad_norm": 2.323625290086717, - "language_loss": 0.790016, - "learning_rate": 4.7164484944123574e-07, - "loss": 0.81108725, - "num_input_tokens_seen": 281138760, - "step": 13032, - "time_per_iteration": 2.750812292098999 - }, - { - "auxiliary_loss_clip": 0.0110433, - "auxiliary_loss_mlp": 0.01040225, - "balance_loss_clip": 1.03909242, - "balance_loss_mlp": 1.02739763, - "epoch": 0.7835863520216444, - "flos": 16143211676160.0, - "grad_norm": 2.0684316430418463, - "language_loss": 0.62812865, - "learning_rate": 4.7139367333137726e-07, - "loss": 0.64957428, - "num_input_tokens_seen": 281157420, - "step": 13033, - "time_per_iteration": 2.6421468257904053 - }, - { - "auxiliary_loss_clip": 0.01098998, - "auxiliary_loss_mlp": 0.01034317, - "balance_loss_clip": 1.03790116, - "balance_loss_mlp": 1.02132297, - "epoch": 0.7836464752743123, - "flos": 11509909459200.0, - "grad_norm": 1.58357633529001, - "language_loss": 0.71756327, - "learning_rate": 4.7114255518642255e-07, - "loss": 0.73889643, - "num_input_tokens_seen": 281174620, - "step": 13034, - "time_per_iteration": 2.7166271209716797 - }, - { - "auxiliary_loss_clip": 0.01113235, - "auxiliary_loss_mlp": 0.00771091, - "balance_loss_clip": 1.03961957, - "balance_loss_mlp": 1.00013566, - "epoch": 0.7837065985269803, - "flos": 18223696350720.0, - "grad_norm": 1.6566949403371967, - "language_loss": 0.72002685, - "learning_rate": 4.7089149501589555e-07, - "loss": 0.73887014, - "num_input_tokens_seen": 281193865, - "step": 13035, - "time_per_iteration": 2.5778520107269287 - }, - { - "auxiliary_loss_clip": 0.01112728, - "auxiliary_loss_mlp": 0.01035529, - "balance_loss_clip": 1.03951585, - "balance_loss_mlp": 1.02208817, - "epoch": 0.7837667217796482, - "flos": 24754410599040.0, - "grad_norm": 2.9879625935132372, - "language_loss": 0.66463214, - "learning_rate": 4.7064049282931664e-07, - "loss": 0.68611467, - "num_input_tokens_seen": 281212250, - "step": 13036, - "time_per_iteration": 2.6302857398986816 - }, - { - "auxiliary_loss_clip": 0.01104467, - "auxiliary_loss_mlp": 0.01039494, - "balance_loss_clip": 1.03855228, - "balance_loss_mlp": 1.02618408, - "epoch": 0.7838268450323163, - "flos": 22383121415040.0, - "grad_norm": 2.0949975987912848, - "language_loss": 0.73010111, - "learning_rate": 4.703895486362031e-07, - "loss": 0.75154078, - "num_input_tokens_seen": 281230850, - "step": 13037, - "time_per_iteration": 2.6575746536254883 - }, - { - "auxiliary_loss_clip": 0.01070616, - "auxiliary_loss_mlp": 0.01035879, - "balance_loss_clip": 1.03389454, - "balance_loss_mlp": 1.02229476, - "epoch": 0.7838869682849842, - "flos": 19500284689920.0, - "grad_norm": 5.499006833043144, - "language_loss": 0.59598082, - "learning_rate": 4.701386624460717e-07, - "loss": 0.61704576, - "num_input_tokens_seen": 281249810, - "step": 13038, - "time_per_iteration": 2.6556448936462402 - }, - { - "auxiliary_loss_clip": 0.01089544, - "auxiliary_loss_mlp": 0.01028906, - "balance_loss_clip": 1.0388062, - "balance_loss_mlp": 1.0172174, - "epoch": 0.7839470915376522, - "flos": 32892845690880.0, - "grad_norm": 1.750335160170137, - "language_loss": 0.68257546, - "learning_rate": 4.698878342684349e-07, - "loss": 0.70375991, - "num_input_tokens_seen": 281273730, - "step": 13039, - "time_per_iteration": 2.760946273803711 - }, - { - "auxiliary_loss_clip": 0.01072076, - "auxiliary_loss_mlp": 0.01024882, - "balance_loss_clip": 1.03432715, - "balance_loss_mlp": 1.01383781, - "epoch": 0.7840072147903202, - "flos": 29676003373440.0, - "grad_norm": 1.826043040750904, - "language_loss": 0.69417781, - "learning_rate": 4.6963706411280537e-07, - "loss": 0.71514744, - "num_input_tokens_seen": 281293670, - "step": 13040, - "time_per_iteration": 2.7545461654663086 - }, - { - "auxiliary_loss_clip": 0.0106802, - "auxiliary_loss_mlp": 0.01035678, - "balance_loss_clip": 1.03712749, - "balance_loss_mlp": 1.02223086, - "epoch": 0.7840673380429881, - "flos": 18186744234240.0, - "grad_norm": 1.5142145779529246, - "language_loss": 0.67758179, - "learning_rate": 4.6938635198869116e-07, - "loss": 0.69861877, - "num_input_tokens_seen": 281313070, - "step": 13041, - "time_per_iteration": 2.7630157470703125 - }, - { - "auxiliary_loss_clip": 0.01022608, - "auxiliary_loss_mlp": 0.00751599, - "balance_loss_clip": 1.0097084, - "balance_loss_mlp": 0.99966377, - "epoch": 0.7841274612956561, - "flos": 66346006613760.0, - "grad_norm": 0.6656190181226946, - "language_loss": 0.57380033, - "learning_rate": 4.691356979055998e-07, - "loss": 0.59154236, - "num_input_tokens_seen": 281374880, - "step": 13042, - "time_per_iteration": 3.1374757289886475 - }, - { - "auxiliary_loss_clip": 0.01087388, - "auxiliary_loss_mlp": 0.01033015, - "balance_loss_clip": 1.03713918, - "balance_loss_mlp": 1.02007461, - "epoch": 0.784187584548324, - "flos": 26648482665600.0, - "grad_norm": 2.1244828686221267, - "language_loss": 0.83795989, - "learning_rate": 4.688851018730369e-07, - "loss": 0.85916388, - "num_input_tokens_seen": 281392620, - "step": 13043, - "time_per_iteration": 2.793748378753662 - }, - { - "auxiliary_loss_clip": 0.01095712, - "auxiliary_loss_mlp": 0.01027767, - "balance_loss_clip": 1.03719783, - "balance_loss_mlp": 1.0161922, - "epoch": 0.7842477078009921, - "flos": 25740158515200.0, - "grad_norm": 1.3834924746992494, - "language_loss": 0.88441205, - "learning_rate": 4.6863456390050425e-07, - "loss": 0.90564686, - "num_input_tokens_seen": 281413140, - "step": 13044, - "time_per_iteration": 4.261160135269165 - }, - { - "auxiliary_loss_clip": 0.01093506, - "auxiliary_loss_mlp": 0.01034687, - "balance_loss_clip": 1.03787422, - "balance_loss_mlp": 1.02180016, - "epoch": 0.78430783105366, - "flos": 21980957765760.0, - "grad_norm": 1.825374480580212, - "language_loss": 0.78958154, - "learning_rate": 4.6838408399750195e-07, - "loss": 0.81086344, - "num_input_tokens_seen": 281430860, - "step": 13045, - "time_per_iteration": 2.7708632946014404 - }, - { - "auxiliary_loss_clip": 0.01084228, - "auxiliary_loss_mlp": 0.01031577, - "balance_loss_clip": 1.03655803, - "balance_loss_mlp": 1.0191431, - "epoch": 0.784367954306328, - "flos": 23842279607040.0, - "grad_norm": 1.484345043483713, - "language_loss": 0.72495216, - "learning_rate": 4.6813366217352925e-07, - "loss": 0.7461102, - "num_input_tokens_seen": 281451385, - "step": 13046, - "time_per_iteration": 4.295615196228027 - }, - { - "auxiliary_loss_clip": 0.01070358, - "auxiliary_loss_mlp": 0.01035911, - "balance_loss_clip": 1.04044282, - "balance_loss_mlp": 1.02267289, - "epoch": 0.7844280775589959, - "flos": 24826662806400.0, - "grad_norm": 1.5168340119310013, - "language_loss": 0.62780952, - "learning_rate": 4.678832984380809e-07, - "loss": 0.6488722, - "num_input_tokens_seen": 281472255, - "step": 13047, - "time_per_iteration": 4.33956503868103 - }, - { - "auxiliary_loss_clip": 0.01100709, - "auxiliary_loss_mlp": 0.01027981, - "balance_loss_clip": 1.03916669, - "balance_loss_mlp": 1.01601255, - "epoch": 0.7844882008116639, - "flos": 22455660931200.0, - "grad_norm": 1.6255681359432697, - "language_loss": 0.73295152, - "learning_rate": 4.676329928006515e-07, - "loss": 0.75423837, - "num_input_tokens_seen": 281492860, - "step": 13048, - "time_per_iteration": 2.764153003692627 - }, - { - "auxiliary_loss_clip": 0.01087112, - "auxiliary_loss_mlp": 0.0103201, - "balance_loss_clip": 1.03815794, - "balance_loss_mlp": 1.01965356, - "epoch": 0.7845483240643318, - "flos": 26104041244800.0, - "grad_norm": 3.259574846755966, - "language_loss": 0.74822855, - "learning_rate": 4.6738274527073243e-07, - "loss": 0.76941979, - "num_input_tokens_seen": 281511815, - "step": 13049, - "time_per_iteration": 2.702545642852783 - }, - { - "auxiliary_loss_clip": 0.01113727, - "auxiliary_loss_mlp": 0.01032718, - "balance_loss_clip": 1.03731608, - "balance_loss_mlp": 1.01894963, - "epoch": 0.7846084473169999, - "flos": 19354307817600.0, - "grad_norm": 1.71411914117224, - "language_loss": 0.72622865, - "learning_rate": 4.6713255585781454e-07, - "loss": 0.74769306, - "num_input_tokens_seen": 281530090, - "step": 13050, - "time_per_iteration": 2.6567511558532715 - }, - { - "auxiliary_loss_clip": 0.01098536, - "auxiliary_loss_mlp": 0.01034296, - "balance_loss_clip": 1.03764224, - "balance_loss_mlp": 1.02170706, - "epoch": 0.7846685705696678, - "flos": 23325811902720.0, - "grad_norm": 1.9970425884506249, - "language_loss": 0.73258287, - "learning_rate": 4.668824245713825e-07, - "loss": 0.75391114, - "num_input_tokens_seen": 281547075, - "step": 13051, - "time_per_iteration": 4.220673322677612 - }, - { - "auxiliary_loss_clip": 0.01112899, - "auxiliary_loss_mlp": 0.01034321, - "balance_loss_clip": 1.03919625, - "balance_loss_mlp": 1.02135718, - "epoch": 0.7847286938223358, - "flos": 35809545962880.0, - "grad_norm": 2.6887410249812578, - "language_loss": 0.72721338, - "learning_rate": 4.666323514209227e-07, - "loss": 0.7486856, - "num_input_tokens_seen": 281568080, - "step": 13052, - "time_per_iteration": 2.7622361183166504 - }, - { - "auxiliary_loss_clip": 0.0108619, - "auxiliary_loss_mlp": 0.01035577, - "balance_loss_clip": 1.03937328, - "balance_loss_mlp": 1.02357841, - "epoch": 0.7847888170750038, - "flos": 18478159274880.0, - "grad_norm": 1.82904296097524, - "language_loss": 0.69018829, - "learning_rate": 4.663823364159183e-07, - "loss": 0.71140599, - "num_input_tokens_seen": 281586925, - "step": 13053, - "time_per_iteration": 2.7101058959960938 - }, - { - "auxiliary_loss_clip": 0.0109323, - "auxiliary_loss_mlp": 0.01031564, - "balance_loss_clip": 1.03785491, - "balance_loss_mlp": 1.01989341, - "epoch": 0.7848489403276717, - "flos": 25119155255040.0, - "grad_norm": 2.155883968401707, - "language_loss": 0.69833845, - "learning_rate": 4.6613237956584893e-07, - "loss": 0.71958637, - "num_input_tokens_seen": 281603915, - "step": 13054, - "time_per_iteration": 2.6558749675750732 - }, - { - "auxiliary_loss_clip": 0.01102359, - "auxiliary_loss_mlp": 0.01035501, - "balance_loss_clip": 1.03816795, - "balance_loss_mlp": 1.02254295, - "epoch": 0.7849090635803397, - "flos": 26502433966080.0, - "grad_norm": 1.6743772106587247, - "language_loss": 0.76095474, - "learning_rate": 4.658824808801938e-07, - "loss": 0.78233331, - "num_input_tokens_seen": 281624220, - "step": 13055, - "time_per_iteration": 2.729825019836426 - }, - { - "auxiliary_loss_clip": 0.01115191, - "auxiliary_loss_mlp": 0.01034507, - "balance_loss_clip": 1.03995335, - "balance_loss_mlp": 1.02139974, - "epoch": 0.7849691868330076, - "flos": 20959658363520.0, - "grad_norm": 1.870278317520838, - "language_loss": 0.7499572, - "learning_rate": 4.656326403684283e-07, - "loss": 0.77145422, - "num_input_tokens_seen": 281642325, - "step": 13056, - "time_per_iteration": 2.6321020126342773 - }, - { - "auxiliary_loss_clip": 0.01048067, - "auxiliary_loss_mlp": 0.0103263, - "balance_loss_clip": 1.03739357, - "balance_loss_mlp": 1.01989865, - "epoch": 0.7850293100856757, - "flos": 26067484177920.0, - "grad_norm": 1.7420143195586486, - "language_loss": 0.70014071, - "learning_rate": 4.6538285804002744e-07, - "loss": 0.72094762, - "num_input_tokens_seen": 281663065, - "step": 13057, - "time_per_iteration": 2.8007147312164307 - }, - { - "auxiliary_loss_clip": 0.01064676, - "auxiliary_loss_mlp": 0.01033287, - "balance_loss_clip": 1.03794479, - "balance_loss_mlp": 1.02130675, - "epoch": 0.7850894333383436, - "flos": 22491894775680.0, - "grad_norm": 1.791422043134008, - "language_loss": 0.76534569, - "learning_rate": 4.6513313390446175e-07, - "loss": 0.78632534, - "num_input_tokens_seen": 281681005, - "step": 13058, - "time_per_iteration": 2.7110915184020996 - }, - { - "auxiliary_loss_clip": 0.01101284, - "auxiliary_loss_mlp": 0.01036049, - "balance_loss_clip": 1.03946376, - "balance_loss_mlp": 1.0238781, - "epoch": 0.7851495565910116, - "flos": 20558643949440.0, - "grad_norm": 1.5851127868658192, - "language_loss": 0.70834202, - "learning_rate": 4.6488346797120146e-07, - "loss": 0.72971535, - "num_input_tokens_seen": 281697965, - "step": 13059, - "time_per_iteration": 2.7031941413879395 - }, - { - "auxiliary_loss_clip": 0.01081291, - "auxiliary_loss_mlp": 0.01038886, - "balance_loss_clip": 1.03579831, - "balance_loss_mlp": 1.02460492, - "epoch": 0.7852096798436795, - "flos": 15924838942080.0, - "grad_norm": 2.081733102958074, - "language_loss": 0.76698899, - "learning_rate": 4.646338602497144e-07, - "loss": 0.78819072, - "num_input_tokens_seen": 281716035, - "step": 13060, - "time_per_iteration": 2.7939200401306152 - }, - { - "auxiliary_loss_clip": 0.01083148, - "auxiliary_loss_mlp": 0.01031719, - "balance_loss_clip": 1.03790545, - "balance_loss_mlp": 1.01883268, - "epoch": 0.7852698030963475, - "flos": 19062282245760.0, - "grad_norm": 2.323604844819863, - "language_loss": 0.77162534, - "learning_rate": 4.643843107494654e-07, - "loss": 0.79277396, - "num_input_tokens_seen": 281732815, - "step": 13061, - "time_per_iteration": 2.697397232055664 - }, - { - "auxiliary_loss_clip": 0.01074028, - "auxiliary_loss_mlp": 0.01034387, - "balance_loss_clip": 1.03479552, - "balance_loss_mlp": 1.02089262, - "epoch": 0.7853299263490154, - "flos": 24644380262400.0, - "grad_norm": 1.8894100648574905, - "language_loss": 0.74005646, - "learning_rate": 4.641348194799164e-07, - "loss": 0.76114058, - "num_input_tokens_seen": 281751980, - "step": 13062, - "time_per_iteration": 2.9962854385375977 - }, - { - "auxiliary_loss_clip": 0.01097852, - "auxiliary_loss_mlp": 0.01032338, - "balance_loss_clip": 1.03713512, - "balance_loss_mlp": 1.02026176, - "epoch": 0.7853900496016835, - "flos": 22017981709440.0, - "grad_norm": 1.6526268980906231, - "language_loss": 0.68907607, - "learning_rate": 4.638853864505297e-07, - "loss": 0.71037793, - "num_input_tokens_seen": 281772670, - "step": 13063, - "time_per_iteration": 2.7347474098205566 - }, - { - "auxiliary_loss_clip": 0.01099713, - "auxiliary_loss_mlp": 0.01036078, - "balance_loss_clip": 1.04048038, - "balance_loss_mlp": 1.02360916, - "epoch": 0.7854501728543514, - "flos": 30227412032640.0, - "grad_norm": 4.546851509745459, - "language_loss": 0.72635663, - "learning_rate": 4.636360116707625e-07, - "loss": 0.74771458, - "num_input_tokens_seen": 281792930, - "step": 13064, - "time_per_iteration": 2.7636148929595947 - }, - { - "auxiliary_loss_clip": 0.01082833, - "auxiliary_loss_mlp": 0.01033112, - "balance_loss_clip": 1.03790045, - "balance_loss_mlp": 1.02079129, - "epoch": 0.7855102961070194, - "flos": 18843694030080.0, - "grad_norm": 14.965350757481792, - "language_loss": 0.67957228, - "learning_rate": 4.633866951500718e-07, - "loss": 0.70073175, - "num_input_tokens_seen": 281811805, - "step": 13065, - "time_per_iteration": 2.7619290351867676 - }, - { - "auxiliary_loss_clip": 0.01097669, - "auxiliary_loss_mlp": 0.01037896, - "balance_loss_clip": 1.04063308, - "balance_loss_mlp": 1.02562392, - "epoch": 0.7855704193596874, - "flos": 22309971367680.0, - "grad_norm": 1.6867324299047715, - "language_loss": 0.75999427, - "learning_rate": 4.6313743689791196e-07, - "loss": 0.78134984, - "num_input_tokens_seen": 281831885, - "step": 13066, - "time_per_iteration": 2.647052764892578 - }, - { - "auxiliary_loss_clip": 0.0103061, - "auxiliary_loss_mlp": 0.01006066, - "balance_loss_clip": 1.00811362, - "balance_loss_mlp": 1.00509405, - "epoch": 0.7856305426123553, - "flos": 60004434407040.0, - "grad_norm": 0.7063334807152991, - "language_loss": 0.5335499, - "learning_rate": 4.628882369237346e-07, - "loss": 0.55391669, - "num_input_tokens_seen": 281900310, - "step": 13067, - "time_per_iteration": 3.2783384323120117 - }, - { - "auxiliary_loss_clip": 0.01065395, - "auxiliary_loss_mlp": 0.01032148, - "balance_loss_clip": 1.03609753, - "balance_loss_mlp": 1.01884413, - "epoch": 0.7856906658650233, - "flos": 21868593045120.0, - "grad_norm": 1.5153182614776801, - "language_loss": 0.67582923, - "learning_rate": 4.62639095236989e-07, - "loss": 0.69680464, - "num_input_tokens_seen": 281918870, - "step": 13068, - "time_per_iteration": 2.818237543106079 - }, - { - "auxiliary_loss_clip": 0.01076742, - "auxiliary_loss_mlp": 0.01030057, - "balance_loss_clip": 1.03852606, - "balance_loss_mlp": 1.01839852, - "epoch": 0.7857507891176913, - "flos": 23622937205760.0, - "grad_norm": 2.4110222950654325, - "language_loss": 0.68040943, - "learning_rate": 4.6239001184712267e-07, - "loss": 0.70147741, - "num_input_tokens_seen": 281936905, - "step": 13069, - "time_per_iteration": 2.7654619216918945 - }, - { - "auxiliary_loss_clip": 0.01103004, - "auxiliary_loss_mlp": 0.01035827, - "balance_loss_clip": 1.04032803, - "balance_loss_mlp": 1.02331567, - "epoch": 0.7858109123703593, - "flos": 25520061928320.0, - "grad_norm": 1.6503036246986864, - "language_loss": 0.76820791, - "learning_rate": 4.6214098676358195e-07, - "loss": 0.7895962, - "num_input_tokens_seen": 281955625, - "step": 13070, - "time_per_iteration": 2.7123591899871826 - }, - { - "auxiliary_loss_clip": 0.0105121, - "auxiliary_loss_mlp": 0.0105136, - "balance_loss_clip": 1.030967, - "balance_loss_mlp": 1.03746009, - "epoch": 0.7858710356230272, - "flos": 17457398576640.0, - "grad_norm": 1.7605883689591728, - "language_loss": 0.65229589, - "learning_rate": 4.618920199958083e-07, - "loss": 0.6733216, - "num_input_tokens_seen": 281973285, - "step": 13071, - "time_per_iteration": 2.727679491043091 - }, - { - "auxiliary_loss_clip": 0.01063123, - "auxiliary_loss_mlp": 0.0103513, - "balance_loss_clip": 1.03286123, - "balance_loss_mlp": 1.02270818, - "epoch": 0.7859311588756952, - "flos": 24679680353280.0, - "grad_norm": 1.7243596413538878, - "language_loss": 0.73917699, - "learning_rate": 4.616431115532442e-07, - "loss": 0.76015961, - "num_input_tokens_seen": 281991410, - "step": 13072, - "time_per_iteration": 2.8985819816589355 - }, - { - "auxiliary_loss_clip": 0.01097172, - "auxiliary_loss_mlp": 0.01032829, - "balance_loss_clip": 1.04014218, - "balance_loss_mlp": 1.0194838, - "epoch": 0.7859912821283631, - "flos": 21799142098560.0, - "grad_norm": 4.385793601952052, - "language_loss": 0.71439523, - "learning_rate": 4.613942614453268e-07, - "loss": 0.73569524, - "num_input_tokens_seen": 282010845, - "step": 13073, - "time_per_iteration": 2.670741558074951 - }, - { - "auxiliary_loss_clip": 0.01085075, - "auxiliary_loss_mlp": 0.01035125, - "balance_loss_clip": 1.0389545, - "balance_loss_mlp": 1.0218693, - "epoch": 0.7860514053810311, - "flos": 20847293642880.0, - "grad_norm": 1.6142243935129328, - "language_loss": 0.76601768, - "learning_rate": 4.611454696814938e-07, - "loss": 0.78721976, - "num_input_tokens_seen": 282029635, - "step": 13074, - "time_per_iteration": 2.715064287185669 - }, - { - "auxiliary_loss_clip": 0.01067309, - "auxiliary_loss_mlp": 0.01034423, - "balance_loss_clip": 1.03506911, - "balance_loss_mlp": 1.0224185, - "epoch": 0.786111528633699, - "flos": 24315689882880.0, - "grad_norm": 1.7966754252742998, - "language_loss": 0.75166345, - "learning_rate": 4.608967362711782e-07, - "loss": 0.77268076, - "num_input_tokens_seen": 282050285, - "step": 13075, - "time_per_iteration": 2.8381521701812744 - }, - { - "auxiliary_loss_clip": 0.01083185, - "auxiliary_loss_mlp": 0.01026692, - "balance_loss_clip": 1.04080176, - "balance_loss_mlp": 1.01497984, - "epoch": 0.7861716518863671, - "flos": 24353180703360.0, - "grad_norm": 1.743827758665396, - "language_loss": 0.69089484, - "learning_rate": 4.6064806122381283e-07, - "loss": 0.71199363, - "num_input_tokens_seen": 282071040, - "step": 13076, - "time_per_iteration": 2.812002658843994 - }, - { - "auxiliary_loss_clip": 0.01095604, - "auxiliary_loss_mlp": 0.01028537, - "balance_loss_clip": 1.03609765, - "balance_loss_mlp": 1.01606214, - "epoch": 0.786231775139035, - "flos": 14022399006720.0, - "grad_norm": 2.296864016069315, - "language_loss": 0.80343485, - "learning_rate": 4.603994445488282e-07, - "loss": 0.82467622, - "num_input_tokens_seen": 282086610, - "step": 13077, - "time_per_iteration": 2.690382480621338 - }, - { - "auxiliary_loss_clip": 0.0110006, - "auxiliary_loss_mlp": 0.0103229, - "balance_loss_clip": 1.039482, - "balance_loss_mlp": 1.01980269, - "epoch": 0.786291898391703, - "flos": 33724248865920.0, - "grad_norm": 1.6714014639715435, - "language_loss": 0.70845038, - "learning_rate": 4.6015088625564956e-07, - "loss": 0.72977388, - "num_input_tokens_seen": 282107440, - "step": 13078, - "time_per_iteration": 3.024754524230957 - }, - { - "auxiliary_loss_clip": 0.01096328, - "auxiliary_loss_mlp": 0.01035627, - "balance_loss_clip": 1.03739369, - "balance_loss_mlp": 1.02363431, - "epoch": 0.786352021644371, - "flos": 25811476968960.0, - "grad_norm": 1.523123466356383, - "language_loss": 0.81217003, - "learning_rate": 4.599023863537039e-07, - "loss": 0.83348954, - "num_input_tokens_seen": 282127290, - "step": 13079, - "time_per_iteration": 2.6527066230773926 - }, - { - "auxiliary_loss_clip": 0.01078236, - "auxiliary_loss_mlp": 0.01032627, - "balance_loss_clip": 1.03953731, - "balance_loss_mlp": 1.0202589, - "epoch": 0.7864121448970389, - "flos": 28910818920960.0, - "grad_norm": 1.8147971205749318, - "language_loss": 0.68534672, - "learning_rate": 4.596539448524146e-07, - "loss": 0.70645535, - "num_input_tokens_seen": 282147505, - "step": 13080, - "time_per_iteration": 2.7910823822021484 - }, - { - "auxiliary_loss_clip": 0.01099002, - "auxiliary_loss_mlp": 0.01034583, - "balance_loss_clip": 1.03815937, - "balance_loss_mlp": 1.02227473, - "epoch": 0.7864722681497069, - "flos": 19208833735680.0, - "grad_norm": 1.6728405689924877, - "language_loss": 0.69698668, - "learning_rate": 4.594055617612016e-07, - "loss": 0.71832252, - "num_input_tokens_seen": 282166450, - "step": 13081, - "time_per_iteration": 2.676067590713501 - }, - { - "auxiliary_loss_clip": 0.01086253, - "auxiliary_loss_mlp": 0.01035065, - "balance_loss_clip": 1.03589058, - "balance_loss_mlp": 1.0229888, - "epoch": 0.7865323914023749, - "flos": 21871573873920.0, - "grad_norm": 1.8288911392242622, - "language_loss": 0.68142998, - "learning_rate": 4.591572370894838e-07, - "loss": 0.70264316, - "num_input_tokens_seen": 282186465, - "step": 13082, - "time_per_iteration": 2.671044111251831 - }, - { - "auxiliary_loss_clip": 0.01081636, - "auxiliary_loss_mlp": 0.01036406, - "balance_loss_clip": 1.03695893, - "balance_loss_mlp": 1.02418661, - "epoch": 0.7865925146550429, - "flos": 25520313323520.0, - "grad_norm": 1.7965603666617915, - "language_loss": 0.66121304, - "learning_rate": 4.589089708466789e-07, - "loss": 0.68239349, - "num_input_tokens_seen": 282207180, - "step": 13083, - "time_per_iteration": 2.77585506439209 - }, - { - "auxiliary_loss_clip": 0.01089696, - "auxiliary_loss_mlp": 0.01030934, - "balance_loss_clip": 1.03773546, - "balance_loss_mlp": 1.01740384, - "epoch": 0.7866526379077108, - "flos": 19097366855040.0, - "grad_norm": 2.0746759351122614, - "language_loss": 0.74140465, - "learning_rate": 4.5866076304220015e-07, - "loss": 0.76261097, - "num_input_tokens_seen": 282225865, - "step": 13084, - "time_per_iteration": 5.905508518218994 - }, - { - "auxiliary_loss_clip": 0.01083182, - "auxiliary_loss_mlp": 0.01037679, - "balance_loss_clip": 1.03682792, - "balance_loss_mlp": 1.02519202, - "epoch": 0.7867127611603788, - "flos": 16173771171840.0, - "grad_norm": 3.4926036147980635, - "language_loss": 0.70331782, - "learning_rate": 4.584126136854591e-07, - "loss": 0.72452641, - "num_input_tokens_seen": 282242895, - "step": 13085, - "time_per_iteration": 2.689375162124634 - }, - { - "auxiliary_loss_clip": 0.01086151, - "auxiliary_loss_mlp": 0.01030206, - "balance_loss_clip": 1.03600478, - "balance_loss_mlp": 1.01758742, - "epoch": 0.7867728844130467, - "flos": 20773640805120.0, - "grad_norm": 2.163841211360238, - "language_loss": 0.7244603, - "learning_rate": 4.5816452278586617e-07, - "loss": 0.74562383, - "num_input_tokens_seen": 282260425, - "step": 13086, - "time_per_iteration": 4.172788381576538 - }, - { - "auxiliary_loss_clip": 0.01108157, - "auxiliary_loss_mlp": 0.01027186, - "balance_loss_clip": 1.03651989, - "balance_loss_mlp": 1.01503301, - "epoch": 0.7868330076657147, - "flos": 21760106993280.0, - "grad_norm": 1.9419848626775902, - "language_loss": 0.74776971, - "learning_rate": 4.5791649035282965e-07, - "loss": 0.7691232, - "num_input_tokens_seen": 282279335, - "step": 13087, - "time_per_iteration": 2.695462465286255 - }, - { - "auxiliary_loss_clip": 0.01085975, - "auxiliary_loss_mlp": 0.01031976, - "balance_loss_clip": 1.03558397, - "balance_loss_mlp": 1.02015603, - "epoch": 0.7868931309183826, - "flos": 25700692446720.0, - "grad_norm": 1.58603589617711, - "language_loss": 0.71365935, - "learning_rate": 4.5766851639575456e-07, - "loss": 0.73483884, - "num_input_tokens_seen": 282299905, - "step": 13088, - "time_per_iteration": 2.781475782394409 - }, - { - "auxiliary_loss_clip": 0.01029395, - "auxiliary_loss_mlp": 0.01003015, - "balance_loss_clip": 1.0068965, - "balance_loss_mlp": 1.0020256, - "epoch": 0.7869532541710507, - "flos": 64644883430400.0, - "grad_norm": 1.2260501594651212, - "language_loss": 0.55467439, - "learning_rate": 4.574206009240431e-07, - "loss": 0.5749985, - "num_input_tokens_seen": 282367620, - "step": 13089, - "time_per_iteration": 3.24120831489563 - }, - { - "auxiliary_loss_clip": 0.01017655, - "auxiliary_loss_mlp": 0.01001728, - "balance_loss_clip": 1.00651848, - "balance_loss_mlp": 1.0007323, - "epoch": 0.7870133774237186, - "flos": 67453600440960.0, - "grad_norm": 0.7641579994840295, - "language_loss": 0.49973857, - "learning_rate": 4.571727439470976e-07, - "loss": 0.51993239, - "num_input_tokens_seen": 282435695, - "step": 13090, - "time_per_iteration": 4.754423379898071 - }, - { - "auxiliary_loss_clip": 0.01099139, - "auxiliary_loss_mlp": 0.01031069, - "balance_loss_clip": 1.0383184, - "balance_loss_mlp": 1.01955974, - "epoch": 0.7870735006763866, - "flos": 26068310190720.0, - "grad_norm": 1.460212524446196, - "language_loss": 0.8408305, - "learning_rate": 4.5692494547431583e-07, - "loss": 0.86213255, - "num_input_tokens_seen": 282456025, - "step": 13091, - "time_per_iteration": 2.6467459201812744 - }, - { - "auxiliary_loss_clip": 0.01019902, - "auxiliary_loss_mlp": 0.01003454, - "balance_loss_clip": 1.00713682, - "balance_loss_mlp": 1.00247598, - "epoch": 0.7871336239290546, - "flos": 70289572896000.0, - "grad_norm": 0.7147506558128559, - "language_loss": 0.64014363, - "learning_rate": 4.566772055150947e-07, - "loss": 0.6603772, - "num_input_tokens_seen": 282520995, - "step": 13092, - "time_per_iteration": 3.2051150798797607 - }, - { - "auxiliary_loss_clip": 0.01088327, - "auxiliary_loss_mlp": 0.01035505, - "balance_loss_clip": 1.03942823, - "balance_loss_mlp": 1.0227139, - "epoch": 0.7871937471817225, - "flos": 15778574760960.0, - "grad_norm": 2.3884379503568076, - "language_loss": 0.79189074, - "learning_rate": 4.564295240788285e-07, - "loss": 0.81312907, - "num_input_tokens_seen": 282539355, - "step": 13093, - "time_per_iteration": 2.7134079933166504 - }, - { - "auxiliary_loss_clip": 0.01080576, - "auxiliary_loss_mlp": 0.01028467, - "balance_loss_clip": 1.03772855, - "balance_loss_mlp": 1.01671863, - "epoch": 0.7872538704343905, - "flos": 20485242506880.0, - "grad_norm": 1.8523965571373735, - "language_loss": 0.75549555, - "learning_rate": 4.561819011749106e-07, - "loss": 0.77658594, - "num_input_tokens_seen": 282555735, - "step": 13094, - "time_per_iteration": 2.7055883407592773 - }, - { - "auxiliary_loss_clip": 0.01061535, - "auxiliary_loss_mlp": 0.01044464, - "balance_loss_clip": 1.03247035, - "balance_loss_mlp": 1.030725, - "epoch": 0.7873139936870585, - "flos": 25082670015360.0, - "grad_norm": 1.6047845480222185, - "language_loss": 0.79805398, - "learning_rate": 4.5593433681272884e-07, - "loss": 0.81911397, - "num_input_tokens_seen": 282574550, - "step": 13095, - "time_per_iteration": 2.819106340408325 - }, - { - "auxiliary_loss_clip": 0.01098697, - "auxiliary_loss_mlp": 0.01032788, - "balance_loss_clip": 1.03697014, - "balance_loss_mlp": 1.02055073, - "epoch": 0.7873741169397265, - "flos": 30883176679680.0, - "grad_norm": 1.6143252232165546, - "language_loss": 0.67820108, - "learning_rate": 4.556868310016715e-07, - "loss": 0.69951594, - "num_input_tokens_seen": 282596520, - "step": 13096, - "time_per_iteration": 2.6944971084594727 - }, - { - "auxiliary_loss_clip": 0.01082196, - "auxiliary_loss_mlp": 0.01027058, - "balance_loss_clip": 1.0342679, - "balance_loss_mlp": 1.01628733, - "epoch": 0.7874342401923944, - "flos": 46791962242560.0, - "grad_norm": 1.8327451146164324, - "language_loss": 0.7056793, - "learning_rate": 4.55439383751125e-07, - "loss": 0.72677183, - "num_input_tokens_seen": 282620560, - "step": 13097, - "time_per_iteration": 2.969263792037964 - }, - { - "auxiliary_loss_clip": 0.01092033, - "auxiliary_loss_mlp": 0.01034929, - "balance_loss_clip": 1.04004323, - "balance_loss_mlp": 1.0221442, - "epoch": 0.7874943634450624, - "flos": 23584548545280.0, - "grad_norm": 1.6158173512871257, - "language_loss": 0.80720508, - "learning_rate": 4.5519199507047126e-07, - "loss": 0.82847476, - "num_input_tokens_seen": 282639830, - "step": 13098, - "time_per_iteration": 2.7234272956848145 - }, - { - "auxiliary_loss_clip": 0.01069091, - "auxiliary_loss_mlp": 0.01031845, - "balance_loss_clip": 1.03451467, - "balance_loss_mlp": 1.02053809, - "epoch": 0.7875544866977303, - "flos": 20191169859840.0, - "grad_norm": 2.07716673704352, - "language_loss": 0.73976696, - "learning_rate": 4.5494466496909177e-07, - "loss": 0.76077634, - "num_input_tokens_seen": 282660130, - "step": 13099, - "time_per_iteration": 2.7741127014160156 - }, - { - "auxiliary_loss_clip": 0.01087499, - "auxiliary_loss_mlp": 0.01024045, - "balance_loss_clip": 1.03627956, - "balance_loss_mlp": 1.01170659, - "epoch": 0.7876146099503983, - "flos": 22602571557120.0, - "grad_norm": 1.5896108161315186, - "language_loss": 0.78226274, - "learning_rate": 4.5469739345636603e-07, - "loss": 0.80337822, - "num_input_tokens_seen": 282681125, - "step": 13100, - "time_per_iteration": 2.7259294986724854 - }, - { - "auxiliary_loss_clip": 0.01101593, - "auxiliary_loss_mlp": 0.00771735, - "balance_loss_clip": 1.03714919, - "balance_loss_mlp": 1.00031686, - "epoch": 0.7876747332030662, - "flos": 10705833555840.0, - "grad_norm": 3.3947108231001772, - "language_loss": 0.66015649, - "learning_rate": 4.5445018054167007e-07, - "loss": 0.67888987, - "num_input_tokens_seen": 282696690, - "step": 13101, - "time_per_iteration": 2.6262006759643555 - }, - { - "auxiliary_loss_clip": 0.01086168, - "auxiliary_loss_mlp": 0.01030979, - "balance_loss_clip": 1.03619587, - "balance_loss_mlp": 1.01895058, - "epoch": 0.7877348564557343, - "flos": 38399315621760.0, - "grad_norm": 1.4292814509281728, - "language_loss": 0.77840889, - "learning_rate": 4.5420302623437745e-07, - "loss": 0.79958034, - "num_input_tokens_seen": 282721210, - "step": 13102, - "time_per_iteration": 3.016707420349121 - }, - { - "auxiliary_loss_clip": 0.01096566, - "auxiliary_loss_mlp": 0.01040471, - "balance_loss_clip": 1.0358392, - "balance_loss_mlp": 1.02863932, - "epoch": 0.7877949797084022, - "flos": 18329524796160.0, - "grad_norm": 1.7485518464366943, - "language_loss": 0.82362533, - "learning_rate": 4.5395593054386093e-07, - "loss": 0.84499568, - "num_input_tokens_seen": 282738505, - "step": 13103, - "time_per_iteration": 2.6577935218811035 - }, - { - "auxiliary_loss_clip": 0.01101133, - "auxiliary_loss_mlp": 0.01033792, - "balance_loss_clip": 1.03859389, - "balance_loss_mlp": 1.02039886, - "epoch": 0.7878551029610702, - "flos": 25806736373760.0, - "grad_norm": 3.304808366824196, - "language_loss": 0.8070327, - "learning_rate": 4.537088934794913e-07, - "loss": 0.8283819, - "num_input_tokens_seen": 282756895, - "step": 13104, - "time_per_iteration": 2.680666923522949 - }, - { - "auxiliary_loss_clip": 0.01111584, - "auxiliary_loss_mlp": 0.01034583, - "balance_loss_clip": 1.03829467, - "balance_loss_mlp": 1.02250695, - "epoch": 0.7879152262137382, - "flos": 22342685679360.0, - "grad_norm": 1.6276257181376157, - "language_loss": 0.74308252, - "learning_rate": 4.5346191505063515e-07, - "loss": 0.76454425, - "num_input_tokens_seen": 282774955, - "step": 13105, - "time_per_iteration": 2.5943186283111572 - }, - { - "auxiliary_loss_clip": 0.0105328, - "auxiliary_loss_mlp": 0.0104138, - "balance_loss_clip": 1.03382134, - "balance_loss_mlp": 1.02832067, - "epoch": 0.7879753494664061, - "flos": 24785329230720.0, - "grad_norm": 1.561193248297936, - "language_loss": 0.75636542, - "learning_rate": 4.5321499526665776e-07, - "loss": 0.77731198, - "num_input_tokens_seen": 282793165, - "step": 13106, - "time_per_iteration": 2.8052754402160645 - }, - { - "auxiliary_loss_clip": 0.01060642, - "auxiliary_loss_mlp": 0.01033724, - "balance_loss_clip": 1.0368247, - "balance_loss_mlp": 1.02129078, - "epoch": 0.7880354727190741, - "flos": 16909078487040.0, - "grad_norm": 2.2640209986182116, - "language_loss": 0.73844689, - "learning_rate": 4.5296813413692337e-07, - "loss": 0.75939053, - "num_input_tokens_seen": 282809820, - "step": 13107, - "time_per_iteration": 2.7168357372283936 - }, - { - "auxiliary_loss_clip": 0.01109075, - "auxiliary_loss_mlp": 0.01034958, - "balance_loss_clip": 1.03867579, - "balance_loss_mlp": 1.02291143, - "epoch": 0.7880955959717421, - "flos": 22230500526720.0, - "grad_norm": 1.5353613262891537, - "language_loss": 0.73295653, - "learning_rate": 4.5272133167079165e-07, - "loss": 0.7543968, - "num_input_tokens_seen": 282828600, - "step": 13108, - "time_per_iteration": 2.6911845207214355 - }, - { - "auxiliary_loss_clip": 0.01029486, - "auxiliary_loss_mlp": 0.00999387, - "balance_loss_clip": 1.00682902, - "balance_loss_mlp": 0.99848729, - "epoch": 0.7881557192244101, - "flos": 69183200131200.0, - "grad_norm": 0.890062717819184, - "language_loss": 0.60359526, - "learning_rate": 4.5247458787762216e-07, - "loss": 0.62388396, - "num_input_tokens_seen": 282882775, - "step": 13109, - "time_per_iteration": 3.113757610321045 - }, - { - "auxiliary_loss_clip": 0.01067084, - "auxiliary_loss_mlp": 0.010294, - "balance_loss_clip": 1.03637147, - "balance_loss_mlp": 1.01732993, - "epoch": 0.788215842477078, - "flos": 24935436167040.0, - "grad_norm": 1.6561185443626747, - "language_loss": 0.72235435, - "learning_rate": 4.5222790276677126e-07, - "loss": 0.74331915, - "num_input_tokens_seen": 282902680, - "step": 13110, - "time_per_iteration": 2.7759180068969727 - }, - { - "auxiliary_loss_clip": 0.01056492, - "auxiliary_loss_mlp": 0.01030136, - "balance_loss_clip": 1.03376198, - "balance_loss_mlp": 1.01843548, - "epoch": 0.788275965729746, - "flos": 26106483369600.0, - "grad_norm": 1.3819740231055346, - "language_loss": 0.75173604, - "learning_rate": 4.5198127634759455e-07, - "loss": 0.77260238, - "num_input_tokens_seen": 282923625, - "step": 13111, - "time_per_iteration": 2.840644121170044 - }, - { - "auxiliary_loss_clip": 0.01094246, - "auxiliary_loss_mlp": 0.01035923, - "balance_loss_clip": 1.03667474, - "balance_loss_mlp": 1.02317989, - "epoch": 0.7883360889824139, - "flos": 21214803646080.0, - "grad_norm": 2.288432261799451, - "language_loss": 0.61037534, - "learning_rate": 4.5173470862944206e-07, - "loss": 0.63167697, - "num_input_tokens_seen": 282941955, - "step": 13112, - "time_per_iteration": 2.673748016357422 - }, - { - "auxiliary_loss_clip": 0.01089796, - "auxiliary_loss_mlp": 0.0103157, - "balance_loss_clip": 1.03910899, - "balance_loss_mlp": 1.01814699, - "epoch": 0.7883962122350819, - "flos": 21142551438720.0, - "grad_norm": 1.825503520806994, - "language_loss": 0.67753619, - "learning_rate": 4.514881996216644e-07, - "loss": 0.69874984, - "num_input_tokens_seen": 282961280, - "step": 13113, - "time_per_iteration": 2.6813149452209473 - }, - { - "auxiliary_loss_clip": 0.01069296, - "auxiliary_loss_mlp": 0.01035093, - "balance_loss_clip": 1.0344131, - "balance_loss_mlp": 1.02270675, - "epoch": 0.7884563354877498, - "flos": 15302901928320.0, - "grad_norm": 3.4397675156813867, - "language_loss": 0.5793888, - "learning_rate": 4.5124174933361e-07, - "loss": 0.60043263, - "num_input_tokens_seen": 282978210, - "step": 13114, - "time_per_iteration": 2.7150933742523193 - }, - { - "auxiliary_loss_clip": 0.01062606, - "auxiliary_loss_mlp": 0.01031989, - "balance_loss_clip": 1.03754115, - "balance_loss_mlp": 1.01891208, - "epoch": 0.7885164587404179, - "flos": 24388301226240.0, - "grad_norm": 1.5845799743186602, - "language_loss": 0.67243695, - "learning_rate": 4.5099535777462306e-07, - "loss": 0.69338286, - "num_input_tokens_seen": 282998845, - "step": 13115, - "time_per_iteration": 2.80094575881958 - }, - { - "auxiliary_loss_clip": 0.01083933, - "auxiliary_loss_mlp": 0.01040208, - "balance_loss_clip": 1.03556573, - "balance_loss_mlp": 1.02654052, - "epoch": 0.7885765819930858, - "flos": 14385886686720.0, - "grad_norm": 2.573676201829806, - "language_loss": 0.88785017, - "learning_rate": 4.50749024954048e-07, - "loss": 0.90909165, - "num_input_tokens_seen": 283015200, - "step": 13116, - "time_per_iteration": 2.8118736743927 - }, - { - "auxiliary_loss_clip": 0.01093449, - "auxiliary_loss_mlp": 0.01033859, - "balance_loss_clip": 1.03728342, - "balance_loss_mlp": 1.02034712, - "epoch": 0.7886367052457538, - "flos": 18259930195200.0, - "grad_norm": 2.1380250897449384, - "language_loss": 0.72576118, - "learning_rate": 4.505027508812245e-07, - "loss": 0.74703431, - "num_input_tokens_seen": 283033680, - "step": 13117, - "time_per_iteration": 2.782005786895752 - }, - { - "auxiliary_loss_clip": 0.01096232, - "auxiliary_loss_mlp": 0.01027109, - "balance_loss_clip": 1.03812051, - "balance_loss_mlp": 1.01586211, - "epoch": 0.7886968284984217, - "flos": 15305092657920.0, - "grad_norm": 1.6421996108060435, - "language_loss": 0.79999858, - "learning_rate": 4.502565355654926e-07, - "loss": 0.82123202, - "num_input_tokens_seen": 283050620, - "step": 13118, - "time_per_iteration": 2.678349256515503 - }, - { - "auxiliary_loss_clip": 0.01097412, - "auxiliary_loss_mlp": 0.01028112, - "balance_loss_clip": 1.03808641, - "balance_loss_mlp": 1.01605964, - "epoch": 0.7887569517510897, - "flos": 21215450090880.0, - "grad_norm": 1.6890691063161838, - "language_loss": 0.72958535, - "learning_rate": 4.500103790161878e-07, - "loss": 0.75084054, - "num_input_tokens_seen": 283070215, - "step": 13119, - "time_per_iteration": 2.7472004890441895 - }, - { - "auxiliary_loss_clip": 0.01095693, - "auxiliary_loss_mlp": 0.01028074, - "balance_loss_clip": 1.03482223, - "balance_loss_mlp": 1.01517558, - "epoch": 0.7888170750037578, - "flos": 22711237176960.0, - "grad_norm": 3.3903571989834584, - "language_loss": 0.71983945, - "learning_rate": 4.4976428124264454e-07, - "loss": 0.74107713, - "num_input_tokens_seen": 283091485, - "step": 13120, - "time_per_iteration": 2.82316517829895 - }, - { - "auxiliary_loss_clip": 0.01081982, - "auxiliary_loss_mlp": 0.007726, - "balance_loss_clip": 1.03474998, - "balance_loss_mlp": 1.00026715, - "epoch": 0.7888771982564257, - "flos": 36429148592640.0, - "grad_norm": 1.5160777676600576, - "language_loss": 0.79098976, - "learning_rate": 4.4951824225419564e-07, - "loss": 0.80953562, - "num_input_tokens_seen": 283115040, - "step": 13121, - "time_per_iteration": 2.8498284816741943 - }, - { - "auxiliary_loss_clip": 0.01095183, - "auxiliary_loss_mlp": 0.01030106, - "balance_loss_clip": 1.0355587, - "balance_loss_mlp": 1.01765466, - "epoch": 0.7889373215090937, - "flos": 27309993488640.0, - "grad_norm": 1.3811288834626105, - "language_loss": 0.80475199, - "learning_rate": 4.4927226206017057e-07, - "loss": 0.82600486, - "num_input_tokens_seen": 283136925, - "step": 13122, - "time_per_iteration": 2.667525053024292 - }, - { - "auxiliary_loss_clip": 0.01081111, - "auxiliary_loss_mlp": 0.01026345, - "balance_loss_clip": 1.03613377, - "balance_loss_mlp": 1.01491308, - "epoch": 0.7889974447617616, - "flos": 19829010983040.0, - "grad_norm": 1.947347999480454, - "language_loss": 0.78504455, - "learning_rate": 4.4902634066989597e-07, - "loss": 0.8061192, - "num_input_tokens_seen": 283155725, - "step": 13123, - "time_per_iteration": 5.875938653945923 - }, - { - "auxiliary_loss_clip": 0.0109205, - "auxiliary_loss_mlp": 0.01034554, - "balance_loss_clip": 1.04389477, - "balance_loss_mlp": 1.02196002, - "epoch": 0.7890575680144296, - "flos": 17271201450240.0, - "grad_norm": 1.9573332964647796, - "language_loss": 0.67213017, - "learning_rate": 4.487804780926985e-07, - "loss": 0.69339627, - "num_input_tokens_seen": 283173845, - "step": 13124, - "time_per_iteration": 4.206716775894165 - }, - { - "auxiliary_loss_clip": 0.01087652, - "auxiliary_loss_mlp": 0.01025366, - "balance_loss_clip": 1.03578448, - "balance_loss_mlp": 1.01191306, - "epoch": 0.7891176912670975, - "flos": 27600151553280.0, - "grad_norm": 2.308329967659437, - "language_loss": 0.72559512, - "learning_rate": 4.4853467433790036e-07, - "loss": 0.74672532, - "num_input_tokens_seen": 283191985, - "step": 13125, - "time_per_iteration": 2.7699477672576904 - }, - { - "auxiliary_loss_clip": 0.01092333, - "auxiliary_loss_mlp": 0.01028843, - "balance_loss_clip": 1.03605413, - "balance_loss_mlp": 1.01586151, - "epoch": 0.7891778145197655, - "flos": 22711668140160.0, - "grad_norm": 1.8181427406883512, - "language_loss": 0.72330505, - "learning_rate": 4.4828892941482267e-07, - "loss": 0.74451685, - "num_input_tokens_seen": 283210855, - "step": 13126, - "time_per_iteration": 2.799743413925171 - }, - { - "auxiliary_loss_clip": 0.01091919, - "auxiliary_loss_mlp": 0.01031202, - "balance_loss_clip": 1.03676748, - "balance_loss_mlp": 1.01820195, - "epoch": 0.7892379377724335, - "flos": 17310775259520.0, - "grad_norm": 1.9171689494151543, - "language_loss": 0.76746297, - "learning_rate": 4.480432433327845e-07, - "loss": 0.78869414, - "num_input_tokens_seen": 283229665, - "step": 13127, - "time_per_iteration": 2.6769402027130127 - }, - { - "auxiliary_loss_clip": 0.0109264, - "auxiliary_loss_mlp": 0.01040923, - "balance_loss_clip": 1.03622723, - "balance_loss_mlp": 1.02709436, - "epoch": 0.7892980610251015, - "flos": 25775674087680.0, - "grad_norm": 1.6866494650381205, - "language_loss": 0.85712594, - "learning_rate": 4.47797616101103e-07, - "loss": 0.87846154, - "num_input_tokens_seen": 283248615, - "step": 13128, - "time_per_iteration": 2.6580183506011963 - }, - { - "auxiliary_loss_clip": 0.0109824, - "auxiliary_loss_mlp": 0.01037637, - "balance_loss_clip": 1.03702545, - "balance_loss_mlp": 1.02604949, - "epoch": 0.7893581842777694, - "flos": 21579943351680.0, - "grad_norm": 2.375306290130731, - "language_loss": 0.69267899, - "learning_rate": 4.475520477290904e-07, - "loss": 0.71403778, - "num_input_tokens_seen": 283267135, - "step": 13129, - "time_per_iteration": 2.736177682876587 - }, - { - "auxiliary_loss_clip": 0.01020095, - "auxiliary_loss_mlp": 0.01001956, - "balance_loss_clip": 1.00642443, - "balance_loss_mlp": 1.00062704, - "epoch": 0.7894183075304374, - "flos": 69016468176000.0, - "grad_norm": 0.7134870194246187, - "language_loss": 0.61555952, - "learning_rate": 4.473065382260597e-07, - "loss": 0.63578004, - "num_input_tokens_seen": 283328940, - "step": 13130, - "time_per_iteration": 4.797807216644287 - }, - { - "auxiliary_loss_clip": 0.0110005, - "auxiliary_loss_mlp": 0.01028381, - "balance_loss_clip": 1.03902447, - "balance_loss_mlp": 1.01690102, - "epoch": 0.7894784307831053, - "flos": 24243258107520.0, - "grad_norm": 1.9168458838285078, - "language_loss": 0.73797166, - "learning_rate": 4.4706108760132124e-07, - "loss": 0.75925595, - "num_input_tokens_seen": 283350000, - "step": 13131, - "time_per_iteration": 2.7573840618133545 - }, - { - "auxiliary_loss_clip": 0.01088103, - "auxiliary_loss_mlp": 0.01026242, - "balance_loss_clip": 1.0371995, - "balance_loss_mlp": 1.01223469, - "epoch": 0.7895385540357733, - "flos": 20266546550400.0, - "grad_norm": 2.4133377950586676, - "language_loss": 0.68751633, - "learning_rate": 4.4681569586418153e-07, - "loss": 0.70865989, - "num_input_tokens_seen": 283368020, - "step": 13132, - "time_per_iteration": 2.719820499420166 - }, - { - "auxiliary_loss_clip": 0.01101541, - "auxiliary_loss_mlp": 0.01040122, - "balance_loss_clip": 1.03842628, - "balance_loss_mlp": 1.02676463, - "epoch": 0.7895986772884414, - "flos": 20996574566400.0, - "grad_norm": 2.9264754072085104, - "language_loss": 0.62335461, - "learning_rate": 4.465703630239468e-07, - "loss": 0.64477122, - "num_input_tokens_seen": 283387030, - "step": 13133, - "time_per_iteration": 2.6314589977264404 - }, - { - "auxiliary_loss_clip": 0.01079478, - "auxiliary_loss_mlp": 0.01037851, - "balance_loss_clip": 1.03612971, - "balance_loss_mlp": 1.02386165, - "epoch": 0.7896588005411093, - "flos": 18657999694080.0, - "grad_norm": 3.367198830526819, - "language_loss": 0.7950719, - "learning_rate": 4.463250890899195e-07, - "loss": 0.8162452, - "num_input_tokens_seen": 283402090, - "step": 13134, - "time_per_iteration": 2.7504961490631104 - }, - { - "auxiliary_loss_clip": 0.0109746, - "auxiliary_loss_mlp": 0.01032763, - "balance_loss_clip": 1.03501463, - "balance_loss_mlp": 1.02011466, - "epoch": 0.7897189237937773, - "flos": 18405907067520.0, - "grad_norm": 1.8328144041845063, - "language_loss": 0.80414212, - "learning_rate": 4.460798740713998e-07, - "loss": 0.82544434, - "num_input_tokens_seen": 283421035, - "step": 13135, - "time_per_iteration": 2.666182518005371 - }, - { - "auxiliary_loss_clip": 0.01097147, - "auxiliary_loss_mlp": 0.0103152, - "balance_loss_clip": 1.0373044, - "balance_loss_mlp": 1.01890731, - "epoch": 0.7897790470464452, - "flos": 23731602825600.0, - "grad_norm": 1.9348385982052458, - "language_loss": 0.72716129, - "learning_rate": 4.4583471797768733e-07, - "loss": 0.7484479, - "num_input_tokens_seen": 283441830, - "step": 13136, - "time_per_iteration": 2.643087387084961 - }, - { - "auxiliary_loss_clip": 0.01115705, - "auxiliary_loss_mlp": 0.01034132, - "balance_loss_clip": 1.03773975, - "balance_loss_mlp": 1.02081013, - "epoch": 0.7898391702991132, - "flos": 15918949111680.0, - "grad_norm": 5.084496642242111, - "language_loss": 0.70505196, - "learning_rate": 4.455896208180778e-07, - "loss": 0.72655034, - "num_input_tokens_seen": 283459540, - "step": 13137, - "time_per_iteration": 2.584527015686035 - }, - { - "auxiliary_loss_clip": 0.01108112, - "auxiliary_loss_mlp": 0.01035486, - "balance_loss_clip": 1.03718948, - "balance_loss_mlp": 1.02206349, - "epoch": 0.7898992935517811, - "flos": 19829046896640.0, - "grad_norm": 1.7127744511556113, - "language_loss": 0.73933578, - "learning_rate": 4.4534458260186645e-07, - "loss": 0.76077175, - "num_input_tokens_seen": 283478790, - "step": 13138, - "time_per_iteration": 2.7276523113250732 - }, - { - "auxiliary_loss_clip": 0.01070823, - "auxiliary_loss_mlp": 0.01031914, - "balance_loss_clip": 1.03749275, - "balance_loss_mlp": 1.01971924, - "epoch": 0.7899594168044491, - "flos": 16216253982720.0, - "grad_norm": 1.9590714056368506, - "language_loss": 0.68501168, - "learning_rate": 4.4509960333834426e-07, - "loss": 0.70603907, - "num_input_tokens_seen": 283495720, - "step": 13139, - "time_per_iteration": 2.7639269828796387 - }, - { - "auxiliary_loss_clip": 0.01021477, - "auxiliary_loss_mlp": 0.01001215, - "balance_loss_clip": 1.00810361, - "balance_loss_mlp": 1.00014842, - "epoch": 0.790019540057117, - "flos": 68331005959680.0, - "grad_norm": 0.8505295432368817, - "language_loss": 0.60203749, - "learning_rate": 4.448546830368003e-07, - "loss": 0.62226439, - "num_input_tokens_seen": 283558795, - "step": 13140, - "time_per_iteration": 3.293804168701172 - }, - { - "auxiliary_loss_clip": 0.01111705, - "auxiliary_loss_mlp": 0.01036907, - "balance_loss_clip": 1.03908968, - "balance_loss_mlp": 1.02385402, - "epoch": 0.7900796633097851, - "flos": 30332773601280.0, - "grad_norm": 1.6223884699668718, - "language_loss": 0.76106548, - "learning_rate": 4.4460982170652304e-07, - "loss": 0.78255159, - "num_input_tokens_seen": 283579305, - "step": 13141, - "time_per_iteration": 2.753269672393799 - }, - { - "auxiliary_loss_clip": 0.01101932, - "auxiliary_loss_mlp": 0.01036808, - "balance_loss_clip": 1.03863978, - "balance_loss_mlp": 1.02401733, - "epoch": 0.790139786562453, - "flos": 22126790983680.0, - "grad_norm": 2.0698981191981978, - "language_loss": 0.68995577, - "learning_rate": 4.4436501935679694e-07, - "loss": 0.71134317, - "num_input_tokens_seen": 283597840, - "step": 13142, - "time_per_iteration": 2.682314872741699 - }, - { - "auxiliary_loss_clip": 0.00984677, - "auxiliary_loss_mlp": 0.01013212, - "balance_loss_clip": 1.01147008, - "balance_loss_mlp": 1.01161504, - "epoch": 0.790199909815121, - "flos": 58207284213120.0, - "grad_norm": 0.8339263340003221, - "language_loss": 0.59981745, - "learning_rate": 4.441202759969049e-07, - "loss": 0.61979634, - "num_input_tokens_seen": 283647950, - "step": 13143, - "time_per_iteration": 3.278980255126953 - }, - { - "auxiliary_loss_clip": 0.01082841, - "auxiliary_loss_mlp": 0.01035062, - "balance_loss_clip": 1.03883827, - "balance_loss_mlp": 1.02172852, - "epoch": 0.7902600330677889, - "flos": 34533316759680.0, - "grad_norm": 1.6349862086854898, - "language_loss": 0.74675769, - "learning_rate": 4.4387559163612875e-07, - "loss": 0.76793671, - "num_input_tokens_seen": 283670645, - "step": 13144, - "time_per_iteration": 3.294663429260254 - }, - { - "auxiliary_loss_clip": 0.01103742, - "auxiliary_loss_mlp": 0.01036273, - "balance_loss_clip": 1.03867149, - "balance_loss_mlp": 1.02252793, - "epoch": 0.7903201563204569, - "flos": 22346384780160.0, - "grad_norm": 2.139554645223281, - "language_loss": 0.82848895, - "learning_rate": 4.4363096628374605e-07, - "loss": 0.84988904, - "num_input_tokens_seen": 283688830, - "step": 13145, - "time_per_iteration": 2.7851741313934326 - }, - { - "auxiliary_loss_clip": 0.01095507, - "auxiliary_loss_mlp": 0.01030248, - "balance_loss_clip": 1.0367043, - "balance_loss_mlp": 1.01874435, - "epoch": 0.790380279573125, - "flos": 22053533195520.0, - "grad_norm": 1.5468904439953068, - "language_loss": 0.73388755, - "learning_rate": 4.4338639994903235e-07, - "loss": 0.75514507, - "num_input_tokens_seen": 283708625, - "step": 13146, - "time_per_iteration": 2.65710186958313 - }, - { - "auxiliary_loss_clip": 0.01111662, - "auxiliary_loss_mlp": 0.01028915, - "balance_loss_clip": 1.03781211, - "balance_loss_mlp": 1.01685667, - "epoch": 0.7904404028257929, - "flos": 20302600826880.0, - "grad_norm": 1.8467569642796249, - "language_loss": 0.75617737, - "learning_rate": 4.4314189264126246e-07, - "loss": 0.77758318, - "num_input_tokens_seen": 283725710, - "step": 13147, - "time_per_iteration": 2.7460520267486572 - }, - { - "auxiliary_loss_clip": 0.01091922, - "auxiliary_loss_mlp": 0.0103933, - "balance_loss_clip": 1.03564286, - "balance_loss_mlp": 1.02576411, - "epoch": 0.7905005260784609, - "flos": 20008923229440.0, - "grad_norm": 1.7581550780117867, - "language_loss": 0.72203916, - "learning_rate": 4.428974443697087e-07, - "loss": 0.7433517, - "num_input_tokens_seen": 283744150, - "step": 13148, - "time_per_iteration": 2.6912500858306885 - }, - { - "auxiliary_loss_clip": 0.01095913, - "auxiliary_loss_mlp": 0.01030776, - "balance_loss_clip": 1.03445816, - "balance_loss_mlp": 1.01777613, - "epoch": 0.7905606493311288, - "flos": 26905926418560.0, - "grad_norm": 1.814389925772028, - "language_loss": 0.71692038, - "learning_rate": 4.4265305514363913e-07, - "loss": 0.73818725, - "num_input_tokens_seen": 283764170, - "step": 13149, - "time_per_iteration": 2.800591230392456 - }, - { - "auxiliary_loss_clip": 0.01074802, - "auxiliary_loss_mlp": 0.01034207, - "balance_loss_clip": 1.03384662, - "balance_loss_mlp": 1.02023542, - "epoch": 0.7906207725837968, - "flos": 23696230907520.0, - "grad_norm": 2.263262344883557, - "language_loss": 0.65186799, - "learning_rate": 4.424087249723225e-07, - "loss": 0.67295814, - "num_input_tokens_seen": 283784305, - "step": 13150, - "time_per_iteration": 2.774513006210327 - }, - { - "auxiliary_loss_clip": 0.01108732, - "auxiliary_loss_mlp": 0.01033548, - "balance_loss_clip": 1.03688979, - "balance_loss_mlp": 1.02138877, - "epoch": 0.7906808958364647, - "flos": 20848837927680.0, - "grad_norm": 2.4892944447292065, - "language_loss": 0.70353788, - "learning_rate": 4.421644538650231e-07, - "loss": 0.72496063, - "num_input_tokens_seen": 283804040, - "step": 13151, - "time_per_iteration": 2.624737024307251 - }, - { - "auxiliary_loss_clip": 0.01091472, - "auxiliary_loss_mlp": 0.0103796, - "balance_loss_clip": 1.03773379, - "balance_loss_mlp": 1.02501988, - "epoch": 0.7907410190891327, - "flos": 40735196974080.0, - "grad_norm": 1.643411919564688, - "language_loss": 0.70038378, - "learning_rate": 4.4192024183100306e-07, - "loss": 0.72167814, - "num_input_tokens_seen": 283827120, - "step": 13152, - "time_per_iteration": 2.820726156234741 - }, - { - "auxiliary_loss_clip": 0.01076957, - "auxiliary_loss_mlp": 0.00770237, - "balance_loss_clip": 1.03583848, - "balance_loss_mlp": 1.00032854, - "epoch": 0.7908011423418007, - "flos": 13261165050240.0, - "grad_norm": 2.5235845787272972, - "language_loss": 0.72838122, - "learning_rate": 4.4167608887952367e-07, - "loss": 0.74685311, - "num_input_tokens_seen": 283844820, - "step": 13153, - "time_per_iteration": 2.782799005508423 - }, - { - "auxiliary_loss_clip": 0.01109362, - "auxiliary_loss_mlp": 0.01027556, - "balance_loss_clip": 1.0372107, - "balance_loss_mlp": 1.01542032, - "epoch": 0.7908612655944687, - "flos": 19754747614080.0, - "grad_norm": 1.5411567451067254, - "language_loss": 0.78878421, - "learning_rate": 4.4143199501984306e-07, - "loss": 0.81015342, - "num_input_tokens_seen": 283862870, - "step": 13154, - "time_per_iteration": 2.617465019226074 - }, - { - "auxiliary_loss_clip": 0.01106383, - "auxiliary_loss_mlp": 0.01030654, - "balance_loss_clip": 1.03864491, - "balance_loss_mlp": 1.0168618, - "epoch": 0.7909213888471366, - "flos": 21287738211840.0, - "grad_norm": 2.826426218857978, - "language_loss": 0.7024678, - "learning_rate": 4.411879602612185e-07, - "loss": 0.72383815, - "num_input_tokens_seen": 283882405, - "step": 13155, - "time_per_iteration": 2.60141658782959 - }, - { - "auxiliary_loss_clip": 0.01110954, - "auxiliary_loss_mlp": 0.01030043, - "balance_loss_clip": 1.03789937, - "balance_loss_mlp": 1.01748431, - "epoch": 0.7909815120998046, - "flos": 22528882805760.0, - "grad_norm": 1.6493957316701613, - "language_loss": 0.76920623, - "learning_rate": 4.4094398461290174e-07, - "loss": 0.79061615, - "num_input_tokens_seen": 283902070, - "step": 13156, - "time_per_iteration": 2.616990327835083 - }, - { - "auxiliary_loss_clip": 0.01077807, - "auxiliary_loss_mlp": 0.01032296, - "balance_loss_clip": 1.03416896, - "balance_loss_mlp": 1.02008295, - "epoch": 0.7910416353524725, - "flos": 26727702111360.0, - "grad_norm": 1.6152194898453356, - "language_loss": 0.65486753, - "learning_rate": 4.4070006808414526e-07, - "loss": 0.67596853, - "num_input_tokens_seen": 283924100, - "step": 13157, - "time_per_iteration": 2.7800040245056152 - }, - { - "auxiliary_loss_clip": 0.01098205, - "auxiliary_loss_mlp": 0.01038516, - "balance_loss_clip": 1.03559875, - "balance_loss_mlp": 1.02468824, - "epoch": 0.7911017586051405, - "flos": 24644847139200.0, - "grad_norm": 1.6816257658835039, - "language_loss": 0.74068034, - "learning_rate": 4.4045621068419894e-07, - "loss": 0.76204759, - "num_input_tokens_seen": 283944955, - "step": 13158, - "time_per_iteration": 2.6075475215911865 - }, - { - "auxiliary_loss_clip": 0.01095673, - "auxiliary_loss_mlp": 0.01033389, - "balance_loss_clip": 1.0357399, - "balance_loss_mlp": 1.02176023, - "epoch": 0.7911618818578086, - "flos": 17565489578880.0, - "grad_norm": 2.030035460018427, - "language_loss": 0.67612302, - "learning_rate": 4.40212412422309e-07, - "loss": 0.69741368, - "num_input_tokens_seen": 283963125, - "step": 13159, - "time_per_iteration": 2.6242077350616455 - }, - { - "auxiliary_loss_clip": 0.01098583, - "auxiliary_loss_mlp": 0.01035004, - "balance_loss_clip": 1.03775477, - "balance_loss_mlp": 1.02250552, - "epoch": 0.7912220051104765, - "flos": 16721660298240.0, - "grad_norm": 3.313195141465383, - "language_loss": 0.67271805, - "learning_rate": 4.399686733077206e-07, - "loss": 0.69405401, - "num_input_tokens_seen": 283982850, - "step": 13160, - "time_per_iteration": 2.75685715675354 - }, - { - "auxiliary_loss_clip": 0.0108344, - "auxiliary_loss_mlp": 0.01027351, - "balance_loss_clip": 1.03476191, - "balance_loss_mlp": 1.01664615, - "epoch": 0.7912821283631445, - "flos": 13698736531200.0, - "grad_norm": 2.063884011157957, - "language_loss": 0.72593331, - "learning_rate": 4.3972499334967694e-07, - "loss": 0.74704123, - "num_input_tokens_seen": 283998275, - "step": 13161, - "time_per_iteration": 2.6084799766540527 - }, - { - "auxiliary_loss_clip": 0.01080502, - "auxiliary_loss_mlp": 0.01033054, - "balance_loss_clip": 1.03568757, - "balance_loss_mlp": 1.02046573, - "epoch": 0.7913422516158124, - "flos": 23769021818880.0, - "grad_norm": 1.6120052582411066, - "language_loss": 0.73379862, - "learning_rate": 4.39481372557418e-07, - "loss": 0.75493419, - "num_input_tokens_seen": 284018750, - "step": 13162, - "time_per_iteration": 6.126726865768433 - }, - { - "auxiliary_loss_clip": 0.01089834, - "auxiliary_loss_mlp": 0.01032531, - "balance_loss_clip": 1.03729248, - "balance_loss_mlp": 1.01965666, - "epoch": 0.7914023748684804, - "flos": 19938251220480.0, - "grad_norm": 1.9889723389835698, - "language_loss": 0.71760178, - "learning_rate": 4.392378109401811e-07, - "loss": 0.73882544, - "num_input_tokens_seen": 284037850, - "step": 13163, - "time_per_iteration": 4.413632869720459 - }, - { - "auxiliary_loss_clip": 0.01075124, - "auxiliary_loss_mlp": 0.01031765, - "balance_loss_clip": 1.03465581, - "balance_loss_mlp": 1.01800179, - "epoch": 0.7914624981211483, - "flos": 20594805966720.0, - "grad_norm": 1.8803473960616024, - "language_loss": 0.70246696, - "learning_rate": 4.3899430850720296e-07, - "loss": 0.72353578, - "num_input_tokens_seen": 284056380, - "step": 13164, - "time_per_iteration": 2.698758840560913 - }, - { - "auxiliary_loss_clip": 0.01070741, - "auxiliary_loss_mlp": 0.01037319, - "balance_loss_clip": 1.0364182, - "balance_loss_mlp": 1.02521276, - "epoch": 0.7915226213738163, - "flos": 21799465320960.0, - "grad_norm": 1.885675562841956, - "language_loss": 0.67027831, - "learning_rate": 4.387508652677177e-07, - "loss": 0.69135886, - "num_input_tokens_seen": 284074945, - "step": 13165, - "time_per_iteration": 2.74423885345459 - }, - { - "auxiliary_loss_clip": 0.01062193, - "auxiliary_loss_mlp": 0.01027491, - "balance_loss_clip": 1.0360235, - "balance_loss_mlp": 1.0160346, - "epoch": 0.7915827446264843, - "flos": 16288362535680.0, - "grad_norm": 2.5652044967821563, - "language_loss": 0.72134489, - "learning_rate": 4.385074812309557e-07, - "loss": 0.74224174, - "num_input_tokens_seen": 284092070, - "step": 13166, - "time_per_iteration": 2.74450421333313 - }, - { - "auxiliary_loss_clip": 0.01107168, - "auxiliary_loss_mlp": 0.01033777, - "balance_loss_clip": 1.03592849, - "balance_loss_mlp": 1.02065766, - "epoch": 0.7916428678791523, - "flos": 25702595867520.0, - "grad_norm": 1.6924622649146908, - "language_loss": 0.77245665, - "learning_rate": 4.382641564061462e-07, - "loss": 0.79386616, - "num_input_tokens_seen": 284112255, - "step": 13167, - "time_per_iteration": 2.6304922103881836 - }, - { - "auxiliary_loss_clip": 0.01074373, - "auxiliary_loss_mlp": 0.01032393, - "balance_loss_clip": 1.03654242, - "balance_loss_mlp": 1.02080607, - "epoch": 0.7917029911318202, - "flos": 23878513451520.0, - "grad_norm": 1.5572430197509217, - "language_loss": 0.8423599, - "learning_rate": 4.3802089080251713e-07, - "loss": 0.86342752, - "num_input_tokens_seen": 284132330, - "step": 13168, - "time_per_iteration": 2.7429237365722656 - }, - { - "auxiliary_loss_clip": 0.011112, - "auxiliary_loss_mlp": 0.01031479, - "balance_loss_clip": 1.03944874, - "balance_loss_mlp": 1.01902127, - "epoch": 0.7917631143844882, - "flos": 21646593037440.0, - "grad_norm": 1.5464810747568485, - "language_loss": 0.72668618, - "learning_rate": 4.3777768442929155e-07, - "loss": 0.74811298, - "num_input_tokens_seen": 284150640, - "step": 13169, - "time_per_iteration": 2.6592273712158203 - }, - { - "auxiliary_loss_clip": 0.01112278, - "auxiliary_loss_mlp": 0.01034078, - "balance_loss_clip": 1.03776097, - "balance_loss_mlp": 1.02096534, - "epoch": 0.7918232376371561, - "flos": 38874198355200.0, - "grad_norm": 3.0164907954915856, - "language_loss": 0.67173648, - "learning_rate": 4.3753453729569287e-07, - "loss": 0.69320005, - "num_input_tokens_seen": 284171910, - "step": 13170, - "time_per_iteration": 4.270065546035767 - }, - { - "auxiliary_loss_clip": 0.01098461, - "auxiliary_loss_mlp": 0.01026319, - "balance_loss_clip": 1.03575373, - "balance_loss_mlp": 1.01436245, - "epoch": 0.7918833608898241, - "flos": 20775544225920.0, - "grad_norm": 1.6549225426524543, - "language_loss": 0.70591486, - "learning_rate": 4.372914494109412e-07, - "loss": 0.72716266, - "num_input_tokens_seen": 284191340, - "step": 13171, - "time_per_iteration": 2.6470091342926025 - }, - { - "auxiliary_loss_clip": 0.01097608, - "auxiliary_loss_mlp": 0.01030463, - "balance_loss_clip": 1.03621912, - "balance_loss_mlp": 1.01798749, - "epoch": 0.7919434841424922, - "flos": 33910122769920.0, - "grad_norm": 3.018313579930399, - "language_loss": 0.67142022, - "learning_rate": 4.370484207842553e-07, - "loss": 0.69270092, - "num_input_tokens_seen": 284212495, - "step": 13172, - "time_per_iteration": 2.7242603302001953 - }, - { - "auxiliary_loss_clip": 0.01083539, - "auxiliary_loss_mlp": 0.01033715, - "balance_loss_clip": 1.03492141, - "balance_loss_mlp": 1.02068591, - "epoch": 0.7920036073951601, - "flos": 21064660796160.0, - "grad_norm": 2.177677653156343, - "language_loss": 0.79725873, - "learning_rate": 4.3680545142484893e-07, - "loss": 0.81843126, - "num_input_tokens_seen": 284230825, - "step": 13173, - "time_per_iteration": 2.6997551918029785 - }, - { - "auxiliary_loss_clip": 0.01071714, - "auxiliary_loss_mlp": 0.01038162, - "balance_loss_clip": 1.03270435, - "balance_loss_mlp": 1.02527571, - "epoch": 0.7920637306478281, - "flos": 23655974739840.0, - "grad_norm": 2.0997194022490038, - "language_loss": 0.76738131, - "learning_rate": 4.365625413419365e-07, - "loss": 0.78848016, - "num_input_tokens_seen": 284250365, - "step": 13174, - "time_per_iteration": 2.8328940868377686 - }, - { - "auxiliary_loss_clip": 0.01083806, - "auxiliary_loss_mlp": 0.01034483, - "balance_loss_clip": 1.03376579, - "balance_loss_mlp": 1.02280629, - "epoch": 0.792123853900496, - "flos": 27195438038400.0, - "grad_norm": 2.433097475426471, - "language_loss": 0.71779603, - "learning_rate": 4.363196905447297e-07, - "loss": 0.73897892, - "num_input_tokens_seen": 284269635, - "step": 13175, - "time_per_iteration": 2.7348971366882324 - }, - { - "auxiliary_loss_clip": 0.01098061, - "auxiliary_loss_mlp": 0.010319, - "balance_loss_clip": 1.03613544, - "balance_loss_mlp": 1.01925838, - "epoch": 0.792183977153164, - "flos": 19098659744640.0, - "grad_norm": 1.8855424428426124, - "language_loss": 0.60150284, - "learning_rate": 4.360768990424364e-07, - "loss": 0.62280244, - "num_input_tokens_seen": 284288380, - "step": 13176, - "time_per_iteration": 2.645940065383911 - }, - { - "auxiliary_loss_clip": 0.01112239, - "auxiliary_loss_mlp": 0.01033063, - "balance_loss_clip": 1.04115438, - "balance_loss_mlp": 1.02052176, - "epoch": 0.7922441004058319, - "flos": 17128851851520.0, - "grad_norm": 1.8607925161268413, - "language_loss": 0.73708278, - "learning_rate": 4.3583416684426376e-07, - "loss": 0.75853586, - "num_input_tokens_seen": 284306920, - "step": 13177, - "time_per_iteration": 2.624305009841919 - }, - { - "auxiliary_loss_clip": 0.01092978, - "auxiliary_loss_mlp": 0.01035467, - "balance_loss_clip": 1.03717804, - "balance_loss_mlp": 1.02310514, - "epoch": 0.7923042236585, - "flos": 17821640442240.0, - "grad_norm": 1.880784618902091, - "language_loss": 0.64198965, - "learning_rate": 4.355914939594174e-07, - "loss": 0.66327411, - "num_input_tokens_seen": 284324700, - "step": 13178, - "time_per_iteration": 2.6623740196228027 - }, - { - "auxiliary_loss_clip": 0.01086006, - "auxiliary_loss_mlp": 0.01028552, - "balance_loss_clip": 1.03637266, - "balance_loss_mlp": 1.01807904, - "epoch": 0.7923643469111679, - "flos": 29935206892800.0, - "grad_norm": 1.4540811343422748, - "language_loss": 0.68699908, - "learning_rate": 4.3534888039709726e-07, - "loss": 0.70814466, - "num_input_tokens_seen": 284345985, - "step": 13179, - "time_per_iteration": 2.832632541656494 - }, - { - "auxiliary_loss_clip": 0.01106835, - "auxiliary_loss_mlp": 0.01030886, - "balance_loss_clip": 1.036268, - "balance_loss_mlp": 1.01872063, - "epoch": 0.7924244701638359, - "flos": 22674716023680.0, - "grad_norm": 2.2092827624793117, - "language_loss": 0.74018443, - "learning_rate": 4.3510632616650444e-07, - "loss": 0.76156163, - "num_input_tokens_seen": 284364475, - "step": 13180, - "time_per_iteration": 2.6299288272857666 - }, - { - "auxiliary_loss_clip": 0.01099012, - "auxiliary_loss_mlp": 0.01036443, - "balance_loss_clip": 1.03927088, - "balance_loss_mlp": 1.02306199, - "epoch": 0.7924845934165038, - "flos": 17968156018560.0, - "grad_norm": 2.065397931254967, - "language_loss": 0.8179431, - "learning_rate": 4.3486383127683646e-07, - "loss": 0.83929765, - "num_input_tokens_seen": 284382125, - "step": 13181, - "time_per_iteration": 2.6588377952575684 - }, - { - "auxiliary_loss_clip": 0.01079854, - "auxiliary_loss_mlp": 0.01038549, - "balance_loss_clip": 1.03439593, - "balance_loss_mlp": 1.02538192, - "epoch": 0.7925447166691718, - "flos": 23476960333440.0, - "grad_norm": 1.7700147531802202, - "language_loss": 0.77401638, - "learning_rate": 4.346213957372895e-07, - "loss": 0.79520041, - "num_input_tokens_seen": 284401585, - "step": 13182, - "time_per_iteration": 2.702794313430786 - }, - { - "auxiliary_loss_clip": 0.01097492, - "auxiliary_loss_mlp": 0.01041087, - "balance_loss_clip": 1.0389626, - "balance_loss_mlp": 1.02766991, - "epoch": 0.7926048399218397, - "flos": 20447572118400.0, - "grad_norm": 1.8061510819801756, - "language_loss": 0.74171931, - "learning_rate": 4.34379019557056e-07, - "loss": 0.76310509, - "num_input_tokens_seen": 284419125, - "step": 13183, - "time_per_iteration": 2.615912675857544 - }, - { - "auxiliary_loss_clip": 0.01078552, - "auxiliary_loss_mlp": 0.01032415, - "balance_loss_clip": 1.03608036, - "balance_loss_mlp": 1.0189023, - "epoch": 0.7926649631745077, - "flos": 37160038535040.0, - "grad_norm": 1.5412113664578542, - "language_loss": 0.68428183, - "learning_rate": 4.341367027453264e-07, - "loss": 0.70539147, - "num_input_tokens_seen": 284440445, - "step": 13184, - "time_per_iteration": 2.7763001918792725 - }, - { - "auxiliary_loss_clip": 0.01073218, - "auxiliary_loss_mlp": 0.01034358, - "balance_loss_clip": 1.03828871, - "balance_loss_mlp": 1.02169812, - "epoch": 0.7927250864271758, - "flos": 17018606033280.0, - "grad_norm": 1.8246032292732381, - "language_loss": 0.70783365, - "learning_rate": 4.338944453112907e-07, - "loss": 0.72890937, - "num_input_tokens_seen": 284459370, - "step": 13185, - "time_per_iteration": 2.7633087635040283 - }, - { - "auxiliary_loss_clip": 0.01096127, - "auxiliary_loss_mlp": 0.01032722, - "balance_loss_clip": 1.03772926, - "balance_loss_mlp": 1.02017522, - "epoch": 0.7927852096798437, - "flos": 17749208666880.0, - "grad_norm": 2.140666716995379, - "language_loss": 0.65258479, - "learning_rate": 4.3365224726413375e-07, - "loss": 0.67387331, - "num_input_tokens_seen": 284477525, - "step": 13186, - "time_per_iteration": 2.762816905975342 - }, - { - "auxiliary_loss_clip": 0.01094364, - "auxiliary_loss_mlp": 0.01037697, - "balance_loss_clip": 1.03739357, - "balance_loss_mlp": 1.02557981, - "epoch": 0.7928453329325117, - "flos": 23838436851840.0, - "grad_norm": 1.4957281455318547, - "language_loss": 0.76961684, - "learning_rate": 4.334101086130408e-07, - "loss": 0.79093742, - "num_input_tokens_seen": 284496590, - "step": 13187, - "time_per_iteration": 2.7023680210113525 - }, - { - "auxiliary_loss_clip": 0.01088541, - "auxiliary_loss_mlp": 0.01031178, - "balance_loss_clip": 1.03613758, - "balance_loss_mlp": 1.0191083, - "epoch": 0.7929054561851796, - "flos": 17454920538240.0, - "grad_norm": 2.090727269336269, - "language_loss": 0.7242974, - "learning_rate": 4.3316802936719334e-07, - "loss": 0.7454946, - "num_input_tokens_seen": 284511470, - "step": 13188, - "time_per_iteration": 2.6116061210632324 - }, - { - "auxiliary_loss_clip": 0.01110097, - "auxiliary_loss_mlp": 0.00771207, - "balance_loss_clip": 1.03619778, - "balance_loss_mlp": 1.0002346, - "epoch": 0.7929655794378476, - "flos": 21981280988160.0, - "grad_norm": 3.5192145755873043, - "language_loss": 0.63126463, - "learning_rate": 4.329260095357725e-07, - "loss": 0.65007764, - "num_input_tokens_seen": 284531125, - "step": 13189, - "time_per_iteration": 2.5492398738861084 - }, - { - "auxiliary_loss_clip": 0.01063574, - "auxiliary_loss_mlp": 0.01031813, - "balance_loss_clip": 1.03684545, - "balance_loss_mlp": 1.02014804, - "epoch": 0.7930257026905155, - "flos": 17273930883840.0, - "grad_norm": 1.84728181644231, - "language_loss": 0.73074591, - "learning_rate": 4.3268404912795307e-07, - "loss": 0.75169981, - "num_input_tokens_seen": 284549340, - "step": 13190, - "time_per_iteration": 2.7327284812927246 - }, - { - "auxiliary_loss_clip": 0.01094105, - "auxiliary_loss_mlp": 0.01030162, - "balance_loss_clip": 1.03697276, - "balance_loss_mlp": 1.01938009, - "epoch": 0.7930858259431836, - "flos": 27300584125440.0, - "grad_norm": 1.7378717321453667, - "language_loss": 0.73166823, - "learning_rate": 4.3244214815291166e-07, - "loss": 0.75291085, - "num_input_tokens_seen": 284567060, - "step": 13191, - "time_per_iteration": 2.761871337890625 - }, - { - "auxiliary_loss_clip": 0.01097055, - "auxiliary_loss_mlp": 0.01039867, - "balance_loss_clip": 1.03603792, - "balance_loss_mlp": 1.02686751, - "epoch": 0.7931459491958515, - "flos": 19863736456320.0, - "grad_norm": 1.7612896167092924, - "language_loss": 0.69279987, - "learning_rate": 4.322003066198219e-07, - "loss": 0.71416903, - "num_input_tokens_seen": 284586600, - "step": 13192, - "time_per_iteration": 2.6835954189300537 - }, - { - "auxiliary_loss_clip": 0.01074955, - "auxiliary_loss_mlp": 0.01035455, - "balance_loss_clip": 1.03394866, - "balance_loss_mlp": 1.0229497, - "epoch": 0.7932060724485195, - "flos": 23147120718720.0, - "grad_norm": 1.8840827690458661, - "language_loss": 0.75363815, - "learning_rate": 4.3195852453785274e-07, - "loss": 0.77474225, - "num_input_tokens_seen": 284605715, - "step": 13193, - "time_per_iteration": 2.723729372024536 - }, - { - "auxiliary_loss_clip": 0.01097101, - "auxiliary_loss_mlp": 0.01033041, - "balance_loss_clip": 1.03796721, - "balance_loss_mlp": 1.01971292, - "epoch": 0.7932661957011874, - "flos": 29934847756800.0, - "grad_norm": 2.301032967508139, - "language_loss": 0.71940517, - "learning_rate": 4.317168019161741e-07, - "loss": 0.74070656, - "num_input_tokens_seen": 284628540, - "step": 13194, - "time_per_iteration": 2.758888006210327 - }, - { - "auxiliary_loss_clip": 0.01113373, - "auxiliary_loss_mlp": 0.01036092, - "balance_loss_clip": 1.03853393, - "balance_loss_mlp": 1.02333045, - "epoch": 0.7933263189538554, - "flos": 22559119079040.0, - "grad_norm": 1.9174397116927768, - "language_loss": 0.70116889, - "learning_rate": 4.314751387639517e-07, - "loss": 0.72266352, - "num_input_tokens_seen": 284646040, - "step": 13195, - "time_per_iteration": 2.558119058609009 - }, - { - "auxiliary_loss_clip": 0.01060029, - "auxiliary_loss_mlp": 0.0102797, - "balance_loss_clip": 1.03700423, - "balance_loss_mlp": 1.0154351, - "epoch": 0.7933864422065233, - "flos": 25479051575040.0, - "grad_norm": 3.5361878755115286, - "language_loss": 0.77569836, - "learning_rate": 4.3123353509034844e-07, - "loss": 0.79657841, - "num_input_tokens_seen": 284665110, - "step": 13196, - "time_per_iteration": 2.7758255004882812 - }, - { - "auxiliary_loss_clip": 0.01079414, - "auxiliary_loss_mlp": 0.01037257, - "balance_loss_clip": 1.03883171, - "balance_loss_mlp": 1.02485287, - "epoch": 0.7934465654591913, - "flos": 33583156243200.0, - "grad_norm": 1.7631963808402482, - "language_loss": 0.68811917, - "learning_rate": 4.309919909045268e-07, - "loss": 0.70928586, - "num_input_tokens_seen": 284686515, - "step": 13197, - "time_per_iteration": 2.788442850112915 - }, - { - "auxiliary_loss_clip": 0.01097503, - "auxiliary_loss_mlp": 0.01029061, - "balance_loss_clip": 1.03770566, - "balance_loss_mlp": 1.01680613, - "epoch": 0.7935066887118594, - "flos": 31432538263680.0, - "grad_norm": 2.573420648877448, - "language_loss": 0.65293157, - "learning_rate": 4.30750506215646e-07, - "loss": 0.6741972, - "num_input_tokens_seen": 284707300, - "step": 13198, - "time_per_iteration": 2.785005807876587 - }, - { - "auxiliary_loss_clip": 0.010622, - "auxiliary_loss_mlp": 0.01040394, - "balance_loss_clip": 1.03600621, - "balance_loss_mlp": 1.02515936, - "epoch": 0.7935668119645273, - "flos": 14682616940160.0, - "grad_norm": 2.6924527077689113, - "language_loss": 0.72298622, - "learning_rate": 4.30509081032864e-07, - "loss": 0.74401212, - "num_input_tokens_seen": 284723545, - "step": 13199, - "time_per_iteration": 2.828518867492676 - }, - { - "auxiliary_loss_clip": 0.01083399, - "auxiliary_loss_mlp": 0.01032791, - "balance_loss_clip": 1.03479409, - "balance_loss_mlp": 1.02038765, - "epoch": 0.7936269352171953, - "flos": 18004246208640.0, - "grad_norm": 1.7805702438055635, - "language_loss": 0.80542034, - "learning_rate": 4.302677153653349e-07, - "loss": 0.82658225, - "num_input_tokens_seen": 284742650, - "step": 13200, - "time_per_iteration": 2.719022035598755 - }, - { - "auxiliary_loss_clip": 0.01096575, - "auxiliary_loss_mlp": 0.01034603, - "balance_loss_clip": 1.0383296, - "balance_loss_mlp": 1.02258706, - "epoch": 0.7936870584698632, - "flos": 18880215183360.0, - "grad_norm": 1.7717483221141246, - "language_loss": 0.77400053, - "learning_rate": 4.3002640922221077e-07, - "loss": 0.79531235, - "num_input_tokens_seen": 284760955, - "step": 13201, - "time_per_iteration": 4.26847243309021 - }, - { - "auxiliary_loss_clip": 0.01108331, - "auxiliary_loss_mlp": 0.01032866, - "balance_loss_clip": 1.03744578, - "balance_loss_mlp": 1.02092719, - "epoch": 0.7937471817225312, - "flos": 23367001824000.0, - "grad_norm": 1.5551997587526456, - "language_loss": 0.67323661, - "learning_rate": 4.2978516261264296e-07, - "loss": 0.69464856, - "num_input_tokens_seen": 284780745, - "step": 13202, - "time_per_iteration": 4.283862352371216 - }, - { - "auxiliary_loss_clip": 0.01099327, - "auxiliary_loss_mlp": 0.01034975, - "balance_loss_clip": 1.03811014, - "balance_loss_mlp": 1.02211785, - "epoch": 0.7938073049751991, - "flos": 22674428714880.0, - "grad_norm": 2.1258656424203464, - "language_loss": 0.75316, - "learning_rate": 4.2954397554577884e-07, - "loss": 0.77450299, - "num_input_tokens_seen": 284799000, - "step": 13203, - "time_per_iteration": 4.218053817749023 - }, - { - "auxiliary_loss_clip": 0.01057545, - "auxiliary_loss_mlp": 0.01032901, - "balance_loss_clip": 1.03676009, - "balance_loss_mlp": 1.02075946, - "epoch": 0.7938674282278672, - "flos": 22851431959680.0, - "grad_norm": 1.8069073081221512, - "language_loss": 0.66618353, - "learning_rate": 4.293028480307643e-07, - "loss": 0.68708801, - "num_input_tokens_seen": 284817450, - "step": 13204, - "time_per_iteration": 2.819964647293091 - }, - { - "auxiliary_loss_clip": 0.01049205, - "auxiliary_loss_mlp": 0.01028932, - "balance_loss_clip": 1.03277397, - "balance_loss_mlp": 1.01646256, - "epoch": 0.7939275514805351, - "flos": 27012509049600.0, - "grad_norm": 1.5710457021949253, - "language_loss": 0.7940079, - "learning_rate": 4.290617800767438e-07, - "loss": 0.8147893, - "num_input_tokens_seen": 284838865, - "step": 13205, - "time_per_iteration": 2.832738161087036 - }, - { - "auxiliary_loss_clip": 0.0107234, - "auxiliary_loss_mlp": 0.01030961, - "balance_loss_clip": 1.0324893, - "balance_loss_mlp": 1.01827097, - "epoch": 0.7939876747332031, - "flos": 21142838747520.0, - "grad_norm": 7.819538292121243, - "language_loss": 0.7771039, - "learning_rate": 4.28820771692858e-07, - "loss": 0.79813695, - "num_input_tokens_seen": 284857975, - "step": 13206, - "time_per_iteration": 2.7768259048461914 - }, - { - "auxiliary_loss_clip": 0.01086044, - "auxiliary_loss_mlp": 0.01035236, - "balance_loss_clip": 1.03653049, - "balance_loss_mlp": 1.02031064, - "epoch": 0.794047797985871, - "flos": 23289075267840.0, - "grad_norm": 2.0761554247876526, - "language_loss": 0.78858304, - "learning_rate": 4.285798228882456e-07, - "loss": 0.8097958, - "num_input_tokens_seen": 284877145, - "step": 13207, - "time_per_iteration": 2.78918719291687 - }, - { - "auxiliary_loss_clip": 0.01071641, - "auxiliary_loss_mlp": 0.01034494, - "balance_loss_clip": 1.03531897, - "balance_loss_mlp": 1.02225077, - "epoch": 0.794107921238539, - "flos": 24608074590720.0, - "grad_norm": 1.921000285111042, - "language_loss": 0.83848017, - "learning_rate": 4.2833893367204375e-07, - "loss": 0.85954154, - "num_input_tokens_seen": 284895560, - "step": 13208, - "time_per_iteration": 2.799513578414917 - }, - { - "auxiliary_loss_clip": 0.00994574, - "auxiliary_loss_mlp": 0.0101022, - "balance_loss_clip": 1.00948644, - "balance_loss_mlp": 1.0090878, - "epoch": 0.7941680444912069, - "flos": 64093690252800.0, - "grad_norm": 0.7333327804859686, - "language_loss": 0.58320063, - "learning_rate": 4.280981040533875e-07, - "loss": 0.60324866, - "num_input_tokens_seen": 284963135, - "step": 13209, - "time_per_iteration": 4.956205368041992 - }, - { - "auxiliary_loss_clip": 0.01076765, - "auxiliary_loss_mlp": 0.01034685, - "balance_loss_clip": 1.03624475, - "balance_loss_mlp": 1.02142262, - "epoch": 0.794228167743875, - "flos": 24388839930240.0, - "grad_norm": 2.256316924700655, - "language_loss": 0.62863505, - "learning_rate": 4.2785733404140825e-07, - "loss": 0.64974952, - "num_input_tokens_seen": 284981755, - "step": 13210, - "time_per_iteration": 2.7703917026519775 - }, - { - "auxiliary_loss_clip": 0.010938, - "auxiliary_loss_mlp": 0.01036077, - "balance_loss_clip": 1.03719687, - "balance_loss_mlp": 1.024073, - "epoch": 0.794288290996543, - "flos": 28512498026880.0, - "grad_norm": 1.9531340028994628, - "language_loss": 0.6936754, - "learning_rate": 4.2761662364523676e-07, - "loss": 0.71497422, - "num_input_tokens_seen": 285003060, - "step": 13211, - "time_per_iteration": 2.74078106880188 - }, - { - "auxiliary_loss_clip": 0.01102649, - "auxiliary_loss_mlp": 0.01039332, - "balance_loss_clip": 1.03825963, - "balance_loss_mlp": 1.02562237, - "epoch": 0.7943484142492109, - "flos": 25922117836800.0, - "grad_norm": 1.640321960898119, - "language_loss": 0.72502631, - "learning_rate": 4.2737597287400074e-07, - "loss": 0.74644607, - "num_input_tokens_seen": 285021640, - "step": 13212, - "time_per_iteration": 2.745793104171753 - }, - { - "auxiliary_loss_clip": 0.01095421, - "auxiliary_loss_mlp": 0.01029916, - "balance_loss_clip": 1.03563583, - "balance_loss_mlp": 1.01776266, - "epoch": 0.7944085375018789, - "flos": 23915286000000.0, - "grad_norm": 1.7707252579484445, - "language_loss": 0.80655056, - "learning_rate": 4.271353817368246e-07, - "loss": 0.82780391, - "num_input_tokens_seen": 285040490, - "step": 13213, - "time_per_iteration": 2.7571616172790527 - }, - { - "auxiliary_loss_clip": 0.01102684, - "auxiliary_loss_mlp": 0.01030906, - "balance_loss_clip": 1.03846729, - "balance_loss_mlp": 1.01816225, - "epoch": 0.7944686607545468, - "flos": 20229953569920.0, - "grad_norm": 2.0723417946435196, - "language_loss": 0.67524314, - "learning_rate": 4.268948502428327e-07, - "loss": 0.69657904, - "num_input_tokens_seen": 285059270, - "step": 13214, - "time_per_iteration": 2.7216098308563232 - }, - { - "auxiliary_loss_clip": 0.01107626, - "auxiliary_loss_mlp": 0.01031004, - "balance_loss_clip": 1.03777719, - "balance_loss_mlp": 1.01888001, - "epoch": 0.7945287840072148, - "flos": 21980993679360.0, - "grad_norm": 2.140296316096213, - "language_loss": 0.72678429, - "learning_rate": 4.2665437840114535e-07, - "loss": 0.74817061, - "num_input_tokens_seen": 285075390, - "step": 13215, - "time_per_iteration": 2.687727212905884 - }, - { - "auxiliary_loss_clip": 0.01058497, - "auxiliary_loss_mlp": 0.01037215, - "balance_loss_clip": 1.03636539, - "balance_loss_mlp": 1.02328491, - "epoch": 0.7945889072598827, - "flos": 26397718842240.0, - "grad_norm": 1.5145901921228262, - "language_loss": 0.79136622, - "learning_rate": 4.2641396622088253e-07, - "loss": 0.81232333, - "num_input_tokens_seen": 285096290, - "step": 13216, - "time_per_iteration": 2.7064990997314453 - }, - { - "auxiliary_loss_clip": 0.01096019, - "auxiliary_loss_mlp": 0.01034212, - "balance_loss_clip": 1.03587198, - "balance_loss_mlp": 1.02159381, - "epoch": 0.7946490305125508, - "flos": 25810255906560.0, - "grad_norm": 1.5522674129771217, - "language_loss": 0.73874998, - "learning_rate": 4.261736137111598e-07, - "loss": 0.7600522, - "num_input_tokens_seen": 285116020, - "step": 13217, - "time_per_iteration": 2.6791646480560303 - }, - { - "auxiliary_loss_clip": 0.01082895, - "auxiliary_loss_mlp": 0.01034325, - "balance_loss_clip": 1.03578281, - "balance_loss_mlp": 1.02138495, - "epoch": 0.7947091537652187, - "flos": 15960965045760.0, - "grad_norm": 1.8630939701915927, - "language_loss": 0.73956853, - "learning_rate": 4.259333208810907e-07, - "loss": 0.76074076, - "num_input_tokens_seen": 285133510, - "step": 13218, - "time_per_iteration": 2.681337594985962 - }, - { - "auxiliary_loss_clip": 0.01099657, - "auxiliary_loss_mlp": 0.01037837, - "balance_loss_clip": 1.0363996, - "balance_loss_mlp": 1.02428901, - "epoch": 0.7947692770178867, - "flos": 18587866389120.0, - "grad_norm": 1.8649212651108453, - "language_loss": 0.83193207, - "learning_rate": 4.2569308773978817e-07, - "loss": 0.85330701, - "num_input_tokens_seen": 285151690, - "step": 13219, - "time_per_iteration": 2.6580770015716553 - }, - { - "auxiliary_loss_clip": 0.01100239, - "auxiliary_loss_mlp": 0.01043205, - "balance_loss_clip": 1.03854525, - "balance_loss_mlp": 1.02832818, - "epoch": 0.7948294002705546, - "flos": 20442220992000.0, - "grad_norm": 2.3946957736467915, - "language_loss": 0.75677502, - "learning_rate": 4.2545291429636123e-07, - "loss": 0.77820945, - "num_input_tokens_seen": 285170485, - "step": 13220, - "time_per_iteration": 2.644994020462036 - }, - { - "auxiliary_loss_clip": 0.01084385, - "auxiliary_loss_mlp": 0.01035732, - "balance_loss_clip": 1.0356847, - "balance_loss_mlp": 1.0225656, - "epoch": 0.7948895235232226, - "flos": 38181194282880.0, - "grad_norm": 1.8822123036698593, - "language_loss": 0.72409242, - "learning_rate": 4.252128005599176e-07, - "loss": 0.74529362, - "num_input_tokens_seen": 285191050, - "step": 13221, - "time_per_iteration": 2.765852689743042 - }, - { - "auxiliary_loss_clip": 0.01099762, - "auxiliary_loss_mlp": 0.01030106, - "balance_loss_clip": 1.03885102, - "balance_loss_mlp": 1.01822662, - "epoch": 0.7949496467758905, - "flos": 15559806977280.0, - "grad_norm": 2.0084967919839527, - "language_loss": 0.74979097, - "learning_rate": 4.249727465395634e-07, - "loss": 0.77108967, - "num_input_tokens_seen": 285208750, - "step": 13222, - "time_per_iteration": 2.6160507202148438 - }, - { - "auxiliary_loss_clip": 0.01012175, - "auxiliary_loss_mlp": 0.01002836, - "balance_loss_clip": 1.00953972, - "balance_loss_mlp": 1.00179863, - "epoch": 0.7950097700285585, - "flos": 70897036728960.0, - "grad_norm": 0.7723294250131235, - "language_loss": 0.6706062, - "learning_rate": 4.247327522443993e-07, - "loss": 0.69075632, - "num_input_tokens_seen": 285264605, - "step": 13223, - "time_per_iteration": 3.087876319885254 - }, - { - "auxiliary_loss_clip": 0.010973, - "auxiliary_loss_mlp": 0.01033159, - "balance_loss_clip": 1.0365479, - "balance_loss_mlp": 1.01981974, - "epoch": 0.7950698932812266, - "flos": 23951627585280.0, - "grad_norm": 2.366420887555622, - "language_loss": 0.71044689, - "learning_rate": 4.2449281768352717e-07, - "loss": 0.73175144, - "num_input_tokens_seen": 285283940, - "step": 13224, - "time_per_iteration": 2.640591621398926 - }, - { - "auxiliary_loss_clip": 0.0103006, - "auxiliary_loss_mlp": 0.01002258, - "balance_loss_clip": 1.00757217, - "balance_loss_mlp": 1.00124514, - "epoch": 0.7951300165338945, - "flos": 60282561415680.0, - "grad_norm": 0.6682926442494496, - "language_loss": 0.54986, - "learning_rate": 4.2425294286604527e-07, - "loss": 0.57018316, - "num_input_tokens_seen": 285349525, - "step": 13225, - "time_per_iteration": 3.1831283569335938 - }, - { - "auxiliary_loss_clip": 0.01083968, - "auxiliary_loss_mlp": 0.01025518, - "balance_loss_clip": 1.03336477, - "balance_loss_mlp": 1.01375794, - "epoch": 0.7951901397865625, - "flos": 22819004956800.0, - "grad_norm": 2.1821061924274106, - "language_loss": 0.64788163, - "learning_rate": 4.2401312780105034e-07, - "loss": 0.66897643, - "num_input_tokens_seen": 285367355, - "step": 13226, - "time_per_iteration": 2.7201919555664062 - }, - { - "auxiliary_loss_clip": 0.01065742, - "auxiliary_loss_mlp": 0.01037995, - "balance_loss_clip": 1.03712797, - "balance_loss_mlp": 1.02581131, - "epoch": 0.7952502630392304, - "flos": 35695672871040.0, - "grad_norm": 2.811230996366362, - "language_loss": 0.69988328, - "learning_rate": 4.237733724976349e-07, - "loss": 0.72092068, - "num_input_tokens_seen": 285386190, - "step": 13227, - "time_per_iteration": 2.88179874420166 - }, - { - "auxiliary_loss_clip": 0.01065232, - "auxiliary_loss_mlp": 0.01029863, - "balance_loss_clip": 1.03389943, - "balance_loss_mlp": 1.01914048, - "epoch": 0.7953103862918984, - "flos": 25629840869760.0, - "grad_norm": 1.5862839127208228, - "language_loss": 0.69230592, - "learning_rate": 4.2353367696489184e-07, - "loss": 0.71325696, - "num_input_tokens_seen": 285406150, - "step": 13228, - "time_per_iteration": 2.9039552211761475 - }, - { - "auxiliary_loss_clip": 0.01062042, - "auxiliary_loss_mlp": 0.01046101, - "balance_loss_clip": 1.03289461, - "balance_loss_mlp": 1.03178382, - "epoch": 0.7953705095445663, - "flos": 40551980676480.0, - "grad_norm": 1.5155440892228063, - "language_loss": 0.70645332, - "learning_rate": 4.232940412119095e-07, - "loss": 0.72753471, - "num_input_tokens_seen": 285429900, - "step": 13229, - "time_per_iteration": 2.9804372787475586 - }, - { - "auxiliary_loss_clip": 0.01103757, - "auxiliary_loss_mlp": 0.01033426, - "balance_loss_clip": 1.03948689, - "balance_loss_mlp": 1.02086771, - "epoch": 0.7954306327972344, - "flos": 27636672706560.0, - "grad_norm": 2.202823116168555, - "language_loss": 0.71696305, - "learning_rate": 4.2305446524777457e-07, - "loss": 0.73833489, - "num_input_tokens_seen": 285452555, - "step": 13230, - "time_per_iteration": 2.8171424865722656 - }, - { - "auxiliary_loss_clip": 0.0101259, - "auxiliary_loss_mlp": 0.01002419, - "balance_loss_clip": 1.00992072, - "balance_loss_mlp": 1.00133443, - "epoch": 0.7954907560499023, - "flos": 59504055995520.0, - "grad_norm": 0.8970251417868289, - "language_loss": 0.63560265, - "learning_rate": 4.2281494908157247e-07, - "loss": 0.65575272, - "num_input_tokens_seen": 285515700, - "step": 13231, - "time_per_iteration": 3.281342029571533 - }, - { - "auxiliary_loss_clip": 0.01086059, - "auxiliary_loss_mlp": 0.01028708, - "balance_loss_clip": 1.03541231, - "balance_loss_mlp": 1.01657856, - "epoch": 0.7955508793025703, - "flos": 20120533764480.0, - "grad_norm": 4.745461781703955, - "language_loss": 0.69967991, - "learning_rate": 4.2257549272238566e-07, - "loss": 0.72082758, - "num_input_tokens_seen": 285533910, - "step": 13232, - "time_per_iteration": 2.6862258911132812 - }, - { - "auxiliary_loss_clip": 0.01098188, - "auxiliary_loss_mlp": 0.0103012, - "balance_loss_clip": 1.03567708, - "balance_loss_mlp": 1.01753175, - "epoch": 0.7956110025552382, - "flos": 26505378881280.0, - "grad_norm": 3.7128388079610075, - "language_loss": 0.77988273, - "learning_rate": 4.223360961792952e-07, - "loss": 0.80116582, - "num_input_tokens_seen": 285554080, - "step": 13233, - "time_per_iteration": 2.755737066268921 - }, - { - "auxiliary_loss_clip": 0.01099521, - "auxiliary_loss_mlp": 0.01032696, - "balance_loss_clip": 1.03679132, - "balance_loss_mlp": 1.02042377, - "epoch": 0.7956711258079062, - "flos": 22565475786240.0, - "grad_norm": 1.9443930320320317, - "language_loss": 0.79183459, - "learning_rate": 4.220967594613769e-07, - "loss": 0.81315672, - "num_input_tokens_seen": 285572325, - "step": 13234, - "time_per_iteration": 2.7519893646240234 - }, - { - "auxiliary_loss_clip": 0.01089518, - "auxiliary_loss_mlp": 0.00769637, - "balance_loss_clip": 1.03883278, - "balance_loss_mlp": 1.00016356, - "epoch": 0.7957312490605741, - "flos": 17379005143680.0, - "grad_norm": 1.963343394843674, - "language_loss": 0.69879019, - "learning_rate": 4.218574825777077e-07, - "loss": 0.71738172, - "num_input_tokens_seen": 285589770, - "step": 13235, - "time_per_iteration": 2.6992905139923096 - }, - { - "auxiliary_loss_clip": 0.01072089, - "auxiliary_loss_mlp": 0.01031418, - "balance_loss_clip": 1.03438449, - "balance_loss_mlp": 1.0185138, - "epoch": 0.7957913723132422, - "flos": 22491427898880.0, - "grad_norm": 1.4985866886242822, - "language_loss": 0.6796065, - "learning_rate": 4.2161826553736145e-07, - "loss": 0.70064157, - "num_input_tokens_seen": 285610065, - "step": 13236, - "time_per_iteration": 2.7930455207824707 - }, - { - "auxiliary_loss_clip": 0.01062113, - "auxiliary_loss_mlp": 0.01030284, - "balance_loss_clip": 1.03622985, - "balance_loss_mlp": 1.01748657, - "epoch": 0.7958514955659101, - "flos": 22638087129600.0, - "grad_norm": 1.6336623511601824, - "language_loss": 0.75105399, - "learning_rate": 4.2137910834940826e-07, - "loss": 0.7719779, - "num_input_tokens_seen": 285628480, - "step": 13237, - "time_per_iteration": 2.8149497509002686 - }, - { - "auxiliary_loss_clip": 0.01100352, - "auxiliary_loss_mlp": 0.0103538, - "balance_loss_clip": 1.03876448, - "balance_loss_mlp": 1.02211833, - "epoch": 0.7959116188185781, - "flos": 20704225772160.0, - "grad_norm": 2.4969872572253067, - "language_loss": 0.71244603, - "learning_rate": 4.211400110229175e-07, - "loss": 0.73380333, - "num_input_tokens_seen": 285647805, - "step": 13238, - "time_per_iteration": 2.650225877761841 - }, - { - "auxiliary_loss_clip": 0.01093003, - "auxiliary_loss_mlp": 0.01028042, - "balance_loss_clip": 1.03492129, - "balance_loss_mlp": 1.01565003, - "epoch": 0.7959717420712461, - "flos": 19024683684480.0, - "grad_norm": 2.0234207796888666, - "language_loss": 0.74033141, - "learning_rate": 4.2090097356695684e-07, - "loss": 0.7615419, - "num_input_tokens_seen": 285665505, - "step": 13239, - "time_per_iteration": 2.68799090385437 - }, - { - "auxiliary_loss_clip": 0.01113057, - "auxiliary_loss_mlp": 0.01034221, - "balance_loss_clip": 1.03780365, - "balance_loss_mlp": 1.02156138, - "epoch": 0.796031865323914, - "flos": 26356636661760.0, - "grad_norm": 2.1969833935070953, - "language_loss": 0.69364315, - "learning_rate": 4.2066199599058814e-07, - "loss": 0.7151159, - "num_input_tokens_seen": 285685855, - "step": 13240, - "time_per_iteration": 4.224658250808716 - }, - { - "auxiliary_loss_clip": 0.01024595, - "auxiliary_loss_mlp": 0.00998488, - "balance_loss_clip": 1.01116359, - "balance_loss_mlp": 0.99737942, - "epoch": 0.796091988576582, - "flos": 62069440320000.0, - "grad_norm": 0.887431308267293, - "language_loss": 0.58674192, - "learning_rate": 4.2042307830287526e-07, - "loss": 0.60697281, - "num_input_tokens_seen": 285735710, - "step": 13241, - "time_per_iteration": 4.535626649856567 - }, - { - "auxiliary_loss_clip": 0.01078843, - "auxiliary_loss_mlp": 0.01030817, - "balance_loss_clip": 1.03829169, - "balance_loss_mlp": 1.01925421, - "epoch": 0.7961521118292499, - "flos": 39020103400320.0, - "grad_norm": 1.593674462626725, - "language_loss": 0.64147931, - "learning_rate": 4.201842205128772e-07, - "loss": 0.66257584, - "num_input_tokens_seen": 285757045, - "step": 13242, - "time_per_iteration": 4.472386598587036 - }, - { - "auxiliary_loss_clip": 0.01110267, - "auxiliary_loss_mlp": 0.01034884, - "balance_loss_clip": 1.03763533, - "balance_loss_mlp": 1.02198589, - "epoch": 0.796212235081918, - "flos": 21762836426880.0, - "grad_norm": 1.8778627225113254, - "language_loss": 0.75913978, - "learning_rate": 4.199454226296526e-07, - "loss": 0.78059125, - "num_input_tokens_seen": 285776050, - "step": 13243, - "time_per_iteration": 2.590519666671753 - }, - { - "auxiliary_loss_clip": 0.01085583, - "auxiliary_loss_mlp": 0.01032008, - "balance_loss_clip": 1.04038298, - "balance_loss_mlp": 1.01900232, - "epoch": 0.7962723583345859, - "flos": 21178857110400.0, - "grad_norm": 1.6501275630789378, - "language_loss": 0.79442871, - "learning_rate": 4.1970668466225565e-07, - "loss": 0.81560457, - "num_input_tokens_seen": 285796830, - "step": 13244, - "time_per_iteration": 2.752902030944824 - }, - { - "auxiliary_loss_clip": 0.01102665, - "auxiliary_loss_mlp": 0.01029623, - "balance_loss_clip": 1.03597069, - "balance_loss_mlp": 1.01640284, - "epoch": 0.7963324815872539, - "flos": 17128636369920.0, - "grad_norm": 2.172628764552698, - "language_loss": 0.68508917, - "learning_rate": 4.1946800661973934e-07, - "loss": 0.70641208, - "num_input_tokens_seen": 285814755, - "step": 13245, - "time_per_iteration": 2.5828065872192383 - }, - { - "auxiliary_loss_clip": 0.01090189, - "auxiliary_loss_mlp": 0.01034231, - "balance_loss_clip": 1.03773546, - "balance_loss_mlp": 1.02139258, - "epoch": 0.7963926048399218, - "flos": 21397481239680.0, - "grad_norm": 1.806241454752508, - "language_loss": 0.79336578, - "learning_rate": 4.192293885111549e-07, - "loss": 0.81461, - "num_input_tokens_seen": 285834255, - "step": 13246, - "time_per_iteration": 2.6900124549865723 - }, - { - "auxiliary_loss_clip": 0.01090986, - "auxiliary_loss_mlp": 0.01031115, - "balance_loss_clip": 1.03666353, - "balance_loss_mlp": 1.01834822, - "epoch": 0.7964527280925898, - "flos": 25184188828800.0, - "grad_norm": 2.003867832970485, - "language_loss": 0.66143036, - "learning_rate": 4.1899083034555007e-07, - "loss": 0.6826514, - "num_input_tokens_seen": 285853540, - "step": 13247, - "time_per_iteration": 2.6503524780273438 - }, - { - "auxiliary_loss_clip": 0.01085363, - "auxiliary_loss_mlp": 0.01029081, - "balance_loss_clip": 1.0366044, - "balance_loss_mlp": 1.01764846, - "epoch": 0.7965128513452577, - "flos": 27015884928000.0, - "grad_norm": 2.040458459238989, - "language_loss": 0.71853489, - "learning_rate": 4.1875233213197123e-07, - "loss": 0.73967934, - "num_input_tokens_seen": 285872705, - "step": 13248, - "time_per_iteration": 4.260182857513428 - }, - { - "auxiliary_loss_clip": 0.0109327, - "auxiliary_loss_mlp": 0.01028611, - "balance_loss_clip": 1.03817999, - "balance_loss_mlp": 1.01565921, - "epoch": 0.7965729745979258, - "flos": 24419578993920.0, - "grad_norm": 2.2291806355034507, - "language_loss": 0.76553303, - "learning_rate": 4.1851389387946255e-07, - "loss": 0.78675187, - "num_input_tokens_seen": 285890290, - "step": 13249, - "time_per_iteration": 2.6802589893341064 - }, - { - "auxiliary_loss_clip": 0.01082795, - "auxiliary_loss_mlp": 0.01029897, - "balance_loss_clip": 1.03743911, - "balance_loss_mlp": 1.01721263, - "epoch": 0.7966330978505937, - "flos": 18840389978880.0, - "grad_norm": 2.0056248298720623, - "language_loss": 0.61770105, - "learning_rate": 4.1827551559706674e-07, - "loss": 0.63882804, - "num_input_tokens_seen": 285909190, - "step": 13250, - "time_per_iteration": 2.7855334281921387 - }, - { - "auxiliary_loss_clip": 0.01088491, - "auxiliary_loss_mlp": 0.01027346, - "balance_loss_clip": 1.03615296, - "balance_loss_mlp": 1.01445389, - "epoch": 0.7966932211032617, - "flos": 13152319862400.0, - "grad_norm": 2.1520508995588523, - "language_loss": 0.72124857, - "learning_rate": 4.180371972938206e-07, - "loss": 0.74240696, - "num_input_tokens_seen": 285927570, - "step": 13251, - "time_per_iteration": 2.7121150493621826 - }, - { - "auxiliary_loss_clip": 0.01116, - "auxiliary_loss_mlp": 0.01031729, - "balance_loss_clip": 1.0401994, - "balance_loss_mlp": 1.01820469, - "epoch": 0.7967533443559297, - "flos": 23949760078080.0, - "grad_norm": 2.6256177602060395, - "language_loss": 0.72742116, - "learning_rate": 4.177989389787624e-07, - "loss": 0.74889851, - "num_input_tokens_seen": 285945810, - "step": 13252, - "time_per_iteration": 2.582284927368164 - }, - { - "auxiliary_loss_clip": 0.01109038, - "auxiliary_loss_mlp": 0.01028059, - "balance_loss_clip": 1.03879833, - "balance_loss_mlp": 1.01554191, - "epoch": 0.7968134676085976, - "flos": 30368791964160.0, - "grad_norm": 1.5712453668855284, - "language_loss": 0.66325545, - "learning_rate": 4.175607406609278e-07, - "loss": 0.68462646, - "num_input_tokens_seen": 285964235, - "step": 13253, - "time_per_iteration": 2.6929616928100586 - }, - { - "auxiliary_loss_clip": 0.0108594, - "auxiliary_loss_mlp": 0.01036955, - "balance_loss_clip": 1.0418272, - "balance_loss_mlp": 1.0236156, - "epoch": 0.7968735908612656, - "flos": 23075048079360.0, - "grad_norm": 1.5829772200812473, - "language_loss": 0.67843878, - "learning_rate": 4.1732260234934767e-07, - "loss": 0.69966775, - "num_input_tokens_seen": 285983710, - "step": 13254, - "time_per_iteration": 2.7649550437927246 - }, - { - "auxiliary_loss_clip": 0.01098933, - "auxiliary_loss_mlp": 0.01034065, - "balance_loss_clip": 1.03641415, - "balance_loss_mlp": 1.02192962, - "epoch": 0.7969337141139335, - "flos": 23582250074880.0, - "grad_norm": 1.8731034083925706, - "language_loss": 0.70037842, - "learning_rate": 4.1708452405305314e-07, - "loss": 0.72170842, - "num_input_tokens_seen": 286003425, - "step": 13255, - "time_per_iteration": 2.6560351848602295 - }, - { - "auxiliary_loss_clip": 0.01108119, - "auxiliary_loss_mlp": 0.01031031, - "balance_loss_clip": 1.03694665, - "balance_loss_mlp": 1.01906836, - "epoch": 0.7969938373666016, - "flos": 19755860935680.0, - "grad_norm": 2.1084612697613268, - "language_loss": 0.79501426, - "learning_rate": 4.168465057810733e-07, - "loss": 0.81640577, - "num_input_tokens_seen": 286020130, - "step": 13256, - "time_per_iteration": 2.6129326820373535 - }, - { - "auxiliary_loss_clip": 0.01098682, - "auxiliary_loss_mlp": 0.01025793, - "balance_loss_clip": 1.03868675, - "balance_loss_mlp": 1.01325274, - "epoch": 0.7970539606192695, - "flos": 24134089697280.0, - "grad_norm": 1.6974660367757792, - "language_loss": 0.66300124, - "learning_rate": 4.166085475424315e-07, - "loss": 0.68424594, - "num_input_tokens_seen": 286040230, - "step": 13257, - "time_per_iteration": 2.6830172538757324 - }, - { - "auxiliary_loss_clip": 0.0109134, - "auxiliary_loss_mlp": 0.01034065, - "balance_loss_clip": 1.0377934, - "balance_loss_mlp": 1.02150643, - "epoch": 0.7971140838719375, - "flos": 17968622895360.0, - "grad_norm": 1.9146410977072226, - "language_loss": 0.72192776, - "learning_rate": 4.163706493461523e-07, - "loss": 0.74318182, - "num_input_tokens_seen": 286059475, - "step": 13258, - "time_per_iteration": 2.661726236343384 - }, - { - "auxiliary_loss_clip": 0.01100938, - "auxiliary_loss_mlp": 0.01034189, - "balance_loss_clip": 1.03692877, - "balance_loss_mlp": 1.0205518, - "epoch": 0.7971742071246054, - "flos": 19169547235200.0, - "grad_norm": 1.8087181306355609, - "language_loss": 0.68977499, - "learning_rate": 4.1613281120125655e-07, - "loss": 0.71112633, - "num_input_tokens_seen": 286077820, - "step": 13259, - "time_per_iteration": 2.611186981201172 - }, - { - "auxiliary_loss_clip": 0.01096475, - "auxiliary_loss_mlp": 0.01030487, - "balance_loss_clip": 1.03723931, - "balance_loss_mlp": 1.01854253, - "epoch": 0.7972343303772734, - "flos": 27125951178240.0, - "grad_norm": 2.1313633169820547, - "language_loss": 0.73609447, - "learning_rate": 4.158950331167641e-07, - "loss": 0.75736415, - "num_input_tokens_seen": 286097285, - "step": 13260, - "time_per_iteration": 2.699951648712158 - }, - { - "auxiliary_loss_clip": 0.010819, - "auxiliary_loss_mlp": 0.01032439, - "balance_loss_clip": 1.03271997, - "balance_loss_mlp": 1.02031517, - "epoch": 0.7972944536299413, - "flos": 20996646393600.0, - "grad_norm": 1.836032369081443, - "language_loss": 0.78399926, - "learning_rate": 4.1565731510169065e-07, - "loss": 0.80514264, - "num_input_tokens_seen": 286116000, - "step": 13261, - "time_per_iteration": 2.6140952110290527 - }, - { - "auxiliary_loss_clip": 0.01095642, - "auxiliary_loss_mlp": 0.01030129, - "balance_loss_clip": 1.03774393, - "balance_loss_mlp": 1.0191493, - "epoch": 0.7973545768826094, - "flos": 21580015178880.0, - "grad_norm": 1.439588217770827, - "language_loss": 0.76199102, - "learning_rate": 4.154196571650501e-07, - "loss": 0.78324872, - "num_input_tokens_seen": 286135110, - "step": 13262, - "time_per_iteration": 2.7024636268615723 - }, - { - "auxiliary_loss_clip": 0.01082139, - "auxiliary_loss_mlp": 0.01033411, - "balance_loss_clip": 1.03903556, - "balance_loss_mlp": 1.0191412, - "epoch": 0.7974147001352773, - "flos": 20558536208640.0, - "grad_norm": 2.631651732945755, - "language_loss": 0.70419514, - "learning_rate": 4.1518205931585524e-07, - "loss": 0.72535068, - "num_input_tokens_seen": 286152835, - "step": 13263, - "time_per_iteration": 2.72177791595459 - }, - { - "auxiliary_loss_clip": 0.01103923, - "auxiliary_loss_mlp": 0.01038655, - "balance_loss_clip": 1.03778529, - "balance_loss_mlp": 1.02499938, - "epoch": 0.7974748233879453, - "flos": 20996790048000.0, - "grad_norm": 2.0043756449172547, - "language_loss": 0.70802379, - "learning_rate": 4.149445215631153e-07, - "loss": 0.72944963, - "num_input_tokens_seen": 286171785, - "step": 13264, - "time_per_iteration": 2.706388473510742 - }, - { - "auxiliary_loss_clip": 0.01107469, - "auxiliary_loss_mlp": 0.01033786, - "balance_loss_clip": 1.03775704, - "balance_loss_mlp": 1.02225232, - "epoch": 0.7975349466406133, - "flos": 22565188477440.0, - "grad_norm": 1.891498852375028, - "language_loss": 0.76922268, - "learning_rate": 4.1470704391583776e-07, - "loss": 0.79063523, - "num_input_tokens_seen": 286190420, - "step": 13265, - "time_per_iteration": 2.6580817699432373 - }, - { - "auxiliary_loss_clip": 0.01080723, - "auxiliary_loss_mlp": 0.01027162, - "balance_loss_clip": 1.0393877, - "balance_loss_mlp": 1.0149374, - "epoch": 0.7975950698932812, - "flos": 21689542725120.0, - "grad_norm": 2.280360071674855, - "language_loss": 0.75571597, - "learning_rate": 4.144696263830285e-07, - "loss": 0.77679479, - "num_input_tokens_seen": 286210105, - "step": 13266, - "time_per_iteration": 2.707306146621704 - }, - { - "auxiliary_loss_clip": 0.01083885, - "auxiliary_loss_mlp": 0.01026943, - "balance_loss_clip": 1.03626752, - "balance_loss_mlp": 1.01505208, - "epoch": 0.7976551931459492, - "flos": 19604568850560.0, - "grad_norm": 7.354197908727964, - "language_loss": 0.84225118, - "learning_rate": 4.1423226897369015e-07, - "loss": 0.86335951, - "num_input_tokens_seen": 286228180, - "step": 13267, - "time_per_iteration": 2.6513888835906982 - }, - { - "auxiliary_loss_clip": 0.01095515, - "auxiliary_loss_mlp": 0.01031649, - "balance_loss_clip": 1.03541887, - "balance_loss_mlp": 1.01869643, - "epoch": 0.7977153163986171, - "flos": 21687603390720.0, - "grad_norm": 1.5920140883630767, - "language_loss": 0.76201731, - "learning_rate": 4.139949716968223e-07, - "loss": 0.7832889, - "num_input_tokens_seen": 286247305, - "step": 13268, - "time_per_iteration": 2.7020766735076904 - }, - { - "auxiliary_loss_clip": 0.01109132, - "auxiliary_loss_mlp": 0.01030449, - "balance_loss_clip": 1.03789496, - "balance_loss_mlp": 1.0182898, - "epoch": 0.7977754396512852, - "flos": 23476780765440.0, - "grad_norm": 1.5724932567080838, - "language_loss": 0.77637428, - "learning_rate": 4.1375773456142403e-07, - "loss": 0.79777002, - "num_input_tokens_seen": 286268145, - "step": 13269, - "time_per_iteration": 2.6634888648986816 - }, - { - "auxiliary_loss_clip": 0.01090369, - "auxiliary_loss_mlp": 0.01042032, - "balance_loss_clip": 1.03390729, - "balance_loss_mlp": 1.02950919, - "epoch": 0.7978355629039531, - "flos": 22382223575040.0, - "grad_norm": 1.6845375324844267, - "language_loss": 0.82535768, - "learning_rate": 4.135205575764922e-07, - "loss": 0.84668171, - "num_input_tokens_seen": 286286775, - "step": 13270, - "time_per_iteration": 2.684476613998413 - }, - { - "auxiliary_loss_clip": 0.01068469, - "auxiliary_loss_mlp": 0.01040513, - "balance_loss_clip": 1.03474867, - "balance_loss_mlp": 1.02632725, - "epoch": 0.7978956861566211, - "flos": 20266331068800.0, - "grad_norm": 1.5659305382034026, - "language_loss": 0.59210402, - "learning_rate": 4.1328344075101905e-07, - "loss": 0.61319387, - "num_input_tokens_seen": 286305590, - "step": 13271, - "time_per_iteration": 2.860095262527466 - }, - { - "auxiliary_loss_clip": 0.01090884, - "auxiliary_loss_mlp": 0.01031592, - "balance_loss_clip": 1.03714991, - "balance_loss_mlp": 1.01914704, - "epoch": 0.797955809409289, - "flos": 28112417366400.0, - "grad_norm": 1.4518492514226418, - "language_loss": 0.73159599, - "learning_rate": 4.130463840939975e-07, - "loss": 0.75282073, - "num_input_tokens_seen": 286328050, - "step": 13272, - "time_per_iteration": 2.770979881286621 - }, - { - "auxiliary_loss_clip": 0.01046384, - "auxiliary_loss_mlp": 0.01036557, - "balance_loss_clip": 1.03212595, - "balance_loss_mlp": 1.023736, - "epoch": 0.798015932661957, - "flos": 15559591495680.0, - "grad_norm": 2.073152053590518, - "language_loss": 0.71566808, - "learning_rate": 4.128093876144161e-07, - "loss": 0.73649746, - "num_input_tokens_seen": 286345265, - "step": 13273, - "time_per_iteration": 2.7531182765960693 - }, - { - "auxiliary_loss_clip": 0.0108926, - "auxiliary_loss_mlp": 0.01034875, - "balance_loss_clip": 1.03732777, - "balance_loss_mlp": 1.02203608, - "epoch": 0.7980760559146249, - "flos": 23951196622080.0, - "grad_norm": 1.7484854884585128, - "language_loss": 0.75765157, - "learning_rate": 4.1257245132126117e-07, - "loss": 0.77889293, - "num_input_tokens_seen": 286364465, - "step": 13274, - "time_per_iteration": 2.788862705230713 - }, - { - "auxiliary_loss_clip": 0.0105609, - "auxiliary_loss_mlp": 0.01027938, - "balance_loss_clip": 1.03353024, - "balance_loss_mlp": 1.01679242, - "epoch": 0.798136179167293, - "flos": 28038082170240.0, - "grad_norm": 1.3747811855715935, - "language_loss": 0.77784944, - "learning_rate": 4.12335575223518e-07, - "loss": 0.79868966, - "num_input_tokens_seen": 286385565, - "step": 13275, - "time_per_iteration": 2.823310375213623 - }, - { - "auxiliary_loss_clip": 0.01100598, - "auxiliary_loss_mlp": 0.01039391, - "balance_loss_clip": 1.03790784, - "balance_loss_mlp": 1.02598548, - "epoch": 0.7981963024199609, - "flos": 35984538046080.0, - "grad_norm": 1.8295595288590525, - "language_loss": 0.63964415, - "learning_rate": 4.1209875933016877e-07, - "loss": 0.66104394, - "num_input_tokens_seen": 286403950, - "step": 13276, - "time_per_iteration": 2.6914138793945312 - }, - { - "auxiliary_loss_clip": 0.01067297, - "auxiliary_loss_mlp": 0.01030718, - "balance_loss_clip": 1.03446054, - "balance_loss_mlp": 1.01858199, - "epoch": 0.7982564256726289, - "flos": 25884914325120.0, - "grad_norm": 1.804313446176304, - "language_loss": 0.61235017, - "learning_rate": 4.118620036501945e-07, - "loss": 0.63333035, - "num_input_tokens_seen": 286426160, - "step": 13277, - "time_per_iteration": 2.7913875579833984 - }, - { - "auxiliary_loss_clip": 0.0108732, - "auxiliary_loss_mlp": 0.01033667, - "balance_loss_clip": 1.03842235, - "balance_loss_mlp": 1.0209415, - "epoch": 0.7983165489252969, - "flos": 25739152934400.0, - "grad_norm": 1.9796578843322905, - "language_loss": 0.79335415, - "learning_rate": 4.1162530819257227e-07, - "loss": 0.81456405, - "num_input_tokens_seen": 286446610, - "step": 13278, - "time_per_iteration": 2.69783353805542 - }, - { - "auxiliary_loss_clip": 0.01089196, - "auxiliary_loss_mlp": 0.01040766, - "balance_loss_clip": 1.03595579, - "balance_loss_mlp": 1.0271821, - "epoch": 0.7983766721779648, - "flos": 21908202768000.0, - "grad_norm": 1.9939125008903142, - "language_loss": 0.62796175, - "learning_rate": 4.113886729662768e-07, - "loss": 0.64926136, - "num_input_tokens_seen": 286465460, - "step": 13279, - "time_per_iteration": 2.6455893516540527 - }, - { - "auxiliary_loss_clip": 0.01093985, - "auxiliary_loss_mlp": 0.01029527, - "balance_loss_clip": 1.03608727, - "balance_loss_mlp": 1.01845241, - "epoch": 0.7984367954306328, - "flos": 29347420734720.0, - "grad_norm": 2.0437348521937633, - "language_loss": 0.71019673, - "learning_rate": 4.111520979802825e-07, - "loss": 0.73143184, - "num_input_tokens_seen": 286485720, - "step": 13280, - "time_per_iteration": 5.853861093521118 - }, - { - "auxiliary_loss_clip": 0.01071418, - "auxiliary_loss_mlp": 0.01042718, - "balance_loss_clip": 1.03524828, - "balance_loss_mlp": 1.02807951, - "epoch": 0.7984969186833007, - "flos": 31357772104320.0, - "grad_norm": 1.7977133455003094, - "language_loss": 0.62786448, - "learning_rate": 4.1091558324355955e-07, - "loss": 0.64900589, - "num_input_tokens_seen": 286507465, - "step": 13281, - "time_per_iteration": 2.8363935947418213 - }, - { - "auxiliary_loss_clip": 0.01098858, - "auxiliary_loss_mlp": 0.01032968, - "balance_loss_clip": 1.03470564, - "balance_loss_mlp": 1.0203495, - "epoch": 0.7985570419359688, - "flos": 24312924535680.0, - "grad_norm": 2.135522103798107, - "language_loss": 0.80706322, - "learning_rate": 4.1067912876507683e-07, - "loss": 0.82838148, - "num_input_tokens_seen": 286526345, - "step": 13282, - "time_per_iteration": 4.3146162033081055 - }, - { - "auxiliary_loss_clip": 0.01075396, - "auxiliary_loss_mlp": 0.00770211, - "balance_loss_clip": 1.03265977, - "balance_loss_mlp": 1.00023508, - "epoch": 0.7986171651886367, - "flos": 15742233175680.0, - "grad_norm": 1.7496465983827643, - "language_loss": 0.71291137, - "learning_rate": 4.10442734553802e-07, - "loss": 0.73136741, - "num_input_tokens_seen": 286544095, - "step": 13283, - "time_per_iteration": 2.7113521099090576 - }, - { - "auxiliary_loss_clip": 0.01094572, - "auxiliary_loss_mlp": 0.01026527, - "balance_loss_clip": 1.03506041, - "balance_loss_mlp": 1.01502371, - "epoch": 0.7986772884413047, - "flos": 11619401091840.0, - "grad_norm": 1.8142883767804951, - "language_loss": 0.73932701, - "learning_rate": 4.102064006186967e-07, - "loss": 0.76053798, - "num_input_tokens_seen": 286560960, - "step": 13284, - "time_per_iteration": 2.690788984298706 - }, - { - "auxiliary_loss_clip": 0.01081168, - "auxiliary_loss_mlp": 0.01036951, - "balance_loss_clip": 1.03430653, - "balance_loss_mlp": 1.02556038, - "epoch": 0.7987374116939726, - "flos": 22091059929600.0, - "grad_norm": 2.8983502428316252, - "language_loss": 0.70378709, - "learning_rate": 4.0997012696872415e-07, - "loss": 0.72496831, - "num_input_tokens_seen": 286579865, - "step": 13285, - "time_per_iteration": 2.6703269481658936 - }, - { - "auxiliary_loss_clip": 0.01080639, - "auxiliary_loss_mlp": 0.01033039, - "balance_loss_clip": 1.03381705, - "balance_loss_mlp": 1.02097476, - "epoch": 0.7987975349466406, - "flos": 17890696339200.0, - "grad_norm": 1.6695326991809423, - "language_loss": 0.7404871, - "learning_rate": 4.097339136128437e-07, - "loss": 0.76162386, - "num_input_tokens_seen": 286597295, - "step": 13286, - "time_per_iteration": 2.663839817047119 - }, - { - "auxiliary_loss_clip": 0.01087446, - "auxiliary_loss_mlp": 0.01030474, - "balance_loss_clip": 1.03605843, - "balance_loss_mlp": 1.01811767, - "epoch": 0.7988576581993085, - "flos": 19719232041600.0, - "grad_norm": 1.9331179632037672, - "language_loss": 0.75270319, - "learning_rate": 4.0949776056001296e-07, - "loss": 0.77388239, - "num_input_tokens_seen": 286616270, - "step": 13287, - "time_per_iteration": 2.6603620052337646 - }, - { - "auxiliary_loss_clip": 0.01086627, - "auxiliary_loss_mlp": 0.01029452, - "balance_loss_clip": 1.03799939, - "balance_loss_mlp": 1.01714361, - "epoch": 0.7989177814519766, - "flos": 28036358317440.0, - "grad_norm": 1.5251443213363312, - "language_loss": 0.61793303, - "learning_rate": 4.092616678191863e-07, - "loss": 0.63909382, - "num_input_tokens_seen": 286638315, - "step": 13288, - "time_per_iteration": 4.3285603523254395 - }, - { - "auxiliary_loss_clip": 0.01098321, - "auxiliary_loss_mlp": 0.01032216, - "balance_loss_clip": 1.03874183, - "balance_loss_mlp": 1.02039015, - "epoch": 0.7989779047046445, - "flos": 28871029630080.0, - "grad_norm": 2.003655756829568, - "language_loss": 0.70842254, - "learning_rate": 4.090256353993169e-07, - "loss": 0.72972792, - "num_input_tokens_seen": 286658630, - "step": 13289, - "time_per_iteration": 2.754244089126587 - }, - { - "auxiliary_loss_clip": 0.01077989, - "auxiliary_loss_mlp": 0.01036331, - "balance_loss_clip": 1.0401969, - "balance_loss_mlp": 1.02322364, - "epoch": 0.7990380279573125, - "flos": 18186887888640.0, - "grad_norm": 2.067060536008121, - "language_loss": 0.62479776, - "learning_rate": 4.0878966330935506e-07, - "loss": 0.64594096, - "num_input_tokens_seen": 286676870, - "step": 13290, - "time_per_iteration": 2.7182984352111816 - }, - { - "auxiliary_loss_clip": 0.01102224, - "auxiliary_loss_mlp": 0.0103293, - "balance_loss_clip": 1.03841472, - "balance_loss_mlp": 1.01973963, - "epoch": 0.7990981512099805, - "flos": 20879936127360.0, - "grad_norm": 2.07432932467733, - "language_loss": 0.71562916, - "learning_rate": 4.08553751558248e-07, - "loss": 0.73698068, - "num_input_tokens_seen": 286694300, - "step": 13291, - "time_per_iteration": 2.679877281188965 - }, - { - "auxiliary_loss_clip": 0.01071725, - "auxiliary_loss_mlp": 0.01028448, - "balance_loss_clip": 1.03726125, - "balance_loss_mlp": 1.01692605, - "epoch": 0.7991582744626484, - "flos": 26099911180800.0, - "grad_norm": 1.4271226582537684, - "language_loss": 0.63687944, - "learning_rate": 4.083179001549422e-07, - "loss": 0.65788114, - "num_input_tokens_seen": 286714545, - "step": 13292, - "time_per_iteration": 2.7268645763397217 - }, - { - "auxiliary_loss_clip": 0.01097914, - "auxiliary_loss_mlp": 0.0103158, - "balance_loss_clip": 1.03674936, - "balance_loss_mlp": 1.0198257, - "epoch": 0.7992183977153164, - "flos": 35295843605760.0, - "grad_norm": 1.6084532273776246, - "language_loss": 0.56303227, - "learning_rate": 4.0808210910838105e-07, - "loss": 0.58432722, - "num_input_tokens_seen": 286734525, - "step": 13293, - "time_per_iteration": 2.7652106285095215 - }, - { - "auxiliary_loss_clip": 0.0108332, - "auxiliary_loss_mlp": 0.0103505, - "balance_loss_clip": 1.03898919, - "balance_loss_mlp": 1.02236032, - "epoch": 0.7992785209679844, - "flos": 51853426577280.0, - "grad_norm": 6.931153518532829, - "language_loss": 0.71501821, - "learning_rate": 4.0784637842750704e-07, - "loss": 0.73620194, - "num_input_tokens_seen": 286753430, - "step": 13294, - "time_per_iteration": 2.9734227657318115 - }, - { - "auxiliary_loss_clip": 0.01071635, - "auxiliary_loss_mlp": 0.01033492, - "balance_loss_clip": 1.03379464, - "balance_loss_mlp": 1.02083826, - "epoch": 0.7993386442206524, - "flos": 22565116650240.0, - "grad_norm": 1.9346589994202708, - "language_loss": 0.72097647, - "learning_rate": 4.0761070812125675e-07, - "loss": 0.74202782, - "num_input_tokens_seen": 286771915, - "step": 13295, - "time_per_iteration": 2.8697874546051025 - }, - { - "auxiliary_loss_clip": 0.0107528, - "auxiliary_loss_mlp": 0.01033352, - "balance_loss_clip": 1.03569388, - "balance_loss_mlp": 1.02194977, - "epoch": 0.7993987674733203, - "flos": 18800277465600.0, - "grad_norm": 1.7062921905810151, - "language_loss": 0.75847328, - "learning_rate": 4.0737509819856797e-07, - "loss": 0.77955961, - "num_input_tokens_seen": 286789835, - "step": 13296, - "time_per_iteration": 2.851438522338867 - }, - { - "auxiliary_loss_clip": 0.00998558, - "auxiliary_loss_mlp": 0.01004815, - "balance_loss_clip": 1.00716496, - "balance_loss_mlp": 1.00364101, - "epoch": 0.7994588907259883, - "flos": 69421720394880.0, - "grad_norm": 0.714455868109846, - "language_loss": 0.60823548, - "learning_rate": 4.0713954866837573e-07, - "loss": 0.6282692, - "num_input_tokens_seen": 286855580, - "step": 13297, - "time_per_iteration": 3.307276725769043 - }, - { - "auxiliary_loss_clip": 0.01086945, - "auxiliary_loss_mlp": 0.01034042, - "balance_loss_clip": 1.03667986, - "balance_loss_mlp": 1.02218103, - "epoch": 0.7995190139786562, - "flos": 13480327883520.0, - "grad_norm": 2.2332538895333482, - "language_loss": 0.70562863, - "learning_rate": 4.0690405953961073e-07, - "loss": 0.72683859, - "num_input_tokens_seen": 286874360, - "step": 13298, - "time_per_iteration": 2.764620542526245 - }, - { - "auxiliary_loss_clip": 0.01073541, - "auxiliary_loss_mlp": 0.01036123, - "balance_loss_clip": 1.03546071, - "balance_loss_mlp": 1.0225215, - "epoch": 0.7995791372313242, - "flos": 21652842003840.0, - "grad_norm": 2.144443690498565, - "language_loss": 0.75778526, - "learning_rate": 4.066686308212037e-07, - "loss": 0.77888191, - "num_input_tokens_seen": 286891950, - "step": 13299, - "time_per_iteration": 2.7200376987457275 - }, - { - "auxiliary_loss_clip": 0.0108171, - "auxiliary_loss_mlp": 0.01035616, - "balance_loss_clip": 1.03365636, - "balance_loss_mlp": 1.02388, - "epoch": 0.7996392604839921, - "flos": 26068130622720.0, - "grad_norm": 1.914646005951808, - "language_loss": 0.77740645, - "learning_rate": 4.064332625220828e-07, - "loss": 0.79857981, - "num_input_tokens_seen": 286911725, - "step": 13300, - "time_per_iteration": 3.0327885150909424 - }, - { - "auxiliary_loss_clip": 0.01066534, - "auxiliary_loss_mlp": 0.01041633, - "balance_loss_clip": 1.03083372, - "balance_loss_mlp": 1.02648187, - "epoch": 0.7996993837366602, - "flos": 24606889441920.0, - "grad_norm": 1.7486826819933081, - "language_loss": 0.6372295, - "learning_rate": 4.0619795465117115e-07, - "loss": 0.65831113, - "num_input_tokens_seen": 286931400, - "step": 13301, - "time_per_iteration": 2.797971725463867 - }, - { - "auxiliary_loss_clip": 0.01096682, - "auxiliary_loss_mlp": 0.01034454, - "balance_loss_clip": 1.03674114, - "balance_loss_mlp": 1.02209187, - "epoch": 0.7997595069893281, - "flos": 20992049452800.0, - "grad_norm": 2.21348387588423, - "language_loss": 0.71967971, - "learning_rate": 4.059627072173928e-07, - "loss": 0.74099112, - "num_input_tokens_seen": 286949795, - "step": 13302, - "time_per_iteration": 2.874833822250366 - }, - { - "auxiliary_loss_clip": 0.01111886, - "auxiliary_loss_mlp": 0.00770697, - "balance_loss_clip": 1.03792214, - "balance_loss_mlp": 1.0001955, - "epoch": 0.7998196302419961, - "flos": 24426510318720.0, - "grad_norm": 2.0232516764799953, - "language_loss": 0.83735251, - "learning_rate": 4.057275202296684e-07, - "loss": 0.8561784, - "num_input_tokens_seen": 286968805, - "step": 13303, - "time_per_iteration": 2.73748779296875 - }, - { - "auxiliary_loss_clip": 0.01106654, - "auxiliary_loss_mlp": 0.01032892, - "balance_loss_clip": 1.03686202, - "balance_loss_mlp": 1.02197862, - "epoch": 0.7998797534946641, - "flos": 30264651457920.0, - "grad_norm": 1.7050821885070455, - "language_loss": 0.58436215, - "learning_rate": 4.054923936969166e-07, - "loss": 0.60575771, - "num_input_tokens_seen": 286990235, - "step": 13304, - "time_per_iteration": 2.6886215209960938 - }, - { - "auxiliary_loss_clip": 0.01111166, - "auxiliary_loss_mlp": 0.01031185, - "balance_loss_clip": 1.03615296, - "balance_loss_mlp": 1.01842976, - "epoch": 0.799939876747332, - "flos": 23513984277120.0, - "grad_norm": 1.709353821854052, - "language_loss": 0.6893419, - "learning_rate": 4.0525732762805265e-07, - "loss": 0.71076536, - "num_input_tokens_seen": 287011060, - "step": 13305, - "time_per_iteration": 2.6649460792541504 - }, - { - "auxiliary_loss_clip": 0.01072914, - "auxiliary_loss_mlp": 0.01027062, - "balance_loss_clip": 1.0366323, - "balance_loss_mlp": 1.01584458, - "epoch": 0.8, - "flos": 19318109886720.0, - "grad_norm": 1.5790890199142242, - "language_loss": 0.69499552, - "learning_rate": 4.0502232203199107e-07, - "loss": 0.71599531, - "num_input_tokens_seen": 287029215, - "step": 13306, - "time_per_iteration": 2.7563791275024414 - }, - { - "auxiliary_loss_clip": 0.01101067, - "auxiliary_loss_mlp": 0.0103442, - "balance_loss_clip": 1.03880584, - "balance_loss_mlp": 1.02221918, - "epoch": 0.800060123252668, - "flos": 32412432263040.0, - "grad_norm": 1.5764355485932124, - "language_loss": 0.69476044, - "learning_rate": 4.0478737691764286e-07, - "loss": 0.71611536, - "num_input_tokens_seen": 287050855, - "step": 13307, - "time_per_iteration": 2.732285737991333 - }, - { - "auxiliary_loss_clip": 0.01085939, - "auxiliary_loss_mlp": 0.01036111, - "balance_loss_clip": 1.0350318, - "balance_loss_mlp": 1.02393389, - "epoch": 0.800120246505336, - "flos": 20010611168640.0, - "grad_norm": 1.8640379762112131, - "language_loss": 0.76623571, - "learning_rate": 4.0455249229391677e-07, - "loss": 0.78745627, - "num_input_tokens_seen": 287069915, - "step": 13308, - "time_per_iteration": 2.642228603363037 - }, - { - "auxiliary_loss_clip": 0.01063897, - "auxiliary_loss_mlp": 0.01031546, - "balance_loss_clip": 1.03632379, - "balance_loss_mlp": 1.01817083, - "epoch": 0.8001803697580039, - "flos": 31868278151040.0, - "grad_norm": 1.4469096851593135, - "language_loss": 0.78943181, - "learning_rate": 4.0431766816972e-07, - "loss": 0.8103863, - "num_input_tokens_seen": 287091450, - "step": 13309, - "time_per_iteration": 2.864769697189331 - }, - { - "auxiliary_loss_clip": 0.01030417, - "auxiliary_loss_mlp": 0.01001696, - "balance_loss_clip": 1.00792837, - "balance_loss_mlp": 1.00063515, - "epoch": 0.8002404930106719, - "flos": 63392066916480.0, - "grad_norm": 0.9323209922385806, - "language_loss": 0.64716959, - "learning_rate": 4.040829045539571e-07, - "loss": 0.66749072, - "num_input_tokens_seen": 287148365, - "step": 13310, - "time_per_iteration": 3.1092755794525146 - }, - { - "auxiliary_loss_clip": 0.01098583, - "auxiliary_loss_mlp": 0.01033194, - "balance_loss_clip": 1.03659534, - "balance_loss_mlp": 1.02035546, - "epoch": 0.8003006162633398, - "flos": 27855476403840.0, - "grad_norm": 2.1579786023849445, - "language_loss": 0.82891053, - "learning_rate": 4.0384820145553156e-07, - "loss": 0.85022825, - "num_input_tokens_seen": 287168280, - "step": 13311, - "time_per_iteration": 2.7086493968963623 - }, - { - "auxiliary_loss_clip": 0.0109936, - "auxiliary_loss_mlp": 0.01033907, - "balance_loss_clip": 1.03775251, - "balance_loss_mlp": 1.0216639, - "epoch": 0.8003607395160078, - "flos": 18223337214720.0, - "grad_norm": 1.9933201272328842, - "language_loss": 0.66162074, - "learning_rate": 4.0361355888334116e-07, - "loss": 0.68295336, - "num_input_tokens_seen": 287185980, - "step": 13312, - "time_per_iteration": 2.680204153060913 - }, - { - "auxiliary_loss_clip": 0.01114636, - "auxiliary_loss_mlp": 0.01031944, - "balance_loss_clip": 1.04067063, - "balance_loss_mlp": 1.01846743, - "epoch": 0.8004208627686757, - "flos": 20886975192960.0, - "grad_norm": 2.3011865249501264, - "language_loss": 0.75151718, - "learning_rate": 4.033789768462843e-07, - "loss": 0.77298295, - "num_input_tokens_seen": 287203875, - "step": 13313, - "time_per_iteration": 2.606222629547119 - }, - { - "auxiliary_loss_clip": 0.0109515, - "auxiliary_loss_mlp": 0.01030962, - "balance_loss_clip": 1.03412461, - "balance_loss_mlp": 1.01851058, - "epoch": 0.8004809860213438, - "flos": 26436143416320.0, - "grad_norm": 1.3567607017939294, - "language_loss": 0.75564599, - "learning_rate": 4.031444553532575e-07, - "loss": 0.77690709, - "num_input_tokens_seen": 287226445, - "step": 13314, - "time_per_iteration": 2.6715898513793945 - }, - { - "auxiliary_loss_clip": 0.00988299, - "auxiliary_loss_mlp": 0.01000387, - "balance_loss_clip": 1.00804853, - "balance_loss_mlp": 0.99932635, - "epoch": 0.8005411092740117, - "flos": 63648612829440.0, - "grad_norm": 0.8122679233845669, - "language_loss": 0.53769958, - "learning_rate": 4.029099944131522e-07, - "loss": 0.55758643, - "num_input_tokens_seen": 287286240, - "step": 13315, - "time_per_iteration": 3.1782495975494385 - }, - { - "auxiliary_loss_clip": 0.01086886, - "auxiliary_loss_mlp": 0.01029216, - "balance_loss_clip": 1.03729582, - "balance_loss_mlp": 1.0172112, - "epoch": 0.8006012325266797, - "flos": 36138056774400.0, - "grad_norm": 1.6928178696023135, - "language_loss": 0.71341288, - "learning_rate": 4.026755940348603e-07, - "loss": 0.7345739, - "num_input_tokens_seen": 287310265, - "step": 13316, - "time_per_iteration": 2.7924816608428955 - }, - { - "auxiliary_loss_clip": 0.01091573, - "auxiliary_loss_mlp": 0.01030799, - "balance_loss_clip": 1.03969979, - "balance_loss_mlp": 1.0183655, - "epoch": 0.8006613557793477, - "flos": 33838947970560.0, - "grad_norm": 1.868325107289893, - "language_loss": 0.64874738, - "learning_rate": 4.024412542272706e-07, - "loss": 0.66997111, - "num_input_tokens_seen": 287331610, - "step": 13317, - "time_per_iteration": 2.7774088382720947 - }, - { - "auxiliary_loss_clip": 0.01029734, - "auxiliary_loss_mlp": 0.01001074, - "balance_loss_clip": 1.00732291, - "balance_loss_mlp": 1.00008476, - "epoch": 0.8007214790320156, - "flos": 67348310699520.0, - "grad_norm": 0.7791846864300481, - "language_loss": 0.59069222, - "learning_rate": 4.0220697499926783e-07, - "loss": 0.61100036, - "num_input_tokens_seen": 287394795, - "step": 13318, - "time_per_iteration": 4.755454778671265 - }, - { - "auxiliary_loss_clip": 0.01074086, - "auxiliary_loss_mlp": 0.01027358, - "balance_loss_clip": 1.03581715, - "balance_loss_mlp": 1.01549029, - "epoch": 0.8007816022846836, - "flos": 23185653033600.0, - "grad_norm": 1.8075855078848244, - "language_loss": 0.66764301, - "learning_rate": 4.019727563597366e-07, - "loss": 0.68865746, - "num_input_tokens_seen": 287414595, - "step": 13319, - "time_per_iteration": 4.444296360015869 - }, - { - "auxiliary_loss_clip": 0.0111121, - "auxiliary_loss_mlp": 0.00771312, - "balance_loss_clip": 1.03728712, - "balance_loss_mlp": 1.00022757, - "epoch": 0.8008417255373516, - "flos": 21981388728960.0, - "grad_norm": 1.8607859210030597, - "language_loss": 0.74157208, - "learning_rate": 4.0173859831755873e-07, - "loss": 0.76039732, - "num_input_tokens_seen": 287434395, - "step": 13320, - "time_per_iteration": 2.628570079803467 - }, - { - "auxiliary_loss_clip": 0.01097073, - "auxiliary_loss_mlp": 0.01026936, - "balance_loss_clip": 1.0365932, - "balance_loss_mlp": 1.01422882, - "epoch": 0.8009018487900196, - "flos": 16727334647040.0, - "grad_norm": 1.9175300817586667, - "language_loss": 0.80223489, - "learning_rate": 4.015045008816138e-07, - "loss": 0.823475, - "num_input_tokens_seen": 287450590, - "step": 13321, - "time_per_iteration": 4.052290201187134 - }, - { - "auxiliary_loss_clip": 0.01033155, - "auxiliary_loss_mlp": 0.01036586, - "balance_loss_clip": 1.02668345, - "balance_loss_mlp": 1.02364588, - "epoch": 0.8009619720426875, - "flos": 20813609664000.0, - "grad_norm": 1.8862095260452836, - "language_loss": 0.66014248, - "learning_rate": 4.0127046406077825e-07, - "loss": 0.6808399, - "num_input_tokens_seen": 287468455, - "step": 13322, - "time_per_iteration": 2.7416417598724365 - }, - { - "auxiliary_loss_clip": 0.01099704, - "auxiliary_loss_mlp": 0.01028354, - "balance_loss_clip": 1.03734875, - "balance_loss_mlp": 1.01642156, - "epoch": 0.8010220952953555, - "flos": 17931096161280.0, - "grad_norm": 1.9049008418549798, - "language_loss": 0.77709258, - "learning_rate": 4.010364878639265e-07, - "loss": 0.79837316, - "num_input_tokens_seen": 287486485, - "step": 13323, - "time_per_iteration": 2.6071035861968994 - }, - { - "auxiliary_loss_clip": 0.01110946, - "auxiliary_loss_mlp": 0.01029696, - "balance_loss_clip": 1.03769231, - "balance_loss_mlp": 1.01716661, - "epoch": 0.8010822185480234, - "flos": 24572235795840.0, - "grad_norm": 2.445337752212116, - "language_loss": 0.71122754, - "learning_rate": 4.00802572299932e-07, - "loss": 0.73263395, - "num_input_tokens_seen": 287503940, - "step": 13324, - "time_per_iteration": 2.6217870712280273 - }, - { - "auxiliary_loss_clip": 0.01068071, - "auxiliary_loss_mlp": 0.0103353, - "balance_loss_clip": 1.03280735, - "balance_loss_mlp": 1.02047682, - "epoch": 0.8011423418006914, - "flos": 21829988903040.0, - "grad_norm": 1.814435796416432, - "language_loss": 0.76471907, - "learning_rate": 4.005687173776635e-07, - "loss": 0.78573507, - "num_input_tokens_seen": 287521660, - "step": 13325, - "time_per_iteration": 2.6970367431640625 - }, - { - "auxiliary_loss_clip": 0.01084618, - "auxiliary_loss_mlp": 0.01027508, - "balance_loss_clip": 1.03447258, - "balance_loss_mlp": 1.01634359, - "epoch": 0.8012024650533593, - "flos": 23915178259200.0, - "grad_norm": 1.5321170582331973, - "language_loss": 0.7980848, - "learning_rate": 4.003349231059898e-07, - "loss": 0.81920606, - "num_input_tokens_seen": 287541505, - "step": 13326, - "time_per_iteration": 2.6341090202331543 - }, - { - "auxiliary_loss_clip": 0.0109705, - "auxiliary_loss_mlp": 0.01032819, - "balance_loss_clip": 1.03666115, - "balance_loss_mlp": 1.02096391, - "epoch": 0.8012625883060274, - "flos": 23587062497280.0, - "grad_norm": 1.9170928763125719, - "language_loss": 0.65865368, - "learning_rate": 4.001011894937765e-07, - "loss": 0.67995238, - "num_input_tokens_seen": 287560015, - "step": 13327, - "time_per_iteration": 4.200170278549194 - }, - { - "auxiliary_loss_clip": 0.01094832, - "auxiliary_loss_mlp": 0.01031408, - "balance_loss_clip": 1.03746152, - "balance_loss_mlp": 1.02033961, - "epoch": 0.8013227115586953, - "flos": 20813932886400.0, - "grad_norm": 1.5945061628863433, - "language_loss": 0.73482913, - "learning_rate": 3.9986751654988636e-07, - "loss": 0.75609159, - "num_input_tokens_seen": 287579150, - "step": 13328, - "time_per_iteration": 2.598289966583252 - }, - { - "auxiliary_loss_clip": 0.01050876, - "auxiliary_loss_mlp": 0.01034952, - "balance_loss_clip": 1.03355122, - "balance_loss_mlp": 1.02166045, - "epoch": 0.8013828348113633, - "flos": 15888317788800.0, - "grad_norm": 1.9762883167731011, - "language_loss": 0.73578757, - "learning_rate": 3.996339042831798e-07, - "loss": 0.7566458, - "num_input_tokens_seen": 287597420, - "step": 13329, - "time_per_iteration": 2.738548994064331 - }, - { - "auxiliary_loss_clip": 0.0102058, - "auxiliary_loss_mlp": 0.00999735, - "balance_loss_clip": 1.0074687, - "balance_loss_mlp": 0.99866766, - "epoch": 0.8014429580640313, - "flos": 71062981562880.0, - "grad_norm": 0.6934041027763224, - "language_loss": 0.52926564, - "learning_rate": 3.9940035270251605e-07, - "loss": 0.54946882, - "num_input_tokens_seen": 287667280, - "step": 13330, - "time_per_iteration": 3.3172037601470947 - }, - { - "auxiliary_loss_clip": 0.01083958, - "auxiliary_loss_mlp": 0.01037459, - "balance_loss_clip": 1.03489339, - "balance_loss_mlp": 1.02364862, - "epoch": 0.8015030813166992, - "flos": 23076340968960.0, - "grad_norm": 1.7329849942476805, - "language_loss": 0.7308808, - "learning_rate": 3.991668618167519e-07, - "loss": 0.75209498, - "num_input_tokens_seen": 287687375, - "step": 13331, - "time_per_iteration": 2.7093939781188965 - }, - { - "auxiliary_loss_clip": 0.01091699, - "auxiliary_loss_mlp": 0.01029361, - "balance_loss_clip": 1.03614366, - "balance_loss_mlp": 1.01829839, - "epoch": 0.8015632045693672, - "flos": 21872328059520.0, - "grad_norm": 1.8780665842935151, - "language_loss": 0.77335048, - "learning_rate": 3.989334316347401e-07, - "loss": 0.79456115, - "num_input_tokens_seen": 287707895, - "step": 13332, - "time_per_iteration": 2.708766460418701 - }, - { - "auxiliary_loss_clip": 0.01110082, - "auxiliary_loss_mlp": 0.01032572, - "balance_loss_clip": 1.03853345, - "balance_loss_mlp": 1.02041256, - "epoch": 0.8016233278220352, - "flos": 23656728925440.0, - "grad_norm": 1.9285581629240347, - "language_loss": 0.83625793, - "learning_rate": 3.987000621653338e-07, - "loss": 0.85768449, - "num_input_tokens_seen": 287723990, - "step": 13333, - "time_per_iteration": 2.6203196048736572 - }, - { - "auxiliary_loss_clip": 0.01088802, - "auxiliary_loss_mlp": 0.01032312, - "balance_loss_clip": 1.03681588, - "balance_loss_mlp": 1.02005112, - "epoch": 0.8016834510747032, - "flos": 16253170185600.0, - "grad_norm": 2.0639926273292115, - "language_loss": 0.73560673, - "learning_rate": 3.9846675341738133e-07, - "loss": 0.75681788, - "num_input_tokens_seen": 287742380, - "step": 13334, - "time_per_iteration": 2.674370765686035 - }, - { - "auxiliary_loss_clip": 0.01068855, - "auxiliary_loss_mlp": 0.01038369, - "balance_loss_clip": 1.03341401, - "balance_loss_mlp": 1.02465343, - "epoch": 0.8017435743273711, - "flos": 12276027665280.0, - "grad_norm": 3.712822925491278, - "language_loss": 0.7483573, - "learning_rate": 3.9823350539972967e-07, - "loss": 0.7694295, - "num_input_tokens_seen": 287760130, - "step": 13335, - "time_per_iteration": 2.661638021469116 - }, - { - "auxiliary_loss_clip": 0.01067475, - "auxiliary_loss_mlp": 0.01033285, - "balance_loss_clip": 1.03284895, - "balance_loss_mlp": 1.02039289, - "epoch": 0.8018036975800391, - "flos": 17196112068480.0, - "grad_norm": 1.8858612114976723, - "language_loss": 0.75267804, - "learning_rate": 3.9800031812122416e-07, - "loss": 0.77368569, - "num_input_tokens_seen": 287777565, - "step": 13336, - "time_per_iteration": 2.716108560562134 - }, - { - "auxiliary_loss_clip": 0.01077828, - "auxiliary_loss_mlp": 0.01037534, - "balance_loss_clip": 1.03872991, - "balance_loss_mlp": 1.02433777, - "epoch": 0.801863820832707, - "flos": 20631865824000.0, - "grad_norm": 2.2915329222004153, - "language_loss": 0.75145626, - "learning_rate": 3.977671915907068e-07, - "loss": 0.77260983, - "num_input_tokens_seen": 287796310, - "step": 13337, - "time_per_iteration": 2.714571237564087 - }, - { - "auxiliary_loss_clip": 0.0105226, - "auxiliary_loss_mlp": 0.00771062, - "balance_loss_clip": 1.03701448, - "balance_loss_mlp": 1.00021958, - "epoch": 0.801923944085375, - "flos": 30445569285120.0, - "grad_norm": 1.6114282506426694, - "language_loss": 0.80135483, - "learning_rate": 3.9753412581701883e-07, - "loss": 0.81958807, - "num_input_tokens_seen": 287817330, - "step": 13338, - "time_per_iteration": 2.8196728229522705 - }, - { - "auxiliary_loss_clip": 0.01073348, - "auxiliary_loss_mlp": 0.01033358, - "balance_loss_clip": 1.03255105, - "balance_loss_mlp": 1.01937521, - "epoch": 0.801984067338043, - "flos": 20010575255040.0, - "grad_norm": 1.8585955829202727, - "language_loss": 0.74602437, - "learning_rate": 3.9730112080899733e-07, - "loss": 0.76709145, - "num_input_tokens_seen": 287835095, - "step": 13339, - "time_per_iteration": 2.6212968826293945 - }, - { - "auxiliary_loss_clip": 0.01096453, - "auxiliary_loss_mlp": 0.01029361, - "balance_loss_clip": 1.03771079, - "balance_loss_mlp": 1.01769042, - "epoch": 0.802044190590711, - "flos": 22784028088320.0, - "grad_norm": 1.7386931657461442, - "language_loss": 0.79321545, - "learning_rate": 3.970681765754775e-07, - "loss": 0.81447363, - "num_input_tokens_seen": 287854595, - "step": 13340, - "time_per_iteration": 2.6530919075012207 - }, - { - "auxiliary_loss_clip": 0.01083163, - "auxiliary_loss_mlp": 0.01032731, - "balance_loss_clip": 1.04112291, - "balance_loss_mlp": 1.02116799, - "epoch": 0.8021043138433789, - "flos": 27600115639680.0, - "grad_norm": 1.956756887496364, - "language_loss": 0.68165088, - "learning_rate": 3.968352931252936e-07, - "loss": 0.70280981, - "num_input_tokens_seen": 287876960, - "step": 13341, - "time_per_iteration": 2.75055193901062 - }, - { - "auxiliary_loss_clip": 0.01012323, - "auxiliary_loss_mlp": 0.01007998, - "balance_loss_clip": 1.00806713, - "balance_loss_mlp": 1.00693703, - "epoch": 0.8021644370960469, - "flos": 62063730057600.0, - "grad_norm": 0.8136387701822201, - "language_loss": 0.61581981, - "learning_rate": 3.9660247046727547e-07, - "loss": 0.63602304, - "num_input_tokens_seen": 287936530, - "step": 13342, - "time_per_iteration": 3.1247668266296387 - }, - { - "auxiliary_loss_clip": 0.01092566, - "auxiliary_loss_mlp": 0.01037048, - "balance_loss_clip": 1.03939772, - "balance_loss_mlp": 1.02370882, - "epoch": 0.8022245603487148, - "flos": 23361794352000.0, - "grad_norm": 1.856395424623049, - "language_loss": 0.63709104, - "learning_rate": 3.963697086102522e-07, - "loss": 0.65838718, - "num_input_tokens_seen": 287954285, - "step": 13343, - "time_per_iteration": 2.7734808921813965 - }, - { - "auxiliary_loss_clip": 0.01081526, - "auxiliary_loss_mlp": 0.01029916, - "balance_loss_clip": 1.03520012, - "balance_loss_mlp": 1.01859128, - "epoch": 0.8022846836013828, - "flos": 10853354712960.0, - "grad_norm": 2.8925242692111124, - "language_loss": 0.68967628, - "learning_rate": 3.96137007563051e-07, - "loss": 0.71079069, - "num_input_tokens_seen": 287971595, - "step": 13344, - "time_per_iteration": 2.7123825550079346 - }, - { - "auxiliary_loss_clip": 0.01099765, - "auxiliary_loss_mlp": 0.0102957, - "balance_loss_clip": 1.03843033, - "balance_loss_mlp": 1.01712489, - "epoch": 0.8023448068540509, - "flos": 29240443054080.0, - "grad_norm": 1.7802127623764623, - "language_loss": 0.7023524, - "learning_rate": 3.9590436733449506e-07, - "loss": 0.72364575, - "num_input_tokens_seen": 287992540, - "step": 13345, - "time_per_iteration": 2.7695276737213135 - }, - { - "auxiliary_loss_clip": 0.01013378, - "auxiliary_loss_mlp": 0.01005433, - "balance_loss_clip": 1.00990939, - "balance_loss_mlp": 1.00426471, - "epoch": 0.8024049301067188, - "flos": 64153588181760.0, - "grad_norm": 0.8891261669037472, - "language_loss": 0.62973511, - "learning_rate": 3.956717879334059e-07, - "loss": 0.64992326, - "num_input_tokens_seen": 288052810, - "step": 13346, - "time_per_iteration": 3.28011417388916 - }, - { - "auxiliary_loss_clip": 0.01084414, - "auxiliary_loss_mlp": 0.01031015, - "balance_loss_clip": 1.03860998, - "balance_loss_mlp": 1.01868272, - "epoch": 0.8024650533593868, - "flos": 28585360765440.0, - "grad_norm": 2.315219650527018, - "language_loss": 0.72604311, - "learning_rate": 3.9543926936860327e-07, - "loss": 0.74719733, - "num_input_tokens_seen": 288073045, - "step": 13347, - "time_per_iteration": 2.7291135787963867 - }, - { - "auxiliary_loss_clip": 0.01098598, - "auxiliary_loss_mlp": 0.01032216, - "balance_loss_clip": 1.0363127, - "balance_loss_mlp": 1.01959181, - "epoch": 0.8025251766120547, - "flos": 16982264448000.0, - "grad_norm": 1.7959998434769961, - "language_loss": 0.72794473, - "learning_rate": 3.9520681164890493e-07, - "loss": 0.74925292, - "num_input_tokens_seen": 288091165, - "step": 13348, - "time_per_iteration": 2.623680353164673 - }, - { - "auxiliary_loss_clip": 0.01083208, - "auxiliary_loss_mlp": 0.01030443, - "balance_loss_clip": 1.03903532, - "balance_loss_mlp": 1.0179677, - "epoch": 0.8025852998647227, - "flos": 22163671272960.0, - "grad_norm": 3.547597089549619, - "language_loss": 0.75893748, - "learning_rate": 3.9497441478312444e-07, - "loss": 0.780074, - "num_input_tokens_seen": 288110595, - "step": 13349, - "time_per_iteration": 2.658114433288574 - }, - { - "auxiliary_loss_clip": 0.01110466, - "auxiliary_loss_mlp": 0.01034774, - "balance_loss_clip": 1.03946209, - "balance_loss_mlp": 1.02321064, - "epoch": 0.8026454231173906, - "flos": 22017012042240.0, - "grad_norm": 2.407592259971092, - "language_loss": 0.83429128, - "learning_rate": 3.947420787800755e-07, - "loss": 0.85574365, - "num_input_tokens_seen": 288128995, - "step": 13350, - "time_per_iteration": 2.6693131923675537 - }, - { - "auxiliary_loss_clip": 0.01100877, - "auxiliary_loss_mlp": 0.01036159, - "balance_loss_clip": 1.03946972, - "balance_loss_mlp": 1.02371919, - "epoch": 0.8027055463700586, - "flos": 22491320158080.0, - "grad_norm": 2.406570051160922, - "language_loss": 0.71667969, - "learning_rate": 3.945098036485679e-07, - "loss": 0.7380501, - "num_input_tokens_seen": 288149265, - "step": 13351, - "time_per_iteration": 2.6675031185150146 - }, - { - "auxiliary_loss_clip": 0.01069791, - "auxiliary_loss_mlp": 0.01034939, - "balance_loss_clip": 1.03470254, - "balance_loss_mlp": 1.02237439, - "epoch": 0.8027656696227266, - "flos": 28912901909760.0, - "grad_norm": 1.586066811433664, - "language_loss": 0.61656845, - "learning_rate": 3.9427758939740885e-07, - "loss": 0.63761568, - "num_input_tokens_seen": 288170745, - "step": 13352, - "time_per_iteration": 2.8598105907440186 - }, - { - "auxiliary_loss_clip": 0.01096816, - "auxiliary_loss_mlp": 0.0103672, - "balance_loss_clip": 1.03765738, - "balance_loss_mlp": 1.02495408, - "epoch": 0.8028257928753946, - "flos": 18589374760320.0, - "grad_norm": 2.4706643643447346, - "language_loss": 0.76973253, - "learning_rate": 3.940454360354046e-07, - "loss": 0.79106784, - "num_input_tokens_seen": 288189415, - "step": 13353, - "time_per_iteration": 2.6434624195098877 - }, - { - "auxiliary_loss_clip": 0.01052438, - "auxiliary_loss_mlp": 0.01029892, - "balance_loss_clip": 1.03585696, - "balance_loss_mlp": 1.01597464, - "epoch": 0.8028859161280625, - "flos": 19130009339520.0, - "grad_norm": 2.077119473640625, - "language_loss": 0.73317617, - "learning_rate": 3.938133435713582e-07, - "loss": 0.75399947, - "num_input_tokens_seen": 288206900, - "step": 13354, - "time_per_iteration": 2.980314254760742 - }, - { - "auxiliary_loss_clip": 0.01069099, - "auxiliary_loss_mlp": 0.01040669, - "balance_loss_clip": 1.03414679, - "balance_loss_mlp": 1.02725756, - "epoch": 0.8029460393807305, - "flos": 20229881742720.0, - "grad_norm": 2.1378474316121463, - "language_loss": 0.65846258, - "learning_rate": 3.935813120140714e-07, - "loss": 0.6795603, - "num_input_tokens_seen": 288224800, - "step": 13355, - "time_per_iteration": 2.7468628883361816 - }, - { - "auxiliary_loss_clip": 0.01074013, - "auxiliary_loss_mlp": 0.01033759, - "balance_loss_clip": 1.03249073, - "balance_loss_mlp": 1.0199666, - "epoch": 0.8030061626333984, - "flos": 49783320933120.0, - "grad_norm": 2.375725357962007, - "language_loss": 0.68678093, - "learning_rate": 3.9334934137234235e-07, - "loss": 0.70785862, - "num_input_tokens_seen": 288249400, - "step": 13356, - "time_per_iteration": 2.9967265129089355 - }, - { - "auxiliary_loss_clip": 0.01069606, - "auxiliary_loss_mlp": 0.01029553, - "balance_loss_clip": 1.04181337, - "balance_loss_mlp": 1.01715517, - "epoch": 0.8030662858860664, - "flos": 21615243442560.0, - "grad_norm": 1.6420809304717021, - "language_loss": 0.77664089, - "learning_rate": 3.931174316549666e-07, - "loss": 0.79763246, - "num_input_tokens_seen": 288268780, - "step": 13357, - "time_per_iteration": 4.406202077865601 - }, - { - "auxiliary_loss_clip": 0.01074511, - "auxiliary_loss_mlp": 0.01032493, - "balance_loss_clip": 1.03333926, - "balance_loss_mlp": 1.01853395, - "epoch": 0.8031264091387345, - "flos": 25630056351360.0, - "grad_norm": 1.4188016653576663, - "language_loss": 0.77055764, - "learning_rate": 3.9288558287073937e-07, - "loss": 0.79162776, - "num_input_tokens_seen": 288290830, - "step": 13358, - "time_per_iteration": 4.418318033218384 - }, - { - "auxiliary_loss_clip": 0.01097306, - "auxiliary_loss_mlp": 0.01028987, - "balance_loss_clip": 1.03661919, - "balance_loss_mlp": 1.01740575, - "epoch": 0.8031865323914024, - "flos": 19646225648640.0, - "grad_norm": 1.5140849812100452, - "language_loss": 0.84604448, - "learning_rate": 3.9265379502845143e-07, - "loss": 0.86730748, - "num_input_tokens_seen": 288308865, - "step": 13359, - "time_per_iteration": 2.6452579498291016 - }, - { - "auxiliary_loss_clip": 0.01081667, - "auxiliary_loss_mlp": 0.01025679, - "balance_loss_clip": 1.03710377, - "balance_loss_mlp": 1.01406813, - "epoch": 0.8032466556440704, - "flos": 26169110732160.0, - "grad_norm": 1.8313021321254959, - "language_loss": 0.73854876, - "learning_rate": 3.924220681368928e-07, - "loss": 0.75962222, - "num_input_tokens_seen": 288327325, - "step": 13360, - "time_per_iteration": 2.7636659145355225 - }, - { - "auxiliary_loss_clip": 0.01110485, - "auxiliary_loss_mlp": 0.01027583, - "balance_loss_clip": 1.03801131, - "balance_loss_mlp": 1.01598358, - "epoch": 0.8033067788967383, - "flos": 25520026014720.0, - "grad_norm": 2.137959732287125, - "language_loss": 0.69831038, - "learning_rate": 3.921904022048512e-07, - "loss": 0.71969098, - "num_input_tokens_seen": 288347285, - "step": 13361, - "time_per_iteration": 4.267240524291992 - }, - { - "auxiliary_loss_clip": 0.01112515, - "auxiliary_loss_mlp": 0.01035596, - "balance_loss_clip": 1.03754067, - "balance_loss_mlp": 1.02316272, - "epoch": 0.8033669021494063, - "flos": 24024274842240.0, - "grad_norm": 1.8009112987643567, - "language_loss": 0.70254129, - "learning_rate": 3.919587972411098e-07, - "loss": 0.72402239, - "num_input_tokens_seen": 288367785, - "step": 13362, - "time_per_iteration": 2.6688599586486816 - }, - { - "auxiliary_loss_clip": 0.01116592, - "auxiliary_loss_mlp": 0.0103705, - "balance_loss_clip": 1.03921294, - "balance_loss_mlp": 1.02289987, - "epoch": 0.8034270254020742, - "flos": 13588059749760.0, - "grad_norm": 2.3399431246123132, - "language_loss": 0.78629005, - "learning_rate": 3.91727253254452e-07, - "loss": 0.80782652, - "num_input_tokens_seen": 288384135, - "step": 13363, - "time_per_iteration": 2.597430944442749 - }, - { - "auxiliary_loss_clip": 0.01097254, - "auxiliary_loss_mlp": 0.01028939, - "balance_loss_clip": 1.03588295, - "balance_loss_mlp": 1.01675546, - "epoch": 0.8034871486547422, - "flos": 27412661537280.0, - "grad_norm": 6.139321315724576, - "language_loss": 0.74428964, - "learning_rate": 3.9149577025365787e-07, - "loss": 0.76555157, - "num_input_tokens_seen": 288403805, - "step": 13364, - "time_per_iteration": 2.688309669494629 - }, - { - "auxiliary_loss_clip": 0.01096585, - "auxiliary_loss_mlp": 0.01030723, - "balance_loss_clip": 1.03990126, - "balance_loss_mlp": 1.0187124, - "epoch": 0.8035472719074102, - "flos": 32598593475840.0, - "grad_norm": 2.077806609776792, - "language_loss": 0.61057466, - "learning_rate": 3.9126434824750596e-07, - "loss": 0.63184774, - "num_input_tokens_seen": 288424895, - "step": 13365, - "time_per_iteration": 2.9324018955230713 - }, - { - "auxiliary_loss_clip": 0.01089765, - "auxiliary_loss_mlp": 0.01034058, - "balance_loss_clip": 1.03685248, - "balance_loss_mlp": 1.02096307, - "epoch": 0.8036073951600782, - "flos": 21287989607040.0, - "grad_norm": 1.971281433653639, - "language_loss": 0.66274738, - "learning_rate": 3.910329872447706e-07, - "loss": 0.68398559, - "num_input_tokens_seen": 288443865, - "step": 13366, - "time_per_iteration": 4.221456050872803 - }, - { - "auxiliary_loss_clip": 0.01106605, - "auxiliary_loss_mlp": 0.01031218, - "balance_loss_clip": 1.03672922, - "balance_loss_mlp": 1.01938665, - "epoch": 0.8036675184127461, - "flos": 18113845582080.0, - "grad_norm": 1.997291212558988, - "language_loss": 0.74654198, - "learning_rate": 3.908016872542259e-07, - "loss": 0.7679202, - "num_input_tokens_seen": 288461065, - "step": 13367, - "time_per_iteration": 2.6199634075164795 - }, - { - "auxiliary_loss_clip": 0.01108205, - "auxiliary_loss_mlp": 0.01027463, - "balance_loss_clip": 1.03705049, - "balance_loss_mlp": 1.01538706, - "epoch": 0.8037276416654141, - "flos": 26030280666240.0, - "grad_norm": 1.603723731254455, - "language_loss": 0.73827767, - "learning_rate": 3.905704482846428e-07, - "loss": 0.75963438, - "num_input_tokens_seen": 288481865, - "step": 13368, - "time_per_iteration": 2.6368210315704346 - }, - { - "auxiliary_loss_clip": 0.01110551, - "auxiliary_loss_mlp": 0.01031277, - "balance_loss_clip": 1.03718567, - "balance_loss_mlp": 1.01879573, - "epoch": 0.803787764918082, - "flos": 18802180886400.0, - "grad_norm": 1.9805811504758806, - "language_loss": 0.70231676, - "learning_rate": 3.90339270344789e-07, - "loss": 0.72373503, - "num_input_tokens_seen": 288499345, - "step": 13369, - "time_per_iteration": 2.5763745307922363 - }, - { - "auxiliary_loss_clip": 0.01088545, - "auxiliary_loss_mlp": 0.01033097, - "balance_loss_clip": 1.03672147, - "balance_loss_mlp": 1.02170682, - "epoch": 0.80384788817075, - "flos": 20225787592320.0, - "grad_norm": 2.6910439854166466, - "language_loss": 0.73273438, - "learning_rate": 3.901081534434312e-07, - "loss": 0.75395083, - "num_input_tokens_seen": 288517660, - "step": 13370, - "time_per_iteration": 2.748764753341675 - }, - { - "auxiliary_loss_clip": 0.01087131, - "auxiliary_loss_mlp": 0.01032078, - "balance_loss_clip": 1.03560901, - "balance_loss_mlp": 1.01849425, - "epoch": 0.8039080114234181, - "flos": 18515290959360.0, - "grad_norm": 2.9168856587883987, - "language_loss": 0.86785686, - "learning_rate": 3.898770975893342e-07, - "loss": 0.88904893, - "num_input_tokens_seen": 288534180, - "step": 13371, - "time_per_iteration": 2.7640862464904785 - }, - { - "auxiliary_loss_clip": 0.01100956, - "auxiliary_loss_mlp": 0.01032312, - "balance_loss_clip": 1.03582239, - "balance_loss_mlp": 1.0192045, - "epoch": 0.803968134676086, - "flos": 22382510883840.0, - "grad_norm": 1.8421468200068354, - "language_loss": 0.75026673, - "learning_rate": 3.89646102791259e-07, - "loss": 0.77159941, - "num_input_tokens_seen": 288553350, - "step": 13372, - "time_per_iteration": 2.724491596221924 - }, - { - "auxiliary_loss_clip": 0.01068816, - "auxiliary_loss_mlp": 0.01031309, - "balance_loss_clip": 1.03654325, - "balance_loss_mlp": 1.01796961, - "epoch": 0.804028257928754, - "flos": 23842566915840.0, - "grad_norm": 2.31065628339188, - "language_loss": 0.79036891, - "learning_rate": 3.894151690579646e-07, - "loss": 0.81137019, - "num_input_tokens_seen": 288571325, - "step": 13373, - "time_per_iteration": 2.910059928894043 - }, - { - "auxiliary_loss_clip": 0.01081798, - "auxiliary_loss_mlp": 0.01035447, - "balance_loss_clip": 1.03376925, - "balance_loss_mlp": 1.02387166, - "epoch": 0.8040883811814219, - "flos": 23550720912000.0, - "grad_norm": 1.7053581559181326, - "language_loss": 0.74311471, - "learning_rate": 3.8918429639820815e-07, - "loss": 0.76428711, - "num_input_tokens_seen": 288592100, - "step": 13374, - "time_per_iteration": 2.698894500732422 - }, - { - "auxiliary_loss_clip": 0.01059369, - "auxiliary_loss_mlp": 0.01037575, - "balance_loss_clip": 1.03106141, - "balance_loss_mlp": 1.02297187, - "epoch": 0.8041485044340899, - "flos": 19026263882880.0, - "grad_norm": 1.889259029929228, - "language_loss": 0.6848501, - "learning_rate": 3.889534848207452e-07, - "loss": 0.70581961, - "num_input_tokens_seen": 288612305, - "step": 13375, - "time_per_iteration": 2.781163215637207 - }, - { - "auxiliary_loss_clip": 0.01008954, - "auxiliary_loss_mlp": 0.01001942, - "balance_loss_clip": 1.01513779, - "balance_loss_mlp": 1.00076795, - "epoch": 0.8042086276867578, - "flos": 70005663797760.0, - "grad_norm": 0.7220073692688251, - "language_loss": 0.55658954, - "learning_rate": 3.887227343343271e-07, - "loss": 0.57669854, - "num_input_tokens_seen": 288676015, - "step": 13376, - "time_per_iteration": 3.373588800430298 - }, - { - "auxiliary_loss_clip": 0.01056178, - "auxiliary_loss_mlp": 0.01035137, - "balance_loss_clip": 1.03220654, - "balance_loss_mlp": 1.02096248, - "epoch": 0.8042687509394258, - "flos": 21872435800320.0, - "grad_norm": 1.964983939907185, - "language_loss": 0.72909731, - "learning_rate": 3.8849204494770425e-07, - "loss": 0.75001043, - "num_input_tokens_seen": 288696455, - "step": 13377, - "time_per_iteration": 2.8420963287353516 - }, - { - "auxiliary_loss_clip": 0.01095422, - "auxiliary_loss_mlp": 0.01029914, - "balance_loss_clip": 1.03510308, - "balance_loss_mlp": 1.01737309, - "epoch": 0.8043288741920938, - "flos": 26614870513920.0, - "grad_norm": 1.843970634280457, - "language_loss": 0.70282233, - "learning_rate": 3.8826141666962567e-07, - "loss": 0.72407568, - "num_input_tokens_seen": 288715560, - "step": 13378, - "time_per_iteration": 2.656498670578003 - }, - { - "auxiliary_loss_clip": 0.0110247, - "auxiliary_loss_mlp": 0.01027065, - "balance_loss_clip": 1.03830576, - "balance_loss_mlp": 1.01435089, - "epoch": 0.8043889974447618, - "flos": 33403387651200.0, - "grad_norm": 1.3626557970625712, - "language_loss": 0.69352663, - "learning_rate": 3.880308495088347e-07, - "loss": 0.71482199, - "num_input_tokens_seen": 288739485, - "step": 13379, - "time_per_iteration": 2.725536584854126 - }, - { - "auxiliary_loss_clip": 0.01115659, - "auxiliary_loss_mlp": 0.01034653, - "balance_loss_clip": 1.04020107, - "balance_loss_mlp": 1.02027059, - "epoch": 0.8044491206974297, - "flos": 20375966355840.0, - "grad_norm": 1.7895941643177822, - "language_loss": 0.76386261, - "learning_rate": 3.8780034347407533e-07, - "loss": 0.7853657, - "num_input_tokens_seen": 288757420, - "step": 13380, - "time_per_iteration": 2.560413360595703 - }, - { - "auxiliary_loss_clip": 0.01062218, - "auxiliary_loss_mlp": 0.01029167, - "balance_loss_clip": 1.03264856, - "balance_loss_mlp": 1.01679909, - "epoch": 0.8045092439500977, - "flos": 23403810286080.0, - "grad_norm": 2.4010662161686813, - "language_loss": 0.69055688, - "learning_rate": 3.875698985740887e-07, - "loss": 0.71147072, - "num_input_tokens_seen": 288775535, - "step": 13381, - "time_per_iteration": 2.7233426570892334 - }, - { - "auxiliary_loss_clip": 0.01102054, - "auxiliary_loss_mlp": 0.01033657, - "balance_loss_clip": 1.03834701, - "balance_loss_mlp": 1.02112257, - "epoch": 0.8045693672027656, - "flos": 24097245321600.0, - "grad_norm": 1.7871560626135898, - "language_loss": 0.63795519, - "learning_rate": 3.873395148176135e-07, - "loss": 0.65931231, - "num_input_tokens_seen": 288795035, - "step": 13382, - "time_per_iteration": 2.62091326713562 - }, - { - "auxiliary_loss_clip": 0.01086707, - "auxiliary_loss_mlp": 0.01036462, - "balance_loss_clip": 1.03742146, - "balance_loss_mlp": 1.02481508, - "epoch": 0.8046294904554336, - "flos": 27707165147520.0, - "grad_norm": 4.567282213112271, - "language_loss": 0.7625041, - "learning_rate": 3.8710919221338487e-07, - "loss": 0.78373575, - "num_input_tokens_seen": 288816270, - "step": 13383, - "time_per_iteration": 2.7304000854492188 - }, - { - "auxiliary_loss_clip": 0.01093751, - "auxiliary_loss_mlp": 0.01041645, - "balance_loss_clip": 1.03544414, - "balance_loss_mlp": 1.02812028, - "epoch": 0.8046896137081017, - "flos": 24972998814720.0, - "grad_norm": 1.8100283052553972, - "language_loss": 0.69704837, - "learning_rate": 3.868789307701381e-07, - "loss": 0.71840227, - "num_input_tokens_seen": 288836050, - "step": 13384, - "time_per_iteration": 2.623194932937622 - }, - { - "auxiliary_loss_clip": 0.0109844, - "auxiliary_loss_mlp": 0.01036542, - "balance_loss_clip": 1.03508019, - "balance_loss_mlp": 1.02301192, - "epoch": 0.8047497369607696, - "flos": 17675484001920.0, - "grad_norm": 8.534865412307397, - "language_loss": 0.79628527, - "learning_rate": 3.8664873049660375e-07, - "loss": 0.81763506, - "num_input_tokens_seen": 288852900, - "step": 13385, - "time_per_iteration": 2.640493869781494 - }, - { - "auxiliary_loss_clip": 0.01109031, - "auxiliary_loss_mlp": 0.01031665, - "balance_loss_clip": 1.03663421, - "balance_loss_mlp": 1.01859391, - "epoch": 0.8048098602134376, - "flos": 22382079920640.0, - "grad_norm": 1.7222276785014166, - "language_loss": 0.72210598, - "learning_rate": 3.864185914015108e-07, - "loss": 0.74351293, - "num_input_tokens_seen": 288872625, - "step": 13386, - "time_per_iteration": 2.620424747467041 - }, - { - "auxiliary_loss_clip": 0.01000165, - "auxiliary_loss_mlp": 0.01002697, - "balance_loss_clip": 1.0073638, - "balance_loss_mlp": 1.00164747, - "epoch": 0.8048699834661055, - "flos": 71200949702400.0, - "grad_norm": 0.6627374322558968, - "language_loss": 0.51254958, - "learning_rate": 3.861885134935865e-07, - "loss": 0.53257823, - "num_input_tokens_seen": 288939180, - "step": 13387, - "time_per_iteration": 3.249873399734497 - }, - { - "auxiliary_loss_clip": 0.01108941, - "auxiliary_loss_mlp": 0.01033944, - "balance_loss_clip": 1.03665853, - "balance_loss_mlp": 1.02005613, - "epoch": 0.8049301067187735, - "flos": 23660320285440.0, - "grad_norm": 1.7754749617398262, - "language_loss": 0.73770982, - "learning_rate": 3.859584967815559e-07, - "loss": 0.75913864, - "num_input_tokens_seen": 288958925, - "step": 13388, - "time_per_iteration": 2.63905930519104 - }, - { - "auxiliary_loss_clip": 0.0108125, - "auxiliary_loss_mlp": 0.01028741, - "balance_loss_clip": 1.04047871, - "balance_loss_mlp": 1.01668882, - "epoch": 0.8049902299714414, - "flos": 24426330750720.0, - "grad_norm": 1.3693007974519653, - "language_loss": 0.71537852, - "learning_rate": 3.857285412741411e-07, - "loss": 0.73647845, - "num_input_tokens_seen": 288980935, - "step": 13389, - "time_per_iteration": 2.8490209579467773 - }, - { - "auxiliary_loss_clip": 0.01085356, - "auxiliary_loss_mlp": 0.01032905, - "balance_loss_clip": 1.03994167, - "balance_loss_mlp": 1.02047765, - "epoch": 0.8050503532241094, - "flos": 17492626840320.0, - "grad_norm": 2.1746579852789565, - "language_loss": 0.82934594, - "learning_rate": 3.8549864698006097e-07, - "loss": 0.8505286, - "num_input_tokens_seen": 288996780, - "step": 13390, - "time_per_iteration": 2.695349931716919 - }, - { - "auxiliary_loss_clip": 0.01021163, - "auxiliary_loss_mlp": 0.01001808, - "balance_loss_clip": 1.00786567, - "balance_loss_mlp": 1.00077081, - "epoch": 0.8051104764767774, - "flos": 57658030369920.0, - "grad_norm": 0.7760583028840095, - "language_loss": 0.55514753, - "learning_rate": 3.8526881390803424e-07, - "loss": 0.57537723, - "num_input_tokens_seen": 289057590, - "step": 13391, - "time_per_iteration": 3.1499392986297607 - }, - { - "auxiliary_loss_clip": 0.01096246, - "auxiliary_loss_mlp": 0.01032181, - "balance_loss_clip": 1.0376209, - "balance_loss_mlp": 1.02046287, - "epoch": 0.8051705997294454, - "flos": 18003456109440.0, - "grad_norm": 1.5156025498114776, - "language_loss": 0.84548998, - "learning_rate": 3.850390420667762e-07, - "loss": 0.86677432, - "num_input_tokens_seen": 289076285, - "step": 13392, - "time_per_iteration": 2.7121686935424805 - }, - { - "auxiliary_loss_clip": 0.01075704, - "auxiliary_loss_mlp": 0.01031392, - "balance_loss_clip": 1.03425109, - "balance_loss_mlp": 1.01953077, - "epoch": 0.8052307229821133, - "flos": 26397754755840.0, - "grad_norm": 1.5752957975366317, - "language_loss": 0.70452738, - "learning_rate": 3.8480933146499914e-07, - "loss": 0.72559834, - "num_input_tokens_seen": 289097585, - "step": 13393, - "time_per_iteration": 2.857966899871826 - }, - { - "auxiliary_loss_clip": 0.01100081, - "auxiliary_loss_mlp": 0.01033968, - "balance_loss_clip": 1.03709984, - "balance_loss_mlp": 1.02045584, - "epoch": 0.8052908462347813, - "flos": 21757018423680.0, - "grad_norm": 2.1123482954588733, - "language_loss": 0.76134676, - "learning_rate": 3.84579682111414e-07, - "loss": 0.78268725, - "num_input_tokens_seen": 289116890, - "step": 13394, - "time_per_iteration": 2.6536917686462402 - }, - { - "auxiliary_loss_clip": 0.0111249, - "auxiliary_loss_mlp": 0.01030653, - "balance_loss_clip": 1.03986442, - "balance_loss_mlp": 1.0186125, - "epoch": 0.8053509694874492, - "flos": 25442279026560.0, - "grad_norm": 1.6448341122906027, - "language_loss": 0.64934421, - "learning_rate": 3.843500940147304e-07, - "loss": 0.67077565, - "num_input_tokens_seen": 289136670, - "step": 13395, - "time_per_iteration": 2.6280672550201416 - }, - { - "auxiliary_loss_clip": 0.01019533, - "auxiliary_loss_mlp": 0.00999955, - "balance_loss_clip": 1.00609279, - "balance_loss_mlp": 0.99902552, - "epoch": 0.8054110927401172, - "flos": 57668122091520.0, - "grad_norm": 0.7500398234084821, - "language_loss": 0.57342923, - "learning_rate": 3.8412056718365206e-07, - "loss": 0.59362411, - "num_input_tokens_seen": 289200150, - "step": 13396, - "time_per_iteration": 3.278367519378662 - }, - { - "auxiliary_loss_clip": 0.01099939, - "auxiliary_loss_mlp": 0.01035451, - "balance_loss_clip": 1.0372299, - "balance_loss_mlp": 1.02181315, - "epoch": 0.8054712159927853, - "flos": 19276201693440.0, - "grad_norm": 1.6362380088306854, - "language_loss": 0.77317524, - "learning_rate": 3.8389110162688353e-07, - "loss": 0.79452914, - "num_input_tokens_seen": 289218125, - "step": 13397, - "time_per_iteration": 5.758723258972168 - }, - { - "auxiliary_loss_clip": 0.01095341, - "auxiliary_loss_mlp": 0.01029553, - "balance_loss_clip": 1.04027462, - "balance_loss_mlp": 1.01784718, - "epoch": 0.8055313392454532, - "flos": 17967617314560.0, - "grad_norm": 1.6834134519930992, - "language_loss": 0.70419687, - "learning_rate": 3.836616973531266e-07, - "loss": 0.72544581, - "num_input_tokens_seen": 289237115, - "step": 13398, - "time_per_iteration": 2.6618268489837646 - }, - { - "auxiliary_loss_clip": 0.01086822, - "auxiliary_loss_mlp": 0.01031551, - "balance_loss_clip": 1.03505898, - "balance_loss_mlp": 1.01981521, - "epoch": 0.8055914624981212, - "flos": 13478352635520.0, - "grad_norm": 4.3745696767144056, - "language_loss": 0.69005787, - "learning_rate": 3.834323543710805e-07, - "loss": 0.71124166, - "num_input_tokens_seen": 289253635, - "step": 13399, - "time_per_iteration": 2.682286024093628 - }, - { - "auxiliary_loss_clip": 0.01109953, - "auxiliary_loss_mlp": 0.0103448, - "balance_loss_clip": 1.03786373, - "balance_loss_mlp": 1.02234411, - "epoch": 0.8056515857507891, - "flos": 13224787551360.0, - "grad_norm": 2.3581489443950043, - "language_loss": 0.71867836, - "learning_rate": 3.8320307268944153e-07, - "loss": 0.74012268, - "num_input_tokens_seen": 289270085, - "step": 13400, - "time_per_iteration": 4.049706935882568 - }, - { - "auxiliary_loss_clip": 0.0109504, - "auxiliary_loss_mlp": 0.01032952, - "balance_loss_clip": 1.03316319, - "balance_loss_mlp": 1.0205605, - "epoch": 0.8057117090034571, - "flos": 23878190229120.0, - "grad_norm": 1.8105260022834564, - "language_loss": 0.64344472, - "learning_rate": 3.829738523169037e-07, - "loss": 0.66472465, - "num_input_tokens_seen": 289289645, - "step": 13401, - "time_per_iteration": 2.6664888858795166 - }, - { - "auxiliary_loss_clip": 0.01097912, - "auxiliary_loss_mlp": 0.01033141, - "balance_loss_clip": 1.03556728, - "balance_loss_mlp": 1.0208919, - "epoch": 0.805771832256125, - "flos": 21214300855680.0, - "grad_norm": 2.280323005246413, - "language_loss": 0.83644533, - "learning_rate": 3.8274469326215985e-07, - "loss": 0.8577559, - "num_input_tokens_seen": 289306630, - "step": 13402, - "time_per_iteration": 2.6944761276245117 - }, - { - "auxiliary_loss_clip": 0.01058036, - "auxiliary_loss_mlp": 0.01032366, - "balance_loss_clip": 1.03603578, - "balance_loss_mlp": 1.01981318, - "epoch": 0.805831955508793, - "flos": 17566818382080.0, - "grad_norm": 6.24262023613013, - "language_loss": 0.68056262, - "learning_rate": 3.8251559553389876e-07, - "loss": 0.70146668, - "num_input_tokens_seen": 289324960, - "step": 13403, - "time_per_iteration": 2.763301372528076 - }, - { - "auxiliary_loss_clip": 0.01069641, - "auxiliary_loss_mlp": 0.00769597, - "balance_loss_clip": 1.0345124, - "balance_loss_mlp": 1.00027502, - "epoch": 0.805892078761461, - "flos": 26907542530560.0, - "grad_norm": 3.472008939762663, - "language_loss": 0.84777313, - "learning_rate": 3.822865591408084e-07, - "loss": 0.86616552, - "num_input_tokens_seen": 289344980, - "step": 13404, - "time_per_iteration": 2.7284321784973145 - }, - { - "auxiliary_loss_clip": 0.01066717, - "auxiliary_loss_mlp": 0.01032046, - "balance_loss_clip": 1.03557312, - "balance_loss_mlp": 1.02060783, - "epoch": 0.805952202014129, - "flos": 31506442496640.0, - "grad_norm": 1.5427853582818836, - "language_loss": 0.70597529, - "learning_rate": 3.820575840915743e-07, - "loss": 0.72696286, - "num_input_tokens_seen": 289367500, - "step": 13405, - "time_per_iteration": 4.542062520980835 - }, - { - "auxiliary_loss_clip": 0.01098712, - "auxiliary_loss_mlp": 0.01025986, - "balance_loss_clip": 1.03720641, - "balance_loss_mlp": 1.01441038, - "epoch": 0.8060123252667969, - "flos": 24389953251840.0, - "grad_norm": 2.8341192767465957, - "language_loss": 0.7541554, - "learning_rate": 3.818286703948788e-07, - "loss": 0.77540243, - "num_input_tokens_seen": 289385930, - "step": 13406, - "time_per_iteration": 2.68805193901062 - }, - { - "auxiliary_loss_clip": 0.01100072, - "auxiliary_loss_mlp": 0.01035029, - "balance_loss_clip": 1.03811562, - "balance_loss_mlp": 1.02201676, - "epoch": 0.8060724485194649, - "flos": 23479941162240.0, - "grad_norm": 1.5246012967976152, - "language_loss": 0.76201332, - "learning_rate": 3.815998180594018e-07, - "loss": 0.7833643, - "num_input_tokens_seen": 289408025, - "step": 13407, - "time_per_iteration": 2.666938066482544 - }, - { - "auxiliary_loss_clip": 0.01080345, - "auxiliary_loss_mlp": 0.00770991, - "balance_loss_clip": 1.03358412, - "balance_loss_mlp": 1.00019884, - "epoch": 0.8061325717721328, - "flos": 18624495283200.0, - "grad_norm": 1.5796081620527238, - "language_loss": 0.73616993, - "learning_rate": 3.81371027093822e-07, - "loss": 0.75468338, - "num_input_tokens_seen": 289426575, - "step": 13408, - "time_per_iteration": 2.662716865539551 - }, - { - "auxiliary_loss_clip": 0.01079538, - "auxiliary_loss_mlp": 0.01039164, - "balance_loss_clip": 1.03391623, - "balance_loss_mlp": 1.02488232, - "epoch": 0.8061926950248008, - "flos": 23582752865280.0, - "grad_norm": 1.8848269452946171, - "language_loss": 0.7084735, - "learning_rate": 3.8114229750681523e-07, - "loss": 0.72966051, - "num_input_tokens_seen": 289447760, - "step": 13409, - "time_per_iteration": 2.6887590885162354 - }, - { - "auxiliary_loss_clip": 0.01108, - "auxiliary_loss_mlp": 0.01029607, - "balance_loss_clip": 1.03585696, - "balance_loss_mlp": 1.0173347, - "epoch": 0.8062528182774689, - "flos": 11143333209600.0, - "grad_norm": 2.076248478414053, - "language_loss": 0.76634085, - "learning_rate": 3.809136293070545e-07, - "loss": 0.78771693, - "num_input_tokens_seen": 289463920, - "step": 13410, - "time_per_iteration": 2.5652787685394287 - }, - { - "auxiliary_loss_clip": 0.01099064, - "auxiliary_loss_mlp": 0.01037232, - "balance_loss_clip": 1.03812885, - "balance_loss_mlp": 1.02414274, - "epoch": 0.8063129415301368, - "flos": 22346815743360.0, - "grad_norm": 2.5070949588041653, - "language_loss": 0.68454826, - "learning_rate": 3.806850225032117e-07, - "loss": 0.70591122, - "num_input_tokens_seen": 289482635, - "step": 13411, - "time_per_iteration": 2.627668857574463 - }, - { - "auxiliary_loss_clip": 0.01076065, - "auxiliary_loss_mlp": 0.01032676, - "balance_loss_clip": 1.03557301, - "balance_loss_mlp": 1.02042735, - "epoch": 0.8063730647828048, - "flos": 23988400133760.0, - "grad_norm": 1.6635819299590309, - "language_loss": 0.68043619, - "learning_rate": 3.804564771039551e-07, - "loss": 0.70152354, - "num_input_tokens_seen": 289502040, - "step": 13412, - "time_per_iteration": 2.8055179119110107 - }, - { - "auxiliary_loss_clip": 0.01099792, - "auxiliary_loss_mlp": 0.01036335, - "balance_loss_clip": 1.03846812, - "balance_loss_mlp": 1.02239335, - "epoch": 0.8064331880354727, - "flos": 21321494017920.0, - "grad_norm": 1.701960301408402, - "language_loss": 0.81657159, - "learning_rate": 3.8022799311795064e-07, - "loss": 0.83793283, - "num_input_tokens_seen": 289520740, - "step": 13413, - "time_per_iteration": 2.700803279876709 - }, - { - "auxiliary_loss_clip": 0.01092458, - "auxiliary_loss_mlp": 0.01042312, - "balance_loss_clip": 1.03472614, - "balance_loss_mlp": 1.02902031, - "epoch": 0.8064933112881407, - "flos": 19682890456320.0, - "grad_norm": 1.8513481069997626, - "language_loss": 0.85172534, - "learning_rate": 3.7999957055386303e-07, - "loss": 0.8730731, - "num_input_tokens_seen": 289535840, - "step": 13414, - "time_per_iteration": 2.563521385192871 - }, - { - "auxiliary_loss_clip": 0.01083885, - "auxiliary_loss_mlp": 0.01033091, - "balance_loss_clip": 1.03454745, - "balance_loss_mlp": 1.02088439, - "epoch": 0.8065534345408086, - "flos": 19279721226240.0, - "grad_norm": 1.9940068342487725, - "language_loss": 0.67127073, - "learning_rate": 3.7977120942035467e-07, - "loss": 0.69244045, - "num_input_tokens_seen": 289555205, - "step": 13415, - "time_per_iteration": 2.816197633743286 - }, - { - "auxiliary_loss_clip": 0.01072851, - "auxiliary_loss_mlp": 0.01025923, - "balance_loss_clip": 1.0345068, - "balance_loss_mlp": 1.01406205, - "epoch": 0.8066135577934767, - "flos": 19677718897920.0, - "grad_norm": 1.6185283067641691, - "language_loss": 0.76407629, - "learning_rate": 3.7954290972608383e-07, - "loss": 0.78506404, - "num_input_tokens_seen": 289573000, - "step": 13416, - "time_per_iteration": 2.7045845985412598 - }, - { - "auxiliary_loss_clip": 0.01095005, - "auxiliary_loss_mlp": 0.01034611, - "balance_loss_clip": 1.03473926, - "balance_loss_mlp": 1.02240372, - "epoch": 0.8066736810461446, - "flos": 21143592933120.0, - "grad_norm": 1.4143763344150053, - "language_loss": 0.65079415, - "learning_rate": 3.793146714797086e-07, - "loss": 0.67209029, - "num_input_tokens_seen": 289592625, - "step": 13417, - "time_per_iteration": 2.6034398078918457 - }, - { - "auxiliary_loss_clip": 0.01075095, - "auxiliary_loss_mlp": 0.01055795, - "balance_loss_clip": 1.0338254, - "balance_loss_mlp": 1.0419488, - "epoch": 0.8067338042988126, - "flos": 22598261925120.0, - "grad_norm": 1.7315768879693472, - "language_loss": 0.8098107, - "learning_rate": 3.7908649468988306e-07, - "loss": 0.8311196, - "num_input_tokens_seen": 289610780, - "step": 13418, - "time_per_iteration": 2.721973180770874 - }, - { - "auxiliary_loss_clip": 0.01090483, - "auxiliary_loss_mlp": 0.01032366, - "balance_loss_clip": 1.03820384, - "balance_loss_mlp": 1.01939058, - "epoch": 0.8067939275514805, - "flos": 16508423208960.0, - "grad_norm": 1.9096821915848545, - "language_loss": 0.84676445, - "learning_rate": 3.7885837936526066e-07, - "loss": 0.86799294, - "num_input_tokens_seen": 289628890, - "step": 13419, - "time_per_iteration": 2.6393797397613525 - }, - { - "auxiliary_loss_clip": 0.01071529, - "auxiliary_loss_mlp": 0.00770579, - "balance_loss_clip": 1.03478575, - "balance_loss_mlp": 1.00021386, - "epoch": 0.8068540508041485, - "flos": 28541836460160.0, - "grad_norm": 1.770068441657345, - "language_loss": 0.76010084, - "learning_rate": 3.7863032551449047e-07, - "loss": 0.7785219, - "num_input_tokens_seen": 289647220, - "step": 13420, - "time_per_iteration": 2.8084943294525146 - }, - { - "auxiliary_loss_clip": 0.01090718, - "auxiliary_loss_mlp": 0.00769899, - "balance_loss_clip": 1.03447235, - "balance_loss_mlp": 1.00020134, - "epoch": 0.8069141740568164, - "flos": 21652482867840.0, - "grad_norm": 1.8346765895775454, - "language_loss": 0.78423268, - "learning_rate": 3.784023331462207e-07, - "loss": 0.8028388, - "num_input_tokens_seen": 289665800, - "step": 13421, - "time_per_iteration": 2.6397383213043213 - }, - { - "auxiliary_loss_clip": 0.01078405, - "auxiliary_loss_mlp": 0.01025501, - "balance_loss_clip": 1.0375917, - "balance_loss_mlp": 1.01340711, - "epoch": 0.8069742973094844, - "flos": 17529327561600.0, - "grad_norm": 1.6792370158266972, - "language_loss": 0.80156964, - "learning_rate": 3.78174402269098e-07, - "loss": 0.82260871, - "num_input_tokens_seen": 289682705, - "step": 13422, - "time_per_iteration": 2.7309072017669678 - }, - { - "auxiliary_loss_clip": 0.01108091, - "auxiliary_loss_mlp": 0.01032058, - "balance_loss_clip": 1.03667307, - "balance_loss_mlp": 1.02025604, - "epoch": 0.8070344205621525, - "flos": 23367037737600.0, - "grad_norm": 1.6418430759860865, - "language_loss": 0.67767537, - "learning_rate": 3.7794653289176347e-07, - "loss": 0.69907683, - "num_input_tokens_seen": 289702920, - "step": 13423, - "time_per_iteration": 2.6276538372039795 - }, - { - "auxiliary_loss_clip": 0.01087102, - "auxiliary_loss_mlp": 0.01036916, - "balance_loss_clip": 1.03931355, - "balance_loss_mlp": 1.02424431, - "epoch": 0.8070945438148204, - "flos": 22930184528640.0, - "grad_norm": 1.8059898203268332, - "language_loss": 0.80249333, - "learning_rate": 3.7771872502285904e-07, - "loss": 0.82373351, - "num_input_tokens_seen": 289723280, - "step": 13424, - "time_per_iteration": 2.7442784309387207 - }, - { - "auxiliary_loss_clip": 0.01098964, - "auxiliary_loss_mlp": 0.01028478, - "balance_loss_clip": 1.03548968, - "balance_loss_mlp": 1.01652098, - "epoch": 0.8071546670674884, - "flos": 25300683613440.0, - "grad_norm": 1.410290973233428, - "language_loss": 0.78814334, - "learning_rate": 3.774909786710232e-07, - "loss": 0.80941772, - "num_input_tokens_seen": 289743475, - "step": 13425, - "time_per_iteration": 2.666613817214966 - }, - { - "auxiliary_loss_clip": 0.0107896, - "auxiliary_loss_mlp": 0.01032247, - "balance_loss_clip": 1.03484488, - "balance_loss_mlp": 1.0198555, - "epoch": 0.8072147903201563, - "flos": 18113701927680.0, - "grad_norm": 2.5661654398107814, - "language_loss": 0.75609297, - "learning_rate": 3.772632938448923e-07, - "loss": 0.77720505, - "num_input_tokens_seen": 289761400, - "step": 13426, - "time_per_iteration": 2.6524770259857178 - }, - { - "auxiliary_loss_clip": 0.01098302, - "auxiliary_loss_mlp": 0.0102617, - "balance_loss_clip": 1.03656507, - "balance_loss_mlp": 1.01461828, - "epoch": 0.8072749135728243, - "flos": 26688164215680.0, - "grad_norm": 1.8886628255914524, - "language_loss": 0.72703242, - "learning_rate": 3.770356705530997e-07, - "loss": 0.74827707, - "num_input_tokens_seen": 289781025, - "step": 13427, - "time_per_iteration": 2.6662089824676514 - }, - { - "auxiliary_loss_clip": 0.01060667, - "auxiliary_loss_mlp": 0.01038927, - "balance_loss_clip": 1.03814864, - "balance_loss_mlp": 1.02555811, - "epoch": 0.8073350368254922, - "flos": 19240291071360.0, - "grad_norm": 1.7110395570969614, - "language_loss": 0.70348513, - "learning_rate": 3.768081088042774e-07, - "loss": 0.72448105, - "num_input_tokens_seen": 289798380, - "step": 13428, - "time_per_iteration": 2.7715890407562256 - }, - { - "auxiliary_loss_clip": 0.01089538, - "auxiliary_loss_mlp": 0.0102989, - "balance_loss_clip": 1.03667974, - "balance_loss_mlp": 1.0185827, - "epoch": 0.8073951600781603, - "flos": 13334530579200.0, - "grad_norm": 2.360374881733329, - "language_loss": 0.74510443, - "learning_rate": 3.765806086070544e-07, - "loss": 0.76629871, - "num_input_tokens_seen": 289814515, - "step": 13429, - "time_per_iteration": 2.6052982807159424 - }, - { - "auxiliary_loss_clip": 0.01096224, - "auxiliary_loss_mlp": 0.01032238, - "balance_loss_clip": 1.03724742, - "balance_loss_mlp": 1.02020407, - "epoch": 0.8074552833308282, - "flos": 22853191726080.0, - "grad_norm": 2.1805515525099466, - "language_loss": 0.66939056, - "learning_rate": 3.763531699700568e-07, - "loss": 0.6906752, - "num_input_tokens_seen": 289834315, - "step": 13430, - "time_per_iteration": 2.6713409423828125 - }, - { - "auxiliary_loss_clip": 0.01068167, - "auxiliary_loss_mlp": 0.01029352, - "balance_loss_clip": 1.03282046, - "balance_loss_mlp": 1.0171392, - "epoch": 0.8075154065834962, - "flos": 20339409288960.0, - "grad_norm": 1.7027899268363083, - "language_loss": 0.80057859, - "learning_rate": 3.7612579290190994e-07, - "loss": 0.82155377, - "num_input_tokens_seen": 289853770, - "step": 13431, - "time_per_iteration": 2.648855447769165 - }, - { - "auxiliary_loss_clip": 0.01084241, - "auxiliary_loss_mlp": 0.0102858, - "balance_loss_clip": 1.03625894, - "balance_loss_mlp": 1.01611698, - "epoch": 0.8075755298361641, - "flos": 21908059113600.0, - "grad_norm": 1.7686498749229664, - "language_loss": 0.80383635, - "learning_rate": 3.7589847741123593e-07, - "loss": 0.82496452, - "num_input_tokens_seen": 289870480, - "step": 13432, - "time_per_iteration": 2.644226551055908 - }, - { - "auxiliary_loss_clip": 0.01083614, - "auxiliary_loss_mlp": 0.01032011, - "balance_loss_clip": 1.03852034, - "balance_loss_mlp": 1.01924944, - "epoch": 0.8076356530888321, - "flos": 15669298609920.0, - "grad_norm": 7.437398375727633, - "language_loss": 0.70418423, - "learning_rate": 3.7567122350665415e-07, - "loss": 0.72534049, - "num_input_tokens_seen": 289888275, - "step": 13433, - "time_per_iteration": 2.657998561859131 - }, - { - "auxiliary_loss_clip": 0.01083097, - "auxiliary_loss_mlp": 0.01028189, - "balance_loss_clip": 1.03641129, - "balance_loss_mlp": 1.01629746, - "epoch": 0.8076957763415, - "flos": 37777414521600.0, - "grad_norm": 1.6613480416430744, - "language_loss": 0.7224468, - "learning_rate": 3.754440311967828e-07, - "loss": 0.7435596, - "num_input_tokens_seen": 289911495, - "step": 13434, - "time_per_iteration": 2.787569046020508 - }, - { - "auxiliary_loss_clip": 0.01071783, - "auxiliary_loss_mlp": 0.01027721, - "balance_loss_clip": 1.03727186, - "balance_loss_mlp": 1.01534724, - "epoch": 0.807755899594168, - "flos": 19610781903360.0, - "grad_norm": 1.8325775965674183, - "language_loss": 0.67859507, - "learning_rate": 3.752169004902361e-07, - "loss": 0.69959009, - "num_input_tokens_seen": 289930045, - "step": 13435, - "time_per_iteration": 2.719987154006958 - }, - { - "auxiliary_loss_clip": 0.01065411, - "auxiliary_loss_mlp": 0.01033721, - "balance_loss_clip": 1.03534269, - "balance_loss_mlp": 1.01921952, - "epoch": 0.8078160228468361, - "flos": 23294893271040.0, - "grad_norm": 1.9641244359702266, - "language_loss": 0.75152278, - "learning_rate": 3.749898313956279e-07, - "loss": 0.7725141, - "num_input_tokens_seen": 289950815, - "step": 13436, - "time_per_iteration": 4.378523826599121 - }, - { - "auxiliary_loss_clip": 0.01104889, - "auxiliary_loss_mlp": 0.01033319, - "balance_loss_clip": 1.03509259, - "balance_loss_mlp": 1.02078414, - "epoch": 0.807876146099504, - "flos": 27162651899520.0, - "grad_norm": 2.08988998980751, - "language_loss": 0.70339876, - "learning_rate": 3.747628239215674e-07, - "loss": 0.7247808, - "num_input_tokens_seen": 289971730, - "step": 13437, - "time_per_iteration": 2.7287251949310303 - }, - { - "auxiliary_loss_clip": 0.01081874, - "auxiliary_loss_mlp": 0.01034019, - "balance_loss_clip": 1.03837299, - "balance_loss_mlp": 1.02234817, - "epoch": 0.807936269352172, - "flos": 27160030206720.0, - "grad_norm": 1.7193467545995484, - "language_loss": 0.73327583, - "learning_rate": 3.745358780766636e-07, - "loss": 0.75443482, - "num_input_tokens_seen": 289992995, - "step": 13438, - "time_per_iteration": 2.73563289642334 - }, - { - "auxiliary_loss_clip": 0.01084164, - "auxiliary_loss_mlp": 0.01031364, - "balance_loss_clip": 1.0364821, - "balance_loss_mlp": 1.01958609, - "epoch": 0.8079963926048399, - "flos": 20740423703040.0, - "grad_norm": 1.9218850358156638, - "language_loss": 0.77182925, - "learning_rate": 3.7430899386952344e-07, - "loss": 0.79298449, - "num_input_tokens_seen": 290009405, - "step": 13439, - "time_per_iteration": 4.257704257965088 - }, - { - "auxiliary_loss_clip": 0.01108447, - "auxiliary_loss_mlp": 0.01030482, - "balance_loss_clip": 1.0371114, - "balance_loss_mlp": 1.01817346, - "epoch": 0.8080565158575079, - "flos": 25009663622400.0, - "grad_norm": 1.5573047662808601, - "language_loss": 0.78926432, - "learning_rate": 3.7408217130874786e-07, - "loss": 0.81065357, - "num_input_tokens_seen": 290031085, - "step": 13440, - "time_per_iteration": 2.61833119392395 - }, - { - "auxiliary_loss_clip": 0.01088716, - "auxiliary_loss_mlp": 0.00770828, - "balance_loss_clip": 1.03697038, - "balance_loss_mlp": 1.00019264, - "epoch": 0.8081166391101758, - "flos": 18698076293760.0, - "grad_norm": 1.6195395418805565, - "language_loss": 0.59136355, - "learning_rate": 3.7385541040293946e-07, - "loss": 0.60995901, - "num_input_tokens_seen": 290048670, - "step": 13441, - "time_per_iteration": 2.6545674800872803 - }, - { - "auxiliary_loss_clip": 0.01097558, - "auxiliary_loss_mlp": 0.01033695, - "balance_loss_clip": 1.03679454, - "balance_loss_mlp": 1.02092791, - "epoch": 0.8081767623628439, - "flos": 19828651847040.0, - "grad_norm": 2.045109695288156, - "language_loss": 0.76209891, - "learning_rate": 3.7362871116069684e-07, - "loss": 0.78341144, - "num_input_tokens_seen": 290064085, - "step": 13442, - "time_per_iteration": 2.579463005065918 - }, - { - "auxiliary_loss_clip": 0.0108635, - "auxiliary_loss_mlp": 0.01031047, - "balance_loss_clip": 1.03652704, - "balance_loss_mlp": 1.01932859, - "epoch": 0.8082368856155118, - "flos": 35772952982400.0, - "grad_norm": 1.9468682551853589, - "language_loss": 0.70523083, - "learning_rate": 3.734020735906169e-07, - "loss": 0.72640479, - "num_input_tokens_seen": 290086255, - "step": 13443, - "time_per_iteration": 2.768657922744751 - }, - { - "auxiliary_loss_clip": 0.010672, - "auxiliary_loss_mlp": 0.01040475, - "balance_loss_clip": 1.03662682, - "balance_loss_mlp": 1.02816081, - "epoch": 0.8082970088681798, - "flos": 17198015489280.0, - "grad_norm": 2.2011960209089128, - "language_loss": 0.82247496, - "learning_rate": 3.7317549770129286e-07, - "loss": 0.8435517, - "num_input_tokens_seen": 290103995, - "step": 13444, - "time_per_iteration": 4.2101311683654785 - }, - { - "auxiliary_loss_clip": 0.00996531, - "auxiliary_loss_mlp": 0.00751439, - "balance_loss_clip": 1.01225722, - "balance_loss_mlp": 0.99960417, - "epoch": 0.8083571321208477, - "flos": 63555207511680.0, - "grad_norm": 0.8321689368239322, - "language_loss": 0.53573275, - "learning_rate": 3.7294898350131754e-07, - "loss": 0.5532124, - "num_input_tokens_seen": 290157245, - "step": 13445, - "time_per_iteration": 3.0625863075256348 - }, - { - "auxiliary_loss_clip": 0.01071369, - "auxiliary_loss_mlp": 0.01031423, - "balance_loss_clip": 1.03452253, - "balance_loss_mlp": 1.01799452, - "epoch": 0.8084172553735157, - "flos": 17930701111680.0, - "grad_norm": 2.525792385078195, - "language_loss": 0.72017092, - "learning_rate": 3.7272253099927964e-07, - "loss": 0.7411989, - "num_input_tokens_seen": 290174970, - "step": 13446, - "time_per_iteration": 2.7008473873138428 - }, - { - "auxiliary_loss_clip": 0.01084211, - "auxiliary_loss_mlp": 0.01031802, - "balance_loss_clip": 1.03550613, - "balance_loss_mlp": 1.01871324, - "epoch": 0.8084773786261836, - "flos": 24097999507200.0, - "grad_norm": 1.7496181509927413, - "language_loss": 0.71567613, - "learning_rate": 3.7249614020376606e-07, - "loss": 0.73683619, - "num_input_tokens_seen": 290194395, - "step": 13447, - "time_per_iteration": 2.6628973484039307 - }, - { - "auxiliary_loss_clip": 0.01047169, - "auxiliary_loss_mlp": 0.01036303, - "balance_loss_clip": 1.03517139, - "balance_loss_mlp": 1.02175951, - "epoch": 0.8085375018788516, - "flos": 15588211656960.0, - "grad_norm": 3.8730614264516787, - "language_loss": 0.74754572, - "learning_rate": 3.7226981112336197e-07, - "loss": 0.7683804, - "num_input_tokens_seen": 290209200, - "step": 13448, - "time_per_iteration": 2.8440589904785156 - }, - { - "auxiliary_loss_clip": 0.01028882, - "auxiliary_loss_mlp": 0.01000792, - "balance_loss_clip": 1.00652528, - "balance_loss_mlp": 0.99984467, - "epoch": 0.8085976251315197, - "flos": 67561296393600.0, - "grad_norm": 0.7365379441466359, - "language_loss": 0.63871992, - "learning_rate": 3.7204354376665024e-07, - "loss": 0.65901667, - "num_input_tokens_seen": 290274565, - "step": 13449, - "time_per_iteration": 3.194010019302368 - }, - { - "auxiliary_loss_clip": 0.0110053, - "auxiliary_loss_mlp": 0.01027665, - "balance_loss_clip": 1.03765333, - "balance_loss_mlp": 1.01495767, - "epoch": 0.8086577483841876, - "flos": 22561453463040.0, - "grad_norm": 1.8200797231923929, - "language_loss": 0.73743933, - "learning_rate": 3.718173381422105e-07, - "loss": 0.75872129, - "num_input_tokens_seen": 290293630, - "step": 13450, - "time_per_iteration": 2.6638128757476807 - }, - { - "auxiliary_loss_clip": 0.0108587, - "auxiliary_loss_mlp": 0.00770156, - "balance_loss_clip": 1.03534913, - "balance_loss_mlp": 1.00021505, - "epoch": 0.8087178716368556, - "flos": 17968084191360.0, - "grad_norm": 1.7745684945736697, - "language_loss": 0.74215508, - "learning_rate": 3.7159119425861986e-07, - "loss": 0.76071537, - "num_input_tokens_seen": 290311450, - "step": 13451, - "time_per_iteration": 2.6632473468780518 - }, - { - "auxiliary_loss_clip": 0.0108524, - "auxiliary_loss_mlp": 0.0103415, - "balance_loss_clip": 1.03462768, - "balance_loss_mlp": 1.02030349, - "epoch": 0.8087779948895235, - "flos": 21719527603200.0, - "grad_norm": 1.7259402772912478, - "language_loss": 0.80131054, - "learning_rate": 3.713651121244543e-07, - "loss": 0.82250446, - "num_input_tokens_seen": 290330165, - "step": 13452, - "time_per_iteration": 2.7077267169952393 - }, - { - "auxiliary_loss_clip": 0.01100267, - "auxiliary_loss_mlp": 0.01037977, - "balance_loss_clip": 1.03795743, - "balance_loss_mlp": 1.02595496, - "epoch": 0.8088381181421915, - "flos": 29092885983360.0, - "grad_norm": 1.7921268226395743, - "language_loss": 0.78446937, - "learning_rate": 3.711390917482875e-07, - "loss": 0.80585182, - "num_input_tokens_seen": 290350815, - "step": 13453, - "time_per_iteration": 2.655306339263916 - }, - { - "auxiliary_loss_clip": 0.01056304, - "auxiliary_loss_mlp": 0.01031896, - "balance_loss_clip": 1.0308342, - "balance_loss_mlp": 1.01884866, - "epoch": 0.8088982413948594, - "flos": 22198432659840.0, - "grad_norm": 2.2494726223817882, - "language_loss": 0.77063608, - "learning_rate": 3.709131331386892e-07, - "loss": 0.79151809, - "num_input_tokens_seen": 290367380, - "step": 13454, - "time_per_iteration": 2.711794376373291 - }, - { - "auxiliary_loss_clip": 0.01074594, - "auxiliary_loss_mlp": 0.01029586, - "balance_loss_clip": 1.0350008, - "balance_loss_mlp": 1.0173552, - "epoch": 0.8089583646475275, - "flos": 28036717453440.0, - "grad_norm": 1.8232411108878894, - "language_loss": 0.76701343, - "learning_rate": 3.7068723630422795e-07, - "loss": 0.78805518, - "num_input_tokens_seen": 290387965, - "step": 13455, - "time_per_iteration": 2.7459137439727783 - }, - { - "auxiliary_loss_clip": 0.01082819, - "auxiliary_loss_mlp": 0.01035059, - "balance_loss_clip": 1.03439069, - "balance_loss_mlp": 1.02137375, - "epoch": 0.8090184879001954, - "flos": 16617735273600.0, - "grad_norm": 1.8300657936441902, - "language_loss": 0.79052675, - "learning_rate": 3.70461401253471e-07, - "loss": 0.81170559, - "num_input_tokens_seen": 290404150, - "step": 13456, - "time_per_iteration": 2.629514455795288 - }, - { - "auxiliary_loss_clip": 0.01108824, - "auxiliary_loss_mlp": 0.0103641, - "balance_loss_clip": 1.03869963, - "balance_loss_mlp": 1.02431631, - "epoch": 0.8090786111528634, - "flos": 27340804379520.0, - "grad_norm": 1.9435022674849403, - "language_loss": 0.71875274, - "learning_rate": 3.702356279949801e-07, - "loss": 0.74020511, - "num_input_tokens_seen": 290422370, - "step": 13457, - "time_per_iteration": 2.6066160202026367 - }, - { - "auxiliary_loss_clip": 0.01088316, - "auxiliary_loss_mlp": 0.01027847, - "balance_loss_clip": 1.03695107, - "balance_loss_mlp": 1.01670051, - "epoch": 0.8091387344055313, - "flos": 21105742976640.0, - "grad_norm": 2.4771443577175196, - "language_loss": 0.72800726, - "learning_rate": 3.700099165373176e-07, - "loss": 0.74916887, - "num_input_tokens_seen": 290442645, - "step": 13458, - "time_per_iteration": 2.670316696166992 - }, - { - "auxiliary_loss_clip": 0.01097692, - "auxiliary_loss_mlp": 0.01035982, - "balance_loss_clip": 1.03728068, - "balance_loss_mlp": 1.02393568, - "epoch": 0.8091988576581993, - "flos": 11655060318720.0, - "grad_norm": 3.9528236134114736, - "language_loss": 0.78632605, - "learning_rate": 3.6978426688904275e-07, - "loss": 0.80766273, - "num_input_tokens_seen": 290458520, - "step": 13459, - "time_per_iteration": 2.6871142387390137 - }, - { - "auxiliary_loss_clip": 0.0108428, - "auxiliary_loss_mlp": 0.01028178, - "balance_loss_clip": 1.03804088, - "balance_loss_mlp": 1.01529706, - "epoch": 0.8092589809108672, - "flos": 22963329803520.0, - "grad_norm": 2.0963523926810073, - "language_loss": 0.79731387, - "learning_rate": 3.695586790587113e-07, - "loss": 0.81843841, - "num_input_tokens_seen": 290474465, - "step": 13460, - "time_per_iteration": 2.7210707664489746 - }, - { - "auxiliary_loss_clip": 0.01085117, - "auxiliary_loss_mlp": 0.01033465, - "balance_loss_clip": 1.033885, - "balance_loss_mlp": 1.02023244, - "epoch": 0.8093191041635353, - "flos": 13260985482240.0, - "grad_norm": 1.8549028601882585, - "language_loss": 0.84519565, - "learning_rate": 3.693331530548789e-07, - "loss": 0.86638141, - "num_input_tokens_seen": 290492060, - "step": 13461, - "time_per_iteration": 2.7760846614837646 - }, - { - "auxiliary_loss_clip": 0.01100089, - "auxiliary_loss_mlp": 0.01039455, - "balance_loss_clip": 1.03782284, - "balance_loss_mlp": 1.02653313, - "epoch": 0.8093792274162032, - "flos": 25516003691520.0, - "grad_norm": 1.8598987285745991, - "language_loss": 0.76461577, - "learning_rate": 3.69107688886096e-07, - "loss": 0.78601122, - "num_input_tokens_seen": 290511510, - "step": 13462, - "time_per_iteration": 2.845400094985962 - }, - { - "auxiliary_loss_clip": 0.01088384, - "auxiliary_loss_mlp": 0.01034949, - "balance_loss_clip": 1.03809071, - "balance_loss_mlp": 1.02182388, - "epoch": 0.8094393506688712, - "flos": 23546483107200.0, - "grad_norm": 4.497352318555959, - "language_loss": 0.83011431, - "learning_rate": 3.6888228656091357e-07, - "loss": 0.85134763, - "num_input_tokens_seen": 290530035, - "step": 13463, - "time_per_iteration": 2.801821708679199 - }, - { - "auxiliary_loss_clip": 0.01107291, - "auxiliary_loss_mlp": 0.01031928, - "balance_loss_clip": 1.03720284, - "balance_loss_mlp": 1.02069807, - "epoch": 0.8094994739215392, - "flos": 17055917285760.0, - "grad_norm": 2.01920176880003, - "language_loss": 0.62397665, - "learning_rate": 3.686569460878779e-07, - "loss": 0.64536881, - "num_input_tokens_seen": 290548245, - "step": 13464, - "time_per_iteration": 2.7305564880371094 - }, - { - "auxiliary_loss_clip": 0.01106405, - "auxiliary_loss_mlp": 0.01028723, - "balance_loss_clip": 1.03645182, - "balance_loss_mlp": 1.01739168, - "epoch": 0.8095595971742071, - "flos": 23551223702400.0, - "grad_norm": 1.547589805180524, - "language_loss": 0.61729312, - "learning_rate": 3.684316674755341e-07, - "loss": 0.6386444, - "num_input_tokens_seen": 290568625, - "step": 13465, - "time_per_iteration": 2.61460018157959 - }, - { - "auxiliary_loss_clip": 0.01098999, - "auxiliary_loss_mlp": 0.01035736, - "balance_loss_clip": 1.03847957, - "balance_loss_mlp": 1.02339816, - "epoch": 0.8096197204268751, - "flos": 20373201008640.0, - "grad_norm": 2.0112821173289035, - "language_loss": 0.82087392, - "learning_rate": 3.682064507324256e-07, - "loss": 0.84222126, - "num_input_tokens_seen": 290586575, - "step": 13466, - "time_per_iteration": 2.65686297416687 - }, - { - "auxiliary_loss_clip": 0.01094893, - "auxiliary_loss_mlp": 0.0077026, - "balance_loss_clip": 1.03960299, - "balance_loss_mlp": 1.00025487, - "epoch": 0.809679843679543, - "flos": 27818775682560.0, - "grad_norm": 1.8295052098367424, - "language_loss": 0.75791496, - "learning_rate": 3.6798129586709204e-07, - "loss": 0.77656651, - "num_input_tokens_seen": 290606790, - "step": 13467, - "time_per_iteration": 2.7522189617156982 - }, - { - "auxiliary_loss_clip": 0.01073167, - "auxiliary_loss_mlp": 0.01031375, - "balance_loss_clip": 1.03119135, - "balance_loss_mlp": 1.0187211, - "epoch": 0.8097399669322111, - "flos": 22014103040640.0, - "grad_norm": 2.4131060616703484, - "language_loss": 0.78938639, - "learning_rate": 3.6775620288807073e-07, - "loss": 0.81043178, - "num_input_tokens_seen": 290625525, - "step": 13468, - "time_per_iteration": 2.7481191158294678 - }, - { - "auxiliary_loss_clip": 0.01095827, - "auxiliary_loss_mlp": 0.01025893, - "balance_loss_clip": 1.03550076, - "balance_loss_mlp": 1.0147531, - "epoch": 0.809800090184879, - "flos": 18988988544000.0, - "grad_norm": 1.863036262504337, - "language_loss": 0.67737466, - "learning_rate": 3.675311718038978e-07, - "loss": 0.69859189, - "num_input_tokens_seen": 290644935, - "step": 13469, - "time_per_iteration": 2.6411309242248535 - }, - { - "auxiliary_loss_clip": 0.01000462, - "auxiliary_loss_mlp": 0.01006561, - "balance_loss_clip": 1.00773001, - "balance_loss_mlp": 1.00516653, - "epoch": 0.809860213437547, - "flos": 66099516508800.0, - "grad_norm": 0.6947384431465805, - "language_loss": 0.54638267, - "learning_rate": 3.6730620262310683e-07, - "loss": 0.56645286, - "num_input_tokens_seen": 290710735, - "step": 13470, - "time_per_iteration": 3.368800401687622 - }, - { - "auxiliary_loss_clip": 0.01106442, - "auxiliary_loss_mlp": 0.01028612, - "balance_loss_clip": 1.03582871, - "balance_loss_mlp": 1.01704264, - "epoch": 0.8099203366902149, - "flos": 20882485992960.0, - "grad_norm": 2.380568963099226, - "language_loss": 0.69673979, - "learning_rate": 3.670812953542279e-07, - "loss": 0.7180903, - "num_input_tokens_seen": 290729565, - "step": 13471, - "time_per_iteration": 2.6116278171539307 - }, - { - "auxiliary_loss_clip": 0.01099408, - "auxiliary_loss_mlp": 0.01028634, - "balance_loss_clip": 1.03811002, - "balance_loss_mlp": 1.01664793, - "epoch": 0.8099804599428829, - "flos": 26030927111040.0, - "grad_norm": 1.7876254964812721, - "language_loss": 0.79963589, - "learning_rate": 3.6685645000579003e-07, - "loss": 0.82091635, - "num_input_tokens_seen": 290749360, - "step": 13472, - "time_per_iteration": 2.656299114227295 - }, - { - "auxiliary_loss_clip": 0.01020676, - "auxiliary_loss_mlp": 0.01001704, - "balance_loss_clip": 1.00737977, - "balance_loss_mlp": 1.00073814, - "epoch": 0.8100405831955508, - "flos": 69303573584640.0, - "grad_norm": 0.7639440927647733, - "language_loss": 0.57809198, - "learning_rate": 3.666316665863201e-07, - "loss": 0.59831583, - "num_input_tokens_seen": 290812145, - "step": 13473, - "time_per_iteration": 3.1810851097106934 - }, - { - "auxiliary_loss_clip": 0.01058837, - "auxiliary_loss_mlp": 0.01030171, - "balance_loss_clip": 1.03626239, - "balance_loss_mlp": 1.01749909, - "epoch": 0.8101007064482189, - "flos": 15012492468480.0, - "grad_norm": 1.6585074003521199, - "language_loss": 0.73932016, - "learning_rate": 3.664069451043399e-07, - "loss": 0.76021028, - "num_input_tokens_seen": 290829845, - "step": 13474, - "time_per_iteration": 2.7276382446289062 - }, - { - "auxiliary_loss_clip": 0.01096806, - "auxiliary_loss_mlp": 0.01037708, - "balance_loss_clip": 1.0382781, - "balance_loss_mlp": 1.02569187, - "epoch": 0.8101608297008868, - "flos": 21067210661760.0, - "grad_norm": 1.8626875728515482, - "language_loss": 0.78847283, - "learning_rate": 3.661822855683723e-07, - "loss": 0.80981803, - "num_input_tokens_seen": 290848815, - "step": 13475, - "time_per_iteration": 4.543492078781128 - }, - { - "auxiliary_loss_clip": 0.01096436, - "auxiliary_loss_mlp": 0.01035936, - "balance_loss_clip": 1.03686523, - "balance_loss_mlp": 1.02425337, - "epoch": 0.8102209529535548, - "flos": 23731279603200.0, - "grad_norm": 1.6073691325832198, - "language_loss": 0.75316137, - "learning_rate": 3.659576879869364e-07, - "loss": 0.77448511, - "num_input_tokens_seen": 290868580, - "step": 13476, - "time_per_iteration": 4.139512062072754 - }, - { - "auxiliary_loss_clip": 0.01089782, - "auxiliary_loss_mlp": 0.01036786, - "balance_loss_clip": 1.03533959, - "balance_loss_mlp": 1.02327955, - "epoch": 0.8102810762062228, - "flos": 10955879107200.0, - "grad_norm": 2.306148776958402, - "language_loss": 0.73640966, - "learning_rate": 3.657331523685485e-07, - "loss": 0.75767529, - "num_input_tokens_seen": 290883540, - "step": 13477, - "time_per_iteration": 2.632864236831665 - }, - { - "auxiliary_loss_clip": 0.01083746, - "auxiliary_loss_mlp": 0.01035491, - "balance_loss_clip": 1.03832769, - "balance_loss_mlp": 1.02404094, - "epoch": 0.8103411994588907, - "flos": 14648825220480.0, - "grad_norm": 2.218286037812516, - "language_loss": 0.69816357, - "learning_rate": 3.6550867872172365e-07, - "loss": 0.71935594, - "num_input_tokens_seen": 290901560, - "step": 13478, - "time_per_iteration": 4.151829242706299 - }, - { - "auxiliary_loss_clip": 0.01028235, - "auxiliary_loss_mlp": 0.01001319, - "balance_loss_clip": 1.00567842, - "balance_loss_mlp": 1.00037754, - "epoch": 0.8104013227115587, - "flos": 59153314665600.0, - "grad_norm": 0.6814078364409648, - "language_loss": 0.52150369, - "learning_rate": 3.6528426705497293e-07, - "loss": 0.54179931, - "num_input_tokens_seen": 290959185, - "step": 13479, - "time_per_iteration": 3.0655500888824463 - }, - { - "auxiliary_loss_clip": 0.01055198, - "auxiliary_loss_mlp": 0.01032847, - "balance_loss_clip": 1.03287351, - "balance_loss_mlp": 1.02027011, - "epoch": 0.8104614459642266, - "flos": 19828687760640.0, - "grad_norm": 1.696157853255504, - "language_loss": 0.7152276, - "learning_rate": 3.650599173768072e-07, - "loss": 0.73610806, - "num_input_tokens_seen": 290979585, - "step": 13480, - "time_per_iteration": 2.7024738788604736 - }, - { - "auxiliary_loss_clip": 0.01108515, - "auxiliary_loss_mlp": 0.01030096, - "balance_loss_clip": 1.03706646, - "balance_loss_mlp": 1.01809144, - "epoch": 0.8105215692168947, - "flos": 25374264624000.0, - "grad_norm": 1.7323825430226805, - "language_loss": 0.7977457, - "learning_rate": 3.648356296957327e-07, - "loss": 0.81913185, - "num_input_tokens_seen": 291000865, - "step": 13481, - "time_per_iteration": 2.5942976474761963 - }, - { - "auxiliary_loss_clip": 0.01085323, - "auxiliary_loss_mlp": 0.01030766, - "balance_loss_clip": 1.03626788, - "balance_loss_mlp": 1.01913691, - "epoch": 0.8105816924695626, - "flos": 20481722974080.0, - "grad_norm": 2.2690231246252686, - "language_loss": 0.72909606, - "learning_rate": 3.646114040202548e-07, - "loss": 0.75025702, - "num_input_tokens_seen": 291018285, - "step": 13482, - "time_per_iteration": 2.6350491046905518 - }, - { - "auxiliary_loss_clip": 0.01044563, - "auxiliary_loss_mlp": 0.01026969, - "balance_loss_clip": 1.03307259, - "balance_loss_mlp": 1.01405859, - "epoch": 0.8106418157222306, - "flos": 14538687143040.0, - "grad_norm": 2.648947443807841, - "language_loss": 0.65993869, - "learning_rate": 3.6438724035887705e-07, - "loss": 0.68065393, - "num_input_tokens_seen": 291035745, - "step": 13483, - "time_per_iteration": 4.212749719619751 - }, - { - "auxiliary_loss_clip": 0.01080725, - "auxiliary_loss_mlp": 0.01028092, - "balance_loss_clip": 1.03347242, - "balance_loss_mlp": 1.01528955, - "epoch": 0.8107019389748985, - "flos": 22564470205440.0, - "grad_norm": 1.846296028434146, - "language_loss": 0.76504505, - "learning_rate": 3.641631387200992e-07, - "loss": 0.78613329, - "num_input_tokens_seen": 291053280, - "step": 13484, - "time_per_iteration": 2.6466033458709717 - }, - { - "auxiliary_loss_clip": 0.01091182, - "auxiliary_loss_mlp": 0.01033252, - "balance_loss_clip": 1.03665829, - "balance_loss_mlp": 1.01950169, - "epoch": 0.8107620622275665, - "flos": 19609560840960.0, - "grad_norm": 1.4911204923404016, - "language_loss": 0.72589421, - "learning_rate": 3.639390991124183e-07, - "loss": 0.74713862, - "num_input_tokens_seen": 291072855, - "step": 13485, - "time_per_iteration": 2.7968504428863525 - }, - { - "auxiliary_loss_clip": 0.01060159, - "auxiliary_loss_mlp": 0.01037186, - "balance_loss_clip": 1.02962208, - "balance_loss_mlp": 1.02368569, - "epoch": 0.8108221854802344, - "flos": 16143498984960.0, - "grad_norm": 1.7176036308983489, - "language_loss": 0.75729263, - "learning_rate": 3.637151215443308e-07, - "loss": 0.77826607, - "num_input_tokens_seen": 291090285, - "step": 13486, - "time_per_iteration": 2.652395725250244 - }, - { - "auxiliary_loss_clip": 0.01089867, - "auxiliary_loss_mlp": 0.01031482, - "balance_loss_clip": 1.03695536, - "balance_loss_mlp": 1.01949561, - "epoch": 0.8108823087329025, - "flos": 21106209853440.0, - "grad_norm": 1.9697831053353734, - "language_loss": 0.72577608, - "learning_rate": 3.6349120602433045e-07, - "loss": 0.74698949, - "num_input_tokens_seen": 291107675, - "step": 13487, - "time_per_iteration": 2.5947606563568115 - }, - { - "auxiliary_loss_clip": 0.01046594, - "auxiliary_loss_mlp": 0.0103374, - "balance_loss_clip": 1.03592014, - "balance_loss_mlp": 1.02142572, - "epoch": 0.8109424319855704, - "flos": 29199648182400.0, - "grad_norm": 2.098798308260877, - "language_loss": 0.84576857, - "learning_rate": 3.6326735256090715e-07, - "loss": 0.8665719, - "num_input_tokens_seen": 291126900, - "step": 13488, - "time_per_iteration": 2.793503761291504 - }, - { - "auxiliary_loss_clip": 0.01111048, - "auxiliary_loss_mlp": 0.01032699, - "balance_loss_clip": 1.03847623, - "balance_loss_mlp": 1.02028883, - "epoch": 0.8110025552382384, - "flos": 23111856541440.0, - "grad_norm": 1.9295150325791675, - "language_loss": 0.73623288, - "learning_rate": 3.630435611625502e-07, - "loss": 0.75767034, - "num_input_tokens_seen": 291145285, - "step": 13489, - "time_per_iteration": 2.599238395690918 - }, - { - "auxiliary_loss_clip": 0.01065923, - "auxiliary_loss_mlp": 0.00769841, - "balance_loss_clip": 1.03703368, - "balance_loss_mlp": 1.00027084, - "epoch": 0.8110626784909064, - "flos": 22379961018240.0, - "grad_norm": 1.5399542352120712, - "language_loss": 0.71757072, - "learning_rate": 3.628198318377453e-07, - "loss": 0.73592842, - "num_input_tokens_seen": 291163485, - "step": 13490, - "time_per_iteration": 2.830582857131958 - }, - { - "auxiliary_loss_clip": 0.01077295, - "auxiliary_loss_mlp": 0.01050998, - "balance_loss_clip": 1.03662229, - "balance_loss_mlp": 1.03582263, - "epoch": 0.8111228017435743, - "flos": 23368043318400.0, - "grad_norm": 2.1733695936937103, - "language_loss": 0.71880186, - "learning_rate": 3.625961645949762e-07, - "loss": 0.74008483, - "num_input_tokens_seen": 291182215, - "step": 13491, - "time_per_iteration": 2.788850784301758 - }, - { - "auxiliary_loss_clip": 0.01107942, - "auxiliary_loss_mlp": 0.01029919, - "balance_loss_clip": 1.03627849, - "balance_loss_mlp": 1.01822448, - "epoch": 0.8111829249962423, - "flos": 21286553063040.0, - "grad_norm": 1.3514463639541505, - "language_loss": 0.67817056, - "learning_rate": 3.623725594427245e-07, - "loss": 0.6995492, - "num_input_tokens_seen": 291203145, - "step": 13492, - "time_per_iteration": 2.6831281185150146 - }, - { - "auxiliary_loss_clip": 0.01064465, - "auxiliary_loss_mlp": 0.01029335, - "balance_loss_clip": 1.03531671, - "balance_loss_mlp": 1.01716399, - "epoch": 0.8112430482489102, - "flos": 22345558767360.0, - "grad_norm": 1.6581742417712237, - "language_loss": 0.71983981, - "learning_rate": 3.6214901638947006e-07, - "loss": 0.74077779, - "num_input_tokens_seen": 291220600, - "step": 13493, - "time_per_iteration": 2.7153713703155518 - }, - { - "auxiliary_loss_clip": 0.01091343, - "auxiliary_loss_mlp": 0.01038969, - "balance_loss_clip": 1.03388119, - "balance_loss_mlp": 1.02628565, - "epoch": 0.8113031715015783, - "flos": 31138321962240.0, - "grad_norm": 1.76439624492188, - "language_loss": 0.70763975, - "learning_rate": 3.619255354436885e-07, - "loss": 0.72894287, - "num_input_tokens_seen": 291241195, - "step": 13494, - "time_per_iteration": 2.6391232013702393 - }, - { - "auxiliary_loss_clip": 0.01100106, - "auxiliary_loss_mlp": 0.01033523, - "balance_loss_clip": 1.03767419, - "balance_loss_mlp": 1.02014816, - "epoch": 0.8113632947542462, - "flos": 25335445000320.0, - "grad_norm": 1.998515349302589, - "language_loss": 0.76569247, - "learning_rate": 3.6170211661385543e-07, - "loss": 0.78702873, - "num_input_tokens_seen": 291258715, - "step": 13495, - "time_per_iteration": 2.588968515396118 - }, - { - "auxiliary_loss_clip": 0.01089895, - "auxiliary_loss_mlp": 0.01036707, - "balance_loss_clip": 1.03693032, - "balance_loss_mlp": 1.02406478, - "epoch": 0.8114234180069142, - "flos": 28439168411520.0, - "grad_norm": 1.859703318738602, - "language_loss": 0.80103755, - "learning_rate": 3.614787599084417e-07, - "loss": 0.82230359, - "num_input_tokens_seen": 291278030, - "step": 13496, - "time_per_iteration": 2.612717390060425 - }, - { - "auxiliary_loss_clip": 0.01098421, - "auxiliary_loss_mlp": 0.01031509, - "balance_loss_clip": 1.0357132, - "balance_loss_mlp": 1.01813412, - "epoch": 0.8114835412595821, - "flos": 20338870584960.0, - "grad_norm": 1.7561157530405371, - "language_loss": 0.71104527, - "learning_rate": 3.6125546533591787e-07, - "loss": 0.73234457, - "num_input_tokens_seen": 291296740, - "step": 13497, - "time_per_iteration": 2.505080461502075 - }, - { - "auxiliary_loss_clip": 0.01073865, - "auxiliary_loss_mlp": 0.01031093, - "balance_loss_clip": 1.03375506, - "balance_loss_mlp": 1.01949358, - "epoch": 0.8115436645122501, - "flos": 22490889194880.0, - "grad_norm": 1.5327513158552182, - "language_loss": 0.76614642, - "learning_rate": 3.610322329047508e-07, - "loss": 0.78719592, - "num_input_tokens_seen": 291318730, - "step": 13498, - "time_per_iteration": 2.6442582607269287 - }, - { - "auxiliary_loss_clip": 0.01109819, - "auxiliary_loss_mlp": 0.01034546, - "balance_loss_clip": 1.03731918, - "balance_loss_mlp": 1.02193928, - "epoch": 0.811603787764918, - "flos": 13845288021120.0, - "grad_norm": 3.6462574824728313, - "language_loss": 0.84119499, - "learning_rate": 3.608090626234055e-07, - "loss": 0.86263865, - "num_input_tokens_seen": 291336755, - "step": 13499, - "time_per_iteration": 2.483522653579712 - }, - { - "auxiliary_loss_clip": 0.01075443, - "auxiliary_loss_mlp": 0.01031753, - "balance_loss_clip": 1.03560185, - "balance_loss_mlp": 1.01798427, - "epoch": 0.8116639110175861, - "flos": 21614632911360.0, - "grad_norm": 1.449342518398089, - "language_loss": 0.76081306, - "learning_rate": 3.6058595450034603e-07, - "loss": 0.78188503, - "num_input_tokens_seen": 291356795, - "step": 13500, - "time_per_iteration": 2.605076313018799 - }, - { - "auxiliary_loss_clip": 0.01008001, - "auxiliary_loss_mlp": 0.00999684, - "balance_loss_clip": 1.00579894, - "balance_loss_mlp": 0.9987666, - "epoch": 0.811724034270254, - "flos": 64459799625600.0, - "grad_norm": 0.8052955879746776, - "language_loss": 0.59879011, - "learning_rate": 3.603629085440303e-07, - "loss": 0.61886698, - "num_input_tokens_seen": 291416005, - "step": 13501, - "time_per_iteration": 3.1991348266601562 - }, - { - "auxiliary_loss_clip": 0.01094365, - "auxiliary_loss_mlp": 0.01025632, - "balance_loss_clip": 1.03644705, - "balance_loss_mlp": 1.01366997, - "epoch": 0.811784157522922, - "flos": 24754123290240.0, - "grad_norm": 1.57620399349307, - "language_loss": 0.79127729, - "learning_rate": 3.6013992476291753e-07, - "loss": 0.81247723, - "num_input_tokens_seen": 291434870, - "step": 13502, - "time_per_iteration": 2.612614154815674 - }, - { - "auxiliary_loss_clip": 0.01081743, - "auxiliary_loss_mlp": 0.01037896, - "balance_loss_clip": 1.03356886, - "balance_loss_mlp": 1.02457452, - "epoch": 0.81184428077559, - "flos": 12167146563840.0, - "grad_norm": 1.8233893425242464, - "language_loss": 0.71166331, - "learning_rate": 3.599170031654635e-07, - "loss": 0.73285973, - "num_input_tokens_seen": 291452230, - "step": 13503, - "time_per_iteration": 2.61946964263916 - }, - { - "auxiliary_loss_clip": 0.01079859, - "auxiliary_loss_mlp": 0.01030554, - "balance_loss_clip": 1.03437757, - "balance_loss_mlp": 1.01704192, - "epoch": 0.8119044040282579, - "flos": 44422037775360.0, - "grad_norm": 1.451264981868303, - "language_loss": 0.67748487, - "learning_rate": 3.5969414376012065e-07, - "loss": 0.69858897, - "num_input_tokens_seen": 291477425, - "step": 13504, - "time_per_iteration": 2.850944995880127 - }, - { - "auxiliary_loss_clip": 0.01081144, - "auxiliary_loss_mlp": 0.01032065, - "balance_loss_clip": 1.03502822, - "balance_loss_mlp": 1.01892805, - "epoch": 0.8119645272809259, - "flos": 52155507957120.0, - "grad_norm": 6.247382834720127, - "language_loss": 0.74465597, - "learning_rate": 3.594713465553403e-07, - "loss": 0.76578808, - "num_input_tokens_seen": 291501070, - "step": 13505, - "time_per_iteration": 2.9765865802764893 - }, - { - "auxiliary_loss_clip": 0.01085863, - "auxiliary_loss_mlp": 0.01031618, - "balance_loss_clip": 1.03640378, - "balance_loss_mlp": 1.01850462, - "epoch": 0.8120246505335939, - "flos": 30232978640640.0, - "grad_norm": 1.9161202158648818, - "language_loss": 0.73033476, - "learning_rate": 3.5924861155957123e-07, - "loss": 0.75150955, - "num_input_tokens_seen": 291524945, - "step": 13506, - "time_per_iteration": 2.7573962211608887 - }, - { - "auxiliary_loss_clip": 0.01114046, - "auxiliary_loss_mlp": 0.01031523, - "balance_loss_clip": 1.03798163, - "balance_loss_mlp": 1.01900649, - "epoch": 0.8120847737862619, - "flos": 22127652910080.0, - "grad_norm": 2.1835357751588664, - "language_loss": 0.75858426, - "learning_rate": 3.590259387812593e-07, - "loss": 0.78003991, - "num_input_tokens_seen": 291544605, - "step": 13507, - "time_per_iteration": 2.5932223796844482 - }, - { - "auxiliary_loss_clip": 0.01110179, - "auxiliary_loss_mlp": 0.01028613, - "balance_loss_clip": 1.03553689, - "balance_loss_mlp": 1.01663268, - "epoch": 0.8121448970389298, - "flos": 23295180579840.0, - "grad_norm": 1.789952904121179, - "language_loss": 0.70542687, - "learning_rate": 3.5880332822884783e-07, - "loss": 0.72681475, - "num_input_tokens_seen": 291563850, - "step": 13508, - "time_per_iteration": 2.6186447143554688 - }, - { - "auxiliary_loss_clip": 0.01096661, - "auxiliary_loss_mlp": 0.01033361, - "balance_loss_clip": 1.03715634, - "balance_loss_mlp": 1.02163088, - "epoch": 0.8122050202915978, - "flos": 22164138149760.0, - "grad_norm": 2.344890772054223, - "language_loss": 0.75989026, - "learning_rate": 3.585807799107785e-07, - "loss": 0.78119051, - "num_input_tokens_seen": 291581730, - "step": 13509, - "time_per_iteration": 2.589594841003418 - }, - { - "auxiliary_loss_clip": 0.01110373, - "auxiliary_loss_mlp": 0.01030617, - "balance_loss_clip": 1.03736436, - "balance_loss_mlp": 1.01834416, - "epoch": 0.8122651435442657, - "flos": 23258946735360.0, - "grad_norm": 1.705010281678355, - "language_loss": 0.76900029, - "learning_rate": 3.58358293835491e-07, - "loss": 0.79041028, - "num_input_tokens_seen": 291601225, - "step": 13510, - "time_per_iteration": 2.6146199703216553 - }, - { - "auxiliary_loss_clip": 0.01099011, - "auxiliary_loss_mlp": 0.01035544, - "balance_loss_clip": 1.0352056, - "balance_loss_mlp": 1.02240086, - "epoch": 0.8123252667969337, - "flos": 16140015365760.0, - "grad_norm": 2.0243900954468446, - "language_loss": 0.69868124, - "learning_rate": 3.581358700114212e-07, - "loss": 0.72002673, - "num_input_tokens_seen": 291616995, - "step": 13511, - "time_per_iteration": 2.5841333866119385 - }, - { - "auxiliary_loss_clip": 0.0108991, - "auxiliary_loss_mlp": 0.01035297, - "balance_loss_clip": 1.03681922, - "balance_loss_mlp": 1.0227139, - "epoch": 0.8123853900496016, - "flos": 21245399055360.0, - "grad_norm": 1.8851833910284228, - "language_loss": 0.79274458, - "learning_rate": 3.57913508447004e-07, - "loss": 0.81399667, - "num_input_tokens_seen": 291636145, - "step": 13512, - "time_per_iteration": 2.6589250564575195 - }, - { - "auxiliary_loss_clip": 0.01096941, - "auxiliary_loss_mlp": 0.01028829, - "balance_loss_clip": 1.0360527, - "balance_loss_mlp": 1.01692605, - "epoch": 0.8124455133022697, - "flos": 64377596373120.0, - "grad_norm": 1.5979406334713135, - "language_loss": 0.63230824, - "learning_rate": 3.5769120915067076e-07, - "loss": 0.65356594, - "num_input_tokens_seen": 291662440, - "step": 13513, - "time_per_iteration": 2.990612030029297 - }, - { - "auxiliary_loss_clip": 0.01058491, - "auxiliary_loss_mlp": 0.01033418, - "balance_loss_clip": 1.03237724, - "balance_loss_mlp": 1.02057886, - "epoch": 0.8125056365549376, - "flos": 23842207779840.0, - "grad_norm": 1.8613542328195332, - "language_loss": 0.71270061, - "learning_rate": 3.5746897213085194e-07, - "loss": 0.73361969, - "num_input_tokens_seen": 291680950, - "step": 13514, - "time_per_iteration": 4.445888519287109 - }, - { - "auxiliary_loss_clip": 0.01073863, - "auxiliary_loss_mlp": 0.01030266, - "balance_loss_clip": 1.03600311, - "balance_loss_mlp": 1.01780891, - "epoch": 0.8125657598076056, - "flos": 23550325862400.0, - "grad_norm": 1.5528453792894483, - "language_loss": 0.62748504, - "learning_rate": 3.5724679739597364e-07, - "loss": 0.64852631, - "num_input_tokens_seen": 291702395, - "step": 13515, - "time_per_iteration": 4.218576192855835 - }, - { - "auxiliary_loss_clip": 0.01102975, - "auxiliary_loss_mlp": 0.00769685, - "balance_loss_clip": 1.03534853, - "balance_loss_mlp": 1.00016952, - "epoch": 0.8126258830602736, - "flos": 20704225772160.0, - "grad_norm": 5.114192306914355, - "language_loss": 0.75868255, - "learning_rate": 3.570246849544616e-07, - "loss": 0.7774092, - "num_input_tokens_seen": 291721135, - "step": 13516, - "time_per_iteration": 2.6113100051879883 - }, - { - "auxiliary_loss_clip": 0.0105995, - "auxiliary_loss_mlp": 0.01028758, - "balance_loss_clip": 1.03563333, - "balance_loss_mlp": 1.01692092, - "epoch": 0.8126860063129415, - "flos": 23618160696960.0, - "grad_norm": 1.4792626024719475, - "language_loss": 0.91386318, - "learning_rate": 3.5680263481473907e-07, - "loss": 0.9347502, - "num_input_tokens_seen": 291741235, - "step": 13517, - "time_per_iteration": 2.730743408203125 - }, - { - "auxiliary_loss_clip": 0.01101276, - "auxiliary_loss_mlp": 0.00770067, - "balance_loss_clip": 1.03974628, - "balance_loss_mlp": 1.00018048, - "epoch": 0.8127461295656095, - "flos": 25007149670400.0, - "grad_norm": 1.60312479075504, - "language_loss": 0.78797936, - "learning_rate": 3.565806469852244e-07, - "loss": 0.80669272, - "num_input_tokens_seen": 291761430, - "step": 13518, - "time_per_iteration": 4.1632232666015625 - }, - { - "auxiliary_loss_clip": 0.01096668, - "auxiliary_loss_mlp": 0.01029335, - "balance_loss_clip": 1.03643668, - "balance_loss_mlp": 1.01815319, - "epoch": 0.8128062528182775, - "flos": 27342169096320.0, - "grad_norm": 1.5642213115065133, - "language_loss": 0.78870726, - "learning_rate": 3.56358721474336e-07, - "loss": 0.80996728, - "num_input_tokens_seen": 291781755, - "step": 13519, - "time_per_iteration": 2.681206226348877 - }, - { - "auxiliary_loss_clip": 0.01109503, - "auxiliary_loss_mlp": 0.01033337, - "balance_loss_clip": 1.03672457, - "balance_loss_mlp": 1.02139187, - "epoch": 0.8128663760709455, - "flos": 26506312634880.0, - "grad_norm": 1.5860289304426558, - "language_loss": 0.70635796, - "learning_rate": 3.561368582904905e-07, - "loss": 0.72778636, - "num_input_tokens_seen": 291804410, - "step": 13520, - "time_per_iteration": 2.6522674560546875 - }, - { - "auxiliary_loss_clip": 0.01093185, - "auxiliary_loss_mlp": 0.01033091, - "balance_loss_clip": 1.03861439, - "balance_loss_mlp": 1.02049041, - "epoch": 0.8129264993236134, - "flos": 17931239815680.0, - "grad_norm": 1.3829440346213167, - "language_loss": 0.7262553, - "learning_rate": 3.5591505744209925e-07, - "loss": 0.74751806, - "num_input_tokens_seen": 291823285, - "step": 13521, - "time_per_iteration": 2.7141287326812744 - }, - { - "auxiliary_loss_clip": 0.01101075, - "auxiliary_loss_mlp": 0.01030446, - "balance_loss_clip": 1.03675306, - "balance_loss_mlp": 1.01783347, - "epoch": 0.8129866225762814, - "flos": 26177694082560.0, - "grad_norm": 1.8750216603820857, - "language_loss": 0.70207542, - "learning_rate": 3.5569331893757394e-07, - "loss": 0.7233907, - "num_input_tokens_seen": 291845305, - "step": 13522, - "time_per_iteration": 2.7125802040100098 - }, - { - "auxiliary_loss_clip": 0.01093707, - "auxiliary_loss_mlp": 0.01032133, - "balance_loss_clip": 1.03634274, - "balance_loss_mlp": 1.02078998, - "epoch": 0.8130467458289493, - "flos": 21032197879680.0, - "grad_norm": 1.6227036980267577, - "language_loss": 0.7064119, - "learning_rate": 3.554716427853233e-07, - "loss": 0.72767031, - "num_input_tokens_seen": 291863715, - "step": 13523, - "time_per_iteration": 4.0815582275390625 - }, - { - "auxiliary_loss_clip": 0.01096974, - "auxiliary_loss_mlp": 0.01031155, - "balance_loss_clip": 1.03545976, - "balance_loss_mlp": 1.01855457, - "epoch": 0.8131068690816173, - "flos": 15487051979520.0, - "grad_norm": 2.0371214657099435, - "language_loss": 0.70833939, - "learning_rate": 3.5525002899375256e-07, - "loss": 0.7296207, - "num_input_tokens_seen": 291880735, - "step": 13524, - "time_per_iteration": 2.6003952026367188 - }, - { - "auxiliary_loss_clip": 0.01095723, - "auxiliary_loss_mlp": 0.01030696, - "balance_loss_clip": 1.03494859, - "balance_loss_mlp": 1.01905477, - "epoch": 0.8131669923342852, - "flos": 29351227576320.0, - "grad_norm": 1.8442373927503088, - "language_loss": 0.62811154, - "learning_rate": 3.550284775712653e-07, - "loss": 0.64937574, - "num_input_tokens_seen": 291900535, - "step": 13525, - "time_per_iteration": 2.657466411590576 - }, - { - "auxiliary_loss_clip": 0.01079403, - "auxiliary_loss_mlp": 0.01036654, - "balance_loss_clip": 1.03601646, - "balance_loss_mlp": 1.02482271, - "epoch": 0.8132271155869533, - "flos": 35256162055680.0, - "grad_norm": 1.777429638442415, - "language_loss": 0.65313601, - "learning_rate": 3.548069885262628e-07, - "loss": 0.67429662, - "num_input_tokens_seen": 291919760, - "step": 13526, - "time_per_iteration": 2.7304532527923584 - }, - { - "auxiliary_loss_clip": 0.01083448, - "auxiliary_loss_mlp": 0.0102643, - "balance_loss_clip": 1.03576994, - "balance_loss_mlp": 1.01569486, - "epoch": 0.8132872388396212, - "flos": 27781895393280.0, - "grad_norm": 1.6432032258331546, - "language_loss": 0.75642002, - "learning_rate": 3.5458556186714473e-07, - "loss": 0.77751887, - "num_input_tokens_seen": 291938915, - "step": 13527, - "time_per_iteration": 2.667377471923828 - }, - { - "auxiliary_loss_clip": 0.01107517, - "auxiliary_loss_mlp": 0.01026255, - "balance_loss_clip": 1.03658116, - "balance_loss_mlp": 1.01454246, - "epoch": 0.8133473620922892, - "flos": 27819601695360.0, - "grad_norm": 2.105634119207497, - "language_loss": 0.70704925, - "learning_rate": 3.5436419760230706e-07, - "loss": 0.728387, - "num_input_tokens_seen": 291958145, - "step": 13528, - "time_per_iteration": 2.6163675785064697 - }, - { - "auxiliary_loss_clip": 0.01108822, - "auxiliary_loss_mlp": 0.01030423, - "balance_loss_clip": 1.03638566, - "balance_loss_mlp": 1.01875806, - "epoch": 0.8134074853449572, - "flos": 18989527248000.0, - "grad_norm": 1.9964746744268802, - "language_loss": 0.69046456, - "learning_rate": 3.5414289574014357e-07, - "loss": 0.71185702, - "num_input_tokens_seen": 291976860, - "step": 13529, - "time_per_iteration": 2.5404155254364014 - }, - { - "auxiliary_loss_clip": 0.01089372, - "auxiliary_loss_mlp": 0.01031807, - "balance_loss_clip": 1.03602207, - "balance_loss_mlp": 1.02028584, - "epoch": 0.8134676085976251, - "flos": 24242863057920.0, - "grad_norm": 2.1717910963600615, - "language_loss": 0.77427143, - "learning_rate": 3.5392165628904635e-07, - "loss": 0.79548317, - "num_input_tokens_seen": 291998085, - "step": 13530, - "time_per_iteration": 2.617090940475464 - }, - { - "auxiliary_loss_clip": 0.01097307, - "auxiliary_loss_mlp": 0.01035822, - "balance_loss_clip": 1.03674924, - "balance_loss_mlp": 1.02292991, - "epoch": 0.8135277318502931, - "flos": 19062389986560.0, - "grad_norm": 4.052594441167417, - "language_loss": 0.81679058, - "learning_rate": 3.537004792574052e-07, - "loss": 0.83812189, - "num_input_tokens_seen": 292016585, - "step": 13531, - "time_per_iteration": 2.6205062866210938 - }, - { - "auxiliary_loss_clip": 0.01084413, - "auxiliary_loss_mlp": 0.01034781, - "balance_loss_clip": 1.03383708, - "balance_loss_mlp": 1.02089322, - "epoch": 0.813587855102961, - "flos": 17269728992640.0, - "grad_norm": 1.9466339664768382, - "language_loss": 0.72048044, - "learning_rate": 3.534793646536065e-07, - "loss": 0.7416724, - "num_input_tokens_seen": 292033255, - "step": 13532, - "time_per_iteration": 2.6235249042510986 - }, - { - "auxiliary_loss_clip": 0.01076826, - "auxiliary_loss_mlp": 0.01028928, - "balance_loss_clip": 1.03568232, - "balance_loss_mlp": 1.01717389, - "epoch": 0.8136479783556291, - "flos": 20157593621760.0, - "grad_norm": 1.944089941040769, - "language_loss": 0.7643801, - "learning_rate": 3.5325831248603533e-07, - "loss": 0.78543758, - "num_input_tokens_seen": 292051800, - "step": 13533, - "time_per_iteration": 2.686540126800537 - }, - { - "auxiliary_loss_clip": 0.01112795, - "auxiliary_loss_mlp": 0.0077037, - "balance_loss_clip": 1.03745687, - "balance_loss_mlp": 1.00021124, - "epoch": 0.813708101608297, - "flos": 22052348046720.0, - "grad_norm": 1.645533384240896, - "language_loss": 0.76579952, - "learning_rate": 3.5303732276307495e-07, - "loss": 0.78463125, - "num_input_tokens_seen": 292072215, - "step": 13534, - "time_per_iteration": 2.6405410766601562 - }, - { - "auxiliary_loss_clip": 0.01090662, - "auxiliary_loss_mlp": 0.01028024, - "balance_loss_clip": 1.03678405, - "balance_loss_mlp": 1.01722336, - "epoch": 0.813768224860965, - "flos": 16173412035840.0, - "grad_norm": 2.1008954563080153, - "language_loss": 0.93045878, - "learning_rate": 3.5281639549310336e-07, - "loss": 0.95164573, - "num_input_tokens_seen": 292088830, - "step": 13535, - "time_per_iteration": 2.64209246635437 - }, - { - "auxiliary_loss_clip": 0.01071147, - "auxiliary_loss_mlp": 0.01027139, - "balance_loss_clip": 1.0390265, - "balance_loss_mlp": 1.01590967, - "epoch": 0.8138283481136329, - "flos": 24352318776960.0, - "grad_norm": 1.5385602481593355, - "language_loss": 0.70752996, - "learning_rate": 3.52595530684499e-07, - "loss": 0.72851282, - "num_input_tokens_seen": 292109225, - "step": 13536, - "time_per_iteration": 2.80938720703125 - }, - { - "auxiliary_loss_clip": 0.01072251, - "auxiliary_loss_mlp": 0.01029565, - "balance_loss_clip": 1.034621, - "balance_loss_mlp": 1.01691151, - "epoch": 0.8138884713663009, - "flos": 25516362827520.0, - "grad_norm": 1.7915852283109845, - "language_loss": 0.75374007, - "learning_rate": 3.5237472834563775e-07, - "loss": 0.77475834, - "num_input_tokens_seen": 292129660, - "step": 13537, - "time_per_iteration": 2.709963798522949 - }, - { - "auxiliary_loss_clip": 0.01083975, - "auxiliary_loss_mlp": 0.01039156, - "balance_loss_clip": 1.03624582, - "balance_loss_mlp": 1.02596569, - "epoch": 0.8139485946189688, - "flos": 22454368041600.0, - "grad_norm": 1.531988851802544, - "language_loss": 0.76327688, - "learning_rate": 3.5215398848489163e-07, - "loss": 0.78450817, - "num_input_tokens_seen": 292149090, - "step": 13538, - "time_per_iteration": 2.659142255783081 - }, - { - "auxiliary_loss_clip": 0.01090459, - "auxiliary_loss_mlp": 0.01029562, - "balance_loss_clip": 1.03432798, - "balance_loss_mlp": 1.01791525, - "epoch": 0.8140087178716369, - "flos": 21250391045760.0, - "grad_norm": 1.5412733274323733, - "language_loss": 0.78075993, - "learning_rate": 3.5193331111063176e-07, - "loss": 0.80196011, - "num_input_tokens_seen": 292169260, - "step": 13539, - "time_per_iteration": 2.637423515319824 - }, - { - "auxiliary_loss_clip": 0.01073968, - "auxiliary_loss_mlp": 0.01029706, - "balance_loss_clip": 1.04544592, - "balance_loss_mlp": 1.01841712, - "epoch": 0.8140688411243048, - "flos": 39415730774400.0, - "grad_norm": 2.1381487111045145, - "language_loss": 0.66290975, - "learning_rate": 3.5171269623122533e-07, - "loss": 0.68394649, - "num_input_tokens_seen": 292188145, - "step": 13540, - "time_per_iteration": 2.8771181106567383 - }, - { - "auxiliary_loss_clip": 0.01101069, - "auxiliary_loss_mlp": 0.01033274, - "balance_loss_clip": 1.03914928, - "balance_loss_mlp": 1.02268267, - "epoch": 0.8141289643769728, - "flos": 25415885508480.0, - "grad_norm": 1.6197380880165504, - "language_loss": 0.67438757, - "learning_rate": 3.5149214385503913e-07, - "loss": 0.69573104, - "num_input_tokens_seen": 292212135, - "step": 13541, - "time_per_iteration": 2.769536018371582 - }, - { - "auxiliary_loss_clip": 0.01106222, - "auxiliary_loss_mlp": 0.01034432, - "balance_loss_clip": 1.03566313, - "balance_loss_mlp": 1.02187276, - "epoch": 0.8141890876296408, - "flos": 12568053237120.0, - "grad_norm": 1.8846151947016416, - "language_loss": 0.69230938, - "learning_rate": 3.512716539904355e-07, - "loss": 0.71371591, - "num_input_tokens_seen": 292230645, - "step": 13542, - "time_per_iteration": 2.6285057067871094 - }, - { - "auxiliary_loss_clip": 0.0111203, - "auxiliary_loss_mlp": 0.01033744, - "balance_loss_clip": 1.03642273, - "balance_loss_mlp": 1.02083373, - "epoch": 0.8142492108823087, - "flos": 14967172483200.0, - "grad_norm": 2.925687794386818, - "language_loss": 0.79454219, - "learning_rate": 3.5105122664577613e-07, - "loss": 0.81599998, - "num_input_tokens_seen": 292243540, - "step": 13543, - "time_per_iteration": 2.5403339862823486 - }, - { - "auxiliary_loss_clip": 0.01081798, - "auxiliary_loss_mlp": 0.01035405, - "balance_loss_clip": 1.03946197, - "balance_loss_mlp": 1.02264941, - "epoch": 0.8143093341349767, - "flos": 12422004537600.0, - "grad_norm": 2.0517480511317774, - "language_loss": 0.78091002, - "learning_rate": 3.5083086182942003e-07, - "loss": 0.80208206, - "num_input_tokens_seen": 292261715, - "step": 13544, - "time_per_iteration": 2.782600164413452 - }, - { - "auxiliary_loss_clip": 0.01116058, - "auxiliary_loss_mlp": 0.01031813, - "balance_loss_clip": 1.03913999, - "balance_loss_mlp": 1.01734674, - "epoch": 0.8143694573876447, - "flos": 11910564737280.0, - "grad_norm": 3.0853831344468707, - "language_loss": 0.7382375, - "learning_rate": 3.5061055954972264e-07, - "loss": 0.75971621, - "num_input_tokens_seen": 292275080, - "step": 13545, - "time_per_iteration": 2.631141185760498 - }, - { - "auxiliary_loss_clip": 0.01096875, - "auxiliary_loss_mlp": 0.01029281, - "balance_loss_clip": 1.03640938, - "balance_loss_mlp": 1.0174439, - "epoch": 0.8144295806403127, - "flos": 21212900225280.0, - "grad_norm": 1.6342395606373197, - "language_loss": 0.76933265, - "learning_rate": 3.5039031981503776e-07, - "loss": 0.79059422, - "num_input_tokens_seen": 292294635, - "step": 13546, - "time_per_iteration": 2.6105756759643555 - }, - { - "auxiliary_loss_clip": 0.0110063, - "auxiliary_loss_mlp": 0.01030809, - "balance_loss_clip": 1.0386976, - "balance_loss_mlp": 1.01948416, - "epoch": 0.8144897038929806, - "flos": 19865280741120.0, - "grad_norm": 2.057693835457072, - "language_loss": 0.70437783, - "learning_rate": 3.501701426337178e-07, - "loss": 0.72569221, - "num_input_tokens_seen": 292312695, - "step": 13547, - "time_per_iteration": 2.6459848880767822 - }, - { - "auxiliary_loss_clip": 0.01112435, - "auxiliary_loss_mlp": 0.01036703, - "balance_loss_clip": 1.03837729, - "balance_loss_mlp": 1.02320886, - "epoch": 0.8145498271456486, - "flos": 24571733005440.0, - "grad_norm": 1.7911803251126166, - "language_loss": 0.70297545, - "learning_rate": 3.49950028014111e-07, - "loss": 0.7244668, - "num_input_tokens_seen": 292332005, - "step": 13548, - "time_per_iteration": 2.7214651107788086 - }, - { - "auxiliary_loss_clip": 0.01099863, - "auxiliary_loss_mlp": 0.01033065, - "balance_loss_clip": 1.03860509, - "balance_loss_mlp": 1.01963055, - "epoch": 0.8146099503983165, - "flos": 20193037367040.0, - "grad_norm": 4.113506280616557, - "language_loss": 0.77017093, - "learning_rate": 3.4972997596456444e-07, - "loss": 0.79150021, - "num_input_tokens_seen": 292348365, - "step": 13549, - "time_per_iteration": 2.6624276638031006 - }, - { - "auxiliary_loss_clip": 0.011122, - "auxiliary_loss_mlp": 0.01030353, - "balance_loss_clip": 1.03999424, - "balance_loss_mlp": 1.01782978, - "epoch": 0.8146700736509845, - "flos": 19536949497600.0, - "grad_norm": 1.9226967396977621, - "language_loss": 0.71076775, - "learning_rate": 3.4950998649342233e-07, - "loss": 0.73219323, - "num_input_tokens_seen": 292368050, - "step": 13550, - "time_per_iteration": 2.7254621982574463 - }, - { - "auxiliary_loss_clip": 0.01094556, - "auxiliary_loss_mlp": 0.01025932, - "balance_loss_clip": 1.0368104, - "balance_loss_mlp": 1.01444018, - "epoch": 0.8147301969036524, - "flos": 18041341979520.0, - "grad_norm": 1.9888509797715757, - "language_loss": 0.71529424, - "learning_rate": 3.4929005960902826e-07, - "loss": 0.73649913, - "num_input_tokens_seen": 292385315, - "step": 13551, - "time_per_iteration": 2.704594850540161 - }, - { - "auxiliary_loss_clip": 0.01072466, - "auxiliary_loss_mlp": 0.01036963, - "balance_loss_clip": 1.03925037, - "balance_loss_mlp": 1.02343869, - "epoch": 0.8147903201563205, - "flos": 18004713085440.0, - "grad_norm": 1.9897080161612837, - "language_loss": 0.68656695, - "learning_rate": 3.4907019531971926e-07, - "loss": 0.70766115, - "num_input_tokens_seen": 292403375, - "step": 13552, - "time_per_iteration": 2.7425405979156494 - }, - { - "auxiliary_loss_clip": 0.01107317, - "auxiliary_loss_mlp": 0.01043397, - "balance_loss_clip": 1.03570342, - "balance_loss_mlp": 1.03133857, - "epoch": 0.8148504434089884, - "flos": 20259327916800.0, - "grad_norm": 1.7008594179120202, - "language_loss": 0.82082725, - "learning_rate": 3.4885039363383407e-07, - "loss": 0.84233445, - "num_input_tokens_seen": 292419260, - "step": 13553, - "time_per_iteration": 2.5453405380249023 - }, - { - "auxiliary_loss_clip": 0.01097272, - "auxiliary_loss_mlp": 0.01030289, - "balance_loss_clip": 1.035079, - "balance_loss_mlp": 1.01831412, - "epoch": 0.8149105666616564, - "flos": 12494723621760.0, - "grad_norm": 1.6418052636171558, - "language_loss": 0.67904902, - "learning_rate": 3.4863065455970795e-07, - "loss": 0.70032459, - "num_input_tokens_seen": 292436095, - "step": 13554, - "time_per_iteration": 4.209248781204224 - }, - { - "auxiliary_loss_clip": 0.01082623, - "auxiliary_loss_mlp": 0.01041493, - "balance_loss_clip": 1.035748, - "balance_loss_mlp": 1.02727127, - "epoch": 0.8149706899143244, - "flos": 32523683662080.0, - "grad_norm": 1.9729231386540171, - "language_loss": 0.66057062, - "learning_rate": 3.484109781056723e-07, - "loss": 0.68181175, - "num_input_tokens_seen": 292457190, - "step": 13555, - "time_per_iteration": 4.274117708206177 - }, - { - "auxiliary_loss_clip": 0.01102138, - "auxiliary_loss_mlp": 0.01034591, - "balance_loss_clip": 1.03688693, - "balance_loss_mlp": 1.02167439, - "epoch": 0.8150308131669923, - "flos": 19386088375680.0, - "grad_norm": 2.108444825647498, - "language_loss": 0.7319755, - "learning_rate": 3.4819136428005844e-07, - "loss": 0.75334281, - "num_input_tokens_seen": 292474300, - "step": 13556, - "time_per_iteration": 2.5886549949645996 - }, - { - "auxiliary_loss_clip": 0.01099496, - "auxiliary_loss_mlp": 0.01027956, - "balance_loss_clip": 1.03907287, - "balance_loss_mlp": 1.01664877, - "epoch": 0.8150909364196604, - "flos": 17421380213760.0, - "grad_norm": 1.654846698931865, - "language_loss": 0.80619091, - "learning_rate": 3.4797181309119307e-07, - "loss": 0.82746542, - "num_input_tokens_seen": 292492420, - "step": 13557, - "time_per_iteration": 4.058533430099487 - }, - { - "auxiliary_loss_clip": 0.01089108, - "auxiliary_loss_mlp": 0.01034069, - "balance_loss_clip": 1.03591609, - "balance_loss_mlp": 1.02201128, - "epoch": 0.8151510596723283, - "flos": 27162795553920.0, - "grad_norm": 1.7508168660237897, - "language_loss": 0.6597842, - "learning_rate": 3.4775232454740255e-07, - "loss": 0.68101597, - "num_input_tokens_seen": 292512895, - "step": 13558, - "time_per_iteration": 2.7690083980560303 - }, - { - "auxiliary_loss_clip": 0.01029498, - "auxiliary_loss_mlp": 0.01004693, - "balance_loss_clip": 1.00695944, - "balance_loss_mlp": 1.00384712, - "epoch": 0.8152111829249963, - "flos": 64219052718720.0, - "grad_norm": 0.8394726411943846, - "language_loss": 0.56896985, - "learning_rate": 3.4753289865700896e-07, - "loss": 0.58931184, - "num_input_tokens_seen": 292566580, - "step": 13559, - "time_per_iteration": 3.114321231842041 - }, - { - "auxiliary_loss_clip": 0.01012079, - "auxiliary_loss_mlp": 0.01011711, - "balance_loss_clip": 1.00770724, - "balance_loss_mlp": 1.0104531, - "epoch": 0.8152713061776642, - "flos": 67072012306560.0, - "grad_norm": 0.6789957550904517, - "language_loss": 0.55196381, - "learning_rate": 3.473135354283334e-07, - "loss": 0.57220173, - "num_input_tokens_seen": 292621490, - "step": 13560, - "time_per_iteration": 3.059293746948242 - }, - { - "auxiliary_loss_clip": 0.0108755, - "auxiliary_loss_mlp": 0.01029779, - "balance_loss_clip": 1.03620529, - "balance_loss_mlp": 1.01832318, - "epoch": 0.8153314294303322, - "flos": 14391130072320.0, - "grad_norm": 1.7604364526960343, - "language_loss": 0.67580026, - "learning_rate": 3.470942348696948e-07, - "loss": 0.6969735, - "num_input_tokens_seen": 292638660, - "step": 13561, - "time_per_iteration": 2.659605026245117 - }, - { - "auxiliary_loss_clip": 0.01103139, - "auxiliary_loss_mlp": 0.01035516, - "balance_loss_clip": 1.03822076, - "balance_loss_mlp": 1.02304101, - "epoch": 0.8153915526830001, - "flos": 25623520076160.0, - "grad_norm": 1.5670796727664797, - "language_loss": 0.81579733, - "learning_rate": 3.468749969894085e-07, - "loss": 0.83718389, - "num_input_tokens_seen": 292658545, - "step": 13562, - "time_per_iteration": 4.182463884353638 - }, - { - "auxiliary_loss_clip": 0.01085183, - "auxiliary_loss_mlp": 0.01033103, - "balance_loss_clip": 1.03773975, - "balance_loss_mlp": 1.02135468, - "epoch": 0.8154516759356681, - "flos": 23369156640000.0, - "grad_norm": 1.459474907859823, - "language_loss": 0.71938479, - "learning_rate": 3.4665582179578734e-07, - "loss": 0.74056768, - "num_input_tokens_seen": 292678460, - "step": 13563, - "time_per_iteration": 2.695099353790283 - }, - { - "auxiliary_loss_clip": 0.01025068, - "auxiliary_loss_mlp": 0.01029122, - "balance_loss_clip": 1.03488255, - "balance_loss_mlp": 1.01562715, - "epoch": 0.815511799188336, - "flos": 28149189914880.0, - "grad_norm": 1.6109076046410835, - "language_loss": 0.702739, - "learning_rate": 3.4643670929714387e-07, - "loss": 0.72328091, - "num_input_tokens_seen": 292699815, - "step": 13564, - "time_per_iteration": 3.0163979530334473 - }, - { - "auxiliary_loss_clip": 0.0108271, - "auxiliary_loss_mlp": 0.0102893, - "balance_loss_clip": 1.03672302, - "balance_loss_mlp": 1.01679492, - "epoch": 0.8155719224410041, - "flos": 16983413683200.0, - "grad_norm": 2.0873578348376745, - "language_loss": 0.70476174, - "learning_rate": 3.462176595017854e-07, - "loss": 0.72587812, - "num_input_tokens_seen": 292717370, - "step": 13565, - "time_per_iteration": 2.8652422428131104 - }, - { - "auxiliary_loss_clip": 0.01097982, - "auxiliary_loss_mlp": 0.01031925, - "balance_loss_clip": 1.03627336, - "balance_loss_mlp": 1.01994491, - "epoch": 0.815632045693672, - "flos": 24681727428480.0, - "grad_norm": 1.7436798411842787, - "language_loss": 0.78950644, - "learning_rate": 3.459986724180188e-07, - "loss": 0.8108055, - "num_input_tokens_seen": 292737110, - "step": 13566, - "time_per_iteration": 2.6846365928649902 - }, - { - "auxiliary_loss_clip": 0.01087086, - "auxiliary_loss_mlp": 0.01029651, - "balance_loss_clip": 1.03798318, - "balance_loss_mlp": 1.01873779, - "epoch": 0.81569216894634, - "flos": 19938323047680.0, - "grad_norm": 1.5991898000196176, - "language_loss": 0.82388943, - "learning_rate": 3.457797480541491e-07, - "loss": 0.84505683, - "num_input_tokens_seen": 292756510, - "step": 13567, - "time_per_iteration": 2.6953818798065186 - }, - { - "auxiliary_loss_clip": 0.01105808, - "auxiliary_loss_mlp": 0.01028496, - "balance_loss_clip": 1.03625703, - "balance_loss_mlp": 1.01798785, - "epoch": 0.8157522921990079, - "flos": 21799393493760.0, - "grad_norm": 2.2084802673905592, - "language_loss": 0.79599839, - "learning_rate": 3.455608864184771e-07, - "loss": 0.81734145, - "num_input_tokens_seen": 292776710, - "step": 13568, - "time_per_iteration": 2.6095540523529053 - }, - { - "auxiliary_loss_clip": 0.01088313, - "auxiliary_loss_mlp": 0.01030015, - "balance_loss_clip": 1.03864861, - "balance_loss_mlp": 1.01857734, - "epoch": 0.8158124154516759, - "flos": 18508323720960.0, - "grad_norm": 1.8748620768134565, - "language_loss": 0.77194703, - "learning_rate": 3.453420875193016e-07, - "loss": 0.79313028, - "num_input_tokens_seen": 292794350, - "step": 13569, - "time_per_iteration": 2.7158358097076416 - }, - { - "auxiliary_loss_clip": 0.01107012, - "auxiliary_loss_mlp": 0.01039138, - "balance_loss_clip": 1.03705049, - "balance_loss_mlp": 1.02786636, - "epoch": 0.815872538704344, - "flos": 26830801123200.0, - "grad_norm": 2.207936196273456, - "language_loss": 0.59039974, - "learning_rate": 3.451233513649199e-07, - "loss": 0.61186123, - "num_input_tokens_seen": 292814005, - "step": 13570, - "time_per_iteration": 2.6274027824401855 - }, - { - "auxiliary_loss_clip": 0.01099743, - "auxiliary_loss_mlp": 0.01037351, - "balance_loss_clip": 1.03609109, - "balance_loss_mlp": 1.02433372, - "epoch": 0.8159326619570119, - "flos": 21725704742400.0, - "grad_norm": 1.7456808566314872, - "language_loss": 0.8209976, - "learning_rate": 3.4490467796362687e-07, - "loss": 0.84236854, - "num_input_tokens_seen": 292833485, - "step": 13571, - "time_per_iteration": 2.607311725616455 - }, - { - "auxiliary_loss_clip": 0.01082011, - "auxiliary_loss_mlp": 0.01040519, - "balance_loss_clip": 1.03530788, - "balance_loss_mlp": 1.02747178, - "epoch": 0.8159927852096799, - "flos": 13840726993920.0, - "grad_norm": 2.3386046142984966, - "language_loss": 0.7775113, - "learning_rate": 3.446860673237142e-07, - "loss": 0.79873657, - "num_input_tokens_seen": 292848045, - "step": 13572, - "time_per_iteration": 2.615434169769287 - }, - { - "auxiliary_loss_clip": 0.01110553, - "auxiliary_loss_mlp": 0.01033153, - "balance_loss_clip": 1.0374527, - "balance_loss_mlp": 1.0209341, - "epoch": 0.8160529084623478, - "flos": 24499516711680.0, - "grad_norm": 1.477093078405139, - "language_loss": 0.65240854, - "learning_rate": 3.4446751945347186e-07, - "loss": 0.67384559, - "num_input_tokens_seen": 292869965, - "step": 13573, - "time_per_iteration": 2.6414575576782227 - }, - { - "auxiliary_loss_clip": 0.01075717, - "auxiliary_loss_mlp": 0.01028632, - "balance_loss_clip": 1.03710008, - "balance_loss_mlp": 1.0173732, - "epoch": 0.8161130317150158, - "flos": 24826339584000.0, - "grad_norm": 1.642166809234801, - "language_loss": 0.75473046, - "learning_rate": 3.442490343611868e-07, - "loss": 0.77577394, - "num_input_tokens_seen": 292889680, - "step": 13574, - "time_per_iteration": 2.85577392578125 - }, - { - "auxiliary_loss_clip": 0.01101144, - "auxiliary_loss_mlp": 0.0103536, - "balance_loss_clip": 1.03803658, - "balance_loss_mlp": 1.02264595, - "epoch": 0.8161731549676837, - "flos": 30956542208640.0, - "grad_norm": 2.5612570404533157, - "language_loss": 0.60302323, - "learning_rate": 3.4403061205514485e-07, - "loss": 0.62438828, - "num_input_tokens_seen": 292912360, - "step": 13575, - "time_per_iteration": 2.725813627243042 - }, - { - "auxiliary_loss_clip": 0.01030079, - "auxiliary_loss_mlp": 0.01039791, - "balance_loss_clip": 1.03109765, - "balance_loss_mlp": 1.02550936, - "epoch": 0.8162332782203517, - "flos": 18551991680640.0, - "grad_norm": 7.314687575537146, - "language_loss": 0.74446952, - "learning_rate": 3.4381225254362736e-07, - "loss": 0.76516831, - "num_input_tokens_seen": 292928325, - "step": 13576, - "time_per_iteration": 2.8337759971618652 - }, - { - "auxiliary_loss_clip": 0.01010195, - "auxiliary_loss_mlp": 0.01001162, - "balance_loss_clip": 1.00829458, - "balance_loss_mlp": 1.00028598, - "epoch": 0.8162934014730197, - "flos": 70386853904640.0, - "grad_norm": 0.8299990748373413, - "language_loss": 0.58698022, - "learning_rate": 3.435939558349155e-07, - "loss": 0.60709381, - "num_input_tokens_seen": 292992795, - "step": 13577, - "time_per_iteration": 3.217165470123291 - }, - { - "auxiliary_loss_clip": 0.01050236, - "auxiliary_loss_mlp": 0.01031701, - "balance_loss_clip": 1.03245091, - "balance_loss_mlp": 1.01977444, - "epoch": 0.8163535247256877, - "flos": 21214839559680.0, - "grad_norm": 1.6040267571253908, - "language_loss": 0.70921433, - "learning_rate": 3.4337572193728747e-07, - "loss": 0.73003376, - "num_input_tokens_seen": 293011950, - "step": 13578, - "time_per_iteration": 2.840709686279297 - }, - { - "auxiliary_loss_clip": 0.01068471, - "auxiliary_loss_mlp": 0.01030739, - "balance_loss_clip": 1.03506184, - "balance_loss_mlp": 1.0190444, - "epoch": 0.8164136479783556, - "flos": 21098847565440.0, - "grad_norm": 2.7752862595751977, - "language_loss": 0.73731124, - "learning_rate": 3.431575508590172e-07, - "loss": 0.7583034, - "num_input_tokens_seen": 293030175, - "step": 13579, - "time_per_iteration": 2.812387704849243 - }, - { - "auxiliary_loss_clip": 0.01110978, - "auxiliary_loss_mlp": 0.01028039, - "balance_loss_clip": 1.03761864, - "balance_loss_mlp": 1.01615429, - "epoch": 0.8164737712310236, - "flos": 21720640924800.0, - "grad_norm": 5.498991527014378, - "language_loss": 0.79516351, - "learning_rate": 3.4293944260837873e-07, - "loss": 0.81655371, - "num_input_tokens_seen": 293047980, - "step": 13580, - "time_per_iteration": 2.8092665672302246 - }, - { - "auxiliary_loss_clip": 0.01071948, - "auxiliary_loss_mlp": 0.01034847, - "balance_loss_clip": 1.03299272, - "balance_loss_mlp": 1.02182913, - "epoch": 0.8165338944836915, - "flos": 19536805843200.0, - "grad_norm": 1.723429137426299, - "language_loss": 0.69085348, - "learning_rate": 3.4272139719364314e-07, - "loss": 0.71192145, - "num_input_tokens_seen": 293067030, - "step": 13581, - "time_per_iteration": 2.7907984256744385 - }, - { - "auxiliary_loss_clip": 0.01107871, - "auxiliary_loss_mlp": 0.01032555, - "balance_loss_clip": 1.03613353, - "balance_loss_mlp": 1.02049136, - "epoch": 0.8165940177363595, - "flos": 22928568416640.0, - "grad_norm": 1.8992496957974652, - "language_loss": 0.59582806, - "learning_rate": 3.4250341462307786e-07, - "loss": 0.61723232, - "num_input_tokens_seen": 293085575, - "step": 13582, - "time_per_iteration": 2.72542405128479 - }, - { - "auxiliary_loss_clip": 0.0107424, - "auxiliary_loss_mlp": 0.00769809, - "balance_loss_clip": 1.03585207, - "balance_loss_mlp": 1.00015545, - "epoch": 0.8166541409890276, - "flos": 23370377702400.0, - "grad_norm": 1.954054796899383, - "language_loss": 0.82329261, - "learning_rate": 3.4228549490494897e-07, - "loss": 0.84173316, - "num_input_tokens_seen": 293108200, - "step": 13583, - "time_per_iteration": 2.749908685684204 - }, - { - "auxiliary_loss_clip": 0.01088673, - "auxiliary_loss_mlp": 0.01025238, - "balance_loss_clip": 1.03623259, - "balance_loss_mlp": 1.01392472, - "epoch": 0.8167142642416955, - "flos": 18441997257600.0, - "grad_norm": 1.802555874623744, - "language_loss": 0.74573183, - "learning_rate": 3.4206763804752093e-07, - "loss": 0.76687098, - "num_input_tokens_seen": 293126020, - "step": 13584, - "time_per_iteration": 2.8091073036193848 - }, - { - "auxiliary_loss_clip": 0.01098996, - "auxiliary_loss_mlp": 0.0102858, - "balance_loss_clip": 1.03830242, - "balance_loss_mlp": 1.01618278, - "epoch": 0.8167743874943635, - "flos": 21214983214080.0, - "grad_norm": 1.689121999421373, - "language_loss": 0.74577987, - "learning_rate": 3.4184984405905405e-07, - "loss": 0.76705563, - "num_input_tokens_seen": 293144620, - "step": 13585, - "time_per_iteration": 2.6251516342163086 - }, - { - "auxiliary_loss_clip": 0.01083034, - "auxiliary_loss_mlp": 0.01035775, - "balance_loss_clip": 1.03814149, - "balance_loss_mlp": 1.02334797, - "epoch": 0.8168345107470314, - "flos": 18697681244160.0, - "grad_norm": 2.110704900607274, - "language_loss": 0.6954788, - "learning_rate": 3.416321129478068e-07, - "loss": 0.71666694, - "num_input_tokens_seen": 293162850, - "step": 13586, - "time_per_iteration": 2.6488070487976074 - }, - { - "auxiliary_loss_clip": 0.01049954, - "auxiliary_loss_mlp": 0.01038255, - "balance_loss_clip": 1.03342056, - "balance_loss_mlp": 1.02592838, - "epoch": 0.8168946339996994, - "flos": 16253098358400.0, - "grad_norm": 1.5273465759672988, - "language_loss": 0.60744089, - "learning_rate": 3.4141444472203594e-07, - "loss": 0.62832302, - "num_input_tokens_seen": 293181620, - "step": 13587, - "time_per_iteration": 2.7878332138061523 - }, - { - "auxiliary_loss_clip": 0.01100484, - "auxiliary_loss_mlp": 0.0103421, - "balance_loss_clip": 1.03639674, - "balance_loss_mlp": 1.02172291, - "epoch": 0.8169547572523673, - "flos": 26941585645440.0, - "grad_norm": 2.223800946247814, - "language_loss": 0.6970458, - "learning_rate": 3.4119683938999624e-07, - "loss": 0.71839273, - "num_input_tokens_seen": 293200270, - "step": 13588, - "time_per_iteration": 2.692920207977295 - }, - { - "auxiliary_loss_clip": 0.01085855, - "auxiliary_loss_mlp": 0.01043402, - "balance_loss_clip": 1.03655553, - "balance_loss_mlp": 1.02848303, - "epoch": 0.8170148805050353, - "flos": 18952323736320.0, - "grad_norm": 1.5303676433154123, - "language_loss": 0.73124111, - "learning_rate": 3.4097929695993854e-07, - "loss": 0.75253367, - "num_input_tokens_seen": 293218960, - "step": 13589, - "time_per_iteration": 2.679173469543457 - }, - { - "auxiliary_loss_clip": 0.01094872, - "auxiliary_loss_mlp": 0.01032909, - "balance_loss_clip": 1.03692865, - "balance_loss_mlp": 1.02016521, - "epoch": 0.8170750037577033, - "flos": 21834909066240.0, - "grad_norm": 2.2699236258793456, - "language_loss": 0.73170865, - "learning_rate": 3.4076181744011166e-07, - "loss": 0.75298643, - "num_input_tokens_seen": 293236450, - "step": 13590, - "time_per_iteration": 2.661827802658081 - }, - { - "auxiliary_loss_clip": 0.01112691, - "auxiliary_loss_mlp": 0.01033597, - "balance_loss_clip": 1.03789759, - "balance_loss_mlp": 1.01964402, - "epoch": 0.8171351270103713, - "flos": 33507169021440.0, - "grad_norm": 2.228487135956597, - "language_loss": 0.65462661, - "learning_rate": 3.4054440083876345e-07, - "loss": 0.67608947, - "num_input_tokens_seen": 293256480, - "step": 13591, - "time_per_iteration": 2.713564872741699 - }, - { - "auxiliary_loss_clip": 0.01110837, - "auxiliary_loss_mlp": 0.01036337, - "balance_loss_clip": 1.03630888, - "balance_loss_mlp": 1.02364123, - "epoch": 0.8171952502630392, - "flos": 22708184520960.0, - "grad_norm": 2.2790144502571366, - "language_loss": 0.68108523, - "learning_rate": 3.403270471641373e-07, - "loss": 0.70255697, - "num_input_tokens_seen": 293274960, - "step": 13592, - "time_per_iteration": 2.673107862472534 - }, - { - "auxiliary_loss_clip": 0.01086566, - "auxiliary_loss_mlp": 0.01029843, - "balance_loss_clip": 1.03531361, - "balance_loss_mlp": 1.01699781, - "epoch": 0.8172553735157072, - "flos": 26723715701760.0, - "grad_norm": 1.5485466533329424, - "language_loss": 0.6656639, - "learning_rate": 3.401097564244759e-07, - "loss": 0.68682802, - "num_input_tokens_seen": 293295945, - "step": 13593, - "time_per_iteration": 2.738813877105713 - }, - { - "auxiliary_loss_clip": 0.01098161, - "auxiliary_loss_mlp": 0.01032061, - "balance_loss_clip": 1.03540421, - "balance_loss_mlp": 1.02022982, - "epoch": 0.8173154967683751, - "flos": 15961072786560.0, - "grad_norm": 1.90048610301986, - "language_loss": 0.69598675, - "learning_rate": 3.398925286280188e-07, - "loss": 0.71728897, - "num_input_tokens_seen": 293313300, - "step": 13594, - "time_per_iteration": 5.800758361816406 - }, - { - "auxiliary_loss_clip": 0.01110285, - "auxiliary_loss_mlp": 0.01033044, - "balance_loss_clip": 1.0364691, - "balance_loss_mlp": 1.02115333, - "epoch": 0.8173756200210431, - "flos": 25986720447360.0, - "grad_norm": 1.8053968351175154, - "language_loss": 0.65974349, - "learning_rate": 3.3967536378300456e-07, - "loss": 0.68117678, - "num_input_tokens_seen": 293333085, - "step": 13595, - "time_per_iteration": 2.6032371520996094 - }, - { - "auxiliary_loss_clip": 0.01068247, - "auxiliary_loss_mlp": 0.01028339, - "balance_loss_clip": 1.03591299, - "balance_loss_mlp": 1.01576889, - "epoch": 0.8174357432737112, - "flos": 25664422688640.0, - "grad_norm": 1.659795934344192, - "language_loss": 0.78425729, - "learning_rate": 3.394582618976658e-07, - "loss": 0.80522317, - "num_input_tokens_seen": 293351895, - "step": 13596, - "time_per_iteration": 4.231920003890991 - }, - { - "auxiliary_loss_clip": 0.01081938, - "auxiliary_loss_mlp": 0.01028873, - "balance_loss_clip": 1.03306651, - "balance_loss_mlp": 1.01600397, - "epoch": 0.8174958665263791, - "flos": 21835088634240.0, - "grad_norm": 2.5636613927912775, - "language_loss": 0.58887529, - "learning_rate": 3.392412229802362e-07, - "loss": 0.60998344, - "num_input_tokens_seen": 293371165, - "step": 13597, - "time_per_iteration": 2.699782133102417 - }, - { - "auxiliary_loss_clip": 0.0107094, - "auxiliary_loss_mlp": 0.01033853, - "balance_loss_clip": 1.03980625, - "balance_loss_mlp": 1.02193189, - "epoch": 0.8175559897790471, - "flos": 22455517276800.0, - "grad_norm": 1.534270538423627, - "language_loss": 0.82330656, - "learning_rate": 3.390242470389462e-07, - "loss": 0.84435457, - "num_input_tokens_seen": 293391150, - "step": 13598, - "time_per_iteration": 2.7620291709899902 - }, - { - "auxiliary_loss_clip": 0.01052171, - "auxiliary_loss_mlp": 0.01031716, - "balance_loss_clip": 1.03996241, - "balance_loss_mlp": 1.01993775, - "epoch": 0.817616113031715, - "flos": 23615790399360.0, - "grad_norm": 1.8636627263308922, - "language_loss": 0.82549691, - "learning_rate": 3.3880733408202277e-07, - "loss": 0.84633583, - "num_input_tokens_seen": 293409440, - "step": 13599, - "time_per_iteration": 2.8193368911743164 - }, - { - "auxiliary_loss_clip": 0.01057864, - "auxiliary_loss_mlp": 0.0104518, - "balance_loss_clip": 1.03178751, - "balance_loss_mlp": 1.03132749, - "epoch": 0.817676236284383, - "flos": 27672260106240.0, - "grad_norm": 2.111179301114437, - "language_loss": 0.83922112, - "learning_rate": 3.3859048411769186e-07, - "loss": 0.86025155, - "num_input_tokens_seen": 293428995, - "step": 13600, - "time_per_iteration": 2.7920475006103516 - }, - { - "auxiliary_loss_clip": 0.01074994, - "auxiliary_loss_mlp": 0.01031351, - "balance_loss_clip": 1.03580821, - "balance_loss_mlp": 1.01914954, - "epoch": 0.8177363595370509, - "flos": 24681009156480.0, - "grad_norm": 1.862299702432468, - "language_loss": 0.74226046, - "learning_rate": 3.383736971541766e-07, - "loss": 0.76332384, - "num_input_tokens_seen": 293449155, - "step": 13601, - "time_per_iteration": 4.308535575866699 - }, - { - "auxiliary_loss_clip": 0.01078366, - "auxiliary_loss_mlp": 0.01036015, - "balance_loss_clip": 1.03641343, - "balance_loss_mlp": 1.02314591, - "epoch": 0.817796482789719, - "flos": 17346326745600.0, - "grad_norm": 2.078289918028392, - "language_loss": 0.68360138, - "learning_rate": 3.3815697319969737e-07, - "loss": 0.70474523, - "num_input_tokens_seen": 293466125, - "step": 13602, - "time_per_iteration": 2.8116466999053955 - }, - { - "auxiliary_loss_clip": 0.01068639, - "auxiliary_loss_mlp": 0.01038644, - "balance_loss_clip": 1.03409863, - "balance_loss_mlp": 1.02547121, - "epoch": 0.8178566060423869, - "flos": 17778475272960.0, - "grad_norm": 2.118367882744336, - "language_loss": 0.83765864, - "learning_rate": 3.379403122624718e-07, - "loss": 0.85873151, - "num_input_tokens_seen": 293481345, - "step": 13603, - "time_per_iteration": 2.7411158084869385 - }, - { - "auxiliary_loss_clip": 0.0106116, - "auxiliary_loss_mlp": 0.01028949, - "balance_loss_clip": 1.03759289, - "balance_loss_mlp": 1.0176841, - "epoch": 0.8179167292950549, - "flos": 24973250209920.0, - "grad_norm": 1.7705965391975051, - "language_loss": 0.69410896, - "learning_rate": 3.377237143507159e-07, - "loss": 0.71501005, - "num_input_tokens_seen": 293502330, - "step": 13604, - "time_per_iteration": 2.7547354698181152 - }, - { - "auxiliary_loss_clip": 0.01081221, - "auxiliary_loss_mlp": 0.01034665, - "balance_loss_clip": 1.03777099, - "balance_loss_mlp": 1.02226162, - "epoch": 0.8179768525477228, - "flos": 22856783086080.0, - "grad_norm": 1.8951606880095677, - "language_loss": 0.74119198, - "learning_rate": 3.3750717947264406e-07, - "loss": 0.76235086, - "num_input_tokens_seen": 293521415, - "step": 13605, - "time_per_iteration": 2.7130730152130127 - }, - { - "auxiliary_loss_clip": 0.01071497, - "auxiliary_loss_mlp": 0.01038946, - "balance_loss_clip": 1.03906167, - "balance_loss_mlp": 1.02588034, - "epoch": 0.8180369758003908, - "flos": 18515147304960.0, - "grad_norm": 1.7730120877057978, - "language_loss": 0.73990393, - "learning_rate": 3.372907076364666e-07, - "loss": 0.76100838, - "num_input_tokens_seen": 293539245, - "step": 13606, - "time_per_iteration": 2.705872058868408 - }, - { - "auxiliary_loss_clip": 0.01108658, - "auxiliary_loss_mlp": 0.01032239, - "balance_loss_clip": 1.03782868, - "balance_loss_mlp": 1.02010965, - "epoch": 0.8180970990530587, - "flos": 33182105915520.0, - "grad_norm": 1.7215775601325016, - "language_loss": 0.65496033, - "learning_rate": 3.370742988503916e-07, - "loss": 0.67636931, - "num_input_tokens_seen": 293560640, - "step": 13607, - "time_per_iteration": 2.695094347000122 - }, - { - "auxiliary_loss_clip": 0.01087636, - "auxiliary_loss_mlp": 0.01030931, - "balance_loss_clip": 1.0383904, - "balance_loss_mlp": 1.0186528, - "epoch": 0.8181572223057267, - "flos": 25010022758400.0, - "grad_norm": 1.9180357233704657, - "language_loss": 0.70527983, - "learning_rate": 3.3685795312262634e-07, - "loss": 0.72646552, - "num_input_tokens_seen": 293579465, - "step": 13608, - "time_per_iteration": 2.7109787464141846 - }, - { - "auxiliary_loss_clip": 0.01094237, - "auxiliary_loss_mlp": 0.01033072, - "balance_loss_clip": 1.03419423, - "balance_loss_mlp": 1.02114487, - "epoch": 0.8182173455583948, - "flos": 28548731871360.0, - "grad_norm": 1.94764090555504, - "language_loss": 0.79518479, - "learning_rate": 3.366416704613735e-07, - "loss": 0.81645787, - "num_input_tokens_seen": 293600540, - "step": 13609, - "time_per_iteration": 2.678457736968994 - }, - { - "auxiliary_loss_clip": 0.01006167, - "auxiliary_loss_mlp": 0.01001094, - "balance_loss_clip": 1.01206219, - "balance_loss_mlp": 0.99999696, - "epoch": 0.8182774688110627, - "flos": 72028043245440.0, - "grad_norm": 0.745693768883286, - "language_loss": 0.55858743, - "learning_rate": 3.3642545087483544e-07, - "loss": 0.57866001, - "num_input_tokens_seen": 293665160, - "step": 13610, - "time_per_iteration": 3.287687063217163 - }, - { - "auxiliary_loss_clip": 0.01043521, - "auxiliary_loss_mlp": 0.00770311, - "balance_loss_clip": 1.02925563, - "balance_loss_mlp": 1.00016284, - "epoch": 0.8183375920637307, - "flos": 19755358145280.0, - "grad_norm": 1.923535272295543, - "language_loss": 0.77933627, - "learning_rate": 3.362092943712107e-07, - "loss": 0.79747456, - "num_input_tokens_seen": 293683995, - "step": 13611, - "time_per_iteration": 2.757842540740967 - }, - { - "auxiliary_loss_clip": 0.01074897, - "auxiliary_loss_mlp": 0.01033531, - "balance_loss_clip": 1.03499138, - "balance_loss_mlp": 1.01989329, - "epoch": 0.8183977153163986, - "flos": 22341895580160.0, - "grad_norm": 1.792092336455415, - "language_loss": 0.77061421, - "learning_rate": 3.3599320095869745e-07, - "loss": 0.79169852, - "num_input_tokens_seen": 293704115, - "step": 13612, - "time_per_iteration": 2.7527639865875244 - }, - { - "auxiliary_loss_clip": 0.01070156, - "auxiliary_loss_mlp": 0.01026228, - "balance_loss_clip": 1.03287673, - "balance_loss_mlp": 1.01489091, - "epoch": 0.8184578385690666, - "flos": 17712472032000.0, - "grad_norm": 2.2843501898761205, - "language_loss": 0.86122215, - "learning_rate": 3.3577717064548793e-07, - "loss": 0.88218594, - "num_input_tokens_seen": 293722225, - "step": 13613, - "time_per_iteration": 2.7401769161224365 - }, - { - "auxiliary_loss_clip": 0.01098117, - "auxiliary_loss_mlp": 0.01045961, - "balance_loss_clip": 1.03796077, - "balance_loss_mlp": 1.03408742, - "epoch": 0.8185179618217345, - "flos": 25701159323520.0, - "grad_norm": 2.6943480518584906, - "language_loss": 0.72842276, - "learning_rate": 3.355612034397746e-07, - "loss": 0.74986356, - "num_input_tokens_seen": 293743995, - "step": 13614, - "time_per_iteration": 2.680565118789673 - }, - { - "auxiliary_loss_clip": 0.01085324, - "auxiliary_loss_mlp": 0.01040844, - "balance_loss_clip": 1.03373837, - "balance_loss_mlp": 1.02824354, - "epoch": 0.8185780850744026, - "flos": 25960326929280.0, - "grad_norm": 1.7330678379647075, - "language_loss": 0.81346858, - "learning_rate": 3.353452993497479e-07, - "loss": 0.83473027, - "num_input_tokens_seen": 293764935, - "step": 13615, - "time_per_iteration": 2.715773105621338 - }, - { - "auxiliary_loss_clip": 0.01093975, - "auxiliary_loss_mlp": 0.01032579, - "balance_loss_clip": 1.03279996, - "balance_loss_mlp": 1.01989484, - "epoch": 0.8186382083270705, - "flos": 25228431406080.0, - "grad_norm": 3.391470400733545, - "language_loss": 0.75472414, - "learning_rate": 3.3512945838359375e-07, - "loss": 0.77598965, - "num_input_tokens_seen": 293784035, - "step": 13616, - "time_per_iteration": 2.6478960514068604 - }, - { - "auxiliary_loss_clip": 0.0106733, - "auxiliary_loss_mlp": 0.01043672, - "balance_loss_clip": 1.03091192, - "balance_loss_mlp": 1.02980757, - "epoch": 0.8186983315797385, - "flos": 22415009713920.0, - "grad_norm": 1.7062309242094946, - "language_loss": 0.75500989, - "learning_rate": 3.349136805494979e-07, - "loss": 0.77611995, - "num_input_tokens_seen": 293803360, - "step": 13617, - "time_per_iteration": 2.7314293384552 - }, - { - "auxiliary_loss_clip": 0.01080104, - "auxiliary_loss_mlp": 0.01032113, - "balance_loss_clip": 1.03370297, - "balance_loss_mlp": 1.02053142, - "epoch": 0.8187584548324064, - "flos": 22018017623040.0, - "grad_norm": 1.9943005109582315, - "language_loss": 0.68466866, - "learning_rate": 3.346979658556415e-07, - "loss": 0.70579082, - "num_input_tokens_seen": 293821325, - "step": 13618, - "time_per_iteration": 2.7118663787841797 - }, - { - "auxiliary_loss_clip": 0.01086635, - "auxiliary_loss_mlp": 0.01032733, - "balance_loss_clip": 1.03645062, - "balance_loss_mlp": 1.01954257, - "epoch": 0.8188185780850744, - "flos": 29241664116480.0, - "grad_norm": 1.958275526623864, - "language_loss": 0.69876873, - "learning_rate": 3.344823143102058e-07, - "loss": 0.71996242, - "num_input_tokens_seen": 293840315, - "step": 13619, - "time_per_iteration": 2.7601280212402344 - }, - { - "auxiliary_loss_clip": 0.01051452, - "auxiliary_loss_mlp": 0.01029042, - "balance_loss_clip": 1.03892016, - "balance_loss_mlp": 1.01647735, - "epoch": 0.8188787013377423, - "flos": 20696504348160.0, - "grad_norm": 1.8298313202907792, - "language_loss": 0.73982012, - "learning_rate": 3.3426672592136694e-07, - "loss": 0.760625, - "num_input_tokens_seen": 293855685, - "step": 13620, - "time_per_iteration": 2.782697916030884 - }, - { - "auxiliary_loss_clip": 0.01079658, - "auxiliary_loss_mlp": 0.00772167, - "balance_loss_clip": 1.03250647, - "balance_loss_mlp": 1.00025058, - "epoch": 0.8189388245904103, - "flos": 23732967542400.0, - "grad_norm": 1.515288767485074, - "language_loss": 0.76337874, - "learning_rate": 3.340512006973011e-07, - "loss": 0.78189701, - "num_input_tokens_seen": 293875540, - "step": 13621, - "time_per_iteration": 2.681579828262329 - }, - { - "auxiliary_loss_clip": 0.01082197, - "auxiliary_loss_mlp": 0.01030264, - "balance_loss_clip": 1.03172946, - "balance_loss_mlp": 1.01746082, - "epoch": 0.8189989478430784, - "flos": 28255090187520.0, - "grad_norm": 2.436105215072431, - "language_loss": 0.66058964, - "learning_rate": 3.3383573864618076e-07, - "loss": 0.68171418, - "num_input_tokens_seen": 293896570, - "step": 13622, - "time_per_iteration": 2.753495216369629 - }, - { - "auxiliary_loss_clip": 0.01111281, - "auxiliary_loss_mlp": 0.01030003, - "balance_loss_clip": 1.03886437, - "balance_loss_mlp": 1.01628244, - "epoch": 0.8190590710957463, - "flos": 21397696721280.0, - "grad_norm": 1.992471820034199, - "language_loss": 0.74813384, - "learning_rate": 3.3362033977617653e-07, - "loss": 0.76954669, - "num_input_tokens_seen": 293914680, - "step": 13623, - "time_per_iteration": 2.6488537788391113 - }, - { - "auxiliary_loss_clip": 0.01085531, - "auxiliary_loss_mlp": 0.01039034, - "balance_loss_clip": 1.03339553, - "balance_loss_mlp": 1.02606368, - "epoch": 0.8191191943484143, - "flos": 38796451367040.0, - "grad_norm": 1.888675270274182, - "language_loss": 0.63241279, - "learning_rate": 3.3340500409545527e-07, - "loss": 0.65365839, - "num_input_tokens_seen": 293936480, - "step": 13624, - "time_per_iteration": 2.9440207481384277 - }, - { - "auxiliary_loss_clip": 0.01106162, - "auxiliary_loss_mlp": 0.01034735, - "balance_loss_clip": 1.03641939, - "balance_loss_mlp": 1.02273679, - "epoch": 0.8191793176010822, - "flos": 25446516831360.0, - "grad_norm": 1.6219303590574095, - "language_loss": 0.78032911, - "learning_rate": 3.3318973161218386e-07, - "loss": 0.80173808, - "num_input_tokens_seen": 293957815, - "step": 13625, - "time_per_iteration": 2.685042381286621 - }, - { - "auxiliary_loss_clip": 0.01101604, - "auxiliary_loss_mlp": 0.00771173, - "balance_loss_clip": 1.0347513, - "balance_loss_mlp": 1.0001961, - "epoch": 0.8192394408537502, - "flos": 25083029151360.0, - "grad_norm": 2.016240511414733, - "language_loss": 0.75687516, - "learning_rate": 3.329745223345244e-07, - "loss": 0.77560294, - "num_input_tokens_seen": 293975440, - "step": 13626, - "time_per_iteration": 2.637768507003784 - }, - { - "auxiliary_loss_clip": 0.01098049, - "auxiliary_loss_mlp": 0.0103675, - "balance_loss_clip": 1.0376209, - "balance_loss_mlp": 1.02519846, - "epoch": 0.8192995641064181, - "flos": 27673732563840.0, - "grad_norm": 1.5972949724439707, - "language_loss": 0.73228663, - "learning_rate": 3.3275937627063823e-07, - "loss": 0.75363463, - "num_input_tokens_seen": 293997540, - "step": 13627, - "time_per_iteration": 2.7295448780059814 - }, - { - "auxiliary_loss_clip": 0.01109571, - "auxiliary_loss_mlp": 0.01033076, - "balance_loss_clip": 1.03797257, - "balance_loss_mlp": 1.02066636, - "epoch": 0.8193596873590862, - "flos": 21288492397440.0, - "grad_norm": 1.651474068364024, - "language_loss": 0.69222027, - "learning_rate": 3.3254429342868353e-07, - "loss": 0.71364677, - "num_input_tokens_seen": 294017030, - "step": 13628, - "time_per_iteration": 2.6305129528045654 - }, - { - "auxiliary_loss_clip": 0.01087095, - "auxiliary_loss_mlp": 0.01045505, - "balance_loss_clip": 1.0360409, - "balance_loss_mlp": 1.03141403, - "epoch": 0.8194198106117541, - "flos": 17492626840320.0, - "grad_norm": 2.3448084033624115, - "language_loss": 0.85264301, - "learning_rate": 3.323292738168171e-07, - "loss": 0.87396896, - "num_input_tokens_seen": 294035700, - "step": 13629, - "time_per_iteration": 2.6781747341156006 - }, - { - "auxiliary_loss_clip": 0.01106506, - "auxiliary_loss_mlp": 0.01026288, - "balance_loss_clip": 1.03619409, - "balance_loss_mlp": 1.01409864, - "epoch": 0.8194799338644221, - "flos": 15267925059840.0, - "grad_norm": 2.0184519411378345, - "language_loss": 0.73626029, - "learning_rate": 3.3211431744319084e-07, - "loss": 0.75758827, - "num_input_tokens_seen": 294049730, - "step": 13630, - "time_per_iteration": 2.6452038288116455 - }, - { - "auxiliary_loss_clip": 0.01096556, - "auxiliary_loss_mlp": 0.01039124, - "balance_loss_clip": 1.03655708, - "balance_loss_mlp": 1.02556396, - "epoch": 0.81954005711709, - "flos": 14718814871040.0, - "grad_norm": 1.8847266375290428, - "language_loss": 0.72261512, - "learning_rate": 3.31899424315957e-07, - "loss": 0.74397194, - "num_input_tokens_seen": 294066545, - "step": 13631, - "time_per_iteration": 2.595489025115967 - }, - { - "auxiliary_loss_clip": 0.01108623, - "auxiliary_loss_mlp": 0.01030377, - "balance_loss_clip": 1.03625333, - "balance_loss_mlp": 1.018224, - "epoch": 0.819600180369758, - "flos": 23074042498560.0, - "grad_norm": 1.78491625477661, - "language_loss": 0.76710784, - "learning_rate": 3.3168459444326447e-07, - "loss": 0.78849781, - "num_input_tokens_seen": 294087455, - "step": 13632, - "time_per_iteration": 2.639312267303467 - }, - { - "auxiliary_loss_clip": 0.01081621, - "auxiliary_loss_mlp": 0.01031706, - "balance_loss_clip": 1.03269899, - "balance_loss_mlp": 1.01979756, - "epoch": 0.8196603036224259, - "flos": 27599792417280.0, - "grad_norm": 3.6495669730083455, - "language_loss": 0.65916097, - "learning_rate": 3.314698278332588e-07, - "loss": 0.68029428, - "num_input_tokens_seen": 294107480, - "step": 13633, - "time_per_iteration": 4.429157733917236 - }, - { - "auxiliary_loss_clip": 0.01090266, - "auxiliary_loss_mlp": 0.01037639, - "balance_loss_clip": 1.03390145, - "balance_loss_mlp": 1.02634966, - "epoch": 0.8197204268750939, - "flos": 28582020800640.0, - "grad_norm": 1.4436935437112157, - "language_loss": 0.75417399, - "learning_rate": 3.3125512449408513e-07, - "loss": 0.77545297, - "num_input_tokens_seen": 294130115, - "step": 13634, - "time_per_iteration": 4.236420392990112 - }, - { - "auxiliary_loss_clip": 0.01049415, - "auxiliary_loss_mlp": 0.00769002, - "balance_loss_clip": 1.03555465, - "balance_loss_mlp": 1.00017309, - "epoch": 0.819780550127762, - "flos": 23258300290560.0, - "grad_norm": 1.863716594786732, - "language_loss": 0.82285905, - "learning_rate": 3.310404844338841e-07, - "loss": 0.84104323, - "num_input_tokens_seen": 294148495, - "step": 13635, - "time_per_iteration": 4.350587606430054 - }, - { - "auxiliary_loss_clip": 0.01094136, - "auxiliary_loss_mlp": 0.01031626, - "balance_loss_clip": 1.03306413, - "balance_loss_mlp": 1.01876307, - "epoch": 0.8198406733804299, - "flos": 26685255214080.0, - "grad_norm": 1.580556826967959, - "language_loss": 0.7557019, - "learning_rate": 3.308259076607949e-07, - "loss": 0.77695948, - "num_input_tokens_seen": 294169595, - "step": 13636, - "time_per_iteration": 2.694965362548828 - }, - { - "auxiliary_loss_clip": 0.01085829, - "auxiliary_loss_mlp": 0.01034289, - "balance_loss_clip": 1.04320598, - "balance_loss_mlp": 1.02125335, - "epoch": 0.8199007966330979, - "flos": 20084084438400.0, - "grad_norm": 2.291328351334751, - "language_loss": 0.81272769, - "learning_rate": 3.3061139418295445e-07, - "loss": 0.83392888, - "num_input_tokens_seen": 294183885, - "step": 13637, - "time_per_iteration": 2.730604410171509 - }, - { - "auxiliary_loss_clip": 0.01097936, - "auxiliary_loss_mlp": 0.01031089, - "balance_loss_clip": 1.03770888, - "balance_loss_mlp": 1.01860201, - "epoch": 0.8199609198857658, - "flos": 31902788142720.0, - "grad_norm": 2.2206002932791566, - "language_loss": 0.710298, - "learning_rate": 3.3039694400849725e-07, - "loss": 0.73158824, - "num_input_tokens_seen": 294200150, - "step": 13638, - "time_per_iteration": 2.683467149734497 - }, - { - "auxiliary_loss_clip": 0.01061969, - "auxiliary_loss_mlp": 0.0103106, - "balance_loss_clip": 1.0327965, - "balance_loss_mlp": 1.01680839, - "epoch": 0.8200210431384338, - "flos": 26470150617600.0, - "grad_norm": 1.942665681540599, - "language_loss": 0.79615062, - "learning_rate": 3.3018255714555564e-07, - "loss": 0.81708086, - "num_input_tokens_seen": 294220385, - "step": 13639, - "time_per_iteration": 2.7710959911346436 - }, - { - "auxiliary_loss_clip": 0.01062834, - "auxiliary_loss_mlp": 0.01033155, - "balance_loss_clip": 1.03322732, - "balance_loss_mlp": 1.02089465, - "epoch": 0.8200811663911017, - "flos": 22091454979200.0, - "grad_norm": 1.6392982589425356, - "language_loss": 0.79226673, - "learning_rate": 3.299682336022589e-07, - "loss": 0.81322664, - "num_input_tokens_seen": 294239355, - "step": 13640, - "time_per_iteration": 4.275204658508301 - }, - { - "auxiliary_loss_clip": 0.01076176, - "auxiliary_loss_mlp": 0.01035588, - "balance_loss_clip": 1.03405476, - "balance_loss_mlp": 1.0229578, - "epoch": 0.8201412896437698, - "flos": 37593659520000.0, - "grad_norm": 1.7308217218168405, - "language_loss": 0.63248229, - "learning_rate": 3.297539733867336e-07, - "loss": 0.65359992, - "num_input_tokens_seen": 294259395, - "step": 13641, - "time_per_iteration": 2.795254945755005 - }, - { - "auxiliary_loss_clip": 0.01056206, - "auxiliary_loss_mlp": 0.01028153, - "balance_loss_clip": 1.03538704, - "balance_loss_mlp": 1.01539707, - "epoch": 0.8202014128964377, - "flos": 19646333389440.0, - "grad_norm": 1.8557472198282705, - "language_loss": 0.73365706, - "learning_rate": 3.295397765071055e-07, - "loss": 0.75450063, - "num_input_tokens_seen": 294277365, - "step": 13642, - "time_per_iteration": 2.7157320976257324 - }, - { - "auxiliary_loss_clip": 0.01086181, - "auxiliary_loss_mlp": 0.01031537, - "balance_loss_clip": 1.03858817, - "balance_loss_mlp": 1.01963401, - "epoch": 0.8202615361491057, - "flos": 31467335564160.0, - "grad_norm": 2.095752785900936, - "language_loss": 0.70286655, - "learning_rate": 3.2932564297149615e-07, - "loss": 0.72404379, - "num_input_tokens_seen": 294297555, - "step": 13643, - "time_per_iteration": 2.7395925521850586 - }, - { - "auxiliary_loss_clip": 0.01097598, - "auxiliary_loss_mlp": 0.01031628, - "balance_loss_clip": 1.03775418, - "balance_loss_mlp": 1.01995778, - "epoch": 0.8203216594017736, - "flos": 24715555061760.0, - "grad_norm": 1.69681118758784, - "language_loss": 0.65516806, - "learning_rate": 3.291115727880256e-07, - "loss": 0.67646027, - "num_input_tokens_seen": 294317600, - "step": 13644, - "time_per_iteration": 2.6443233489990234 - }, - { - "auxiliary_loss_clip": 0.01069905, - "auxiliary_loss_mlp": 0.01034884, - "balance_loss_clip": 1.0356884, - "balance_loss_mlp": 1.02291584, - "epoch": 0.8203817826544416, - "flos": 26031824951040.0, - "grad_norm": 1.4101189561247485, - "language_loss": 0.70740688, - "learning_rate": 3.2889756596481234e-07, - "loss": 0.72845483, - "num_input_tokens_seen": 294340215, - "step": 13645, - "time_per_iteration": 2.7722573280334473 - }, - { - "auxiliary_loss_clip": 0.01083381, - "auxiliary_loss_mlp": 0.01027664, - "balance_loss_clip": 1.03680301, - "balance_loss_mlp": 1.01596987, - "epoch": 0.8204419059071095, - "flos": 25954544839680.0, - "grad_norm": 2.371583298507033, - "language_loss": 0.7132858, - "learning_rate": 3.286836225099707e-07, - "loss": 0.73439622, - "num_input_tokens_seen": 294358590, - "step": 13646, - "time_per_iteration": 2.713864803314209 - }, - { - "auxiliary_loss_clip": 0.01089571, - "auxiliary_loss_mlp": 0.01030919, - "balance_loss_clip": 1.036955, - "balance_loss_mlp": 1.01831234, - "epoch": 0.8205020291597775, - "flos": 23580059345280.0, - "grad_norm": 2.245036233958922, - "language_loss": 0.78633201, - "learning_rate": 3.284697424316132e-07, - "loss": 0.80753696, - "num_input_tokens_seen": 294375825, - "step": 13647, - "time_per_iteration": 2.659745693206787 - }, - { - "auxiliary_loss_clip": 0.01105517, - "auxiliary_loss_mlp": 0.01033284, - "balance_loss_clip": 1.03771901, - "balance_loss_mlp": 1.02169704, - "epoch": 0.8205621524124456, - "flos": 26799164219520.0, - "grad_norm": 1.7369474065732662, - "language_loss": 0.67728269, - "learning_rate": 3.2825592573785034e-07, - "loss": 0.69867074, - "num_input_tokens_seen": 294398500, - "step": 13648, - "time_per_iteration": 2.642002582550049 - }, - { - "auxiliary_loss_clip": 0.01080292, - "auxiliary_loss_mlp": 0.01028555, - "balance_loss_clip": 1.03181791, - "balance_loss_mlp": 1.0157932, - "epoch": 0.8206222756651135, - "flos": 27527863432320.0, - "grad_norm": 1.7471547354733792, - "language_loss": 0.80010235, - "learning_rate": 3.28042172436791e-07, - "loss": 0.82119077, - "num_input_tokens_seen": 294418840, - "step": 13649, - "time_per_iteration": 2.704329252243042 - }, - { - "auxiliary_loss_clip": 0.01092884, - "auxiliary_loss_mlp": 0.01034827, - "balance_loss_clip": 1.03850818, - "balance_loss_mlp": 1.0212965, - "epoch": 0.8206823989177815, - "flos": 21178605715200.0, - "grad_norm": 1.9987063648882384, - "language_loss": 0.69307315, - "learning_rate": 3.278284825365396e-07, - "loss": 0.71435022, - "num_input_tokens_seen": 294438215, - "step": 13650, - "time_per_iteration": 2.59381365776062 - }, - { - "auxiliary_loss_clip": 0.01090201, - "auxiliary_loss_mlp": 0.01031521, - "balance_loss_clip": 1.03758073, - "balance_loss_mlp": 1.01843143, - "epoch": 0.8207425221704494, - "flos": 11509622150400.0, - "grad_norm": 1.942606809988791, - "language_loss": 0.60333896, - "learning_rate": 3.276148560452001e-07, - "loss": 0.62455606, - "num_input_tokens_seen": 294455260, - "step": 13651, - "time_per_iteration": 2.620542287826538 - }, - { - "auxiliary_loss_clip": 0.01069774, - "auxiliary_loss_mlp": 0.00773358, - "balance_loss_clip": 1.03502905, - "balance_loss_mlp": 1.00031233, - "epoch": 0.8208026454231174, - "flos": 19791987039360.0, - "grad_norm": 3.123048822731667, - "language_loss": 0.72240758, - "learning_rate": 3.2740129297087293e-07, - "loss": 0.74083889, - "num_input_tokens_seen": 294473205, - "step": 13652, - "time_per_iteration": 2.7204532623291016 - }, - { - "auxiliary_loss_clip": 0.01081839, - "auxiliary_loss_mlp": 0.01030063, - "balance_loss_clip": 1.03512836, - "balance_loss_mlp": 1.01936436, - "epoch": 0.8208627686757853, - "flos": 15667538843520.0, - "grad_norm": 1.909630535987182, - "language_loss": 0.73210537, - "learning_rate": 3.271877933216558e-07, - "loss": 0.75322437, - "num_input_tokens_seen": 294490645, - "step": 13653, - "time_per_iteration": 2.6469080448150635 - }, - { - "auxiliary_loss_clip": 0.0107235, - "auxiliary_loss_mlp": 0.01036685, - "balance_loss_clip": 1.03659797, - "balance_loss_mlp": 1.02340472, - "epoch": 0.8209228919284534, - "flos": 37482659516160.0, - "grad_norm": 1.930498918584404, - "language_loss": 0.63319474, - "learning_rate": 3.269743571056451e-07, - "loss": 0.65428507, - "num_input_tokens_seen": 294513500, - "step": 13654, - "time_per_iteration": 2.9437685012817383 - }, - { - "auxiliary_loss_clip": 0.0108459, - "auxiliary_loss_mlp": 0.01029817, - "balance_loss_clip": 1.03793693, - "balance_loss_mlp": 1.01780069, - "epoch": 0.8209830151811213, - "flos": 23112969863040.0, - "grad_norm": 1.5668397368199467, - "language_loss": 0.70084441, - "learning_rate": 3.2676098433093447e-07, - "loss": 0.72198856, - "num_input_tokens_seen": 294535710, - "step": 13655, - "time_per_iteration": 2.7804574966430664 - }, - { - "auxiliary_loss_clip": 0.01084392, - "auxiliary_loss_mlp": 0.01036883, - "balance_loss_clip": 1.03608942, - "balance_loss_mlp": 1.0246346, - "epoch": 0.8210431384337893, - "flos": 21288169175040.0, - "grad_norm": 2.0172748125132283, - "language_loss": 0.82037187, - "learning_rate": 3.265476750056162e-07, - "loss": 0.84158462, - "num_input_tokens_seen": 294554055, - "step": 13656, - "time_per_iteration": 2.721017599105835 - }, - { - "auxiliary_loss_clip": 0.01080199, - "auxiliary_loss_mlp": 0.01030184, - "balance_loss_clip": 1.03631461, - "balance_loss_mlp": 1.01812029, - "epoch": 0.8211032616864572, - "flos": 11502403516800.0, - "grad_norm": 2.1429350332327335, - "language_loss": 0.74038959, - "learning_rate": 3.2633442913777654e-07, - "loss": 0.76149338, - "num_input_tokens_seen": 294570390, - "step": 13657, - "time_per_iteration": 2.6449975967407227 - }, - { - "auxiliary_loss_clip": 0.01076624, - "auxiliary_loss_mlp": 0.01033144, - "balance_loss_clip": 1.03495431, - "balance_loss_mlp": 1.02119923, - "epoch": 0.8211633849391252, - "flos": 29821477455360.0, - "grad_norm": 1.677076204685542, - "language_loss": 0.55757195, - "learning_rate": 3.2612124673550325e-07, - "loss": 0.57866967, - "num_input_tokens_seen": 294593050, - "step": 13658, - "time_per_iteration": 2.7866504192352295 - }, - { - "auxiliary_loss_clip": 0.01046948, - "auxiliary_loss_mlp": 0.01032121, - "balance_loss_clip": 1.03354919, - "balance_loss_mlp": 1.01984835, - "epoch": 0.8212235081917931, - "flos": 13115439573120.0, - "grad_norm": 2.054958093178623, - "language_loss": 0.78911436, - "learning_rate": 3.259081278068805e-07, - "loss": 0.80990505, - "num_input_tokens_seen": 294608550, - "step": 13659, - "time_per_iteration": 2.7733964920043945 - }, - { - "auxiliary_loss_clip": 0.01090521, - "auxiliary_loss_mlp": 0.01028594, - "balance_loss_clip": 1.03315973, - "balance_loss_mlp": 1.01845503, - "epoch": 0.8212836314444611, - "flos": 40515351782400.0, - "grad_norm": 1.7003866148099478, - "language_loss": 0.59908175, - "learning_rate": 3.256950723599887e-07, - "loss": 0.62027293, - "num_input_tokens_seen": 294630380, - "step": 13660, - "time_per_iteration": 2.7818117141723633 - }, - { - "auxiliary_loss_clip": 0.01096127, - "auxiliary_loss_mlp": 0.01034813, - "balance_loss_clip": 1.03519523, - "balance_loss_mlp": 1.0208652, - "epoch": 0.8213437546971292, - "flos": 18770543982720.0, - "grad_norm": 2.120880379867683, - "language_loss": 0.73009235, - "learning_rate": 3.254820804029075e-07, - "loss": 0.75140172, - "num_input_tokens_seen": 294648655, - "step": 13661, - "time_per_iteration": 2.5873122215270996 - }, - { - "auxiliary_loss_clip": 0.01093175, - "auxiliary_loss_mlp": 0.01030555, - "balance_loss_clip": 1.03569698, - "balance_loss_mlp": 1.01827097, - "epoch": 0.8214038779497971, - "flos": 19682279925120.0, - "grad_norm": 2.1908603009914707, - "language_loss": 0.74912691, - "learning_rate": 3.252691519437143e-07, - "loss": 0.77036428, - "num_input_tokens_seen": 294666915, - "step": 13662, - "time_per_iteration": 2.70076322555542 - }, - { - "auxiliary_loss_clip": 0.01029455, - "auxiliary_loss_mlp": 0.01001299, - "balance_loss_clip": 1.00707769, - "balance_loss_mlp": 1.00035727, - "epoch": 0.8214640012024651, - "flos": 71602969697280.0, - "grad_norm": 0.7436789430956001, - "language_loss": 0.54036576, - "learning_rate": 3.250562869904825e-07, - "loss": 0.5606733, - "num_input_tokens_seen": 294731545, - "step": 13663, - "time_per_iteration": 3.2524144649505615 - }, - { - "auxiliary_loss_clip": 0.0106094, - "auxiliary_loss_mlp": 0.01032266, - "balance_loss_clip": 1.03105712, - "balance_loss_mlp": 1.02002287, - "epoch": 0.821524124455133, - "flos": 14757203531520.0, - "grad_norm": 2.109135364690857, - "language_loss": 0.65783775, - "learning_rate": 3.248434855512838e-07, - "loss": 0.67876983, - "num_input_tokens_seen": 294748745, - "step": 13664, - "time_per_iteration": 2.7579057216644287 - }, - { - "auxiliary_loss_clip": 0.01081895, - "auxiliary_loss_mlp": 0.0103047, - "balance_loss_clip": 1.03475428, - "balance_loss_mlp": 1.01932395, - "epoch": 0.821584247707801, - "flos": 25082274965760.0, - "grad_norm": 1.5036493569794076, - "language_loss": 0.75327474, - "learning_rate": 3.246307476341881e-07, - "loss": 0.77439839, - "num_input_tokens_seen": 294768955, - "step": 13665, - "time_per_iteration": 2.7124111652374268 - }, - { - "auxiliary_loss_clip": 0.01093989, - "auxiliary_loss_mlp": 0.00769563, - "balance_loss_clip": 1.03792393, - "balance_loss_mlp": 1.00023198, - "epoch": 0.8216443709604689, - "flos": 36830701710720.0, - "grad_norm": 2.32376999717277, - "language_loss": 0.65432054, - "learning_rate": 3.2441807324726256e-07, - "loss": 0.67295599, - "num_input_tokens_seen": 294789250, - "step": 13666, - "time_per_iteration": 2.7520713806152344 - }, - { - "auxiliary_loss_clip": 0.01059201, - "auxiliary_loss_mlp": 0.01030678, - "balance_loss_clip": 1.03574967, - "balance_loss_mlp": 1.01929307, - "epoch": 0.821704494213137, - "flos": 25081808088960.0, - "grad_norm": 1.6586859973993004, - "language_loss": 0.76773095, - "learning_rate": 3.2420546239857174e-07, - "loss": 0.78862977, - "num_input_tokens_seen": 294809760, - "step": 13667, - "time_per_iteration": 2.8164875507354736 - }, - { - "auxiliary_loss_clip": 0.01077218, - "auxiliary_loss_mlp": 0.01032735, - "balance_loss_clip": 1.03665185, - "balance_loss_mlp": 1.02043223, - "epoch": 0.8217646174658049, - "flos": 14356117290240.0, - "grad_norm": 1.9214564024977732, - "language_loss": 0.77153236, - "learning_rate": 3.239929150961773e-07, - "loss": 0.79263186, - "num_input_tokens_seen": 294826495, - "step": 13668, - "time_per_iteration": 2.795309066772461 - }, - { - "auxiliary_loss_clip": 0.0106108, - "auxiliary_loss_mlp": 0.01032359, - "balance_loss_clip": 1.03410029, - "balance_loss_mlp": 1.02047384, - "epoch": 0.8218247407184729, - "flos": 22090557139200.0, - "grad_norm": 2.232101782459693, - "language_loss": 0.7333163, - "learning_rate": 3.2378043134813984e-07, - "loss": 0.75425071, - "num_input_tokens_seen": 294845370, - "step": 13669, - "time_per_iteration": 2.733705520629883 - }, - { - "auxiliary_loss_clip": 0.01096991, - "auxiliary_loss_mlp": 0.01026791, - "balance_loss_clip": 1.03674257, - "balance_loss_mlp": 1.01509678, - "epoch": 0.8218848639711408, - "flos": 16764035368320.0, - "grad_norm": 1.5914876728736391, - "language_loss": 0.78921843, - "learning_rate": 3.235680111625161e-07, - "loss": 0.81045628, - "num_input_tokens_seen": 294863740, - "step": 13670, - "time_per_iteration": 2.632380723953247 - }, - { - "auxiliary_loss_clip": 0.01101033, - "auxiliary_loss_mlp": 0.01037163, - "balance_loss_clip": 1.03839719, - "balance_loss_mlp": 1.02437234, - "epoch": 0.8219449872238088, - "flos": 25994801007360.0, - "grad_norm": 1.7358038060221426, - "language_loss": 0.74638772, - "learning_rate": 3.2335565454736123e-07, - "loss": 0.76776969, - "num_input_tokens_seen": 294882815, - "step": 13671, - "time_per_iteration": 2.6536366939544678 - }, - { - "auxiliary_loss_clip": 0.01102103, - "auxiliary_loss_mlp": 0.0103, - "balance_loss_clip": 1.03765309, - "balance_loss_mlp": 1.0173583, - "epoch": 0.8220051104764767, - "flos": 20778094091520.0, - "grad_norm": 1.8480200060327416, - "language_loss": 0.76200128, - "learning_rate": 3.23143361510728e-07, - "loss": 0.78332233, - "num_input_tokens_seen": 294901985, - "step": 13672, - "time_per_iteration": 2.6287293434143066 - }, - { - "auxiliary_loss_clip": 0.0105776, - "auxiliary_loss_mlp": 0.01037446, - "balance_loss_clip": 1.03279448, - "balance_loss_mlp": 1.02387452, - "epoch": 0.8220652337291448, - "flos": 14574849160320.0, - "grad_norm": 2.155588749623656, - "language_loss": 0.74635303, - "learning_rate": 3.2293113206066733e-07, - "loss": 0.76730502, - "num_input_tokens_seen": 294919705, - "step": 13673, - "time_per_iteration": 5.964927911758423 - }, - { - "auxiliary_loss_clip": 0.01091542, - "auxiliary_loss_mlp": 0.01034095, - "balance_loss_clip": 1.03949618, - "balance_loss_mlp": 1.02133989, - "epoch": 0.8221253569818128, - "flos": 23805866194560.0, - "grad_norm": 1.8576069667953699, - "language_loss": 0.79360175, - "learning_rate": 3.227189662052254e-07, - "loss": 0.8148582, - "num_input_tokens_seen": 294939900, - "step": 13674, - "time_per_iteration": 2.711923599243164 - }, - { - "auxiliary_loss_clip": 0.01082091, - "auxiliary_loss_mlp": 0.01037891, - "balance_loss_clip": 1.03274429, - "balance_loss_mlp": 1.0257858, - "epoch": 0.8221854802344807, - "flos": 21288241002240.0, - "grad_norm": 2.0257881823597508, - "language_loss": 0.69993466, - "learning_rate": 3.225068639524484e-07, - "loss": 0.72113442, - "num_input_tokens_seen": 294959110, - "step": 13675, - "time_per_iteration": 4.205335378646851 - }, - { - "auxiliary_loss_clip": 0.01089922, - "auxiliary_loss_mlp": 0.01037385, - "balance_loss_clip": 1.03466141, - "balance_loss_mlp": 1.02468348, - "epoch": 0.8222456034871487, - "flos": 20956785275520.0, - "grad_norm": 1.6271888022504428, - "language_loss": 0.74166471, - "learning_rate": 3.2229482531037965e-07, - "loss": 0.76293778, - "num_input_tokens_seen": 294978660, - "step": 13676, - "time_per_iteration": 2.633631944656372 - }, - { - "auxiliary_loss_clip": 0.01081581, - "auxiliary_loss_mlp": 0.01032411, - "balance_loss_clip": 1.03602481, - "balance_loss_mlp": 1.02066302, - "epoch": 0.8223057267398166, - "flos": 21397517153280.0, - "grad_norm": 1.8848040435519355, - "language_loss": 0.80344379, - "learning_rate": 3.2208285028705893e-07, - "loss": 0.82458377, - "num_input_tokens_seen": 294998075, - "step": 13677, - "time_per_iteration": 2.715427875518799 - }, - { - "auxiliary_loss_clip": 0.01093784, - "auxiliary_loss_mlp": 0.01037139, - "balance_loss_clip": 1.03556919, - "balance_loss_mlp": 1.02450824, - "epoch": 0.8223658499924846, - "flos": 15268212368640.0, - "grad_norm": 2.296503126138382, - "language_loss": 0.70510441, - "learning_rate": 3.218709388905245e-07, - "loss": 0.72641361, - "num_input_tokens_seen": 295015950, - "step": 13678, - "time_per_iteration": 2.662177085876465 - }, - { - "auxiliary_loss_clip": 0.01107791, - "auxiliary_loss_mlp": 0.01034832, - "balance_loss_clip": 1.03623056, - "balance_loss_mlp": 1.02258909, - "epoch": 0.8224259732451525, - "flos": 31249537447680.0, - "grad_norm": 1.4830532789333675, - "language_loss": 0.71389025, - "learning_rate": 3.216590911288133e-07, - "loss": 0.73531646, - "num_input_tokens_seen": 295036800, - "step": 13679, - "time_per_iteration": 4.202351808547974 - }, - { - "auxiliary_loss_clip": 0.01079212, - "auxiliary_loss_mlp": 0.01033441, - "balance_loss_clip": 1.03329039, - "balance_loss_mlp": 1.02008915, - "epoch": 0.8224860964978206, - "flos": 21574628138880.0, - "grad_norm": 1.9740769073564464, - "language_loss": 0.70159578, - "learning_rate": 3.214473070099564e-07, - "loss": 0.72272229, - "num_input_tokens_seen": 295055300, - "step": 13680, - "time_per_iteration": 2.644590139389038 - }, - { - "auxiliary_loss_clip": 0.01075147, - "auxiliary_loss_mlp": 0.01029985, - "balance_loss_clip": 1.03547573, - "balance_loss_mlp": 1.01875556, - "epoch": 0.8225462197504885, - "flos": 25483217552640.0, - "grad_norm": 3.3850190908064164, - "language_loss": 0.59734452, - "learning_rate": 3.21235586541986e-07, - "loss": 0.61839581, - "num_input_tokens_seen": 295076420, - "step": 13681, - "time_per_iteration": 2.693240165710449 - }, - { - "auxiliary_loss_clip": 0.01084056, - "auxiliary_loss_mlp": 0.01038117, - "balance_loss_clip": 1.0347333, - "balance_loss_mlp": 1.02480125, - "epoch": 0.8226063430031565, - "flos": 39385458587520.0, - "grad_norm": 2.647199220979941, - "language_loss": 0.68972695, - "learning_rate": 3.2102392973293047e-07, - "loss": 0.71094871, - "num_input_tokens_seen": 295100540, - "step": 13682, - "time_per_iteration": 2.793362855911255 - }, - { - "auxiliary_loss_clip": 0.01109468, - "auxiliary_loss_mlp": 0.01030898, - "balance_loss_clip": 1.0367074, - "balance_loss_mlp": 1.01775503, - "epoch": 0.8226664662558244, - "flos": 22815269942400.0, - "grad_norm": 1.8560800713238335, - "language_loss": 0.79419553, - "learning_rate": 3.20812336590816e-07, - "loss": 0.81559926, - "num_input_tokens_seen": 295120180, - "step": 13683, - "time_per_iteration": 2.663804292678833 - }, - { - "auxiliary_loss_clip": 0.01104253, - "auxiliary_loss_mlp": 0.01029993, - "balance_loss_clip": 1.03593254, - "balance_loss_mlp": 1.01891863, - "epoch": 0.8227265895084924, - "flos": 25665607837440.0, - "grad_norm": 1.9656579535514493, - "language_loss": 0.86604738, - "learning_rate": 3.206008071236661e-07, - "loss": 0.88738984, - "num_input_tokens_seen": 295138530, - "step": 13684, - "time_per_iteration": 2.6015169620513916 - }, - { - "auxiliary_loss_clip": 0.01104335, - "auxiliary_loss_mlp": 0.01029249, - "balance_loss_clip": 1.03555918, - "balance_loss_mlp": 1.01763213, - "epoch": 0.8227867127611603, - "flos": 26179274280960.0, - "grad_norm": 1.536812487486819, - "language_loss": 0.79920459, - "learning_rate": 3.2038934133950157e-07, - "loss": 0.82054043, - "num_input_tokens_seen": 295160260, - "step": 13685, - "time_per_iteration": 2.7008142471313477 - }, - { - "auxiliary_loss_clip": 0.0107249, - "auxiliary_loss_mlp": 0.01030486, - "balance_loss_clip": 1.03493214, - "balance_loss_mlp": 1.01813579, - "epoch": 0.8228468360138284, - "flos": 22018053536640.0, - "grad_norm": 1.6748443436475502, - "language_loss": 0.68744385, - "learning_rate": 3.2017793924634194e-07, - "loss": 0.70847368, - "num_input_tokens_seen": 295177055, - "step": 13686, - "time_per_iteration": 2.7271742820739746 - }, - { - "auxiliary_loss_clip": 0.01076871, - "auxiliary_loss_mlp": 0.01032996, - "balance_loss_clip": 1.03525162, - "balance_loss_mlp": 1.02082491, - "epoch": 0.8229069592664963, - "flos": 14903359971840.0, - "grad_norm": 1.8146731165016403, - "language_loss": 0.77963513, - "learning_rate": 3.1996660085220263e-07, - "loss": 0.80073375, - "num_input_tokens_seen": 295193870, - "step": 13687, - "time_per_iteration": 2.6741888523101807 - }, - { - "auxiliary_loss_clip": 0.01097929, - "auxiliary_loss_mlp": 0.01030475, - "balance_loss_clip": 1.03655159, - "balance_loss_mlp": 1.01794028, - "epoch": 0.8229670825191643, - "flos": 15669478177920.0, - "grad_norm": 1.7147177179883277, - "language_loss": 0.72279108, - "learning_rate": 3.1975532616509825e-07, - "loss": 0.74407512, - "num_input_tokens_seen": 295211040, - "step": 13688, - "time_per_iteration": 2.583867311477661 - }, - { - "auxiliary_loss_clip": 0.01108409, - "auxiliary_loss_mlp": 0.00769781, - "balance_loss_clip": 1.03682184, - "balance_loss_mlp": 1.0001483, - "epoch": 0.8230272057718323, - "flos": 23183498217600.0, - "grad_norm": 2.234271170897282, - "language_loss": 0.73181629, - "learning_rate": 3.1954411519304025e-07, - "loss": 0.75059819, - "num_input_tokens_seen": 295231300, - "step": 13689, - "time_per_iteration": 2.718895673751831 - }, - { - "auxiliary_loss_clip": 0.01098539, - "auxiliary_loss_mlp": 0.01031163, - "balance_loss_clip": 1.0351994, - "balance_loss_mlp": 1.0188967, - "epoch": 0.8230873290245002, - "flos": 21032413361280.0, - "grad_norm": 3.545814626026256, - "language_loss": 0.69253677, - "learning_rate": 3.1933296794403887e-07, - "loss": 0.71383381, - "num_input_tokens_seen": 295251045, - "step": 13690, - "time_per_iteration": 2.6642231941223145 - }, - { - "auxiliary_loss_clip": 0.01062899, - "auxiliary_loss_mlp": 0.01041098, - "balance_loss_clip": 1.03263807, - "balance_loss_mlp": 1.02722192, - "epoch": 0.8231474522771682, - "flos": 21250139650560.0, - "grad_norm": 1.8299845733517255, - "language_loss": 0.85268778, - "learning_rate": 3.191218844260988e-07, - "loss": 0.87372774, - "num_input_tokens_seen": 295270225, - "step": 13691, - "time_per_iteration": 2.7507143020629883 - }, - { - "auxiliary_loss_clip": 0.01101229, - "auxiliary_loss_mlp": 0.01034591, - "balance_loss_clip": 1.03781307, - "balance_loss_mlp": 1.02287257, - "epoch": 0.8232075755298361, - "flos": 23842028211840.0, - "grad_norm": 1.8079890688492317, - "language_loss": 0.77103651, - "learning_rate": 3.189108646472252e-07, - "loss": 0.79239464, - "num_input_tokens_seen": 295288950, - "step": 13692, - "time_per_iteration": 2.67478084564209 - }, - { - "auxiliary_loss_clip": 0.01096284, - "auxiliary_loss_mlp": 0.01027159, - "balance_loss_clip": 1.03692162, - "balance_loss_mlp": 1.0151006, - "epoch": 0.8232676987825042, - "flos": 21653955325440.0, - "grad_norm": 1.722595052749625, - "language_loss": 0.71423566, - "learning_rate": 3.186999086154205e-07, - "loss": 0.73547006, - "num_input_tokens_seen": 295309405, - "step": 13693, - "time_per_iteration": 2.718867301940918 - }, - { - "auxiliary_loss_clip": 0.01070842, - "auxiliary_loss_mlp": 0.0102981, - "balance_loss_clip": 1.03349066, - "balance_loss_mlp": 1.01865232, - "epoch": 0.8233278220351721, - "flos": 26322701287680.0, - "grad_norm": 1.3395802259030574, - "language_loss": 0.83745861, - "learning_rate": 3.1848901633868355e-07, - "loss": 0.85846514, - "num_input_tokens_seen": 295331115, - "step": 13694, - "time_per_iteration": 2.7664167881011963 - }, - { - "auxiliary_loss_clip": 0.0104721, - "auxiliary_loss_mlp": 0.01032456, - "balance_loss_clip": 1.03542459, - "balance_loss_mlp": 1.0194205, - "epoch": 0.8233879452878401, - "flos": 21725812483200.0, - "grad_norm": 1.774536934152641, - "language_loss": 0.76836276, - "learning_rate": 3.182781878250118e-07, - "loss": 0.78915936, - "num_input_tokens_seen": 295350495, - "step": 13695, - "time_per_iteration": 2.750267744064331 - }, - { - "auxiliary_loss_clip": 0.01087721, - "auxiliary_loss_mlp": 0.01034204, - "balance_loss_clip": 1.03655171, - "balance_loss_mlp": 1.02215171, - "epoch": 0.823448068540508, - "flos": 20557746109440.0, - "grad_norm": 1.7975071163239338, - "language_loss": 0.80965418, - "learning_rate": 3.1806742308239985e-07, - "loss": 0.83087343, - "num_input_tokens_seen": 295368225, - "step": 13696, - "time_per_iteration": 2.6955337524414062 - }, - { - "auxiliary_loss_clip": 0.01020282, - "auxiliary_loss_mlp": 0.0100384, - "balance_loss_clip": 1.00769222, - "balance_loss_mlp": 1.00285649, - "epoch": 0.823508191793176, - "flos": 67273688194560.0, - "grad_norm": 0.7350797292935349, - "language_loss": 0.63867533, - "learning_rate": 3.178567221188393e-07, - "loss": 0.65891653, - "num_input_tokens_seen": 295430035, - "step": 13697, - "time_per_iteration": 3.2243242263793945 - }, - { - "auxiliary_loss_clip": 0.01070899, - "auxiliary_loss_mlp": 0.01025732, - "balance_loss_clip": 1.03476644, - "balance_loss_mlp": 1.01477075, - "epoch": 0.8235683150458439, - "flos": 17928402641280.0, - "grad_norm": 1.6913547769566408, - "language_loss": 0.72991723, - "learning_rate": 3.1764608494232037e-07, - "loss": 0.75088358, - "num_input_tokens_seen": 295447765, - "step": 13698, - "time_per_iteration": 2.670644998550415 - }, - { - "auxiliary_loss_clip": 0.01063119, - "auxiliary_loss_mlp": 0.01047662, - "balance_loss_clip": 1.03002477, - "balance_loss_mlp": 1.03214049, - "epoch": 0.823628438298512, - "flos": 18916089891840.0, - "grad_norm": 1.861601543372515, - "language_loss": 0.71800578, - "learning_rate": 3.174355115608305e-07, - "loss": 0.73911357, - "num_input_tokens_seen": 295464810, - "step": 13699, - "time_per_iteration": 2.7969279289245605 - }, - { - "auxiliary_loss_clip": 0.01086761, - "auxiliary_loss_mlp": 0.01028632, - "balance_loss_clip": 1.03632307, - "balance_loss_mlp": 1.01650214, - "epoch": 0.8236885615511799, - "flos": 18696460181760.0, - "grad_norm": 1.9855299133733353, - "language_loss": 0.8196975, - "learning_rate": 3.1722500198235526e-07, - "loss": 0.84085149, - "num_input_tokens_seen": 295482605, - "step": 13700, - "time_per_iteration": 2.6503469944000244 - }, - { - "auxiliary_loss_clip": 0.01086133, - "auxiliary_loss_mlp": 0.01035546, - "balance_loss_clip": 1.03498542, - "balance_loss_mlp": 1.02366662, - "epoch": 0.8237486848038479, - "flos": 23695009845120.0, - "grad_norm": 1.6635741154144412, - "language_loss": 0.73422629, - "learning_rate": 3.170145562148763e-07, - "loss": 0.7554431, - "num_input_tokens_seen": 295503780, - "step": 13701, - "time_per_iteration": 2.6358823776245117 - }, - { - "auxiliary_loss_clip": 0.01097849, - "auxiliary_loss_mlp": 0.01036965, - "balance_loss_clip": 1.03509569, - "balance_loss_mlp": 1.02432895, - "epoch": 0.8238088080565159, - "flos": 23441301106560.0, - "grad_norm": 1.9462768217086985, - "language_loss": 0.69265807, - "learning_rate": 3.1680417426637384e-07, - "loss": 0.71400625, - "num_input_tokens_seen": 295522035, - "step": 13702, - "time_per_iteration": 2.60188627243042 - }, - { - "auxiliary_loss_clip": 0.01063324, - "auxiliary_loss_mlp": 0.01034265, - "balance_loss_clip": 1.03598332, - "balance_loss_mlp": 1.02128875, - "epoch": 0.8238689313091838, - "flos": 22746537267840.0, - "grad_norm": 1.9923897807991633, - "language_loss": 0.75280106, - "learning_rate": 3.1659385614482603e-07, - "loss": 0.77377695, - "num_input_tokens_seen": 295541190, - "step": 13703, - "time_per_iteration": 2.7468554973602295 - }, - { - "auxiliary_loss_clip": 0.01113854, - "auxiliary_loss_mlp": 0.01037893, - "balance_loss_clip": 1.03847456, - "balance_loss_mlp": 1.02467299, - "epoch": 0.8239290545618518, - "flos": 25630092264960.0, - "grad_norm": 1.7182748421567742, - "language_loss": 0.69657588, - "learning_rate": 3.1638360185820755e-07, - "loss": 0.71809334, - "num_input_tokens_seen": 295558860, - "step": 13704, - "time_per_iteration": 2.5931785106658936 - }, - { - "auxiliary_loss_clip": 0.01105612, - "auxiliary_loss_mlp": 0.01030565, - "balance_loss_clip": 1.03566051, - "balance_loss_mlp": 1.01859665, - "epoch": 0.8239891778145197, - "flos": 26026473824640.0, - "grad_norm": 1.8447020844215793, - "language_loss": 0.64444757, - "learning_rate": 3.161734114144916e-07, - "loss": 0.66580933, - "num_input_tokens_seen": 295578155, - "step": 13705, - "time_per_iteration": 2.5968048572540283 - }, - { - "auxiliary_loss_clip": 0.01110492, - "auxiliary_loss_mlp": 0.01031144, - "balance_loss_clip": 1.03668666, - "balance_loss_mlp": 1.01796532, - "epoch": 0.8240493010671878, - "flos": 21833257040640.0, - "grad_norm": 1.541851656815521, - "language_loss": 0.69572484, - "learning_rate": 3.1596328482164915e-07, - "loss": 0.71714121, - "num_input_tokens_seen": 295599170, - "step": 13706, - "time_per_iteration": 2.5887668132781982 - }, - { - "auxiliary_loss_clip": 0.0108328, - "auxiliary_loss_mlp": 0.01039333, - "balance_loss_clip": 1.03719616, - "balance_loss_mlp": 1.02601147, - "epoch": 0.8241094243198557, - "flos": 18551919853440.0, - "grad_norm": 1.661218457463816, - "language_loss": 0.69479191, - "learning_rate": 3.157532220876475e-07, - "loss": 0.71601802, - "num_input_tokens_seen": 295617465, - "step": 13707, - "time_per_iteration": 2.6411385536193848 - }, - { - "auxiliary_loss_clip": 0.01072958, - "auxiliary_loss_mlp": 0.01038335, - "balance_loss_clip": 1.03431034, - "balance_loss_mlp": 1.0244596, - "epoch": 0.8241695475725237, - "flos": 25447163276160.0, - "grad_norm": 1.8467879085994943, - "language_loss": 0.79235733, - "learning_rate": 3.1554322322045226e-07, - "loss": 0.81347024, - "num_input_tokens_seen": 295634960, - "step": 13708, - "time_per_iteration": 2.700183153152466 - }, - { - "auxiliary_loss_clip": 0.01092221, - "auxiliary_loss_mlp": 0.0103148, - "balance_loss_clip": 1.0341289, - "balance_loss_mlp": 1.01864731, - "epoch": 0.8242296708251916, - "flos": 18989670902400.0, - "grad_norm": 3.0430641807268954, - "language_loss": 0.68361056, - "learning_rate": 3.1533328822802664e-07, - "loss": 0.70484757, - "num_input_tokens_seen": 295652725, - "step": 13709, - "time_per_iteration": 2.5937395095825195 - }, - { - "auxiliary_loss_clip": 0.01065101, - "auxiliary_loss_mlp": 0.01032868, - "balance_loss_clip": 1.03181398, - "balance_loss_mlp": 1.02109027, - "epoch": 0.8242897940778596, - "flos": 22600883617920.0, - "grad_norm": 1.766284405655816, - "language_loss": 0.82331645, - "learning_rate": 3.151234171183319e-07, - "loss": 0.84429616, - "num_input_tokens_seen": 295671195, - "step": 13710, - "time_per_iteration": 2.749650239944458 - }, - { - "auxiliary_loss_clip": 0.01096973, - "auxiliary_loss_mlp": 0.01034028, - "balance_loss_clip": 1.03629923, - "balance_loss_mlp": 1.02127314, - "epoch": 0.8243499173305275, - "flos": 21468153248640.0, - "grad_norm": 13.701839105359984, - "language_loss": 0.78112018, - "learning_rate": 3.149136098993257e-07, - "loss": 0.80243027, - "num_input_tokens_seen": 295689130, - "step": 13711, - "time_per_iteration": 2.7447783946990967 - }, - { - "auxiliary_loss_clip": 0.0107344, - "auxiliary_loss_mlp": 0.01029912, - "balance_loss_clip": 1.03311896, - "balance_loss_mlp": 1.01736498, - "epoch": 0.8244100405831956, - "flos": 20010359773440.0, - "grad_norm": 3.3468765947444568, - "language_loss": 0.65435582, - "learning_rate": 3.1470386657896473e-07, - "loss": 0.67538929, - "num_input_tokens_seen": 295706385, - "step": 13712, - "time_per_iteration": 4.317276477813721 - }, - { - "auxiliary_loss_clip": 0.01091569, - "auxiliary_loss_mlp": 0.01029045, - "balance_loss_clip": 1.03673708, - "balance_loss_mlp": 1.0174818, - "epoch": 0.8244701638358635, - "flos": 26430684549120.0, - "grad_norm": 1.8364742562696106, - "language_loss": 0.74371034, - "learning_rate": 3.14494187165202e-07, - "loss": 0.76491648, - "num_input_tokens_seen": 295727925, - "step": 13713, - "time_per_iteration": 4.166277647018433 - }, - { - "auxiliary_loss_clip": 0.01096875, - "auxiliary_loss_mlp": 0.01027842, - "balance_loss_clip": 1.03534007, - "balance_loss_mlp": 1.01558685, - "epoch": 0.8245302870885315, - "flos": 17640004343040.0, - "grad_norm": 6.838551643078677, - "language_loss": 0.80911207, - "learning_rate": 3.1428457166598833e-07, - "loss": 0.83035922, - "num_input_tokens_seen": 295744420, - "step": 13714, - "time_per_iteration": 2.624154806137085 - }, - { - "auxiliary_loss_clip": 0.01099074, - "auxiliary_loss_mlp": 0.01034976, - "balance_loss_clip": 1.03917253, - "balance_loss_mlp": 1.02173758, - "epoch": 0.8245904103411995, - "flos": 26209510554240.0, - "grad_norm": 1.9766045334359852, - "language_loss": 0.66371924, - "learning_rate": 3.1407502008927235e-07, - "loss": 0.68505979, - "num_input_tokens_seen": 295765105, - "step": 13715, - "time_per_iteration": 4.212578296661377 - }, - { - "auxiliary_loss_clip": 0.01081096, - "auxiliary_loss_mlp": 0.01029311, - "balance_loss_clip": 1.03784251, - "balance_loss_mlp": 1.01657939, - "epoch": 0.8246505335938674, - "flos": 24205084928640.0, - "grad_norm": 2.0424767412149567, - "language_loss": 0.74730164, - "learning_rate": 3.1386553244300086e-07, - "loss": 0.76840568, - "num_input_tokens_seen": 295784200, - "step": 13716, - "time_per_iteration": 2.7325594425201416 - }, - { - "auxiliary_loss_clip": 0.00991112, - "auxiliary_loss_mlp": 0.0100064, - "balance_loss_clip": 1.00916934, - "balance_loss_mlp": 0.99952489, - "epoch": 0.8247106568465354, - "flos": 67092195749760.0, - "grad_norm": 0.7138774267720784, - "language_loss": 0.58973479, - "learning_rate": 3.136561087351175e-07, - "loss": 0.60965228, - "num_input_tokens_seen": 295846555, - "step": 13717, - "time_per_iteration": 4.931637763977051 - }, - { - "auxiliary_loss_clip": 0.01094759, - "auxiliary_loss_mlp": 0.00770088, - "balance_loss_clip": 1.03633809, - "balance_loss_mlp": 1.00021517, - "epoch": 0.8247707800992033, - "flos": 12568232805120.0, - "grad_norm": 1.8911400591103953, - "language_loss": 0.79565227, - "learning_rate": 3.1344674897356373e-07, - "loss": 0.81430078, - "num_input_tokens_seen": 295863425, - "step": 13718, - "time_per_iteration": 2.6436800956726074 - }, - { - "auxiliary_loss_clip": 0.01088621, - "auxiliary_loss_mlp": 0.01033203, - "balance_loss_clip": 1.03615391, - "balance_loss_mlp": 1.02111554, - "epoch": 0.8248309033518714, - "flos": 15923617879680.0, - "grad_norm": 1.5520316842938593, - "language_loss": 0.68703258, - "learning_rate": 3.132374531662778e-07, - "loss": 0.70825082, - "num_input_tokens_seen": 295880925, - "step": 13719, - "time_per_iteration": 2.716325044631958 - }, - { - "auxiliary_loss_clip": 0.01079067, - "auxiliary_loss_mlp": 0.01034824, - "balance_loss_clip": 1.03340077, - "balance_loss_mlp": 1.0202682, - "epoch": 0.8248910266045393, - "flos": 17564735393280.0, - "grad_norm": 2.6589956640079038, - "language_loss": 0.70158517, - "learning_rate": 3.13028221321197e-07, - "loss": 0.72272408, - "num_input_tokens_seen": 295898205, - "step": 13720, - "time_per_iteration": 2.5896477699279785 - }, - { - "auxiliary_loss_clip": 0.01033476, - "auxiliary_loss_mlp": 0.01024097, - "balance_loss_clip": 1.03508949, - "balance_loss_mlp": 1.01189578, - "epoch": 0.8249511498572073, - "flos": 28619655275520.0, - "grad_norm": 1.5927327922033778, - "language_loss": 0.75676763, - "learning_rate": 3.1281905344625467e-07, - "loss": 0.77734333, - "num_input_tokens_seen": 295918130, - "step": 13721, - "time_per_iteration": 3.003366470336914 - }, - { - "auxiliary_loss_clip": 0.01064431, - "auxiliary_loss_mlp": 0.0102768, - "balance_loss_clip": 1.04172993, - "balance_loss_mlp": 1.01569343, - "epoch": 0.8250112731098752, - "flos": 25556583081600.0, - "grad_norm": 1.9277434065767896, - "language_loss": 0.7792846, - "learning_rate": 3.1260994954938305e-07, - "loss": 0.80020571, - "num_input_tokens_seen": 295937760, - "step": 13722, - "time_per_iteration": 2.993467092514038 - }, - { - "auxiliary_loss_clip": 0.01107933, - "auxiliary_loss_mlp": 0.01030614, - "balance_loss_clip": 1.03832984, - "balance_loss_mlp": 1.01868153, - "epoch": 0.8250713963625432, - "flos": 27746164339200.0, - "grad_norm": 1.9336689467836483, - "language_loss": 0.63077027, - "learning_rate": 3.1240090963851205e-07, - "loss": 0.6521557, - "num_input_tokens_seen": 295957585, - "step": 13723, - "time_per_iteration": 2.65627384185791 - }, - { - "auxiliary_loss_clip": 0.01109221, - "auxiliary_loss_mlp": 0.0103494, - "balance_loss_clip": 1.03650689, - "balance_loss_mlp": 1.0223273, - "epoch": 0.8251315196152111, - "flos": 21610610588160.0, - "grad_norm": 1.425967776015011, - "language_loss": 0.74256718, - "learning_rate": 3.121919337215666e-07, - "loss": 0.76400876, - "num_input_tokens_seen": 295977135, - "step": 13724, - "time_per_iteration": 2.6450181007385254 - }, - { - "auxiliary_loss_clip": 0.01076005, - "auxiliary_loss_mlp": 0.01035593, - "balance_loss_clip": 1.03590727, - "balance_loss_mlp": 1.02253342, - "epoch": 0.8251916428678792, - "flos": 28579363194240.0, - "grad_norm": 1.8109135586659708, - "language_loss": 0.6419245, - "learning_rate": 3.1198302180647253e-07, - "loss": 0.66304046, - "num_input_tokens_seen": 295996265, - "step": 13725, - "time_per_iteration": 2.747354507446289 - }, - { - "auxiliary_loss_clip": 0.01081699, - "auxiliary_loss_mlp": 0.01029734, - "balance_loss_clip": 1.03467178, - "balance_loss_mlp": 1.01717496, - "epoch": 0.8252517661205471, - "flos": 23075191733760.0, - "grad_norm": 1.5423551170824084, - "language_loss": 0.81953287, - "learning_rate": 3.1177417390115125e-07, - "loss": 0.84064722, - "num_input_tokens_seen": 296014745, - "step": 13726, - "time_per_iteration": 2.677957057952881 - }, - { - "auxiliary_loss_clip": 0.01090181, - "auxiliary_loss_mlp": 0.01033897, - "balance_loss_clip": 1.03259659, - "balance_loss_mlp": 1.02245855, - "epoch": 0.8253118893732151, - "flos": 31759576617600.0, - "grad_norm": 1.6832694134847563, - "language_loss": 0.70317417, - "learning_rate": 3.1156539001352286e-07, - "loss": 0.72441494, - "num_input_tokens_seen": 296036960, - "step": 13727, - "time_per_iteration": 2.6928937435150146 - }, - { - "auxiliary_loss_clip": 0.01102136, - "auxiliary_loss_mlp": 0.01028405, - "balance_loss_clip": 1.03817558, - "balance_loss_mlp": 1.01547694, - "epoch": 0.8253720126258831, - "flos": 18296415434880.0, - "grad_norm": 1.667834208410725, - "language_loss": 0.62520349, - "learning_rate": 3.113566701515036e-07, - "loss": 0.64650893, - "num_input_tokens_seen": 296056540, - "step": 13728, - "time_per_iteration": 2.6370222568511963 - }, - { - "auxiliary_loss_clip": 0.01092032, - "auxiliary_loss_mlp": 0.01029928, - "balance_loss_clip": 1.03751087, - "balance_loss_mlp": 1.0174228, - "epoch": 0.825432135878551, - "flos": 26797332625920.0, - "grad_norm": 1.9482709923382855, - "language_loss": 0.71667683, - "learning_rate": 3.111480143230092e-07, - "loss": 0.73789644, - "num_input_tokens_seen": 296077950, - "step": 13729, - "time_per_iteration": 2.6492090225219727 - }, - { - "auxiliary_loss_clip": 0.01014426, - "auxiliary_loss_mlp": 0.0100436, - "balance_loss_clip": 1.01090586, - "balance_loss_mlp": 1.00330532, - "epoch": 0.825492259131219, - "flos": 54219116217600.0, - "grad_norm": 0.8488116722729574, - "language_loss": 0.6264025, - "learning_rate": 3.109394225359514e-07, - "loss": 0.64659035, - "num_input_tokens_seen": 296127060, - "step": 13730, - "time_per_iteration": 3.0054545402526855 - }, - { - "auxiliary_loss_clip": 0.01058894, - "auxiliary_loss_mlp": 0.01034161, - "balance_loss_clip": 1.03521633, - "balance_loss_mlp": 1.02156639, - "epoch": 0.825552382383887, - "flos": 43756145493120.0, - "grad_norm": 7.4365130225127505, - "language_loss": 0.6353327, - "learning_rate": 3.1073089479823945e-07, - "loss": 0.65626323, - "num_input_tokens_seen": 296147775, - "step": 13731, - "time_per_iteration": 2.9331674575805664 - }, - { - "auxiliary_loss_clip": 0.0107139, - "auxiliary_loss_mlp": 0.00773278, - "balance_loss_clip": 1.03046966, - "balance_loss_mlp": 1.0002327, - "epoch": 0.825612505636555, - "flos": 12602814624000.0, - "grad_norm": 2.180240651143821, - "language_loss": 0.70295024, - "learning_rate": 3.105224311177812e-07, - "loss": 0.72139692, - "num_input_tokens_seen": 296163560, - "step": 13732, - "time_per_iteration": 2.765413761138916 - }, - { - "auxiliary_loss_clip": 0.01100354, - "auxiliary_loss_mlp": 0.01038249, - "balance_loss_clip": 1.03632462, - "balance_loss_mlp": 1.02532113, - "epoch": 0.8256726288892229, - "flos": 17595618111360.0, - "grad_norm": 2.287080193464761, - "language_loss": 0.71307957, - "learning_rate": 3.103140315024817e-07, - "loss": 0.7344656, - "num_input_tokens_seen": 296178730, - "step": 13733, - "time_per_iteration": 2.663184642791748 - }, - { - "auxiliary_loss_clip": 0.01106421, - "auxiliary_loss_mlp": 0.01033457, - "balance_loss_clip": 1.03536689, - "balance_loss_mlp": 1.02092218, - "epoch": 0.8257327521418909, - "flos": 23805794367360.0, - "grad_norm": 1.5370953364737692, - "language_loss": 0.82361829, - "learning_rate": 3.1010569596024437e-07, - "loss": 0.84501708, - "num_input_tokens_seen": 296200175, - "step": 13734, - "time_per_iteration": 2.5860283374786377 - }, - { - "auxiliary_loss_clip": 0.01078022, - "auxiliary_loss_mlp": 0.01033879, - "balance_loss_clip": 1.03394449, - "balance_loss_mlp": 1.02108788, - "epoch": 0.8257928753945588, - "flos": 19281121856640.0, - "grad_norm": 1.767732379268741, - "language_loss": 0.8304292, - "learning_rate": 3.098974244989676e-07, - "loss": 0.85154831, - "num_input_tokens_seen": 296219305, - "step": 13735, - "time_per_iteration": 2.6341776847839355 - }, - { - "auxiliary_loss_clip": 0.01103224, - "auxiliary_loss_mlp": 0.01029169, - "balance_loss_clip": 1.03989172, - "balance_loss_mlp": 1.01795721, - "epoch": 0.8258529986472268, - "flos": 18478841633280.0, - "grad_norm": 1.736444707629355, - "language_loss": 0.70653635, - "learning_rate": 3.096892171265497e-07, - "loss": 0.72786027, - "num_input_tokens_seen": 296236945, - "step": 13736, - "time_per_iteration": 2.5950427055358887 - }, - { - "auxiliary_loss_clip": 0.01021603, - "auxiliary_loss_mlp": 0.01002911, - "balance_loss_clip": 1.00879157, - "balance_loss_mlp": 1.00194514, - "epoch": 0.8259131218998947, - "flos": 62137957512960.0, - "grad_norm": 0.8987273381116809, - "language_loss": 0.6798467, - "learning_rate": 3.0948107385088665e-07, - "loss": 0.70009184, - "num_input_tokens_seen": 296294685, - "step": 13737, - "time_per_iteration": 3.1607825756073 - }, - { - "auxiliary_loss_clip": 0.01084099, - "auxiliary_loss_mlp": 0.010326, - "balance_loss_clip": 1.0343399, - "balance_loss_mlp": 1.02032113, - "epoch": 0.8259732451525628, - "flos": 22159038418560.0, - "grad_norm": 1.7543830364671171, - "language_loss": 0.69818115, - "learning_rate": 3.0927299467987e-07, - "loss": 0.71934807, - "num_input_tokens_seen": 296314790, - "step": 13738, - "time_per_iteration": 2.715946912765503 - }, - { - "auxiliary_loss_clip": 0.01092604, - "auxiliary_loss_mlp": 0.01029077, - "balance_loss_clip": 1.03914821, - "balance_loss_mlp": 1.01492107, - "epoch": 0.8260333684052307, - "flos": 38361645233280.0, - "grad_norm": 1.9809949104241253, - "language_loss": 0.63092321, - "learning_rate": 3.090649796213911e-07, - "loss": 0.65214008, - "num_input_tokens_seen": 296335355, - "step": 13739, - "time_per_iteration": 2.8820793628692627 - }, - { - "auxiliary_loss_clip": 0.01011074, - "auxiliary_loss_mlp": 0.01000594, - "balance_loss_clip": 1.00743914, - "balance_loss_mlp": 0.99961609, - "epoch": 0.8260934916578987, - "flos": 62185611882240.0, - "grad_norm": 0.8059815006501098, - "language_loss": 0.59246588, - "learning_rate": 3.0885702868333853e-07, - "loss": 0.61258256, - "num_input_tokens_seen": 296399885, - "step": 13740, - "time_per_iteration": 3.2520594596862793 - }, - { - "auxiliary_loss_clip": 0.01114893, - "auxiliary_loss_mlp": 0.01034041, - "balance_loss_clip": 1.03906655, - "balance_loss_mlp": 1.02052891, - "epoch": 0.8261536149105667, - "flos": 22565475786240.0, - "grad_norm": 2.0240971997235317, - "language_loss": 0.75221682, - "learning_rate": 3.086491418735959e-07, - "loss": 0.7737062, - "num_input_tokens_seen": 296417660, - "step": 13741, - "time_per_iteration": 2.543391704559326 - }, - { - "auxiliary_loss_clip": 0.01096486, - "auxiliary_loss_mlp": 0.01034655, - "balance_loss_clip": 1.03584099, - "balance_loss_mlp": 1.02222109, - "epoch": 0.8262137381632346, - "flos": 32525479342080.0, - "grad_norm": 1.8715316592875402, - "language_loss": 0.62344342, - "learning_rate": 3.0844131920004726e-07, - "loss": 0.64475489, - "num_input_tokens_seen": 296438255, - "step": 13742, - "time_per_iteration": 2.7066636085510254 - }, - { - "auxiliary_loss_clip": 0.01066607, - "auxiliary_loss_mlp": 0.01036357, - "balance_loss_clip": 1.03625488, - "balance_loss_mlp": 1.02224827, - "epoch": 0.8262738614159026, - "flos": 14136451666560.0, - "grad_norm": 2.739309614101712, - "language_loss": 0.65881348, - "learning_rate": 3.0823356067057327e-07, - "loss": 0.67984313, - "num_input_tokens_seen": 296454485, - "step": 13743, - "time_per_iteration": 2.722188949584961 - }, - { - "auxiliary_loss_clip": 0.01089117, - "auxiliary_loss_mlp": 0.01035575, - "balance_loss_clip": 1.0356648, - "balance_loss_mlp": 1.02275968, - "epoch": 0.8263339846685706, - "flos": 19825347795840.0, - "grad_norm": 1.7892755960798923, - "language_loss": 0.66778719, - "learning_rate": 3.0802586629305283e-07, - "loss": 0.6890341, - "num_input_tokens_seen": 296473740, - "step": 13744, - "time_per_iteration": 2.632858991622925 - }, - { - "auxiliary_loss_clip": 0.01077178, - "auxiliary_loss_mlp": 0.01032278, - "balance_loss_clip": 1.03721189, - "balance_loss_mlp": 1.02044034, - "epoch": 0.8263941079212386, - "flos": 22745962650240.0, - "grad_norm": 1.8023826175749642, - "language_loss": 0.75316632, - "learning_rate": 3.078182360753612e-07, - "loss": 0.77426088, - "num_input_tokens_seen": 296493355, - "step": 13745, - "time_per_iteration": 2.7315781116485596 - }, - { - "auxiliary_loss_clip": 0.01077899, - "auxiliary_loss_mlp": 0.0077187, - "balance_loss_clip": 1.03393078, - "balance_loss_mlp": 1.00011253, - "epoch": 0.8264542311739065, - "flos": 20120641505280.0, - "grad_norm": 1.8014211048676299, - "language_loss": 0.79279208, - "learning_rate": 3.076106700253709e-07, - "loss": 0.81128979, - "num_input_tokens_seen": 296510520, - "step": 13746, - "time_per_iteration": 2.6316795349121094 - }, - { - "auxiliary_loss_clip": 0.01103647, - "auxiliary_loss_mlp": 0.01036543, - "balance_loss_clip": 1.03922772, - "balance_loss_mlp": 1.02318525, - "epoch": 0.8265143544265745, - "flos": 16837149502080.0, - "grad_norm": 1.8721646210593863, - "language_loss": 0.68316424, - "learning_rate": 3.0740316815095415e-07, - "loss": 0.70456612, - "num_input_tokens_seen": 296528265, - "step": 13747, - "time_per_iteration": 2.586827039718628 - }, - { - "auxiliary_loss_clip": 0.0109475, - "auxiliary_loss_mlp": 0.01037445, - "balance_loss_clip": 1.03468108, - "balance_loss_mlp": 1.02315235, - "epoch": 0.8265744776792424, - "flos": 22018592240640.0, - "grad_norm": 1.994737585930927, - "language_loss": 0.75182354, - "learning_rate": 3.0719573045997835e-07, - "loss": 0.77314556, - "num_input_tokens_seen": 296547810, - "step": 13748, - "time_per_iteration": 2.650148868560791 - }, - { - "auxiliary_loss_clip": 0.01071464, - "auxiliary_loss_mlp": 0.01033138, - "balance_loss_clip": 1.03686166, - "balance_loss_mlp": 1.02170539, - "epoch": 0.8266346009319104, - "flos": 19244852098560.0, - "grad_norm": 1.689203762569125, - "language_loss": 0.64030862, - "learning_rate": 3.069883569603102e-07, - "loss": 0.6613546, - "num_input_tokens_seen": 296565940, - "step": 13749, - "time_per_iteration": 2.757077217102051 - }, - { - "auxiliary_loss_clip": 0.01082519, - "auxiliary_loss_mlp": 0.01028885, - "balance_loss_clip": 1.03196669, - "balance_loss_mlp": 1.01680279, - "epoch": 0.8266947241845783, - "flos": 24166768095360.0, - "grad_norm": 1.5570975728465015, - "language_loss": 0.73744154, - "learning_rate": 3.067810476598132e-07, - "loss": 0.75855553, - "num_input_tokens_seen": 296585090, - "step": 13750, - "time_per_iteration": 2.714416742324829 - }, - { - "auxiliary_loss_clip": 0.01099886, - "auxiliary_loss_mlp": 0.01035041, - "balance_loss_clip": 1.03848624, - "balance_loss_mlp": 1.02245283, - "epoch": 0.8267548474372464, - "flos": 21105814803840.0, - "grad_norm": 1.7884167706589897, - "language_loss": 0.65513742, - "learning_rate": 3.065738025663496e-07, - "loss": 0.67648673, - "num_input_tokens_seen": 296604950, - "step": 13751, - "time_per_iteration": 5.785562753677368 - }, - { - "auxiliary_loss_clip": 0.01081731, - "auxiliary_loss_mlp": 0.01027835, - "balance_loss_clip": 1.03284156, - "balance_loss_mlp": 1.01637304, - "epoch": 0.8268149706899143, - "flos": 39968288668800.0, - "grad_norm": 1.5963517669581677, - "language_loss": 0.60753131, - "learning_rate": 3.0636662168777607e-07, - "loss": 0.628627, - "num_input_tokens_seen": 296627780, - "step": 13752, - "time_per_iteration": 2.755326747894287 - }, - { - "auxiliary_loss_clip": 0.01018872, - "auxiliary_loss_mlp": 0.0100062, - "balance_loss_clip": 1.00675297, - "balance_loss_mlp": 0.99959439, - "epoch": 0.8268750939425823, - "flos": 65782423244160.0, - "grad_norm": 0.7684012049495671, - "language_loss": 0.57412326, - "learning_rate": 3.0615950503194986e-07, - "loss": 0.59431815, - "num_input_tokens_seen": 296683850, - "step": 13753, - "time_per_iteration": 3.1750407218933105 - }, - { - "auxiliary_loss_clip": 0.0099067, - "auxiliary_loss_mlp": 0.00751461, - "balance_loss_clip": 1.01540029, - "balance_loss_mlp": 0.99955767, - "epoch": 0.8269352171952503, - "flos": 52981455242880.0, - "grad_norm": 0.6979413175863002, - "language_loss": 0.54908955, - "learning_rate": 3.0595245260672563e-07, - "loss": 0.56651086, - "num_input_tokens_seen": 296741420, - "step": 13754, - "time_per_iteration": 4.901344299316406 - }, - { - "auxiliary_loss_clip": 0.0106662, - "auxiliary_loss_mlp": 0.01033194, - "balance_loss_clip": 1.03270113, - "balance_loss_mlp": 1.02221489, - "epoch": 0.8269953404479182, - "flos": 23076125487360.0, - "grad_norm": 1.746367517796231, - "language_loss": 0.69104445, - "learning_rate": 3.0574546441995354e-07, - "loss": 0.71204263, - "num_input_tokens_seen": 296759620, - "step": 13755, - "time_per_iteration": 3.0003440380096436 - }, - { - "auxiliary_loss_clip": 0.01062261, - "auxiliary_loss_mlp": 0.01029699, - "balance_loss_clip": 1.03354418, - "balance_loss_mlp": 1.01864886, - "epoch": 0.8270554637005862, - "flos": 14209996763520.0, - "grad_norm": 1.955736447357461, - "language_loss": 0.70088506, - "learning_rate": 3.0553854047948324e-07, - "loss": 0.72180462, - "num_input_tokens_seen": 296777275, - "step": 13756, - "time_per_iteration": 4.257762432098389 - }, - { - "auxiliary_loss_clip": 0.01102671, - "auxiliary_loss_mlp": 0.01033469, - "balance_loss_clip": 1.04094052, - "balance_loss_mlp": 1.02107131, - "epoch": 0.8271155869532542, - "flos": 21762046327680.0, - "grad_norm": 1.737700331339717, - "language_loss": 0.72146094, - "learning_rate": 3.053316807931623e-07, - "loss": 0.74282235, - "num_input_tokens_seen": 296796655, - "step": 13757, - "time_per_iteration": 2.6405348777770996 - }, - { - "auxiliary_loss_clip": 0.01101277, - "auxiliary_loss_mlp": 0.01035132, - "balance_loss_clip": 1.03689456, - "balance_loss_mlp": 1.02129722, - "epoch": 0.8271757102059222, - "flos": 15120475729920.0, - "grad_norm": 2.690067923112346, - "language_loss": 0.6930806, - "learning_rate": 3.0512488536883283e-07, - "loss": 0.7144447, - "num_input_tokens_seen": 296813705, - "step": 13758, - "time_per_iteration": 2.6009304523468018 - }, - { - "auxiliary_loss_clip": 0.01083685, - "auxiliary_loss_mlp": 0.01028318, - "balance_loss_clip": 1.03558612, - "balance_loss_mlp": 1.01677251, - "epoch": 0.8272358334585901, - "flos": 24133730561280.0, - "grad_norm": 1.5602292181836888, - "language_loss": 0.69900572, - "learning_rate": 3.0491815421433775e-07, - "loss": 0.72012579, - "num_input_tokens_seen": 296833985, - "step": 13759, - "time_per_iteration": 2.6815199851989746 - }, - { - "auxiliary_loss_clip": 0.01087619, - "auxiliary_loss_mlp": 0.0103009, - "balance_loss_clip": 1.03719068, - "balance_loss_mlp": 1.01779902, - "epoch": 0.8272959567112581, - "flos": 18990712396800.0, - "grad_norm": 1.7630032179390376, - "language_loss": 0.70951492, - "learning_rate": 3.047114873375161e-07, - "loss": 0.73069203, - "num_input_tokens_seen": 296850150, - "step": 13760, - "time_per_iteration": 2.6415457725524902 - }, - { - "auxiliary_loss_clip": 0.0106558, - "auxiliary_loss_mlp": 0.01031388, - "balance_loss_clip": 1.03384495, - "balance_loss_mlp": 1.01930583, - "epoch": 0.827356079963926, - "flos": 20631614428800.0, - "grad_norm": 44.683792034890395, - "language_loss": 0.77058452, - "learning_rate": 3.0450488474620505e-07, - "loss": 0.79155421, - "num_input_tokens_seen": 296869585, - "step": 13761, - "time_per_iteration": 2.658909320831299 - }, - { - "auxiliary_loss_clip": 0.01075197, - "auxiliary_loss_mlp": 0.01033022, - "balance_loss_clip": 1.03739285, - "balance_loss_mlp": 1.0216558, - "epoch": 0.827416203216594, - "flos": 22416625825920.0, - "grad_norm": 1.6365377494190674, - "language_loss": 0.70046437, - "learning_rate": 3.042983464482387e-07, - "loss": 0.72154659, - "num_input_tokens_seen": 296887710, - "step": 13762, - "time_per_iteration": 2.6890883445739746 - }, - { - "auxiliary_loss_clip": 0.01056694, - "auxiliary_loss_mlp": 0.01032, - "balance_loss_clip": 1.03542447, - "balance_loss_mlp": 1.0196439, - "epoch": 0.827476326469262, - "flos": 19026192055680.0, - "grad_norm": 2.3529843833311297, - "language_loss": 0.70278549, - "learning_rate": 3.0409187245144853e-07, - "loss": 0.72367239, - "num_input_tokens_seen": 296906265, - "step": 13763, - "time_per_iteration": 2.7008395195007324 - }, - { - "auxiliary_loss_clip": 0.01013794, - "auxiliary_loss_mlp": 0.00999838, - "balance_loss_clip": 1.01678598, - "balance_loss_mlp": 0.99868739, - "epoch": 0.82753644972193, - "flos": 68500575089280.0, - "grad_norm": 0.8946836161965805, - "language_loss": 0.65109873, - "learning_rate": 3.038854627636651e-07, - "loss": 0.67123502, - "num_input_tokens_seen": 296971290, - "step": 13764, - "time_per_iteration": 3.350186586380005 - }, - { - "auxiliary_loss_clip": 0.01100213, - "auxiliary_loss_mlp": 0.01033972, - "balance_loss_clip": 1.03844428, - "balance_loss_mlp": 1.02069247, - "epoch": 0.8275965729745979, - "flos": 18405404277120.0, - "grad_norm": 1.9854785426124901, - "language_loss": 0.77840686, - "learning_rate": 3.0367911739271423e-07, - "loss": 0.79974878, - "num_input_tokens_seen": 296989060, - "step": 13765, - "time_per_iteration": 2.6723389625549316 - }, - { - "auxiliary_loss_clip": 0.01056381, - "auxiliary_loss_mlp": 0.01029974, - "balance_loss_clip": 1.03462076, - "balance_loss_mlp": 1.01668835, - "epoch": 0.8276566962272659, - "flos": 28512067063680.0, - "grad_norm": 1.6645003745934188, - "language_loss": 0.62420988, - "learning_rate": 3.034728363464214e-07, - "loss": 0.64507341, - "num_input_tokens_seen": 297011300, - "step": 13766, - "time_per_iteration": 2.811694383621216 - }, - { - "auxiliary_loss_clip": 0.01073861, - "auxiliary_loss_mlp": 0.01030979, - "balance_loss_clip": 1.03385091, - "balance_loss_mlp": 1.01828325, - "epoch": 0.8277168194799339, - "flos": 20230240878720.0, - "grad_norm": 1.6477178817747764, - "language_loss": 0.82427168, - "learning_rate": 3.03266619632609e-07, - "loss": 0.84532011, - "num_input_tokens_seen": 297030350, - "step": 13767, - "time_per_iteration": 2.716275453567505 - }, - { - "auxiliary_loss_clip": 0.01082823, - "auxiliary_loss_mlp": 0.0102858, - "balance_loss_clip": 1.03814888, - "balance_loss_mlp": 1.01580667, - "epoch": 0.8277769427326018, - "flos": 28476623318400.0, - "grad_norm": 1.6672913040668584, - "language_loss": 0.6903677, - "learning_rate": 3.030604672590964e-07, - "loss": 0.71148169, - "num_input_tokens_seen": 297049710, - "step": 13768, - "time_per_iteration": 2.688441753387451 - }, - { - "auxiliary_loss_clip": 0.0103987, - "auxiliary_loss_mlp": 0.0103503, - "balance_loss_clip": 1.02953947, - "balance_loss_mlp": 1.02242327, - "epoch": 0.8278370659852698, - "flos": 27197628768000.0, - "grad_norm": 2.022593721700604, - "language_loss": 0.74887329, - "learning_rate": 3.028543792337006e-07, - "loss": 0.76962233, - "num_input_tokens_seen": 297070510, - "step": 13769, - "time_per_iteration": 2.765038251876831 - }, - { - "auxiliary_loss_clip": 0.01084819, - "auxiliary_loss_mlp": 0.01030015, - "balance_loss_clip": 1.03507888, - "balance_loss_mlp": 1.01786184, - "epoch": 0.8278971892379378, - "flos": 37816126404480.0, - "grad_norm": 1.6742460818696816, - "language_loss": 0.74587786, - "learning_rate": 3.0264835556423675e-07, - "loss": 0.76702625, - "num_input_tokens_seen": 297092585, - "step": 13770, - "time_per_iteration": 2.78021502494812 - }, - { - "auxiliary_loss_clip": 0.010808, - "auxiliary_loss_mlp": 0.01033104, - "balance_loss_clip": 1.03744841, - "balance_loss_mlp": 1.0202477, - "epoch": 0.8279573124906058, - "flos": 22560160573440.0, - "grad_norm": 2.613906237758894, - "language_loss": 0.75822175, - "learning_rate": 3.0244239625851785e-07, - "loss": 0.77936077, - "num_input_tokens_seen": 297110055, - "step": 13771, - "time_per_iteration": 2.6900837421417236 - }, - { - "auxiliary_loss_clip": 0.01109049, - "auxiliary_loss_mlp": 0.01035265, - "balance_loss_clip": 1.03709054, - "balance_loss_mlp": 1.02323627, - "epoch": 0.8280174357432737, - "flos": 36064619418240.0, - "grad_norm": 1.6606339233038099, - "language_loss": 0.72508442, - "learning_rate": 3.0223650132435284e-07, - "loss": 0.74652761, - "num_input_tokens_seen": 297132170, - "step": 13772, - "time_per_iteration": 2.7568705081939697 - }, - { - "auxiliary_loss_clip": 0.01087016, - "auxiliary_loss_mlp": 0.01030121, - "balance_loss_clip": 1.03733611, - "balance_loss_mlp": 1.01710296, - "epoch": 0.8280775589959417, - "flos": 22961067246720.0, - "grad_norm": 2.2592902165659154, - "language_loss": 0.75143635, - "learning_rate": 3.0203067076955035e-07, - "loss": 0.77260774, - "num_input_tokens_seen": 297149515, - "step": 13773, - "time_per_iteration": 2.683868646621704 - }, - { - "auxiliary_loss_clip": 0.01062538, - "auxiliary_loss_mlp": 0.01034452, - "balance_loss_clip": 1.03560376, - "balance_loss_mlp": 1.02264452, - "epoch": 0.8281376822486096, - "flos": 26063282286720.0, - "grad_norm": 1.872449151264808, - "language_loss": 0.75778252, - "learning_rate": 3.01824904601915e-07, - "loss": 0.77875245, - "num_input_tokens_seen": 297170320, - "step": 13774, - "time_per_iteration": 2.7567591667175293 - }, - { - "auxiliary_loss_clip": 0.01081331, - "auxiliary_loss_mlp": 0.00770591, - "balance_loss_clip": 1.03898907, - "balance_loss_mlp": 1.00031459, - "epoch": 0.8281978055012776, - "flos": 20667776446080.0, - "grad_norm": 1.8056323038896689, - "language_loss": 0.74878412, - "learning_rate": 3.01619202829249e-07, - "loss": 0.76730335, - "num_input_tokens_seen": 297189935, - "step": 13775, - "time_per_iteration": 2.74230694770813 - }, - { - "auxiliary_loss_clip": 0.01112679, - "auxiliary_loss_mlp": 0.01030499, - "balance_loss_clip": 1.0371238, - "balance_loss_mlp": 1.01723146, - "epoch": 0.8282579287539455, - "flos": 29315281040640.0, - "grad_norm": 2.0814301454392994, - "language_loss": 0.73856264, - "learning_rate": 3.01413565459353e-07, - "loss": 0.75999445, - "num_input_tokens_seen": 297210885, - "step": 13776, - "time_per_iteration": 2.684095621109009 - }, - { - "auxiliary_loss_clip": 0.01053766, - "auxiliary_loss_mlp": 0.01037151, - "balance_loss_clip": 1.0285629, - "balance_loss_mlp": 1.02321506, - "epoch": 0.8283180520066136, - "flos": 15706178899200.0, - "grad_norm": 1.9371446657055744, - "language_loss": 0.77532077, - "learning_rate": 3.0120799250002483e-07, - "loss": 0.79622996, - "num_input_tokens_seen": 297228500, - "step": 13777, - "time_per_iteration": 2.7644686698913574 - }, - { - "auxiliary_loss_clip": 0.01096655, - "auxiliary_loss_mlp": 0.01029876, - "balance_loss_clip": 1.03806889, - "balance_loss_mlp": 1.01883733, - "epoch": 0.8283781752592815, - "flos": 24791470456320.0, - "grad_norm": 1.6926504608706043, - "language_loss": 0.82732141, - "learning_rate": 3.010024839590604e-07, - "loss": 0.8485868, - "num_input_tokens_seen": 297249470, - "step": 13778, - "time_per_iteration": 2.7171225547790527 - }, - { - "auxiliary_loss_clip": 0.01092306, - "auxiliary_loss_mlp": 0.01025464, - "balance_loss_clip": 1.03395522, - "balance_loss_mlp": 1.01303005, - "epoch": 0.8284382985119495, - "flos": 18982811404800.0, - "grad_norm": 1.8413358549591246, - "language_loss": 0.74458718, - "learning_rate": 3.0079703984425187e-07, - "loss": 0.76576483, - "num_input_tokens_seen": 297265970, - "step": 13779, - "time_per_iteration": 2.626110553741455 - }, - { - "auxiliary_loss_clip": 0.0100526, - "auxiliary_loss_mlp": 0.01000579, - "balance_loss_clip": 1.01090991, - "balance_loss_mlp": 0.99951804, - "epoch": 0.8284984217646175, - "flos": 61034460814080.0, - "grad_norm": 0.7702655263685751, - "language_loss": 0.56672931, - "learning_rate": 3.0059166016338954e-07, - "loss": 0.5867877, - "num_input_tokens_seen": 297325525, - "step": 13780, - "time_per_iteration": 3.212908983230591 - }, - { - "auxiliary_loss_clip": 0.01067858, - "auxiliary_loss_mlp": 0.01029582, - "balance_loss_clip": 1.03421974, - "balance_loss_mlp": 1.01657593, - "epoch": 0.8285585450172854, - "flos": 19714635100800.0, - "grad_norm": 1.699800901130364, - "language_loss": 0.79739404, - "learning_rate": 3.0038634492426205e-07, - "loss": 0.81836849, - "num_input_tokens_seen": 297345025, - "step": 13781, - "time_per_iteration": 2.655301809310913 - }, - { - "auxiliary_loss_clip": 0.01065725, - "auxiliary_loss_mlp": 0.01033354, - "balance_loss_clip": 1.03596509, - "balance_loss_mlp": 1.01966882, - "epoch": 0.8286186682699535, - "flos": 21688896280320.0, - "grad_norm": 1.8730371598803492, - "language_loss": 0.75640142, - "learning_rate": 3.001810941346543e-07, - "loss": 0.77739221, - "num_input_tokens_seen": 297363570, - "step": 13782, - "time_per_iteration": 2.6944918632507324 - }, - { - "auxiliary_loss_clip": 0.01095829, - "auxiliary_loss_mlp": 0.01033027, - "balance_loss_clip": 1.03388703, - "balance_loss_mlp": 1.02099264, - "epoch": 0.8286787915226214, - "flos": 25775566346880.0, - "grad_norm": 1.6561193474083664, - "language_loss": 0.76484203, - "learning_rate": 2.9997590780234983e-07, - "loss": 0.78613055, - "num_input_tokens_seen": 297385385, - "step": 13783, - "time_per_iteration": 2.6690874099731445 - }, - { - "auxiliary_loss_clip": 0.01107918, - "auxiliary_loss_mlp": 0.01028274, - "balance_loss_clip": 1.03614211, - "balance_loss_mlp": 1.01590598, - "epoch": 0.8287389147752894, - "flos": 21288348743040.0, - "grad_norm": 1.6914982205488613, - "language_loss": 0.73518729, - "learning_rate": 2.997707859351304e-07, - "loss": 0.75654924, - "num_input_tokens_seen": 297403950, - "step": 13784, - "time_per_iteration": 2.6368956565856934 - }, - { - "auxiliary_loss_clip": 0.01100253, - "auxiliary_loss_mlp": 0.01035986, - "balance_loss_clip": 1.03504157, - "balance_loss_mlp": 1.02221763, - "epoch": 0.8287990380279573, - "flos": 33544875323520.0, - "grad_norm": 6.002474127634083, - "language_loss": 0.69880319, - "learning_rate": 2.99565728540772e-07, - "loss": 0.72016555, - "num_input_tokens_seen": 297424565, - "step": 13785, - "time_per_iteration": 2.7842202186584473 - }, - { - "auxiliary_loss_clip": 0.01085403, - "auxiliary_loss_mlp": 0.01035636, - "balance_loss_clip": 1.03928435, - "balance_loss_mlp": 1.02327418, - "epoch": 0.8288591612806253, - "flos": 22966346545920.0, - "grad_norm": 1.401854742726992, - "language_loss": 0.68165773, - "learning_rate": 2.993607356270516e-07, - "loss": 0.7028681, - "num_input_tokens_seen": 297445180, - "step": 13786, - "time_per_iteration": 2.6792120933532715 - }, - { - "auxiliary_loss_clip": 0.01069299, - "auxiliary_loss_mlp": 0.01035995, - "balance_loss_clip": 1.03638959, - "balance_loss_mlp": 1.02368629, - "epoch": 0.8289192845332932, - "flos": 18588979710720.0, - "grad_norm": 1.8600312404195347, - "language_loss": 0.77116591, - "learning_rate": 2.991558072017426e-07, - "loss": 0.7922188, - "num_input_tokens_seen": 297463790, - "step": 13787, - "time_per_iteration": 2.7485241889953613 - }, - { - "auxiliary_loss_clip": 0.01090466, - "auxiliary_loss_mlp": 0.01033116, - "balance_loss_clip": 1.03657961, - "balance_loss_mlp": 1.02168417, - "epoch": 0.8289794077859612, - "flos": 15450423085440.0, - "grad_norm": 1.668975764455463, - "language_loss": 0.80241442, - "learning_rate": 2.989509432726163e-07, - "loss": 0.82365024, - "num_input_tokens_seen": 297480100, - "step": 13788, - "time_per_iteration": 2.646430730819702 - }, - { - "auxiliary_loss_clip": 0.01083639, - "auxiliary_loss_mlp": 0.01033973, - "balance_loss_clip": 1.03628707, - "balance_loss_mlp": 1.02209973, - "epoch": 0.8290395310386292, - "flos": 28877853214080.0, - "grad_norm": 1.718363547417138, - "language_loss": 0.71454132, - "learning_rate": 2.9874614384744014e-07, - "loss": 0.73571742, - "num_input_tokens_seen": 297499890, - "step": 13789, - "time_per_iteration": 2.6843364238739014 - }, - { - "auxiliary_loss_clip": 0.01076455, - "auxiliary_loss_mlp": 0.01028674, - "balance_loss_clip": 1.032372, - "balance_loss_mlp": 1.01604366, - "epoch": 0.8290996542912972, - "flos": 36576274700160.0, - "grad_norm": 2.586850358316563, - "language_loss": 0.68054211, - "learning_rate": 2.985414089339813e-07, - "loss": 0.7015934, - "num_input_tokens_seen": 297521440, - "step": 13790, - "time_per_iteration": 4.365084171295166 - }, - { - "auxiliary_loss_clip": 0.01099215, - "auxiliary_loss_mlp": 0.01030184, - "balance_loss_clip": 1.03627872, - "balance_loss_mlp": 1.01633167, - "epoch": 0.8291597775439651, - "flos": 23623009032960.0, - "grad_norm": 1.629598209312908, - "language_loss": 0.77366352, - "learning_rate": 2.9833673854000265e-07, - "loss": 0.7949574, - "num_input_tokens_seen": 297539920, - "step": 13791, - "time_per_iteration": 4.515652894973755 - }, - { - "auxiliary_loss_clip": 0.01083692, - "auxiliary_loss_mlp": 0.01031149, - "balance_loss_clip": 1.03688049, - "balance_loss_mlp": 1.01825666, - "epoch": 0.8292199007966331, - "flos": 21397481239680.0, - "grad_norm": 1.4251720115143436, - "language_loss": 0.70067787, - "learning_rate": 2.981321326732651e-07, - "loss": 0.72182631, - "num_input_tokens_seen": 297560000, - "step": 13792, - "time_per_iteration": 2.7335619926452637 - }, - { - "auxiliary_loss_clip": 0.0108758, - "auxiliary_loss_mlp": 0.01032436, - "balance_loss_clip": 1.03578472, - "balance_loss_mlp": 1.01971602, - "epoch": 0.829280024049301, - "flos": 28767607395840.0, - "grad_norm": 1.529482170283821, - "language_loss": 0.64886749, - "learning_rate": 2.9792759134152736e-07, - "loss": 0.67006767, - "num_input_tokens_seen": 297579300, - "step": 13793, - "time_per_iteration": 4.254675388336182 - }, - { - "auxiliary_loss_clip": 0.01052518, - "auxiliary_loss_mlp": 0.01038478, - "balance_loss_clip": 1.03231871, - "balance_loss_mlp": 1.02323079, - "epoch": 0.829340147301969, - "flos": 19938071652480.0, - "grad_norm": 1.91807865555319, - "language_loss": 0.66570354, - "learning_rate": 2.977231145525461e-07, - "loss": 0.6866135, - "num_input_tokens_seen": 297598095, - "step": 13794, - "time_per_iteration": 2.6897053718566895 - }, - { - "auxiliary_loss_clip": 0.01108178, - "auxiliary_loss_mlp": 0.01036453, - "balance_loss_clip": 1.03576493, - "balance_loss_mlp": 1.0234766, - "epoch": 0.829400270554637, - "flos": 25228575060480.0, - "grad_norm": 2.1693553604990132, - "language_loss": 0.66396624, - "learning_rate": 2.975187023140757e-07, - "loss": 0.68541253, - "num_input_tokens_seen": 297615955, - "step": 13795, - "time_per_iteration": 2.609815835952759 - }, - { - "auxiliary_loss_clip": 0.0101707, - "auxiliary_loss_mlp": 0.01041895, - "balance_loss_clip": 1.031497, - "balance_loss_mlp": 1.02748859, - "epoch": 0.829460393807305, - "flos": 24463570176000.0, - "grad_norm": 1.7274097985625807, - "language_loss": 0.66617584, - "learning_rate": 2.973143546338661e-07, - "loss": 0.68676549, - "num_input_tokens_seen": 297636285, - "step": 13796, - "time_per_iteration": 4.47485876083374 - }, - { - "auxiliary_loss_clip": 0.01060431, - "auxiliary_loss_mlp": 0.0104346, - "balance_loss_clip": 1.03264594, - "balance_loss_mlp": 1.02998924, - "epoch": 0.829520517059973, - "flos": 15122486891520.0, - "grad_norm": 1.7688571213307858, - "language_loss": 0.7208361, - "learning_rate": 2.971100715196666e-07, - "loss": 0.74187499, - "num_input_tokens_seen": 297653315, - "step": 13797, - "time_per_iteration": 2.996868133544922 - }, - { - "auxiliary_loss_clip": 0.01042783, - "auxiliary_loss_mlp": 0.01032554, - "balance_loss_clip": 1.03644705, - "balance_loss_mlp": 1.02056766, - "epoch": 0.8295806403126409, - "flos": 21579979265280.0, - "grad_norm": 2.64934630097921, - "language_loss": 0.72061169, - "learning_rate": 2.969058529792243e-07, - "loss": 0.74136508, - "num_input_tokens_seen": 297673480, - "step": 13798, - "time_per_iteration": 2.8359265327453613 - }, - { - "auxiliary_loss_clip": 0.01069075, - "auxiliary_loss_mlp": 0.01032261, - "balance_loss_clip": 1.03152323, - "balance_loss_mlp": 1.01987529, - "epoch": 0.8296407635653089, - "flos": 21726566668800.0, - "grad_norm": 1.5432798793202427, - "language_loss": 0.76292628, - "learning_rate": 2.967016990202822e-07, - "loss": 0.78393966, - "num_input_tokens_seen": 297693250, - "step": 13799, - "time_per_iteration": 2.693103790283203 - }, - { - "auxiliary_loss_clip": 0.01108566, - "auxiliary_loss_mlp": 0.01033314, - "balance_loss_clip": 1.03785658, - "balance_loss_mlp": 1.02094579, - "epoch": 0.8297008868179768, - "flos": 11181147252480.0, - "grad_norm": 1.9112775618394213, - "language_loss": 0.67614651, - "learning_rate": 2.9649760965058245e-07, - "loss": 0.69756532, - "num_input_tokens_seen": 297710975, - "step": 13800, - "time_per_iteration": 2.6247994899749756 - }, - { - "auxiliary_loss_clip": 0.01074439, - "auxiliary_loss_mlp": 0.01033186, - "balance_loss_clip": 1.03878558, - "balance_loss_mlp": 1.01930976, - "epoch": 0.8297610100706448, - "flos": 20664041431680.0, - "grad_norm": 2.709008705792723, - "language_loss": 0.74460614, - "learning_rate": 2.9629358487786515e-07, - "loss": 0.76568246, - "num_input_tokens_seen": 297730860, - "step": 13801, - "time_per_iteration": 2.7638845443725586 - }, - { - "auxiliary_loss_clip": 0.01063708, - "auxiliary_loss_mlp": 0.0102829, - "balance_loss_clip": 1.03407621, - "balance_loss_mlp": 1.01658368, - "epoch": 0.8298211333233128, - "flos": 20376325491840.0, - "grad_norm": 1.5797415663470742, - "language_loss": 0.73625791, - "learning_rate": 2.9608962470986476e-07, - "loss": 0.75717783, - "num_input_tokens_seen": 297749765, - "step": 13802, - "time_per_iteration": 2.7499916553497314 - }, - { - "auxiliary_loss_clip": 0.01088515, - "auxiliary_loss_mlp": 0.01029669, - "balance_loss_clip": 1.03459883, - "balance_loss_mlp": 1.01764071, - "epoch": 0.8298812565759808, - "flos": 21508696725120.0, - "grad_norm": 1.4712858328123304, - "language_loss": 0.74977744, - "learning_rate": 2.9588572915431644e-07, - "loss": 0.77095926, - "num_input_tokens_seen": 297770380, - "step": 13803, - "time_per_iteration": 2.7700328826904297 - }, - { - "auxiliary_loss_clip": 0.01099479, - "auxiliary_loss_mlp": 0.01034049, - "balance_loss_clip": 1.03803515, - "balance_loss_mlp": 1.02196717, - "epoch": 0.8299413798286487, - "flos": 22818681734400.0, - "grad_norm": 1.629212800491102, - "language_loss": 0.79214036, - "learning_rate": 2.9568189821895215e-07, - "loss": 0.81347561, - "num_input_tokens_seen": 297789440, - "step": 13804, - "time_per_iteration": 2.668266773223877 - }, - { - "auxiliary_loss_clip": 0.01109225, - "auxiliary_loss_mlp": 0.01032372, - "balance_loss_clip": 1.03797591, - "balance_loss_mlp": 1.0205344, - "epoch": 0.8300015030813167, - "flos": 29679199683840.0, - "grad_norm": 2.3697694156081157, - "language_loss": 0.72845703, - "learning_rate": 2.954781319115016e-07, - "loss": 0.74987304, - "num_input_tokens_seen": 297810425, - "step": 13805, - "time_per_iteration": 2.68404221534729 - }, - { - "auxiliary_loss_clip": 0.01102118, - "auxiliary_loss_mlp": 0.00771001, - "balance_loss_clip": 1.03904784, - "balance_loss_mlp": 1.00029325, - "epoch": 0.8300616263339846, - "flos": 19719483436800.0, - "grad_norm": 2.0648930657274462, - "language_loss": 0.77626657, - "learning_rate": 2.952744302396906e-07, - "loss": 0.79499781, - "num_input_tokens_seen": 297827680, - "step": 13806, - "time_per_iteration": 2.6478402614593506 - }, - { - "auxiliary_loss_clip": 0.0110212, - "auxiliary_loss_mlp": 0.01033184, - "balance_loss_clip": 1.03748512, - "balance_loss_mlp": 1.01954055, - "epoch": 0.8301217495866526, - "flos": 19901945548800.0, - "grad_norm": 1.8834407676447842, - "language_loss": 0.63916278, - "learning_rate": 2.950707932112444e-07, - "loss": 0.66051579, - "num_input_tokens_seen": 297848005, - "step": 13807, - "time_per_iteration": 2.6306519508361816 - }, - { - "auxiliary_loss_clip": 0.01097082, - "auxiliary_loss_mlp": 0.01029652, - "balance_loss_clip": 1.0383265, - "balance_loss_mlp": 1.01728976, - "epoch": 0.8301818728393207, - "flos": 19715784336000.0, - "grad_norm": 1.9692323669369614, - "language_loss": 0.72846484, - "learning_rate": 2.948672208338847e-07, - "loss": 0.74973214, - "num_input_tokens_seen": 297866730, - "step": 13808, - "time_per_iteration": 2.640733480453491 - }, - { - "auxiliary_loss_clip": 0.0109338, - "auxiliary_loss_mlp": 0.01046632, - "balance_loss_clip": 1.03866029, - "balance_loss_mlp": 1.03264272, - "epoch": 0.8302419960919886, - "flos": 28293658416000.0, - "grad_norm": 1.7739722668753906, - "language_loss": 0.66351604, - "learning_rate": 2.9466371311533046e-07, - "loss": 0.6849162, - "num_input_tokens_seen": 297886390, - "step": 13809, - "time_per_iteration": 2.751115322113037 - }, - { - "auxiliary_loss_clip": 0.011108, - "auxiliary_loss_mlp": 0.01024776, - "balance_loss_clip": 1.03813148, - "balance_loss_mlp": 1.01287341, - "epoch": 0.8303021193446566, - "flos": 18223444955520.0, - "grad_norm": 1.8449229056789198, - "language_loss": 0.74058008, - "learning_rate": 2.9446027006329896e-07, - "loss": 0.76193583, - "num_input_tokens_seen": 297905110, - "step": 13810, - "time_per_iteration": 2.506547451019287 - }, - { - "auxiliary_loss_clip": 0.01076467, - "auxiliary_loss_mlp": 0.01036006, - "balance_loss_clip": 1.03609502, - "balance_loss_mlp": 1.02471638, - "epoch": 0.8303622425973245, - "flos": 23111425578240.0, - "grad_norm": 1.5651865455038416, - "language_loss": 0.81083822, - "learning_rate": 2.94256891685505e-07, - "loss": 0.83196294, - "num_input_tokens_seen": 297925460, - "step": 13811, - "time_per_iteration": 2.7325217723846436 - }, - { - "auxiliary_loss_clip": 0.01076005, - "auxiliary_loss_mlp": 0.01045296, - "balance_loss_clip": 1.0357846, - "balance_loss_mlp": 1.03202796, - "epoch": 0.8304223658499925, - "flos": 19572860119680.0, - "grad_norm": 2.992999936529954, - "language_loss": 0.73513645, - "learning_rate": 2.9405357798966156e-07, - "loss": 0.75634944, - "num_input_tokens_seen": 297941760, - "step": 13812, - "time_per_iteration": 2.7724623680114746 - }, - { - "auxiliary_loss_clip": 0.01081692, - "auxiliary_loss_mlp": 0.01028975, - "balance_loss_clip": 1.03739822, - "balance_loss_mlp": 1.01693439, - "epoch": 0.8304824891026604, - "flos": 24426115269120.0, - "grad_norm": 1.8080417533170523, - "language_loss": 0.78173685, - "learning_rate": 2.9385032898347664e-07, - "loss": 0.80284357, - "num_input_tokens_seen": 297959745, - "step": 13813, - "time_per_iteration": 2.7371325492858887 - }, - { - "auxiliary_loss_clip": 0.01054685, - "auxiliary_loss_mlp": 0.00771129, - "balance_loss_clip": 1.03353238, - "balance_loss_mlp": 1.00019467, - "epoch": 0.8305426123553284, - "flos": 22381792611840.0, - "grad_norm": 1.8015570621783799, - "language_loss": 0.71141535, - "learning_rate": 2.93647144674658e-07, - "loss": 0.7296735, - "num_input_tokens_seen": 297977665, - "step": 13814, - "time_per_iteration": 2.8410873413085938 - }, - { - "auxiliary_loss_clip": 0.01117986, - "auxiliary_loss_mlp": 0.01044096, - "balance_loss_clip": 1.03891778, - "balance_loss_mlp": 1.02902818, - "epoch": 0.8306027356079964, - "flos": 14903575453440.0, - "grad_norm": 2.331626844380792, - "language_loss": 0.67776018, - "learning_rate": 2.9344402507091116e-07, - "loss": 0.69938099, - "num_input_tokens_seen": 297993525, - "step": 13815, - "time_per_iteration": 2.607855796813965 - }, - { - "auxiliary_loss_clip": 0.01097003, - "auxiliary_loss_mlp": 0.01033309, - "balance_loss_clip": 1.03770578, - "balance_loss_mlp": 1.02078068, - "epoch": 0.8306628588606644, - "flos": 19644573623040.0, - "grad_norm": 1.9297971278174, - "language_loss": 0.75907093, - "learning_rate": 2.9324097017993745e-07, - "loss": 0.78037405, - "num_input_tokens_seen": 298012920, - "step": 13816, - "time_per_iteration": 2.632202625274658 - }, - { - "auxiliary_loss_clip": 0.01074394, - "auxiliary_loss_mlp": 0.01036444, - "balance_loss_clip": 1.03376317, - "balance_loss_mlp": 1.02446318, - "epoch": 0.8307229821133323, - "flos": 24389737770240.0, - "grad_norm": 1.9005747270922144, - "language_loss": 0.81343293, - "learning_rate": 2.930379800094371e-07, - "loss": 0.83454132, - "num_input_tokens_seen": 298033310, - "step": 13817, - "time_per_iteration": 2.8131661415100098 - }, - { - "auxiliary_loss_clip": 0.01101146, - "auxiliary_loss_mlp": 0.01040878, - "balance_loss_clip": 1.03882217, - "balance_loss_mlp": 1.02748489, - "epoch": 0.8307831053660003, - "flos": 20996933702400.0, - "grad_norm": 1.505220062902958, - "language_loss": 0.78014338, - "learning_rate": 2.9283505456710875e-07, - "loss": 0.80156362, - "num_input_tokens_seen": 298053530, - "step": 13818, - "time_per_iteration": 2.6576030254364014 - }, - { - "auxiliary_loss_clip": 0.01093761, - "auxiliary_loss_mlp": 0.01035748, - "balance_loss_clip": 1.03938222, - "balance_loss_mlp": 1.02312958, - "epoch": 0.8308432286186682, - "flos": 21397301671680.0, - "grad_norm": 1.8020045024766413, - "language_loss": 0.819812, - "learning_rate": 2.926321938606453e-07, - "loss": 0.84110707, - "num_input_tokens_seen": 298069305, - "step": 13819, - "time_per_iteration": 2.6772990226745605 - }, - { - "auxiliary_loss_clip": 0.01020743, - "auxiliary_loss_mlp": 0.0100494, - "balance_loss_clip": 1.00830984, - "balance_loss_mlp": 1.00400436, - "epoch": 0.8309033518713362, - "flos": 62533656714240.0, - "grad_norm": 0.7602438257539984, - "language_loss": 0.56127542, - "learning_rate": 2.924293978977399e-07, - "loss": 0.58153224, - "num_input_tokens_seen": 298125830, - "step": 13820, - "time_per_iteration": 3.193361520767212 - }, - { - "auxiliary_loss_clip": 0.01095529, - "auxiliary_loss_mlp": 0.01025816, - "balance_loss_clip": 1.0352664, - "balance_loss_mlp": 1.01369286, - "epoch": 0.8309634751240043, - "flos": 16979104051200.0, - "grad_norm": 1.789990737596213, - "language_loss": 0.67907584, - "learning_rate": 2.922266666860831e-07, - "loss": 0.70028925, - "num_input_tokens_seen": 298142320, - "step": 13821, - "time_per_iteration": 2.661176919937134 - }, - { - "auxiliary_loss_clip": 0.01043861, - "auxiliary_loss_mlp": 0.01036485, - "balance_loss_clip": 1.029109, - "balance_loss_mlp": 1.02242458, - "epoch": 0.8310235983766722, - "flos": 22674464628480.0, - "grad_norm": 1.7649942540467223, - "language_loss": 0.69191265, - "learning_rate": 2.920240002333625e-07, - "loss": 0.7127161, - "num_input_tokens_seen": 298161845, - "step": 13822, - "time_per_iteration": 2.9704768657684326 - }, - { - "auxiliary_loss_clip": 0.01059895, - "auxiliary_loss_mlp": 0.01035644, - "balance_loss_clip": 1.03586471, - "balance_loss_mlp": 1.02335334, - "epoch": 0.8310837216293402, - "flos": 30811463176320.0, - "grad_norm": 1.6650310845720533, - "language_loss": 0.62025028, - "learning_rate": 2.918213985472631e-07, - "loss": 0.64120567, - "num_input_tokens_seen": 298184165, - "step": 13823, - "time_per_iteration": 2.8505992889404297 - }, - { - "auxiliary_loss_clip": 0.01009787, - "auxiliary_loss_mlp": 0.00999982, - "balance_loss_clip": 1.00688207, - "balance_loss_mlp": 0.9989447, - "epoch": 0.8311438448820081, - "flos": 71276074997760.0, - "grad_norm": 0.9240644196901294, - "language_loss": 0.61982203, - "learning_rate": 2.916188616354669e-07, - "loss": 0.63991976, - "num_input_tokens_seen": 298251720, - "step": 13824, - "time_per_iteration": 3.28657603263855 - }, - { - "auxiliary_loss_clip": 0.01110797, - "auxiliary_loss_mlp": 0.01030392, - "balance_loss_clip": 1.03885794, - "balance_loss_mlp": 1.01815486, - "epoch": 0.8312039681346761, - "flos": 20887082933760.0, - "grad_norm": 1.7761437257032648, - "language_loss": 0.73975819, - "learning_rate": 2.914163895056552e-07, - "loss": 0.76117009, - "num_input_tokens_seen": 298271910, - "step": 13825, - "time_per_iteration": 2.6168012619018555 - }, - { - "auxiliary_loss_clip": 0.01060453, - "auxiliary_loss_mlp": 0.0077103, - "balance_loss_clip": 1.03461838, - "balance_loss_mlp": 1.00020123, - "epoch": 0.831264091387344, - "flos": 17017528625280.0, - "grad_norm": 1.9546255724089596, - "language_loss": 0.80497503, - "learning_rate": 2.9121398216550486e-07, - "loss": 0.82328987, - "num_input_tokens_seen": 298288105, - "step": 13826, - "time_per_iteration": 2.6546146869659424 - }, - { - "auxiliary_loss_clip": 0.01110653, - "auxiliary_loss_mlp": 0.01033321, - "balance_loss_clip": 1.03793025, - "balance_loss_mlp": 1.02049446, - "epoch": 0.831324214640012, - "flos": 24419578993920.0, - "grad_norm": 1.5280583431221222, - "language_loss": 0.67963809, - "learning_rate": 2.910116396226914e-07, - "loss": 0.70107782, - "num_input_tokens_seen": 298307600, - "step": 13827, - "time_per_iteration": 2.5905277729034424 - }, - { - "auxiliary_loss_clip": 0.01098107, - "auxiliary_loss_mlp": 0.01030102, - "balance_loss_clip": 1.03539395, - "balance_loss_mlp": 1.01871204, - "epoch": 0.83138433789268, - "flos": 13545576938880.0, - "grad_norm": 1.973600288976441, - "language_loss": 0.74098945, - "learning_rate": 2.9080936188488834e-07, - "loss": 0.76227152, - "num_input_tokens_seen": 298323055, - "step": 13828, - "time_per_iteration": 2.6251087188720703 - }, - { - "auxiliary_loss_clip": 0.01073913, - "auxiliary_loss_mlp": 0.01033531, - "balance_loss_clip": 1.03275013, - "balance_loss_mlp": 1.0203644, - "epoch": 0.831444461145348, - "flos": 44492386561920.0, - "grad_norm": 2.63988910993405, - "language_loss": 0.67159581, - "learning_rate": 2.906071489597657e-07, - "loss": 0.69267023, - "num_input_tokens_seen": 298346950, - "step": 13829, - "time_per_iteration": 3.220686435699463 - }, - { - "auxiliary_loss_clip": 0.01085933, - "auxiliary_loss_mlp": 0.01031507, - "balance_loss_clip": 1.03704345, - "balance_loss_mlp": 1.01854897, - "epoch": 0.8315045843980159, - "flos": 22705024124160.0, - "grad_norm": 1.6177335267963915, - "language_loss": 0.82913047, - "learning_rate": 2.9040500085499054e-07, - "loss": 0.8503049, - "num_input_tokens_seen": 298366315, - "step": 13830, - "time_per_iteration": 6.03197717666626 - }, - { - "auxiliary_loss_clip": 0.01097952, - "auxiliary_loss_mlp": 0.01034258, - "balance_loss_clip": 1.03697491, - "balance_loss_mlp": 1.02208698, - "epoch": 0.8315647076506839, - "flos": 16873491087360.0, - "grad_norm": 2.1932847563543247, - "language_loss": 0.73822612, - "learning_rate": 2.9020291757822925e-07, - "loss": 0.75954819, - "num_input_tokens_seen": 298385185, - "step": 13831, - "time_per_iteration": 2.665022611618042 - }, - { - "auxiliary_loss_clip": 0.01111975, - "auxiliary_loss_mlp": 0.01033792, - "balance_loss_clip": 1.03963041, - "balance_loss_mlp": 1.02083445, - "epoch": 0.8316248309033518, - "flos": 13808730954240.0, - "grad_norm": 1.6071595034037367, - "language_loss": 0.7129162, - "learning_rate": 2.9000089913714523e-07, - "loss": 0.73437387, - "num_input_tokens_seen": 298402335, - "step": 13832, - "time_per_iteration": 2.647451400756836 - }, - { - "auxiliary_loss_clip": 0.0108072, - "auxiliary_loss_mlp": 0.0103351, - "balance_loss_clip": 1.03389788, - "balance_loss_mlp": 1.02102256, - "epoch": 0.8316849541560198, - "flos": 23512511819520.0, - "grad_norm": 1.6195807094532317, - "language_loss": 0.84269989, - "learning_rate": 2.897989455393979e-07, - "loss": 0.86384219, - "num_input_tokens_seen": 298423370, - "step": 13833, - "time_per_iteration": 4.226484298706055 - }, - { - "auxiliary_loss_clip": 0.010921, - "auxiliary_loss_mlp": 0.01036257, - "balance_loss_clip": 1.03806973, - "balance_loss_mlp": 1.02329278, - "epoch": 0.8317450774086879, - "flos": 23771356202880.0, - "grad_norm": 2.0307476796649917, - "language_loss": 0.76316315, - "learning_rate": 2.8959705679264625e-07, - "loss": 0.78444666, - "num_input_tokens_seen": 298444835, - "step": 13834, - "time_per_iteration": 2.814335584640503 - }, - { - "auxiliary_loss_clip": 0.01105662, - "auxiliary_loss_mlp": 0.00769799, - "balance_loss_clip": 1.03617358, - "balance_loss_mlp": 1.00016499, - "epoch": 0.8318052006613558, - "flos": 16215535710720.0, - "grad_norm": 1.877967943064554, - "language_loss": 0.79689634, - "learning_rate": 2.893952329045459e-07, - "loss": 0.81565094, - "num_input_tokens_seen": 298461845, - "step": 13835, - "time_per_iteration": 4.108726978302002 - }, - { - "auxiliary_loss_clip": 0.01103663, - "auxiliary_loss_mlp": 0.01037899, - "balance_loss_clip": 1.03955829, - "balance_loss_mlp": 1.02354026, - "epoch": 0.8318653239140238, - "flos": 19974556892160.0, - "grad_norm": 1.8066423967351954, - "language_loss": 0.80604517, - "learning_rate": 2.8919347388274905e-07, - "loss": 0.82746077, - "num_input_tokens_seen": 298479095, - "step": 13836, - "time_per_iteration": 2.624318838119507 - }, - { - "auxiliary_loss_clip": 0.01088523, - "auxiliary_loss_mlp": 0.01031373, - "balance_loss_clip": 1.03795898, - "balance_loss_mlp": 1.01932681, - "epoch": 0.8319254471666917, - "flos": 17704714694400.0, - "grad_norm": 1.9385404559381145, - "language_loss": 0.77292264, - "learning_rate": 2.8899177973490727e-07, - "loss": 0.79412162, - "num_input_tokens_seen": 298494475, - "step": 13837, - "time_per_iteration": 2.662458896636963 - }, - { - "auxiliary_loss_clip": 0.01114063, - "auxiliary_loss_mlp": 0.01030366, - "balance_loss_clip": 1.03759873, - "balance_loss_mlp": 1.01654339, - "epoch": 0.8319855704193597, - "flos": 19536554448000.0, - "grad_norm": 1.6836751176353142, - "language_loss": 0.83425492, - "learning_rate": 2.887901504686685e-07, - "loss": 0.85569924, - "num_input_tokens_seen": 298513185, - "step": 13838, - "time_per_iteration": 2.533836603164673 - }, - { - "auxiliary_loss_clip": 0.01081066, - "auxiliary_loss_mlp": 0.01035142, - "balance_loss_clip": 1.03288436, - "balance_loss_mlp": 1.02131331, - "epoch": 0.8320456936720276, - "flos": 21178067011200.0, - "grad_norm": 2.4451044719374613, - "language_loss": 0.74250424, - "learning_rate": 2.885885860916795e-07, - "loss": 0.76366633, - "num_input_tokens_seen": 298531885, - "step": 13839, - "time_per_iteration": 2.6616058349609375 - }, - { - "auxiliary_loss_clip": 0.01096452, - "auxiliary_loss_mlp": 0.01033098, - "balance_loss_clip": 1.03666425, - "balance_loss_mlp": 1.02004433, - "epoch": 0.8321058169246957, - "flos": 33250874503680.0, - "grad_norm": 1.4805288033952766, - "language_loss": 0.67812371, - "learning_rate": 2.8838708661158253e-07, - "loss": 0.69941914, - "num_input_tokens_seen": 298554905, - "step": 13840, - "time_per_iteration": 2.735732078552246 - }, - { - "auxiliary_loss_clip": 0.01054107, - "auxiliary_loss_mlp": 0.01039263, - "balance_loss_clip": 1.03295565, - "balance_loss_mlp": 1.02507687, - "epoch": 0.8321659401773636, - "flos": 14208129256320.0, - "grad_norm": 1.9499790502126348, - "language_loss": 0.79567152, - "learning_rate": 2.8818565203601843e-07, - "loss": 0.81660521, - "num_input_tokens_seen": 298571185, - "step": 13841, - "time_per_iteration": 2.6811771392822266 - }, - { - "auxiliary_loss_clip": 0.0106104, - "auxiliary_loss_mlp": 0.01030475, - "balance_loss_clip": 1.03763831, - "balance_loss_mlp": 1.0179522, - "epoch": 0.8322260634300316, - "flos": 15158253859200.0, - "grad_norm": 1.7496340078851804, - "language_loss": 0.68060827, - "learning_rate": 2.879842823726262e-07, - "loss": 0.70152342, - "num_input_tokens_seen": 298588505, - "step": 13842, - "time_per_iteration": 2.8322203159332275 - }, - { - "auxiliary_loss_clip": 0.0108993, - "auxiliary_loss_mlp": 0.0103225, - "balance_loss_clip": 1.03790903, - "balance_loss_mlp": 1.01888657, - "epoch": 0.8322861866826995, - "flos": 25300827267840.0, - "grad_norm": 1.5488311429576032, - "language_loss": 0.73103952, - "learning_rate": 2.8778297762904124e-07, - "loss": 0.75226128, - "num_input_tokens_seen": 298609295, - "step": 13843, - "time_per_iteration": 2.886599540710449 - }, - { - "auxiliary_loss_clip": 0.01077287, - "auxiliary_loss_mlp": 0.01027519, - "balance_loss_clip": 1.03611994, - "balance_loss_mlp": 1.01505589, - "epoch": 0.8323463099353675, - "flos": 17019360218880.0, - "grad_norm": 1.8098235185512692, - "language_loss": 0.77365232, - "learning_rate": 2.875817378128975e-07, - "loss": 0.79470038, - "num_input_tokens_seen": 298625765, - "step": 13844, - "time_per_iteration": 2.7069430351257324 - }, - { - "auxiliary_loss_clip": 0.01007928, - "auxiliary_loss_mlp": 0.01001333, - "balance_loss_clip": 1.00663698, - "balance_loss_mlp": 1.00036097, - "epoch": 0.8324064331880354, - "flos": 55607889709440.0, - "grad_norm": 0.7847120872391591, - "language_loss": 0.55208087, - "learning_rate": 2.8738056293182624e-07, - "loss": 0.57217348, - "num_input_tokens_seen": 298683005, - "step": 13845, - "time_per_iteration": 3.0783231258392334 - }, - { - "auxiliary_loss_clip": 0.01102275, - "auxiliary_loss_mlp": 0.01044708, - "balance_loss_clip": 1.03761721, - "balance_loss_mlp": 1.0314219, - "epoch": 0.8324665564407034, - "flos": 26138623063680.0, - "grad_norm": 1.6009211364700722, - "language_loss": 0.75140607, - "learning_rate": 2.871794529934555e-07, - "loss": 0.77287591, - "num_input_tokens_seen": 298703060, - "step": 13846, - "time_per_iteration": 2.676182508468628 - }, - { - "auxiliary_loss_clip": 0.01056649, - "auxiliary_loss_mlp": 0.01033367, - "balance_loss_clip": 1.03160548, - "balance_loss_mlp": 1.01809657, - "epoch": 0.8325266796933715, - "flos": 22049187649920.0, - "grad_norm": 1.6388328541738983, - "language_loss": 0.78896999, - "learning_rate": 2.8697840800541115e-07, - "loss": 0.80987018, - "num_input_tokens_seen": 298721765, - "step": 13847, - "time_per_iteration": 2.7297866344451904 - }, - { - "auxiliary_loss_clip": 0.01052928, - "auxiliary_loss_mlp": 0.01030708, - "balance_loss_clip": 1.0369612, - "balance_loss_mlp": 1.01901376, - "epoch": 0.8325868029460394, - "flos": 22816634659200.0, - "grad_norm": 2.65968337371303, - "language_loss": 0.74193573, - "learning_rate": 2.867774279753175e-07, - "loss": 0.76277208, - "num_input_tokens_seen": 298740825, - "step": 13848, - "time_per_iteration": 2.740797758102417 - }, - { - "auxiliary_loss_clip": 0.0110005, - "auxiliary_loss_mlp": 0.01027688, - "balance_loss_clip": 1.03858709, - "balance_loss_mlp": 1.0153321, - "epoch": 0.8326469261987074, - "flos": 14757454926720.0, - "grad_norm": 1.7578930460196398, - "language_loss": 0.63396668, - "learning_rate": 2.8657651291079554e-07, - "loss": 0.65524411, - "num_input_tokens_seen": 298758515, - "step": 13849, - "time_per_iteration": 2.5713930130004883 - }, - { - "auxiliary_loss_clip": 0.01084755, - "auxiliary_loss_mlp": 0.01033475, - "balance_loss_clip": 1.0322125, - "balance_loss_mlp": 1.0203917, - "epoch": 0.8327070494513753, - "flos": 22926126291840.0, - "grad_norm": 2.0835174192024533, - "language_loss": 0.79707754, - "learning_rate": 2.863756628194638e-07, - "loss": 0.81825984, - "num_input_tokens_seen": 298776375, - "step": 13850, - "time_per_iteration": 2.6037027835845947 - }, - { - "auxiliary_loss_clip": 0.0106844, - "auxiliary_loss_mlp": 0.01032788, - "balance_loss_clip": 1.03266466, - "balance_loss_mlp": 1.02161264, - "epoch": 0.8327671727040433, - "flos": 20665334321280.0, - "grad_norm": 1.589654785001457, - "language_loss": 0.7825923, - "learning_rate": 2.8617487770893877e-07, - "loss": 0.80360448, - "num_input_tokens_seen": 298795135, - "step": 13851, - "time_per_iteration": 2.669689416885376 - }, - { - "auxiliary_loss_clip": 0.01021321, - "auxiliary_loss_mlp": 0.01003693, - "balance_loss_clip": 1.00839996, - "balance_loss_mlp": 1.00260222, - "epoch": 0.8328272959567112, - "flos": 56060760384000.0, - "grad_norm": 0.7603079247900993, - "language_loss": 0.55759335, - "learning_rate": 2.859741575868344e-07, - "loss": 0.57784349, - "num_input_tokens_seen": 298855475, - "step": 13852, - "time_per_iteration": 3.171971321105957 - }, - { - "auxiliary_loss_clip": 0.01096762, - "auxiliary_loss_mlp": 0.01030025, - "balance_loss_clip": 1.03631687, - "balance_loss_mlp": 1.01785994, - "epoch": 0.8328874192093793, - "flos": 32303084284800.0, - "grad_norm": 1.490710672408854, - "language_loss": 0.67185426, - "learning_rate": 2.8577350246076125e-07, - "loss": 0.69312215, - "num_input_tokens_seen": 298875875, - "step": 13853, - "time_per_iteration": 2.705221176147461 - }, - { - "auxiliary_loss_clip": 0.01082363, - "auxiliary_loss_mlp": 0.01035053, - "balance_loss_clip": 1.03677762, - "balance_loss_mlp": 1.02310205, - "epoch": 0.8329475424620472, - "flos": 23512691387520.0, - "grad_norm": 1.8131340833394713, - "language_loss": 0.7809993, - "learning_rate": 2.855729123383286e-07, - "loss": 0.80217344, - "num_input_tokens_seen": 298895950, - "step": 13854, - "time_per_iteration": 2.6784071922302246 - }, - { - "auxiliary_loss_clip": 0.01029094, - "auxiliary_loss_mlp": 0.00999528, - "balance_loss_clip": 1.00678289, - "balance_loss_mlp": 0.99855083, - "epoch": 0.8330076657147152, - "flos": 67840680378240.0, - "grad_norm": 0.7605812264158395, - "language_loss": 0.58664268, - "learning_rate": 2.8537238722714295e-07, - "loss": 0.60692888, - "num_input_tokens_seen": 298955770, - "step": 13855, - "time_per_iteration": 3.0027222633361816 - }, - { - "auxiliary_loss_clip": 0.01098543, - "auxiliary_loss_mlp": 0.01027235, - "balance_loss_clip": 1.03760314, - "balance_loss_mlp": 1.01511717, - "epoch": 0.8330677889673831, - "flos": 22892801448960.0, - "grad_norm": 1.6486782153606043, - "language_loss": 0.71799862, - "learning_rate": 2.8517192713480853e-07, - "loss": 0.73925638, - "num_input_tokens_seen": 298976545, - "step": 13856, - "time_per_iteration": 2.6572425365448 - }, - { - "auxiliary_loss_clip": 0.01098496, - "auxiliary_loss_mlp": 0.01031016, - "balance_loss_clip": 1.03601694, - "balance_loss_mlp": 1.01861823, - "epoch": 0.8331279122200511, - "flos": 27345042184320.0, - "grad_norm": 1.530155897456529, - "language_loss": 0.75503182, - "learning_rate": 2.8497153206892677e-07, - "loss": 0.77632695, - "num_input_tokens_seen": 298996750, - "step": 13857, - "time_per_iteration": 2.709289073944092 - }, - { - "auxiliary_loss_clip": 0.0106038, - "auxiliary_loss_mlp": 0.01024239, - "balance_loss_clip": 1.0359571, - "balance_loss_mlp": 1.01319456, - "epoch": 0.833188035472719, - "flos": 19938179393280.0, - "grad_norm": 1.5089034219469146, - "language_loss": 0.7372514, - "learning_rate": 2.847712020370958e-07, - "loss": 0.75809759, - "num_input_tokens_seen": 299014895, - "step": 13858, - "time_per_iteration": 2.771655321121216 - }, - { - "auxiliary_loss_clip": 0.01112772, - "auxiliary_loss_mlp": 0.01033358, - "balance_loss_clip": 1.03744984, - "balance_loss_mlp": 1.01981604, - "epoch": 0.833248158725387, - "flos": 15232624968960.0, - "grad_norm": 1.9106405712672399, - "language_loss": 0.73376054, - "learning_rate": 2.8457093704691316e-07, - "loss": 0.75522184, - "num_input_tokens_seen": 299032855, - "step": 13859, - "time_per_iteration": 2.5690972805023193 - }, - { - "auxiliary_loss_clip": 0.01093273, - "auxiliary_loss_mlp": 0.01025326, - "balance_loss_clip": 1.03597152, - "balance_loss_mlp": 1.01405454, - "epoch": 0.8333082819780551, - "flos": 24535535074560.0, - "grad_norm": 1.588476401883647, - "language_loss": 0.79069161, - "learning_rate": 2.8437073710597205e-07, - "loss": 0.81187761, - "num_input_tokens_seen": 299052055, - "step": 13860, - "time_per_iteration": 2.687077283859253 - }, - { - "auxiliary_loss_clip": 0.0103731, - "auxiliary_loss_mlp": 0.01031939, - "balance_loss_clip": 1.03546524, - "balance_loss_mlp": 1.01915944, - "epoch": 0.833368405230723, - "flos": 31467407391360.0, - "grad_norm": 1.5993206787679535, - "language_loss": 0.8204006, - "learning_rate": 2.841706022218644e-07, - "loss": 0.84109306, - "num_input_tokens_seen": 299075285, - "step": 13861, - "time_per_iteration": 3.007451295852661 - }, - { - "auxiliary_loss_clip": 0.01112118, - "auxiliary_loss_mlp": 0.0103305, - "balance_loss_clip": 1.03988099, - "balance_loss_mlp": 1.02040219, - "epoch": 0.833428528483391, - "flos": 14902713527040.0, - "grad_norm": 1.7332412626678735, - "language_loss": 0.78811872, - "learning_rate": 2.839705324021806e-07, - "loss": 0.80957043, - "num_input_tokens_seen": 299092520, - "step": 13862, - "time_per_iteration": 2.7910513877868652 - }, - { - "auxiliary_loss_clip": 0.01099183, - "auxiliary_loss_mlp": 0.01035326, - "balance_loss_clip": 1.03552341, - "balance_loss_mlp": 1.02280307, - "epoch": 0.8334886517360589, - "flos": 22199833290240.0, - "grad_norm": 1.8555893155682146, - "language_loss": 0.75250399, - "learning_rate": 2.83770527654505e-07, - "loss": 0.77384913, - "num_input_tokens_seen": 299109450, - "step": 13863, - "time_per_iteration": 2.623645782470703 - }, - { - "auxiliary_loss_clip": 0.01049642, - "auxiliary_loss_mlp": 0.00771776, - "balance_loss_clip": 1.03239465, - "balance_loss_mlp": 1.00020719, - "epoch": 0.8335487749887269, - "flos": 30372562892160.0, - "grad_norm": 1.9984651067343642, - "language_loss": 0.75399351, - "learning_rate": 2.835705879864232e-07, - "loss": 0.77220774, - "num_input_tokens_seen": 299129540, - "step": 13864, - "time_per_iteration": 2.8347368240356445 - }, - { - "auxiliary_loss_clip": 0.01086549, - "auxiliary_loss_mlp": 0.01034445, - "balance_loss_clip": 1.03666651, - "balance_loss_mlp": 1.02171326, - "epoch": 0.8336088982413948, - "flos": 24681152810880.0, - "grad_norm": 1.9805001513042173, - "language_loss": 0.69349921, - "learning_rate": 2.833707134055168e-07, - "loss": 0.71470916, - "num_input_tokens_seen": 299148670, - "step": 13865, - "time_per_iteration": 2.7639873027801514 - }, - { - "auxiliary_loss_clip": 0.01099811, - "auxiliary_loss_mlp": 0.01032523, - "balance_loss_clip": 1.03819227, - "balance_loss_mlp": 1.01979089, - "epoch": 0.8336690214940629, - "flos": 38177207873280.0, - "grad_norm": 5.1442614378118625, - "language_loss": 0.75333238, - "learning_rate": 2.831709039193653e-07, - "loss": 0.7746557, - "num_input_tokens_seen": 299169330, - "step": 13866, - "time_per_iteration": 2.777001142501831 - }, - { - "auxiliary_loss_clip": 0.01008617, - "auxiliary_loss_mlp": 0.01009028, - "balance_loss_clip": 1.00722134, - "balance_loss_mlp": 1.00765133, - "epoch": 0.8337291447467308, - "flos": 55565119589760.0, - "grad_norm": 0.870724565539336, - "language_loss": 0.63078576, - "learning_rate": 2.8297115953554465e-07, - "loss": 0.65096223, - "num_input_tokens_seen": 299220980, - "step": 13867, - "time_per_iteration": 3.081568956375122 - }, - { - "auxiliary_loss_clip": 0.01083895, - "auxiliary_loss_mlp": 0.01028872, - "balance_loss_clip": 1.03740549, - "balance_loss_mlp": 1.01767826, - "epoch": 0.8337892679993988, - "flos": 24133550993280.0, - "grad_norm": 1.7649595884410185, - "language_loss": 0.71936655, - "learning_rate": 2.827714802616301e-07, - "loss": 0.74049425, - "num_input_tokens_seen": 299240130, - "step": 13868, - "time_per_iteration": 2.652420997619629 - }, - { - "auxiliary_loss_clip": 0.0108564, - "auxiliary_loss_mlp": 0.01032875, - "balance_loss_clip": 1.03956676, - "balance_loss_mlp": 1.02057862, - "epoch": 0.8338493912520667, - "flos": 28183915388160.0, - "grad_norm": 1.3896296545352977, - "language_loss": 0.80381906, - "learning_rate": 2.8257186610519325e-07, - "loss": 0.82500416, - "num_input_tokens_seen": 299260705, - "step": 13869, - "time_per_iteration": 4.254533529281616 - }, - { - "auxiliary_loss_clip": 0.01100488, - "auxiliary_loss_mlp": 0.01032723, - "balance_loss_clip": 1.03849924, - "balance_loss_mlp": 1.02023017, - "epoch": 0.8339095145047347, - "flos": 22158356060160.0, - "grad_norm": 1.5584172404732568, - "language_loss": 0.82560688, - "learning_rate": 2.823723170738028e-07, - "loss": 0.84693897, - "num_input_tokens_seen": 299278925, - "step": 13870, - "time_per_iteration": 4.364636421203613 - }, - { - "auxiliary_loss_clip": 0.01078884, - "auxiliary_loss_mlp": 0.0102765, - "balance_loss_clip": 1.03682613, - "balance_loss_mlp": 1.01443601, - "epoch": 0.8339696377574026, - "flos": 17307112072320.0, - "grad_norm": 2.7050320038401146, - "language_loss": 0.7043367, - "learning_rate": 2.821728331750264e-07, - "loss": 0.72540206, - "num_input_tokens_seen": 299291580, - "step": 13871, - "time_per_iteration": 2.650563955307007 - }, - { - "auxiliary_loss_clip": 0.01097514, - "auxiliary_loss_mlp": 0.0103365, - "balance_loss_clip": 1.03766418, - "balance_loss_mlp": 1.02192545, - "epoch": 0.8340297610100706, - "flos": 20668351063680.0, - "grad_norm": 1.6604394599481103, - "language_loss": 0.6898998, - "learning_rate": 2.8197341441642853e-07, - "loss": 0.7112115, - "num_input_tokens_seen": 299310385, - "step": 13872, - "time_per_iteration": 4.172610759735107 - }, - { - "auxiliary_loss_clip": 0.01086882, - "auxiliary_loss_mlp": 0.01026823, - "balance_loss_clip": 1.03666329, - "balance_loss_mlp": 1.01506281, - "epoch": 0.8340898842627387, - "flos": 20515442866560.0, - "grad_norm": 1.969935634979257, - "language_loss": 0.73773992, - "learning_rate": 2.817740608055712e-07, - "loss": 0.75887698, - "num_input_tokens_seen": 299327660, - "step": 13873, - "time_per_iteration": 2.7069506645202637 - }, - { - "auxiliary_loss_clip": 0.01087674, - "auxiliary_loss_mlp": 0.01035501, - "balance_loss_clip": 1.03668487, - "balance_loss_mlp": 1.02100515, - "epoch": 0.8341500075154066, - "flos": 21425850005760.0, - "grad_norm": 2.086638333931779, - "language_loss": 0.75528133, - "learning_rate": 2.81574772350013e-07, - "loss": 0.7765131, - "num_input_tokens_seen": 299343685, - "step": 13874, - "time_per_iteration": 4.401844263076782 - }, - { - "auxiliary_loss_clip": 0.0108051, - "auxiliary_loss_mlp": 0.01028815, - "balance_loss_clip": 1.0355829, - "balance_loss_mlp": 1.01691747, - "epoch": 0.8342101307680746, - "flos": 22090988102400.0, - "grad_norm": 2.2988326749129766, - "language_loss": 0.66232169, - "learning_rate": 2.813755490573118e-07, - "loss": 0.68341494, - "num_input_tokens_seen": 299363305, - "step": 13875, - "time_per_iteration": 2.7391769886016846 - }, - { - "auxiliary_loss_clip": 0.010648, - "auxiliary_loss_mlp": 0.01036714, - "balance_loss_clip": 1.03338897, - "balance_loss_mlp": 1.02434039, - "epoch": 0.8342702540207425, - "flos": 21871466133120.0, - "grad_norm": 1.700714112258655, - "language_loss": 0.79729408, - "learning_rate": 2.8117639093502243e-07, - "loss": 0.81830925, - "num_input_tokens_seen": 299382630, - "step": 13876, - "time_per_iteration": 2.8024299144744873 - }, - { - "auxiliary_loss_clip": 0.01093328, - "auxiliary_loss_mlp": 0.01038156, - "balance_loss_clip": 1.03557479, - "balance_loss_mlp": 1.02462614, - "epoch": 0.8343303772734105, - "flos": 22528487756160.0, - "grad_norm": 1.934148297226032, - "language_loss": 0.87182283, - "learning_rate": 2.8097729799069615e-07, - "loss": 0.89313757, - "num_input_tokens_seen": 299402385, - "step": 13877, - "time_per_iteration": 2.652780055999756 - }, - { - "auxiliary_loss_clip": 0.01064054, - "auxiliary_loss_mlp": 0.01029724, - "balance_loss_clip": 1.03309846, - "balance_loss_mlp": 1.01811349, - "epoch": 0.8343905005260784, - "flos": 14939773384320.0, - "grad_norm": 2.0462356502158445, - "language_loss": 0.69456965, - "learning_rate": 2.807782702318828e-07, - "loss": 0.71550739, - "num_input_tokens_seen": 299419820, - "step": 13878, - "time_per_iteration": 2.642768144607544 - }, - { - "auxiliary_loss_clip": 0.01084966, - "auxiliary_loss_mlp": 0.01028475, - "balance_loss_clip": 1.03594303, - "balance_loss_mlp": 1.01660752, - "epoch": 0.8344506237787465, - "flos": 15012456554880.0, - "grad_norm": 2.2290576221184537, - "language_loss": 0.790878, - "learning_rate": 2.805793076661309e-07, - "loss": 0.81201237, - "num_input_tokens_seen": 299436265, - "step": 13879, - "time_per_iteration": 2.6227519512176514 - }, - { - "auxiliary_loss_clip": 0.01061568, - "auxiliary_loss_mlp": 0.01031482, - "balance_loss_clip": 1.03857195, - "balance_loss_mlp": 1.02046072, - "epoch": 0.8345107470314144, - "flos": 17560389847680.0, - "grad_norm": 2.053274894813819, - "language_loss": 0.8324911, - "learning_rate": 2.803804103009828e-07, - "loss": 0.85342157, - "num_input_tokens_seen": 299451660, - "step": 13880, - "time_per_iteration": 2.7081100940704346 - }, - { - "auxiliary_loss_clip": 0.01089609, - "auxiliary_loss_mlp": 0.0102994, - "balance_loss_clip": 1.035254, - "balance_loss_mlp": 1.01767302, - "epoch": 0.8345708702840824, - "flos": 25187277398400.0, - "grad_norm": 1.577577271354365, - "language_loss": 0.78032011, - "learning_rate": 2.80181578143982e-07, - "loss": 0.80151558, - "num_input_tokens_seen": 299472070, - "step": 13881, - "time_per_iteration": 2.672635793685913 - }, - { - "auxiliary_loss_clip": 0.010645, - "auxiliary_loss_mlp": 0.01024591, - "balance_loss_clip": 1.03461313, - "balance_loss_mlp": 1.01385057, - "epoch": 0.8346309935367503, - "flos": 15083559527040.0, - "grad_norm": 2.2926708400629137, - "language_loss": 0.78564227, - "learning_rate": 2.7998281120266807e-07, - "loss": 0.80653316, - "num_input_tokens_seen": 299486725, - "step": 13882, - "time_per_iteration": 2.6480295658111572 - }, - { - "auxiliary_loss_clip": 0.01070114, - "auxiliary_loss_mlp": 0.01053336, - "balance_loss_clip": 1.03278971, - "balance_loss_mlp": 1.03948951, - "epoch": 0.8346911167894183, - "flos": 22930615491840.0, - "grad_norm": 1.6147247688158133, - "language_loss": 0.80761689, - "learning_rate": 2.79784109484579e-07, - "loss": 0.82885134, - "num_input_tokens_seen": 299505435, - "step": 13883, - "time_per_iteration": 2.6793839931488037 - }, - { - "auxiliary_loss_clip": 0.01096684, - "auxiliary_loss_mlp": 0.01036312, - "balance_loss_clip": 1.03590465, - "balance_loss_mlp": 1.02373528, - "epoch": 0.8347512400420862, - "flos": 20193037367040.0, - "grad_norm": 4.685073844907577, - "language_loss": 0.74089235, - "learning_rate": 2.795854729972482e-07, - "loss": 0.76222229, - "num_input_tokens_seen": 299523555, - "step": 13884, - "time_per_iteration": 2.604556083679199 - }, - { - "auxiliary_loss_clip": 0.01095519, - "auxiliary_loss_mlp": 0.01034904, - "balance_loss_clip": 1.03934622, - "balance_loss_mlp": 1.02086711, - "epoch": 0.8348113632947542, - "flos": 25954832148480.0, - "grad_norm": 1.6937955935304687, - "language_loss": 0.7016691, - "learning_rate": 2.7938690174820913e-07, - "loss": 0.72297329, - "num_input_tokens_seen": 299541660, - "step": 13885, - "time_per_iteration": 2.6773464679718018 - }, - { - "auxiliary_loss_clip": 0.01077954, - "auxiliary_loss_mlp": 0.01033472, - "balance_loss_clip": 1.03690195, - "balance_loss_mlp": 1.02092576, - "epoch": 0.8348714865474223, - "flos": 34204554552960.0, - "grad_norm": 1.8731982804534555, - "language_loss": 0.6992318, - "learning_rate": 2.791883957449912e-07, - "loss": 0.72034615, - "num_input_tokens_seen": 299562465, - "step": 13886, - "time_per_iteration": 2.8285069465637207 - }, - { - "auxiliary_loss_clip": 0.01073957, - "auxiliary_loss_mlp": 0.01033794, - "balance_loss_clip": 1.03586638, - "balance_loss_mlp": 1.01972771, - "epoch": 0.8349316098000902, - "flos": 24390132819840.0, - "grad_norm": 2.5697448718102414, - "language_loss": 0.79508579, - "learning_rate": 2.7898995499512134e-07, - "loss": 0.81616336, - "num_input_tokens_seen": 299582700, - "step": 13887, - "time_per_iteration": 2.7178754806518555 - }, - { - "auxiliary_loss_clip": 0.01092328, - "auxiliary_loss_mlp": 0.00771043, - "balance_loss_clip": 1.03849149, - "balance_loss_mlp": 1.00030017, - "epoch": 0.8349917330527582, - "flos": 23032744836480.0, - "grad_norm": 2.693894693314375, - "language_loss": 0.64530712, - "learning_rate": 2.7879157950612467e-07, - "loss": 0.66394079, - "num_input_tokens_seen": 299600310, - "step": 13888, - "time_per_iteration": 2.6735687255859375 - }, - { - "auxiliary_loss_clip": 0.01088596, - "auxiliary_loss_mlp": 0.01028671, - "balance_loss_clip": 1.03663945, - "balance_loss_mlp": 1.01663125, - "epoch": 0.8350518563054261, - "flos": 13625873792640.0, - "grad_norm": 2.0620816016550436, - "language_loss": 0.66669202, - "learning_rate": 2.785932692855244e-07, - "loss": 0.68786466, - "num_input_tokens_seen": 299617025, - "step": 13889, - "time_per_iteration": 2.680638551712036 - }, - { - "auxiliary_loss_clip": 0.01090008, - "auxiliary_loss_mlp": 0.01028757, - "balance_loss_clip": 1.03369141, - "balance_loss_mlp": 1.01666367, - "epoch": 0.8351119795580941, - "flos": 21579799697280.0, - "grad_norm": 2.20971990347348, - "language_loss": 0.6832096, - "learning_rate": 2.783950243408399e-07, - "loss": 0.70439726, - "num_input_tokens_seen": 299633050, - "step": 13890, - "time_per_iteration": 2.627889394760132 - }, - { - "auxiliary_loss_clip": 0.01088958, - "auxiliary_loss_mlp": 0.01036104, - "balance_loss_clip": 1.03766465, - "balance_loss_mlp": 1.02320004, - "epoch": 0.835172102810762, - "flos": 20038297576320.0, - "grad_norm": 2.518146173573676, - "language_loss": 0.59095812, - "learning_rate": 2.7819684467958817e-07, - "loss": 0.61220872, - "num_input_tokens_seen": 299646445, - "step": 13891, - "time_per_iteration": 2.7173044681549072 - }, - { - "auxiliary_loss_clip": 0.01099806, - "auxiliary_loss_mlp": 0.01029991, - "balance_loss_clip": 1.03822279, - "balance_loss_mlp": 1.01823068, - "epoch": 0.8352322260634301, - "flos": 25111577485440.0, - "grad_norm": 1.6267311876614727, - "language_loss": 0.71812761, - "learning_rate": 2.779987303092846e-07, - "loss": 0.7394256, - "num_input_tokens_seen": 299662665, - "step": 13892, - "time_per_iteration": 2.662322998046875 - }, - { - "auxiliary_loss_clip": 0.01106347, - "auxiliary_loss_mlp": 0.01034997, - "balance_loss_clip": 1.03654015, - "balance_loss_mlp": 1.02241993, - "epoch": 0.835292349316098, - "flos": 24863758577280.0, - "grad_norm": 1.986327047613025, - "language_loss": 0.65929645, - "learning_rate": 2.7780068123744207e-07, - "loss": 0.68070984, - "num_input_tokens_seen": 299683585, - "step": 13893, - "time_per_iteration": 2.666810989379883 - }, - { - "auxiliary_loss_clip": 0.01079282, - "auxiliary_loss_mlp": 0.01024568, - "balance_loss_clip": 1.03549695, - "balance_loss_mlp": 1.01279628, - "epoch": 0.835352472568766, - "flos": 19865568049920.0, - "grad_norm": 2.066965169500514, - "language_loss": 0.78525186, - "learning_rate": 2.7760269747156996e-07, - "loss": 0.80629033, - "num_input_tokens_seen": 299702680, - "step": 13894, - "time_per_iteration": 2.656261920928955 - }, - { - "auxiliary_loss_clip": 0.01089446, - "auxiliary_loss_mlp": 0.01029648, - "balance_loss_clip": 1.03555644, - "balance_loss_mlp": 1.01722014, - "epoch": 0.8354125958214339, - "flos": 22054754257920.0, - "grad_norm": 1.734895870039155, - "language_loss": 0.72428441, - "learning_rate": 2.7740477901917625e-07, - "loss": 0.74547529, - "num_input_tokens_seen": 299721050, - "step": 13895, - "time_per_iteration": 2.5912013053894043 - }, - { - "auxiliary_loss_clip": 0.01098522, - "auxiliary_loss_mlp": 0.01043176, - "balance_loss_clip": 1.0375011, - "balance_loss_mlp": 1.02959836, - "epoch": 0.8354727190741019, - "flos": 21397804462080.0, - "grad_norm": 2.180746282209792, - "language_loss": 0.72239274, - "learning_rate": 2.772069258877667e-07, - "loss": 0.7438097, - "num_input_tokens_seen": 299738255, - "step": 13896, - "time_per_iteration": 2.816459894180298 - }, - { - "auxiliary_loss_clip": 0.01096666, - "auxiliary_loss_mlp": 0.01033423, - "balance_loss_clip": 1.03551006, - "balance_loss_mlp": 1.02084064, - "epoch": 0.8355328423267698, - "flos": 50840997834240.0, - "grad_norm": 2.4314822364196456, - "language_loss": 0.5891223, - "learning_rate": 2.770091380848423e-07, - "loss": 0.61042321, - "num_input_tokens_seen": 299761315, - "step": 13897, - "time_per_iteration": 2.854132652282715 - }, - { - "auxiliary_loss_clip": 0.01029051, - "auxiliary_loss_mlp": 0.00750932, - "balance_loss_clip": 1.00681758, - "balance_loss_mlp": 0.99963111, - "epoch": 0.8355929655794379, - "flos": 65551052764800.0, - "grad_norm": 0.6926411996792173, - "language_loss": 0.57645589, - "learning_rate": 2.7681141561790423e-07, - "loss": 0.59425569, - "num_input_tokens_seen": 299828735, - "step": 13898, - "time_per_iteration": 3.189154624938965 - }, - { - "auxiliary_loss_clip": 0.01095352, - "auxiliary_loss_mlp": 0.01038588, - "balance_loss_clip": 1.03804767, - "balance_loss_mlp": 1.02465272, - "epoch": 0.8356530888321058, - "flos": 19170516902400.0, - "grad_norm": 1.982321802271085, - "language_loss": 0.79983473, - "learning_rate": 2.7661375849444967e-07, - "loss": 0.8211742, - "num_input_tokens_seen": 299848395, - "step": 13899, - "time_per_iteration": 2.6372761726379395 - }, - { - "auxiliary_loss_clip": 0.01110341, - "auxiliary_loss_mlp": 0.01035111, - "balance_loss_clip": 1.03744435, - "balance_loss_mlp": 1.0235889, - "epoch": 0.8357132120847738, - "flos": 44126672238720.0, - "grad_norm": 3.0475154129794473, - "language_loss": 0.69246173, - "learning_rate": 2.764161667219749e-07, - "loss": 0.7139163, - "num_input_tokens_seen": 299871665, - "step": 13900, - "time_per_iteration": 2.7809805870056152 - }, - { - "auxiliary_loss_clip": 0.01086706, - "auxiliary_loss_mlp": 0.01030844, - "balance_loss_clip": 1.03770447, - "balance_loss_mlp": 1.01880407, - "epoch": 0.8357733353374418, - "flos": 24389701856640.0, - "grad_norm": 1.5605306335935556, - "language_loss": 0.71076608, - "learning_rate": 2.762186403079716e-07, - "loss": 0.73194158, - "num_input_tokens_seen": 299891960, - "step": 13901, - "time_per_iteration": 2.6282958984375 - }, - { - "auxiliary_loss_clip": 0.01065898, - "auxiliary_loss_mlp": 0.01039549, - "balance_loss_clip": 1.03284073, - "balance_loss_mlp": 1.02650762, - "epoch": 0.8358334585901097, - "flos": 20916313626240.0, - "grad_norm": 2.0190709128744686, - "language_loss": 0.79701173, - "learning_rate": 2.7602117925992963e-07, - "loss": 0.81806624, - "num_input_tokens_seen": 299905070, - "step": 13902, - "time_per_iteration": 2.690213441848755 - }, - { - "auxiliary_loss_clip": 0.01096183, - "auxiliary_loss_mlp": 0.01031756, - "balance_loss_clip": 1.03576422, - "balance_loss_mlp": 1.01979947, - "epoch": 0.8358935818427777, - "flos": 19244169740160.0, - "grad_norm": 1.8244739444872173, - "language_loss": 0.62556911, - "learning_rate": 2.758237835853379e-07, - "loss": 0.6468485, - "num_input_tokens_seen": 299925130, - "step": 13903, - "time_per_iteration": 2.6231348514556885 - }, - { - "auxiliary_loss_clip": 0.01084825, - "auxiliary_loss_mlp": 0.01036663, - "balance_loss_clip": 1.03508985, - "balance_loss_mlp": 1.02401519, - "epoch": 0.8359537050954456, - "flos": 24134053783680.0, - "grad_norm": 4.416778142718545, - "language_loss": 0.7411294, - "learning_rate": 2.7562645329168054e-07, - "loss": 0.7623443, - "num_input_tokens_seen": 299943845, - "step": 13904, - "time_per_iteration": 2.7428109645843506 - }, - { - "auxiliary_loss_clip": 0.01082834, - "auxiliary_loss_mlp": 0.01030754, - "balance_loss_clip": 1.03411317, - "balance_loss_mlp": 1.017802, - "epoch": 0.8360138283481137, - "flos": 16180415187840.0, - "grad_norm": 1.704840597436559, - "language_loss": 0.72898692, - "learning_rate": 2.7542918838644104e-07, - "loss": 0.75012279, - "num_input_tokens_seen": 299961620, - "step": 13905, - "time_per_iteration": 2.7568368911743164 - }, - { - "auxiliary_loss_clip": 0.01096191, - "auxiliary_loss_mlp": 0.01036518, - "balance_loss_clip": 1.03658271, - "balance_loss_mlp": 1.02507973, - "epoch": 0.8360739516007816, - "flos": 22198899536640.0, - "grad_norm": 1.8034628053468047, - "language_loss": 0.66455811, - "learning_rate": 2.752319888771e-07, - "loss": 0.68588519, - "num_input_tokens_seen": 299982170, - "step": 13906, - "time_per_iteration": 2.6989848613739014 - }, - { - "auxiliary_loss_clip": 0.01096481, - "auxiliary_loss_mlp": 0.01028681, - "balance_loss_clip": 1.03535354, - "balance_loss_mlp": 1.01639068, - "epoch": 0.8361340748534496, - "flos": 20923137210240.0, - "grad_norm": 2.553726632874823, - "language_loss": 0.74047542, - "learning_rate": 2.7503485477113475e-07, - "loss": 0.76172698, - "num_input_tokens_seen": 300001330, - "step": 13907, - "time_per_iteration": 2.5955686569213867 - }, - { - "auxiliary_loss_clip": 0.0107652, - "auxiliary_loss_mlp": 0.0103429, - "balance_loss_clip": 1.03481364, - "balance_loss_mlp": 1.02162361, - "epoch": 0.8361941981061175, - "flos": 26173599932160.0, - "grad_norm": 11.039148102509154, - "language_loss": 0.75409931, - "learning_rate": 2.7483778607602005e-07, - "loss": 0.7752074, - "num_input_tokens_seen": 300020645, - "step": 13908, - "time_per_iteration": 2.696906566619873 - }, - { - "auxiliary_loss_clip": 0.01097882, - "auxiliary_loss_mlp": 0.01031312, - "balance_loss_clip": 1.03590965, - "balance_loss_mlp": 1.01825249, - "epoch": 0.8362543213587855, - "flos": 24419363512320.0, - "grad_norm": 1.9388338218951495, - "language_loss": 0.71320546, - "learning_rate": 2.7464078279922964e-07, - "loss": 0.73449743, - "num_input_tokens_seen": 300039945, - "step": 13909, - "time_per_iteration": 5.753490686416626 - }, - { - "auxiliary_loss_clip": 0.01112711, - "auxiliary_loss_mlp": 0.00771249, - "balance_loss_clip": 1.03798199, - "balance_loss_mlp": 1.00026917, - "epoch": 0.8363144446114534, - "flos": 17202396948480.0, - "grad_norm": 1.7414813953695425, - "language_loss": 0.73090255, - "learning_rate": 2.744438449482338e-07, - "loss": 0.74974209, - "num_input_tokens_seen": 300058260, - "step": 13910, - "time_per_iteration": 2.6283226013183594 - }, - { - "auxiliary_loss_clip": 0.01095006, - "auxiliary_loss_mlp": 0.00772614, - "balance_loss_clip": 1.0360173, - "balance_loss_mlp": 1.00014329, - "epoch": 0.8363745678641215, - "flos": 19279398003840.0, - "grad_norm": 2.750713587824779, - "language_loss": 0.73741031, - "learning_rate": 2.742469725305001e-07, - "loss": 0.75608653, - "num_input_tokens_seen": 300076720, - "step": 13911, - "time_per_iteration": 4.149497985839844 - }, - { - "auxiliary_loss_clip": 0.01090915, - "auxiliary_loss_mlp": 0.01037296, - "balance_loss_clip": 1.03743172, - "balance_loss_mlp": 1.02490461, - "epoch": 0.8364346911167894, - "flos": 11874869596800.0, - "grad_norm": 2.172823280625671, - "language_loss": 0.78602064, - "learning_rate": 2.740501655534946e-07, - "loss": 0.80730277, - "num_input_tokens_seen": 300092950, - "step": 13912, - "time_per_iteration": 2.7247042655944824 - }, - { - "auxiliary_loss_clip": 0.01099282, - "auxiliary_loss_mlp": 0.01030051, - "balance_loss_clip": 1.03660643, - "balance_loss_mlp": 1.01849961, - "epoch": 0.8364948143694574, - "flos": 20225212974720.0, - "grad_norm": 1.9881992667595267, - "language_loss": 0.7914685, - "learning_rate": 2.738534240246797e-07, - "loss": 0.81276178, - "num_input_tokens_seen": 300110950, - "step": 13913, - "time_per_iteration": 4.134316682815552 - }, - { - "auxiliary_loss_clip": 0.01097532, - "auxiliary_loss_mlp": 0.01030734, - "balance_loss_clip": 1.03647411, - "balance_loss_mlp": 1.0179075, - "epoch": 0.8365549376221254, - "flos": 21612909058560.0, - "grad_norm": 2.642445797747624, - "language_loss": 0.73418862, - "learning_rate": 2.736567479515153e-07, - "loss": 0.75547129, - "num_input_tokens_seen": 300128705, - "step": 13914, - "time_per_iteration": 2.571171760559082 - }, - { - "auxiliary_loss_clip": 0.01062932, - "auxiliary_loss_mlp": 0.0103246, - "balance_loss_clip": 1.03763103, - "balance_loss_mlp": 1.01987803, - "epoch": 0.8366150608747933, - "flos": 23294210912640.0, - "grad_norm": 1.590677583762713, - "language_loss": 0.71320194, - "learning_rate": 2.7346013734146025e-07, - "loss": 0.73415583, - "num_input_tokens_seen": 300148635, - "step": 13915, - "time_per_iteration": 2.751453161239624 - }, - { - "auxiliary_loss_clip": 0.01080426, - "auxiliary_loss_mlp": 0.01030234, - "balance_loss_clip": 1.03791106, - "balance_loss_mlp": 1.01852822, - "epoch": 0.8366751841274613, - "flos": 15267673664640.0, - "grad_norm": 1.8965589808135064, - "language_loss": 0.71970236, - "learning_rate": 2.7326359220197035e-07, - "loss": 0.74080902, - "num_input_tokens_seen": 300165490, - "step": 13916, - "time_per_iteration": 2.6807093620300293 - }, - { - "auxiliary_loss_clip": 0.01077533, - "auxiliary_loss_mlp": 0.00770081, - "balance_loss_clip": 1.03652239, - "balance_loss_mlp": 1.00017905, - "epoch": 0.8367353073801292, - "flos": 13224931205760.0, - "grad_norm": 2.314558822643351, - "language_loss": 0.74767375, - "learning_rate": 2.7306711254049755e-07, - "loss": 0.76614988, - "num_input_tokens_seen": 300182130, - "step": 13917, - "time_per_iteration": 2.6898746490478516 - }, - { - "auxiliary_loss_clip": 0.01107617, - "auxiliary_loss_mlp": 0.01034237, - "balance_loss_clip": 1.03919959, - "balance_loss_mlp": 1.02238786, - "epoch": 0.8367954306327973, - "flos": 24205084928640.0, - "grad_norm": 1.7520480918143468, - "language_loss": 0.79073501, - "learning_rate": 2.728706983644933e-07, - "loss": 0.81215358, - "num_input_tokens_seen": 300203050, - "step": 13918, - "time_per_iteration": 2.585444450378418 - }, - { - "auxiliary_loss_clip": 0.01069111, - "auxiliary_loss_mlp": 0.01035129, - "balance_loss_clip": 1.03858578, - "balance_loss_mlp": 1.02256465, - "epoch": 0.8368555538854652, - "flos": 24534744975360.0, - "grad_norm": 1.689646886321145, - "language_loss": 0.67851698, - "learning_rate": 2.7267434968140457e-07, - "loss": 0.69955939, - "num_input_tokens_seen": 300224380, - "step": 13919, - "time_per_iteration": 2.7965781688690186 - }, - { - "auxiliary_loss_clip": 0.0109041, - "auxiliary_loss_mlp": 0.01036947, - "balance_loss_clip": 1.03292394, - "balance_loss_mlp": 1.02389407, - "epoch": 0.8369156771381332, - "flos": 20259363830400.0, - "grad_norm": 1.776956438502091, - "language_loss": 0.73908985, - "learning_rate": 2.7247806649867835e-07, - "loss": 0.76036346, - "num_input_tokens_seen": 300242915, - "step": 13920, - "time_per_iteration": 2.636904716491699 - }, - { - "auxiliary_loss_clip": 0.01088456, - "auxiliary_loss_mlp": 0.01029889, - "balance_loss_clip": 1.03454947, - "balance_loss_mlp": 1.0174973, - "epoch": 0.8369758003908011, - "flos": 21835555511040.0, - "grad_norm": 1.8125965247419975, - "language_loss": 0.68985099, - "learning_rate": 2.722818488237566e-07, - "loss": 0.71103442, - "num_input_tokens_seen": 300261905, - "step": 13921, - "time_per_iteration": 2.649538278579712 - }, - { - "auxiliary_loss_clip": 0.01101931, - "auxiliary_loss_mlp": 0.01031907, - "balance_loss_clip": 1.03782213, - "balance_loss_mlp": 1.01993847, - "epoch": 0.8370359236434691, - "flos": 21719312121600.0, - "grad_norm": 1.96204567174708, - "language_loss": 0.85527766, - "learning_rate": 2.720856966640801e-07, - "loss": 0.876616, - "num_input_tokens_seen": 300281145, - "step": 13922, - "time_per_iteration": 2.6043357849121094 - }, - { - "auxiliary_loss_clip": 0.01068346, - "auxiliary_loss_mlp": 0.00769545, - "balance_loss_clip": 1.0355072, - "balance_loss_mlp": 1.00012457, - "epoch": 0.837096046896137, - "flos": 23148880485120.0, - "grad_norm": 1.4962135590682923, - "language_loss": 0.71717429, - "learning_rate": 2.71889610027088e-07, - "loss": 0.73555321, - "num_input_tokens_seen": 300301610, - "step": 13923, - "time_per_iteration": 2.6874313354492188 - }, - { - "auxiliary_loss_clip": 0.01082449, - "auxiliary_loss_mlp": 0.01029737, - "balance_loss_clip": 1.03654337, - "balance_loss_mlp": 1.01662445, - "epoch": 0.8371561701488051, - "flos": 24492872695680.0, - "grad_norm": 1.885906995135632, - "language_loss": 0.759628, - "learning_rate": 2.7169358892021433e-07, - "loss": 0.78074992, - "num_input_tokens_seen": 300319420, - "step": 13924, - "time_per_iteration": 2.671105146408081 - }, - { - "auxiliary_loss_clip": 0.01084333, - "auxiliary_loss_mlp": 0.01027444, - "balance_loss_clip": 1.03405309, - "balance_loss_mlp": 1.01530862, - "epoch": 0.837216293401473, - "flos": 29206723161600.0, - "grad_norm": 1.5298544720059444, - "language_loss": 0.64247084, - "learning_rate": 2.7149763335089293e-07, - "loss": 0.66358864, - "num_input_tokens_seen": 300341325, - "step": 13925, - "time_per_iteration": 2.6903226375579834 - }, - { - "auxiliary_loss_clip": 0.01086129, - "auxiliary_loss_mlp": 0.01032025, - "balance_loss_clip": 1.03692162, - "balance_loss_mlp": 1.01949048, - "epoch": 0.837276416654141, - "flos": 25265275781760.0, - "grad_norm": 2.05791983020966, - "language_loss": 0.74643993, - "learning_rate": 2.713017433265543e-07, - "loss": 0.7676214, - "num_input_tokens_seen": 300361620, - "step": 13926, - "time_per_iteration": 2.7113802433013916 - }, - { - "auxiliary_loss_clip": 0.01099802, - "auxiliary_loss_mlp": 0.01036252, - "balance_loss_clip": 1.03915524, - "balance_loss_mlp": 1.02321601, - "epoch": 0.837336539906809, - "flos": 13882024656000.0, - "grad_norm": 5.046746125844788, - "language_loss": 0.71061361, - "learning_rate": 2.711059188546274e-07, - "loss": 0.73197412, - "num_input_tokens_seen": 300378675, - "step": 13927, - "time_per_iteration": 2.5931549072265625 - }, - { - "auxiliary_loss_clip": 0.01001985, - "auxiliary_loss_mlp": 0.01002351, - "balance_loss_clip": 1.00909257, - "balance_loss_mlp": 1.00123012, - "epoch": 0.8373966631594769, - "flos": 68870599044480.0, - "grad_norm": 0.7031143541961051, - "language_loss": 0.58787715, - "learning_rate": 2.7091015994253695e-07, - "loss": 0.60792047, - "num_input_tokens_seen": 300449740, - "step": 13928, - "time_per_iteration": 3.2960739135742188 - }, - { - "auxiliary_loss_clip": 0.01071961, - "auxiliary_loss_mlp": 0.01042566, - "balance_loss_clip": 1.03641248, - "balance_loss_mlp": 1.02829027, - "epoch": 0.8374567864121449, - "flos": 20448972748800.0, - "grad_norm": 1.77650313447611, - "language_loss": 0.69560969, - "learning_rate": 2.707144665977068e-07, - "loss": 0.71675503, - "num_input_tokens_seen": 300470000, - "step": 13929, - "time_per_iteration": 2.6807215213775635 - }, - { - "auxiliary_loss_clip": 0.01098571, - "auxiliary_loss_mlp": 0.01027466, - "balance_loss_clip": 1.03639913, - "balance_loss_mlp": 1.01444805, - "epoch": 0.8375169096648128, - "flos": 41904197101440.0, - "grad_norm": 1.5497858215784133, - "language_loss": 0.66676092, - "learning_rate": 2.705188388275574e-07, - "loss": 0.68802124, - "num_input_tokens_seen": 300494975, - "step": 13930, - "time_per_iteration": 2.861119031906128 - }, - { - "auxiliary_loss_clip": 0.01066352, - "auxiliary_loss_mlp": 0.01027411, - "balance_loss_clip": 1.03803921, - "balance_loss_mlp": 1.01527548, - "epoch": 0.8375770329174809, - "flos": 20009354192640.0, - "grad_norm": 1.6161268751514244, - "language_loss": 0.71101642, - "learning_rate": 2.703232766395067e-07, - "loss": 0.7319541, - "num_input_tokens_seen": 300513175, - "step": 13931, - "time_per_iteration": 2.8232531547546387 - }, - { - "auxiliary_loss_clip": 0.01072605, - "auxiliary_loss_mlp": 0.01032905, - "balance_loss_clip": 1.03191161, - "balance_loss_mlp": 1.02047718, - "epoch": 0.8376371561701488, - "flos": 22783597125120.0, - "grad_norm": 2.2209749295259913, - "language_loss": 0.71790922, - "learning_rate": 2.701277800409705e-07, - "loss": 0.73896432, - "num_input_tokens_seen": 300533770, - "step": 13932, - "time_per_iteration": 2.7237002849578857 - }, - { - "auxiliary_loss_clip": 0.0104491, - "auxiliary_loss_mlp": 0.01034703, - "balance_loss_clip": 1.03452373, - "balance_loss_mlp": 1.02334225, - "epoch": 0.8376972794228168, - "flos": 23914459987200.0, - "grad_norm": 2.3684193578736785, - "language_loss": 0.66962039, - "learning_rate": 2.699323490393628e-07, - "loss": 0.69041657, - "num_input_tokens_seen": 300552995, - "step": 13933, - "time_per_iteration": 2.926781415939331 - }, - { - "auxiliary_loss_clip": 0.01079254, - "auxiliary_loss_mlp": 0.01043444, - "balance_loss_clip": 1.03600967, - "balance_loss_mlp": 1.0309329, - "epoch": 0.8377574026754847, - "flos": 13734718980480.0, - "grad_norm": 1.9196886282794297, - "language_loss": 0.76411772, - "learning_rate": 2.697369836420933e-07, - "loss": 0.78534472, - "num_input_tokens_seen": 300570275, - "step": 13934, - "time_per_iteration": 2.8570826053619385 - }, - { - "auxiliary_loss_clip": 0.01100527, - "auxiliary_loss_mlp": 0.01029597, - "balance_loss_clip": 1.04098976, - "balance_loss_mlp": 1.01738369, - "epoch": 0.8378175259281527, - "flos": 21651333632640.0, - "grad_norm": 1.505075221616912, - "language_loss": 0.77353156, - "learning_rate": 2.6954168385657115e-07, - "loss": 0.79483283, - "num_input_tokens_seen": 300590875, - "step": 13935, - "time_per_iteration": 2.6582868099212646 - }, - { - "auxiliary_loss_clip": 0.01070099, - "auxiliary_loss_mlp": 0.01027318, - "balance_loss_clip": 1.03629911, - "balance_loss_mlp": 1.01469421, - "epoch": 0.8378776491808206, - "flos": 15448806973440.0, - "grad_norm": 5.23588368234973, - "language_loss": 0.56080019, - "learning_rate": 2.6934644969020135e-07, - "loss": 0.58177441, - "num_input_tokens_seen": 300607490, - "step": 13936, - "time_per_iteration": 2.684828042984009 - }, - { - "auxiliary_loss_clip": 0.01090807, - "auxiliary_loss_mlp": 0.0103317, - "balance_loss_clip": 1.03235912, - "balance_loss_mlp": 1.02025938, - "epoch": 0.8379377724334887, - "flos": 14720395069440.0, - "grad_norm": 2.797405460790855, - "language_loss": 0.89294749, - "learning_rate": 2.691512811503882e-07, - "loss": 0.91418725, - "num_input_tokens_seen": 300623635, - "step": 13937, - "time_per_iteration": 2.5899250507354736 - }, - { - "auxiliary_loss_clip": 0.01099019, - "auxiliary_loss_mlp": 0.01026723, - "balance_loss_clip": 1.03668594, - "balance_loss_mlp": 1.01458192, - "epoch": 0.8379978956861566, - "flos": 24535247765760.0, - "grad_norm": 2.44657354170433, - "language_loss": 0.81838822, - "learning_rate": 2.689561782445313e-07, - "loss": 0.83964562, - "num_input_tokens_seen": 300643835, - "step": 13938, - "time_per_iteration": 2.634232521057129 - }, - { - "auxiliary_loss_clip": 0.01101448, - "auxiliary_loss_mlp": 0.01033084, - "balance_loss_clip": 1.03746319, - "balance_loss_mlp": 1.01988757, - "epoch": 0.8380580189388246, - "flos": 18952611045120.0, - "grad_norm": 1.6841429045668255, - "language_loss": 0.7044903, - "learning_rate": 2.6876114098002965e-07, - "loss": 0.72583556, - "num_input_tokens_seen": 300662500, - "step": 13939, - "time_per_iteration": 2.61344575881958 - }, - { - "auxiliary_loss_clip": 0.01078321, - "auxiliary_loss_mlp": 0.01039086, - "balance_loss_clip": 1.03616691, - "balance_loss_mlp": 1.02573454, - "epoch": 0.8381181421914926, - "flos": 26540283922560.0, - "grad_norm": 6.593424997719047, - "language_loss": 0.76224637, - "learning_rate": 2.6856616936428e-07, - "loss": 0.78342044, - "num_input_tokens_seen": 300681480, - "step": 13940, - "time_per_iteration": 2.6947879791259766 - }, - { - "auxiliary_loss_clip": 0.01093556, - "auxiliary_loss_mlp": 0.01034451, - "balance_loss_clip": 1.0350914, - "balance_loss_mlp": 1.02206564, - "epoch": 0.8381782654441605, - "flos": 23291481479040.0, - "grad_norm": 1.6698207376849759, - "language_loss": 0.76370448, - "learning_rate": 2.6837126340467374e-07, - "loss": 0.78498459, - "num_input_tokens_seen": 300699165, - "step": 13941, - "time_per_iteration": 2.629971742630005 - }, - { - "auxiliary_loss_clip": 0.01068862, - "auxiliary_loss_mlp": 0.01030093, - "balance_loss_clip": 1.03516936, - "balance_loss_mlp": 1.01728964, - "epoch": 0.8382383886968285, - "flos": 26758800311040.0, - "grad_norm": 2.4302037617265793, - "language_loss": 0.73204666, - "learning_rate": 2.6817642310860276e-07, - "loss": 0.75303626, - "num_input_tokens_seen": 300714615, - "step": 13942, - "time_per_iteration": 2.741283893585205 - }, - { - "auxiliary_loss_clip": 0.01067172, - "auxiliary_loss_mlp": 0.01039608, - "balance_loss_clip": 1.03562307, - "balance_loss_mlp": 1.02545786, - "epoch": 0.8382985119494964, - "flos": 26104544035200.0, - "grad_norm": 1.6102800053781703, - "language_loss": 0.79528558, - "learning_rate": 2.679816484834554e-07, - "loss": 0.81635338, - "num_input_tokens_seen": 300734860, - "step": 13943, - "time_per_iteration": 2.7648844718933105 - }, - { - "auxiliary_loss_clip": 0.01057583, - "auxiliary_loss_mlp": 0.01030261, - "balance_loss_clip": 1.03292835, - "balance_loss_mlp": 1.01832187, - "epoch": 0.8383586352021645, - "flos": 16435129507200.0, - "grad_norm": 1.9936529414882505, - "language_loss": 0.85062182, - "learning_rate": 2.6778693953661766e-07, - "loss": 0.87150025, - "num_input_tokens_seen": 300752735, - "step": 13944, - "time_per_iteration": 2.702016592025757 - }, - { - "auxiliary_loss_clip": 0.01009407, - "auxiliary_loss_mlp": 0.00750919, - "balance_loss_clip": 1.00603545, - "balance_loss_mlp": 0.99966449, - "epoch": 0.8384187584548324, - "flos": 64195532288640.0, - "grad_norm": 0.6194539078330007, - "language_loss": 0.50268608, - "learning_rate": 2.6759229627547263e-07, - "loss": 0.5202893, - "num_input_tokens_seen": 300820760, - "step": 13945, - "time_per_iteration": 3.2719228267669678 - }, - { - "auxiliary_loss_clip": 0.01067358, - "auxiliary_loss_mlp": 0.01031818, - "balance_loss_clip": 1.03898573, - "balance_loss_mlp": 1.01964068, - "epoch": 0.8384788817075004, - "flos": 22382905933440.0, - "grad_norm": 2.0041104630114726, - "language_loss": 0.64992464, - "learning_rate": 2.673977187074017e-07, - "loss": 0.67091638, - "num_input_tokens_seen": 300840025, - "step": 13946, - "time_per_iteration": 2.7332231998443604 - }, - { - "auxiliary_loss_clip": 0.01060162, - "auxiliary_loss_mlp": 0.01032335, - "balance_loss_clip": 1.03414798, - "balance_loss_mlp": 1.01927578, - "epoch": 0.8385390049601683, - "flos": 29496845312640.0, - "grad_norm": 1.5282176707936672, - "language_loss": 0.67431152, - "learning_rate": 2.672032068397829e-07, - "loss": 0.69523644, - "num_input_tokens_seen": 300860380, - "step": 13947, - "time_per_iteration": 2.8148739337921143 - }, - { - "auxiliary_loss_clip": 0.01084671, - "auxiliary_loss_mlp": 0.01034421, - "balance_loss_clip": 1.03683496, - "balance_loss_mlp": 1.02082467, - "epoch": 0.8385991282128363, - "flos": 32707797799680.0, - "grad_norm": 2.1167215710156566, - "language_loss": 0.70042205, - "learning_rate": 2.6700876067999176e-07, - "loss": 0.72161293, - "num_input_tokens_seen": 300881895, - "step": 13948, - "time_per_iteration": 4.3659327030181885 - }, - { - "auxiliary_loss_clip": 0.01084202, - "auxiliary_loss_mlp": 0.01032949, - "balance_loss_clip": 1.03514576, - "balance_loss_mlp": 1.02195239, - "epoch": 0.8386592514655042, - "flos": 25441022050560.0, - "grad_norm": 2.5602152463146033, - "language_loss": 0.85150999, - "learning_rate": 2.6681438023540194e-07, - "loss": 0.8726815, - "num_input_tokens_seen": 300901575, - "step": 13949, - "time_per_iteration": 4.24399995803833 - }, - { - "auxiliary_loss_clip": 0.01081801, - "auxiliary_loss_mlp": 0.01029076, - "balance_loss_clip": 1.03833914, - "balance_loss_mlp": 1.01670778, - "epoch": 0.8387193747181723, - "flos": 22015898720640.0, - "grad_norm": 4.303340454207266, - "language_loss": 0.69926894, - "learning_rate": 2.66620065513385e-07, - "loss": 0.72037774, - "num_input_tokens_seen": 300919735, - "step": 13950, - "time_per_iteration": 4.277710914611816 - }, - { - "auxiliary_loss_clip": 0.01091242, - "auxiliary_loss_mlp": 0.01029375, - "balance_loss_clip": 1.03648567, - "balance_loss_mlp": 1.01687598, - "epoch": 0.8387794979708402, - "flos": 18150223080960.0, - "grad_norm": 1.697645904301953, - "language_loss": 0.6442672, - "learning_rate": 2.6642581652130913e-07, - "loss": 0.6654734, - "num_input_tokens_seen": 300939150, - "step": 13951, - "time_per_iteration": 2.564544439315796 - }, - { - "auxiliary_loss_clip": 0.01100469, - "auxiliary_loss_mlp": 0.01030012, - "balance_loss_clip": 1.03913283, - "balance_loss_mlp": 1.01795959, - "epoch": 0.8388396212235082, - "flos": 25411216740480.0, - "grad_norm": 1.428691785600865, - "language_loss": 0.69986898, - "learning_rate": 2.662316332665393e-07, - "loss": 0.72117376, - "num_input_tokens_seen": 300959730, - "step": 13952, - "time_per_iteration": 4.1969122886657715 - }, - { - "auxiliary_loss_clip": 0.01096336, - "auxiliary_loss_mlp": 0.01032886, - "balance_loss_clip": 1.03714609, - "balance_loss_mlp": 1.02128077, - "epoch": 0.8388997444761762, - "flos": 22273055164800.0, - "grad_norm": 1.8744192088426839, - "language_loss": 0.72788477, - "learning_rate": 2.6603751575643987e-07, - "loss": 0.74917698, - "num_input_tokens_seen": 300976120, - "step": 13953, - "time_per_iteration": 2.6013376712799072 - }, - { - "auxiliary_loss_clip": 0.01036141, - "auxiliary_loss_mlp": 0.01033482, - "balance_loss_clip": 1.03169441, - "balance_loss_mlp": 1.01992166, - "epoch": 0.8389598677288441, - "flos": 19573219255680.0, - "grad_norm": 1.9148205474215427, - "language_loss": 0.68345833, - "learning_rate": 2.6584346399837176e-07, - "loss": 0.70415455, - "num_input_tokens_seen": 300995080, - "step": 13954, - "time_per_iteration": 2.7236297130584717 - }, - { - "auxiliary_loss_clip": 0.0108771, - "auxiliary_loss_mlp": 0.01035111, - "balance_loss_clip": 1.03767776, - "balance_loss_mlp": 1.0240128, - "epoch": 0.8390199909815121, - "flos": 17384715406080.0, - "grad_norm": 1.7636414088599872, - "language_loss": 0.7324779, - "learning_rate": 2.656494779996932e-07, - "loss": 0.7537061, - "num_input_tokens_seen": 301012920, - "step": 13955, - "time_per_iteration": 2.661045551300049 - }, - { - "auxiliary_loss_clip": 0.01043432, - "auxiliary_loss_mlp": 0.01032322, - "balance_loss_clip": 1.03135204, - "balance_loss_mlp": 1.019346, - "epoch": 0.83908011423418, - "flos": 24639639667200.0, - "grad_norm": 8.047952352869046, - "language_loss": 0.66471386, - "learning_rate": 2.6545555776775995e-07, - "loss": 0.68547142, - "num_input_tokens_seen": 301028875, - "step": 13956, - "time_per_iteration": 2.7914817333221436 - }, - { - "auxiliary_loss_clip": 0.01099865, - "auxiliary_loss_mlp": 0.01036313, - "balance_loss_clip": 1.03744364, - "balance_loss_mlp": 1.02332473, - "epoch": 0.8391402374868481, - "flos": 24718356322560.0, - "grad_norm": 2.4631411130881995, - "language_loss": 0.79544741, - "learning_rate": 2.6526170330992667e-07, - "loss": 0.81680918, - "num_input_tokens_seen": 301050115, - "step": 13957, - "time_per_iteration": 2.7476260662078857 - }, - { - "auxiliary_loss_clip": 0.00983967, - "auxiliary_loss_mlp": 0.01019247, - "balance_loss_clip": 1.01336145, - "balance_loss_mlp": 1.01760185, - "epoch": 0.839200360739516, - "flos": 56871695784960.0, - "grad_norm": 0.7593984964089096, - "language_loss": 0.53379953, - "learning_rate": 2.6506791463354283e-07, - "loss": 0.5538317, - "num_input_tokens_seen": 301114155, - "step": 13958, - "time_per_iteration": 3.488459825515747 - }, - { - "auxiliary_loss_clip": 0.01098132, - "auxiliary_loss_mlp": 0.01032752, - "balance_loss_clip": 1.03722978, - "balance_loss_mlp": 1.01981759, - "epoch": 0.839260483992184, - "flos": 18332792933760.0, - "grad_norm": 1.8527919164572209, - "language_loss": 0.72979414, - "learning_rate": 2.648741917459574e-07, - "loss": 0.75110304, - "num_input_tokens_seen": 301133150, - "step": 13959, - "time_per_iteration": 3.048078775405884 - }, - { - "auxiliary_loss_clip": 0.01075035, - "auxiliary_loss_mlp": 0.01024754, - "balance_loss_clip": 1.037763, - "balance_loss_mlp": 1.01298177, - "epoch": 0.8393206072448519, - "flos": 27087921653760.0, - "grad_norm": 2.0364412666865843, - "language_loss": 0.55541557, - "learning_rate": 2.646805346545169e-07, - "loss": 0.57641345, - "num_input_tokens_seen": 301153600, - "step": 13960, - "time_per_iteration": 2.835035800933838 - }, - { - "auxiliary_loss_clip": 0.01002229, - "auxiliary_loss_mlp": 0.01000998, - "balance_loss_clip": 1.0077697, - "balance_loss_mlp": 1.00003195, - "epoch": 0.8393807304975199, - "flos": 61521192057600.0, - "grad_norm": 0.7763315867784596, - "language_loss": 0.60705209, - "learning_rate": 2.6448694336656397e-07, - "loss": 0.62708437, - "num_input_tokens_seen": 301214335, - "step": 13961, - "time_per_iteration": 3.3663535118103027 - }, - { - "auxiliary_loss_clip": 0.01052805, - "auxiliary_loss_mlp": 0.010396, - "balance_loss_clip": 1.02986741, - "balance_loss_mlp": 1.02657616, - "epoch": 0.8394408537501878, - "flos": 14894848448640.0, - "grad_norm": 2.557584362268972, - "language_loss": 0.68461823, - "learning_rate": 2.642934178894405e-07, - "loss": 0.70554227, - "num_input_tokens_seen": 301228960, - "step": 13962, - "time_per_iteration": 2.6838927268981934 - }, - { - "auxiliary_loss_clip": 0.01077301, - "auxiliary_loss_mlp": 0.01027695, - "balance_loss_clip": 1.03520894, - "balance_loss_mlp": 1.01575601, - "epoch": 0.8395009770028559, - "flos": 17412186332160.0, - "grad_norm": 1.9087314512083013, - "language_loss": 0.72709483, - "learning_rate": 2.640999582304841e-07, - "loss": 0.74814475, - "num_input_tokens_seen": 301245875, - "step": 13963, - "time_per_iteration": 2.7063026428222656 - }, - { - "auxiliary_loss_clip": 0.01085945, - "auxiliary_loss_mlp": 0.01034113, - "balance_loss_clip": 1.035007, - "balance_loss_mlp": 1.02194226, - "epoch": 0.8395611002555238, - "flos": 27924747782400.0, - "grad_norm": 1.5783482209537385, - "language_loss": 0.76520944, - "learning_rate": 2.6390656439703173e-07, - "loss": 0.78640997, - "num_input_tokens_seen": 301265550, - "step": 13964, - "time_per_iteration": 2.7841615676879883 - }, - { - "auxiliary_loss_clip": 0.01089552, - "auxiliary_loss_mlp": 0.01036408, - "balance_loss_clip": 1.03722572, - "balance_loss_mlp": 1.02287793, - "epoch": 0.8396212235081918, - "flos": 11100922225920.0, - "grad_norm": 2.0757447568639633, - "language_loss": 0.78032225, - "learning_rate": 2.637132363964161e-07, - "loss": 0.80158186, - "num_input_tokens_seen": 301282035, - "step": 13965, - "time_per_iteration": 2.67738938331604 - }, - { - "auxiliary_loss_clip": 0.01092348, - "auxiliary_loss_mlp": 0.01032536, - "balance_loss_clip": 1.03630924, - "balance_loss_mlp": 1.02068114, - "epoch": 0.8396813467608598, - "flos": 35735641729920.0, - "grad_norm": 1.499677295305954, - "language_loss": 0.65898132, - "learning_rate": 2.635199742359684e-07, - "loss": 0.68023014, - "num_input_tokens_seen": 301305210, - "step": 13966, - "time_per_iteration": 2.7493228912353516 - }, - { - "auxiliary_loss_clip": 0.01086107, - "auxiliary_loss_mlp": 0.01032854, - "balance_loss_clip": 1.03722155, - "balance_loss_mlp": 1.02049196, - "epoch": 0.8397414700135277, - "flos": 26176724415360.0, - "grad_norm": 1.9100754434512948, - "language_loss": 0.74755192, - "learning_rate": 2.633267779230177e-07, - "loss": 0.76874149, - "num_input_tokens_seen": 301324885, - "step": 13967, - "time_per_iteration": 2.6640665531158447 - }, - { - "auxiliary_loss_clip": 0.01081249, - "auxiliary_loss_mlp": 0.01029478, - "balance_loss_clip": 1.0370276, - "balance_loss_mlp": 1.01756883, - "epoch": 0.8398015932661957, - "flos": 18333116156160.0, - "grad_norm": 1.8492234177580402, - "language_loss": 0.82993788, - "learning_rate": 2.6313364746488974e-07, - "loss": 0.85104513, - "num_input_tokens_seen": 301343070, - "step": 13968, - "time_per_iteration": 2.5900182723999023 - }, - { - "auxiliary_loss_clip": 0.01083656, - "auxiliary_loss_mlp": 0.01032124, - "balance_loss_clip": 1.03804672, - "balance_loss_mlp": 1.01986384, - "epoch": 0.8398617165188637, - "flos": 17379507934080.0, - "grad_norm": 2.094652231343387, - "language_loss": 0.7729916, - "learning_rate": 2.629405828689075e-07, - "loss": 0.7941494, - "num_input_tokens_seen": 301359280, - "step": 13969, - "time_per_iteration": 2.611394166946411 - }, - { - "auxiliary_loss_clip": 0.01090762, - "auxiliary_loss_mlp": 0.01030453, - "balance_loss_clip": 1.03660858, - "balance_loss_mlp": 1.01741195, - "epoch": 0.8399218397715317, - "flos": 22929681738240.0, - "grad_norm": 2.0003618837908244, - "language_loss": 0.77181804, - "learning_rate": 2.627475841423923e-07, - "loss": 0.79303014, - "num_input_tokens_seen": 301376465, - "step": 13970, - "time_per_iteration": 2.6121816635131836 - }, - { - "auxiliary_loss_clip": 0.01087144, - "auxiliary_loss_mlp": 0.01038704, - "balance_loss_clip": 1.03651595, - "balance_loss_mlp": 1.02689075, - "epoch": 0.8399819630241996, - "flos": 23149562843520.0, - "grad_norm": 2.097520356637354, - "language_loss": 0.71949625, - "learning_rate": 2.625546512926633e-07, - "loss": 0.74075466, - "num_input_tokens_seen": 301396000, - "step": 13971, - "time_per_iteration": 2.6382222175598145 - }, - { - "auxiliary_loss_clip": 0.01085619, - "auxiliary_loss_mlp": 0.01031304, - "balance_loss_clip": 1.03411746, - "balance_loss_mlp": 1.01840544, - "epoch": 0.8400420862768676, - "flos": 16397423205120.0, - "grad_norm": 1.7445644136224228, - "language_loss": 0.77706194, - "learning_rate": 2.623617843270358e-07, - "loss": 0.79823112, - "num_input_tokens_seen": 301413160, - "step": 13972, - "time_per_iteration": 2.637141227722168 - }, - { - "auxiliary_loss_clip": 0.01041674, - "auxiliary_loss_mlp": 0.01037854, - "balance_loss_clip": 1.03100634, - "balance_loss_mlp": 1.02458596, - "epoch": 0.8401022095295355, - "flos": 21287486816640.0, - "grad_norm": 1.304807190993185, - "language_loss": 0.68481863, - "learning_rate": 2.6216898325282333e-07, - "loss": 0.70561385, - "num_input_tokens_seen": 301433325, - "step": 13973, - "time_per_iteration": 2.7618348598480225 - }, - { - "auxiliary_loss_clip": 0.01088741, - "auxiliary_loss_mlp": 0.01030742, - "balance_loss_clip": 1.03717828, - "balance_loss_mlp": 1.01786125, - "epoch": 0.8401623327822035, - "flos": 17311313963520.0, - "grad_norm": 2.2621035315363858, - "language_loss": 0.78135633, - "learning_rate": 2.619762480773382e-07, - "loss": 0.80255115, - "num_input_tokens_seen": 301450265, - "step": 13974, - "time_per_iteration": 2.6674814224243164 - }, - { - "auxiliary_loss_clip": 0.01095006, - "auxiliary_loss_mlp": 0.01030636, - "balance_loss_clip": 1.03827214, - "balance_loss_mlp": 1.01826859, - "epoch": 0.8402224560348714, - "flos": 22236677665920.0, - "grad_norm": 1.513610095867281, - "language_loss": 0.7256, - "learning_rate": 2.617835788078868e-07, - "loss": 0.74685645, - "num_input_tokens_seen": 301470760, - "step": 13975, - "time_per_iteration": 2.838907241821289 - }, - { - "auxiliary_loss_clip": 0.01089044, - "auxiliary_loss_mlp": 0.01025952, - "balance_loss_clip": 1.03686631, - "balance_loss_mlp": 1.01353598, - "epoch": 0.8402825792875395, - "flos": 20229953569920.0, - "grad_norm": 7.753004351668279, - "language_loss": 0.72390342, - "learning_rate": 2.6159097545177645e-07, - "loss": 0.74505341, - "num_input_tokens_seen": 301489425, - "step": 13976, - "time_per_iteration": 2.726900100708008 - }, - { - "auxiliary_loss_clip": 0.01107341, - "auxiliary_loss_mlp": 0.00769496, - "balance_loss_clip": 1.03678119, - "balance_loss_mlp": 1.00013971, - "epoch": 0.8403427025402074, - "flos": 23289973107840.0, - "grad_norm": 1.8413083341315597, - "language_loss": 0.71979779, - "learning_rate": 2.61398438016311e-07, - "loss": 0.7385661, - "num_input_tokens_seen": 301508885, - "step": 13977, - "time_per_iteration": 2.630323886871338 - }, - { - "auxiliary_loss_clip": 0.01096098, - "auxiliary_loss_mlp": 0.01032339, - "balance_loss_clip": 1.03397727, - "balance_loss_mlp": 1.02011466, - "epoch": 0.8404028257928754, - "flos": 32675586278400.0, - "grad_norm": 1.7930366312861392, - "language_loss": 0.68852651, - "learning_rate": 2.6120596650879043e-07, - "loss": 0.70981085, - "num_input_tokens_seen": 301533780, - "step": 13978, - "time_per_iteration": 2.7467479705810547 - }, - { - "auxiliary_loss_clip": 0.01071792, - "auxiliary_loss_mlp": 0.01031819, - "balance_loss_clip": 1.03347301, - "balance_loss_mlp": 1.01965952, - "epoch": 0.8404629490455434, - "flos": 16180522928640.0, - "grad_norm": 1.8105667854844871, - "language_loss": 0.77938527, - "learning_rate": 2.610135609365145e-07, - "loss": 0.80042142, - "num_input_tokens_seen": 301551775, - "step": 13979, - "time_per_iteration": 2.6985831260681152 - }, - { - "auxiliary_loss_clip": 0.0109651, - "auxiliary_loss_mlp": 0.01030498, - "balance_loss_clip": 1.04012775, - "balance_loss_mlp": 1.01822543, - "epoch": 0.8405230722982113, - "flos": 15194451790080.0, - "grad_norm": 2.0614080045574714, - "language_loss": 0.77732342, - "learning_rate": 2.60821221306778e-07, - "loss": 0.79859352, - "num_input_tokens_seen": 301570495, - "step": 13980, - "time_per_iteration": 2.5943267345428467 - }, - { - "auxiliary_loss_clip": 0.01073604, - "auxiliary_loss_mlp": 0.01029935, - "balance_loss_clip": 1.03548491, - "balance_loss_mlp": 1.01782358, - "epoch": 0.8405831955508793, - "flos": 27812418975360.0, - "grad_norm": 5.854551943090863, - "language_loss": 0.86627793, - "learning_rate": 2.606289476268757e-07, - "loss": 0.88731331, - "num_input_tokens_seen": 301591705, - "step": 13981, - "time_per_iteration": 2.742199182510376 - }, - { - "auxiliary_loss_clip": 0.01097126, - "auxiliary_loss_mlp": 0.01033594, - "balance_loss_clip": 1.03606057, - "balance_loss_mlp": 1.02131581, - "epoch": 0.8406433188035473, - "flos": 23769452782080.0, - "grad_norm": 2.935607999333321, - "language_loss": 0.67501163, - "learning_rate": 2.6043673990409745e-07, - "loss": 0.69631881, - "num_input_tokens_seen": 301611670, - "step": 13982, - "time_per_iteration": 2.6252353191375732 - }, - { - "auxiliary_loss_clip": 0.01061743, - "auxiliary_loss_mlp": 0.01041459, - "balance_loss_clip": 1.03561366, - "balance_loss_mlp": 1.02742803, - "epoch": 0.8407034420562153, - "flos": 29205681667200.0, - "grad_norm": 3.073966172290324, - "language_loss": 0.67936915, - "learning_rate": 2.602445981457324e-07, - "loss": 0.70040119, - "num_input_tokens_seen": 301632540, - "step": 13983, - "time_per_iteration": 2.7724905014038086 - }, - { - "auxiliary_loss_clip": 0.01069644, - "auxiliary_loss_mlp": 0.01032871, - "balance_loss_clip": 1.03083551, - "balance_loss_mlp": 1.01959062, - "epoch": 0.8407635653088832, - "flos": 26360084367360.0, - "grad_norm": 1.7674861859482776, - "language_loss": 0.79221404, - "learning_rate": 2.6005252235906684e-07, - "loss": 0.8132391, - "num_input_tokens_seen": 301651480, - "step": 13984, - "time_per_iteration": 2.7457640171051025 - }, - { - "auxiliary_loss_clip": 0.01094285, - "auxiliary_loss_mlp": 0.01033616, - "balance_loss_clip": 1.03387666, - "balance_loss_mlp": 1.02156985, - "epoch": 0.8408236885615512, - "flos": 21468799693440.0, - "grad_norm": 2.394750373647798, - "language_loss": 0.59764493, - "learning_rate": 2.598605125513842e-07, - "loss": 0.6189239, - "num_input_tokens_seen": 301670010, - "step": 13985, - "time_per_iteration": 2.6200110912323 - }, - { - "auxiliary_loss_clip": 0.01067816, - "auxiliary_loss_mlp": 0.01029355, - "balance_loss_clip": 1.03496993, - "balance_loss_mlp": 1.01653397, - "epoch": 0.8408838118142191, - "flos": 22963724853120.0, - "grad_norm": 1.5708649671091988, - "language_loss": 0.82083929, - "learning_rate": 2.5966856872996467e-07, - "loss": 0.84181106, - "num_input_tokens_seen": 301689785, - "step": 13986, - "time_per_iteration": 2.728940725326538 - }, - { - "auxiliary_loss_clip": 0.01088746, - "auxiliary_loss_mlp": 0.0077024, - "balance_loss_clip": 1.03921962, - "balance_loss_mlp": 1.00023127, - "epoch": 0.8409439350668871, - "flos": 26800026145920.0, - "grad_norm": 1.4303842163720517, - "language_loss": 0.6583513, - "learning_rate": 2.5947669090208755e-07, - "loss": 0.67694116, - "num_input_tokens_seen": 301712225, - "step": 13987, - "time_per_iteration": 4.393038988113403 - }, - { - "auxiliary_loss_clip": 0.01109413, - "auxiliary_loss_mlp": 0.00770439, - "balance_loss_clip": 1.03814602, - "balance_loss_mlp": 1.00023389, - "epoch": 0.841004058319555, - "flos": 26578672583040.0, - "grad_norm": 2.100312722202425, - "language_loss": 0.67510009, - "learning_rate": 2.5928487907502906e-07, - "loss": 0.69389856, - "num_input_tokens_seen": 301730955, - "step": 13988, - "time_per_iteration": 4.25507664680481 - }, - { - "auxiliary_loss_clip": 0.01099532, - "auxiliary_loss_mlp": 0.01036728, - "balance_loss_clip": 1.04084682, - "balance_loss_mlp": 1.02341866, - "epoch": 0.8410641815722231, - "flos": 14501878680960.0, - "grad_norm": 2.2740432778318143, - "language_loss": 0.81379843, - "learning_rate": 2.590931332560622e-07, - "loss": 0.83516109, - "num_input_tokens_seen": 301746930, - "step": 13989, - "time_per_iteration": 2.584982395172119 - }, - { - "auxiliary_loss_clip": 0.01096831, - "auxiliary_loss_mlp": 0.0103085, - "balance_loss_clip": 1.03519654, - "balance_loss_mlp": 1.01829755, - "epoch": 0.841124304824891, - "flos": 29166682475520.0, - "grad_norm": 1.6804070387823404, - "language_loss": 0.75063282, - "learning_rate": 2.5890145345245826e-07, - "loss": 0.77190965, - "num_input_tokens_seen": 301766945, - "step": 13990, - "time_per_iteration": 4.359278440475464 - }, - { - "auxiliary_loss_clip": 0.01093958, - "auxiliary_loss_mlp": 0.01031281, - "balance_loss_clip": 1.03545666, - "balance_loss_mlp": 1.01897252, - "epoch": 0.841184428077559, - "flos": 22412028885120.0, - "grad_norm": 1.7221123856133072, - "language_loss": 0.80666637, - "learning_rate": 2.5870983967148597e-07, - "loss": 0.82791877, - "num_input_tokens_seen": 301785460, - "step": 13991, - "time_per_iteration": 4.206341743469238 - }, - { - "auxiliary_loss_clip": 0.01070481, - "auxiliary_loss_mlp": 0.01033414, - "balance_loss_clip": 1.0353756, - "balance_loss_mlp": 1.0215528, - "epoch": 0.841244551330227, - "flos": 22962791099520.0, - "grad_norm": 2.0828857174593263, - "language_loss": 0.70396578, - "learning_rate": 2.585182919204105e-07, - "loss": 0.72500479, - "num_input_tokens_seen": 301804180, - "step": 13992, - "time_per_iteration": 2.692427158355713 - }, - { - "auxiliary_loss_clip": 0.01075291, - "auxiliary_loss_mlp": 0.01027217, - "balance_loss_clip": 1.03414965, - "balance_loss_mlp": 1.01490271, - "epoch": 0.8413046745828949, - "flos": 21032736583680.0, - "grad_norm": 3.8503455300269427, - "language_loss": 0.76960343, - "learning_rate": 2.583268102064959e-07, - "loss": 0.79062855, - "num_input_tokens_seen": 301823670, - "step": 13993, - "time_per_iteration": 2.704113006591797 - }, - { - "auxiliary_loss_clip": 0.01102579, - "auxiliary_loss_mlp": 0.01036453, - "balance_loss_clip": 1.0354774, - "balance_loss_mlp": 1.02206421, - "epoch": 0.841364797835563, - "flos": 27052082858880.0, - "grad_norm": 6.277502737271607, - "language_loss": 0.74242276, - "learning_rate": 2.5813539453700393e-07, - "loss": 0.76381308, - "num_input_tokens_seen": 301845890, - "step": 13994, - "time_per_iteration": 2.6997077465057373 - }, - { - "auxiliary_loss_clip": 0.01094097, - "auxiliary_loss_mlp": 0.01029334, - "balance_loss_clip": 1.0345273, - "balance_loss_mlp": 1.01830745, - "epoch": 0.8414249210882309, - "flos": 17895688329600.0, - "grad_norm": 1.5616526510476096, - "language_loss": 0.5941689, - "learning_rate": 2.5794404491919163e-07, - "loss": 0.61540318, - "num_input_tokens_seen": 301863985, - "step": 13995, - "time_per_iteration": 2.6176936626434326 - }, - { - "auxiliary_loss_clip": 0.01095561, - "auxiliary_loss_mlp": 0.01031608, - "balance_loss_clip": 1.035285, - "balance_loss_mlp": 1.01885819, - "epoch": 0.8414850443408989, - "flos": 25441201618560.0, - "grad_norm": 2.764700779392295, - "language_loss": 0.71651798, - "learning_rate": 2.577527613603163e-07, - "loss": 0.73778963, - "num_input_tokens_seen": 301882765, - "step": 13996, - "time_per_iteration": 2.596438407897949 - }, - { - "auxiliary_loss_clip": 0.0108265, - "auxiliary_loss_mlp": 0.01030717, - "balance_loss_clip": 1.03388953, - "balance_loss_mlp": 1.01880229, - "epoch": 0.8415451675935668, - "flos": 23220055284480.0, - "grad_norm": 1.7462285917475873, - "language_loss": 0.64240086, - "learning_rate": 2.5756154386763017e-07, - "loss": 0.66353452, - "num_input_tokens_seen": 301902720, - "step": 13997, - "time_per_iteration": 2.7167398929595947 - }, - { - "auxiliary_loss_clip": 0.01087567, - "auxiliary_loss_mlp": 0.01035998, - "balance_loss_clip": 1.03863931, - "balance_loss_mlp": 1.02296853, - "epoch": 0.8416052908462348, - "flos": 18546496899840.0, - "grad_norm": 1.8459858361991137, - "language_loss": 0.82516265, - "learning_rate": 2.5737039244838565e-07, - "loss": 0.84639835, - "num_input_tokens_seen": 301921245, - "step": 13998, - "time_per_iteration": 2.6906321048736572 - }, - { - "auxiliary_loss_clip": 0.0110001, - "auxiliary_loss_mlp": 0.00769946, - "balance_loss_clip": 1.03833914, - "balance_loss_mlp": 1.00016832, - "epoch": 0.8416654140989027, - "flos": 26105190480000.0, - "grad_norm": 2.037627492824348, - "language_loss": 0.80260479, - "learning_rate": 2.5717930710982984e-07, - "loss": 0.82130432, - "num_input_tokens_seen": 301942320, - "step": 13999, - "time_per_iteration": 2.679971218109131 - }, - { - "auxiliary_loss_clip": 0.01098013, - "auxiliary_loss_mlp": 0.01033879, - "balance_loss_clip": 1.03585649, - "balance_loss_mlp": 1.02033651, - "epoch": 0.8417255373515707, - "flos": 26433270328320.0, - "grad_norm": 2.9164172994343946, - "language_loss": 0.66541272, - "learning_rate": 2.569882878592096e-07, - "loss": 0.68673158, - "num_input_tokens_seen": 301963110, - "step": 14000, - "time_per_iteration": 2.6393961906433105 - }, - { - "auxiliary_loss_clip": 0.011048, - "auxiliary_loss_mlp": 0.01028492, - "balance_loss_clip": 1.03878963, - "balance_loss_mlp": 1.01545656, - "epoch": 0.8417856606042387, - "flos": 24717745791360.0, - "grad_norm": 1.439326835594235, - "language_loss": 0.79285717, - "learning_rate": 2.5679733470376885e-07, - "loss": 0.81419003, - "num_input_tokens_seen": 301984915, - "step": 14001, - "time_per_iteration": 2.6358094215393066 - }, - { - "auxiliary_loss_clip": 0.01045692, - "auxiliary_loss_mlp": 0.01031649, - "balance_loss_clip": 1.0337944, - "balance_loss_mlp": 1.01975203, - "epoch": 0.8418457838569067, - "flos": 20850849089280.0, - "grad_norm": 1.7593384488852517, - "language_loss": 0.78821921, - "learning_rate": 2.5660644765074703e-07, - "loss": 0.80899262, - "num_input_tokens_seen": 302004095, - "step": 14002, - "time_per_iteration": 2.7560184001922607 - }, - { - "auxiliary_loss_clip": 0.01062189, - "auxiliary_loss_mlp": 0.00769355, - "balance_loss_clip": 1.03507459, - "balance_loss_mlp": 1.00019288, - "epoch": 0.8419059071095746, - "flos": 28660629715200.0, - "grad_norm": 1.490278429458478, - "language_loss": 0.78022176, - "learning_rate": 2.5641562670738334e-07, - "loss": 0.79853719, - "num_input_tokens_seen": 302027250, - "step": 14003, - "time_per_iteration": 2.792100429534912 - }, - { - "auxiliary_loss_clip": 0.01083114, - "auxiliary_loss_mlp": 0.01028819, - "balance_loss_clip": 1.03756177, - "balance_loss_mlp": 1.01619506, - "epoch": 0.8419660303622426, - "flos": 21653596189440.0, - "grad_norm": 4.275398079582637, - "language_loss": 0.65523028, - "learning_rate": 2.5622487188091436e-07, - "loss": 0.67634964, - "num_input_tokens_seen": 302046950, - "step": 14004, - "time_per_iteration": 2.676882028579712 - }, - { - "auxiliary_loss_clip": 0.01098301, - "auxiliary_loss_mlp": 0.01032889, - "balance_loss_clip": 1.03571546, - "balance_loss_mlp": 1.01909709, - "epoch": 0.8420261536149106, - "flos": 25301114576640.0, - "grad_norm": 2.012358102157947, - "language_loss": 0.76216292, - "learning_rate": 2.560341831785724e-07, - "loss": 0.7834748, - "num_input_tokens_seen": 302065470, - "step": 14005, - "time_per_iteration": 2.6246840953826904 - }, - { - "auxiliary_loss_clip": 0.01072567, - "auxiliary_loss_mlp": 0.00770849, - "balance_loss_clip": 1.03307796, - "balance_loss_mlp": 1.00026453, - "epoch": 0.8420862768675785, - "flos": 18763397176320.0, - "grad_norm": 1.64958735251114, - "language_loss": 0.77457279, - "learning_rate": 2.5584356060758906e-07, - "loss": 0.7930069, - "num_input_tokens_seen": 302083190, - "step": 14006, - "time_per_iteration": 2.686828136444092 - }, - { - "auxiliary_loss_clip": 0.01098645, - "auxiliary_loss_mlp": 0.01036893, - "balance_loss_clip": 1.03723645, - "balance_loss_mlp": 1.02451313, - "epoch": 0.8421464001202466, - "flos": 18328052338560.0, - "grad_norm": 2.595732898924613, - "language_loss": 0.76791775, - "learning_rate": 2.556530041751932e-07, - "loss": 0.78927308, - "num_input_tokens_seen": 302098820, - "step": 14007, - "time_per_iteration": 2.5972254276275635 - }, - { - "auxiliary_loss_clip": 0.01081698, - "auxiliary_loss_mlp": 0.01034223, - "balance_loss_clip": 1.03605211, - "balance_loss_mlp": 1.02137184, - "epoch": 0.8422065233729145, - "flos": 31537181560320.0, - "grad_norm": 2.375931901998386, - "language_loss": 0.65710688, - "learning_rate": 2.554625138886102e-07, - "loss": 0.67826605, - "num_input_tokens_seen": 302117075, - "step": 14008, - "time_per_iteration": 2.700505256652832 - }, - { - "auxiliary_loss_clip": 0.01019521, - "auxiliary_loss_mlp": 0.01001888, - "balance_loss_clip": 1.00692487, - "balance_loss_mlp": 1.00089824, - "epoch": 0.8422666466255825, - "flos": 64298128510080.0, - "grad_norm": 0.7086546699989251, - "language_loss": 0.5692749, - "learning_rate": 2.552720897550631e-07, - "loss": 0.58948898, - "num_input_tokens_seen": 302179735, - "step": 14009, - "time_per_iteration": 3.2387187480926514 - }, - { - "auxiliary_loss_clip": 0.01039857, - "auxiliary_loss_mlp": 0.01034967, - "balance_loss_clip": 1.03280532, - "balance_loss_mlp": 1.02329016, - "epoch": 0.8423267698782504, - "flos": 24316731377280.0, - "grad_norm": 1.394156026072437, - "language_loss": 0.77893424, - "learning_rate": 2.5508173178177304e-07, - "loss": 0.79968244, - "num_input_tokens_seen": 302202055, - "step": 14010, - "time_per_iteration": 2.8507986068725586 - }, - { - "auxiliary_loss_clip": 0.01113646, - "auxiliary_loss_mlp": 0.01037844, - "balance_loss_clip": 1.03962326, - "balance_loss_mlp": 1.0242126, - "epoch": 0.8423868931309184, - "flos": 18296092212480.0, - "grad_norm": 1.6155120229975741, - "language_loss": 0.72607601, - "learning_rate": 2.548914399759592e-07, - "loss": 0.7475909, - "num_input_tokens_seen": 302221360, - "step": 14011, - "time_per_iteration": 2.614745855331421 - }, - { - "auxiliary_loss_clip": 0.01093355, - "auxiliary_loss_mlp": 0.01039684, - "balance_loss_clip": 1.0365963, - "balance_loss_mlp": 1.02718472, - "epoch": 0.8424470163835863, - "flos": 23550218121600.0, - "grad_norm": 1.762716946245802, - "language_loss": 0.84175313, - "learning_rate": 2.5470121434483636e-07, - "loss": 0.86308348, - "num_input_tokens_seen": 302240715, - "step": 14012, - "time_per_iteration": 2.872255325317383 - }, - { - "auxiliary_loss_clip": 0.01100527, - "auxiliary_loss_mlp": 0.01030747, - "balance_loss_clip": 1.03485525, - "balance_loss_mlp": 1.02031064, - "epoch": 0.8425071396362543, - "flos": 23769488695680.0, - "grad_norm": 1.7021120391885685, - "language_loss": 0.67887056, - "learning_rate": 2.5451105489561884e-07, - "loss": 0.70018327, - "num_input_tokens_seen": 302260950, - "step": 14013, - "time_per_iteration": 2.603848457336426 - }, - { - "auxiliary_loss_clip": 0.01115809, - "auxiliary_loss_mlp": 0.01036207, - "balance_loss_clip": 1.03945398, - "balance_loss_mlp": 1.02304602, - "epoch": 0.8425672628889223, - "flos": 16178906816640.0, - "grad_norm": 3.415074767080469, - "language_loss": 0.78946209, - "learning_rate": 2.5432096163551644e-07, - "loss": 0.81098223, - "num_input_tokens_seen": 302277500, - "step": 14014, - "time_per_iteration": 2.555556297302246 - }, - { - "auxiliary_loss_clip": 0.01077492, - "auxiliary_loss_mlp": 0.00770145, - "balance_loss_clip": 1.03449714, - "balance_loss_mlp": 1.00027716, - "epoch": 0.8426273861415903, - "flos": 23149131880320.0, - "grad_norm": 1.667905320409494, - "language_loss": 0.67027128, - "learning_rate": 2.5413093457173884e-07, - "loss": 0.68874758, - "num_input_tokens_seen": 302297930, - "step": 14015, - "time_per_iteration": 2.7183566093444824 - }, - { - "auxiliary_loss_clip": 0.011092, - "auxiliary_loss_mlp": 0.01031339, - "balance_loss_clip": 1.03803563, - "balance_loss_mlp": 1.0183686, - "epoch": 0.8426875093942582, - "flos": 17457757712640.0, - "grad_norm": 5.511316765631844, - "language_loss": 0.76168728, - "learning_rate": 2.5394097371149036e-07, - "loss": 0.78309268, - "num_input_tokens_seen": 302315735, - "step": 14016, - "time_per_iteration": 2.5260772705078125 - }, - { - "auxiliary_loss_clip": 0.01086806, - "auxiliary_loss_mlp": 0.01032087, - "balance_loss_clip": 1.03610539, - "balance_loss_mlp": 1.01919413, - "epoch": 0.8427476326469262, - "flos": 19640551299840.0, - "grad_norm": 1.8433329789592472, - "language_loss": 0.79657745, - "learning_rate": 2.5375107906197544e-07, - "loss": 0.81776643, - "num_input_tokens_seen": 302332790, - "step": 14017, - "time_per_iteration": 2.630877733230591 - }, - { - "auxiliary_loss_clip": 0.01087127, - "auxiliary_loss_mlp": 0.0103184, - "balance_loss_clip": 1.03714514, - "balance_loss_mlp": 1.02009821, - "epoch": 0.8428077558995941, - "flos": 11941160146560.0, - "grad_norm": 2.433761002198627, - "language_loss": 0.63508832, - "learning_rate": 2.5356125063039525e-07, - "loss": 0.65627795, - "num_input_tokens_seen": 302346490, - "step": 14018, - "time_per_iteration": 2.600435256958008 - }, - { - "auxiliary_loss_clip": 0.0109746, - "auxiliary_loss_mlp": 0.01036009, - "balance_loss_clip": 1.03713536, - "balance_loss_mlp": 1.02413559, - "epoch": 0.8428678791522621, - "flos": 10451729767680.0, - "grad_norm": 2.058952264097869, - "language_loss": 0.79526985, - "learning_rate": 2.5337148842394687e-07, - "loss": 0.81660461, - "num_input_tokens_seen": 302363235, - "step": 14019, - "time_per_iteration": 2.606147289276123 - }, - { - "auxiliary_loss_clip": 0.01066617, - "auxiliary_loss_mlp": 0.01042966, - "balance_loss_clip": 1.0320183, - "balance_loss_mlp": 1.02731419, - "epoch": 0.8429280024049302, - "flos": 28767248259840.0, - "grad_norm": 1.8880951217635216, - "language_loss": 0.78381932, - "learning_rate": 2.531817924498265e-07, - "loss": 0.80491519, - "num_input_tokens_seen": 302383270, - "step": 14020, - "time_per_iteration": 2.761439561843872 - }, - { - "auxiliary_loss_clip": 0.01094532, - "auxiliary_loss_mlp": 0.01027321, - "balance_loss_clip": 1.03691518, - "balance_loss_mlp": 1.01528084, - "epoch": 0.8429881256575981, - "flos": 19537093152000.0, - "grad_norm": 1.619318951916878, - "language_loss": 0.71194899, - "learning_rate": 2.5299216271522805e-07, - "loss": 0.73316747, - "num_input_tokens_seen": 302401355, - "step": 14021, - "time_per_iteration": 2.5756282806396484 - }, - { - "auxiliary_loss_clip": 0.01082102, - "auxiliary_loss_mlp": 0.01039813, - "balance_loss_clip": 1.03787649, - "balance_loss_mlp": 1.02695012, - "epoch": 0.8430482489102661, - "flos": 24790931752320.0, - "grad_norm": 1.6917414821142582, - "language_loss": 0.69565594, - "learning_rate": 2.5280259922734125e-07, - "loss": 0.71687508, - "num_input_tokens_seen": 302419515, - "step": 14022, - "time_per_iteration": 2.654576301574707 - }, - { - "auxiliary_loss_clip": 0.01053571, - "auxiliary_loss_mlp": 0.01034251, - "balance_loss_clip": 1.03549337, - "balance_loss_mlp": 1.02110815, - "epoch": 0.843108372162934, - "flos": 21544248211200.0, - "grad_norm": 2.0704880658750264, - "language_loss": 0.72135806, - "learning_rate": 2.526131019933553e-07, - "loss": 0.74223632, - "num_input_tokens_seen": 302438280, - "step": 14023, - "time_per_iteration": 2.763561248779297 - }, - { - "auxiliary_loss_clip": 0.01097817, - "auxiliary_loss_mlp": 0.01036537, - "balance_loss_clip": 1.03748226, - "balance_loss_mlp": 1.02365077, - "epoch": 0.843168495415602, - "flos": 24608792862720.0, - "grad_norm": 3.1279379432314496, - "language_loss": 0.66840017, - "learning_rate": 2.524236710204559e-07, - "loss": 0.68974364, - "num_input_tokens_seen": 302460860, - "step": 14024, - "time_per_iteration": 2.6798737049102783 - }, - { - "auxiliary_loss_clip": 0.01094098, - "auxiliary_loss_mlp": 0.01033274, - "balance_loss_clip": 1.0358882, - "balance_loss_mlp": 1.02064943, - "epoch": 0.8432286186682699, - "flos": 15122738286720.0, - "grad_norm": 1.7785534436425128, - "language_loss": 0.80463433, - "learning_rate": 2.522343063158261e-07, - "loss": 0.82590806, - "num_input_tokens_seen": 302476980, - "step": 14025, - "time_per_iteration": 2.5957210063934326 - }, - { - "auxiliary_loss_clip": 0.01094269, - "auxiliary_loss_mlp": 0.01032342, - "balance_loss_clip": 1.03669548, - "balance_loss_mlp": 1.02171469, - "epoch": 0.843288741920938, - "flos": 20301882554880.0, - "grad_norm": 1.7599057641282252, - "language_loss": 0.77854842, - "learning_rate": 2.5204500788664606e-07, - "loss": 0.79981452, - "num_input_tokens_seen": 302496380, - "step": 14026, - "time_per_iteration": 4.200474500656128 - }, - { - "auxiliary_loss_clip": 0.01082991, - "auxiliary_loss_mlp": 0.01035909, - "balance_loss_clip": 1.03349411, - "balance_loss_mlp": 1.02325487, - "epoch": 0.8433488651736059, - "flos": 23332096782720.0, - "grad_norm": 1.3875448337644876, - "language_loss": 0.8256402, - "learning_rate": 2.518557757400945e-07, - "loss": 0.84682918, - "num_input_tokens_seen": 302516845, - "step": 14027, - "time_per_iteration": 2.649754524230957 - }, - { - "auxiliary_loss_clip": 0.01083401, - "auxiliary_loss_mlp": 0.01029357, - "balance_loss_clip": 1.03570163, - "balance_loss_mlp": 1.01768661, - "epoch": 0.8434089884262739, - "flos": 39458105844480.0, - "grad_norm": 1.6608095267116312, - "language_loss": 0.56683648, - "learning_rate": 2.5166660988334754e-07, - "loss": 0.58796406, - "num_input_tokens_seen": 302538865, - "step": 14028, - "time_per_iteration": 4.3750526905059814 - }, - { - "auxiliary_loss_clip": 0.01082684, - "auxiliary_loss_mlp": 0.0102599, - "balance_loss_clip": 1.03466272, - "balance_loss_mlp": 1.01414621, - "epoch": 0.8434691116789418, - "flos": 23768842250880.0, - "grad_norm": 2.5757916535354304, - "language_loss": 0.64079869, - "learning_rate": 2.51477510323578e-07, - "loss": 0.66188538, - "num_input_tokens_seen": 302557970, - "step": 14029, - "time_per_iteration": 4.223181962966919 - }, - { - "auxiliary_loss_clip": 0.01105336, - "auxiliary_loss_mlp": 0.0103257, - "balance_loss_clip": 1.037485, - "balance_loss_mlp": 1.02116728, - "epoch": 0.8435292349316098, - "flos": 22671411972480.0, - "grad_norm": 2.654906642720587, - "language_loss": 0.7511518, - "learning_rate": 2.51288477067956e-07, - "loss": 0.77253079, - "num_input_tokens_seen": 302578915, - "step": 14030, - "time_per_iteration": 4.182165145874023 - }, - { - "auxiliary_loss_clip": 0.01087432, - "auxiliary_loss_mlp": 0.01035541, - "balance_loss_clip": 1.03771615, - "balance_loss_mlp": 1.02353668, - "epoch": 0.8435893581842777, - "flos": 18843622202880.0, - "grad_norm": 1.6649016404625991, - "language_loss": 0.83075505, - "learning_rate": 2.510995101236502e-07, - "loss": 0.85198474, - "num_input_tokens_seen": 302596300, - "step": 14031, - "time_per_iteration": 2.6392641067504883 - }, - { - "auxiliary_loss_clip": 0.01084779, - "auxiliary_loss_mlp": 0.01029478, - "balance_loss_clip": 1.03526592, - "balance_loss_mlp": 1.01829624, - "epoch": 0.8436494814369457, - "flos": 20704225772160.0, - "grad_norm": 1.8193190780443504, - "language_loss": 0.80412525, - "learning_rate": 2.509106094978266e-07, - "loss": 0.82526779, - "num_input_tokens_seen": 302614975, - "step": 14032, - "time_per_iteration": 2.640856981277466 - }, - { - "auxiliary_loss_clip": 0.01072594, - "auxiliary_loss_mlp": 0.01035806, - "balance_loss_clip": 1.03194261, - "balance_loss_mlp": 1.02130389, - "epoch": 0.8437096046896138, - "flos": 22674177319680.0, - "grad_norm": 1.5175948868756235, - "language_loss": 0.75642312, - "learning_rate": 2.507217751976478e-07, - "loss": 0.77750713, - "num_input_tokens_seen": 302636415, - "step": 14033, - "time_per_iteration": 2.6690027713775635 - }, - { - "auxiliary_loss_clip": 0.01070256, - "auxiliary_loss_mlp": 0.01037556, - "balance_loss_clip": 1.03320062, - "balance_loss_mlp": 1.02597451, - "epoch": 0.8437697279422817, - "flos": 16180127879040.0, - "grad_norm": 1.777468155857912, - "language_loss": 0.83613944, - "learning_rate": 2.505330072302743e-07, - "loss": 0.85721743, - "num_input_tokens_seen": 302653605, - "step": 14034, - "time_per_iteration": 2.765951156616211 - }, - { - "auxiliary_loss_clip": 0.01074581, - "auxiliary_loss_mlp": 0.01032138, - "balance_loss_clip": 1.03461361, - "balance_loss_mlp": 1.01791012, - "epoch": 0.8438298511949497, - "flos": 28765847629440.0, - "grad_norm": 1.4932135863758922, - "language_loss": 0.78466785, - "learning_rate": 2.503443056028656e-07, - "loss": 0.80573499, - "num_input_tokens_seen": 302673965, - "step": 14035, - "time_per_iteration": 2.76178240776062 - }, - { - "auxiliary_loss_clip": 0.01093632, - "auxiliary_loss_mlp": 0.01036882, - "balance_loss_clip": 1.03451896, - "balance_loss_mlp": 1.02403092, - "epoch": 0.8438899744476176, - "flos": 33724284779520.0, - "grad_norm": 1.361908156116711, - "language_loss": 0.72181344, - "learning_rate": 2.501556703225751e-07, - "loss": 0.74311858, - "num_input_tokens_seen": 302695560, - "step": 14036, - "time_per_iteration": 2.719937562942505 - }, - { - "auxiliary_loss_clip": 0.01103676, - "auxiliary_loss_mlp": 0.01025959, - "balance_loss_clip": 1.03616214, - "balance_loss_mlp": 1.01573718, - "epoch": 0.8439500977002856, - "flos": 25110787386240.0, - "grad_norm": 1.7256131227226181, - "language_loss": 0.69647789, - "learning_rate": 2.49967101396557e-07, - "loss": 0.71777427, - "num_input_tokens_seen": 302713480, - "step": 14037, - "time_per_iteration": 2.581303596496582 - }, - { - "auxiliary_loss_clip": 0.01107935, - "auxiliary_loss_mlp": 0.01026894, - "balance_loss_clip": 1.03714299, - "balance_loss_mlp": 1.01509237, - "epoch": 0.8440102209529535, - "flos": 32850362880000.0, - "grad_norm": 1.8136551952010338, - "language_loss": 0.69107378, - "learning_rate": 2.4977859883196227e-07, - "loss": 0.71242201, - "num_input_tokens_seen": 302736860, - "step": 14038, - "time_per_iteration": 2.6723809242248535 - }, - { - "auxiliary_loss_clip": 0.01051869, - "auxiliary_loss_mlp": 0.01039696, - "balance_loss_clip": 1.03102171, - "balance_loss_mlp": 1.02648771, - "epoch": 0.8440703442056215, - "flos": 23730202195200.0, - "grad_norm": 1.5801528390801436, - "language_loss": 0.76572794, - "learning_rate": 2.49590162635938e-07, - "loss": 0.78664356, - "num_input_tokens_seen": 302757745, - "step": 14039, - "time_per_iteration": 2.721997022628784 - }, - { - "auxiliary_loss_clip": 0.0111525, - "auxiliary_loss_mlp": 0.01028155, - "balance_loss_clip": 1.03972554, - "balance_loss_mlp": 1.01560223, - "epoch": 0.8441304674582895, - "flos": 20193719725440.0, - "grad_norm": 1.8472982875687889, - "language_loss": 0.79579616, - "learning_rate": 2.4940179281563046e-07, - "loss": 0.81723017, - "num_input_tokens_seen": 302774885, - "step": 14040, - "time_per_iteration": 2.531126022338867 - }, - { - "auxiliary_loss_clip": 0.01077191, - "auxiliary_loss_mlp": 0.01039289, - "balance_loss_clip": 1.03676081, - "balance_loss_mlp": 1.02646196, - "epoch": 0.8441905907109575, - "flos": 20219897761920.0, - "grad_norm": 1.9576992195932046, - "language_loss": 0.69267452, - "learning_rate": 2.492134893781821e-07, - "loss": 0.71383929, - "num_input_tokens_seen": 302791035, - "step": 14041, - "time_per_iteration": 2.749387741088867 - }, - { - "auxiliary_loss_clip": 0.01087824, - "auxiliary_loss_mlp": 0.01037084, - "balance_loss_clip": 1.03482831, - "balance_loss_mlp": 1.02474546, - "epoch": 0.8442507139636254, - "flos": 13516453987200.0, - "grad_norm": 1.879973628715058, - "language_loss": 0.68978488, - "learning_rate": 2.490252523307341e-07, - "loss": 0.71103394, - "num_input_tokens_seen": 302808650, - "step": 14042, - "time_per_iteration": 2.656613826751709 - }, - { - "auxiliary_loss_clip": 0.01085316, - "auxiliary_loss_mlp": 0.01033386, - "balance_loss_clip": 1.03706896, - "balance_loss_mlp": 1.02187014, - "epoch": 0.8443108372162934, - "flos": 18220212731520.0, - "grad_norm": 3.089358843777989, - "language_loss": 0.74717695, - "learning_rate": 2.4883708168042373e-07, - "loss": 0.76836395, - "num_input_tokens_seen": 302824605, - "step": 14043, - "time_per_iteration": 2.633385181427002 - }, - { - "auxiliary_loss_clip": 0.01107453, - "auxiliary_loss_mlp": 0.00769638, - "balance_loss_clip": 1.03682041, - "balance_loss_mlp": 1.00010276, - "epoch": 0.8443709604689613, - "flos": 16105110324480.0, - "grad_norm": 2.7829513982165306, - "language_loss": 0.7167477, - "learning_rate": 2.486489774343865e-07, - "loss": 0.73551863, - "num_input_tokens_seen": 302840170, - "step": 14044, - "time_per_iteration": 2.5848615169525146 - }, - { - "auxiliary_loss_clip": 0.01085792, - "auxiliary_loss_mlp": 0.01029904, - "balance_loss_clip": 1.03579986, - "balance_loss_mlp": 1.01772702, - "epoch": 0.8444310837216293, - "flos": 18512130562560.0, - "grad_norm": 1.5511958815264777, - "language_loss": 0.74899876, - "learning_rate": 2.484609395997559e-07, - "loss": 0.77015567, - "num_input_tokens_seen": 302858320, - "step": 14045, - "time_per_iteration": 2.6302268505096436 - }, - { - "auxiliary_loss_clip": 0.01086761, - "auxiliary_loss_mlp": 0.00769733, - "balance_loss_clip": 1.03393674, - "balance_loss_mlp": 1.00021636, - "epoch": 0.8444912069742974, - "flos": 14939845211520.0, - "grad_norm": 1.9839329932661167, - "language_loss": 0.78436804, - "learning_rate": 2.4827296818366216e-07, - "loss": 0.80293298, - "num_input_tokens_seen": 302875255, - "step": 14046, - "time_per_iteration": 2.6413092613220215 - }, - { - "auxiliary_loss_clip": 0.01081685, - "auxiliary_loss_mlp": 0.01035179, - "balance_loss_clip": 1.03393447, - "balance_loss_mlp": 1.02033782, - "epoch": 0.8445513302269653, - "flos": 20120318282880.0, - "grad_norm": 2.2863023721610842, - "language_loss": 0.7816503, - "learning_rate": 2.4808506319323255e-07, - "loss": 0.80281889, - "num_input_tokens_seen": 302894690, - "step": 14047, - "time_per_iteration": 2.6660380363464355 - }, - { - "auxiliary_loss_clip": 0.01086084, - "auxiliary_loss_mlp": 0.0103182, - "balance_loss_clip": 1.03934455, - "balance_loss_mlp": 1.01988101, - "epoch": 0.8446114534796333, - "flos": 31170928533120.0, - "grad_norm": 1.8722756124069524, - "language_loss": 0.72262931, - "learning_rate": 2.478972246355935e-07, - "loss": 0.74380839, - "num_input_tokens_seen": 302912405, - "step": 14048, - "time_per_iteration": 2.750633716583252 - }, - { - "auxiliary_loss_clip": 0.01029086, - "auxiliary_loss_mlp": 0.01033735, - "balance_loss_clip": 1.03568673, - "balance_loss_mlp": 1.02149785, - "epoch": 0.8446715767323012, - "flos": 23948323534080.0, - "grad_norm": 1.6779262102032728, - "language_loss": 0.73663235, - "learning_rate": 2.477094525178667e-07, - "loss": 0.75726056, - "num_input_tokens_seen": 302932525, - "step": 14049, - "time_per_iteration": 3.1203606128692627 - }, - { - "auxiliary_loss_clip": 0.01019667, - "auxiliary_loss_mlp": 0.00751068, - "balance_loss_clip": 1.00710368, - "balance_loss_mlp": 0.99964279, - "epoch": 0.8447316999849692, - "flos": 67984897484160.0, - "grad_norm": 0.890275680771782, - "language_loss": 0.60581625, - "learning_rate": 2.475217468471729e-07, - "loss": 0.62352359, - "num_input_tokens_seen": 302991285, - "step": 14050, - "time_per_iteration": 3.2392361164093018 - }, - { - "auxiliary_loss_clip": 0.01082426, - "auxiliary_loss_mlp": 0.00772259, - "balance_loss_clip": 1.03367877, - "balance_loss_mlp": 1.00022018, - "epoch": 0.8447918232376371, - "flos": 22418924296320.0, - "grad_norm": 2.8956487608781036, - "language_loss": 0.72659081, - "learning_rate": 2.473341076306303e-07, - "loss": 0.74513769, - "num_input_tokens_seen": 303009515, - "step": 14051, - "time_per_iteration": 2.6661341190338135 - }, - { - "auxiliary_loss_clip": 0.01095777, - "auxiliary_loss_mlp": 0.01027878, - "balance_loss_clip": 1.03622103, - "balance_loss_mlp": 1.01606417, - "epoch": 0.8448519464903052, - "flos": 23694147918720.0, - "grad_norm": 1.811318817116214, - "language_loss": 0.74613708, - "learning_rate": 2.471465348753547e-07, - "loss": 0.76737368, - "num_input_tokens_seen": 303026905, - "step": 14052, - "time_per_iteration": 2.7032968997955322 - }, - { - "auxiliary_loss_clip": 0.0107808, - "auxiliary_loss_mlp": 0.01028694, - "balance_loss_clip": 1.03693604, - "balance_loss_mlp": 1.01800132, - "epoch": 0.8449120697429731, - "flos": 13735904129280.0, - "grad_norm": 2.027055068247027, - "language_loss": 0.73807508, - "learning_rate": 2.469590285884575e-07, - "loss": 0.75914282, - "num_input_tokens_seen": 303045245, - "step": 14053, - "time_per_iteration": 2.6814658641815186 - }, - { - "auxiliary_loss_clip": 0.01092814, - "auxiliary_loss_mlp": 0.01029071, - "balance_loss_clip": 1.03634143, - "balance_loss_mlp": 1.01660776, - "epoch": 0.8449721929956411, - "flos": 20886795624960.0, - "grad_norm": 1.652927903849228, - "language_loss": 0.73763537, - "learning_rate": 2.467715887770494e-07, - "loss": 0.75885427, - "num_input_tokens_seen": 303065205, - "step": 14054, - "time_per_iteration": 2.7116918563842773 - }, - { - "auxiliary_loss_clip": 0.01101862, - "auxiliary_loss_mlp": 0.0103134, - "balance_loss_clip": 1.03724992, - "balance_loss_mlp": 1.01904297, - "epoch": 0.845032316248309, - "flos": 33216939129600.0, - "grad_norm": 1.366097555200395, - "language_loss": 0.77969533, - "learning_rate": 2.4658421544823895e-07, - "loss": 0.8010273, - "num_input_tokens_seen": 303088250, - "step": 14055, - "time_per_iteration": 2.73816180229187 - }, - { - "auxiliary_loss_clip": 0.01096569, - "auxiliary_loss_mlp": 0.01036149, - "balance_loss_clip": 1.03655601, - "balance_loss_mlp": 1.02406144, - "epoch": 0.845092439500977, - "flos": 23585230903680.0, - "grad_norm": 1.670368654954579, - "language_loss": 0.72893804, - "learning_rate": 2.463969086091302e-07, - "loss": 0.75026524, - "num_input_tokens_seen": 303109280, - "step": 14056, - "time_per_iteration": 2.70538592338562 - }, - { - "auxiliary_loss_clip": 0.01102046, - "auxiliary_loss_mlp": 0.01036064, - "balance_loss_clip": 1.03741002, - "balance_loss_mlp": 1.02301073, - "epoch": 0.8451525627536449, - "flos": 13333920048000.0, - "grad_norm": 2.25806090872676, - "language_loss": 0.67116416, - "learning_rate": 2.4620966826682686e-07, - "loss": 0.69254524, - "num_input_tokens_seen": 303126075, - "step": 14057, - "time_per_iteration": 2.6500983238220215 - }, - { - "auxiliary_loss_clip": 0.01061896, - "auxiliary_loss_mlp": 0.01031163, - "balance_loss_clip": 1.03401434, - "balance_loss_mlp": 1.01830649, - "epoch": 0.8452126860063129, - "flos": 27817985583360.0, - "grad_norm": 14.445297752057405, - "language_loss": 0.77819413, - "learning_rate": 2.460224944284284e-07, - "loss": 0.79912472, - "num_input_tokens_seen": 303146920, - "step": 14058, - "time_per_iteration": 2.7543530464172363 - }, - { - "auxiliary_loss_clip": 0.01110927, - "auxiliary_loss_mlp": 0.0103341, - "balance_loss_clip": 1.03813863, - "balance_loss_mlp": 1.02150726, - "epoch": 0.845272809258981, - "flos": 27124694202240.0, - "grad_norm": 1.550470920076943, - "language_loss": 0.69772273, - "learning_rate": 2.45835387101033e-07, - "loss": 0.7191661, - "num_input_tokens_seen": 303167885, - "step": 14059, - "time_per_iteration": 2.6287412643432617 - }, - { - "auxiliary_loss_clip": 0.01112261, - "auxiliary_loss_mlp": 0.01036279, - "balance_loss_clip": 1.03764248, - "balance_loss_mlp": 1.02282071, - "epoch": 0.8453329325116489, - "flos": 18332577452160.0, - "grad_norm": 2.0803365731251278, - "language_loss": 0.57748783, - "learning_rate": 2.4564834629173516e-07, - "loss": 0.59897316, - "num_input_tokens_seen": 303185000, - "step": 14060, - "time_per_iteration": 2.5504209995269775 - }, - { - "auxiliary_loss_clip": 0.0108835, - "auxiliary_loss_mlp": 0.01037716, - "balance_loss_clip": 1.0332464, - "balance_loss_mlp": 1.02378607, - "epoch": 0.8453930557643169, - "flos": 22675254727680.0, - "grad_norm": 1.5991086812981428, - "language_loss": 0.756387, - "learning_rate": 2.454613720076277e-07, - "loss": 0.77764767, - "num_input_tokens_seen": 303205210, - "step": 14061, - "time_per_iteration": 2.6448512077331543 - }, - { - "auxiliary_loss_clip": 0.010831, - "auxiliary_loss_mlp": 0.0102804, - "balance_loss_clip": 1.0347625, - "balance_loss_mlp": 1.01493907, - "epoch": 0.8454531790169848, - "flos": 22487261921280.0, - "grad_norm": 2.9339159034227134, - "language_loss": 0.71316195, - "learning_rate": 2.452744642558013e-07, - "loss": 0.73427337, - "num_input_tokens_seen": 303224655, - "step": 14062, - "time_per_iteration": 2.6758151054382324 - }, - { - "auxiliary_loss_clip": 0.00988143, - "auxiliary_loss_mlp": 0.00999448, - "balance_loss_clip": 1.01111102, - "balance_loss_mlp": 0.99836904, - "epoch": 0.8455133022696528, - "flos": 58277848481280.0, - "grad_norm": 0.6380346194484136, - "language_loss": 0.52619612, - "learning_rate": 2.450876230433432e-07, - "loss": 0.54607201, - "num_input_tokens_seen": 303289645, - "step": 14063, - "time_per_iteration": 3.2946317195892334 - }, - { - "auxiliary_loss_clip": 0.01065561, - "auxiliary_loss_mlp": 0.01025729, - "balance_loss_clip": 1.03565407, - "balance_loss_mlp": 1.01469028, - "epoch": 0.8455734255223207, - "flos": 21361283308800.0, - "grad_norm": 1.9822663620489593, - "language_loss": 0.82145214, - "learning_rate": 2.449008483773378e-07, - "loss": 0.84236503, - "num_input_tokens_seen": 303308350, - "step": 14064, - "time_per_iteration": 2.6607656478881836 - }, - { - "auxiliary_loss_clip": 0.01101966, - "auxiliary_loss_mlp": 0.01033412, - "balance_loss_clip": 1.03909707, - "balance_loss_mlp": 1.02036476, - "epoch": 0.8456335487749888, - "flos": 20449260057600.0, - "grad_norm": 1.7113113897930685, - "language_loss": 0.72365153, - "learning_rate": 2.447141402648685e-07, - "loss": 0.74500531, - "num_input_tokens_seen": 303325230, - "step": 14065, - "time_per_iteration": 4.209578037261963 - }, - { - "auxiliary_loss_clip": 0.0107366, - "auxiliary_loss_mlp": 0.01027599, - "balance_loss_clip": 1.03522468, - "balance_loss_mlp": 1.01634598, - "epoch": 0.8456936720276567, - "flos": 28840901097600.0, - "grad_norm": 1.7659052654385863, - "language_loss": 0.77673888, - "learning_rate": 2.445274987130146e-07, - "loss": 0.79775143, - "num_input_tokens_seen": 303345810, - "step": 14066, - "time_per_iteration": 2.7587270736694336 - }, - { - "auxiliary_loss_clip": 0.01072656, - "auxiliary_loss_mlp": 0.01030619, - "balance_loss_clip": 1.03802919, - "balance_loss_mlp": 1.01832891, - "epoch": 0.8457537952803247, - "flos": 22672884430080.0, - "grad_norm": 1.438108024739344, - "language_loss": 0.69719791, - "learning_rate": 2.4434092372885363e-07, - "loss": 0.71823066, - "num_input_tokens_seen": 303365140, - "step": 14067, - "time_per_iteration": 4.787655353546143 - }, - { - "auxiliary_loss_clip": 0.01071366, - "auxiliary_loss_mlp": 0.01028654, - "balance_loss_clip": 1.03298759, - "balance_loss_mlp": 1.01651883, - "epoch": 0.8458139185329926, - "flos": 33802929607680.0, - "grad_norm": 2.205817987023731, - "language_loss": 0.71166551, - "learning_rate": 2.4415441531946144e-07, - "loss": 0.73266566, - "num_input_tokens_seen": 303386150, - "step": 14068, - "time_per_iteration": 4.351239204406738 - }, - { - "auxiliary_loss_clip": 0.00992733, - "auxiliary_loss_mlp": 0.01001464, - "balance_loss_clip": 1.00806594, - "balance_loss_mlp": 1.00047481, - "epoch": 0.8458740417856606, - "flos": 70295929603200.0, - "grad_norm": 0.6926239489661511, - "language_loss": 0.604882, - "learning_rate": 2.4396797349190976e-07, - "loss": 0.62482405, - "num_input_tokens_seen": 303453770, - "step": 14069, - "time_per_iteration": 5.011239290237427 - }, - { - "auxiliary_loss_clip": 0.01085111, - "auxiliary_loss_mlp": 0.01030137, - "balance_loss_clip": 1.0371033, - "balance_loss_mlp": 1.01844859, - "epoch": 0.8459341650383285, - "flos": 24170862245760.0, - "grad_norm": 1.60147052022308, - "language_loss": 0.74564326, - "learning_rate": 2.4378159825326804e-07, - "loss": 0.76679569, - "num_input_tokens_seen": 303474520, - "step": 14070, - "time_per_iteration": 2.651233196258545 - }, - { - "auxiliary_loss_clip": 0.01061032, - "auxiliary_loss_mlp": 0.01030467, - "balance_loss_clip": 1.03419256, - "balance_loss_mlp": 1.01784265, - "epoch": 0.8459942882909965, - "flos": 38181158369280.0, - "grad_norm": 3.793310499972189, - "language_loss": 0.66902626, - "learning_rate": 2.435952896106039e-07, - "loss": 0.68994129, - "num_input_tokens_seen": 303497345, - "step": 14071, - "time_per_iteration": 2.863940954208374 - }, - { - "auxiliary_loss_clip": 0.01019962, - "auxiliary_loss_mlp": 0.00751058, - "balance_loss_clip": 1.00760365, - "balance_loss_mlp": 0.99957699, - "epoch": 0.8460544115436646, - "flos": 64118252177280.0, - "grad_norm": 0.7316338741504227, - "language_loss": 0.61046565, - "learning_rate": 2.4340904757098313e-07, - "loss": 0.62817585, - "num_input_tokens_seen": 303554890, - "step": 14072, - "time_per_iteration": 3.041468858718872 - }, - { - "auxiliary_loss_clip": 0.01069973, - "auxiliary_loss_mlp": 0.0103143, - "balance_loss_clip": 1.0375272, - "balance_loss_mlp": 1.01801336, - "epoch": 0.8461145347963325, - "flos": 24170826332160.0, - "grad_norm": 2.698687896203199, - "language_loss": 0.72609383, - "learning_rate": 2.4322287214146664e-07, - "loss": 0.74710786, - "num_input_tokens_seen": 303574380, - "step": 14073, - "time_per_iteration": 2.7544729709625244 - }, - { - "auxiliary_loss_clip": 0.01091999, - "auxiliary_loss_mlp": 0.01034187, - "balance_loss_clip": 1.0379231, - "balance_loss_mlp": 1.02047241, - "epoch": 0.8461746580490005, - "flos": 34893787697280.0, - "grad_norm": 2.1945547327589976, - "language_loss": 0.78341836, - "learning_rate": 2.430367633291155e-07, - "loss": 0.80468023, - "num_input_tokens_seen": 303594910, - "step": 14074, - "time_per_iteration": 2.8085241317749023 - }, - { - "auxiliary_loss_clip": 0.01099175, - "auxiliary_loss_mlp": 0.01030891, - "balance_loss_clip": 1.03856003, - "balance_loss_mlp": 1.01867247, - "epoch": 0.8462347813016684, - "flos": 25557014044800.0, - "grad_norm": 2.0589509569637143, - "language_loss": 0.75481176, - "learning_rate": 2.4285072114098583e-07, - "loss": 0.77611244, - "num_input_tokens_seen": 303613520, - "step": 14075, - "time_per_iteration": 2.6737287044525146 - }, - { - "auxiliary_loss_clip": 0.01084327, - "auxiliary_loss_mlp": 0.01026954, - "balance_loss_clip": 1.03593063, - "balance_loss_mlp": 1.0144732, - "epoch": 0.8462949045543364, - "flos": 21325336773120.0, - "grad_norm": 2.312181037526072, - "language_loss": 0.73324478, - "learning_rate": 2.4266474558413355e-07, - "loss": 0.75435758, - "num_input_tokens_seen": 303631225, - "step": 14076, - "time_per_iteration": 2.6550984382629395 - }, - { - "auxiliary_loss_clip": 0.01091988, - "auxiliary_loss_mlp": 0.01032969, - "balance_loss_clip": 1.03691387, - "balance_loss_mlp": 1.02119696, - "epoch": 0.8463550278070043, - "flos": 22637440684800.0, - "grad_norm": 2.1680045577224543, - "language_loss": 0.78016102, - "learning_rate": 2.4247883666560945e-07, - "loss": 0.80141062, - "num_input_tokens_seen": 303649175, - "step": 14077, - "time_per_iteration": 2.7090940475463867 - }, - { - "auxiliary_loss_clip": 0.01075749, - "auxiliary_loss_mlp": 0.0103478, - "balance_loss_clip": 1.03529286, - "balance_loss_mlp": 1.022102, - "epoch": 0.8464151510596724, - "flos": 13005588804480.0, - "grad_norm": 3.514395307267888, - "language_loss": 0.75203717, - "learning_rate": 2.422929943924643e-07, - "loss": 0.77314246, - "num_input_tokens_seen": 303665915, - "step": 14078, - "time_per_iteration": 2.668720245361328 - }, - { - "auxiliary_loss_clip": 0.01069196, - "auxiliary_loss_mlp": 0.01025183, - "balance_loss_clip": 1.0366447, - "balance_loss_mlp": 1.01232052, - "epoch": 0.8464752743123403, - "flos": 15704921923200.0, - "grad_norm": 3.2183911644001237, - "language_loss": 0.85171533, - "learning_rate": 2.4210721877174565e-07, - "loss": 0.87265909, - "num_input_tokens_seen": 303679985, - "step": 14079, - "time_per_iteration": 2.7119951248168945 - }, - { - "auxiliary_loss_clip": 0.01084378, - "auxiliary_loss_mlp": 0.01037369, - "balance_loss_clip": 1.03693473, - "balance_loss_mlp": 1.02382135, - "epoch": 0.8465353975650083, - "flos": 21653955325440.0, - "grad_norm": 1.8047778580763019, - "language_loss": 0.5904004, - "learning_rate": 2.419215098104965e-07, - "loss": 0.61161786, - "num_input_tokens_seen": 303698470, - "step": 14080, - "time_per_iteration": 2.6963582038879395 - }, - { - "auxiliary_loss_clip": 0.01084298, - "auxiliary_loss_mlp": 0.01029709, - "balance_loss_clip": 1.03614235, - "balance_loss_mlp": 1.01678658, - "epoch": 0.8465955208176762, - "flos": 18515650095360.0, - "grad_norm": 2.4057964944506174, - "language_loss": 0.65874493, - "learning_rate": 2.4173586751576014e-07, - "loss": 0.67988491, - "num_input_tokens_seen": 303716415, - "step": 14081, - "time_per_iteration": 2.68113112449646 - }, - { - "auxiliary_loss_clip": 0.01096638, - "auxiliary_loss_mlp": 0.01035444, - "balance_loss_clip": 1.03579867, - "balance_loss_mlp": 1.0226047, - "epoch": 0.8466556440703442, - "flos": 24200559815040.0, - "grad_norm": 1.8302638990848867, - "language_loss": 0.72922373, - "learning_rate": 2.41550291894576e-07, - "loss": 0.75054455, - "num_input_tokens_seen": 303734490, - "step": 14082, - "time_per_iteration": 2.6815195083618164 - }, - { - "auxiliary_loss_clip": 0.01055673, - "auxiliary_loss_mlp": 0.01036867, - "balance_loss_clip": 1.03194714, - "balance_loss_mlp": 1.02327132, - "epoch": 0.8467157673230121, - "flos": 20375894528640.0, - "grad_norm": 2.0055254774422666, - "language_loss": 0.76295221, - "learning_rate": 2.413647829539809e-07, - "loss": 0.78387761, - "num_input_tokens_seen": 303752310, - "step": 14083, - "time_per_iteration": 2.7438621520996094 - }, - { - "auxiliary_loss_clip": 0.01061542, - "auxiliary_loss_mlp": 0.01034056, - "balance_loss_clip": 1.03273213, - "balance_loss_mlp": 1.01982224, - "epoch": 0.8467758905756801, - "flos": 28473642489600.0, - "grad_norm": 1.7272953079134175, - "language_loss": 0.65860844, - "learning_rate": 2.411793407010092e-07, - "loss": 0.67956436, - "num_input_tokens_seen": 303776065, - "step": 14084, - "time_per_iteration": 2.7735166549682617 - }, - { - "auxiliary_loss_clip": 0.01067401, - "auxiliary_loss_mlp": 0.01030615, - "balance_loss_clip": 1.03815413, - "balance_loss_mlp": 1.01846731, - "epoch": 0.8468360138283482, - "flos": 11692551139200.0, - "grad_norm": 2.7040264249801766, - "language_loss": 0.69605839, - "learning_rate": 2.409939651426938e-07, - "loss": 0.71703851, - "num_input_tokens_seen": 303793500, - "step": 14085, - "time_per_iteration": 2.773153781890869 - }, - { - "auxiliary_loss_clip": 0.01066275, - "auxiliary_loss_mlp": 0.0102846, - "balance_loss_clip": 1.03402877, - "balance_loss_mlp": 1.01666391, - "epoch": 0.8468961370810161, - "flos": 24607859109120.0, - "grad_norm": 1.5869517214902362, - "language_loss": 0.71034825, - "learning_rate": 2.408086562860634e-07, - "loss": 0.73129559, - "num_input_tokens_seen": 303814835, - "step": 14086, - "time_per_iteration": 2.778090476989746 - }, - { - "auxiliary_loss_clip": 0.01091608, - "auxiliary_loss_mlp": 0.01032202, - "balance_loss_clip": 1.03516686, - "balance_loss_mlp": 1.01981008, - "epoch": 0.8469562603336841, - "flos": 19609812236160.0, - "grad_norm": 1.9986258796414704, - "language_loss": 0.74891198, - "learning_rate": 2.4062341413814445e-07, - "loss": 0.77015007, - "num_input_tokens_seen": 303834505, - "step": 14087, - "time_per_iteration": 2.659958600997925 - }, - { - "auxiliary_loss_clip": 0.01080191, - "auxiliary_loss_mlp": 0.01026574, - "balance_loss_clip": 1.03761494, - "balance_loss_mlp": 1.01394975, - "epoch": 0.847016383586352, - "flos": 22638949056000.0, - "grad_norm": 1.3437593766156344, - "language_loss": 0.74087977, - "learning_rate": 2.4043823870596227e-07, - "loss": 0.76194739, - "num_input_tokens_seen": 303855050, - "step": 14088, - "time_per_iteration": 2.820697784423828 - }, - { - "auxiliary_loss_clip": 0.01099232, - "auxiliary_loss_mlp": 0.01032056, - "balance_loss_clip": 1.03691757, - "balance_loss_mlp": 1.01979518, - "epoch": 0.84707650683902, - "flos": 20960161153920.0, - "grad_norm": 2.494250359435125, - "language_loss": 0.7231648, - "learning_rate": 2.402531299965387e-07, - "loss": 0.74447769, - "num_input_tokens_seen": 303875635, - "step": 14089, - "time_per_iteration": 2.6343815326690674 - }, - { - "auxiliary_loss_clip": 0.01108775, - "auxiliary_loss_mlp": 0.01028953, - "balance_loss_clip": 1.03952324, - "balance_loss_mlp": 1.01720452, - "epoch": 0.8471366300916879, - "flos": 24093007516800.0, - "grad_norm": 1.3722946087239658, - "language_loss": 0.79204518, - "learning_rate": 2.400680880168928e-07, - "loss": 0.81342244, - "num_input_tokens_seen": 303896750, - "step": 14090, - "time_per_iteration": 2.6099236011505127 - }, - { - "auxiliary_loss_clip": 0.01053519, - "auxiliary_loss_mlp": 0.01040145, - "balance_loss_clip": 1.03225899, - "balance_loss_mlp": 1.02603018, - "epoch": 0.847196753344356, - "flos": 18332900674560.0, - "grad_norm": 2.9954684546587553, - "language_loss": 0.76710737, - "learning_rate": 2.3988311277404085e-07, - "loss": 0.78804398, - "num_input_tokens_seen": 303915435, - "step": 14091, - "time_per_iteration": 2.780735492706299 - }, - { - "auxiliary_loss_clip": 0.01028625, - "auxiliary_loss_mlp": 0.01002892, - "balance_loss_clip": 1.00622869, - "balance_loss_mlp": 1.00184846, - "epoch": 0.8472568765970239, - "flos": 49567536956160.0, - "grad_norm": 0.8179269563899582, - "language_loss": 0.59413207, - "learning_rate": 2.396982042749982e-07, - "loss": 0.61444724, - "num_input_tokens_seen": 303977245, - "step": 14092, - "time_per_iteration": 3.1960132122039795 - }, - { - "auxiliary_loss_clip": 0.01081941, - "auxiliary_loss_mlp": 0.01036569, - "balance_loss_clip": 1.03254557, - "balance_loss_mlp": 1.02275276, - "epoch": 0.8473169998496919, - "flos": 19279074781440.0, - "grad_norm": 1.7883321120809321, - "language_loss": 0.70245391, - "learning_rate": 2.395133625267756e-07, - "loss": 0.72363901, - "num_input_tokens_seen": 303996055, - "step": 14093, - "time_per_iteration": 2.6437125205993652 - }, - { - "auxiliary_loss_clip": 0.01105171, - "auxiliary_loss_mlp": 0.01025923, - "balance_loss_clip": 1.03583193, - "balance_loss_mlp": 1.01443684, - "epoch": 0.8473771231023598, - "flos": 17675555829120.0, - "grad_norm": 2.1012559182302866, - "language_loss": 0.83147365, - "learning_rate": 2.3932858753638263e-07, - "loss": 0.85278457, - "num_input_tokens_seen": 304012205, - "step": 14094, - "time_per_iteration": 2.5802862644195557 - }, - { - "auxiliary_loss_clip": 0.01089017, - "auxiliary_loss_mlp": 0.01031094, - "balance_loss_clip": 1.03741288, - "balance_loss_mlp": 1.01977515, - "epoch": 0.8474372463550278, - "flos": 26359761144960.0, - "grad_norm": 2.04303085122746, - "language_loss": 0.71497333, - "learning_rate": 2.3914387931082626e-07, - "loss": 0.7361744, - "num_input_tokens_seen": 304033475, - "step": 14095, - "time_per_iteration": 2.6501832008361816 - }, - { - "auxiliary_loss_clip": 0.01094545, - "auxiliary_loss_mlp": 0.00769791, - "balance_loss_clip": 1.03552461, - "balance_loss_mlp": 1.00019312, - "epoch": 0.8474973696076957, - "flos": 23402050519680.0, - "grad_norm": 1.904609327228077, - "language_loss": 0.80488968, - "learning_rate": 2.3895923785711105e-07, - "loss": 0.82353306, - "num_input_tokens_seen": 304051845, - "step": 14096, - "time_per_iteration": 2.644343376159668 - }, - { - "auxiliary_loss_clip": 0.0110016, - "auxiliary_loss_mlp": 0.01030989, - "balance_loss_clip": 1.03743386, - "balance_loss_mlp": 1.01790023, - "epoch": 0.8475574928603637, - "flos": 25075666863360.0, - "grad_norm": 8.034016369371804, - "language_loss": 0.77681863, - "learning_rate": 2.387746631822374e-07, - "loss": 0.79813015, - "num_input_tokens_seen": 304069965, - "step": 14097, - "time_per_iteration": 2.793025255203247 - }, - { - "auxiliary_loss_clip": 0.0107883, - "auxiliary_loss_mlp": 0.0102726, - "balance_loss_clip": 1.03687394, - "balance_loss_mlp": 1.01560712, - "epoch": 0.8476176161130318, - "flos": 19966691813760.0, - "grad_norm": 1.7033024624087645, - "language_loss": 0.802845, - "learning_rate": 2.385901552932048e-07, - "loss": 0.82390594, - "num_input_tokens_seen": 304086805, - "step": 14098, - "time_per_iteration": 2.675039052963257 - }, - { - "auxiliary_loss_clip": 0.01092536, - "auxiliary_loss_mlp": 0.00770177, - "balance_loss_clip": 1.03604007, - "balance_loss_mlp": 1.00013864, - "epoch": 0.8476777393656997, - "flos": 21285834791040.0, - "grad_norm": 1.8975178373976451, - "language_loss": 0.71665621, - "learning_rate": 2.3840571419701062e-07, - "loss": 0.73528326, - "num_input_tokens_seen": 304105865, - "step": 14099, - "time_per_iteration": 2.5827932357788086 - }, - { - "auxiliary_loss_clip": 0.01094872, - "auxiliary_loss_mlp": 0.01032372, - "balance_loss_clip": 1.03322709, - "balance_loss_mlp": 1.01883566, - "epoch": 0.8477378626183677, - "flos": 29971476650880.0, - "grad_norm": 2.118472556405624, - "language_loss": 0.63617903, - "learning_rate": 2.3822133990064787e-07, - "loss": 0.65745145, - "num_input_tokens_seen": 304128300, - "step": 14100, - "time_per_iteration": 2.723047971725464 - }, - { - "auxiliary_loss_clip": 0.01099377, - "auxiliary_loss_mlp": 0.01033502, - "balance_loss_clip": 1.03651261, - "balance_loss_mlp": 1.02066314, - "epoch": 0.8477979858710356, - "flos": 24237727413120.0, - "grad_norm": 2.0266984363046876, - "language_loss": 0.73806208, - "learning_rate": 2.380370324111085e-07, - "loss": 0.75939089, - "num_input_tokens_seen": 304143695, - "step": 14101, - "time_per_iteration": 2.6568257808685303 - }, - { - "auxiliary_loss_clip": 0.01098555, - "auxiliary_loss_mlp": 0.01028521, - "balance_loss_clip": 1.03505516, - "balance_loss_mlp": 1.01662445, - "epoch": 0.8478581091237036, - "flos": 25593678852480.0, - "grad_norm": 1.6724420871950227, - "language_loss": 0.71237093, - "learning_rate": 2.3785279173538163e-07, - "loss": 0.73364168, - "num_input_tokens_seen": 304165800, - "step": 14102, - "time_per_iteration": 2.72493052482605 - }, - { - "auxiliary_loss_clip": 0.01084921, - "auxiliary_loss_mlp": 0.01033048, - "balance_loss_clip": 1.03555894, - "balance_loss_mlp": 1.01940477, - "epoch": 0.8479182323763715, - "flos": 12057116227200.0, - "grad_norm": 2.4752629302772426, - "language_loss": 0.81727469, - "learning_rate": 2.3766861788045366e-07, - "loss": 0.83845437, - "num_input_tokens_seen": 304182910, - "step": 14103, - "time_per_iteration": 2.723888874053955 - }, - { - "auxiliary_loss_clip": 0.01109645, - "auxiliary_loss_mlp": 0.01030985, - "balance_loss_clip": 1.03859901, - "balance_loss_mlp": 1.01881981, - "epoch": 0.8479783556290396, - "flos": 21433391861760.0, - "grad_norm": 1.9133517562586435, - "language_loss": 0.78571969, - "learning_rate": 2.374845108533079e-07, - "loss": 0.80712605, - "num_input_tokens_seen": 304200175, - "step": 14104, - "time_per_iteration": 4.045240879058838 - }, - { - "auxiliary_loss_clip": 0.01101779, - "auxiliary_loss_mlp": 0.01037373, - "balance_loss_clip": 1.03928828, - "balance_loss_mlp": 1.02440929, - "epoch": 0.8480384788817075, - "flos": 19642634288640.0, - "grad_norm": 1.8032304310085965, - "language_loss": 0.78830254, - "learning_rate": 2.3730047066092607e-07, - "loss": 0.80969405, - "num_input_tokens_seen": 304217775, - "step": 14105, - "time_per_iteration": 2.5720746517181396 - }, - { - "auxiliary_loss_clip": 0.01083671, - "auxiliary_loss_mlp": 0.01037196, - "balance_loss_clip": 1.03580463, - "balance_loss_mlp": 1.02209163, - "epoch": 0.8480986021343755, - "flos": 22489201255680.0, - "grad_norm": 1.7624192448776133, - "language_loss": 0.50159001, - "learning_rate": 2.3711649731028749e-07, - "loss": 0.52279866, - "num_input_tokens_seen": 304235760, - "step": 14106, - "time_per_iteration": 4.288937330245972 - }, - { - "auxiliary_loss_clip": 0.01077376, - "auxiliary_loss_mlp": 0.01035024, - "balance_loss_clip": 1.03691649, - "balance_loss_mlp": 1.0228827, - "epoch": 0.8481587253870434, - "flos": 22090557139200.0, - "grad_norm": 2.145828005559372, - "language_loss": 0.75445443, - "learning_rate": 2.3693259080836792e-07, - "loss": 0.7755785, - "num_input_tokens_seen": 304253985, - "step": 14107, - "time_per_iteration": 2.6221656799316406 - }, - { - "auxiliary_loss_clip": 0.01076318, - "auxiliary_loss_mlp": 0.0102835, - "balance_loss_clip": 1.03518283, - "balance_loss_mlp": 1.01601171, - "epoch": 0.8482188486397114, - "flos": 33582689366400.0, - "grad_norm": 1.5246182504502446, - "language_loss": 0.73586017, - "learning_rate": 2.3674875116214087e-07, - "loss": 0.75690687, - "num_input_tokens_seen": 304276785, - "step": 14108, - "time_per_iteration": 5.8729071617126465 - }, - { - "auxiliary_loss_clip": 0.01106391, - "auxiliary_loss_mlp": 0.01029487, - "balance_loss_clip": 1.03722811, - "balance_loss_mlp": 1.01592076, - "epoch": 0.8482789718923793, - "flos": 20919402195840.0, - "grad_norm": 1.650722642214462, - "language_loss": 0.72323227, - "learning_rate": 2.3656497837857836e-07, - "loss": 0.74459112, - "num_input_tokens_seen": 304296310, - "step": 14109, - "time_per_iteration": 2.633683443069458 - }, - { - "auxiliary_loss_clip": 0.01039152, - "auxiliary_loss_mlp": 0.01036217, - "balance_loss_clip": 1.03288758, - "balance_loss_mlp": 1.02361703, - "epoch": 0.8483390951450474, - "flos": 12896204912640.0, - "grad_norm": 2.505097141687178, - "language_loss": 0.74121177, - "learning_rate": 2.3638127246464811e-07, - "loss": 0.76196551, - "num_input_tokens_seen": 304311715, - "step": 14110, - "time_per_iteration": 2.7661683559417725 - }, - { - "auxiliary_loss_clip": 0.0105041, - "auxiliary_loss_mlp": 0.01031441, - "balance_loss_clip": 1.03519773, - "balance_loss_mlp": 1.01922786, - "epoch": 0.8483992183977154, - "flos": 25081628520960.0, - "grad_norm": 1.695497905568318, - "language_loss": 0.75963587, - "learning_rate": 2.3619763342731658e-07, - "loss": 0.7804544, - "num_input_tokens_seen": 304331910, - "step": 14111, - "time_per_iteration": 2.809145450592041 - }, - { - "auxiliary_loss_clip": 0.01107437, - "auxiliary_loss_mlp": 0.01029784, - "balance_loss_clip": 1.03754044, - "balance_loss_mlp": 1.018435, - "epoch": 0.8484593416503833, - "flos": 25557445008000.0, - "grad_norm": 3.4199751822671955, - "language_loss": 0.67615312, - "learning_rate": 2.3601406127354772e-07, - "loss": 0.69752538, - "num_input_tokens_seen": 304351405, - "step": 14112, - "time_per_iteration": 2.576991081237793 - }, - { - "auxiliary_loss_clip": 0.01093257, - "auxiliary_loss_mlp": 0.01032326, - "balance_loss_clip": 1.03299069, - "balance_loss_mlp": 1.0202558, - "epoch": 0.8485194649030513, - "flos": 27198454780800.0, - "grad_norm": 1.4428256767877636, - "language_loss": 0.73642004, - "learning_rate": 2.3583055601030312e-07, - "loss": 0.75767583, - "num_input_tokens_seen": 304372935, - "step": 14113, - "time_per_iteration": 2.6638875007629395 - }, - { - "auxiliary_loss_clip": 0.01071779, - "auxiliary_loss_mlp": 0.01031894, - "balance_loss_clip": 1.03808439, - "balance_loss_mlp": 1.01990139, - "epoch": 0.8485795881557192, - "flos": 24205910941440.0, - "grad_norm": 16.765212760184376, - "language_loss": 0.66891378, - "learning_rate": 2.3564711764454003e-07, - "loss": 0.68995047, - "num_input_tokens_seen": 304393070, - "step": 14114, - "time_per_iteration": 2.71993088722229 - }, - { - "auxiliary_loss_clip": 0.01111702, - "auxiliary_loss_mlp": 0.01031552, - "balance_loss_clip": 1.03860688, - "balance_loss_mlp": 1.01900554, - "epoch": 0.8486397114083872, - "flos": 21141653598720.0, - "grad_norm": 1.6329871649970922, - "language_loss": 0.78943914, - "learning_rate": 2.3546374618321495e-07, - "loss": 0.81087166, - "num_input_tokens_seen": 304411195, - "step": 14115, - "time_per_iteration": 2.5624794960021973 - }, - { - "auxiliary_loss_clip": 0.01110202, - "auxiliary_loss_mlp": 0.01033883, - "balance_loss_clip": 1.03798008, - "balance_loss_mlp": 1.02150321, - "epoch": 0.8486998346610551, - "flos": 19974772373760.0, - "grad_norm": 1.8401751033694997, - "language_loss": 0.78926462, - "learning_rate": 2.3528044163328187e-07, - "loss": 0.81070548, - "num_input_tokens_seen": 304429425, - "step": 14116, - "time_per_iteration": 2.5436830520629883 - }, - { - "auxiliary_loss_clip": 0.01101053, - "auxiliary_loss_mlp": 0.0102868, - "balance_loss_clip": 1.037081, - "balance_loss_mlp": 1.01596665, - "epoch": 0.8487599579137232, - "flos": 19792310261760.0, - "grad_norm": 1.863909931261485, - "language_loss": 0.68563157, - "learning_rate": 2.3509720400169076e-07, - "loss": 0.70692891, - "num_input_tokens_seen": 304447460, - "step": 14117, - "time_per_iteration": 2.580505609512329 - }, - { - "auxiliary_loss_clip": 0.01089877, - "auxiliary_loss_mlp": 0.01028249, - "balance_loss_clip": 1.0346086, - "balance_loss_mlp": 1.01571453, - "epoch": 0.8488200811663911, - "flos": 26396030903040.0, - "grad_norm": 1.9028577306172345, - "language_loss": 0.65127873, - "learning_rate": 2.3491403329539096e-07, - "loss": 0.67246002, - "num_input_tokens_seen": 304468230, - "step": 14118, - "time_per_iteration": 2.670452356338501 - }, - { - "auxiliary_loss_clip": 0.01066258, - "auxiliary_loss_mlp": 0.01029959, - "balance_loss_clip": 1.0340513, - "balance_loss_mlp": 1.01815736, - "epoch": 0.8488802044190591, - "flos": 16359285939840.0, - "grad_norm": 1.7651382162143987, - "language_loss": 0.7343511, - "learning_rate": 2.3473092952132757e-07, - "loss": 0.75531328, - "num_input_tokens_seen": 304484860, - "step": 14119, - "time_per_iteration": 2.681450128555298 - }, - { - "auxiliary_loss_clip": 0.01076463, - "auxiliary_loss_mlp": 0.01030361, - "balance_loss_clip": 1.03407943, - "balance_loss_mlp": 1.01667559, - "epoch": 0.848940327671727, - "flos": 19208869649280.0, - "grad_norm": 2.0775692345074486, - "language_loss": 0.77856582, - "learning_rate": 2.345478926864446e-07, - "loss": 0.7996341, - "num_input_tokens_seen": 304503575, - "step": 14120, - "time_per_iteration": 2.6914706230163574 - }, - { - "auxiliary_loss_clip": 0.01094944, - "auxiliary_loss_mlp": 0.01029569, - "balance_loss_clip": 1.03799891, - "balance_loss_mlp": 1.01668835, - "epoch": 0.849000450924395, - "flos": 21871178824320.0, - "grad_norm": 1.7424170846480949, - "language_loss": 0.75571072, - "learning_rate": 2.3436492279768227e-07, - "loss": 0.7769559, - "num_input_tokens_seen": 304525005, - "step": 14121, - "time_per_iteration": 2.6244821548461914 - }, - { - "auxiliary_loss_clip": 0.00992252, - "auxiliary_loss_mlp": 0.00999952, - "balance_loss_clip": 1.00927687, - "balance_loss_mlp": 0.9989683, - "epoch": 0.8490605741770629, - "flos": 71166475624320.0, - "grad_norm": 1.3338181745049902, - "language_loss": 0.60148805, - "learning_rate": 2.3418201986197883e-07, - "loss": 0.62141007, - "num_input_tokens_seen": 304585220, - "step": 14122, - "time_per_iteration": 3.219271659851074 - }, - { - "auxiliary_loss_clip": 0.01098712, - "auxiliary_loss_mlp": 0.01031259, - "balance_loss_clip": 1.03732467, - "balance_loss_mlp": 1.01909947, - "epoch": 0.849120697429731, - "flos": 24973357950720.0, - "grad_norm": 1.7666847271822834, - "language_loss": 0.79593515, - "learning_rate": 2.3399918388627048e-07, - "loss": 0.81723487, - "num_input_tokens_seen": 304604665, - "step": 14123, - "time_per_iteration": 2.696174144744873 - }, - { - "auxiliary_loss_clip": 0.01095036, - "auxiliary_loss_mlp": 0.01030234, - "balance_loss_clip": 1.03751028, - "balance_loss_mlp": 1.01858711, - "epoch": 0.8491808206823989, - "flos": 23032277959680.0, - "grad_norm": 1.9536438787496402, - "language_loss": 0.82910216, - "learning_rate": 2.3381641487749016e-07, - "loss": 0.85035485, - "num_input_tokens_seen": 304620600, - "step": 14124, - "time_per_iteration": 2.7340493202209473 - }, - { - "auxiliary_loss_clip": 0.01064676, - "auxiliary_loss_mlp": 0.01034224, - "balance_loss_clip": 1.03637028, - "balance_loss_mlp": 1.0209558, - "epoch": 0.8492409439350669, - "flos": 23878549365120.0, - "grad_norm": 1.89858398727176, - "language_loss": 0.72199571, - "learning_rate": 2.3363371284256805e-07, - "loss": 0.74298477, - "num_input_tokens_seen": 304639540, - "step": 14125, - "time_per_iteration": 2.736920118331909 - }, - { - "auxiliary_loss_clip": 0.01114158, - "auxiliary_loss_mlp": 0.0103526, - "balance_loss_clip": 1.03876936, - "balance_loss_mlp": 1.0216043, - "epoch": 0.8493010671877349, - "flos": 22419893963520.0, - "grad_norm": 1.649969149423374, - "language_loss": 0.73402816, - "learning_rate": 2.3345107778843288e-07, - "loss": 0.75552237, - "num_input_tokens_seen": 304660595, - "step": 14126, - "time_per_iteration": 2.5707271099090576 - }, - { - "auxiliary_loss_clip": 0.01061518, - "auxiliary_loss_mlp": 0.01039889, - "balance_loss_clip": 1.03374052, - "balance_loss_mlp": 1.0265317, - "epoch": 0.8493611904404028, - "flos": 17529435302400.0, - "grad_norm": 1.4324709138124028, - "language_loss": 0.67603076, - "learning_rate": 2.3326850972200928e-07, - "loss": 0.69704485, - "num_input_tokens_seen": 304679580, - "step": 14127, - "time_per_iteration": 2.7047815322875977 - }, - { - "auxiliary_loss_clip": 0.01075849, - "auxiliary_loss_mlp": 0.00772172, - "balance_loss_clip": 1.03386354, - "balance_loss_mlp": 1.00027514, - "epoch": 0.8494213136930708, - "flos": 19462937523840.0, - "grad_norm": 2.2394682768750727, - "language_loss": 0.68882221, - "learning_rate": 2.330860086502211e-07, - "loss": 0.70730239, - "num_input_tokens_seen": 304698385, - "step": 14128, - "time_per_iteration": 2.714137077331543 - }, - { - "auxiliary_loss_clip": 0.01082408, - "auxiliary_loss_mlp": 0.01032133, - "balance_loss_clip": 1.03493214, - "balance_loss_mlp": 1.01906157, - "epoch": 0.8494814369457387, - "flos": 18770292587520.0, - "grad_norm": 1.7314045833982252, - "language_loss": 0.77983749, - "learning_rate": 2.3290357457998855e-07, - "loss": 0.80098283, - "num_input_tokens_seen": 304715430, - "step": 14129, - "time_per_iteration": 2.6516494750976562 - }, - { - "auxiliary_loss_clip": 0.01044399, - "auxiliary_loss_mlp": 0.01036739, - "balance_loss_clip": 1.03597188, - "balance_loss_mlp": 1.02454424, - "epoch": 0.8495415601984068, - "flos": 23331486251520.0, - "grad_norm": 1.784130753830601, - "language_loss": 0.67886949, - "learning_rate": 2.3272120751823031e-07, - "loss": 0.69968086, - "num_input_tokens_seen": 304734345, - "step": 14130, - "time_per_iteration": 2.8086585998535156 - }, - { - "auxiliary_loss_clip": 0.01099002, - "auxiliary_loss_mlp": 0.01034825, - "balance_loss_clip": 1.03699052, - "balance_loss_mlp": 1.02243376, - "epoch": 0.8496016834510747, - "flos": 26612859352320.0, - "grad_norm": 1.7932919190030374, - "language_loss": 0.71109772, - "learning_rate": 2.3253890747186e-07, - "loss": 0.732436, - "num_input_tokens_seen": 304755030, - "step": 14131, - "time_per_iteration": 2.704787254333496 - }, - { - "auxiliary_loss_clip": 0.01079775, - "auxiliary_loss_mlp": 0.01028208, - "balance_loss_clip": 1.03795338, - "balance_loss_mlp": 1.0159409, - "epoch": 0.8496618067037427, - "flos": 25480380378240.0, - "grad_norm": 1.8086796883087504, - "language_loss": 0.68577588, - "learning_rate": 2.3235667444779162e-07, - "loss": 0.70685571, - "num_input_tokens_seen": 304774320, - "step": 14132, - "time_per_iteration": 2.7110090255737305 - }, - { - "auxiliary_loss_clip": 0.01105556, - "auxiliary_loss_mlp": 0.01035286, - "balance_loss_clip": 1.03522205, - "balance_loss_mlp": 1.0235796, - "epoch": 0.8497219299564106, - "flos": 25374587846400.0, - "grad_norm": 1.7573733285315933, - "language_loss": 0.70354646, - "learning_rate": 2.3217450845293564e-07, - "loss": 0.7249549, - "num_input_tokens_seen": 304795355, - "step": 14133, - "time_per_iteration": 2.567920684814453 - }, - { - "auxiliary_loss_clip": 0.00997066, - "auxiliary_loss_mlp": 0.00751378, - "balance_loss_clip": 1.01175642, - "balance_loss_mlp": 0.99961358, - "epoch": 0.8497820532090786, - "flos": 67780279658880.0, - "grad_norm": 0.7450619720676375, - "language_loss": 0.57556748, - "learning_rate": 2.3199240949419918e-07, - "loss": 0.59305191, - "num_input_tokens_seen": 304863915, - "step": 14134, - "time_per_iteration": 3.3689846992492676 - }, - { - "auxiliary_loss_clip": 0.0107422, - "auxiliary_loss_mlp": 0.01027994, - "balance_loss_clip": 1.03716052, - "balance_loss_mlp": 1.01549459, - "epoch": 0.8498421764617465, - "flos": 23440546920960.0, - "grad_norm": 2.466409087633597, - "language_loss": 0.78983986, - "learning_rate": 2.3181037757848787e-07, - "loss": 0.81086206, - "num_input_tokens_seen": 304881555, - "step": 14135, - "time_per_iteration": 2.7446372509002686 - }, - { - "auxiliary_loss_clip": 0.01097445, - "auxiliary_loss_mlp": 0.01030777, - "balance_loss_clip": 1.03782988, - "balance_loss_mlp": 1.01817632, - "epoch": 0.8499022997144146, - "flos": 17712615686400.0, - "grad_norm": 2.7995527616505966, - "language_loss": 0.63055122, - "learning_rate": 2.316284127127044e-07, - "loss": 0.65183342, - "num_input_tokens_seen": 304898760, - "step": 14136, - "time_per_iteration": 2.5907950401306152 - }, - { - "auxiliary_loss_clip": 0.01101166, - "auxiliary_loss_mlp": 0.01031023, - "balance_loss_clip": 1.03812134, - "balance_loss_mlp": 1.01783872, - "epoch": 0.8499624229670825, - "flos": 18588512833920.0, - "grad_norm": 1.700183635273407, - "language_loss": 0.84176117, - "learning_rate": 2.3144651490374835e-07, - "loss": 0.86308306, - "num_input_tokens_seen": 304915465, - "step": 14137, - "time_per_iteration": 2.605083703994751 - }, - { - "auxiliary_loss_clip": 0.010792, - "auxiliary_loss_mlp": 0.01027843, - "balance_loss_clip": 1.03870046, - "balance_loss_mlp": 1.01687622, - "epoch": 0.8500225462197505, - "flos": 24345854328960.0, - "grad_norm": 2.180201293156008, - "language_loss": 0.78512466, - "learning_rate": 2.3126468415851773e-07, - "loss": 0.80619514, - "num_input_tokens_seen": 304933190, - "step": 14138, - "time_per_iteration": 2.70701003074646 - }, - { - "auxiliary_loss_clip": 0.01098762, - "auxiliary_loss_mlp": 0.01028021, - "balance_loss_clip": 1.03806686, - "balance_loss_mlp": 1.0162487, - "epoch": 0.8500826694724185, - "flos": 16545518979840.0, - "grad_norm": 1.5977485951908699, - "language_loss": 0.64826471, - "learning_rate": 2.310829204839073e-07, - "loss": 0.66953254, - "num_input_tokens_seen": 304951110, - "step": 14139, - "time_per_iteration": 2.5747222900390625 - }, - { - "auxiliary_loss_clip": 0.01067444, - "auxiliary_loss_mlp": 0.01031881, - "balance_loss_clip": 1.03539836, - "balance_loss_mlp": 1.02024066, - "epoch": 0.8501427927250864, - "flos": 16289404030080.0, - "grad_norm": 1.8080135201880956, - "language_loss": 0.7064625, - "learning_rate": 2.3090122388681043e-07, - "loss": 0.72745574, - "num_input_tokens_seen": 304969095, - "step": 14140, - "time_per_iteration": 2.7031800746917725 - }, - { - "auxiliary_loss_clip": 0.01073027, - "auxiliary_loss_mlp": 0.01034541, - "balance_loss_clip": 1.03628802, - "balance_loss_mlp": 1.02165496, - "epoch": 0.8502029159777544, - "flos": 26687912820480.0, - "grad_norm": 2.024190780090597, - "language_loss": 0.64177513, - "learning_rate": 2.3071959437411648e-07, - "loss": 0.6628508, - "num_input_tokens_seen": 304989315, - "step": 14141, - "time_per_iteration": 2.780942916870117 - }, - { - "auxiliary_loss_clip": 0.01079122, - "auxiliary_loss_mlp": 0.01034336, - "balance_loss_clip": 1.03825319, - "balance_loss_mlp": 1.02206933, - "epoch": 0.8502630392304223, - "flos": 35590778179200.0, - "grad_norm": 1.598166482791058, - "language_loss": 0.70859313, - "learning_rate": 2.3053803195271214e-07, - "loss": 0.72972775, - "num_input_tokens_seen": 305011020, - "step": 14142, - "time_per_iteration": 2.8212552070617676 - }, - { - "auxiliary_loss_clip": 0.01061273, - "auxiliary_loss_mlp": 0.0103314, - "balance_loss_clip": 1.03280842, - "balance_loss_mlp": 1.0207963, - "epoch": 0.8503231624830904, - "flos": 21649466125440.0, - "grad_norm": 1.747417790949646, - "language_loss": 0.6528132, - "learning_rate": 2.3035653662948375e-07, - "loss": 0.67375731, - "num_input_tokens_seen": 305033550, - "step": 14143, - "time_per_iteration": 2.785883665084839 - }, - { - "auxiliary_loss_clip": 0.01081279, - "auxiliary_loss_mlp": 0.00770514, - "balance_loss_clip": 1.03600597, - "balance_loss_mlp": 1.00017881, - "epoch": 0.8503832857357583, - "flos": 22417451838720.0, - "grad_norm": 2.048866472556172, - "language_loss": 0.68279046, - "learning_rate": 2.3017510841131216e-07, - "loss": 0.70130837, - "num_input_tokens_seen": 305052885, - "step": 14144, - "time_per_iteration": 4.240123748779297 - }, - { - "auxiliary_loss_clip": 0.01042348, - "auxiliary_loss_mlp": 0.01033949, - "balance_loss_clip": 1.03315759, - "balance_loss_mlp": 1.02033496, - "epoch": 0.8504434089884263, - "flos": 18697968552960.0, - "grad_norm": 2.1174262858689628, - "language_loss": 0.6438145, - "learning_rate": 2.299937473050777e-07, - "loss": 0.66457748, - "num_input_tokens_seen": 305071995, - "step": 14145, - "time_per_iteration": 4.4199535846710205 - }, - { - "auxiliary_loss_clip": 0.01087485, - "auxiliary_loss_mlp": 0.01031784, - "balance_loss_clip": 1.03486562, - "balance_loss_mlp": 1.01891518, - "epoch": 0.8505035322410942, - "flos": 20007989475840.0, - "grad_norm": 1.8443246841114695, - "language_loss": 0.8561762, - "learning_rate": 2.2981245331765842e-07, - "loss": 0.87736893, - "num_input_tokens_seen": 305090190, - "step": 14146, - "time_per_iteration": 2.6533970832824707 - }, - { - "auxiliary_loss_clip": 0.0110625, - "auxiliary_loss_mlp": 0.01027868, - "balance_loss_clip": 1.03480434, - "balance_loss_mlp": 1.01580358, - "epoch": 0.8505636554937622, - "flos": 20812173120000.0, - "grad_norm": 1.6073228623135094, - "language_loss": 0.84023243, - "learning_rate": 2.2963122645592814e-07, - "loss": 0.86157364, - "num_input_tokens_seen": 305109355, - "step": 14147, - "time_per_iteration": 4.045815706253052 - }, - { - "auxiliary_loss_clip": 0.01099865, - "auxiliary_loss_mlp": 0.01031152, - "balance_loss_clip": 1.0365082, - "balance_loss_mlp": 1.01814604, - "epoch": 0.8506237787464301, - "flos": 14174445277440.0, - "grad_norm": 3.171596156329527, - "language_loss": 0.85552716, - "learning_rate": 2.2945006672675894e-07, - "loss": 0.87683737, - "num_input_tokens_seen": 305124165, - "step": 14148, - "time_per_iteration": 4.178382635116577 - }, - { - "auxiliary_loss_clip": 0.01086687, - "auxiliary_loss_mlp": 0.01033466, - "balance_loss_clip": 1.03529978, - "balance_loss_mlp": 1.02051425, - "epoch": 0.8506839019990982, - "flos": 23258372117760.0, - "grad_norm": 1.5915072699945274, - "language_loss": 0.71948111, - "learning_rate": 2.292689741370204e-07, - "loss": 0.7406826, - "num_input_tokens_seen": 305143940, - "step": 14149, - "time_per_iteration": 2.7413246631622314 - }, - { - "auxiliary_loss_clip": 0.01087525, - "auxiliary_loss_mlp": 0.01029431, - "balance_loss_clip": 1.03729534, - "balance_loss_mlp": 1.0173254, - "epoch": 0.8507440252517661, - "flos": 23659206963840.0, - "grad_norm": 1.895927290429812, - "language_loss": 0.76037747, - "learning_rate": 2.290879486935804e-07, - "loss": 0.78154701, - "num_input_tokens_seen": 305163505, - "step": 14150, - "time_per_iteration": 2.8601326942443848 - }, - { - "auxiliary_loss_clip": 0.01068558, - "auxiliary_loss_mlp": 0.01033535, - "balance_loss_clip": 1.03508079, - "balance_loss_mlp": 1.02081537, - "epoch": 0.8508041485044341, - "flos": 18661339658880.0, - "grad_norm": 1.9437397223028954, - "language_loss": 0.72261739, - "learning_rate": 2.2890699040330231e-07, - "loss": 0.74363828, - "num_input_tokens_seen": 305182325, - "step": 14151, - "time_per_iteration": 2.7191174030303955 - }, - { - "auxiliary_loss_clip": 0.00989017, - "auxiliary_loss_mlp": 0.01001335, - "balance_loss_clip": 1.01485205, - "balance_loss_mlp": 1.00013149, - "epoch": 0.8508642717571021, - "flos": 52510918055040.0, - "grad_norm": 0.8877797296007555, - "language_loss": 0.5956288, - "learning_rate": 2.2872609927304909e-07, - "loss": 0.61553234, - "num_input_tokens_seen": 305230775, - "step": 14152, - "time_per_iteration": 3.0959417819976807 - }, - { - "auxiliary_loss_clip": 0.01012053, - "auxiliary_loss_mlp": 0.01000683, - "balance_loss_clip": 1.00869, - "balance_loss_mlp": 0.99963391, - "epoch": 0.85092439500977, - "flos": 69297145050240.0, - "grad_norm": 0.6913266470704932, - "language_loss": 0.61156118, - "learning_rate": 2.285452753096797e-07, - "loss": 0.63168854, - "num_input_tokens_seen": 305296000, - "step": 14153, - "time_per_iteration": 3.1953656673431396 - }, - { - "auxiliary_loss_clip": 0.01099193, - "auxiliary_loss_mlp": 0.01032395, - "balance_loss_clip": 1.03656745, - "balance_loss_mlp": 1.0191927, - "epoch": 0.850984518262438, - "flos": 24389737770240.0, - "grad_norm": 1.8650933862340224, - "language_loss": 0.80833215, - "learning_rate": 2.2836451852005067e-07, - "loss": 0.82964802, - "num_input_tokens_seen": 305314705, - "step": 14154, - "time_per_iteration": 2.6398138999938965 - }, - { - "auxiliary_loss_clip": 0.01070524, - "auxiliary_loss_mlp": 0.01031227, - "balance_loss_clip": 1.0340178, - "balance_loss_mlp": 1.02010489, - "epoch": 0.851044641515106, - "flos": 23294821443840.0, - "grad_norm": 3.9078640909935753, - "language_loss": 0.79612941, - "learning_rate": 2.281838289110165e-07, - "loss": 0.8171469, - "num_input_tokens_seen": 305333870, - "step": 14155, - "time_per_iteration": 2.7668473720550537 - }, - { - "auxiliary_loss_clip": 0.01075246, - "auxiliary_loss_mlp": 0.0103132, - "balance_loss_clip": 1.03424454, - "balance_loss_mlp": 1.01889825, - "epoch": 0.851104764767774, - "flos": 22050085489920.0, - "grad_norm": 2.1664550070392172, - "language_loss": 0.70601416, - "learning_rate": 2.2800320648942904e-07, - "loss": 0.72707975, - "num_input_tokens_seen": 305352780, - "step": 14156, - "time_per_iteration": 2.712688684463501 - }, - { - "auxiliary_loss_clip": 0.01067563, - "auxiliary_loss_mlp": 0.01030898, - "balance_loss_clip": 1.03651905, - "balance_loss_mlp": 1.01922178, - "epoch": 0.8511648880204419, - "flos": 20704728562560.0, - "grad_norm": 2.295507051871608, - "language_loss": 0.73186374, - "learning_rate": 2.278226512621386e-07, - "loss": 0.75284839, - "num_input_tokens_seen": 305371370, - "step": 14157, - "time_per_iteration": 2.702608108520508 - }, - { - "auxiliary_loss_clip": 0.01040081, - "auxiliary_loss_mlp": 0.010238, - "balance_loss_clip": 1.03516209, - "balance_loss_mlp": 1.01280284, - "epoch": 0.8512250112731099, - "flos": 24024669891840.0, - "grad_norm": 2.053663216507987, - "language_loss": 0.800686, - "learning_rate": 2.2764216323598995e-07, - "loss": 0.82132483, - "num_input_tokens_seen": 305387955, - "step": 14158, - "time_per_iteration": 2.8139398097991943 - }, - { - "auxiliary_loss_clip": 0.01094324, - "auxiliary_loss_mlp": 0.01036878, - "balance_loss_clip": 1.03557563, - "balance_loss_mlp": 1.02236986, - "epoch": 0.8512851345257778, - "flos": 22015467757440.0, - "grad_norm": 2.1721071422061446, - "language_loss": 0.79100662, - "learning_rate": 2.27461742417828e-07, - "loss": 0.81231868, - "num_input_tokens_seen": 305406285, - "step": 14159, - "time_per_iteration": 2.5417728424072266 - }, - { - "auxiliary_loss_clip": 0.01089601, - "auxiliary_loss_mlp": 0.0103497, - "balance_loss_clip": 1.0372653, - "balance_loss_mlp": 1.02239358, - "epoch": 0.8513452577784458, - "flos": 14830209924480.0, - "grad_norm": 2.049292713518449, - "language_loss": 0.71023905, - "learning_rate": 2.2728138881449488e-07, - "loss": 0.73148477, - "num_input_tokens_seen": 305424500, - "step": 14160, - "time_per_iteration": 2.5549099445343018 - }, - { - "auxiliary_loss_clip": 0.01104724, - "auxiliary_loss_mlp": 0.01029216, - "balance_loss_clip": 1.03866696, - "balance_loss_mlp": 1.01627553, - "epoch": 0.8514053810311137, - "flos": 33035662166400.0, - "grad_norm": 2.7738458222833637, - "language_loss": 0.7019136, - "learning_rate": 2.2710110243282866e-07, - "loss": 0.72325301, - "num_input_tokens_seen": 305442990, - "step": 14161, - "time_per_iteration": 2.5866782665252686 - }, - { - "auxiliary_loss_clip": 0.01097425, - "auxiliary_loss_mlp": 0.01030584, - "balance_loss_clip": 1.03306913, - "balance_loss_mlp": 1.01881218, - "epoch": 0.8514655042837818, - "flos": 27564456412800.0, - "grad_norm": 2.41119817546413, - "language_loss": 0.77940011, - "learning_rate": 2.2692088327966653e-07, - "loss": 0.80068016, - "num_input_tokens_seen": 305463065, - "step": 14162, - "time_per_iteration": 2.7035062313079834 - }, - { - "auxiliary_loss_clip": 0.01099345, - "auxiliary_loss_mlp": 0.01033323, - "balance_loss_clip": 1.03699732, - "balance_loss_mlp": 1.02044845, - "epoch": 0.8515256275364497, - "flos": 35556052705920.0, - "grad_norm": 1.8591590026423754, - "language_loss": 0.77019423, - "learning_rate": 2.2674073136184235e-07, - "loss": 0.79152089, - "num_input_tokens_seen": 305489070, - "step": 14163, - "time_per_iteration": 2.750953435897827 - }, - { - "auxiliary_loss_clip": 0.01013801, - "auxiliary_loss_mlp": 0.01003898, - "balance_loss_clip": 1.01090002, - "balance_loss_mlp": 1.00288486, - "epoch": 0.8515857507891177, - "flos": 70207372621440.0, - "grad_norm": 0.6897705551367352, - "language_loss": 0.54935861, - "learning_rate": 2.2656064668618735e-07, - "loss": 0.56953561, - "num_input_tokens_seen": 305551490, - "step": 14164, - "time_per_iteration": 3.223865509033203 - }, - { - "auxiliary_loss_clip": 0.01099487, - "auxiliary_loss_mlp": 0.01033823, - "balance_loss_clip": 1.03638053, - "balance_loss_mlp": 1.02158666, - "epoch": 0.8516458740417857, - "flos": 22675290641280.0, - "grad_norm": 1.9828346759864348, - "language_loss": 0.7308625, - "learning_rate": 2.2638062925953005e-07, - "loss": 0.7521956, - "num_input_tokens_seen": 305570535, - "step": 14165, - "time_per_iteration": 2.683063268661499 - }, - { - "auxiliary_loss_clip": 0.01070656, - "auxiliary_loss_mlp": 0.01030913, - "balance_loss_clip": 1.03600621, - "balance_loss_mlp": 1.01849699, - "epoch": 0.8517059972944536, - "flos": 22747435107840.0, - "grad_norm": 1.5369280358332664, - "language_loss": 0.6716156, - "learning_rate": 2.26200679088697e-07, - "loss": 0.6926313, - "num_input_tokens_seen": 305590800, - "step": 14166, - "time_per_iteration": 2.7411108016967773 - }, - { - "auxiliary_loss_clip": 0.01084994, - "auxiliary_loss_mlp": 0.01034282, - "balance_loss_clip": 1.03303361, - "balance_loss_mlp": 1.02188396, - "epoch": 0.8517661205471216, - "flos": 21689147675520.0, - "grad_norm": 1.785592393708889, - "language_loss": 0.73291379, - "learning_rate": 2.260207961805125e-07, - "loss": 0.75410652, - "num_input_tokens_seen": 305609495, - "step": 14167, - "time_per_iteration": 2.6664106845855713 - }, - { - "auxiliary_loss_clip": 0.01109416, - "auxiliary_loss_mlp": 0.0103133, - "balance_loss_clip": 1.03773403, - "balance_loss_mlp": 1.01968884, - "epoch": 0.8518262437997896, - "flos": 25374839241600.0, - "grad_norm": 1.6176439709713288, - "language_loss": 0.80560851, - "learning_rate": 2.258409805417969e-07, - "loss": 0.827016, - "num_input_tokens_seen": 305629420, - "step": 14168, - "time_per_iteration": 2.59899640083313 - }, - { - "auxiliary_loss_clip": 0.01106516, - "auxiliary_loss_mlp": 0.01027237, - "balance_loss_clip": 1.03524876, - "balance_loss_mlp": 1.01554823, - "epoch": 0.8518863670524576, - "flos": 27235406897280.0, - "grad_norm": 1.781183741177256, - "language_loss": 0.76068074, - "learning_rate": 2.2566123217936893e-07, - "loss": 0.7820183, - "num_input_tokens_seen": 305649835, - "step": 14169, - "time_per_iteration": 2.634589672088623 - }, - { - "auxiliary_loss_clip": 0.01112356, - "auxiliary_loss_mlp": 0.01030417, - "balance_loss_clip": 1.03858984, - "balance_loss_mlp": 1.01746488, - "epoch": 0.8519464903051255, - "flos": 20959514709120.0, - "grad_norm": 1.6296067224566693, - "language_loss": 0.63455546, - "learning_rate": 2.254815511000452e-07, - "loss": 0.65598321, - "num_input_tokens_seen": 305668840, - "step": 14170, - "time_per_iteration": 2.556849718093872 - }, - { - "auxiliary_loss_clip": 0.0109011, - "auxiliary_loss_mlp": 0.01029143, - "balance_loss_clip": 1.03445804, - "balance_loss_mlp": 1.0168829, - "epoch": 0.8520066135577935, - "flos": 18441745862400.0, - "grad_norm": 2.158149023964769, - "language_loss": 0.8638401, - "learning_rate": 2.253019373106384e-07, - "loss": 0.88503265, - "num_input_tokens_seen": 305686955, - "step": 14171, - "time_per_iteration": 2.6308727264404297 - }, - { - "auxiliary_loss_clip": 0.01094344, - "auxiliary_loss_mlp": 0.01038693, - "balance_loss_clip": 1.0366205, - "balance_loss_mlp": 1.02613449, - "epoch": 0.8520667368104614, - "flos": 29130233149440.0, - "grad_norm": 1.7172812217189943, - "language_loss": 0.54943144, - "learning_rate": 2.2512239081796003e-07, - "loss": 0.5707618, - "num_input_tokens_seen": 305706290, - "step": 14172, - "time_per_iteration": 2.6792733669281006 - }, - { - "auxiliary_loss_clip": 0.01082291, - "auxiliary_loss_mlp": 0.01028716, - "balance_loss_clip": 1.0339576, - "balance_loss_mlp": 1.01860142, - "epoch": 0.8521268600631294, - "flos": 16034366488320.0, - "grad_norm": 2.2874699102047824, - "language_loss": 0.6964975, - "learning_rate": 2.2494291162881862e-07, - "loss": 0.71760762, - "num_input_tokens_seen": 305723835, - "step": 14173, - "time_per_iteration": 2.656660795211792 - }, - { - "auxiliary_loss_clip": 0.0108799, - "auxiliary_loss_mlp": 0.0077035, - "balance_loss_clip": 1.03576326, - "balance_loss_mlp": 1.0002656, - "epoch": 0.8521869833157973, - "flos": 22454870832000.0, - "grad_norm": 2.469794290307129, - "language_loss": 0.77085257, - "learning_rate": 2.247634997500205e-07, - "loss": 0.78943598, - "num_input_tokens_seen": 305741655, - "step": 14174, - "time_per_iteration": 2.6629743576049805 - }, - { - "auxiliary_loss_clip": 0.01074547, - "auxiliary_loss_mlp": 0.00771408, - "balance_loss_clip": 1.03330672, - "balance_loss_mlp": 1.00036669, - "epoch": 0.8522471065684654, - "flos": 24972029147520.0, - "grad_norm": 3.681847019850499, - "language_loss": 0.8197754, - "learning_rate": 2.245841551883676e-07, - "loss": 0.83823496, - "num_input_tokens_seen": 305761890, - "step": 14175, - "time_per_iteration": 2.6883835792541504 - }, - { - "auxiliary_loss_clip": 0.01112836, - "auxiliary_loss_mlp": 0.01035663, - "balance_loss_clip": 1.03869867, - "balance_loss_mlp": 1.02256155, - "epoch": 0.8523072298211333, - "flos": 17710604524800.0, - "grad_norm": 7.221526535208017, - "language_loss": 0.65591013, - "learning_rate": 2.2440487795066153e-07, - "loss": 0.67739511, - "num_input_tokens_seen": 305779190, - "step": 14176, - "time_per_iteration": 2.513249397277832 - }, - { - "auxiliary_loss_clip": 0.01083655, - "auxiliary_loss_mlp": 0.00769903, - "balance_loss_clip": 1.03461874, - "balance_loss_mlp": 1.00019979, - "epoch": 0.8523673530738013, - "flos": 25446193608960.0, - "grad_norm": 1.6790468369786946, - "language_loss": 0.7851091, - "learning_rate": 2.2422566804370068e-07, - "loss": 0.80364466, - "num_input_tokens_seen": 305799870, - "step": 14177, - "time_per_iteration": 2.6671228408813477 - }, - { - "auxiliary_loss_clip": 0.0108583, - "auxiliary_loss_mlp": 0.01029951, - "balance_loss_clip": 1.03573, - "balance_loss_mlp": 1.01741612, - "epoch": 0.8524274763264693, - "flos": 31429593348480.0, - "grad_norm": 1.9646253723972047, - "language_loss": 0.73313767, - "learning_rate": 2.2404652547428026e-07, - "loss": 0.75429547, - "num_input_tokens_seen": 305819695, - "step": 14178, - "time_per_iteration": 2.713926315307617 - }, - { - "auxiliary_loss_clip": 0.01074008, - "auxiliary_loss_mlp": 0.01037664, - "balance_loss_clip": 1.03707623, - "balance_loss_mlp": 1.02537966, - "epoch": 0.8524875995791372, - "flos": 17712651600000.0, - "grad_norm": 1.8623872713684044, - "language_loss": 0.74955928, - "learning_rate": 2.238674502491935e-07, - "loss": 0.77067608, - "num_input_tokens_seen": 305837270, - "step": 14179, - "time_per_iteration": 2.6611170768737793 - }, - { - "auxiliary_loss_clip": 0.01109256, - "auxiliary_loss_mlp": 0.01029994, - "balance_loss_clip": 1.03910112, - "balance_loss_mlp": 1.01806116, - "epoch": 0.8525477228318052, - "flos": 21687316081920.0, - "grad_norm": 2.060347527701932, - "language_loss": 0.816504, - "learning_rate": 2.2368844237523165e-07, - "loss": 0.83789647, - "num_input_tokens_seen": 305855250, - "step": 14180, - "time_per_iteration": 2.6562328338623047 - }, - { - "auxiliary_loss_clip": 0.01051532, - "auxiliary_loss_mlp": 0.01034699, - "balance_loss_clip": 1.03316164, - "balance_loss_mlp": 1.02265859, - "epoch": 0.8526078460844732, - "flos": 24827057856000.0, - "grad_norm": 6.706307974363978, - "language_loss": 0.60821462, - "learning_rate": 2.235095018591815e-07, - "loss": 0.62907696, - "num_input_tokens_seen": 305875660, - "step": 14181, - "time_per_iteration": 2.7725861072540283 - }, - { - "auxiliary_loss_clip": 0.011084, - "auxiliary_loss_mlp": 0.01033406, - "balance_loss_clip": 1.03824615, - "balance_loss_mlp": 1.02208114, - "epoch": 0.8526679693371412, - "flos": 13516418073600.0, - "grad_norm": 2.1617391285888314, - "language_loss": 0.72616804, - "learning_rate": 2.2333062870782894e-07, - "loss": 0.74758613, - "num_input_tokens_seen": 305892415, - "step": 14182, - "time_per_iteration": 2.5392303466796875 - }, - { - "auxiliary_loss_clip": 0.01056951, - "auxiliary_loss_mlp": 0.01033126, - "balance_loss_clip": 1.03387702, - "balance_loss_mlp": 1.0208416, - "epoch": 0.8527280925898091, - "flos": 23514092017920.0, - "grad_norm": 1.4656692633945465, - "language_loss": 0.70735776, - "learning_rate": 2.2315182292795697e-07, - "loss": 0.72825855, - "num_input_tokens_seen": 305912665, - "step": 14183, - "time_per_iteration": 4.254406213760376 - }, - { - "auxiliary_loss_clip": 0.01081461, - "auxiliary_loss_mlp": 0.01031438, - "balance_loss_clip": 1.03771853, - "balance_loss_mlp": 1.01956463, - "epoch": 0.8527882158424771, - "flos": 20303031790080.0, - "grad_norm": 1.7895488338576169, - "language_loss": 0.72972029, - "learning_rate": 2.2297308452634644e-07, - "loss": 0.75084925, - "num_input_tokens_seen": 305931515, - "step": 14184, - "time_per_iteration": 4.304826974868774 - }, - { - "auxiliary_loss_clip": 0.01109825, - "auxiliary_loss_mlp": 0.01033313, - "balance_loss_clip": 1.0379746, - "balance_loss_mlp": 1.02064705, - "epoch": 0.852848339095145, - "flos": 17202504689280.0, - "grad_norm": 1.7597843900192167, - "language_loss": 0.7711637, - "learning_rate": 2.2279441350977457e-07, - "loss": 0.79259503, - "num_input_tokens_seen": 305949965, - "step": 14185, - "time_per_iteration": 2.5977091789245605 - }, - { - "auxiliary_loss_clip": 0.0106691, - "auxiliary_loss_mlp": 0.0102992, - "balance_loss_clip": 1.03286219, - "balance_loss_mlp": 1.01596713, - "epoch": 0.852908462347813, - "flos": 18368990864640.0, - "grad_norm": 2.425914836353015, - "language_loss": 0.79841149, - "learning_rate": 2.2261580988501637e-07, - "loss": 0.81937975, - "num_input_tokens_seen": 305967820, - "step": 14186, - "time_per_iteration": 4.160691738128662 - }, - { - "auxiliary_loss_clip": 0.01085946, - "auxiliary_loss_mlp": 0.01029366, - "balance_loss_clip": 1.03557575, - "balance_loss_mlp": 1.01655054, - "epoch": 0.8529685856004809, - "flos": 18624890332800.0, - "grad_norm": 1.6292428132802597, - "language_loss": 0.62476075, - "learning_rate": 2.224372736588449e-07, - "loss": 0.64591384, - "num_input_tokens_seen": 305985505, - "step": 14187, - "time_per_iteration": 4.218466758728027 - }, - { - "auxiliary_loss_clip": 0.01056813, - "auxiliary_loss_mlp": 0.01030511, - "balance_loss_clip": 1.03186965, - "balance_loss_mlp": 1.01697493, - "epoch": 0.853028708853149, - "flos": 29607665748480.0, - "grad_norm": 1.8450425087178943, - "language_loss": 0.76632512, - "learning_rate": 2.2225880483803005e-07, - "loss": 0.78719831, - "num_input_tokens_seen": 306005220, - "step": 14188, - "time_per_iteration": 2.756181240081787 - }, - { - "auxiliary_loss_clip": 0.01098789, - "auxiliary_loss_mlp": 0.01029553, - "balance_loss_clip": 1.03644693, - "balance_loss_mlp": 1.01655281, - "epoch": 0.8530888321058169, - "flos": 26353153042560.0, - "grad_norm": 1.4873919798576203, - "language_loss": 0.7809422, - "learning_rate": 2.2208040342933932e-07, - "loss": 0.80222559, - "num_input_tokens_seen": 306023785, - "step": 14189, - "time_per_iteration": 2.6410326957702637 - }, - { - "auxiliary_loss_clip": 0.01086145, - "auxiliary_loss_mlp": 0.01032823, - "balance_loss_clip": 1.03470349, - "balance_loss_mlp": 1.01997268, - "epoch": 0.8531489553584849, - "flos": 20521979141760.0, - "grad_norm": 2.5651447957757005, - "language_loss": 0.7962172, - "learning_rate": 2.2190206943953793e-07, - "loss": 0.81740683, - "num_input_tokens_seen": 306041600, - "step": 14190, - "time_per_iteration": 2.6400444507598877 - }, - { - "auxiliary_loss_clip": 0.01059317, - "auxiliary_loss_mlp": 0.01029769, - "balance_loss_clip": 1.03576827, - "balance_loss_mlp": 1.01700187, - "epoch": 0.8532090786111529, - "flos": 20704297599360.0, - "grad_norm": 2.162987125122954, - "language_loss": 0.75559723, - "learning_rate": 2.2172380287538894e-07, - "loss": 0.77648813, - "num_input_tokens_seen": 306060345, - "step": 14191, - "time_per_iteration": 2.691556692123413 - }, - { - "auxiliary_loss_clip": 0.01098409, - "auxiliary_loss_mlp": 0.01030091, - "balance_loss_clip": 1.0377655, - "balance_loss_mlp": 1.01723993, - "epoch": 0.8532692018638208, - "flos": 19828903242240.0, - "grad_norm": 1.957341316580574, - "language_loss": 0.69267607, - "learning_rate": 2.2154560374365073e-07, - "loss": 0.71396106, - "num_input_tokens_seen": 306078285, - "step": 14192, - "time_per_iteration": 2.631347894668579 - }, - { - "auxiliary_loss_clip": 0.01101694, - "auxiliary_loss_mlp": 0.01035412, - "balance_loss_clip": 1.03725314, - "balance_loss_mlp": 1.02091622, - "epoch": 0.8533293251164888, - "flos": 20996790048000.0, - "grad_norm": 2.1501600362317643, - "language_loss": 0.63451266, - "learning_rate": 2.2136747205108164e-07, - "loss": 0.65588367, - "num_input_tokens_seen": 306093760, - "step": 14193, - "time_per_iteration": 2.626577377319336 - }, - { - "auxiliary_loss_clip": 0.01081646, - "auxiliary_loss_mlp": 0.01028086, - "balance_loss_clip": 1.03549838, - "balance_loss_mlp": 1.01570606, - "epoch": 0.8533894483691568, - "flos": 22419606654720.0, - "grad_norm": 2.0851012965746905, - "language_loss": 0.76840144, - "learning_rate": 2.211894078044365e-07, - "loss": 0.78949881, - "num_input_tokens_seen": 306112595, - "step": 14194, - "time_per_iteration": 2.6441872119903564 - }, - { - "auxiliary_loss_clip": 0.01110242, - "auxiliary_loss_mlp": 0.01028551, - "balance_loss_clip": 1.03740048, - "balance_loss_mlp": 1.01674914, - "epoch": 0.8534495716218248, - "flos": 21616536332160.0, - "grad_norm": 2.217628380709863, - "language_loss": 0.69469094, - "learning_rate": 2.2101141101046705e-07, - "loss": 0.71607888, - "num_input_tokens_seen": 306131800, - "step": 14195, - "time_per_iteration": 2.679945707321167 - }, - { - "auxiliary_loss_clip": 0.01082724, - "auxiliary_loss_mlp": 0.01032645, - "balance_loss_clip": 1.03602624, - "balance_loss_mlp": 1.01968741, - "epoch": 0.8535096948744927, - "flos": 22346277039360.0, - "grad_norm": 1.838474831161169, - "language_loss": 0.85432625, - "learning_rate": 2.2083348167592343e-07, - "loss": 0.87547994, - "num_input_tokens_seen": 306150590, - "step": 14196, - "time_per_iteration": 2.6546883583068848 - }, - { - "auxiliary_loss_clip": 0.01011396, - "auxiliary_loss_mlp": 0.01001419, - "balance_loss_clip": 1.00853372, - "balance_loss_mlp": 1.00029302, - "epoch": 0.8535698181271607, - "flos": 52762507891200.0, - "grad_norm": 0.7576235506473017, - "language_loss": 0.55055857, - "learning_rate": 2.2065561980755243e-07, - "loss": 0.5706867, - "num_input_tokens_seen": 306205850, - "step": 14197, - "time_per_iteration": 3.1265292167663574 - }, - { - "auxiliary_loss_clip": 0.01072451, - "auxiliary_loss_mlp": 0.00769866, - "balance_loss_clip": 1.03453422, - "balance_loss_mlp": 1.0002501, - "epoch": 0.8536299413798286, - "flos": 19062892776960.0, - "grad_norm": 1.7349390720233626, - "language_loss": 0.81448388, - "learning_rate": 2.2047782541209826e-07, - "loss": 0.83290708, - "num_input_tokens_seen": 306225220, - "step": 14198, - "time_per_iteration": 2.709376573562622 - }, - { - "auxiliary_loss_clip": 0.01107145, - "auxiliary_loss_mlp": 0.01028426, - "balance_loss_clip": 1.03658509, - "balance_loss_mlp": 1.01760149, - "epoch": 0.8536900646324966, - "flos": 49344743871360.0, - "grad_norm": 3.58688569610331, - "language_loss": 0.6833868, - "learning_rate": 2.203000984963035e-07, - "loss": 0.70474249, - "num_input_tokens_seen": 306249865, - "step": 14199, - "time_per_iteration": 2.8150370121002197 - }, - { - "auxiliary_loss_clip": 0.01070955, - "auxiliary_loss_mlp": 0.01028243, - "balance_loss_clip": 1.03376341, - "balance_loss_mlp": 1.01713872, - "epoch": 0.8537501878851645, - "flos": 21762333636480.0, - "grad_norm": 1.5671357707792795, - "language_loss": 0.86500955, - "learning_rate": 2.201224390669072e-07, - "loss": 0.88600153, - "num_input_tokens_seen": 306270215, - "step": 14200, - "time_per_iteration": 2.6922430992126465 - }, - { - "auxiliary_loss_clip": 0.01079411, - "auxiliary_loss_mlp": 0.01028505, - "balance_loss_clip": 1.03768003, - "balance_loss_mlp": 1.01712668, - "epoch": 0.8538103111378326, - "flos": 22269176496000.0, - "grad_norm": 1.8836819449837667, - "language_loss": 0.77679044, - "learning_rate": 2.1994484713064666e-07, - "loss": 0.79786962, - "num_input_tokens_seen": 306288960, - "step": 14201, - "time_per_iteration": 2.686408758163452 - }, - { - "auxiliary_loss_clip": 0.01080739, - "auxiliary_loss_mlp": 0.01029211, - "balance_loss_clip": 1.03589165, - "balance_loss_mlp": 1.01757574, - "epoch": 0.8538704343905005, - "flos": 20303929630080.0, - "grad_norm": 2.780473134009725, - "language_loss": 0.6885708, - "learning_rate": 2.19767322694256e-07, - "loss": 0.70967031, - "num_input_tokens_seen": 306308735, - "step": 14202, - "time_per_iteration": 2.6336662769317627 - }, - { - "auxiliary_loss_clip": 0.01099521, - "auxiliary_loss_mlp": 0.01036207, - "balance_loss_clip": 1.037709, - "balance_loss_mlp": 1.02389884, - "epoch": 0.8539305576431685, - "flos": 24755164784640.0, - "grad_norm": 1.9187950545950658, - "language_loss": 0.80178666, - "learning_rate": 2.195898657644666e-07, - "loss": 0.82314396, - "num_input_tokens_seen": 306329015, - "step": 14203, - "time_per_iteration": 2.6216869354248047 - }, - { - "auxiliary_loss_clip": 0.01090886, - "auxiliary_loss_mlp": 0.01032011, - "balance_loss_clip": 1.03592849, - "balance_loss_mlp": 1.01897538, - "epoch": 0.8539906808958365, - "flos": 26687625511680.0, - "grad_norm": 2.0827727006300543, - "language_loss": 0.66570961, - "learning_rate": 2.1941247634800808e-07, - "loss": 0.68693864, - "num_input_tokens_seen": 306349085, - "step": 14204, - "time_per_iteration": 2.7057762145996094 - }, - { - "auxiliary_loss_clip": 0.01111148, - "auxiliary_loss_mlp": 0.01032529, - "balance_loss_clip": 1.03801191, - "balance_loss_mlp": 1.01958251, - "epoch": 0.8540508041485044, - "flos": 13365521038080.0, - "grad_norm": 2.906237255185196, - "language_loss": 0.59810114, - "learning_rate": 2.1923515445160667e-07, - "loss": 0.61953795, - "num_input_tokens_seen": 306365385, - "step": 14205, - "time_per_iteration": 2.573305368423462 - }, - { - "auxiliary_loss_clip": 0.0108658, - "auxiliary_loss_mlp": 0.0102928, - "balance_loss_clip": 1.03708744, - "balance_loss_mlp": 1.01709652, - "epoch": 0.8541109274011724, - "flos": 32780876019840.0, - "grad_norm": 3.4708591258451116, - "language_loss": 0.72213638, - "learning_rate": 2.1905790008198655e-07, - "loss": 0.74329495, - "num_input_tokens_seen": 306384585, - "step": 14206, - "time_per_iteration": 2.7664809226989746 - }, - { - "auxiliary_loss_clip": 0.01100148, - "auxiliary_loss_mlp": 0.01027694, - "balance_loss_clip": 1.03798437, - "balance_loss_mlp": 1.01563621, - "epoch": 0.8541710506538404, - "flos": 17639286071040.0, - "grad_norm": 2.7591002381259617, - "language_loss": 0.76277685, - "learning_rate": 2.1888071324586987e-07, - "loss": 0.78405529, - "num_input_tokens_seen": 306401565, - "step": 14207, - "time_per_iteration": 2.5857670307159424 - }, - { - "auxiliary_loss_clip": 0.01110866, - "auxiliary_loss_mlp": 0.01031023, - "balance_loss_clip": 1.03753805, - "balance_loss_mlp": 1.01777935, - "epoch": 0.8542311739065084, - "flos": 20263062931200.0, - "grad_norm": 1.7437874977291616, - "language_loss": 0.85243803, - "learning_rate": 2.1870359394997485e-07, - "loss": 0.8738569, - "num_input_tokens_seen": 306419995, - "step": 14208, - "time_per_iteration": 2.5491318702697754 - }, - { - "auxiliary_loss_clip": 0.01090714, - "auxiliary_loss_mlp": 0.0102915, - "balance_loss_clip": 1.03670692, - "balance_loss_mlp": 1.01759243, - "epoch": 0.8542912971591763, - "flos": 17785657992960.0, - "grad_norm": 1.579396751637571, - "language_loss": 0.66011345, - "learning_rate": 2.1852654220101785e-07, - "loss": 0.68131208, - "num_input_tokens_seen": 306439240, - "step": 14209, - "time_per_iteration": 2.619147539138794 - }, - { - "auxiliary_loss_clip": 0.01062026, - "auxiliary_loss_mlp": 0.01025767, - "balance_loss_clip": 1.03395295, - "balance_loss_mlp": 1.01391149, - "epoch": 0.8543514204118443, - "flos": 26979507429120.0, - "grad_norm": 2.0420847855392297, - "language_loss": 0.70425576, - "learning_rate": 2.1834955800571287e-07, - "loss": 0.72513366, - "num_input_tokens_seen": 306458425, - "step": 14210, - "time_per_iteration": 2.7978549003601074 - }, - { - "auxiliary_loss_clip": 0.01085485, - "auxiliary_loss_mlp": 0.01031679, - "balance_loss_clip": 1.03576684, - "balance_loss_mlp": 1.0193646, - "epoch": 0.8544115436645122, - "flos": 24024598064640.0, - "grad_norm": 1.6548708543912152, - "language_loss": 0.70239341, - "learning_rate": 2.1817264137077141e-07, - "loss": 0.7235651, - "num_input_tokens_seen": 306477210, - "step": 14211, - "time_per_iteration": 2.766183614730835 - }, - { - "auxiliary_loss_clip": 0.01090016, - "auxiliary_loss_mlp": 0.0103377, - "balance_loss_clip": 1.03690755, - "balance_loss_mlp": 1.02137232, - "epoch": 0.8544716669171802, - "flos": 16617986668800.0, - "grad_norm": 2.2883331624161687, - "language_loss": 0.81601977, - "learning_rate": 2.1799579230290166e-07, - "loss": 0.83725762, - "num_input_tokens_seen": 306495820, - "step": 14212, - "time_per_iteration": 2.6845991611480713 - }, - { - "auxiliary_loss_clip": 0.01073343, - "auxiliary_loss_mlp": 0.01033296, - "balance_loss_clip": 1.03170538, - "balance_loss_mlp": 1.01963472, - "epoch": 0.8545317901698481, - "flos": 40005779489280.0, - "grad_norm": 1.7913118059444788, - "language_loss": 0.66273463, - "learning_rate": 2.178190108088105e-07, - "loss": 0.68380105, - "num_input_tokens_seen": 306516420, - "step": 14213, - "time_per_iteration": 2.8582568168640137 - }, - { - "auxiliary_loss_clip": 0.01107415, - "auxiliary_loss_mlp": 0.01029384, - "balance_loss_clip": 1.03667092, - "balance_loss_mlp": 1.01733816, - "epoch": 0.8545919134225162, - "flos": 19902520166400.0, - "grad_norm": 1.7812973298458348, - "language_loss": 0.78218639, - "learning_rate": 2.1764229689520098e-07, - "loss": 0.80355442, - "num_input_tokens_seen": 306534785, - "step": 14214, - "time_per_iteration": 2.5741806030273438 - }, - { - "auxiliary_loss_clip": 0.01090515, - "auxiliary_loss_mlp": 0.01030143, - "balance_loss_clip": 1.03572309, - "balance_loss_mlp": 1.01646936, - "epoch": 0.8546520366751841, - "flos": 18952970181120.0, - "grad_norm": 2.3620228976169013, - "language_loss": 0.66771472, - "learning_rate": 2.1746565056877397e-07, - "loss": 0.68892121, - "num_input_tokens_seen": 306552440, - "step": 14215, - "time_per_iteration": 2.682720422744751 - }, - { - "auxiliary_loss_clip": 0.01108233, - "auxiliary_loss_mlp": 0.01027691, - "balance_loss_clip": 1.03707683, - "balance_loss_mlp": 1.01554906, - "epoch": 0.8547121599278521, - "flos": 35621445415680.0, - "grad_norm": 1.6345629270986273, - "language_loss": 0.62375963, - "learning_rate": 2.172890718362279e-07, - "loss": 0.64511889, - "num_input_tokens_seen": 306573600, - "step": 14216, - "time_per_iteration": 2.675818681716919 - }, - { - "auxiliary_loss_clip": 0.01073815, - "auxiliary_loss_mlp": 0.01034717, - "balance_loss_clip": 1.03433871, - "balance_loss_mlp": 1.0223552, - "epoch": 0.8547722831805201, - "flos": 16910048154240.0, - "grad_norm": 2.187084459340775, - "language_loss": 0.6559574, - "learning_rate": 2.17112560704259e-07, - "loss": 0.67704272, - "num_input_tokens_seen": 306592840, - "step": 14217, - "time_per_iteration": 2.6645264625549316 - }, - { - "auxiliary_loss_clip": 0.01095964, - "auxiliary_loss_mlp": 0.0103166, - "balance_loss_clip": 1.03822827, - "balance_loss_mlp": 1.01984668, - "epoch": 0.854832406433188, - "flos": 23002616304000.0, - "grad_norm": 1.691658565151652, - "language_loss": 0.64885128, - "learning_rate": 2.1693611717956072e-07, - "loss": 0.67012751, - "num_input_tokens_seen": 306613210, - "step": 14218, - "time_per_iteration": 2.659118890762329 - }, - { - "auxiliary_loss_clip": 0.01094891, - "auxiliary_loss_mlp": 0.01035498, - "balance_loss_clip": 1.03430879, - "balance_loss_mlp": 1.02195024, - "epoch": 0.854892529685856, - "flos": 20412595249920.0, - "grad_norm": 1.722487926122784, - "language_loss": 0.70405877, - "learning_rate": 2.167597412688238e-07, - "loss": 0.72536266, - "num_input_tokens_seen": 306631620, - "step": 14219, - "time_per_iteration": 2.6162991523742676 - }, - { - "auxiliary_loss_clip": 0.01085887, - "auxiliary_loss_mlp": 0.01039141, - "balance_loss_clip": 1.03332317, - "balance_loss_mlp": 1.02628446, - "epoch": 0.854952652938524, - "flos": 16398716094720.0, - "grad_norm": 2.7265350217211397, - "language_loss": 0.67212754, - "learning_rate": 2.1658343297873549e-07, - "loss": 0.69337785, - "num_input_tokens_seen": 306646695, - "step": 14220, - "time_per_iteration": 2.618908166885376 - }, - { - "auxiliary_loss_clip": 0.01105252, - "auxiliary_loss_mlp": 0.01030801, - "balance_loss_clip": 1.03653455, - "balance_loss_mlp": 1.01895165, - "epoch": 0.855012776191192, - "flos": 21178677542400.0, - "grad_norm": 1.9488426413623547, - "language_loss": 0.71819413, - "learning_rate": 2.164071923159827e-07, - "loss": 0.73955464, - "num_input_tokens_seen": 306665465, - "step": 14221, - "time_per_iteration": 2.547293186187744 - }, - { - "auxiliary_loss_clip": 0.01077738, - "auxiliary_loss_mlp": 0.01041646, - "balance_loss_clip": 1.03548348, - "balance_loss_mlp": 1.02897441, - "epoch": 0.8550728994438599, - "flos": 26140993361280.0, - "grad_norm": 1.7974681861069632, - "language_loss": 0.59693348, - "learning_rate": 2.1623101928724763e-07, - "loss": 0.61812735, - "num_input_tokens_seen": 306685950, - "step": 14222, - "time_per_iteration": 4.256742477416992 - }, - { - "auxiliary_loss_clip": 0.01079753, - "auxiliary_loss_mlp": 0.01032489, - "balance_loss_clip": 1.03260887, - "balance_loss_mlp": 1.01989484, - "epoch": 0.8551330226965279, - "flos": 22786793435520.0, - "grad_norm": 1.5392521458494535, - "language_loss": 0.84364492, - "learning_rate": 2.1605491389921093e-07, - "loss": 0.86476731, - "num_input_tokens_seen": 306705740, - "step": 14223, - "time_per_iteration": 2.6583445072174072 - }, - { - "auxiliary_loss_clip": 0.01097669, - "auxiliary_loss_mlp": 0.01031894, - "balance_loss_clip": 1.03776193, - "balance_loss_mlp": 1.01984763, - "epoch": 0.8551931459491958, - "flos": 22419032037120.0, - "grad_norm": 1.7057034680905072, - "language_loss": 0.74193132, - "learning_rate": 2.158788761585515e-07, - "loss": 0.76322699, - "num_input_tokens_seen": 306725065, - "step": 14224, - "time_per_iteration": 4.2042076587677 - }, - { - "auxiliary_loss_clip": 0.01081831, - "auxiliary_loss_mlp": 0.00772053, - "balance_loss_clip": 1.03394115, - "balance_loss_mlp": 1.00025678, - "epoch": 0.8552532692018638, - "flos": 19573183342080.0, - "grad_norm": 1.8055208702511056, - "language_loss": 0.75255108, - "learning_rate": 2.1570290607194307e-07, - "loss": 0.77108991, - "num_input_tokens_seen": 306743630, - "step": 14225, - "time_per_iteration": 4.162761449813843 - }, - { - "auxiliary_loss_clip": 0.01047716, - "auxiliary_loss_mlp": 0.01039572, - "balance_loss_clip": 1.0343529, - "balance_loss_mlp": 1.02750206, - "epoch": 0.8553133924545318, - "flos": 26432767537920.0, - "grad_norm": 1.8461972921962662, - "language_loss": 0.77405238, - "learning_rate": 2.1552700364605925e-07, - "loss": 0.79492527, - "num_input_tokens_seen": 306763105, - "step": 14226, - "time_per_iteration": 2.7609846591949463 - }, - { - "auxiliary_loss_clip": 0.01112703, - "auxiliary_loss_mlp": 0.01038329, - "balance_loss_clip": 1.03843546, - "balance_loss_mlp": 1.02525818, - "epoch": 0.8553735157071998, - "flos": 16362446336640.0, - "grad_norm": 18.71502714000466, - "language_loss": 0.54893303, - "learning_rate": 2.153511688875702e-07, - "loss": 0.57044339, - "num_input_tokens_seen": 306779875, - "step": 14227, - "time_per_iteration": 4.112335443496704 - }, - { - "auxiliary_loss_clip": 0.01077046, - "auxiliary_loss_mlp": 0.0077063, - "balance_loss_clip": 1.03572583, - "balance_loss_mlp": 1.00020015, - "epoch": 0.8554336389598677, - "flos": 20887334328960.0, - "grad_norm": 1.839893156700162, - "language_loss": 0.6559819, - "learning_rate": 2.151754018031442e-07, - "loss": 0.67445874, - "num_input_tokens_seen": 306800015, - "step": 14228, - "time_per_iteration": 2.6349892616271973 - }, - { - "auxiliary_loss_clip": 0.01076617, - "auxiliary_loss_mlp": 0.01032816, - "balance_loss_clip": 1.03681397, - "balance_loss_mlp": 1.02012038, - "epoch": 0.8554937622125357, - "flos": 21284721469440.0, - "grad_norm": 2.007233284435357, - "language_loss": 0.73960888, - "learning_rate": 2.1499970239944542e-07, - "loss": 0.76070321, - "num_input_tokens_seen": 306814160, - "step": 14229, - "time_per_iteration": 2.653921365737915 - }, - { - "auxiliary_loss_clip": 0.01096335, - "auxiliary_loss_mlp": 0.01031192, - "balance_loss_clip": 1.03618884, - "balance_loss_mlp": 1.01952744, - "epoch": 0.8555538854652037, - "flos": 22413178120320.0, - "grad_norm": 2.129951857800807, - "language_loss": 0.72556508, - "learning_rate": 2.1482407068313724e-07, - "loss": 0.74684036, - "num_input_tokens_seen": 306833310, - "step": 14230, - "time_per_iteration": 2.611541509628296 - }, - { - "auxiliary_loss_clip": 0.01094442, - "auxiliary_loss_mlp": 0.01030708, - "balance_loss_clip": 1.03460538, - "balance_loss_mlp": 1.01829863, - "epoch": 0.8556140087178716, - "flos": 20193719725440.0, - "grad_norm": 2.23514067772812, - "language_loss": 0.82251632, - "learning_rate": 2.1464850666087897e-07, - "loss": 0.84376776, - "num_input_tokens_seen": 306851345, - "step": 14231, - "time_per_iteration": 2.6085596084594727 - }, - { - "auxiliary_loss_clip": 0.01100487, - "auxiliary_loss_mlp": 0.01033167, - "balance_loss_clip": 1.03759503, - "balance_loss_mlp": 1.01945221, - "epoch": 0.8556741319705397, - "flos": 22638123043200.0, - "grad_norm": 2.1730018430212175, - "language_loss": 0.67839086, - "learning_rate": 2.1447301033932796e-07, - "loss": 0.69972742, - "num_input_tokens_seen": 306871040, - "step": 14232, - "time_per_iteration": 2.619722843170166 - }, - { - "auxiliary_loss_clip": 0.01088548, - "auxiliary_loss_mlp": 0.01032023, - "balance_loss_clip": 1.03769374, - "balance_loss_mlp": 1.01942301, - "epoch": 0.8557342552232076, - "flos": 23549320281600.0, - "grad_norm": 1.4620803714373924, - "language_loss": 0.66840327, - "learning_rate": 2.1429758172513955e-07, - "loss": 0.68960893, - "num_input_tokens_seen": 306891625, - "step": 14233, - "time_per_iteration": 2.645831346511841 - }, - { - "auxiliary_loss_clip": 0.01096889, - "auxiliary_loss_mlp": 0.01034394, - "balance_loss_clip": 1.03637278, - "balance_loss_mlp": 1.02236605, - "epoch": 0.8557943784758756, - "flos": 19609884063360.0, - "grad_norm": 2.026925189610044, - "language_loss": 0.76869869, - "learning_rate": 2.1412222082496556e-07, - "loss": 0.79001153, - "num_input_tokens_seen": 306910020, - "step": 14234, - "time_per_iteration": 2.58845853805542 - }, - { - "auxiliary_loss_clip": 0.01001494, - "auxiliary_loss_mlp": 0.01021829, - "balance_loss_clip": 1.00670254, - "balance_loss_mlp": 1.02035093, - "epoch": 0.8558545017285435, - "flos": 70641891446400.0, - "grad_norm": 0.7646124593211208, - "language_loss": 0.57967913, - "learning_rate": 2.1394692764545684e-07, - "loss": 0.59991229, - "num_input_tokens_seen": 306969505, - "step": 14235, - "time_per_iteration": 3.2275688648223877 - }, - { - "auxiliary_loss_clip": 0.0101382, - "auxiliary_loss_mlp": 0.01002617, - "balance_loss_clip": 1.01051199, - "balance_loss_mlp": 1.00143075, - "epoch": 0.8559146249812115, - "flos": 56649983086080.0, - "grad_norm": 0.8551315667817418, - "language_loss": 0.56688058, - "learning_rate": 2.1377170219325858e-07, - "loss": 0.58704495, - "num_input_tokens_seen": 307027710, - "step": 14236, - "time_per_iteration": 3.086979866027832 - }, - { - "auxiliary_loss_clip": 0.01086537, - "auxiliary_loss_mlp": 0.01035642, - "balance_loss_clip": 1.035743, - "balance_loss_mlp": 1.02300572, - "epoch": 0.8559747482338794, - "flos": 22888240421760.0, - "grad_norm": 1.785861454279873, - "language_loss": 0.70469606, - "learning_rate": 2.1359654447501673e-07, - "loss": 0.72591788, - "num_input_tokens_seen": 307045515, - "step": 14237, - "time_per_iteration": 2.615514039993286 - }, - { - "auxiliary_loss_clip": 0.01085764, - "auxiliary_loss_mlp": 0.01029104, - "balance_loss_clip": 1.03411865, - "balance_loss_mlp": 1.01737368, - "epoch": 0.8560348714865474, - "flos": 22601925112320.0, - "grad_norm": 2.6092090917428465, - "language_loss": 0.63390237, - "learning_rate": 2.1342145449737314e-07, - "loss": 0.65505099, - "num_input_tokens_seen": 307064470, - "step": 14238, - "time_per_iteration": 2.8091626167297363 - }, - { - "auxiliary_loss_clip": 0.01104641, - "auxiliary_loss_mlp": 0.01032794, - "balance_loss_clip": 1.03615522, - "balance_loss_mlp": 1.02233911, - "epoch": 0.8560949947392154, - "flos": 17931455297280.0, - "grad_norm": 1.7164911082437782, - "language_loss": 0.69517374, - "learning_rate": 2.1324643226696648e-07, - "loss": 0.71654809, - "num_input_tokens_seen": 307083900, - "step": 14239, - "time_per_iteration": 2.57605242729187 - }, - { - "auxiliary_loss_clip": 0.01111794, - "auxiliary_loss_mlp": 0.0103739, - "balance_loss_clip": 1.0377574, - "balance_loss_mlp": 1.02455664, - "epoch": 0.8561551179918834, - "flos": 31026208636800.0, - "grad_norm": 2.169346981343539, - "language_loss": 0.66365606, - "learning_rate": 2.1307147779043455e-07, - "loss": 0.68514788, - "num_input_tokens_seen": 307104590, - "step": 14240, - "time_per_iteration": 2.6193511486053467 - }, - { - "auxiliary_loss_clip": 0.01068263, - "auxiliary_loss_mlp": 0.01040061, - "balance_loss_clip": 1.0336616, - "balance_loss_mlp": 1.02518964, - "epoch": 0.8562152412445513, - "flos": 30665198995200.0, - "grad_norm": 1.6476205784607059, - "language_loss": 0.62131298, - "learning_rate": 2.1289659107441182e-07, - "loss": 0.64239621, - "num_input_tokens_seen": 307125580, - "step": 14241, - "time_per_iteration": 2.7614312171936035 - }, - { - "auxiliary_loss_clip": 0.01112623, - "auxiliary_loss_mlp": 0.01036862, - "balance_loss_clip": 1.03619862, - "balance_loss_mlp": 1.02343321, - "epoch": 0.8562753644972193, - "flos": 31576144838400.0, - "grad_norm": 2.266500331980379, - "language_loss": 0.74537355, - "learning_rate": 2.1272177212552855e-07, - "loss": 0.76686835, - "num_input_tokens_seen": 307147625, - "step": 14242, - "time_per_iteration": 2.6258413791656494 - }, - { - "auxiliary_loss_clip": 0.01043356, - "auxiliary_loss_mlp": 0.01049301, - "balance_loss_clip": 1.0376476, - "balance_loss_mlp": 1.03507984, - "epoch": 0.8563354877498872, - "flos": 26213640618240.0, - "grad_norm": 2.248077645392886, - "language_loss": 0.7636081, - "learning_rate": 2.1254702095041498e-07, - "loss": 0.78453457, - "num_input_tokens_seen": 307164665, - "step": 14243, - "time_per_iteration": 2.819819927215576 - }, - { - "auxiliary_loss_clip": 0.01088321, - "auxiliary_loss_mlp": 0.00769311, - "balance_loss_clip": 1.03758311, - "balance_loss_mlp": 1.00028658, - "epoch": 0.8563956110025552, - "flos": 24134341092480.0, - "grad_norm": 2.314406650865767, - "language_loss": 0.68075836, - "learning_rate": 2.123723375556974e-07, - "loss": 0.69933462, - "num_input_tokens_seen": 307182530, - "step": 14244, - "time_per_iteration": 2.668156147003174 - }, - { - "auxiliary_loss_clip": 0.01020209, - "auxiliary_loss_mlp": 0.01006142, - "balance_loss_clip": 1.0066725, - "balance_loss_mlp": 1.00496769, - "epoch": 0.8564557342552233, - "flos": 56271986311680.0, - "grad_norm": 0.7568226385522613, - "language_loss": 0.58461487, - "learning_rate": 2.1219772194800046e-07, - "loss": 0.60487843, - "num_input_tokens_seen": 307241240, - "step": 14245, - "time_per_iteration": 3.0361111164093018 - }, - { - "auxiliary_loss_clip": 0.01102848, - "auxiliary_loss_mlp": 0.01031456, - "balance_loss_clip": 1.03873086, - "balance_loss_mlp": 1.01862907, - "epoch": 0.8565158575078912, - "flos": 23440618748160.0, - "grad_norm": 1.7549005151263664, - "language_loss": 0.77337581, - "learning_rate": 2.1202317413394488e-07, - "loss": 0.79471886, - "num_input_tokens_seen": 307261485, - "step": 14246, - "time_per_iteration": 2.630526542663574 - }, - { - "auxiliary_loss_clip": 0.01082478, - "auxiliary_loss_mlp": 0.01027673, - "balance_loss_clip": 1.03102589, - "balance_loss_mlp": 1.01518607, - "epoch": 0.8565759807605592, - "flos": 20375930442240.0, - "grad_norm": 1.8941484357847163, - "language_loss": 0.81755006, - "learning_rate": 2.1184869412014938e-07, - "loss": 0.83865154, - "num_input_tokens_seen": 307279160, - "step": 14247, - "time_per_iteration": 2.637540578842163 - }, - { - "auxiliary_loss_clip": 0.01088373, - "auxiliary_loss_mlp": 0.01031625, - "balance_loss_clip": 1.03624964, - "balance_loss_mlp": 1.01832116, - "epoch": 0.8566361040132271, - "flos": 18807101049600.0, - "grad_norm": 1.8985078396153772, - "language_loss": 0.77648062, - "learning_rate": 2.1167428191323112e-07, - "loss": 0.79768062, - "num_input_tokens_seen": 307297920, - "step": 14248, - "time_per_iteration": 2.637140989303589 - }, - { - "auxiliary_loss_clip": 0.01059574, - "auxiliary_loss_mlp": 0.01038673, - "balance_loss_clip": 1.03150558, - "balance_loss_mlp": 1.02398682, - "epoch": 0.8566962272658951, - "flos": 24535355506560.0, - "grad_norm": 1.8205967022668303, - "language_loss": 0.78117526, - "learning_rate": 2.1149993751980278e-07, - "loss": 0.8021577, - "num_input_tokens_seen": 307318320, - "step": 14249, - "time_per_iteration": 2.747084856033325 - }, - { - "auxiliary_loss_clip": 0.01082913, - "auxiliary_loss_mlp": 0.01033057, - "balance_loss_clip": 1.03500676, - "balance_loss_mlp": 1.02062345, - "epoch": 0.856756350518563, - "flos": 23178506227200.0, - "grad_norm": 1.834584951570381, - "language_loss": 0.78369069, - "learning_rate": 2.1132566094647597e-07, - "loss": 0.80485034, - "num_input_tokens_seen": 307336720, - "step": 14250, - "time_per_iteration": 2.6694507598876953 - }, - { - "auxiliary_loss_clip": 0.01085775, - "auxiliary_loss_mlp": 0.01030606, - "balance_loss_clip": 1.03689909, - "balance_loss_mlp": 1.01948988, - "epoch": 0.856816473771231, - "flos": 20808581760000.0, - "grad_norm": 1.7702839302991833, - "language_loss": 0.79165637, - "learning_rate": 2.1115145219985942e-07, - "loss": 0.81282026, - "num_input_tokens_seen": 307354120, - "step": 14251, - "time_per_iteration": 2.61769962310791 - }, - { - "auxiliary_loss_clip": 0.01071172, - "auxiliary_loss_mlp": 0.01031805, - "balance_loss_clip": 1.03706813, - "balance_loss_mlp": 1.01999116, - "epoch": 0.856876597023899, - "flos": 20228157889920.0, - "grad_norm": 2.063789660652868, - "language_loss": 0.61335462, - "learning_rate": 2.1097731128656005e-07, - "loss": 0.63438439, - "num_input_tokens_seen": 307373165, - "step": 14252, - "time_per_iteration": 2.730942964553833 - }, - { - "auxiliary_loss_clip": 0.01088715, - "auxiliary_loss_mlp": 0.01037397, - "balance_loss_clip": 1.04091692, - "balance_loss_mlp": 1.02395606, - "epoch": 0.856936720276567, - "flos": 18296128126080.0, - "grad_norm": 1.8690578228710872, - "language_loss": 0.69612849, - "learning_rate": 2.1080323821317924e-07, - "loss": 0.71738964, - "num_input_tokens_seen": 307391000, - "step": 14253, - "time_per_iteration": 2.6573426723480225 - }, - { - "auxiliary_loss_clip": 0.01013485, - "auxiliary_loss_mlp": 0.01001116, - "balance_loss_clip": 1.00999308, - "balance_loss_mlp": 1.0000428, - "epoch": 0.8569968435292349, - "flos": 69878394933120.0, - "grad_norm": 0.7842094362693159, - "language_loss": 0.59178007, - "learning_rate": 2.1062923298631907e-07, - "loss": 0.61192608, - "num_input_tokens_seen": 307452865, - "step": 14254, - "time_per_iteration": 3.2271313667297363 - }, - { - "auxiliary_loss_clip": 0.0108384, - "auxiliary_loss_mlp": 0.01034196, - "balance_loss_clip": 1.0343616, - "balance_loss_mlp": 1.02042699, - "epoch": 0.8570569667819029, - "flos": 25848572739840.0, - "grad_norm": 1.7290830197798204, - "language_loss": 0.80958641, - "learning_rate": 2.1045529561257825e-07, - "loss": 0.8307668, - "num_input_tokens_seen": 307471940, - "step": 14255, - "time_per_iteration": 2.6941943168640137 - }, - { - "auxiliary_loss_clip": 0.011065, - "auxiliary_loss_mlp": 0.01024921, - "balance_loss_clip": 1.03668928, - "balance_loss_mlp": 1.01289284, - "epoch": 0.8571170900345708, - "flos": 23257115141760.0, - "grad_norm": 1.9710027507831065, - "language_loss": 0.67309523, - "learning_rate": 2.1028142609855126e-07, - "loss": 0.69440937, - "num_input_tokens_seen": 307488745, - "step": 14256, - "time_per_iteration": 2.719081163406372 - }, - { - "auxiliary_loss_clip": 0.01099477, - "auxiliary_loss_mlp": 0.01031183, - "balance_loss_clip": 1.037992, - "balance_loss_mlp": 1.01950645, - "epoch": 0.8571772132872388, - "flos": 18917670090240.0, - "grad_norm": 2.031884958657008, - "language_loss": 0.70139217, - "learning_rate": 2.1010762445083218e-07, - "loss": 0.72269881, - "num_input_tokens_seen": 307506855, - "step": 14257, - "time_per_iteration": 2.600598096847534 - }, - { - "auxiliary_loss_clip": 0.01073361, - "auxiliary_loss_mlp": 0.01032458, - "balance_loss_clip": 1.03339398, - "balance_loss_mlp": 1.01963735, - "epoch": 0.8572373365399069, - "flos": 33250120318080.0, - "grad_norm": 2.468248667135471, - "language_loss": 0.77000117, - "learning_rate": 2.0993389067601197e-07, - "loss": 0.79105937, - "num_input_tokens_seen": 307526115, - "step": 14258, - "time_per_iteration": 2.757704973220825 - }, - { - "auxiliary_loss_clip": 0.01096583, - "auxiliary_loss_mlp": 0.00769575, - "balance_loss_clip": 1.0357585, - "balance_loss_mlp": 1.00029516, - "epoch": 0.8572974597925748, - "flos": 23327535755520.0, - "grad_norm": 1.474412147771869, - "language_loss": 0.6799866, - "learning_rate": 2.0976022478067735e-07, - "loss": 0.69864815, - "num_input_tokens_seen": 307545230, - "step": 14259, - "time_per_iteration": 2.6122398376464844 - }, - { - "auxiliary_loss_clip": 0.010953, - "auxiliary_loss_mlp": 0.0103545, - "balance_loss_clip": 1.03352249, - "balance_loss_mlp": 1.02250957, - "epoch": 0.8573575830452428, - "flos": 24535858296960.0, - "grad_norm": 1.6836228896931322, - "language_loss": 0.77251399, - "learning_rate": 2.0958662677141437e-07, - "loss": 0.79382151, - "num_input_tokens_seen": 307564900, - "step": 14260, - "time_per_iteration": 2.6170718669891357 - }, - { - "auxiliary_loss_clip": 0.01083087, - "auxiliary_loss_mlp": 0.01031864, - "balance_loss_clip": 1.03345168, - "balance_loss_mlp": 1.0186913, - "epoch": 0.8574177062979107, - "flos": 24165403378560.0, - "grad_norm": 1.694275563361149, - "language_loss": 0.74151957, - "learning_rate": 2.09413096654806e-07, - "loss": 0.76266909, - "num_input_tokens_seen": 307583500, - "step": 14261, - "time_per_iteration": 4.178469181060791 - }, - { - "auxiliary_loss_clip": 0.0109609, - "auxiliary_loss_mlp": 0.01032733, - "balance_loss_clip": 1.03748691, - "balance_loss_mlp": 1.01923871, - "epoch": 0.8574778295505787, - "flos": 17930737025280.0, - "grad_norm": 1.9745240066766159, - "language_loss": 0.78983176, - "learning_rate": 2.0923963443743276e-07, - "loss": 0.81111997, - "num_input_tokens_seen": 307601430, - "step": 14262, - "time_per_iteration": 2.646378993988037 - }, - { - "auxiliary_loss_clip": 0.0107326, - "auxiliary_loss_mlp": 0.01032783, - "balance_loss_clip": 1.03582883, - "balance_loss_mlp": 1.02097511, - "epoch": 0.8575379528032466, - "flos": 21580697537280.0, - "grad_norm": 1.674172506907798, - "language_loss": 0.67816055, - "learning_rate": 2.0906624012587203e-07, - "loss": 0.69922101, - "num_input_tokens_seen": 307621495, - "step": 14263, - "time_per_iteration": 4.332361698150635 - }, - { - "auxiliary_loss_clip": 0.01072214, - "auxiliary_loss_mlp": 0.00770907, - "balance_loss_clip": 1.03429055, - "balance_loss_mlp": 1.00025988, - "epoch": 0.8575980760559146, - "flos": 21761579450880.0, - "grad_norm": 1.4408705629721363, - "language_loss": 0.79718733, - "learning_rate": 2.088929137266986e-07, - "loss": 0.81561852, - "num_input_tokens_seen": 307640840, - "step": 14264, - "time_per_iteration": 2.753828287124634 - }, - { - "auxiliary_loss_clip": 0.01071482, - "auxiliary_loss_mlp": 0.01039645, - "balance_loss_clip": 1.03247488, - "balance_loss_mlp": 1.02618599, - "epoch": 0.8576581993085826, - "flos": 34386442047360.0, - "grad_norm": 1.2896850911673399, - "language_loss": 0.69861013, - "learning_rate": 2.0871965524648582e-07, - "loss": 0.71972132, - "num_input_tokens_seen": 307663820, - "step": 14265, - "time_per_iteration": 4.417909145355225 - }, - { - "auxiliary_loss_clip": 0.01105479, - "auxiliary_loss_mlp": 0.01028348, - "balance_loss_clip": 1.03650212, - "balance_loss_mlp": 1.01695776, - "epoch": 0.8577183225612506, - "flos": 23222497409280.0, - "grad_norm": 1.6592250093825642, - "language_loss": 0.66188025, - "learning_rate": 2.085464646918027e-07, - "loss": 0.68321854, - "num_input_tokens_seen": 307682385, - "step": 14266, - "time_per_iteration": 2.6130142211914062 - }, - { - "auxiliary_loss_clip": 0.01087662, - "auxiliary_loss_mlp": 0.01032473, - "balance_loss_clip": 1.03722739, - "balance_loss_mlp": 1.02009344, - "epoch": 0.8577784458139185, - "flos": 28804164462720.0, - "grad_norm": 1.6281862094757322, - "language_loss": 0.75571585, - "learning_rate": 2.0837334206921731e-07, - "loss": 0.77691722, - "num_input_tokens_seen": 307704680, - "step": 14267, - "time_per_iteration": 4.48302960395813 - }, - { - "auxiliary_loss_clip": 0.01095891, - "auxiliary_loss_mlp": 0.01032819, - "balance_loss_clip": 1.03645444, - "balance_loss_mlp": 1.02119589, - "epoch": 0.8578385690665865, - "flos": 19755573626880.0, - "grad_norm": 1.7702696425064848, - "language_loss": 0.87967706, - "learning_rate": 2.082002873852946e-07, - "loss": 0.9009642, - "num_input_tokens_seen": 307723245, - "step": 14268, - "time_per_iteration": 2.7304728031158447 - }, - { - "auxiliary_loss_clip": 0.01098203, - "auxiliary_loss_mlp": 0.01036419, - "balance_loss_clip": 1.03701484, - "balance_loss_mlp": 1.02400303, - "epoch": 0.8578986923192544, - "flos": 20704082117760.0, - "grad_norm": 2.207459116191671, - "language_loss": 0.72899628, - "learning_rate": 2.0802730064659667e-07, - "loss": 0.75034249, - "num_input_tokens_seen": 307742510, - "step": 14269, - "time_per_iteration": 2.686720848083496 - }, - { - "auxiliary_loss_clip": 0.01099494, - "auxiliary_loss_mlp": 0.01032209, - "balance_loss_clip": 1.03617907, - "balance_loss_mlp": 1.01991236, - "epoch": 0.8579588155719224, - "flos": 36101715189120.0, - "grad_norm": 1.7486556391574948, - "language_loss": 0.66497004, - "learning_rate": 2.0785438185968252e-07, - "loss": 0.68628705, - "num_input_tokens_seen": 307766030, - "step": 14270, - "time_per_iteration": 2.759577751159668 - }, - { - "auxiliary_loss_clip": 0.01082271, - "auxiliary_loss_mlp": 0.0103104, - "balance_loss_clip": 1.0320828, - "balance_loss_mlp": 1.01854658, - "epoch": 0.8580189388245905, - "flos": 22853479034880.0, - "grad_norm": 1.9964784224395893, - "language_loss": 0.73861098, - "learning_rate": 2.0768153103110997e-07, - "loss": 0.75974405, - "num_input_tokens_seen": 307785800, - "step": 14271, - "time_per_iteration": 2.6652464866638184 - }, - { - "auxiliary_loss_clip": 0.00990812, - "auxiliary_loss_mlp": 0.00751033, - "balance_loss_clip": 1.00730669, - "balance_loss_mlp": 0.99962157, - "epoch": 0.8580790620772584, - "flos": 69642104290560.0, - "grad_norm": 0.808728293182982, - "language_loss": 0.595052, - "learning_rate": 2.0750874816743358e-07, - "loss": 0.61247051, - "num_input_tokens_seen": 307850995, - "step": 14272, - "time_per_iteration": 3.3493616580963135 - }, - { - "auxiliary_loss_clip": 0.0108737, - "auxiliary_loss_mlp": 0.01037635, - "balance_loss_clip": 1.03556502, - "balance_loss_mlp": 1.02342474, - "epoch": 0.8581391853299264, - "flos": 13334243270400.0, - "grad_norm": 1.7519497448745491, - "language_loss": 0.75282109, - "learning_rate": 2.0733603327520499e-07, - "loss": 0.7740711, - "num_input_tokens_seen": 307868585, - "step": 14273, - "time_per_iteration": 2.6791751384735107 - }, - { - "auxiliary_loss_clip": 0.01097542, - "auxiliary_loss_mlp": 0.01029287, - "balance_loss_clip": 1.03653765, - "balance_loss_mlp": 1.01684737, - "epoch": 0.8581993085825943, - "flos": 19645651031040.0, - "grad_norm": 1.8670463657155183, - "language_loss": 0.82038534, - "learning_rate": 2.0716338636097385e-07, - "loss": 0.84165359, - "num_input_tokens_seen": 307886820, - "step": 14274, - "time_per_iteration": 2.617358446121216 - }, - { - "auxiliary_loss_clip": 0.0101945, - "auxiliary_loss_mlp": 0.01002494, - "balance_loss_clip": 1.00673366, - "balance_loss_mlp": 1.00137389, - "epoch": 0.8582594318352623, - "flos": 55825077294720.0, - "grad_norm": 0.7943422785901219, - "language_loss": 0.60750306, - "learning_rate": 2.0699080743128672e-07, - "loss": 0.6277225, - "num_input_tokens_seen": 307944020, - "step": 14275, - "time_per_iteration": 3.2472341060638428 - }, - { - "auxiliary_loss_clip": 0.01096248, - "auxiliary_loss_mlp": 0.01028909, - "balance_loss_clip": 1.03805137, - "balance_loss_mlp": 1.01562345, - "epoch": 0.8583195550879302, - "flos": 24279563779200.0, - "grad_norm": 2.0431646133306764, - "language_loss": 0.59516066, - "learning_rate": 2.0681829649268768e-07, - "loss": 0.61641222, - "num_input_tokens_seen": 307961055, - "step": 14276, - "time_per_iteration": 2.7009382247924805 - }, - { - "auxiliary_loss_clip": 0.0108586, - "auxiliary_loss_mlp": 0.01034021, - "balance_loss_clip": 1.03556418, - "balance_loss_mlp": 1.02205861, - "epoch": 0.8583796783405983, - "flos": 13444129952640.0, - "grad_norm": 2.25300331444078, - "language_loss": 0.76484519, - "learning_rate": 2.0664585355171838e-07, - "loss": 0.786044, - "num_input_tokens_seen": 307978690, - "step": 14277, - "time_per_iteration": 2.6383044719696045 - }, - { - "auxiliary_loss_clip": 0.01085815, - "auxiliary_loss_mlp": 0.0102939, - "balance_loss_clip": 1.0350821, - "balance_loss_mlp": 1.01708126, - "epoch": 0.8584398015932662, - "flos": 16180271533440.0, - "grad_norm": 1.614064459915635, - "language_loss": 0.83699441, - "learning_rate": 2.0647347861491803e-07, - "loss": 0.85814643, - "num_input_tokens_seen": 307995870, - "step": 14278, - "time_per_iteration": 2.690840721130371 - }, - { - "auxiliary_loss_clip": 0.01087706, - "auxiliary_loss_mlp": 0.01030174, - "balance_loss_clip": 1.03669083, - "balance_loss_mlp": 1.01709092, - "epoch": 0.8584999248459342, - "flos": 17450431338240.0, - "grad_norm": 2.022696664220824, - "language_loss": 0.74557948, - "learning_rate": 2.0630117168882366e-07, - "loss": 0.76675826, - "num_input_tokens_seen": 308013645, - "step": 14279, - "time_per_iteration": 2.6342451572418213 - }, - { - "auxiliary_loss_clip": 0.01107856, - "auxiliary_loss_mlp": 0.0103262, - "balance_loss_clip": 1.03726792, - "balance_loss_mlp": 1.0206275, - "epoch": 0.8585600480986021, - "flos": 23441013797760.0, - "grad_norm": 2.2412241965372095, - "language_loss": 0.66438127, - "learning_rate": 2.0612893277996845e-07, - "loss": 0.68578601, - "num_input_tokens_seen": 308032490, - "step": 14280, - "time_per_iteration": 2.586599349975586 - }, - { - "auxiliary_loss_clip": 0.01095719, - "auxiliary_loss_mlp": 0.01028274, - "balance_loss_clip": 1.03592777, - "balance_loss_mlp": 1.01651978, - "epoch": 0.8586201713512701, - "flos": 19937927998080.0, - "grad_norm": 1.86716453090562, - "language_loss": 0.62667966, - "learning_rate": 2.0595676189488343e-07, - "loss": 0.64791965, - "num_input_tokens_seen": 308052110, - "step": 14281, - "time_per_iteration": 2.6187994480133057 - }, - { - "auxiliary_loss_clip": 0.01084456, - "auxiliary_loss_mlp": 0.00770032, - "balance_loss_clip": 1.03628945, - "balance_loss_mlp": 1.00014341, - "epoch": 0.858680294603938, - "flos": 15304769435520.0, - "grad_norm": 3.7299826958950493, - "language_loss": 0.73169029, - "learning_rate": 2.0578465904009845e-07, - "loss": 0.7502352, - "num_input_tokens_seen": 308070660, - "step": 14282, - "time_per_iteration": 2.7070963382720947 - }, - { - "auxiliary_loss_clip": 0.01080016, - "auxiliary_loss_mlp": 0.01030436, - "balance_loss_clip": 1.0322001, - "balance_loss_mlp": 1.01892662, - "epoch": 0.858740417856606, - "flos": 22711237176960.0, - "grad_norm": 1.8508205946022054, - "language_loss": 0.75599825, - "learning_rate": 2.0561262422213832e-07, - "loss": 0.77710283, - "num_input_tokens_seen": 308089520, - "step": 14283, - "time_per_iteration": 2.70784854888916 - }, - { - "auxiliary_loss_clip": 0.01093289, - "auxiliary_loss_mlp": 0.01032374, - "balance_loss_clip": 1.03351057, - "balance_loss_mlp": 1.01973772, - "epoch": 0.8588005411092741, - "flos": 34054303962240.0, - "grad_norm": 1.810517869683259, - "language_loss": 0.60200775, - "learning_rate": 2.0544065744752736e-07, - "loss": 0.62326431, - "num_input_tokens_seen": 308111545, - "step": 14284, - "time_per_iteration": 2.804454803466797 - }, - { - "auxiliary_loss_clip": 0.01080997, - "auxiliary_loss_mlp": 0.01030838, - "balance_loss_clip": 1.03671587, - "balance_loss_mlp": 1.01877391, - "epoch": 0.858860664361942, - "flos": 28913584268160.0, - "grad_norm": 1.9759393563383274, - "language_loss": 0.75834155, - "learning_rate": 2.0526875872278749e-07, - "loss": 0.77945989, - "num_input_tokens_seen": 308129690, - "step": 14285, - "time_per_iteration": 2.717355489730835 - }, - { - "auxiliary_loss_clip": 0.01096428, - "auxiliary_loss_mlp": 0.01034968, - "balance_loss_clip": 1.03976953, - "balance_loss_mlp": 1.0222249, - "epoch": 0.85892078761461, - "flos": 19792525743360.0, - "grad_norm": 2.2818993237689242, - "language_loss": 0.7433964, - "learning_rate": 2.0509692805443524e-07, - "loss": 0.76471031, - "num_input_tokens_seen": 308147410, - "step": 14286, - "time_per_iteration": 2.60193133354187 - }, - { - "auxiliary_loss_clip": 0.01009396, - "auxiliary_loss_mlp": 0.00750956, - "balance_loss_clip": 1.0070982, - "balance_loss_mlp": 0.99964851, - "epoch": 0.8589809108672779, - "flos": 67106630039040.0, - "grad_norm": 0.7818074542698659, - "language_loss": 0.4943513, - "learning_rate": 2.0492516544898718e-07, - "loss": 0.51195478, - "num_input_tokens_seen": 308204875, - "step": 14287, - "time_per_iteration": 3.223233461380005 - }, - { - "auxiliary_loss_clip": 0.01099243, - "auxiliary_loss_mlp": 0.01030496, - "balance_loss_clip": 1.03820276, - "balance_loss_mlp": 1.01868248, - "epoch": 0.8590410341199459, - "flos": 29716259541120.0, - "grad_norm": 2.0753846040431574, - "language_loss": 0.79119551, - "learning_rate": 2.0475347091295704e-07, - "loss": 0.81249291, - "num_input_tokens_seen": 308225690, - "step": 14288, - "time_per_iteration": 2.8012468814849854 - }, - { - "auxiliary_loss_clip": 0.01070856, - "auxiliary_loss_mlp": 0.01034723, - "balance_loss_clip": 1.03844345, - "balance_loss_mlp": 1.02160382, - "epoch": 0.8591011573726138, - "flos": 23987430466560.0, - "grad_norm": 2.333742079437343, - "language_loss": 0.80807364, - "learning_rate": 2.045818444528553e-07, - "loss": 0.82912946, - "num_input_tokens_seen": 308245255, - "step": 14289, - "time_per_iteration": 2.677363634109497 - }, - { - "auxiliary_loss_clip": 0.01101023, - "auxiliary_loss_mlp": 0.01032169, - "balance_loss_clip": 1.03798854, - "balance_loss_mlp": 1.01974702, - "epoch": 0.8591612806252819, - "flos": 14428656806400.0, - "grad_norm": 1.8411584307927096, - "language_loss": 0.65171552, - "learning_rate": 2.0441028607518973e-07, - "loss": 0.67304742, - "num_input_tokens_seen": 308261755, - "step": 14290, - "time_per_iteration": 2.6130077838897705 - }, - { - "auxiliary_loss_clip": 0.01088699, - "auxiliary_loss_mlp": 0.01029909, - "balance_loss_clip": 1.0362072, - "balance_loss_mlp": 1.01692736, - "epoch": 0.8592214038779498, - "flos": 31577150419200.0, - "grad_norm": 1.9868152248145707, - "language_loss": 0.55034781, - "learning_rate": 2.0423879578646642e-07, - "loss": 0.57153386, - "num_input_tokens_seen": 308285145, - "step": 14291, - "time_per_iteration": 2.7079780101776123 - }, - { - "auxiliary_loss_clip": 0.0110119, - "auxiliary_loss_mlp": 0.01031083, - "balance_loss_clip": 1.03754354, - "balance_loss_mlp": 1.01885247, - "epoch": 0.8592815271306178, - "flos": 17457290835840.0, - "grad_norm": 2.056468770778706, - "language_loss": 0.71314991, - "learning_rate": 2.0406737359318792e-07, - "loss": 0.73447263, - "num_input_tokens_seen": 308304130, - "step": 14292, - "time_per_iteration": 2.595897674560547 - }, - { - "auxiliary_loss_clip": 0.01098184, - "auxiliary_loss_mlp": 0.01034158, - "balance_loss_clip": 1.03526211, - "balance_loss_mlp": 1.02187383, - "epoch": 0.8593416503832857, - "flos": 25411360394880.0, - "grad_norm": 1.470631901330716, - "language_loss": 0.71314609, - "learning_rate": 2.038960195018542e-07, - "loss": 0.73446953, - "num_input_tokens_seen": 308324670, - "step": 14293, - "time_per_iteration": 2.652717351913452 - }, - { - "auxiliary_loss_clip": 0.01080648, - "auxiliary_loss_mlp": 0.01034358, - "balance_loss_clip": 1.03720033, - "balance_loss_mlp": 1.02217507, - "epoch": 0.8594017736359537, - "flos": 20996646393600.0, - "grad_norm": 1.543293476083091, - "language_loss": 0.6855827, - "learning_rate": 2.0372473351896358e-07, - "loss": 0.70673275, - "num_input_tokens_seen": 308344215, - "step": 14294, - "time_per_iteration": 2.6766042709350586 - }, - { - "auxiliary_loss_clip": 0.01104946, - "auxiliary_loss_mlp": 0.01031001, - "balance_loss_clip": 1.03467357, - "balance_loss_mlp": 1.01901507, - "epoch": 0.8594618968886216, - "flos": 22091059929600.0, - "grad_norm": 1.9038081617192384, - "language_loss": 0.77887809, - "learning_rate": 2.0355351565101087e-07, - "loss": 0.80023754, - "num_input_tokens_seen": 308360520, - "step": 14295, - "time_per_iteration": 2.6753733158111572 - }, - { - "auxiliary_loss_clip": 0.01085392, - "auxiliary_loss_mlp": 0.01037375, - "balance_loss_clip": 1.03575659, - "balance_loss_mlp": 1.02281332, - "epoch": 0.8595220201412896, - "flos": 11656245467520.0, - "grad_norm": 2.8815633850100095, - "language_loss": 0.69029182, - "learning_rate": 2.0338236590448975e-07, - "loss": 0.71151948, - "num_input_tokens_seen": 308376865, - "step": 14296, - "time_per_iteration": 2.6722471714019775 - }, - { - "auxiliary_loss_clip": 0.01081568, - "auxiliary_loss_mlp": 0.01033296, - "balance_loss_clip": 1.03467476, - "balance_loss_mlp": 1.02070773, - "epoch": 0.8595821433939577, - "flos": 25040366772480.0, - "grad_norm": 2.176741931564133, - "language_loss": 0.78606057, - "learning_rate": 2.0321128428588842e-07, - "loss": 0.80720925, - "num_input_tokens_seen": 308395870, - "step": 14297, - "time_per_iteration": 2.6577630043029785 - }, - { - "auxiliary_loss_clip": 0.01091905, - "auxiliary_loss_mlp": 0.01032076, - "balance_loss_clip": 1.03271341, - "balance_loss_mlp": 1.02086425, - "epoch": 0.8596422666466256, - "flos": 28511528359680.0, - "grad_norm": 2.673036998280705, - "language_loss": 0.67951548, - "learning_rate": 2.030402708016954e-07, - "loss": 0.7007553, - "num_input_tokens_seen": 308417250, - "step": 14298, - "time_per_iteration": 2.7069945335388184 - }, - { - "auxiliary_loss_clip": 0.01083251, - "auxiliary_loss_mlp": 0.01035882, - "balance_loss_clip": 1.0348295, - "balance_loss_mlp": 1.02360308, - "epoch": 0.8597023898992936, - "flos": 13589137157760.0, - "grad_norm": 2.2714540225430775, - "language_loss": 0.68807364, - "learning_rate": 2.0286932545839576e-07, - "loss": 0.70926499, - "num_input_tokens_seen": 308434565, - "step": 14299, - "time_per_iteration": 2.637234687805176 - }, - { - "auxiliary_loss_clip": 0.01080144, - "auxiliary_loss_mlp": 0.01036834, - "balance_loss_clip": 1.03766489, - "balance_loss_mlp": 1.02434683, - "epoch": 0.8597625131519615, - "flos": 32300821728000.0, - "grad_norm": 2.455453131727374, - "language_loss": 0.71315849, - "learning_rate": 2.0269844826247096e-07, - "loss": 0.73432827, - "num_input_tokens_seen": 308450040, - "step": 14300, - "time_per_iteration": 4.307279109954834 - }, - { - "auxiliary_loss_clip": 0.01080749, - "auxiliary_loss_mlp": 0.01035226, - "balance_loss_clip": 1.03184569, - "balance_loss_mlp": 1.02227378, - "epoch": 0.8598226364046295, - "flos": 28730367970560.0, - "grad_norm": 2.4178089215843377, - "language_loss": 0.69498658, - "learning_rate": 2.0252763922040116e-07, - "loss": 0.71614629, - "num_input_tokens_seen": 308470545, - "step": 14301, - "time_per_iteration": 2.7081966400146484 - }, - { - "auxiliary_loss_clip": 0.01056383, - "auxiliary_loss_mlp": 0.01033857, - "balance_loss_clip": 1.03381944, - "balance_loss_mlp": 1.02151251, - "epoch": 0.8598827596572974, - "flos": 21871825269120.0, - "grad_norm": 1.627550751133936, - "language_loss": 0.74207568, - "learning_rate": 2.023568983386641e-07, - "loss": 0.76297808, - "num_input_tokens_seen": 308490020, - "step": 14302, - "time_per_iteration": 2.711632251739502 - }, - { - "auxiliary_loss_clip": 0.01092554, - "auxiliary_loss_mlp": 0.01030522, - "balance_loss_clip": 1.03438914, - "balance_loss_mlp": 1.01904821, - "epoch": 0.8599428829099655, - "flos": 23767297966080.0, - "grad_norm": 1.6910498368057518, - "language_loss": 0.83883357, - "learning_rate": 2.02186225623733e-07, - "loss": 0.86006427, - "num_input_tokens_seen": 308509065, - "step": 14303, - "time_per_iteration": 4.2169249057769775 - }, - { - "auxiliary_loss_clip": 0.01096255, - "auxiliary_loss_mlp": 0.01036748, - "balance_loss_clip": 1.03428876, - "balance_loss_mlp": 1.02355707, - "epoch": 0.8600030061626334, - "flos": 16212770363520.0, - "grad_norm": 2.148560231945797, - "language_loss": 0.7746321, - "learning_rate": 2.0201562108208025e-07, - "loss": 0.7959621, - "num_input_tokens_seen": 308524725, - "step": 14304, - "time_per_iteration": 4.171972990036011 - }, - { - "auxiliary_loss_clip": 0.01110849, - "auxiliary_loss_mlp": 0.01035361, - "balance_loss_clip": 1.03822732, - "balance_loss_mlp": 1.02181315, - "epoch": 0.8600631294153014, - "flos": 15669370437120.0, - "grad_norm": 1.95456339418458, - "language_loss": 0.54470098, - "learning_rate": 2.0184508472017537e-07, - "loss": 0.56616312, - "num_input_tokens_seen": 308543525, - "step": 14305, - "time_per_iteration": 2.594041585922241 - }, - { - "auxiliary_loss_clip": 0.01108772, - "auxiliary_loss_mlp": 0.01029221, - "balance_loss_clip": 1.03798604, - "balance_loss_mlp": 1.01622105, - "epoch": 0.8601232526679693, - "flos": 17493093717120.0, - "grad_norm": 1.9212568904782885, - "language_loss": 0.84086001, - "learning_rate": 2.0167461654448558e-07, - "loss": 0.86223984, - "num_input_tokens_seen": 308557995, - "step": 14306, - "time_per_iteration": 4.086545467376709 - }, - { - "auxiliary_loss_clip": 0.01097083, - "auxiliary_loss_mlp": 0.00769534, - "balance_loss_clip": 1.03672814, - "balance_loss_mlp": 1.00017905, - "epoch": 0.8601833759206373, - "flos": 26985935963520.0, - "grad_norm": 1.3981944993349464, - "language_loss": 0.71432567, - "learning_rate": 2.01504216561474e-07, - "loss": 0.73299187, - "num_input_tokens_seen": 308582750, - "step": 14307, - "time_per_iteration": 2.7123961448669434 - }, - { - "auxiliary_loss_clip": 0.01096964, - "auxiliary_loss_mlp": 0.00771884, - "balance_loss_clip": 1.03435898, - "balance_loss_mlp": 1.00030386, - "epoch": 0.8602434991733052, - "flos": 25229760209280.0, - "grad_norm": 1.8399000779871275, - "language_loss": 0.636989, - "learning_rate": 2.0133388477760316e-07, - "loss": 0.6556775, - "num_input_tokens_seen": 308603770, - "step": 14308, - "time_per_iteration": 2.6409523487091064 - }, - { - "auxiliary_loss_clip": 0.01010709, - "auxiliary_loss_mlp": 0.01001248, - "balance_loss_clip": 1.00715673, - "balance_loss_mlp": 1.00013912, - "epoch": 0.8603036224259732, - "flos": 71015363107200.0, - "grad_norm": 0.6173812153983712, - "language_loss": 0.48415971, - "learning_rate": 2.0116362119933172e-07, - "loss": 0.50427926, - "num_input_tokens_seen": 308667735, - "step": 14309, - "time_per_iteration": 3.2728710174560547 - }, - { - "auxiliary_loss_clip": 0.01054401, - "auxiliary_loss_mlp": 0.01034995, - "balance_loss_clip": 1.03519821, - "balance_loss_mlp": 1.02176285, - "epoch": 0.8603637456786413, - "flos": 20300625578880.0, - "grad_norm": 1.8566830795066585, - "language_loss": 0.67076862, - "learning_rate": 2.0099342583311563e-07, - "loss": 0.69166255, - "num_input_tokens_seen": 308686300, - "step": 14310, - "time_per_iteration": 2.7875287532806396 - }, - { - "auxiliary_loss_clip": 0.01040328, - "auxiliary_loss_mlp": 0.01035443, - "balance_loss_clip": 1.02937603, - "balance_loss_mlp": 1.02352858, - "epoch": 0.8604238689313092, - "flos": 21835842819840.0, - "grad_norm": 1.7678336453099173, - "language_loss": 0.7815913, - "learning_rate": 2.0082329868540905e-07, - "loss": 0.80234909, - "num_input_tokens_seen": 308705825, - "step": 14311, - "time_per_iteration": 2.779208183288574 - }, - { - "auxiliary_loss_clip": 0.01096237, - "auxiliary_loss_mlp": 0.01031381, - "balance_loss_clip": 1.03626657, - "balance_loss_mlp": 1.0191319, - "epoch": 0.8604839921839772, - "flos": 18004210295040.0, - "grad_norm": 2.0823633926297087, - "language_loss": 0.72099596, - "learning_rate": 2.006532397626639e-07, - "loss": 0.74227214, - "num_input_tokens_seen": 308723340, - "step": 14312, - "time_per_iteration": 2.572300672531128 - }, - { - "auxiliary_loss_clip": 0.01079744, - "auxiliary_loss_mlp": 0.01033351, - "balance_loss_clip": 1.03377199, - "balance_loss_mlp": 1.02101254, - "epoch": 0.8605441154366451, - "flos": 16252164604800.0, - "grad_norm": 4.48770964436052, - "language_loss": 0.77972746, - "learning_rate": 2.0048324907132797e-07, - "loss": 0.80085838, - "num_input_tokens_seen": 308741280, - "step": 14313, - "time_per_iteration": 2.6455512046813965 - }, - { - "auxiliary_loss_clip": 0.01084267, - "auxiliary_loss_mlp": 0.0103529, - "balance_loss_clip": 1.03463316, - "balance_loss_mlp": 1.02147377, - "epoch": 0.8606042386893131, - "flos": 32267065921920.0, - "grad_norm": 1.4772487181933294, - "language_loss": 0.7305848, - "learning_rate": 2.003133266178474e-07, - "loss": 0.75178033, - "num_input_tokens_seen": 308762875, - "step": 14314, - "time_per_iteration": 2.760899782180786 - }, - { - "auxiliary_loss_clip": 0.01085045, - "auxiliary_loss_mlp": 0.01033377, - "balance_loss_clip": 1.03471231, - "balance_loss_mlp": 1.02096725, - "epoch": 0.860664361941981, - "flos": 20229774001920.0, - "grad_norm": 1.8071847940662549, - "language_loss": 0.68796486, - "learning_rate": 2.001434724086657e-07, - "loss": 0.70914906, - "num_input_tokens_seen": 308780315, - "step": 14315, - "time_per_iteration": 2.649801254272461 - }, - { - "auxiliary_loss_clip": 0.01096879, - "auxiliary_loss_mlp": 0.01032695, - "balance_loss_clip": 1.03672695, - "balance_loss_mlp": 1.02085114, - "epoch": 0.8607244851946491, - "flos": 25191622944000.0, - "grad_norm": 1.885182281921848, - "language_loss": 0.71844518, - "learning_rate": 1.9997368645022418e-07, - "loss": 0.73974097, - "num_input_tokens_seen": 308799435, - "step": 14316, - "time_per_iteration": 2.7529983520507812 - }, - { - "auxiliary_loss_clip": 0.01090676, - "auxiliary_loss_mlp": 0.01030269, - "balance_loss_clip": 1.04007196, - "balance_loss_mlp": 1.0183723, - "epoch": 0.860784608447317, - "flos": 20482082110080.0, - "grad_norm": 1.9094680545566136, - "language_loss": 0.82880986, - "learning_rate": 1.9980396874896056e-07, - "loss": 0.85001934, - "num_input_tokens_seen": 308817730, - "step": 14317, - "time_per_iteration": 2.6640453338623047 - }, - { - "auxiliary_loss_clip": 0.01090255, - "auxiliary_loss_mlp": 0.01030325, - "balance_loss_clip": 1.03797185, - "balance_loss_mlp": 1.01819539, - "epoch": 0.860844731699985, - "flos": 50476037696640.0, - "grad_norm": 1.6214847591514705, - "language_loss": 0.67348385, - "learning_rate": 1.996343193113108e-07, - "loss": 0.69468963, - "num_input_tokens_seen": 308841735, - "step": 14318, - "time_per_iteration": 2.869259834289551 - }, - { - "auxiliary_loss_clip": 0.01094097, - "auxiliary_loss_mlp": 0.01027928, - "balance_loss_clip": 1.0362227, - "balance_loss_mlp": 1.01671052, - "epoch": 0.8609048549526529, - "flos": 41172768455040.0, - "grad_norm": 1.558793225555784, - "language_loss": 0.71354842, - "learning_rate": 1.9946473814370911e-07, - "loss": 0.73476869, - "num_input_tokens_seen": 308865050, - "step": 14319, - "time_per_iteration": 2.844249963760376 - }, - { - "auxiliary_loss_clip": 0.0109006, - "auxiliary_loss_mlp": 0.00769912, - "balance_loss_clip": 1.03683519, - "balance_loss_mlp": 1.00023806, - "epoch": 0.8609649782053209, - "flos": 23951196622080.0, - "grad_norm": 1.8769046861773884, - "language_loss": 0.67780548, - "learning_rate": 1.992952252525839e-07, - "loss": 0.69640523, - "num_input_tokens_seen": 308885375, - "step": 14320, - "time_per_iteration": 2.6762452125549316 - }, - { - "auxiliary_loss_clip": 0.01080757, - "auxiliary_loss_mlp": 0.01037726, - "balance_loss_clip": 1.03380013, - "balance_loss_mlp": 1.02343893, - "epoch": 0.8610251014579888, - "flos": 23112574813440.0, - "grad_norm": 5.062268799214488, - "language_loss": 0.79499638, - "learning_rate": 1.9912578064436446e-07, - "loss": 0.81618118, - "num_input_tokens_seen": 308904700, - "step": 14321, - "time_per_iteration": 2.7844552993774414 - }, - { - "auxiliary_loss_clip": 0.01092256, - "auxiliary_loss_mlp": 0.00770223, - "balance_loss_clip": 1.03433347, - "balance_loss_mlp": 1.00014472, - "epoch": 0.8610852247106568, - "flos": 19426811420160.0, - "grad_norm": 1.8063677075547142, - "language_loss": 0.7084378, - "learning_rate": 1.9895640432547567e-07, - "loss": 0.72706258, - "num_input_tokens_seen": 308922985, - "step": 14322, - "time_per_iteration": 2.6614699363708496 - }, - { - "auxiliary_loss_clip": 0.01087983, - "auxiliary_loss_mlp": 0.01039264, - "balance_loss_clip": 1.03474808, - "balance_loss_mlp": 1.02575755, - "epoch": 0.8611453479633249, - "flos": 19312076401920.0, - "grad_norm": 1.9402477905188305, - "language_loss": 0.56338006, - "learning_rate": 1.9878709630234102e-07, - "loss": 0.58465254, - "num_input_tokens_seen": 308940765, - "step": 14323, - "time_per_iteration": 2.639302968978882 - }, - { - "auxiliary_loss_clip": 0.01071823, - "auxiliary_loss_mlp": 0.01026562, - "balance_loss_clip": 1.03276682, - "balance_loss_mlp": 1.01427782, - "epoch": 0.8612054712159928, - "flos": 23253667436160.0, - "grad_norm": 2.0929228827532413, - "language_loss": 0.75493181, - "learning_rate": 1.986178565813801e-07, - "loss": 0.77591568, - "num_input_tokens_seen": 308960110, - "step": 14324, - "time_per_iteration": 2.6960513591766357 - }, - { - "auxiliary_loss_clip": 0.01063342, - "auxiliary_loss_mlp": 0.01035964, - "balance_loss_clip": 1.03498292, - "balance_loss_mlp": 1.02134275, - "epoch": 0.8612655944686608, - "flos": 16028440744320.0, - "grad_norm": 2.114341094605167, - "language_loss": 0.66620868, - "learning_rate": 1.9844868516901036e-07, - "loss": 0.68720174, - "num_input_tokens_seen": 308976665, - "step": 14325, - "time_per_iteration": 2.704503297805786 - }, - { - "auxiliary_loss_clip": 0.01099873, - "auxiliary_loss_mlp": 0.01030402, - "balance_loss_clip": 1.03732955, - "balance_loss_mlp": 1.01800442, - "epoch": 0.8613257177213287, - "flos": 22492720788480.0, - "grad_norm": 1.7540053494594063, - "language_loss": 0.64823282, - "learning_rate": 1.982795820716472e-07, - "loss": 0.66953552, - "num_input_tokens_seen": 308997015, - "step": 14326, - "time_per_iteration": 2.634575843811035 - }, - { - "auxiliary_loss_clip": 0.01085647, - "auxiliary_loss_mlp": 0.01033577, - "balance_loss_clip": 1.03369999, - "balance_loss_mlp": 1.02078009, - "epoch": 0.8613858409739967, - "flos": 17238056175360.0, - "grad_norm": 1.9850234614136824, - "language_loss": 0.84380805, - "learning_rate": 1.9811054729570253e-07, - "loss": 0.86500031, - "num_input_tokens_seen": 309015250, - "step": 14327, - "time_per_iteration": 2.653275728225708 - }, - { - "auxiliary_loss_clip": 0.01098118, - "auxiliary_loss_mlp": 0.01031945, - "balance_loss_clip": 1.0356977, - "balance_loss_mlp": 1.01924908, - "epoch": 0.8614459642266646, - "flos": 22821123859200.0, - "grad_norm": 2.1125726227452186, - "language_loss": 0.7496419, - "learning_rate": 1.9794158084758661e-07, - "loss": 0.77094257, - "num_input_tokens_seen": 309034140, - "step": 14328, - "time_per_iteration": 2.644585132598877 - }, - { - "auxiliary_loss_clip": 0.01096938, - "auxiliary_loss_mlp": 0.01027149, - "balance_loss_clip": 1.03526139, - "balance_loss_mlp": 1.01539493, - "epoch": 0.8615060874793327, - "flos": 26504301473280.0, - "grad_norm": 1.8484016306146063, - "language_loss": 0.80306005, - "learning_rate": 1.9777268273370673e-07, - "loss": 0.82430089, - "num_input_tokens_seen": 309055075, - "step": 14329, - "time_per_iteration": 2.723478078842163 - }, - { - "auxiliary_loss_clip": 0.01083147, - "auxiliary_loss_mlp": 0.01031796, - "balance_loss_clip": 1.03760588, - "balance_loss_mlp": 1.01930857, - "epoch": 0.8615662107320006, - "flos": 24061011477120.0, - "grad_norm": 2.0863615030267937, - "language_loss": 0.76824546, - "learning_rate": 1.9760385296046757e-07, - "loss": 0.78939486, - "num_input_tokens_seen": 309074650, - "step": 14330, - "time_per_iteration": 2.812311887741089 - }, - { - "auxiliary_loss_clip": 0.01096755, - "auxiliary_loss_mlp": 0.01030872, - "balance_loss_clip": 1.03553391, - "balance_loss_mlp": 1.01855159, - "epoch": 0.8616263339846686, - "flos": 24165044242560.0, - "grad_norm": 1.8215281342853327, - "language_loss": 0.64920008, - "learning_rate": 1.974350915342702e-07, - "loss": 0.67047632, - "num_input_tokens_seen": 309094385, - "step": 14331, - "time_per_iteration": 2.6918468475341797 - }, - { - "auxiliary_loss_clip": 0.01086033, - "auxiliary_loss_mlp": 0.01032455, - "balance_loss_clip": 1.03811228, - "balance_loss_mlp": 1.02118349, - "epoch": 0.8616864572373365, - "flos": 21724340025600.0, - "grad_norm": 1.6141703486069339, - "language_loss": 0.760149, - "learning_rate": 1.9726639846151506e-07, - "loss": 0.7813338, - "num_input_tokens_seen": 309111815, - "step": 14332, - "time_per_iteration": 2.7376909255981445 - }, - { - "auxiliary_loss_clip": 0.01096761, - "auxiliary_loss_mlp": 0.01031207, - "balance_loss_clip": 1.03702247, - "balance_loss_mlp": 1.01777267, - "epoch": 0.8617465804900045, - "flos": 23766651521280.0, - "grad_norm": 1.7306536007075406, - "language_loss": 0.67241013, - "learning_rate": 1.9709777374859904e-07, - "loss": 0.69368982, - "num_input_tokens_seen": 309131385, - "step": 14333, - "time_per_iteration": 2.6434760093688965 - }, - { - "auxiliary_loss_clip": 0.01086243, - "auxiliary_loss_mlp": 0.0103825, - "balance_loss_clip": 1.03663921, - "balance_loss_mlp": 1.02411211, - "epoch": 0.8618067037426724, - "flos": 37703941251840.0, - "grad_norm": 1.6353598696173437, - "language_loss": 0.62017745, - "learning_rate": 1.969292174019157e-07, - "loss": 0.64142239, - "num_input_tokens_seen": 309155020, - "step": 14334, - "time_per_iteration": 2.758512258529663 - }, - { - "auxiliary_loss_clip": 0.01080188, - "auxiliary_loss_mlp": 0.01048728, - "balance_loss_clip": 1.0376997, - "balance_loss_mlp": 1.03463769, - "epoch": 0.8618668269953405, - "flos": 21471026336640.0, - "grad_norm": 4.004935288615531, - "language_loss": 0.69439906, - "learning_rate": 1.967607294278577e-07, - "loss": 0.71568823, - "num_input_tokens_seen": 309172865, - "step": 14335, - "time_per_iteration": 2.69771671295166 - }, - { - "auxiliary_loss_clip": 0.01100982, - "auxiliary_loss_mlp": 0.01035912, - "balance_loss_clip": 1.03802538, - "balance_loss_mlp": 1.02374029, - "epoch": 0.8619269502480085, - "flos": 22232691256320.0, - "grad_norm": 3.0287384377889297, - "language_loss": 0.82912672, - "learning_rate": 1.965923098328135e-07, - "loss": 0.85049564, - "num_input_tokens_seen": 309193575, - "step": 14336, - "time_per_iteration": 2.6209864616394043 - }, - { - "auxiliary_loss_clip": 0.01112224, - "auxiliary_loss_mlp": 0.010339, - "balance_loss_clip": 1.03766823, - "balance_loss_mlp": 1.02133584, - "epoch": 0.8619870735006764, - "flos": 22710626645760.0, - "grad_norm": 2.099074168500333, - "language_loss": 0.67489713, - "learning_rate": 1.9642395862316907e-07, - "loss": 0.69635832, - "num_input_tokens_seen": 309212680, - "step": 14337, - "time_per_iteration": 2.6033341884613037 - }, - { - "auxiliary_loss_clip": 0.01069511, - "auxiliary_loss_mlp": 0.01033217, - "balance_loss_clip": 1.03057778, - "balance_loss_mlp": 1.02058089, - "epoch": 0.8620471967533444, - "flos": 37520293991040.0, - "grad_norm": 1.5608583142668484, - "language_loss": 0.6694777, - "learning_rate": 1.962556758053089e-07, - "loss": 0.69050497, - "num_input_tokens_seen": 309234485, - "step": 14338, - "time_per_iteration": 2.775123119354248 - }, - { - "auxiliary_loss_clip": 0.01086678, - "auxiliary_loss_mlp": 0.01031842, - "balance_loss_clip": 1.03658581, - "balance_loss_mlp": 1.02030885, - "epoch": 0.8621073200060123, - "flos": 19682459493120.0, - "grad_norm": 1.9189965100666158, - "language_loss": 0.62008345, - "learning_rate": 1.9608746138561448e-07, - "loss": 0.64126867, - "num_input_tokens_seen": 309253630, - "step": 14339, - "time_per_iteration": 2.696450710296631 - }, - { - "auxiliary_loss_clip": 0.01086707, - "auxiliary_loss_mlp": 0.00770344, - "balance_loss_clip": 1.03489327, - "balance_loss_mlp": 1.00020528, - "epoch": 0.8621674432586803, - "flos": 14536855549440.0, - "grad_norm": 1.8496565464342125, - "language_loss": 0.62634254, - "learning_rate": 1.9591931537046458e-07, - "loss": 0.64491296, - "num_input_tokens_seen": 309270950, - "step": 14340, - "time_per_iteration": 4.219670295715332 - }, - { - "auxiliary_loss_clip": 0.01060496, - "auxiliary_loss_mlp": 0.0102529, - "balance_loss_clip": 1.03393662, - "balance_loss_mlp": 1.01384556, - "epoch": 0.8622275665113482, - "flos": 20740100480640.0, - "grad_norm": 1.5540537291722216, - "language_loss": 0.79882658, - "learning_rate": 1.9575123776623493e-07, - "loss": 0.81968445, - "num_input_tokens_seen": 309288780, - "step": 14341, - "time_per_iteration": 2.7992727756500244 - }, - { - "auxiliary_loss_clip": 0.01092904, - "auxiliary_loss_mlp": 0.01032082, - "balance_loss_clip": 1.0364188, - "balance_loss_mlp": 1.02028048, - "epoch": 0.8622876897640163, - "flos": 24715914197760.0, - "grad_norm": 1.6849671618732158, - "language_loss": 0.74542058, - "learning_rate": 1.9558322857929887e-07, - "loss": 0.76667047, - "num_input_tokens_seen": 309310875, - "step": 14342, - "time_per_iteration": 4.3738038539886475 - }, - { - "auxiliary_loss_clip": 0.01069834, - "auxiliary_loss_mlp": 0.01028747, - "balance_loss_clip": 1.03554666, - "balance_loss_mlp": 1.01579463, - "epoch": 0.8623478130166842, - "flos": 17457362663040.0, - "grad_norm": 1.6056166986401446, - "language_loss": 0.68522966, - "learning_rate": 1.95415287816028e-07, - "loss": 0.7062155, - "num_input_tokens_seen": 309329900, - "step": 14343, - "time_per_iteration": 4.237400770187378 - }, - { - "auxiliary_loss_clip": 0.01096424, - "auxiliary_loss_mlp": 0.01042074, - "balance_loss_clip": 1.03559923, - "balance_loss_mlp": 1.02879965, - "epoch": 0.8624079362693522, - "flos": 18109176814080.0, - "grad_norm": 1.6148942161800302, - "language_loss": 0.6802907, - "learning_rate": 1.9524741548278967e-07, - "loss": 0.70167565, - "num_input_tokens_seen": 309347870, - "step": 14344, - "time_per_iteration": 4.1997270584106445 - }, - { - "auxiliary_loss_clip": 0.01067509, - "auxiliary_loss_mlp": 0.01046204, - "balance_loss_clip": 1.0338335, - "balance_loss_mlp": 1.03233421, - "epoch": 0.8624680595220201, - "flos": 30666455971200.0, - "grad_norm": 1.5830249885479915, - "language_loss": 0.81282222, - "learning_rate": 1.9507961158595054e-07, - "loss": 0.83395934, - "num_input_tokens_seen": 309371695, - "step": 14345, - "time_per_iteration": 2.7645456790924072 - }, - { - "auxiliary_loss_clip": 0.01103951, - "auxiliary_loss_mlp": 0.01034874, - "balance_loss_clip": 1.03953946, - "balance_loss_mlp": 1.02208841, - "epoch": 0.8625281827746881, - "flos": 37998588516480.0, - "grad_norm": 1.9141588154194698, - "language_loss": 0.50585526, - "learning_rate": 1.9491187613187355e-07, - "loss": 0.52724349, - "num_input_tokens_seen": 309394645, - "step": 14346, - "time_per_iteration": 2.7219948768615723 - }, - { - "auxiliary_loss_clip": 0.01029718, - "auxiliary_loss_mlp": 0.01032771, - "balance_loss_clip": 1.03116322, - "balance_loss_mlp": 1.01971757, - "epoch": 0.862588306027356, - "flos": 26249730808320.0, - "grad_norm": 1.6259000305173057, - "language_loss": 0.75161147, - "learning_rate": 1.9474420912691913e-07, - "loss": 0.77223635, - "num_input_tokens_seen": 309413170, - "step": 14347, - "time_per_iteration": 2.8139262199401855 - }, - { - "auxiliary_loss_clip": 0.01082643, - "auxiliary_loss_mlp": 0.01030127, - "balance_loss_clip": 1.03561497, - "balance_loss_mlp": 1.01690078, - "epoch": 0.862648429280024, - "flos": 25878809013120.0, - "grad_norm": 2.1840928220647684, - "language_loss": 0.80749428, - "learning_rate": 1.945766105774449e-07, - "loss": 0.82862198, - "num_input_tokens_seen": 309431315, - "step": 14348, - "time_per_iteration": 2.656729221343994 - }, - { - "auxiliary_loss_clip": 0.01091404, - "auxiliary_loss_mlp": 0.01029077, - "balance_loss_clip": 1.03467631, - "balance_loss_mlp": 1.01720428, - "epoch": 0.862708552532692, - "flos": 37816413713280.0, - "grad_norm": 1.8503371551245635, - "language_loss": 0.66269898, - "learning_rate": 1.9440908048980665e-07, - "loss": 0.68390381, - "num_input_tokens_seen": 309453020, - "step": 14349, - "time_per_iteration": 2.799384832382202 - }, - { - "auxiliary_loss_clip": 0.0109691, - "auxiliary_loss_mlp": 0.01036094, - "balance_loss_clip": 1.03515387, - "balance_loss_mlp": 1.02395201, - "epoch": 0.86276867578536, - "flos": 19091800247040.0, - "grad_norm": 2.6246269667941906, - "language_loss": 0.7027539, - "learning_rate": 1.942416188703573e-07, - "loss": 0.7240839, - "num_input_tokens_seen": 309469780, - "step": 14350, - "time_per_iteration": 2.5943920612335205 - }, - { - "auxiliary_loss_clip": 0.0108035, - "auxiliary_loss_mlp": 0.01033768, - "balance_loss_clip": 1.03473318, - "balance_loss_mlp": 1.02111983, - "epoch": 0.862828799038028, - "flos": 22164281804160.0, - "grad_norm": 1.8551444377087964, - "language_loss": 0.76769114, - "learning_rate": 1.9407422572544618e-07, - "loss": 0.78883231, - "num_input_tokens_seen": 309489610, - "step": 14351, - "time_per_iteration": 2.6581666469573975 - }, - { - "auxiliary_loss_clip": 0.01096886, - "auxiliary_loss_mlp": 0.0103006, - "balance_loss_clip": 1.0370357, - "balance_loss_mlp": 1.01837111, - "epoch": 0.8628889222906959, - "flos": 23145576433920.0, - "grad_norm": 3.863289439771493, - "language_loss": 0.85162789, - "learning_rate": 1.9390690106142204e-07, - "loss": 0.87289739, - "num_input_tokens_seen": 309508295, - "step": 14352, - "time_per_iteration": 2.6280806064605713 - }, - { - "auxiliary_loss_clip": 0.01022246, - "auxiliary_loss_mlp": 0.0100272, - "balance_loss_clip": 1.0090481, - "balance_loss_mlp": 1.00167739, - "epoch": 0.8629490455433639, - "flos": 57817762151040.0, - "grad_norm": 0.7895499485816829, - "language_loss": 0.61935335, - "learning_rate": 1.9373964488462913e-07, - "loss": 0.63960302, - "num_input_tokens_seen": 309567960, - "step": 14353, - "time_per_iteration": 3.146935224533081 - }, - { - "auxiliary_loss_clip": 0.01107884, - "auxiliary_loss_mlp": 0.01030033, - "balance_loss_clip": 1.038095, - "balance_loss_mlp": 1.0188508, - "epoch": 0.8630091687960318, - "flos": 15919667383680.0, - "grad_norm": 1.6638505981636493, - "language_loss": 0.81754172, - "learning_rate": 1.9357245720140948e-07, - "loss": 0.83892089, - "num_input_tokens_seen": 309586050, - "step": 14354, - "time_per_iteration": 2.566462993621826 - }, - { - "auxiliary_loss_clip": 0.01086608, - "auxiliary_loss_mlp": 0.01027335, - "balance_loss_clip": 1.03349864, - "balance_loss_mlp": 1.01475871, - "epoch": 0.8630692920486999, - "flos": 17961691570560.0, - "grad_norm": 2.0513019933105827, - "language_loss": 0.85992026, - "learning_rate": 1.934053380181031e-07, - "loss": 0.88105971, - "num_input_tokens_seen": 309602910, - "step": 14355, - "time_per_iteration": 2.5831828117370605 - }, - { - "auxiliary_loss_clip": 0.01069864, - "auxiliary_loss_mlp": 0.01030488, - "balance_loss_clip": 1.03425539, - "balance_loss_mlp": 1.0177269, - "epoch": 0.8631294153013678, - "flos": 22455158140800.0, - "grad_norm": 4.854829851946411, - "language_loss": 0.58569849, - "learning_rate": 1.9323828734104763e-07, - "loss": 0.60670203, - "num_input_tokens_seen": 309621175, - "step": 14356, - "time_per_iteration": 2.65341854095459 - }, - { - "auxiliary_loss_clip": 0.01064009, - "auxiliary_loss_mlp": 0.01035736, - "balance_loss_clip": 1.03384709, - "balance_loss_mlp": 1.02203846, - "epoch": 0.8631895385540358, - "flos": 16837005847680.0, - "grad_norm": 1.8090879268972078, - "language_loss": 0.77420521, - "learning_rate": 1.9307130517657756e-07, - "loss": 0.79520273, - "num_input_tokens_seen": 309639395, - "step": 14357, - "time_per_iteration": 2.710195302963257 - }, - { - "auxiliary_loss_clip": 0.01098594, - "auxiliary_loss_mlp": 0.01033653, - "balance_loss_clip": 1.03671813, - "balance_loss_mlp": 1.02122521, - "epoch": 0.8632496618067037, - "flos": 18697214367360.0, - "grad_norm": 12.099648120671757, - "language_loss": 0.77500695, - "learning_rate": 1.9290439153102468e-07, - "loss": 0.79632944, - "num_input_tokens_seen": 309657265, - "step": 14358, - "time_per_iteration": 2.6657116413116455 - }, - { - "auxiliary_loss_clip": 0.01071096, - "auxiliary_loss_mlp": 0.0103447, - "balance_loss_clip": 1.03174829, - "balance_loss_mlp": 1.02037358, - "epoch": 0.8633097850593717, - "flos": 24279922915200.0, - "grad_norm": 1.4851174588982734, - "language_loss": 0.75020039, - "learning_rate": 1.9273754641071816e-07, - "loss": 0.77125597, - "num_input_tokens_seen": 309678610, - "step": 14359, - "time_per_iteration": 2.6872808933258057 - }, - { - "auxiliary_loss_clip": 0.01045653, - "auxiliary_loss_mlp": 0.01028999, - "balance_loss_clip": 1.03073585, - "balance_loss_mlp": 1.01629102, - "epoch": 0.8633699083120396, - "flos": 21178569801600.0, - "grad_norm": 1.864228118741394, - "language_loss": 0.70209599, - "learning_rate": 1.9257076982198517e-07, - "loss": 0.72284251, - "num_input_tokens_seen": 309697710, - "step": 14360, - "time_per_iteration": 2.8204243183135986 - }, - { - "auxiliary_loss_clip": 0.01079991, - "auxiliary_loss_mlp": 0.01034146, - "balance_loss_clip": 1.03886342, - "balance_loss_mlp": 1.02069938, - "epoch": 0.8634300315647077, - "flos": 19244888012160.0, - "grad_norm": 1.7674774133909552, - "language_loss": 0.7663061, - "learning_rate": 1.9240406177114953e-07, - "loss": 0.78744745, - "num_input_tokens_seen": 309715985, - "step": 14361, - "time_per_iteration": 2.7079758644104004 - }, - { - "auxiliary_loss_clip": 0.01028441, - "auxiliary_loss_mlp": 0.01002241, - "balance_loss_clip": 1.00602102, - "balance_loss_mlp": 1.00118601, - "epoch": 0.8634901548173756, - "flos": 66195648282240.0, - "grad_norm": 0.9560869661193441, - "language_loss": 0.58801341, - "learning_rate": 1.922374222645329e-07, - "loss": 0.60832024, - "num_input_tokens_seen": 309779930, - "step": 14362, - "time_per_iteration": 3.145829677581787 - }, - { - "auxiliary_loss_clip": 0.01042985, - "auxiliary_loss_mlp": 0.01031427, - "balance_loss_clip": 1.03692436, - "balance_loss_mlp": 1.01842105, - "epoch": 0.8635502780700436, - "flos": 24789531121920.0, - "grad_norm": 1.852310617760456, - "language_loss": 0.80515075, - "learning_rate": 1.9207085130845524e-07, - "loss": 0.82589483, - "num_input_tokens_seen": 309800580, - "step": 14363, - "time_per_iteration": 2.862398147583008 - }, - { - "auxiliary_loss_clip": 0.01082251, - "auxiliary_loss_mlp": 0.0104491, - "balance_loss_clip": 1.03282666, - "balance_loss_mlp": 1.02994919, - "epoch": 0.8636104013227116, - "flos": 25189970918400.0, - "grad_norm": 2.305599711448788, - "language_loss": 0.72819698, - "learning_rate": 1.9190434890923112e-07, - "loss": 0.74946856, - "num_input_tokens_seen": 309821725, - "step": 14364, - "time_per_iteration": 2.7694895267486572 - }, - { - "auxiliary_loss_clip": 0.01084893, - "auxiliary_loss_mlp": 0.01037595, - "balance_loss_clip": 1.033113, - "balance_loss_mlp": 1.02479792, - "epoch": 0.8636705245753795, - "flos": 23878441624320.0, - "grad_norm": 3.709270849116724, - "language_loss": 0.71231377, - "learning_rate": 1.917379150731755e-07, - "loss": 0.73353863, - "num_input_tokens_seen": 309841565, - "step": 14365, - "time_per_iteration": 2.6591691970825195 - }, - { - "auxiliary_loss_clip": 0.01084976, - "auxiliary_loss_mlp": 0.01048634, - "balance_loss_clip": 1.03588641, - "balance_loss_mlp": 1.03338158, - "epoch": 0.8637306478280475, - "flos": 23110455911040.0, - "grad_norm": 2.5553133795092853, - "language_loss": 0.7095083, - "learning_rate": 1.915715498065993e-07, - "loss": 0.73084438, - "num_input_tokens_seen": 309858635, - "step": 14366, - "time_per_iteration": 2.654860019683838 - }, - { - "auxiliary_loss_clip": 0.01080294, - "auxiliary_loss_mlp": 0.01025551, - "balance_loss_clip": 1.03619814, - "balance_loss_mlp": 1.01414287, - "epoch": 0.8637907710807154, - "flos": 21906802137600.0, - "grad_norm": 1.7096438755629864, - "language_loss": 0.81546772, - "learning_rate": 1.9140525311581146e-07, - "loss": 0.83652616, - "num_input_tokens_seen": 309877885, - "step": 14367, - "time_per_iteration": 2.658378839492798 - }, - { - "auxiliary_loss_clip": 0.01084703, - "auxiliary_loss_mlp": 0.01029547, - "balance_loss_clip": 1.03672993, - "balance_loss_mlp": 1.01633847, - "epoch": 0.8638508943333835, - "flos": 23580526222080.0, - "grad_norm": 1.893928917899102, - "language_loss": 0.61735493, - "learning_rate": 1.9123902500711743e-07, - "loss": 0.63849741, - "num_input_tokens_seen": 309893140, - "step": 14368, - "time_per_iteration": 2.7563858032226562 - }, - { - "auxiliary_loss_clip": 0.01100198, - "auxiliary_loss_mlp": 0.01032496, - "balance_loss_clip": 1.03874695, - "balance_loss_mlp": 1.02039003, - "epoch": 0.8639110175860514, - "flos": 25775853655680.0, - "grad_norm": 1.8793002534030256, - "language_loss": 0.76034266, - "learning_rate": 1.91072865486821e-07, - "loss": 0.78166956, - "num_input_tokens_seen": 309914175, - "step": 14369, - "time_per_iteration": 2.720898389816284 - }, - { - "auxiliary_loss_clip": 0.01084672, - "auxiliary_loss_mlp": 0.01036631, - "balance_loss_clip": 1.03559625, - "balance_loss_mlp": 1.02341676, - "epoch": 0.8639711408387194, - "flos": 23369443948800.0, - "grad_norm": 1.7853455922645574, - "language_loss": 0.64685416, - "learning_rate": 1.9090677456122294e-07, - "loss": 0.66806722, - "num_input_tokens_seen": 309932395, - "step": 14370, - "time_per_iteration": 2.7746939659118652 - }, - { - "auxiliary_loss_clip": 0.01051431, - "auxiliary_loss_mlp": 0.01034289, - "balance_loss_clip": 1.03813696, - "balance_loss_mlp": 1.02186131, - "epoch": 0.8640312640913873, - "flos": 22127221946880.0, - "grad_norm": 1.6691251892121577, - "language_loss": 0.66381669, - "learning_rate": 1.907407522366209e-07, - "loss": 0.68467391, - "num_input_tokens_seen": 309951720, - "step": 14371, - "time_per_iteration": 2.7832515239715576 - }, - { - "auxiliary_loss_clip": 0.01010679, - "auxiliary_loss_mlp": 0.0100181, - "balance_loss_clip": 1.00754333, - "balance_loss_mlp": 1.00070095, - "epoch": 0.8640913873440553, - "flos": 57571735944960.0, - "grad_norm": 0.8715418299752374, - "language_loss": 0.56873655, - "learning_rate": 1.905747985193107e-07, - "loss": 0.58886147, - "num_input_tokens_seen": 310006120, - "step": 14372, - "time_per_iteration": 3.080965042114258 - }, - { - "auxiliary_loss_clip": 0.01107085, - "auxiliary_loss_mlp": 0.01031651, - "balance_loss_clip": 1.03817725, - "balance_loss_mlp": 1.01909208, - "epoch": 0.8641515105967232, - "flos": 23987430466560.0, - "grad_norm": 1.722968636798083, - "language_loss": 0.79519123, - "learning_rate": 1.9040891341558597e-07, - "loss": 0.81657857, - "num_input_tokens_seen": 310026740, - "step": 14373, - "time_per_iteration": 2.635335683822632 - }, - { - "auxiliary_loss_clip": 0.01110837, - "auxiliary_loss_mlp": 0.01028883, - "balance_loss_clip": 1.03787744, - "balance_loss_mlp": 1.01607943, - "epoch": 0.8642116338493913, - "flos": 19062749122560.0, - "grad_norm": 1.6653536401221238, - "language_loss": 0.63377726, - "learning_rate": 1.9024309693173656e-07, - "loss": 0.65517449, - "num_input_tokens_seen": 310044135, - "step": 14374, - "time_per_iteration": 2.5494918823242188 - }, - { - "auxiliary_loss_clip": 0.01077851, - "auxiliary_loss_mlp": 0.01034525, - "balance_loss_clip": 1.03636634, - "balance_loss_mlp": 1.02211523, - "epoch": 0.8642717571020592, - "flos": 18254148105600.0, - "grad_norm": 1.7168843124571862, - "language_loss": 0.77189004, - "learning_rate": 1.9007734907404993e-07, - "loss": 0.79301381, - "num_input_tokens_seen": 310061560, - "step": 14375, - "time_per_iteration": 2.677976131439209 - }, - { - "auxiliary_loss_clip": 0.01064524, - "auxiliary_loss_mlp": 0.00770411, - "balance_loss_clip": 1.03405952, - "balance_loss_mlp": 1.00014496, - "epoch": 0.8643318803547272, - "flos": 57663270777600.0, - "grad_norm": 1.8714174217127035, - "language_loss": 0.60663325, - "learning_rate": 1.899116698488117e-07, - "loss": 0.6249826, - "num_input_tokens_seen": 310087310, - "step": 14376, - "time_per_iteration": 3.0315792560577393 - }, - { - "auxiliary_loss_clip": 0.01065318, - "auxiliary_loss_mlp": 0.01037856, - "balance_loss_clip": 1.0328449, - "balance_loss_mlp": 1.02571476, - "epoch": 0.8643920036073952, - "flos": 19609524927360.0, - "grad_norm": 1.4665083491596096, - "language_loss": 0.66321123, - "learning_rate": 1.8974605926230457e-07, - "loss": 0.68424296, - "num_input_tokens_seen": 310106260, - "step": 14377, - "time_per_iteration": 2.661478042602539 - }, - { - "auxiliary_loss_clip": 0.01082246, - "auxiliary_loss_mlp": 0.0104227, - "balance_loss_clip": 1.03249764, - "balance_loss_mlp": 1.02842414, - "epoch": 0.8644521268600631, - "flos": 20850346298880.0, - "grad_norm": 1.6699792562126987, - "language_loss": 0.70700777, - "learning_rate": 1.8958051732080804e-07, - "loss": 0.72825295, - "num_input_tokens_seen": 310125305, - "step": 14378, - "time_per_iteration": 2.6440517902374268 - }, - { - "auxiliary_loss_clip": 0.0101912, - "auxiliary_loss_mlp": 0.01001905, - "balance_loss_clip": 1.0065546, - "balance_loss_mlp": 1.00082636, - "epoch": 0.8645122501127311, - "flos": 66719550101760.0, - "grad_norm": 0.8082600022248976, - "language_loss": 0.60236883, - "learning_rate": 1.894150440305995e-07, - "loss": 0.6225791, - "num_input_tokens_seen": 310189270, - "step": 14379, - "time_per_iteration": 3.1792728900909424 - }, - { - "auxiliary_loss_clip": 0.01077548, - "auxiliary_loss_mlp": 0.01032454, - "balance_loss_clip": 1.03373933, - "balance_loss_mlp": 1.02031279, - "epoch": 0.864572373365399, - "flos": 21690009601920.0, - "grad_norm": 1.8837339678348841, - "language_loss": 0.74800771, - "learning_rate": 1.8924963939795478e-07, - "loss": 0.76910776, - "num_input_tokens_seen": 310208395, - "step": 14380, - "time_per_iteration": 4.324819803237915 - }, - { - "auxiliary_loss_clip": 0.01080903, - "auxiliary_loss_mlp": 0.01036416, - "balance_loss_clip": 1.03346038, - "balance_loss_mlp": 1.02307606, - "epoch": 0.8646324966180671, - "flos": 20266402896000.0, - "grad_norm": 1.9558839364360057, - "language_loss": 0.75436544, - "learning_rate": 1.8908430342914473e-07, - "loss": 0.77553868, - "num_input_tokens_seen": 310227415, - "step": 14381, - "time_per_iteration": 4.3003315925598145 - }, - { - "auxiliary_loss_clip": 0.01085169, - "auxiliary_loss_mlp": 0.01035093, - "balance_loss_clip": 1.03721309, - "balance_loss_mlp": 1.02337468, - "epoch": 0.864692619870735, - "flos": 11946188050560.0, - "grad_norm": 2.531870478420468, - "language_loss": 0.84684384, - "learning_rate": 1.8891903613043892e-07, - "loss": 0.86804652, - "num_input_tokens_seen": 310242625, - "step": 14382, - "time_per_iteration": 4.235616683959961 - }, - { - "auxiliary_loss_clip": 0.01101073, - "auxiliary_loss_mlp": 0.01035119, - "balance_loss_clip": 1.03812909, - "balance_loss_mlp": 1.02230954, - "epoch": 0.864752743123403, - "flos": 21470703114240.0, - "grad_norm": 2.020788387095791, - "language_loss": 0.75921559, - "learning_rate": 1.8875383750810504e-07, - "loss": 0.78057754, - "num_input_tokens_seen": 310260585, - "step": 14383, - "time_per_iteration": 2.743340015411377 - }, - { - "auxiliary_loss_clip": 0.01089565, - "auxiliary_loss_mlp": 0.0103368, - "balance_loss_clip": 1.03891516, - "balance_loss_mlp": 1.02116311, - "epoch": 0.8648128663760709, - "flos": 19530018172800.0, - "grad_norm": 1.8560596447894047, - "language_loss": 0.85428023, - "learning_rate": 1.8858870756840738e-07, - "loss": 0.87551272, - "num_input_tokens_seen": 310277210, - "step": 14384, - "time_per_iteration": 4.140477418899536 - }, - { - "auxiliary_loss_clip": 0.01093344, - "auxiliary_loss_mlp": 0.01030627, - "balance_loss_clip": 1.0340718, - "balance_loss_mlp": 1.01875997, - "epoch": 0.8648729896287389, - "flos": 21287953693440.0, - "grad_norm": 1.6613358165771401, - "language_loss": 0.8117463, - "learning_rate": 1.884236463176072e-07, - "loss": 0.832986, - "num_input_tokens_seen": 310296610, - "step": 14385, - "time_per_iteration": 2.563424825668335 - }, - { - "auxiliary_loss_clip": 0.01094427, - "auxiliary_loss_mlp": 0.01035484, - "balance_loss_clip": 1.040411, - "balance_loss_mlp": 1.02252555, - "epoch": 0.8649331128814068, - "flos": 24604483230720.0, - "grad_norm": 2.3388483586087303, - "language_loss": 0.72581172, - "learning_rate": 1.8825865376196437e-07, - "loss": 0.74711078, - "num_input_tokens_seen": 310316830, - "step": 14386, - "time_per_iteration": 2.667926549911499 - }, - { - "auxiliary_loss_clip": 0.01093992, - "auxiliary_loss_mlp": 0.01041396, - "balance_loss_clip": 1.03530121, - "balance_loss_mlp": 1.02797318, - "epoch": 0.8649932361340749, - "flos": 15377811742080.0, - "grad_norm": 4.510791763694996, - "language_loss": 0.81868196, - "learning_rate": 1.8809372990773476e-07, - "loss": 0.8400358, - "num_input_tokens_seen": 310334355, - "step": 14387, - "time_per_iteration": 2.660701036453247 - }, - { - "auxiliary_loss_clip": 0.01106932, - "auxiliary_loss_mlp": 0.01028302, - "balance_loss_clip": 1.0378803, - "balance_loss_mlp": 1.01641643, - "epoch": 0.8650533593867428, - "flos": 19901227276800.0, - "grad_norm": 2.104447554520212, - "language_loss": 0.68797326, - "learning_rate": 1.8792887476117224e-07, - "loss": 0.70932555, - "num_input_tokens_seen": 310352900, - "step": 14388, - "time_per_iteration": 2.5773561000823975 - }, - { - "auxiliary_loss_clip": 0.01073211, - "auxiliary_loss_mlp": 0.01036245, - "balance_loss_clip": 1.03666544, - "balance_loss_mlp": 1.02510452, - "epoch": 0.8651134826394108, - "flos": 25626931868160.0, - "grad_norm": 2.8952711176553043, - "language_loss": 0.90358889, - "learning_rate": 1.877640883285283e-07, - "loss": 0.92468345, - "num_input_tokens_seen": 310372855, - "step": 14389, - "time_per_iteration": 2.712479591369629 - }, - { - "auxiliary_loss_clip": 0.01065736, - "auxiliary_loss_mlp": 0.00769129, - "balance_loss_clip": 1.03819394, - "balance_loss_mlp": 1.0002389, - "epoch": 0.8651736058920788, - "flos": 18734525619840.0, - "grad_norm": 1.556328693404614, - "language_loss": 0.70784509, - "learning_rate": 1.8759937061605212e-07, - "loss": 0.72619373, - "num_input_tokens_seen": 310391595, - "step": 14390, - "time_per_iteration": 2.7250664234161377 - }, - { - "auxiliary_loss_clip": 0.0110984, - "auxiliary_loss_mlp": 0.01034761, - "balance_loss_clip": 1.03667974, - "balance_loss_mlp": 1.02206492, - "epoch": 0.8652337291447467, - "flos": 20776765288320.0, - "grad_norm": 3.0242900770440158, - "language_loss": 0.82031155, - "learning_rate": 1.8743472162998941e-07, - "loss": 0.84175754, - "num_input_tokens_seen": 310410090, - "step": 14391, - "time_per_iteration": 2.5874016284942627 - }, - { - "auxiliary_loss_clip": 0.00999016, - "auxiliary_loss_mlp": 0.00998272, - "balance_loss_clip": 1.00931406, - "balance_loss_mlp": 0.99692518, - "epoch": 0.8652938523974147, - "flos": 64227887464320.0, - "grad_norm": 0.800129032908664, - "language_loss": 0.67961007, - "learning_rate": 1.8727014137658337e-07, - "loss": 0.69958293, - "num_input_tokens_seen": 310470055, - "step": 14392, - "time_per_iteration": 3.141786813735962 - }, - { - "auxiliary_loss_clip": 0.01102797, - "auxiliary_loss_mlp": 0.01032477, - "balance_loss_clip": 1.03694808, - "balance_loss_mlp": 1.01924479, - "epoch": 0.8653539756500827, - "flos": 18040587793920.0, - "grad_norm": 1.8856474230308053, - "language_loss": 0.75999135, - "learning_rate": 1.8710562986207523e-07, - "loss": 0.78134412, - "num_input_tokens_seen": 310487665, - "step": 14393, - "time_per_iteration": 2.6403071880340576 - }, - { - "auxiliary_loss_clip": 0.01085265, - "auxiliary_loss_mlp": 0.01035656, - "balance_loss_clip": 1.03292179, - "balance_loss_mlp": 1.02319884, - "epoch": 0.8654140989027507, - "flos": 17382416935680.0, - "grad_norm": 1.8766276101061499, - "language_loss": 0.73443645, - "learning_rate": 1.8694118709270357e-07, - "loss": 0.75564563, - "num_input_tokens_seen": 310506130, - "step": 14394, - "time_per_iteration": 2.589737892150879 - }, - { - "auxiliary_loss_clip": 0.01098893, - "auxiliary_loss_mlp": 0.01029702, - "balance_loss_clip": 1.03559685, - "balance_loss_mlp": 1.01642823, - "epoch": 0.8654742221554186, - "flos": 53284862448000.0, - "grad_norm": 25.445187757651638, - "language_loss": 0.65340948, - "learning_rate": 1.867768130747036e-07, - "loss": 0.67469549, - "num_input_tokens_seen": 310532445, - "step": 14395, - "time_per_iteration": 2.8686017990112305 - }, - { - "auxiliary_loss_clip": 0.01091975, - "auxiliary_loss_mlp": 0.0103619, - "balance_loss_clip": 1.03594851, - "balance_loss_mlp": 1.02362514, - "epoch": 0.8655343454080866, - "flos": 23914711382400.0, - "grad_norm": 3.648513206821013, - "language_loss": 0.68270028, - "learning_rate": 1.8661250781430838e-07, - "loss": 0.70398188, - "num_input_tokens_seen": 310552300, - "step": 14396, - "time_per_iteration": 2.691372871398926 - }, - { - "auxiliary_loss_clip": 0.01102693, - "auxiliary_loss_mlp": 0.01036564, - "balance_loss_clip": 1.03977966, - "balance_loss_mlp": 1.02393413, - "epoch": 0.8655944686607545, - "flos": 24097209408000.0, - "grad_norm": 2.1296548078090067, - "language_loss": 0.6985743, - "learning_rate": 1.8644827131774954e-07, - "loss": 0.71996689, - "num_input_tokens_seen": 310572710, - "step": 14397, - "time_per_iteration": 2.6537063121795654 - }, - { - "auxiliary_loss_clip": 0.01092627, - "auxiliary_loss_mlp": 0.01029547, - "balance_loss_clip": 1.03830481, - "balance_loss_mlp": 1.01773953, - "epoch": 0.8656545919134225, - "flos": 23112718467840.0, - "grad_norm": 1.7708020135557936, - "language_loss": 0.63645488, - "learning_rate": 1.86284103591253e-07, - "loss": 0.65767658, - "num_input_tokens_seen": 310592460, - "step": 14398, - "time_per_iteration": 2.721609592437744 - }, - { - "auxiliary_loss_clip": 0.01072146, - "auxiliary_loss_mlp": 0.01040273, - "balance_loss_clip": 1.03550839, - "balance_loss_mlp": 1.02659369, - "epoch": 0.8657147151660904, - "flos": 21141761339520.0, - "grad_norm": 2.410679040433659, - "language_loss": 0.76115006, - "learning_rate": 1.8612000464104517e-07, - "loss": 0.78227425, - "num_input_tokens_seen": 310609375, - "step": 14399, - "time_per_iteration": 2.6792304515838623 - }, - { - "auxiliary_loss_clip": 0.01091264, - "auxiliary_loss_mlp": 0.0102886, - "balance_loss_clip": 1.03629327, - "balance_loss_mlp": 1.0173502, - "epoch": 0.8657748384187585, - "flos": 16289439943680.0, - "grad_norm": 2.1842250603302906, - "language_loss": 0.93539166, - "learning_rate": 1.8595597447334855e-07, - "loss": 0.95659292, - "num_input_tokens_seen": 310627405, - "step": 14400, - "time_per_iteration": 2.557438850402832 - }, - { - "auxiliary_loss_clip": 0.01044413, - "auxiliary_loss_mlp": 0.01038088, - "balance_loss_clip": 1.0341754, - "balance_loss_mlp": 1.02537465, - "epoch": 0.8658349616714264, - "flos": 30843890179200.0, - "grad_norm": 1.8571085521140969, - "language_loss": 0.67723525, - "learning_rate": 1.8579201309438353e-07, - "loss": 0.69806027, - "num_input_tokens_seen": 310649945, - "step": 14401, - "time_per_iteration": 2.8091368675231934 - }, - { - "auxiliary_loss_clip": 0.01099417, - "auxiliary_loss_mlp": 0.01031502, - "balance_loss_clip": 1.03662825, - "balance_loss_mlp": 1.01880038, - "epoch": 0.8658950849240944, - "flos": 18952862440320.0, - "grad_norm": 2.157466322300169, - "language_loss": 0.73613071, - "learning_rate": 1.8562812051036714e-07, - "loss": 0.75743997, - "num_input_tokens_seen": 310668285, - "step": 14402, - "time_per_iteration": 2.570737838745117 - }, - { - "auxiliary_loss_clip": 0.01036456, - "auxiliary_loss_mlp": 0.01033626, - "balance_loss_clip": 1.03347492, - "balance_loss_mlp": 1.02177048, - "epoch": 0.8659552081767624, - "flos": 23364344217600.0, - "grad_norm": 1.7804756809265996, - "language_loss": 0.74911118, - "learning_rate": 1.8546429672751397e-07, - "loss": 0.76981199, - "num_input_tokens_seen": 310687015, - "step": 14403, - "time_per_iteration": 2.8824269771575928 - }, - { - "auxiliary_loss_clip": 0.0108389, - "auxiliary_loss_mlp": 0.01034677, - "balance_loss_clip": 1.03559339, - "balance_loss_mlp": 1.02145052, - "epoch": 0.8660153314294303, - "flos": 23841992298240.0, - "grad_norm": 1.9785439757020915, - "language_loss": 0.73294771, - "learning_rate": 1.853005417520368e-07, - "loss": 0.75413334, - "num_input_tokens_seen": 310707580, - "step": 14404, - "time_per_iteration": 2.691854238510132 - }, - { - "auxiliary_loss_clip": 0.01070251, - "auxiliary_loss_mlp": 0.01036432, - "balance_loss_clip": 1.03529263, - "balance_loss_mlp": 1.02364087, - "epoch": 0.8660754546820983, - "flos": 23112467072640.0, - "grad_norm": 1.6230968808599193, - "language_loss": 0.70621324, - "learning_rate": 1.851368555901447e-07, - "loss": 0.72728002, - "num_input_tokens_seen": 310727300, - "step": 14405, - "time_per_iteration": 2.6545495986938477 - }, - { - "auxiliary_loss_clip": 0.01099979, - "auxiliary_loss_mlp": 0.0077033, - "balance_loss_clip": 1.03619599, - "balance_loss_mlp": 1.00023413, - "epoch": 0.8661355779347663, - "flos": 14391991998720.0, - "grad_norm": 1.8683678221955426, - "language_loss": 0.66598046, - "learning_rate": 1.8497323824804467e-07, - "loss": 0.68468356, - "num_input_tokens_seen": 310744935, - "step": 14406, - "time_per_iteration": 2.6244313716888428 - }, - { - "auxiliary_loss_clip": 0.01087721, - "auxiliary_loss_mlp": 0.01027006, - "balance_loss_clip": 1.0369488, - "balance_loss_mlp": 1.01565766, - "epoch": 0.8661957011874343, - "flos": 21870137329920.0, - "grad_norm": 1.713289909017667, - "language_loss": 0.82678503, - "learning_rate": 1.8480968973194177e-07, - "loss": 0.84793234, - "num_input_tokens_seen": 310765085, - "step": 14407, - "time_per_iteration": 2.7246527671813965 - }, - { - "auxiliary_loss_clip": 0.01097432, - "auxiliary_loss_mlp": 0.01038578, - "balance_loss_clip": 1.03706372, - "balance_loss_mlp": 1.02623403, - "epoch": 0.8662558244401022, - "flos": 21835160461440.0, - "grad_norm": 1.640288408492858, - "language_loss": 0.70144266, - "learning_rate": 1.8464621004803748e-07, - "loss": 0.72280276, - "num_input_tokens_seen": 310783260, - "step": 14408, - "time_per_iteration": 2.688714027404785 - }, - { - "auxiliary_loss_clip": 0.01088368, - "auxiliary_loss_mlp": 0.01034051, - "balance_loss_clip": 1.036026, - "balance_loss_mlp": 1.02254152, - "epoch": 0.8663159476927702, - "flos": 17384104874880.0, - "grad_norm": 1.9035272419543303, - "language_loss": 0.7693873, - "learning_rate": 1.844827992025304e-07, - "loss": 0.79061151, - "num_input_tokens_seen": 310801970, - "step": 14409, - "time_per_iteration": 2.668154239654541 - }, - { - "auxiliary_loss_clip": 0.01101925, - "auxiliary_loss_mlp": 0.01034118, - "balance_loss_clip": 1.03869689, - "balance_loss_mlp": 1.02009869, - "epoch": 0.8663760709454381, - "flos": 22747722416640.0, - "grad_norm": 1.696612134520476, - "language_loss": 0.77045894, - "learning_rate": 1.8431945720161757e-07, - "loss": 0.79181939, - "num_input_tokens_seen": 310822070, - "step": 14410, - "time_per_iteration": 2.6069350242614746 - }, - { - "auxiliary_loss_clip": 0.0106574, - "auxiliary_loss_mlp": 0.0103477, - "balance_loss_clip": 1.03402448, - "balance_loss_mlp": 1.02225292, - "epoch": 0.8664361941981061, - "flos": 17376850327680.0, - "grad_norm": 1.9481665792177514, - "language_loss": 0.77590597, - "learning_rate": 1.8415618405149315e-07, - "loss": 0.79691112, - "num_input_tokens_seen": 310838355, - "step": 14411, - "time_per_iteration": 2.6132922172546387 - }, - { - "auxiliary_loss_clip": 0.01078109, - "auxiliary_loss_mlp": 0.01035885, - "balance_loss_clip": 1.03366232, - "balance_loss_mlp": 1.02461982, - "epoch": 0.866496317450774, - "flos": 16034438315520.0, - "grad_norm": 1.750688188601547, - "language_loss": 0.74020624, - "learning_rate": 1.8399297975834794e-07, - "loss": 0.76134622, - "num_input_tokens_seen": 310856055, - "step": 14412, - "time_per_iteration": 2.6058592796325684 - }, - { - "auxiliary_loss_clip": 0.01090356, - "auxiliary_loss_mlp": 0.00771287, - "balance_loss_clip": 1.03415728, - "balance_loss_mlp": 1.0002377, - "epoch": 0.8665564407034421, - "flos": 20814830726400.0, - "grad_norm": 1.7730290452974458, - "language_loss": 0.6952216, - "learning_rate": 1.83829844328371e-07, - "loss": 0.71383798, - "num_input_tokens_seen": 310876695, - "step": 14413, - "time_per_iteration": 2.614438056945801 - }, - { - "auxiliary_loss_clip": 0.01098326, - "auxiliary_loss_mlp": 0.01035601, - "balance_loss_clip": 1.03807211, - "balance_loss_mlp": 1.02280378, - "epoch": 0.86661656395611, - "flos": 15815167741440.0, - "grad_norm": 2.2624919572268603, - "language_loss": 0.62299776, - "learning_rate": 1.8366677776774874e-07, - "loss": 0.64433706, - "num_input_tokens_seen": 310893880, - "step": 14414, - "time_per_iteration": 2.5781359672546387 - }, - { - "auxiliary_loss_clip": 0.01078873, - "auxiliary_loss_mlp": 0.00769848, - "balance_loss_clip": 1.03693521, - "balance_loss_mlp": 1.00018334, - "epoch": 0.866676687208778, - "flos": 23036910814080.0, - "grad_norm": 1.633402194861805, - "language_loss": 0.6382761, - "learning_rate": 1.8350378008266377e-07, - "loss": 0.65676332, - "num_input_tokens_seen": 310914145, - "step": 14415, - "time_per_iteration": 2.718871831893921 - }, - { - "auxiliary_loss_clip": 0.01001561, - "auxiliary_loss_mlp": 0.01003608, - "balance_loss_clip": 1.00817573, - "balance_loss_mlp": 1.00249326, - "epoch": 0.866736810461446, - "flos": 63802275212160.0, - "grad_norm": 0.7984060732990605, - "language_loss": 0.60386515, - "learning_rate": 1.8334085127929754e-07, - "loss": 0.62391675, - "num_input_tokens_seen": 310972825, - "step": 14416, - "time_per_iteration": 3.32995343208313 - }, - { - "auxiliary_loss_clip": 0.01101132, - "auxiliary_loss_mlp": 0.00771613, - "balance_loss_clip": 1.03657961, - "balance_loss_mlp": 1.00021935, - "epoch": 0.8667969337141139, - "flos": 20449367798400.0, - "grad_norm": 1.8418559136989974, - "language_loss": 0.74591923, - "learning_rate": 1.831779913638285e-07, - "loss": 0.76464671, - "num_input_tokens_seen": 310992050, - "step": 14417, - "time_per_iteration": 2.6240720748901367 - }, - { - "auxiliary_loss_clip": 0.0108446, - "auxiliary_loss_mlp": 0.01035619, - "balance_loss_clip": 1.03623867, - "balance_loss_mlp": 1.02401364, - "epoch": 0.866857056966782, - "flos": 21653703930240.0, - "grad_norm": 1.6010496631035476, - "language_loss": 0.75304806, - "learning_rate": 1.830152003424319e-07, - "loss": 0.77424884, - "num_input_tokens_seen": 311011105, - "step": 14418, - "time_per_iteration": 2.6442039012908936 - }, - { - "auxiliary_loss_clip": 0.01096633, - "auxiliary_loss_mlp": 0.01034851, - "balance_loss_clip": 1.0357796, - "balance_loss_mlp": 1.02292967, - "epoch": 0.8669171802194499, - "flos": 22852832590080.0, - "grad_norm": 1.669621966476557, - "language_loss": 0.68341649, - "learning_rate": 1.8285247822128126e-07, - "loss": 0.70473135, - "num_input_tokens_seen": 311032080, - "step": 14419, - "time_per_iteration": 2.623978853225708 - }, - { - "auxiliary_loss_clip": 0.01099318, - "auxiliary_loss_mlp": 0.01031681, - "balance_loss_clip": 1.03616405, - "balance_loss_mlp": 1.02020669, - "epoch": 0.8669773034721179, - "flos": 18734166483840.0, - "grad_norm": 1.6685720418473156, - "language_loss": 0.78522211, - "learning_rate": 1.826898250065465e-07, - "loss": 0.80653214, - "num_input_tokens_seen": 311049735, - "step": 14420, - "time_per_iteration": 4.198700189590454 - }, - { - "auxiliary_loss_clip": 0.01093862, - "auxiliary_loss_mlp": 0.01032049, - "balance_loss_clip": 1.03496552, - "balance_loss_mlp": 1.01974106, - "epoch": 0.8670374267247858, - "flos": 18916018064640.0, - "grad_norm": 1.5087342244931736, - "language_loss": 0.83599997, - "learning_rate": 1.8252724070439586e-07, - "loss": 0.85725909, - "num_input_tokens_seen": 311067675, - "step": 14421, - "time_per_iteration": 4.208746910095215 - }, - { - "auxiliary_loss_clip": 0.01006687, - "auxiliary_loss_mlp": 0.00999775, - "balance_loss_clip": 1.00802314, - "balance_loss_mlp": 0.99845761, - "epoch": 0.8670975499774538, - "flos": 48814527214080.0, - "grad_norm": 0.7509779369384021, - "language_loss": 0.49057785, - "learning_rate": 1.823647253209941e-07, - "loss": 0.51064241, - "num_input_tokens_seen": 311126605, - "step": 14422, - "time_per_iteration": 4.777186870574951 - }, - { - "auxiliary_loss_clip": 0.01087105, - "auxiliary_loss_mlp": 0.00769697, - "balance_loss_clip": 1.03720963, - "balance_loss_mlp": 1.00028849, - "epoch": 0.8671576732301217, - "flos": 26136145025280.0, - "grad_norm": 1.670233296430545, - "language_loss": 0.73442525, - "learning_rate": 1.8220227886250417e-07, - "loss": 0.75299329, - "num_input_tokens_seen": 311147325, - "step": 14423, - "time_per_iteration": 4.283585786819458 - }, - { - "auxiliary_loss_clip": 0.01061427, - "auxiliary_loss_mlp": 0.0103548, - "balance_loss_clip": 1.03110516, - "balance_loss_mlp": 1.02256989, - "epoch": 0.8672177964827897, - "flos": 18367446579840.0, - "grad_norm": 1.5662705117653968, - "language_loss": 0.76781297, - "learning_rate": 1.8203990133508684e-07, - "loss": 0.78878212, - "num_input_tokens_seen": 311165385, - "step": 14424, - "time_per_iteration": 2.645517110824585 - }, - { - "auxiliary_loss_clip": 0.01066724, - "auxiliary_loss_mlp": 0.01040643, - "balance_loss_clip": 1.03161621, - "balance_loss_mlp": 1.02800703, - "epoch": 0.8672779197354576, - "flos": 28545355992960.0, - "grad_norm": 1.9458194171790135, - "language_loss": 0.71327066, - "learning_rate": 1.8187759274489767e-07, - "loss": 0.73434436, - "num_input_tokens_seen": 311185860, - "step": 14425, - "time_per_iteration": 2.7444801330566406 - }, - { - "auxiliary_loss_clip": 0.01100034, - "auxiliary_loss_mlp": 0.01033916, - "balance_loss_clip": 1.03743434, - "balance_loss_mlp": 1.02065444, - "epoch": 0.8673380429881257, - "flos": 22382474970240.0, - "grad_norm": 1.755018176625315, - "language_loss": 0.6806134, - "learning_rate": 1.817153530980926e-07, - "loss": 0.70195293, - "num_input_tokens_seen": 311205810, - "step": 14426, - "time_per_iteration": 2.5805845260620117 - }, - { - "auxiliary_loss_clip": 0.01065339, - "auxiliary_loss_mlp": 0.01027987, - "balance_loss_clip": 1.03625464, - "balance_loss_mlp": 1.01546359, - "epoch": 0.8673981662407936, - "flos": 20996430912000.0, - "grad_norm": 1.8587393637126561, - "language_loss": 0.70647991, - "learning_rate": 1.815531824008234e-07, - "loss": 0.72741318, - "num_input_tokens_seen": 311226080, - "step": 14427, - "time_per_iteration": 2.685107469558716 - }, - { - "auxiliary_loss_clip": 0.01080277, - "auxiliary_loss_mlp": 0.0103208, - "balance_loss_clip": 1.03615725, - "balance_loss_mlp": 1.02000976, - "epoch": 0.8674582894934616, - "flos": 24426797627520.0, - "grad_norm": 1.894860167096284, - "language_loss": 0.68146193, - "learning_rate": 1.8139108065924004e-07, - "loss": 0.70258546, - "num_input_tokens_seen": 311246380, - "step": 14428, - "time_per_iteration": 2.7677488327026367 - }, - { - "auxiliary_loss_clip": 0.01080543, - "auxiliary_loss_mlp": 0.01029001, - "balance_loss_clip": 1.0359962, - "balance_loss_mlp": 1.01683569, - "epoch": 0.8675184127461296, - "flos": 20737514701440.0, - "grad_norm": 2.892495609398215, - "language_loss": 0.70616251, - "learning_rate": 1.812290478794889e-07, - "loss": 0.72725797, - "num_input_tokens_seen": 311266465, - "step": 14429, - "time_per_iteration": 2.624802827835083 - }, - { - "auxiliary_loss_clip": 0.010878, - "auxiliary_loss_mlp": 0.01030213, - "balance_loss_clip": 1.03670454, - "balance_loss_mlp": 1.01785088, - "epoch": 0.8675785359987975, - "flos": 19135647774720.0, - "grad_norm": 1.8760175406026705, - "language_loss": 0.66803014, - "learning_rate": 1.810670840677151e-07, - "loss": 0.6892103, - "num_input_tokens_seen": 311285075, - "step": 14430, - "time_per_iteration": 2.6141793727874756 - }, - { - "auxiliary_loss_clip": 0.01064719, - "auxiliary_loss_mlp": 0.01037459, - "balance_loss_clip": 1.03474712, - "balance_loss_mlp": 1.02360034, - "epoch": 0.8676386592514655, - "flos": 22710662559360.0, - "grad_norm": 1.8772851475850807, - "language_loss": 0.69439894, - "learning_rate": 1.8090518923005948e-07, - "loss": 0.71542072, - "num_input_tokens_seen": 311303230, - "step": 14431, - "time_per_iteration": 2.760996103286743 - }, - { - "auxiliary_loss_clip": 0.01097351, - "auxiliary_loss_mlp": 0.01040167, - "balance_loss_clip": 1.03582358, - "balance_loss_mlp": 1.02768576, - "epoch": 0.8676987825041335, - "flos": 14209853109120.0, - "grad_norm": 2.630424540057507, - "language_loss": 0.63210046, - "learning_rate": 1.8074336337266116e-07, - "loss": 0.65347564, - "num_input_tokens_seen": 311318070, - "step": 14432, - "time_per_iteration": 2.5565524101257324 - }, - { - "auxiliary_loss_clip": 0.0109965, - "auxiliary_loss_mlp": 0.01039814, - "balance_loss_clip": 1.03807235, - "balance_loss_mlp": 1.02821505, - "epoch": 0.8677589057568015, - "flos": 13589927256960.0, - "grad_norm": 1.9324335266361277, - "language_loss": 0.78167832, - "learning_rate": 1.8058160650165656e-07, - "loss": 0.80307293, - "num_input_tokens_seen": 311334885, - "step": 14433, - "time_per_iteration": 2.603163242340088 - }, - { - "auxiliary_loss_clip": 0.01010943, - "auxiliary_loss_mlp": 0.01002541, - "balance_loss_clip": 1.00770855, - "balance_loss_mlp": 1.00159311, - "epoch": 0.8678190290094694, - "flos": 68933657370240.0, - "grad_norm": 0.7061148841104811, - "language_loss": 0.5846473, - "learning_rate": 1.804199186231805e-07, - "loss": 0.6047821, - "num_input_tokens_seen": 311399780, - "step": 14434, - "time_per_iteration": 3.2711222171783447 - }, - { - "auxiliary_loss_clip": 0.01084546, - "auxiliary_loss_mlp": 0.01034123, - "balance_loss_clip": 1.03522635, - "balance_loss_mlp": 1.02258372, - "epoch": 0.8678791522621374, - "flos": 32557726776960.0, - "grad_norm": 1.9678570849349808, - "language_loss": 0.80160731, - "learning_rate": 1.802582997433628e-07, - "loss": 0.82279408, - "num_input_tokens_seen": 311419610, - "step": 14435, - "time_per_iteration": 2.729384660720825 - }, - { - "auxiliary_loss_clip": 0.0108652, - "auxiliary_loss_mlp": 0.00771159, - "balance_loss_clip": 1.03368807, - "balance_loss_mlp": 1.00019312, - "epoch": 0.8679392755148053, - "flos": 35042637657600.0, - "grad_norm": 2.323598256693539, - "language_loss": 0.62088466, - "learning_rate": 1.8009674986833322e-07, - "loss": 0.63946146, - "num_input_tokens_seen": 311440045, - "step": 14436, - "time_per_iteration": 2.7514889240264893 - }, - { - "auxiliary_loss_clip": 0.01084626, - "auxiliary_loss_mlp": 0.01030406, - "balance_loss_clip": 1.03650117, - "balance_loss_mlp": 1.01762128, - "epoch": 0.8679993987674733, - "flos": 18552494471040.0, - "grad_norm": 2.2152793164861477, - "language_loss": 0.70417553, - "learning_rate": 1.7993526900421706e-07, - "loss": 0.72532582, - "num_input_tokens_seen": 311456660, - "step": 14437, - "time_per_iteration": 2.682568311691284 - }, - { - "auxiliary_loss_clip": 0.01073964, - "auxiliary_loss_mlp": 0.01026899, - "balance_loss_clip": 1.03458905, - "balance_loss_mlp": 1.01451957, - "epoch": 0.8680595220201412, - "flos": 27454390162560.0, - "grad_norm": 1.9672371609341477, - "language_loss": 0.80644393, - "learning_rate": 1.797738571571381e-07, - "loss": 0.8274526, - "num_input_tokens_seen": 311475460, - "step": 14438, - "time_per_iteration": 2.7269651889801025 - }, - { - "auxiliary_loss_clip": 0.01089468, - "auxiliary_loss_mlp": 0.01024249, - "balance_loss_clip": 1.035025, - "balance_loss_mlp": 1.01237011, - "epoch": 0.8681196452728093, - "flos": 19208797822080.0, - "grad_norm": 1.7527538645260887, - "language_loss": 0.67584556, - "learning_rate": 1.7961251433321656e-07, - "loss": 0.69698274, - "num_input_tokens_seen": 311494575, - "step": 14439, - "time_per_iteration": 2.581627130508423 - }, - { - "auxiliary_loss_clip": 0.01096234, - "auxiliary_loss_mlp": 0.01034212, - "balance_loss_clip": 1.03661394, - "balance_loss_mlp": 1.02268469, - "epoch": 0.8681797685254772, - "flos": 37560442417920.0, - "grad_norm": 1.484819058237711, - "language_loss": 0.63649923, - "learning_rate": 1.7945124053857085e-07, - "loss": 0.65780365, - "num_input_tokens_seen": 311515805, - "step": 14440, - "time_per_iteration": 2.761298656463623 - }, - { - "auxiliary_loss_clip": 0.01095909, - "auxiliary_loss_mlp": 0.01034623, - "balance_loss_clip": 1.03644252, - "balance_loss_mlp": 1.02241611, - "epoch": 0.8682398917781452, - "flos": 23289937194240.0, - "grad_norm": 1.7310260750928266, - "language_loss": 0.66075879, - "learning_rate": 1.7929003577931722e-07, - "loss": 0.68206406, - "num_input_tokens_seen": 311536000, - "step": 14441, - "time_per_iteration": 2.5800838470458984 - }, - { - "auxiliary_loss_clip": 0.01091494, - "auxiliary_loss_mlp": 0.01025353, - "balance_loss_clip": 1.0385139, - "balance_loss_mlp": 1.0138557, - "epoch": 0.8683000150308132, - "flos": 21872794936320.0, - "grad_norm": 1.681496330113871, - "language_loss": 0.66083562, - "learning_rate": 1.7912890006156722e-07, - "loss": 0.68200409, - "num_input_tokens_seen": 311556220, - "step": 14442, - "time_per_iteration": 2.642595052719116 - }, - { - "auxiliary_loss_clip": 0.01084435, - "auxiliary_loss_mlp": 0.01033988, - "balance_loss_clip": 1.03615665, - "balance_loss_mlp": 1.02031493, - "epoch": 0.8683601382834811, - "flos": 14647209108480.0, - "grad_norm": 1.780014180776551, - "language_loss": 0.72400081, - "learning_rate": 1.7896783339143195e-07, - "loss": 0.74518502, - "num_input_tokens_seen": 311572530, - "step": 14443, - "time_per_iteration": 2.621661901473999 - }, - { - "auxiliary_loss_clip": 0.01109856, - "auxiliary_loss_mlp": 0.01028336, - "balance_loss_clip": 1.0374794, - "balance_loss_mlp": 1.01575971, - "epoch": 0.8684202615361492, - "flos": 26359904799360.0, - "grad_norm": 1.7034879908488254, - "language_loss": 0.83455396, - "learning_rate": 1.7880683577501877e-07, - "loss": 0.85593581, - "num_input_tokens_seen": 311591105, - "step": 14444, - "time_per_iteration": 2.5682990550994873 - }, - { - "auxiliary_loss_clip": 0.01071317, - "auxiliary_loss_mlp": 0.01030828, - "balance_loss_clip": 1.03839469, - "balance_loss_mlp": 1.0183413, - "epoch": 0.8684803847888171, - "flos": 20704010290560.0, - "grad_norm": 1.882585411960033, - "language_loss": 0.77276009, - "learning_rate": 1.7864590721843342e-07, - "loss": 0.79378152, - "num_input_tokens_seen": 311608350, - "step": 14445, - "time_per_iteration": 2.6933975219726562 - }, - { - "auxiliary_loss_clip": 0.01097793, - "auxiliary_loss_mlp": 0.01031892, - "balance_loss_clip": 1.0368073, - "balance_loss_mlp": 1.01954842, - "epoch": 0.8685405080414851, - "flos": 22638123043200.0, - "grad_norm": 1.8974232570725826, - "language_loss": 0.68224823, - "learning_rate": 1.7848504772777728e-07, - "loss": 0.70354509, - "num_input_tokens_seen": 311626380, - "step": 14446, - "time_per_iteration": 2.6505656242370605 - }, - { - "auxiliary_loss_clip": 0.01093238, - "auxiliary_loss_mlp": 0.01034173, - "balance_loss_clip": 1.03448546, - "balance_loss_mlp": 1.0214951, - "epoch": 0.868600631294153, - "flos": 24822065865600.0, - "grad_norm": 1.831558393609818, - "language_loss": 0.83143735, - "learning_rate": 1.7832425730915102e-07, - "loss": 0.85271144, - "num_input_tokens_seen": 311644345, - "step": 14447, - "time_per_iteration": 2.5855720043182373 - }, - { - "auxiliary_loss_clip": 0.01028885, - "auxiliary_loss_mlp": 0.01028809, - "balance_loss_clip": 1.03098965, - "balance_loss_mlp": 1.01697183, - "epoch": 0.868660754546821, - "flos": 25113983696640.0, - "grad_norm": 1.612042145706922, - "language_loss": 0.74218094, - "learning_rate": 1.781635359686515e-07, - "loss": 0.76275784, - "num_input_tokens_seen": 311663340, - "step": 14448, - "time_per_iteration": 2.75423002243042 - }, - { - "auxiliary_loss_clip": 0.01081834, - "auxiliary_loss_mlp": 0.01032637, - "balance_loss_clip": 1.03381288, - "balance_loss_mlp": 1.01907682, - "epoch": 0.8687208777994889, - "flos": 12677832178560.0, - "grad_norm": 1.9294306155040917, - "language_loss": 0.79997855, - "learning_rate": 1.7800288371237303e-07, - "loss": 0.82112324, - "num_input_tokens_seen": 311679860, - "step": 14449, - "time_per_iteration": 2.6481199264526367 - }, - { - "auxiliary_loss_clip": 0.0100162, - "auxiliary_loss_mlp": 0.01004017, - "balance_loss_clip": 1.00803828, - "balance_loss_mlp": 1.0030396, - "epoch": 0.8687810010521569, - "flos": 65617235573760.0, - "grad_norm": 0.811742362179789, - "language_loss": 0.60572553, - "learning_rate": 1.7784230054640758e-07, - "loss": 0.62578189, - "num_input_tokens_seen": 311738135, - "step": 14450, - "time_per_iteration": 3.1744225025177 - }, - { - "auxiliary_loss_clip": 0.01084674, - "auxiliary_loss_mlp": 0.01030905, - "balance_loss_clip": 1.03782833, - "balance_loss_mlp": 1.01882339, - "epoch": 0.8688411243048249, - "flos": 24244012293120.0, - "grad_norm": 1.7384604154685417, - "language_loss": 0.76132762, - "learning_rate": 1.7768178647684517e-07, - "loss": 0.78248346, - "num_input_tokens_seen": 311756975, - "step": 14451, - "time_per_iteration": 2.71647310256958 - }, - { - "auxiliary_loss_clip": 0.01093999, - "auxiliary_loss_mlp": 0.01027005, - "balance_loss_clip": 1.03542089, - "balance_loss_mlp": 1.01485705, - "epoch": 0.8689012475574929, - "flos": 18221828843520.0, - "grad_norm": 3.077369236554663, - "language_loss": 0.71929884, - "learning_rate": 1.7752134150977205e-07, - "loss": 0.74050885, - "num_input_tokens_seen": 311771830, - "step": 14452, - "time_per_iteration": 2.6421010494232178 - }, - { - "auxiliary_loss_clip": 0.01086249, - "auxiliary_loss_mlp": 0.00770837, - "balance_loss_clip": 1.03687978, - "balance_loss_mlp": 1.00033617, - "epoch": 0.8689613708101608, - "flos": 19646728439040.0, - "grad_norm": 1.4971300186991454, - "language_loss": 0.72101021, - "learning_rate": 1.7736096565127201e-07, - "loss": 0.73958105, - "num_input_tokens_seen": 311790130, - "step": 14453, - "time_per_iteration": 2.6629247665405273 - }, - { - "auxiliary_loss_clip": 0.01096295, - "auxiliary_loss_mlp": 0.01035101, - "balance_loss_clip": 1.03675365, - "balance_loss_mlp": 1.02261996, - "epoch": 0.8690214940628288, - "flos": 11728749070080.0, - "grad_norm": 3.182912447217293, - "language_loss": 0.73198676, - "learning_rate": 1.7720065890742664e-07, - "loss": 0.75330073, - "num_input_tokens_seen": 311808360, - "step": 14454, - "time_per_iteration": 2.6625709533691406 - }, - { - "auxiliary_loss_clip": 0.01109645, - "auxiliary_loss_mlp": 0.01031456, - "balance_loss_clip": 1.03889263, - "balance_loss_mlp": 1.01947582, - "epoch": 0.8690816173154968, - "flos": 34936450076160.0, - "grad_norm": 2.5283080783573615, - "language_loss": 0.59421092, - "learning_rate": 1.7704042128431552e-07, - "loss": 0.61562192, - "num_input_tokens_seen": 311831325, - "step": 14455, - "time_per_iteration": 2.716947078704834 - }, - { - "auxiliary_loss_clip": 0.01088564, - "auxiliary_loss_mlp": 0.01031079, - "balance_loss_clip": 1.03601408, - "balance_loss_mlp": 1.0188961, - "epoch": 0.8691417405681647, - "flos": 11614804151040.0, - "grad_norm": 2.476455717843228, - "language_loss": 0.80191058, - "learning_rate": 1.7688025278801378e-07, - "loss": 0.823107, - "num_input_tokens_seen": 311848090, - "step": 14456, - "time_per_iteration": 2.608692169189453 - }, - { - "auxiliary_loss_clip": 0.01050256, - "auxiliary_loss_mlp": 0.01043748, - "balance_loss_clip": 1.03250086, - "balance_loss_mlp": 1.02862024, - "epoch": 0.8692018638208328, - "flos": 24608038677120.0, - "grad_norm": 3.350924717538294, - "language_loss": 0.74652326, - "learning_rate": 1.7672015342459568e-07, - "loss": 0.76746327, - "num_input_tokens_seen": 311867855, - "step": 14457, - "time_per_iteration": 2.8124382495880127 - }, - { - "auxiliary_loss_clip": 0.0104746, - "auxiliary_loss_mlp": 0.0103056, - "balance_loss_clip": 1.03249383, - "balance_loss_mlp": 1.01879406, - "epoch": 0.8692619870735007, - "flos": 25995124229760.0, - "grad_norm": 1.6659706537885548, - "language_loss": 0.78279102, - "learning_rate": 1.765601232001328e-07, - "loss": 0.80357122, - "num_input_tokens_seen": 311888675, - "step": 14458, - "time_per_iteration": 2.7865068912506104 - }, - { - "auxiliary_loss_clip": 0.0109921, - "auxiliary_loss_mlp": 0.01034656, - "balance_loss_clip": 1.0370791, - "balance_loss_mlp": 1.02152491, - "epoch": 0.8693221103261687, - "flos": 18041808856320.0, - "grad_norm": 1.8944843149653803, - "language_loss": 0.70788461, - "learning_rate": 1.7640016212069187e-07, - "loss": 0.72922325, - "num_input_tokens_seen": 311907310, - "step": 14459, - "time_per_iteration": 4.2408952713012695 - }, - { - "auxiliary_loss_clip": 0.01082625, - "auxiliary_loss_mlp": 0.01030636, - "balance_loss_clip": 1.03549707, - "balance_loss_mlp": 1.01960313, - "epoch": 0.8693822335788366, - "flos": 27492347859840.0, - "grad_norm": 1.4467054831762125, - "language_loss": 0.73848921, - "learning_rate": 1.762402701923398e-07, - "loss": 0.75962174, - "num_input_tokens_seen": 311929635, - "step": 14460, - "time_per_iteration": 4.442849636077881 - }, - { - "auxiliary_loss_clip": 0.01092251, - "auxiliary_loss_mlp": 0.01033418, - "balance_loss_clip": 1.03765035, - "balance_loss_mlp": 1.02094245, - "epoch": 0.8694423568315046, - "flos": 24097712198400.0, - "grad_norm": 1.8288235329592715, - "language_loss": 0.64751619, - "learning_rate": 1.7608044742113947e-07, - "loss": 0.66877288, - "num_input_tokens_seen": 311948800, - "step": 14461, - "time_per_iteration": 2.68937087059021 - }, - { - "auxiliary_loss_clip": 0.01093111, - "auxiliary_loss_mlp": 0.01033043, - "balance_loss_clip": 1.03253245, - "balance_loss_mlp": 1.0203414, - "epoch": 0.8695024800841725, - "flos": 18362131367040.0, - "grad_norm": 2.5518242110711933, - "language_loss": 0.82737637, - "learning_rate": 1.7592069381315123e-07, - "loss": 0.84863782, - "num_input_tokens_seen": 311964090, - "step": 14462, - "time_per_iteration": 5.744420289993286 - }, - { - "auxiliary_loss_clip": 0.01096615, - "auxiliary_loss_mlp": 0.01033503, - "balance_loss_clip": 1.03401327, - "balance_loss_mlp": 1.02065229, - "epoch": 0.8695626033368405, - "flos": 14027750133120.0, - "grad_norm": 1.890249833404203, - "language_loss": 0.65323138, - "learning_rate": 1.757610093744335e-07, - "loss": 0.67453253, - "num_input_tokens_seen": 311981460, - "step": 14463, - "time_per_iteration": 2.601334810256958 - }, - { - "auxiliary_loss_clip": 0.01091864, - "auxiliary_loss_mlp": 0.01035596, - "balance_loss_clip": 1.03908527, - "balance_loss_mlp": 1.02291179, - "epoch": 0.8696227265895085, - "flos": 16836862193280.0, - "grad_norm": 2.1647226205532206, - "language_loss": 0.66890931, - "learning_rate": 1.7560139411104058e-07, - "loss": 0.690184, - "num_input_tokens_seen": 312000115, - "step": 14464, - "time_per_iteration": 2.6851119995117188 - }, - { - "auxiliary_loss_clip": 0.01090151, - "auxiliary_loss_mlp": 0.01034839, - "balance_loss_clip": 1.03739452, - "balance_loss_mlp": 1.02226293, - "epoch": 0.8696828498421765, - "flos": 21799070271360.0, - "grad_norm": 2.2457253344226245, - "language_loss": 0.62439811, - "learning_rate": 1.7544184802902607e-07, - "loss": 0.64564812, - "num_input_tokens_seen": 312020770, - "step": 14465, - "time_per_iteration": 2.79040265083313 - }, - { - "auxiliary_loss_clip": 0.01091695, - "auxiliary_loss_mlp": 0.01041479, - "balance_loss_clip": 1.03505969, - "balance_loss_mlp": 1.03027892, - "epoch": 0.8697429730948444, - "flos": 22894812610560.0, - "grad_norm": 1.5293603652202958, - "language_loss": 0.84881204, - "learning_rate": 1.7528237113443934e-07, - "loss": 0.87014377, - "num_input_tokens_seen": 312041870, - "step": 14466, - "time_per_iteration": 2.636146306991577 - }, - { - "auxiliary_loss_clip": 0.0108122, - "auxiliary_loss_mlp": 0.01044083, - "balance_loss_clip": 1.03755033, - "balance_loss_mlp": 1.02939653, - "epoch": 0.8698030963475124, - "flos": 24717458482560.0, - "grad_norm": 2.8453884595631846, - "language_loss": 0.61869633, - "learning_rate": 1.7512296343332779e-07, - "loss": 0.63994938, - "num_input_tokens_seen": 312058210, - "step": 14467, - "time_per_iteration": 2.6638076305389404 - }, - { - "auxiliary_loss_clip": 0.01103261, - "auxiliary_loss_mlp": 0.01028354, - "balance_loss_clip": 1.03525686, - "balance_loss_mlp": 1.01705909, - "epoch": 0.8698632196001803, - "flos": 28442221067520.0, - "grad_norm": 1.4153650067531596, - "language_loss": 0.68961638, - "learning_rate": 1.7496362493173655e-07, - "loss": 0.71093249, - "num_input_tokens_seen": 312082665, - "step": 14468, - "time_per_iteration": 2.6570017337799072 - }, - { - "auxiliary_loss_clip": 0.01083749, - "auxiliary_loss_mlp": 0.01030565, - "balance_loss_clip": 1.03446794, - "balance_loss_mlp": 1.01894248, - "epoch": 0.8699233428528483, - "flos": 27636457224960.0, - "grad_norm": 1.5754041648724575, - "language_loss": 0.71199894, - "learning_rate": 1.7480435563570773e-07, - "loss": 0.73314214, - "num_input_tokens_seen": 312101960, - "step": 14469, - "time_per_iteration": 2.6813437938690186 - }, - { - "auxiliary_loss_clip": 0.01091595, - "auxiliary_loss_mlp": 0.01032263, - "balance_loss_clip": 1.03561163, - "balance_loss_mlp": 1.0210638, - "epoch": 0.8699834661055164, - "flos": 20045659864320.0, - "grad_norm": 1.885135452961054, - "language_loss": 0.84151506, - "learning_rate": 1.7464515555128024e-07, - "loss": 0.86275363, - "num_input_tokens_seen": 312117125, - "step": 14470, - "time_per_iteration": 2.6702113151550293 - }, - { - "auxiliary_loss_clip": 0.01081371, - "auxiliary_loss_mlp": 0.01034725, - "balance_loss_clip": 1.03428483, - "balance_loss_mlp": 1.02214813, - "epoch": 0.8700435893581843, - "flos": 23732787974400.0, - "grad_norm": 1.7089523138026592, - "language_loss": 0.72859287, - "learning_rate": 1.7448602468449148e-07, - "loss": 0.74975377, - "num_input_tokens_seen": 312135775, - "step": 14471, - "time_per_iteration": 2.695295572280884 - }, - { - "auxiliary_loss_clip": 0.01107843, - "auxiliary_loss_mlp": 0.01025754, - "balance_loss_clip": 1.03751683, - "balance_loss_mlp": 1.01464319, - "epoch": 0.8701037126108523, - "flos": 23548422441600.0, - "grad_norm": 1.3968254989831368, - "language_loss": 0.78822994, - "learning_rate": 1.7432696304137573e-07, - "loss": 0.80956596, - "num_input_tokens_seen": 312156070, - "step": 14472, - "time_per_iteration": 2.570103883743286 - }, - { - "auxiliary_loss_clip": 0.01091602, - "auxiliary_loss_mlp": 0.00771146, - "balance_loss_clip": 1.03555846, - "balance_loss_mlp": 1.00026262, - "epoch": 0.8701638358635202, - "flos": 18843442634880.0, - "grad_norm": 2.053518578575987, - "language_loss": 0.72808838, - "learning_rate": 1.741679706279644e-07, - "loss": 0.74671578, - "num_input_tokens_seen": 312174380, - "step": 14473, - "time_per_iteration": 2.5629189014434814 - }, - { - "auxiliary_loss_clip": 0.01111529, - "auxiliary_loss_mlp": 0.01033322, - "balance_loss_clip": 1.03829002, - "balance_loss_mlp": 1.02074575, - "epoch": 0.8702239591161882, - "flos": 27928339142400.0, - "grad_norm": 1.5975251132862047, - "language_loss": 0.72459877, - "learning_rate": 1.7400904745028644e-07, - "loss": 0.74604738, - "num_input_tokens_seen": 312195130, - "step": 14474, - "time_per_iteration": 2.629110097885132 - }, - { - "auxiliary_loss_clip": 0.01084584, - "auxiliary_loss_mlp": 0.01037278, - "balance_loss_clip": 1.03387856, - "balance_loss_mlp": 1.02389669, - "epoch": 0.8702840823688561, - "flos": 17233997938560.0, - "grad_norm": 1.7683975899654203, - "language_loss": 0.67307568, - "learning_rate": 1.7385019351436925e-07, - "loss": 0.69429433, - "num_input_tokens_seen": 312212300, - "step": 14475, - "time_per_iteration": 2.7122128009796143 - }, - { - "auxiliary_loss_clip": 0.01107714, - "auxiliary_loss_mlp": 0.01025637, - "balance_loss_clip": 1.03506005, - "balance_loss_mlp": 1.01282167, - "epoch": 0.8703442056215241, - "flos": 19427565605760.0, - "grad_norm": 1.7492617051008474, - "language_loss": 0.77730834, - "learning_rate": 1.736914088262349e-07, - "loss": 0.79864192, - "num_input_tokens_seen": 312231735, - "step": 14476, - "time_per_iteration": 2.6359400749206543 - }, - { - "auxiliary_loss_clip": 0.01090317, - "auxiliary_loss_mlp": 0.01034999, - "balance_loss_clip": 1.03377438, - "balance_loss_mlp": 1.02168965, - "epoch": 0.8704043288741921, - "flos": 22273845264000.0, - "grad_norm": 1.9949328659253254, - "language_loss": 0.72224838, - "learning_rate": 1.7353269339190525e-07, - "loss": 0.74350154, - "num_input_tokens_seen": 312253060, - "step": 14477, - "time_per_iteration": 2.7253026962280273 - }, - { - "auxiliary_loss_clip": 0.01100703, - "auxiliary_loss_mlp": 0.01029682, - "balance_loss_clip": 1.03792751, - "balance_loss_mlp": 1.01752841, - "epoch": 0.8704644521268601, - "flos": 16648725732480.0, - "grad_norm": 1.8285670196603703, - "language_loss": 0.59689963, - "learning_rate": 1.7337404721739946e-07, - "loss": 0.61820352, - "num_input_tokens_seen": 312269460, - "step": 14478, - "time_per_iteration": 2.6406443119049072 - }, - { - "auxiliary_loss_clip": 0.01099279, - "auxiliary_loss_mlp": 0.01028306, - "balance_loss_clip": 1.04014349, - "balance_loss_mlp": 1.01780367, - "epoch": 0.870524575379528, - "flos": 24280210224000.0, - "grad_norm": 1.716825353140286, - "language_loss": 0.71369159, - "learning_rate": 1.732154703087323e-07, - "loss": 0.73496747, - "num_input_tokens_seen": 312289830, - "step": 14479, - "time_per_iteration": 2.6733837127685547 - }, - { - "auxiliary_loss_clip": 0.01084359, - "auxiliary_loss_mlp": 0.01031083, - "balance_loss_clip": 1.03538418, - "balance_loss_mlp": 1.01857221, - "epoch": 0.870584698632196, - "flos": 28768684803840.0, - "grad_norm": 1.4964038489812062, - "language_loss": 0.70916605, - "learning_rate": 1.7305696267191805e-07, - "loss": 0.73032045, - "num_input_tokens_seen": 312311320, - "step": 14480, - "time_per_iteration": 2.724393367767334 - }, - { - "auxiliary_loss_clip": 0.01056493, - "auxiliary_loss_mlp": 0.01033786, - "balance_loss_clip": 1.03123474, - "balance_loss_mlp": 1.0217936, - "epoch": 0.8706448218848639, - "flos": 32449635774720.0, - "grad_norm": 1.7419679363065612, - "language_loss": 0.70210093, - "learning_rate": 1.728985243129666e-07, - "loss": 0.72300369, - "num_input_tokens_seen": 312332095, - "step": 14481, - "time_per_iteration": 2.9082820415496826 - }, - { - "auxiliary_loss_clip": 0.01096033, - "auxiliary_loss_mlp": 0.0103073, - "balance_loss_clip": 1.03603554, - "balance_loss_mlp": 1.01895249, - "epoch": 0.8707049451375319, - "flos": 22748009725440.0, - "grad_norm": 1.9715155189450182, - "language_loss": 0.76938367, - "learning_rate": 1.7274015523788643e-07, - "loss": 0.79065132, - "num_input_tokens_seen": 312351225, - "step": 14482, - "time_per_iteration": 2.663579225540161 - }, - { - "auxiliary_loss_clip": 0.0108459, - "auxiliary_loss_mlp": 0.01033132, - "balance_loss_clip": 1.03461742, - "balance_loss_mlp": 1.02019787, - "epoch": 0.8707650683902, - "flos": 15851976203520.0, - "grad_norm": 1.9099743094346329, - "language_loss": 0.76708519, - "learning_rate": 1.7258185545268234e-07, - "loss": 0.78826237, - "num_input_tokens_seen": 312369730, - "step": 14483, - "time_per_iteration": 2.6323695182800293 - }, - { - "auxiliary_loss_clip": 0.01102699, - "auxiliary_loss_mlp": 0.01038639, - "balance_loss_clip": 1.03712118, - "balance_loss_mlp": 1.02540636, - "epoch": 0.8708251916428679, - "flos": 16468131127680.0, - "grad_norm": 2.2142588001680856, - "language_loss": 0.61881113, - "learning_rate": 1.7242362496335749e-07, - "loss": 0.64022452, - "num_input_tokens_seen": 312386780, - "step": 14484, - "time_per_iteration": 2.710033893585205 - }, - { - "auxiliary_loss_clip": 0.01108847, - "auxiliary_loss_mlp": 0.01031795, - "balance_loss_clip": 1.03816152, - "balance_loss_mlp": 1.01980281, - "epoch": 0.8708853148955359, - "flos": 15377847655680.0, - "grad_norm": 2.077574729557336, - "language_loss": 0.68238926, - "learning_rate": 1.7226546377591222e-07, - "loss": 0.70379567, - "num_input_tokens_seen": 312404875, - "step": 14485, - "time_per_iteration": 2.5754683017730713 - }, - { - "auxiliary_loss_clip": 0.01050138, - "auxiliary_loss_mlp": 0.00770399, - "balance_loss_clip": 1.03129363, - "balance_loss_mlp": 1.00021982, - "epoch": 0.8709454381482038, - "flos": 30551325903360.0, - "grad_norm": 1.7252030737684174, - "language_loss": 0.62990439, - "learning_rate": 1.7210737189634373e-07, - "loss": 0.64810973, - "num_input_tokens_seen": 312425280, - "step": 14486, - "time_per_iteration": 2.9066638946533203 - }, - { - "auxiliary_loss_clip": 0.01111488, - "auxiliary_loss_mlp": 0.01033225, - "balance_loss_clip": 1.03683174, - "balance_loss_mlp": 1.02015388, - "epoch": 0.8710055614008718, - "flos": 22601422321920.0, - "grad_norm": 1.8160916488481187, - "language_loss": 0.61385965, - "learning_rate": 1.7194934933064653e-07, - "loss": 0.63530672, - "num_input_tokens_seen": 312443835, - "step": 14487, - "time_per_iteration": 2.5739262104034424 - }, - { - "auxiliary_loss_clip": 0.01081023, - "auxiliary_loss_mlp": 0.00768637, - "balance_loss_clip": 1.03572392, - "balance_loss_mlp": 1.00022483, - "epoch": 0.8710656846535397, - "flos": 18443146492800.0, - "grad_norm": 2.0123613366122126, - "language_loss": 0.67942166, - "learning_rate": 1.7179139608481318e-07, - "loss": 0.6979183, - "num_input_tokens_seen": 312460830, - "step": 14488, - "time_per_iteration": 2.7428195476531982 - }, - { - "auxiliary_loss_clip": 0.01092486, - "auxiliary_loss_mlp": 0.007699, - "balance_loss_clip": 1.03904903, - "balance_loss_mlp": 1.00028038, - "epoch": 0.8711258079062077, - "flos": 16503862181760.0, - "grad_norm": 1.8864520858010565, - "language_loss": 0.85530466, - "learning_rate": 1.716335121648338e-07, - "loss": 0.87392855, - "num_input_tokens_seen": 312477575, - "step": 14489, - "time_per_iteration": 2.647411346435547 - }, - { - "auxiliary_loss_clip": 0.01102857, - "auxiliary_loss_mlp": 0.01030869, - "balance_loss_clip": 1.03787231, - "balance_loss_mlp": 1.01791716, - "epoch": 0.8711859311588757, - "flos": 15663336952320.0, - "grad_norm": 11.279745936995974, - "language_loss": 0.75571835, - "learning_rate": 1.7147569757669445e-07, - "loss": 0.77705562, - "num_input_tokens_seen": 312492140, - "step": 14490, - "time_per_iteration": 2.602102041244507 - }, - { - "auxiliary_loss_clip": 0.01100977, - "auxiliary_loss_mlp": 0.01029449, - "balance_loss_clip": 1.03637326, - "balance_loss_mlp": 1.01625216, - "epoch": 0.8712460544115437, - "flos": 15557544420480.0, - "grad_norm": 2.2840810833035157, - "language_loss": 0.7581045, - "learning_rate": 1.7131795232638012e-07, - "loss": 0.77940881, - "num_input_tokens_seen": 312508400, - "step": 14491, - "time_per_iteration": 2.600862503051758 - }, - { - "auxiliary_loss_clip": 0.01080925, - "auxiliary_loss_mlp": 0.01026616, - "balance_loss_clip": 1.04117799, - "balance_loss_mlp": 1.01437354, - "epoch": 0.8713061776642116, - "flos": 16763568491520.0, - "grad_norm": 1.774399528748011, - "language_loss": 0.67152178, - "learning_rate": 1.711602764198723e-07, - "loss": 0.69259721, - "num_input_tokens_seen": 312525915, - "step": 14492, - "time_per_iteration": 2.666191577911377 - }, - { - "auxiliary_loss_clip": 0.01095889, - "auxiliary_loss_mlp": 0.01032256, - "balance_loss_clip": 1.03753376, - "balance_loss_mlp": 1.02096081, - "epoch": 0.8713663009168796, - "flos": 24279887001600.0, - "grad_norm": 1.7247817112541417, - "language_loss": 0.6931386, - "learning_rate": 1.7100266986314992e-07, - "loss": 0.71442008, - "num_input_tokens_seen": 312544735, - "step": 14493, - "time_per_iteration": 2.6735992431640625 - }, - { - "auxiliary_loss_clip": 0.01112164, - "auxiliary_loss_mlp": 0.01033694, - "balance_loss_clip": 1.03958261, - "balance_loss_mlp": 1.02021742, - "epoch": 0.8714264241695475, - "flos": 23795594904960.0, - "grad_norm": 2.938022699932479, - "language_loss": 0.8914628, - "learning_rate": 1.7084513266218936e-07, - "loss": 0.91292143, - "num_input_tokens_seen": 312557910, - "step": 14494, - "time_per_iteration": 2.5774879455566406 - }, - { - "auxiliary_loss_clip": 0.01074718, - "auxiliary_loss_mlp": 0.01032805, - "balance_loss_clip": 1.03785324, - "balance_loss_mlp": 1.02117586, - "epoch": 0.8714865474222155, - "flos": 37997942071680.0, - "grad_norm": 1.9797291272398052, - "language_loss": 0.59116101, - "learning_rate": 1.7068766482296514e-07, - "loss": 0.61223626, - "num_input_tokens_seen": 312580360, - "step": 14495, - "time_per_iteration": 2.8289716243743896 - }, - { - "auxiliary_loss_clip": 0.01076759, - "auxiliary_loss_mlp": 0.01037611, - "balance_loss_clip": 1.03488982, - "balance_loss_mlp": 1.02474201, - "epoch": 0.8715466706748836, - "flos": 22455696844800.0, - "grad_norm": 2.176188158663058, - "language_loss": 0.80262101, - "learning_rate": 1.7053026635144762e-07, - "loss": 0.82376468, - "num_input_tokens_seen": 312597550, - "step": 14496, - "time_per_iteration": 2.6638436317443848 - }, - { - "auxiliary_loss_clip": 0.01083126, - "auxiliary_loss_mlp": 0.01037102, - "balance_loss_clip": 1.03796446, - "balance_loss_mlp": 1.02335715, - "epoch": 0.8716067939275515, - "flos": 21215126868480.0, - "grad_norm": 2.0021272743800536, - "language_loss": 0.78574479, - "learning_rate": 1.7037293725360624e-07, - "loss": 0.80694699, - "num_input_tokens_seen": 312616435, - "step": 14497, - "time_per_iteration": 2.6190896034240723 - }, - { - "auxiliary_loss_clip": 0.01111391, - "auxiliary_loss_mlp": 0.01030843, - "balance_loss_clip": 1.03765655, - "balance_loss_mlp": 1.01795101, - "epoch": 0.8716669171802195, - "flos": 22997732054400.0, - "grad_norm": 1.9670976270372313, - "language_loss": 0.67136586, - "learning_rate": 1.70215677535406e-07, - "loss": 0.69278824, - "num_input_tokens_seen": 312632770, - "step": 14498, - "time_per_iteration": 4.060052394866943 - }, - { - "auxiliary_loss_clip": 0.01070213, - "auxiliary_loss_mlp": 0.01031582, - "balance_loss_clip": 1.03320992, - "balance_loss_mlp": 1.01950634, - "epoch": 0.8717270404328874, - "flos": 29784058462080.0, - "grad_norm": 1.6975392941334262, - "language_loss": 0.57051951, - "learning_rate": 1.700584872028108e-07, - "loss": 0.59153748, - "num_input_tokens_seen": 312651900, - "step": 14499, - "time_per_iteration": 4.371240615844727 - }, - { - "auxiliary_loss_clip": 0.01067535, - "auxiliary_loss_mlp": 0.01035634, - "balance_loss_clip": 1.03329492, - "balance_loss_mlp": 1.02273571, - "epoch": 0.8717871636855554, - "flos": 22018125363840.0, - "grad_norm": 2.018070377597452, - "language_loss": 0.79869312, - "learning_rate": 1.6990136626178097e-07, - "loss": 0.8197248, - "num_input_tokens_seen": 312671380, - "step": 14500, - "time_per_iteration": 2.641244888305664 - }, - { - "auxiliary_loss_clip": 0.01093156, - "auxiliary_loss_mlp": 0.0103193, - "balance_loss_clip": 1.03767002, - "balance_loss_mlp": 1.01997352, - "epoch": 0.8718472869382233, - "flos": 16654256426880.0, - "grad_norm": 1.9117037031727822, - "language_loss": 0.72699761, - "learning_rate": 1.6974431471827466e-07, - "loss": 0.74824846, - "num_input_tokens_seen": 312689215, - "step": 14501, - "time_per_iteration": 5.817331552505493 - }, - { - "auxiliary_loss_clip": 0.01072933, - "auxiliary_loss_mlp": 0.0102922, - "balance_loss_clip": 1.03339136, - "balance_loss_mlp": 1.01612496, - "epoch": 0.8719074101908914, - "flos": 19495328613120.0, - "grad_norm": 2.7665364794887934, - "language_loss": 0.64852804, - "learning_rate": 1.695873325782482e-07, - "loss": 0.66954952, - "num_input_tokens_seen": 312706400, - "step": 14502, - "time_per_iteration": 2.730670690536499 - }, - { - "auxiliary_loss_clip": 0.01083793, - "auxiliary_loss_mlp": 0.01040001, - "balance_loss_clip": 1.03453636, - "balance_loss_mlp": 1.02594066, - "epoch": 0.8719675334435593, - "flos": 33070890430080.0, - "grad_norm": 1.7549915055892822, - "language_loss": 0.68897182, - "learning_rate": 1.6943041984765262e-07, - "loss": 0.71020973, - "num_input_tokens_seen": 312727985, - "step": 14503, - "time_per_iteration": 2.7599282264709473 - }, - { - "auxiliary_loss_clip": 0.01085187, - "auxiliary_loss_mlp": 0.01028495, - "balance_loss_clip": 1.03614664, - "balance_loss_mlp": 1.01606762, - "epoch": 0.8720276566962273, - "flos": 13626268842240.0, - "grad_norm": 2.4452833389757833, - "language_loss": 0.69641596, - "learning_rate": 1.6927357653243912e-07, - "loss": 0.71755278, - "num_input_tokens_seen": 312745025, - "step": 14504, - "time_per_iteration": 2.651085376739502 - }, - { - "auxiliary_loss_clip": 0.01095546, - "auxiliary_loss_mlp": 0.00770191, - "balance_loss_clip": 1.03598738, - "balance_loss_mlp": 1.00016737, - "epoch": 0.8720877799488952, - "flos": 23514163845120.0, - "grad_norm": 2.77338091149224, - "language_loss": 0.7014603, - "learning_rate": 1.691168026385552e-07, - "loss": 0.72011769, - "num_input_tokens_seen": 312764170, - "step": 14505, - "time_per_iteration": 2.6669936180114746 - }, - { - "auxiliary_loss_clip": 0.010867, - "auxiliary_loss_mlp": 0.01028087, - "balance_loss_clip": 1.03689265, - "balance_loss_mlp": 1.01638639, - "epoch": 0.8721479032015632, - "flos": 20814148368000.0, - "grad_norm": 2.005999921971975, - "language_loss": 0.78253883, - "learning_rate": 1.6896009817194545e-07, - "loss": 0.80368668, - "num_input_tokens_seen": 312783830, - "step": 14506, - "time_per_iteration": 2.657680034637451 - }, - { - "auxiliary_loss_clip": 0.01088712, - "auxiliary_loss_mlp": 0.01028485, - "balance_loss_clip": 1.03430939, - "balance_loss_mlp": 1.01588416, - "epoch": 0.8722080264542311, - "flos": 19463655795840.0, - "grad_norm": 2.6356366496590775, - "language_loss": 0.73982906, - "learning_rate": 1.6880346313855221e-07, - "loss": 0.76100105, - "num_input_tokens_seen": 312802015, - "step": 14507, - "time_per_iteration": 2.6549437046051025 - }, - { - "auxiliary_loss_clip": 0.01050345, - "auxiliary_loss_mlp": 0.01041227, - "balance_loss_clip": 1.03153408, - "balance_loss_mlp": 1.02601552, - "epoch": 0.8722681497068991, - "flos": 21761866759680.0, - "grad_norm": 2.186590491088002, - "language_loss": 0.72111464, - "learning_rate": 1.686468975443156e-07, - "loss": 0.74203038, - "num_input_tokens_seen": 312820650, - "step": 14508, - "time_per_iteration": 2.7782466411590576 - }, - { - "auxiliary_loss_clip": 0.01091843, - "auxiliary_loss_mlp": 0.01035384, - "balance_loss_clip": 1.03782344, - "balance_loss_mlp": 1.02198446, - "epoch": 0.8723282729595672, - "flos": 28877134942080.0, - "grad_norm": 9.271419619391889, - "language_loss": 0.68848205, - "learning_rate": 1.6849040139517202e-07, - "loss": 0.70975429, - "num_input_tokens_seen": 312841310, - "step": 14509, - "time_per_iteration": 2.729306221008301 - }, - { - "auxiliary_loss_clip": 0.01084143, - "auxiliary_loss_mlp": 0.01032677, - "balance_loss_clip": 1.03603458, - "balance_loss_mlp": 1.02049422, - "epoch": 0.8723883962122351, - "flos": 26469145036800.0, - "grad_norm": 1.83494283279599, - "language_loss": 0.58361018, - "learning_rate": 1.683339746970558e-07, - "loss": 0.60477841, - "num_input_tokens_seen": 312862100, - "step": 14510, - "time_per_iteration": 2.712592363357544 - }, - { - "auxiliary_loss_clip": 0.01115632, - "auxiliary_loss_mlp": 0.01032731, - "balance_loss_clip": 1.03837419, - "balance_loss_mlp": 1.01929033, - "epoch": 0.8724485194649031, - "flos": 20521476351360.0, - "grad_norm": 2.9455639532360003, - "language_loss": 0.67271483, - "learning_rate": 1.6817761745589865e-07, - "loss": 0.69419849, - "num_input_tokens_seen": 312880220, - "step": 14511, - "time_per_iteration": 2.6101818084716797 - }, - { - "auxiliary_loss_clip": 0.01066568, - "auxiliary_loss_mlp": 0.01035139, - "balance_loss_clip": 1.03755903, - "balance_loss_mlp": 1.02190125, - "epoch": 0.872508642717571, - "flos": 24353360271360.0, - "grad_norm": 1.5822238245751863, - "language_loss": 0.81579173, - "learning_rate": 1.6802132967763027e-07, - "loss": 0.8368088, - "num_input_tokens_seen": 312900765, - "step": 14512, - "time_per_iteration": 2.8737993240356445 - }, - { - "auxiliary_loss_clip": 0.01013613, - "auxiliary_loss_mlp": 0.01001982, - "balance_loss_clip": 1.01023149, - "balance_loss_mlp": 1.00103402, - "epoch": 0.872568765970239, - "flos": 61410012485760.0, - "grad_norm": 0.7938781120275261, - "language_loss": 0.58586168, - "learning_rate": 1.6786511136817617e-07, - "loss": 0.60601765, - "num_input_tokens_seen": 312955840, - "step": 14513, - "time_per_iteration": 3.0974059104919434 - }, - { - "auxiliary_loss_clip": 0.01099507, - "auxiliary_loss_mlp": 0.01033855, - "balance_loss_clip": 1.03738713, - "balance_loss_mlp": 1.02111769, - "epoch": 0.8726288892229069, - "flos": 22598046443520.0, - "grad_norm": 1.742471393477679, - "language_loss": 0.76562905, - "learning_rate": 1.6770896253346112e-07, - "loss": 0.78696269, - "num_input_tokens_seen": 312973565, - "step": 14514, - "time_per_iteration": 2.6420650482177734 - }, - { - "auxiliary_loss_clip": 0.01103565, - "auxiliary_loss_mlp": 0.0102728, - "balance_loss_clip": 1.03866398, - "balance_loss_mlp": 1.01560926, - "epoch": 0.872689012475575, - "flos": 25885201633920.0, - "grad_norm": 1.9498734403168592, - "language_loss": 0.6555599, - "learning_rate": 1.675528831794055e-07, - "loss": 0.67686838, - "num_input_tokens_seen": 312994660, - "step": 14515, - "time_per_iteration": 2.6264796257019043 - }, - { - "auxiliary_loss_clip": 0.01097256, - "auxiliary_loss_mlp": 0.01035195, - "balance_loss_clip": 1.03490353, - "balance_loss_mlp": 1.02188492, - "epoch": 0.8727491357282429, - "flos": 21506721477120.0, - "grad_norm": 2.001096470926363, - "language_loss": 0.79334152, - "learning_rate": 1.6739687331192842e-07, - "loss": 0.81466603, - "num_input_tokens_seen": 313009860, - "step": 14516, - "time_per_iteration": 2.620288133621216 - }, - { - "auxiliary_loss_clip": 0.01112304, - "auxiliary_loss_mlp": 0.01034072, - "balance_loss_clip": 1.0381372, - "balance_loss_mlp": 1.02127457, - "epoch": 0.8728092589809109, - "flos": 19207504932480.0, - "grad_norm": 2.0299342762070927, - "language_loss": 0.72229123, - "learning_rate": 1.672409329369453e-07, - "loss": 0.74375498, - "num_input_tokens_seen": 313027025, - "step": 14517, - "time_per_iteration": 2.5668914318084717 - }, - { - "auxiliary_loss_clip": 0.0106993, - "auxiliary_loss_mlp": 0.01024167, - "balance_loss_clip": 1.03314495, - "balance_loss_mlp": 1.01283014, - "epoch": 0.8728693822335788, - "flos": 20595308757120.0, - "grad_norm": 2.054216166652221, - "language_loss": 0.72725064, - "learning_rate": 1.6708506206036966e-07, - "loss": 0.74819165, - "num_input_tokens_seen": 313046830, - "step": 14518, - "time_per_iteration": 2.6475393772125244 - }, - { - "auxiliary_loss_clip": 0.01081214, - "auxiliary_loss_mlp": 0.01038057, - "balance_loss_clip": 1.03350496, - "balance_loss_mlp": 1.02506304, - "epoch": 0.8729295054862468, - "flos": 21728613744000.0, - "grad_norm": 1.3596830366410917, - "language_loss": 0.743572, - "learning_rate": 1.6692926068811275e-07, - "loss": 0.76476473, - "num_input_tokens_seen": 313067715, - "step": 14519, - "time_per_iteration": 2.6572721004486084 - }, - { - "auxiliary_loss_clip": 0.01099689, - "auxiliary_loss_mlp": 0.01031709, - "balance_loss_clip": 1.03680825, - "balance_loss_mlp": 1.0181669, - "epoch": 0.8729896287389147, - "flos": 17673436926720.0, - "grad_norm": 2.5396553116313205, - "language_loss": 0.76397449, - "learning_rate": 1.6677352882608142e-07, - "loss": 0.78528845, - "num_input_tokens_seen": 313082305, - "step": 14520, - "time_per_iteration": 2.5867063999176025 - }, - { - "auxiliary_loss_clip": 0.01086668, - "auxiliary_loss_mlp": 0.01036169, - "balance_loss_clip": 1.03518891, - "balance_loss_mlp": 1.02296638, - "epoch": 0.8730497519915827, - "flos": 24571804832640.0, - "grad_norm": 1.6038658913961292, - "language_loss": 0.82005751, - "learning_rate": 1.666178664801816e-07, - "loss": 0.84128582, - "num_input_tokens_seen": 313101190, - "step": 14521, - "time_per_iteration": 2.7092795372009277 - }, - { - "auxiliary_loss_clip": 0.01097676, - "auxiliary_loss_mlp": 0.01032217, - "balance_loss_clip": 1.03878248, - "balance_loss_mlp": 1.01914012, - "epoch": 0.8731098752442508, - "flos": 13443734903040.0, - "grad_norm": 1.8658353480537415, - "language_loss": 0.76242197, - "learning_rate": 1.6646227365631616e-07, - "loss": 0.78372091, - "num_input_tokens_seen": 313118965, - "step": 14522, - "time_per_iteration": 2.5802886486053467 - }, - { - "auxiliary_loss_clip": 0.01094482, - "auxiliary_loss_mlp": 0.00769289, - "balance_loss_clip": 1.03429079, - "balance_loss_mlp": 1.0001862, - "epoch": 0.8731699984969187, - "flos": 23474446381440.0, - "grad_norm": 3.16869295355315, - "language_loss": 0.75775874, - "learning_rate": 1.66306750360385e-07, - "loss": 0.77639639, - "num_input_tokens_seen": 313139280, - "step": 14523, - "time_per_iteration": 2.684039831161499 - }, - { - "auxiliary_loss_clip": 0.01097173, - "auxiliary_loss_mlp": 0.01032086, - "balance_loss_clip": 1.03595114, - "balance_loss_mlp": 1.01999831, - "epoch": 0.8732301217495867, - "flos": 17712651600000.0, - "grad_norm": 2.782713247138861, - "language_loss": 0.78118378, - "learning_rate": 1.6615129659828542e-07, - "loss": 0.80247641, - "num_input_tokens_seen": 313156655, - "step": 14524, - "time_per_iteration": 2.5906875133514404 - }, - { - "auxiliary_loss_clip": 0.01089545, - "auxiliary_loss_mlp": 0.01031376, - "balance_loss_clip": 1.03745615, - "balance_loss_mlp": 1.02009869, - "epoch": 0.8732902450022546, - "flos": 22054359208320.0, - "grad_norm": 4.924039303176845, - "language_loss": 0.77730787, - "learning_rate": 1.6599591237591272e-07, - "loss": 0.79851705, - "num_input_tokens_seen": 313174050, - "step": 14525, - "time_per_iteration": 2.6270298957824707 - }, - { - "auxiliary_loss_clip": 0.01020522, - "auxiliary_loss_mlp": 0.01034516, - "balance_loss_clip": 1.03363109, - "balance_loss_mlp": 1.02209401, - "epoch": 0.8733503682549226, - "flos": 22272983337600.0, - "grad_norm": 2.157402662097444, - "language_loss": 0.6920954, - "learning_rate": 1.6584059769915902e-07, - "loss": 0.71264577, - "num_input_tokens_seen": 313192765, - "step": 14526, - "time_per_iteration": 3.1794915199279785 - }, - { - "auxiliary_loss_clip": 0.01059512, - "auxiliary_loss_mlp": 0.01041504, - "balance_loss_clip": 1.03597927, - "balance_loss_mlp": 1.02804565, - "epoch": 0.8734104915075905, - "flos": 23364344217600.0, - "grad_norm": 2.126615018801638, - "language_loss": 0.6124419, - "learning_rate": 1.6568535257391326e-07, - "loss": 0.63345206, - "num_input_tokens_seen": 313210925, - "step": 14527, - "time_per_iteration": 2.93717098236084 - }, - { - "auxiliary_loss_clip": 0.01102101, - "auxiliary_loss_mlp": 0.01036741, - "balance_loss_clip": 1.04113436, - "balance_loss_mlp": 1.02263236, - "epoch": 0.8734706147602586, - "flos": 17712292464000.0, - "grad_norm": 1.9327506841110211, - "language_loss": 0.65617096, - "learning_rate": 1.6553017700606265e-07, - "loss": 0.67755938, - "num_input_tokens_seen": 313228250, - "step": 14528, - "time_per_iteration": 2.5247788429260254 - }, - { - "auxiliary_loss_clip": 0.01080324, - "auxiliary_loss_mlp": 0.01027224, - "balance_loss_clip": 1.03828454, - "balance_loss_mlp": 1.01499307, - "epoch": 0.8735307380129265, - "flos": 22049367217920.0, - "grad_norm": 2.128650528943947, - "language_loss": 0.89494413, - "learning_rate": 1.6537507100149205e-07, - "loss": 0.91601956, - "num_input_tokens_seen": 313247880, - "step": 14529, - "time_per_iteration": 2.800915241241455 - }, - { - "auxiliary_loss_clip": 0.01085933, - "auxiliary_loss_mlp": 0.01031322, - "balance_loss_clip": 1.03527832, - "balance_loss_mlp": 1.01898432, - "epoch": 0.8735908612655945, - "flos": 25338425829120.0, - "grad_norm": 1.740049553302022, - "language_loss": 0.84358543, - "learning_rate": 1.6522003456608258e-07, - "loss": 0.8647579, - "num_input_tokens_seen": 313266790, - "step": 14530, - "time_per_iteration": 2.7246882915496826 - }, - { - "auxiliary_loss_clip": 0.01086126, - "auxiliary_loss_mlp": 0.01038129, - "balance_loss_clip": 1.03533483, - "balance_loss_mlp": 1.02629161, - "epoch": 0.8736509845182624, - "flos": 21540908246400.0, - "grad_norm": 2.065068159593715, - "language_loss": 0.74541724, - "learning_rate": 1.650650677057128e-07, - "loss": 0.7666598, - "num_input_tokens_seen": 313286805, - "step": 14531, - "time_per_iteration": 2.7866251468658447 - }, - { - "auxiliary_loss_clip": 0.01094848, - "auxiliary_loss_mlp": 0.0103322, - "balance_loss_clip": 1.0341115, - "balance_loss_mlp": 1.02093542, - "epoch": 0.8737111077709304, - "flos": 22017227523840.0, - "grad_norm": 2.6296616466434655, - "language_loss": 0.6131202, - "learning_rate": 1.6491017042625966e-07, - "loss": 0.6344009, - "num_input_tokens_seen": 313305415, - "step": 14532, - "time_per_iteration": 2.677741289138794 - }, - { - "auxiliary_loss_clip": 0.01018177, - "auxiliary_loss_mlp": 0.01004849, - "balance_loss_clip": 1.005548, - "balance_loss_mlp": 1.00377011, - "epoch": 0.8737712310235983, - "flos": 70066315912320.0, - "grad_norm": 0.9206045969458919, - "language_loss": 0.58650947, - "learning_rate": 1.6475534273359704e-07, - "loss": 0.60673976, - "num_input_tokens_seen": 313369940, - "step": 14533, - "time_per_iteration": 4.089330434799194 - }, - { - "auxiliary_loss_clip": 0.01079874, - "auxiliary_loss_mlp": 0.01032746, - "balance_loss_clip": 1.03403592, - "balance_loss_mlp": 1.02048564, - "epoch": 0.8738313542762663, - "flos": 28658331244800.0, - "grad_norm": 1.49408783242758, - "language_loss": 0.76831782, - "learning_rate": 1.646005846335954e-07, - "loss": 0.78944403, - "num_input_tokens_seen": 313390965, - "step": 14534, - "time_per_iteration": 2.702711582183838 - }, - { - "auxiliary_loss_clip": 0.0108079, - "auxiliary_loss_mlp": 0.01033546, - "balance_loss_clip": 1.03330386, - "balance_loss_mlp": 1.02107036, - "epoch": 0.8738914775289344, - "flos": 22346384780160.0, - "grad_norm": 1.7135543711038013, - "language_loss": 0.75193512, - "learning_rate": 1.6444589613212357e-07, - "loss": 0.77307844, - "num_input_tokens_seen": 313409680, - "step": 14535, - "time_per_iteration": 2.6537675857543945 - }, - { - "auxiliary_loss_clip": 0.01107851, - "auxiliary_loss_mlp": 0.01033358, - "balance_loss_clip": 1.03563666, - "balance_loss_mlp": 1.02093053, - "epoch": 0.8739516007816023, - "flos": 31759648444800.0, - "grad_norm": 2.0846644532444625, - "language_loss": 0.74546909, - "learning_rate": 1.64291277235048e-07, - "loss": 0.76688123, - "num_input_tokens_seen": 313431335, - "step": 14536, - "time_per_iteration": 2.6706697940826416 - }, - { - "auxiliary_loss_clip": 0.01087464, - "auxiliary_loss_mlp": 0.01031248, - "balance_loss_clip": 1.03460896, - "balance_loss_mlp": 1.01939237, - "epoch": 0.8740117240342703, - "flos": 21211715076480.0, - "grad_norm": 1.8068501761157092, - "language_loss": 0.63835013, - "learning_rate": 1.641367279482304e-07, - "loss": 0.65953726, - "num_input_tokens_seen": 313449225, - "step": 14537, - "time_per_iteration": 4.280652761459351 - }, - { - "auxiliary_loss_clip": 0.01094433, - "auxiliary_loss_mlp": 0.01028086, - "balance_loss_clip": 1.03392243, - "balance_loss_mlp": 1.01478267, - "epoch": 0.8740718472869382, - "flos": 25186666867200.0, - "grad_norm": 1.8076510907949124, - "language_loss": 0.57990402, - "learning_rate": 1.6398224827753216e-07, - "loss": 0.60112923, - "num_input_tokens_seen": 313467715, - "step": 14538, - "time_per_iteration": 4.291844844818115 - }, - { - "auxiliary_loss_clip": 0.01096418, - "auxiliary_loss_mlp": 0.01025884, - "balance_loss_clip": 1.03719354, - "balance_loss_mlp": 1.0136714, - "epoch": 0.8741319705396062, - "flos": 19500931134720.0, - "grad_norm": 1.7388451814310184, - "language_loss": 0.68716401, - "learning_rate": 1.6382783822881142e-07, - "loss": 0.70838702, - "num_input_tokens_seen": 313486805, - "step": 14539, - "time_per_iteration": 2.5990817546844482 - }, - { - "auxiliary_loss_clip": 0.01101524, - "auxiliary_loss_mlp": 0.01030794, - "balance_loss_clip": 1.03593516, - "balance_loss_mlp": 1.01815796, - "epoch": 0.8741920937922741, - "flos": 14100900180480.0, - "grad_norm": 2.0449241273671355, - "language_loss": 0.74361241, - "learning_rate": 1.6367349780792262e-07, - "loss": 0.76493561, - "num_input_tokens_seen": 313504880, - "step": 14540, - "time_per_iteration": 2.6135077476501465 - }, - { - "auxiliary_loss_clip": 0.01082066, - "auxiliary_loss_mlp": 0.0103892, - "balance_loss_clip": 1.03429246, - "balance_loss_mlp": 1.02535379, - "epoch": 0.8742522170449422, - "flos": 27709858667520.0, - "grad_norm": 2.2042306692212947, - "language_loss": 0.78727126, - "learning_rate": 1.635192270207193e-07, - "loss": 0.8084811, - "num_input_tokens_seen": 313524995, - "step": 14541, - "time_per_iteration": 5.828189849853516 - }, - { - "auxiliary_loss_clip": 0.01068115, - "auxiliary_loss_mlp": 0.01034781, - "balance_loss_clip": 1.03299069, - "balance_loss_mlp": 1.02049947, - "epoch": 0.8743123402976101, - "flos": 21142587352320.0, - "grad_norm": 2.5163397271017724, - "language_loss": 0.66620183, - "learning_rate": 1.6336502587305035e-07, - "loss": 0.68723083, - "num_input_tokens_seen": 313541740, - "step": 14542, - "time_per_iteration": 2.7577908039093018 - }, - { - "auxiliary_loss_clip": 0.01027438, - "auxiliary_loss_mlp": 0.0100168, - "balance_loss_clip": 1.00493681, - "balance_loss_mlp": 1.00071454, - "epoch": 0.8743724635502781, - "flos": 60870024351360.0, - "grad_norm": 0.7818261146678972, - "language_loss": 0.54485422, - "learning_rate": 1.632108943707642e-07, - "loss": 0.56514537, - "num_input_tokens_seen": 313593445, - "step": 14543, - "time_per_iteration": 2.908863067626953 - }, - { - "auxiliary_loss_clip": 0.01084752, - "auxiliary_loss_mlp": 0.01035429, - "balance_loss_clip": 1.0375371, - "balance_loss_mlp": 1.02258444, - "epoch": 0.874432586802946, - "flos": 28109292883200.0, - "grad_norm": 2.3839087640585457, - "language_loss": 0.69428027, - "learning_rate": 1.6305683251970458e-07, - "loss": 0.71548212, - "num_input_tokens_seen": 313615640, - "step": 14544, - "time_per_iteration": 2.6920766830444336 - }, - { - "auxiliary_loss_clip": 0.01064253, - "auxiliary_loss_mlp": 0.01028954, - "balance_loss_clip": 1.03769612, - "balance_loss_mlp": 1.01798081, - "epoch": 0.874492710055614, - "flos": 23550289948800.0, - "grad_norm": 1.7246009574285497, - "language_loss": 0.75945365, - "learning_rate": 1.62902840325714e-07, - "loss": 0.78038573, - "num_input_tokens_seen": 313635550, - "step": 14545, - "time_per_iteration": 2.7786312103271484 - }, - { - "auxiliary_loss_clip": 0.01097234, - "auxiliary_loss_mlp": 0.00771469, - "balance_loss_clip": 1.03498626, - "balance_loss_mlp": 1.00026131, - "epoch": 0.8745528333082819, - "flos": 40915647924480.0, - "grad_norm": 10.499099096665093, - "language_loss": 0.66618592, - "learning_rate": 1.6274891779463217e-07, - "loss": 0.68487293, - "num_input_tokens_seen": 313659275, - "step": 14546, - "time_per_iteration": 2.8346989154815674 - }, - { - "auxiliary_loss_clip": 0.01109602, - "auxiliary_loss_mlp": 0.01030191, - "balance_loss_clip": 1.03745484, - "balance_loss_mlp": 1.01785886, - "epoch": 0.87461295656095, - "flos": 23622901292160.0, - "grad_norm": 1.5789135583569807, - "language_loss": 0.7296229, - "learning_rate": 1.6259506493229536e-07, - "loss": 0.75102079, - "num_input_tokens_seen": 313680595, - "step": 14547, - "time_per_iteration": 2.659517526626587 - }, - { - "auxiliary_loss_clip": 0.01115124, - "auxiliary_loss_mlp": 0.01040105, - "balance_loss_clip": 1.03795385, - "balance_loss_mlp": 1.02661061, - "epoch": 0.874673079813618, - "flos": 38794116983040.0, - "grad_norm": 3.3678360175538087, - "language_loss": 0.69317234, - "learning_rate": 1.6244128174453752e-07, - "loss": 0.71472466, - "num_input_tokens_seen": 313699730, - "step": 14548, - "time_per_iteration": 2.754931926727295 - }, - { - "auxiliary_loss_clip": 0.01090989, - "auxiliary_loss_mlp": 0.01033861, - "balance_loss_clip": 1.03693557, - "balance_loss_mlp": 1.02118921, - "epoch": 0.8747332030662859, - "flos": 23696159080320.0, - "grad_norm": 2.005045026903121, - "language_loss": 0.70676434, - "learning_rate": 1.6228756823719093e-07, - "loss": 0.72801286, - "num_input_tokens_seen": 313720090, - "step": 14549, - "time_per_iteration": 2.8153107166290283 - }, - { - "auxiliary_loss_clip": 0.01101259, - "auxiliary_loss_mlp": 0.00772545, - "balance_loss_clip": 1.0357511, - "balance_loss_mlp": 1.00031376, - "epoch": 0.8747933263189539, - "flos": 24462456854400.0, - "grad_norm": 2.512472286488796, - "language_loss": 0.84052968, - "learning_rate": 1.6213392441608352e-07, - "loss": 0.85926771, - "num_input_tokens_seen": 313736795, - "step": 14550, - "time_per_iteration": 2.6691277027130127 - }, - { - "auxiliary_loss_clip": 0.01100072, - "auxiliary_loss_mlp": 0.01041079, - "balance_loss_clip": 1.03686762, - "balance_loss_mlp": 1.02883005, - "epoch": 0.8748534495716218, - "flos": 13809161917440.0, - "grad_norm": 1.6392278362685582, - "language_loss": 0.71681327, - "learning_rate": 1.6198035028704183e-07, - "loss": 0.7382248, - "num_input_tokens_seen": 313754820, - "step": 14551, - "time_per_iteration": 2.6196999549865723 - }, - { - "auxiliary_loss_clip": 0.01098688, - "auxiliary_loss_mlp": 0.00770542, - "balance_loss_clip": 1.03751254, - "balance_loss_mlp": 1.00018144, - "epoch": 0.8749135728242898, - "flos": 29862092759040.0, - "grad_norm": 5.521178940955395, - "language_loss": 0.64576298, - "learning_rate": 1.6182684585588934e-07, - "loss": 0.66445529, - "num_input_tokens_seen": 313775830, - "step": 14552, - "time_per_iteration": 2.7710392475128174 - }, - { - "auxiliary_loss_clip": 0.01078604, - "auxiliary_loss_mlp": 0.01028395, - "balance_loss_clip": 1.03420365, - "balance_loss_mlp": 1.01439333, - "epoch": 0.8749736960769577, - "flos": 24133479166080.0, - "grad_norm": 5.011357337141667, - "language_loss": 0.79550266, - "learning_rate": 1.616734111284479e-07, - "loss": 0.81657255, - "num_input_tokens_seen": 313795745, - "step": 14553, - "time_per_iteration": 2.7544870376586914 - }, - { - "auxiliary_loss_clip": 0.01093009, - "auxiliary_loss_mlp": 0.01033697, - "balance_loss_clip": 1.03364944, - "balance_loss_mlp": 1.02119756, - "epoch": 0.8750338193296258, - "flos": 17202540602880.0, - "grad_norm": 2.1871328119231337, - "language_loss": 0.70039916, - "learning_rate": 1.6152004611053416e-07, - "loss": 0.72166622, - "num_input_tokens_seen": 313813895, - "step": 14554, - "time_per_iteration": 2.5449023246765137 - }, - { - "auxiliary_loss_clip": 0.01091308, - "auxiliary_loss_mlp": 0.00770366, - "balance_loss_clip": 1.03953791, - "balance_loss_mlp": 1.00012708, - "epoch": 0.8750939425822937, - "flos": 23733218937600.0, - "grad_norm": 1.5371757112217883, - "language_loss": 0.83528662, - "learning_rate": 1.6136675080796457e-07, - "loss": 0.85390329, - "num_input_tokens_seen": 313834225, - "step": 14555, - "time_per_iteration": 2.712270498275757 - }, - { - "auxiliary_loss_clip": 0.01097341, - "auxiliary_loss_mlp": 0.01034286, - "balance_loss_clip": 1.03663278, - "balance_loss_mlp": 1.02133369, - "epoch": 0.8751540658349617, - "flos": 26541684552960.0, - "grad_norm": 1.5869522480564469, - "language_loss": 0.71009433, - "learning_rate": 1.6121352522655252e-07, - "loss": 0.73141062, - "num_input_tokens_seen": 313854430, - "step": 14556, - "time_per_iteration": 2.626359462738037 - }, - { - "auxiliary_loss_clip": 0.01093494, - "auxiliary_loss_mlp": 0.01036101, - "balance_loss_clip": 1.03601527, - "balance_loss_mlp": 1.02195692, - "epoch": 0.8752141890876296, - "flos": 19386806647680.0, - "grad_norm": 1.8472844895882763, - "language_loss": 0.76663041, - "learning_rate": 1.6106036937210732e-07, - "loss": 0.78792638, - "num_input_tokens_seen": 313871600, - "step": 14557, - "time_per_iteration": 2.7687621116638184 - }, - { - "auxiliary_loss_clip": 0.01072231, - "auxiliary_loss_mlp": 0.01039476, - "balance_loss_clip": 1.03658962, - "balance_loss_mlp": 1.02650011, - "epoch": 0.8752743123402976, - "flos": 25374408278400.0, - "grad_norm": 1.8980752716365015, - "language_loss": 0.83232927, - "learning_rate": 1.6090728325043767e-07, - "loss": 0.85344636, - "num_input_tokens_seen": 313891570, - "step": 14558, - "time_per_iteration": 2.7216644287109375 - }, - { - "auxiliary_loss_clip": 0.01027546, - "auxiliary_loss_mlp": 0.01003435, - "balance_loss_clip": 1.00482631, - "balance_loss_mlp": 1.00239205, - "epoch": 0.8753344355929655, - "flos": 59952398578560.0, - "grad_norm": 0.8156616177259552, - "language_loss": 0.56093448, - "learning_rate": 1.6075426686734784e-07, - "loss": 0.58124429, - "num_input_tokens_seen": 313951290, - "step": 14559, - "time_per_iteration": 3.1608095169067383 - }, - { - "auxiliary_loss_clip": 0.01099027, - "auxiliary_loss_mlp": 0.01035007, - "balance_loss_clip": 1.03775668, - "balance_loss_mlp": 1.02299678, - "epoch": 0.8753945588456336, - "flos": 17894646835200.0, - "grad_norm": 2.769429121490499, - "language_loss": 0.66112006, - "learning_rate": 1.606013202286407e-07, - "loss": 0.68246031, - "num_input_tokens_seen": 313968645, - "step": 14560, - "time_per_iteration": 2.62923526763916 - }, - { - "auxiliary_loss_clip": 0.011089, - "auxiliary_loss_mlp": 0.01030691, - "balance_loss_clip": 1.03712916, - "balance_loss_mlp": 1.0187583, - "epoch": 0.8754546820983016, - "flos": 30914885410560.0, - "grad_norm": 3.865819478454591, - "language_loss": 0.78949714, - "learning_rate": 1.6044844334011541e-07, - "loss": 0.810893, - "num_input_tokens_seen": 313987580, - "step": 14561, - "time_per_iteration": 2.6706154346466064 - }, - { - "auxiliary_loss_clip": 0.01109674, - "auxiliary_loss_mlp": 0.01033224, - "balance_loss_clip": 1.03582835, - "balance_loss_mlp": 1.01984262, - "epoch": 0.8755148053509695, - "flos": 20631075724800.0, - "grad_norm": 1.9781083362240712, - "language_loss": 0.77276206, - "learning_rate": 1.6029563620756982e-07, - "loss": 0.79419112, - "num_input_tokens_seen": 314004460, - "step": 14562, - "time_per_iteration": 2.5154237747192383 - }, - { - "auxiliary_loss_clip": 0.01103173, - "auxiliary_loss_mlp": 0.01027848, - "balance_loss_clip": 1.03530455, - "balance_loss_mlp": 1.0163027, - "epoch": 0.8755749286036375, - "flos": 34969739005440.0, - "grad_norm": 1.5352533116826146, - "language_loss": 0.71789098, - "learning_rate": 1.601428988367981e-07, - "loss": 0.73920125, - "num_input_tokens_seen": 314026855, - "step": 14563, - "time_per_iteration": 2.743906021118164 - }, - { - "auxiliary_loss_clip": 0.01114581, - "auxiliary_loss_mlp": 0.01034001, - "balance_loss_clip": 1.04004955, - "balance_loss_mlp": 1.0215075, - "epoch": 0.8756350518563054, - "flos": 18186456925440.0, - "grad_norm": 2.1781284121642304, - "language_loss": 0.65630162, - "learning_rate": 1.5999023123359235e-07, - "loss": 0.67778742, - "num_input_tokens_seen": 314042830, - "step": 14564, - "time_per_iteration": 2.601315498352051 - }, - { - "auxiliary_loss_clip": 0.01095159, - "auxiliary_loss_mlp": 0.01036435, - "balance_loss_clip": 1.03489327, - "balance_loss_mlp": 1.02443063, - "epoch": 0.8756951751089734, - "flos": 20084012611200.0, - "grad_norm": 1.7268939160144312, - "language_loss": 0.7091375, - "learning_rate": 1.598376334037408e-07, - "loss": 0.73045349, - "num_input_tokens_seen": 314062225, - "step": 14565, - "time_per_iteration": 2.67029070854187 - }, - { - "auxiliary_loss_clip": 0.01092949, - "auxiliary_loss_mlp": 0.01036021, - "balance_loss_clip": 1.03708506, - "balance_loss_mlp": 1.02246666, - "epoch": 0.8757552983616413, - "flos": 27525241739520.0, - "grad_norm": 1.5872462776777525, - "language_loss": 0.77823293, - "learning_rate": 1.5968510535303102e-07, - "loss": 0.79952264, - "num_input_tokens_seen": 314082325, - "step": 14566, - "time_per_iteration": 2.728349447250366 - }, - { - "auxiliary_loss_clip": 0.01087655, - "auxiliary_loss_mlp": 0.01031782, - "balance_loss_clip": 1.03929698, - "balance_loss_mlp": 1.01946163, - "epoch": 0.8758154216143094, - "flos": 18073014796800.0, - "grad_norm": 1.606606930203952, - "language_loss": 0.71347201, - "learning_rate": 1.5953264708724624e-07, - "loss": 0.73466635, - "num_input_tokens_seen": 314100310, - "step": 14567, - "time_per_iteration": 2.6560468673706055 - }, - { - "auxiliary_loss_clip": 0.01089483, - "auxiliary_loss_mlp": 0.00770872, - "balance_loss_clip": 1.0368377, - "balance_loss_mlp": 1.00015092, - "epoch": 0.8758755448669773, - "flos": 25045681985280.0, - "grad_norm": 1.924193327132232, - "language_loss": 0.74096954, - "learning_rate": 1.5938025861216776e-07, - "loss": 0.7595731, - "num_input_tokens_seen": 314121330, - "step": 14568, - "time_per_iteration": 2.669600248336792 - }, - { - "auxiliary_loss_clip": 0.0106924, - "auxiliary_loss_mlp": 0.01031283, - "balance_loss_clip": 1.03213978, - "balance_loss_mlp": 1.01898623, - "epoch": 0.8759356681196453, - "flos": 22856818999680.0, - "grad_norm": 2.753044994093851, - "language_loss": 0.86606205, - "learning_rate": 1.5922793993357475e-07, - "loss": 0.88706732, - "num_input_tokens_seen": 314139875, - "step": 14569, - "time_per_iteration": 2.7353930473327637 - }, - { - "auxiliary_loss_clip": 0.01069957, - "auxiliary_loss_mlp": 0.01032444, - "balance_loss_clip": 1.0342617, - "balance_loss_mlp": 1.02065396, - "epoch": 0.8759957913723132, - "flos": 21032521102080.0, - "grad_norm": 1.818630760471602, - "language_loss": 0.74142909, - "learning_rate": 1.5907569105724284e-07, - "loss": 0.76245314, - "num_input_tokens_seen": 314157850, - "step": 14570, - "time_per_iteration": 2.699028253555298 - }, - { - "auxiliary_loss_clip": 0.01100775, - "auxiliary_loss_mlp": 0.00770915, - "balance_loss_clip": 1.03732276, - "balance_loss_mlp": 1.00026119, - "epoch": 0.8760559146249812, - "flos": 20010467514240.0, - "grad_norm": 1.5893457614137378, - "language_loss": 0.67510492, - "learning_rate": 1.5892351198894472e-07, - "loss": 0.69382179, - "num_input_tokens_seen": 314176720, - "step": 14571, - "time_per_iteration": 2.617493152618408 - }, - { - "auxiliary_loss_clip": 0.01069948, - "auxiliary_loss_mlp": 0.01027892, - "balance_loss_clip": 1.03497171, - "balance_loss_mlp": 1.01635253, - "epoch": 0.8761160378776491, - "flos": 19974161842560.0, - "grad_norm": 2.1723550236606486, - "language_loss": 0.62609088, - "learning_rate": 1.5877140273445156e-07, - "loss": 0.64706922, - "num_input_tokens_seen": 314196645, - "step": 14572, - "time_per_iteration": 2.80468487739563 - }, - { - "auxiliary_loss_clip": 0.01095539, - "auxiliary_loss_mlp": 0.01029157, - "balance_loss_clip": 1.03618896, - "balance_loss_mlp": 1.01790953, - "epoch": 0.8761761611303172, - "flos": 28804415857920.0, - "grad_norm": 1.6603874349444352, - "language_loss": 0.73751938, - "learning_rate": 1.5861936329953162e-07, - "loss": 0.75876629, - "num_input_tokens_seen": 314217430, - "step": 14573, - "time_per_iteration": 2.8996636867523193 - }, - { - "auxiliary_loss_clip": 0.01058502, - "auxiliary_loss_mlp": 0.0076881, - "balance_loss_clip": 1.03545105, - "balance_loss_mlp": 1.00015557, - "epoch": 0.8762362843829851, - "flos": 18332505624960.0, - "grad_norm": 1.9024608944750214, - "language_loss": 0.72550857, - "learning_rate": 1.5846739368994966e-07, - "loss": 0.74378169, - "num_input_tokens_seen": 314235310, - "step": 14574, - "time_per_iteration": 2.7545413970947266 - }, - { - "auxiliary_loss_clip": 0.01095926, - "auxiliary_loss_mlp": 0.01036229, - "balance_loss_clip": 1.03621411, - "balance_loss_mlp": 1.02418888, - "epoch": 0.8762964076356531, - "flos": 15779149378560.0, - "grad_norm": 1.8502793872644558, - "language_loss": 0.76065028, - "learning_rate": 1.5831549391146903e-07, - "loss": 0.78197181, - "num_input_tokens_seen": 314252355, - "step": 14575, - "time_per_iteration": 2.5257208347320557 - }, - { - "auxiliary_loss_clip": 0.01081299, - "auxiliary_loss_mlp": 0.01038665, - "balance_loss_clip": 1.03579473, - "balance_loss_mlp": 1.02677417, - "epoch": 0.8763565308883211, - "flos": 33176754789120.0, - "grad_norm": 1.9305081146362895, - "language_loss": 0.66477948, - "learning_rate": 1.5816366396984916e-07, - "loss": 0.68597913, - "num_input_tokens_seen": 314272755, - "step": 14576, - "time_per_iteration": 2.7134413719177246 - }, - { - "auxiliary_loss_clip": 0.01078146, - "auxiliary_loss_mlp": 0.01034504, - "balance_loss_clip": 1.03182101, - "balance_loss_mlp": 1.02249372, - "epoch": 0.876416654140989, - "flos": 15888102307200.0, - "grad_norm": 2.865595040791599, - "language_loss": 0.668244, - "learning_rate": 1.5801190387084806e-07, - "loss": 0.68937051, - "num_input_tokens_seen": 314291365, - "step": 14577, - "time_per_iteration": 5.730209589004517 - }, - { - "auxiliary_loss_clip": 0.01099421, - "auxiliary_loss_mlp": 0.01031703, - "balance_loss_clip": 1.03849435, - "balance_loss_mlp": 1.01906085, - "epoch": 0.876476777393657, - "flos": 25885237547520.0, - "grad_norm": 2.277451139554719, - "language_loss": 0.71319246, - "learning_rate": 1.5786021362021962e-07, - "loss": 0.73450363, - "num_input_tokens_seen": 314310075, - "step": 14578, - "time_per_iteration": 2.6785285472869873 - }, - { - "auxiliary_loss_clip": 0.01110348, - "auxiliary_loss_mlp": 0.01034332, - "balance_loss_clip": 1.03671813, - "balance_loss_mlp": 1.02167737, - "epoch": 0.876536900646325, - "flos": 13589675861760.0, - "grad_norm": 2.477066541201799, - "language_loss": 0.7168777, - "learning_rate": 1.5770859322371676e-07, - "loss": 0.73832452, - "num_input_tokens_seen": 314325695, - "step": 14579, - "time_per_iteration": 4.083740472793579 - }, - { - "auxiliary_loss_clip": 0.01075998, - "auxiliary_loss_mlp": 0.01036896, - "balance_loss_clip": 1.0316453, - "balance_loss_mlp": 1.02358592, - "epoch": 0.876597023898993, - "flos": 12203344494720.0, - "grad_norm": 1.7087182635635378, - "language_loss": 0.70119214, - "learning_rate": 1.5755704268708912e-07, - "loss": 0.72232103, - "num_input_tokens_seen": 314343605, - "step": 14580, - "time_per_iteration": 4.30855393409729 - }, - { - "auxiliary_loss_clip": 0.01105953, - "auxiliary_loss_mlp": 0.00769599, - "balance_loss_clip": 1.03692436, - "balance_loss_mlp": 1.00017405, - "epoch": 0.8766571471516609, - "flos": 25336773803520.0, - "grad_norm": 1.6590992493321417, - "language_loss": 0.65825737, - "learning_rate": 1.5740556201608256e-07, - "loss": 0.67701292, - "num_input_tokens_seen": 314364275, - "step": 14581, - "time_per_iteration": 2.6293153762817383 - }, - { - "auxiliary_loss_clip": 0.0108123, - "auxiliary_loss_mlp": 0.01033696, - "balance_loss_clip": 1.03592646, - "balance_loss_mlp": 1.0222156, - "epoch": 0.8767172704043289, - "flos": 30113287545600.0, - "grad_norm": 1.6719712227937835, - "language_loss": 0.7391513, - "learning_rate": 1.572541512164416e-07, - "loss": 0.76030058, - "num_input_tokens_seen": 314385140, - "step": 14582, - "time_per_iteration": 2.8180127143859863 - }, - { - "auxiliary_loss_clip": 0.01106807, - "auxiliary_loss_mlp": 0.00770216, - "balance_loss_clip": 1.03510261, - "balance_loss_mlp": 1.00013459, - "epoch": 0.8767773936569968, - "flos": 19281157770240.0, - "grad_norm": 1.8898145602887721, - "language_loss": 0.66737789, - "learning_rate": 1.5710281029390826e-07, - "loss": 0.68614811, - "num_input_tokens_seen": 314403715, - "step": 14583, - "time_per_iteration": 2.68875789642334 - }, - { - "auxiliary_loss_clip": 0.011013, - "auxiliary_loss_mlp": 0.00770347, - "balance_loss_clip": 1.03735173, - "balance_loss_mlp": 1.00024498, - "epoch": 0.8768375169096648, - "flos": 21247230648960.0, - "grad_norm": 1.7289325254896564, - "language_loss": 0.7945081, - "learning_rate": 1.5695153925422067e-07, - "loss": 0.81322455, - "num_input_tokens_seen": 314421880, - "step": 14584, - "time_per_iteration": 2.6574294567108154 - }, - { - "auxiliary_loss_clip": 0.01078304, - "auxiliary_loss_mlp": 0.010306, - "balance_loss_clip": 1.03573895, - "balance_loss_mlp": 1.01824355, - "epoch": 0.8768976401623327, - "flos": 23295539715840.0, - "grad_norm": 2.4589506169652147, - "language_loss": 0.72250307, - "learning_rate": 1.5680033810311555e-07, - "loss": 0.74359208, - "num_input_tokens_seen": 314441585, - "step": 14585, - "time_per_iteration": 2.755363702774048 - }, - { - "auxiliary_loss_clip": 0.01087385, - "auxiliary_loss_mlp": 0.01030085, - "balance_loss_clip": 1.03488159, - "balance_loss_mlp": 1.01720476, - "epoch": 0.8769577634150008, - "flos": 21361247395200.0, - "grad_norm": 1.8198293013572575, - "language_loss": 0.74285269, - "learning_rate": 1.5664920684632654e-07, - "loss": 0.76402736, - "num_input_tokens_seen": 314459020, - "step": 14586, - "time_per_iteration": 2.7154970169067383 - }, - { - "auxiliary_loss_clip": 0.01107064, - "auxiliary_loss_mlp": 0.01030626, - "balance_loss_clip": 1.0354507, - "balance_loss_mlp": 1.0183115, - "epoch": 0.8770178866676687, - "flos": 23514056104320.0, - "grad_norm": 1.7048029370407318, - "language_loss": 0.78917754, - "learning_rate": 1.564981454895844e-07, - "loss": 0.81055439, - "num_input_tokens_seen": 314478935, - "step": 14587, - "time_per_iteration": 2.659623384475708 - }, - { - "auxiliary_loss_clip": 0.010977, - "auxiliary_loss_mlp": 0.01033091, - "balance_loss_clip": 1.0367806, - "balance_loss_mlp": 1.01905441, - "epoch": 0.8770780099203367, - "flos": 19719052473600.0, - "grad_norm": 1.5723021986517474, - "language_loss": 0.73480511, - "learning_rate": 1.5634715403861697e-07, - "loss": 0.75611293, - "num_input_tokens_seen": 314497635, - "step": 14588, - "time_per_iteration": 2.6490304470062256 - }, - { - "auxiliary_loss_clip": 0.01042159, - "auxiliary_loss_mlp": 0.0077056, - "balance_loss_clip": 1.03166127, - "balance_loss_mlp": 1.00015152, - "epoch": 0.8771381331730047, - "flos": 21395901041280.0, - "grad_norm": 1.8014093436247518, - "language_loss": 0.66976607, - "learning_rate": 1.5619623249915016e-07, - "loss": 0.68789327, - "num_input_tokens_seen": 314515445, - "step": 14589, - "time_per_iteration": 2.7724153995513916 - }, - { - "auxiliary_loss_clip": 0.01098134, - "auxiliary_loss_mlp": 0.0103223, - "balance_loss_clip": 1.03780675, - "balance_loss_mlp": 1.01989174, - "epoch": 0.8771982564256726, - "flos": 20261770041600.0, - "grad_norm": 2.6258083956029776, - "language_loss": 0.70362616, - "learning_rate": 1.5604538087690732e-07, - "loss": 0.72492981, - "num_input_tokens_seen": 314533040, - "step": 14590, - "time_per_iteration": 2.6688060760498047 - }, - { - "auxiliary_loss_clip": 0.01085853, - "auxiliary_loss_mlp": 0.01041073, - "balance_loss_clip": 1.03592718, - "balance_loss_mlp": 1.0268271, - "epoch": 0.8772583796783406, - "flos": 12489372495360.0, - "grad_norm": 2.102125056445036, - "language_loss": 0.74291348, - "learning_rate": 1.558945991776086e-07, - "loss": 0.76418269, - "num_input_tokens_seen": 314548280, - "step": 14591, - "time_per_iteration": 2.644625425338745 - }, - { - "auxiliary_loss_clip": 0.01104605, - "auxiliary_loss_mlp": 0.01027114, - "balance_loss_clip": 1.03682637, - "balance_loss_mlp": 1.01522839, - "epoch": 0.8773185029310085, - "flos": 15921103927680.0, - "grad_norm": 1.6170050772781672, - "language_loss": 0.79845113, - "learning_rate": 1.5574388740697096e-07, - "loss": 0.81976831, - "num_input_tokens_seen": 314565345, - "step": 14592, - "time_per_iteration": 2.604241132736206 - }, - { - "auxiliary_loss_clip": 0.01106487, - "auxiliary_loss_mlp": 0.0103183, - "balance_loss_clip": 1.03708172, - "balance_loss_mlp": 1.02006376, - "epoch": 0.8773786261836766, - "flos": 21504530747520.0, - "grad_norm": 1.5930198030485112, - "language_loss": 0.82747221, - "learning_rate": 1.5559324557071052e-07, - "loss": 0.84885532, - "num_input_tokens_seen": 314584190, - "step": 14593, - "time_per_iteration": 2.5794694423675537 - }, - { - "auxiliary_loss_clip": 0.0109194, - "auxiliary_loss_mlp": 0.01028175, - "balance_loss_clip": 1.03585052, - "balance_loss_mlp": 1.0158962, - "epoch": 0.8774387494363445, - "flos": 26761493831040.0, - "grad_norm": 1.3623421288990831, - "language_loss": 0.76057625, - "learning_rate": 1.5544267367453845e-07, - "loss": 0.78177738, - "num_input_tokens_seen": 314605625, - "step": 14594, - "time_per_iteration": 2.66890025138855 - }, - { - "auxiliary_loss_clip": 0.01057614, - "auxiliary_loss_mlp": 0.01040841, - "balance_loss_clip": 1.03001809, - "balance_loss_mlp": 1.02620232, - "epoch": 0.8774988726890125, - "flos": 18478841633280.0, - "grad_norm": 2.047444074315711, - "language_loss": 0.77807617, - "learning_rate": 1.552921717241651e-07, - "loss": 0.7990607, - "num_input_tokens_seen": 314622630, - "step": 14595, - "time_per_iteration": 2.8318984508514404 - }, - { - "auxiliary_loss_clip": 0.01075529, - "auxiliary_loss_mlp": 0.01033164, - "balance_loss_clip": 1.03548956, - "balance_loss_mlp": 1.02087939, - "epoch": 0.8775589959416804, - "flos": 24426366664320.0, - "grad_norm": 1.536994143649266, - "language_loss": 0.70930111, - "learning_rate": 1.5514173972529743e-07, - "loss": 0.7303881, - "num_input_tokens_seen": 314642460, - "step": 14596, - "time_per_iteration": 2.7869088649749756 - }, - { - "auxiliary_loss_clip": 0.01074468, - "auxiliary_loss_mlp": 0.01024808, - "balance_loss_clip": 1.03594506, - "balance_loss_mlp": 1.01340532, - "epoch": 0.8776191191943484, - "flos": 23440151871360.0, - "grad_norm": 1.7123074266942537, - "language_loss": 0.85920203, - "learning_rate": 1.5499137768364067e-07, - "loss": 0.88019478, - "num_input_tokens_seen": 314659875, - "step": 14597, - "time_per_iteration": 2.741469383239746 - }, - { - "auxiliary_loss_clip": 0.01095944, - "auxiliary_loss_mlp": 0.01030022, - "balance_loss_clip": 1.0365026, - "balance_loss_mlp": 1.01824403, - "epoch": 0.8776792424470163, - "flos": 26830872950400.0, - "grad_norm": 1.6418502548107807, - "language_loss": 0.72893673, - "learning_rate": 1.5484108560489494e-07, - "loss": 0.7501964, - "num_input_tokens_seen": 314680260, - "step": 14598, - "time_per_iteration": 2.679743766784668 - }, - { - "auxiliary_loss_clip": 0.01093166, - "auxiliary_loss_mlp": 0.00771018, - "balance_loss_clip": 1.03571749, - "balance_loss_mlp": 1.00025177, - "epoch": 0.8777393656996844, - "flos": 15626169354240.0, - "grad_norm": 2.1344366739736915, - "language_loss": 0.77418303, - "learning_rate": 1.5469086349476036e-07, - "loss": 0.79282486, - "num_input_tokens_seen": 314696260, - "step": 14599, - "time_per_iteration": 2.645653486251831 - }, - { - "auxiliary_loss_clip": 0.01077971, - "auxiliary_loss_mlp": 0.01030468, - "balance_loss_clip": 1.03576493, - "balance_loss_mlp": 1.0187732, - "epoch": 0.8777994889523523, - "flos": 18879999701760.0, - "grad_norm": 2.045708434317711, - "language_loss": 0.67680991, - "learning_rate": 1.545407113589332e-07, - "loss": 0.69789433, - "num_input_tokens_seen": 314714215, - "step": 14600, - "time_per_iteration": 2.67521333694458 - }, - { - "auxiliary_loss_clip": 0.01098236, - "auxiliary_loss_mlp": 0.01040238, - "balance_loss_clip": 1.03573418, - "balance_loss_mlp": 1.02782202, - "epoch": 0.8778596122050203, - "flos": 48826516400640.0, - "grad_norm": 1.696137650912348, - "language_loss": 0.69482052, - "learning_rate": 1.543906292031072e-07, - "loss": 0.71620524, - "num_input_tokens_seen": 314735700, - "step": 14601, - "time_per_iteration": 2.852067708969116 - }, - { - "auxiliary_loss_clip": 0.01102467, - "auxiliary_loss_mlp": 0.01033846, - "balance_loss_clip": 1.03806257, - "balance_loss_mlp": 1.0211978, - "epoch": 0.8779197354576883, - "flos": 25660184883840.0, - "grad_norm": 1.8150997518302137, - "language_loss": 0.72907132, - "learning_rate": 1.542406170329733e-07, - "loss": 0.75043446, - "num_input_tokens_seen": 314753335, - "step": 14602, - "time_per_iteration": 2.666530132293701 - }, - { - "auxiliary_loss_clip": 0.01106598, - "auxiliary_loss_mlp": 0.01033593, - "balance_loss_clip": 1.03583145, - "balance_loss_mlp": 1.02214909, - "epoch": 0.8779798587103562, - "flos": 18843227153280.0, - "grad_norm": 2.0286896900141103, - "language_loss": 0.70824677, - "learning_rate": 1.5409067485422056e-07, - "loss": 0.72964865, - "num_input_tokens_seen": 314770800, - "step": 14603, - "time_per_iteration": 2.6004815101623535 - }, - { - "auxiliary_loss_clip": 0.01011292, - "auxiliary_loss_mlp": 0.01001925, - "balance_loss_clip": 1.00817752, - "balance_loss_mlp": 1.00094128, - "epoch": 0.8780399819630242, - "flos": 68613119377920.0, - "grad_norm": 0.7394752492120941, - "language_loss": 0.54153609, - "learning_rate": 1.539408026725344e-07, - "loss": 0.56166828, - "num_input_tokens_seen": 314837275, - "step": 14604, - "time_per_iteration": 3.240145683288574 - }, - { - "auxiliary_loss_clip": 0.01001285, - "auxiliary_loss_mlp": 0.01016546, - "balance_loss_clip": 1.00654078, - "balance_loss_mlp": 1.01528251, - "epoch": 0.8781001052156922, - "flos": 65734807766400.0, - "grad_norm": 0.7095982216693757, - "language_loss": 0.59140944, - "learning_rate": 1.537910004935976e-07, - "loss": 0.61158776, - "num_input_tokens_seen": 314902220, - "step": 14605, - "time_per_iteration": 3.193176507949829 - }, - { - "auxiliary_loss_clip": 0.01068364, - "auxiliary_loss_mlp": 0.01034464, - "balance_loss_clip": 1.03649735, - "balance_loss_mlp": 1.02195311, - "epoch": 0.8781602284683602, - "flos": 22049654526720.0, - "grad_norm": 1.640207436482767, - "language_loss": 0.85104489, - "learning_rate": 1.536412683230912e-07, - "loss": 0.87207323, - "num_input_tokens_seen": 314921645, - "step": 14606, - "time_per_iteration": 2.7456490993499756 - }, - { - "auxiliary_loss_clip": 0.01111634, - "auxiliary_loss_mlp": 0.01031457, - "balance_loss_clip": 1.03835487, - "balance_loss_mlp": 1.01814675, - "epoch": 0.8782203517210281, - "flos": 17562939713280.0, - "grad_norm": 2.106006869176157, - "language_loss": 0.70568335, - "learning_rate": 1.534916061666931e-07, - "loss": 0.72711432, - "num_input_tokens_seen": 314939390, - "step": 14607, - "time_per_iteration": 2.5896804332733154 - }, - { - "auxiliary_loss_clip": 0.01086458, - "auxiliary_loss_mlp": 0.01041468, - "balance_loss_clip": 1.03582692, - "balance_loss_mlp": 1.03008974, - "epoch": 0.8782804749736961, - "flos": 25520421064320.0, - "grad_norm": 1.8237368142749963, - "language_loss": 0.72306776, - "learning_rate": 1.533420140300785e-07, - "loss": 0.74434698, - "num_input_tokens_seen": 314959205, - "step": 14608, - "time_per_iteration": 2.741672992706299 - }, - { - "auxiliary_loss_clip": 0.01099239, - "auxiliary_loss_mlp": 0.01037651, - "balance_loss_clip": 1.03510618, - "balance_loss_mlp": 1.0248003, - "epoch": 0.878340598226364, - "flos": 21798747048960.0, - "grad_norm": 1.955140517106729, - "language_loss": 0.87650675, - "learning_rate": 1.5319249191891936e-07, - "loss": 0.89787567, - "num_input_tokens_seen": 314977485, - "step": 14609, - "time_per_iteration": 2.7044589519500732 - }, - { - "auxiliary_loss_clip": 0.01064019, - "auxiliary_loss_mlp": 0.01031489, - "balance_loss_clip": 1.03733373, - "balance_loss_mlp": 1.01938868, - "epoch": 0.878400721479032, - "flos": 21102403011840.0, - "grad_norm": 1.6056637569887062, - "language_loss": 0.70521188, - "learning_rate": 1.5304303983888643e-07, - "loss": 0.72616696, - "num_input_tokens_seen": 314997830, - "step": 14610, - "time_per_iteration": 2.803408145904541 - }, - { - "auxiliary_loss_clip": 0.01090443, - "auxiliary_loss_mlp": 0.00770344, - "balance_loss_clip": 1.03708553, - "balance_loss_mlp": 1.0002346, - "epoch": 0.8784608447316999, - "flos": 20923532259840.0, - "grad_norm": 5.657745869325684, - "language_loss": 0.80772901, - "learning_rate": 1.5289365779564612e-07, - "loss": 0.82633686, - "num_input_tokens_seen": 315016480, - "step": 14611, - "time_per_iteration": 2.660065174102783 - }, - { - "auxiliary_loss_clip": 0.01108968, - "auxiliary_loss_mlp": 0.01032438, - "balance_loss_clip": 1.03722143, - "balance_loss_mlp": 1.01999247, - "epoch": 0.878520967984368, - "flos": 23330660238720.0, - "grad_norm": 1.533059433053689, - "language_loss": 0.76187742, - "learning_rate": 1.5274434579486338e-07, - "loss": 0.78329152, - "num_input_tokens_seen": 315036135, - "step": 14612, - "time_per_iteration": 2.6014697551727295 - }, - { - "auxiliary_loss_clip": 0.01056207, - "auxiliary_loss_mlp": 0.010328, - "balance_loss_clip": 1.03447223, - "balance_loss_mlp": 1.02104592, - "epoch": 0.8785810912370359, - "flos": 25518984520320.0, - "grad_norm": 1.467098748610033, - "language_loss": 0.72364843, - "learning_rate": 1.525951038422002e-07, - "loss": 0.74453855, - "num_input_tokens_seen": 315057995, - "step": 14613, - "time_per_iteration": 2.865140676498413 - }, - { - "auxiliary_loss_clip": 0.0100752, - "auxiliary_loss_mlp": 0.01000964, - "balance_loss_clip": 1.01305175, - "balance_loss_mlp": 0.9998787, - "epoch": 0.8786412144897039, - "flos": 61841047691520.0, - "grad_norm": 1.0274738596365884, - "language_loss": 0.64512694, - "learning_rate": 1.5244593194331667e-07, - "loss": 0.6652118, - "num_input_tokens_seen": 315104010, - "step": 14614, - "time_per_iteration": 3.0442123413085938 - }, - { - "auxiliary_loss_clip": 0.01027601, - "auxiliary_loss_mlp": 0.01004471, - "balance_loss_clip": 1.00515628, - "balance_loss_mlp": 1.00352311, - "epoch": 0.8787013377423719, - "flos": 70989364638720.0, - "grad_norm": 0.6570239019291962, - "language_loss": 0.58545709, - "learning_rate": 1.5229683010386762e-07, - "loss": 0.6057778, - "num_input_tokens_seen": 315174550, - "step": 14615, - "time_per_iteration": 3.2077083587646484 - }, - { - "auxiliary_loss_clip": 0.01059951, - "auxiliary_loss_mlp": 0.01028954, - "balance_loss_clip": 1.03297782, - "balance_loss_mlp": 1.01675916, - "epoch": 0.8787614609950398, - "flos": 17347404153600.0, - "grad_norm": 3.0128650645072503, - "language_loss": 0.72307193, - "learning_rate": 1.5214779832950807e-07, - "loss": 0.74396092, - "num_input_tokens_seen": 315191825, - "step": 14616, - "time_per_iteration": 4.491776704788208 - }, - { - "auxiliary_loss_clip": 0.01028184, - "auxiliary_loss_mlp": 0.01002892, - "balance_loss_clip": 1.00566876, - "balance_loss_mlp": 1.00189614, - "epoch": 0.8788215842477078, - "flos": 72511401588480.0, - "grad_norm": 0.8039335760257915, - "language_loss": 0.5797807, - "learning_rate": 1.5199883662588953e-07, - "loss": 0.60009146, - "num_input_tokens_seen": 315255075, - "step": 14617, - "time_per_iteration": 3.238430976867676 - }, - { - "auxiliary_loss_clip": 0.01081125, - "auxiliary_loss_mlp": 0.01037319, - "balance_loss_clip": 1.03397489, - "balance_loss_mlp": 1.02404451, - "epoch": 0.8788817075003758, - "flos": 24827452905600.0, - "grad_norm": 1.7430695626814152, - "language_loss": 0.83371663, - "learning_rate": 1.5184994499865987e-07, - "loss": 0.85490113, - "num_input_tokens_seen": 315273995, - "step": 14618, - "time_per_iteration": 2.6718552112579346 - }, - { - "auxiliary_loss_clip": 0.01081904, - "auxiliary_loss_mlp": 0.01028474, - "balance_loss_clip": 1.03612018, - "balance_loss_mlp": 1.0165534, - "epoch": 0.8789418307530438, - "flos": 22638769488000.0, - "grad_norm": 1.5514378708700263, - "language_loss": 0.69016528, - "learning_rate": 1.5170112345346598e-07, - "loss": 0.71126908, - "num_input_tokens_seen": 315294485, - "step": 14619, - "time_per_iteration": 5.7080559730529785 - }, - { - "auxiliary_loss_clip": 0.01067003, - "auxiliary_loss_mlp": 0.01037225, - "balance_loss_clip": 1.03445745, - "balance_loss_mlp": 1.02513099, - "epoch": 0.8790019540057117, - "flos": 19785738072960.0, - "grad_norm": 3.788287535500063, - "language_loss": 0.77142107, - "learning_rate": 1.5155237199595016e-07, - "loss": 0.79246336, - "num_input_tokens_seen": 315310420, - "step": 14620, - "time_per_iteration": 2.7002434730529785 - }, - { - "auxiliary_loss_clip": 0.0108692, - "auxiliary_loss_mlp": 0.01031499, - "balance_loss_clip": 1.03867853, - "balance_loss_mlp": 1.01823735, - "epoch": 0.8790620772583797, - "flos": 20229774001920.0, - "grad_norm": 1.6265722719383797, - "language_loss": 0.79121077, - "learning_rate": 1.514036906317542e-07, - "loss": 0.81239492, - "num_input_tokens_seen": 315330110, - "step": 14621, - "time_per_iteration": 2.706190824508667 - }, - { - "auxiliary_loss_clip": 0.01088315, - "auxiliary_loss_mlp": 0.01033697, - "balance_loss_clip": 1.03491962, - "balance_loss_mlp": 1.02115011, - "epoch": 0.8791222005110476, - "flos": 24130785646080.0, - "grad_norm": 1.9310015183922709, - "language_loss": 0.66529369, - "learning_rate": 1.5125507936651506e-07, - "loss": 0.68651378, - "num_input_tokens_seen": 315350080, - "step": 14622, - "time_per_iteration": 2.7165491580963135 - }, - { - "auxiliary_loss_clip": 0.01082524, - "auxiliary_loss_mlp": 0.01036818, - "balance_loss_clip": 1.03749692, - "balance_loss_mlp": 1.02424717, - "epoch": 0.8791823237637156, - "flos": 21614201948160.0, - "grad_norm": 1.9313985868431403, - "language_loss": 0.72802383, - "learning_rate": 1.511065382058687e-07, - "loss": 0.74921727, - "num_input_tokens_seen": 315366360, - "step": 14623, - "time_per_iteration": 2.747246026992798 - }, - { - "auxiliary_loss_clip": 0.01055452, - "auxiliary_loss_mlp": 0.01032326, - "balance_loss_clip": 1.03043795, - "balance_loss_mlp": 1.02029753, - "epoch": 0.8792424470163835, - "flos": 24243401761920.0, - "grad_norm": 1.9152762624748565, - "language_loss": 0.78623891, - "learning_rate": 1.5095806715544801e-07, - "loss": 0.80711675, - "num_input_tokens_seen": 315385890, - "step": 14624, - "time_per_iteration": 2.8343048095703125 - }, - { - "auxiliary_loss_clip": 0.01097982, - "auxiliary_loss_mlp": 0.01037468, - "balance_loss_clip": 1.03468323, - "balance_loss_mlp": 1.02431333, - "epoch": 0.8793025702690516, - "flos": 24893204751360.0, - "grad_norm": 1.7619469810650616, - "language_loss": 0.79745495, - "learning_rate": 1.5080966622088265e-07, - "loss": 0.81880945, - "num_input_tokens_seen": 315403400, - "step": 14625, - "time_per_iteration": 2.660099983215332 - }, - { - "auxiliary_loss_clip": 0.01083648, - "auxiliary_loss_mlp": 0.01039128, - "balance_loss_clip": 1.03540492, - "balance_loss_mlp": 1.02714157, - "epoch": 0.8793626935217195, - "flos": 25373115388800.0, - "grad_norm": 1.6785159518142898, - "language_loss": 0.74372435, - "learning_rate": 1.5066133540779967e-07, - "loss": 0.76495212, - "num_input_tokens_seen": 315423670, - "step": 14626, - "time_per_iteration": 2.676588535308838 - }, - { - "auxiliary_loss_clip": 0.01098546, - "auxiliary_loss_mlp": 0.01032578, - "balance_loss_clip": 1.03614759, - "balance_loss_mlp": 1.019876, - "epoch": 0.8794228167743875, - "flos": 34678000742400.0, - "grad_norm": 3.563179851520993, - "language_loss": 0.71319157, - "learning_rate": 1.505130747218246e-07, - "loss": 0.73450279, - "num_input_tokens_seen": 315446265, - "step": 14627, - "time_per_iteration": 2.7192656993865967 - }, - { - "auxiliary_loss_clip": 0.01081037, - "auxiliary_loss_mlp": 0.01032231, - "balance_loss_clip": 1.04066432, - "balance_loss_mlp": 1.01920116, - "epoch": 0.8794829400270555, - "flos": 19464014931840.0, - "grad_norm": 1.8203006608438008, - "language_loss": 0.72041732, - "learning_rate": 1.5036488416857873e-07, - "loss": 0.74155003, - "num_input_tokens_seen": 315464655, - "step": 14628, - "time_per_iteration": 2.6673803329467773 - }, - { - "auxiliary_loss_clip": 0.01077339, - "auxiliary_loss_mlp": 0.01034101, - "balance_loss_clip": 1.03383875, - "balance_loss_mlp": 1.02086902, - "epoch": 0.8795430632797234, - "flos": 15231403906560.0, - "grad_norm": 2.5577360809446312, - "language_loss": 0.69041932, - "learning_rate": 1.5021676375368175e-07, - "loss": 0.71153378, - "num_input_tokens_seen": 315481090, - "step": 14629, - "time_per_iteration": 2.6587491035461426 - }, - { - "auxiliary_loss_clip": 0.01082842, - "auxiliary_loss_mlp": 0.0103309, - "balance_loss_clip": 1.03334451, - "balance_loss_mlp": 1.02162218, - "epoch": 0.8796031865323914, - "flos": 27744727795200.0, - "grad_norm": 1.5244181147754692, - "language_loss": 0.68586159, - "learning_rate": 1.5006871348275053e-07, - "loss": 0.70702088, - "num_input_tokens_seen": 315502010, - "step": 14630, - "time_per_iteration": 2.6706295013427734 - }, - { - "auxiliary_loss_clip": 0.01081928, - "auxiliary_loss_mlp": 0.01033864, - "balance_loss_clip": 1.03443193, - "balance_loss_mlp": 1.02096558, - "epoch": 0.8796633097850594, - "flos": 31285412156160.0, - "grad_norm": 1.7384017818460198, - "language_loss": 0.74517637, - "learning_rate": 1.499207333613999e-07, - "loss": 0.7663343, - "num_input_tokens_seen": 315523040, - "step": 14631, - "time_per_iteration": 2.7020559310913086 - }, - { - "auxiliary_loss_clip": 0.01085004, - "auxiliary_loss_mlp": 0.00769583, - "balance_loss_clip": 1.03570437, - "balance_loss_mlp": 1.00020719, - "epoch": 0.8797234330377274, - "flos": 24243150366720.0, - "grad_norm": 2.2960657953969434, - "language_loss": 0.69393373, - "learning_rate": 1.4977282339523954e-07, - "loss": 0.71247965, - "num_input_tokens_seen": 315541865, - "step": 14632, - "time_per_iteration": 2.75093674659729 - }, - { - "auxiliary_loss_clip": 0.01087331, - "auxiliary_loss_mlp": 0.01027739, - "balance_loss_clip": 1.03704596, - "balance_loss_mlp": 1.01637244, - "epoch": 0.8797835562903953, - "flos": 24167414540160.0, - "grad_norm": 1.8690741277115708, - "language_loss": 0.65338004, - "learning_rate": 1.4962498358987929e-07, - "loss": 0.67453068, - "num_input_tokens_seen": 315561470, - "step": 14633, - "time_per_iteration": 2.6868348121643066 - }, - { - "auxiliary_loss_clip": 0.01075776, - "auxiliary_loss_mlp": 0.01034988, - "balance_loss_clip": 1.03406906, - "balance_loss_mlp": 1.0226382, - "epoch": 0.8798436795430633, - "flos": 19284677303040.0, - "grad_norm": 1.4189442310597726, - "language_loss": 0.84372133, - "learning_rate": 1.4947721395092528e-07, - "loss": 0.864829, - "num_input_tokens_seen": 315583140, - "step": 14634, - "time_per_iteration": 2.711578845977783 - }, - { - "auxiliary_loss_clip": 0.01085532, - "auxiliary_loss_mlp": 0.00770557, - "balance_loss_clip": 1.03607786, - "balance_loss_mlp": 1.00022292, - "epoch": 0.8799038027957312, - "flos": 28179390274560.0, - "grad_norm": 1.6380725692975024, - "language_loss": 0.79907227, - "learning_rate": 1.4932951448398056e-07, - "loss": 0.81763315, - "num_input_tokens_seen": 315601935, - "step": 14635, - "time_per_iteration": 2.7726967334747314 - }, - { - "auxiliary_loss_clip": 0.01081031, - "auxiliary_loss_mlp": 0.01025598, - "balance_loss_clip": 1.03709126, - "balance_loss_mlp": 1.01331937, - "epoch": 0.8799639260483992, - "flos": 24644703484800.0, - "grad_norm": 1.9658310023555117, - "language_loss": 0.65064734, - "learning_rate": 1.4918188519464648e-07, - "loss": 0.67171359, - "num_input_tokens_seen": 315619995, - "step": 14636, - "time_per_iteration": 2.686582565307617 - }, - { - "auxiliary_loss_clip": 0.01082702, - "auxiliary_loss_mlp": 0.01038782, - "balance_loss_clip": 1.03411579, - "balance_loss_mlp": 1.02539492, - "epoch": 0.8800240493010671, - "flos": 22200479735040.0, - "grad_norm": 1.4477537955972881, - "language_loss": 0.70313036, - "learning_rate": 1.4903432608852074e-07, - "loss": 0.72434527, - "num_input_tokens_seen": 315637895, - "step": 14637, - "time_per_iteration": 2.6938488483428955 - }, - { - "auxiliary_loss_clip": 0.01087054, - "auxiliary_loss_mlp": 0.01029647, - "balance_loss_clip": 1.03786731, - "balance_loss_mlp": 1.01791048, - "epoch": 0.8800841725537352, - "flos": 14246086953600.0, - "grad_norm": 2.359329981837555, - "language_loss": 0.66048372, - "learning_rate": 1.4888683717119843e-07, - "loss": 0.6816507, - "num_input_tokens_seen": 315655520, - "step": 14638, - "time_per_iteration": 2.633389472961426 - }, - { - "auxiliary_loss_clip": 0.01097569, - "auxiliary_loss_mlp": 0.01029703, - "balance_loss_clip": 1.03652537, - "balance_loss_mlp": 1.01738858, - "epoch": 0.8801442958064031, - "flos": 37415794348800.0, - "grad_norm": 2.0860441545932247, - "language_loss": 0.57805324, - "learning_rate": 1.4873941844827286e-07, - "loss": 0.59932595, - "num_input_tokens_seen": 315678955, - "step": 14639, - "time_per_iteration": 2.762080669403076 - }, - { - "auxiliary_loss_clip": 0.01081797, - "auxiliary_loss_mlp": 0.0103561, - "balance_loss_clip": 1.03559208, - "balance_loss_mlp": 1.0227828, - "epoch": 0.8802044190590711, - "flos": 25047334010880.0, - "grad_norm": 1.6274947606267138, - "language_loss": 0.74253106, - "learning_rate": 1.4859206992533402e-07, - "loss": 0.76370513, - "num_input_tokens_seen": 315700360, - "step": 14640, - "time_per_iteration": 2.6815481185913086 - }, - { - "auxiliary_loss_clip": 0.010844, - "auxiliary_loss_mlp": 0.01043439, - "balance_loss_clip": 1.03346467, - "balance_loss_mlp": 1.03030789, - "epoch": 0.8802645423117391, - "flos": 24133874215680.0, - "grad_norm": 2.333395940952266, - "language_loss": 0.69967985, - "learning_rate": 1.4844479160796985e-07, - "loss": 0.72095823, - "num_input_tokens_seen": 315719270, - "step": 14641, - "time_per_iteration": 2.749075174331665 - }, - { - "auxiliary_loss_clip": 0.01095024, - "auxiliary_loss_mlp": 0.01030792, - "balance_loss_clip": 1.03647685, - "balance_loss_mlp": 1.01772094, - "epoch": 0.880324665564407, - "flos": 17931203902080.0, - "grad_norm": 2.1703882572052837, - "language_loss": 0.84749234, - "learning_rate": 1.4829758350176457e-07, - "loss": 0.86875057, - "num_input_tokens_seen": 315737425, - "step": 14642, - "time_per_iteration": 2.5922858715057373 - }, - { - "auxiliary_loss_clip": 0.0107106, - "auxiliary_loss_mlp": 0.01034269, - "balance_loss_clip": 1.04185271, - "balance_loss_mlp": 1.02141285, - "epoch": 0.880384788817075, - "flos": 21287630471040.0, - "grad_norm": 1.7146284056948287, - "language_loss": 0.78968871, - "learning_rate": 1.4815044561230038e-07, - "loss": 0.81074202, - "num_input_tokens_seen": 315755725, - "step": 14643, - "time_per_iteration": 2.7133426666259766 - }, - { - "auxiliary_loss_clip": 0.01091961, - "auxiliary_loss_mlp": 0.010299, - "balance_loss_clip": 1.03380251, - "balance_loss_mlp": 1.01829529, - "epoch": 0.880444912069743, - "flos": 12458489777280.0, - "grad_norm": 1.637601444546806, - "language_loss": 0.72898597, - "learning_rate": 1.4800337794515705e-07, - "loss": 0.75020456, - "num_input_tokens_seen": 315773835, - "step": 14644, - "time_per_iteration": 2.644477367401123 - }, - { - "auxiliary_loss_clip": 0.01111824, - "auxiliary_loss_mlp": 0.00770434, - "balance_loss_clip": 1.03767347, - "balance_loss_mlp": 1.00029421, - "epoch": 0.880505035322411, - "flos": 13625945619840.0, - "grad_norm": 1.899004626318215, - "language_loss": 0.79560626, - "learning_rate": 1.47856380505911e-07, - "loss": 0.81442887, - "num_input_tokens_seen": 315790615, - "step": 14645, - "time_per_iteration": 2.5354764461517334 - }, - { - "auxiliary_loss_clip": 0.01092346, - "auxiliary_loss_mlp": 0.01037836, - "balance_loss_clip": 1.03347158, - "balance_loss_mlp": 1.02530098, - "epoch": 0.8805651585750789, - "flos": 23183067254400.0, - "grad_norm": 1.7052158673760782, - "language_loss": 0.64392948, - "learning_rate": 1.477094533001364e-07, - "loss": 0.66523129, - "num_input_tokens_seen": 315811010, - "step": 14646, - "time_per_iteration": 2.579423427581787 - }, - { - "auxiliary_loss_clip": 0.01080209, - "auxiliary_loss_mlp": 0.01036788, - "balance_loss_clip": 1.03776836, - "balance_loss_mlp": 1.02298915, - "epoch": 0.8806252818277469, - "flos": 14903000835840.0, - "grad_norm": 2.7067451127953874, - "language_loss": 0.77432781, - "learning_rate": 1.475625963334055e-07, - "loss": 0.79549778, - "num_input_tokens_seen": 315828130, - "step": 14647, - "time_per_iteration": 2.6500446796417236 - }, - { - "auxiliary_loss_clip": 0.01106216, - "auxiliary_loss_mlp": 0.01031301, - "balance_loss_clip": 1.03662324, - "balance_loss_mlp": 1.01965976, - "epoch": 0.8806854050804148, - "flos": 17639178330240.0, - "grad_norm": 2.4139145058404976, - "language_loss": 0.75048065, - "learning_rate": 1.4741580961128652e-07, - "loss": 0.77185583, - "num_input_tokens_seen": 315844900, - "step": 14648, - "time_per_iteration": 2.5998997688293457 - }, - { - "auxiliary_loss_clip": 0.01087799, - "auxiliary_loss_mlp": 0.01031948, - "balance_loss_clip": 1.03425181, - "balance_loss_mlp": 1.01994991, - "epoch": 0.8807455283330828, - "flos": 25332392344320.0, - "grad_norm": 1.6348853786721524, - "language_loss": 0.65398651, - "learning_rate": 1.4726909313934522e-07, - "loss": 0.67518401, - "num_input_tokens_seen": 315863745, - "step": 14649, - "time_per_iteration": 2.7652242183685303 - }, - { - "auxiliary_loss_clip": 0.010729, - "auxiliary_loss_mlp": 0.01033475, - "balance_loss_clip": 1.03727496, - "balance_loss_mlp": 1.02036798, - "epoch": 0.8808056515857507, - "flos": 25265168040960.0, - "grad_norm": 1.3476131678952612, - "language_loss": 0.62504375, - "learning_rate": 1.4712244692314578e-07, - "loss": 0.64610744, - "num_input_tokens_seen": 315885765, - "step": 14650, - "time_per_iteration": 2.77528715133667 - }, - { - "auxiliary_loss_clip": 0.01081061, - "auxiliary_loss_mlp": 0.01032731, - "balance_loss_clip": 1.03366303, - "balance_loss_mlp": 1.02105451, - "epoch": 0.8808657748384188, - "flos": 26578852151040.0, - "grad_norm": 1.497639019636266, - "language_loss": 0.72776234, - "learning_rate": 1.4697587096824914e-07, - "loss": 0.74890018, - "num_input_tokens_seen": 315907340, - "step": 14651, - "time_per_iteration": 2.755974769592285 - }, - { - "auxiliary_loss_clip": 0.01102624, - "auxiliary_loss_mlp": 0.01034556, - "balance_loss_clip": 1.03813457, - "balance_loss_mlp": 1.0211333, - "epoch": 0.8809258980910867, - "flos": 18661231918080.0, - "grad_norm": 1.7947734047574024, - "language_loss": 0.71671438, - "learning_rate": 1.4682936528021284e-07, - "loss": 0.73808622, - "num_input_tokens_seen": 315924935, - "step": 14652, - "time_per_iteration": 2.6478350162506104 - }, - { - "auxiliary_loss_clip": 0.01088537, - "auxiliary_loss_mlp": 0.01029586, - "balance_loss_clip": 1.03456485, - "balance_loss_mlp": 1.01757014, - "epoch": 0.8809860213437547, - "flos": 19792274348160.0, - "grad_norm": 2.741119431069501, - "language_loss": 0.74593818, - "learning_rate": 1.4668292986459286e-07, - "loss": 0.76711941, - "num_input_tokens_seen": 315943165, - "step": 14653, - "time_per_iteration": 2.657860517501831 - }, - { - "auxiliary_loss_clip": 0.01111355, - "auxiliary_loss_mlp": 0.01031409, - "balance_loss_clip": 1.03685915, - "balance_loss_mlp": 1.01822495, - "epoch": 0.8810461445964227, - "flos": 17894467267200.0, - "grad_norm": 1.7692800722324005, - "language_loss": 0.71231246, - "learning_rate": 1.465365647269421e-07, - "loss": 0.73374015, - "num_input_tokens_seen": 315961340, - "step": 14654, - "time_per_iteration": 2.6377742290496826 - }, - { - "auxiliary_loss_clip": 0.01062842, - "auxiliary_loss_mlp": 0.01038906, - "balance_loss_clip": 1.03567505, - "balance_loss_mlp": 1.02497637, - "epoch": 0.8811062678490906, - "flos": 29163917128320.0, - "grad_norm": 1.6194615705289337, - "language_loss": 0.71497536, - "learning_rate": 1.4639026987281012e-07, - "loss": 0.73599279, - "num_input_tokens_seen": 315981335, - "step": 14655, - "time_per_iteration": 4.449506044387817 - }, - { - "auxiliary_loss_clip": 0.01059688, - "auxiliary_loss_mlp": 0.01035269, - "balance_loss_clip": 1.03264832, - "balance_loss_mlp": 1.02179229, - "epoch": 0.8811663911017587, - "flos": 20338834671360.0, - "grad_norm": 2.1016384343696246, - "language_loss": 0.81381142, - "learning_rate": 1.462440453077449e-07, - "loss": 0.83476096, - "num_input_tokens_seen": 316001325, - "step": 14656, - "time_per_iteration": 4.342563629150391 - }, - { - "auxiliary_loss_clip": 0.01084679, - "auxiliary_loss_mlp": 0.01034904, - "balance_loss_clip": 1.03799617, - "balance_loss_mlp": 1.02292371, - "epoch": 0.8812265143544266, - "flos": 25885704424320.0, - "grad_norm": 1.9594168695096041, - "language_loss": 0.68740302, - "learning_rate": 1.460978910372914e-07, - "loss": 0.70859885, - "num_input_tokens_seen": 316022540, - "step": 14657, - "time_per_iteration": 2.75775408744812 - }, - { - "auxiliary_loss_clip": 0.01086792, - "auxiliary_loss_mlp": 0.01036523, - "balance_loss_clip": 1.03888392, - "balance_loss_mlp": 1.02426791, - "epoch": 0.8812866376070946, - "flos": 27195509865600.0, - "grad_norm": 2.309045431146604, - "language_loss": 0.84054673, - "learning_rate": 1.4595180706699207e-07, - "loss": 0.86177993, - "num_input_tokens_seen": 316037735, - "step": 14658, - "time_per_iteration": 4.1529762744903564 - }, - { - "auxiliary_loss_clip": 0.01094436, - "auxiliary_loss_mlp": 0.01035928, - "balance_loss_clip": 1.03857708, - "balance_loss_mlp": 1.02275574, - "epoch": 0.8813467608597625, - "flos": 23807194997760.0, - "grad_norm": 1.9486638108186574, - "language_loss": 0.77363259, - "learning_rate": 1.4580579340238554e-07, - "loss": 0.79493624, - "num_input_tokens_seen": 316058105, - "step": 14659, - "time_per_iteration": 4.211735010147095 - }, - { - "auxiliary_loss_clip": 0.01085864, - "auxiliary_loss_mlp": 0.01034435, - "balance_loss_clip": 1.03627634, - "balance_loss_mlp": 1.0214169, - "epoch": 0.8814068841124305, - "flos": 21105455667840.0, - "grad_norm": 2.1180822282078235, - "language_loss": 0.60540521, - "learning_rate": 1.4565985004900894e-07, - "loss": 0.62660819, - "num_input_tokens_seen": 316074415, - "step": 14660, - "time_per_iteration": 2.6319613456726074 - }, - { - "auxiliary_loss_clip": 0.01094829, - "auxiliary_loss_mlp": 0.01038386, - "balance_loss_clip": 1.04060745, - "balance_loss_mlp": 1.02493942, - "epoch": 0.8814670073650984, - "flos": 24716991605760.0, - "grad_norm": 1.6496161205890179, - "language_loss": 0.77789259, - "learning_rate": 1.455139770123972e-07, - "loss": 0.79922473, - "num_input_tokens_seen": 316094405, - "step": 14661, - "time_per_iteration": 2.633333444595337 - }, - { - "auxiliary_loss_clip": 0.01068997, - "auxiliary_loss_mlp": 0.01045562, - "balance_loss_clip": 1.03819084, - "balance_loss_mlp": 1.03196073, - "epoch": 0.8815271306177664, - "flos": 22966274718720.0, - "grad_norm": 2.4670359855209374, - "language_loss": 0.76707077, - "learning_rate": 1.45368174298081e-07, - "loss": 0.78821635, - "num_input_tokens_seen": 316113390, - "step": 14662, - "time_per_iteration": 2.645803451538086 - }, - { - "auxiliary_loss_clip": 0.01059478, - "auxiliary_loss_mlp": 0.01029767, - "balance_loss_clip": 1.03322673, - "balance_loss_mlp": 1.01856136, - "epoch": 0.8815872538704344, - "flos": 19460064435840.0, - "grad_norm": 2.046618055728614, - "language_loss": 0.73941565, - "learning_rate": 1.4522244191158929e-07, - "loss": 0.76030809, - "num_input_tokens_seen": 316131085, - "step": 14663, - "time_per_iteration": 2.7289090156555176 - }, - { - "auxiliary_loss_clip": 0.01099377, - "auxiliary_loss_mlp": 0.00769769, - "balance_loss_clip": 1.03778672, - "balance_loss_mlp": 1.00022185, - "epoch": 0.8816473771231024, - "flos": 32156604622080.0, - "grad_norm": 2.211377651108035, - "language_loss": 0.69977838, - "learning_rate": 1.450767798584489e-07, - "loss": 0.71846986, - "num_input_tokens_seen": 316151440, - "step": 14664, - "time_per_iteration": 2.679704427719116 - }, - { - "auxiliary_loss_clip": 0.01028116, - "auxiliary_loss_mlp": 0.01040404, - "balance_loss_clip": 1.03083181, - "balance_loss_mlp": 1.02833962, - "epoch": 0.8817075003757703, - "flos": 19682279925120.0, - "grad_norm": 1.499474682944125, - "language_loss": 0.80967414, - "learning_rate": 1.449311881441828e-07, - "loss": 0.83035928, - "num_input_tokens_seen": 316170750, - "step": 14665, - "time_per_iteration": 2.818871021270752 - }, - { - "auxiliary_loss_clip": 0.01085891, - "auxiliary_loss_mlp": 0.01035568, - "balance_loss_clip": 1.03590584, - "balance_loss_mlp": 1.0237484, - "epoch": 0.8817676236284383, - "flos": 15668616251520.0, - "grad_norm": 2.192576285565641, - "language_loss": 0.5833683, - "learning_rate": 1.447856667743117e-07, - "loss": 0.60458285, - "num_input_tokens_seen": 316187265, - "step": 14666, - "time_per_iteration": 2.6670124530792236 - }, - { - "auxiliary_loss_clip": 0.01101515, - "auxiliary_loss_mlp": 0.01031699, - "balance_loss_clip": 1.03911185, - "balance_loss_mlp": 1.01791823, - "epoch": 0.8818277468811063, - "flos": 17895185539200.0, - "grad_norm": 2.486999206259205, - "language_loss": 0.83586216, - "learning_rate": 1.4464021575435403e-07, - "loss": 0.8571943, - "num_input_tokens_seen": 316206555, - "step": 14667, - "time_per_iteration": 2.6268537044525146 - }, - { - "auxiliary_loss_clip": 0.01109075, - "auxiliary_loss_mlp": 0.01032153, - "balance_loss_clip": 1.03729033, - "balance_loss_mlp": 1.01920688, - "epoch": 0.8818878701337742, - "flos": 18770508069120.0, - "grad_norm": 1.817207647136482, - "language_loss": 0.62429118, - "learning_rate": 1.4449483508982563e-07, - "loss": 0.64570343, - "num_input_tokens_seen": 316225210, - "step": 14668, - "time_per_iteration": 2.552854061126709 - }, - { - "auxiliary_loss_clip": 0.01095167, - "auxiliary_loss_mlp": 0.01031398, - "balance_loss_clip": 1.03637564, - "balance_loss_mlp": 1.02023387, - "epoch": 0.8819479933864423, - "flos": 17712292464000.0, - "grad_norm": 2.79196460175423, - "language_loss": 0.57027191, - "learning_rate": 1.4434952478623918e-07, - "loss": 0.59153754, - "num_input_tokens_seen": 316242685, - "step": 14669, - "time_per_iteration": 2.565288782119751 - }, - { - "auxiliary_loss_clip": 0.0110705, - "auxiliary_loss_mlp": 0.01032118, - "balance_loss_clip": 1.03566611, - "balance_loss_mlp": 1.01975608, - "epoch": 0.8820081166391102, - "flos": 11728749070080.0, - "grad_norm": 1.8986730900413675, - "language_loss": 0.71354139, - "learning_rate": 1.442042848491043e-07, - "loss": 0.73493308, - "num_input_tokens_seen": 316260935, - "step": 14670, - "time_per_iteration": 2.563056707382202 - }, - { - "auxiliary_loss_clip": 0.01090236, - "auxiliary_loss_mlp": 0.01032669, - "balance_loss_clip": 1.03279638, - "balance_loss_mlp": 1.02009296, - "epoch": 0.8820682398917782, - "flos": 27490372611840.0, - "grad_norm": 1.913343870820924, - "language_loss": 0.73558605, - "learning_rate": 1.44059115283929e-07, - "loss": 0.75681508, - "num_input_tokens_seen": 316281190, - "step": 14671, - "time_per_iteration": 2.648991346359253 - }, - { - "auxiliary_loss_clip": 0.0108854, - "auxiliary_loss_mlp": 0.01032215, - "balance_loss_clip": 1.03446448, - "balance_loss_mlp": 1.01891685, - "epoch": 0.8821283631444461, - "flos": 16873850223360.0, - "grad_norm": 2.5015746427003878, - "language_loss": 0.84854722, - "learning_rate": 1.43914016096218e-07, - "loss": 0.86975479, - "num_input_tokens_seen": 316297115, - "step": 14672, - "time_per_iteration": 2.582524061203003 - }, - { - "auxiliary_loss_clip": 0.01071178, - "auxiliary_loss_mlp": 0.01030273, - "balance_loss_clip": 1.03337216, - "balance_loss_mlp": 1.01805353, - "epoch": 0.8821884863971141, - "flos": 24280964409600.0, - "grad_norm": 1.630028849005291, - "language_loss": 0.7247709, - "learning_rate": 1.4376898729147336e-07, - "loss": 0.74578547, - "num_input_tokens_seen": 316318235, - "step": 14673, - "time_per_iteration": 2.7013115882873535 - }, - { - "auxiliary_loss_clip": 0.01008529, - "auxiliary_loss_mlp": 0.01000562, - "balance_loss_clip": 1.00525308, - "balance_loss_mlp": 0.99949533, - "epoch": 0.882248609649782, - "flos": 59432342492160.0, - "grad_norm": 0.8079833493209833, - "language_loss": 0.49358672, - "learning_rate": 1.4362402887519487e-07, - "loss": 0.5136776, - "num_input_tokens_seen": 316384705, - "step": 14674, - "time_per_iteration": 3.268969774246216 - }, - { - "auxiliary_loss_clip": 0.01083711, - "auxiliary_loss_mlp": 0.00770966, - "balance_loss_clip": 1.03282237, - "balance_loss_mlp": 1.00024939, - "epoch": 0.88230873290245, - "flos": 19937784343680.0, - "grad_norm": 2.0037273036642578, - "language_loss": 0.76279628, - "learning_rate": 1.4347914085287971e-07, - "loss": 0.78134304, - "num_input_tokens_seen": 316401165, - "step": 14675, - "time_per_iteration": 2.6139438152313232 - }, - { - "auxiliary_loss_clip": 0.01083195, - "auxiliary_loss_mlp": 0.01035716, - "balance_loss_clip": 1.03536808, - "balance_loss_mlp": 1.02411079, - "epoch": 0.882368856155118, - "flos": 16362769559040.0, - "grad_norm": 1.8400865500932602, - "language_loss": 0.79260898, - "learning_rate": 1.4333432323002105e-07, - "loss": 0.81379807, - "num_input_tokens_seen": 316418780, - "step": 14676, - "time_per_iteration": 2.6346635818481445 - }, - { - "auxiliary_loss_clip": 0.00997838, - "auxiliary_loss_mlp": 0.01005545, - "balance_loss_clip": 1.01021266, - "balance_loss_mlp": 1.00431693, - "epoch": 0.882428979407786, - "flos": 70594563277440.0, - "grad_norm": 0.7902692138186003, - "language_loss": 0.54692107, - "learning_rate": 1.431895760121109e-07, - "loss": 0.56695491, - "num_input_tokens_seen": 316482030, - "step": 14677, - "time_per_iteration": 3.293663501739502 - }, - { - "auxiliary_loss_clip": 0.01105406, - "auxiliary_loss_mlp": 0.0103004, - "balance_loss_clip": 1.03478503, - "balance_loss_mlp": 1.01775551, - "epoch": 0.8824891026604539, - "flos": 18150294908160.0, - "grad_norm": 2.2487393421780673, - "language_loss": 0.64326406, - "learning_rate": 1.4304489920463847e-07, - "loss": 0.66461849, - "num_input_tokens_seen": 316499175, - "step": 14678, - "time_per_iteration": 2.5656399726867676 - }, - { - "auxiliary_loss_clip": 0.01087368, - "auxiliary_loss_mlp": 0.01031922, - "balance_loss_clip": 1.03426218, - "balance_loss_mlp": 1.01929188, - "epoch": 0.8825492259131219, - "flos": 27232713377280.0, - "grad_norm": 1.973421047113739, - "language_loss": 0.71194983, - "learning_rate": 1.4290029281308936e-07, - "loss": 0.73314273, - "num_input_tokens_seen": 316519495, - "step": 14679, - "time_per_iteration": 2.717034339904785 - }, - { - "auxiliary_loss_clip": 0.01084094, - "auxiliary_loss_mlp": 0.01031717, - "balance_loss_clip": 1.03604484, - "balance_loss_mlp": 1.02098179, - "epoch": 0.8826093491657898, - "flos": 22274419881600.0, - "grad_norm": 1.9561596241675088, - "language_loss": 0.63978046, - "learning_rate": 1.4275575684294694e-07, - "loss": 0.66093856, - "num_input_tokens_seen": 316538180, - "step": 14680, - "time_per_iteration": 2.6951301097869873 - }, - { - "auxiliary_loss_clip": 0.01107228, - "auxiliary_loss_mlp": 0.01033016, - "balance_loss_clip": 1.03680277, - "balance_loss_mlp": 1.0208087, - "epoch": 0.8826694724184578, - "flos": 14204753377920.0, - "grad_norm": 2.3967020475044767, - "language_loss": 0.77099824, - "learning_rate": 1.4261129129969328e-07, - "loss": 0.79240072, - "num_input_tokens_seen": 316551750, - "step": 14681, - "time_per_iteration": 2.5262744426727295 - }, - { - "auxiliary_loss_clip": 0.01087034, - "auxiliary_loss_mlp": 0.01034558, - "balance_loss_clip": 1.03454781, - "balance_loss_mlp": 1.02127814, - "epoch": 0.8827295956711259, - "flos": 20631686256000.0, - "grad_norm": 1.7532857738520948, - "language_loss": 0.72604549, - "learning_rate": 1.424668961888047e-07, - "loss": 0.74726152, - "num_input_tokens_seen": 316570680, - "step": 14682, - "time_per_iteration": 2.632432699203491 - }, - { - "auxiliary_loss_clip": 0.01069185, - "auxiliary_loss_mlp": 0.0103165, - "balance_loss_clip": 1.03995907, - "balance_loss_mlp": 1.01723146, - "epoch": 0.8827897189237938, - "flos": 18513064316160.0, - "grad_norm": 1.9501054227353172, - "language_loss": 0.74376327, - "learning_rate": 1.4232257151575765e-07, - "loss": 0.76477158, - "num_input_tokens_seen": 316588635, - "step": 14683, - "time_per_iteration": 2.7173256874084473 - }, - { - "auxiliary_loss_clip": 0.01074481, - "auxiliary_loss_mlp": 0.01032656, - "balance_loss_clip": 1.03458118, - "balance_loss_mlp": 1.01993001, - "epoch": 0.8828498421764618, - "flos": 22747399194240.0, - "grad_norm": 1.85393754134711, - "language_loss": 0.65667385, - "learning_rate": 1.4217831728602492e-07, - "loss": 0.67774516, - "num_input_tokens_seen": 316607550, - "step": 14684, - "time_per_iteration": 2.7330434322357178 - }, - { - "auxiliary_loss_clip": 0.0109236, - "auxiliary_loss_mlp": 0.01029425, - "balance_loss_clip": 1.03487706, - "balance_loss_mlp": 1.01779604, - "epoch": 0.8829099654291297, - "flos": 15012384727680.0, - "grad_norm": 1.9479646224303804, - "language_loss": 0.69623429, - "learning_rate": 1.4203413350507677e-07, - "loss": 0.71745217, - "num_input_tokens_seen": 316624460, - "step": 14685, - "time_per_iteration": 2.5940215587615967 - }, - { - "auxiliary_loss_clip": 0.01057757, - "auxiliary_loss_mlp": 0.0103887, - "balance_loss_clip": 1.03562188, - "balance_loss_mlp": 1.02445173, - "epoch": 0.8829700886817977, - "flos": 16720546976640.0, - "grad_norm": 1.9726343446445405, - "language_loss": 0.74293447, - "learning_rate": 1.418900201783806e-07, - "loss": 0.76390076, - "num_input_tokens_seen": 316640765, - "step": 14686, - "time_per_iteration": 2.724073886871338 - }, - { - "auxiliary_loss_clip": 0.01055836, - "auxiliary_loss_mlp": 0.01028261, - "balance_loss_clip": 1.03210068, - "balance_loss_mlp": 1.01602983, - "epoch": 0.8830302119344656, - "flos": 15263256291840.0, - "grad_norm": 1.8765198803907357, - "language_loss": 0.63015836, - "learning_rate": 1.417459773114007e-07, - "loss": 0.65099931, - "num_input_tokens_seen": 316656120, - "step": 14687, - "time_per_iteration": 2.707498550415039 - }, - { - "auxiliary_loss_clip": 0.01100271, - "auxiliary_loss_mlp": 0.01038773, - "balance_loss_clip": 1.03685296, - "balance_loss_mlp": 1.02611268, - "epoch": 0.8830903351871336, - "flos": 28617751854720.0, - "grad_norm": 1.78215533171273, - "language_loss": 0.69295615, - "learning_rate": 1.4160200490959984e-07, - "loss": 0.71434665, - "num_input_tokens_seen": 316676095, - "step": 14688, - "time_per_iteration": 2.6418840885162354 - }, - { - "auxiliary_loss_clip": 0.0109326, - "auxiliary_loss_mlp": 0.01027423, - "balance_loss_clip": 1.03498924, - "balance_loss_mlp": 1.01532912, - "epoch": 0.8831504584398016, - "flos": 28001632844160.0, - "grad_norm": 1.920117351658533, - "language_loss": 0.66948056, - "learning_rate": 1.4145810297843697e-07, - "loss": 0.69068736, - "num_input_tokens_seen": 316696235, - "step": 14689, - "time_per_iteration": 2.572154998779297 - }, - { - "auxiliary_loss_clip": 0.01082065, - "auxiliary_loss_mlp": 0.01027763, - "balance_loss_clip": 1.03897274, - "balance_loss_mlp": 1.01591396, - "epoch": 0.8832105816924696, - "flos": 26579642250240.0, - "grad_norm": 1.390214083666347, - "language_loss": 0.74641317, - "learning_rate": 1.4131427152336905e-07, - "loss": 0.76751149, - "num_input_tokens_seen": 316719680, - "step": 14690, - "time_per_iteration": 2.7160091400146484 - }, - { - "auxiliary_loss_clip": 0.0108565, - "auxiliary_loss_mlp": 0.01037391, - "balance_loss_clip": 1.0344497, - "balance_loss_mlp": 1.02380705, - "epoch": 0.8832707049451375, - "flos": 24898771359360.0, - "grad_norm": 1.4286133557095182, - "language_loss": 0.72746867, - "learning_rate": 1.4117051054985018e-07, - "loss": 0.74869907, - "num_input_tokens_seen": 316739830, - "step": 14691, - "time_per_iteration": 2.650376558303833 - }, - { - "auxiliary_loss_clip": 0.01076966, - "auxiliary_loss_mlp": 0.0102843, - "balance_loss_clip": 1.03778577, - "balance_loss_mlp": 1.01508439, - "epoch": 0.8833308281978055, - "flos": 15451141357440.0, - "grad_norm": 2.0965604291100277, - "language_loss": 0.51753283, - "learning_rate": 1.4102682006333243e-07, - "loss": 0.53858674, - "num_input_tokens_seen": 316758105, - "step": 14692, - "time_per_iteration": 2.656104564666748 - }, - { - "auxiliary_loss_clip": 0.01072794, - "auxiliary_loss_mlp": 0.01033738, - "balance_loss_clip": 1.03685403, - "balance_loss_mlp": 1.02114379, - "epoch": 0.8833909514504734, - "flos": 20301523418880.0, - "grad_norm": 2.5366757871087264, - "language_loss": 0.60396338, - "learning_rate": 1.4088320006926346e-07, - "loss": 0.62502873, - "num_input_tokens_seen": 316777455, - "step": 14693, - "time_per_iteration": 2.6937055587768555 - }, - { - "auxiliary_loss_clip": 0.01104793, - "auxiliary_loss_mlp": 0.01027886, - "balance_loss_clip": 1.03680062, - "balance_loss_mlp": 1.01657307, - "epoch": 0.8834510747031414, - "flos": 20374027021440.0, - "grad_norm": 1.6196331469074723, - "language_loss": 0.75283146, - "learning_rate": 1.407396505730898e-07, - "loss": 0.77415824, - "num_input_tokens_seen": 316796300, - "step": 14694, - "time_per_iteration": 2.577456474304199 - }, - { - "auxiliary_loss_clip": 0.01092067, - "auxiliary_loss_mlp": 0.01031516, - "balance_loss_clip": 1.03433728, - "balance_loss_mlp": 1.02011991, - "epoch": 0.8835111979558095, - "flos": 29752026508800.0, - "grad_norm": 1.8847177158288673, - "language_loss": 0.72582275, - "learning_rate": 1.4059617158025527e-07, - "loss": 0.74705863, - "num_input_tokens_seen": 316819090, - "step": 14695, - "time_per_iteration": 5.806610822677612 - }, - { - "auxiliary_loss_clip": 0.01092613, - "auxiliary_loss_mlp": 0.01026382, - "balance_loss_clip": 1.03546548, - "balance_loss_mlp": 1.01503897, - "epoch": 0.8835713212084774, - "flos": 24134556574080.0, - "grad_norm": 1.7805903977771496, - "language_loss": 0.80249125, - "learning_rate": 1.404527630961998e-07, - "loss": 0.82368124, - "num_input_tokens_seen": 316839250, - "step": 14696, - "time_per_iteration": 2.6262238025665283 - }, - { - "auxiliary_loss_clip": 0.01070594, - "auxiliary_loss_mlp": 0.01033129, - "balance_loss_clip": 1.03721249, - "balance_loss_mlp": 1.02114844, - "epoch": 0.8836314444611454, - "flos": 27672331933440.0, - "grad_norm": 1.3961231216590477, - "language_loss": 0.74813706, - "learning_rate": 1.4030942512636236e-07, - "loss": 0.76917428, - "num_input_tokens_seen": 316861315, - "step": 14697, - "time_per_iteration": 4.375631809234619 - }, - { - "auxiliary_loss_clip": 0.01087263, - "auxiliary_loss_mlp": 0.01030892, - "balance_loss_clip": 1.03708208, - "balance_loss_mlp": 1.01885819, - "epoch": 0.8836915677138133, - "flos": 16836969934080.0, - "grad_norm": 2.288430876034272, - "language_loss": 0.72242546, - "learning_rate": 1.401661576761779e-07, - "loss": 0.74360704, - "num_input_tokens_seen": 316879325, - "step": 14698, - "time_per_iteration": 4.223493576049805 - }, - { - "auxiliary_loss_clip": 0.01018409, - "auxiliary_loss_mlp": 0.00999712, - "balance_loss_clip": 1.00626993, - "balance_loss_mlp": 0.99860901, - "epoch": 0.8837516909664813, - "flos": 69310540823040.0, - "grad_norm": 0.8057459170171036, - "language_loss": 0.53683382, - "learning_rate": 1.4002296075107856e-07, - "loss": 0.55701506, - "num_input_tokens_seen": 316936425, - "step": 14699, - "time_per_iteration": 3.2273147106170654 - }, - { - "auxiliary_loss_clip": 0.01087948, - "auxiliary_loss_mlp": 0.01031005, - "balance_loss_clip": 1.03542256, - "balance_loss_mlp": 1.01808274, - "epoch": 0.8838118142191492, - "flos": 21324726241920.0, - "grad_norm": 1.773127577183959, - "language_loss": 0.76996839, - "learning_rate": 1.3987983435649508e-07, - "loss": 0.79115796, - "num_input_tokens_seen": 316956360, - "step": 14700, - "time_per_iteration": 2.7143490314483643 - }, - { - "auxiliary_loss_clip": 0.01074827, - "auxiliary_loss_mlp": 0.01031532, - "balance_loss_clip": 1.03586185, - "balance_loss_mlp": 1.01926565, - "epoch": 0.8838719374718172, - "flos": 21470559459840.0, - "grad_norm": 1.7340083630316034, - "language_loss": 0.72865736, - "learning_rate": 1.3973677849785494e-07, - "loss": 0.74972093, - "num_input_tokens_seen": 316975295, - "step": 14701, - "time_per_iteration": 2.6882786750793457 - }, - { - "auxiliary_loss_clip": 0.01086251, - "auxiliary_loss_mlp": 0.01036661, - "balance_loss_clip": 1.03455663, - "balance_loss_mlp": 1.02270126, - "epoch": 0.8839320607244852, - "flos": 26468929555200.0, - "grad_norm": 1.8625463465240368, - "language_loss": 0.71503305, - "learning_rate": 1.3959379318058262e-07, - "loss": 0.73626214, - "num_input_tokens_seen": 316994520, - "step": 14702, - "time_per_iteration": 2.72592830657959 - }, - { - "auxiliary_loss_clip": 0.01071764, - "auxiliary_loss_mlp": 0.01044197, - "balance_loss_clip": 1.03413224, - "balance_loss_mlp": 1.02983212, - "epoch": 0.8839921839771532, - "flos": 45222270923520.0, - "grad_norm": 1.7559603641053307, - "language_loss": 0.71454448, - "learning_rate": 1.3945087841010006e-07, - "loss": 0.73570406, - "num_input_tokens_seen": 317018095, - "step": 14703, - "time_per_iteration": 2.9277431964874268 - }, - { - "auxiliary_loss_clip": 0.01065783, - "auxiliary_loss_mlp": 0.01031109, - "balance_loss_clip": 1.03791165, - "balance_loss_mlp": 1.01922941, - "epoch": 0.8840523072298211, - "flos": 20006876154240.0, - "grad_norm": 2.1820279654831474, - "language_loss": 0.6694417, - "learning_rate": 1.3930803419182645e-07, - "loss": 0.69041061, - "num_input_tokens_seen": 317035755, - "step": 14704, - "time_per_iteration": 2.8294484615325928 - }, - { - "auxiliary_loss_clip": 0.01087087, - "auxiliary_loss_mlp": 0.0102583, - "balance_loss_clip": 1.03454638, - "balance_loss_mlp": 1.0141418, - "epoch": 0.8841124304824891, - "flos": 24426007528320.0, - "grad_norm": 1.887740201159673, - "language_loss": 0.70546407, - "learning_rate": 1.3916526053117905e-07, - "loss": 0.72659326, - "num_input_tokens_seen": 317055765, - "step": 14705, - "time_per_iteration": 2.7231884002685547 - }, - { - "auxiliary_loss_clip": 0.01086994, - "auxiliary_loss_mlp": 0.01032189, - "balance_loss_clip": 1.03693652, - "balance_loss_mlp": 1.02126944, - "epoch": 0.884172553735157, - "flos": 31284622056960.0, - "grad_norm": 1.4798383584085324, - "language_loss": 0.70781028, - "learning_rate": 1.3902255743357104e-07, - "loss": 0.72900212, - "num_input_tokens_seen": 317077955, - "step": 14706, - "time_per_iteration": 2.817166805267334 - }, - { - "auxiliary_loss_clip": 0.0109745, - "auxiliary_loss_mlp": 0.01031235, - "balance_loss_clip": 1.03596139, - "balance_loss_mlp": 1.0189985, - "epoch": 0.884232676987825, - "flos": 21391160446080.0, - "grad_norm": 1.9611948964387604, - "language_loss": 0.74633074, - "learning_rate": 1.3887992490441413e-07, - "loss": 0.76761764, - "num_input_tokens_seen": 317095825, - "step": 14707, - "time_per_iteration": 2.692667007446289 - }, - { - "auxiliary_loss_clip": 0.01001598, - "auxiliary_loss_mlp": 0.01000676, - "balance_loss_clip": 1.00856423, - "balance_loss_mlp": 0.99968618, - "epoch": 0.8842928002404931, - "flos": 57911451799680.0, - "grad_norm": 0.8872244282469403, - "language_loss": 0.60417277, - "learning_rate": 1.387373629491173e-07, - "loss": 0.62419552, - "num_input_tokens_seen": 317152875, - "step": 14708, - "time_per_iteration": 3.083991765975952 - }, - { - "auxiliary_loss_clip": 0.01077587, - "auxiliary_loss_mlp": 0.01032084, - "balance_loss_clip": 1.03236675, - "balance_loss_mlp": 1.02057397, - "epoch": 0.884352923493161, - "flos": 41463896186880.0, - "grad_norm": 4.7609896272216305, - "language_loss": 0.67469186, - "learning_rate": 1.3859487157308625e-07, - "loss": 0.6957885, - "num_input_tokens_seen": 317176725, - "step": 14709, - "time_per_iteration": 2.8194525241851807 - }, - { - "auxiliary_loss_clip": 0.01091628, - "auxiliary_loss_mlp": 0.01036852, - "balance_loss_clip": 1.03700888, - "balance_loss_mlp": 1.02251136, - "epoch": 0.884413046745829, - "flos": 46541234332800.0, - "grad_norm": 1.664665419544956, - "language_loss": 0.62438279, - "learning_rate": 1.3845245078172373e-07, - "loss": 0.64566755, - "num_input_tokens_seen": 317206880, - "step": 14710, - "time_per_iteration": 2.9080650806427 - }, - { - "auxiliary_loss_clip": 0.01074046, - "auxiliary_loss_mlp": 0.01025213, - "balance_loss_clip": 1.03497434, - "balance_loss_mlp": 1.01367342, - "epoch": 0.8844731699984969, - "flos": 19135324552320.0, - "grad_norm": 5.507655560622358, - "language_loss": 0.63936687, - "learning_rate": 1.38310100580431e-07, - "loss": 0.66035938, - "num_input_tokens_seen": 317224135, - "step": 14711, - "time_per_iteration": 2.7565457820892334 - }, - { - "auxiliary_loss_clip": 0.0107192, - "auxiliary_loss_mlp": 0.01033004, - "balance_loss_clip": 1.03220356, - "balance_loss_mlp": 1.01972961, - "epoch": 0.8845332932511649, - "flos": 23260634674560.0, - "grad_norm": 2.576105371894639, - "language_loss": 0.76215911, - "learning_rate": 1.38167820974606e-07, - "loss": 0.78320837, - "num_input_tokens_seen": 317244505, - "step": 14712, - "time_per_iteration": 2.7664034366607666 - }, - { - "auxiliary_loss_clip": 0.01048291, - "auxiliary_loss_mlp": 0.01029784, - "balance_loss_clip": 1.02974892, - "balance_loss_mlp": 1.01695108, - "epoch": 0.8845934165038328, - "flos": 17564591738880.0, - "grad_norm": 2.26538239437818, - "language_loss": 0.80963331, - "learning_rate": 1.3802561196964368e-07, - "loss": 0.83041406, - "num_input_tokens_seen": 317257830, - "step": 14713, - "time_per_iteration": 2.7584569454193115 - }, - { - "auxiliary_loss_clip": 0.01084824, - "auxiliary_loss_mlp": 0.01028599, - "balance_loss_clip": 1.03427589, - "balance_loss_mlp": 1.01581335, - "epoch": 0.8846535397565009, - "flos": 27485739757440.0, - "grad_norm": 1.3779261727690353, - "language_loss": 0.55363518, - "learning_rate": 1.3788347357093688e-07, - "loss": 0.57476938, - "num_input_tokens_seen": 317278430, - "step": 14714, - "time_per_iteration": 2.733762502670288 - }, - { - "auxiliary_loss_clip": 0.01053317, - "auxiliary_loss_mlp": 0.01038776, - "balance_loss_clip": 1.03666592, - "balance_loss_mlp": 1.02476311, - "epoch": 0.8847136630091688, - "flos": 28761430256640.0, - "grad_norm": 1.7611265846696629, - "language_loss": 0.74193525, - "learning_rate": 1.377414057838755e-07, - "loss": 0.76285625, - "num_input_tokens_seen": 317295970, - "step": 14715, - "time_per_iteration": 2.841095447540283 - }, - { - "auxiliary_loss_clip": 0.01098367, - "auxiliary_loss_mlp": 0.01030945, - "balance_loss_clip": 1.03592944, - "balance_loss_mlp": 1.0190537, - "epoch": 0.8847737862618368, - "flos": 23476924419840.0, - "grad_norm": 2.015334930490414, - "language_loss": 0.75194365, - "learning_rate": 1.375994086138461e-07, - "loss": 0.77323675, - "num_input_tokens_seen": 317316185, - "step": 14716, - "time_per_iteration": 2.661020517349243 - }, - { - "auxiliary_loss_clip": 0.01075664, - "auxiliary_loss_mlp": 0.0103605, - "balance_loss_clip": 1.03667819, - "balance_loss_mlp": 1.02395606, - "epoch": 0.8848339095145047, - "flos": 18660872782080.0, - "grad_norm": 1.993706294910503, - "language_loss": 0.71433997, - "learning_rate": 1.3745748206623397e-07, - "loss": 0.73545712, - "num_input_tokens_seen": 317333275, - "step": 14717, - "time_per_iteration": 2.7001688480377197 - }, - { - "auxiliary_loss_clip": 0.01093455, - "auxiliary_loss_mlp": 0.01033322, - "balance_loss_clip": 1.0350275, - "balance_loss_mlp": 1.02174115, - "epoch": 0.8848940327671727, - "flos": 32270298145920.0, - "grad_norm": 2.3327665948166643, - "language_loss": 0.73770732, - "learning_rate": 1.373156261464208e-07, - "loss": 0.75897503, - "num_input_tokens_seen": 317351245, - "step": 14718, - "time_per_iteration": 2.677098274230957 - }, - { - "auxiliary_loss_clip": 0.01058475, - "auxiliary_loss_mlp": 0.01029814, - "balance_loss_clip": 1.03630209, - "balance_loss_mlp": 1.01655793, - "epoch": 0.8849541560198406, - "flos": 24021832717440.0, - "grad_norm": 2.0713842614778755, - "language_loss": 0.78531897, - "learning_rate": 1.3717384085978602e-07, - "loss": 0.80620188, - "num_input_tokens_seen": 317370740, - "step": 14719, - "time_per_iteration": 2.8046772480010986 - }, - { - "auxiliary_loss_clip": 0.01108831, - "auxiliary_loss_mlp": 0.01026154, - "balance_loss_clip": 1.03627968, - "balance_loss_mlp": 1.01376843, - "epoch": 0.8850142792725086, - "flos": 16873060124160.0, - "grad_norm": 1.562689851566494, - "language_loss": 0.71582258, - "learning_rate": 1.3703212621170579e-07, - "loss": 0.73717248, - "num_input_tokens_seen": 317388370, - "step": 14720, - "time_per_iteration": 2.6795947551727295 - }, - { - "auxiliary_loss_clip": 0.01087567, - "auxiliary_loss_mlp": 0.01032607, - "balance_loss_clip": 1.03469348, - "balance_loss_mlp": 1.02011943, - "epoch": 0.8850744025251767, - "flos": 24024059360640.0, - "grad_norm": 1.9018606695741462, - "language_loss": 0.82328093, - "learning_rate": 1.3689048220755383e-07, - "loss": 0.84448266, - "num_input_tokens_seen": 317407390, - "step": 14721, - "time_per_iteration": 2.7234106063842773 - }, - { - "auxiliary_loss_clip": 0.01087774, - "auxiliary_loss_mlp": 0.01030218, - "balance_loss_clip": 1.03554928, - "balance_loss_mlp": 1.01725388, - "epoch": 0.8851345257778446, - "flos": 47955575329920.0, - "grad_norm": 2.0019899609402994, - "language_loss": 0.6242708, - "learning_rate": 1.3674890885270186e-07, - "loss": 0.64545077, - "num_input_tokens_seen": 317430825, - "step": 14722, - "time_per_iteration": 2.94286847114563 - }, - { - "auxiliary_loss_clip": 0.01098996, - "auxiliary_loss_mlp": 0.01030621, - "balance_loss_clip": 1.03618419, - "balance_loss_mlp": 1.01827097, - "epoch": 0.8851946490305126, - "flos": 36611000173440.0, - "grad_norm": 2.1418673941566815, - "language_loss": 0.68605435, - "learning_rate": 1.3660740615251754e-07, - "loss": 0.70735055, - "num_input_tokens_seen": 317451905, - "step": 14723, - "time_per_iteration": 2.733093023300171 - }, - { - "auxiliary_loss_clip": 0.01073469, - "auxiliary_loss_mlp": 0.01037418, - "balance_loss_clip": 1.03269005, - "balance_loss_mlp": 1.02493691, - "epoch": 0.8852547722831805, - "flos": 21544248211200.0, - "grad_norm": 1.6603204159034268, - "language_loss": 0.77997786, - "learning_rate": 1.3646597411236703e-07, - "loss": 0.80108678, - "num_input_tokens_seen": 317470030, - "step": 14724, - "time_per_iteration": 2.667952299118042 - }, - { - "auxiliary_loss_clip": 0.01018949, - "auxiliary_loss_mlp": 0.01000573, - "balance_loss_clip": 1.00656819, - "balance_loss_mlp": 0.9996435, - "epoch": 0.8853148955358485, - "flos": 63059246472960.0, - "grad_norm": 0.79872504573919, - "language_loss": 0.58856809, - "learning_rate": 1.363246127376143e-07, - "loss": 0.60876334, - "num_input_tokens_seen": 317527460, - "step": 14725, - "time_per_iteration": 3.0969929695129395 - }, - { - "auxiliary_loss_clip": 0.010877, - "auxiliary_loss_mlp": 0.00772122, - "balance_loss_clip": 1.0332129, - "balance_loss_mlp": 1.00029242, - "epoch": 0.8853750187885164, - "flos": 18149828031360.0, - "grad_norm": 1.9516180183005214, - "language_loss": 0.69201702, - "learning_rate": 1.3618332203361837e-07, - "loss": 0.71061528, - "num_input_tokens_seen": 317544070, - "step": 14726, - "time_per_iteration": 2.6915600299835205 - }, - { - "auxiliary_loss_clip": 0.01095197, - "auxiliary_loss_mlp": 0.00770245, - "balance_loss_clip": 1.03544807, - "balance_loss_mlp": 1.00021529, - "epoch": 0.8854351420411845, - "flos": 39570542392320.0, - "grad_norm": 1.2107511197334673, - "language_loss": 0.69623214, - "learning_rate": 1.3604210200573785e-07, - "loss": 0.71488655, - "num_input_tokens_seen": 317570275, - "step": 14727, - "time_per_iteration": 2.7665956020355225 - }, - { - "auxiliary_loss_clip": 0.01088033, - "auxiliary_loss_mlp": 0.01032809, - "balance_loss_clip": 1.03910947, - "balance_loss_mlp": 1.02020836, - "epoch": 0.8854952652938524, - "flos": 23769309127680.0, - "grad_norm": 1.5740836195645216, - "language_loss": 0.69980741, - "learning_rate": 1.3590095265932733e-07, - "loss": 0.72101581, - "num_input_tokens_seen": 317590160, - "step": 14728, - "time_per_iteration": 2.765291929244995 - }, - { - "auxiliary_loss_clip": 0.0107448, - "auxiliary_loss_mlp": 0.01027952, - "balance_loss_clip": 1.03473592, - "balance_loss_mlp": 1.01644814, - "epoch": 0.8855553885465204, - "flos": 18290310122880.0, - "grad_norm": 2.332652743923133, - "language_loss": 0.66558629, - "learning_rate": 1.3575987399973987e-07, - "loss": 0.68661064, - "num_input_tokens_seen": 317608340, - "step": 14729, - "time_per_iteration": 2.7198948860168457 - }, - { - "auxiliary_loss_clip": 0.01079258, - "auxiliary_loss_mlp": 0.01037721, - "balance_loss_clip": 1.03742743, - "balance_loss_mlp": 1.02642, - "epoch": 0.8856155117991883, - "flos": 36867402432000.0, - "grad_norm": 1.6722891950677918, - "language_loss": 0.62810826, - "learning_rate": 1.3561886603232453e-07, - "loss": 0.64927804, - "num_input_tokens_seen": 317629910, - "step": 14730, - "time_per_iteration": 2.8442556858062744 - }, - { - "auxiliary_loss_clip": 0.01071976, - "auxiliary_loss_mlp": 0.01031648, - "balance_loss_clip": 1.03443062, - "balance_loss_mlp": 1.01946437, - "epoch": 0.8856756350518563, - "flos": 22163886754560.0, - "grad_norm": 1.401332014115865, - "language_loss": 0.79437548, - "learning_rate": 1.3547792876242904e-07, - "loss": 0.81541169, - "num_input_tokens_seen": 317650265, - "step": 14731, - "time_per_iteration": 2.762430429458618 - }, - { - "auxiliary_loss_clip": 0.01072107, - "auxiliary_loss_mlp": 0.01033343, - "balance_loss_clip": 1.0311588, - "balance_loss_mlp": 1.02106476, - "epoch": 0.8857357583045242, - "flos": 20740962407040.0, - "grad_norm": 1.5976657601317488, - "language_loss": 0.82999492, - "learning_rate": 1.3533706219539708e-07, - "loss": 0.85104942, - "num_input_tokens_seen": 317669045, - "step": 14732, - "time_per_iteration": 2.7181379795074463 - }, - { - "auxiliary_loss_clip": 0.01009214, - "auxiliary_loss_mlp": 0.01003697, - "balance_loss_clip": 1.00654268, - "balance_loss_mlp": 1.00273728, - "epoch": 0.8857958815571922, - "flos": 69892329409920.0, - "grad_norm": 0.9009578672979854, - "language_loss": 0.5992915, - "learning_rate": 1.3519626633657045e-07, - "loss": 0.61942059, - "num_input_tokens_seen": 317728065, - "step": 14733, - "time_per_iteration": 4.828664064407349 - }, - { - "auxiliary_loss_clip": 0.01109749, - "auxiliary_loss_mlp": 0.0077073, - "balance_loss_clip": 1.03790414, - "balance_loss_mlp": 1.00016737, - "epoch": 0.8858560048098603, - "flos": 15121948187520.0, - "grad_norm": 4.085770877577171, - "language_loss": 0.66732299, - "learning_rate": 1.3505554119128838e-07, - "loss": 0.68612778, - "num_input_tokens_seen": 317746120, - "step": 14734, - "time_per_iteration": 4.0870819091796875 - }, - { - "auxiliary_loss_clip": 0.01081595, - "auxiliary_loss_mlp": 0.01037825, - "balance_loss_clip": 1.03617239, - "balance_loss_mlp": 1.02644062, - "epoch": 0.8859161280625282, - "flos": 16611019430400.0, - "grad_norm": 1.9769334143757535, - "language_loss": 0.75267172, - "learning_rate": 1.3491488676488682e-07, - "loss": 0.77386594, - "num_input_tokens_seen": 317762280, - "step": 14735, - "time_per_iteration": 2.596672534942627 - }, - { - "auxiliary_loss_clip": 0.01070336, - "auxiliary_loss_mlp": 0.0103395, - "balance_loss_clip": 1.03347635, - "balance_loss_mlp": 1.02087295, - "epoch": 0.8859762513151962, - "flos": 18694484933760.0, - "grad_norm": 1.9172644356964386, - "language_loss": 0.70264298, - "learning_rate": 1.3477430306270066e-07, - "loss": 0.72368586, - "num_input_tokens_seen": 317780615, - "step": 14736, - "time_per_iteration": 4.219033479690552 - }, - { - "auxiliary_loss_clip": 0.01077332, - "auxiliary_loss_mlp": 0.01031715, - "balance_loss_clip": 1.03754532, - "balance_loss_mlp": 1.01955533, - "epoch": 0.8860363745678641, - "flos": 19536877670400.0, - "grad_norm": 5.918141742658791, - "language_loss": 0.84637642, - "learning_rate": 1.3463379009005892e-07, - "loss": 0.86746687, - "num_input_tokens_seen": 317798830, - "step": 14737, - "time_per_iteration": 4.119691848754883 - }, - { - "auxiliary_loss_clip": 0.01084938, - "auxiliary_loss_mlp": 0.01034567, - "balance_loss_clip": 1.03715491, - "balance_loss_mlp": 1.02060747, - "epoch": 0.8860964978205321, - "flos": 35954912304000.0, - "grad_norm": 2.9176785944040087, - "language_loss": 0.67942357, - "learning_rate": 1.3449334785229093e-07, - "loss": 0.70061862, - "num_input_tokens_seen": 317819235, - "step": 14738, - "time_per_iteration": 2.865959882736206 - }, - { - "auxiliary_loss_clip": 0.01101518, - "auxiliary_loss_mlp": 0.01030491, - "balance_loss_clip": 1.03650808, - "balance_loss_mlp": 1.0173068, - "epoch": 0.8861566210732, - "flos": 21212577002880.0, - "grad_norm": 1.8188122899172712, - "language_loss": 0.75242293, - "learning_rate": 1.343529763547222e-07, - "loss": 0.77374303, - "num_input_tokens_seen": 317836785, - "step": 14739, - "time_per_iteration": 2.6084749698638916 - }, - { - "auxiliary_loss_clip": 0.01096641, - "auxiliary_loss_mlp": 0.01033392, - "balance_loss_clip": 1.03646207, - "balance_loss_mlp": 1.02176273, - "epoch": 0.886216744325868, - "flos": 14609071843200.0, - "grad_norm": 2.373250938513307, - "language_loss": 0.87370729, - "learning_rate": 1.3421267560267559e-07, - "loss": 0.89500761, - "num_input_tokens_seen": 317854225, - "step": 14740, - "time_per_iteration": 2.6357059478759766 - }, - { - "auxiliary_loss_clip": 0.01058963, - "auxiliary_loss_mlp": 0.01034429, - "balance_loss_clip": 1.03261304, - "balance_loss_mlp": 1.02202511, - "epoch": 0.886276867578536, - "flos": 26651643062400.0, - "grad_norm": 1.7903686918676003, - "language_loss": 0.63587701, - "learning_rate": 1.34072445601471e-07, - "loss": 0.656811, - "num_input_tokens_seen": 317874865, - "step": 14741, - "time_per_iteration": 2.7529678344726562 - }, - { - "auxiliary_loss_clip": 0.01108743, - "auxiliary_loss_mlp": 0.01029702, - "balance_loss_clip": 1.03720188, - "balance_loss_mlp": 1.01766753, - "epoch": 0.886336990831204, - "flos": 16764071281920.0, - "grad_norm": 1.7833239303064403, - "language_loss": 0.72917497, - "learning_rate": 1.3393228635642717e-07, - "loss": 0.75055945, - "num_input_tokens_seen": 317892830, - "step": 14742, - "time_per_iteration": 2.5617966651916504 - }, - { - "auxiliary_loss_clip": 0.01097185, - "auxiliary_loss_mlp": 0.00770206, - "balance_loss_clip": 1.0359509, - "balance_loss_mlp": 1.00016761, - "epoch": 0.8863971140838719, - "flos": 25265275781760.0, - "grad_norm": 1.894504945703206, - "language_loss": 0.59785163, - "learning_rate": 1.3379219787285733e-07, - "loss": 0.61652559, - "num_input_tokens_seen": 317911780, - "step": 14743, - "time_per_iteration": 2.7500805854797363 - }, - { - "auxiliary_loss_clip": 0.01079179, - "auxiliary_loss_mlp": 0.01033862, - "balance_loss_clip": 1.03562689, - "balance_loss_mlp": 1.02005744, - "epoch": 0.8864572373365399, - "flos": 23404313076480.0, - "grad_norm": 1.5564571259362694, - "language_loss": 0.60083222, - "learning_rate": 1.3365218015607437e-07, - "loss": 0.62196267, - "num_input_tokens_seen": 317932855, - "step": 14744, - "time_per_iteration": 2.770298957824707 - }, - { - "auxiliary_loss_clip": 0.01092438, - "auxiliary_loss_mlp": 0.0077049, - "balance_loss_clip": 1.03708875, - "balance_loss_mlp": 1.00017428, - "epoch": 0.8865173605892078, - "flos": 18548759456640.0, - "grad_norm": 1.674319681826978, - "language_loss": 0.76905382, - "learning_rate": 1.3351223321138762e-07, - "loss": 0.78768307, - "num_input_tokens_seen": 317952090, - "step": 14745, - "time_per_iteration": 2.5852930545806885 - }, - { - "auxiliary_loss_clip": 0.01107283, - "auxiliary_loss_mlp": 0.00770198, - "balance_loss_clip": 1.03665972, - "balance_loss_mlp": 1.00020969, - "epoch": 0.8865774838418758, - "flos": 19025868833280.0, - "grad_norm": 2.096197494867565, - "language_loss": 0.77457786, - "learning_rate": 1.3337235704410454e-07, - "loss": 0.79335266, - "num_input_tokens_seen": 317970370, - "step": 14746, - "time_per_iteration": 2.573580026626587 - }, - { - "auxiliary_loss_clip": 0.01086009, - "auxiliary_loss_mlp": 0.01035282, - "balance_loss_clip": 1.0389545, - "balance_loss_mlp": 1.02199042, - "epoch": 0.8866376070945439, - "flos": 22163168482560.0, - "grad_norm": 2.0671047150936674, - "language_loss": 0.76368475, - "learning_rate": 1.3323255165952873e-07, - "loss": 0.78489769, - "num_input_tokens_seen": 317989125, - "step": 14747, - "time_per_iteration": 2.624581813812256 - }, - { - "auxiliary_loss_clip": 0.01082631, - "auxiliary_loss_mlp": 0.007697, - "balance_loss_clip": 1.03356695, - "balance_loss_mlp": 1.00016332, - "epoch": 0.8866977303472118, - "flos": 20704261685760.0, - "grad_norm": 1.7191098225964694, - "language_loss": 0.82627869, - "learning_rate": 1.3309281706296127e-07, - "loss": 0.84480202, - "num_input_tokens_seen": 318007820, - "step": 14748, - "time_per_iteration": 2.67641282081604 - }, - { - "auxiliary_loss_clip": 0.01099108, - "auxiliary_loss_mlp": 0.01035329, - "balance_loss_clip": 1.03823555, - "balance_loss_mlp": 1.02254343, - "epoch": 0.8867578535998798, - "flos": 48794448533760.0, - "grad_norm": 1.734559291294961, - "language_loss": 0.77452302, - "learning_rate": 1.3295315325970148e-07, - "loss": 0.79586738, - "num_input_tokens_seen": 318030435, - "step": 14749, - "time_per_iteration": 2.84780216217041 - }, - { - "auxiliary_loss_clip": 0.01044507, - "auxiliary_loss_mlp": 0.00770609, - "balance_loss_clip": 1.0361824, - "balance_loss_mlp": 1.0002166, - "epoch": 0.8868179768525477, - "flos": 21105312013440.0, - "grad_norm": 1.9998873550656093, - "language_loss": 0.69549012, - "learning_rate": 1.328135602550451e-07, - "loss": 0.71364129, - "num_input_tokens_seen": 318049465, - "step": 14750, - "time_per_iteration": 2.714163064956665 - }, - { - "auxiliary_loss_clip": 0.01097015, - "auxiliary_loss_mlp": 0.01037086, - "balance_loss_clip": 1.03601015, - "balance_loss_mlp": 1.02531457, - "epoch": 0.8868781001052157, - "flos": 21830922656640.0, - "grad_norm": 1.7739396110359793, - "language_loss": 0.59205437, - "learning_rate": 1.3267403805428546e-07, - "loss": 0.61339533, - "num_input_tokens_seen": 318067760, - "step": 14751, - "time_per_iteration": 2.627380609512329 - }, - { - "auxiliary_loss_clip": 0.01109091, - "auxiliary_loss_mlp": 0.01032178, - "balance_loss_clip": 1.03742659, - "balance_loss_mlp": 1.01954198, - "epoch": 0.8869382233578836, - "flos": 13516418073600.0, - "grad_norm": 2.24908964745291, - "language_loss": 0.81063259, - "learning_rate": 1.3253458666271344e-07, - "loss": 0.83204532, - "num_input_tokens_seen": 318082785, - "step": 14752, - "time_per_iteration": 2.548123836517334 - }, - { - "auxiliary_loss_clip": 0.01090623, - "auxiliary_loss_mlp": 0.01030836, - "balance_loss_clip": 1.0372467, - "balance_loss_mlp": 1.01752663, - "epoch": 0.8869983466105517, - "flos": 22704988210560.0, - "grad_norm": 2.2048651718571963, - "language_loss": 0.80242121, - "learning_rate": 1.3239520608561793e-07, - "loss": 0.82363582, - "num_input_tokens_seen": 318101925, - "step": 14753, - "time_per_iteration": 2.634328842163086 - }, - { - "auxiliary_loss_clip": 0.01106619, - "auxiliary_loss_mlp": 0.01033372, - "balance_loss_clip": 1.03586936, - "balance_loss_mlp": 1.02094483, - "epoch": 0.8870584698632196, - "flos": 15340751884800.0, - "grad_norm": 1.7988782645876313, - "language_loss": 0.65307128, - "learning_rate": 1.3225589632828248e-07, - "loss": 0.67447126, - "num_input_tokens_seen": 318119945, - "step": 14754, - "time_per_iteration": 2.5431594848632812 - }, - { - "auxiliary_loss_clip": 0.01110421, - "auxiliary_loss_mlp": 0.01031473, - "balance_loss_clip": 1.03804612, - "balance_loss_mlp": 1.01891458, - "epoch": 0.8871185931158876, - "flos": 26615624699520.0, - "grad_norm": 2.066769305763262, - "language_loss": 0.7433095, - "learning_rate": 1.3211665739599065e-07, - "loss": 0.76472843, - "num_input_tokens_seen": 318139685, - "step": 14755, - "time_per_iteration": 2.5941274166107178 - }, - { - "auxiliary_loss_clip": 0.01084027, - "auxiliary_loss_mlp": 0.01033231, - "balance_loss_clip": 1.03191829, - "balance_loss_mlp": 1.01927161, - "epoch": 0.8871787163685555, - "flos": 21799034357760.0, - "grad_norm": 1.4611791846416269, - "language_loss": 0.77831644, - "learning_rate": 1.3197748929402262e-07, - "loss": 0.79948902, - "num_input_tokens_seen": 318160375, - "step": 14756, - "time_per_iteration": 2.7859320640563965 - }, - { - "auxiliary_loss_clip": 0.01089134, - "auxiliary_loss_mlp": 0.01034258, - "balance_loss_clip": 1.0377419, - "balance_loss_mlp": 1.02150822, - "epoch": 0.8872388396212235, - "flos": 14902964922240.0, - "grad_norm": 2.1242136336639414, - "language_loss": 0.76514637, - "learning_rate": 1.3183839202765535e-07, - "loss": 0.78638029, - "num_input_tokens_seen": 318177995, - "step": 14757, - "time_per_iteration": 2.637052059173584 - }, - { - "auxiliary_loss_clip": 0.01048992, - "auxiliary_loss_mlp": 0.01036807, - "balance_loss_clip": 1.03180897, - "balance_loss_mlp": 1.02424812, - "epoch": 0.8872989628738914, - "flos": 26432157006720.0, - "grad_norm": 1.8638847565120873, - "language_loss": 0.68011022, - "learning_rate": 1.316993656021632e-07, - "loss": 0.70096827, - "num_input_tokens_seen": 318197030, - "step": 14758, - "time_per_iteration": 2.852785348892212 - }, - { - "auxiliary_loss_clip": 0.01108807, - "auxiliary_loss_mlp": 0.01035484, - "balance_loss_clip": 1.03694987, - "balance_loss_mlp": 1.02170336, - "epoch": 0.8873590861265594, - "flos": 48142562555520.0, - "grad_norm": 3.8430422864269356, - "language_loss": 0.69252694, - "learning_rate": 1.3156041002281915e-07, - "loss": 0.71396983, - "num_input_tokens_seen": 318221780, - "step": 14759, - "time_per_iteration": 2.795743942260742 - }, - { - "auxiliary_loss_clip": 0.01106874, - "auxiliary_loss_mlp": 0.01033127, - "balance_loss_clip": 1.03578842, - "balance_loss_mlp": 1.0204078, - "epoch": 0.8874192093792275, - "flos": 18332972501760.0, - "grad_norm": 1.7718328909299519, - "language_loss": 0.74552894, - "learning_rate": 1.3142152529489092e-07, - "loss": 0.76692903, - "num_input_tokens_seen": 318239710, - "step": 14760, - "time_per_iteration": 2.5467581748962402 - }, - { - "auxiliary_loss_clip": 0.01090454, - "auxiliary_loss_mlp": 0.01034099, - "balance_loss_clip": 1.03724909, - "balance_loss_mlp": 1.02152801, - "epoch": 0.8874793326318954, - "flos": 17894215872000.0, - "grad_norm": 2.98069622772717, - "language_loss": 0.76240933, - "learning_rate": 1.3128271142364565e-07, - "loss": 0.78365493, - "num_input_tokens_seen": 318257425, - "step": 14761, - "time_per_iteration": 2.641578197479248 - }, - { - "auxiliary_loss_clip": 0.01110247, - "auxiliary_loss_mlp": 0.01036705, - "balance_loss_clip": 1.03677571, - "balance_loss_mlp": 1.02415276, - "epoch": 0.8875394558845634, - "flos": 31102231772160.0, - "grad_norm": 1.7387210735055314, - "language_loss": 0.61797994, - "learning_rate": 1.3114396841434717e-07, - "loss": 0.63944948, - "num_input_tokens_seen": 318278485, - "step": 14762, - "time_per_iteration": 2.6031076908111572 - }, - { - "auxiliary_loss_clip": 0.0109514, - "auxiliary_loss_mlp": 0.01034017, - "balance_loss_clip": 1.03448653, - "balance_loss_mlp": 1.02041471, - "epoch": 0.8875995791372313, - "flos": 21142048648320.0, - "grad_norm": 1.7775042808478863, - "language_loss": 0.63881463, - "learning_rate": 1.3100529627225697e-07, - "loss": 0.66010618, - "num_input_tokens_seen": 318297560, - "step": 14763, - "time_per_iteration": 2.5757639408111572 - }, - { - "auxiliary_loss_clip": 0.01082921, - "auxiliary_loss_mlp": 0.00770724, - "balance_loss_clip": 1.03658664, - "balance_loss_mlp": 1.00031114, - "epoch": 0.8876597023898993, - "flos": 17455136019840.0, - "grad_norm": 2.009886034280031, - "language_loss": 0.71068102, - "learning_rate": 1.3086669500263335e-07, - "loss": 0.72921747, - "num_input_tokens_seen": 318313060, - "step": 14764, - "time_per_iteration": 2.6084272861480713 - }, - { - "auxiliary_loss_clip": 0.01113096, - "auxiliary_loss_mlp": 0.01036581, - "balance_loss_clip": 1.03770447, - "balance_loss_mlp": 1.02350986, - "epoch": 0.8877198256425672, - "flos": 22707933125760.0, - "grad_norm": 2.026362447668411, - "language_loss": 0.66558039, - "learning_rate": 1.3072816461073166e-07, - "loss": 0.68707716, - "num_input_tokens_seen": 318332030, - "step": 14765, - "time_per_iteration": 2.547609806060791 - }, - { - "auxiliary_loss_clip": 0.01068364, - "auxiliary_loss_mlp": 0.01027425, - "balance_loss_clip": 1.0361414, - "balance_loss_mlp": 1.01615393, - "epoch": 0.8877799488952353, - "flos": 24535104111360.0, - "grad_norm": 1.776562659783939, - "language_loss": 0.7677201, - "learning_rate": 1.3058970510180568e-07, - "loss": 0.78867799, - "num_input_tokens_seen": 318351090, - "step": 14766, - "time_per_iteration": 2.6800858974456787 - }, - { - "auxiliary_loss_clip": 0.01076267, - "auxiliary_loss_mlp": 0.01031608, - "balance_loss_clip": 1.03301024, - "balance_loss_mlp": 1.01951444, - "epoch": 0.8878400721479032, - "flos": 20959191486720.0, - "grad_norm": 1.9295075111293745, - "language_loss": 0.73348194, - "learning_rate": 1.3045131648110496e-07, - "loss": 0.75456071, - "num_input_tokens_seen": 318372000, - "step": 14767, - "time_per_iteration": 2.605175256729126 - }, - { - "auxiliary_loss_clip": 0.0110506, - "auxiliary_loss_mlp": 0.0103329, - "balance_loss_clip": 1.03636575, - "balance_loss_mlp": 1.02166677, - "epoch": 0.8879001954005712, - "flos": 25295260659840.0, - "grad_norm": 1.7081054740283463, - "language_loss": 0.70993221, - "learning_rate": 1.303129987538778e-07, - "loss": 0.73131573, - "num_input_tokens_seen": 318391530, - "step": 14768, - "time_per_iteration": 2.5900521278381348 - }, - { - "auxiliary_loss_clip": 0.01093069, - "auxiliary_loss_mlp": 0.01031049, - "balance_loss_clip": 1.03431153, - "balance_loss_mlp": 1.01872909, - "epoch": 0.8879603186532391, - "flos": 23185329811200.0, - "grad_norm": 1.9932230097119548, - "language_loss": 0.70054102, - "learning_rate": 1.3017475192536932e-07, - "loss": 0.72178221, - "num_input_tokens_seen": 318410690, - "step": 14769, - "time_per_iteration": 2.5676157474517822 - }, - { - "auxiliary_loss_clip": 0.01080083, - "auxiliary_loss_mlp": 0.01031718, - "balance_loss_clip": 1.03361869, - "balance_loss_mlp": 1.01996374, - "epoch": 0.8880204419059071, - "flos": 13655427707520.0, - "grad_norm": 2.022851777751632, - "language_loss": 0.67168438, - "learning_rate": 1.3003657600082174e-07, - "loss": 0.69280243, - "num_input_tokens_seen": 318427380, - "step": 14770, - "time_per_iteration": 2.6081535816192627 - }, - { - "auxiliary_loss_clip": 0.01094329, - "auxiliary_loss_mlp": 0.0103224, - "balance_loss_clip": 1.03698134, - "balance_loss_mlp": 1.01974094, - "epoch": 0.888080565158575, - "flos": 20631865824000.0, - "grad_norm": 1.758420734888644, - "language_loss": 0.65032512, - "learning_rate": 1.2989847098547424e-07, - "loss": 0.67159081, - "num_input_tokens_seen": 318448530, - "step": 14771, - "time_per_iteration": 2.6046431064605713 - }, - { - "auxiliary_loss_clip": 0.01084735, - "auxiliary_loss_mlp": 0.01028682, - "balance_loss_clip": 1.03336012, - "balance_loss_mlp": 1.01646304, - "epoch": 0.888140688411243, - "flos": 28620014411520.0, - "grad_norm": 1.5574826798475248, - "language_loss": 0.82247543, - "learning_rate": 1.2976043688456396e-07, - "loss": 0.84360957, - "num_input_tokens_seen": 318468655, - "step": 14772, - "time_per_iteration": 2.7616019248962402 - }, - { - "auxiliary_loss_clip": 0.01079313, - "auxiliary_loss_mlp": 0.01024388, - "balance_loss_clip": 1.03151107, - "balance_loss_mlp": 1.01318812, - "epoch": 0.8882008116639111, - "flos": 25520241496320.0, - "grad_norm": 1.4234953861106903, - "language_loss": 0.76511365, - "learning_rate": 1.296224737033258e-07, - "loss": 0.78615069, - "num_input_tokens_seen": 318488740, - "step": 14773, - "time_per_iteration": 6.201860427856445 - }, - { - "auxiliary_loss_clip": 0.01083069, - "auxiliary_loss_mlp": 0.01026892, - "balance_loss_clip": 1.03498697, - "balance_loss_mlp": 1.01539993, - "epoch": 0.888260934916579, - "flos": 27673696650240.0, - "grad_norm": 1.9867965850384985, - "language_loss": 0.75016356, - "learning_rate": 1.294845814469907e-07, - "loss": 0.77126318, - "num_input_tokens_seen": 318508810, - "step": 14774, - "time_per_iteration": 2.675410270690918 - }, - { - "auxiliary_loss_clip": 0.0106342, - "auxiliary_loss_mlp": 0.00770109, - "balance_loss_clip": 1.03600156, - "balance_loss_mlp": 1.0002929, - "epoch": 0.888321058169247, - "flos": 21611077464960.0, - "grad_norm": 2.763852995715363, - "language_loss": 0.72647572, - "learning_rate": 1.2934676012078783e-07, - "loss": 0.74481106, - "num_input_tokens_seen": 318526860, - "step": 14775, - "time_per_iteration": 2.768602132797241 - }, - { - "auxiliary_loss_clip": 0.01106903, - "auxiliary_loss_mlp": 0.01032645, - "balance_loss_clip": 1.03619862, - "balance_loss_mlp": 1.02074754, - "epoch": 0.8883811814219149, - "flos": 18149109759360.0, - "grad_norm": 1.6801831073555262, - "language_loss": 0.79828447, - "learning_rate": 1.292090097299432e-07, - "loss": 0.81967992, - "num_input_tokens_seen": 318545180, - "step": 14776, - "time_per_iteration": 5.694887399673462 - }, - { - "auxiliary_loss_clip": 0.01103137, - "auxiliary_loss_mlp": 0.01037272, - "balance_loss_clip": 1.03596711, - "balance_loss_mlp": 1.02439141, - "epoch": 0.8884413046745829, - "flos": 28324648874880.0, - "grad_norm": 2.2946403260680746, - "language_loss": 0.69125223, - "learning_rate": 1.290713302796802e-07, - "loss": 0.71265632, - "num_input_tokens_seen": 318564350, - "step": 14777, - "time_per_iteration": 2.6711583137512207 - }, - { - "auxiliary_loss_clip": 0.01091804, - "auxiliary_loss_mlp": 0.01033294, - "balance_loss_clip": 1.0316937, - "balance_loss_mlp": 1.0206517, - "epoch": 0.8885014279272508, - "flos": 15158756649600.0, - "grad_norm": 1.744756226696034, - "language_loss": 0.71044743, - "learning_rate": 1.2893372177522e-07, - "loss": 0.73169839, - "num_input_tokens_seen": 318582275, - "step": 14778, - "time_per_iteration": 2.5861656665802 - }, - { - "auxiliary_loss_clip": 0.01107976, - "auxiliary_loss_mlp": 0.01029742, - "balance_loss_clip": 1.035954, - "balance_loss_mlp": 1.01773167, - "epoch": 0.8885615511799189, - "flos": 19099593498240.0, - "grad_norm": 3.336773779105202, - "language_loss": 0.77618229, - "learning_rate": 1.287961842217804e-07, - "loss": 0.79755944, - "num_input_tokens_seen": 318601230, - "step": 14779, - "time_per_iteration": 2.5533976554870605 - }, - { - "auxiliary_loss_clip": 0.01005115, - "auxiliary_loss_mlp": 0.00999201, - "balance_loss_clip": 1.00668931, - "balance_loss_mlp": 0.99814647, - "epoch": 0.8886216744325868, - "flos": 51186567605760.0, - "grad_norm": 0.8737021693090686, - "language_loss": 0.56777793, - "learning_rate": 1.2865871762457747e-07, - "loss": 0.58782107, - "num_input_tokens_seen": 318645595, - "step": 14780, - "time_per_iteration": 2.964052438735962 - }, - { - "auxiliary_loss_clip": 0.01028008, - "auxiliary_loss_mlp": 0.01000581, - "balance_loss_clip": 1.00549233, - "balance_loss_mlp": 0.99967527, - "epoch": 0.8886817976852548, - "flos": 61612981263360.0, - "grad_norm": 0.7941416089367529, - "language_loss": 0.62353128, - "learning_rate": 1.2852132198882326e-07, - "loss": 0.64381719, - "num_input_tokens_seen": 318707850, - "step": 14781, - "time_per_iteration": 3.181043863296509 - }, - { - "auxiliary_loss_clip": 0.00963643, - "auxiliary_loss_mlp": 0.01006454, - "balance_loss_clip": 1.01169443, - "balance_loss_mlp": 1.00542259, - "epoch": 0.8887419209379227, - "flos": 60646946935680.0, - "grad_norm": 0.7977280372936163, - "language_loss": 0.58126575, - "learning_rate": 1.2838399731972805e-07, - "loss": 0.60096675, - "num_input_tokens_seen": 318764915, - "step": 14782, - "time_per_iteration": 3.2847399711608887 - }, - { - "auxiliary_loss_clip": 0.01106535, - "auxiliary_loss_mlp": 0.01029737, - "balance_loss_clip": 1.03703415, - "balance_loss_mlp": 1.01808405, - "epoch": 0.8888020441905907, - "flos": 29205861235200.0, - "grad_norm": 1.567088080659984, - "language_loss": 0.65746784, - "learning_rate": 1.2824674362249922e-07, - "loss": 0.67883062, - "num_input_tokens_seen": 318785660, - "step": 14783, - "time_per_iteration": 2.841909646987915 - }, - { - "auxiliary_loss_clip": 0.0111198, - "auxiliary_loss_mlp": 0.01035797, - "balance_loss_clip": 1.03731346, - "balance_loss_mlp": 1.02278554, - "epoch": 0.8888621674432586, - "flos": 22162701605760.0, - "grad_norm": 1.5542908685815622, - "language_loss": 0.77494425, - "learning_rate": 1.281095609023415e-07, - "loss": 0.796422, - "num_input_tokens_seen": 318806080, - "step": 14784, - "time_per_iteration": 2.597027540206909 - }, - { - "auxiliary_loss_clip": 0.01083474, - "auxiliary_loss_mlp": 0.01034386, - "balance_loss_clip": 1.03568983, - "balance_loss_mlp": 1.02146983, - "epoch": 0.8889222906959267, - "flos": 27672834723840.0, - "grad_norm": 15.751964718050344, - "language_loss": 0.6070298, - "learning_rate": 1.279724491644565e-07, - "loss": 0.6282084, - "num_input_tokens_seen": 318826445, - "step": 14785, - "time_per_iteration": 2.7380104064941406 - }, - { - "auxiliary_loss_clip": 0.01073801, - "auxiliary_loss_mlp": 0.01035055, - "balance_loss_clip": 1.03463125, - "balance_loss_mlp": 1.02198935, - "epoch": 0.8889824139485947, - "flos": 14168627274240.0, - "grad_norm": 1.8296614273320466, - "language_loss": 0.65093189, - "learning_rate": 1.278354084140445e-07, - "loss": 0.67202044, - "num_input_tokens_seen": 318843915, - "step": 14786, - "time_per_iteration": 2.774667978286743 - }, - { - "auxiliary_loss_clip": 0.01076771, - "auxiliary_loss_mlp": 0.00771472, - "balance_loss_clip": 1.03597903, - "balance_loss_mlp": 1.00018907, - "epoch": 0.8890425372012626, - "flos": 12853003829760.0, - "grad_norm": 2.7089037879672624, - "language_loss": 0.85490113, - "learning_rate": 1.276984386563009e-07, - "loss": 0.87338352, - "num_input_tokens_seen": 318859670, - "step": 14787, - "time_per_iteration": 2.6649672985076904 - }, - { - "auxiliary_loss_clip": 0.01084573, - "auxiliary_loss_mlp": 0.01030017, - "balance_loss_clip": 1.03664386, - "balance_loss_mlp": 1.01775646, - "epoch": 0.8891026604539306, - "flos": 21689291329920.0, - "grad_norm": 2.1717922675442094, - "language_loss": 0.70967633, - "learning_rate": 1.2756153989642027e-07, - "loss": 0.73082221, - "num_input_tokens_seen": 318877855, - "step": 14788, - "time_per_iteration": 2.832113027572632 - }, - { - "auxiliary_loss_clip": 0.01105551, - "auxiliary_loss_mlp": 0.01029542, - "balance_loss_clip": 1.03675187, - "balance_loss_mlp": 1.01768684, - "epoch": 0.8891627837065985, - "flos": 21871430219520.0, - "grad_norm": 1.719821366869133, - "language_loss": 0.69946039, - "learning_rate": 1.274247121395935e-07, - "loss": 0.72081137, - "num_input_tokens_seen": 318896045, - "step": 14789, - "time_per_iteration": 2.6089062690734863 - }, - { - "auxiliary_loss_clip": 0.01100862, - "auxiliary_loss_mlp": 0.01030753, - "balance_loss_clip": 1.03966713, - "balance_loss_mlp": 1.01853967, - "epoch": 0.8892229069592665, - "flos": 21580230660480.0, - "grad_norm": 1.4843336736816757, - "language_loss": 0.70594078, - "learning_rate": 1.2728795539100956e-07, - "loss": 0.72725689, - "num_input_tokens_seen": 318915515, - "step": 14790, - "time_per_iteration": 2.6216959953308105 - }, - { - "auxiliary_loss_clip": 0.01088486, - "auxiliary_loss_mlp": 0.01027627, - "balance_loss_clip": 1.03701544, - "balance_loss_mlp": 1.01623666, - "epoch": 0.8892830302119344, - "flos": 23075981832960.0, - "grad_norm": 1.8356781695474516, - "language_loss": 0.72947121, - "learning_rate": 1.2715126965585387e-07, - "loss": 0.75063235, - "num_input_tokens_seen": 318934305, - "step": 14791, - "time_per_iteration": 2.7145907878875732 - }, - { - "auxiliary_loss_clip": 0.01078142, - "auxiliary_loss_mlp": 0.01033106, - "balance_loss_clip": 1.03699768, - "balance_loss_mlp": 1.02080894, - "epoch": 0.8893431534646025, - "flos": 23072139077760.0, - "grad_norm": 1.7972192952628998, - "language_loss": 0.74159795, - "learning_rate": 1.2701465493931008e-07, - "loss": 0.76271045, - "num_input_tokens_seen": 318953880, - "step": 14792, - "time_per_iteration": 2.689258575439453 - }, - { - "auxiliary_loss_clip": 0.01041593, - "auxiliary_loss_mlp": 0.01037472, - "balance_loss_clip": 1.03244281, - "balance_loss_mlp": 1.02338743, - "epoch": 0.8894032767172704, - "flos": 22454978572800.0, - "grad_norm": 1.9651444821716726, - "language_loss": 0.66043746, - "learning_rate": 1.2687811124655801e-07, - "loss": 0.68122816, - "num_input_tokens_seen": 318971395, - "step": 14793, - "time_per_iteration": 2.73183012008667 - }, - { - "auxiliary_loss_clip": 0.01079264, - "auxiliary_loss_mlp": 0.01031003, - "balance_loss_clip": 1.03588605, - "balance_loss_mlp": 1.01774693, - "epoch": 0.8894633999699384, - "flos": 25338246261120.0, - "grad_norm": 1.671450826366533, - "language_loss": 0.71594059, - "learning_rate": 1.2674163858277552e-07, - "loss": 0.73704326, - "num_input_tokens_seen": 318990580, - "step": 14794, - "time_per_iteration": 2.7042224407196045 - }, - { - "auxiliary_loss_clip": 0.01099154, - "auxiliary_loss_mlp": 0.01034078, - "balance_loss_clip": 1.03871417, - "balance_loss_mlp": 1.02107775, - "epoch": 0.8895235232226063, - "flos": 20994096528000.0, - "grad_norm": 1.7160866842792333, - "language_loss": 0.75350553, - "learning_rate": 1.2660523695313785e-07, - "loss": 0.77483785, - "num_input_tokens_seen": 319010040, - "step": 14795, - "time_per_iteration": 2.5956714153289795 - }, - { - "auxiliary_loss_clip": 0.01003077, - "auxiliary_loss_mlp": 0.00999947, - "balance_loss_clip": 1.00997567, - "balance_loss_mlp": 0.99892819, - "epoch": 0.8895836464752743, - "flos": 69732956764800.0, - "grad_norm": 0.7671992564200865, - "language_loss": 0.56051087, - "learning_rate": 1.2646890636281727e-07, - "loss": 0.58054101, - "num_input_tokens_seen": 319063860, - "step": 14796, - "time_per_iteration": 3.078346014022827 - }, - { - "auxiliary_loss_clip": 0.01111208, - "auxiliary_loss_mlp": 0.01030403, - "balance_loss_clip": 1.03733194, - "balance_loss_mlp": 1.01666403, - "epoch": 0.8896437697279422, - "flos": 23221815050880.0, - "grad_norm": 1.7603233439489925, - "language_loss": 0.70576537, - "learning_rate": 1.263326468169843e-07, - "loss": 0.72718143, - "num_input_tokens_seen": 319082335, - "step": 14797, - "time_per_iteration": 2.576277017593384 - }, - { - "auxiliary_loss_clip": 0.01017004, - "auxiliary_loss_mlp": 0.01002028, - "balance_loss_clip": 1.01229072, - "balance_loss_mlp": 1.00102699, - "epoch": 0.8897038929806103, - "flos": 70752711882240.0, - "grad_norm": 0.7794431422590221, - "language_loss": 0.5794524, - "learning_rate": 1.2619645832080417e-07, - "loss": 0.59964275, - "num_input_tokens_seen": 319147075, - "step": 14798, - "time_per_iteration": 3.218555212020874 - }, - { - "auxiliary_loss_clip": 0.01097846, - "auxiliary_loss_mlp": 0.01029628, - "balance_loss_clip": 1.03578901, - "balance_loss_mlp": 1.01621103, - "epoch": 0.8897640162332782, - "flos": 19245103493760.0, - "grad_norm": 1.822201303291812, - "language_loss": 0.7947073, - "learning_rate": 1.2606034087944251e-07, - "loss": 0.81598198, - "num_input_tokens_seen": 319166630, - "step": 14799, - "time_per_iteration": 2.6169159412384033 - }, - { - "auxiliary_loss_clip": 0.01018703, - "auxiliary_loss_mlp": 0.01003426, - "balance_loss_clip": 1.0060674, - "balance_loss_mlp": 1.00247824, - "epoch": 0.8898241394859462, - "flos": 41356275039360.0, - "grad_norm": 0.8879772067683966, - "language_loss": 0.58123994, - "learning_rate": 1.2592429449806053e-07, - "loss": 0.60146117, - "num_input_tokens_seen": 319221865, - "step": 14800, - "time_per_iteration": 3.090841054916382 - }, - { - "auxiliary_loss_clip": 0.01099994, - "auxiliary_loss_mlp": 0.01034134, - "balance_loss_clip": 1.03885424, - "balance_loss_mlp": 1.02245724, - "epoch": 0.8898842627386142, - "flos": 18986295024000.0, - "grad_norm": 1.5949008184751121, - "language_loss": 0.66234601, - "learning_rate": 1.2578831918181698e-07, - "loss": 0.68368721, - "num_input_tokens_seen": 319240710, - "step": 14801, - "time_per_iteration": 2.5842556953430176 - }, - { - "auxiliary_loss_clip": 0.01073781, - "auxiliary_loss_mlp": 0.01036561, - "balance_loss_clip": 1.03613853, - "balance_loss_mlp": 1.02248251, - "epoch": 0.8899443859912821, - "flos": 13217173868160.0, - "grad_norm": 2.7903408199323496, - "language_loss": 0.7563743, - "learning_rate": 1.256524149358682e-07, - "loss": 0.77747774, - "num_input_tokens_seen": 319256495, - "step": 14802, - "time_per_iteration": 2.6613779067993164 - }, - { - "auxiliary_loss_clip": 0.01091905, - "auxiliary_loss_mlp": 0.01030893, - "balance_loss_clip": 1.03725505, - "balance_loss_mlp": 1.01900768, - "epoch": 0.8900045092439501, - "flos": 22674680110080.0, - "grad_norm": 1.8379867635089826, - "language_loss": 0.73482311, - "learning_rate": 1.2551658176536805e-07, - "loss": 0.75605106, - "num_input_tokens_seen": 319273620, - "step": 14803, - "time_per_iteration": 2.675278425216675 - }, - { - "auxiliary_loss_clip": 0.01081084, - "auxiliary_loss_mlp": 0.01036017, - "balance_loss_clip": 1.03560674, - "balance_loss_mlp": 1.02347028, - "epoch": 0.890064632496618, - "flos": 21141617685120.0, - "grad_norm": 1.8905881524985035, - "language_loss": 0.71867836, - "learning_rate": 1.2538081967546664e-07, - "loss": 0.73984939, - "num_input_tokens_seen": 319291720, - "step": 14804, - "time_per_iteration": 2.637640953063965 - }, - { - "auxiliary_loss_clip": 0.01093595, - "auxiliary_loss_mlp": 0.0103033, - "balance_loss_clip": 1.03525758, - "balance_loss_mlp": 1.01756275, - "epoch": 0.8901247557492861, - "flos": 23397058529280.0, - "grad_norm": 1.8040298487747064, - "language_loss": 0.81148362, - "learning_rate": 1.252451286713123e-07, - "loss": 0.8327229, - "num_input_tokens_seen": 319310380, - "step": 14805, - "time_per_iteration": 2.6288270950317383 - }, - { - "auxiliary_loss_clip": 0.01100196, - "auxiliary_loss_mlp": 0.01029911, - "balance_loss_clip": 1.03652012, - "balance_loss_mlp": 1.01704848, - "epoch": 0.890184879001954, - "flos": 29169591477120.0, - "grad_norm": 2.314720607634321, - "language_loss": 0.67655379, - "learning_rate": 1.251095087580505e-07, - "loss": 0.69785488, - "num_input_tokens_seen": 319331765, - "step": 14806, - "time_per_iteration": 2.701447010040283 - }, - { - "auxiliary_loss_clip": 0.01082875, - "auxiliary_loss_mlp": 0.0103147, - "balance_loss_clip": 1.03327703, - "balance_loss_mlp": 1.0191853, - "epoch": 0.890245002254622, - "flos": 14427830793600.0, - "grad_norm": 1.860806449184193, - "language_loss": 0.6715759, - "learning_rate": 1.2497395994082438e-07, - "loss": 0.6927194, - "num_input_tokens_seen": 319349135, - "step": 14807, - "time_per_iteration": 2.6722195148468018 - }, - { - "auxiliary_loss_clip": 0.01082528, - "auxiliary_loss_mlp": 0.01030238, - "balance_loss_clip": 1.03432024, - "balance_loss_mlp": 1.01869881, - "epoch": 0.8903051255072899, - "flos": 22382187661440.0, - "grad_norm": 1.7718809355965226, - "language_loss": 0.75224829, - "learning_rate": 1.248384822247732e-07, - "loss": 0.77337593, - "num_input_tokens_seen": 319368410, - "step": 14808, - "time_per_iteration": 2.640336036682129 - }, - { - "auxiliary_loss_clip": 0.0107632, - "auxiliary_loss_mlp": 0.01030363, - "balance_loss_clip": 1.0358547, - "balance_loss_mlp": 1.01811981, - "epoch": 0.8903652487599579, - "flos": 20777375819520.0, - "grad_norm": 12.27562140526227, - "language_loss": 0.81525707, - "learning_rate": 1.2470307561503513e-07, - "loss": 0.83632386, - "num_input_tokens_seen": 319387535, - "step": 14809, - "time_per_iteration": 2.6590049266815186 - }, - { - "auxiliary_loss_clip": 0.01099147, - "auxiliary_loss_mlp": 0.0103237, - "balance_loss_clip": 1.03634763, - "balance_loss_mlp": 1.02048481, - "epoch": 0.8904253720126258, - "flos": 24424499157120.0, - "grad_norm": 2.1293350080998747, - "language_loss": 0.68579054, - "learning_rate": 1.2456774011674442e-07, - "loss": 0.70710576, - "num_input_tokens_seen": 319407210, - "step": 14810, - "time_per_iteration": 2.601858139038086 - }, - { - "auxiliary_loss_clip": 0.01074787, - "auxiliary_loss_mlp": 0.01028878, - "balance_loss_clip": 1.03349328, - "balance_loss_mlp": 1.01603925, - "epoch": 0.8904854952652939, - "flos": 19463871277440.0, - "grad_norm": 2.1159045694256124, - "language_loss": 0.70389724, - "learning_rate": 1.2443247573503257e-07, - "loss": 0.72493392, - "num_input_tokens_seen": 319425340, - "step": 14811, - "time_per_iteration": 2.652963876724243 - }, - { - "auxiliary_loss_clip": 0.01077147, - "auxiliary_loss_mlp": 0.00770711, - "balance_loss_clip": 1.03590763, - "balance_loss_mlp": 1.00018835, - "epoch": 0.8905456185179618, - "flos": 50800741666560.0, - "grad_norm": 2.4983735528249182, - "language_loss": 0.66081208, - "learning_rate": 1.2429728247502924e-07, - "loss": 0.67929065, - "num_input_tokens_seen": 319448150, - "step": 14812, - "time_per_iteration": 4.636792182922363 - }, - { - "auxiliary_loss_clip": 0.01060766, - "auxiliary_loss_mlp": 0.01031191, - "balance_loss_clip": 1.03516841, - "balance_loss_mlp": 1.01957428, - "epoch": 0.8906057417706298, - "flos": 17784867893760.0, - "grad_norm": 1.7995676770850613, - "language_loss": 0.68747163, - "learning_rate": 1.24162160341861e-07, - "loss": 0.70839119, - "num_input_tokens_seen": 319466115, - "step": 14813, - "time_per_iteration": 4.193687200546265 - }, - { - "auxiliary_loss_clip": 0.01084515, - "auxiliary_loss_mlp": 0.01040982, - "balance_loss_clip": 1.03238082, - "balance_loss_mlp": 1.02447116, - "epoch": 0.8906658650232978, - "flos": 21944867575680.0, - "grad_norm": 3.876084846753058, - "language_loss": 0.75659066, - "learning_rate": 1.2402710934065198e-07, - "loss": 0.77784562, - "num_input_tokens_seen": 319485255, - "step": 14814, - "time_per_iteration": 2.6463520526885986 - }, - { - "auxiliary_loss_clip": 0.01100125, - "auxiliary_loss_mlp": 0.01030982, - "balance_loss_clip": 1.03604758, - "balance_loss_mlp": 1.01783299, - "epoch": 0.8907259882759657, - "flos": 21287810039040.0, - "grad_norm": 2.0688857636131734, - "language_loss": 0.74374747, - "learning_rate": 1.2389212947652229e-07, - "loss": 0.76505852, - "num_input_tokens_seen": 319501800, - "step": 14815, - "time_per_iteration": 4.110440492630005 - }, - { - "auxiliary_loss_clip": 0.01068212, - "auxiliary_loss_mlp": 0.01031366, - "balance_loss_clip": 1.03145206, - "balance_loss_mlp": 1.01870036, - "epoch": 0.8907861115286337, - "flos": 20120426023680.0, - "grad_norm": 2.123609537525354, - "language_loss": 0.75087738, - "learning_rate": 1.237572207545914e-07, - "loss": 0.77187324, - "num_input_tokens_seen": 319520415, - "step": 14816, - "time_per_iteration": 4.275893926620483 - }, - { - "auxiliary_loss_clip": 0.01086936, - "auxiliary_loss_mlp": 0.01031641, - "balance_loss_clip": 1.03456652, - "balance_loss_mlp": 1.01913631, - "epoch": 0.8908462347813016, - "flos": 20084156265600.0, - "grad_norm": 1.7646380277651805, - "language_loss": 0.77968502, - "learning_rate": 1.2362238317997476e-07, - "loss": 0.80087078, - "num_input_tokens_seen": 319538410, - "step": 14817, - "time_per_iteration": 2.694972515106201 - }, - { - "auxiliary_loss_clip": 0.01001525, - "auxiliary_loss_mlp": 0.01001251, - "balance_loss_clip": 1.00726986, - "balance_loss_mlp": 1.00008297, - "epoch": 0.8909063580339697, - "flos": 65503649790720.0, - "grad_norm": 0.7456782467309502, - "language_loss": 0.56431699, - "learning_rate": 1.2348761675778517e-07, - "loss": 0.58434474, - "num_input_tokens_seen": 319602565, - "step": 14818, - "time_per_iteration": 3.234703540802002 - }, - { - "auxiliary_loss_clip": 0.01059509, - "auxiliary_loss_mlp": 0.01034162, - "balance_loss_clip": 1.0355022, - "balance_loss_mlp": 1.02152014, - "epoch": 0.8909664812866376, - "flos": 29863062426240.0, - "grad_norm": 1.7646144877343908, - "language_loss": 0.64705229, - "learning_rate": 1.2335292149313325e-07, - "loss": 0.66798902, - "num_input_tokens_seen": 319624645, - "step": 14819, - "time_per_iteration": 2.7950870990753174 - }, - { - "auxiliary_loss_clip": 0.01097653, - "auxiliary_loss_mlp": 0.01030768, - "balance_loss_clip": 1.03588057, - "balance_loss_mlp": 1.01794112, - "epoch": 0.8910266045393056, - "flos": 25447127362560.0, - "grad_norm": 2.2154062344071312, - "language_loss": 0.78340304, - "learning_rate": 1.2321829739112731e-07, - "loss": 0.80468726, - "num_input_tokens_seen": 319644040, - "step": 14820, - "time_per_iteration": 2.6286323070526123 - }, - { - "auxiliary_loss_clip": 0.01070015, - "auxiliary_loss_mlp": 0.00769687, - "balance_loss_clip": 1.03580856, - "balance_loss_mlp": 1.00026464, - "epoch": 0.8910867277919735, - "flos": 24499121662080.0, - "grad_norm": 1.856207333364825, - "language_loss": 0.76575708, - "learning_rate": 1.2308374445687087e-07, - "loss": 0.78415406, - "num_input_tokens_seen": 319663930, - "step": 14821, - "time_per_iteration": 2.710040330886841 - }, - { - "auxiliary_loss_clip": 0.01014485, - "auxiliary_loss_mlp": 0.00751361, - "balance_loss_clip": 1.00564671, - "balance_loss_mlp": 0.99960148, - "epoch": 0.8911468510446415, - "flos": 60688136856960.0, - "grad_norm": 0.7925502121917717, - "language_loss": 0.59283942, - "learning_rate": 1.2294926269546712e-07, - "loss": 0.61049783, - "num_input_tokens_seen": 319721245, - "step": 14822, - "time_per_iteration": 3.042881727218628 - }, - { - "auxiliary_loss_clip": 0.0109278, - "auxiliary_loss_mlp": 0.0103595, - "balance_loss_clip": 1.03620601, - "balance_loss_mlp": 1.02346885, - "epoch": 0.8912069742973094, - "flos": 25337492075520.0, - "grad_norm": 2.0091476458751845, - "language_loss": 0.69135273, - "learning_rate": 1.2281485211201515e-07, - "loss": 0.71263999, - "num_input_tokens_seen": 319741200, - "step": 14823, - "time_per_iteration": 2.6208603382110596 - }, - { - "auxiliary_loss_clip": 0.01089302, - "auxiliary_loss_mlp": 0.01035298, - "balance_loss_clip": 1.03342748, - "balance_loss_mlp": 1.02241755, - "epoch": 0.8912670975499775, - "flos": 18223516782720.0, - "grad_norm": 1.5978850394355568, - "language_loss": 0.69198072, - "learning_rate": 1.2268051271161262e-07, - "loss": 0.71322668, - "num_input_tokens_seen": 319759265, - "step": 14824, - "time_per_iteration": 2.508863687515259 - }, - { - "auxiliary_loss_clip": 0.01058099, - "auxiliary_loss_mlp": 0.0103706, - "balance_loss_clip": 1.03319716, - "balance_loss_mlp": 1.02307105, - "epoch": 0.8913272208026454, - "flos": 26504481041280.0, - "grad_norm": 1.9932021748393736, - "language_loss": 0.70705098, - "learning_rate": 1.2254624449935303e-07, - "loss": 0.72800255, - "num_input_tokens_seen": 319777560, - "step": 14825, - "time_per_iteration": 2.654224157333374 - }, - { - "auxiliary_loss_clip": 0.01085791, - "auxiliary_loss_mlp": 0.01032779, - "balance_loss_clip": 1.03422439, - "balance_loss_mlp": 1.01951671, - "epoch": 0.8913873440553134, - "flos": 18802324540800.0, - "grad_norm": 1.827676363511503, - "language_loss": 0.71464586, - "learning_rate": 1.2241204748032786e-07, - "loss": 0.7358315, - "num_input_tokens_seen": 319794125, - "step": 14826, - "time_per_iteration": 2.5119738578796387 - }, - { - "auxiliary_loss_clip": 0.0109572, - "auxiliary_loss_mlp": 0.01028162, - "balance_loss_clip": 1.03623497, - "balance_loss_mlp": 1.01646793, - "epoch": 0.8914474673079814, - "flos": 20884892204160.0, - "grad_norm": 2.0101591277509243, - "language_loss": 0.75315851, - "learning_rate": 1.2227792165962615e-07, - "loss": 0.77439737, - "num_input_tokens_seen": 319810310, - "step": 14827, - "time_per_iteration": 2.4767954349517822 - }, - { - "auxiliary_loss_clip": 0.01100376, - "auxiliary_loss_mlp": 0.0103277, - "balance_loss_clip": 1.03736746, - "balance_loss_mlp": 1.02037859, - "epoch": 0.8915075905606493, - "flos": 20952439729920.0, - "grad_norm": 2.25546419546836, - "language_loss": 0.78480828, - "learning_rate": 1.221438670423336e-07, - "loss": 0.80613977, - "num_input_tokens_seen": 319828505, - "step": 14828, - "time_per_iteration": 2.4681639671325684 - }, - { - "auxiliary_loss_clip": 0.01068483, - "auxiliary_loss_mlp": 0.01032448, - "balance_loss_clip": 1.0356158, - "balance_loss_mlp": 1.01987755, - "epoch": 0.8915677138133173, - "flos": 23076305055360.0, - "grad_norm": 1.7213243632049227, - "language_loss": 0.75276792, - "learning_rate": 1.2200988363353392e-07, - "loss": 0.77377725, - "num_input_tokens_seen": 319848680, - "step": 14829, - "time_per_iteration": 2.6100480556488037 - }, - { - "auxiliary_loss_clip": 0.01108879, - "auxiliary_loss_mlp": 0.01034466, - "balance_loss_clip": 1.03637552, - "balance_loss_mlp": 1.02299213, - "epoch": 0.8916278370659853, - "flos": 23440259612160.0, - "grad_norm": 1.5513516933839315, - "language_loss": 0.84576946, - "learning_rate": 1.2187597143830773e-07, - "loss": 0.867203, - "num_input_tokens_seen": 319868835, - "step": 14830, - "time_per_iteration": 2.5831005573272705 - }, - { - "auxiliary_loss_clip": 0.01093236, - "auxiliary_loss_mlp": 0.01030146, - "balance_loss_clip": 1.03435206, - "balance_loss_mlp": 1.01864195, - "epoch": 0.8916879603186533, - "flos": 25160488830720.0, - "grad_norm": 1.3477965843038384, - "language_loss": 0.74875772, - "learning_rate": 1.2174213046173299e-07, - "loss": 0.76999158, - "num_input_tokens_seen": 319891585, - "step": 14831, - "time_per_iteration": 2.7232887744903564 - }, - { - "auxiliary_loss_clip": 0.01100471, - "auxiliary_loss_mlp": 0.01029141, - "balance_loss_clip": 1.03624547, - "balance_loss_mlp": 1.01663041, - "epoch": 0.8917480835713212, - "flos": 20229845829120.0, - "grad_norm": 1.8265908258617016, - "language_loss": 0.72934276, - "learning_rate": 1.216083607088847e-07, - "loss": 0.75063884, - "num_input_tokens_seen": 319910315, - "step": 14832, - "time_per_iteration": 2.616689443588257 - }, - { - "auxiliary_loss_clip": 0.01045927, - "auxiliary_loss_mlp": 0.00770458, - "balance_loss_clip": 1.03222537, - "balance_loss_mlp": 1.00019884, - "epoch": 0.8918082068239892, - "flos": 26101922342400.0, - "grad_norm": 3.1162015685797972, - "language_loss": 0.66912735, - "learning_rate": 1.214746621848355e-07, - "loss": 0.68729126, - "num_input_tokens_seen": 319932275, - "step": 14833, - "time_per_iteration": 2.8316352367401123 - }, - { - "auxiliary_loss_clip": 0.01106023, - "auxiliary_loss_mlp": 0.01034649, - "balance_loss_clip": 1.03974128, - "balance_loss_mlp": 1.02139854, - "epoch": 0.8918683300766571, - "flos": 24831439315200.0, - "grad_norm": 1.9997597617659202, - "language_loss": 0.73976004, - "learning_rate": 1.2134103489465575e-07, - "loss": 0.76116675, - "num_input_tokens_seen": 319955335, - "step": 14834, - "time_per_iteration": 2.7026243209838867 - }, - { - "auxiliary_loss_clip": 0.01065475, - "auxiliary_loss_mlp": 0.0103389, - "balance_loss_clip": 1.03502977, - "balance_loss_mlp": 1.02165282, - "epoch": 0.8919284533293251, - "flos": 22305158945280.0, - "grad_norm": 1.9340437806838273, - "language_loss": 0.78773081, - "learning_rate": 1.2120747884341188e-07, - "loss": 0.80872452, - "num_input_tokens_seen": 319973990, - "step": 14835, - "time_per_iteration": 2.64371395111084 - }, - { - "auxiliary_loss_clip": 0.01103945, - "auxiliary_loss_mlp": 0.0103108, - "balance_loss_clip": 1.03464007, - "balance_loss_mlp": 1.01960659, - "epoch": 0.891988576581993, - "flos": 30373532559360.0, - "grad_norm": 1.6176322749361627, - "language_loss": 0.74194962, - "learning_rate": 1.210739940361689e-07, - "loss": 0.76329982, - "num_input_tokens_seen": 319995555, - "step": 14836, - "time_per_iteration": 2.6271843910217285 - }, - { - "auxiliary_loss_clip": 0.01087557, - "auxiliary_loss_mlp": 0.01032238, - "balance_loss_clip": 1.03471708, - "balance_loss_mlp": 1.01970363, - "epoch": 0.8920486998346611, - "flos": 15552947479680.0, - "grad_norm": 3.2025172292231625, - "language_loss": 0.68644428, - "learning_rate": 1.2094058047798838e-07, - "loss": 0.7076422, - "num_input_tokens_seen": 320012385, - "step": 14837, - "time_per_iteration": 2.612969160079956 - }, - { - "auxiliary_loss_clip": 0.01050841, - "auxiliary_loss_mlp": 0.0103232, - "balance_loss_clip": 1.03323007, - "balance_loss_mlp": 1.01922536, - "epoch": 0.892108823087329, - "flos": 21214983214080.0, - "grad_norm": 1.653711357861068, - "language_loss": 0.67707741, - "learning_rate": 1.2080723817392913e-07, - "loss": 0.697909, - "num_input_tokens_seen": 320032390, - "step": 14838, - "time_per_iteration": 2.7335948944091797 - }, - { - "auxiliary_loss_clip": 0.01096545, - "auxiliary_loss_mlp": 0.01031606, - "balance_loss_clip": 1.03442597, - "balance_loss_mlp": 1.0184747, - "epoch": 0.892168946339997, - "flos": 21978982517760.0, - "grad_norm": 2.2756639024172722, - "language_loss": 0.76234394, - "learning_rate": 1.2067396712904777e-07, - "loss": 0.78362542, - "num_input_tokens_seen": 320052885, - "step": 14839, - "time_per_iteration": 2.6222732067108154 - }, - { - "auxiliary_loss_clip": 0.00999654, - "auxiliary_loss_mlp": 0.00751271, - "balance_loss_clip": 1.00644863, - "balance_loss_mlp": 0.99961644, - "epoch": 0.892229069592665, - "flos": 67475289277440.0, - "grad_norm": 0.6958789552427521, - "language_loss": 0.49386242, - "learning_rate": 1.205407673483978e-07, - "loss": 0.51137161, - "num_input_tokens_seen": 320113685, - "step": 14840, - "time_per_iteration": 3.1971607208251953 - }, - { - "auxiliary_loss_clip": 0.0111346, - "auxiliary_loss_mlp": 0.01031899, - "balance_loss_clip": 1.03685474, - "balance_loss_mlp": 1.01813662, - "epoch": 0.8922891928453329, - "flos": 19459561645440.0, - "grad_norm": 2.2275620590123575, - "language_loss": 0.64040601, - "learning_rate": 1.2040763883703074e-07, - "loss": 0.66185963, - "num_input_tokens_seen": 320130810, - "step": 14841, - "time_per_iteration": 2.5630903244018555 - }, - { - "auxiliary_loss_clip": 0.01073374, - "auxiliary_loss_mlp": 0.00768866, - "balance_loss_clip": 1.03585565, - "balance_loss_mlp": 1.00014949, - "epoch": 0.8923493160980009, - "flos": 23367396873600.0, - "grad_norm": 1.4260666189370539, - "language_loss": 0.68198895, - "learning_rate": 1.2027458159999438e-07, - "loss": 0.70041138, - "num_input_tokens_seen": 320152170, - "step": 14842, - "time_per_iteration": 2.7487709522247314 - }, - { - "auxiliary_loss_clip": 0.01107456, - "auxiliary_loss_mlp": 0.01036165, - "balance_loss_clip": 1.03805566, - "balance_loss_mlp": 1.02464318, - "epoch": 0.8924094393506689, - "flos": 26177047637760.0, - "grad_norm": 2.0828434512728387, - "language_loss": 0.80424309, - "learning_rate": 1.2014159564233373e-07, - "loss": 0.8256793, - "num_input_tokens_seen": 320172360, - "step": 14843, - "time_per_iteration": 2.6367337703704834 - }, - { - "auxiliary_loss_clip": 0.01084909, - "auxiliary_loss_mlp": 0.01033483, - "balance_loss_clip": 1.03361225, - "balance_loss_mlp": 1.01991701, - "epoch": 0.8924695626033369, - "flos": 22018520413440.0, - "grad_norm": 2.382089830168308, - "language_loss": 0.68838096, - "learning_rate": 1.2000868096909257e-07, - "loss": 0.70956492, - "num_input_tokens_seen": 320192130, - "step": 14844, - "time_per_iteration": 2.6400132179260254 - }, - { - "auxiliary_loss_clip": 0.01064131, - "auxiliary_loss_mlp": 0.0103004, - "balance_loss_clip": 1.03404808, - "balance_loss_mlp": 1.01779175, - "epoch": 0.8925296858560048, - "flos": 14793940166400.0, - "grad_norm": 2.2436852387053134, - "language_loss": 0.91622436, - "learning_rate": 1.1987583758531038e-07, - "loss": 0.93716609, - "num_input_tokens_seen": 320207760, - "step": 14845, - "time_per_iteration": 2.74336314201355 - }, - { - "auxiliary_loss_clip": 0.01089634, - "auxiliary_loss_mlp": 0.01031578, - "balance_loss_clip": 1.03469348, - "balance_loss_mlp": 1.01985955, - "epoch": 0.8925898091086728, - "flos": 22346636175360.0, - "grad_norm": 1.8155448981855211, - "language_loss": 0.72219133, - "learning_rate": 1.1974306549602476e-07, - "loss": 0.74340343, - "num_input_tokens_seen": 320225325, - "step": 14846, - "time_per_iteration": 2.628924608230591 - }, - { - "auxiliary_loss_clip": 0.01084746, - "auxiliary_loss_mlp": 0.01033411, - "balance_loss_clip": 1.03907979, - "balance_loss_mlp": 1.02118051, - "epoch": 0.8926499323613407, - "flos": 45806322067200.0, - "grad_norm": 2.129136173165777, - "language_loss": 0.56949878, - "learning_rate": 1.1961036470627094e-07, - "loss": 0.5906803, - "num_input_tokens_seen": 320247645, - "step": 14847, - "time_per_iteration": 2.8942604064941406 - }, - { - "auxiliary_loss_clip": 0.01071094, - "auxiliary_loss_mlp": 0.01034217, - "balance_loss_clip": 1.03545833, - "balance_loss_mlp": 1.0223918, - "epoch": 0.8927100556140087, - "flos": 22127042378880.0, - "grad_norm": 2.417347097790333, - "language_loss": 0.76218295, - "learning_rate": 1.1947773522108052e-07, - "loss": 0.78323603, - "num_input_tokens_seen": 320266005, - "step": 14848, - "time_per_iteration": 2.703596830368042 - }, - { - "auxiliary_loss_clip": 0.01043101, - "auxiliary_loss_mlp": 0.01046178, - "balance_loss_clip": 1.0295552, - "balance_loss_mlp": 1.03208137, - "epoch": 0.8927701788666766, - "flos": 28330143655680.0, - "grad_norm": 2.5994238973384554, - "language_loss": 0.69254899, - "learning_rate": 1.1934517704548251e-07, - "loss": 0.71344179, - "num_input_tokens_seen": 320285555, - "step": 14849, - "time_per_iteration": 2.7903876304626465 - }, - { - "auxiliary_loss_clip": 0.01099654, - "auxiliary_loss_mlp": 0.01032823, - "balance_loss_clip": 1.03864908, - "balance_loss_mlp": 1.02075911, - "epoch": 0.8928303021193447, - "flos": 25294973351040.0, - "grad_norm": 1.9684228103737367, - "language_loss": 0.80747259, - "learning_rate": 1.1921269018450364e-07, - "loss": 0.8287974, - "num_input_tokens_seen": 320305395, - "step": 14850, - "time_per_iteration": 2.6187615394592285 - }, - { - "auxiliary_loss_clip": 0.01087788, - "auxiliary_loss_mlp": 0.01037651, - "balance_loss_clip": 1.03636372, - "balance_loss_mlp": 1.02547944, - "epoch": 0.8928904253720126, - "flos": 22236713579520.0, - "grad_norm": 1.6645229603446685, - "language_loss": 0.74605459, - "learning_rate": 1.1908027464316872e-07, - "loss": 0.76730895, - "num_input_tokens_seen": 320324220, - "step": 14851, - "time_per_iteration": 2.6631858348846436 - }, - { - "auxiliary_loss_clip": 0.0108452, - "auxiliary_loss_mlp": 0.01029205, - "balance_loss_clip": 1.03504527, - "balance_loss_mlp": 1.01692009, - "epoch": 0.8929505486246806, - "flos": 27092374940160.0, - "grad_norm": 1.5560164927466833, - "language_loss": 0.78718781, - "learning_rate": 1.1894793042649775e-07, - "loss": 0.80832505, - "num_input_tokens_seen": 320347195, - "step": 14852, - "time_per_iteration": 5.973539113998413 - }, - { - "auxiliary_loss_clip": 0.01091326, - "auxiliary_loss_mlp": 0.01033169, - "balance_loss_clip": 1.03806448, - "balance_loss_mlp": 1.0212301, - "epoch": 0.8930106718773486, - "flos": 23039352938880.0, - "grad_norm": 2.4577931840380596, - "language_loss": 0.69120765, - "learning_rate": 1.1881565753951006e-07, - "loss": 0.71245253, - "num_input_tokens_seen": 320366850, - "step": 14853, - "time_per_iteration": 2.6630473136901855 - }, - { - "auxiliary_loss_clip": 0.01060947, - "auxiliary_loss_mlp": 0.01032204, - "balance_loss_clip": 1.03697348, - "balance_loss_mlp": 1.01997924, - "epoch": 0.8930707951300165, - "flos": 35626652887680.0, - "grad_norm": 1.537977130569083, - "language_loss": 0.67207319, - "learning_rate": 1.1868345598722118e-07, - "loss": 0.69300473, - "num_input_tokens_seen": 320388895, - "step": 14854, - "time_per_iteration": 4.400064945220947 - }, - { - "auxiliary_loss_clip": 0.01081067, - "auxiliary_loss_mlp": 0.01040836, - "balance_loss_clip": 1.03309155, - "balance_loss_mlp": 1.02784824, - "epoch": 0.8931309183826845, - "flos": 23039891642880.0, - "grad_norm": 1.6742794068707105, - "language_loss": 0.74868983, - "learning_rate": 1.1855132577464399e-07, - "loss": 0.76990891, - "num_input_tokens_seen": 320408520, - "step": 14855, - "time_per_iteration": 4.200139284133911 - }, - { - "auxiliary_loss_clip": 0.01086542, - "auxiliary_loss_mlp": 0.01032762, - "balance_loss_clip": 1.03601122, - "balance_loss_mlp": 1.02056086, - "epoch": 0.8931910416353525, - "flos": 26504624695680.0, - "grad_norm": 2.5885445984861613, - "language_loss": 0.64431441, - "learning_rate": 1.1841926690678893e-07, - "loss": 0.6655075, - "num_input_tokens_seen": 320427400, - "step": 14856, - "time_per_iteration": 2.657810926437378 - }, - { - "auxiliary_loss_clip": 0.0110682, - "auxiliary_loss_mlp": 0.0102884, - "balance_loss_clip": 1.03531027, - "balance_loss_mlp": 1.01715207, - "epoch": 0.8932511648880205, - "flos": 24973609345920.0, - "grad_norm": 1.6750308846874502, - "language_loss": 0.66575366, - "learning_rate": 1.1828727938866378e-07, - "loss": 0.68711025, - "num_input_tokens_seen": 320447570, - "step": 14857, - "time_per_iteration": 2.644740343093872 - }, - { - "auxiliary_loss_clip": 0.01068637, - "auxiliary_loss_mlp": 0.01038826, - "balance_loss_clip": 1.04051542, - "balance_loss_mlp": 1.02599871, - "epoch": 0.8933112881406884, - "flos": 24460733001600.0, - "grad_norm": 2.232767512472365, - "language_loss": 0.75065112, - "learning_rate": 1.1815536322527408e-07, - "loss": 0.77172571, - "num_input_tokens_seen": 320464405, - "step": 14858, - "time_per_iteration": 2.7609682083129883 - }, - { - "auxiliary_loss_clip": 0.01096177, - "auxiliary_loss_mlp": 0.0102937, - "balance_loss_clip": 1.03594685, - "balance_loss_mlp": 1.01651311, - "epoch": 0.8933714113933564, - "flos": 28293083798400.0, - "grad_norm": 1.825199606882533, - "language_loss": 0.69551903, - "learning_rate": 1.1802351842162139e-07, - "loss": 0.71677446, - "num_input_tokens_seen": 320485525, - "step": 14859, - "time_per_iteration": 2.6836822032928467 - }, - { - "auxiliary_loss_clip": 0.01056346, - "auxiliary_loss_mlp": 0.01028474, - "balance_loss_clip": 1.03371429, - "balance_loss_mlp": 1.0170536, - "epoch": 0.8934315346460243, - "flos": 21434864319360.0, - "grad_norm": 1.6309895409207762, - "language_loss": 0.75540131, - "learning_rate": 1.1789174498270526e-07, - "loss": 0.77624959, - "num_input_tokens_seen": 320506725, - "step": 14860, - "time_per_iteration": 2.76859450340271 - }, - { - "auxiliary_loss_clip": 0.01086873, - "auxiliary_loss_mlp": 0.01033583, - "balance_loss_clip": 1.03512859, - "balance_loss_mlp": 1.02008855, - "epoch": 0.8934916578986923, - "flos": 23769596436480.0, - "grad_norm": 4.01916529302481, - "language_loss": 0.57677805, - "learning_rate": 1.1776004291352303e-07, - "loss": 0.59798259, - "num_input_tokens_seen": 320525425, - "step": 14861, - "time_per_iteration": 2.661344289779663 - }, - { - "auxiliary_loss_clip": 0.01078056, - "auxiliary_loss_mlp": 0.01033158, - "balance_loss_clip": 1.03267503, - "balance_loss_mlp": 1.02077198, - "epoch": 0.8935517811513602, - "flos": 18916161719040.0, - "grad_norm": 1.9140763695424603, - "language_loss": 0.63545376, - "learning_rate": 1.176284122190685e-07, - "loss": 0.6565659, - "num_input_tokens_seen": 320543010, - "step": 14862, - "time_per_iteration": 2.5856823921203613 - }, - { - "auxiliary_loss_clip": 0.01092562, - "auxiliary_loss_mlp": 0.01026666, - "balance_loss_clip": 1.03338671, - "balance_loss_mlp": 1.01455998, - "epoch": 0.8936119044040283, - "flos": 24061370613120.0, - "grad_norm": 2.1334167708433323, - "language_loss": 0.78088272, - "learning_rate": 1.1749685290433298e-07, - "loss": 0.80207497, - "num_input_tokens_seen": 320562180, - "step": 14863, - "time_per_iteration": 2.611900806427002 - }, - { - "auxiliary_loss_clip": 0.01080768, - "auxiliary_loss_mlp": 0.01037163, - "balance_loss_clip": 1.03352034, - "balance_loss_mlp": 1.02448487, - "epoch": 0.8936720276566962, - "flos": 21324079797120.0, - "grad_norm": 1.7735911629661039, - "language_loss": 0.71075487, - "learning_rate": 1.1736536497430627e-07, - "loss": 0.73193425, - "num_input_tokens_seen": 320580395, - "step": 14864, - "time_per_iteration": 2.691619873046875 - }, - { - "auxiliary_loss_clip": 0.01101616, - "auxiliary_loss_mlp": 0.01037124, - "balance_loss_clip": 1.03658938, - "balance_loss_mlp": 1.02402878, - "epoch": 0.8937321509093642, - "flos": 18406122549120.0, - "grad_norm": 2.399528351176047, - "language_loss": 0.76093769, - "learning_rate": 1.1723394843397283e-07, - "loss": 0.78232509, - "num_input_tokens_seen": 320599505, - "step": 14865, - "time_per_iteration": 2.6147727966308594 - }, - { - "auxiliary_loss_clip": 0.01069542, - "auxiliary_loss_mlp": 0.01032163, - "balance_loss_clip": 1.03533304, - "balance_loss_mlp": 1.02058804, - "epoch": 0.8937922741620322, - "flos": 22054754257920.0, - "grad_norm": 1.8011765216812077, - "language_loss": 0.72078204, - "learning_rate": 1.1710260328831668e-07, - "loss": 0.74179912, - "num_input_tokens_seen": 320619825, - "step": 14866, - "time_per_iteration": 2.7329297065734863 - }, - { - "auxiliary_loss_clip": 0.01100829, - "auxiliary_loss_mlp": 0.01029076, - "balance_loss_clip": 1.0382688, - "balance_loss_mlp": 1.01533771, - "epoch": 0.8938523974147001, - "flos": 25664386775040.0, - "grad_norm": 1.830929850281708, - "language_loss": 0.83762133, - "learning_rate": 1.1697132954231869e-07, - "loss": 0.8589204, - "num_input_tokens_seen": 320638515, - "step": 14867, - "time_per_iteration": 2.668128728866577 - }, - { - "auxiliary_loss_clip": 0.0109843, - "auxiliary_loss_mlp": 0.01029062, - "balance_loss_clip": 1.03669333, - "balance_loss_mlp": 1.01795816, - "epoch": 0.8939125206673681, - "flos": 25742852035200.0, - "grad_norm": 1.586495908389307, - "language_loss": 0.80449593, - "learning_rate": 1.168401272009567e-07, - "loss": 0.82577085, - "num_input_tokens_seen": 320659430, - "step": 14868, - "time_per_iteration": 2.680034637451172 - }, - { - "auxiliary_loss_clip": 0.01083053, - "auxiliary_loss_mlp": 0.01033575, - "balance_loss_clip": 1.03728485, - "balance_loss_mlp": 1.0209384, - "epoch": 0.8939726439200361, - "flos": 27344503480320.0, - "grad_norm": 1.8649016797962312, - "language_loss": 0.7731384, - "learning_rate": 1.167089962692056e-07, - "loss": 0.79430467, - "num_input_tokens_seen": 320679295, - "step": 14869, - "time_per_iteration": 2.745805263519287 - }, - { - "auxiliary_loss_clip": 0.01097268, - "auxiliary_loss_mlp": 0.00769412, - "balance_loss_clip": 1.03609347, - "balance_loss_mlp": 1.00023556, - "epoch": 0.8940327671727041, - "flos": 20338834671360.0, - "grad_norm": 1.4278023080407176, - "language_loss": 0.65314829, - "learning_rate": 1.1657793675203853e-07, - "loss": 0.67181504, - "num_input_tokens_seen": 320697535, - "step": 14870, - "time_per_iteration": 2.6284589767456055 - }, - { - "auxiliary_loss_clip": 0.00993024, - "auxiliary_loss_mlp": 0.0102124, - "balance_loss_clip": 1.00702477, - "balance_loss_mlp": 1.01970196, - "epoch": 0.894092890425372, - "flos": 58410573235200.0, - "grad_norm": 0.7966327834428544, - "language_loss": 0.55929744, - "learning_rate": 1.1644694865442461e-07, - "loss": 0.57944012, - "num_input_tokens_seen": 320758635, - "step": 14871, - "time_per_iteration": 3.3122901916503906 - }, - { - "auxiliary_loss_clip": 0.01091917, - "auxiliary_loss_mlp": 0.0103181, - "balance_loss_clip": 1.03682566, - "balance_loss_mlp": 1.02005625, - "epoch": 0.89415301367804, - "flos": 19829657427840.0, - "grad_norm": 1.9266754384359623, - "language_loss": 0.76406336, - "learning_rate": 1.16316031981331e-07, - "loss": 0.78530067, - "num_input_tokens_seen": 320777175, - "step": 14872, - "time_per_iteration": 2.6247551441192627 - }, - { - "auxiliary_loss_clip": 0.01094372, - "auxiliary_loss_mlp": 0.01031313, - "balance_loss_clip": 1.03704429, - "balance_loss_mlp": 1.02015495, - "epoch": 0.8942131369307079, - "flos": 25775781828480.0, - "grad_norm": 1.648018727425323, - "language_loss": 0.67068255, - "learning_rate": 1.1618518673772215e-07, - "loss": 0.69193947, - "num_input_tokens_seen": 320797670, - "step": 14873, - "time_per_iteration": 2.6552417278289795 - }, - { - "auxiliary_loss_clip": 0.01105979, - "auxiliary_loss_mlp": 0.01034727, - "balance_loss_clip": 1.03645134, - "balance_loss_mlp": 1.02241898, - "epoch": 0.8942732601833759, - "flos": 23149024139520.0, - "grad_norm": 1.5958829385063367, - "language_loss": 0.59345031, - "learning_rate": 1.1605441292856033e-07, - "loss": 0.61485744, - "num_input_tokens_seen": 320817410, - "step": 14874, - "time_per_iteration": 2.5860843658447266 - }, - { - "auxiliary_loss_clip": 0.01078313, - "auxiliary_loss_mlp": 0.01032167, - "balance_loss_clip": 1.03743887, - "balance_loss_mlp": 1.01903629, - "epoch": 0.8943333834360438, - "flos": 27855548231040.0, - "grad_norm": 1.8290237697003595, - "language_loss": 0.75576758, - "learning_rate": 1.1592371055880356e-07, - "loss": 0.7768724, - "num_input_tokens_seen": 320836745, - "step": 14875, - "time_per_iteration": 2.7420151233673096 - }, - { - "auxiliary_loss_clip": 0.01079183, - "auxiliary_loss_mlp": 0.0103557, - "balance_loss_clip": 1.03446269, - "balance_loss_mlp": 1.0205195, - "epoch": 0.8943935066887119, - "flos": 22163958581760.0, - "grad_norm": 2.3429928427333926, - "language_loss": 0.77405798, - "learning_rate": 1.1579307963340857e-07, - "loss": 0.79520553, - "num_input_tokens_seen": 320853305, - "step": 14876, - "time_per_iteration": 2.816397190093994 - }, - { - "auxiliary_loss_clip": 0.01096244, - "auxiliary_loss_mlp": 0.01025808, - "balance_loss_clip": 1.0358882, - "balance_loss_mlp": 1.01482916, - "epoch": 0.8944536299413798, - "flos": 21470056669440.0, - "grad_norm": 1.6703549010755179, - "language_loss": 0.78432184, - "learning_rate": 1.156625201573287e-07, - "loss": 0.80554235, - "num_input_tokens_seen": 320872885, - "step": 14877, - "time_per_iteration": 2.7098886966705322 - }, - { - "auxiliary_loss_clip": 0.01059905, - "auxiliary_loss_mlp": 0.01039763, - "balance_loss_clip": 1.03192687, - "balance_loss_mlp": 1.02515423, - "epoch": 0.8945137531940478, - "flos": 17748777703680.0, - "grad_norm": 2.0737748491478465, - "language_loss": 0.7512145, - "learning_rate": 1.155320321355151e-07, - "loss": 0.77221119, - "num_input_tokens_seen": 320889755, - "step": 14878, - "time_per_iteration": 2.6619186401367188 - }, - { - "auxiliary_loss_clip": 0.01094053, - "auxiliary_loss_mlp": 0.01030234, - "balance_loss_clip": 1.03389883, - "balance_loss_mlp": 1.01564312, - "epoch": 0.8945738764467158, - "flos": 21142264129920.0, - "grad_norm": 1.682076176326582, - "language_loss": 0.76145089, - "learning_rate": 1.1540161557291539e-07, - "loss": 0.78269374, - "num_input_tokens_seen": 320907860, - "step": 14879, - "time_per_iteration": 2.5775701999664307 - }, - { - "auxiliary_loss_clip": 0.01078076, - "auxiliary_loss_mlp": 0.0103149, - "balance_loss_clip": 1.03829026, - "balance_loss_mlp": 1.01952147, - "epoch": 0.8946339996993837, - "flos": 14903000835840.0, - "grad_norm": 1.842392268931871, - "language_loss": 0.74446988, - "learning_rate": 1.1527127047447538e-07, - "loss": 0.76556557, - "num_input_tokens_seen": 320925825, - "step": 14880, - "time_per_iteration": 2.665179967880249 - }, - { - "auxiliary_loss_clip": 0.0109132, - "auxiliary_loss_mlp": 0.01029879, - "balance_loss_clip": 1.03410816, - "balance_loss_mlp": 1.01687312, - "epoch": 0.8946941229520518, - "flos": 27382173868800.0, - "grad_norm": 1.5269173028094163, - "language_loss": 0.82799721, - "learning_rate": 1.1514099684513822e-07, - "loss": 0.84920919, - "num_input_tokens_seen": 320946165, - "step": 14881, - "time_per_iteration": 2.6503562927246094 - }, - { - "auxiliary_loss_clip": 0.01067605, - "auxiliary_loss_mlp": 0.00770988, - "balance_loss_clip": 1.03390133, - "balance_loss_mlp": 1.00022626, - "epoch": 0.8947542462047197, - "flos": 31796277338880.0, - "grad_norm": 1.614884288144251, - "language_loss": 0.67639142, - "learning_rate": 1.1501079468984287e-07, - "loss": 0.69477737, - "num_input_tokens_seen": 320969330, - "step": 14882, - "time_per_iteration": 2.7693512439727783 - }, - { - "auxiliary_loss_clip": 0.01085159, - "auxiliary_loss_mlp": 0.0103542, - "balance_loss_clip": 1.03287458, - "balance_loss_mlp": 1.0205431, - "epoch": 0.8948143694573877, - "flos": 20883599314560.0, - "grad_norm": 2.045453824962206, - "language_loss": 0.74976206, - "learning_rate": 1.1488066401352691e-07, - "loss": 0.77096784, - "num_input_tokens_seen": 320985055, - "step": 14883, - "time_per_iteration": 2.6624233722686768 - }, - { - "auxiliary_loss_clip": 0.01080827, - "auxiliary_loss_mlp": 0.01033735, - "balance_loss_clip": 1.03383732, - "balance_loss_mlp": 1.02153993, - "epoch": 0.8948744927100556, - "flos": 28215552291840.0, - "grad_norm": 1.5810148244458424, - "language_loss": 0.72292316, - "learning_rate": 1.147506048211253e-07, - "loss": 0.74406874, - "num_input_tokens_seen": 321004720, - "step": 14884, - "time_per_iteration": 2.6995975971221924 - }, - { - "auxiliary_loss_clip": 0.01076203, - "auxiliary_loss_mlp": 0.01030683, - "balance_loss_clip": 1.03102303, - "balance_loss_mlp": 1.0188036, - "epoch": 0.8949346159627236, - "flos": 21902672073600.0, - "grad_norm": 1.6922147897555293, - "language_loss": 0.75564313, - "learning_rate": 1.1462061711756987e-07, - "loss": 0.77671194, - "num_input_tokens_seen": 321022350, - "step": 14885, - "time_per_iteration": 2.628843069076538 - }, - { - "auxiliary_loss_clip": 0.01081812, - "auxiliary_loss_mlp": 0.01031169, - "balance_loss_clip": 1.03561521, - "balance_loss_mlp": 1.01841331, - "epoch": 0.8949947392153915, - "flos": 21359128492800.0, - "grad_norm": 1.9911058650536606, - "language_loss": 0.81962872, - "learning_rate": 1.1449070090778911e-07, - "loss": 0.84075844, - "num_input_tokens_seen": 321040450, - "step": 14886, - "time_per_iteration": 2.6610560417175293 - }, - { - "auxiliary_loss_clip": 0.01047486, - "auxiliary_loss_mlp": 0.01027777, - "balance_loss_clip": 1.03327608, - "balance_loss_mlp": 1.01596951, - "epoch": 0.8950548624680595, - "flos": 52445342799360.0, - "grad_norm": 1.5558434759275688, - "language_loss": 0.63781691, - "learning_rate": 1.1436085619671043e-07, - "loss": 0.65856951, - "num_input_tokens_seen": 321063970, - "step": 14887, - "time_per_iteration": 3.0324647426605225 - }, - { - "auxiliary_loss_clip": 0.01088528, - "auxiliary_loss_mlp": 0.01035221, - "balance_loss_clip": 1.03487492, - "balance_loss_mlp": 1.02251327, - "epoch": 0.8951149857207275, - "flos": 20121323863680.0, - "grad_norm": 1.8589921868531927, - "language_loss": 0.60964525, - "learning_rate": 1.1423108298925698e-07, - "loss": 0.63088268, - "num_input_tokens_seen": 321083840, - "step": 14888, - "time_per_iteration": 2.745520830154419 - }, - { - "auxiliary_loss_clip": 0.01110592, - "auxiliary_loss_mlp": 0.0103061, - "balance_loss_clip": 1.0367682, - "balance_loss_mlp": 1.01834917, - "epoch": 0.8951751089733955, - "flos": 29862631463040.0, - "grad_norm": 2.002662666178723, - "language_loss": 0.70275199, - "learning_rate": 1.1410138129034952e-07, - "loss": 0.72416401, - "num_input_tokens_seen": 321104165, - "step": 14889, - "time_per_iteration": 2.6176459789276123 - }, - { - "auxiliary_loss_clip": 0.01096532, - "auxiliary_loss_mlp": 0.00770989, - "balance_loss_clip": 1.03800106, - "balance_loss_mlp": 1.00024951, - "epoch": 0.8952352322260634, - "flos": 15262789415040.0, - "grad_norm": 2.7797851150305615, - "language_loss": 0.71586537, - "learning_rate": 1.1397175110490676e-07, - "loss": 0.73454058, - "num_input_tokens_seen": 321117290, - "step": 14890, - "time_per_iteration": 2.5783839225769043 - }, - { - "auxiliary_loss_clip": 0.0102349, - "auxiliary_loss_mlp": 0.00773622, - "balance_loss_clip": 1.02805948, - "balance_loss_mlp": 1.00013435, - "epoch": 0.8952953554787314, - "flos": 26798338206720.0, - "grad_norm": 1.483485143798382, - "language_loss": 0.75744319, - "learning_rate": 1.1384219243784454e-07, - "loss": 0.77541423, - "num_input_tokens_seen": 321137115, - "step": 14891, - "time_per_iteration": 6.244478225708008 - }, - { - "auxiliary_loss_clip": 0.01051483, - "auxiliary_loss_mlp": 0.01035049, - "balance_loss_clip": 1.03069568, - "balance_loss_mlp": 1.02226329, - "epoch": 0.8953554787313994, - "flos": 14137205852160.0, - "grad_norm": 2.0123273105882586, - "language_loss": 0.76453358, - "learning_rate": 1.1371270529407517e-07, - "loss": 0.7853989, - "num_input_tokens_seen": 321154490, - "step": 14892, - "time_per_iteration": 3.087535858154297 - }, - { - "auxiliary_loss_clip": 0.01093667, - "auxiliary_loss_mlp": 0.01032529, - "balance_loss_clip": 1.03796649, - "balance_loss_mlp": 1.02048337, - "epoch": 0.8954156019840673, - "flos": 25703314139520.0, - "grad_norm": 3.335988726881917, - "language_loss": 0.81619698, - "learning_rate": 1.1358328967850895e-07, - "loss": 0.83745897, - "num_input_tokens_seen": 321175625, - "step": 14893, - "time_per_iteration": 4.313986778259277 - }, - { - "auxiliary_loss_clip": 0.01061423, - "auxiliary_loss_mlp": 0.01032555, - "balance_loss_clip": 1.03349638, - "balance_loss_mlp": 1.02072382, - "epoch": 0.8954757252367354, - "flos": 21907987286400.0, - "grad_norm": 1.880542691848622, - "language_loss": 0.74994141, - "learning_rate": 1.1345394559605348e-07, - "loss": 0.77088118, - "num_input_tokens_seen": 321193895, - "step": 14894, - "time_per_iteration": 2.9463634490966797 - }, - { - "auxiliary_loss_clip": 0.01097897, - "auxiliary_loss_mlp": 0.0103303, - "balance_loss_clip": 1.03915453, - "balance_loss_mlp": 1.01995826, - "epoch": 0.8955358484894033, - "flos": 12970396454400.0, - "grad_norm": 1.9665552489767175, - "language_loss": 0.66606176, - "learning_rate": 1.1332467305161352e-07, - "loss": 0.68737108, - "num_input_tokens_seen": 321211610, - "step": 14895, - "time_per_iteration": 4.159812927246094 - }, - { - "auxiliary_loss_clip": 0.01099951, - "auxiliary_loss_mlp": 0.01029624, - "balance_loss_clip": 1.03752875, - "balance_loss_mlp": 1.01608145, - "epoch": 0.8955959717420713, - "flos": 17273966797440.0, - "grad_norm": 1.671045590451987, - "language_loss": 0.67131901, - "learning_rate": 1.1319547205009094e-07, - "loss": 0.69261479, - "num_input_tokens_seen": 321229805, - "step": 14896, - "time_per_iteration": 2.5856170654296875 - }, - { - "auxiliary_loss_clip": 0.01099928, - "auxiliary_loss_mlp": 0.01033038, - "balance_loss_clip": 1.03831029, - "balance_loss_mlp": 1.0207119, - "epoch": 0.8956560949947392, - "flos": 14793868339200.0, - "grad_norm": 1.8809584975485838, - "language_loss": 0.75465834, - "learning_rate": 1.1306634259638492e-07, - "loss": 0.77598798, - "num_input_tokens_seen": 321247165, - "step": 14897, - "time_per_iteration": 2.657931089401245 - }, - { - "auxiliary_loss_clip": 0.00994794, - "auxiliary_loss_mlp": 0.00751908, - "balance_loss_clip": 1.00807071, - "balance_loss_mlp": 0.99958485, - "epoch": 0.8957162182474072, - "flos": 63607817957760.0, - "grad_norm": 0.7439840356253357, - "language_loss": 0.55338937, - "learning_rate": 1.129372846953931e-07, - "loss": 0.57085639, - "num_input_tokens_seen": 321308425, - "step": 14898, - "time_per_iteration": 3.3162429332733154 - }, - { - "auxiliary_loss_clip": 0.01109726, - "auxiliary_loss_mlp": 0.00771113, - "balance_loss_clip": 1.03748989, - "balance_loss_mlp": 1.00012457, - "epoch": 0.8957763415000751, - "flos": 25009843190400.0, - "grad_norm": 1.500591280857772, - "language_loss": 0.70237386, - "learning_rate": 1.12808298352008e-07, - "loss": 0.72118223, - "num_input_tokens_seen": 321329295, - "step": 14899, - "time_per_iteration": 2.6604552268981934 - }, - { - "auxiliary_loss_clip": 0.01054108, - "auxiliary_loss_mlp": 0.0103587, - "balance_loss_clip": 1.03760815, - "balance_loss_mlp": 1.02217865, - "epoch": 0.8958364647527431, - "flos": 19828615933440.0, - "grad_norm": 1.672513533456995, - "language_loss": 0.73965251, - "learning_rate": 1.1267938357112106e-07, - "loss": 0.76055229, - "num_input_tokens_seen": 321347580, - "step": 14900, - "time_per_iteration": 2.7858917713165283 - }, - { - "auxiliary_loss_clip": 0.00999101, - "auxiliary_loss_mlp": 0.01000333, - "balance_loss_clip": 1.01374125, - "balance_loss_mlp": 0.99923056, - "epoch": 0.895896588005411, - "flos": 65537190115200.0, - "grad_norm": 0.793037706766976, - "language_loss": 0.61771894, - "learning_rate": 1.1255054035762124e-07, - "loss": 0.63771325, - "num_input_tokens_seen": 321407820, - "step": 14901, - "time_per_iteration": 3.225350856781006 - }, - { - "auxiliary_loss_clip": 0.01099179, - "auxiliary_loss_mlp": 0.01029055, - "balance_loss_clip": 1.03669286, - "balance_loss_mlp": 1.01680589, - "epoch": 0.8959567112580791, - "flos": 25591021246080.0, - "grad_norm": 1.6768583776386496, - "language_loss": 0.70434642, - "learning_rate": 1.1242176871639441e-07, - "loss": 0.72562879, - "num_input_tokens_seen": 321426745, - "step": 14902, - "time_per_iteration": 2.629722833633423 - }, - { - "auxiliary_loss_clip": 0.01080163, - "auxiliary_loss_mlp": 0.01030377, - "balance_loss_clip": 1.03510499, - "balance_loss_mlp": 1.01877761, - "epoch": 0.896016834510747, - "flos": 24201780877440.0, - "grad_norm": 1.8587409033889455, - "language_loss": 0.78276879, - "learning_rate": 1.1229306865232313e-07, - "loss": 0.80387414, - "num_input_tokens_seen": 321446165, - "step": 14903, - "time_per_iteration": 2.6630077362060547 - }, - { - "auxiliary_loss_clip": 0.01085975, - "auxiliary_loss_mlp": 0.01034269, - "balance_loss_clip": 1.03611159, - "balance_loss_mlp": 1.02067935, - "epoch": 0.896076957763415, - "flos": 23075945919360.0, - "grad_norm": 1.7273682997495312, - "language_loss": 0.73095953, - "learning_rate": 1.121644401702877e-07, - "loss": 0.75216204, - "num_input_tokens_seen": 321465285, - "step": 14904, - "time_per_iteration": 2.656641721725464 - }, - { - "auxiliary_loss_clip": 0.01097461, - "auxiliary_loss_mlp": 0.01028056, - "balance_loss_clip": 1.03512216, - "balance_loss_mlp": 1.0144484, - "epoch": 0.8961370810160829, - "flos": 22236605838720.0, - "grad_norm": 1.972644186412881, - "language_loss": 0.74508619, - "learning_rate": 1.12035883275166e-07, - "loss": 0.76634133, - "num_input_tokens_seen": 321483670, - "step": 14905, - "time_per_iteration": 2.5795624256134033 - }, - { - "auxiliary_loss_clip": 0.01096538, - "auxiliary_loss_mlp": 0.01032548, - "balance_loss_clip": 1.03503621, - "balance_loss_mlp": 1.02032363, - "epoch": 0.8961972042687509, - "flos": 23072318645760.0, - "grad_norm": 2.276578769172911, - "language_loss": 0.76414752, - "learning_rate": 1.1190739797183279e-07, - "loss": 0.78543842, - "num_input_tokens_seen": 321501190, - "step": 14906, - "time_per_iteration": 2.608065605163574 - }, - { - "auxiliary_loss_clip": 0.0109916, - "auxiliary_loss_mlp": 0.01034258, - "balance_loss_clip": 1.03820026, - "balance_loss_mlp": 1.02151513, - "epoch": 0.896257327521419, - "flos": 18185882307840.0, - "grad_norm": 1.6230699036233884, - "language_loss": 0.7410239, - "learning_rate": 1.1177898426515996e-07, - "loss": 0.76235807, - "num_input_tokens_seen": 321518540, - "step": 14907, - "time_per_iteration": 2.5740091800689697 - }, - { - "auxiliary_loss_clip": 0.01098288, - "auxiliary_loss_mlp": 0.0103583, - "balance_loss_clip": 1.03720152, - "balance_loss_mlp": 1.0237062, - "epoch": 0.8963174507740869, - "flos": 17895472848000.0, - "grad_norm": 3.741927314180935, - "language_loss": 0.82670319, - "learning_rate": 1.1165064216001785e-07, - "loss": 0.84804434, - "num_input_tokens_seen": 321536555, - "step": 14908, - "time_per_iteration": 2.5786521434783936 - }, - { - "auxiliary_loss_clip": 0.01086384, - "auxiliary_loss_mlp": 0.01031261, - "balance_loss_clip": 1.0361675, - "balance_loss_mlp": 1.01765347, - "epoch": 0.8963775740267549, - "flos": 21032269706880.0, - "grad_norm": 2.161346134185943, - "language_loss": 0.70245093, - "learning_rate": 1.1152237166127232e-07, - "loss": 0.72362739, - "num_input_tokens_seen": 321557655, - "step": 14909, - "time_per_iteration": 2.652540445327759 - }, - { - "auxiliary_loss_clip": 0.01076255, - "auxiliary_loss_mlp": 0.01036306, - "balance_loss_clip": 1.03869569, - "balance_loss_mlp": 1.02353907, - "epoch": 0.8964376972794228, - "flos": 23179619548800.0, - "grad_norm": 16.444537313084084, - "language_loss": 0.7209096, - "learning_rate": 1.113941727737877e-07, - "loss": 0.74203527, - "num_input_tokens_seen": 321576160, - "step": 14910, - "time_per_iteration": 2.6874682903289795 - }, - { - "auxiliary_loss_clip": 0.01095164, - "auxiliary_loss_mlp": 0.01028278, - "balance_loss_clip": 1.03482211, - "balance_loss_mlp": 1.01633346, - "epoch": 0.8964978205320908, - "flos": 24972998814720.0, - "grad_norm": 2.519588986898142, - "language_loss": 0.6361804, - "learning_rate": 1.1126604550242502e-07, - "loss": 0.65741479, - "num_input_tokens_seen": 321596205, - "step": 14911, - "time_per_iteration": 2.594196081161499 - }, - { - "auxiliary_loss_clip": 0.01082355, - "auxiliary_loss_mlp": 0.00770688, - "balance_loss_clip": 1.03677964, - "balance_loss_mlp": 1.00020111, - "epoch": 0.8965579437847587, - "flos": 19172025273600.0, - "grad_norm": 1.6361676394072804, - "language_loss": 0.74929178, - "learning_rate": 1.111379898520437e-07, - "loss": 0.76782227, - "num_input_tokens_seen": 321614800, - "step": 14912, - "time_per_iteration": 2.620948076248169 - }, - { - "auxiliary_loss_clip": 0.01083336, - "auxiliary_loss_mlp": 0.01037867, - "balance_loss_clip": 1.03508806, - "balance_loss_mlp": 1.02545714, - "epoch": 0.8966180670374267, - "flos": 24276690691200.0, - "grad_norm": 1.791048209942099, - "language_loss": 0.81890047, - "learning_rate": 1.1101000582749876e-07, - "loss": 0.84011245, - "num_input_tokens_seen": 321633445, - "step": 14913, - "time_per_iteration": 2.6343531608581543 - }, - { - "auxiliary_loss_clip": 0.01101255, - "auxiliary_loss_mlp": 0.01035811, - "balance_loss_clip": 1.03797901, - "balance_loss_mlp": 1.02259076, - "epoch": 0.8966781902900947, - "flos": 13553190622080.0, - "grad_norm": 3.5493075869596176, - "language_loss": 0.61391163, - "learning_rate": 1.1088209343364407e-07, - "loss": 0.63528228, - "num_input_tokens_seen": 321650890, - "step": 14914, - "time_per_iteration": 2.611363649368286 - }, - { - "auxiliary_loss_clip": 0.01005981, - "auxiliary_loss_mlp": 0.00999937, - "balance_loss_clip": 1.00627279, - "balance_loss_mlp": 0.99880487, - "epoch": 0.8967383135427627, - "flos": 65066114223360.0, - "grad_norm": 2.6237376103475905, - "language_loss": 0.5505228, - "learning_rate": 1.1075425267532956e-07, - "loss": 0.57058197, - "num_input_tokens_seen": 321710960, - "step": 14915, - "time_per_iteration": 3.191149950027466 - }, - { - "auxiliary_loss_clip": 0.01068433, - "auxiliary_loss_mlp": 0.01032694, - "balance_loss_clip": 1.03356564, - "balance_loss_mlp": 1.02107704, - "epoch": 0.8967984367954306, - "flos": 29713027317120.0, - "grad_norm": 1.506591711885427, - "language_loss": 0.71458489, - "learning_rate": 1.1062648355740289e-07, - "loss": 0.73559618, - "num_input_tokens_seen": 321733290, - "step": 14916, - "time_per_iteration": 2.7623350620269775 - }, - { - "auxiliary_loss_clip": 0.01087907, - "auxiliary_loss_mlp": 0.01030723, - "balance_loss_clip": 1.03692842, - "balance_loss_mlp": 1.01904035, - "epoch": 0.8968585600480986, - "flos": 25702488126720.0, - "grad_norm": 1.8194206716370875, - "language_loss": 0.77866107, - "learning_rate": 1.1049878608470931e-07, - "loss": 0.79984742, - "num_input_tokens_seen": 321753120, - "step": 14917, - "time_per_iteration": 2.6854681968688965 - }, - { - "auxiliary_loss_clip": 0.01102374, - "auxiliary_loss_mlp": 0.01041532, - "balance_loss_clip": 1.03805685, - "balance_loss_mlp": 1.02815735, - "epoch": 0.8969186833007665, - "flos": 30044698525440.0, - "grad_norm": 1.9817396257364666, - "language_loss": 0.6853829, - "learning_rate": 1.1037116026209137e-07, - "loss": 0.70682192, - "num_input_tokens_seen": 321772840, - "step": 14918, - "time_per_iteration": 2.6850335597991943 - }, - { - "auxiliary_loss_clip": 0.01059733, - "auxiliary_loss_mlp": 0.010324, - "balance_loss_clip": 1.0353421, - "balance_loss_mlp": 1.02087831, - "epoch": 0.8969788065534345, - "flos": 22818143030400.0, - "grad_norm": 1.8968334913567422, - "language_loss": 0.83584672, - "learning_rate": 1.102436060943881e-07, - "loss": 0.85676813, - "num_input_tokens_seen": 321791020, - "step": 14919, - "time_per_iteration": 2.7944953441619873 - }, - { - "auxiliary_loss_clip": 0.0110904, - "auxiliary_loss_mlp": 0.00771505, - "balance_loss_clip": 1.03592348, - "balance_loss_mlp": 1.00021255, - "epoch": 0.8970389298061026, - "flos": 13261488272640.0, - "grad_norm": 6.002098471284828, - "language_loss": 0.72274148, - "learning_rate": 1.1011612358643696e-07, - "loss": 0.74154693, - "num_input_tokens_seen": 321810075, - "step": 14920, - "time_per_iteration": 2.641122579574585 - }, - { - "auxiliary_loss_clip": 0.01096514, - "auxiliary_loss_mlp": 0.01031999, - "balance_loss_clip": 1.03508401, - "balance_loss_mlp": 1.01865947, - "epoch": 0.8970990530587705, - "flos": 10266071345280.0, - "grad_norm": 2.2258437639369753, - "language_loss": 0.90893173, - "learning_rate": 1.0998871274307164e-07, - "loss": 0.93021685, - "num_input_tokens_seen": 321822635, - "step": 14921, - "time_per_iteration": 2.5753695964813232 - }, - { - "auxiliary_loss_clip": 0.0105106, - "auxiliary_loss_mlp": 0.0103667, - "balance_loss_clip": 1.03290153, - "balance_loss_mlp": 1.02218616, - "epoch": 0.8971591763114385, - "flos": 20302708567680.0, - "grad_norm": 1.7221269856692987, - "language_loss": 0.73712015, - "learning_rate": 1.0986137356912384e-07, - "loss": 0.75799739, - "num_input_tokens_seen": 321841130, - "step": 14922, - "time_per_iteration": 2.796809673309326 - }, - { - "auxiliary_loss_clip": 0.01059125, - "auxiliary_loss_mlp": 0.01039549, - "balance_loss_clip": 1.0326159, - "balance_loss_mlp": 1.02563119, - "epoch": 0.8972192995641064, - "flos": 23257043314560.0, - "grad_norm": 1.7526261778537016, - "language_loss": 0.70386976, - "learning_rate": 1.097341060694219e-07, - "loss": 0.7248565, - "num_input_tokens_seen": 321859855, - "step": 14923, - "time_per_iteration": 2.716149091720581 - }, - { - "auxiliary_loss_clip": 0.01087701, - "auxiliary_loss_mlp": 0.01029185, - "balance_loss_clip": 1.03695786, - "balance_loss_mlp": 1.01585746, - "epoch": 0.8972794228167744, - "flos": 18369601395840.0, - "grad_norm": 2.5800290587382606, - "language_loss": 0.7121672, - "learning_rate": 1.0960691024879221e-07, - "loss": 0.73333609, - "num_input_tokens_seen": 321877990, - "step": 14924, - "time_per_iteration": 2.6310861110687256 - }, - { - "auxiliary_loss_clip": 0.01094366, - "auxiliary_loss_mlp": 0.01035713, - "balance_loss_clip": 1.03357565, - "balance_loss_mlp": 1.02411425, - "epoch": 0.8973395460694423, - "flos": 23952058548480.0, - "grad_norm": 1.524405249104344, - "language_loss": 0.720016, - "learning_rate": 1.0947978611205844e-07, - "loss": 0.7413168, - "num_input_tokens_seen": 321898120, - "step": 14925, - "time_per_iteration": 2.665548324584961 - }, - { - "auxiliary_loss_clip": 0.01087294, - "auxiliary_loss_mlp": 0.00773098, - "balance_loss_clip": 1.03590477, - "balance_loss_mlp": 1.0001936, - "epoch": 0.8973996693221103, - "flos": 24970843998720.0, - "grad_norm": 1.8138493402848186, - "language_loss": 0.82518828, - "learning_rate": 1.0935273366404008e-07, - "loss": 0.84379226, - "num_input_tokens_seen": 321918140, - "step": 14926, - "time_per_iteration": 2.6425201892852783 - }, - { - "auxiliary_loss_clip": 0.01054597, - "auxiliary_loss_mlp": 0.01030973, - "balance_loss_clip": 1.03193653, - "balance_loss_mlp": 1.0189445, - "epoch": 0.8974597925747783, - "flos": 25738937452800.0, - "grad_norm": 1.4975243359696364, - "language_loss": 0.7919172, - "learning_rate": 1.092257529095555e-07, - "loss": 0.81277287, - "num_input_tokens_seen": 321938580, - "step": 14927, - "time_per_iteration": 2.760615825653076 - }, - { - "auxiliary_loss_clip": 0.01081394, - "auxiliary_loss_mlp": 0.01029342, - "balance_loss_clip": 1.03361082, - "balance_loss_mlp": 1.01776123, - "epoch": 0.8975199158274463, - "flos": 38071918131840.0, - "grad_norm": 1.6317289116194253, - "language_loss": 0.66483474, - "learning_rate": 1.0909884385341994e-07, - "loss": 0.68594205, - "num_input_tokens_seen": 321961135, - "step": 14928, - "time_per_iteration": 2.778822898864746 - }, - { - "auxiliary_loss_clip": 0.01087431, - "auxiliary_loss_mlp": 0.01043461, - "balance_loss_clip": 1.03568482, - "balance_loss_mlp": 1.02808905, - "epoch": 0.8975800390801142, - "flos": 25411683617280.0, - "grad_norm": 5.313736387639944, - "language_loss": 0.70643723, - "learning_rate": 1.0897200650044602e-07, - "loss": 0.72774613, - "num_input_tokens_seen": 321980945, - "step": 14929, - "time_per_iteration": 2.7232232093811035 - }, - { - "auxiliary_loss_clip": 0.01089831, - "auxiliary_loss_mlp": 0.01031432, - "balance_loss_clip": 1.03910744, - "balance_loss_mlp": 1.01977956, - "epoch": 0.8976401623327822, - "flos": 21759604202880.0, - "grad_norm": 1.7936229016193426, - "language_loss": 0.68214059, - "learning_rate": 1.0884524085544256e-07, - "loss": 0.70335329, - "num_input_tokens_seen": 322000350, - "step": 14930, - "time_per_iteration": 4.204017162322998 - }, - { - "auxiliary_loss_clip": 0.01078251, - "auxiliary_loss_mlp": 0.01028551, - "balance_loss_clip": 1.03327012, - "balance_loss_mlp": 1.01617682, - "epoch": 0.8977002855854501, - "flos": 13845323934720.0, - "grad_norm": 3.3144307660697994, - "language_loss": 0.74537098, - "learning_rate": 1.0871854692321769e-07, - "loss": 0.76643896, - "num_input_tokens_seen": 322018980, - "step": 14931, - "time_per_iteration": 4.21280837059021 - }, - { - "auxiliary_loss_clip": 0.01098516, - "auxiliary_loss_mlp": 0.0102899, - "balance_loss_clip": 1.0380764, - "balance_loss_mlp": 1.01730156, - "epoch": 0.8977604088381181, - "flos": 19427529692160.0, - "grad_norm": 1.8135529971721605, - "language_loss": 0.62872756, - "learning_rate": 1.0859192470857492e-07, - "loss": 0.6500026, - "num_input_tokens_seen": 322037675, - "step": 14932, - "time_per_iteration": 2.5633347034454346 - }, - { - "auxiliary_loss_clip": 0.01091207, - "auxiliary_loss_mlp": 0.01028215, - "balance_loss_clip": 1.03397417, - "balance_loss_mlp": 1.01719403, - "epoch": 0.8978205320907862, - "flos": 22742083981440.0, - "grad_norm": 1.6627829242988799, - "language_loss": 0.7173481, - "learning_rate": 1.0846537421631552e-07, - "loss": 0.73854238, - "num_input_tokens_seen": 322055130, - "step": 14933, - "time_per_iteration": 5.648598909378052 - }, - { - "auxiliary_loss_clip": 0.01061803, - "auxiliary_loss_mlp": 0.01036521, - "balance_loss_clip": 1.03099751, - "balance_loss_mlp": 1.02254987, - "epoch": 0.8978806553434541, - "flos": 21360529123200.0, - "grad_norm": 1.8892940748793305, - "language_loss": 0.74708331, - "learning_rate": 1.0833889545123898e-07, - "loss": 0.76806653, - "num_input_tokens_seen": 322074850, - "step": 14934, - "time_per_iteration": 2.7452452182769775 - }, - { - "auxiliary_loss_clip": 0.01063828, - "auxiliary_loss_mlp": 0.01038833, - "balance_loss_clip": 1.03115392, - "balance_loss_mlp": 1.02425992, - "epoch": 0.8979407785961221, - "flos": 20924178704640.0, - "grad_norm": 1.7395229013410125, - "language_loss": 0.60459125, - "learning_rate": 1.0821248841814123e-07, - "loss": 0.62561786, - "num_input_tokens_seen": 322093315, - "step": 14935, - "time_per_iteration": 2.6802937984466553 - }, - { - "auxiliary_loss_clip": 0.01067049, - "auxiliary_loss_mlp": 0.0102824, - "balance_loss_clip": 1.03403175, - "balance_loss_mlp": 1.01568127, - "epoch": 0.89800090184879, - "flos": 25228934196480.0, - "grad_norm": 2.3833073137139773, - "language_loss": 0.76938522, - "learning_rate": 1.0808615312181512e-07, - "loss": 0.79033804, - "num_input_tokens_seen": 322112555, - "step": 14936, - "time_per_iteration": 2.6882402896881104 - }, - { - "auxiliary_loss_clip": 0.01084705, - "auxiliary_loss_mlp": 0.01033293, - "balance_loss_clip": 1.0342505, - "balance_loss_mlp": 1.02111554, - "epoch": 0.898061025101458, - "flos": 22562674525440.0, - "grad_norm": 1.7261485222993433, - "language_loss": 0.74040693, - "learning_rate": 1.0795988956705193e-07, - "loss": 0.7615869, - "num_input_tokens_seen": 322130440, - "step": 14937, - "time_per_iteration": 2.6710762977600098 - }, - { - "auxiliary_loss_clip": 0.01000999, - "auxiliary_loss_mlp": 0.00999075, - "balance_loss_clip": 1.00671172, - "balance_loss_mlp": 0.9980852, - "epoch": 0.8981211483541259, - "flos": 56192551384320.0, - "grad_norm": 0.843865572085313, - "language_loss": 0.63512671, - "learning_rate": 1.0783369775863915e-07, - "loss": 0.65512741, - "num_input_tokens_seen": 322187295, - "step": 14938, - "time_per_iteration": 3.0942494869232178 - }, - { - "auxiliary_loss_clip": 0.01085887, - "auxiliary_loss_mlp": 0.01026506, - "balance_loss_clip": 1.03574538, - "balance_loss_mlp": 1.0140903, - "epoch": 0.898181271606794, - "flos": 16392718523520.0, - "grad_norm": 2.1860479541490268, - "language_loss": 0.79759568, - "learning_rate": 1.0770757770136251e-07, - "loss": 0.81871951, - "num_input_tokens_seen": 322202965, - "step": 14939, - "time_per_iteration": 2.663742780685425 - }, - { - "auxiliary_loss_clip": 0.01000054, - "auxiliary_loss_mlp": 0.01000102, - "balance_loss_clip": 1.00716364, - "balance_loss_mlp": 0.99917239, - "epoch": 0.8982413948594619, - "flos": 63440259989760.0, - "grad_norm": 0.7229819252676494, - "language_loss": 0.52847624, - "learning_rate": 1.0758152940000375e-07, - "loss": 0.54847777, - "num_input_tokens_seen": 322269490, - "step": 14940, - "time_per_iteration": 3.3590850830078125 - }, - { - "auxiliary_loss_clip": 0.01109001, - "auxiliary_loss_mlp": 0.01032379, - "balance_loss_clip": 1.03646505, - "balance_loss_mlp": 1.01890242, - "epoch": 0.8983015181121299, - "flos": 21835340029440.0, - "grad_norm": 1.950102596930943, - "language_loss": 0.77829498, - "learning_rate": 1.0745555285934327e-07, - "loss": 0.79970872, - "num_input_tokens_seen": 322288060, - "step": 14941, - "time_per_iteration": 2.744305372238159 - }, - { - "auxiliary_loss_clip": 0.01098003, - "auxiliary_loss_mlp": 0.01036176, - "balance_loss_clip": 1.0353359, - "balance_loss_mlp": 1.02308095, - "epoch": 0.8983616413647978, - "flos": 28949961767040.0, - "grad_norm": 2.416025895078288, - "language_loss": 0.73365378, - "learning_rate": 1.0732964808415834e-07, - "loss": 0.75499552, - "num_input_tokens_seen": 322307930, - "step": 14942, - "time_per_iteration": 2.754950523376465 - }, - { - "auxiliary_loss_clip": 0.01087926, - "auxiliary_loss_mlp": 0.01039089, - "balance_loss_clip": 1.03435743, - "balance_loss_mlp": 1.0259459, - "epoch": 0.8984217646174658, - "flos": 17785083375360.0, - "grad_norm": 3.391273382759864, - "language_loss": 0.79918504, - "learning_rate": 1.0720381507922205e-07, - "loss": 0.82045519, - "num_input_tokens_seen": 322326155, - "step": 14943, - "time_per_iteration": 2.7248191833496094 - }, - { - "auxiliary_loss_clip": 0.01085525, - "auxiliary_loss_mlp": 0.01032768, - "balance_loss_clip": 1.0354557, - "balance_loss_mlp": 1.01938701, - "epoch": 0.8984818878701337, - "flos": 23404528558080.0, - "grad_norm": 1.5192187964233135, - "language_loss": 0.71140742, - "learning_rate": 1.0707805384930701e-07, - "loss": 0.73259044, - "num_input_tokens_seen": 322345850, - "step": 14944, - "time_per_iteration": 2.6967763900756836 - }, - { - "auxiliary_loss_clip": 0.01069595, - "auxiliary_loss_mlp": 0.01033426, - "balance_loss_clip": 1.03214753, - "balance_loss_mlp": 1.01949, - "epoch": 0.8985420111228017, - "flos": 22346061557760.0, - "grad_norm": 2.216725804590017, - "language_loss": 0.76311302, - "learning_rate": 1.0695236439918187e-07, - "loss": 0.78414327, - "num_input_tokens_seen": 322364715, - "step": 14945, - "time_per_iteration": 2.6679043769836426 - }, - { - "auxiliary_loss_clip": 0.01114813, - "auxiliary_loss_mlp": 0.01031111, - "balance_loss_clip": 1.03778219, - "balance_loss_mlp": 1.01778316, - "epoch": 0.8986021343754698, - "flos": 21392776558080.0, - "grad_norm": 2.148577693611771, - "language_loss": 0.73464406, - "learning_rate": 1.0682674673361302e-07, - "loss": 0.75610334, - "num_input_tokens_seen": 322383570, - "step": 14946, - "time_per_iteration": 2.5922229290008545 - }, - { - "auxiliary_loss_clip": 0.0105656, - "auxiliary_loss_mlp": 0.01032137, - "balance_loss_clip": 1.03178859, - "balance_loss_mlp": 1.01898205, - "epoch": 0.8986622576281377, - "flos": 21325372686720.0, - "grad_norm": 1.8845669239623069, - "language_loss": 0.64757031, - "learning_rate": 1.0670120085736334e-07, - "loss": 0.66845727, - "num_input_tokens_seen": 322401375, - "step": 14947, - "time_per_iteration": 2.7270290851593018 - }, - { - "auxiliary_loss_clip": 0.01087566, - "auxiliary_loss_mlp": 0.01034127, - "balance_loss_clip": 1.03707767, - "balance_loss_mlp": 1.02179492, - "epoch": 0.8987223808808057, - "flos": 23988292392960.0, - "grad_norm": 2.018840894039702, - "language_loss": 0.70409435, - "learning_rate": 1.0657572677519411e-07, - "loss": 0.72531128, - "num_input_tokens_seen": 322421890, - "step": 14948, - "time_per_iteration": 2.712301254272461 - }, - { - "auxiliary_loss_clip": 0.01076508, - "auxiliary_loss_mlp": 0.01028981, - "balance_loss_clip": 1.03520036, - "balance_loss_mlp": 1.01646447, - "epoch": 0.8987825041334736, - "flos": 41500956044160.0, - "grad_norm": 1.7555603952219132, - "language_loss": 0.7477864, - "learning_rate": 1.0645032449186309e-07, - "loss": 0.76884139, - "num_input_tokens_seen": 322445730, - "step": 14949, - "time_per_iteration": 2.8739330768585205 - }, - { - "auxiliary_loss_clip": 0.01067975, - "auxiliary_loss_mlp": 0.01032509, - "balance_loss_clip": 1.03525853, - "balance_loss_mlp": 1.01840615, - "epoch": 0.8988426273861416, - "flos": 27564276844800.0, - "grad_norm": 1.6443346508458696, - "language_loss": 0.75822496, - "learning_rate": 1.0632499401212513e-07, - "loss": 0.77922982, - "num_input_tokens_seen": 322464595, - "step": 14950, - "time_per_iteration": 2.801135301589966 - }, - { - "auxiliary_loss_clip": 0.01082227, - "auxiliary_loss_mlp": 0.01031565, - "balance_loss_clip": 1.03504586, - "balance_loss_mlp": 1.01929891, - "epoch": 0.8989027506388095, - "flos": 17092653920640.0, - "grad_norm": 1.910819021087814, - "language_loss": 0.66423386, - "learning_rate": 1.0619973534073334e-07, - "loss": 0.68537182, - "num_input_tokens_seen": 322483305, - "step": 14951, - "time_per_iteration": 2.722646951675415 - }, - { - "auxiliary_loss_clip": 0.01110481, - "auxiliary_loss_mlp": 0.01030196, - "balance_loss_clip": 1.03482461, - "balance_loss_mlp": 1.01835823, - "epoch": 0.8989628738914776, - "flos": 20555124416640.0, - "grad_norm": 1.9405005215432696, - "language_loss": 0.73878247, - "learning_rate": 1.0607454848243769e-07, - "loss": 0.76018929, - "num_input_tokens_seen": 322501905, - "step": 14952, - "time_per_iteration": 2.638542413711548 - }, - { - "auxiliary_loss_clip": 0.01108749, - "auxiliary_loss_mlp": 0.01033676, - "balance_loss_clip": 1.03708589, - "balance_loss_mlp": 1.02110505, - "epoch": 0.8990229971441455, - "flos": 16251087196800.0, - "grad_norm": 2.2604855154768595, - "language_loss": 0.56825626, - "learning_rate": 1.0594943344198481e-07, - "loss": 0.58968055, - "num_input_tokens_seen": 322518135, - "step": 14953, - "time_per_iteration": 2.674570083618164 - }, - { - "auxiliary_loss_clip": 0.01083928, - "auxiliary_loss_mlp": 0.01033798, - "balance_loss_clip": 1.03378558, - "balance_loss_mlp": 1.021281, - "epoch": 0.8990831203968135, - "flos": 21981316901760.0, - "grad_norm": 2.264909455658383, - "language_loss": 0.82036901, - "learning_rate": 1.0582439022411915e-07, - "loss": 0.8415463, - "num_input_tokens_seen": 322537905, - "step": 14954, - "time_per_iteration": 2.6860923767089844 - }, - { - "auxiliary_loss_clip": 0.01107036, - "auxiliary_loss_mlp": 0.01032255, - "balance_loss_clip": 1.0373497, - "balance_loss_mlp": 1.01929116, - "epoch": 0.8991432436494814, - "flos": 27447171528960.0, - "grad_norm": 1.822158313950773, - "language_loss": 0.59985012, - "learning_rate": 1.0569941883358224e-07, - "loss": 0.621243, - "num_input_tokens_seen": 322557945, - "step": 14955, - "time_per_iteration": 2.645461082458496 - }, - { - "auxiliary_loss_clip": 0.01097918, - "auxiliary_loss_mlp": 0.01032904, - "balance_loss_clip": 1.03783774, - "balance_loss_mlp": 1.02125764, - "epoch": 0.8992033669021494, - "flos": 21579835610880.0, - "grad_norm": 2.0084560499241486, - "language_loss": 0.54700983, - "learning_rate": 1.0557451927511341e-07, - "loss": 0.56831801, - "num_input_tokens_seen": 322575765, - "step": 14956, - "time_per_iteration": 2.6565489768981934 - }, - { - "auxiliary_loss_clip": 0.01063944, - "auxiliary_loss_mlp": 0.01036733, - "balance_loss_clip": 1.03451157, - "balance_loss_mlp": 1.02390623, - "epoch": 0.8992634901548173, - "flos": 28584211530240.0, - "grad_norm": 1.8401685244545993, - "language_loss": 0.79821646, - "learning_rate": 1.0544969155344863e-07, - "loss": 0.81922328, - "num_input_tokens_seen": 322595665, - "step": 14957, - "time_per_iteration": 2.797804117202759 - }, - { - "auxiliary_loss_clip": 0.0111253, - "auxiliary_loss_mlp": 0.01031569, - "balance_loss_clip": 1.03749204, - "balance_loss_mlp": 1.01827729, - "epoch": 0.8993236134074853, - "flos": 19867435557120.0, - "grad_norm": 2.4531476671988663, - "language_loss": 0.78357041, - "learning_rate": 1.0532493567332123e-07, - "loss": 0.80501139, - "num_input_tokens_seen": 322614755, - "step": 14958, - "time_per_iteration": 2.6688661575317383 - }, - { - "auxiliary_loss_clip": 0.0104078, - "auxiliary_loss_mlp": 0.01030342, - "balance_loss_clip": 1.03928471, - "balance_loss_mlp": 1.01852262, - "epoch": 0.8993837366601534, - "flos": 19390649402880.0, - "grad_norm": 2.9878038930671362, - "language_loss": 0.74742228, - "learning_rate": 1.0520025163946277e-07, - "loss": 0.76813352, - "num_input_tokens_seen": 322633425, - "step": 14959, - "time_per_iteration": 2.8125593662261963 - }, - { - "auxiliary_loss_clip": 0.01103112, - "auxiliary_loss_mlp": 0.0103209, - "balance_loss_clip": 1.03359628, - "balance_loss_mlp": 1.01983559, - "epoch": 0.8994438599128213, - "flos": 18551740285440.0, - "grad_norm": 2.0035831193239684, - "language_loss": 0.68275356, - "learning_rate": 1.0507563945660015e-07, - "loss": 0.70410562, - "num_input_tokens_seen": 322652065, - "step": 14960, - "time_per_iteration": 2.5540730953216553 - }, - { - "auxiliary_loss_clip": 0.01084725, - "auxiliary_loss_mlp": 0.010279, - "balance_loss_clip": 1.03622437, - "balance_loss_mlp": 1.01650357, - "epoch": 0.8995039831654893, - "flos": 24427587726720.0, - "grad_norm": 1.431119232973545, - "language_loss": 0.65543896, - "learning_rate": 1.049510991294591e-07, - "loss": 0.67656523, - "num_input_tokens_seen": 322673275, - "step": 14961, - "time_per_iteration": 2.7903378009796143 - }, - { - "auxiliary_loss_clip": 0.01084623, - "auxiliary_loss_mlp": 0.01027938, - "balance_loss_clip": 1.03403842, - "balance_loss_mlp": 1.01648808, - "epoch": 0.8995641064181572, - "flos": 21251324799360.0, - "grad_norm": 1.6157800679699814, - "language_loss": 0.83261824, - "learning_rate": 1.0482663066276254e-07, - "loss": 0.85374379, - "num_input_tokens_seen": 322693375, - "step": 14962, - "time_per_iteration": 2.640796661376953 - }, - { - "auxiliary_loss_clip": 0.01090281, - "auxiliary_loss_mlp": 0.01030459, - "balance_loss_clip": 1.0377152, - "balance_loss_mlp": 1.01667809, - "epoch": 0.8996242296708252, - "flos": 23513661054720.0, - "grad_norm": 2.0695727885892095, - "language_loss": 0.76674181, - "learning_rate": 1.047022340612298e-07, - "loss": 0.7879492, - "num_input_tokens_seen": 322712615, - "step": 14963, - "time_per_iteration": 2.6461212635040283 - }, - { - "auxiliary_loss_clip": 0.00991703, - "auxiliary_loss_mlp": 0.01005224, - "balance_loss_clip": 1.01595902, - "balance_loss_mlp": 1.00418079, - "epoch": 0.8996843529234931, - "flos": 62403230430720.0, - "grad_norm": 0.7797202356654998, - "language_loss": 0.57483667, - "learning_rate": 1.0457790932957867e-07, - "loss": 0.59480596, - "num_input_tokens_seen": 322766855, - "step": 14964, - "time_per_iteration": 3.1848866939544678 - }, - { - "auxiliary_loss_clip": 0.0110498, - "auxiliary_loss_mlp": 0.01033929, - "balance_loss_clip": 1.03950953, - "balance_loss_mlp": 1.02064347, - "epoch": 0.8997444761761612, - "flos": 24236829573120.0, - "grad_norm": 3.314723962162985, - "language_loss": 0.6772269, - "learning_rate": 1.0445365647252269e-07, - "loss": 0.69861603, - "num_input_tokens_seen": 322781130, - "step": 14965, - "time_per_iteration": 2.6162235736846924 - }, - { - "auxiliary_loss_clip": 0.01110984, - "auxiliary_loss_mlp": 0.01030286, - "balance_loss_clip": 1.03775227, - "balance_loss_mlp": 1.01822209, - "epoch": 0.8998045994288291, - "flos": 21361103740800.0, - "grad_norm": 2.9087273995519136, - "language_loss": 0.71626663, - "learning_rate": 1.0432947549477433e-07, - "loss": 0.73767936, - "num_input_tokens_seen": 322800310, - "step": 14966, - "time_per_iteration": 2.5625483989715576 - }, - { - "auxiliary_loss_clip": 0.01076915, - "auxiliary_loss_mlp": 0.0103174, - "balance_loss_clip": 1.0351249, - "balance_loss_mlp": 1.01843047, - "epoch": 0.8998647226814971, - "flos": 28986159697920.0, - "grad_norm": 1.8489899153137084, - "language_loss": 0.73536384, - "learning_rate": 1.0420536640104205e-07, - "loss": 0.75645041, - "num_input_tokens_seen": 322820955, - "step": 14967, - "time_per_iteration": 2.785755157470703 - }, - { - "auxiliary_loss_clip": 0.01064386, - "auxiliary_loss_mlp": 0.00770622, - "balance_loss_clip": 1.03535485, - "balance_loss_mlp": 1.00016153, - "epoch": 0.899924845934165, - "flos": 13625909706240.0, - "grad_norm": 1.9571169995533768, - "language_loss": 0.72163457, - "learning_rate": 1.040813291960323e-07, - "loss": 0.73998475, - "num_input_tokens_seen": 322838780, - "step": 14968, - "time_per_iteration": 2.7936058044433594 - }, - { - "auxiliary_loss_clip": 0.01093703, - "auxiliary_loss_mlp": 0.01032931, - "balance_loss_clip": 1.03627658, - "balance_loss_mlp": 1.02080774, - "epoch": 0.899984969186833, - "flos": 20882629647360.0, - "grad_norm": 1.942509479538182, - "language_loss": 0.71323812, - "learning_rate": 1.0395736388444864e-07, - "loss": 0.73450446, - "num_input_tokens_seen": 322856710, - "step": 14969, - "time_per_iteration": 4.1407389640808105 - }, - { - "auxiliary_loss_clip": 0.01111967, - "auxiliary_loss_mlp": 0.01031075, - "balance_loss_clip": 1.039024, - "balance_loss_mlp": 1.01810515, - "epoch": 0.9000450924395009, - "flos": 20921808407040.0, - "grad_norm": 1.978725901368175, - "language_loss": 0.75983673, - "learning_rate": 1.0383347047099201e-07, - "loss": 0.78126717, - "num_input_tokens_seen": 322876070, - "step": 14970, - "time_per_iteration": 4.195037603378296 - }, - { - "auxiliary_loss_clip": 0.01101891, - "auxiliary_loss_mlp": 0.01032608, - "balance_loss_clip": 1.036654, - "balance_loss_mlp": 1.0206151, - "epoch": 0.900105215692169, - "flos": 17165049782400.0, - "grad_norm": 1.6764087084105503, - "language_loss": 0.73020303, - "learning_rate": 1.0370964896035972e-07, - "loss": 0.75154805, - "num_input_tokens_seen": 322895095, - "step": 14971, - "time_per_iteration": 2.5875184535980225 - }, - { - "auxiliary_loss_clip": 0.0107201, - "auxiliary_loss_mlp": 0.01031462, - "balance_loss_clip": 1.03537893, - "balance_loss_mlp": 1.01815248, - "epoch": 0.900165338944837, - "flos": 19931930426880.0, - "grad_norm": 2.0581551062194703, - "language_loss": 0.8157441, - "learning_rate": 1.035858993572476e-07, - "loss": 0.83677888, - "num_input_tokens_seen": 322911845, - "step": 14972, - "time_per_iteration": 4.170926094055176 - }, - { - "auxiliary_loss_clip": 0.01080845, - "auxiliary_loss_mlp": 0.01030543, - "balance_loss_clip": 1.03386259, - "balance_loss_mlp": 1.01756763, - "epoch": 0.9002254621975049, - "flos": 16107085572480.0, - "grad_norm": 5.44647111318727, - "language_loss": 0.8157503, - "learning_rate": 1.0346222166634855e-07, - "loss": 0.83686423, - "num_input_tokens_seen": 322928170, - "step": 14973, - "time_per_iteration": 4.245764493942261 - }, - { - "auxiliary_loss_clip": 0.01108859, - "auxiliary_loss_mlp": 0.01033665, - "balance_loss_clip": 1.03745437, - "balance_loss_mlp": 1.02064097, - "epoch": 0.9002855854501729, - "flos": 28476120528000.0, - "grad_norm": 1.8931986793937958, - "language_loss": 0.58183479, - "learning_rate": 1.0333861589235193e-07, - "loss": 0.60326004, - "num_input_tokens_seen": 322948165, - "step": 14974, - "time_per_iteration": 2.6841914653778076 - }, - { - "auxiliary_loss_clip": 0.01112242, - "auxiliary_loss_mlp": 0.01034905, - "balance_loss_clip": 1.04007757, - "balance_loss_mlp": 1.02229297, - "epoch": 0.9003457087028408, - "flos": 25630307746560.0, - "grad_norm": 1.7431363980937327, - "language_loss": 0.63522345, - "learning_rate": 1.0321508203994489e-07, - "loss": 0.65669495, - "num_input_tokens_seen": 322968880, - "step": 14975, - "time_per_iteration": 2.620419979095459 - }, - { - "auxiliary_loss_clip": 0.01098045, - "auxiliary_loss_mlp": 0.01032816, - "balance_loss_clip": 1.03662229, - "balance_loss_mlp": 1.02011395, - "epoch": 0.9004058319555088, - "flos": 24389414547840.0, - "grad_norm": 1.7931871131687724, - "language_loss": 0.73011506, - "learning_rate": 1.0309162011381257e-07, - "loss": 0.75142372, - "num_input_tokens_seen": 322989395, - "step": 14976, - "time_per_iteration": 2.6519412994384766 - }, - { - "auxiliary_loss_clip": 0.01092647, - "auxiliary_loss_mlp": 0.01031748, - "balance_loss_clip": 1.03749645, - "balance_loss_mlp": 1.01954126, - "epoch": 0.9004659552081767, - "flos": 29059345658880.0, - "grad_norm": 1.7534579820157172, - "language_loss": 0.69598532, - "learning_rate": 1.0296823011863565e-07, - "loss": 0.71722925, - "num_input_tokens_seen": 323009060, - "step": 14977, - "time_per_iteration": 2.6647446155548096 - }, - { - "auxiliary_loss_clip": 0.01082206, - "auxiliary_loss_mlp": 0.00771481, - "balance_loss_clip": 1.03483725, - "balance_loss_mlp": 1.00023878, - "epoch": 0.9005260784608448, - "flos": 16763855800320.0, - "grad_norm": 2.4242637443808603, - "language_loss": 0.65483779, - "learning_rate": 1.0284491205909351e-07, - "loss": 0.67337465, - "num_input_tokens_seen": 323027530, - "step": 14978, - "time_per_iteration": 2.6061410903930664 - }, - { - "auxiliary_loss_clip": 0.01078235, - "auxiliary_loss_mlp": 0.01038613, - "balance_loss_clip": 1.03480434, - "balance_loss_mlp": 1.02505875, - "epoch": 0.9005862017135127, - "flos": 20376002269440.0, - "grad_norm": 1.675464257364332, - "language_loss": 0.78981739, - "learning_rate": 1.0272166593986286e-07, - "loss": 0.81098592, - "num_input_tokens_seen": 323045370, - "step": 14979, - "time_per_iteration": 2.6818509101867676 - }, - { - "auxiliary_loss_clip": 0.01008335, - "auxiliary_loss_mlp": 0.01001784, - "balance_loss_clip": 1.00541806, - "balance_loss_mlp": 1.00071073, - "epoch": 0.9006463249661807, - "flos": 67580255796480.0, - "grad_norm": 0.7182102286721572, - "language_loss": 0.535707, - "learning_rate": 1.0259849176561642e-07, - "loss": 0.55580819, - "num_input_tokens_seen": 323105660, - "step": 14980, - "time_per_iteration": 3.2093987464904785 - }, - { - "auxiliary_loss_clip": 0.01103101, - "auxiliary_loss_mlp": 0.01041025, - "balance_loss_clip": 1.03967285, - "balance_loss_mlp": 1.0275898, - "epoch": 0.9007064482188486, - "flos": 28293335193600.0, - "grad_norm": 1.66637606590706, - "language_loss": 0.82372773, - "learning_rate": 1.0247538954102553e-07, - "loss": 0.84516907, - "num_input_tokens_seen": 323126365, - "step": 14981, - "time_per_iteration": 2.650113582611084 - }, - { - "auxiliary_loss_clip": 0.01066706, - "auxiliary_loss_mlp": 0.01032966, - "balance_loss_clip": 1.03680325, - "balance_loss_mlp": 1.02073503, - "epoch": 0.9007665714715166, - "flos": 21616320850560.0, - "grad_norm": 2.004462394579591, - "language_loss": 0.81781876, - "learning_rate": 1.0235235927075758e-07, - "loss": 0.83881551, - "num_input_tokens_seen": 323145655, - "step": 14982, - "time_per_iteration": 2.7423040866851807 - }, - { - "auxiliary_loss_clip": 0.01075107, - "auxiliary_loss_mlp": 0.01040244, - "balance_loss_clip": 1.03167033, - "balance_loss_mlp": 1.0271908, - "epoch": 0.9008266947241845, - "flos": 26541864120960.0, - "grad_norm": 1.9584785414964334, - "language_loss": 0.71540499, - "learning_rate": 1.0222940095947885e-07, - "loss": 0.73655844, - "num_input_tokens_seen": 323164540, - "step": 14983, - "time_per_iteration": 2.7024779319763184 - }, - { - "auxiliary_loss_clip": 0.01097308, - "auxiliary_loss_mlp": 0.0102736, - "balance_loss_clip": 1.03790069, - "balance_loss_mlp": 1.01611233, - "epoch": 0.9008868179768525, - "flos": 23110527738240.0, - "grad_norm": 1.3086245920828656, - "language_loss": 0.74951446, - "learning_rate": 1.0210651461185115e-07, - "loss": 0.77076113, - "num_input_tokens_seen": 323186960, - "step": 14984, - "time_per_iteration": 2.813418388366699 - }, - { - "auxiliary_loss_clip": 0.01104396, - "auxiliary_loss_mlp": 0.01032793, - "balance_loss_clip": 1.03461576, - "balance_loss_mlp": 1.02068746, - "epoch": 0.9009469412295206, - "flos": 19060809788160.0, - "grad_norm": 1.7050072282727156, - "language_loss": 0.70293552, - "learning_rate": 1.0198370023253456e-07, - "loss": 0.72430742, - "num_input_tokens_seen": 323206135, - "step": 14985, - "time_per_iteration": 2.767937183380127 - }, - { - "auxiliary_loss_clip": 0.01087695, - "auxiliary_loss_mlp": 0.01033923, - "balance_loss_clip": 1.03466606, - "balance_loss_mlp": 1.02110791, - "epoch": 0.9010070644821885, - "flos": 23222281927680.0, - "grad_norm": 2.1137974022402575, - "language_loss": 0.70276654, - "learning_rate": 1.0186095782618643e-07, - "loss": 0.72398281, - "num_input_tokens_seen": 323225980, - "step": 14986, - "time_per_iteration": 2.7246689796447754 - }, - { - "auxiliary_loss_clip": 0.01096893, - "auxiliary_loss_mlp": 0.01032434, - "balance_loss_clip": 1.03352499, - "balance_loss_mlp": 1.01991105, - "epoch": 0.9010671877348565, - "flos": 17384823146880.0, - "grad_norm": 1.658677803041605, - "language_loss": 0.76861989, - "learning_rate": 1.0173828739746104e-07, - "loss": 0.78991318, - "num_input_tokens_seen": 323243700, - "step": 14987, - "time_per_iteration": 2.5764570236206055 - }, - { - "auxiliary_loss_clip": 0.01092941, - "auxiliary_loss_mlp": 0.0103351, - "balance_loss_clip": 1.03674459, - "balance_loss_mlp": 1.02103519, - "epoch": 0.9011273109875244, - "flos": 21908166854400.0, - "grad_norm": 1.956004475384015, - "language_loss": 0.73540664, - "learning_rate": 1.0161568895100981e-07, - "loss": 0.75667119, - "num_input_tokens_seen": 323261535, - "step": 14988, - "time_per_iteration": 2.558128595352173 - }, - { - "auxiliary_loss_clip": 0.01086646, - "auxiliary_loss_mlp": 0.01032746, - "balance_loss_clip": 1.03845191, - "balance_loss_mlp": 1.01903129, - "epoch": 0.9011874342401924, - "flos": 24060831909120.0, - "grad_norm": 1.6803307482650078, - "language_loss": 0.69135392, - "learning_rate": 1.0149316249148188e-07, - "loss": 0.7125479, - "num_input_tokens_seen": 323281855, - "step": 14989, - "time_per_iteration": 2.650520086288452 - }, - { - "auxiliary_loss_clip": 0.01109667, - "auxiliary_loss_mlp": 0.0102853, - "balance_loss_clip": 1.03716099, - "balance_loss_mlp": 1.01638818, - "epoch": 0.9012475574928603, - "flos": 16758791982720.0, - "grad_norm": 1.8703364751087568, - "language_loss": 0.79935807, - "learning_rate": 1.0137070802352376e-07, - "loss": 0.8207401, - "num_input_tokens_seen": 323299505, - "step": 14990, - "time_per_iteration": 2.5482540130615234 - }, - { - "auxiliary_loss_clip": 0.0107379, - "auxiliary_loss_mlp": 0.01031402, - "balance_loss_clip": 1.03743267, - "balance_loss_mlp": 1.01825941, - "epoch": 0.9013076807455284, - "flos": 19971109186560.0, - "grad_norm": 1.999297573895168, - "language_loss": 0.78150022, - "learning_rate": 1.0124832555177842e-07, - "loss": 0.8025521, - "num_input_tokens_seen": 323318365, - "step": 14991, - "time_per_iteration": 2.7104129791259766 - }, - { - "auxiliary_loss_clip": 0.00995246, - "auxiliary_loss_mlp": 0.00751703, - "balance_loss_clip": 1.00523067, - "balance_loss_mlp": 0.9995659, - "epoch": 0.9013678039981963, - "flos": 65180274624000.0, - "grad_norm": 0.7792478224468473, - "language_loss": 0.60261661, - "learning_rate": 1.0112601508088726e-07, - "loss": 0.62008613, - "num_input_tokens_seen": 323371835, - "step": 14992, - "time_per_iteration": 3.123297691345215 - }, - { - "auxiliary_loss_clip": 0.0109359, - "auxiliary_loss_mlp": 0.0102822, - "balance_loss_clip": 1.03499448, - "balance_loss_mlp": 1.01605463, - "epoch": 0.9014279272508643, - "flos": 20521224956160.0, - "grad_norm": 2.279288260696507, - "language_loss": 0.82825989, - "learning_rate": 1.0100377661548764e-07, - "loss": 0.84947795, - "num_input_tokens_seen": 323388495, - "step": 14993, - "time_per_iteration": 2.574572801589966 - }, - { - "auxiliary_loss_clip": 0.01107431, - "auxiliary_loss_mlp": 0.01034277, - "balance_loss_clip": 1.03556728, - "balance_loss_mlp": 1.02142596, - "epoch": 0.9014880505035322, - "flos": 17309051406720.0, - "grad_norm": 2.2716926772447286, - "language_loss": 0.73481464, - "learning_rate": 1.0088161016021502e-07, - "loss": 0.75623167, - "num_input_tokens_seen": 323405280, - "step": 14994, - "time_per_iteration": 2.538275957107544 - }, - { - "auxiliary_loss_clip": 0.01093439, - "auxiliary_loss_mlp": 0.01026115, - "balance_loss_clip": 1.03458691, - "balance_loss_mlp": 1.01476073, - "epoch": 0.9015481737562002, - "flos": 28402862739840.0, - "grad_norm": 1.830419215860498, - "language_loss": 0.64486498, - "learning_rate": 1.0075951571970187e-07, - "loss": 0.66606051, - "num_input_tokens_seen": 323425310, - "step": 14995, - "time_per_iteration": 2.623666286468506 - }, - { - "auxiliary_loss_clip": 0.01069201, - "auxiliary_loss_mlp": 0.01033584, - "balance_loss_clip": 1.03037524, - "balance_loss_mlp": 1.01956463, - "epoch": 0.9016082970088681, - "flos": 29752672953600.0, - "grad_norm": 1.8771395079815063, - "language_loss": 0.66334212, - "learning_rate": 1.0063749329857873e-07, - "loss": 0.68436992, - "num_input_tokens_seen": 323447805, - "step": 14996, - "time_per_iteration": 2.781064510345459 - }, - { - "auxiliary_loss_clip": 0.01095585, - "auxiliary_loss_mlp": 0.01028339, - "balance_loss_clip": 1.03407955, - "balance_loss_mlp": 1.01678181, - "epoch": 0.9016684202615362, - "flos": 23513230091520.0, - "grad_norm": 1.8246548425287972, - "language_loss": 0.66247928, - "learning_rate": 1.0051554290147168e-07, - "loss": 0.68371856, - "num_input_tokens_seen": 323467150, - "step": 14997, - "time_per_iteration": 2.71907114982605 - }, - { - "auxiliary_loss_clip": 0.01080625, - "auxiliary_loss_mlp": 0.01037629, - "balance_loss_clip": 1.03254914, - "balance_loss_mlp": 1.02483201, - "epoch": 0.9017285435142042, - "flos": 16979247705600.0, - "grad_norm": 1.8261489433850353, - "language_loss": 0.77650619, - "learning_rate": 1.0039366453300613e-07, - "loss": 0.79768866, - "num_input_tokens_seen": 323484250, - "step": 14998, - "time_per_iteration": 2.6528589725494385 - }, - { - "auxiliary_loss_clip": 0.0110937, - "auxiliary_loss_mlp": 0.01029624, - "balance_loss_clip": 1.03644896, - "balance_loss_mlp": 1.0174706, - "epoch": 0.9017886667668721, - "flos": 21393351175680.0, - "grad_norm": 1.6740643670552307, - "language_loss": 0.75225437, - "learning_rate": 1.0027185819780281e-07, - "loss": 0.77364427, - "num_input_tokens_seen": 323502910, - "step": 14999, - "time_per_iteration": 2.5951831340789795 - }, - { - "auxiliary_loss_clip": 0.01045283, - "auxiliary_loss_mlp": 0.01030998, - "balance_loss_clip": 1.03599596, - "balance_loss_mlp": 1.01742625, - "epoch": 0.9018487900195401, - "flos": 20996574566400.0, - "grad_norm": 8.557112820549731, - "language_loss": 0.75833976, - "learning_rate": 1.0015012390048117e-07, - "loss": 0.77910256, - "num_input_tokens_seen": 323521820, - "step": 15000, - "time_per_iteration": 2.7700390815734863 - }, - { - "auxiliary_loss_clip": 0.01090367, - "auxiliary_loss_mlp": 0.01028007, - "balance_loss_clip": 1.03579473, - "balance_loss_mlp": 1.01666379, - "epoch": 0.901908913272208, - "flos": 53358443458560.0, - "grad_norm": 2.1350900173970153, - "language_loss": 0.80694187, - "learning_rate": 1.0002846164565704e-07, - "loss": 0.8281256, - "num_input_tokens_seen": 323543200, - "step": 15001, - "time_per_iteration": 2.914686918258667 - }, - { - "auxiliary_loss_clip": 0.01076218, - "auxiliary_loss_mlp": 0.01028009, - "balance_loss_clip": 1.03696334, - "balance_loss_mlp": 1.01640391, - "epoch": 0.901969036524876, - "flos": 22089838867200.0, - "grad_norm": 1.603585496644282, - "language_loss": 0.78372264, - "learning_rate": 9.990687143794407e-08, - "loss": 0.80476493, - "num_input_tokens_seen": 323563075, - "step": 15002, - "time_per_iteration": 2.7641050815582275 - }, - { - "auxiliary_loss_clip": 0.01082464, - "auxiliary_loss_mlp": 0.01045075, - "balance_loss_clip": 1.03580308, - "balance_loss_mlp": 1.03049612, - "epoch": 0.9020291597775439, - "flos": 23835025059840.0, - "grad_norm": 2.0404977120917147, - "language_loss": 0.68748105, - "learning_rate": 9.978535328195347e-08, - "loss": 0.70875645, - "num_input_tokens_seen": 323579065, - "step": 15003, - "time_per_iteration": 2.771782875061035 - }, - { - "auxiliary_loss_clip": 0.01085032, - "auxiliary_loss_mlp": 0.0103817, - "balance_loss_clip": 1.03330088, - "balance_loss_mlp": 1.02506328, - "epoch": 0.902089283030212, - "flos": 18326005263360.0, - "grad_norm": 1.8107770462670902, - "language_loss": 0.85949785, - "learning_rate": 9.9663907182292e-08, - "loss": 0.88072991, - "num_input_tokens_seen": 323594835, - "step": 15004, - "time_per_iteration": 2.666977882385254 - }, - { - "auxiliary_loss_clip": 0.0107511, - "auxiliary_loss_mlp": 0.01035534, - "balance_loss_clip": 1.03480256, - "balance_loss_mlp": 1.02221882, - "epoch": 0.9021494062828799, - "flos": 24170359455360.0, - "grad_norm": 2.562101889878063, - "language_loss": 0.71954483, - "learning_rate": 9.954253314356575e-08, - "loss": 0.74065125, - "num_input_tokens_seen": 323611475, - "step": 15005, - "time_per_iteration": 2.759964942932129 - }, - { - "auxiliary_loss_clip": 0.01100393, - "auxiliary_loss_mlp": 0.01030904, - "balance_loss_clip": 1.03423667, - "balance_loss_mlp": 1.01778543, - "epoch": 0.9022095295355479, - "flos": 21616859554560.0, - "grad_norm": 1.9300274914184496, - "language_loss": 0.70556152, - "learning_rate": 9.942123117037748e-08, - "loss": 0.72687459, - "num_input_tokens_seen": 323629730, - "step": 15006, - "time_per_iteration": 2.6384735107421875 - }, - { - "auxiliary_loss_clip": 0.01086555, - "auxiliary_loss_mlp": 0.0102873, - "balance_loss_clip": 1.03485382, - "balance_loss_mlp": 1.01679754, - "epoch": 0.9022696527882158, - "flos": 18726229578240.0, - "grad_norm": 3.0054319156686264, - "language_loss": 0.84866273, - "learning_rate": 9.930000126732618e-08, - "loss": 0.86981565, - "num_input_tokens_seen": 323646000, - "step": 15007, - "time_per_iteration": 2.648921489715576 - }, - { - "auxiliary_loss_clip": 0.01079211, - "auxiliary_loss_mlp": 0.01030937, - "balance_loss_clip": 1.03299296, - "balance_loss_mlp": 1.01809239, - "epoch": 0.9023297760408838, - "flos": 26761206522240.0, - "grad_norm": 1.9317011784213973, - "language_loss": 0.7883476, - "learning_rate": 9.917884343900928e-08, - "loss": 0.80944914, - "num_input_tokens_seen": 323667250, - "step": 15008, - "time_per_iteration": 4.242715120315552 - }, - { - "auxiliary_loss_clip": 0.01063806, - "auxiliary_loss_mlp": 0.01033642, - "balance_loss_clip": 1.03497434, - "balance_loss_mlp": 1.0214889, - "epoch": 0.9023898992935517, - "flos": 20522553759360.0, - "grad_norm": 1.866627762329264, - "language_loss": 0.73153245, - "learning_rate": 9.905775769002156e-08, - "loss": 0.75250691, - "num_input_tokens_seen": 323687150, - "step": 15009, - "time_per_iteration": 2.691822052001953 - }, - { - "auxiliary_loss_clip": 0.01107314, - "auxiliary_loss_mlp": 0.01035667, - "balance_loss_clip": 1.0361793, - "balance_loss_mlp": 1.02314413, - "epoch": 0.9024500225462198, - "flos": 17456644391040.0, - "grad_norm": 1.76387616724559, - "language_loss": 0.73348868, - "learning_rate": 9.893674402495399e-08, - "loss": 0.75491852, - "num_input_tokens_seen": 323703660, - "step": 15010, - "time_per_iteration": 4.291422128677368 - }, - { - "auxiliary_loss_clip": 0.0108209, - "auxiliary_loss_mlp": 0.01035862, - "balance_loss_clip": 1.03634191, - "balance_loss_mlp": 1.02284431, - "epoch": 0.9025101457988878, - "flos": 20813609664000.0, - "grad_norm": 2.097335794667479, - "language_loss": 0.74242449, - "learning_rate": 9.881580244839538e-08, - "loss": 0.76360393, - "num_input_tokens_seen": 323722060, - "step": 15011, - "time_per_iteration": 4.15416693687439 - }, - { - "auxiliary_loss_clip": 0.01101836, - "auxiliary_loss_mlp": 0.01031616, - "balance_loss_clip": 1.03616571, - "balance_loss_mlp": 1.01824617, - "epoch": 0.9025702690515557, - "flos": 19026371623680.0, - "grad_norm": 10.830412851776218, - "language_loss": 0.72975504, - "learning_rate": 9.869493296493204e-08, - "loss": 0.75108945, - "num_input_tokens_seen": 323740645, - "step": 15012, - "time_per_iteration": 4.172262668609619 - }, - { - "auxiliary_loss_clip": 0.01073966, - "auxiliary_loss_mlp": 0.01038584, - "balance_loss_clip": 1.03479862, - "balance_loss_mlp": 1.02719402, - "epoch": 0.9026303923042237, - "flos": 19682818629120.0, - "grad_norm": 1.6805885971222159, - "language_loss": 0.69541949, - "learning_rate": 9.857413557914763e-08, - "loss": 0.71654499, - "num_input_tokens_seen": 323758905, - "step": 15013, - "time_per_iteration": 2.6801204681396484 - }, - { - "auxiliary_loss_clip": 0.01092922, - "auxiliary_loss_mlp": 0.01031842, - "balance_loss_clip": 1.03412437, - "balance_loss_mlp": 1.01987374, - "epoch": 0.9026905155568916, - "flos": 24608110504320.0, - "grad_norm": 1.451081928504829, - "language_loss": 0.73157448, - "learning_rate": 9.845341029562249e-08, - "loss": 0.75282216, - "num_input_tokens_seen": 323780595, - "step": 15014, - "time_per_iteration": 2.6699087619781494 - }, - { - "auxiliary_loss_clip": 0.01107905, - "auxiliary_loss_mlp": 0.01032528, - "balance_loss_clip": 1.03593612, - "balance_loss_mlp": 1.01995111, - "epoch": 0.9027506388095596, - "flos": 20521799573760.0, - "grad_norm": 1.9727005096909034, - "language_loss": 0.72401255, - "learning_rate": 9.833275711893474e-08, - "loss": 0.74541688, - "num_input_tokens_seen": 323798160, - "step": 15015, - "time_per_iteration": 2.536134958267212 - }, - { - "auxiliary_loss_clip": 0.01083409, - "auxiliary_loss_mlp": 0.01034606, - "balance_loss_clip": 1.03356743, - "balance_loss_mlp": 1.02245855, - "epoch": 0.9028107620622275, - "flos": 22784494965120.0, - "grad_norm": 2.2967609307485213, - "language_loss": 0.6894691, - "learning_rate": 9.821217605365895e-08, - "loss": 0.71064925, - "num_input_tokens_seen": 323816810, - "step": 15016, - "time_per_iteration": 2.696544647216797 - }, - { - "auxiliary_loss_clip": 0.01105993, - "auxiliary_loss_mlp": 0.01027953, - "balance_loss_clip": 1.03578448, - "balance_loss_mlp": 1.0165025, - "epoch": 0.9028708853148956, - "flos": 25410534382080.0, - "grad_norm": 1.8623697779544957, - "language_loss": 0.7037698, - "learning_rate": 9.809166710436855e-08, - "loss": 0.72510922, - "num_input_tokens_seen": 323836900, - "step": 15017, - "time_per_iteration": 2.595538377761841 - }, - { - "auxiliary_loss_clip": 0.01086858, - "auxiliary_loss_mlp": 0.01033508, - "balance_loss_clip": 1.03965449, - "balance_loss_mlp": 1.02197492, - "epoch": 0.9029310085675635, - "flos": 21871322478720.0, - "grad_norm": 1.936832914018773, - "language_loss": 0.69448954, - "learning_rate": 9.797123027563237e-08, - "loss": 0.71569324, - "num_input_tokens_seen": 323855325, - "step": 15018, - "time_per_iteration": 2.6294448375701904 - }, - { - "auxiliary_loss_clip": 0.01097184, - "auxiliary_loss_mlp": 0.01030964, - "balance_loss_clip": 1.0363183, - "balance_loss_mlp": 1.01848841, - "epoch": 0.9029911318202315, - "flos": 26214394803840.0, - "grad_norm": 2.670057075172495, - "language_loss": 0.68977821, - "learning_rate": 9.785086557201782e-08, - "loss": 0.71105969, - "num_input_tokens_seen": 323875650, - "step": 15019, - "time_per_iteration": 2.7993857860565186 - }, - { - "auxiliary_loss_clip": 0.01105429, - "auxiliary_loss_mlp": 0.0103295, - "balance_loss_clip": 1.03574717, - "balance_loss_mlp": 1.02123153, - "epoch": 0.9030512550728994, - "flos": 15961360095360.0, - "grad_norm": 1.9111353110117102, - "language_loss": 0.72140992, - "learning_rate": 9.773057299808951e-08, - "loss": 0.74279368, - "num_input_tokens_seen": 323892920, - "step": 15020, - "time_per_iteration": 2.588925361633301 - }, - { - "auxiliary_loss_clip": 0.01094641, - "auxiliary_loss_mlp": 0.01030177, - "balance_loss_clip": 1.03369665, - "balance_loss_mlp": 1.01739788, - "epoch": 0.9031113783255674, - "flos": 23987610034560.0, - "grad_norm": 1.5881960753658597, - "language_loss": 0.74275625, - "learning_rate": 9.7610352558408e-08, - "loss": 0.76400447, - "num_input_tokens_seen": 323913835, - "step": 15021, - "time_per_iteration": 2.588358163833618 - }, - { - "auxiliary_loss_clip": 0.01112744, - "auxiliary_loss_mlp": 0.01031351, - "balance_loss_clip": 1.03767323, - "balance_loss_mlp": 1.01803565, - "epoch": 0.9031715015782353, - "flos": 22237216369920.0, - "grad_norm": 2.206963784071178, - "language_loss": 0.7280935, - "learning_rate": 9.749020425753251e-08, - "loss": 0.74953449, - "num_input_tokens_seen": 323933440, - "step": 15022, - "time_per_iteration": 2.536369562149048 - }, - { - "auxiliary_loss_clip": 0.01068128, - "auxiliary_loss_mlp": 0.01027903, - "balance_loss_clip": 1.03312647, - "balance_loss_mlp": 1.01603556, - "epoch": 0.9032316248309034, - "flos": 26323168164480.0, - "grad_norm": 2.626283652398094, - "language_loss": 0.72459871, - "learning_rate": 9.737012810001943e-08, - "loss": 0.74555898, - "num_input_tokens_seen": 323954090, - "step": 15023, - "time_per_iteration": 2.7086663246154785 - }, - { - "auxiliary_loss_clip": 0.01095012, - "auxiliary_loss_mlp": 0.01033688, - "balance_loss_clip": 1.03661966, - "balance_loss_mlp": 1.02148056, - "epoch": 0.9032917480835713, - "flos": 22636686499200.0, - "grad_norm": 1.615390594189699, - "language_loss": 0.82334167, - "learning_rate": 9.725012409042155e-08, - "loss": 0.84462869, - "num_input_tokens_seen": 323974040, - "step": 15024, - "time_per_iteration": 2.6185879707336426 - }, - { - "auxiliary_loss_clip": 0.01099161, - "auxiliary_loss_mlp": 0.01028549, - "balance_loss_clip": 1.03624964, - "balance_loss_mlp": 1.01650262, - "epoch": 0.9033518713362393, - "flos": 23878764846720.0, - "grad_norm": 1.6458847486672181, - "language_loss": 0.69518673, - "learning_rate": 9.713019223328966e-08, - "loss": 0.7164638, - "num_input_tokens_seen": 323996125, - "step": 15025, - "time_per_iteration": 2.6076362133026123 - }, - { - "auxiliary_loss_clip": 0.01073996, - "auxiliary_loss_mlp": 0.01035637, - "balance_loss_clip": 1.03491449, - "balance_loss_mlp": 1.02332294, - "epoch": 0.9034119945889073, - "flos": 26905279973760.0, - "grad_norm": 1.5601899591487556, - "language_loss": 0.76521379, - "learning_rate": 9.70103325331717e-08, - "loss": 0.78631014, - "num_input_tokens_seen": 324017645, - "step": 15026, - "time_per_iteration": 2.7674145698547363 - }, - { - "auxiliary_loss_clip": 0.01098222, - "auxiliary_loss_mlp": 0.01030884, - "balance_loss_clip": 1.03840423, - "balance_loss_mlp": 1.01899886, - "epoch": 0.9034721178415752, - "flos": 20850166730880.0, - "grad_norm": 2.0222752747400192, - "language_loss": 0.68377501, - "learning_rate": 9.68905449946129e-08, - "loss": 0.70506608, - "num_input_tokens_seen": 324036875, - "step": 15027, - "time_per_iteration": 2.6653904914855957 - }, - { - "auxiliary_loss_clip": 0.01052551, - "auxiliary_loss_mlp": 0.01041128, - "balance_loss_clip": 1.03196084, - "balance_loss_mlp": 1.02769923, - "epoch": 0.9035322410942432, - "flos": 22234307368320.0, - "grad_norm": 1.6548540409634305, - "language_loss": 0.75698447, - "learning_rate": 9.677082962215477e-08, - "loss": 0.7779212, - "num_input_tokens_seen": 324057045, - "step": 15028, - "time_per_iteration": 2.7179388999938965 - }, - { - "auxiliary_loss_clip": 0.01052919, - "auxiliary_loss_mlp": 0.0103746, - "balance_loss_clip": 1.03367233, - "balance_loss_mlp": 1.02507436, - "epoch": 0.9035923643469111, - "flos": 25923410726400.0, - "grad_norm": 1.805593039358967, - "language_loss": 0.69399357, - "learning_rate": 9.665118642033765e-08, - "loss": 0.71489739, - "num_input_tokens_seen": 324079735, - "step": 15029, - "time_per_iteration": 2.813114643096924 - }, - { - "auxiliary_loss_clip": 0.01096672, - "auxiliary_loss_mlp": 0.01034607, - "balance_loss_clip": 1.03852797, - "balance_loss_mlp": 1.02123141, - "epoch": 0.9036524875995792, - "flos": 20339804338560.0, - "grad_norm": 1.8345501751502649, - "language_loss": 0.7369951, - "learning_rate": 9.653161539369858e-08, - "loss": 0.75830793, - "num_input_tokens_seen": 324097785, - "step": 15030, - "time_per_iteration": 2.696516990661621 - }, - { - "auxiliary_loss_clip": 0.01101797, - "auxiliary_loss_mlp": 0.01030355, - "balance_loss_clip": 1.03739715, - "balance_loss_mlp": 1.01790965, - "epoch": 0.9037126108522471, - "flos": 40114624677120.0, - "grad_norm": 2.451430150859209, - "language_loss": 0.6831615, - "learning_rate": 9.641211654677151e-08, - "loss": 0.70448303, - "num_input_tokens_seen": 324121625, - "step": 15031, - "time_per_iteration": 2.776313543319702 - }, - { - "auxiliary_loss_clip": 0.01085756, - "auxiliary_loss_mlp": 0.01028456, - "balance_loss_clip": 1.03706944, - "balance_loss_mlp": 1.01662993, - "epoch": 0.9037727341049151, - "flos": 23332024955520.0, - "grad_norm": 1.492349301530935, - "language_loss": 0.76186407, - "learning_rate": 9.629268988408723e-08, - "loss": 0.78300619, - "num_input_tokens_seen": 324142535, - "step": 15032, - "time_per_iteration": 2.722729206085205 - }, - { - "auxiliary_loss_clip": 0.01110023, - "auxiliary_loss_mlp": 0.01033692, - "balance_loss_clip": 1.03756511, - "balance_loss_mlp": 1.02144957, - "epoch": 0.903832857357583, - "flos": 12822659815680.0, - "grad_norm": 1.761761043861274, - "language_loss": 0.75420368, - "learning_rate": 9.617333541017502e-08, - "loss": 0.77564085, - "num_input_tokens_seen": 324159610, - "step": 15033, - "time_per_iteration": 2.6790883541107178 - }, - { - "auxiliary_loss_clip": 0.01074569, - "auxiliary_loss_mlp": 0.01038477, - "balance_loss_clip": 1.03108501, - "balance_loss_mlp": 1.02516127, - "epoch": 0.903892980610251, - "flos": 25703026830720.0, - "grad_norm": 1.9648590511752269, - "language_loss": 0.73957044, - "learning_rate": 9.605405312956105e-08, - "loss": 0.76070094, - "num_input_tokens_seen": 324182510, - "step": 15034, - "time_per_iteration": 2.7564845085144043 - }, - { - "auxiliary_loss_clip": 0.01076984, - "auxiliary_loss_mlp": 0.01032868, - "balance_loss_clip": 1.03676867, - "balance_loss_mlp": 1.02031517, - "epoch": 0.9039531038629189, - "flos": 14684089397760.0, - "grad_norm": 2.177722949634339, - "language_loss": 0.6356231, - "learning_rate": 9.593484304676791e-08, - "loss": 0.65672159, - "num_input_tokens_seen": 324200555, - "step": 15035, - "time_per_iteration": 2.714242935180664 - }, - { - "auxiliary_loss_clip": 0.01109298, - "auxiliary_loss_mlp": 0.01032355, - "balance_loss_clip": 1.0378021, - "balance_loss_mlp": 1.01890254, - "epoch": 0.904013227115587, - "flos": 24024921287040.0, - "grad_norm": 2.5713675612269897, - "language_loss": 0.61697221, - "learning_rate": 9.581570516631643e-08, - "loss": 0.63838875, - "num_input_tokens_seen": 324220255, - "step": 15036, - "time_per_iteration": 2.6531126499176025 - }, - { - "auxiliary_loss_clip": 0.01057116, - "auxiliary_loss_mlp": 0.01033081, - "balance_loss_clip": 1.03590751, - "balance_loss_mlp": 1.02079058, - "epoch": 0.9040733503682549, - "flos": 22856459863680.0, - "grad_norm": 1.6688110224130346, - "language_loss": 0.82059491, - "learning_rate": 9.569663949272455e-08, - "loss": 0.84149683, - "num_input_tokens_seen": 324237855, - "step": 15037, - "time_per_iteration": 2.667306661605835 - }, - { - "auxiliary_loss_clip": 0.01111291, - "auxiliary_loss_mlp": 0.01029139, - "balance_loss_clip": 1.03720188, - "balance_loss_mlp": 1.01677668, - "epoch": 0.9041334736209229, - "flos": 19974951941760.0, - "grad_norm": 1.9034264024294631, - "language_loss": 0.67595971, - "learning_rate": 9.557764603050667e-08, - "loss": 0.69736397, - "num_input_tokens_seen": 324257050, - "step": 15038, - "time_per_iteration": 2.546713352203369 - }, - { - "auxiliary_loss_clip": 0.01085126, - "auxiliary_loss_mlp": 0.0103871, - "balance_loss_clip": 1.03417087, - "balance_loss_mlp": 1.02606213, - "epoch": 0.9041935968735909, - "flos": 17530548624000.0, - "grad_norm": 2.007069946827801, - "language_loss": 0.7516647, - "learning_rate": 9.545872478417494e-08, - "loss": 0.77290308, - "num_input_tokens_seen": 324275510, - "step": 15039, - "time_per_iteration": 2.6198740005493164 - }, - { - "auxiliary_loss_clip": 0.01082867, - "auxiliary_loss_mlp": 0.01030216, - "balance_loss_clip": 1.03606772, - "balance_loss_mlp": 1.01828885, - "epoch": 0.9042537201262588, - "flos": 22780149419520.0, - "grad_norm": 1.4865254834014996, - "language_loss": 0.70274264, - "learning_rate": 9.533987575823977e-08, - "loss": 0.7238735, - "num_input_tokens_seen": 324295150, - "step": 15040, - "time_per_iteration": 2.6253907680511475 - }, - { - "auxiliary_loss_clip": 0.01073575, - "auxiliary_loss_mlp": 0.01031021, - "balance_loss_clip": 1.03373194, - "balance_loss_mlp": 1.01905835, - "epoch": 0.9043138433789268, - "flos": 20595416497920.0, - "grad_norm": 1.5884049488424423, - "language_loss": 0.67547166, - "learning_rate": 9.522109895720709e-08, - "loss": 0.69651759, - "num_input_tokens_seen": 324313855, - "step": 15041, - "time_per_iteration": 2.6538193225860596 - }, - { - "auxiliary_loss_clip": 0.01096511, - "auxiliary_loss_mlp": 0.01033073, - "balance_loss_clip": 1.0354538, - "balance_loss_mlp": 1.02016878, - "epoch": 0.9043739666315948, - "flos": 32962978995840.0, - "grad_norm": 1.757404597325935, - "language_loss": 0.57556689, - "learning_rate": 9.510239438558155e-08, - "loss": 0.59686273, - "num_input_tokens_seen": 324338465, - "step": 15042, - "time_per_iteration": 2.7718114852905273 - }, - { - "auxiliary_loss_clip": 0.01010523, - "auxiliary_loss_mlp": 0.00751383, - "balance_loss_clip": 1.00739527, - "balance_loss_mlp": 0.99962682, - "epoch": 0.9044340898842628, - "flos": 67296418525440.0, - "grad_norm": 0.79646583953312, - "language_loss": 0.56897914, - "learning_rate": 9.498376204786351e-08, - "loss": 0.58659816, - "num_input_tokens_seen": 324398740, - "step": 15043, - "time_per_iteration": 3.1866395473480225 - }, - { - "auxiliary_loss_clip": 0.01086756, - "auxiliary_loss_mlp": 0.01031593, - "balance_loss_clip": 1.03518081, - "balance_loss_mlp": 1.01791954, - "epoch": 0.9044942131369307, - "flos": 17713154390400.0, - "grad_norm": 2.037927105640118, - "language_loss": 0.69802731, - "learning_rate": 9.486520194855274e-08, - "loss": 0.71921074, - "num_input_tokens_seen": 324417335, - "step": 15044, - "time_per_iteration": 2.6936917304992676 - }, - { - "auxiliary_loss_clip": 0.01089873, - "auxiliary_loss_mlp": 0.01040675, - "balance_loss_clip": 1.03643358, - "balance_loss_mlp": 1.02699018, - "epoch": 0.9045543363895987, - "flos": 17820563034240.0, - "grad_norm": 2.361452153722679, - "language_loss": 0.69954962, - "learning_rate": 9.474671409214407e-08, - "loss": 0.72085512, - "num_input_tokens_seen": 324433240, - "step": 15045, - "time_per_iteration": 2.655958414077759 - }, - { - "auxiliary_loss_clip": 0.01077221, - "auxiliary_loss_mlp": 0.01037261, - "balance_loss_clip": 1.0350486, - "balance_loss_mlp": 1.02417183, - "epoch": 0.9046144596422666, - "flos": 21872723109120.0, - "grad_norm": 1.816781294987572, - "language_loss": 0.65513825, - "learning_rate": 9.462829848313081e-08, - "loss": 0.67628312, - "num_input_tokens_seen": 324452675, - "step": 15046, - "time_per_iteration": 2.704993963241577 - }, - { - "auxiliary_loss_clip": 0.01077406, - "auxiliary_loss_mlp": 0.01039336, - "balance_loss_clip": 1.03620148, - "balance_loss_mlp": 1.02714109, - "epoch": 0.9046745828949346, - "flos": 17672646827520.0, - "grad_norm": 2.027120160637291, - "language_loss": 0.62039495, - "learning_rate": 9.450995512600379e-08, - "loss": 0.6415624, - "num_input_tokens_seen": 324467865, - "step": 15047, - "time_per_iteration": 2.731316089630127 - }, - { - "auxiliary_loss_clip": 0.01109878, - "auxiliary_loss_mlp": 0.00770221, - "balance_loss_clip": 1.03869438, - "balance_loss_mlp": 1.00023651, - "epoch": 0.9047347061476025, - "flos": 25702559953920.0, - "grad_norm": 1.5037316307134132, - "language_loss": 0.71319842, - "learning_rate": 9.439168402525032e-08, - "loss": 0.73199946, - "num_input_tokens_seen": 324490430, - "step": 15048, - "time_per_iteration": 5.092748403549194 - }, - { - "auxiliary_loss_clip": 0.01098767, - "auxiliary_loss_mlp": 0.0103572, - "balance_loss_clip": 1.03479016, - "balance_loss_mlp": 1.02233887, - "epoch": 0.9047948294002706, - "flos": 15158146118400.0, - "grad_norm": 2.1618818731676637, - "language_loss": 0.748658, - "learning_rate": 9.427348518535483e-08, - "loss": 0.7700029, - "num_input_tokens_seen": 324506620, - "step": 15049, - "time_per_iteration": 4.3224146366119385 - }, - { - "auxiliary_loss_clip": 0.01095393, - "auxiliary_loss_mlp": 0.0103323, - "balance_loss_clip": 1.0372622, - "balance_loss_mlp": 1.02072453, - "epoch": 0.9048549526529385, - "flos": 21872292145920.0, - "grad_norm": 2.5225470406592105, - "language_loss": 0.75863099, - "learning_rate": 9.415535861079993e-08, - "loss": 0.77991724, - "num_input_tokens_seen": 324525505, - "step": 15050, - "time_per_iteration": 4.230266094207764 - }, - { - "auxiliary_loss_clip": 0.01109636, - "auxiliary_loss_mlp": 0.00769663, - "balance_loss_clip": 1.03721118, - "balance_loss_mlp": 1.00019288, - "epoch": 0.9049150759056065, - "flos": 23546626761600.0, - "grad_norm": 1.8328342559703832, - "language_loss": 0.81820488, - "learning_rate": 9.403730430606472e-08, - "loss": 0.83699787, - "num_input_tokens_seen": 324544415, - "step": 15051, - "time_per_iteration": 4.13810133934021 - }, - { - "auxiliary_loss_clip": 0.0109796, - "auxiliary_loss_mlp": 0.01030797, - "balance_loss_clip": 1.03711987, - "balance_loss_mlp": 1.01926315, - "epoch": 0.9049751991582745, - "flos": 19645902426240.0, - "grad_norm": 2.063226238004681, - "language_loss": 0.89144683, - "learning_rate": 9.391932227562582e-08, - "loss": 0.91273439, - "num_input_tokens_seen": 324562555, - "step": 15052, - "time_per_iteration": 2.5994207859039307 - }, - { - "auxiliary_loss_clip": 0.01101275, - "auxiliary_loss_mlp": 0.01032827, - "balance_loss_clip": 1.03616786, - "balance_loss_mlp": 1.02020848, - "epoch": 0.9050353224109424, - "flos": 15596220389760.0, - "grad_norm": 3.6081086448903616, - "language_loss": 0.77183485, - "learning_rate": 9.380141252395724e-08, - "loss": 0.79317588, - "num_input_tokens_seen": 324580865, - "step": 15053, - "time_per_iteration": 2.546614170074463 - }, - { - "auxiliary_loss_clip": 0.01095283, - "auxiliary_loss_mlp": 0.01033259, - "balance_loss_clip": 1.03654027, - "balance_loss_mlp": 1.02096224, - "epoch": 0.9050954456636104, - "flos": 28183592165760.0, - "grad_norm": 2.4176866972554927, - "language_loss": 0.73160625, - "learning_rate": 9.368357505553049e-08, - "loss": 0.75289166, - "num_input_tokens_seen": 324600665, - "step": 15054, - "time_per_iteration": 2.658132553100586 - }, - { - "auxiliary_loss_clip": 0.01054009, - "auxiliary_loss_mlp": 0.01034334, - "balance_loss_clip": 1.03092122, - "balance_loss_mlp": 1.0217638, - "epoch": 0.9051555689162784, - "flos": 25731611078400.0, - "grad_norm": 1.6662566471194642, - "language_loss": 0.83386469, - "learning_rate": 9.356580987481333e-08, - "loss": 0.85474813, - "num_input_tokens_seen": 324618145, - "step": 15055, - "time_per_iteration": 2.7756059169769287 - }, - { - "auxiliary_loss_clip": 0.01094483, - "auxiliary_loss_mlp": 0.01034211, - "balance_loss_clip": 1.03571832, - "balance_loss_mlp": 1.02193809, - "epoch": 0.9052156921689464, - "flos": 23257258796160.0, - "grad_norm": 1.7590583279943084, - "language_loss": 0.85093272, - "learning_rate": 9.344811698627176e-08, - "loss": 0.87221962, - "num_input_tokens_seen": 324638165, - "step": 15056, - "time_per_iteration": 2.6432409286499023 - }, - { - "auxiliary_loss_clip": 0.01079366, - "auxiliary_loss_mlp": 0.01028685, - "balance_loss_clip": 1.03504348, - "balance_loss_mlp": 1.01706791, - "epoch": 0.9052758154216143, - "flos": 29564285097600.0, - "grad_norm": 2.874678812458683, - "language_loss": 0.72274697, - "learning_rate": 9.333049639436863e-08, - "loss": 0.74382746, - "num_input_tokens_seen": 324658560, - "step": 15057, - "time_per_iteration": 2.729560613632202 - }, - { - "auxiliary_loss_clip": 0.0109434, - "auxiliary_loss_mlp": 0.01032293, - "balance_loss_clip": 1.03419363, - "balance_loss_mlp": 1.02033675, - "epoch": 0.9053359386742823, - "flos": 22127688823680.0, - "grad_norm": 1.7504023555736803, - "language_loss": 0.80625844, - "learning_rate": 9.321294810356418e-08, - "loss": 0.82752472, - "num_input_tokens_seen": 324679185, - "step": 15058, - "time_per_iteration": 2.7866742610931396 - }, - { - "auxiliary_loss_clip": 0.01016738, - "auxiliary_loss_mlp": 0.01001155, - "balance_loss_clip": 1.00546241, - "balance_loss_mlp": 1.00027263, - "epoch": 0.9053960619269502, - "flos": 67090112760960.0, - "grad_norm": 0.6742645002897684, - "language_loss": 0.51343101, - "learning_rate": 9.309547211831592e-08, - "loss": 0.53360993, - "num_input_tokens_seen": 324744830, - "step": 15059, - "time_per_iteration": 3.2885544300079346 - }, - { - "auxiliary_loss_clip": 0.01072001, - "auxiliary_loss_mlp": 0.01030876, - "balance_loss_clip": 1.04110169, - "balance_loss_mlp": 1.0184902, - "epoch": 0.9054561851796182, - "flos": 15815419136640.0, - "grad_norm": 1.7140803408550112, - "language_loss": 0.67263991, - "learning_rate": 9.297806844307831e-08, - "loss": 0.69366872, - "num_input_tokens_seen": 324762905, - "step": 15060, - "time_per_iteration": 2.8112542629241943 - }, - { - "auxiliary_loss_clip": 0.01089234, - "auxiliary_loss_mlp": 0.01032459, - "balance_loss_clip": 1.03664804, - "balance_loss_mlp": 1.01979876, - "epoch": 0.9055163084322861, - "flos": 17566997950080.0, - "grad_norm": 2.3975546753010915, - "language_loss": 0.64229333, - "learning_rate": 9.286073708230357e-08, - "loss": 0.66351026, - "num_input_tokens_seen": 324781905, - "step": 15061, - "time_per_iteration": 2.6348559856414795 - }, - { - "auxiliary_loss_clip": 0.01083114, - "auxiliary_loss_mlp": 0.01038728, - "balance_loss_clip": 1.03490663, - "balance_loss_mlp": 1.02568662, - "epoch": 0.9055764316849542, - "flos": 17639573379840.0, - "grad_norm": 1.6952248050793448, - "language_loss": 0.71770173, - "learning_rate": 9.274347804044058e-08, - "loss": 0.73892021, - "num_input_tokens_seen": 324799260, - "step": 15062, - "time_per_iteration": 2.889420986175537 - }, - { - "auxiliary_loss_clip": 0.01106793, - "auxiliary_loss_mlp": 0.01031859, - "balance_loss_clip": 1.03594065, - "balance_loss_mlp": 1.01968181, - "epoch": 0.9056365549376221, - "flos": 20120856986880.0, - "grad_norm": 2.4465454482745534, - "language_loss": 0.71081591, - "learning_rate": 9.2626291321936e-08, - "loss": 0.73220247, - "num_input_tokens_seen": 324817800, - "step": 15063, - "time_per_iteration": 2.5845255851745605 - }, - { - "auxiliary_loss_clip": 0.01066505, - "auxiliary_loss_mlp": 0.01033382, - "balance_loss_clip": 1.03441405, - "balance_loss_mlp": 1.02137733, - "epoch": 0.9056966781902901, - "flos": 27598786836480.0, - "grad_norm": 1.6140764840552748, - "language_loss": 0.72192168, - "learning_rate": 9.250917693123406e-08, - "loss": 0.74292052, - "num_input_tokens_seen": 324838445, - "step": 15064, - "time_per_iteration": 2.711472511291504 - }, - { - "auxiliary_loss_clip": 0.01099676, - "auxiliary_loss_mlp": 0.01032131, - "balance_loss_clip": 1.0358665, - "balance_loss_mlp": 1.01976943, - "epoch": 0.9057568014429581, - "flos": 25920106675200.0, - "grad_norm": 1.9283380616790378, - "language_loss": 0.69733697, - "learning_rate": 9.23921348727752e-08, - "loss": 0.71865511, - "num_input_tokens_seen": 324859895, - "step": 15065, - "time_per_iteration": 2.6254019737243652 - }, - { - "auxiliary_loss_clip": 0.01076646, - "auxiliary_loss_mlp": 0.01034346, - "balance_loss_clip": 1.03431368, - "balance_loss_mlp": 1.02240729, - "epoch": 0.905816924695626, - "flos": 22930364096640.0, - "grad_norm": 1.5639103383265116, - "language_loss": 0.62895906, - "learning_rate": 9.227516515099743e-08, - "loss": 0.65006894, - "num_input_tokens_seen": 324879580, - "step": 15066, - "time_per_iteration": 2.7154438495635986 - }, - { - "auxiliary_loss_clip": 0.01035849, - "auxiliary_loss_mlp": 0.01033037, - "balance_loss_clip": 1.02947974, - "balance_loss_mlp": 1.01869655, - "epoch": 0.905877047948294, - "flos": 22157422306560.0, - "grad_norm": 1.934180125308043, - "language_loss": 0.80121052, - "learning_rate": 9.215826777033675e-08, - "loss": 0.82189941, - "num_input_tokens_seen": 324898950, - "step": 15067, - "time_per_iteration": 2.7812981605529785 - }, - { - "auxiliary_loss_clip": 0.0108924, - "auxiliary_loss_mlp": 0.01033309, - "balance_loss_clip": 1.0376116, - "balance_loss_mlp": 1.02020836, - "epoch": 0.905937171200962, - "flos": 15304805349120.0, - "grad_norm": 1.6228923811634084, - "language_loss": 0.70006502, - "learning_rate": 9.204144273522563e-08, - "loss": 0.72129059, - "num_input_tokens_seen": 324917455, - "step": 15068, - "time_per_iteration": 2.865957021713257 - }, - { - "auxiliary_loss_clip": 0.01104355, - "auxiliary_loss_mlp": 0.01028972, - "balance_loss_clip": 1.03523481, - "balance_loss_mlp": 1.01681864, - "epoch": 0.90599729445363, - "flos": 19462973437440.0, - "grad_norm": 2.0548899338022064, - "language_loss": 0.85366511, - "learning_rate": 9.19246900500943e-08, - "loss": 0.87499845, - "num_input_tokens_seen": 324934495, - "step": 15069, - "time_per_iteration": 2.5832648277282715 - }, - { - "auxiliary_loss_clip": 0.01100336, - "auxiliary_loss_mlp": 0.01032536, - "balance_loss_clip": 1.03515148, - "balance_loss_mlp": 1.01913118, - "epoch": 0.9060574177062979, - "flos": 23732967542400.0, - "grad_norm": 1.7734674553578826, - "language_loss": 0.59089136, - "learning_rate": 9.180800971936987e-08, - "loss": 0.61222005, - "num_input_tokens_seen": 324953230, - "step": 15070, - "time_per_iteration": 2.6578190326690674 - }, - { - "auxiliary_loss_clip": 0.01073063, - "auxiliary_loss_mlp": 0.01029432, - "balance_loss_clip": 1.03542089, - "balance_loss_mlp": 1.01644397, - "epoch": 0.9061175409589659, - "flos": 17311134395520.0, - "grad_norm": 2.114967180727135, - "language_loss": 0.81690538, - "learning_rate": 9.169140174747724e-08, - "loss": 0.83793026, - "num_input_tokens_seen": 324969880, - "step": 15071, - "time_per_iteration": 2.677042245864868 - }, - { - "auxiliary_loss_clip": 0.0111224, - "auxiliary_loss_mlp": 0.01041359, - "balance_loss_clip": 1.03753805, - "balance_loss_mlp": 1.02798986, - "epoch": 0.9061776642116338, - "flos": 17778439359360.0, - "grad_norm": 1.8991196777690924, - "language_loss": 0.61947775, - "learning_rate": 9.157486613883758e-08, - "loss": 0.64101374, - "num_input_tokens_seen": 324987005, - "step": 15072, - "time_per_iteration": 2.5581016540527344 - }, - { - "auxiliary_loss_clip": 0.01088368, - "auxiliary_loss_mlp": 0.01035581, - "balance_loss_clip": 1.03575015, - "balance_loss_mlp": 1.02321947, - "epoch": 0.9062377874643018, - "flos": 42777688037760.0, - "grad_norm": 1.883547115522317, - "language_loss": 0.73039377, - "learning_rate": 9.145840289787021e-08, - "loss": 0.75163323, - "num_input_tokens_seen": 325010700, - "step": 15073, - "time_per_iteration": 2.933929681777954 - }, - { - "auxiliary_loss_clip": 0.01094334, - "auxiliary_loss_mlp": 0.01027115, - "balance_loss_clip": 1.0359031, - "balance_loss_mlp": 1.01591563, - "epoch": 0.9062979107169697, - "flos": 16361620323840.0, - "grad_norm": 1.8214785876499617, - "language_loss": 0.8087334, - "learning_rate": 9.134201202899161e-08, - "loss": 0.82994789, - "num_input_tokens_seen": 325028760, - "step": 15074, - "time_per_iteration": 2.6201162338256836 - }, - { - "auxiliary_loss_clip": 0.00984336, - "auxiliary_loss_mlp": 0.00752175, - "balance_loss_clip": 1.00953913, - "balance_loss_mlp": 0.99961203, - "epoch": 0.9063580339696378, - "flos": 69313988528640.0, - "grad_norm": 0.7424455220001136, - "language_loss": 0.52306926, - "learning_rate": 9.122569353661513e-08, - "loss": 0.54043436, - "num_input_tokens_seen": 325093545, - "step": 15075, - "time_per_iteration": 3.318652391433716 - }, - { - "auxiliary_loss_clip": 0.00997512, - "auxiliary_loss_mlp": 0.00998485, - "balance_loss_clip": 1.0082109, - "balance_loss_mlp": 0.99731654, - "epoch": 0.9064181572223057, - "flos": 58794747148800.0, - "grad_norm": 0.7354115302623626, - "language_loss": 0.62038195, - "learning_rate": 9.11094474251517e-08, - "loss": 0.640342, - "num_input_tokens_seen": 325152295, - "step": 15076, - "time_per_iteration": 3.1302971839904785 - }, - { - "auxiliary_loss_clip": 0.01095732, - "auxiliary_loss_mlp": 0.01035747, - "balance_loss_clip": 1.0357511, - "balance_loss_mlp": 1.0237844, - "epoch": 0.9064782804749737, - "flos": 21762692772480.0, - "grad_norm": 1.7300331938520934, - "language_loss": 0.81917107, - "learning_rate": 9.09932736990091e-08, - "loss": 0.84048593, - "num_input_tokens_seen": 325169705, - "step": 15077, - "time_per_iteration": 2.6315958499908447 - }, - { - "auxiliary_loss_clip": 0.01081763, - "auxiliary_loss_mlp": 0.00769991, - "balance_loss_clip": 1.03210878, - "balance_loss_mlp": 1.00007868, - "epoch": 0.9065384037276417, - "flos": 21397373498880.0, - "grad_norm": 1.5468663255290942, - "language_loss": 0.83872044, - "learning_rate": 9.08771723625934e-08, - "loss": 0.85723794, - "num_input_tokens_seen": 325189175, - "step": 15078, - "time_per_iteration": 2.727109670639038 - }, - { - "auxiliary_loss_clip": 0.01093852, - "auxiliary_loss_mlp": 0.00770079, - "balance_loss_clip": 1.03619432, - "balance_loss_mlp": 1.00015736, - "epoch": 0.9065985269803096, - "flos": 38283646849920.0, - "grad_norm": 1.6827515544701097, - "language_loss": 0.65606648, - "learning_rate": 9.076114342030617e-08, - "loss": 0.67470574, - "num_input_tokens_seen": 325211020, - "step": 15079, - "time_per_iteration": 2.771944999694824 - }, - { - "auxiliary_loss_clip": 0.01028805, - "auxiliary_loss_mlp": 0.01027589, - "balance_loss_clip": 1.03047419, - "balance_loss_mlp": 1.0151794, - "epoch": 0.9066586502329776, - "flos": 44818562989440.0, - "grad_norm": 1.7893675619126004, - "language_loss": 0.70638371, - "learning_rate": 9.064518687654765e-08, - "loss": 0.72694761, - "num_input_tokens_seen": 325236970, - "step": 15080, - "time_per_iteration": 2.9839913845062256 - }, - { - "auxiliary_loss_clip": 0.01096514, - "auxiliary_loss_mlp": 0.01031379, - "balance_loss_clip": 1.03848863, - "balance_loss_mlp": 1.01837301, - "epoch": 0.9067187734856456, - "flos": 18623992492800.0, - "grad_norm": 2.4819155827069452, - "language_loss": 0.71019328, - "learning_rate": 9.052930273571547e-08, - "loss": 0.73147219, - "num_input_tokens_seen": 325252670, - "step": 15081, - "time_per_iteration": 2.5639331340789795 - }, - { - "auxiliary_loss_clip": 0.01082423, - "auxiliary_loss_mlp": 0.01034663, - "balance_loss_clip": 1.03733432, - "balance_loss_mlp": 1.02240872, - "epoch": 0.9067788967383136, - "flos": 22747578762240.0, - "grad_norm": 5.90815505153055, - "language_loss": 0.7437706, - "learning_rate": 9.04134910022032e-08, - "loss": 0.76494145, - "num_input_tokens_seen": 325273860, - "step": 15082, - "time_per_iteration": 2.6862359046936035 - }, - { - "auxiliary_loss_clip": 0.01073569, - "auxiliary_loss_mlp": 0.0103586, - "balance_loss_clip": 1.03576851, - "balance_loss_mlp": 1.02364099, - "epoch": 0.9068390199909815, - "flos": 27670787648640.0, - "grad_norm": 2.0228960329106904, - "language_loss": 0.78056735, - "learning_rate": 9.029775168040266e-08, - "loss": 0.80166161, - "num_input_tokens_seen": 325294140, - "step": 15083, - "time_per_iteration": 2.7631537914276123 - }, - { - "auxiliary_loss_clip": 0.01082943, - "auxiliary_loss_mlp": 0.0076928, - "balance_loss_clip": 1.03722239, - "balance_loss_mlp": 1.00023723, - "epoch": 0.9068991432436495, - "flos": 24244012293120.0, - "grad_norm": 1.5997317426680842, - "language_loss": 0.68783748, - "learning_rate": 9.01820847747028e-08, - "loss": 0.70635974, - "num_input_tokens_seen": 325313130, - "step": 15084, - "time_per_iteration": 2.720623731613159 - }, - { - "auxiliary_loss_clip": 0.01108826, - "auxiliary_loss_mlp": 0.01031366, - "balance_loss_clip": 1.03775597, - "balance_loss_mlp": 1.01930761, - "epoch": 0.9069592664963174, - "flos": 28033305661440.0, - "grad_norm": 7.23400764704548, - "language_loss": 0.67128915, - "learning_rate": 9.006649028948965e-08, - "loss": 0.69269109, - "num_input_tokens_seen": 325334880, - "step": 15085, - "time_per_iteration": 2.6862213611602783 - }, - { - "auxiliary_loss_clip": 0.00998184, - "auxiliary_loss_mlp": 0.01017743, - "balance_loss_clip": 1.00960755, - "balance_loss_mlp": 1.01620471, - "epoch": 0.9070193897489854, - "flos": 68778414789120.0, - "grad_norm": 0.7963063657697701, - "language_loss": 0.61316264, - "learning_rate": 8.995096822914638e-08, - "loss": 0.63332188, - "num_input_tokens_seen": 325394175, - "step": 15086, - "time_per_iteration": 3.2537643909454346 - }, - { - "auxiliary_loss_clip": 0.01093775, - "auxiliary_loss_mlp": 0.01038417, - "balance_loss_clip": 1.03427684, - "balance_loss_mlp": 1.02487493, - "epoch": 0.9070795130016533, - "flos": 23441624328960.0, - "grad_norm": 1.436388517862248, - "language_loss": 0.72142053, - "learning_rate": 8.983551859805416e-08, - "loss": 0.74274248, - "num_input_tokens_seen": 325415020, - "step": 15087, - "time_per_iteration": 4.312045335769653 - }, - { - "auxiliary_loss_clip": 0.01084735, - "auxiliary_loss_mlp": 0.01027139, - "balance_loss_clip": 1.03397894, - "balance_loss_mlp": 1.01522434, - "epoch": 0.9071396362543214, - "flos": 18916413114240.0, - "grad_norm": 1.949639239308053, - "language_loss": 0.76511991, - "learning_rate": 8.972014140059058e-08, - "loss": 0.78623861, - "num_input_tokens_seen": 325433595, - "step": 15088, - "time_per_iteration": 4.274383783340454 - }, - { - "auxiliary_loss_clip": 0.01073577, - "auxiliary_loss_mlp": 0.01032164, - "balance_loss_clip": 1.03376746, - "balance_loss_mlp": 1.02019525, - "epoch": 0.9071997595069893, - "flos": 25228646887680.0, - "grad_norm": 1.7650984011067665, - "language_loss": 0.73451883, - "learning_rate": 8.960483664113038e-08, - "loss": 0.75557625, - "num_input_tokens_seen": 325451605, - "step": 15089, - "time_per_iteration": 4.142383575439453 - }, - { - "auxiliary_loss_clip": 0.01103445, - "auxiliary_loss_mlp": 0.01034573, - "balance_loss_clip": 1.03631544, - "balance_loss_mlp": 1.02313471, - "epoch": 0.9072598827596573, - "flos": 24346608514560.0, - "grad_norm": 1.785554810845489, - "language_loss": 0.75460756, - "learning_rate": 8.948960432404628e-08, - "loss": 0.77598774, - "num_input_tokens_seen": 325470645, - "step": 15090, - "time_per_iteration": 4.125551462173462 - }, - { - "auxiliary_loss_clip": 0.01081669, - "auxiliary_loss_mlp": 0.01030269, - "balance_loss_clip": 1.03531027, - "balance_loss_mlp": 1.0168643, - "epoch": 0.9073200060123253, - "flos": 22674967418880.0, - "grad_norm": 2.644042732321969, - "language_loss": 0.7796579, - "learning_rate": 8.93744444537079e-08, - "loss": 0.8007772, - "num_input_tokens_seen": 325488070, - "step": 15091, - "time_per_iteration": 2.611660957336426 - }, - { - "auxiliary_loss_clip": 0.01080451, - "auxiliary_loss_mlp": 0.01025973, - "balance_loss_clip": 1.03320861, - "balance_loss_mlp": 1.01513076, - "epoch": 0.9073801292649932, - "flos": 23695476721920.0, - "grad_norm": 1.8611765559863347, - "language_loss": 0.85915703, - "learning_rate": 8.925935703448217e-08, - "loss": 0.88022125, - "num_input_tokens_seen": 325509285, - "step": 15092, - "time_per_iteration": 2.6740128993988037 - }, - { - "auxiliary_loss_clip": 0.01084789, - "auxiliary_loss_mlp": 0.0103167, - "balance_loss_clip": 1.03747833, - "balance_loss_mlp": 1.0196414, - "epoch": 0.9074402525176612, - "flos": 25375413859200.0, - "grad_norm": 1.5044941360603252, - "language_loss": 0.78849494, - "learning_rate": 8.914434207073296e-08, - "loss": 0.80965954, - "num_input_tokens_seen": 325529360, - "step": 15093, - "time_per_iteration": 2.680701494216919 - }, - { - "auxiliary_loss_clip": 0.01019381, - "auxiliary_loss_mlp": 0.01002565, - "balance_loss_clip": 1.00606823, - "balance_loss_mlp": 1.00151622, - "epoch": 0.9075003757703292, - "flos": 67649024384640.0, - "grad_norm": 0.7360353686888242, - "language_loss": 0.56958818, - "learning_rate": 8.902939956682188e-08, - "loss": 0.58980775, - "num_input_tokens_seen": 325583565, - "step": 15094, - "time_per_iteration": 3.086918592453003 - }, - { - "auxiliary_loss_clip": 0.01099075, - "auxiliary_loss_mlp": 0.01034812, - "balance_loss_clip": 1.03545427, - "balance_loss_mlp": 1.02190804, - "epoch": 0.9075604990229972, - "flos": 22453649769600.0, - "grad_norm": 1.9406797492354237, - "language_loss": 0.71160638, - "learning_rate": 8.891452952710742e-08, - "loss": 0.73294526, - "num_input_tokens_seen": 325603690, - "step": 15095, - "time_per_iteration": 2.6372621059417725 - }, - { - "auxiliary_loss_clip": 0.01066408, - "auxiliary_loss_mlp": 0.01034351, - "balance_loss_clip": 1.0342015, - "balance_loss_mlp": 1.02175641, - "epoch": 0.9076206222756651, - "flos": 19536662188800.0, - "grad_norm": 2.201890556865997, - "language_loss": 0.7416867, - "learning_rate": 8.879973195594526e-08, - "loss": 0.76269424, - "num_input_tokens_seen": 325622255, - "step": 15096, - "time_per_iteration": 2.7420341968536377 - }, - { - "auxiliary_loss_clip": 0.01109715, - "auxiliary_loss_mlp": 0.01038367, - "balance_loss_clip": 1.03712845, - "balance_loss_mlp": 1.02484858, - "epoch": 0.9076807455283331, - "flos": 30116914819200.0, - "grad_norm": 1.8892024302552053, - "language_loss": 0.56777847, - "learning_rate": 8.868500685768898e-08, - "loss": 0.58925933, - "num_input_tokens_seen": 325640165, - "step": 15097, - "time_per_iteration": 2.66786527633667 - }, - { - "auxiliary_loss_clip": 0.01085602, - "auxiliary_loss_mlp": 0.01024669, - "balance_loss_clip": 1.03317809, - "balance_loss_mlp": 1.01340389, - "epoch": 0.907740868781001, - "flos": 18697537589760.0, - "grad_norm": 1.7446964488150043, - "language_loss": 0.79539967, - "learning_rate": 8.857035423668935e-08, - "loss": 0.81650233, - "num_input_tokens_seen": 325659455, - "step": 15098, - "time_per_iteration": 2.6101489067077637 - }, - { - "auxiliary_loss_clip": 0.010671, - "auxiliary_loss_mlp": 0.00771611, - "balance_loss_clip": 1.03485239, - "balance_loss_mlp": 1.00026011, - "epoch": 0.907800992033669, - "flos": 22638805401600.0, - "grad_norm": 18.550819833404994, - "language_loss": 0.66001773, - "learning_rate": 8.845577409729266e-08, - "loss": 0.67840481, - "num_input_tokens_seen": 325678095, - "step": 15099, - "time_per_iteration": 2.782886266708374 - }, - { - "auxiliary_loss_clip": 0.01089093, - "auxiliary_loss_mlp": 0.0103633, - "balance_loss_clip": 1.03569531, - "balance_loss_mlp": 1.02341413, - "epoch": 0.907861115286337, - "flos": 21287666384640.0, - "grad_norm": 2.095035959000706, - "language_loss": 0.70761675, - "learning_rate": 8.834126644384477e-08, - "loss": 0.72887093, - "num_input_tokens_seen": 325695825, - "step": 15100, - "time_per_iteration": 2.718719482421875 - }, - { - "auxiliary_loss_clip": 0.01018547, - "auxiliary_loss_mlp": 0.01002357, - "balance_loss_clip": 1.00599432, - "balance_loss_mlp": 1.00136185, - "epoch": 0.907921238539005, - "flos": 69739493040000.0, - "grad_norm": 0.6221166311541254, - "language_loss": 0.5336588, - "learning_rate": 8.822683128068775e-08, - "loss": 0.55386788, - "num_input_tokens_seen": 325764515, - "step": 15101, - "time_per_iteration": 3.2601447105407715 - }, - { - "auxiliary_loss_clip": 0.01074173, - "auxiliary_loss_mlp": 0.0103007, - "balance_loss_clip": 1.03405142, - "balance_loss_mlp": 1.01715326, - "epoch": 0.9079813617916729, - "flos": 23477391296640.0, - "grad_norm": 1.6841110565912183, - "language_loss": 0.68209207, - "learning_rate": 8.811246861216081e-08, - "loss": 0.70313448, - "num_input_tokens_seen": 325783235, - "step": 15102, - "time_per_iteration": 2.6863279342651367 - }, - { - "auxiliary_loss_clip": 0.01094848, - "auxiliary_loss_mlp": 0.0103185, - "balance_loss_clip": 1.03587008, - "balance_loss_mlp": 1.01915479, - "epoch": 0.9080414850443409, - "flos": 22929933133440.0, - "grad_norm": 1.7674184353723423, - "language_loss": 0.79133558, - "learning_rate": 8.799817844260049e-08, - "loss": 0.81260264, - "num_input_tokens_seen": 325800195, - "step": 15103, - "time_per_iteration": 2.672898054122925 - }, - { - "auxiliary_loss_clip": 0.0108183, - "auxiliary_loss_mlp": 0.01033096, - "balance_loss_clip": 1.03430343, - "balance_loss_mlp": 1.02016127, - "epoch": 0.9081016082970089, - "flos": 26177083551360.0, - "grad_norm": 1.7434121737063208, - "language_loss": 0.71796912, - "learning_rate": 8.78839607763413e-08, - "loss": 0.73911834, - "num_input_tokens_seen": 325820215, - "step": 15104, - "time_per_iteration": 2.6979503631591797 - }, - { - "auxiliary_loss_clip": 0.01083633, - "auxiliary_loss_mlp": 0.01026392, - "balance_loss_clip": 1.03431463, - "balance_loss_mlp": 1.01508558, - "epoch": 0.9081617315496768, - "flos": 24462169545600.0, - "grad_norm": 1.7433195469593918, - "language_loss": 0.77697951, - "learning_rate": 8.77698156177138e-08, - "loss": 0.79807979, - "num_input_tokens_seen": 325838415, - "step": 15105, - "time_per_iteration": 2.693650722503662 - }, - { - "auxiliary_loss_clip": 0.01106144, - "auxiliary_loss_mlp": 0.00770719, - "balance_loss_clip": 1.03435302, - "balance_loss_mlp": 1.00018311, - "epoch": 0.9082218548023449, - "flos": 24746868743040.0, - "grad_norm": 2.4921159969268625, - "language_loss": 0.73882461, - "learning_rate": 8.765574297104628e-08, - "loss": 0.75759327, - "num_input_tokens_seen": 325855580, - "step": 15106, - "time_per_iteration": 2.6928508281707764 - }, - { - "auxiliary_loss_clip": 0.01059785, - "auxiliary_loss_mlp": 0.0103567, - "balance_loss_clip": 1.02983892, - "balance_loss_mlp": 1.02212226, - "epoch": 0.9082819780550128, - "flos": 24421302846720.0, - "grad_norm": 1.956694255658617, - "language_loss": 0.80682945, - "learning_rate": 8.754174284066462e-08, - "loss": 0.82778394, - "num_input_tokens_seen": 325874890, - "step": 15107, - "time_per_iteration": 2.8211913108825684 - }, - { - "auxiliary_loss_clip": 0.01003818, - "auxiliary_loss_mlp": 0.01000224, - "balance_loss_clip": 1.00530005, - "balance_loss_mlp": 0.99906158, - "epoch": 0.9083421013076808, - "flos": 59609704872960.0, - "grad_norm": 0.8163194562351376, - "language_loss": 0.59763622, - "learning_rate": 8.742781523089205e-08, - "loss": 0.61767673, - "num_input_tokens_seen": 325935835, - "step": 15108, - "time_per_iteration": 3.176673173904419 - }, - { - "auxiliary_loss_clip": 0.01085744, - "auxiliary_loss_mlp": 0.0102493, - "balance_loss_clip": 1.03396034, - "balance_loss_mlp": 1.01259756, - "epoch": 0.9084022245603487, - "flos": 33620216100480.0, - "grad_norm": 1.5754460951726812, - "language_loss": 0.73228884, - "learning_rate": 8.73139601460482e-08, - "loss": 0.75339556, - "num_input_tokens_seen": 325958035, - "step": 15109, - "time_per_iteration": 2.744368314743042 - }, - { - "auxiliary_loss_clip": 0.01072978, - "auxiliary_loss_mlp": 0.01028525, - "balance_loss_clip": 1.03370285, - "balance_loss_mlp": 1.01687264, - "epoch": 0.9084623478130167, - "flos": 24971705925120.0, - "grad_norm": 4.290837775967832, - "language_loss": 0.71557301, - "learning_rate": 8.720017759045073e-08, - "loss": 0.736588, - "num_input_tokens_seen": 325979870, - "step": 15110, - "time_per_iteration": 2.7875888347625732 - }, - { - "auxiliary_loss_clip": 0.0107739, - "auxiliary_loss_mlp": 0.01035324, - "balance_loss_clip": 1.03073955, - "balance_loss_mlp": 1.0219785, - "epoch": 0.9085224710656846, - "flos": 31461804869760.0, - "grad_norm": 1.8448389320189542, - "language_loss": 0.69122839, - "learning_rate": 8.708646756841421e-08, - "loss": 0.71235561, - "num_input_tokens_seen": 325998245, - "step": 15111, - "time_per_iteration": 2.7633275985717773 - }, - { - "auxiliary_loss_clip": 0.00998747, - "auxiliary_loss_mlp": 0.01004801, - "balance_loss_clip": 1.00629544, - "balance_loss_mlp": 1.00380516, - "epoch": 0.9085825943183526, - "flos": 64917012867840.0, - "grad_norm": 0.6888629438196041, - "language_loss": 0.51703209, - "learning_rate": 8.697283008425026e-08, - "loss": 0.53706759, - "num_input_tokens_seen": 326061770, - "step": 15112, - "time_per_iteration": 3.2464187145233154 - }, - { - "auxiliary_loss_clip": 0.0109824, - "auxiliary_loss_mlp": 0.01033068, - "balance_loss_clip": 1.03503668, - "balance_loss_mlp": 1.02022314, - "epoch": 0.9086427175710206, - "flos": 18953221576320.0, - "grad_norm": 1.723855201970508, - "language_loss": 0.7027775, - "learning_rate": 8.685926514226837e-08, - "loss": 0.72409058, - "num_input_tokens_seen": 326080945, - "step": 15113, - "time_per_iteration": 2.615265130996704 - }, - { - "auxiliary_loss_clip": 0.01098496, - "auxiliary_loss_mlp": 0.01030884, - "balance_loss_clip": 1.03785408, - "balance_loss_mlp": 1.0189271, - "epoch": 0.9087028408236886, - "flos": 34014873807360.0, - "grad_norm": 2.0973757596387004, - "language_loss": 0.78994763, - "learning_rate": 8.674577274677508e-08, - "loss": 0.81124145, - "num_input_tokens_seen": 326100630, - "step": 15114, - "time_per_iteration": 2.7337305545806885 - }, - { - "auxiliary_loss_clip": 0.01070616, - "auxiliary_loss_mlp": 0.01033895, - "balance_loss_clip": 1.03684914, - "balance_loss_mlp": 1.02053201, - "epoch": 0.9087629640763565, - "flos": 21944580266880.0, - "grad_norm": 3.929458307617432, - "language_loss": 0.70178634, - "learning_rate": 8.663235290207405e-08, - "loss": 0.72283143, - "num_input_tokens_seen": 326120145, - "step": 15115, - "time_per_iteration": 2.751361131668091 - }, - { - "auxiliary_loss_clip": 0.01086218, - "auxiliary_loss_mlp": 0.01032577, - "balance_loss_clip": 1.03923655, - "balance_loss_mlp": 1.01895118, - "epoch": 0.9088230873290245, - "flos": 21762908254080.0, - "grad_norm": 2.3483964506042603, - "language_loss": 0.65777099, - "learning_rate": 8.651900561246561e-08, - "loss": 0.67895895, - "num_input_tokens_seen": 326140715, - "step": 15116, - "time_per_iteration": 2.715759754180908 - }, - { - "auxiliary_loss_clip": 0.01106542, - "auxiliary_loss_mlp": 0.01034021, - "balance_loss_clip": 1.0372858, - "balance_loss_mlp": 1.02119398, - "epoch": 0.9088832105816925, - "flos": 21541267382400.0, - "grad_norm": 6.342744698991267, - "language_loss": 0.69591606, - "learning_rate": 8.640573088224812e-08, - "loss": 0.71732175, - "num_input_tokens_seen": 326159130, - "step": 15117, - "time_per_iteration": 2.582552433013916 - }, - { - "auxiliary_loss_clip": 0.01066284, - "auxiliary_loss_mlp": 0.01026525, - "balance_loss_clip": 1.03425217, - "balance_loss_mlp": 1.01489568, - "epoch": 0.9089433338343604, - "flos": 25996704428160.0, - "grad_norm": 3.6808698691711856, - "language_loss": 0.74660701, - "learning_rate": 8.629252871571745e-08, - "loss": 0.76753509, - "num_input_tokens_seen": 326181375, - "step": 15118, - "time_per_iteration": 2.751481056213379 - }, - { - "auxiliary_loss_clip": 0.01083211, - "auxiliary_loss_mlp": 0.01034962, - "balance_loss_clip": 1.03344107, - "balance_loss_mlp": 1.02128291, - "epoch": 0.9090034570870285, - "flos": 21178426147200.0, - "grad_norm": 2.13733826102676, - "language_loss": 0.73172134, - "learning_rate": 8.617939911716554e-08, - "loss": 0.75290304, - "num_input_tokens_seen": 326199740, - "step": 15119, - "time_per_iteration": 2.7050302028656006 - }, - { - "auxiliary_loss_clip": 0.01073499, - "auxiliary_loss_mlp": 0.0103193, - "balance_loss_clip": 1.03588152, - "balance_loss_mlp": 1.01727891, - "epoch": 0.9090635803396964, - "flos": 16141811045760.0, - "grad_norm": 2.3309233232368376, - "language_loss": 0.71525586, - "learning_rate": 8.60663420908827e-08, - "loss": 0.73631012, - "num_input_tokens_seen": 326214350, - "step": 15120, - "time_per_iteration": 2.748596429824829 - }, - { - "auxiliary_loss_clip": 0.01109717, - "auxiliary_loss_mlp": 0.00770513, - "balance_loss_clip": 1.03689528, - "balance_loss_mlp": 1.0002079, - "epoch": 0.9091237035923644, - "flos": 20591537829120.0, - "grad_norm": 2.1685106805534002, - "language_loss": 0.65576839, - "learning_rate": 8.595335764115596e-08, - "loss": 0.67457068, - "num_input_tokens_seen": 326234580, - "step": 15121, - "time_per_iteration": 2.6541824340820312 - }, - { - "auxiliary_loss_clip": 0.01098528, - "auxiliary_loss_mlp": 0.01035932, - "balance_loss_clip": 1.03610933, - "balance_loss_mlp": 1.02343321, - "epoch": 0.9091838268450323, - "flos": 52227760164480.0, - "grad_norm": 2.467654081114951, - "language_loss": 0.70642388, - "learning_rate": 8.58404457722699e-08, - "loss": 0.72776842, - "num_input_tokens_seen": 326259080, - "step": 15122, - "time_per_iteration": 2.925644636154175 - }, - { - "auxiliary_loss_clip": 0.01052109, - "auxiliary_loss_mlp": 0.01030913, - "balance_loss_clip": 1.03168774, - "balance_loss_mlp": 1.01879561, - "epoch": 0.9092439500977003, - "flos": 20559613616640.0, - "grad_norm": 1.4208742035415944, - "language_loss": 0.74525023, - "learning_rate": 8.572760648850575e-08, - "loss": 0.76608044, - "num_input_tokens_seen": 326280175, - "step": 15123, - "time_per_iteration": 2.734441041946411 - }, - { - "auxiliary_loss_clip": 0.0109521, - "auxiliary_loss_mlp": 0.01033097, - "balance_loss_clip": 1.03593588, - "balance_loss_mlp": 1.02159882, - "epoch": 0.9093040733503682, - "flos": 28617859595520.0, - "grad_norm": 1.8896970450570774, - "language_loss": 0.7576673, - "learning_rate": 8.561483979414253e-08, - "loss": 0.77895033, - "num_input_tokens_seen": 326297990, - "step": 15124, - "time_per_iteration": 2.6362528800964355 - }, - { - "auxiliary_loss_clip": 0.01090802, - "auxiliary_loss_mlp": 0.01032412, - "balance_loss_clip": 1.03465593, - "balance_loss_mlp": 1.02002668, - "epoch": 0.9093641966030362, - "flos": 23440187784960.0, - "grad_norm": 1.8805276614968602, - "language_loss": 0.71919298, - "learning_rate": 8.55021456934566e-08, - "loss": 0.74042511, - "num_input_tokens_seen": 326316735, - "step": 15125, - "time_per_iteration": 2.5915915966033936 - }, - { - "auxiliary_loss_clip": 0.01068085, - "auxiliary_loss_mlp": 0.01035917, - "balance_loss_clip": 1.03625441, - "balance_loss_mlp": 1.02334034, - "epoch": 0.9094243198557042, - "flos": 16800197385600.0, - "grad_norm": 1.6292066230001099, - "language_loss": 0.79466188, - "learning_rate": 8.538952419072143e-08, - "loss": 0.8157019, - "num_input_tokens_seen": 326334370, - "step": 15126, - "time_per_iteration": 4.219731569290161 - }, - { - "auxiliary_loss_clip": 0.01065083, - "auxiliary_loss_mlp": 0.01034805, - "balance_loss_clip": 1.03633046, - "balance_loss_mlp": 1.02255654, - "epoch": 0.9094844431083722, - "flos": 24273278899200.0, - "grad_norm": 1.707765126078796, - "language_loss": 0.75641441, - "learning_rate": 8.527697529020694e-08, - "loss": 0.77741325, - "num_input_tokens_seen": 326353435, - "step": 15127, - "time_per_iteration": 2.7128138542175293 - }, - { - "auxiliary_loss_clip": 0.01027678, - "auxiliary_loss_mlp": 0.01033923, - "balance_loss_clip": 1.03002882, - "balance_loss_mlp": 1.02145934, - "epoch": 0.9095445663610401, - "flos": 21944652094080.0, - "grad_norm": 1.8998405875281965, - "language_loss": 0.62571168, - "learning_rate": 8.516449899618173e-08, - "loss": 0.64632773, - "num_input_tokens_seen": 326371810, - "step": 15128, - "time_per_iteration": 4.432798385620117 - }, - { - "auxiliary_loss_clip": 0.01075251, - "auxiliary_loss_mlp": 0.01024187, - "balance_loss_clip": 1.03530467, - "balance_loss_mlp": 1.01223636, - "epoch": 0.9096046896137081, - "flos": 19792848965760.0, - "grad_norm": 1.7664774928724292, - "language_loss": 0.76836801, - "learning_rate": 8.505209531291013e-08, - "loss": 0.78936237, - "num_input_tokens_seen": 326391380, - "step": 15129, - "time_per_iteration": 4.206790447235107 - }, - { - "auxiliary_loss_clip": 0.01096669, - "auxiliary_loss_mlp": 0.01027809, - "balance_loss_clip": 1.03541172, - "balance_loss_mlp": 1.01559019, - "epoch": 0.909664812866376, - "flos": 22638087129600.0, - "grad_norm": 1.9024505356481058, - "language_loss": 0.83078182, - "learning_rate": 8.49397642446552e-08, - "loss": 0.85202664, - "num_input_tokens_seen": 326408800, - "step": 15130, - "time_per_iteration": 2.6001152992248535 - }, - { - "auxiliary_loss_clip": 0.0108696, - "auxiliary_loss_mlp": 0.01032978, - "balance_loss_clip": 1.0359422, - "balance_loss_mlp": 1.01988339, - "epoch": 0.909724936119044, - "flos": 39852153020160.0, - "grad_norm": 1.6192884326083825, - "language_loss": 0.75177467, - "learning_rate": 8.482750579567644e-08, - "loss": 0.77297407, - "num_input_tokens_seen": 326431565, - "step": 15131, - "time_per_iteration": 2.848465919494629 - }, - { - "auxiliary_loss_clip": 0.01083451, - "auxiliary_loss_mlp": 0.01035354, - "balance_loss_clip": 1.03611147, - "balance_loss_mlp": 1.02193737, - "epoch": 0.9097850593717121, - "flos": 35071616954880.0, - "grad_norm": 1.8781997333884533, - "language_loss": 0.599832, - "learning_rate": 8.471531997023085e-08, - "loss": 0.62102008, - "num_input_tokens_seen": 326451715, - "step": 15132, - "time_per_iteration": 2.7317306995391846 - }, - { - "auxiliary_loss_clip": 0.01068526, - "auxiliary_loss_mlp": 0.01031798, - "balance_loss_clip": 1.0371139, - "balance_loss_mlp": 1.02007413, - "epoch": 0.90984518262438, - "flos": 23367468700800.0, - "grad_norm": 1.7969799110161846, - "language_loss": 0.82646108, - "learning_rate": 8.460320677257193e-08, - "loss": 0.84746432, - "num_input_tokens_seen": 326470855, - "step": 15133, - "time_per_iteration": 2.666724920272827 - }, - { - "auxiliary_loss_clip": 0.01084851, - "auxiliary_loss_mlp": 0.01033684, - "balance_loss_clip": 1.03276467, - "balance_loss_mlp": 1.0209645, - "epoch": 0.909905305877048, - "flos": 27523302405120.0, - "grad_norm": 1.9696626904627623, - "language_loss": 0.74180704, - "learning_rate": 8.449116620695118e-08, - "loss": 0.76299238, - "num_input_tokens_seen": 326490480, - "step": 15134, - "time_per_iteration": 2.7024521827697754 - }, - { - "auxiliary_loss_clip": 0.01081442, - "auxiliary_loss_mlp": 0.01032002, - "balance_loss_clip": 1.03796458, - "balance_loss_mlp": 1.01934206, - "epoch": 0.9099654291297159, - "flos": 24347865490560.0, - "grad_norm": 1.5144614886506496, - "language_loss": 0.72592616, - "learning_rate": 8.437919827761786e-08, - "loss": 0.74706054, - "num_input_tokens_seen": 326509445, - "step": 15135, - "time_per_iteration": 2.7127246856689453 - }, - { - "auxiliary_loss_clip": 0.01096766, - "auxiliary_loss_mlp": 0.01030937, - "balance_loss_clip": 1.03764153, - "balance_loss_mlp": 1.01891482, - "epoch": 0.9100255523823839, - "flos": 21215234609280.0, - "grad_norm": 1.683349330463744, - "language_loss": 0.70137173, - "learning_rate": 8.426730298881702e-08, - "loss": 0.72264874, - "num_input_tokens_seen": 326528380, - "step": 15136, - "time_per_iteration": 2.6193113327026367 - }, - { - "auxiliary_loss_clip": 0.00990412, - "auxiliary_loss_mlp": 0.01005783, - "balance_loss_clip": 1.00657475, - "balance_loss_mlp": 1.00484753, - "epoch": 0.9100856756350518, - "flos": 46052276446080.0, - "grad_norm": 0.825688175716241, - "language_loss": 0.59235996, - "learning_rate": 8.415548034479214e-08, - "loss": 0.61232191, - "num_input_tokens_seen": 326576940, - "step": 15137, - "time_per_iteration": 3.083552837371826 - }, - { - "auxiliary_loss_clip": 0.01098465, - "auxiliary_loss_mlp": 0.01035683, - "balance_loss_clip": 1.03574395, - "balance_loss_mlp": 1.02372026, - "epoch": 0.9101457988877198, - "flos": 20229917656320.0, - "grad_norm": 2.33052803483979, - "language_loss": 0.82487237, - "learning_rate": 8.40437303497834e-08, - "loss": 0.84621382, - "num_input_tokens_seen": 326596100, - "step": 15138, - "time_per_iteration": 2.674602508544922 - }, - { - "auxiliary_loss_clip": 0.01094423, - "auxiliary_loss_mlp": 0.01026368, - "balance_loss_clip": 1.037696, - "balance_loss_mlp": 1.01526928, - "epoch": 0.9102059221403878, - "flos": 26615157822720.0, - "grad_norm": 1.5741555664538536, - "language_loss": 0.81272125, - "learning_rate": 8.39320530080283e-08, - "loss": 0.83392918, - "num_input_tokens_seen": 326615700, - "step": 15139, - "time_per_iteration": 2.694201946258545 - }, - { - "auxiliary_loss_clip": 0.01076496, - "auxiliary_loss_mlp": 0.01033648, - "balance_loss_clip": 1.03743947, - "balance_loss_mlp": 1.02160764, - "epoch": 0.9102660453930558, - "flos": 21908561904000.0, - "grad_norm": 2.050091798744291, - "language_loss": 0.77814442, - "learning_rate": 8.382044832376167e-08, - "loss": 0.79924583, - "num_input_tokens_seen": 326635905, - "step": 15140, - "time_per_iteration": 2.722778558731079 - }, - { - "auxiliary_loss_clip": 0.01106393, - "auxiliary_loss_mlp": 0.01031373, - "balance_loss_clip": 1.0352447, - "balance_loss_mlp": 1.01943445, - "epoch": 0.9103261686457237, - "flos": 36176660916480.0, - "grad_norm": 1.7205881923201032, - "language_loss": 0.66666603, - "learning_rate": 8.370891630121569e-08, - "loss": 0.68804365, - "num_input_tokens_seen": 326661855, - "step": 15141, - "time_per_iteration": 2.7130444049835205 - }, - { - "auxiliary_loss_clip": 0.01095941, - "auxiliary_loss_mlp": 0.01037339, - "balance_loss_clip": 1.03542638, - "balance_loss_mlp": 1.02527499, - "epoch": 0.9103862918983917, - "flos": 23878549365120.0, - "grad_norm": 1.8944850892633267, - "language_loss": 0.75325441, - "learning_rate": 8.359745694462005e-08, - "loss": 0.77458721, - "num_input_tokens_seen": 326679320, - "step": 15142, - "time_per_iteration": 2.6429429054260254 - }, - { - "auxiliary_loss_clip": 0.01069268, - "auxiliary_loss_mlp": 0.01042478, - "balance_loss_clip": 1.03122544, - "balance_loss_mlp": 1.02982378, - "epoch": 0.9104464151510596, - "flos": 14939521989120.0, - "grad_norm": 1.6543746947107703, - "language_loss": 0.64361405, - "learning_rate": 8.348607025820076e-08, - "loss": 0.6647315, - "num_input_tokens_seen": 326698110, - "step": 15143, - "time_per_iteration": 2.669706344604492 - }, - { - "auxiliary_loss_clip": 0.01110746, - "auxiliary_loss_mlp": 0.0103534, - "balance_loss_clip": 1.03664672, - "balance_loss_mlp": 1.02197671, - "epoch": 0.9105065384037276, - "flos": 33655803500160.0, - "grad_norm": 1.826138803106712, - "language_loss": 0.61111665, - "learning_rate": 8.337475624618152e-08, - "loss": 0.63257754, - "num_input_tokens_seen": 326718370, - "step": 15144, - "time_per_iteration": 2.659849166870117 - }, - { - "auxiliary_loss_clip": 0.01065641, - "auxiliary_loss_mlp": 0.01027587, - "balance_loss_clip": 1.0301441, - "balance_loss_mlp": 1.01508248, - "epoch": 0.9105666616563957, - "flos": 24316695463680.0, - "grad_norm": 1.5990370313133018, - "language_loss": 0.70864612, - "learning_rate": 8.326351491278382e-08, - "loss": 0.72957838, - "num_input_tokens_seen": 326738445, - "step": 15145, - "time_per_iteration": 2.685203790664673 - }, - { - "auxiliary_loss_clip": 0.01047743, - "auxiliary_loss_mlp": 0.01032721, - "balance_loss_clip": 1.03243327, - "balance_loss_mlp": 1.02036476, - "epoch": 0.9106267849090636, - "flos": 29971692132480.0, - "grad_norm": 1.5060644265455205, - "language_loss": 0.70642048, - "learning_rate": 8.315234626222545e-08, - "loss": 0.72722512, - "num_input_tokens_seen": 326758855, - "step": 15146, - "time_per_iteration": 2.7676496505737305 - }, - { - "auxiliary_loss_clip": 0.01085776, - "auxiliary_loss_mlp": 0.01032662, - "balance_loss_clip": 1.03418636, - "balance_loss_mlp": 1.02068782, - "epoch": 0.9106869081617316, - "flos": 25337743470720.0, - "grad_norm": 1.8260905410066164, - "language_loss": 0.72899806, - "learning_rate": 8.304125029872233e-08, - "loss": 0.75018245, - "num_input_tokens_seen": 326777140, - "step": 15147, - "time_per_iteration": 2.6421234607696533 - }, - { - "auxiliary_loss_clip": 0.01081187, - "auxiliary_loss_mlp": 0.01031004, - "balance_loss_clip": 1.0361135, - "balance_loss_mlp": 1.01835012, - "epoch": 0.9107470314143995, - "flos": 18187031543040.0, - "grad_norm": 1.914291203586608, - "language_loss": 0.80780458, - "learning_rate": 8.293022702648711e-08, - "loss": 0.82892644, - "num_input_tokens_seen": 326794070, - "step": 15148, - "time_per_iteration": 2.6653599739074707 - }, - { - "auxiliary_loss_clip": 0.01076044, - "auxiliary_loss_mlp": 0.01039047, - "balance_loss_clip": 1.03479314, - "balance_loss_mlp": 1.02636874, - "epoch": 0.9108071546670675, - "flos": 23550828652800.0, - "grad_norm": 2.087055328388918, - "language_loss": 0.67585528, - "learning_rate": 8.281927644972996e-08, - "loss": 0.69700611, - "num_input_tokens_seen": 326814695, - "step": 15149, - "time_per_iteration": 2.758857011795044 - }, - { - "auxiliary_loss_clip": 0.01108552, - "auxiliary_loss_mlp": 0.01030478, - "balance_loss_clip": 1.03687429, - "balance_loss_mlp": 1.01744866, - "epoch": 0.9108672779197354, - "flos": 25630307746560.0, - "grad_norm": 1.9181044295268432, - "language_loss": 0.63203096, - "learning_rate": 8.270839857265776e-08, - "loss": 0.65342128, - "num_input_tokens_seen": 326835295, - "step": 15150, - "time_per_iteration": 2.650240898132324 - }, - { - "auxiliary_loss_clip": 0.01066309, - "auxiliary_loss_mlp": 0.01031448, - "balance_loss_clip": 1.03402328, - "balance_loss_mlp": 1.01881194, - "epoch": 0.9109274011724035, - "flos": 22339094319360.0, - "grad_norm": 2.2733833539943333, - "language_loss": 0.72643161, - "learning_rate": 8.259759339947514e-08, - "loss": 0.74740922, - "num_input_tokens_seen": 326853350, - "step": 15151, - "time_per_iteration": 2.706934690475464 - }, - { - "auxiliary_loss_clip": 0.01095436, - "auxiliary_loss_mlp": 0.01029482, - "balance_loss_clip": 1.03496432, - "balance_loss_mlp": 1.01727509, - "epoch": 0.9109875244250714, - "flos": 26688200129280.0, - "grad_norm": 1.648582433866266, - "language_loss": 0.64558387, - "learning_rate": 8.248686093438429e-08, - "loss": 0.66683304, - "num_input_tokens_seen": 326873425, - "step": 15152, - "time_per_iteration": 2.699647903442383 - }, - { - "auxiliary_loss_clip": 0.0108822, - "auxiliary_loss_mlp": 0.00770055, - "balance_loss_clip": 1.03658628, - "balance_loss_mlp": 1.00032091, - "epoch": 0.9110476476777394, - "flos": 22930112701440.0, - "grad_norm": 1.8488661615298092, - "language_loss": 0.73683035, - "learning_rate": 8.23762011815834e-08, - "loss": 0.75541312, - "num_input_tokens_seen": 326893455, - "step": 15153, - "time_per_iteration": 2.6998884677886963 - }, - { - "auxiliary_loss_clip": 0.01067073, - "auxiliary_loss_mlp": 0.01050891, - "balance_loss_clip": 1.03213048, - "balance_loss_mlp": 1.03591788, - "epoch": 0.9111077709304073, - "flos": 13472857854720.0, - "grad_norm": 2.3413318457237775, - "language_loss": 0.72122753, - "learning_rate": 8.226561414526956e-08, - "loss": 0.74240714, - "num_input_tokens_seen": 326910210, - "step": 15154, - "time_per_iteration": 2.683474540710449 - }, - { - "auxiliary_loss_clip": 0.01088157, - "auxiliary_loss_mlp": 0.01032363, - "balance_loss_clip": 1.03857923, - "balance_loss_mlp": 1.02037024, - "epoch": 0.9111678941830753, - "flos": 20850561780480.0, - "grad_norm": 1.7345027920232028, - "language_loss": 0.82108957, - "learning_rate": 8.215509982963564e-08, - "loss": 0.84229481, - "num_input_tokens_seen": 326929350, - "step": 15155, - "time_per_iteration": 2.7106335163116455 - }, - { - "auxiliary_loss_clip": 0.01096529, - "auxiliary_loss_mlp": 0.01031415, - "balance_loss_clip": 1.03773642, - "balance_loss_mlp": 1.01885629, - "epoch": 0.9112280174357432, - "flos": 19682244011520.0, - "grad_norm": 1.8052926393059447, - "language_loss": 0.5958488, - "learning_rate": 8.204465823887252e-08, - "loss": 0.61712825, - "num_input_tokens_seen": 326949060, - "step": 15156, - "time_per_iteration": 2.6679844856262207 - }, - { - "auxiliary_loss_clip": 0.01099444, - "auxiliary_loss_mlp": 0.01028039, - "balance_loss_clip": 1.03477848, - "balance_loss_mlp": 1.01498008, - "epoch": 0.9112881406884112, - "flos": 25447163276160.0, - "grad_norm": 2.321869813201265, - "language_loss": 0.74290884, - "learning_rate": 8.193428937716796e-08, - "loss": 0.76418364, - "num_input_tokens_seen": 326968950, - "step": 15157, - "time_per_iteration": 2.6687350273132324 - }, - { - "auxiliary_loss_clip": 0.01063031, - "auxiliary_loss_mlp": 0.01033754, - "balance_loss_clip": 1.03153825, - "balance_loss_mlp": 1.02228022, - "epoch": 0.9113482639410793, - "flos": 33066975847680.0, - "grad_norm": 1.6132914581945528, - "language_loss": 0.59553444, - "learning_rate": 8.182399324870747e-08, - "loss": 0.61650229, - "num_input_tokens_seen": 326989455, - "step": 15158, - "time_per_iteration": 2.8011231422424316 - }, - { - "auxiliary_loss_clip": 0.01050049, - "auxiliary_loss_mlp": 0.01031146, - "balance_loss_clip": 1.03574824, - "balance_loss_mlp": 1.01942158, - "epoch": 0.9114083871937472, - "flos": 21835591424640.0, - "grad_norm": 2.2386737595671047, - "language_loss": 0.68004364, - "learning_rate": 8.171376985767375e-08, - "loss": 0.70085549, - "num_input_tokens_seen": 327009640, - "step": 15159, - "time_per_iteration": 2.772341251373291 - }, - { - "auxiliary_loss_clip": 0.01087373, - "auxiliary_loss_mlp": 0.01029488, - "balance_loss_clip": 1.03617239, - "balance_loss_mlp": 1.0176506, - "epoch": 0.9114685104464152, - "flos": 27088999061760.0, - "grad_norm": 2.7938055787234015, - "language_loss": 0.78473425, - "learning_rate": 8.160361920824588e-08, - "loss": 0.8059029, - "num_input_tokens_seen": 327027690, - "step": 15160, - "time_per_iteration": 2.7388458251953125 - }, - { - "auxiliary_loss_clip": 0.01111531, - "auxiliary_loss_mlp": 0.01029053, - "balance_loss_clip": 1.03913951, - "balance_loss_mlp": 1.01570201, - "epoch": 0.9115286336990831, - "flos": 17967042696960.0, - "grad_norm": 1.6224624723660812, - "language_loss": 0.69028407, - "learning_rate": 8.149354130460073e-08, - "loss": 0.71168995, - "num_input_tokens_seen": 327045915, - "step": 15161, - "time_per_iteration": 2.6148221492767334 - }, - { - "auxiliary_loss_clip": 0.01060884, - "auxiliary_loss_mlp": 0.01040252, - "balance_loss_clip": 1.03252292, - "balance_loss_mlp": 1.02619767, - "epoch": 0.9115887569517511, - "flos": 22929861306240.0, - "grad_norm": 1.7334002472530148, - "language_loss": 0.7660948, - "learning_rate": 8.138353615091321e-08, - "loss": 0.78710622, - "num_input_tokens_seen": 327066355, - "step": 15162, - "time_per_iteration": 2.938532590866089 - }, - { - "auxiliary_loss_clip": 0.01082027, - "auxiliary_loss_mlp": 0.01032242, - "balance_loss_clip": 1.03714919, - "balance_loss_mlp": 1.01954055, - "epoch": 0.911648880204419, - "flos": 23988436047360.0, - "grad_norm": 1.8353414047586432, - "language_loss": 0.66910523, - "learning_rate": 8.127360375135395e-08, - "loss": 0.69024795, - "num_input_tokens_seen": 327086735, - "step": 15163, - "time_per_iteration": 2.6603245735168457 - }, - { - "auxiliary_loss_clip": 0.01066197, - "auxiliary_loss_mlp": 0.01033842, - "balance_loss_clip": 1.03512335, - "balance_loss_mlp": 1.0209856, - "epoch": 0.911709003457087, - "flos": 17055306754560.0, - "grad_norm": 7.686069864980859, - "language_loss": 0.70642608, - "learning_rate": 8.116374411009186e-08, - "loss": 0.72742647, - "num_input_tokens_seen": 327104035, - "step": 15164, - "time_per_iteration": 2.7450454235076904 - }, - { - "auxiliary_loss_clip": 0.01108615, - "auxiliary_loss_mlp": 0.01031494, - "balance_loss_clip": 1.03994727, - "balance_loss_mlp": 1.01950121, - "epoch": 0.911769126709755, - "flos": 21653344794240.0, - "grad_norm": 1.5696959903057297, - "language_loss": 0.76052606, - "learning_rate": 8.105395723129315e-08, - "loss": 0.78192717, - "num_input_tokens_seen": 327124370, - "step": 15165, - "time_per_iteration": 2.588705062866211 - }, - { - "auxiliary_loss_clip": 0.01093363, - "auxiliary_loss_mlp": 0.01033622, - "balance_loss_clip": 1.03510165, - "balance_loss_mlp": 1.02148008, - "epoch": 0.911829249962423, - "flos": 24790321221120.0, - "grad_norm": 2.0749050237393423, - "language_loss": 0.72525322, - "learning_rate": 8.094424311912074e-08, - "loss": 0.74652308, - "num_input_tokens_seen": 327140915, - "step": 15166, - "time_per_iteration": 4.245245933532715 - }, - { - "auxiliary_loss_clip": 0.01060198, - "auxiliary_loss_mlp": 0.01038217, - "balance_loss_clip": 1.03464365, - "balance_loss_mlp": 1.02491355, - "epoch": 0.9118893732150909, - "flos": 20959406968320.0, - "grad_norm": 1.8141703562808917, - "language_loss": 0.73241115, - "learning_rate": 8.083460177773482e-08, - "loss": 0.75339532, - "num_input_tokens_seen": 327158940, - "step": 15167, - "time_per_iteration": 5.897623062133789 - }, - { - "auxiliary_loss_clip": 0.0101816, - "auxiliary_loss_mlp": 0.00998888, - "balance_loss_clip": 1.01390624, - "balance_loss_mlp": 0.99787515, - "epoch": 0.9119494964677589, - "flos": 67917385872000.0, - "grad_norm": 0.7753194150086553, - "language_loss": 0.65546739, - "learning_rate": 8.072503321129298e-08, - "loss": 0.67563796, - "num_input_tokens_seen": 327217450, - "step": 15168, - "time_per_iteration": 3.210770845413208 - }, - { - "auxiliary_loss_clip": 0.01078881, - "auxiliary_loss_mlp": 0.01031567, - "balance_loss_clip": 1.03501606, - "balance_loss_mlp": 1.01959848, - "epoch": 0.9120096197204268, - "flos": 18551524803840.0, - "grad_norm": 1.9364628157336585, - "language_loss": 0.78129464, - "learning_rate": 8.061553742395033e-08, - "loss": 0.80239916, - "num_input_tokens_seen": 327233905, - "step": 15169, - "time_per_iteration": 4.273360729217529 - }, - { - "auxiliary_loss_clip": 0.01097706, - "auxiliary_loss_mlp": 0.0103039, - "balance_loss_clip": 1.03744388, - "balance_loss_mlp": 1.01839125, - "epoch": 0.9120697429730948, - "flos": 19025725178880.0, - "grad_norm": 1.8060821353354455, - "language_loss": 0.81748688, - "learning_rate": 8.05061144198591e-08, - "loss": 0.83876789, - "num_input_tokens_seen": 327252430, - "step": 15170, - "time_per_iteration": 2.6498122215270996 - }, - { - "auxiliary_loss_clip": 0.01100439, - "auxiliary_loss_mlp": 0.01030541, - "balance_loss_clip": 1.03837538, - "balance_loss_mlp": 1.01746333, - "epoch": 0.9121298662257629, - "flos": 17163685065600.0, - "grad_norm": 2.097374036278885, - "language_loss": 0.76902175, - "learning_rate": 8.039676420316799e-08, - "loss": 0.79033154, - "num_input_tokens_seen": 327269215, - "step": 15171, - "time_per_iteration": 2.6777992248535156 - }, - { - "auxiliary_loss_clip": 0.01025503, - "auxiliary_loss_mlp": 0.01038378, - "balance_loss_clip": 1.03109252, - "balance_loss_mlp": 1.02510428, - "epoch": 0.9121899894784308, - "flos": 19682710888320.0, - "grad_norm": 1.2924384179927062, - "language_loss": 0.66694897, - "learning_rate": 8.02874867780241e-08, - "loss": 0.68758774, - "num_input_tokens_seen": 327290320, - "step": 15172, - "time_per_iteration": 2.851702928543091 - }, - { - "auxiliary_loss_clip": 0.01079756, - "auxiliary_loss_mlp": 0.01033459, - "balance_loss_clip": 1.03634048, - "balance_loss_mlp": 1.02087665, - "epoch": 0.9122501127310988, - "flos": 22235743912320.0, - "grad_norm": 1.6696295487638473, - "language_loss": 0.74975204, - "learning_rate": 8.017828214857103e-08, - "loss": 0.77088416, - "num_input_tokens_seen": 327310150, - "step": 15173, - "time_per_iteration": 2.6567437648773193 - }, - { - "auxiliary_loss_clip": 0.01093131, - "auxiliary_loss_mlp": 0.01034759, - "balance_loss_clip": 1.03830385, - "balance_loss_mlp": 1.02032316, - "epoch": 0.9123102359837667, - "flos": 15957122290560.0, - "grad_norm": 5.127558879454518, - "language_loss": 0.6578263, - "learning_rate": 8.00691503189499e-08, - "loss": 0.67910528, - "num_input_tokens_seen": 327326660, - "step": 15174, - "time_per_iteration": 2.6690120697021484 - }, - { - "auxiliary_loss_clip": 0.01096653, - "auxiliary_loss_mlp": 0.0103167, - "balance_loss_clip": 1.03507042, - "balance_loss_mlp": 1.01747251, - "epoch": 0.9123703592364347, - "flos": 25155784149120.0, - "grad_norm": 1.9591497521426535, - "language_loss": 0.74826527, - "learning_rate": 7.996009129329894e-08, - "loss": 0.76954854, - "num_input_tokens_seen": 327346700, - "step": 15175, - "time_per_iteration": 2.6358284950256348 - }, - { - "auxiliary_loss_clip": 0.01017603, - "auxiliary_loss_mlp": 0.01002357, - "balance_loss_clip": 1.00503564, - "balance_loss_mlp": 1.00146246, - "epoch": 0.9124304824891026, - "flos": 60801650812800.0, - "grad_norm": 0.9602139847905503, - "language_loss": 0.58486784, - "learning_rate": 7.985110507575421e-08, - "loss": 0.60506743, - "num_input_tokens_seen": 327403050, - "step": 15176, - "time_per_iteration": 3.1978743076324463 - }, - { - "auxiliary_loss_clip": 0.01083812, - "auxiliary_loss_mlp": 0.01036933, - "balance_loss_clip": 1.03273082, - "balance_loss_mlp": 1.02405846, - "epoch": 0.9124906057417707, - "flos": 18150941352960.0, - "grad_norm": 1.6481113085508423, - "language_loss": 0.65639609, - "learning_rate": 7.97421916704475e-08, - "loss": 0.67760354, - "num_input_tokens_seen": 327422225, - "step": 15177, - "time_per_iteration": 2.6916801929473877 - }, - { - "auxiliary_loss_clip": 0.0107591, - "auxiliary_loss_mlp": 0.01028967, - "balance_loss_clip": 1.03282464, - "balance_loss_mlp": 1.01652193, - "epoch": 0.9125507289944386, - "flos": 11686769049600.0, - "grad_norm": 2.237261929253729, - "language_loss": 0.81215572, - "learning_rate": 7.963335108150926e-08, - "loss": 0.83320451, - "num_input_tokens_seen": 327437025, - "step": 15178, - "time_per_iteration": 2.6279830932617188 - }, - { - "auxiliary_loss_clip": 0.01049012, - "auxiliary_loss_mlp": 0.01039158, - "balance_loss_clip": 1.03083158, - "balance_loss_mlp": 1.02516901, - "epoch": 0.9126108522471066, - "flos": 17748813617280.0, - "grad_norm": 2.000734331425356, - "language_loss": 0.79079652, - "learning_rate": 7.952458331306711e-08, - "loss": 0.81167829, - "num_input_tokens_seen": 327453915, - "step": 15179, - "time_per_iteration": 2.675297737121582 - }, - { - "auxiliary_loss_clip": 0.01084629, - "auxiliary_loss_mlp": 0.01031574, - "balance_loss_clip": 1.0357244, - "balance_loss_mlp": 1.02000451, - "epoch": 0.9126709754997745, - "flos": 27635738952960.0, - "grad_norm": 1.5039394152550116, - "language_loss": 0.67973173, - "learning_rate": 7.941588836924507e-08, - "loss": 0.70089382, - "num_input_tokens_seen": 327474415, - "step": 15180, - "time_per_iteration": 2.697028875350952 - }, - { - "auxiliary_loss_clip": 0.0109496, - "auxiliary_loss_mlp": 0.01027912, - "balance_loss_clip": 1.03393316, - "balance_loss_mlp": 1.01655757, - "epoch": 0.9127310987524425, - "flos": 15924982596480.0, - "grad_norm": 1.6922587349839364, - "language_loss": 0.75127202, - "learning_rate": 7.930726625416495e-08, - "loss": 0.77250075, - "num_input_tokens_seen": 327492750, - "step": 15181, - "time_per_iteration": 2.6039087772369385 - }, - { - "auxiliary_loss_clip": 0.01113895, - "auxiliary_loss_mlp": 0.01031043, - "balance_loss_clip": 1.03893065, - "balance_loss_mlp": 1.01871705, - "epoch": 0.9127912220051104, - "flos": 21536885923200.0, - "grad_norm": 4.263529248122138, - "language_loss": 0.74789053, - "learning_rate": 7.919871697194614e-08, - "loss": 0.76933992, - "num_input_tokens_seen": 327509470, - "step": 15182, - "time_per_iteration": 2.5808985233306885 - }, - { - "auxiliary_loss_clip": 0.01109967, - "auxiliary_loss_mlp": 0.01030307, - "balance_loss_clip": 1.036412, - "balance_loss_mlp": 1.01767075, - "epoch": 0.9128513452577784, - "flos": 24063561342720.0, - "grad_norm": 1.4783992665801426, - "language_loss": 0.7637254, - "learning_rate": 7.909024052670421e-08, - "loss": 0.78512818, - "num_input_tokens_seen": 327530520, - "step": 15183, - "time_per_iteration": 2.690436601638794 - }, - { - "auxiliary_loss_clip": 0.0109821, - "auxiliary_loss_mlp": 0.01031178, - "balance_loss_clip": 1.03847337, - "balance_loss_mlp": 1.01838112, - "epoch": 0.9129114685104465, - "flos": 16216469464320.0, - "grad_norm": 2.432279077679038, - "language_loss": 0.76472139, - "learning_rate": 7.898183692255256e-08, - "loss": 0.78601527, - "num_input_tokens_seen": 327546960, - "step": 15184, - "time_per_iteration": 2.643298864364624 - }, - { - "auxiliary_loss_clip": 0.01093284, - "auxiliary_loss_mlp": 0.01035355, - "balance_loss_clip": 1.03755832, - "balance_loss_mlp": 1.02360058, - "epoch": 0.9129715917631144, - "flos": 19384364522880.0, - "grad_norm": 1.6196695380751174, - "language_loss": 0.74525392, - "learning_rate": 7.887350616360233e-08, - "loss": 0.76654035, - "num_input_tokens_seen": 327564830, - "step": 15185, - "time_per_iteration": 2.5846035480499268 - }, - { - "auxiliary_loss_clip": 0.0108538, - "auxiliary_loss_mlp": 0.01031798, - "balance_loss_clip": 1.03683412, - "balance_loss_mlp": 1.01925135, - "epoch": 0.9130317150157824, - "flos": 20590460421120.0, - "grad_norm": 2.0406191594638257, - "language_loss": 0.68331826, - "learning_rate": 7.876524825396158e-08, - "loss": 0.70449007, - "num_input_tokens_seen": 327583675, - "step": 15186, - "time_per_iteration": 2.6857335567474365 - }, - { - "auxiliary_loss_clip": 0.01089556, - "auxiliary_loss_mlp": 0.01041285, - "balance_loss_clip": 1.03548872, - "balance_loss_mlp": 1.02558517, - "epoch": 0.9130918382684503, - "flos": 20189230525440.0, - "grad_norm": 2.094200267173926, - "language_loss": 0.77826124, - "learning_rate": 7.865706319773502e-08, - "loss": 0.79956973, - "num_input_tokens_seen": 327602280, - "step": 15187, - "time_per_iteration": 2.707458972930908 - }, - { - "auxiliary_loss_clip": 0.01108019, - "auxiliary_loss_mlp": 0.007702, - "balance_loss_clip": 1.03599858, - "balance_loss_mlp": 1.00022209, - "epoch": 0.9131519615211183, - "flos": 25556870390400.0, - "grad_norm": 6.79519157361436, - "language_loss": 0.65794706, - "learning_rate": 7.854895099902515e-08, - "loss": 0.6767292, - "num_input_tokens_seen": 327623515, - "step": 15188, - "time_per_iteration": 2.6106925010681152 - }, - { - "auxiliary_loss_clip": 0.0103354, - "auxiliary_loss_mlp": 0.01035627, - "balance_loss_clip": 1.02865291, - "balance_loss_mlp": 1.02201962, - "epoch": 0.9132120847737862, - "flos": 17931563038080.0, - "grad_norm": 1.7656682346209025, - "language_loss": 0.76258671, - "learning_rate": 7.844091166193157e-08, - "loss": 0.78327841, - "num_input_tokens_seen": 327642875, - "step": 15189, - "time_per_iteration": 2.8081729412078857 - }, - { - "auxiliary_loss_clip": 0.0109744, - "auxiliary_loss_mlp": 0.01029243, - "balance_loss_clip": 1.03559053, - "balance_loss_mlp": 1.0180254, - "epoch": 0.9132722080264543, - "flos": 20047635112320.0, - "grad_norm": 1.7520638649774822, - "language_loss": 0.75371557, - "learning_rate": 7.8332945190551e-08, - "loss": 0.77498239, - "num_input_tokens_seen": 327662450, - "step": 15190, - "time_per_iteration": 2.6704981327056885 - }, - { - "auxiliary_loss_clip": 0.01019225, - "auxiliary_loss_mlp": 0.01003714, - "balance_loss_clip": 1.00641704, - "balance_loss_mlp": 1.00264728, - "epoch": 0.9133323312791222, - "flos": 70439967141120.0, - "grad_norm": 0.7014520418780014, - "language_loss": 0.57308424, - "learning_rate": 7.822505158897797e-08, - "loss": 0.59331357, - "num_input_tokens_seen": 327723845, - "step": 15191, - "time_per_iteration": 3.21588134765625 - }, - { - "auxiliary_loss_clip": 0.01113051, - "auxiliary_loss_mlp": 0.01033224, - "balance_loss_clip": 1.03901196, - "balance_loss_mlp": 1.02014041, - "epoch": 0.9133924545317902, - "flos": 25483792170240.0, - "grad_norm": 1.7022640616397489, - "language_loss": 0.74351078, - "learning_rate": 7.81172308613034e-08, - "loss": 0.76497352, - "num_input_tokens_seen": 327742590, - "step": 15192, - "time_per_iteration": 2.615525245666504 - }, - { - "auxiliary_loss_clip": 0.01096745, - "auxiliary_loss_mlp": 0.01028743, - "balance_loss_clip": 1.03728342, - "balance_loss_mlp": 1.01645255, - "epoch": 0.9134525777844581, - "flos": 39930690107520.0, - "grad_norm": 1.536018225691407, - "language_loss": 0.69412756, - "learning_rate": 7.800948301161647e-08, - "loss": 0.71538246, - "num_input_tokens_seen": 327764350, - "step": 15193, - "time_per_iteration": 2.774912118911743 - }, - { - "auxiliary_loss_clip": 0.01095342, - "auxiliary_loss_mlp": 0.0103875, - "balance_loss_clip": 1.03767395, - "balance_loss_mlp": 1.02737117, - "epoch": 0.9135127010371261, - "flos": 20886723797760.0, - "grad_norm": 1.712567345292954, - "language_loss": 0.73434842, - "learning_rate": 7.790180804400215e-08, - "loss": 0.75568932, - "num_input_tokens_seen": 327783120, - "step": 15194, - "time_per_iteration": 2.581974983215332 - }, - { - "auxiliary_loss_clip": 0.01063051, - "auxiliary_loss_mlp": 0.01041182, - "balance_loss_clip": 1.03309762, - "balance_loss_mlp": 1.02517855, - "epoch": 0.913572824289794, - "flos": 20813250528000.0, - "grad_norm": 1.8550488642948777, - "language_loss": 0.61682135, - "learning_rate": 7.779420596254383e-08, - "loss": 0.63786364, - "num_input_tokens_seen": 327801960, - "step": 15195, - "time_per_iteration": 2.881197929382324 - }, - { - "auxiliary_loss_clip": 0.01098691, - "auxiliary_loss_mlp": 0.01034267, - "balance_loss_clip": 1.03617358, - "balance_loss_mlp": 1.02182126, - "epoch": 0.913632947542462, - "flos": 25703278225920.0, - "grad_norm": 1.4758121064048373, - "language_loss": 0.71160495, - "learning_rate": 7.768667677132201e-08, - "loss": 0.73293453, - "num_input_tokens_seen": 327823795, - "step": 15196, - "time_per_iteration": 2.6203744411468506 - }, - { - "auxiliary_loss_clip": 0.01084959, - "auxiliary_loss_mlp": 0.01035793, - "balance_loss_clip": 1.034657, - "balance_loss_mlp": 1.02372885, - "epoch": 0.9136930707951301, - "flos": 26286216048000.0, - "grad_norm": 1.471790705908436, - "language_loss": 0.71344984, - "learning_rate": 7.757922047441411e-08, - "loss": 0.73465735, - "num_input_tokens_seen": 327845175, - "step": 15197, - "time_per_iteration": 2.6849207878112793 - }, - { - "auxiliary_loss_clip": 0.01088436, - "auxiliary_loss_mlp": 0.01027135, - "balance_loss_clip": 1.03387213, - "balance_loss_mlp": 1.01404572, - "epoch": 0.913753194047798, - "flos": 22091885942400.0, - "grad_norm": 1.7806883440096042, - "language_loss": 0.7787807, - "learning_rate": 7.747183707589489e-08, - "loss": 0.79993641, - "num_input_tokens_seen": 327863150, - "step": 15198, - "time_per_iteration": 2.629854202270508 - }, - { - "auxiliary_loss_clip": 0.01089748, - "auxiliary_loss_mlp": 0.01030529, - "balance_loss_clip": 1.03545046, - "balance_loss_mlp": 1.01816726, - "epoch": 0.913813317300466, - "flos": 23587206151680.0, - "grad_norm": 1.509412256528269, - "language_loss": 0.67781103, - "learning_rate": 7.736452657983616e-08, - "loss": 0.69901383, - "num_input_tokens_seen": 327883445, - "step": 15199, - "time_per_iteration": 2.6181437969207764 - }, - { - "auxiliary_loss_clip": 0.01097631, - "auxiliary_loss_mlp": 0.00769993, - "balance_loss_clip": 1.03525543, - "balance_loss_mlp": 1.00025439, - "epoch": 0.9138734405531339, - "flos": 28876452583680.0, - "grad_norm": 1.5467213284534869, - "language_loss": 0.67587829, - "learning_rate": 7.725728899030714e-08, - "loss": 0.69455445, - "num_input_tokens_seen": 327905745, - "step": 15200, - "time_per_iteration": 2.768298387527466 - }, - { - "auxiliary_loss_clip": 0.0109491, - "auxiliary_loss_mlp": 0.01031874, - "balance_loss_clip": 1.03708506, - "balance_loss_mlp": 1.020787, - "epoch": 0.9139335638058019, - "flos": 22821087945600.0, - "grad_norm": 1.5631891180078048, - "language_loss": 0.71305549, - "learning_rate": 7.715012431137435e-08, - "loss": 0.73432332, - "num_input_tokens_seen": 327925435, - "step": 15201, - "time_per_iteration": 2.6898791790008545 - }, - { - "auxiliary_loss_clip": 0.01096112, - "auxiliary_loss_mlp": 0.01027534, - "balance_loss_clip": 1.03487992, - "balance_loss_mlp": 1.01640594, - "epoch": 0.9139936870584698, - "flos": 18004174381440.0, - "grad_norm": 1.9050793527303824, - "language_loss": 0.70880222, - "learning_rate": 7.704303254710165e-08, - "loss": 0.73003864, - "num_input_tokens_seen": 327944145, - "step": 15202, - "time_per_iteration": 2.645087718963623 - }, - { - "auxiliary_loss_clip": 0.01107696, - "auxiliary_loss_mlp": 0.01031158, - "balance_loss_clip": 1.03578544, - "balance_loss_mlp": 1.01858711, - "epoch": 0.9140538103111379, - "flos": 15813767111040.0, - "grad_norm": 5.183790549538575, - "language_loss": 0.66272342, - "learning_rate": 7.693601370155001e-08, - "loss": 0.68411195, - "num_input_tokens_seen": 327960565, - "step": 15203, - "time_per_iteration": 2.5569849014282227 - }, - { - "auxiliary_loss_clip": 0.01099433, - "auxiliary_loss_mlp": 0.0102949, - "balance_loss_clip": 1.03735852, - "balance_loss_mlp": 1.01664543, - "epoch": 0.9141139335638058, - "flos": 23987035416960.0, - "grad_norm": 1.5350852350505626, - "language_loss": 0.68632525, - "learning_rate": 7.682906777877751e-08, - "loss": 0.70761448, - "num_input_tokens_seen": 327981180, - "step": 15204, - "time_per_iteration": 2.609595537185669 - }, - { - "auxiliary_loss_clip": 0.01096665, - "auxiliary_loss_mlp": 0.01024903, - "balance_loss_clip": 1.03312159, - "balance_loss_mlp": 1.01215935, - "epoch": 0.9141740568164738, - "flos": 24024418496640.0, - "grad_norm": 1.940906740500505, - "language_loss": 0.59392846, - "learning_rate": 7.672219478283915e-08, - "loss": 0.61514413, - "num_input_tokens_seen": 328001500, - "step": 15205, - "time_per_iteration": 4.150220632553101 - }, - { - "auxiliary_loss_clip": 0.01065472, - "auxiliary_loss_mlp": 0.0103254, - "balance_loss_clip": 1.03354537, - "balance_loss_mlp": 1.01977837, - "epoch": 0.9142341800691417, - "flos": 27018291139200.0, - "grad_norm": 1.7151218871860374, - "language_loss": 0.81336343, - "learning_rate": 7.661539471778811e-08, - "loss": 0.83434355, - "num_input_tokens_seen": 328023025, - "step": 15206, - "time_per_iteration": 4.417832612991333 - }, - { - "auxiliary_loss_clip": 0.01062676, - "auxiliary_loss_mlp": 0.0102896, - "balance_loss_clip": 1.03224123, - "balance_loss_mlp": 1.01588321, - "epoch": 0.9142943033218097, - "flos": 20412487509120.0, - "grad_norm": 2.7859643949116695, - "language_loss": 0.73940361, - "learning_rate": 7.650866758767382e-08, - "loss": 0.76031995, - "num_input_tokens_seen": 328041410, - "step": 15207, - "time_per_iteration": 2.729606866836548 - }, - { - "auxiliary_loss_clip": 0.01068037, - "auxiliary_loss_mlp": 0.01037257, - "balance_loss_clip": 1.04014826, - "balance_loss_mlp": 1.02391171, - "epoch": 0.9143544265744776, - "flos": 19755322231680.0, - "grad_norm": 1.6574585771542836, - "language_loss": 0.7323935, - "learning_rate": 7.640201339654373e-08, - "loss": 0.75344646, - "num_input_tokens_seen": 328060495, - "step": 15208, - "time_per_iteration": 4.227857351303101 - }, - { - "auxiliary_loss_clip": 0.01091165, - "auxiliary_loss_mlp": 0.01027772, - "balance_loss_clip": 1.03750086, - "balance_loss_mlp": 1.01647067, - "epoch": 0.9144145498271457, - "flos": 17165444832000.0, - "grad_norm": 2.0923542291564545, - "language_loss": 0.8601079, - "learning_rate": 7.629543214844237e-08, - "loss": 0.88129735, - "num_input_tokens_seen": 328076905, - "step": 15209, - "time_per_iteration": 2.590949058532715 - }, - { - "auxiliary_loss_clip": 0.01091262, - "auxiliary_loss_mlp": 0.01034147, - "balance_loss_clip": 1.04051423, - "balance_loss_mlp": 1.0222261, - "epoch": 0.9144746730798137, - "flos": 23726072131200.0, - "grad_norm": 1.9387336499719838, - "language_loss": 0.75063741, - "learning_rate": 7.618892384741093e-08, - "loss": 0.77189153, - "num_input_tokens_seen": 328096960, - "step": 15210, - "time_per_iteration": 2.6469690799713135 - }, - { - "auxiliary_loss_clip": 0.01083487, - "auxiliary_loss_mlp": 0.01032807, - "balance_loss_clip": 1.03146422, - "balance_loss_mlp": 1.02025414, - "epoch": 0.9145347963324816, - "flos": 25847854467840.0, - "grad_norm": 2.0189583543818994, - "language_loss": 0.78215957, - "learning_rate": 7.6082488497488e-08, - "loss": 0.80332255, - "num_input_tokens_seen": 328115445, - "step": 15211, - "time_per_iteration": 2.6844332218170166 - }, - { - "auxiliary_loss_clip": 0.01100808, - "auxiliary_loss_mlp": 0.01026537, - "balance_loss_clip": 1.03790462, - "balance_loss_mlp": 1.01447928, - "epoch": 0.9145949195851496, - "flos": 19242769109760.0, - "grad_norm": 1.6970166038949297, - "language_loss": 0.82861638, - "learning_rate": 7.597612610270986e-08, - "loss": 0.84988987, - "num_input_tokens_seen": 328133965, - "step": 15212, - "time_per_iteration": 2.670666217803955 - }, - { - "auxiliary_loss_clip": 0.01095988, - "auxiliary_loss_mlp": 0.01029094, - "balance_loss_clip": 1.03628695, - "balance_loss_mlp": 1.01744699, - "epoch": 0.9146550428378175, - "flos": 18296379521280.0, - "grad_norm": 1.816708490158756, - "language_loss": 0.83801937, - "learning_rate": 7.586983666711022e-08, - "loss": 0.85927022, - "num_input_tokens_seen": 328151520, - "step": 15213, - "time_per_iteration": 2.5807952880859375 - }, - { - "auxiliary_loss_clip": 0.01092484, - "auxiliary_loss_mlp": 0.0102752, - "balance_loss_clip": 1.03717518, - "balance_loss_mlp": 1.01593268, - "epoch": 0.9147151660904855, - "flos": 20084264006400.0, - "grad_norm": 1.7762329213074084, - "language_loss": 0.70716697, - "learning_rate": 7.576362019471894e-08, - "loss": 0.72836697, - "num_input_tokens_seen": 328171275, - "step": 15214, - "time_per_iteration": 2.606302499771118 - }, - { - "auxiliary_loss_clip": 0.01100282, - "auxiliary_loss_mlp": 0.0103609, - "balance_loss_clip": 1.03756428, - "balance_loss_mlp": 1.02288795, - "epoch": 0.9147752893431534, - "flos": 24389127239040.0, - "grad_norm": 2.6763056235741876, - "language_loss": 0.62738419, - "learning_rate": 7.565747668956413e-08, - "loss": 0.64874792, - "num_input_tokens_seen": 328192115, - "step": 15215, - "time_per_iteration": 2.624128580093384 - }, - { - "auxiliary_loss_clip": 0.01083257, - "auxiliary_loss_mlp": 0.01031489, - "balance_loss_clip": 1.04120791, - "balance_loss_mlp": 1.0186621, - "epoch": 0.9148354125958215, - "flos": 18150402648960.0, - "grad_norm": 2.856196608513459, - "language_loss": 0.75838691, - "learning_rate": 7.555140615567058e-08, - "loss": 0.77953434, - "num_input_tokens_seen": 328208990, - "step": 15216, - "time_per_iteration": 2.683112144470215 - }, - { - "auxiliary_loss_clip": 0.01082061, - "auxiliary_loss_mlp": 0.0104043, - "balance_loss_clip": 1.0344038, - "balance_loss_mlp": 1.02597594, - "epoch": 0.9148955358484894, - "flos": 23367540528000.0, - "grad_norm": 2.1556116302861223, - "language_loss": 0.679968, - "learning_rate": 7.544540859706062e-08, - "loss": 0.70119286, - "num_input_tokens_seen": 328227840, - "step": 15217, - "time_per_iteration": 2.7583320140838623 - }, - { - "auxiliary_loss_clip": 0.01096251, - "auxiliary_loss_mlp": 0.01034398, - "balance_loss_clip": 1.03755021, - "balance_loss_mlp": 1.02222061, - "epoch": 0.9149556591011574, - "flos": 18076498416000.0, - "grad_norm": 1.7866598830816114, - "language_loss": 0.79880273, - "learning_rate": 7.533948401775347e-08, - "loss": 0.82010925, - "num_input_tokens_seen": 328246250, - "step": 15218, - "time_per_iteration": 2.5897185802459717 - }, - { - "auxiliary_loss_clip": 0.0099941, - "auxiliary_loss_mlp": 0.00999986, - "balance_loss_clip": 1.00879896, - "balance_loss_mlp": 0.99891329, - "epoch": 0.9150157823538253, - "flos": 54586374825600.0, - "grad_norm": 0.8465659506320653, - "language_loss": 0.59200621, - "learning_rate": 7.523363242176595e-08, - "loss": 0.61200017, - "num_input_tokens_seen": 328303625, - "step": 15219, - "time_per_iteration": 3.1801815032958984 - }, - { - "auxiliary_loss_clip": 0.01096152, - "auxiliary_loss_mlp": 0.01034464, - "balance_loss_clip": 1.03535295, - "balance_loss_mlp": 1.0223403, - "epoch": 0.9150759056064933, - "flos": 17893102550400.0, - "grad_norm": 2.4543943314261063, - "language_loss": 0.78340375, - "learning_rate": 7.512785381311216e-08, - "loss": 0.80470991, - "num_input_tokens_seen": 328322135, - "step": 15220, - "time_per_iteration": 2.595521926879883 - }, - { - "auxiliary_loss_clip": 0.01057387, - "auxiliary_loss_mlp": 0.01042337, - "balance_loss_clip": 1.03327441, - "balance_loss_mlp": 1.02777517, - "epoch": 0.9151360288591612, - "flos": 18073517587200.0, - "grad_norm": 2.0267534769754683, - "language_loss": 0.66091788, - "learning_rate": 7.50221481958031e-08, - "loss": 0.68191504, - "num_input_tokens_seen": 328340750, - "step": 15221, - "time_per_iteration": 2.7066280841827393 - }, - { - "auxiliary_loss_clip": 0.01086188, - "auxiliary_loss_mlp": 0.01031232, - "balance_loss_clip": 1.0361774, - "balance_loss_mlp": 1.01978827, - "epoch": 0.9151961521118293, - "flos": 19354523299200.0, - "grad_norm": 1.6413784171664523, - "language_loss": 0.84243524, - "learning_rate": 7.491651557384692e-08, - "loss": 0.86360949, - "num_input_tokens_seen": 328359995, - "step": 15222, - "time_per_iteration": 2.6501386165618896 - }, - { - "auxiliary_loss_clip": 0.01014171, - "auxiliary_loss_mlp": 0.0100656, - "balance_loss_clip": 1.01053584, - "balance_loss_mlp": 1.00542736, - "epoch": 0.9152562753644973, - "flos": 72146621018880.0, - "grad_norm": 0.7238738726338669, - "language_loss": 0.49580848, - "learning_rate": 7.481095595124953e-08, - "loss": 0.51601577, - "num_input_tokens_seen": 328426865, - "step": 15223, - "time_per_iteration": 3.214282751083374 - }, - { - "auxiliary_loss_clip": 0.01078844, - "auxiliary_loss_mlp": 0.01037006, - "balance_loss_clip": 1.03739119, - "balance_loss_mlp": 1.02367282, - "epoch": 0.9153163986171652, - "flos": 20777016683520.0, - "grad_norm": 2.2306023467876175, - "language_loss": 0.72199959, - "learning_rate": 7.470546933201349e-08, - "loss": 0.7431581, - "num_input_tokens_seen": 328445970, - "step": 15224, - "time_per_iteration": 2.673509359359741 - }, - { - "auxiliary_loss_clip": 0.01093298, - "auxiliary_loss_mlp": 0.01029185, - "balance_loss_clip": 1.03519857, - "balance_loss_mlp": 1.01645935, - "epoch": 0.9153765218698332, - "flos": 23040107124480.0, - "grad_norm": 1.873148880683522, - "language_loss": 0.81030774, - "learning_rate": 7.460005572013895e-08, - "loss": 0.83153254, - "num_input_tokens_seen": 328464585, - "step": 15225, - "time_per_iteration": 2.5755882263183594 - }, - { - "auxiliary_loss_clip": 0.01105808, - "auxiliary_loss_mlp": 0.01023692, - "balance_loss_clip": 1.03513598, - "balance_loss_mlp": 1.01225948, - "epoch": 0.9154366451225011, - "flos": 28990900293120.0, - "grad_norm": 1.4093561745696859, - "language_loss": 0.71350908, - "learning_rate": 7.44947151196238e-08, - "loss": 0.73480415, - "num_input_tokens_seen": 328490155, - "step": 15226, - "time_per_iteration": 2.658024787902832 - }, - { - "auxiliary_loss_clip": 0.01038791, - "auxiliary_loss_mlp": 0.01029628, - "balance_loss_clip": 1.03364909, - "balance_loss_mlp": 1.01687872, - "epoch": 0.9154967683751691, - "flos": 22309504490880.0, - "grad_norm": 2.8076014846483166, - "language_loss": 0.74480593, - "learning_rate": 7.43894475344613e-08, - "loss": 0.76549006, - "num_input_tokens_seen": 328508275, - "step": 15227, - "time_per_iteration": 2.8204689025878906 - }, - { - "auxiliary_loss_clip": 0.01084535, - "auxiliary_loss_mlp": 0.01031001, - "balance_loss_clip": 1.03527713, - "balance_loss_mlp": 1.01924694, - "epoch": 0.915556891627837, - "flos": 24571481610240.0, - "grad_norm": 1.973795210729136, - "language_loss": 0.74037504, - "learning_rate": 7.428425296864404e-08, - "loss": 0.7615304, - "num_input_tokens_seen": 328529425, - "step": 15228, - "time_per_iteration": 2.745267152786255 - }, - { - "auxiliary_loss_clip": 0.0106924, - "auxiliary_loss_mlp": 0.01029082, - "balance_loss_clip": 1.03406215, - "balance_loss_mlp": 1.01733994, - "epoch": 0.9156170148805051, - "flos": 22164676853760.0, - "grad_norm": 1.4512437253894719, - "language_loss": 0.71928173, - "learning_rate": 7.417913142616106e-08, - "loss": 0.74026489, - "num_input_tokens_seen": 328550200, - "step": 15229, - "time_per_iteration": 2.8107035160064697 - }, - { - "auxiliary_loss_clip": 0.01111837, - "auxiliary_loss_mlp": 0.01035356, - "balance_loss_clip": 1.03959012, - "balance_loss_mlp": 1.0219506, - "epoch": 0.915677138133173, - "flos": 20920659171840.0, - "grad_norm": 1.9803845760849772, - "language_loss": 0.83079779, - "learning_rate": 7.407408291099848e-08, - "loss": 0.85226971, - "num_input_tokens_seen": 328568540, - "step": 15230, - "time_per_iteration": 2.5778980255126953 - }, - { - "auxiliary_loss_clip": 0.01068692, - "auxiliary_loss_mlp": 0.0102912, - "balance_loss_clip": 1.03630972, - "balance_loss_mlp": 1.01733065, - "epoch": 0.915737261385841, - "flos": 24345136056960.0, - "grad_norm": 1.5638994000303916, - "language_loss": 0.83665484, - "learning_rate": 7.396910742713957e-08, - "loss": 0.85763288, - "num_input_tokens_seen": 328587300, - "step": 15231, - "time_per_iteration": 2.757667303085327 - }, - { - "auxiliary_loss_clip": 0.0109037, - "auxiliary_loss_mlp": 0.0102554, - "balance_loss_clip": 1.03120708, - "balance_loss_mlp": 1.01339293, - "epoch": 0.9157973846385089, - "flos": 26761386090240.0, - "grad_norm": 1.4862156687145838, - "language_loss": 0.72474539, - "learning_rate": 7.386420497856516e-08, - "loss": 0.74590445, - "num_input_tokens_seen": 328610055, - "step": 15232, - "time_per_iteration": 2.65309739112854 - }, - { - "auxiliary_loss_clip": 0.01110021, - "auxiliary_loss_mlp": 0.01035652, - "balance_loss_clip": 1.03648698, - "balance_loss_mlp": 1.02338552, - "epoch": 0.9158575078911769, - "flos": 18478733892480.0, - "grad_norm": 2.186963867327178, - "language_loss": 0.67672479, - "learning_rate": 7.375937556925338e-08, - "loss": 0.69818151, - "num_input_tokens_seen": 328626815, - "step": 15233, - "time_per_iteration": 2.5290985107421875 - }, - { - "auxiliary_loss_clip": 0.01084574, - "auxiliary_loss_mlp": 0.01037951, - "balance_loss_clip": 1.03832459, - "balance_loss_mlp": 1.02474308, - "epoch": 0.9159176311438448, - "flos": 21798926616960.0, - "grad_norm": 1.9371619126619564, - "language_loss": 0.69512558, - "learning_rate": 7.365461920317861e-08, - "loss": 0.71635091, - "num_input_tokens_seen": 328643995, - "step": 15234, - "time_per_iteration": 2.6468849182128906 - }, - { - "auxiliary_loss_clip": 0.01086822, - "auxiliary_loss_mlp": 0.01034886, - "balance_loss_clip": 1.0372566, - "balance_loss_mlp": 1.02233958, - "epoch": 0.9159777543965129, - "flos": 24783749032320.0, - "grad_norm": 1.9164787678121122, - "language_loss": 0.88101876, - "learning_rate": 7.354993588431391e-08, - "loss": 0.90223587, - "num_input_tokens_seen": 328659565, - "step": 15235, - "time_per_iteration": 2.681330919265747 - }, - { - "auxiliary_loss_clip": 0.0104198, - "auxiliary_loss_mlp": 0.01037221, - "balance_loss_clip": 1.03242683, - "balance_loss_mlp": 1.0227077, - "epoch": 0.9160378776491809, - "flos": 26868758820480.0, - "grad_norm": 1.7189420130737911, - "language_loss": 0.77287024, - "learning_rate": 7.344532561662853e-08, - "loss": 0.79366231, - "num_input_tokens_seen": 328679045, - "step": 15236, - "time_per_iteration": 2.7985198497772217 - }, - { - "auxiliary_loss_clip": 0.00988696, - "auxiliary_loss_mlp": 0.01006396, - "balance_loss_clip": 1.01333547, - "balance_loss_mlp": 1.00522804, - "epoch": 0.9160980009018488, - "flos": 70578222589440.0, - "grad_norm": 0.6745147326692066, - "language_loss": 0.62227875, - "learning_rate": 7.334078840409019e-08, - "loss": 0.64222974, - "num_input_tokens_seen": 328744565, - "step": 15237, - "time_per_iteration": 3.2159206867218018 - }, - { - "auxiliary_loss_clip": 0.0111032, - "auxiliary_loss_mlp": 0.00770462, - "balance_loss_clip": 1.03761566, - "balance_loss_mlp": 1.00039566, - "epoch": 0.9161581241545168, - "flos": 16289332202880.0, - "grad_norm": 2.2962314429529638, - "language_loss": 0.75145757, - "learning_rate": 7.323632425066151e-08, - "loss": 0.77026534, - "num_input_tokens_seen": 328762455, - "step": 15238, - "time_per_iteration": 2.5952906608581543 - }, - { - "auxiliary_loss_clip": 0.01108796, - "auxiliary_loss_mlp": 0.01025665, - "balance_loss_clip": 1.03680956, - "balance_loss_mlp": 1.01369047, - "epoch": 0.9162182474071847, - "flos": 18438154502400.0, - "grad_norm": 2.6834766833849693, - "language_loss": 0.7463975, - "learning_rate": 7.313193316030464e-08, - "loss": 0.76774204, - "num_input_tokens_seen": 328780320, - "step": 15239, - "time_per_iteration": 2.5366570949554443 - }, - { - "auxiliary_loss_clip": 0.01078699, - "auxiliary_loss_mlp": 0.01035056, - "balance_loss_clip": 1.03494883, - "balance_loss_mlp": 1.02270627, - "epoch": 0.9162783706598527, - "flos": 19167248764800.0, - "grad_norm": 3.1115685298181797, - "language_loss": 0.63496542, - "learning_rate": 7.302761513697819e-08, - "loss": 0.65610296, - "num_input_tokens_seen": 328797570, - "step": 15240, - "time_per_iteration": 2.654343366622925 - }, - { - "auxiliary_loss_clip": 0.01084597, - "auxiliary_loss_mlp": 0.00769911, - "balance_loss_clip": 1.0354557, - "balance_loss_mlp": 1.00024796, - "epoch": 0.9163384939125206, - "flos": 20412990299520.0, - "grad_norm": 1.818210522630089, - "language_loss": 0.7633701, - "learning_rate": 7.292337018463746e-08, - "loss": 0.78191519, - "num_input_tokens_seen": 328814075, - "step": 15241, - "time_per_iteration": 2.681783676147461 - }, - { - "auxiliary_loss_clip": 0.01103855, - "auxiliary_loss_mlp": 0.01030564, - "balance_loss_clip": 1.03727055, - "balance_loss_mlp": 1.01654494, - "epoch": 0.9163986171651887, - "flos": 19645902426240.0, - "grad_norm": 2.120916469746568, - "language_loss": 0.67877054, - "learning_rate": 7.281919830723549e-08, - "loss": 0.70011473, - "num_input_tokens_seen": 328831990, - "step": 15242, - "time_per_iteration": 2.695181131362915 - }, - { - "auxiliary_loss_clip": 0.01095195, - "auxiliary_loss_mlp": 0.01035014, - "balance_loss_clip": 1.03303313, - "balance_loss_mlp": 1.02215111, - "epoch": 0.9164587404178566, - "flos": 12823054865280.0, - "grad_norm": 2.0974216325015944, - "language_loss": 0.80733311, - "learning_rate": 7.271509950872334e-08, - "loss": 0.8286351, - "num_input_tokens_seen": 328849105, - "step": 15243, - "time_per_iteration": 2.634120464324951 - }, - { - "auxiliary_loss_clip": 0.01082905, - "auxiliary_loss_mlp": 0.0103102, - "balance_loss_clip": 1.03140903, - "balance_loss_mlp": 1.01816344, - "epoch": 0.9165188636705246, - "flos": 22309396750080.0, - "grad_norm": 1.8693748825507899, - "language_loss": 0.82145083, - "learning_rate": 7.261107379304721e-08, - "loss": 0.84259009, - "num_input_tokens_seen": 328866810, - "step": 15244, - "time_per_iteration": 4.170153617858887 - }, - { - "auxiliary_loss_clip": 0.01113607, - "auxiliary_loss_mlp": 0.01035382, - "balance_loss_clip": 1.0378207, - "balance_loss_mlp": 1.02204251, - "epoch": 0.9165789869231925, - "flos": 18223337214720.0, - "grad_norm": 3.40014047465237, - "language_loss": 0.71937442, - "learning_rate": 7.250712116415214e-08, - "loss": 0.74086428, - "num_input_tokens_seen": 328885325, - "step": 15245, - "time_per_iteration": 4.17969822883606 - }, - { - "auxiliary_loss_clip": 0.01083804, - "auxiliary_loss_mlp": 0.01029232, - "balance_loss_clip": 1.03430676, - "balance_loss_mlp": 1.01741219, - "epoch": 0.9166391101758605, - "flos": 13691553811200.0, - "grad_norm": 1.6435574883208541, - "language_loss": 0.74527669, - "learning_rate": 7.240324162598033e-08, - "loss": 0.76640707, - "num_input_tokens_seen": 328902655, - "step": 15246, - "time_per_iteration": 4.363448858261108 - }, - { - "auxiliary_loss_clip": 0.01080629, - "auxiliary_loss_mlp": 0.01034328, - "balance_loss_clip": 1.03388071, - "balance_loss_mlp": 1.02122653, - "epoch": 0.9166992334285284, - "flos": 17346793622400.0, - "grad_norm": 1.9577914656696735, - "language_loss": 0.75327551, - "learning_rate": 7.229943518247106e-08, - "loss": 0.77442503, - "num_input_tokens_seen": 328918440, - "step": 15247, - "time_per_iteration": 2.664409637451172 - }, - { - "auxiliary_loss_clip": 0.01101374, - "auxiliary_loss_mlp": 0.01027057, - "balance_loss_clip": 1.03908849, - "balance_loss_mlp": 1.01476669, - "epoch": 0.9167593566811965, - "flos": 23731135948800.0, - "grad_norm": 1.669323682742113, - "language_loss": 0.76257682, - "learning_rate": 7.219570183756052e-08, - "loss": 0.7838611, - "num_input_tokens_seen": 328938055, - "step": 15248, - "time_per_iteration": 4.128343820571899 - }, - { - "auxiliary_loss_clip": 0.01097593, - "auxiliary_loss_mlp": 0.01037777, - "balance_loss_clip": 1.03494728, - "balance_loss_mlp": 1.02446711, - "epoch": 0.9168194799338644, - "flos": 27818201064960.0, - "grad_norm": 2.2661509072382424, - "language_loss": 0.72809201, - "learning_rate": 7.209204159518178e-08, - "loss": 0.74944574, - "num_input_tokens_seen": 328957895, - "step": 15249, - "time_per_iteration": 2.67682147026062 - }, - { - "auxiliary_loss_clip": 0.01060539, - "auxiliary_loss_mlp": 0.01029442, - "balance_loss_clip": 1.03332496, - "balance_loss_mlp": 1.01615024, - "epoch": 0.9168796031865324, - "flos": 21717552355200.0, - "grad_norm": 3.1939184411772406, - "language_loss": 0.75809246, - "learning_rate": 7.198845445926616e-08, - "loss": 0.7789923, - "num_input_tokens_seen": 328971365, - "step": 15250, - "time_per_iteration": 2.738577365875244 - }, - { - "auxiliary_loss_clip": 0.01066866, - "auxiliary_loss_mlp": 0.01026181, - "balance_loss_clip": 1.03519356, - "balance_loss_mlp": 1.01423001, - "epoch": 0.9169397264392004, - "flos": 23404420817280.0, - "grad_norm": 1.6784135757036345, - "language_loss": 0.75771379, - "learning_rate": 7.188494043374138e-08, - "loss": 0.77864426, - "num_input_tokens_seen": 328990830, - "step": 15251, - "time_per_iteration": 2.7864675521850586 - }, - { - "auxiliary_loss_clip": 0.01084617, - "auxiliary_loss_mlp": 0.01033127, - "balance_loss_clip": 1.03682351, - "balance_loss_mlp": 1.01889396, - "epoch": 0.9169998496918683, - "flos": 23950981140480.0, - "grad_norm": 3.1809452254911896, - "language_loss": 0.79785126, - "learning_rate": 7.178149952253298e-08, - "loss": 0.81902874, - "num_input_tokens_seen": 329008345, - "step": 15252, - "time_per_iteration": 2.67496395111084 - }, - { - "auxiliary_loss_clip": 0.01108344, - "auxiliary_loss_mlp": 0.01034651, - "balance_loss_clip": 1.03633821, - "balance_loss_mlp": 1.02253342, - "epoch": 0.9170599729445363, - "flos": 18332469711360.0, - "grad_norm": 1.6979211858236058, - "language_loss": 0.77028179, - "learning_rate": 7.167813172956316e-08, - "loss": 0.79171169, - "num_input_tokens_seen": 329027440, - "step": 15253, - "time_per_iteration": 2.5820562839508057 - }, - { - "auxiliary_loss_clip": 0.01099567, - "auxiliary_loss_mlp": 0.01026712, - "balance_loss_clip": 1.03753924, - "balance_loss_mlp": 1.0148387, - "epoch": 0.9171200961972042, - "flos": 22674859678080.0, - "grad_norm": 1.9944636420338524, - "language_loss": 0.73225999, - "learning_rate": 7.157483705875256e-08, - "loss": 0.75352275, - "num_input_tokens_seen": 329046445, - "step": 15254, - "time_per_iteration": 2.66645884513855 - }, - { - "auxiliary_loss_clip": 0.01069043, - "auxiliary_loss_mlp": 0.01024866, - "balance_loss_clip": 1.03459096, - "balance_loss_mlp": 1.01324344, - "epoch": 0.9171802194498723, - "flos": 26719298328960.0, - "grad_norm": 1.757918865833482, - "language_loss": 0.79068267, - "learning_rate": 7.14716155140167e-08, - "loss": 0.81162179, - "num_input_tokens_seen": 329065555, - "step": 15255, - "time_per_iteration": 2.791233539581299 - }, - { - "auxiliary_loss_clip": 0.01099583, - "auxiliary_loss_mlp": 0.01032665, - "balance_loss_clip": 1.0360918, - "balance_loss_mlp": 1.01973057, - "epoch": 0.9172403427025402, - "flos": 37889240538240.0, - "grad_norm": 2.1163696122590228, - "language_loss": 0.68610239, - "learning_rate": 7.136846709927047e-08, - "loss": 0.70742488, - "num_input_tokens_seen": 329087515, - "step": 15256, - "time_per_iteration": 2.8768861293792725 - }, - { - "auxiliary_loss_clip": 0.0109198, - "auxiliary_loss_mlp": 0.01039298, - "balance_loss_clip": 1.03456831, - "balance_loss_mlp": 1.02614951, - "epoch": 0.9173004659552082, - "flos": 17055163100160.0, - "grad_norm": 1.585817342288342, - "language_loss": 0.83782554, - "learning_rate": 7.126539181842561e-08, - "loss": 0.85913831, - "num_input_tokens_seen": 329106820, - "step": 15257, - "time_per_iteration": 2.65502667427063 - }, - { - "auxiliary_loss_clip": 0.01082945, - "auxiliary_loss_mlp": 0.01033799, - "balance_loss_clip": 1.03255379, - "balance_loss_mlp": 1.0220809, - "epoch": 0.9173605892078761, - "flos": 22201593056640.0, - "grad_norm": 1.60833944396701, - "language_loss": 0.7756505, - "learning_rate": 7.116238967539012e-08, - "loss": 0.79681796, - "num_input_tokens_seen": 329126515, - "step": 15258, - "time_per_iteration": 2.6512203216552734 - }, - { - "auxiliary_loss_clip": 0.01093895, - "auxiliary_loss_mlp": 0.01030397, - "balance_loss_clip": 1.03959584, - "balance_loss_mlp": 1.01836896, - "epoch": 0.9174207124605441, - "flos": 16507776764160.0, - "grad_norm": 2.0334925748000794, - "language_loss": 0.78772163, - "learning_rate": 7.105946067406999e-08, - "loss": 0.80896461, - "num_input_tokens_seen": 329142660, - "step": 15259, - "time_per_iteration": 2.5838210582733154 - }, - { - "auxiliary_loss_clip": 0.01059246, - "auxiliary_loss_mlp": 0.01035054, - "balance_loss_clip": 1.03190184, - "balance_loss_mlp": 1.02319837, - "epoch": 0.917480835713212, - "flos": 24535606901760.0, - "grad_norm": 1.6551719766080486, - "language_loss": 0.76302671, - "learning_rate": 7.095660481836895e-08, - "loss": 0.7839697, - "num_input_tokens_seen": 329162575, - "step": 15260, - "time_per_iteration": 2.682069778442383 - }, - { - "auxiliary_loss_clip": 0.01066153, - "auxiliary_loss_mlp": 0.01028553, - "balance_loss_clip": 1.03227329, - "balance_loss_mlp": 1.0160774, - "epoch": 0.9175409589658801, - "flos": 20880726226560.0, - "grad_norm": 1.5511805000911754, - "language_loss": 0.61173445, - "learning_rate": 7.085382211218637e-08, - "loss": 0.63268149, - "num_input_tokens_seen": 329182090, - "step": 15261, - "time_per_iteration": 2.681443929672241 - }, - { - "auxiliary_loss_clip": 0.01080586, - "auxiliary_loss_mlp": 0.01029547, - "balance_loss_clip": 1.03192782, - "balance_loss_mlp": 1.01745868, - "epoch": 0.917601082218548, - "flos": 14276035918080.0, - "grad_norm": 1.8788230361145468, - "language_loss": 0.73716688, - "learning_rate": 7.075111255942002e-08, - "loss": 0.75826824, - "num_input_tokens_seen": 329196535, - "step": 15262, - "time_per_iteration": 2.6560230255126953 - }, - { - "auxiliary_loss_clip": 0.01110257, - "auxiliary_loss_mlp": 0.01038053, - "balance_loss_clip": 1.03490496, - "balance_loss_mlp": 1.0255841, - "epoch": 0.917661205471216, - "flos": 19099234362240.0, - "grad_norm": 1.8199175016949676, - "language_loss": 0.77784705, - "learning_rate": 7.064847616396496e-08, - "loss": 0.79933017, - "num_input_tokens_seen": 329215135, - "step": 15263, - "time_per_iteration": 2.5552515983581543 - }, - { - "auxiliary_loss_clip": 0.01110998, - "auxiliary_loss_mlp": 0.01029572, - "balance_loss_clip": 1.03634441, - "balance_loss_mlp": 1.017097, - "epoch": 0.917721328723884, - "flos": 21106568989440.0, - "grad_norm": 2.03433288811874, - "language_loss": 0.75501031, - "learning_rate": 7.054591292971324e-08, - "loss": 0.776416, - "num_input_tokens_seen": 329235150, - "step": 15264, - "time_per_iteration": 2.5273077487945557 - }, - { - "auxiliary_loss_clip": 0.01085288, - "auxiliary_loss_mlp": 0.0103495, - "balance_loss_clip": 1.03628254, - "balance_loss_mlp": 1.02340412, - "epoch": 0.9177814519765519, - "flos": 21943215550080.0, - "grad_norm": 1.6751862272881284, - "language_loss": 0.83633941, - "learning_rate": 7.044342286055394e-08, - "loss": 0.8575418, - "num_input_tokens_seen": 329254365, - "step": 15265, - "time_per_iteration": 2.6066534519195557 - }, - { - "auxiliary_loss_clip": 0.01114086, - "auxiliary_loss_mlp": 0.01040959, - "balance_loss_clip": 1.03847134, - "balance_loss_mlp": 1.02778673, - "epoch": 0.9178415752292199, - "flos": 24205982768640.0, - "grad_norm": 1.6645706894370145, - "language_loss": 0.7328164, - "learning_rate": 7.034100596037306e-08, - "loss": 0.75436687, - "num_input_tokens_seen": 329274385, - "step": 15266, - "time_per_iteration": 2.5833418369293213 - }, - { - "auxiliary_loss_clip": 0.01108539, - "auxiliary_loss_mlp": 0.01030083, - "balance_loss_clip": 1.03649783, - "balance_loss_mlp": 1.01844811, - "epoch": 0.9179016984818879, - "flos": 20042068504320.0, - "grad_norm": 1.59558924604592, - "language_loss": 0.77707624, - "learning_rate": 7.023866223305486e-08, - "loss": 0.79846251, - "num_input_tokens_seen": 329292160, - "step": 15267, - "time_per_iteration": 2.551771879196167 - }, - { - "auxiliary_loss_clip": 0.01017255, - "auxiliary_loss_mlp": 0.00751276, - "balance_loss_clip": 1.00686395, - "balance_loss_mlp": 0.99959415, - "epoch": 0.9179618217345559, - "flos": 65555901100800.0, - "grad_norm": 0.7374753112947235, - "language_loss": 0.56223977, - "learning_rate": 7.013639168247975e-08, - "loss": 0.57992506, - "num_input_tokens_seen": 329351870, - "step": 15268, - "time_per_iteration": 3.2256064414978027 - }, - { - "auxiliary_loss_clip": 0.01110226, - "auxiliary_loss_mlp": 0.00770103, - "balance_loss_clip": 1.03661978, - "balance_loss_mlp": 1.00023341, - "epoch": 0.9180219449872238, - "flos": 21324618501120.0, - "grad_norm": 1.9824828423996201, - "language_loss": 0.76052523, - "learning_rate": 7.0034194312526e-08, - "loss": 0.77932847, - "num_input_tokens_seen": 329370930, - "step": 15269, - "time_per_iteration": 2.571711540222168 - }, - { - "auxiliary_loss_clip": 0.01074295, - "auxiliary_loss_mlp": 0.0103616, - "balance_loss_clip": 1.03202271, - "balance_loss_mlp": 1.02265382, - "epoch": 0.9180820682398918, - "flos": 41060008684800.0, - "grad_norm": 1.7451151800168656, - "language_loss": 0.72839332, - "learning_rate": 6.993207012706936e-08, - "loss": 0.74949783, - "num_input_tokens_seen": 329391275, - "step": 15270, - "time_per_iteration": 2.877145290374756 - }, - { - "auxiliary_loss_clip": 0.01105632, - "auxiliary_loss_mlp": 0.01032058, - "balance_loss_clip": 1.03500867, - "balance_loss_mlp": 1.0196898, - "epoch": 0.9181421914925597, - "flos": 28072915384320.0, - "grad_norm": 1.5262987533233972, - "language_loss": 0.80171967, - "learning_rate": 6.98300191299821e-08, - "loss": 0.82309657, - "num_input_tokens_seen": 329412775, - "step": 15271, - "time_per_iteration": 2.696314573287964 - }, - { - "auxiliary_loss_clip": 0.0106623, - "auxiliary_loss_mlp": 0.01035218, - "balance_loss_clip": 1.03281534, - "balance_loss_mlp": 1.02193236, - "epoch": 0.9182023147452277, - "flos": 29169411909120.0, - "grad_norm": 2.0203873157492387, - "language_loss": 0.72958052, - "learning_rate": 6.972804132513355e-08, - "loss": 0.75059497, - "num_input_tokens_seen": 329432440, - "step": 15272, - "time_per_iteration": 2.7418758869171143 - }, - { - "auxiliary_loss_clip": 0.01080541, - "auxiliary_loss_mlp": 0.01034356, - "balance_loss_clip": 1.03587949, - "balance_loss_mlp": 1.0225302, - "epoch": 0.9182624379978956, - "flos": 24060831909120.0, - "grad_norm": 1.8761263587608576, - "language_loss": 0.72443533, - "learning_rate": 6.962613671639105e-08, - "loss": 0.74558425, - "num_input_tokens_seen": 329450605, - "step": 15273, - "time_per_iteration": 2.5915794372558594 - }, - { - "auxiliary_loss_clip": 0.01068999, - "auxiliary_loss_mlp": 0.01026814, - "balance_loss_clip": 1.033952, - "balance_loss_mlp": 1.01544738, - "epoch": 0.9183225612505637, - "flos": 23293528554240.0, - "grad_norm": 1.6815527096411953, - "language_loss": 0.74486136, - "learning_rate": 6.952430530761933e-08, - "loss": 0.76581949, - "num_input_tokens_seen": 329470550, - "step": 15274, - "time_per_iteration": 2.757570266723633 - }, - { - "auxiliary_loss_clip": 0.01095676, - "auxiliary_loss_mlp": 0.01038928, - "balance_loss_clip": 1.03320456, - "balance_loss_mlp": 1.02651846, - "epoch": 0.9183826845032316, - "flos": 19609237618560.0, - "grad_norm": 1.4749833825049345, - "language_loss": 0.68892634, - "learning_rate": 6.942254710267902e-08, - "loss": 0.71027237, - "num_input_tokens_seen": 329489765, - "step": 15275, - "time_per_iteration": 2.5961973667144775 - }, - { - "auxiliary_loss_clip": 0.01094254, - "auxiliary_loss_mlp": 0.0103149, - "balance_loss_clip": 1.03530109, - "balance_loss_mlp": 1.01921117, - "epoch": 0.9184428077558996, - "flos": 18479057114880.0, - "grad_norm": 1.9188925482656494, - "language_loss": 0.72735369, - "learning_rate": 6.932086210542953e-08, - "loss": 0.74861109, - "num_input_tokens_seen": 329507040, - "step": 15276, - "time_per_iteration": 2.557286024093628 - }, - { - "auxiliary_loss_clip": 0.01086791, - "auxiliary_loss_mlp": 0.01032139, - "balance_loss_clip": 1.03626883, - "balance_loss_mlp": 1.02049232, - "epoch": 0.9185029310085676, - "flos": 20741034234240.0, - "grad_norm": 1.5932066455164264, - "language_loss": 0.73415935, - "learning_rate": 6.921925031972642e-08, - "loss": 0.75534868, - "num_input_tokens_seen": 329525540, - "step": 15277, - "time_per_iteration": 2.6720054149627686 - }, - { - "auxiliary_loss_clip": 0.01000655, - "auxiliary_loss_mlp": 0.00999523, - "balance_loss_clip": 1.00764501, - "balance_loss_mlp": 0.99853915, - "epoch": 0.9185630542612355, - "flos": 68209231875840.0, - "grad_norm": 0.7136545127762665, - "language_loss": 0.59176219, - "learning_rate": 6.91177117494226e-08, - "loss": 0.61176395, - "num_input_tokens_seen": 329592905, - "step": 15278, - "time_per_iteration": 3.3310906887054443 - }, - { - "auxiliary_loss_clip": 0.01068097, - "auxiliary_loss_mlp": 0.01030448, - "balance_loss_clip": 1.03167319, - "balance_loss_mlp": 1.01953995, - "epoch": 0.9186231775139035, - "flos": 12239470598400.0, - "grad_norm": 1.6780534662903475, - "language_loss": 0.63930976, - "learning_rate": 6.901624639836879e-08, - "loss": 0.66029525, - "num_input_tokens_seen": 329610150, - "step": 15279, - "time_per_iteration": 2.6621286869049072 - }, - { - "auxiliary_loss_clip": 0.0102767, - "auxiliary_loss_mlp": 0.00751159, - "balance_loss_clip": 1.0052371, - "balance_loss_mlp": 0.99961108, - "epoch": 0.9186833007665715, - "flos": 63939237770880.0, - "grad_norm": 0.8547221489704414, - "language_loss": 0.60236037, - "learning_rate": 6.891485427041211e-08, - "loss": 0.62014866, - "num_input_tokens_seen": 329673650, - "step": 15280, - "time_per_iteration": 3.122877836227417 - }, - { - "auxiliary_loss_clip": 0.01090206, - "auxiliary_loss_mlp": 0.01032261, - "balance_loss_clip": 1.03693056, - "balance_loss_mlp": 1.01968455, - "epoch": 0.9187434240192395, - "flos": 19974700546560.0, - "grad_norm": 2.122988890708145, - "language_loss": 0.69674432, - "learning_rate": 6.881353536939815e-08, - "loss": 0.717969, - "num_input_tokens_seen": 329692520, - "step": 15281, - "time_per_iteration": 2.6311352252960205 - }, - { - "auxiliary_loss_clip": 0.01086175, - "auxiliary_loss_mlp": 0.01029211, - "balance_loss_clip": 1.03539133, - "balance_loss_mlp": 1.01567471, - "epoch": 0.9188035472719074, - "flos": 25227820874880.0, - "grad_norm": 1.7776435853136854, - "language_loss": 0.84506124, - "learning_rate": 6.871228969916831e-08, - "loss": 0.86621511, - "num_input_tokens_seen": 329713750, - "step": 15282, - "time_per_iteration": 2.6757116317749023 - }, - { - "auxiliary_loss_clip": 0.01082882, - "auxiliary_loss_mlp": 0.01031746, - "balance_loss_clip": 1.03398228, - "balance_loss_mlp": 1.01928234, - "epoch": 0.9188636705245754, - "flos": 18405547931520.0, - "grad_norm": 1.9199915461067039, - "language_loss": 0.60464978, - "learning_rate": 6.861111726356194e-08, - "loss": 0.62579608, - "num_input_tokens_seen": 329730960, - "step": 15283, - "time_per_iteration": 2.666703224182129 - }, - { - "auxiliary_loss_clip": 0.01100933, - "auxiliary_loss_mlp": 0.00770887, - "balance_loss_clip": 1.03721941, - "balance_loss_mlp": 1.00024927, - "epoch": 0.9189237937772433, - "flos": 23769129559680.0, - "grad_norm": 1.5468834987808995, - "language_loss": 0.65656137, - "learning_rate": 6.851001806641554e-08, - "loss": 0.67527962, - "num_input_tokens_seen": 329750975, - "step": 15284, - "time_per_iteration": 5.761394023895264 - }, - { - "auxiliary_loss_clip": 0.01106112, - "auxiliary_loss_mlp": 0.01032597, - "balance_loss_clip": 1.03494716, - "balance_loss_mlp": 1.02003229, - "epoch": 0.9189839170299113, - "flos": 21214624078080.0, - "grad_norm": 2.0145798278900164, - "language_loss": 0.73759109, - "learning_rate": 6.840899211156292e-08, - "loss": 0.75897819, - "num_input_tokens_seen": 329769645, - "step": 15285, - "time_per_iteration": 2.5861620903015137 - }, - { - "auxiliary_loss_clip": 0.0110641, - "auxiliary_loss_mlp": 0.01035061, - "balance_loss_clip": 1.03556252, - "balance_loss_mlp": 1.02236581, - "epoch": 0.9190440402825792, - "flos": 16727370560640.0, - "grad_norm": 1.8392842036391315, - "language_loss": 0.71751177, - "learning_rate": 6.830803940283458e-08, - "loss": 0.73892653, - "num_input_tokens_seen": 329788185, - "step": 15286, - "time_per_iteration": 4.326793193817139 - }, - { - "auxiliary_loss_clip": 0.01109819, - "auxiliary_loss_mlp": 0.01033357, - "balance_loss_clip": 1.03742743, - "balance_loss_mlp": 1.02026165, - "epoch": 0.9191041635352473, - "flos": 23441193365760.0, - "grad_norm": 1.8870763932590424, - "language_loss": 0.73988366, - "learning_rate": 6.820715994405945e-08, - "loss": 0.76131546, - "num_input_tokens_seen": 329806780, - "step": 15287, - "time_per_iteration": 2.582787275314331 - }, - { - "auxiliary_loss_clip": 0.01110857, - "auxiliary_loss_mlp": 0.01029796, - "balance_loss_clip": 1.03881836, - "balance_loss_mlp": 1.01651573, - "epoch": 0.9191642867879152, - "flos": 18807532012800.0, - "grad_norm": 2.0747808934421883, - "language_loss": 0.65521705, - "learning_rate": 6.810635373906226e-08, - "loss": 0.67662358, - "num_input_tokens_seen": 329826350, - "step": 15288, - "time_per_iteration": 4.104849338531494 - }, - { - "auxiliary_loss_clip": 0.01112827, - "auxiliary_loss_mlp": 0.010354, - "balance_loss_clip": 1.04050171, - "balance_loss_mlp": 1.02340722, - "epoch": 0.9192244100405832, - "flos": 32160950167680.0, - "grad_norm": 1.8366679912878503, - "language_loss": 0.71489662, - "learning_rate": 6.800562079166549e-08, - "loss": 0.73637891, - "num_input_tokens_seen": 329846160, - "step": 15289, - "time_per_iteration": 2.628432273864746 - }, - { - "auxiliary_loss_clip": 0.01067852, - "auxiliary_loss_mlp": 0.01037164, - "balance_loss_clip": 1.03277981, - "balance_loss_mlp": 1.02398539, - "epoch": 0.9192845332932512, - "flos": 16357669827840.0, - "grad_norm": 2.022083421674923, - "language_loss": 0.7447117, - "learning_rate": 6.790496110568921e-08, - "loss": 0.76576185, - "num_input_tokens_seen": 329862020, - "step": 15290, - "time_per_iteration": 2.6732118129730225 - }, - { - "auxiliary_loss_clip": 0.01067483, - "auxiliary_loss_mlp": 0.01027695, - "balance_loss_clip": 1.03620386, - "balance_loss_mlp": 1.01607156, - "epoch": 0.9193446565459191, - "flos": 26614475464320.0, - "grad_norm": 1.914747567902696, - "language_loss": 0.72083873, - "learning_rate": 6.78043746849506e-08, - "loss": 0.74179053, - "num_input_tokens_seen": 329880185, - "step": 15291, - "time_per_iteration": 2.72456431388855 - }, - { - "auxiliary_loss_clip": 0.01083225, - "auxiliary_loss_mlp": 0.01026967, - "balance_loss_clip": 1.03504729, - "balance_loss_mlp": 1.0149684, - "epoch": 0.9194047797985871, - "flos": 22492182084480.0, - "grad_norm": 1.6500392247637397, - "language_loss": 0.71124983, - "learning_rate": 6.770386153326346e-08, - "loss": 0.73235166, - "num_input_tokens_seen": 329900255, - "step": 15292, - "time_per_iteration": 2.6152868270874023 - }, - { - "auxiliary_loss_clip": 0.01087602, - "auxiliary_loss_mlp": 0.01029375, - "balance_loss_clip": 1.03518999, - "balance_loss_mlp": 1.01654267, - "epoch": 0.9194649030512551, - "flos": 25078791346560.0, - "grad_norm": 2.1012892949543454, - "language_loss": 0.72765195, - "learning_rate": 6.760342165443988e-08, - "loss": 0.74882174, - "num_input_tokens_seen": 329919095, - "step": 15293, - "time_per_iteration": 2.7014577388763428 - }, - { - "auxiliary_loss_clip": 0.01106702, - "auxiliary_loss_mlp": 0.01026876, - "balance_loss_clip": 1.03621578, - "balance_loss_mlp": 1.01458549, - "epoch": 0.9195250263039231, - "flos": 11911139354880.0, - "grad_norm": 1.8656934281191482, - "language_loss": 0.78315026, - "learning_rate": 6.750305505228837e-08, - "loss": 0.80448604, - "num_input_tokens_seen": 329936505, - "step": 15294, - "time_per_iteration": 2.547825813293457 - }, - { - "auxiliary_loss_clip": 0.01088089, - "auxiliary_loss_mlp": 0.01036683, - "balance_loss_clip": 1.0347265, - "balance_loss_mlp": 1.02261615, - "epoch": 0.919585149556591, - "flos": 21834154880640.0, - "grad_norm": 1.8102816220705245, - "language_loss": 0.77170849, - "learning_rate": 6.74027617306141e-08, - "loss": 0.79295617, - "num_input_tokens_seen": 329956795, - "step": 15295, - "time_per_iteration": 2.7039098739624023 - }, - { - "auxiliary_loss_clip": 0.01106989, - "auxiliary_loss_mlp": 0.01029958, - "balance_loss_clip": 1.03723979, - "balance_loss_mlp": 1.01890755, - "epoch": 0.919645272809259, - "flos": 28184059042560.0, - "grad_norm": 2.3118295307682066, - "language_loss": 0.7140969, - "learning_rate": 6.730254169322114e-08, - "loss": 0.73546642, - "num_input_tokens_seen": 329977195, - "step": 15296, - "time_per_iteration": 2.6299383640289307 - }, - { - "auxiliary_loss_clip": 0.01109705, - "auxiliary_loss_mlp": 0.01039854, - "balance_loss_clip": 1.03783691, - "balance_loss_mlp": 1.02766538, - "epoch": 0.9197053960619269, - "flos": 18332828847360.0, - "grad_norm": 2.0214476637003567, - "language_loss": 0.75176775, - "learning_rate": 6.720239494390912e-08, - "loss": 0.77326334, - "num_input_tokens_seen": 329992095, - "step": 15297, - "time_per_iteration": 2.5208096504211426 - }, - { - "auxiliary_loss_clip": 0.01093577, - "auxiliary_loss_mlp": 0.00770462, - "balance_loss_clip": 1.03651249, - "balance_loss_mlp": 1.00015736, - "epoch": 0.9197655193145949, - "flos": 28183448511360.0, - "grad_norm": 1.600708869843347, - "language_loss": 0.73453987, - "learning_rate": 6.710232148647676e-08, - "loss": 0.75318027, - "num_input_tokens_seen": 330011490, - "step": 15298, - "time_per_iteration": 2.5899410247802734 - }, - { - "auxiliary_loss_clip": 0.01084548, - "auxiliary_loss_mlp": 0.01035002, - "balance_loss_clip": 1.03919554, - "balance_loss_mlp": 1.02254462, - "epoch": 0.9198256425672628, - "flos": 17306321973120.0, - "grad_norm": 1.9381032663604973, - "language_loss": 0.79355192, - "learning_rate": 6.70023213247175e-08, - "loss": 0.81474739, - "num_input_tokens_seen": 330027885, - "step": 15299, - "time_per_iteration": 2.618654251098633 - }, - { - "auxiliary_loss_clip": 0.01078356, - "auxiliary_loss_mlp": 0.01023938, - "balance_loss_clip": 1.03582788, - "balance_loss_mlp": 1.01230943, - "epoch": 0.9198857658199309, - "flos": 17858520731520.0, - "grad_norm": 2.334922484548837, - "language_loss": 0.63701689, - "learning_rate": 6.690239446242385e-08, - "loss": 0.65803981, - "num_input_tokens_seen": 330046230, - "step": 15300, - "time_per_iteration": 2.6653809547424316 - }, - { - "auxiliary_loss_clip": 0.01079487, - "auxiliary_loss_mlp": 0.00768011, - "balance_loss_clip": 1.03474522, - "balance_loss_mlp": 1.00012684, - "epoch": 0.9199458890725988, - "flos": 22127545169280.0, - "grad_norm": 1.7470881044851607, - "language_loss": 0.69722879, - "learning_rate": 6.680254090338545e-08, - "loss": 0.71570385, - "num_input_tokens_seen": 330065535, - "step": 15301, - "time_per_iteration": 2.6812119483947754 - }, - { - "auxiliary_loss_clip": 0.01096515, - "auxiliary_loss_mlp": 0.01040071, - "balance_loss_clip": 1.0358305, - "balance_loss_mlp": 1.02490699, - "epoch": 0.9200060123252668, - "flos": 16034043265920.0, - "grad_norm": 1.711835493107777, - "language_loss": 0.71127915, - "learning_rate": 6.670276065138814e-08, - "loss": 0.73264498, - "num_input_tokens_seen": 330082920, - "step": 15302, - "time_per_iteration": 2.5945441722869873 - }, - { - "auxiliary_loss_clip": 0.01110029, - "auxiliary_loss_mlp": 0.01030513, - "balance_loss_clip": 1.03716493, - "balance_loss_mlp": 1.0183115, - "epoch": 0.9200661355779348, - "flos": 26864521015680.0, - "grad_norm": 2.8681928190187556, - "language_loss": 0.76527154, - "learning_rate": 6.660305371021579e-08, - "loss": 0.78667694, - "num_input_tokens_seen": 330101165, - "step": 15303, - "time_per_iteration": 2.641113519668579 - }, - { - "auxiliary_loss_clip": 0.01088214, - "auxiliary_loss_mlp": 0.01031045, - "balance_loss_clip": 1.03663945, - "balance_loss_mlp": 1.01886749, - "epoch": 0.9201262588306027, - "flos": 12786749193600.0, - "grad_norm": 3.150146783563773, - "language_loss": 0.88236862, - "learning_rate": 6.650342008365006e-08, - "loss": 0.90356123, - "num_input_tokens_seen": 330118775, - "step": 15304, - "time_per_iteration": 2.6956560611724854 - }, - { - "auxiliary_loss_clip": 0.0104635, - "auxiliary_loss_mlp": 0.01037753, - "balance_loss_clip": 1.03275561, - "balance_loss_mlp": 1.02204168, - "epoch": 0.9201863820832707, - "flos": 20631614428800.0, - "grad_norm": 2.036135949691738, - "language_loss": 0.77178156, - "learning_rate": 6.64038597754677e-08, - "loss": 0.79262257, - "num_input_tokens_seen": 330135570, - "step": 15305, - "time_per_iteration": 2.817863941192627 - }, - { - "auxiliary_loss_clip": 0.01091635, - "auxiliary_loss_mlp": 0.01036183, - "balance_loss_clip": 1.03597045, - "balance_loss_mlp": 1.02348161, - "epoch": 0.9202465053359387, - "flos": 26395815421440.0, - "grad_norm": 6.064835951868583, - "language_loss": 0.8149547, - "learning_rate": 6.630437278944501e-08, - "loss": 0.8362329, - "num_input_tokens_seen": 330152840, - "step": 15306, - "time_per_iteration": 2.6748034954071045 - }, - { - "auxiliary_loss_clip": 0.01067915, - "auxiliary_loss_mlp": 0.01030425, - "balance_loss_clip": 1.03378415, - "balance_loss_mlp": 1.01910639, - "epoch": 0.9203066285886067, - "flos": 10488179093760.0, - "grad_norm": 1.9090343566843708, - "language_loss": 0.72313774, - "learning_rate": 6.62049591293541e-08, - "loss": 0.74412113, - "num_input_tokens_seen": 330168605, - "step": 15307, - "time_per_iteration": 2.707096815109253 - }, - { - "auxiliary_loss_clip": 0.01100301, - "auxiliary_loss_mlp": 0.01030096, - "balance_loss_clip": 1.03705478, - "balance_loss_mlp": 1.01726282, - "epoch": 0.9203667518412746, - "flos": 19390721230080.0, - "grad_norm": 2.092849830568689, - "language_loss": 0.78399515, - "learning_rate": 6.610561879896526e-08, - "loss": 0.80529916, - "num_input_tokens_seen": 330186160, - "step": 15308, - "time_per_iteration": 2.606255531311035 - }, - { - "auxiliary_loss_clip": 0.01084659, - "auxiliary_loss_mlp": 0.01032923, - "balance_loss_clip": 1.03293347, - "balance_loss_mlp": 1.01967311, - "epoch": 0.9204268750939426, - "flos": 15924982596480.0, - "grad_norm": 2.276895603959481, - "language_loss": 0.77885747, - "learning_rate": 6.600635180204484e-08, - "loss": 0.80003333, - "num_input_tokens_seen": 330201780, - "step": 15309, - "time_per_iteration": 2.637420654296875 - }, - { - "auxiliary_loss_clip": 0.01054204, - "auxiliary_loss_mlp": 0.01030272, - "balance_loss_clip": 1.03081393, - "balance_loss_mlp": 1.01686096, - "epoch": 0.9204869983466105, - "flos": 16471758401280.0, - "grad_norm": 1.8296999045819267, - "language_loss": 0.66413641, - "learning_rate": 6.590715814235781e-08, - "loss": 0.68498123, - "num_input_tokens_seen": 330219165, - "step": 15310, - "time_per_iteration": 2.7335994243621826 - }, - { - "auxiliary_loss_clip": 0.01044089, - "auxiliary_loss_mlp": 0.01032237, - "balance_loss_clip": 1.03122044, - "balance_loss_mlp": 1.01953483, - "epoch": 0.9205471215992785, - "flos": 21539220307200.0, - "grad_norm": 1.6521926444868564, - "language_loss": 0.66375726, - "learning_rate": 6.580803782366495e-08, - "loss": 0.6845206, - "num_input_tokens_seen": 330238975, - "step": 15311, - "time_per_iteration": 2.8604588508605957 - }, - { - "auxiliary_loss_clip": 0.01097174, - "auxiliary_loss_mlp": 0.01034831, - "balance_loss_clip": 1.03502798, - "balance_loss_mlp": 1.02240396, - "epoch": 0.9206072448519464, - "flos": 25005892694400.0, - "grad_norm": 1.6158209988301302, - "language_loss": 0.7622931, - "learning_rate": 6.570899084972503e-08, - "loss": 0.78361315, - "num_input_tokens_seen": 330259755, - "step": 15312, - "time_per_iteration": 2.664778232574463 - }, - { - "auxiliary_loss_clip": 0.01095599, - "auxiliary_loss_mlp": 0.01038065, - "balance_loss_clip": 1.03726745, - "balance_loss_mlp": 1.02628684, - "epoch": 0.9206673681046145, - "flos": 20522661500160.0, - "grad_norm": 1.6943388606397072, - "language_loss": 0.79487884, - "learning_rate": 6.561001722429394e-08, - "loss": 0.81621552, - "num_input_tokens_seen": 330277660, - "step": 15313, - "time_per_iteration": 2.5808446407318115 - }, - { - "auxiliary_loss_clip": 0.01100191, - "auxiliary_loss_mlp": 0.01030615, - "balance_loss_clip": 1.03598011, - "balance_loss_mlp": 1.01823509, - "epoch": 0.9207274913572824, - "flos": 20883455660160.0, - "grad_norm": 2.6484489133321976, - "language_loss": 0.78395313, - "learning_rate": 6.55111169511251e-08, - "loss": 0.80526119, - "num_input_tokens_seen": 330295455, - "step": 15314, - "time_per_iteration": 2.6530680656433105 - }, - { - "auxiliary_loss_clip": 0.01093159, - "auxiliary_loss_mlp": 0.01034883, - "balance_loss_clip": 1.0372566, - "balance_loss_mlp": 1.02071548, - "epoch": 0.9207876146099504, - "flos": 22708256348160.0, - "grad_norm": 1.9276699965768014, - "language_loss": 0.79122138, - "learning_rate": 6.541229003396864e-08, - "loss": 0.81250179, - "num_input_tokens_seen": 330315310, - "step": 15315, - "time_per_iteration": 2.6690027713775635 - }, - { - "auxiliary_loss_clip": 0.01089675, - "auxiliary_loss_mlp": 0.01032551, - "balance_loss_clip": 1.03612041, - "balance_loss_mlp": 1.01993239, - "epoch": 0.9208477378626184, - "flos": 18507354053760.0, - "grad_norm": 1.761446107604308, - "language_loss": 0.75961876, - "learning_rate": 6.531353647657156e-08, - "loss": 0.78084099, - "num_input_tokens_seen": 330333260, - "step": 15316, - "time_per_iteration": 2.5912938117980957 - }, - { - "auxiliary_loss_clip": 0.01108895, - "auxiliary_loss_mlp": 0.01034715, - "balance_loss_clip": 1.03550375, - "balance_loss_mlp": 1.02175713, - "epoch": 0.9209078611152863, - "flos": 22999635475200.0, - "grad_norm": 1.6108706295980322, - "language_loss": 0.69277954, - "learning_rate": 6.521485628267931e-08, - "loss": 0.71421564, - "num_input_tokens_seen": 330352465, - "step": 15317, - "time_per_iteration": 2.5787100791931152 - }, - { - "auxiliary_loss_clip": 0.01098793, - "auxiliary_loss_mlp": 0.01031182, - "balance_loss_clip": 1.03747189, - "balance_loss_mlp": 1.01845622, - "epoch": 0.9209679843679544, - "flos": 24061514267520.0, - "grad_norm": 1.6422186031600345, - "language_loss": 0.8337481, - "learning_rate": 6.511624945603378e-08, - "loss": 0.85504782, - "num_input_tokens_seen": 330372685, - "step": 15318, - "time_per_iteration": 2.655625820159912 - }, - { - "auxiliary_loss_clip": 0.01087423, - "auxiliary_loss_mlp": 0.01030908, - "balance_loss_clip": 1.03772366, - "balance_loss_mlp": 1.01855183, - "epoch": 0.9210281076206223, - "flos": 13553370190080.0, - "grad_norm": 1.8520706427370603, - "language_loss": 0.85584986, - "learning_rate": 6.501771600037354e-08, - "loss": 0.87703317, - "num_input_tokens_seen": 330388860, - "step": 15319, - "time_per_iteration": 2.62506103515625 - }, - { - "auxiliary_loss_clip": 0.01027307, - "auxiliary_loss_mlp": 0.01001328, - "balance_loss_clip": 1.00478411, - "balance_loss_mlp": 1.0003742, - "epoch": 0.9210882308732903, - "flos": 71426289674880.0, - "grad_norm": 0.7696536988394306, - "language_loss": 0.56245381, - "learning_rate": 6.491925591943559e-08, - "loss": 0.58274013, - "num_input_tokens_seen": 330448735, - "step": 15320, - "time_per_iteration": 3.1641623973846436 - }, - { - "auxiliary_loss_clip": 0.01060714, - "auxiliary_loss_mlp": 0.01048122, - "balance_loss_clip": 1.03641021, - "balance_loss_mlp": 1.03252339, - "epoch": 0.9211483541259582, - "flos": 18509113820160.0, - "grad_norm": 3.2582738862572156, - "language_loss": 0.63959485, - "learning_rate": 6.482086921695384e-08, - "loss": 0.66068316, - "num_input_tokens_seen": 330465600, - "step": 15321, - "time_per_iteration": 2.677826404571533 - }, - { - "auxiliary_loss_clip": 0.01068249, - "auxiliary_loss_mlp": 0.01028212, - "balance_loss_clip": 1.03475666, - "balance_loss_mlp": 1.01626706, - "epoch": 0.9212084773786262, - "flos": 23258228463360.0, - "grad_norm": 1.6685688646331795, - "language_loss": 0.71651804, - "learning_rate": 6.47225558966582e-08, - "loss": 0.73748261, - "num_input_tokens_seen": 330485770, - "step": 15322, - "time_per_iteration": 2.740342855453491 - }, - { - "auxiliary_loss_clip": 0.01058964, - "auxiliary_loss_mlp": 0.01032827, - "balance_loss_clip": 1.03519404, - "balance_loss_mlp": 1.02108479, - "epoch": 0.9212686006312941, - "flos": 16289511770880.0, - "grad_norm": 1.8646866235028916, - "language_loss": 0.69607079, - "learning_rate": 6.462431596227725e-08, - "loss": 0.71698868, - "num_input_tokens_seen": 330504255, - "step": 15323, - "time_per_iteration": 4.281275987625122 - }, - { - "auxiliary_loss_clip": 0.010823, - "auxiliary_loss_mlp": 0.01039009, - "balance_loss_clip": 1.03287673, - "balance_loss_mlp": 1.02479923, - "epoch": 0.9213287238839621, - "flos": 19785773986560.0, - "grad_norm": 1.8454161764247499, - "language_loss": 0.74490941, - "learning_rate": 6.452614941753597e-08, - "loss": 0.76612252, - "num_input_tokens_seen": 330520705, - "step": 15324, - "time_per_iteration": 4.212737798690796 - }, - { - "auxiliary_loss_clip": 0.01099326, - "auxiliary_loss_mlp": 0.01041809, - "balance_loss_clip": 1.0375061, - "balance_loss_mlp": 1.02970934, - "epoch": 0.92138884713663, - "flos": 21030402199680.0, - "grad_norm": 1.8170423555389452, - "language_loss": 0.71340334, - "learning_rate": 6.442805626615744e-08, - "loss": 0.73481476, - "num_input_tokens_seen": 330539245, - "step": 15325, - "time_per_iteration": 4.3058435916900635 - }, - { - "auxiliary_loss_clip": 0.01081418, - "auxiliary_loss_mlp": 0.01031747, - "balance_loss_clip": 1.03530788, - "balance_loss_mlp": 1.0195992, - "epoch": 0.9214489703892981, - "flos": 28587264186240.0, - "grad_norm": 1.763186738417038, - "language_loss": 0.78558946, - "learning_rate": 6.433003651186109e-08, - "loss": 0.80672109, - "num_input_tokens_seen": 330561815, - "step": 15326, - "time_per_iteration": 2.703559160232544 - }, - { - "auxiliary_loss_clip": 0.01101844, - "auxiliary_loss_mlp": 0.01033367, - "balance_loss_clip": 1.03805542, - "balance_loss_mlp": 1.02046824, - "epoch": 0.921509093641966, - "flos": 16361476669440.0, - "grad_norm": 2.8495287751902754, - "language_loss": 0.71737856, - "learning_rate": 6.42320901583635e-08, - "loss": 0.73873067, - "num_input_tokens_seen": 330579760, - "step": 15327, - "time_per_iteration": 4.265162706375122 - }, - { - "auxiliary_loss_clip": 0.01101192, - "auxiliary_loss_mlp": 0.0104188, - "balance_loss_clip": 1.03807735, - "balance_loss_mlp": 1.02861834, - "epoch": 0.921569216894634, - "flos": 26830837036800.0, - "grad_norm": 1.806043843779226, - "language_loss": 0.77786517, - "learning_rate": 6.413421720937906e-08, - "loss": 0.79929584, - "num_input_tokens_seen": 330598545, - "step": 15328, - "time_per_iteration": 2.7142398357391357 - }, - { - "auxiliary_loss_clip": 0.01088664, - "auxiliary_loss_mlp": 0.01032833, - "balance_loss_clip": 1.03698349, - "balance_loss_mlp": 1.02065539, - "epoch": 0.921629340147302, - "flos": 24645134448000.0, - "grad_norm": 3.3382204523213455, - "language_loss": 0.71625078, - "learning_rate": 6.4036417668619e-08, - "loss": 0.73746574, - "num_input_tokens_seen": 330616700, - "step": 15329, - "time_per_iteration": 2.8545138835906982 - }, - { - "auxiliary_loss_clip": 0.01095503, - "auxiliary_loss_mlp": 0.01028111, - "balance_loss_clip": 1.03497839, - "balance_loss_mlp": 1.01688147, - "epoch": 0.9216894633999699, - "flos": 15086504442240.0, - "grad_norm": 2.2459771654219067, - "language_loss": 0.86542726, - "learning_rate": 6.393869153979192e-08, - "loss": 0.88666344, - "num_input_tokens_seen": 330633355, - "step": 15330, - "time_per_iteration": 2.5924322605133057 - }, - { - "auxiliary_loss_clip": 0.01074582, - "auxiliary_loss_mlp": 0.01031838, - "balance_loss_clip": 1.03277349, - "balance_loss_mlp": 1.0190115, - "epoch": 0.921749586652638, - "flos": 19204524103680.0, - "grad_norm": 2.041503026001501, - "language_loss": 0.75815696, - "learning_rate": 6.384103882660397e-08, - "loss": 0.77922112, - "num_input_tokens_seen": 330651470, - "step": 15331, - "time_per_iteration": 2.6607861518859863 - }, - { - "auxiliary_loss_clip": 0.01096924, - "auxiliary_loss_mlp": 0.01029082, - "balance_loss_clip": 1.03500032, - "balance_loss_mlp": 1.01668429, - "epoch": 0.9218097099053059, - "flos": 20522446018560.0, - "grad_norm": 1.901322595674086, - "language_loss": 0.75386262, - "learning_rate": 6.374345953275794e-08, - "loss": 0.7751227, - "num_input_tokens_seen": 330669170, - "step": 15332, - "time_per_iteration": 2.682168483734131 - }, - { - "auxiliary_loss_clip": 0.01055682, - "auxiliary_loss_mlp": 0.01030851, - "balance_loss_clip": 1.03246427, - "balance_loss_mlp": 1.0191865, - "epoch": 0.9218698331579739, - "flos": 17348625216000.0, - "grad_norm": 1.7775108010679808, - "language_loss": 0.74603796, - "learning_rate": 6.364595366195358e-08, - "loss": 0.76690328, - "num_input_tokens_seen": 330686635, - "step": 15333, - "time_per_iteration": 2.7291512489318848 - }, - { - "auxiliary_loss_clip": 0.01017268, - "auxiliary_loss_mlp": 0.01001133, - "balance_loss_clip": 1.00694776, - "balance_loss_mlp": 1.00006628, - "epoch": 0.9219299564106418, - "flos": 61958332575360.0, - "grad_norm": 0.8092949717587729, - "language_loss": 0.52865499, - "learning_rate": 6.354852121788879e-08, - "loss": 0.54883903, - "num_input_tokens_seen": 330749160, - "step": 15334, - "time_per_iteration": 3.11421275138855 - }, - { - "auxiliary_loss_clip": 0.01080248, - "auxiliary_loss_mlp": 0.01032803, - "balance_loss_clip": 1.03543484, - "balance_loss_mlp": 1.02087057, - "epoch": 0.9219900796633098, - "flos": 15701761526400.0, - "grad_norm": 1.9555030178553923, - "language_loss": 0.62425917, - "learning_rate": 6.345116220425839e-08, - "loss": 0.64538974, - "num_input_tokens_seen": 330766840, - "step": 15335, - "time_per_iteration": 2.64497971534729 - }, - { - "auxiliary_loss_clip": 0.01055617, - "auxiliary_loss_mlp": 0.0103028, - "balance_loss_clip": 1.03126609, - "balance_loss_mlp": 1.01756644, - "epoch": 0.9220502029159777, - "flos": 24932670819840.0, - "grad_norm": 1.6552756447627857, - "language_loss": 0.71621144, - "learning_rate": 6.335387662475366e-08, - "loss": 0.73707038, - "num_input_tokens_seen": 330785585, - "step": 15336, - "time_per_iteration": 2.7646801471710205 - }, - { - "auxiliary_loss_clip": 0.01083887, - "auxiliary_loss_mlp": 0.01032421, - "balance_loss_clip": 1.03509367, - "balance_loss_mlp": 1.02121532, - "epoch": 0.9221103261686457, - "flos": 15667215621120.0, - "grad_norm": 1.8250492219467316, - "language_loss": 0.71701425, - "learning_rate": 6.325666448306433e-08, - "loss": 0.7381773, - "num_input_tokens_seen": 330800750, - "step": 15337, - "time_per_iteration": 2.6583242416381836 - }, - { - "auxiliary_loss_clip": 0.01020516, - "auxiliary_loss_mlp": 0.01000329, - "balance_loss_clip": 1.00723362, - "balance_loss_mlp": 0.99938756, - "epoch": 0.9221704494213137, - "flos": 67516299630720.0, - "grad_norm": 0.8846440580369678, - "language_loss": 0.65341711, - "learning_rate": 6.31595257828763e-08, - "loss": 0.67362559, - "num_input_tokens_seen": 330863640, - "step": 15338, - "time_per_iteration": 3.1719980239868164 - }, - { - "auxiliary_loss_clip": 0.01101462, - "auxiliary_loss_mlp": 0.01033959, - "balance_loss_clip": 1.0384047, - "balance_loss_mlp": 1.02131093, - "epoch": 0.9222305726739817, - "flos": 30226945155840.0, - "grad_norm": 1.9711725803511775, - "language_loss": 0.66986012, - "learning_rate": 6.306246052787289e-08, - "loss": 0.69121432, - "num_input_tokens_seen": 330884675, - "step": 15339, - "time_per_iteration": 2.7261481285095215 - }, - { - "auxiliary_loss_clip": 0.01109081, - "auxiliary_loss_mlp": 0.01029578, - "balance_loss_clip": 1.03689742, - "balance_loss_mlp": 1.01729918, - "epoch": 0.9222906959266496, - "flos": 25337204766720.0, - "grad_norm": 2.2637051134502015, - "language_loss": 0.71722078, - "learning_rate": 6.296546872173513e-08, - "loss": 0.73860735, - "num_input_tokens_seen": 330904125, - "step": 15340, - "time_per_iteration": 2.571516275405884 - }, - { - "auxiliary_loss_clip": 0.01074794, - "auxiliary_loss_mlp": 0.01031494, - "balance_loss_clip": 1.03479934, - "balance_loss_mlp": 1.01920938, - "epoch": 0.9223508191793176, - "flos": 27599864244480.0, - "grad_norm": 1.6254811741615818, - "language_loss": 0.70379698, - "learning_rate": 6.286855036814098e-08, - "loss": 0.72485995, - "num_input_tokens_seen": 330925140, - "step": 15341, - "time_per_iteration": 2.8622758388519287 - }, - { - "auxiliary_loss_clip": 0.01056229, - "auxiliary_loss_mlp": 0.01028103, - "balance_loss_clip": 1.03552556, - "balance_loss_mlp": 1.01709414, - "epoch": 0.9224109424319856, - "flos": 27307587277440.0, - "grad_norm": 1.6316629656961243, - "language_loss": 0.67473853, - "learning_rate": 6.277170547076571e-08, - "loss": 0.69558185, - "num_input_tokens_seen": 330946625, - "step": 15342, - "time_per_iteration": 2.9130048751831055 - }, - { - "auxiliary_loss_clip": 0.01059826, - "auxiliary_loss_mlp": 0.01031139, - "balance_loss_clip": 1.03590834, - "balance_loss_mlp": 1.01951027, - "epoch": 0.9224710656846535, - "flos": 48208314401280.0, - "grad_norm": 2.4312862834547175, - "language_loss": 0.6953969, - "learning_rate": 6.26749340332815e-08, - "loss": 0.71630651, - "num_input_tokens_seen": 330967795, - "step": 15343, - "time_per_iteration": 3.0083987712860107 - }, - { - "auxiliary_loss_clip": 0.01011696, - "auxiliary_loss_mlp": 0.0100494, - "balance_loss_clip": 1.008178, - "balance_loss_mlp": 1.00394428, - "epoch": 0.9225311889373216, - "flos": 66722171794560.0, - "grad_norm": 0.7265525500100153, - "language_loss": 0.51983988, - "learning_rate": 6.257823605935786e-08, - "loss": 0.54000616, - "num_input_tokens_seen": 331040850, - "step": 15344, - "time_per_iteration": 3.4099650382995605 - }, - { - "auxiliary_loss_clip": 0.01104022, - "auxiliary_loss_mlp": 0.01032501, - "balance_loss_clip": 1.03630853, - "balance_loss_mlp": 1.0211345, - "epoch": 0.9225913121899895, - "flos": 22271295398400.0, - "grad_norm": 1.703377825859211, - "language_loss": 0.70327353, - "learning_rate": 6.248161155266162e-08, - "loss": 0.7246387, - "num_input_tokens_seen": 331060595, - "step": 15345, - "time_per_iteration": 2.576371431350708 - }, - { - "auxiliary_loss_clip": 0.01087623, - "auxiliary_loss_mlp": 0.01037657, - "balance_loss_clip": 1.03598809, - "balance_loss_mlp": 1.02505088, - "epoch": 0.9226514354426575, - "flos": 20082719721600.0, - "grad_norm": 2.157686246893833, - "language_loss": 0.77242136, - "learning_rate": 6.238506051685677e-08, - "loss": 0.79367411, - "num_input_tokens_seen": 331080195, - "step": 15346, - "time_per_iteration": 2.6608493328094482 - }, - { - "auxiliary_loss_clip": 0.01089778, - "auxiliary_loss_mlp": 0.01037854, - "balance_loss_clip": 1.03755546, - "balance_loss_mlp": 1.02469873, - "epoch": 0.9227115586953254, - "flos": 16070851728000.0, - "grad_norm": 1.7988632334787429, - "language_loss": 0.76320672, - "learning_rate": 6.228858295560457e-08, - "loss": 0.78448308, - "num_input_tokens_seen": 331097645, - "step": 15347, - "time_per_iteration": 2.6887784004211426 - }, - { - "auxiliary_loss_clip": 0.01095866, - "auxiliary_loss_mlp": 0.01030722, - "balance_loss_clip": 1.03849506, - "balance_loss_mlp": 1.01933718, - "epoch": 0.9227716819479934, - "flos": 20446027833600.0, - "grad_norm": 1.7976243281525446, - "language_loss": 0.76849055, - "learning_rate": 6.219217887256367e-08, - "loss": 0.78975642, - "num_input_tokens_seen": 331116830, - "step": 15348, - "time_per_iteration": 2.6028568744659424 - }, - { - "auxiliary_loss_clip": 0.01087325, - "auxiliary_loss_mlp": 0.01033594, - "balance_loss_clip": 1.03495049, - "balance_loss_mlp": 1.02063584, - "epoch": 0.9228318052006613, - "flos": 25007401065600.0, - "grad_norm": 1.9643980003377204, - "language_loss": 0.67811698, - "learning_rate": 6.209584827138959e-08, - "loss": 0.69932616, - "num_input_tokens_seen": 331137235, - "step": 15349, - "time_per_iteration": 2.6671433448791504 - }, - { - "auxiliary_loss_clip": 0.01067448, - "auxiliary_loss_mlp": 0.01030854, - "balance_loss_clip": 1.03284109, - "balance_loss_mlp": 1.01793194, - "epoch": 0.9228919284533293, - "flos": 12677257560960.0, - "grad_norm": 5.153703084259653, - "language_loss": 0.86846638, - "learning_rate": 6.199959115573495e-08, - "loss": 0.88944942, - "num_input_tokens_seen": 331153155, - "step": 15350, - "time_per_iteration": 2.703225612640381 - }, - { - "auxiliary_loss_clip": 0.01009812, - "auxiliary_loss_mlp": 0.01000808, - "balance_loss_clip": 1.00661051, - "balance_loss_mlp": 0.9998011, - "epoch": 0.9229520517059973, - "flos": 69986162712960.0, - "grad_norm": 0.7762360061430656, - "language_loss": 0.60365206, - "learning_rate": 6.190340752924994e-08, - "loss": 0.62375826, - "num_input_tokens_seen": 331214895, - "step": 15351, - "time_per_iteration": 3.158869504928589 - }, - { - "auxiliary_loss_clip": 0.01083781, - "auxiliary_loss_mlp": 0.01026683, - "balance_loss_clip": 1.03323722, - "balance_loss_mlp": 1.01475024, - "epoch": 0.9230121749586653, - "flos": 14793832425600.0, - "grad_norm": 1.832472265730707, - "language_loss": 0.77387846, - "learning_rate": 6.180729739558233e-08, - "loss": 0.79498303, - "num_input_tokens_seen": 331232185, - "step": 15352, - "time_per_iteration": 2.627760171890259 - }, - { - "auxiliary_loss_clip": 0.0107378, - "auxiliary_loss_mlp": 0.01042996, - "balance_loss_clip": 1.03285944, - "balance_loss_mlp": 1.0284164, - "epoch": 0.9230722982113332, - "flos": 22967208472320.0, - "grad_norm": 1.8679415758316251, - "language_loss": 0.59430194, - "learning_rate": 6.171126075837585e-08, - "loss": 0.61546969, - "num_input_tokens_seen": 331251065, - "step": 15353, - "time_per_iteration": 2.7041702270507812 - }, - { - "auxiliary_loss_clip": 0.01083679, - "auxiliary_loss_mlp": 0.01026903, - "balance_loss_clip": 1.034688, - "balance_loss_mlp": 1.01505327, - "epoch": 0.9231324214640012, - "flos": 18551452976640.0, - "grad_norm": 1.711390419205093, - "language_loss": 0.7429471, - "learning_rate": 6.161529762127293e-08, - "loss": 0.76405293, - "num_input_tokens_seen": 331269110, - "step": 15354, - "time_per_iteration": 2.6137607097625732 - }, - { - "auxiliary_loss_clip": 0.01112951, - "auxiliary_loss_mlp": 0.01036765, - "balance_loss_clip": 1.03797793, - "balance_loss_mlp": 1.02363443, - "epoch": 0.9231925447166691, - "flos": 22082727974400.0, - "grad_norm": 2.2506024709408345, - "language_loss": 0.64660299, - "learning_rate": 6.1519407987912e-08, - "loss": 0.66810012, - "num_input_tokens_seen": 331286555, - "step": 15355, - "time_per_iteration": 2.562422275543213 - }, - { - "auxiliary_loss_clip": 0.0108125, - "auxiliary_loss_mlp": 0.01041248, - "balance_loss_clip": 1.03451049, - "balance_loss_mlp": 1.02839768, - "epoch": 0.9232526679693371, - "flos": 26541145848960.0, - "grad_norm": 1.5394884585282018, - "language_loss": 0.7420373, - "learning_rate": 6.142359186192947e-08, - "loss": 0.76326227, - "num_input_tokens_seen": 331307660, - "step": 15356, - "time_per_iteration": 2.6385319232940674 - }, - { - "auxiliary_loss_clip": 0.01084284, - "auxiliary_loss_mlp": 0.01035835, - "balance_loss_clip": 1.03417146, - "balance_loss_mlp": 1.02270436, - "epoch": 0.9233127912220052, - "flos": 14756664827520.0, - "grad_norm": 1.6931372093662804, - "language_loss": 0.60944784, - "learning_rate": 6.132784924695844e-08, - "loss": 0.63064903, - "num_input_tokens_seen": 331324885, - "step": 15357, - "time_per_iteration": 2.6290340423583984 - }, - { - "auxiliary_loss_clip": 0.01082317, - "auxiliary_loss_mlp": 0.01033403, - "balance_loss_clip": 1.03603005, - "balance_loss_mlp": 1.01992083, - "epoch": 0.9233729144746731, - "flos": 25261792162560.0, - "grad_norm": 1.453584491070713, - "language_loss": 0.70108932, - "learning_rate": 6.123218014662956e-08, - "loss": 0.72224653, - "num_input_tokens_seen": 331345885, - "step": 15358, - "time_per_iteration": 2.752317190170288 - }, - { - "auxiliary_loss_clip": 0.01108354, - "auxiliary_loss_mlp": 0.01033314, - "balance_loss_clip": 1.03619421, - "balance_loss_mlp": 1.02111292, - "epoch": 0.9234330377273411, - "flos": 27849837968640.0, - "grad_norm": 2.186505896470512, - "language_loss": 0.73299533, - "learning_rate": 6.113658456457104e-08, - "loss": 0.754412, - "num_input_tokens_seen": 331364320, - "step": 15359, - "time_per_iteration": 2.597811460494995 - }, - { - "auxiliary_loss_clip": 0.01047199, - "auxiliary_loss_mlp": 0.01033574, - "balance_loss_clip": 1.03515124, - "balance_loss_mlp": 1.02113438, - "epoch": 0.923493160980009, - "flos": 24608361899520.0, - "grad_norm": 1.822606379106128, - "language_loss": 0.64573818, - "learning_rate": 6.104106250440732e-08, - "loss": 0.66654599, - "num_input_tokens_seen": 331384135, - "step": 15360, - "time_per_iteration": 2.8328487873077393 - }, - { - "auxiliary_loss_clip": 0.01017958, - "auxiliary_loss_mlp": 0.00751388, - "balance_loss_clip": 1.00556254, - "balance_loss_mlp": 0.99968225, - "epoch": 0.923553284232677, - "flos": 67700916558720.0, - "grad_norm": 0.7601562180978135, - "language_loss": 0.5516786, - "learning_rate": 6.094561396976083e-08, - "loss": 0.56937212, - "num_input_tokens_seen": 331440645, - "step": 15361, - "time_per_iteration": 3.0788414478302 - }, - { - "auxiliary_loss_clip": 0.01075936, - "auxiliary_loss_mlp": 0.01031297, - "balance_loss_clip": 1.03308797, - "balance_loss_mlp": 1.01755273, - "epoch": 0.9236134074853449, - "flos": 18807244704000.0, - "grad_norm": 1.8816264544050445, - "language_loss": 0.69994414, - "learning_rate": 6.085023896425112e-08, - "loss": 0.72101647, - "num_input_tokens_seen": 331459580, - "step": 15362, - "time_per_iteration": 4.193416118621826 - }, - { - "auxiliary_loss_clip": 0.01094932, - "auxiliary_loss_mlp": 0.0103223, - "balance_loss_clip": 1.0347358, - "balance_loss_mlp": 1.01748371, - "epoch": 0.923673530738013, - "flos": 27782362270080.0, - "grad_norm": 1.488556622948845, - "language_loss": 0.75529814, - "learning_rate": 6.075493749149463e-08, - "loss": 0.77656972, - "num_input_tokens_seen": 331481560, - "step": 15363, - "time_per_iteration": 2.6561429500579834 - }, - { - "auxiliary_loss_clip": 0.01109631, - "auxiliary_loss_mlp": 0.01029622, - "balance_loss_clip": 1.03737402, - "balance_loss_mlp": 1.01739717, - "epoch": 0.9237336539906809, - "flos": 26797117144320.0, - "grad_norm": 1.930031039204044, - "language_loss": 0.82993495, - "learning_rate": 6.065970955510514e-08, - "loss": 0.85132754, - "num_input_tokens_seen": 331499090, - "step": 15364, - "time_per_iteration": 5.842444181442261 - }, - { - "auxiliary_loss_clip": 0.01074668, - "auxiliary_loss_mlp": 0.0102538, - "balance_loss_clip": 1.03544402, - "balance_loss_mlp": 1.01388252, - "epoch": 0.9237937772433489, - "flos": 23587708942080.0, - "grad_norm": 1.4281985355444542, - "language_loss": 0.67964804, - "learning_rate": 6.056455515869419e-08, - "loss": 0.70064855, - "num_input_tokens_seen": 331519420, - "step": 15365, - "time_per_iteration": 2.7319743633270264 - }, - { - "auxiliary_loss_clip": 0.01109561, - "auxiliary_loss_mlp": 0.01030475, - "balance_loss_clip": 1.03753567, - "balance_loss_mlp": 1.01805329, - "epoch": 0.9238539004960168, - "flos": 26140562398080.0, - "grad_norm": 2.080228129033925, - "language_loss": 0.62466252, - "learning_rate": 6.046947430586913e-08, - "loss": 0.64606285, - "num_input_tokens_seen": 331538720, - "step": 15366, - "time_per_iteration": 4.141804456710815 - }, - { - "auxiliary_loss_clip": 0.01076799, - "auxiliary_loss_mlp": 0.01028677, - "balance_loss_clip": 1.03669524, - "balance_loss_mlp": 1.01587987, - "epoch": 0.9239140237486848, - "flos": 21068000760960.0, - "grad_norm": 1.4710054055259818, - "language_loss": 0.74650168, - "learning_rate": 6.037446700023619e-08, - "loss": 0.76755643, - "num_input_tokens_seen": 331558505, - "step": 15367, - "time_per_iteration": 2.6937508583068848 - }, - { - "auxiliary_loss_clip": 0.01083975, - "auxiliary_loss_mlp": 0.00768707, - "balance_loss_clip": 1.03666592, - "balance_loss_mlp": 1.0002172, - "epoch": 0.9239741470013527, - "flos": 24607930936320.0, - "grad_norm": 2.0965464238234857, - "language_loss": 0.65042406, - "learning_rate": 6.027953324539759e-08, - "loss": 0.66895086, - "num_input_tokens_seen": 331578440, - "step": 15368, - "time_per_iteration": 2.7437262535095215 - }, - { - "auxiliary_loss_clip": 0.01101382, - "auxiliary_loss_mlp": 0.01034709, - "balance_loss_clip": 1.03610659, - "balance_loss_mlp": 1.02171481, - "epoch": 0.9240342702540207, - "flos": 24718248581760.0, - "grad_norm": 1.7086931963123835, - "language_loss": 0.74773824, - "learning_rate": 6.018467304495401e-08, - "loss": 0.76909912, - "num_input_tokens_seen": 331598945, - "step": 15369, - "time_per_iteration": 2.6743035316467285 - }, - { - "auxiliary_loss_clip": 0.01104923, - "auxiliary_loss_mlp": 0.01037418, - "balance_loss_clip": 1.04013598, - "balance_loss_mlp": 1.02334499, - "epoch": 0.9240943935066888, - "flos": 20849987162880.0, - "grad_norm": 2.038986001331123, - "language_loss": 0.76338404, - "learning_rate": 6.008988640250145e-08, - "loss": 0.78480744, - "num_input_tokens_seen": 331616700, - "step": 15370, - "time_per_iteration": 2.607760429382324 - }, - { - "auxiliary_loss_clip": 0.01109143, - "auxiliary_loss_mlp": 0.01031875, - "balance_loss_clip": 1.03663468, - "balance_loss_mlp": 1.0196085, - "epoch": 0.9241545167593567, - "flos": 24462313200000.0, - "grad_norm": 2.4044373841458495, - "language_loss": 0.66958445, - "learning_rate": 5.999517332163528e-08, - "loss": 0.69099462, - "num_input_tokens_seen": 331635625, - "step": 15371, - "time_per_iteration": 2.6164920330047607 - }, - { - "auxiliary_loss_clip": 0.01011382, - "auxiliary_loss_mlp": 0.01002865, - "balance_loss_clip": 1.00752091, - "balance_loss_mlp": 1.00176203, - "epoch": 0.9242146400120247, - "flos": 61827259847040.0, - "grad_norm": 0.7346104185663305, - "language_loss": 0.57681966, - "learning_rate": 5.99005338059464e-08, - "loss": 0.59696221, - "num_input_tokens_seen": 331698595, - "step": 15372, - "time_per_iteration": 3.1782453060150146 - }, - { - "auxiliary_loss_clip": 0.01108932, - "auxiliary_loss_mlp": 0.01030295, - "balance_loss_clip": 1.03912938, - "balance_loss_mlp": 1.01884508, - "epoch": 0.9242747632646926, - "flos": 22048397550720.0, - "grad_norm": 2.204946411354503, - "language_loss": 0.70037317, - "learning_rate": 5.98059678590237e-08, - "loss": 0.72176552, - "num_input_tokens_seen": 331717975, - "step": 15373, - "time_per_iteration": 2.5716300010681152 - }, - { - "auxiliary_loss_clip": 0.01093668, - "auxiliary_loss_mlp": 0.01037655, - "balance_loss_clip": 1.03443408, - "balance_loss_mlp": 1.02547812, - "epoch": 0.9243348865173606, - "flos": 18478338842880.0, - "grad_norm": 2.311286050873559, - "language_loss": 0.75668836, - "learning_rate": 5.971147548445299e-08, - "loss": 0.77800161, - "num_input_tokens_seen": 331737220, - "step": 15374, - "time_per_iteration": 2.6773972511291504 - }, - { - "auxiliary_loss_clip": 0.01071113, - "auxiliary_loss_mlp": 0.01034847, - "balance_loss_clip": 1.03411102, - "balance_loss_mlp": 1.02240705, - "epoch": 0.9243950097700285, - "flos": 23258767167360.0, - "grad_norm": 1.61997040297718, - "language_loss": 0.64933169, - "learning_rate": 5.961705668581784e-08, - "loss": 0.67039132, - "num_input_tokens_seen": 331757300, - "step": 15375, - "time_per_iteration": 2.724712371826172 - }, - { - "auxiliary_loss_clip": 0.01080494, - "auxiliary_loss_mlp": 0.01034023, - "balance_loss_clip": 1.03776979, - "balance_loss_mlp": 1.02213168, - "epoch": 0.9244551330226966, - "flos": 29749081593600.0, - "grad_norm": 1.73726236759537, - "language_loss": 0.66592222, - "learning_rate": 5.952271146669829e-08, - "loss": 0.68706739, - "num_input_tokens_seen": 331776995, - "step": 15376, - "time_per_iteration": 2.7318432331085205 - }, - { - "auxiliary_loss_clip": 0.01027325, - "auxiliary_loss_mlp": 0.01000699, - "balance_loss_clip": 1.004807, - "balance_loss_mlp": 0.99974555, - "epoch": 0.9245152562753645, - "flos": 68864960609280.0, - "grad_norm": 0.6503040166791668, - "language_loss": 0.61148441, - "learning_rate": 5.94284398306717e-08, - "loss": 0.63176465, - "num_input_tokens_seen": 331845015, - "step": 15377, - "time_per_iteration": 3.3028318881988525 - }, - { - "auxiliary_loss_clip": 0.01066627, - "auxiliary_loss_mlp": 0.01036109, - "balance_loss_clip": 1.03230667, - "balance_loss_mlp": 1.02378881, - "epoch": 0.9245753795280325, - "flos": 21579260993280.0, - "grad_norm": 1.8441759941724984, - "language_loss": 0.74442959, - "learning_rate": 5.933424178131341e-08, - "loss": 0.76545691, - "num_input_tokens_seen": 331862795, - "step": 15378, - "time_per_iteration": 2.7058117389678955 - }, - { - "auxiliary_loss_clip": 0.01111214, - "auxiliary_loss_mlp": 0.0103357, - "balance_loss_clip": 1.03782296, - "balance_loss_mlp": 1.02046919, - "epoch": 0.9246355027807004, - "flos": 34496077334400.0, - "grad_norm": 2.4430826807179122, - "language_loss": 0.62603706, - "learning_rate": 5.924011732219503e-08, - "loss": 0.6474849, - "num_input_tokens_seen": 331882535, - "step": 15379, - "time_per_iteration": 2.672102928161621 - }, - { - "auxiliary_loss_clip": 0.01027241, - "auxiliary_loss_mlp": 0.01034407, - "balance_loss_clip": 1.03090858, - "balance_loss_mlp": 1.02008975, - "epoch": 0.9246956260333684, - "flos": 15953854152960.0, - "grad_norm": 2.0533190004869133, - "language_loss": 0.83975178, - "learning_rate": 5.914606645688591e-08, - "loss": 0.86036825, - "num_input_tokens_seen": 331899335, - "step": 15380, - "time_per_iteration": 2.8909592628479004 - }, - { - "auxiliary_loss_clip": 0.01110328, - "auxiliary_loss_mlp": 0.01034939, - "balance_loss_clip": 1.03606331, - "balance_loss_mlp": 1.02165866, - "epoch": 0.9247557492860363, - "flos": 23368366540800.0, - "grad_norm": 1.485445137788739, - "language_loss": 0.73372233, - "learning_rate": 5.905208918895233e-08, - "loss": 0.75517505, - "num_input_tokens_seen": 331919030, - "step": 15381, - "time_per_iteration": 2.6360280513763428 - }, - { - "auxiliary_loss_clip": 0.01093808, - "auxiliary_loss_mlp": 0.01032177, - "balance_loss_clip": 1.03822911, - "balance_loss_mlp": 1.01991057, - "epoch": 0.9248158725387043, - "flos": 23039855729280.0, - "grad_norm": 1.7916241982506211, - "language_loss": 0.78368294, - "learning_rate": 5.8958185521958524e-08, - "loss": 0.80494279, - "num_input_tokens_seen": 331936465, - "step": 15382, - "time_per_iteration": 2.6322009563446045 - }, - { - "auxiliary_loss_clip": 0.01085867, - "auxiliary_loss_mlp": 0.01035748, - "balance_loss_clip": 1.03582263, - "balance_loss_mlp": 1.02334988, - "epoch": 0.9248759957913724, - "flos": 22522418357760.0, - "grad_norm": 1.7508606986515263, - "language_loss": 0.75239515, - "learning_rate": 5.886435545946455e-08, - "loss": 0.77361131, - "num_input_tokens_seen": 331954625, - "step": 15383, - "time_per_iteration": 2.6861977577209473 - }, - { - "auxiliary_loss_clip": 0.01084507, - "auxiliary_loss_mlp": 0.01026683, - "balance_loss_clip": 1.03370142, - "balance_loss_mlp": 1.0149585, - "epoch": 0.9249361190440403, - "flos": 25447271016960.0, - "grad_norm": 1.6935679354612814, - "language_loss": 0.75976408, - "learning_rate": 5.8770599005028456e-08, - "loss": 0.78087592, - "num_input_tokens_seen": 331975865, - "step": 15384, - "time_per_iteration": 2.7150700092315674 - }, - { - "auxiliary_loss_clip": 0.01075864, - "auxiliary_loss_mlp": 0.01031764, - "balance_loss_clip": 1.03653836, - "balance_loss_mlp": 1.01944935, - "epoch": 0.9249962422967083, - "flos": 12378623886720.0, - "grad_norm": 3.7142425878175223, - "language_loss": 0.66128278, - "learning_rate": 5.8676916162206045e-08, - "loss": 0.68235904, - "num_input_tokens_seen": 331992760, - "step": 15385, - "time_per_iteration": 2.7027785778045654 - }, - { - "auxiliary_loss_clip": 0.01106721, - "auxiliary_loss_mlp": 0.01033952, - "balance_loss_clip": 1.03535783, - "balance_loss_mlp": 1.02156663, - "epoch": 0.9250563655493762, - "flos": 22929430343040.0, - "grad_norm": 1.9713792057532418, - "language_loss": 0.81076729, - "learning_rate": 5.85833069345496e-08, - "loss": 0.83217394, - "num_input_tokens_seen": 332011890, - "step": 15386, - "time_per_iteration": 2.5687849521636963 - }, - { - "auxiliary_loss_clip": 0.01094924, - "auxiliary_loss_mlp": 0.0103698, - "balance_loss_clip": 1.03508687, - "balance_loss_mlp": 1.02440369, - "epoch": 0.9251164888020442, - "flos": 18478662065280.0, - "grad_norm": 1.617640933108123, - "language_loss": 0.75817406, - "learning_rate": 5.8489771325608504e-08, - "loss": 0.77949309, - "num_input_tokens_seen": 332029485, - "step": 15387, - "time_per_iteration": 2.6368582248687744 - }, - { - "auxiliary_loss_clip": 0.0109213, - "auxiliary_loss_mlp": 0.01034821, - "balance_loss_clip": 1.03527617, - "balance_loss_mlp": 1.02329326, - "epoch": 0.9251766120547121, - "flos": 33037062796800.0, - "grad_norm": 1.4014991534530432, - "language_loss": 0.700683, - "learning_rate": 5.839630933893014e-08, - "loss": 0.72195256, - "num_input_tokens_seen": 332052970, - "step": 15388, - "time_per_iteration": 2.754608392715454 - }, - { - "auxiliary_loss_clip": 0.01097095, - "auxiliary_loss_mlp": 0.01030709, - "balance_loss_clip": 1.03522015, - "balance_loss_mlp": 1.01837683, - "epoch": 0.9252367353073802, - "flos": 24387906176640.0, - "grad_norm": 2.101057031092642, - "language_loss": 0.82379329, - "learning_rate": 5.8302920978058115e-08, - "loss": 0.84507132, - "num_input_tokens_seen": 332070395, - "step": 15389, - "time_per_iteration": 2.6602766513824463 - }, - { - "auxiliary_loss_clip": 0.01104924, - "auxiliary_loss_mlp": 0.01031779, - "balance_loss_clip": 1.03799844, - "balance_loss_mlp": 1.01822543, - "epoch": 0.9252968585600481, - "flos": 18916844077440.0, - "grad_norm": 1.6739329388921937, - "language_loss": 0.79294932, - "learning_rate": 5.820960624653381e-08, - "loss": 0.81431639, - "num_input_tokens_seen": 332090185, - "step": 15390, - "time_per_iteration": 2.623624563217163 - }, - { - "auxiliary_loss_clip": 0.01076005, - "auxiliary_loss_mlp": 0.01039788, - "balance_loss_clip": 1.03474212, - "balance_loss_mlp": 1.02668691, - "epoch": 0.9253569818127161, - "flos": 21725345606400.0, - "grad_norm": 1.640565709280766, - "language_loss": 0.75278354, - "learning_rate": 5.811636514789597e-08, - "loss": 0.77394152, - "num_input_tokens_seen": 332109050, - "step": 15391, - "time_per_iteration": 2.6627962589263916 - }, - { - "auxiliary_loss_clip": 0.0108717, - "auxiliary_loss_mlp": 0.01033982, - "balance_loss_clip": 1.03444612, - "balance_loss_mlp": 1.02011776, - "epoch": 0.925417105065384, - "flos": 34240357434240.0, - "grad_norm": 2.199534306893137, - "language_loss": 0.52898717, - "learning_rate": 5.80231976856802e-08, - "loss": 0.55019867, - "num_input_tokens_seen": 332131180, - "step": 15392, - "time_per_iteration": 2.780339479446411 - }, - { - "auxiliary_loss_clip": 0.01106895, - "auxiliary_loss_mlp": 0.01032462, - "balance_loss_clip": 1.03467309, - "balance_loss_mlp": 1.02046967, - "epoch": 0.925477228318052, - "flos": 25959536830080.0, - "grad_norm": 2.305545825428345, - "language_loss": 0.77058631, - "learning_rate": 5.7930103863419454e-08, - "loss": 0.79197991, - "num_input_tokens_seen": 332149555, - "step": 15393, - "time_per_iteration": 2.602755069732666 - }, - { - "auxiliary_loss_clip": 0.01078205, - "auxiliary_loss_mlp": 0.01032904, - "balance_loss_clip": 1.03438604, - "balance_loss_mlp": 1.02044106, - "epoch": 0.9255373515707199, - "flos": 11838240702720.0, - "grad_norm": 3.40082651052626, - "language_loss": 0.69679272, - "learning_rate": 5.783708368464357e-08, - "loss": 0.71790373, - "num_input_tokens_seen": 332165830, - "step": 15394, - "time_per_iteration": 2.6185452938079834 - }, - { - "auxiliary_loss_clip": 0.0110989, - "auxiliary_loss_mlp": 0.01030861, - "balance_loss_clip": 1.0379678, - "balance_loss_mlp": 1.01829052, - "epoch": 0.925597474823388, - "flos": 21434325615360.0, - "grad_norm": 1.7473180632451621, - "language_loss": 0.72795445, - "learning_rate": 5.7744137152879956e-08, - "loss": 0.74936193, - "num_input_tokens_seen": 332185130, - "step": 15395, - "time_per_iteration": 2.6088504791259766 - }, - { - "auxiliary_loss_clip": 0.01057286, - "auxiliary_loss_mlp": 0.01032502, - "balance_loss_clip": 1.03256583, - "balance_loss_mlp": 1.02103376, - "epoch": 0.925657598076056, - "flos": 22857573185280.0, - "grad_norm": 1.8404815888609334, - "language_loss": 0.71465933, - "learning_rate": 5.7651264271653785e-08, - "loss": 0.7355572, - "num_input_tokens_seen": 332203695, - "step": 15396, - "time_per_iteration": 2.7053260803222656 - }, - { - "auxiliary_loss_clip": 0.01106531, - "auxiliary_loss_mlp": 0.01030786, - "balance_loss_clip": 1.03571641, - "balance_loss_mlp": 1.01807809, - "epoch": 0.9257177213287239, - "flos": 25704032411520.0, - "grad_norm": 1.6857437416132761, - "language_loss": 0.87266874, - "learning_rate": 5.755846504448603e-08, - "loss": 0.8940419, - "num_input_tokens_seen": 332224850, - "step": 15397, - "time_per_iteration": 2.5987706184387207 - }, - { - "auxiliary_loss_clip": 0.01027242, - "auxiliary_loss_mlp": 0.00998861, - "balance_loss_clip": 1.00477362, - "balance_loss_mlp": 0.9978596, - "epoch": 0.9257778445813919, - "flos": 59592933221760.0, - "grad_norm": 0.80472899949222, - "language_loss": 0.55124933, - "learning_rate": 5.746573947489586e-08, - "loss": 0.57151037, - "num_input_tokens_seen": 332278085, - "step": 15398, - "time_per_iteration": 3.022796869277954 - }, - { - "auxiliary_loss_clip": 0.01088846, - "auxiliary_loss_mlp": 0.01032308, - "balance_loss_clip": 1.03451252, - "balance_loss_mlp": 1.01805639, - "epoch": 0.9258379678340598, - "flos": 27709427704320.0, - "grad_norm": 1.9788973951742164, - "language_loss": 0.76231545, - "learning_rate": 5.7373087566400025e-08, - "loss": 0.78352696, - "num_input_tokens_seen": 332297875, - "step": 15399, - "time_per_iteration": 2.6782071590423584 - }, - { - "auxiliary_loss_clip": 0.01077436, - "auxiliary_loss_mlp": 0.01029493, - "balance_loss_clip": 1.03120267, - "balance_loss_mlp": 1.01828766, - "epoch": 0.9258980910867278, - "flos": 24863543095680.0, - "grad_norm": 1.6019049009837816, - "language_loss": 0.78070617, - "learning_rate": 5.7280509322510826e-08, - "loss": 0.8017754, - "num_input_tokens_seen": 332318500, - "step": 15400, - "time_per_iteration": 2.6918084621429443 - }, - { - "auxiliary_loss_clip": 0.01019125, - "auxiliary_loss_mlp": 0.01002511, - "balance_loss_clip": 1.00581372, - "balance_loss_mlp": 1.00144386, - "epoch": 0.9259582143393957, - "flos": 63134587249920.0, - "grad_norm": 0.7229223052902047, - "language_loss": 0.51348114, - "learning_rate": 5.718800474673946e-08, - "loss": 0.53369749, - "num_input_tokens_seen": 332381980, - "step": 15401, - "time_per_iteration": 4.655211448669434 - }, - { - "auxiliary_loss_clip": 0.01095608, - "auxiliary_loss_mlp": 0.01034993, - "balance_loss_clip": 1.03721333, - "balance_loss_mlp": 1.02316141, - "epoch": 0.9260183375920638, - "flos": 24127122458880.0, - "grad_norm": 1.7293202030573633, - "language_loss": 0.8252185, - "learning_rate": 5.709557384259378e-08, - "loss": 0.84652448, - "num_input_tokens_seen": 332399510, - "step": 15402, - "time_per_iteration": 2.7125723361968994 - }, - { - "auxiliary_loss_clip": 0.01027546, - "auxiliary_loss_mlp": 0.01001395, - "balance_loss_clip": 1.00508666, - "balance_loss_mlp": 1.0004828, - "epoch": 0.9260784608447317, - "flos": 63042872849280.0, - "grad_norm": 0.7337152983858821, - "language_loss": 0.51039803, - "learning_rate": 5.700321661357876e-08, - "loss": 0.53068745, - "num_input_tokens_seen": 332459130, - "step": 15403, - "time_per_iteration": 4.670861005783081 - }, - { - "auxiliary_loss_clip": 0.01007604, - "auxiliary_loss_mlp": 0.01001265, - "balance_loss_clip": 1.00503612, - "balance_loss_mlp": 1.0004487, - "epoch": 0.9261385840973997, - "flos": 70585979927040.0, - "grad_norm": 0.6850090598665447, - "language_loss": 0.58746749, - "learning_rate": 5.69109330631965e-08, - "loss": 0.60755622, - "num_input_tokens_seen": 332526555, - "step": 15404, - "time_per_iteration": 4.747823238372803 - }, - { - "auxiliary_loss_clip": 0.01083395, - "auxiliary_loss_mlp": 0.01035934, - "balance_loss_clip": 1.03603053, - "balance_loss_mlp": 1.02242208, - "epoch": 0.9261987073500676, - "flos": 20229917656320.0, - "grad_norm": 2.2086830252227903, - "language_loss": 0.71290517, - "learning_rate": 5.681872319494596e-08, - "loss": 0.73409843, - "num_input_tokens_seen": 332544005, - "step": 15405, - "time_per_iteration": 2.6791491508483887 - }, - { - "auxiliary_loss_clip": 0.01061911, - "auxiliary_loss_mlp": 0.01037759, - "balance_loss_clip": 1.03471172, - "balance_loss_mlp": 1.02468121, - "epoch": 0.9262588306027356, - "flos": 20954163582720.0, - "grad_norm": 1.7027063262346462, - "language_loss": 0.68240035, - "learning_rate": 5.672658701232458e-08, - "loss": 0.70339704, - "num_input_tokens_seen": 332563070, - "step": 15406, - "time_per_iteration": 4.413249731063843 - }, - { - "auxiliary_loss_clip": 0.01056836, - "auxiliary_loss_mlp": 0.01046779, - "balance_loss_clip": 1.03164291, - "balance_loss_mlp": 1.03166914, - "epoch": 0.9263189538554035, - "flos": 22158679282560.0, - "grad_norm": 3.2449194361819234, - "language_loss": 0.76197219, - "learning_rate": 5.663452451882555e-08, - "loss": 0.78300834, - "num_input_tokens_seen": 332579620, - "step": 15407, - "time_per_iteration": 2.7800557613372803 - }, - { - "auxiliary_loss_clip": 0.01076765, - "auxiliary_loss_mlp": 0.01037766, - "balance_loss_clip": 1.03282285, - "balance_loss_mlp": 1.02446783, - "epoch": 0.9263790771080715, - "flos": 18187211111040.0, - "grad_norm": 1.9744947993410575, - "language_loss": 0.72376311, - "learning_rate": 5.6542535717940096e-08, - "loss": 0.74490839, - "num_input_tokens_seen": 332597795, - "step": 15408, - "time_per_iteration": 2.654872179031372 - }, - { - "auxiliary_loss_clip": 0.01077908, - "auxiliary_loss_mlp": 0.01028464, - "balance_loss_clip": 1.03418171, - "balance_loss_mlp": 1.01781821, - "epoch": 0.9264392003607396, - "flos": 48178545004800.0, - "grad_norm": 1.6454364766131493, - "language_loss": 0.68587399, - "learning_rate": 5.645062061315675e-08, - "loss": 0.70693767, - "num_input_tokens_seen": 332620375, - "step": 15409, - "time_per_iteration": 2.850269317626953 - }, - { - "auxiliary_loss_clip": 0.01074672, - "auxiliary_loss_mlp": 0.01030839, - "balance_loss_clip": 1.03626847, - "balance_loss_mlp": 1.01791096, - "epoch": 0.9264993236134075, - "flos": 26389458714240.0, - "grad_norm": 2.1360446991021416, - "language_loss": 0.75711519, - "learning_rate": 5.6358779207960506e-08, - "loss": 0.77817023, - "num_input_tokens_seen": 332639510, - "step": 15410, - "time_per_iteration": 2.7220871448516846 - }, - { - "auxiliary_loss_clip": 0.01057013, - "auxiliary_loss_mlp": 0.01030911, - "balance_loss_clip": 1.03571475, - "balance_loss_mlp": 1.01858473, - "epoch": 0.9265594468660755, - "flos": 20920084554240.0, - "grad_norm": 1.538032014217413, - "language_loss": 0.82166702, - "learning_rate": 5.6267011505833905e-08, - "loss": 0.84254622, - "num_input_tokens_seen": 332658350, - "step": 15411, - "time_per_iteration": 2.7539865970611572 - }, - { - "auxiliary_loss_clip": 0.01085605, - "auxiliary_loss_mlp": 0.01037973, - "balance_loss_clip": 1.03824568, - "balance_loss_mlp": 1.02592707, - "epoch": 0.9266195701187434, - "flos": 17525017929600.0, - "grad_norm": 1.9373145629003894, - "language_loss": 0.75171757, - "learning_rate": 5.617531751025728e-08, - "loss": 0.77295339, - "num_input_tokens_seen": 332676715, - "step": 15412, - "time_per_iteration": 2.6214661598205566 - }, - { - "auxiliary_loss_clip": 0.0110647, - "auxiliary_loss_mlp": 0.01029847, - "balance_loss_clip": 1.03467417, - "balance_loss_mlp": 1.01769996, - "epoch": 0.9266796933714114, - "flos": 33688733293440.0, - "grad_norm": 1.8589935579070962, - "language_loss": 0.66795665, - "learning_rate": 5.6083697224707406e-08, - "loss": 0.68931985, - "num_input_tokens_seen": 332701470, - "step": 15413, - "time_per_iteration": 2.690272808074951 - }, - { - "auxiliary_loss_clip": 0.0105034, - "auxiliary_loss_mlp": 0.0103768, - "balance_loss_clip": 1.0341413, - "balance_loss_mlp": 1.0243938, - "epoch": 0.9267398166240793, - "flos": 18916520855040.0, - "grad_norm": 2.0080072670412794, - "language_loss": 0.76213551, - "learning_rate": 5.5992150652658167e-08, - "loss": 0.78301573, - "num_input_tokens_seen": 332719060, - "step": 15414, - "time_per_iteration": 2.858206033706665 - }, - { - "auxiliary_loss_clip": 0.01094062, - "auxiliary_loss_mlp": 0.01028504, - "balance_loss_clip": 1.03724313, - "balance_loss_mlp": 1.01658273, - "epoch": 0.9267999398767474, - "flos": 20478957626880.0, - "grad_norm": 3.161190355469832, - "language_loss": 0.81600469, - "learning_rate": 5.59006777975819e-08, - "loss": 0.83723032, - "num_input_tokens_seen": 332736345, - "step": 15415, - "time_per_iteration": 2.6205687522888184 - }, - { - "auxiliary_loss_clip": 0.01086858, - "auxiliary_loss_mlp": 0.01034472, - "balance_loss_clip": 1.03274202, - "balance_loss_mlp": 1.02163887, - "epoch": 0.9268600631294153, - "flos": 24789351553920.0, - "grad_norm": 1.3966462489262188, - "language_loss": 0.54340827, - "learning_rate": 5.580927866294671e-08, - "loss": 0.56462157, - "num_input_tokens_seen": 332756270, - "step": 15416, - "time_per_iteration": 2.73563814163208 - }, - { - "auxiliary_loss_clip": 0.01067608, - "auxiliary_loss_mlp": 0.01035609, - "balance_loss_clip": 1.03344822, - "balance_loss_mlp": 1.02311552, - "epoch": 0.9269201863820833, - "flos": 18697178453760.0, - "grad_norm": 1.5074603678728897, - "language_loss": 0.72186983, - "learning_rate": 5.571795325221807e-08, - "loss": 0.74290192, - "num_input_tokens_seen": 332775185, - "step": 15417, - "time_per_iteration": 2.715012788772583 - }, - { - "auxiliary_loss_clip": 0.01094578, - "auxiliary_loss_mlp": 0.01033031, - "balance_loss_clip": 1.03809214, - "balance_loss_mlp": 1.02029991, - "epoch": 0.9269803096347512, - "flos": 20923999136640.0, - "grad_norm": 4.4376132167371365, - "language_loss": 0.7579149, - "learning_rate": 5.5626701568859624e-08, - "loss": 0.77919102, - "num_input_tokens_seen": 332794320, - "step": 15418, - "time_per_iteration": 2.6377668380737305 - }, - { - "auxiliary_loss_clip": 0.01095083, - "auxiliary_loss_mlp": 0.01030369, - "balance_loss_clip": 1.03478622, - "balance_loss_mlp": 1.01755381, - "epoch": 0.9270404328874192, - "flos": 28002710252160.0, - "grad_norm": 1.4381586641081634, - "language_loss": 0.76076263, - "learning_rate": 5.553552361633174e-08, - "loss": 0.78201711, - "num_input_tokens_seen": 332818095, - "step": 15419, - "time_per_iteration": 2.7292606830596924 - }, - { - "auxiliary_loss_clip": 0.01104418, - "auxiliary_loss_mlp": 0.0103305, - "balance_loss_clip": 1.03501427, - "balance_loss_mlp": 1.02151632, - "epoch": 0.9271005561400871, - "flos": 25889870401920.0, - "grad_norm": 1.6372901887386972, - "language_loss": 0.75610423, - "learning_rate": 5.5444419398091636e-08, - "loss": 0.77747887, - "num_input_tokens_seen": 332839860, - "step": 15420, - "time_per_iteration": 2.6438181400299072 - }, - { - "auxiliary_loss_clip": 0.01099967, - "auxiliary_loss_mlp": 0.01032703, - "balance_loss_clip": 1.03629184, - "balance_loss_mlp": 1.01973283, - "epoch": 0.9271606793927551, - "flos": 27053914452480.0, - "grad_norm": 1.6035461883801339, - "language_loss": 0.77056849, - "learning_rate": 5.535338891759389e-08, - "loss": 0.79189527, - "num_input_tokens_seen": 332861155, - "step": 15421, - "time_per_iteration": 2.6203770637512207 - }, - { - "auxiliary_loss_clip": 0.0108251, - "auxiliary_loss_mlp": 0.01032083, - "balance_loss_clip": 1.03615677, - "balance_loss_mlp": 1.0196619, - "epoch": 0.9272208026454232, - "flos": 26209869690240.0, - "grad_norm": 2.078534179324168, - "language_loss": 0.72883129, - "learning_rate": 5.526243217829041e-08, - "loss": 0.74997723, - "num_input_tokens_seen": 332881110, - "step": 15422, - "time_per_iteration": 2.700307607650757 - }, - { - "auxiliary_loss_clip": 0.01099345, - "auxiliary_loss_mlp": 0.01039891, - "balance_loss_clip": 1.03540778, - "balance_loss_mlp": 1.0265038, - "epoch": 0.9272809258980911, - "flos": 12458453863680.0, - "grad_norm": 1.8814873155718879, - "language_loss": 0.77395117, - "learning_rate": 5.517154918363065e-08, - "loss": 0.79534352, - "num_input_tokens_seen": 332899350, - "step": 15423, - "time_per_iteration": 2.7268893718719482 - }, - { - "auxiliary_loss_clip": 0.01099209, - "auxiliary_loss_mlp": 0.01033057, - "balance_loss_clip": 1.03573775, - "balance_loss_mlp": 1.01999736, - "epoch": 0.9273410491507591, - "flos": 22856890826880.0, - "grad_norm": 2.3478977381899364, - "language_loss": 0.75240654, - "learning_rate": 5.508073993706053e-08, - "loss": 0.77372921, - "num_input_tokens_seen": 332918105, - "step": 15424, - "time_per_iteration": 2.6554524898529053 - }, - { - "auxiliary_loss_clip": 0.01019493, - "auxiliary_loss_mlp": 0.01002831, - "balance_loss_clip": 1.006253, - "balance_loss_mlp": 1.00180626, - "epoch": 0.927401172403427, - "flos": 47665384329600.0, - "grad_norm": 0.7785886890233412, - "language_loss": 0.60644341, - "learning_rate": 5.499000444202351e-08, - "loss": 0.62666667, - "num_input_tokens_seen": 332969490, - "step": 15425, - "time_per_iteration": 2.9746127128601074 - }, - { - "auxiliary_loss_clip": 0.01086691, - "auxiliary_loss_mlp": 0.00770701, - "balance_loss_clip": 1.03668869, - "balance_loss_mlp": 1.00019503, - "epoch": 0.927461295656095, - "flos": 29972374490880.0, - "grad_norm": 1.4170561273174695, - "language_loss": 0.70912516, - "learning_rate": 5.489934270196106e-08, - "loss": 0.72769904, - "num_input_tokens_seen": 332988805, - "step": 15426, - "time_per_iteration": 2.7353477478027344 - }, - { - "auxiliary_loss_clip": 0.01083876, - "auxiliary_loss_mlp": 0.01027564, - "balance_loss_clip": 1.03567636, - "balance_loss_mlp": 1.01585722, - "epoch": 0.9275214189087629, - "flos": 20375427651840.0, - "grad_norm": 1.8095946188152212, - "language_loss": 0.82924026, - "learning_rate": 5.480875472030977e-08, - "loss": 0.85035467, - "num_input_tokens_seen": 333007960, - "step": 15427, - "time_per_iteration": 2.6290063858032227 - }, - { - "auxiliary_loss_clip": 0.01074923, - "auxiliary_loss_mlp": 0.01034079, - "balance_loss_clip": 1.03522468, - "balance_loss_mlp": 1.02114439, - "epoch": 0.927581542161431, - "flos": 22383193242240.0, - "grad_norm": 1.5641364814856114, - "language_loss": 0.77063322, - "learning_rate": 5.471824050050555e-08, - "loss": 0.79172319, - "num_input_tokens_seen": 333026035, - "step": 15428, - "time_per_iteration": 2.7724509239196777 - }, - { - "auxiliary_loss_clip": 0.01068711, - "auxiliary_loss_mlp": 0.01034095, - "balance_loss_clip": 1.03291845, - "balance_loss_mlp": 1.02194142, - "epoch": 0.9276416654140989, - "flos": 23952453598080.0, - "grad_norm": 1.8224763848392078, - "language_loss": 0.74805522, - "learning_rate": 5.4627800045980555e-08, - "loss": 0.76908326, - "num_input_tokens_seen": 333045590, - "step": 15429, - "time_per_iteration": 2.859591245651245 - }, - { - "auxiliary_loss_clip": 0.01070146, - "auxiliary_loss_mlp": 0.01033725, - "balance_loss_clip": 1.03224564, - "balance_loss_mlp": 1.02171516, - "epoch": 0.9277017886667669, - "flos": 13917719796480.0, - "grad_norm": 1.7936478622179974, - "language_loss": 0.74859536, - "learning_rate": 5.45374333601647e-08, - "loss": 0.76963401, - "num_input_tokens_seen": 333063355, - "step": 15430, - "time_per_iteration": 2.7022671699523926 - }, - { - "auxiliary_loss_clip": 0.01097528, - "auxiliary_loss_mlp": 0.0103505, - "balance_loss_clip": 1.03492427, - "balance_loss_mlp": 1.02224135, - "epoch": 0.9277619119194348, - "flos": 35666478092160.0, - "grad_norm": 1.3597069220305837, - "language_loss": 0.76239693, - "learning_rate": 5.444714044648391e-08, - "loss": 0.78372276, - "num_input_tokens_seen": 333088045, - "step": 15431, - "time_per_iteration": 2.7746591567993164 - }, - { - "auxiliary_loss_clip": 0.01095653, - "auxiliary_loss_mlp": 0.01030326, - "balance_loss_clip": 1.03667474, - "balance_loss_mlp": 1.01806593, - "epoch": 0.9278220351721028, - "flos": 23841238112640.0, - "grad_norm": 1.8615596661189457, - "language_loss": 0.70812196, - "learning_rate": 5.4356921308363e-08, - "loss": 0.72938174, - "num_input_tokens_seen": 333108005, - "step": 15432, - "time_per_iteration": 2.6617555618286133 - }, - { - "auxiliary_loss_clip": 0.01063577, - "auxiliary_loss_mlp": 0.01032538, - "balance_loss_clip": 1.03703666, - "balance_loss_mlp": 1.02040219, - "epoch": 0.9278821584247707, - "flos": 15228135768960.0, - "grad_norm": 2.1745322103620044, - "language_loss": 0.81965214, - "learning_rate": 5.4266775949222354e-08, - "loss": 0.84061331, - "num_input_tokens_seen": 333124335, - "step": 15433, - "time_per_iteration": 2.669423818588257 - }, - { - "auxiliary_loss_clip": 0.01104445, - "auxiliary_loss_mlp": 0.01028126, - "balance_loss_clip": 1.03630841, - "balance_loss_mlp": 1.01734364, - "epoch": 0.9279422816774388, - "flos": 24681404206080.0, - "grad_norm": 1.9129971221663375, - "language_loss": 0.66100991, - "learning_rate": 5.417670437248056e-08, - "loss": 0.68233562, - "num_input_tokens_seen": 333143995, - "step": 15434, - "time_per_iteration": 2.5970053672790527 - }, - { - "auxiliary_loss_clip": 0.01077405, - "auxiliary_loss_mlp": 0.01030104, - "balance_loss_clip": 1.03205669, - "balance_loss_mlp": 1.0184691, - "epoch": 0.9280024049301068, - "flos": 19169188099200.0, - "grad_norm": 1.6749341071635755, - "language_loss": 0.68276256, - "learning_rate": 5.40867065815529e-08, - "loss": 0.70383763, - "num_input_tokens_seen": 333162805, - "step": 15435, - "time_per_iteration": 2.6586270332336426 - }, - { - "auxiliary_loss_clip": 0.0110958, - "auxiliary_loss_mlp": 0.01031502, - "balance_loss_clip": 1.03709114, - "balance_loss_mlp": 1.01851392, - "epoch": 0.9280625281827747, - "flos": 11393701983360.0, - "grad_norm": 1.9176254237254773, - "language_loss": 0.72329485, - "learning_rate": 5.399678257985263e-08, - "loss": 0.74470568, - "num_input_tokens_seen": 333175770, - "step": 15436, - "time_per_iteration": 2.5913889408111572 - }, - { - "auxiliary_loss_clip": 0.01083394, - "auxiliary_loss_mlp": 0.01031424, - "balance_loss_clip": 1.03532553, - "balance_loss_mlp": 1.01925242, - "epoch": 0.9281226514354427, - "flos": 24785616539520.0, - "grad_norm": 1.984161101089214, - "language_loss": 0.66967964, - "learning_rate": 5.390693237078925e-08, - "loss": 0.69082779, - "num_input_tokens_seen": 333194775, - "step": 15437, - "time_per_iteration": 2.681321144104004 - }, - { - "auxiliary_loss_clip": 0.01098237, - "auxiliary_loss_mlp": 0.01033718, - "balance_loss_clip": 1.03667545, - "balance_loss_mlp": 1.02040851, - "epoch": 0.9281827746881106, - "flos": 15083128563840.0, - "grad_norm": 1.9610204855405788, - "language_loss": 0.71449178, - "learning_rate": 5.3817155957770254e-08, - "loss": 0.73581135, - "num_input_tokens_seen": 333208920, - "step": 15438, - "time_per_iteration": 2.6930477619171143 - }, - { - "auxiliary_loss_clip": 0.01108794, - "auxiliary_loss_mlp": 0.01029773, - "balance_loss_clip": 1.03654385, - "balance_loss_mlp": 1.01757753, - "epoch": 0.9282428979407786, - "flos": 24135059364480.0, - "grad_norm": 1.5819700006479365, - "language_loss": 0.64896679, - "learning_rate": 5.3727453344199366e-08, - "loss": 0.67035246, - "num_input_tokens_seen": 333229350, - "step": 15439, - "time_per_iteration": 2.6033389568328857 - }, - { - "auxiliary_loss_clip": 0.01085049, - "auxiliary_loss_mlp": 0.01030625, - "balance_loss_clip": 1.0345124, - "balance_loss_mlp": 1.01829338, - "epoch": 0.9283030211934465, - "flos": 24823215100800.0, - "grad_norm": 1.6494013213160663, - "language_loss": 0.70283854, - "learning_rate": 5.363782453347876e-08, - "loss": 0.72399533, - "num_input_tokens_seen": 333246125, - "step": 15440, - "time_per_iteration": 4.2000510692596436 - }, - { - "auxiliary_loss_clip": 0.01072935, - "auxiliary_loss_mlp": 0.00771755, - "balance_loss_clip": 1.03401041, - "balance_loss_mlp": 1.00015736, - "epoch": 0.9283631444461146, - "flos": 23981037845760.0, - "grad_norm": 1.6732819714869454, - "language_loss": 0.76933122, - "learning_rate": 5.354826952900682e-08, - "loss": 0.78777814, - "num_input_tokens_seen": 333263685, - "step": 15441, - "time_per_iteration": 2.6747872829437256 - }, - { - "auxiliary_loss_clip": 0.01091447, - "auxiliary_loss_mlp": 0.01029564, - "balance_loss_clip": 1.03517056, - "balance_loss_mlp": 1.01912725, - "epoch": 0.9284232676987825, - "flos": 22784530878720.0, - "grad_norm": 2.5725907020773073, - "language_loss": 0.64269817, - "learning_rate": 5.345878833417949e-08, - "loss": 0.6639083, - "num_input_tokens_seen": 333282435, - "step": 15442, - "time_per_iteration": 4.172094106674194 - }, - { - "auxiliary_loss_clip": 0.01064932, - "auxiliary_loss_mlp": 0.01047387, - "balance_loss_clip": 1.03302109, - "balance_loss_mlp": 1.03327239, - "epoch": 0.9284833909514505, - "flos": 19500500171520.0, - "grad_norm": 1.9244730176476685, - "language_loss": 0.80415273, - "learning_rate": 5.3369380952390295e-08, - "loss": 0.8252759, - "num_input_tokens_seen": 333300400, - "step": 15443, - "time_per_iteration": 4.384172201156616 - }, - { - "auxiliary_loss_clip": 0.01098927, - "auxiliary_loss_mlp": 0.00770098, - "balance_loss_clip": 1.03640699, - "balance_loss_mlp": 1.00019741, - "epoch": 0.9285435142041184, - "flos": 23185976256000.0, - "grad_norm": 2.3934730189580278, - "language_loss": 0.6569308, - "learning_rate": 5.328004738702896e-08, - "loss": 0.67562109, - "num_input_tokens_seen": 333318980, - "step": 15444, - "time_per_iteration": 2.6536576747894287 - }, - { - "auxiliary_loss_clip": 0.01066958, - "auxiliary_loss_mlp": 0.01030149, - "balance_loss_clip": 1.03394997, - "balance_loss_mlp": 1.01776958, - "epoch": 0.9286036374567864, - "flos": 17675519915520.0, - "grad_norm": 1.9812585053043275, - "language_loss": 0.73909259, - "learning_rate": 5.3190787641483215e-08, - "loss": 0.76006365, - "num_input_tokens_seen": 333334135, - "step": 15445, - "time_per_iteration": 4.150733947753906 - }, - { - "auxiliary_loss_clip": 0.0109372, - "auxiliary_loss_mlp": 0.01039239, - "balance_loss_clip": 1.03644156, - "balance_loss_mlp": 1.02563095, - "epoch": 0.9286637607094543, - "flos": 20886687884160.0, - "grad_norm": 1.737150069171567, - "language_loss": 0.71495092, - "learning_rate": 5.3101601719138135e-08, - "loss": 0.7362805, - "num_input_tokens_seen": 333353325, - "step": 15446, - "time_per_iteration": 2.6059980392456055 - }, - { - "auxiliary_loss_clip": 0.0105076, - "auxiliary_loss_mlp": 0.01033839, - "balance_loss_clip": 1.0349431, - "balance_loss_mlp": 1.02085745, - "epoch": 0.9287238839621224, - "flos": 19026012487680.0, - "grad_norm": 1.7396993406776888, - "language_loss": 0.69318455, - "learning_rate": 5.301248962337523e-08, - "loss": 0.71403056, - "num_input_tokens_seen": 333371110, - "step": 15447, - "time_per_iteration": 2.785006046295166 - }, - { - "auxiliary_loss_clip": 0.01101911, - "auxiliary_loss_mlp": 0.01030242, - "balance_loss_clip": 1.03475642, - "balance_loss_mlp": 1.01898217, - "epoch": 0.9287840072147904, - "flos": 20557027837440.0, - "grad_norm": 1.5415290454981314, - "language_loss": 0.72406214, - "learning_rate": 5.292345135757403e-08, - "loss": 0.74538368, - "num_input_tokens_seen": 333391420, - "step": 15448, - "time_per_iteration": 2.5804696083068848 - }, - { - "auxiliary_loss_clip": 0.01108235, - "auxiliary_loss_mlp": 0.01028889, - "balance_loss_clip": 1.03618634, - "balance_loss_mlp": 1.01485801, - "epoch": 0.9288441304674583, - "flos": 21250822008960.0, - "grad_norm": 1.5546568234640936, - "language_loss": 0.74195588, - "learning_rate": 5.283448692511072e-08, - "loss": 0.76332712, - "num_input_tokens_seen": 333410365, - "step": 15449, - "time_per_iteration": 2.5949409008026123 - }, - { - "auxiliary_loss_clip": 0.01108056, - "auxiliary_loss_mlp": 0.00770321, - "balance_loss_clip": 1.03558385, - "balance_loss_mlp": 1.00032389, - "epoch": 0.9289042537201263, - "flos": 27669853895040.0, - "grad_norm": 1.737092405076848, - "language_loss": 0.67934465, - "learning_rate": 5.27455963293586e-08, - "loss": 0.69812846, - "num_input_tokens_seen": 333430000, - "step": 15450, - "time_per_iteration": 2.666686773300171 - }, - { - "auxiliary_loss_clip": 0.01076756, - "auxiliary_loss_mlp": 0.01028132, - "balance_loss_clip": 1.03465009, - "balance_loss_mlp": 1.01593733, - "epoch": 0.9289643769727942, - "flos": 19317750750720.0, - "grad_norm": 2.158761151214465, - "language_loss": 0.71885049, - "learning_rate": 5.265677957368875e-08, - "loss": 0.73989934, - "num_input_tokens_seen": 333445800, - "step": 15451, - "time_per_iteration": 2.7205255031585693 - }, - { - "auxiliary_loss_clip": 0.01083407, - "auxiliary_loss_mlp": 0.010432, - "balance_loss_clip": 1.033409, - "balance_loss_mlp": 1.03013515, - "epoch": 0.9290245002254622, - "flos": 14058058233600.0, - "grad_norm": 1.931438285562406, - "language_loss": 0.732077, - "learning_rate": 5.25680366614687e-08, - "loss": 0.75334305, - "num_input_tokens_seen": 333461550, - "step": 15452, - "time_per_iteration": 2.7509524822235107 - }, - { - "auxiliary_loss_clip": 0.01089897, - "auxiliary_loss_mlp": 0.01029462, - "balance_loss_clip": 1.03840566, - "balance_loss_mlp": 1.01675391, - "epoch": 0.9290846234781301, - "flos": 20047132321920.0, - "grad_norm": 2.107575184826188, - "language_loss": 0.74144852, - "learning_rate": 5.2479367596064196e-08, - "loss": 0.76264215, - "num_input_tokens_seen": 333478835, - "step": 15453, - "time_per_iteration": 2.7099177837371826 - }, - { - "auxiliary_loss_clip": 0.00992131, - "auxiliary_loss_mlp": 0.0100121, - "balance_loss_clip": 1.00884318, - "balance_loss_mlp": 1.00022078, - "epoch": 0.9291447467307982, - "flos": 61227514460160.0, - "grad_norm": 0.8159194207787102, - "language_loss": 0.60648072, - "learning_rate": 5.2390772380837226e-08, - "loss": 0.62641418, - "num_input_tokens_seen": 333535250, - "step": 15454, - "time_per_iteration": 3.143502950668335 - }, - { - "auxiliary_loss_clip": 0.01082676, - "auxiliary_loss_mlp": 0.01042707, - "balance_loss_clip": 1.03245759, - "balance_loss_mlp": 1.02956378, - "epoch": 0.9292048699834661, - "flos": 20553328736640.0, - "grad_norm": 1.6995805540725026, - "language_loss": 0.68820882, - "learning_rate": 5.230225101914709e-08, - "loss": 0.70946264, - "num_input_tokens_seen": 333553805, - "step": 15455, - "time_per_iteration": 2.6724045276641846 - }, - { - "auxiliary_loss_clip": 0.01063528, - "auxiliary_loss_mlp": 0.01034362, - "balance_loss_clip": 1.03471339, - "balance_loss_mlp": 1.02136862, - "epoch": 0.9292649932361341, - "flos": 23623655477760.0, - "grad_norm": 1.7980946522964238, - "language_loss": 0.64908248, - "learning_rate": 5.22138035143509e-08, - "loss": 0.67006135, - "num_input_tokens_seen": 333572800, - "step": 15456, - "time_per_iteration": 2.6736927032470703 - }, - { - "auxiliary_loss_clip": 0.01061601, - "auxiliary_loss_mlp": 0.01031323, - "balance_loss_clip": 1.03250432, - "balance_loss_mlp": 1.01845431, - "epoch": 0.929325116488802, - "flos": 15009942602880.0, - "grad_norm": 2.326561847657641, - "language_loss": 0.68176067, - "learning_rate": 5.2125429869802615e-08, - "loss": 0.70268989, - "num_input_tokens_seen": 333588520, - "step": 15457, - "time_per_iteration": 2.722505807876587 - }, - { - "auxiliary_loss_clip": 0.01086966, - "auxiliary_loss_mlp": 0.01029179, - "balance_loss_clip": 1.03466225, - "balance_loss_mlp": 1.01685286, - "epoch": 0.92938523974147, - "flos": 17967365919360.0, - "grad_norm": 3.4374144419000388, - "language_loss": 0.80816442, - "learning_rate": 5.203713008885291e-08, - "loss": 0.82932585, - "num_input_tokens_seen": 333603435, - "step": 15458, - "time_per_iteration": 2.7122104167938232 - }, - { - "auxiliary_loss_clip": 0.01100699, - "auxiliary_loss_mlp": 0.01034056, - "balance_loss_clip": 1.03775263, - "balance_loss_mlp": 1.02139056, - "epoch": 0.9294453629941379, - "flos": 23003047267200.0, - "grad_norm": 1.5848358931339006, - "language_loss": 0.72326326, - "learning_rate": 5.194890417485065e-08, - "loss": 0.74461079, - "num_input_tokens_seen": 333623305, - "step": 15459, - "time_per_iteration": 2.6949429512023926 - }, - { - "auxiliary_loss_clip": 0.01070452, - "auxiliary_loss_mlp": 0.01035576, - "balance_loss_clip": 1.03244114, - "balance_loss_mlp": 1.02314806, - "epoch": 0.929505486246806, - "flos": 17055234927360.0, - "grad_norm": 2.206929984070201, - "language_loss": 0.58746719, - "learning_rate": 5.1860752131141384e-08, - "loss": 0.60852748, - "num_input_tokens_seen": 333641205, - "step": 15460, - "time_per_iteration": 2.7503440380096436 - }, - { - "auxiliary_loss_clip": 0.01057483, - "auxiliary_loss_mlp": 0.01033599, - "balance_loss_clip": 1.03144884, - "balance_loss_mlp": 1.02011657, - "epoch": 0.9295656094994739, - "flos": 27340409329920.0, - "grad_norm": 1.8867552786807409, - "language_loss": 0.80609381, - "learning_rate": 5.177267396106733e-08, - "loss": 0.82700461, - "num_input_tokens_seen": 333659615, - "step": 15461, - "time_per_iteration": 2.773244857788086 - }, - { - "auxiliary_loss_clip": 0.01083444, - "auxiliary_loss_mlp": 0.01025336, - "balance_loss_clip": 1.03467631, - "balance_loss_mlp": 1.01330769, - "epoch": 0.9296257327521419, - "flos": 21470954509440.0, - "grad_norm": 2.177903881361931, - "language_loss": 0.78115839, - "learning_rate": 5.168466966796869e-08, - "loss": 0.80224615, - "num_input_tokens_seen": 333678985, - "step": 15462, - "time_per_iteration": 2.6601200103759766 - }, - { - "auxiliary_loss_clip": 0.01065181, - "auxiliary_loss_mlp": 0.0102857, - "balance_loss_clip": 1.02944088, - "balance_loss_mlp": 1.01573753, - "epoch": 0.9296858560048099, - "flos": 16362661818240.0, - "grad_norm": 1.838020182070701, - "language_loss": 0.62704271, - "learning_rate": 5.159673925518282e-08, - "loss": 0.64798021, - "num_input_tokens_seen": 333696410, - "step": 15463, - "time_per_iteration": 2.65974760055542 - }, - { - "auxiliary_loss_clip": 0.01082053, - "auxiliary_loss_mlp": 0.0103221, - "balance_loss_clip": 1.03119493, - "balance_loss_mlp": 1.020522, - "epoch": 0.9297459792574778, - "flos": 29858609139840.0, - "grad_norm": 1.45056589452118, - "language_loss": 0.70977533, - "learning_rate": 5.15088827260437e-08, - "loss": 0.73091793, - "num_input_tokens_seen": 333716615, - "step": 15464, - "time_per_iteration": 2.7430808544158936 - }, - { - "auxiliary_loss_clip": 0.01082227, - "auxiliary_loss_mlp": 0.01031071, - "balance_loss_clip": 1.0332849, - "balance_loss_mlp": 1.01884627, - "epoch": 0.9298061025101458, - "flos": 15924838942080.0, - "grad_norm": 1.7994510129894437, - "language_loss": 0.77051908, - "learning_rate": 5.1421100083883115e-08, - "loss": 0.79165208, - "num_input_tokens_seen": 333732800, - "step": 15465, - "time_per_iteration": 2.645766019821167 - }, - { - "auxiliary_loss_clip": 0.0098002, - "auxiliary_loss_mlp": 0.01003378, - "balance_loss_clip": 1.01005721, - "balance_loss_mlp": 1.00234056, - "epoch": 0.9298662257628137, - "flos": 64096994304000.0, - "grad_norm": 0.6965140655325198, - "language_loss": 0.56410694, - "learning_rate": 5.133339133202952e-08, - "loss": 0.58394086, - "num_input_tokens_seen": 333799300, - "step": 15466, - "time_per_iteration": 3.565849781036377 - }, - { - "auxiliary_loss_clip": 0.01085072, - "auxiliary_loss_mlp": 0.01039957, - "balance_loss_clip": 1.03284919, - "balance_loss_mlp": 1.02618241, - "epoch": 0.9299263490154818, - "flos": 24280210224000.0, - "grad_norm": 1.7308849012204341, - "language_loss": 0.72874355, - "learning_rate": 5.1245756473809355e-08, - "loss": 0.7499938, - "num_input_tokens_seen": 333820360, - "step": 15467, - "time_per_iteration": 3.236931800842285 - }, - { - "auxiliary_loss_clip": 0.01080183, - "auxiliary_loss_mlp": 0.01034565, - "balance_loss_clip": 1.03504908, - "balance_loss_mlp": 1.02171993, - "epoch": 0.9299864722681497, - "flos": 23294354567040.0, - "grad_norm": 1.5735167762659585, - "language_loss": 0.7158711, - "learning_rate": 5.1158195512545076e-08, - "loss": 0.73701859, - "num_input_tokens_seen": 333840415, - "step": 15468, - "time_per_iteration": 2.7365386486053467 - }, - { - "auxiliary_loss_clip": 0.01094813, - "auxiliary_loss_mlp": 0.01036159, - "balance_loss_clip": 1.0341078, - "balance_loss_mlp": 1.02179384, - "epoch": 0.9300465955208177, - "flos": 21395972868480.0, - "grad_norm": 1.7470237335941396, - "language_loss": 0.75426078, - "learning_rate": 5.107070845155737e-08, - "loss": 0.77557051, - "num_input_tokens_seen": 333859910, - "step": 15469, - "time_per_iteration": 2.7027781009674072 - }, - { - "auxiliary_loss_clip": 0.01082382, - "auxiliary_loss_mlp": 0.01034783, - "balance_loss_clip": 1.03710604, - "balance_loss_mlp": 1.02252209, - "epoch": 0.9301067187734856, - "flos": 24571445696640.0, - "grad_norm": 2.784639154051725, - "language_loss": 0.75578332, - "learning_rate": 5.098329529416379e-08, - "loss": 0.77695501, - "num_input_tokens_seen": 333880495, - "step": 15470, - "time_per_iteration": 2.7347373962402344 - }, - { - "auxiliary_loss_clip": 0.01067813, - "auxiliary_loss_mlp": 0.01032447, - "balance_loss_clip": 1.03560543, - "balance_loss_mlp": 1.02088356, - "epoch": 0.9301668420261536, - "flos": 22196960202240.0, - "grad_norm": 2.387489989885237, - "language_loss": 0.74822462, - "learning_rate": 5.089595604367902e-08, - "loss": 0.76922727, - "num_input_tokens_seen": 333897640, - "step": 15471, - "time_per_iteration": 2.756758213043213 - }, - { - "auxiliary_loss_clip": 0.01093505, - "auxiliary_loss_mlp": 0.01030868, - "balance_loss_clip": 1.03590918, - "balance_loss_mlp": 1.01813686, - "epoch": 0.9302269652788215, - "flos": 17747628468480.0, - "grad_norm": 3.1400649028733345, - "language_loss": 0.68857515, - "learning_rate": 5.080869070341487e-08, - "loss": 0.70981896, - "num_input_tokens_seen": 333913670, - "step": 15472, - "time_per_iteration": 2.6190030574798584 - }, - { - "auxiliary_loss_clip": 0.01078893, - "auxiliary_loss_mlp": 0.01028134, - "balance_loss_clip": 1.03282297, - "balance_loss_mlp": 1.01614726, - "epoch": 0.9302870885314896, - "flos": 19390793057280.0, - "grad_norm": 1.7830315106807422, - "language_loss": 0.88541853, - "learning_rate": 5.0721499276680233e-08, - "loss": 0.90648878, - "num_input_tokens_seen": 333934105, - "step": 15473, - "time_per_iteration": 2.5981593132019043 - }, - { - "auxiliary_loss_clip": 0.01087498, - "auxiliary_loss_mlp": 0.01036678, - "balance_loss_clip": 1.03732419, - "balance_loss_mlp": 1.0225395, - "epoch": 0.9303472117841575, - "flos": 21760286561280.0, - "grad_norm": 1.8849274973342631, - "language_loss": 0.64160311, - "learning_rate": 5.063438176678203e-08, - "loss": 0.6628449, - "num_input_tokens_seen": 333953635, - "step": 15474, - "time_per_iteration": 2.6480371952056885 - }, - { - "auxiliary_loss_clip": 0.01109387, - "auxiliary_loss_mlp": 0.01034944, - "balance_loss_clip": 1.03766966, - "balance_loss_mlp": 1.0225817, - "epoch": 0.9304073350368255, - "flos": 19609740408960.0, - "grad_norm": 1.7431064439867472, - "language_loss": 0.74580079, - "learning_rate": 5.054733817702339e-08, - "loss": 0.7672441, - "num_input_tokens_seen": 333971825, - "step": 15475, - "time_per_iteration": 2.9433109760284424 - }, - { - "auxiliary_loss_clip": 0.01094883, - "auxiliary_loss_mlp": 0.01029771, - "balance_loss_clip": 1.03424644, - "balance_loss_mlp": 1.01804042, - "epoch": 0.9304674582894935, - "flos": 30441582875520.0, - "grad_norm": 1.8594741529837064, - "language_loss": 0.66352129, - "learning_rate": 5.0460368510704786e-08, - "loss": 0.68476784, - "num_input_tokens_seen": 333990120, - "step": 15476, - "time_per_iteration": 2.669774293899536 - }, - { - "auxiliary_loss_clip": 0.01066383, - "auxiliary_loss_mlp": 0.01033281, - "balance_loss_clip": 1.03680027, - "balance_loss_mlp": 1.02047777, - "epoch": 0.9305275815421614, - "flos": 17785693906560.0, - "grad_norm": 1.928617647812536, - "language_loss": 0.68966222, - "learning_rate": 5.0373472771124914e-08, - "loss": 0.71065891, - "num_input_tokens_seen": 334007970, - "step": 15477, - "time_per_iteration": 2.7191553115844727 - }, - { - "auxiliary_loss_clip": 0.01087769, - "auxiliary_loss_mlp": 0.01030039, - "balance_loss_clip": 1.03722644, - "balance_loss_mlp": 1.01820755, - "epoch": 0.9305877047948294, - "flos": 25298456970240.0, - "grad_norm": 3.581472725351638, - "language_loss": 0.58644545, - "learning_rate": 5.0286650961578027e-08, - "loss": 0.60762358, - "num_input_tokens_seen": 334027120, - "step": 15478, - "time_per_iteration": 2.6942026615142822 - }, - { - "auxiliary_loss_clip": 0.01089048, - "auxiliary_loss_mlp": 0.01030623, - "balance_loss_clip": 1.03869212, - "balance_loss_mlp": 1.0165447, - "epoch": 0.9306478280474973, - "flos": 16977236544000.0, - "grad_norm": 2.150126266839501, - "language_loss": 0.78858852, - "learning_rate": 5.01999030853566e-08, - "loss": 0.80978525, - "num_input_tokens_seen": 334042785, - "step": 15479, - "time_per_iteration": 4.2738165855407715 - }, - { - "auxiliary_loss_clip": 0.01109061, - "auxiliary_loss_mlp": 0.01031407, - "balance_loss_clip": 1.03678465, - "balance_loss_mlp": 1.0195874, - "epoch": 0.9307079513001654, - "flos": 35663353608960.0, - "grad_norm": 1.63393685516193, - "language_loss": 0.6846534, - "learning_rate": 5.0113229145750445e-08, - "loss": 0.70605814, - "num_input_tokens_seen": 334063480, - "step": 15480, - "time_per_iteration": 2.7149746417999268 - }, - { - "auxiliary_loss_clip": 0.01109905, - "auxiliary_loss_mlp": 0.01031927, - "balance_loss_clip": 1.0378406, - "balance_loss_mlp": 1.01929665, - "epoch": 0.9307680745528333, - "flos": 19208151377280.0, - "grad_norm": 1.6956958650454903, - "language_loss": 0.67578673, - "learning_rate": 5.002662914604583e-08, - "loss": 0.69720507, - "num_input_tokens_seen": 334082005, - "step": 15481, - "time_per_iteration": 4.039956331253052 - }, - { - "auxiliary_loss_clip": 0.01080993, - "auxiliary_loss_mlp": 0.01032175, - "balance_loss_clip": 1.033005, - "balance_loss_mlp": 1.0192585, - "epoch": 0.9308281978055013, - "flos": 19062641381760.0, - "grad_norm": 1.8467240460111785, - "language_loss": 0.74883473, - "learning_rate": 4.994010308952701e-08, - "loss": 0.76996636, - "num_input_tokens_seen": 334101375, - "step": 15482, - "time_per_iteration": 2.6518447399139404 - }, - { - "auxiliary_loss_clip": 0.01094658, - "auxiliary_loss_mlp": 0.01028267, - "balance_loss_clip": 1.03394866, - "balance_loss_mlp": 1.01649547, - "epoch": 0.9308883210581692, - "flos": 20521548178560.0, - "grad_norm": 1.9180378851164899, - "language_loss": 0.80164105, - "learning_rate": 4.985365097947469e-08, - "loss": 0.82287037, - "num_input_tokens_seen": 334119460, - "step": 15483, - "time_per_iteration": 4.348911285400391 - }, - { - "auxiliary_loss_clip": 0.01082688, - "auxiliary_loss_mlp": 0.01033349, - "balance_loss_clip": 1.03657448, - "balance_loss_mlp": 1.02083826, - "epoch": 0.9309484443108372, - "flos": 13001422826880.0, - "grad_norm": 1.8698210896088554, - "language_loss": 0.74462926, - "learning_rate": 4.976727281916782e-08, - "loss": 0.76578963, - "num_input_tokens_seen": 334136065, - "step": 15484, - "time_per_iteration": 2.664431095123291 - }, - { - "auxiliary_loss_clip": 0.01086381, - "auxiliary_loss_mlp": 0.01032674, - "balance_loss_clip": 1.03691006, - "balance_loss_mlp": 1.01994252, - "epoch": 0.9310085675635051, - "flos": 12567765928320.0, - "grad_norm": 2.155944343427052, - "language_loss": 0.76539695, - "learning_rate": 4.968096861188087e-08, - "loss": 0.78658748, - "num_input_tokens_seen": 334153690, - "step": 15485, - "time_per_iteration": 4.154724597930908 - }, - { - "auxiliary_loss_clip": 0.01063188, - "auxiliary_loss_mlp": 0.01035653, - "balance_loss_clip": 1.03246868, - "balance_loss_mlp": 1.02108002, - "epoch": 0.9310686908161732, - "flos": 23477570864640.0, - "grad_norm": 2.136554797063734, - "language_loss": 0.78668422, - "learning_rate": 4.959473836088723e-08, - "loss": 0.80767262, - "num_input_tokens_seen": 334171880, - "step": 15486, - "time_per_iteration": 2.7599616050720215 - }, - { - "auxiliary_loss_clip": 0.01079287, - "auxiliary_loss_mlp": 0.01030569, - "balance_loss_clip": 1.03827739, - "balance_loss_mlp": 1.01740253, - "epoch": 0.9311288140688411, - "flos": 24170287628160.0, - "grad_norm": 1.753625770007885, - "language_loss": 0.7688942, - "learning_rate": 4.950858206945674e-08, - "loss": 0.78999281, - "num_input_tokens_seen": 334190005, - "step": 15487, - "time_per_iteration": 2.6973049640655518 - }, - { - "auxiliary_loss_clip": 0.01080553, - "auxiliary_loss_mlp": 0.01029283, - "balance_loss_clip": 1.0376873, - "balance_loss_mlp": 1.01600909, - "epoch": 0.9311889373215091, - "flos": 35590203561600.0, - "grad_norm": 2.2700820281078213, - "language_loss": 0.67102247, - "learning_rate": 4.942249974085633e-08, - "loss": 0.69212085, - "num_input_tokens_seen": 334209545, - "step": 15488, - "time_per_iteration": 2.7322824001312256 - }, - { - "auxiliary_loss_clip": 0.01083742, - "auxiliary_loss_mlp": 0.0103061, - "balance_loss_clip": 1.03624427, - "balance_loss_mlp": 1.01787865, - "epoch": 0.9312490605741771, - "flos": 20230528187520.0, - "grad_norm": 1.926714614555793, - "language_loss": 0.75009143, - "learning_rate": 4.933649137834983e-08, - "loss": 0.77123499, - "num_input_tokens_seen": 334228900, - "step": 15489, - "time_per_iteration": 2.5786027908325195 - }, - { - "auxiliary_loss_clip": 0.01111206, - "auxiliary_loss_mlp": 0.01032778, - "balance_loss_clip": 1.03734827, - "balance_loss_mlp": 1.01991534, - "epoch": 0.931309183826845, - "flos": 13950577762560.0, - "grad_norm": 3.9683320091837406, - "language_loss": 0.80892265, - "learning_rate": 4.925055698519931e-08, - "loss": 0.83036256, - "num_input_tokens_seen": 334245500, - "step": 15490, - "time_per_iteration": 2.5186619758605957 - }, - { - "auxiliary_loss_clip": 0.01061842, - "auxiliary_loss_mlp": 0.0103459, - "balance_loss_clip": 1.03452861, - "balance_loss_mlp": 1.02170372, - "epoch": 0.931369307079513, - "flos": 20156731695360.0, - "grad_norm": 1.6571168923434456, - "language_loss": 0.72082543, - "learning_rate": 4.9164696564663264e-08, - "loss": 0.74178976, - "num_input_tokens_seen": 334264370, - "step": 15491, - "time_per_iteration": 2.6573195457458496 - }, - { - "auxiliary_loss_clip": 0.0108057, - "auxiliary_loss_mlp": 0.0076884, - "balance_loss_clip": 1.03255057, - "balance_loss_mlp": 1.00006044, - "epoch": 0.931429430332181, - "flos": 25338569483520.0, - "grad_norm": 1.8544048731654292, - "language_loss": 0.74552429, - "learning_rate": 4.9078910119997096e-08, - "loss": 0.7640183, - "num_input_tokens_seen": 334283905, - "step": 15492, - "time_per_iteration": 2.5493483543395996 - }, - { - "auxiliary_loss_clip": 0.01019334, - "auxiliary_loss_mlp": 0.01002201, - "balance_loss_clip": 1.0056994, - "balance_loss_mlp": 1.00118196, - "epoch": 0.931489553584849, - "flos": 71226193985280.0, - "grad_norm": 0.707578892585582, - "language_loss": 0.53487962, - "learning_rate": 4.899319765445442e-08, - "loss": 0.55509502, - "num_input_tokens_seen": 334339925, - "step": 15493, - "time_per_iteration": 2.947209358215332 - }, - { - "auxiliary_loss_clip": 0.01097309, - "auxiliary_loss_mlp": 0.01033315, - "balance_loss_clip": 1.03523576, - "balance_loss_mlp": 1.02147126, - "epoch": 0.9315496768375169, - "flos": 14643653662080.0, - "grad_norm": 1.8350932820938002, - "language_loss": 0.70629972, - "learning_rate": 4.890755917128531e-08, - "loss": 0.72760594, - "num_input_tokens_seen": 334357225, - "step": 15494, - "time_per_iteration": 2.721458673477173 - }, - { - "auxiliary_loss_clip": 0.01093067, - "auxiliary_loss_mlp": 0.01029285, - "balance_loss_clip": 1.03598893, - "balance_loss_mlp": 1.01683998, - "epoch": 0.9316098000901849, - "flos": 28329928174080.0, - "grad_norm": 2.829925163289834, - "language_loss": 0.68261808, - "learning_rate": 4.882199467373671e-08, - "loss": 0.70384157, - "num_input_tokens_seen": 334375945, - "step": 15495, - "time_per_iteration": 2.6126840114593506 - }, - { - "auxiliary_loss_clip": 0.01104588, - "auxiliary_loss_mlp": 0.01034305, - "balance_loss_clip": 1.03474832, - "balance_loss_mlp": 1.02263463, - "epoch": 0.9316699233428528, - "flos": 28512677594880.0, - "grad_norm": 2.116367071751547, - "language_loss": 0.61655867, - "learning_rate": 4.8736504165053815e-08, - "loss": 0.63794762, - "num_input_tokens_seen": 334395310, - "step": 15496, - "time_per_iteration": 2.5984606742858887 - }, - { - "auxiliary_loss_clip": 0.01099753, - "auxiliary_loss_mlp": 0.01032832, - "balance_loss_clip": 1.03712618, - "balance_loss_mlp": 1.0196712, - "epoch": 0.9317300465955208, - "flos": 33693402061440.0, - "grad_norm": 1.4743797033954789, - "language_loss": 0.76852232, - "learning_rate": 4.865108764847825e-08, - "loss": 0.78984821, - "num_input_tokens_seen": 334416965, - "step": 15497, - "time_per_iteration": 2.694024085998535 - }, - { - "auxiliary_loss_clip": 0.01102298, - "auxiliary_loss_mlp": 0.00771221, - "balance_loss_clip": 1.0387435, - "balance_loss_mlp": 1.00019717, - "epoch": 0.9317901698481887, - "flos": 23658237296640.0, - "grad_norm": 1.6171461718999425, - "language_loss": 0.66427922, - "learning_rate": 4.856574512724898e-08, - "loss": 0.68301445, - "num_input_tokens_seen": 334435620, - "step": 15498, - "time_per_iteration": 2.617232084274292 - }, - { - "auxiliary_loss_clip": 0.01087695, - "auxiliary_loss_mlp": 0.01035624, - "balance_loss_clip": 1.03662407, - "balance_loss_mlp": 1.02294576, - "epoch": 0.9318502931008568, - "flos": 20960017499520.0, - "grad_norm": 1.6376617462499037, - "language_loss": 0.79631472, - "learning_rate": 4.8480476604602305e-08, - "loss": 0.81754798, - "num_input_tokens_seen": 334456210, - "step": 15499, - "time_per_iteration": 2.663939952850342 - }, - { - "auxiliary_loss_clip": 0.01065124, - "auxiliary_loss_mlp": 0.01033992, - "balance_loss_clip": 1.03444028, - "balance_loss_mlp": 1.02104545, - "epoch": 0.9319104163535247, - "flos": 23441049711360.0, - "grad_norm": 1.9165014592612015, - "language_loss": 0.76588839, - "learning_rate": 4.8395282083771196e-08, - "loss": 0.78687954, - "num_input_tokens_seen": 334475485, - "step": 15500, - "time_per_iteration": 2.8950650691986084 - }, - { - "auxiliary_loss_clip": 0.01075294, - "auxiliary_loss_mlp": 0.01026196, - "balance_loss_clip": 1.03517914, - "balance_loss_mlp": 1.01429939, - "epoch": 0.9319705396061927, - "flos": 22347426274560.0, - "grad_norm": 1.8007520102301227, - "language_loss": 0.72160745, - "learning_rate": 4.8310161567987064e-08, - "loss": 0.74262238, - "num_input_tokens_seen": 334494740, - "step": 15501, - "time_per_iteration": 2.671736240386963 - }, - { - "auxiliary_loss_clip": 0.0111059, - "auxiliary_loss_mlp": 0.01035234, - "balance_loss_clip": 1.0367043, - "balance_loss_mlp": 1.02227604, - "epoch": 0.9320306628588607, - "flos": 20993557824000.0, - "grad_norm": 1.6678005947570964, - "language_loss": 0.66245615, - "learning_rate": 4.822511506047666e-08, - "loss": 0.68391442, - "num_input_tokens_seen": 334511910, - "step": 15502, - "time_per_iteration": 2.640803098678589 - }, - { - "auxiliary_loss_clip": 0.01100429, - "auxiliary_loss_mlp": 0.00770206, - "balance_loss_clip": 1.03718948, - "balance_loss_mlp": 1.00017929, - "epoch": 0.9320907861115286, - "flos": 24538300421760.0, - "grad_norm": 1.5006777821326498, - "language_loss": 0.65834957, - "learning_rate": 4.814014256446586e-08, - "loss": 0.67705584, - "num_input_tokens_seen": 334533150, - "step": 15503, - "time_per_iteration": 2.6871988773345947 - }, - { - "auxiliary_loss_clip": 0.01070897, - "auxiliary_loss_mlp": 0.01037527, - "balance_loss_clip": 1.03008294, - "balance_loss_mlp": 1.02359104, - "epoch": 0.9321509093641966, - "flos": 19785414850560.0, - "grad_norm": 1.5576125560522005, - "language_loss": 0.75215459, - "learning_rate": 4.805524408317652e-08, - "loss": 0.77323884, - "num_input_tokens_seen": 334550940, - "step": 15504, - "time_per_iteration": 2.7060647010803223 - }, - { - "auxiliary_loss_clip": 0.01099259, - "auxiliary_loss_mlp": 0.00770405, - "balance_loss_clip": 1.03809631, - "balance_loss_mlp": 1.00028038, - "epoch": 0.9322110326168646, - "flos": 24972675592320.0, - "grad_norm": 2.4285521975478472, - "language_loss": 0.70985043, - "learning_rate": 4.797041961982762e-08, - "loss": 0.7285471, - "num_input_tokens_seen": 334570935, - "step": 15505, - "time_per_iteration": 2.632615089416504 - }, - { - "auxiliary_loss_clip": 0.01089757, - "auxiliary_loss_mlp": 0.01032109, - "balance_loss_clip": 1.03672528, - "balance_loss_mlp": 1.01909173, - "epoch": 0.9322711558695326, - "flos": 16143642639360.0, - "grad_norm": 2.685326437023756, - "language_loss": 0.75406563, - "learning_rate": 4.788566917763614e-08, - "loss": 0.77528429, - "num_input_tokens_seen": 334589315, - "step": 15506, - "time_per_iteration": 2.6244046688079834 - }, - { - "auxiliary_loss_clip": 0.01069315, - "auxiliary_loss_mlp": 0.01031235, - "balance_loss_clip": 1.03417552, - "balance_loss_mlp": 1.01864064, - "epoch": 0.9323312791222005, - "flos": 23732428838400.0, - "grad_norm": 1.7967505184283636, - "language_loss": 0.830755, - "learning_rate": 4.780099275981597e-08, - "loss": 0.85176057, - "num_input_tokens_seen": 334608990, - "step": 15507, - "time_per_iteration": 2.7944211959838867 - }, - { - "auxiliary_loss_clip": 0.01110233, - "auxiliary_loss_mlp": 0.01033256, - "balance_loss_clip": 1.03728533, - "balance_loss_mlp": 1.02054238, - "epoch": 0.9323914023748685, - "flos": 20777914523520.0, - "grad_norm": 1.7255820052461415, - "language_loss": 0.68139851, - "learning_rate": 4.771639036957742e-08, - "loss": 0.70283341, - "num_input_tokens_seen": 334628655, - "step": 15508, - "time_per_iteration": 2.6024084091186523 - }, - { - "auxiliary_loss_clip": 0.01074834, - "auxiliary_loss_mlp": 0.01031451, - "balance_loss_clip": 1.03604794, - "balance_loss_mlp": 1.01885068, - "epoch": 0.9324515256275364, - "flos": 23915178259200.0, - "grad_norm": 1.6572638063256202, - "language_loss": 0.72395205, - "learning_rate": 4.7631862010129033e-08, - "loss": 0.74501491, - "num_input_tokens_seen": 334648295, - "step": 15509, - "time_per_iteration": 2.7021539211273193 - }, - { - "auxiliary_loss_clip": 0.01097551, - "auxiliary_loss_mlp": 0.01032405, - "balance_loss_clip": 1.03582215, - "balance_loss_mlp": 1.02004337, - "epoch": 0.9325116488802044, - "flos": 18005215875840.0, - "grad_norm": 1.9028125229589103, - "language_loss": 0.73969936, - "learning_rate": 4.754740768467624e-08, - "loss": 0.7609989, - "num_input_tokens_seen": 334666280, - "step": 15510, - "time_per_iteration": 2.5746052265167236 - }, - { - "auxiliary_loss_clip": 0.0109828, - "auxiliary_loss_mlp": 0.01029114, - "balance_loss_clip": 1.03353786, - "balance_loss_mlp": 1.01676393, - "epoch": 0.9325717721328723, - "flos": 29021603443200.0, - "grad_norm": 2.4238247125248304, - "language_loss": 0.70348555, - "learning_rate": 4.746302739642161e-08, - "loss": 0.72475946, - "num_input_tokens_seen": 334688830, - "step": 15511, - "time_per_iteration": 2.6687567234039307 - }, - { - "auxiliary_loss_clip": 0.01080656, - "auxiliary_loss_mlp": 0.01040972, - "balance_loss_clip": 1.03440976, - "balance_loss_mlp": 1.02819276, - "epoch": 0.9326318953855404, - "flos": 21646341642240.0, - "grad_norm": 1.8803327029828805, - "language_loss": 0.78425443, - "learning_rate": 4.737872114856412e-08, - "loss": 0.80547071, - "num_input_tokens_seen": 334705205, - "step": 15512, - "time_per_iteration": 2.614408016204834 - }, - { - "auxiliary_loss_clip": 0.01107297, - "auxiliary_loss_mlp": 0.01028882, - "balance_loss_clip": 1.03561831, - "balance_loss_mlp": 1.01595938, - "epoch": 0.9326920186382083, - "flos": 26065724411520.0, - "grad_norm": 2.094641259738662, - "language_loss": 0.80607069, - "learning_rate": 4.7294488944301436e-08, - "loss": 0.82743245, - "num_input_tokens_seen": 334723830, - "step": 15513, - "time_per_iteration": 2.6240689754486084 - }, - { - "auxiliary_loss_clip": 0.01086027, - "auxiliary_loss_mlp": 0.01032384, - "balance_loss_clip": 1.03791964, - "balance_loss_mlp": 1.01878834, - "epoch": 0.9327521418908763, - "flos": 12057116227200.0, - "grad_norm": 1.8470318509254438, - "language_loss": 0.80033004, - "learning_rate": 4.721033078682768e-08, - "loss": 0.82151413, - "num_input_tokens_seen": 334740825, - "step": 15514, - "time_per_iteration": 2.6301167011260986 - }, - { - "auxiliary_loss_clip": 0.01074823, - "auxiliary_loss_mlp": 0.01037395, - "balance_loss_clip": 1.0366869, - "balance_loss_mlp": 1.02556312, - "epoch": 0.9328122651435443, - "flos": 43834395271680.0, - "grad_norm": 1.7635259328932469, - "language_loss": 0.71414572, - "learning_rate": 4.7126246679333626e-08, - "loss": 0.73526788, - "num_input_tokens_seen": 334765825, - "step": 15515, - "time_per_iteration": 2.9563915729522705 - }, - { - "auxiliary_loss_clip": 0.01093417, - "auxiliary_loss_mlp": 0.01033132, - "balance_loss_clip": 1.03784752, - "balance_loss_mlp": 1.02009666, - "epoch": 0.9328723883962122, - "flos": 15194954580480.0, - "grad_norm": 3.7303999678347153, - "language_loss": 0.80836952, - "learning_rate": 4.704223662500806e-08, - "loss": 0.82963496, - "num_input_tokens_seen": 334782680, - "step": 15516, - "time_per_iteration": 2.6183342933654785 - }, - { - "auxiliary_loss_clip": 0.01070452, - "auxiliary_loss_mlp": 0.01039697, - "balance_loss_clip": 1.03149724, - "balance_loss_mlp": 1.02574384, - "epoch": 0.9329325116488802, - "flos": 20261770041600.0, - "grad_norm": 1.6405205567812482, - "language_loss": 0.80559999, - "learning_rate": 4.695830062703643e-08, - "loss": 0.82670152, - "num_input_tokens_seen": 334800160, - "step": 15517, - "time_per_iteration": 2.6957180500030518 - }, - { - "auxiliary_loss_clip": 0.0108811, - "auxiliary_loss_mlp": 0.01031178, - "balance_loss_clip": 1.03524601, - "balance_loss_mlp": 1.01821351, - "epoch": 0.9329926349015482, - "flos": 13115008609920.0, - "grad_norm": 2.5144620557591364, - "language_loss": 0.74485952, - "learning_rate": 4.687443868860219e-08, - "loss": 0.76605237, - "num_input_tokens_seen": 334815840, - "step": 15518, - "time_per_iteration": 2.6164944171905518 - }, - { - "auxiliary_loss_clip": 0.01083865, - "auxiliary_loss_mlp": 0.01042053, - "balance_loss_clip": 1.03354347, - "balance_loss_mlp": 1.02916634, - "epoch": 0.9330527581542162, - "flos": 23040250778880.0, - "grad_norm": 2.0399988917234904, - "language_loss": 0.76014853, - "learning_rate": 4.679065081288458e-08, - "loss": 0.78140771, - "num_input_tokens_seen": 334834735, - "step": 15519, - "time_per_iteration": 4.253001689910889 - }, - { - "auxiliary_loss_clip": 0.01053866, - "auxiliary_loss_mlp": 0.01036997, - "balance_loss_clip": 1.0320313, - "balance_loss_mlp": 1.02326381, - "epoch": 0.9331128814068841, - "flos": 15559627409280.0, - "grad_norm": 2.031373785693728, - "language_loss": 0.83167887, - "learning_rate": 4.6706937003061275e-08, - "loss": 0.85258746, - "num_input_tokens_seen": 334853490, - "step": 15520, - "time_per_iteration": 4.2622270584106445 - }, - { - "auxiliary_loss_clip": 0.01096231, - "auxiliary_loss_mlp": 0.01030529, - "balance_loss_clip": 1.03505969, - "balance_loss_mlp": 1.01851249, - "epoch": 0.9331730046595521, - "flos": 22271762275200.0, - "grad_norm": 1.641224021105838, - "language_loss": 0.76203525, - "learning_rate": 4.6623297262306846e-08, - "loss": 0.78330284, - "num_input_tokens_seen": 334873675, - "step": 15521, - "time_per_iteration": 2.6855099201202393 - }, - { - "auxiliary_loss_clip": 0.01098694, - "auxiliary_loss_mlp": 0.01030692, - "balance_loss_clip": 1.03746796, - "balance_loss_mlp": 1.01878297, - "epoch": 0.93323312791222, - "flos": 15777641007360.0, - "grad_norm": 1.8982902543298203, - "language_loss": 0.77620465, - "learning_rate": 4.6539731593792545e-08, - "loss": 0.79749846, - "num_input_tokens_seen": 334890970, - "step": 15522, - "time_per_iteration": 4.228564977645874 - }, - { - "auxiliary_loss_clip": 0.01075483, - "auxiliary_loss_mlp": 0.00770903, - "balance_loss_clip": 1.03559947, - "balance_loss_mlp": 1.00036263, - "epoch": 0.933293251164888, - "flos": 22010978557440.0, - "grad_norm": 2.1529045189336076, - "language_loss": 0.62858284, - "learning_rate": 4.6456240000687373e-08, - "loss": 0.64704674, - "num_input_tokens_seen": 334906635, - "step": 15523, - "time_per_iteration": 2.720323324203491 - }, - { - "auxiliary_loss_clip": 0.01085105, - "auxiliary_loss_mlp": 0.01031084, - "balance_loss_clip": 1.0346992, - "balance_loss_mlp": 1.0190022, - "epoch": 0.933353374417556, - "flos": 26031358074240.0, - "grad_norm": 2.0555270470512035, - "language_loss": 0.6804812, - "learning_rate": 4.63728224861577e-08, - "loss": 0.70164317, - "num_input_tokens_seen": 334926230, - "step": 15524, - "time_per_iteration": 4.186232805252075 - }, - { - "auxiliary_loss_clip": 0.01065918, - "auxiliary_loss_mlp": 0.01036023, - "balance_loss_clip": 1.03418577, - "balance_loss_mlp": 1.02345872, - "epoch": 0.933413497670224, - "flos": 24900100162560.0, - "grad_norm": 1.5473346576932632, - "language_loss": 0.73752666, - "learning_rate": 4.628947905336589e-08, - "loss": 0.75854605, - "num_input_tokens_seen": 334946680, - "step": 15525, - "time_per_iteration": 2.740737199783325 - }, - { - "auxiliary_loss_clip": 0.01054757, - "auxiliary_loss_mlp": 0.01041762, - "balance_loss_clip": 1.03350389, - "balance_loss_mlp": 1.02915573, - "epoch": 0.9334736209228919, - "flos": 23688689051520.0, - "grad_norm": 1.733935635799957, - "language_loss": 0.83531857, - "learning_rate": 4.6206209705473175e-08, - "loss": 0.85628378, - "num_input_tokens_seen": 334964785, - "step": 15526, - "time_per_iteration": 2.6978201866149902 - }, - { - "auxiliary_loss_clip": 0.01062457, - "auxiliary_loss_mlp": 0.01033558, - "balance_loss_clip": 1.03334141, - "balance_loss_mlp": 1.02088642, - "epoch": 0.9335337441755599, - "flos": 15377344865280.0, - "grad_norm": 1.8748787634103812, - "language_loss": 0.69386899, - "learning_rate": 4.61230144456366e-08, - "loss": 0.71482921, - "num_input_tokens_seen": 334982400, - "step": 15527, - "time_per_iteration": 2.7441039085388184 - }, - { - "auxiliary_loss_clip": 0.01110964, - "auxiliary_loss_mlp": 0.01030308, - "balance_loss_clip": 1.03706026, - "balance_loss_mlp": 1.01640248, - "epoch": 0.9335938674282279, - "flos": 16106726436480.0, - "grad_norm": 2.0901783734577535, - "language_loss": 0.65065324, - "learning_rate": 4.603989327701141e-08, - "loss": 0.67206597, - "num_input_tokens_seen": 334999685, - "step": 15528, - "time_per_iteration": 2.643949270248413 - }, - { - "auxiliary_loss_clip": 0.01110618, - "auxiliary_loss_mlp": 0.01029994, - "balance_loss_clip": 1.03654647, - "balance_loss_mlp": 1.01733375, - "epoch": 0.9336539906808958, - "flos": 18952898353920.0, - "grad_norm": 1.8767297797851592, - "language_loss": 0.74917662, - "learning_rate": 4.5956846202748867e-08, - "loss": 0.77058274, - "num_input_tokens_seen": 335019160, - "step": 15529, - "time_per_iteration": 2.634995698928833 - }, - { - "auxiliary_loss_clip": 0.01062705, - "auxiliary_loss_mlp": 0.01032394, - "balance_loss_clip": 1.03274131, - "balance_loss_mlp": 1.0203954, - "epoch": 0.9337141139335638, - "flos": 18109104986880.0, - "grad_norm": 1.748892845656801, - "language_loss": 0.62968796, - "learning_rate": 4.5873873225998674e-08, - "loss": 0.65063894, - "num_input_tokens_seen": 335037350, - "step": 15530, - "time_per_iteration": 2.7044005393981934 - }, - { - "auxiliary_loss_clip": 0.01088546, - "auxiliary_loss_mlp": 0.01028529, - "balance_loss_clip": 1.03820205, - "balance_loss_mlp": 1.01650035, - "epoch": 0.9337742371862318, - "flos": 17345716214400.0, - "grad_norm": 1.6832056860999263, - "language_loss": 0.72579157, - "learning_rate": 4.5790974349907194e-08, - "loss": 0.74696231, - "num_input_tokens_seen": 335056060, - "step": 15531, - "time_per_iteration": 2.6301660537719727 - }, - { - "auxiliary_loss_clip": 0.01085265, - "auxiliary_loss_mlp": 0.01030468, - "balance_loss_clip": 1.0341419, - "balance_loss_mlp": 1.01802897, - "epoch": 0.9338343604388998, - "flos": 29058986522880.0, - "grad_norm": 1.6070200509837302, - "language_loss": 0.7085079, - "learning_rate": 4.5708149577617925e-08, - "loss": 0.72966528, - "num_input_tokens_seen": 335075410, - "step": 15532, - "time_per_iteration": 2.6982882022857666 - }, - { - "auxiliary_loss_clip": 0.01110813, - "auxiliary_loss_mlp": 0.00770577, - "balance_loss_clip": 1.03735983, - "balance_loss_mlp": 1.00021112, - "epoch": 0.9338944836915677, - "flos": 18660908695680.0, - "grad_norm": 1.6762698781307237, - "language_loss": 0.73232746, - "learning_rate": 4.5625398912271016e-08, - "loss": 0.75114131, - "num_input_tokens_seen": 335095190, - "step": 15533, - "time_per_iteration": 2.570868730545044 - }, - { - "auxiliary_loss_clip": 0.0107118, - "auxiliary_loss_mlp": 0.01029018, - "balance_loss_clip": 1.03274035, - "balance_loss_mlp": 1.01729369, - "epoch": 0.9339546069442357, - "flos": 16617735273600.0, - "grad_norm": 1.7512837189908117, - "language_loss": 0.7965132, - "learning_rate": 4.554272235700507e-08, - "loss": 0.81751519, - "num_input_tokens_seen": 335113825, - "step": 15534, - "time_per_iteration": 2.659533977508545 - }, - { - "auxiliary_loss_clip": 0.01104268, - "auxiliary_loss_mlp": 0.0102812, - "balance_loss_clip": 1.03758454, - "balance_loss_mlp": 1.01707494, - "epoch": 0.9340147301969036, - "flos": 23693106424320.0, - "grad_norm": 1.785516509672164, - "language_loss": 0.74561787, - "learning_rate": 4.546011991495513e-08, - "loss": 0.76694173, - "num_input_tokens_seen": 335136425, - "step": 15535, - "time_per_iteration": 2.615487575531006 - }, - { - "auxiliary_loss_clip": 0.01094475, - "auxiliary_loss_mlp": 0.0102863, - "balance_loss_clip": 1.03818846, - "balance_loss_mlp": 1.0162499, - "epoch": 0.9340748534495716, - "flos": 28654452576000.0, - "grad_norm": 2.554180895365387, - "language_loss": 0.77858245, - "learning_rate": 4.537759158925292e-08, - "loss": 0.79981351, - "num_input_tokens_seen": 335157925, - "step": 15536, - "time_per_iteration": 2.6514716148376465 - }, - { - "auxiliary_loss_clip": 0.01078909, - "auxiliary_loss_mlp": 0.01027656, - "balance_loss_clip": 1.0358901, - "balance_loss_mlp": 1.01566911, - "epoch": 0.9341349767022396, - "flos": 24899633285760.0, - "grad_norm": 1.5285441088297342, - "language_loss": 0.80702901, - "learning_rate": 4.5295137383028593e-08, - "loss": 0.82809466, - "num_input_tokens_seen": 335177840, - "step": 15537, - "time_per_iteration": 2.71079683303833 - }, - { - "auxiliary_loss_clip": 0.01089177, - "auxiliary_loss_mlp": 0.01033529, - "balance_loss_clip": 1.03725171, - "balance_loss_mlp": 1.02132761, - "epoch": 0.9341950999549076, - "flos": 29059525226880.0, - "grad_norm": 2.04950932055524, - "language_loss": 0.77909076, - "learning_rate": 4.5212757299408764e-08, - "loss": 0.80031782, - "num_input_tokens_seen": 335199470, - "step": 15538, - "time_per_iteration": 2.7233970165252686 - }, - { - "auxiliary_loss_clip": 0.01080561, - "auxiliary_loss_mlp": 0.01028684, - "balance_loss_clip": 1.03509426, - "balance_loss_mlp": 1.01653659, - "epoch": 0.9342552232075755, - "flos": 23587062497280.0, - "grad_norm": 1.7693540282121059, - "language_loss": 0.73224825, - "learning_rate": 4.513045134151672e-08, - "loss": 0.75334066, - "num_input_tokens_seen": 335218885, - "step": 15539, - "time_per_iteration": 2.7510504722595215 - }, - { - "auxiliary_loss_clip": 0.01063064, - "auxiliary_loss_mlp": 0.01030385, - "balance_loss_clip": 1.03815532, - "balance_loss_mlp": 1.01935768, - "epoch": 0.9343153464602435, - "flos": 36721389646080.0, - "grad_norm": 1.501458356905732, - "language_loss": 0.64681369, - "learning_rate": 4.504821951247373e-08, - "loss": 0.66774815, - "num_input_tokens_seen": 335239485, - "step": 15540, - "time_per_iteration": 2.8745980262756348 - }, - { - "auxiliary_loss_clip": 0.01095708, - "auxiliary_loss_mlp": 0.0103328, - "balance_loss_clip": 1.03505111, - "balance_loss_mlp": 1.02149022, - "epoch": 0.9343754697129115, - "flos": 22236498097920.0, - "grad_norm": 1.8557589560760206, - "language_loss": 0.76556802, - "learning_rate": 4.496606181539864e-08, - "loss": 0.78685796, - "num_input_tokens_seen": 335258355, - "step": 15541, - "time_per_iteration": 2.651571035385132 - }, - { - "auxiliary_loss_clip": 0.01096825, - "auxiliary_loss_mlp": 0.01033897, - "balance_loss_clip": 1.03985929, - "balance_loss_mlp": 1.02145147, - "epoch": 0.9344355929655794, - "flos": 29710333797120.0, - "grad_norm": 1.9744538682632873, - "language_loss": 0.66810614, - "learning_rate": 4.4883978253406066e-08, - "loss": 0.68941331, - "num_input_tokens_seen": 335276835, - "step": 15542, - "time_per_iteration": 2.760667085647583 - }, - { - "auxiliary_loss_clip": 0.01065848, - "auxiliary_loss_mlp": 0.01029314, - "balance_loss_clip": 1.03444171, - "balance_loss_mlp": 1.01672554, - "epoch": 0.9344957162182475, - "flos": 18880394751360.0, - "grad_norm": 1.8654482805757453, - "language_loss": 0.69444913, - "learning_rate": 4.480196882960907e-08, - "loss": 0.71540076, - "num_input_tokens_seen": 335296220, - "step": 15543, - "time_per_iteration": 2.7620866298675537 - }, - { - "auxiliary_loss_clip": 0.0109899, - "auxiliary_loss_mlp": 0.01029553, - "balance_loss_clip": 1.03395653, - "balance_loss_mlp": 1.01592147, - "epoch": 0.9345558394709154, - "flos": 27417761268480.0, - "grad_norm": 2.0257017035114493, - "language_loss": 0.69519067, - "learning_rate": 4.4720033547117394e-08, - "loss": 0.71647608, - "num_input_tokens_seen": 335316335, - "step": 15544, - "time_per_iteration": 2.7088634967803955 - }, - { - "auxiliary_loss_clip": 0.01094451, - "auxiliary_loss_mlp": 0.01046236, - "balance_loss_clip": 1.03500128, - "balance_loss_mlp": 1.03233039, - "epoch": 0.9346159627235834, - "flos": 20741285629440.0, - "grad_norm": 1.7227662917872677, - "language_loss": 0.77327919, - "learning_rate": 4.463817240903789e-08, - "loss": 0.79468608, - "num_input_tokens_seen": 335335545, - "step": 15545, - "time_per_iteration": 2.630438804626465 - }, - { - "auxiliary_loss_clip": 0.0109898, - "auxiliary_loss_mlp": 0.01026698, - "balance_loss_clip": 1.03614378, - "balance_loss_mlp": 1.01519418, - "epoch": 0.9346760859762513, - "flos": 21069221823360.0, - "grad_norm": 1.7337615176350853, - "language_loss": 0.68626702, - "learning_rate": 4.455638541847495e-08, - "loss": 0.70752382, - "num_input_tokens_seen": 335355350, - "step": 15546, - "time_per_iteration": 2.5841619968414307 - }, - { - "auxiliary_loss_clip": 0.01066558, - "auxiliary_loss_mlp": 0.01029596, - "balance_loss_clip": 1.03200841, - "balance_loss_mlp": 1.01754951, - "epoch": 0.9347362092289193, - "flos": 29204927481600.0, - "grad_norm": 2.0917837457460466, - "language_loss": 0.82409191, - "learning_rate": 4.447467257852966e-08, - "loss": 0.84505343, - "num_input_tokens_seen": 335375160, - "step": 15547, - "time_per_iteration": 2.737194538116455 - }, - { - "auxiliary_loss_clip": 0.01089071, - "auxiliary_loss_mlp": 0.01038533, - "balance_loss_clip": 1.03189087, - "balance_loss_mlp": 1.02542627, - "epoch": 0.9347963324815872, - "flos": 19427350124160.0, - "grad_norm": 1.9497482945485352, - "language_loss": 0.83475363, - "learning_rate": 4.439303389230087e-08, - "loss": 0.85602963, - "num_input_tokens_seen": 335394080, - "step": 15548, - "time_per_iteration": 2.550107479095459 - }, - { - "auxiliary_loss_clip": 0.01101099, - "auxiliary_loss_mlp": 0.01037896, - "balance_loss_clip": 1.03632116, - "balance_loss_mlp": 1.0238775, - "epoch": 0.9348564557342552, - "flos": 36901840596480.0, - "grad_norm": 1.5421911327365105, - "language_loss": 0.65587002, - "learning_rate": 4.4311469362884326e-08, - "loss": 0.67725998, - "num_input_tokens_seen": 335414230, - "step": 15549, - "time_per_iteration": 2.7219295501708984 - }, - { - "auxiliary_loss_clip": 0.01101825, - "auxiliary_loss_mlp": 0.01037163, - "balance_loss_clip": 1.03933716, - "balance_loss_mlp": 1.02354288, - "epoch": 0.9349165789869232, - "flos": 21690117342720.0, - "grad_norm": 2.010328548001079, - "language_loss": 0.80039644, - "learning_rate": 4.4229978993372665e-08, - "loss": 0.82178628, - "num_input_tokens_seen": 335432890, - "step": 15550, - "time_per_iteration": 2.640012741088867 - }, - { - "auxiliary_loss_clip": 0.01096493, - "auxiliary_loss_mlp": 0.01032466, - "balance_loss_clip": 1.0383265, - "balance_loss_mlp": 1.02041388, - "epoch": 0.9349767022395912, - "flos": 18844053166080.0, - "grad_norm": 1.681452605729496, - "language_loss": 0.75687659, - "learning_rate": 4.4148562786856524e-08, - "loss": 0.77816617, - "num_input_tokens_seen": 335452085, - "step": 15551, - "time_per_iteration": 2.584329843521118 - }, - { - "auxiliary_loss_clip": 0.01051893, - "auxiliary_loss_mlp": 0.01030912, - "balance_loss_clip": 1.03308678, - "balance_loss_mlp": 1.02025425, - "epoch": 0.9350368254922591, - "flos": 24973429777920.0, - "grad_norm": 1.499035355144879, - "language_loss": 0.73651052, - "learning_rate": 4.406722074642255e-08, - "loss": 0.75733852, - "num_input_tokens_seen": 335472130, - "step": 15552, - "time_per_iteration": 2.7739923000335693 - }, - { - "auxiliary_loss_clip": 0.01059946, - "auxiliary_loss_mlp": 0.01040283, - "balance_loss_clip": 1.03191781, - "balance_loss_mlp": 1.02666366, - "epoch": 0.9350969487449271, - "flos": 23070594792960.0, - "grad_norm": 1.5949406765998282, - "language_loss": 0.77295089, - "learning_rate": 4.3985952875155386e-08, - "loss": 0.79395318, - "num_input_tokens_seen": 335489970, - "step": 15553, - "time_per_iteration": 2.7346534729003906 - }, - { - "auxiliary_loss_clip": 0.01074123, - "auxiliary_loss_mlp": 0.01033891, - "balance_loss_clip": 1.03367734, - "balance_loss_mlp": 1.02047396, - "epoch": 0.9351570719975951, - "flos": 18625177641600.0, - "grad_norm": 1.630847703889005, - "language_loss": 0.78214866, - "learning_rate": 4.390475917613723e-08, - "loss": 0.8032288, - "num_input_tokens_seen": 335509125, - "step": 15554, - "time_per_iteration": 2.6710941791534424 - }, - { - "auxiliary_loss_clip": 0.01077218, - "auxiliary_loss_mlp": 0.01037117, - "balance_loss_clip": 1.03197753, - "balance_loss_mlp": 1.02502322, - "epoch": 0.935217195250263, - "flos": 15888353702400.0, - "grad_norm": 2.49632757150129, - "language_loss": 0.69451249, - "learning_rate": 4.382363965244695e-08, - "loss": 0.7156558, - "num_input_tokens_seen": 335525620, - "step": 15555, - "time_per_iteration": 2.6385841369628906 - }, - { - "auxiliary_loss_clip": 0.01014929, - "auxiliary_loss_mlp": 0.01045967, - "balance_loss_clip": 1.02853274, - "balance_loss_mlp": 1.0316503, - "epoch": 0.935277318502931, - "flos": 24390312387840.0, - "grad_norm": 1.5017504373533854, - "language_loss": 0.75400025, - "learning_rate": 4.374259430715965e-08, - "loss": 0.77460921, - "num_input_tokens_seen": 335547565, - "step": 15556, - "time_per_iteration": 3.059551477432251 - }, - { - "auxiliary_loss_clip": 0.01085152, - "auxiliary_loss_mlp": 0.01031847, - "balance_loss_clip": 1.03423309, - "balance_loss_mlp": 1.02010441, - "epoch": 0.935337441755599, - "flos": 27600259294080.0, - "grad_norm": 1.4976349419869153, - "language_loss": 0.72337437, - "learning_rate": 4.366162314334953e-08, - "loss": 0.74454439, - "num_input_tokens_seen": 335570285, - "step": 15557, - "time_per_iteration": 4.6448400020599365 - }, - { - "auxiliary_loss_clip": 0.01108474, - "auxiliary_loss_mlp": 0.01033173, - "balance_loss_clip": 1.0365355, - "balance_loss_mlp": 1.01982188, - "epoch": 0.935397565008267, - "flos": 20482872209280.0, - "grad_norm": 1.660550178775489, - "language_loss": 0.63404226, - "learning_rate": 4.358072616408681e-08, - "loss": 0.65545875, - "num_input_tokens_seen": 335588600, - "step": 15558, - "time_per_iteration": 2.6054418087005615 - }, - { - "auxiliary_loss_clip": 0.01087055, - "auxiliary_loss_mlp": 0.01030987, - "balance_loss_clip": 1.03696275, - "balance_loss_mlp": 1.01723039, - "epoch": 0.9354576882609349, - "flos": 23654394541440.0, - "grad_norm": 1.8757208660532867, - "language_loss": 0.72988653, - "learning_rate": 4.34999033724388e-08, - "loss": 0.75106692, - "num_input_tokens_seen": 335606235, - "step": 15559, - "time_per_iteration": 2.6042425632476807 - }, - { - "auxiliary_loss_clip": 0.01053197, - "auxiliary_loss_mlp": 0.00769565, - "balance_loss_clip": 1.03214157, - "balance_loss_mlp": 1.00029421, - "epoch": 0.9355178115136029, - "flos": 36684904406400.0, - "grad_norm": 2.2075476861746526, - "language_loss": 0.63396823, - "learning_rate": 4.341915477147062e-08, - "loss": 0.65219581, - "num_input_tokens_seen": 335628240, - "step": 15560, - "time_per_iteration": 4.612861633300781 - }, - { - "auxiliary_loss_clip": 0.01049187, - "auxiliary_loss_mlp": 0.01034698, - "balance_loss_clip": 1.03704762, - "balance_loss_mlp": 1.02041054, - "epoch": 0.9355779347662708, - "flos": 14460401450880.0, - "grad_norm": 2.3052566052568193, - "language_loss": 0.64168519, - "learning_rate": 4.3338480364244034e-08, - "loss": 0.66252398, - "num_input_tokens_seen": 335643755, - "step": 15561, - "time_per_iteration": 4.437899827957153 - }, - { - "auxiliary_loss_clip": 0.0110932, - "auxiliary_loss_mlp": 0.01036594, - "balance_loss_clip": 1.03827786, - "balance_loss_mlp": 1.02375484, - "epoch": 0.9356380580189388, - "flos": 23185976256000.0, - "grad_norm": 1.6937389446463813, - "language_loss": 0.75591785, - "learning_rate": 4.325788015381859e-08, - "loss": 0.77737701, - "num_input_tokens_seen": 335665160, - "step": 15562, - "time_per_iteration": 2.7620413303375244 - }, - { - "auxiliary_loss_clip": 0.01016437, - "auxiliary_loss_mlp": 0.01002066, - "balance_loss_clip": 1.00517988, - "balance_loss_mlp": 1.0011481, - "epoch": 0.9356981812716068, - "flos": 67471626090240.0, - "grad_norm": 0.9484711819717426, - "language_loss": 0.62294793, - "learning_rate": 4.31773541432503e-08, - "loss": 0.64313298, - "num_input_tokens_seen": 335715240, - "step": 15563, - "time_per_iteration": 4.560046672821045 - }, - { - "auxiliary_loss_clip": 0.01059821, - "auxiliary_loss_mlp": 0.01032447, - "balance_loss_clip": 1.0360657, - "balance_loss_mlp": 1.02043045, - "epoch": 0.9357583045242748, - "flos": 24681619687680.0, - "grad_norm": 1.6275464297875282, - "language_loss": 0.78383303, - "learning_rate": 4.3096902335592714e-08, - "loss": 0.80475569, - "num_input_tokens_seen": 335734970, - "step": 15564, - "time_per_iteration": 2.7581684589385986 - }, - { - "auxiliary_loss_clip": 0.01111071, - "auxiliary_loss_mlp": 0.0103029, - "balance_loss_clip": 1.0369916, - "balance_loss_mlp": 1.0166707, - "epoch": 0.9358184277769427, - "flos": 19463727623040.0, - "grad_norm": 2.0197933120923164, - "language_loss": 0.78051835, - "learning_rate": 4.301652473389694e-08, - "loss": 0.80193192, - "num_input_tokens_seen": 335753435, - "step": 15565, - "time_per_iteration": 2.6100919246673584 - }, - { - "auxiliary_loss_clip": 0.01094214, - "auxiliary_loss_mlp": 0.01031155, - "balance_loss_clip": 1.03455317, - "balance_loss_mlp": 1.01927018, - "epoch": 0.9358785510296107, - "flos": 18916987731840.0, - "grad_norm": 3.310186857599268, - "language_loss": 0.72122169, - "learning_rate": 4.2936221341210774e-08, - "loss": 0.74247533, - "num_input_tokens_seen": 335772105, - "step": 15566, - "time_per_iteration": 2.5962870121002197 - }, - { - "auxiliary_loss_clip": 0.0106957, - "auxiliary_loss_mlp": 0.00771396, - "balance_loss_clip": 1.03290153, - "balance_loss_mlp": 1.00026023, - "epoch": 0.9359386742822787, - "flos": 23441265192960.0, - "grad_norm": 1.9305985811175064, - "language_loss": 0.67621976, - "learning_rate": 4.285599216057889e-08, - "loss": 0.69462943, - "num_input_tokens_seen": 335789125, - "step": 15567, - "time_per_iteration": 2.6770172119140625 - }, - { - "auxiliary_loss_clip": 0.01078108, - "auxiliary_loss_mlp": 0.0103394, - "balance_loss_clip": 1.03551221, - "balance_loss_mlp": 1.02124989, - "epoch": 0.9359987975349466, - "flos": 32744067557760.0, - "grad_norm": 3.1950642417815778, - "language_loss": 0.62192923, - "learning_rate": 4.277583719504418e-08, - "loss": 0.64304972, - "num_input_tokens_seen": 335810995, - "step": 15568, - "time_per_iteration": 2.7253639698028564 - }, - { - "auxiliary_loss_clip": 0.01082433, - "auxiliary_loss_mlp": 0.01037861, - "balance_loss_clip": 1.03121352, - "balance_loss_mlp": 1.02551699, - "epoch": 0.9360589207876147, - "flos": 22819651401600.0, - "grad_norm": 1.8305652991188055, - "language_loss": 0.7874766, - "learning_rate": 4.269575644764556e-08, - "loss": 0.80867952, - "num_input_tokens_seen": 335830580, - "step": 15569, - "time_per_iteration": 2.648876905441284 - }, - { - "auxiliary_loss_clip": 0.01090445, - "auxiliary_loss_mlp": 0.01033037, - "balance_loss_clip": 1.03811383, - "balance_loss_mlp": 1.02041864, - "epoch": 0.9361190440402826, - "flos": 20885251340160.0, - "grad_norm": 3.2418615597680263, - "language_loss": 0.697613, - "learning_rate": 4.261574992142014e-08, - "loss": 0.71884787, - "num_input_tokens_seen": 335846515, - "step": 15570, - "time_per_iteration": 2.695789337158203 - }, - { - "auxiliary_loss_clip": 0.0109347, - "auxiliary_loss_mlp": 0.01030501, - "balance_loss_clip": 1.03646827, - "balance_loss_mlp": 1.0180912, - "epoch": 0.9361791672929506, - "flos": 19317822577920.0, - "grad_norm": 3.942506001346151, - "language_loss": 0.78369403, - "learning_rate": 4.2535817619401726e-08, - "loss": 0.80493373, - "num_input_tokens_seen": 335863350, - "step": 15571, - "time_per_iteration": 2.613274335861206 - }, - { - "auxiliary_loss_clip": 0.01076748, - "auxiliary_loss_mlp": 0.01031253, - "balance_loss_clip": 1.03460646, - "balance_loss_mlp": 1.01874197, - "epoch": 0.9362392905456185, - "flos": 15158182032000.0, - "grad_norm": 2.841657798727435, - "language_loss": 0.77677691, - "learning_rate": 4.2455959544621224e-08, - "loss": 0.79785693, - "num_input_tokens_seen": 335880510, - "step": 15572, - "time_per_iteration": 2.803063154220581 - }, - { - "auxiliary_loss_clip": 0.01082647, - "auxiliary_loss_mlp": 0.01041561, - "balance_loss_clip": 1.03344643, - "balance_loss_mlp": 1.0294075, - "epoch": 0.9362994137982865, - "flos": 22085888371200.0, - "grad_norm": 1.8952922672820693, - "language_loss": 0.78173578, - "learning_rate": 4.237617570010688e-08, - "loss": 0.80297786, - "num_input_tokens_seen": 335899440, - "step": 15573, - "time_per_iteration": 2.64709734916687 - }, - { - "auxiliary_loss_clip": 0.01072731, - "auxiliary_loss_mlp": 0.01028381, - "balance_loss_clip": 1.03296494, - "balance_loss_mlp": 1.01635885, - "epoch": 0.9363595370509544, - "flos": 23512260424320.0, - "grad_norm": 2.4715293938233316, - "language_loss": 0.74473417, - "learning_rate": 4.2296466088884044e-08, - "loss": 0.76574528, - "num_input_tokens_seen": 335919540, - "step": 15574, - "time_per_iteration": 2.8169214725494385 - }, - { - "auxiliary_loss_clip": 0.01050172, - "auxiliary_loss_mlp": 0.01035605, - "balance_loss_clip": 1.03272486, - "balance_loss_mlp": 1.02266467, - "epoch": 0.9364196603036224, - "flos": 27123473139840.0, - "grad_norm": 1.920556373302248, - "language_loss": 0.68192244, - "learning_rate": 4.221683071397564e-08, - "loss": 0.70278013, - "num_input_tokens_seen": 335939665, - "step": 15575, - "time_per_iteration": 2.798386573791504 - }, - { - "auxiliary_loss_clip": 0.01078254, - "auxiliary_loss_mlp": 0.01033935, - "balance_loss_clip": 1.03272521, - "balance_loss_mlp": 1.02136481, - "epoch": 0.9364797835562904, - "flos": 18479057114880.0, - "grad_norm": 1.7600184524514564, - "language_loss": 0.65193367, - "learning_rate": 4.2137269578401026e-08, - "loss": 0.67305553, - "num_input_tokens_seen": 335958580, - "step": 15576, - "time_per_iteration": 2.6554365158081055 - }, - { - "auxiliary_loss_clip": 0.01093147, - "auxiliary_loss_mlp": 0.01030045, - "balance_loss_clip": 1.03174019, - "balance_loss_mlp": 1.0161159, - "epoch": 0.9365399068089584, - "flos": 13005552890880.0, - "grad_norm": 2.420160931511476, - "language_loss": 0.76176679, - "learning_rate": 4.2057782685177566e-08, - "loss": 0.78299868, - "num_input_tokens_seen": 335974965, - "step": 15577, - "time_per_iteration": 2.5658376216888428 - }, - { - "auxiliary_loss_clip": 0.01062399, - "auxiliary_loss_mlp": 0.01030216, - "balance_loss_clip": 1.03228045, - "balance_loss_mlp": 1.01722205, - "epoch": 0.9366000300616263, - "flos": 25666433850240.0, - "grad_norm": 3.270982347260187, - "language_loss": 0.52259952, - "learning_rate": 4.1978370037318855e-08, - "loss": 0.5435257, - "num_input_tokens_seen": 335996575, - "step": 15578, - "time_per_iteration": 2.753800392150879 - }, - { - "auxiliary_loss_clip": 0.01044474, - "auxiliary_loss_mlp": 0.01035654, - "balance_loss_clip": 1.03016138, - "balance_loss_mlp": 1.02336335, - "epoch": 0.9366601533142943, - "flos": 21433355948160.0, - "grad_norm": 1.5769540357516516, - "language_loss": 0.70730215, - "learning_rate": 4.189903163783692e-08, - "loss": 0.7281034, - "num_input_tokens_seen": 336017265, - "step": 15579, - "time_per_iteration": 2.776789903640747 - }, - { - "auxiliary_loss_clip": 0.01081419, - "auxiliary_loss_mlp": 0.01027227, - "balance_loss_clip": 1.03318858, - "balance_loss_mlp": 1.01544309, - "epoch": 0.9367202765669622, - "flos": 24093222998400.0, - "grad_norm": 1.8470459873947023, - "language_loss": 0.76132309, - "learning_rate": 4.181976748973959e-08, - "loss": 0.78240955, - "num_input_tokens_seen": 336035905, - "step": 15580, - "time_per_iteration": 2.685457468032837 - }, - { - "auxiliary_loss_clip": 0.01097941, - "auxiliary_loss_mlp": 0.01031636, - "balance_loss_clip": 1.03599906, - "balance_loss_mlp": 1.01848698, - "epoch": 0.9367803998196302, - "flos": 20888842700160.0, - "grad_norm": 1.6988000782542536, - "language_loss": 0.66216934, - "learning_rate": 4.1740577596033114e-08, - "loss": 0.68346512, - "num_input_tokens_seen": 336055585, - "step": 15581, - "time_per_iteration": 2.642705202102661 - }, - { - "auxiliary_loss_clip": 0.01099156, - "auxiliary_loss_mlp": 0.01028235, - "balance_loss_clip": 1.03769445, - "balance_loss_mlp": 1.01575327, - "epoch": 0.9368405230722983, - "flos": 22564362464640.0, - "grad_norm": 1.6283591696925621, - "language_loss": 0.76962942, - "learning_rate": 4.166146195972042e-08, - "loss": 0.79090333, - "num_input_tokens_seen": 336076695, - "step": 15582, - "time_per_iteration": 2.6836650371551514 - }, - { - "auxiliary_loss_clip": 0.01033952, - "auxiliary_loss_mlp": 0.01033194, - "balance_loss_clip": 1.03131258, - "balance_loss_mlp": 1.02007508, - "epoch": 0.9369006463249662, - "flos": 18880215183360.0, - "grad_norm": 1.9959612516768654, - "language_loss": 0.73610139, - "learning_rate": 4.1582420583800905e-08, - "loss": 0.75677288, - "num_input_tokens_seen": 336094740, - "step": 15583, - "time_per_iteration": 2.9247751235961914 - }, - { - "auxiliary_loss_clip": 0.01113025, - "auxiliary_loss_mlp": 0.01031893, - "balance_loss_clip": 1.03807962, - "balance_loss_mlp": 1.01861954, - "epoch": 0.9369607695776342, - "flos": 26432516142720.0, - "grad_norm": 2.0019759787362417, - "language_loss": 0.84050167, - "learning_rate": 4.1503453471272376e-08, - "loss": 0.86195087, - "num_input_tokens_seen": 336113985, - "step": 15584, - "time_per_iteration": 2.7832884788513184 - }, - { - "auxiliary_loss_clip": 0.01098693, - "auxiliary_loss_mlp": 0.00771025, - "balance_loss_clip": 1.03800797, - "balance_loss_mlp": 1.00032699, - "epoch": 0.9370208928303021, - "flos": 39567346081920.0, - "grad_norm": 1.4532418436154226, - "language_loss": 0.72163695, - "learning_rate": 4.1424560625129334e-08, - "loss": 0.74033409, - "num_input_tokens_seen": 336136395, - "step": 15585, - "time_per_iteration": 2.81025767326355 - }, - { - "auxiliary_loss_clip": 0.01073011, - "auxiliary_loss_mlp": 0.01021827, - "balance_loss_clip": 1.03393424, - "balance_loss_mlp": 1.01078236, - "epoch": 0.9370810160829701, - "flos": 22963114321920.0, - "grad_norm": 1.7172742978336988, - "language_loss": 0.8027873, - "learning_rate": 4.134574204836316e-08, - "loss": 0.82373559, - "num_input_tokens_seen": 336156345, - "step": 15586, - "time_per_iteration": 2.66705322265625 - }, - { - "auxiliary_loss_clip": 0.01068881, - "auxiliary_loss_mlp": 0.01036223, - "balance_loss_clip": 1.03491676, - "balance_loss_mlp": 1.0236938, - "epoch": 0.937141139335638, - "flos": 23075048079360.0, - "grad_norm": 1.5900972808595355, - "language_loss": 0.76568019, - "learning_rate": 4.126699774396258e-08, - "loss": 0.78673124, - "num_input_tokens_seen": 336176760, - "step": 15587, - "time_per_iteration": 2.696638345718384 - }, - { - "auxiliary_loss_clip": 0.01089529, - "auxiliary_loss_mlp": 0.01037342, - "balance_loss_clip": 1.03515196, - "balance_loss_mlp": 1.02427721, - "epoch": 0.937201262588306, - "flos": 16356664247040.0, - "grad_norm": 1.8569642874741914, - "language_loss": 0.87623429, - "learning_rate": 4.118832771491387e-08, - "loss": 0.89750302, - "num_input_tokens_seen": 336193285, - "step": 15588, - "time_per_iteration": 2.6571919918060303 - }, - { - "auxiliary_loss_clip": 0.01106178, - "auxiliary_loss_mlp": 0.00770286, - "balance_loss_clip": 1.03689957, - "balance_loss_mlp": 1.0001812, - "epoch": 0.937261385840974, - "flos": 20194078861440.0, - "grad_norm": 1.9442823727757126, - "language_loss": 0.78136659, - "learning_rate": 4.11097319642002e-08, - "loss": 0.80013114, - "num_input_tokens_seen": 336211425, - "step": 15589, - "time_per_iteration": 2.5364420413970947 - }, - { - "auxiliary_loss_clip": 0.01106688, - "auxiliary_loss_mlp": 0.0103402, - "balance_loss_clip": 1.03706598, - "balance_loss_mlp": 1.02196836, - "epoch": 0.937321509093642, - "flos": 18295948558080.0, - "grad_norm": 1.7833559240011974, - "language_loss": 0.77980852, - "learning_rate": 4.103121049480163e-08, - "loss": 0.80121559, - "num_input_tokens_seen": 336230205, - "step": 15590, - "time_per_iteration": 2.5236690044403076 - }, - { - "auxiliary_loss_clip": 0.01079152, - "auxiliary_loss_mlp": 0.01038917, - "balance_loss_clip": 1.03359151, - "balance_loss_mlp": 1.02445698, - "epoch": 0.9373816323463099, - "flos": 25884662929920.0, - "grad_norm": 1.8039863283037736, - "language_loss": 0.71324873, - "learning_rate": 4.095276330969577e-08, - "loss": 0.73442948, - "num_input_tokens_seen": 336252440, - "step": 15591, - "time_per_iteration": 2.675104856491089 - }, - { - "auxiliary_loss_clip": 0.01097841, - "auxiliary_loss_mlp": 0.00771749, - "balance_loss_clip": 1.03754783, - "balance_loss_mlp": 1.00026131, - "epoch": 0.9374417555989779, - "flos": 27198849830400.0, - "grad_norm": 2.2483638992357844, - "language_loss": 0.53910917, - "learning_rate": 4.0874390411857804e-08, - "loss": 0.55780506, - "num_input_tokens_seen": 336273845, - "step": 15592, - "time_per_iteration": 2.620513439178467 - }, - { - "auxiliary_loss_clip": 0.01092328, - "auxiliary_loss_mlp": 0.01027667, - "balance_loss_clip": 1.03775334, - "balance_loss_mlp": 1.01602066, - "epoch": 0.9375018788516458, - "flos": 23621249266560.0, - "grad_norm": 1.5680593734812756, - "language_loss": 0.67480534, - "learning_rate": 4.0796091804259136e-08, - "loss": 0.69600528, - "num_input_tokens_seen": 336292790, - "step": 15593, - "time_per_iteration": 2.606893301010132 - }, - { - "auxiliary_loss_clip": 0.01086764, - "auxiliary_loss_mlp": 0.01028451, - "balance_loss_clip": 1.03426361, - "balance_loss_mlp": 1.01641703, - "epoch": 0.9375620021043138, - "flos": 22678774260480.0, - "grad_norm": 1.5375149732930165, - "language_loss": 0.74182671, - "learning_rate": 4.0717867489868715e-08, - "loss": 0.76297885, - "num_input_tokens_seen": 336312600, - "step": 15594, - "time_per_iteration": 2.6576709747314453 - }, - { - "auxiliary_loss_clip": 0.01093114, - "auxiliary_loss_mlp": 0.01027231, - "balance_loss_clip": 1.03431714, - "balance_loss_mlp": 1.01590586, - "epoch": 0.9376221253569819, - "flos": 27560254521600.0, - "grad_norm": 1.6954995365401158, - "language_loss": 0.74231362, - "learning_rate": 4.063971747165351e-08, - "loss": 0.76351708, - "num_input_tokens_seen": 336332770, - "step": 15595, - "time_per_iteration": 2.6582190990448 - }, - { - "auxiliary_loss_clip": 0.01080536, - "auxiliary_loss_mlp": 0.01029991, - "balance_loss_clip": 1.03524542, - "balance_loss_mlp": 1.01823688, - "epoch": 0.9376822486096498, - "flos": 24129887806080.0, - "grad_norm": 1.8418600900837818, - "language_loss": 0.75974333, - "learning_rate": 4.056164175257626e-08, - "loss": 0.78084862, - "num_input_tokens_seen": 336351445, - "step": 15596, - "time_per_iteration": 2.6803321838378906 - }, - { - "auxiliary_loss_clip": 0.01079836, - "auxiliary_loss_mlp": 0.01030847, - "balance_loss_clip": 1.03544092, - "balance_loss_mlp": 1.01825309, - "epoch": 0.9377423718623178, - "flos": 22784028088320.0, - "grad_norm": 1.7137269862110038, - "language_loss": 0.78881788, - "learning_rate": 4.0483640335597926e-08, - "loss": 0.80992472, - "num_input_tokens_seen": 336368690, - "step": 15597, - "time_per_iteration": 4.308673143386841 - }, - { - "auxiliary_loss_clip": 0.01113389, - "auxiliary_loss_mlp": 0.01033675, - "balance_loss_clip": 1.03775406, - "balance_loss_mlp": 1.02094936, - "epoch": 0.9378024951149857, - "flos": 19168900790400.0, - "grad_norm": 1.564327070136616, - "language_loss": 0.81037343, - "learning_rate": 4.0405713223676363e-08, - "loss": 0.83184403, - "num_input_tokens_seen": 336388165, - "step": 15598, - "time_per_iteration": 2.5458343029022217 - }, - { - "auxiliary_loss_clip": 0.01077427, - "auxiliary_loss_mlp": 0.01031736, - "balance_loss_clip": 1.03376913, - "balance_loss_mlp": 1.01846755, - "epoch": 0.9378626183676537, - "flos": 23505508667520.0, - "grad_norm": 2.005294265343008, - "language_loss": 0.62860727, - "learning_rate": 4.0327860419766994e-08, - "loss": 0.64969885, - "num_input_tokens_seen": 336406475, - "step": 15599, - "time_per_iteration": 2.638820171356201 - }, - { - "auxiliary_loss_clip": 0.01068952, - "auxiliary_loss_mlp": 0.01033888, - "balance_loss_clip": 1.03511238, - "balance_loss_mlp": 1.0210557, - "epoch": 0.9379227416203216, - "flos": 18405655672320.0, - "grad_norm": 1.8480598201397724, - "language_loss": 0.73232383, - "learning_rate": 4.0250081926821e-08, - "loss": 0.75335222, - "num_input_tokens_seen": 336424690, - "step": 15600, - "time_per_iteration": 6.016250848770142 - }, - { - "auxiliary_loss_clip": 0.01083039, - "auxiliary_loss_mlp": 0.01031732, - "balance_loss_clip": 1.03592873, - "balance_loss_mlp": 1.02013838, - "epoch": 0.9379828648729897, - "flos": 17821855923840.0, - "grad_norm": 1.7892851032269996, - "language_loss": 0.69339931, - "learning_rate": 4.0172377747788474e-08, - "loss": 0.71454704, - "num_input_tokens_seen": 336443055, - "step": 15601, - "time_per_iteration": 2.6296818256378174 - }, - { - "auxiliary_loss_clip": 0.01019215, - "auxiliary_loss_mlp": 0.01003727, - "balance_loss_clip": 1.00596642, - "balance_loss_mlp": 1.00267816, - "epoch": 0.9380429881256576, - "flos": 68024399466240.0, - "grad_norm": 0.7524579876237703, - "language_loss": 0.58074123, - "learning_rate": 4.009474788561573e-08, - "loss": 0.60097063, - "num_input_tokens_seen": 336510190, - "step": 15602, - "time_per_iteration": 3.3650712966918945 - }, - { - "auxiliary_loss_clip": 0.01035142, - "auxiliary_loss_mlp": 0.01039086, - "balance_loss_clip": 1.03298295, - "balance_loss_mlp": 1.02651513, - "epoch": 0.9381031113783256, - "flos": 20776980769920.0, - "grad_norm": 2.016134606608171, - "language_loss": 0.71942216, - "learning_rate": 4.001719234324663e-08, - "loss": 0.7401644, - "num_input_tokens_seen": 336529250, - "step": 15603, - "time_per_iteration": 4.292678356170654 - }, - { - "auxiliary_loss_clip": 0.01100161, - "auxiliary_loss_mlp": 0.01029251, - "balance_loss_clip": 1.03342152, - "balance_loss_mlp": 1.01796222, - "epoch": 0.9381632346309935, - "flos": 19025078734080.0, - "grad_norm": 1.630988444834905, - "language_loss": 0.76084709, - "learning_rate": 3.993971112362171e-08, - "loss": 0.78214121, - "num_input_tokens_seen": 336548530, - "step": 15604, - "time_per_iteration": 2.5863354206085205 - }, - { - "auxiliary_loss_clip": 0.01083382, - "auxiliary_loss_mlp": 0.01039836, - "balance_loss_clip": 1.03308749, - "balance_loss_mlp": 1.02494097, - "epoch": 0.9382233578836615, - "flos": 23513840622720.0, - "grad_norm": 2.0761522756468005, - "language_loss": 0.65524292, - "learning_rate": 3.9862304229679734e-08, - "loss": 0.67647505, - "num_input_tokens_seen": 336568510, - "step": 15605, - "time_per_iteration": 2.7903220653533936 - }, - { - "auxiliary_loss_clip": 0.01075306, - "auxiliary_loss_mlp": 0.00770626, - "balance_loss_clip": 1.03514504, - "balance_loss_mlp": 1.00017333, - "epoch": 0.9382834811363294, - "flos": 43067882016000.0, - "grad_norm": 2.098655820983203, - "language_loss": 0.67783493, - "learning_rate": 3.9784971664355683e-08, - "loss": 0.69629425, - "num_input_tokens_seen": 336592020, - "step": 15606, - "time_per_iteration": 2.8691816329956055 - }, - { - "auxiliary_loss_clip": 0.01091361, - "auxiliary_loss_mlp": 0.01027503, - "balance_loss_clip": 1.03324687, - "balance_loss_mlp": 1.01593983, - "epoch": 0.9383436043889974, - "flos": 16436242828800.0, - "grad_norm": 1.7643369071420325, - "language_loss": 0.77210492, - "learning_rate": 3.970771343058166e-08, - "loss": 0.7932936, - "num_input_tokens_seen": 336610010, - "step": 15607, - "time_per_iteration": 2.670970916748047 - }, - { - "auxiliary_loss_clip": 0.01098186, - "auxiliary_loss_mlp": 0.01027702, - "balance_loss_clip": 1.03540564, - "balance_loss_mlp": 1.01609111, - "epoch": 0.9384037276416655, - "flos": 20740603271040.0, - "grad_norm": 2.3923436535832927, - "language_loss": 0.82524753, - "learning_rate": 3.963052953128776e-08, - "loss": 0.84650642, - "num_input_tokens_seen": 336628520, - "step": 15608, - "time_per_iteration": 2.6184029579162598 - }, - { - "auxiliary_loss_clip": 0.01099685, - "auxiliary_loss_mlp": 0.01035669, - "balance_loss_clip": 1.0386703, - "balance_loss_mlp": 1.02291393, - "epoch": 0.9384638508943334, - "flos": 19062677295360.0, - "grad_norm": 1.6462700950548765, - "language_loss": 0.68830276, - "learning_rate": 3.9553419969400536e-08, - "loss": 0.7096563, - "num_input_tokens_seen": 336647365, - "step": 15609, - "time_per_iteration": 2.5987517833709717 - }, - { - "auxiliary_loss_clip": 0.01080403, - "auxiliary_loss_mlp": 0.01031835, - "balance_loss_clip": 1.03563523, - "balance_loss_mlp": 1.01835871, - "epoch": 0.9385239741470014, - "flos": 23404887694080.0, - "grad_norm": 2.499558460038554, - "language_loss": 0.75453949, - "learning_rate": 3.9476384747844316e-08, - "loss": 0.77566183, - "num_input_tokens_seen": 336667165, - "step": 15610, - "time_per_iteration": 2.7642691135406494 - }, - { - "auxiliary_loss_clip": 0.01044401, - "auxiliary_loss_mlp": 0.01027529, - "balance_loss_clip": 1.03432107, - "balance_loss_mlp": 1.0161804, - "epoch": 0.9385840973996693, - "flos": 12824742804480.0, - "grad_norm": 2.318341323536946, - "language_loss": 0.75083077, - "learning_rate": 3.939942386953987e-08, - "loss": 0.77155006, - "num_input_tokens_seen": 336684130, - "step": 15611, - "time_per_iteration": 2.753612518310547 - }, - { - "auxiliary_loss_clip": 0.01069021, - "auxiliary_loss_mlp": 0.01029199, - "balance_loss_clip": 1.03686237, - "balance_loss_mlp": 1.01732564, - "epoch": 0.9386442206523373, - "flos": 15486980152320.0, - "grad_norm": 1.8818734447956798, - "language_loss": 0.6593554, - "learning_rate": 3.9322537337405756e-08, - "loss": 0.68033767, - "num_input_tokens_seen": 336701520, - "step": 15612, - "time_per_iteration": 2.637763738632202 - }, - { - "auxiliary_loss_clip": 0.01095795, - "auxiliary_loss_mlp": 0.01028771, - "balance_loss_clip": 1.03593373, - "balance_loss_mlp": 1.01703501, - "epoch": 0.9387043439050052, - "flos": 21178821196800.0, - "grad_norm": 2.000722743721445, - "language_loss": 0.57039118, - "learning_rate": 3.924572515435742e-08, - "loss": 0.59163684, - "num_input_tokens_seen": 336720675, - "step": 15613, - "time_per_iteration": 2.733313798904419 - }, - { - "auxiliary_loss_clip": 0.01084485, - "auxiliary_loss_mlp": 0.01036056, - "balance_loss_clip": 1.03319824, - "balance_loss_mlp": 1.02405143, - "epoch": 0.9387644671576733, - "flos": 27668273696640.0, - "grad_norm": 2.367003168945266, - "language_loss": 0.70944715, - "learning_rate": 3.916898732330764e-08, - "loss": 0.73065257, - "num_input_tokens_seen": 336741005, - "step": 15614, - "time_per_iteration": 2.706362009048462 - }, - { - "auxiliary_loss_clip": 0.01101068, - "auxiliary_loss_mlp": 0.01031027, - "balance_loss_clip": 1.03795266, - "balance_loss_mlp": 1.018224, - "epoch": 0.9388245904103412, - "flos": 18836331742080.0, - "grad_norm": 1.9586081993753126, - "language_loss": 0.81213439, - "learning_rate": 3.9092323847166544e-08, - "loss": 0.83345532, - "num_input_tokens_seen": 336757990, - "step": 15615, - "time_per_iteration": 2.5509698390960693 - }, - { - "auxiliary_loss_clip": 0.01078844, - "auxiliary_loss_mlp": 0.01030958, - "balance_loss_clip": 1.03181601, - "balance_loss_mlp": 1.01881695, - "epoch": 0.9388847136630092, - "flos": 25483828083840.0, - "grad_norm": 1.8203668140159897, - "language_loss": 0.71924144, - "learning_rate": 3.901573472884134e-08, - "loss": 0.7403394, - "num_input_tokens_seen": 336777705, - "step": 15616, - "time_per_iteration": 2.6393303871154785 - }, - { - "auxiliary_loss_clip": 0.01108573, - "auxiliary_loss_mlp": 0.01029358, - "balance_loss_clip": 1.03755164, - "balance_loss_mlp": 1.01691222, - "epoch": 0.9389448369156771, - "flos": 18734992496640.0, - "grad_norm": 2.3142633085226536, - "language_loss": 0.66507453, - "learning_rate": 3.89392199712355e-08, - "loss": 0.68645382, - "num_input_tokens_seen": 336798275, - "step": 15617, - "time_per_iteration": 2.5801546573638916 - }, - { - "auxiliary_loss_clip": 0.01100466, - "auxiliary_loss_mlp": 0.01036181, - "balance_loss_clip": 1.03689265, - "balance_loss_mlp": 1.02243066, - "epoch": 0.9390049601683451, - "flos": 21717839664000.0, - "grad_norm": 2.370004086672154, - "language_loss": 0.73481232, - "learning_rate": 3.886277957725092e-08, - "loss": 0.7561788, - "num_input_tokens_seen": 336813835, - "step": 15618, - "time_per_iteration": 2.6102712154388428 - }, - { - "auxiliary_loss_clip": 0.01114877, - "auxiliary_loss_mlp": 0.01031951, - "balance_loss_clip": 1.03841376, - "balance_loss_mlp": 1.01817656, - "epoch": 0.939065083421013, - "flos": 19391224020480.0, - "grad_norm": 1.8942748596777075, - "language_loss": 0.70133412, - "learning_rate": 3.878641354978662e-08, - "loss": 0.7228024, - "num_input_tokens_seen": 336832210, - "step": 15619, - "time_per_iteration": 2.5149004459381104 - }, - { - "auxiliary_loss_clip": 0.01083274, - "auxiliary_loss_mlp": 0.01031368, - "balance_loss_clip": 1.03280878, - "balance_loss_mlp": 1.01836836, - "epoch": 0.939125206673681, - "flos": 24681511946880.0, - "grad_norm": 1.6109808579498737, - "language_loss": 0.7760632, - "learning_rate": 3.8710121891737834e-08, - "loss": 0.79720962, - "num_input_tokens_seen": 336851380, - "step": 15620, - "time_per_iteration": 2.6531193256378174 - }, - { - "auxiliary_loss_clip": 0.01092968, - "auxiliary_loss_mlp": 0.01027438, - "balance_loss_clip": 1.03448093, - "balance_loss_mlp": 1.01568961, - "epoch": 0.9391853299263491, - "flos": 16325961096960.0, - "grad_norm": 3.857357976396781, - "language_loss": 0.73641354, - "learning_rate": 3.8633904605998025e-08, - "loss": 0.75761759, - "num_input_tokens_seen": 336868525, - "step": 15621, - "time_per_iteration": 2.5519356727600098 - }, - { - "auxiliary_loss_clip": 0.01077862, - "auxiliary_loss_mlp": 0.01033031, - "balance_loss_clip": 1.03406405, - "balance_loss_mlp": 1.01961446, - "epoch": 0.939245453179017, - "flos": 11655778590720.0, - "grad_norm": 2.005738336602588, - "language_loss": 0.66011965, - "learning_rate": 3.855776169545688e-08, - "loss": 0.68122858, - "num_input_tokens_seen": 336886200, - "step": 15622, - "time_per_iteration": 2.649592876434326 - }, - { - "auxiliary_loss_clip": 0.01080227, - "auxiliary_loss_mlp": 0.01039822, - "balance_loss_clip": 1.03199553, - "balance_loss_mlp": 1.02594018, - "epoch": 0.939305576431685, - "flos": 23148700917120.0, - "grad_norm": 1.5853407957277033, - "language_loss": 0.71721888, - "learning_rate": 3.848169316300209e-08, - "loss": 0.73841941, - "num_input_tokens_seen": 336905815, - "step": 15623, - "time_per_iteration": 2.6309492588043213 - }, - { - "auxiliary_loss_clip": 0.01101847, - "auxiliary_loss_mlp": 0.01031834, - "balance_loss_clip": 1.03930306, - "balance_loss_mlp": 1.01934707, - "epoch": 0.9393656996843529, - "flos": 33287790706560.0, - "grad_norm": 1.923924688159949, - "language_loss": 0.72363102, - "learning_rate": 3.84056990115178e-08, - "loss": 0.74496788, - "num_input_tokens_seen": 336928460, - "step": 15624, - "time_per_iteration": 2.7837047576904297 - }, - { - "auxiliary_loss_clip": 0.01071928, - "auxiliary_loss_mlp": 0.01033178, - "balance_loss_clip": 1.03422618, - "balance_loss_mlp": 1.02049983, - "epoch": 0.9394258229370209, - "flos": 21689434984320.0, - "grad_norm": 2.3403915430461333, - "language_loss": 0.89429879, - "learning_rate": 3.832977924388614e-08, - "loss": 0.91534984, - "num_input_tokens_seen": 336948320, - "step": 15625, - "time_per_iteration": 2.7144711017608643 - }, - { - "auxiliary_loss_clip": 0.01096935, - "auxiliary_loss_mlp": 0.01031047, - "balance_loss_clip": 1.03645694, - "balance_loss_mlp": 1.01787996, - "epoch": 0.9394859461896888, - "flos": 23874203819520.0, - "grad_norm": 2.0332287450304074, - "language_loss": 0.83621097, - "learning_rate": 3.825393386298592e-08, - "loss": 0.85749084, - "num_input_tokens_seen": 336967670, - "step": 15626, - "time_per_iteration": 2.71279239654541 - }, - { - "auxiliary_loss_clip": 0.01012548, - "auxiliary_loss_mlp": 0.01006796, - "balance_loss_clip": 1.00825083, - "balance_loss_mlp": 1.00575864, - "epoch": 0.9395460694423569, - "flos": 61566116993280.0, - "grad_norm": 0.7779274792928904, - "language_loss": 0.56076801, - "learning_rate": 3.8178162871693284e-08, - "loss": 0.58096135, - "num_input_tokens_seen": 337028395, - "step": 15627, - "time_per_iteration": 3.1956591606140137 - }, - { - "auxiliary_loss_clip": 0.01058297, - "auxiliary_loss_mlp": 0.01041812, - "balance_loss_clip": 1.0335449, - "balance_loss_mlp": 1.02838874, - "epoch": 0.9396061926950248, - "flos": 20995712640000.0, - "grad_norm": 1.8076515347951383, - "language_loss": 0.70110631, - "learning_rate": 3.810246627288105e-08, - "loss": 0.72210741, - "num_input_tokens_seen": 337048150, - "step": 15628, - "time_per_iteration": 2.6945135593414307 - }, - { - "auxiliary_loss_clip": 0.01096653, - "auxiliary_loss_mlp": 0.01028484, - "balance_loss_clip": 1.03629029, - "balance_loss_mlp": 1.01632452, - "epoch": 0.9396663159476928, - "flos": 27487786832640.0, - "grad_norm": 1.4683164605088868, - "language_loss": 0.75408161, - "learning_rate": 3.8026844069420025e-08, - "loss": 0.77533293, - "num_input_tokens_seen": 337069315, - "step": 15629, - "time_per_iteration": 2.697967052459717 - }, - { - "auxiliary_loss_clip": 0.01044306, - "auxiliary_loss_mlp": 0.01039724, - "balance_loss_clip": 1.030352, - "balance_loss_mlp": 1.02693844, - "epoch": 0.9397264392003607, - "flos": 19427457864960.0, - "grad_norm": 1.8515111751581672, - "language_loss": 0.74173099, - "learning_rate": 3.795129626417748e-08, - "loss": 0.76257128, - "num_input_tokens_seen": 337087765, - "step": 15630, - "time_per_iteration": 2.7693710327148438 - }, - { - "auxiliary_loss_clip": 0.01073693, - "auxiliary_loss_mlp": 0.01035386, - "balance_loss_clip": 1.03482318, - "balance_loss_mlp": 1.02306604, - "epoch": 0.9397865624530287, - "flos": 18004820826240.0, - "grad_norm": 2.3868141185330485, - "language_loss": 0.69397956, - "learning_rate": 3.787582286001845e-08, - "loss": 0.71507031, - "num_input_tokens_seen": 337106265, - "step": 15631, - "time_per_iteration": 2.7210657596588135 - }, - { - "auxiliary_loss_clip": 0.01057041, - "auxiliary_loss_mlp": 0.01038236, - "balance_loss_clip": 1.0333792, - "balance_loss_mlp": 1.02626777, - "epoch": 0.9398466857056966, - "flos": 22564613859840.0, - "grad_norm": 1.5129301375877884, - "language_loss": 0.75246739, - "learning_rate": 3.7800423859805086e-08, - "loss": 0.77342016, - "num_input_tokens_seen": 337126090, - "step": 15632, - "time_per_iteration": 2.7409205436706543 - }, - { - "auxiliary_loss_clip": 0.01103425, - "auxiliary_loss_mlp": 0.01036148, - "balance_loss_clip": 1.03828955, - "balance_loss_mlp": 1.02260005, - "epoch": 0.9399068089583646, - "flos": 24535678728960.0, - "grad_norm": 1.6016514710570828, - "language_loss": 0.74265265, - "learning_rate": 3.772509926639622e-08, - "loss": 0.76404846, - "num_input_tokens_seen": 337145655, - "step": 15633, - "time_per_iteration": 2.5950539112091064 - }, - { - "auxiliary_loss_clip": 0.01110088, - "auxiliary_loss_mlp": 0.01034464, - "balance_loss_clip": 1.03653955, - "balance_loss_mlp": 1.0211246, - "epoch": 0.9399669322110327, - "flos": 25630343660160.0, - "grad_norm": 1.9203445491908095, - "language_loss": 0.72707498, - "learning_rate": 3.764984908264823e-08, - "loss": 0.74852049, - "num_input_tokens_seen": 337164805, - "step": 15634, - "time_per_iteration": 2.5872409343719482 - }, - { - "auxiliary_loss_clip": 0.01098967, - "auxiliary_loss_mlp": 0.01031223, - "balance_loss_clip": 1.03486001, - "balance_loss_mlp": 1.01823497, - "epoch": 0.9400270554637006, - "flos": 17089385783040.0, - "grad_norm": 2.514594285895435, - "language_loss": 0.68870479, - "learning_rate": 3.75746733114144e-08, - "loss": 0.71000671, - "num_input_tokens_seen": 337182280, - "step": 15635, - "time_per_iteration": 2.600447654724121 - }, - { - "auxiliary_loss_clip": 0.01056848, - "auxiliary_loss_mlp": 0.01029105, - "balance_loss_clip": 1.03640127, - "balance_loss_mlp": 1.01715422, - "epoch": 0.9400871787163686, - "flos": 22055113393920.0, - "grad_norm": 1.5676691824914186, - "language_loss": 0.74045342, - "learning_rate": 3.7499571955545985e-08, - "loss": 0.76131296, - "num_input_tokens_seen": 337203495, - "step": 15636, - "time_per_iteration": 2.6919074058532715 - }, - { - "auxiliary_loss_clip": 0.01099321, - "auxiliary_loss_mlp": 0.01032818, - "balance_loss_clip": 1.03794205, - "balance_loss_mlp": 1.02044368, - "epoch": 0.9401473019690365, - "flos": 16982767238400.0, - "grad_norm": 2.177328379740788, - "language_loss": 0.82646513, - "learning_rate": 3.7424545017890054e-08, - "loss": 0.84778643, - "num_input_tokens_seen": 337220435, - "step": 15637, - "time_per_iteration": 4.119058132171631 - }, - { - "auxiliary_loss_clip": 0.01065361, - "auxiliary_loss_mlp": 0.01033669, - "balance_loss_clip": 1.03724432, - "balance_loss_mlp": 1.02082443, - "epoch": 0.9402074252217045, - "flos": 19681956702720.0, - "grad_norm": 2.151104061404543, - "language_loss": 0.6892854, - "learning_rate": 3.7349592501292325e-08, - "loss": 0.71027565, - "num_input_tokens_seen": 337238095, - "step": 15638, - "time_per_iteration": 2.720820426940918 - }, - { - "auxiliary_loss_clip": 0.01093316, - "auxiliary_loss_mlp": 0.01038281, - "balance_loss_clip": 1.03545761, - "balance_loss_mlp": 1.02702212, - "epoch": 0.9402675484743724, - "flos": 24754302858240.0, - "grad_norm": 1.6914013320453911, - "language_loss": 0.84974968, - "learning_rate": 3.727471440859498e-08, - "loss": 0.87106568, - "num_input_tokens_seen": 337256645, - "step": 15639, - "time_per_iteration": 5.851804733276367 - }, - { - "auxiliary_loss_clip": 0.01083189, - "auxiliary_loss_mlp": 0.00770067, - "balance_loss_clip": 1.03247952, - "balance_loss_mlp": 1.00016117, - "epoch": 0.9403276717270405, - "flos": 25558630156800.0, - "grad_norm": 1.7768265457850463, - "language_loss": 0.78339088, - "learning_rate": 3.719991074263662e-08, - "loss": 0.80192345, - "num_input_tokens_seen": 337278360, - "step": 15640, - "time_per_iteration": 2.7363038063049316 - }, - { - "auxiliary_loss_clip": 0.01100045, - "auxiliary_loss_mlp": 0.0103256, - "balance_loss_clip": 1.03647268, - "balance_loss_mlp": 1.0201323, - "epoch": 0.9403877949797084, - "flos": 26689852154880.0, - "grad_norm": 1.8593795246940288, - "language_loss": 0.74102533, - "learning_rate": 3.7125181506254544e-08, - "loss": 0.76235145, - "num_input_tokens_seen": 337302480, - "step": 15641, - "time_per_iteration": 2.7518787384033203 - }, - { - "auxiliary_loss_clip": 0.0110061, - "auxiliary_loss_mlp": 0.01034479, - "balance_loss_clip": 1.03686595, - "balance_loss_mlp": 1.01987553, - "epoch": 0.9404479182323764, - "flos": 15011666455680.0, - "grad_norm": 2.217224537042475, - "language_loss": 0.8267206, - "learning_rate": 3.7050526702282256e-08, - "loss": 0.84807152, - "num_input_tokens_seen": 337316600, - "step": 15642, - "time_per_iteration": 4.0844972133636475 - }, - { - "auxiliary_loss_clip": 0.01090346, - "auxiliary_loss_mlp": 0.01030611, - "balance_loss_clip": 1.03500628, - "balance_loss_mlp": 1.01894593, - "epoch": 0.9405080414850443, - "flos": 24973573432320.0, - "grad_norm": 1.8062008321344256, - "language_loss": 0.68693364, - "learning_rate": 3.697594633355084e-08, - "loss": 0.70814323, - "num_input_tokens_seen": 337336895, - "step": 15643, - "time_per_iteration": 2.57680344581604 - }, - { - "auxiliary_loss_clip": 0.01098869, - "auxiliary_loss_mlp": 0.01036038, - "balance_loss_clip": 1.03659177, - "balance_loss_mlp": 1.02258563, - "epoch": 0.9405681647377123, - "flos": 20844743777280.0, - "grad_norm": 1.933266647542814, - "language_loss": 0.76611924, - "learning_rate": 3.6901440402888226e-08, - "loss": 0.78746843, - "num_input_tokens_seen": 337355105, - "step": 15644, - "time_per_iteration": 2.573357343673706 - }, - { - "auxiliary_loss_clip": 0.01090012, - "auxiliary_loss_mlp": 0.0103297, - "balance_loss_clip": 1.03489494, - "balance_loss_mlp": 1.02147841, - "epoch": 0.9406282879903802, - "flos": 23805578885760.0, - "grad_norm": 1.6178233820471488, - "language_loss": 0.67622656, - "learning_rate": 3.682700891311974e-08, - "loss": 0.69745636, - "num_input_tokens_seen": 337374905, - "step": 15645, - "time_per_iteration": 2.615952730178833 - }, - { - "auxiliary_loss_clip": 0.01077394, - "auxiliary_loss_mlp": 0.00769887, - "balance_loss_clip": 1.03552616, - "balance_loss_mlp": 1.00019598, - "epoch": 0.9406884112430483, - "flos": 27674953626240.0, - "grad_norm": 1.3954115728125809, - "language_loss": 0.70446187, - "learning_rate": 3.6752651867067774e-08, - "loss": 0.72293472, - "num_input_tokens_seen": 337397130, - "step": 15646, - "time_per_iteration": 2.6904642581939697 - }, - { - "auxiliary_loss_clip": 0.01090467, - "auxiliary_loss_mlp": 0.01031509, - "balance_loss_clip": 1.03259134, - "balance_loss_mlp": 1.0194633, - "epoch": 0.9407485344957163, - "flos": 23075048079360.0, - "grad_norm": 1.4996623163528855, - "language_loss": 0.74028134, - "learning_rate": 3.667836926755208e-08, - "loss": 0.76150107, - "num_input_tokens_seen": 337418660, - "step": 15647, - "time_per_iteration": 2.6018729209899902 - }, - { - "auxiliary_loss_clip": 0.01010109, - "auxiliary_loss_mlp": 0.01000406, - "balance_loss_clip": 1.00723958, - "balance_loss_mlp": 0.99945861, - "epoch": 0.9408086577483842, - "flos": 71014034304000.0, - "grad_norm": 0.8881598455052471, - "language_loss": 0.63527632, - "learning_rate": 3.660416111738907e-08, - "loss": 0.65538144, - "num_input_tokens_seen": 337478055, - "step": 15648, - "time_per_iteration": 3.2934350967407227 - }, - { - "auxiliary_loss_clip": 0.01104482, - "auxiliary_loss_mlp": 0.01034208, - "balance_loss_clip": 1.03579104, - "balance_loss_mlp": 1.02340806, - "epoch": 0.9408687810010522, - "flos": 23730956380800.0, - "grad_norm": 1.5536213392749576, - "language_loss": 0.66520309, - "learning_rate": 3.653002741939337e-08, - "loss": 0.68659002, - "num_input_tokens_seen": 337499405, - "step": 15649, - "time_per_iteration": 2.553529739379883 - }, - { - "auxiliary_loss_clip": 0.01075375, - "auxiliary_loss_mlp": 0.01026505, - "balance_loss_clip": 1.0331924, - "balance_loss_mlp": 1.01497114, - "epoch": 0.9409289042537201, - "flos": 18369314087040.0, - "grad_norm": 4.521362372265656, - "language_loss": 0.77431417, - "learning_rate": 3.645596817637586e-08, - "loss": 0.79533303, - "num_input_tokens_seen": 337517195, - "step": 15650, - "time_per_iteration": 2.665523052215576 - }, - { - "auxiliary_loss_clip": 0.01064771, - "auxiliary_loss_mlp": 0.01032458, - "balance_loss_clip": 1.03697872, - "balance_loss_mlp": 1.0203402, - "epoch": 0.9409890275063881, - "flos": 23878333883520.0, - "grad_norm": 2.2066131550931942, - "language_loss": 0.74314982, - "learning_rate": 3.638198339114451e-08, - "loss": 0.76412213, - "num_input_tokens_seen": 337535245, - "step": 15651, - "time_per_iteration": 2.790637969970703 - }, - { - "auxiliary_loss_clip": 0.0110668, - "auxiliary_loss_mlp": 0.01032091, - "balance_loss_clip": 1.03554559, - "balance_loss_mlp": 1.01934731, - "epoch": 0.941049150759056, - "flos": 16545088016640.0, - "grad_norm": 1.7324074128675258, - "language_loss": 0.72721291, - "learning_rate": 3.630807306650507e-08, - "loss": 0.74860054, - "num_input_tokens_seen": 337553040, - "step": 15652, - "time_per_iteration": 2.5541346073150635 - }, - { - "auxiliary_loss_clip": 0.01073797, - "auxiliary_loss_mlp": 0.01037686, - "balance_loss_clip": 1.03517735, - "balance_loss_mlp": 1.02407205, - "epoch": 0.9411092740117241, - "flos": 25118401069440.0, - "grad_norm": 1.8143834266468624, - "language_loss": 0.66641271, - "learning_rate": 3.6234237205260645e-08, - "loss": 0.68752754, - "num_input_tokens_seen": 337574580, - "step": 15653, - "time_per_iteration": 2.7330899238586426 - }, - { - "auxiliary_loss_clip": 0.01109084, - "auxiliary_loss_mlp": 0.01035177, - "balance_loss_clip": 1.03644657, - "balance_loss_mlp": 1.02239227, - "epoch": 0.941169397264392, - "flos": 21142264129920.0, - "grad_norm": 1.9183010885495058, - "language_loss": 0.77979028, - "learning_rate": 3.6160475810210536e-08, - "loss": 0.80123287, - "num_input_tokens_seen": 337593010, - "step": 15654, - "time_per_iteration": 2.5508615970611572 - }, - { - "auxiliary_loss_clip": 0.01104499, - "auxiliary_loss_mlp": 0.01029373, - "balance_loss_clip": 1.03763437, - "balance_loss_mlp": 1.01693344, - "epoch": 0.94122952051706, - "flos": 38508914995200.0, - "grad_norm": 1.5713777366197268, - "language_loss": 0.69984704, - "learning_rate": 3.6086788884152065e-08, - "loss": 0.7211858, - "num_input_tokens_seen": 337616170, - "step": 15655, - "time_per_iteration": 2.7416152954101562 - }, - { - "auxiliary_loss_clip": 0.01107647, - "auxiliary_loss_mlp": 0.01036833, - "balance_loss_clip": 1.03607106, - "balance_loss_mlp": 1.02323759, - "epoch": 0.9412896437697279, - "flos": 18369206346240.0, - "grad_norm": 5.7482907771024045, - "language_loss": 0.72394556, - "learning_rate": 3.601317642987944e-08, - "loss": 0.74539036, - "num_input_tokens_seen": 337635215, - "step": 15656, - "time_per_iteration": 2.569613456726074 - }, - { - "auxiliary_loss_clip": 0.01074485, - "auxiliary_loss_mlp": 0.0102962, - "balance_loss_clip": 1.03419089, - "balance_loss_mlp": 1.01772296, - "epoch": 0.9413497670223959, - "flos": 25884950238720.0, - "grad_norm": 1.8612272314279366, - "language_loss": 0.78241754, - "learning_rate": 3.593963845018377e-08, - "loss": 0.80345851, - "num_input_tokens_seen": 337654195, - "step": 15657, - "time_per_iteration": 2.6432650089263916 - }, - { - "auxiliary_loss_clip": 0.01072209, - "auxiliary_loss_mlp": 0.01029018, - "balance_loss_clip": 1.03471482, - "balance_loss_mlp": 1.01653671, - "epoch": 0.9414098902750638, - "flos": 16618309891200.0, - "grad_norm": 2.5566622926725193, - "language_loss": 0.84468395, - "learning_rate": 3.586617494785371e-08, - "loss": 0.86569619, - "num_input_tokens_seen": 337671810, - "step": 15658, - "time_per_iteration": 2.6943564414978027 - }, - { - "auxiliary_loss_clip": 0.01112714, - "auxiliary_loss_mlp": 0.01033078, - "balance_loss_clip": 1.03760839, - "balance_loss_mlp": 1.01849866, - "epoch": 0.9414700135277319, - "flos": 18625033987200.0, - "grad_norm": 2.5090872722582627, - "language_loss": 0.70395422, - "learning_rate": 3.5792785925675254e-08, - "loss": 0.72541213, - "num_input_tokens_seen": 337689410, - "step": 15659, - "time_per_iteration": 2.537353038787842 - }, - { - "auxiliary_loss_clip": 0.01079214, - "auxiliary_loss_mlp": 0.01039877, - "balance_loss_clip": 1.03404224, - "balance_loss_mlp": 1.02849793, - "epoch": 0.9415301367803999, - "flos": 26280146649600.0, - "grad_norm": 1.7562040343891887, - "language_loss": 0.79511106, - "learning_rate": 3.571947138643172e-08, - "loss": 0.81630188, - "num_input_tokens_seen": 337709950, - "step": 15660, - "time_per_iteration": 2.7002146244049072 - }, - { - "auxiliary_loss_clip": 0.01071861, - "auxiliary_loss_mlp": 0.01028763, - "balance_loss_clip": 1.03252554, - "balance_loss_mlp": 1.0167948, - "epoch": 0.9415902600330678, - "flos": 23261388860160.0, - "grad_norm": 1.4022153462876712, - "language_loss": 0.67921788, - "learning_rate": 3.564623133290201e-08, - "loss": 0.70022404, - "num_input_tokens_seen": 337731320, - "step": 15661, - "time_per_iteration": 2.755877733230591 - }, - { - "auxiliary_loss_clip": 0.01092284, - "auxiliary_loss_mlp": 0.01031361, - "balance_loss_clip": 1.03276324, - "balance_loss_mlp": 1.01883173, - "epoch": 0.9416503832857358, - "flos": 14719138093440.0, - "grad_norm": 2.2599485934603725, - "language_loss": 0.66300029, - "learning_rate": 3.557306576786434e-08, - "loss": 0.68423676, - "num_input_tokens_seen": 337747720, - "step": 15662, - "time_per_iteration": 2.5741324424743652 - }, - { - "auxiliary_loss_clip": 0.01009662, - "auxiliary_loss_mlp": 0.01000042, - "balance_loss_clip": 1.00710607, - "balance_loss_mlp": 0.99910659, - "epoch": 0.9417105065384037, - "flos": 70312698276480.0, - "grad_norm": 0.7619674820211261, - "language_loss": 0.59235966, - "learning_rate": 3.5499974694092935e-08, - "loss": 0.61245668, - "num_input_tokens_seen": 337806930, - "step": 15663, - "time_per_iteration": 3.3059024810791016 - }, - { - "auxiliary_loss_clip": 0.01103713, - "auxiliary_loss_mlp": 0.01035518, - "balance_loss_clip": 1.03747571, - "balance_loss_mlp": 1.02217829, - "epoch": 0.9417706297910717, - "flos": 34057895322240.0, - "grad_norm": 4.1621354717950885, - "language_loss": 0.66886747, - "learning_rate": 3.542695811435914e-08, - "loss": 0.69025975, - "num_input_tokens_seen": 337828100, - "step": 15664, - "time_per_iteration": 2.7219324111938477 - }, - { - "auxiliary_loss_clip": 0.01083442, - "auxiliary_loss_mlp": 0.01030443, - "balance_loss_clip": 1.03674304, - "balance_loss_mlp": 1.01874244, - "epoch": 0.9418307530437396, - "flos": 16471614746880.0, - "grad_norm": 2.258809019140803, - "language_loss": 0.73858142, - "learning_rate": 3.535401603143207e-08, - "loss": 0.75972033, - "num_input_tokens_seen": 337844805, - "step": 15665, - "time_per_iteration": 2.6257636547088623 - }, - { - "auxiliary_loss_clip": 0.01105775, - "auxiliary_loss_mlp": 0.01032956, - "balance_loss_clip": 1.03694832, - "balance_loss_mlp": 1.02096939, - "epoch": 0.9418908762964077, - "flos": 11253543114240.0, - "grad_norm": 3.0049644569052907, - "language_loss": 0.63581872, - "learning_rate": 3.528114844807773e-08, - "loss": 0.65720612, - "num_input_tokens_seen": 337860490, - "step": 15666, - "time_per_iteration": 2.5537686347961426 - }, - { - "auxiliary_loss_clip": 0.01072039, - "auxiliary_loss_mlp": 0.01029378, - "balance_loss_clip": 1.0352211, - "balance_loss_mlp": 1.01712298, - "epoch": 0.9419509995490756, - "flos": 18438836860800.0, - "grad_norm": 1.6687010232077268, - "language_loss": 0.78841943, - "learning_rate": 3.520835536705902e-08, - "loss": 0.80943358, - "num_input_tokens_seen": 337878360, - "step": 15667, - "time_per_iteration": 2.66939377784729 - }, - { - "auxiliary_loss_clip": 0.01105116, - "auxiliary_loss_mlp": 0.01027413, - "balance_loss_clip": 1.03544164, - "balance_loss_mlp": 1.01629639, - "epoch": 0.9420111228017436, - "flos": 20737945664640.0, - "grad_norm": 1.8566898819332656, - "language_loss": 0.75282031, - "learning_rate": 3.5135636791136404e-08, - "loss": 0.7741456, - "num_input_tokens_seen": 337895635, - "step": 15668, - "time_per_iteration": 2.5508882999420166 - }, - { - "auxiliary_loss_clip": 0.0105425, - "auxiliary_loss_mlp": 0.01029631, - "balance_loss_clip": 1.03423977, - "balance_loss_mlp": 1.01744199, - "epoch": 0.9420712460544115, - "flos": 21141940907520.0, - "grad_norm": 2.159724886055292, - "language_loss": 0.59023595, - "learning_rate": 3.506299272306723e-08, - "loss": 0.61107475, - "num_input_tokens_seen": 337913940, - "step": 15669, - "time_per_iteration": 2.73180890083313 - }, - { - "auxiliary_loss_clip": 0.01067029, - "auxiliary_loss_mlp": 0.01027243, - "balance_loss_clip": 1.03234708, - "balance_loss_mlp": 1.01523852, - "epoch": 0.9421313693070795, - "flos": 15851760721920.0, - "grad_norm": 1.528198079062627, - "language_loss": 0.77025855, - "learning_rate": 3.4990423165606406e-08, - "loss": 0.79120123, - "num_input_tokens_seen": 337932015, - "step": 15670, - "time_per_iteration": 2.69807767868042 - }, - { - "auxiliary_loss_clip": 0.01109553, - "auxiliary_loss_mlp": 0.01036081, - "balance_loss_clip": 1.03725696, - "balance_loss_mlp": 1.02321219, - "epoch": 0.9421914925597474, - "flos": 32415915882240.0, - "grad_norm": 1.8154793470935222, - "language_loss": 0.65174937, - "learning_rate": 3.491792812150574e-08, - "loss": 0.67320567, - "num_input_tokens_seen": 337953345, - "step": 15671, - "time_per_iteration": 2.7444138526916504 - }, - { - "auxiliary_loss_clip": 0.01082811, - "auxiliary_loss_mlp": 0.01033924, - "balance_loss_clip": 1.03383374, - "balance_loss_mlp": 1.02096009, - "epoch": 0.9422516158124155, - "flos": 19718513769600.0, - "grad_norm": 1.558684648583432, - "language_loss": 0.79916745, - "learning_rate": 3.48455075935139e-08, - "loss": 0.82033479, - "num_input_tokens_seen": 337973685, - "step": 15672, - "time_per_iteration": 2.803809881210327 - }, - { - "auxiliary_loss_clip": 0.01075344, - "auxiliary_loss_mlp": 0.01036959, - "balance_loss_clip": 1.03470707, - "balance_loss_mlp": 1.02285063, - "epoch": 0.9423117390650835, - "flos": 16253277926400.0, - "grad_norm": 1.9824694157705243, - "language_loss": 0.73236197, - "learning_rate": 3.47731615843776e-08, - "loss": 0.75348502, - "num_input_tokens_seen": 337989175, - "step": 15673, - "time_per_iteration": 2.755509614944458 - }, - { - "auxiliary_loss_clip": 0.01091118, - "auxiliary_loss_mlp": 0.01030642, - "balance_loss_clip": 1.03414345, - "balance_loss_mlp": 1.01794672, - "epoch": 0.9423718623177514, - "flos": 31796564647680.0, - "grad_norm": 1.4558092155999423, - "language_loss": 0.70178533, - "learning_rate": 3.470089009683974e-08, - "loss": 0.72300291, - "num_input_tokens_seen": 338011800, - "step": 15674, - "time_per_iteration": 2.695003032684326 - }, - { - "auxiliary_loss_clip": 0.01107385, - "auxiliary_loss_mlp": 0.01025942, - "balance_loss_clip": 1.03574955, - "balance_loss_mlp": 1.01402664, - "epoch": 0.9424319855704194, - "flos": 23331809473920.0, - "grad_norm": 1.9582607226770616, - "language_loss": 0.81163412, - "learning_rate": 3.462869313364125e-08, - "loss": 0.8329674, - "num_input_tokens_seen": 338032120, - "step": 15675, - "time_per_iteration": 2.6292521953582764 - }, - { - "auxiliary_loss_clip": 0.01081718, - "auxiliary_loss_mlp": 0.01031642, - "balance_loss_clip": 1.03442502, - "balance_loss_mlp": 1.01966715, - "epoch": 0.9424921088230873, - "flos": 20777627214720.0, - "grad_norm": 1.7260765463945456, - "language_loss": 0.62643492, - "learning_rate": 3.4556570697519494e-08, - "loss": 0.64756858, - "num_input_tokens_seen": 338051880, - "step": 15676, - "time_per_iteration": 4.179499387741089 - }, - { - "auxiliary_loss_clip": 0.01092941, - "auxiliary_loss_mlp": 0.01038222, - "balance_loss_clip": 1.03998232, - "balance_loss_mlp": 1.02615166, - "epoch": 0.9425522320757553, - "flos": 19026658932480.0, - "grad_norm": 1.777162834544334, - "language_loss": 0.67122662, - "learning_rate": 3.448452279120984e-08, - "loss": 0.69253826, - "num_input_tokens_seen": 338069665, - "step": 15677, - "time_per_iteration": 2.6239006519317627 - }, - { - "auxiliary_loss_clip": 0.01072255, - "auxiliary_loss_mlp": 0.01035548, - "balance_loss_clip": 1.03186798, - "balance_loss_mlp": 1.02190459, - "epoch": 0.9426123553284232, - "flos": 25155353185920.0, - "grad_norm": 2.176683290780186, - "language_loss": 0.641137, - "learning_rate": 3.441254941744387e-08, - "loss": 0.66221505, - "num_input_tokens_seen": 338090490, - "step": 15678, - "time_per_iteration": 4.263075113296509 - }, - { - "auxiliary_loss_clip": 0.01082508, - "auxiliary_loss_mlp": 0.01028954, - "balance_loss_clip": 1.04040313, - "balance_loss_mlp": 1.01706934, - "epoch": 0.9426724785810913, - "flos": 21179359900800.0, - "grad_norm": 1.4630832933179898, - "language_loss": 0.74250793, - "learning_rate": 3.434065057895097e-08, - "loss": 0.76362252, - "num_input_tokens_seen": 338109825, - "step": 15679, - "time_per_iteration": 4.329301357269287 - }, - { - "auxiliary_loss_clip": 0.01089711, - "auxiliary_loss_mlp": 0.01034379, - "balance_loss_clip": 1.0365119, - "balance_loss_mlp": 1.02231526, - "epoch": 0.9427326018337592, - "flos": 14756916222720.0, - "grad_norm": 3.717209623940925, - "language_loss": 0.77565658, - "learning_rate": 3.426882627845762e-08, - "loss": 0.79689747, - "num_input_tokens_seen": 338125790, - "step": 15680, - "time_per_iteration": 2.6704599857330322 - }, - { - "auxiliary_loss_clip": 0.01097961, - "auxiliary_loss_mlp": 0.01033168, - "balance_loss_clip": 1.0371449, - "balance_loss_mlp": 1.02055609, - "epoch": 0.9427927250864272, - "flos": 20923640000640.0, - "grad_norm": 2.190384071517057, - "language_loss": 0.75626266, - "learning_rate": 3.419707651868742e-08, - "loss": 0.77757394, - "num_input_tokens_seen": 338145610, - "step": 15681, - "time_per_iteration": 2.6899359226226807 - }, - { - "auxiliary_loss_clip": 0.01082824, - "auxiliary_loss_mlp": 0.0103551, - "balance_loss_clip": 1.03667855, - "balance_loss_mlp": 1.02248073, - "epoch": 0.9428528483390951, - "flos": 19752520970880.0, - "grad_norm": 2.3199120236961144, - "language_loss": 0.65754902, - "learning_rate": 3.412540130236086e-08, - "loss": 0.6787324, - "num_input_tokens_seen": 338165960, - "step": 15682, - "time_per_iteration": 4.124305963516235 - }, - { - "auxiliary_loss_clip": 0.01071222, - "auxiliary_loss_mlp": 0.01028797, - "balance_loss_clip": 1.03226089, - "balance_loss_mlp": 1.01655436, - "epoch": 0.9429129715917631, - "flos": 24534996370560.0, - "grad_norm": 3.097159468574502, - "language_loss": 0.76566684, - "learning_rate": 3.405380063219665e-08, - "loss": 0.78666705, - "num_input_tokens_seen": 338187215, - "step": 15683, - "time_per_iteration": 2.71305775642395 - }, - { - "auxiliary_loss_clip": 0.01100547, - "auxiliary_loss_mlp": 0.01040046, - "balance_loss_clip": 1.03645873, - "balance_loss_mlp": 1.02684927, - "epoch": 0.942973094844431, - "flos": 17959824063360.0, - "grad_norm": 2.6265213886695826, - "language_loss": 0.75407404, - "learning_rate": 3.398227451090885e-08, - "loss": 0.77547991, - "num_input_tokens_seen": 338201825, - "step": 15684, - "time_per_iteration": 2.6331522464752197 - }, - { - "auxiliary_loss_clip": 0.01104685, - "auxiliary_loss_mlp": 0.01027432, - "balance_loss_clip": 1.03484631, - "balance_loss_mlp": 1.01599407, - "epoch": 0.9430332180970991, - "flos": 26137689310080.0, - "grad_norm": 1.6361176573488942, - "language_loss": 0.77129638, - "learning_rate": 3.391082294121017e-08, - "loss": 0.79261756, - "num_input_tokens_seen": 338220865, - "step": 15685, - "time_per_iteration": 2.7566094398498535 - }, - { - "auxiliary_loss_clip": 0.01092602, - "auxiliary_loss_mlp": 0.01030988, - "balance_loss_clip": 1.03414559, - "balance_loss_mlp": 1.01951969, - "epoch": 0.943093341349767, - "flos": 23951376190080.0, - "grad_norm": 2.1272798688132775, - "language_loss": 0.75766367, - "learning_rate": 3.383944592581023e-08, - "loss": 0.77889955, - "num_input_tokens_seen": 338240160, - "step": 15686, - "time_per_iteration": 2.6965436935424805 - }, - { - "auxiliary_loss_clip": 0.01097717, - "auxiliary_loss_mlp": 0.0103222, - "balance_loss_clip": 1.03482318, - "balance_loss_mlp": 1.01981652, - "epoch": 0.943153464602435, - "flos": 17968407413760.0, - "grad_norm": 1.7020578922272096, - "language_loss": 0.80628002, - "learning_rate": 3.376814346741575e-08, - "loss": 0.82757938, - "num_input_tokens_seen": 338259305, - "step": 15687, - "time_per_iteration": 2.5866737365722656 - }, - { - "auxiliary_loss_clip": 0.01089927, - "auxiliary_loss_mlp": 0.01034435, - "balance_loss_clip": 1.03616667, - "balance_loss_mlp": 1.02021337, - "epoch": 0.943213587855103, - "flos": 14501519544960.0, - "grad_norm": 2.167672264682041, - "language_loss": 0.75638962, - "learning_rate": 3.369691556873011e-08, - "loss": 0.77763325, - "num_input_tokens_seen": 338274950, - "step": 15688, - "time_per_iteration": 2.6230926513671875 - }, - { - "auxiliary_loss_clip": 0.01078255, - "auxiliary_loss_mlp": 0.01026704, - "balance_loss_clip": 1.03318596, - "balance_loss_mlp": 1.01392508, - "epoch": 0.9432737111077709, - "flos": 28986411093120.0, - "grad_norm": 1.671527451823547, - "language_loss": 0.68622327, - "learning_rate": 3.3625762232454504e-08, - "loss": 0.70727283, - "num_input_tokens_seen": 338295585, - "step": 15689, - "time_per_iteration": 2.7073707580566406 - }, - { - "auxiliary_loss_clip": 0.01094693, - "auxiliary_loss_mlp": 0.0103362, - "balance_loss_clip": 1.03498852, - "balance_loss_mlp": 1.0225811, - "epoch": 0.9433338343604389, - "flos": 21609066303360.0, - "grad_norm": 1.766005404007046, - "language_loss": 0.80373913, - "learning_rate": 3.35546834612872e-08, - "loss": 0.82502228, - "num_input_tokens_seen": 338314555, - "step": 15690, - "time_per_iteration": 2.5873029232025146 - }, - { - "auxiliary_loss_clip": 0.0109645, - "auxiliary_loss_mlp": 0.01031556, - "balance_loss_clip": 1.03644657, - "balance_loss_mlp": 1.0193367, - "epoch": 0.9433939576131068, - "flos": 33182285483520.0, - "grad_norm": 2.148376299443603, - "language_loss": 0.59993267, - "learning_rate": 3.348367925792317e-08, - "loss": 0.62121278, - "num_input_tokens_seen": 338336260, - "step": 15691, - "time_per_iteration": 2.7108116149902344 - }, - { - "auxiliary_loss_clip": 0.01070974, - "auxiliary_loss_mlp": 0.01032195, - "balance_loss_clip": 1.03521907, - "balance_loss_mlp": 1.01911151, - "epoch": 0.9434540808657749, - "flos": 20486391742080.0, - "grad_norm": 2.2447371927481545, - "language_loss": 0.66576785, - "learning_rate": 3.341274962505514e-08, - "loss": 0.68679953, - "num_input_tokens_seen": 338354680, - "step": 15692, - "time_per_iteration": 2.6925716400146484 - }, - { - "auxiliary_loss_clip": 0.01093305, - "auxiliary_loss_mlp": 0.01032018, - "balance_loss_clip": 1.03605986, - "balance_loss_mlp": 1.01980531, - "epoch": 0.9435142041184428, - "flos": 21542955321600.0, - "grad_norm": 2.467286667437946, - "language_loss": 0.74455351, - "learning_rate": 3.334189456537251e-08, - "loss": 0.76580673, - "num_input_tokens_seen": 338372490, - "step": 15693, - "time_per_iteration": 2.6023404598236084 - }, - { - "auxiliary_loss_clip": 0.01074066, - "auxiliary_loss_mlp": 0.01035078, - "balance_loss_clip": 1.03401875, - "balance_loss_mlp": 1.0216012, - "epoch": 0.9435743273711108, - "flos": 25009089004800.0, - "grad_norm": 3.213380675885908, - "language_loss": 0.73401213, - "learning_rate": 3.327111408156291e-08, - "loss": 0.75510359, - "num_input_tokens_seen": 338390870, - "step": 15694, - "time_per_iteration": 2.695995569229126 - }, - { - "auxiliary_loss_clip": 0.00992652, - "auxiliary_loss_mlp": 0.01001259, - "balance_loss_clip": 1.00752378, - "balance_loss_mlp": 1.00031126, - "epoch": 0.9436344506237787, - "flos": 60158707320960.0, - "grad_norm": 0.6938858298712827, - "language_loss": 0.50570488, - "learning_rate": 3.3200408176309316e-08, - "loss": 0.52564394, - "num_input_tokens_seen": 338453075, - "step": 15695, - "time_per_iteration": 3.2824831008911133 - }, - { - "auxiliary_loss_clip": 0.01078605, - "auxiliary_loss_mlp": 0.01034023, - "balance_loss_clip": 1.03206229, - "balance_loss_mlp": 1.02225113, - "epoch": 0.9436945738764467, - "flos": 22237252283520.0, - "grad_norm": 1.8024183486894638, - "language_loss": 0.65296769, - "learning_rate": 3.312977685229335e-08, - "loss": 0.67409396, - "num_input_tokens_seen": 338471770, - "step": 15696, - "time_per_iteration": 2.7027387619018555 - }, - { - "auxiliary_loss_clip": 0.01097587, - "auxiliary_loss_mlp": 0.01027656, - "balance_loss_clip": 1.03637338, - "balance_loss_mlp": 1.01574719, - "epoch": 0.9437546971291146, - "flos": 25045179194880.0, - "grad_norm": 1.5868519209040974, - "language_loss": 0.65894949, - "learning_rate": 3.305922011219353e-08, - "loss": 0.68020189, - "num_input_tokens_seen": 338492190, - "step": 15697, - "time_per_iteration": 2.696575880050659 - }, - { - "auxiliary_loss_clip": 0.00999497, - "auxiliary_loss_mlp": 0.01001147, - "balance_loss_clip": 1.00481725, - "balance_loss_mlp": 1.0002768, - "epoch": 0.9438148203817827, - "flos": 56790788400000.0, - "grad_norm": 0.8460520685296222, - "language_loss": 0.63194656, - "learning_rate": 3.298873795868506e-08, - "loss": 0.6519531, - "num_input_tokens_seen": 338552560, - "step": 15698, - "time_per_iteration": 3.1992437839508057 - }, - { - "auxiliary_loss_clip": 0.01088557, - "auxiliary_loss_mlp": 0.01040163, - "balance_loss_clip": 1.03655159, - "balance_loss_mlp": 1.02691269, - "epoch": 0.9438749436344506, - "flos": 22346384780160.0, - "grad_norm": 1.744031032402157, - "language_loss": 0.69575948, - "learning_rate": 3.291833039444092e-08, - "loss": 0.71704668, - "num_input_tokens_seen": 338571770, - "step": 15699, - "time_per_iteration": 2.71105694770813 - }, - { - "auxiliary_loss_clip": 0.01069184, - "auxiliary_loss_mlp": 0.01031098, - "balance_loss_clip": 1.03235722, - "balance_loss_mlp": 1.01913548, - "epoch": 0.9439350668871186, - "flos": 13370800337280.0, - "grad_norm": 2.0518256803371444, - "language_loss": 0.74715513, - "learning_rate": 3.2847997422130734e-08, - "loss": 0.76815796, - "num_input_tokens_seen": 338587310, - "step": 15700, - "time_per_iteration": 2.7928857803344727 - }, - { - "auxiliary_loss_clip": 0.01031212, - "auxiliary_loss_mlp": 0.01031676, - "balance_loss_clip": 1.02990246, - "balance_loss_mlp": 1.02022016, - "epoch": 0.9439951901397866, - "flos": 17785334770560.0, - "grad_norm": 1.7345550747234506, - "language_loss": 0.70444047, - "learning_rate": 3.2777739044421495e-08, - "loss": 0.7250694, - "num_input_tokens_seen": 338606235, - "step": 15701, - "time_per_iteration": 2.956749200820923 - }, - { - "auxiliary_loss_clip": 0.01067175, - "auxiliary_loss_mlp": 0.01028974, - "balance_loss_clip": 1.03305924, - "balance_loss_mlp": 1.01637959, - "epoch": 0.9440553133924545, - "flos": 18879568738560.0, - "grad_norm": 6.919178162697029, - "language_loss": 0.77767622, - "learning_rate": 3.2707555263977505e-08, - "loss": 0.79863775, - "num_input_tokens_seen": 338624090, - "step": 15702, - "time_per_iteration": 3.149764060974121 - }, - { - "auxiliary_loss_clip": 0.01093668, - "auxiliary_loss_mlp": 0.01043554, - "balance_loss_clip": 1.03391433, - "balance_loss_mlp": 1.03058994, - "epoch": 0.9441154366451225, - "flos": 19572967860480.0, - "grad_norm": 2.1610357777231397, - "language_loss": 0.66376126, - "learning_rate": 3.2637446083460194e-08, - "loss": 0.68513346, - "num_input_tokens_seen": 338643695, - "step": 15703, - "time_per_iteration": 2.5990066528320312 - }, - { - "auxiliary_loss_clip": 0.01099113, - "auxiliary_loss_mlp": 0.01029886, - "balance_loss_clip": 1.03849339, - "balance_loss_mlp": 1.01657009, - "epoch": 0.9441755598977905, - "flos": 30294995472000.0, - "grad_norm": 6.247002123537392, - "language_loss": 0.73099834, - "learning_rate": 3.256741150552833e-08, - "loss": 0.75228834, - "num_input_tokens_seen": 338664725, - "step": 15704, - "time_per_iteration": 2.649864673614502 - }, - { - "auxiliary_loss_clip": 0.01094284, - "auxiliary_loss_mlp": 0.0103236, - "balance_loss_clip": 1.03578568, - "balance_loss_mlp": 1.01978898, - "epoch": 0.9442356831504585, - "flos": 20667884186880.0, - "grad_norm": 1.839574518559296, - "language_loss": 0.74311668, - "learning_rate": 3.2497451532837336e-08, - "loss": 0.76438308, - "num_input_tokens_seen": 338683990, - "step": 15705, - "time_per_iteration": 2.611238956451416 - }, - { - "auxiliary_loss_clip": 0.01087617, - "auxiliary_loss_mlp": 0.01034408, - "balance_loss_clip": 1.03792405, - "balance_loss_mlp": 1.02303529, - "epoch": 0.9442958064031264, - "flos": 16107265140480.0, - "grad_norm": 1.8860922128318132, - "language_loss": 0.76915097, - "learning_rate": 3.2427566168039986e-08, - "loss": 0.79037118, - "num_input_tokens_seen": 338702025, - "step": 15706, - "time_per_iteration": 2.651951313018799 - }, - { - "auxiliary_loss_clip": 0.01091977, - "auxiliary_loss_mlp": 0.01029566, - "balance_loss_clip": 1.03399932, - "balance_loss_mlp": 1.01796126, - "epoch": 0.9443559296557944, - "flos": 20447392550400.0, - "grad_norm": 1.4620649428574009, - "language_loss": 0.69324106, - "learning_rate": 3.23577554137866e-08, - "loss": 0.7144565, - "num_input_tokens_seen": 338720920, - "step": 15707, - "time_per_iteration": 2.674379825592041 - }, - { - "auxiliary_loss_clip": 0.0110044, - "auxiliary_loss_mlp": 0.01027284, - "balance_loss_clip": 1.0323143, - "balance_loss_mlp": 1.01660287, - "epoch": 0.9444160529084623, - "flos": 21610897896960.0, - "grad_norm": 1.6031633884107506, - "language_loss": 0.69253683, - "learning_rate": 3.22880192727244e-08, - "loss": 0.71381414, - "num_input_tokens_seen": 338739590, - "step": 15708, - "time_per_iteration": 2.6171586513519287 - }, - { - "auxiliary_loss_clip": 0.01096213, - "auxiliary_loss_mlp": 0.01030391, - "balance_loss_clip": 1.03588486, - "balance_loss_mlp": 1.01868427, - "epoch": 0.9444761761611303, - "flos": 18441781776000.0, - "grad_norm": 2.3501834209242305, - "language_loss": 0.70614785, - "learning_rate": 3.221835774749748e-08, - "loss": 0.72741389, - "num_input_tokens_seen": 338757240, - "step": 15709, - "time_per_iteration": 2.5730903148651123 - }, - { - "auxiliary_loss_clip": 0.01067094, - "auxiliary_loss_mlp": 0.01031921, - "balance_loss_clip": 1.03753853, - "balance_loss_mlp": 1.01969028, - "epoch": 0.9445362994137982, - "flos": 20957144411520.0, - "grad_norm": 2.0328452779578208, - "language_loss": 0.84886342, - "learning_rate": 3.214877084074774e-08, - "loss": 0.86985362, - "num_input_tokens_seen": 338773750, - "step": 15710, - "time_per_iteration": 2.803764581680298 - }, - { - "auxiliary_loss_clip": 0.01086062, - "auxiliary_loss_mlp": 0.01033102, - "balance_loss_clip": 1.03906393, - "balance_loss_mlp": 1.02019763, - "epoch": 0.9445964226664663, - "flos": 20303283185280.0, - "grad_norm": 1.710819363130834, - "language_loss": 0.71519732, - "learning_rate": 3.2079258555113956e-08, - "loss": 0.73638898, - "num_input_tokens_seen": 338792115, - "step": 15711, - "time_per_iteration": 2.786268711090088 - }, - { - "auxiliary_loss_clip": 0.01097144, - "auxiliary_loss_mlp": 0.01032234, - "balance_loss_clip": 1.03653646, - "balance_loss_mlp": 1.01912701, - "epoch": 0.9446565459191342, - "flos": 26396030903040.0, - "grad_norm": 1.6870746080253851, - "language_loss": 0.69105422, - "learning_rate": 3.200982089323179e-08, - "loss": 0.71234798, - "num_input_tokens_seen": 338812480, - "step": 15712, - "time_per_iteration": 2.7278430461883545 - }, - { - "auxiliary_loss_clip": 0.01102036, - "auxiliary_loss_mlp": 0.01036765, - "balance_loss_clip": 1.03873301, - "balance_loss_mlp": 1.02347302, - "epoch": 0.9447166691718022, - "flos": 16544764794240.0, - "grad_norm": 2.4121732835994405, - "language_loss": 0.70365906, - "learning_rate": 3.1940457857734246e-08, - "loss": 0.72504705, - "num_input_tokens_seen": 338829105, - "step": 15713, - "time_per_iteration": 2.6644036769866943 - }, - { - "auxiliary_loss_clip": 0.01083151, - "auxiliary_loss_mlp": 0.0103271, - "balance_loss_clip": 1.03377235, - "balance_loss_mlp": 1.01964462, - "epoch": 0.9447767924244702, - "flos": 29164635400320.0, - "grad_norm": 1.5448448829872168, - "language_loss": 0.7672528, - "learning_rate": 3.187116945125212e-08, - "loss": 0.7884115, - "num_input_tokens_seen": 338850670, - "step": 15714, - "time_per_iteration": 2.713848114013672 - }, - { - "auxiliary_loss_clip": 0.01083406, - "auxiliary_loss_mlp": 0.01030794, - "balance_loss_clip": 1.03877974, - "balance_loss_mlp": 1.01808619, - "epoch": 0.9448369156771381, - "flos": 19274908803840.0, - "grad_norm": 3.6710113259545456, - "language_loss": 0.67744088, - "learning_rate": 3.1801955676412194e-08, - "loss": 0.69858289, - "num_input_tokens_seen": 338867795, - "step": 15715, - "time_per_iteration": 4.1955413818359375 - }, - { - "auxiliary_loss_clip": 0.01076435, - "auxiliary_loss_mlp": 0.01033314, - "balance_loss_clip": 1.03516388, - "balance_loss_mlp": 1.02042139, - "epoch": 0.9448970389298061, - "flos": 23841166285440.0, - "grad_norm": 1.6653982996808796, - "language_loss": 0.74771553, - "learning_rate": 3.173281653583948e-08, - "loss": 0.76881307, - "num_input_tokens_seen": 338887205, - "step": 15716, - "time_per_iteration": 2.7072696685791016 - }, - { - "auxiliary_loss_clip": 0.01092174, - "auxiliary_loss_mlp": 0.01031218, - "balance_loss_clip": 1.03965962, - "balance_loss_mlp": 1.01850486, - "epoch": 0.944957162182474, - "flos": 22382259488640.0, - "grad_norm": 2.4365311852184797, - "language_loss": 0.62516659, - "learning_rate": 3.166375203215565e-08, - "loss": 0.64640057, - "num_input_tokens_seen": 338906130, - "step": 15717, - "time_per_iteration": 4.276852369308472 - }, - { - "auxiliary_loss_clip": 0.01094123, - "auxiliary_loss_mlp": 0.0103479, - "balance_loss_clip": 1.03850865, - "balance_loss_mlp": 1.02269626, - "epoch": 0.9450172854351421, - "flos": 17383889393280.0, - "grad_norm": 1.77862512223437, - "language_loss": 0.79134482, - "learning_rate": 3.1594762167979514e-08, - "loss": 0.81263399, - "num_input_tokens_seen": 338923045, - "step": 15718, - "time_per_iteration": 4.204078674316406 - }, - { - "auxiliary_loss_clip": 0.01018497, - "auxiliary_loss_mlp": 0.00999465, - "balance_loss_clip": 1.00589895, - "balance_loss_mlp": 0.99857122, - "epoch": 0.94507740868781, - "flos": 68466352406400.0, - "grad_norm": 0.6985194200865079, - "language_loss": 0.57825208, - "learning_rate": 3.152584694592719e-08, - "loss": 0.59843159, - "num_input_tokens_seen": 338987545, - "step": 15719, - "time_per_iteration": 3.1477670669555664 - }, - { - "auxiliary_loss_clip": 0.0106753, - "auxiliary_loss_mlp": 0.00770827, - "balance_loss_clip": 1.03413999, - "balance_loss_mlp": 1.0002296, - "epoch": 0.945137531940478, - "flos": 21142479611520.0, - "grad_norm": 1.6417560155484736, - "language_loss": 0.75850344, - "learning_rate": 3.145700636861193e-08, - "loss": 0.77688694, - "num_input_tokens_seen": 339007830, - "step": 15720, - "time_per_iteration": 2.7489445209503174 - }, - { - "auxiliary_loss_clip": 0.01092778, - "auxiliary_loss_mlp": 0.01027293, - "balance_loss_clip": 1.0348984, - "balance_loss_mlp": 1.01603997, - "epoch": 0.9451976551931459, - "flos": 24533918962560.0, - "grad_norm": 1.6214864220397953, - "language_loss": 0.72730792, - "learning_rate": 3.138824043864452e-08, - "loss": 0.74850857, - "num_input_tokens_seen": 339028980, - "step": 15721, - "time_per_iteration": 4.25614595413208 - }, - { - "auxiliary_loss_clip": 0.01062633, - "auxiliary_loss_mlp": 0.0103276, - "balance_loss_clip": 1.03165364, - "balance_loss_mlp": 1.01968312, - "epoch": 0.9452577784458139, - "flos": 23440582834560.0, - "grad_norm": 2.1250289522384933, - "language_loss": 0.85435033, - "learning_rate": 3.131954915863244e-08, - "loss": 0.87530422, - "num_input_tokens_seen": 339047950, - "step": 15722, - "time_per_iteration": 2.7739651203155518 - }, - { - "auxiliary_loss_clip": 0.01008256, - "auxiliary_loss_mlp": 0.00999124, - "balance_loss_clip": 1.00595665, - "balance_loss_mlp": 0.99822962, - "epoch": 0.9453179016984818, - "flos": 52017686449920.0, - "grad_norm": 0.8877194495304748, - "language_loss": 0.64485419, - "learning_rate": 3.125093253118005e-08, - "loss": 0.66492796, - "num_input_tokens_seen": 339104535, - "step": 15723, - "time_per_iteration": 3.120633363723755 - }, - { - "auxiliary_loss_clip": 0.01069344, - "auxiliary_loss_mlp": 0.01031555, - "balance_loss_clip": 1.03639483, - "balance_loss_mlp": 1.01878786, - "epoch": 0.9453780249511499, - "flos": 13473001509120.0, - "grad_norm": 2.137858204283182, - "language_loss": 0.73015231, - "learning_rate": 3.1182390558889715e-08, - "loss": 0.75116134, - "num_input_tokens_seen": 339122050, - "step": 15724, - "time_per_iteration": 2.7390730381011963 - }, - { - "auxiliary_loss_clip": 0.01075665, - "auxiliary_loss_mlp": 0.01027448, - "balance_loss_clip": 1.03523171, - "balance_loss_mlp": 1.01556301, - "epoch": 0.9454381482038178, - "flos": 23258515772160.0, - "grad_norm": 2.0036441460727676, - "language_loss": 0.84524632, - "learning_rate": 3.111392324436024e-08, - "loss": 0.8662774, - "num_input_tokens_seen": 339138940, - "step": 15725, - "time_per_iteration": 2.7032201290130615 - }, - { - "auxiliary_loss_clip": 0.01092034, - "auxiliary_loss_mlp": 0.01027805, - "balance_loss_clip": 1.03934574, - "balance_loss_mlp": 1.01518655, - "epoch": 0.9454982714564858, - "flos": 19496621502720.0, - "grad_norm": 1.7112621255845237, - "language_loss": 0.71131301, - "learning_rate": 3.104553059018822e-08, - "loss": 0.7325114, - "num_input_tokens_seen": 339158245, - "step": 15726, - "time_per_iteration": 2.633211135864258 - }, - { - "auxiliary_loss_clip": 0.01083425, - "auxiliary_loss_mlp": 0.0103136, - "balance_loss_clip": 1.03504848, - "balance_loss_mlp": 1.01800275, - "epoch": 0.9455583947091538, - "flos": 23258120722560.0, - "grad_norm": 1.770426483467669, - "language_loss": 0.60957873, - "learning_rate": 3.097721259896735e-08, - "loss": 0.63072664, - "num_input_tokens_seen": 339178200, - "step": 15727, - "time_per_iteration": 2.66964054107666 - }, - { - "auxiliary_loss_clip": 0.01093477, - "auxiliary_loss_mlp": 0.01033773, - "balance_loss_clip": 1.03381276, - "balance_loss_mlp": 1.02250743, - "epoch": 0.9456185179618217, - "flos": 17673041877120.0, - "grad_norm": 1.6947492443167869, - "language_loss": 0.81717706, - "learning_rate": 3.0908969273287566e-08, - "loss": 0.83844954, - "num_input_tokens_seen": 339193950, - "step": 15728, - "time_per_iteration": 2.6493005752563477 - }, - { - "auxiliary_loss_clip": 0.00982318, - "auxiliary_loss_mlp": 0.01006045, - "balance_loss_clip": 1.00669122, - "balance_loss_mlp": 1.00475144, - "epoch": 0.9456786412144897, - "flos": 61415040389760.0, - "grad_norm": 0.7309632127975088, - "language_loss": 0.59005105, - "learning_rate": 3.08408006157368e-08, - "loss": 0.60993469, - "num_input_tokens_seen": 339252330, - "step": 15729, - "time_per_iteration": 3.3251638412475586 - }, - { - "auxiliary_loss_clip": 0.01106055, - "auxiliary_loss_mlp": 0.01026459, - "balance_loss_clip": 1.03561211, - "balance_loss_mlp": 1.01384068, - "epoch": 0.9457387644671577, - "flos": 18588369179520.0, - "grad_norm": 1.8376283241487172, - "language_loss": 0.76239592, - "learning_rate": 3.077270662890052e-08, - "loss": 0.78372103, - "num_input_tokens_seen": 339270325, - "step": 15730, - "time_per_iteration": 2.822908401489258 - }, - { - "auxiliary_loss_clip": 0.01086637, - "auxiliary_loss_mlp": 0.01031322, - "balance_loss_clip": 1.03977942, - "balance_loss_mlp": 1.01842904, - "epoch": 0.9457988877198257, - "flos": 21108544237440.0, - "grad_norm": 1.4741875108902043, - "language_loss": 0.6241951, - "learning_rate": 3.070468731536047e-08, - "loss": 0.64537472, - "num_input_tokens_seen": 339291980, - "step": 15731, - "time_per_iteration": 2.780259370803833 - }, - { - "auxiliary_loss_clip": 0.01098616, - "auxiliary_loss_mlp": 0.01027483, - "balance_loss_clip": 1.03492093, - "balance_loss_mlp": 1.01464427, - "epoch": 0.9458590109724936, - "flos": 26688379697280.0, - "grad_norm": 1.8389995497060174, - "language_loss": 0.63829595, - "learning_rate": 3.063674267769589e-08, - "loss": 0.65955698, - "num_input_tokens_seen": 339311795, - "step": 15732, - "time_per_iteration": 2.6928884983062744 - }, - { - "auxiliary_loss_clip": 0.01097602, - "auxiliary_loss_mlp": 0.01030285, - "balance_loss_clip": 1.03819144, - "balance_loss_mlp": 1.01691008, - "epoch": 0.9459191342251616, - "flos": 18661591054080.0, - "grad_norm": 2.312355275604837, - "language_loss": 0.83734918, - "learning_rate": 3.056887271848363e-08, - "loss": 0.85862809, - "num_input_tokens_seen": 339327745, - "step": 15733, - "time_per_iteration": 2.573761463165283 - }, - { - "auxiliary_loss_clip": 0.01093698, - "auxiliary_loss_mlp": 0.01029334, - "balance_loss_clip": 1.03431845, - "balance_loss_mlp": 1.01787257, - "epoch": 0.9459792574778295, - "flos": 23398459159680.0, - "grad_norm": 2.0402128352906135, - "language_loss": 0.7230435, - "learning_rate": 3.0501077440297173e-08, - "loss": 0.74427378, - "num_input_tokens_seen": 339346445, - "step": 15734, - "time_per_iteration": 2.6132256984710693 - }, - { - "auxiliary_loss_clip": 0.01092017, - "auxiliary_loss_mlp": 0.01030532, - "balance_loss_clip": 1.03411341, - "balance_loss_mlp": 1.01973712, - "epoch": 0.9460393807304975, - "flos": 24392969994240.0, - "grad_norm": 1.706482735493318, - "language_loss": 0.86788249, - "learning_rate": 3.043335684570692e-08, - "loss": 0.88910794, - "num_input_tokens_seen": 339367945, - "step": 15735, - "time_per_iteration": 2.6257829666137695 - }, - { - "auxiliary_loss_clip": 0.01088315, - "auxiliary_loss_mlp": 0.01028057, - "balance_loss_clip": 1.0354389, - "balance_loss_mlp": 1.0162971, - "epoch": 0.9460995039831654, - "flos": 21939408708480.0, - "grad_norm": 2.0029981426507026, - "language_loss": 0.6727972, - "learning_rate": 3.036571093728102e-08, - "loss": 0.69396096, - "num_input_tokens_seen": 339386060, - "step": 15736, - "time_per_iteration": 2.6414105892181396 - }, - { - "auxiliary_loss_clip": 0.00990794, - "auxiliary_loss_mlp": 0.01001581, - "balance_loss_clip": 1.01297307, - "balance_loss_mlp": 1.00051391, - "epoch": 0.9461596272358335, - "flos": 70322466775680.0, - "grad_norm": 0.8681687595231652, - "language_loss": 0.65302682, - "learning_rate": 3.029813971758499e-08, - "loss": 0.67295063, - "num_input_tokens_seen": 339446695, - "step": 15737, - "time_per_iteration": 3.2643556594848633 - }, - { - "auxiliary_loss_clip": 0.01016522, - "auxiliary_loss_mlp": 0.01001018, - "balance_loss_clip": 1.00645328, - "balance_loss_mlp": 0.99996263, - "epoch": 0.9462197504885014, - "flos": 58591242645120.0, - "grad_norm": 0.8027511027658571, - "language_loss": 0.58797008, - "learning_rate": 3.0230643189181225e-08, - "loss": 0.60814548, - "num_input_tokens_seen": 339510080, - "step": 15738, - "time_per_iteration": 3.1644718647003174 - }, - { - "auxiliary_loss_clip": 0.01093604, - "auxiliary_loss_mlp": 0.0103372, - "balance_loss_clip": 1.03396416, - "balance_loss_mlp": 1.02237749, - "epoch": 0.9462798737411694, - "flos": 23433759250560.0, - "grad_norm": 1.7606066776130818, - "language_loss": 0.71841681, - "learning_rate": 3.016322135462834e-08, - "loss": 0.73969007, - "num_input_tokens_seen": 339529335, - "step": 15739, - "time_per_iteration": 2.6679999828338623 - }, - { - "auxiliary_loss_clip": 0.01093944, - "auxiliary_loss_mlp": 0.01027852, - "balance_loss_clip": 1.03307033, - "balance_loss_mlp": 1.01536524, - "epoch": 0.9463399969938374, - "flos": 25046077034880.0, - "grad_norm": 2.442002689709471, - "language_loss": 0.65025353, - "learning_rate": 3.009587421648363e-08, - "loss": 0.67147148, - "num_input_tokens_seen": 339548820, - "step": 15740, - "time_per_iteration": 2.703686237335205 - }, - { - "auxiliary_loss_clip": 0.01082274, - "auxiliary_loss_mlp": 0.01029406, - "balance_loss_clip": 1.0356245, - "balance_loss_mlp": 1.01749706, - "epoch": 0.9464001202465053, - "flos": 24352606085760.0, - "grad_norm": 1.7330701520859766, - "language_loss": 0.66210133, - "learning_rate": 3.0028601777301045e-08, - "loss": 0.68321818, - "num_input_tokens_seen": 339566775, - "step": 15741, - "time_per_iteration": 2.664438009262085 - }, - { - "auxiliary_loss_clip": 0.01097651, - "auxiliary_loss_mlp": 0.01026578, - "balance_loss_clip": 1.03633511, - "balance_loss_mlp": 1.01461589, - "epoch": 0.9464602434991733, - "flos": 17165444832000.0, - "grad_norm": 1.8465493774504513, - "language_loss": 0.76130718, - "learning_rate": 2.9961404039630987e-08, - "loss": 0.7825495, - "num_input_tokens_seen": 339581905, - "step": 15742, - "time_per_iteration": 2.5938029289245605 - }, - { - "auxiliary_loss_clip": 0.01092873, - "auxiliary_loss_mlp": 0.01030901, - "balance_loss_clip": 1.0342133, - "balance_loss_mlp": 1.01929605, - "epoch": 0.9465203667518413, - "flos": 19938107566080.0, - "grad_norm": 1.7769383818229454, - "language_loss": 0.72399694, - "learning_rate": 2.989428100602187e-08, - "loss": 0.74523461, - "num_input_tokens_seen": 339599870, - "step": 15743, - "time_per_iteration": 2.678401470184326 - }, - { - "auxiliary_loss_clip": 0.01073999, - "auxiliary_loss_mlp": 0.01031411, - "balance_loss_clip": 1.03740954, - "balance_loss_mlp": 1.01843548, - "epoch": 0.9465804900045093, - "flos": 20120318282880.0, - "grad_norm": 1.6693013333008395, - "language_loss": 0.79701877, - "learning_rate": 2.982723267901943e-08, - "loss": 0.81807292, - "num_input_tokens_seen": 339620250, - "step": 15744, - "time_per_iteration": 2.7061126232147217 - }, - { - "auxiliary_loss_clip": 0.01086196, - "auxiliary_loss_mlp": 0.01038507, - "balance_loss_clip": 1.03539801, - "balance_loss_mlp": 1.02565646, - "epoch": 0.9466406132571772, - "flos": 23911622812800.0, - "grad_norm": 1.6306815093715024, - "language_loss": 0.77796626, - "learning_rate": 2.9760259061165417e-08, - "loss": 0.79921329, - "num_input_tokens_seen": 339639900, - "step": 15745, - "time_per_iteration": 2.667794704437256 - }, - { - "auxiliary_loss_clip": 0.01082416, - "auxiliary_loss_mlp": 0.01035804, - "balance_loss_clip": 1.03260911, - "balance_loss_mlp": 1.02258968, - "epoch": 0.9467007365098452, - "flos": 19933223316480.0, - "grad_norm": 1.9540472383542953, - "language_loss": 0.70444429, - "learning_rate": 2.9693360155000014e-08, - "loss": 0.72562647, - "num_input_tokens_seen": 339658970, - "step": 15746, - "time_per_iteration": 2.671787738800049 - }, - { - "auxiliary_loss_clip": 0.01083981, - "auxiliary_loss_mlp": 0.01028625, - "balance_loss_clip": 1.03787088, - "balance_loss_mlp": 1.01583958, - "epoch": 0.9467608597625131, - "flos": 19310496203520.0, - "grad_norm": 2.086321056520881, - "language_loss": 0.55439335, - "learning_rate": 2.962653596305964e-08, - "loss": 0.57551944, - "num_input_tokens_seen": 339675600, - "step": 15747, - "time_per_iteration": 2.6125731468200684 - }, - { - "auxiliary_loss_clip": 0.00971726, - "auxiliary_loss_mlp": 0.0100543, - "balance_loss_clip": 1.00657761, - "balance_loss_mlp": 1.00431538, - "epoch": 0.9468209830151811, - "flos": 69630252802560.0, - "grad_norm": 0.6607022799380584, - "language_loss": 0.53227079, - "learning_rate": 2.955978648787871e-08, - "loss": 0.55204231, - "num_input_tokens_seen": 339744505, - "step": 15748, - "time_per_iteration": 3.6713624000549316 - }, - { - "auxiliary_loss_clip": 0.01087901, - "auxiliary_loss_mlp": 0.01036855, - "balance_loss_clip": 1.03579473, - "balance_loss_mlp": 1.02451098, - "epoch": 0.946881106267849, - "flos": 27016639113600.0, - "grad_norm": 1.6437037457333494, - "language_loss": 0.6632542, - "learning_rate": 2.9493111731988096e-08, - "loss": 0.68450171, - "num_input_tokens_seen": 339765810, - "step": 15749, - "time_per_iteration": 4.178863286972046 - }, - { - "auxiliary_loss_clip": 0.0107672, - "auxiliary_loss_mlp": 0.01030414, - "balance_loss_clip": 1.03375602, - "balance_loss_mlp": 1.01670504, - "epoch": 0.9469412295205171, - "flos": 20190092451840.0, - "grad_norm": 3.088565110530085, - "language_loss": 0.75976688, - "learning_rate": 2.942651169791621e-08, - "loss": 0.78083825, - "num_input_tokens_seen": 339784125, - "step": 15750, - "time_per_iteration": 2.7121167182922363 - }, - { - "auxiliary_loss_clip": 0.01096615, - "auxiliary_loss_mlp": 0.01028166, - "balance_loss_clip": 1.0368098, - "balance_loss_mlp": 1.01631653, - "epoch": 0.947001352773185, - "flos": 21324905809920.0, - "grad_norm": 1.5838311482694045, - "language_loss": 0.67727458, - "learning_rate": 2.9359986388188372e-08, - "loss": 0.69852245, - "num_input_tokens_seen": 339803450, - "step": 15751, - "time_per_iteration": 2.709989070892334 - }, - { - "auxiliary_loss_clip": 0.01076359, - "auxiliary_loss_mlp": 0.01030459, - "balance_loss_clip": 1.03434587, - "balance_loss_mlp": 1.01857424, - "epoch": 0.947061476025853, - "flos": 21944041562880.0, - "grad_norm": 1.6112027169393213, - "language_loss": 0.65785074, - "learning_rate": 2.929353580532723e-08, - "loss": 0.6789189, - "num_input_tokens_seen": 339823215, - "step": 15752, - "time_per_iteration": 2.731290102005005 - }, - { - "auxiliary_loss_clip": 0.01092841, - "auxiliary_loss_mlp": 0.01035403, - "balance_loss_clip": 1.03419137, - "balance_loss_mlp": 1.0214498, - "epoch": 0.947121599278521, - "flos": 21394715892480.0, - "grad_norm": 1.9387183547098805, - "language_loss": 0.71516705, - "learning_rate": 2.9227159951852764e-08, - "loss": 0.73644954, - "num_input_tokens_seen": 339842230, - "step": 15753, - "time_per_iteration": 2.6081583499908447 - }, - { - "auxiliary_loss_clip": 0.01109554, - "auxiliary_loss_mlp": 0.01032964, - "balance_loss_clip": 1.03532398, - "balance_loss_mlp": 1.01883757, - "epoch": 0.9471817225311889, - "flos": 23075730437760.0, - "grad_norm": 1.76483043172341, - "language_loss": 0.70370275, - "learning_rate": 2.9160858830281855e-08, - "loss": 0.72512788, - "num_input_tokens_seen": 339861640, - "step": 15754, - "time_per_iteration": 2.580967426300049 - }, - { - "auxiliary_loss_clip": 0.0110967, - "auxiliary_loss_mlp": 0.01032839, - "balance_loss_clip": 1.03552377, - "balance_loss_mlp": 1.02043486, - "epoch": 0.947241845783857, - "flos": 11910744305280.0, - "grad_norm": 3.2827981258328207, - "language_loss": 0.78840715, - "learning_rate": 2.9094632443129153e-08, - "loss": 0.80983222, - "num_input_tokens_seen": 339878210, - "step": 15755, - "time_per_iteration": 4.16628360748291 - }, - { - "auxiliary_loss_clip": 0.01070124, - "auxiliary_loss_mlp": 0.0103368, - "balance_loss_clip": 1.03388035, - "balance_loss_mlp": 1.01844525, - "epoch": 0.9473019690365249, - "flos": 20740675098240.0, - "grad_norm": 2.275356608148397, - "language_loss": 0.75449395, - "learning_rate": 2.9028480792904876e-08, - "loss": 0.77553201, - "num_input_tokens_seen": 339894255, - "step": 15756, - "time_per_iteration": 4.229847431182861 - }, - { - "auxiliary_loss_clip": 0.01083084, - "auxiliary_loss_mlp": 0.01030263, - "balance_loss_clip": 1.03162217, - "balance_loss_mlp": 1.01807952, - "epoch": 0.9473620922891929, - "flos": 17639896602240.0, - "grad_norm": 2.263017805746966, - "language_loss": 0.74833083, - "learning_rate": 2.8962403882118347e-08, - "loss": 0.76946425, - "num_input_tokens_seen": 339912425, - "step": 15757, - "time_per_iteration": 2.64909291267395 - }, - { - "auxiliary_loss_clip": 0.01089898, - "auxiliary_loss_mlp": 0.0103155, - "balance_loss_clip": 1.03554904, - "balance_loss_mlp": 1.01819229, - "epoch": 0.9474222155418608, - "flos": 23550002640000.0, - "grad_norm": 1.9625522630071812, - "language_loss": 0.79462659, - "learning_rate": 2.889640171327512e-08, - "loss": 0.81584108, - "num_input_tokens_seen": 339929635, - "step": 15758, - "time_per_iteration": 4.308099031448364 - }, - { - "auxiliary_loss_clip": 0.01077085, - "auxiliary_loss_mlp": 0.00769854, - "balance_loss_clip": 1.03425276, - "balance_loss_mlp": 1.00017619, - "epoch": 0.9474823387945288, - "flos": 27089753247360.0, - "grad_norm": 1.3762409201655417, - "language_loss": 0.71830899, - "learning_rate": 2.8830474288877638e-08, - "loss": 0.73677838, - "num_input_tokens_seen": 339951200, - "step": 15759, - "time_per_iteration": 2.7510428428649902 - }, - { - "auxiliary_loss_clip": 0.01091647, - "auxiliary_loss_mlp": 0.01028463, - "balance_loss_clip": 1.0367434, - "balance_loss_mlp": 1.01805592, - "epoch": 0.9475424620471967, - "flos": 22966526113920.0, - "grad_norm": 1.5226325808492376, - "language_loss": 0.75499642, - "learning_rate": 2.8764621611426344e-08, - "loss": 0.77619755, - "num_input_tokens_seen": 339971820, - "step": 15760, - "time_per_iteration": 4.22639799118042 - }, - { - "auxiliary_loss_clip": 0.01107661, - "auxiliary_loss_mlp": 0.00769288, - "balance_loss_clip": 1.0366137, - "balance_loss_mlp": 1.00024486, - "epoch": 0.9476025852998647, - "flos": 20047671025920.0, - "grad_norm": 1.877401984510813, - "language_loss": 0.7275269, - "learning_rate": 2.8698843683418128e-08, - "loss": 0.74629641, - "num_input_tokens_seen": 339989420, - "step": 15761, - "time_per_iteration": 2.6196117401123047 - }, - { - "auxiliary_loss_clip": 0.01086146, - "auxiliary_loss_mlp": 0.01036281, - "balance_loss_clip": 1.03789508, - "balance_loss_mlp": 1.02441955, - "epoch": 0.9476627085525327, - "flos": 14975468524800.0, - "grad_norm": 2.729446835084705, - "language_loss": 0.71608138, - "learning_rate": 2.863314050734722e-08, - "loss": 0.73730564, - "num_input_tokens_seen": 340006690, - "step": 15762, - "time_per_iteration": 2.579223155975342 - }, - { - "auxiliary_loss_clip": 0.01111512, - "auxiliary_loss_mlp": 0.01036338, - "balance_loss_clip": 1.03547406, - "balance_loss_mlp": 1.02280796, - "epoch": 0.9477228318052007, - "flos": 18697788984960.0, - "grad_norm": 2.07051809457296, - "language_loss": 0.66850615, - "learning_rate": 2.856751208570518e-08, - "loss": 0.68998462, - "num_input_tokens_seen": 340025480, - "step": 15763, - "time_per_iteration": 2.5970752239227295 - }, - { - "auxiliary_loss_clip": 0.01107023, - "auxiliary_loss_mlp": 0.01036664, - "balance_loss_clip": 1.03498352, - "balance_loss_mlp": 1.02424252, - "epoch": 0.9477829550578686, - "flos": 23875065745920.0, - "grad_norm": 1.7866390242550823, - "language_loss": 0.69783157, - "learning_rate": 2.8501958420980466e-08, - "loss": 0.71926844, - "num_input_tokens_seen": 340043785, - "step": 15764, - "time_per_iteration": 2.5758285522460938 - }, - { - "auxiliary_loss_clip": 0.01095569, - "auxiliary_loss_mlp": 0.00768781, - "balance_loss_clip": 1.03836465, - "balance_loss_mlp": 1.00017273, - "epoch": 0.9478430783105366, - "flos": 22562890007040.0, - "grad_norm": 1.6268430916699592, - "language_loss": 0.71237898, - "learning_rate": 2.8436479515659306e-08, - "loss": 0.73102248, - "num_input_tokens_seen": 340064360, - "step": 15765, - "time_per_iteration": 2.7962822914123535 - }, - { - "auxiliary_loss_clip": 0.01008188, - "auxiliary_loss_mlp": 0.01003115, - "balance_loss_clip": 1.00526595, - "balance_loss_mlp": 1.00220859, - "epoch": 0.9479032015632046, - "flos": 60857885554560.0, - "grad_norm": 0.805215239265766, - "language_loss": 0.59051013, - "learning_rate": 2.8371075372224384e-08, - "loss": 0.61062312, - "num_input_tokens_seen": 340114425, - "step": 15766, - "time_per_iteration": 2.9193778038024902 - }, - { - "auxiliary_loss_clip": 0.01055212, - "auxiliary_loss_mlp": 0.0104033, - "balance_loss_clip": 1.03303111, - "balance_loss_mlp": 1.02758038, - "epoch": 0.9479633248158725, - "flos": 14683873916160.0, - "grad_norm": 1.7099233652082526, - "language_loss": 0.74133235, - "learning_rate": 2.8305745993155938e-08, - "loss": 0.7622878, - "num_input_tokens_seen": 340132200, - "step": 15767, - "time_per_iteration": 2.805891990661621 - }, - { - "auxiliary_loss_clip": 0.0108313, - "auxiliary_loss_mlp": 0.01032242, - "balance_loss_clip": 1.03779268, - "balance_loss_mlp": 1.0191226, - "epoch": 0.9480234480685406, - "flos": 20333878594560.0, - "grad_norm": 2.313278201082517, - "language_loss": 0.73025, - "learning_rate": 2.8240491380931096e-08, - "loss": 0.75140369, - "num_input_tokens_seen": 340149175, - "step": 15768, - "time_per_iteration": 2.6399149894714355 - }, - { - "auxiliary_loss_clip": 0.00990186, - "auxiliary_loss_mlp": 0.01003636, - "balance_loss_clip": 1.00721884, - "balance_loss_mlp": 1.0025754, - "epoch": 0.9480835713212085, - "flos": 70293092428800.0, - "grad_norm": 0.7345736556606803, - "language_loss": 0.55274725, - "learning_rate": 2.8175311538024326e-08, - "loss": 0.57268548, - "num_input_tokens_seen": 340208155, - "step": 15769, - "time_per_iteration": 3.346592664718628 - }, - { - "auxiliary_loss_clip": 0.01060494, - "auxiliary_loss_mlp": 0.01031034, - "balance_loss_clip": 1.03297341, - "balance_loss_mlp": 1.01898789, - "epoch": 0.9481436945738765, - "flos": 25449749055360.0, - "grad_norm": 1.3356689895855771, - "language_loss": 0.77657175, - "learning_rate": 2.8110206466907428e-08, - "loss": 0.79748702, - "num_input_tokens_seen": 340229275, - "step": 15770, - "time_per_iteration": 2.967400074005127 - }, - { - "auxiliary_loss_clip": 0.01090847, - "auxiliary_loss_mlp": 0.0103804, - "balance_loss_clip": 1.0389564, - "balance_loss_mlp": 1.02452743, - "epoch": 0.9482038178265444, - "flos": 26979902478720.0, - "grad_norm": 3.3844723552272304, - "language_loss": 0.79788053, - "learning_rate": 2.8045176170049313e-08, - "loss": 0.81916934, - "num_input_tokens_seen": 340248920, - "step": 15771, - "time_per_iteration": 2.6720709800720215 - }, - { - "auxiliary_loss_clip": 0.0107385, - "auxiliary_loss_mlp": 0.01029719, - "balance_loss_clip": 1.03290439, - "balance_loss_mlp": 1.01731515, - "epoch": 0.9482639410792124, - "flos": 17785442511360.0, - "grad_norm": 1.7305869022137186, - "language_loss": 0.69742543, - "learning_rate": 2.7980220649915566e-08, - "loss": 0.71846116, - "num_input_tokens_seen": 340266775, - "step": 15772, - "time_per_iteration": 2.7055277824401855 - }, - { - "auxiliary_loss_clip": 0.01091743, - "auxiliary_loss_mlp": 0.0103086, - "balance_loss_clip": 1.03463781, - "balance_loss_mlp": 1.01827109, - "epoch": 0.9483240643318803, - "flos": 20996682307200.0, - "grad_norm": 1.5174213608604383, - "language_loss": 0.73862821, - "learning_rate": 2.7915339908969327e-08, - "loss": 0.7598542, - "num_input_tokens_seen": 340285295, - "step": 15773, - "time_per_iteration": 2.594517469406128 - }, - { - "auxiliary_loss_clip": 0.01075154, - "auxiliary_loss_mlp": 0.01037032, - "balance_loss_clip": 1.03320599, - "balance_loss_mlp": 1.02424073, - "epoch": 0.9483841875845483, - "flos": 20083294339200.0, - "grad_norm": 2.198949028085397, - "language_loss": 0.62984806, - "learning_rate": 2.7850533949671072e-08, - "loss": 0.65096992, - "num_input_tokens_seen": 340304265, - "step": 15774, - "time_per_iteration": 2.6656346321105957 - }, - { - "auxiliary_loss_clip": 0.01108855, - "auxiliary_loss_mlp": 0.01032808, - "balance_loss_clip": 1.03615785, - "balance_loss_mlp": 1.01968336, - "epoch": 0.9484443108372163, - "flos": 20813645577600.0, - "grad_norm": 1.8448156686123751, - "language_loss": 0.59319341, - "learning_rate": 2.7785802774478396e-08, - "loss": 0.61461002, - "num_input_tokens_seen": 340323690, - "step": 15775, - "time_per_iteration": 2.6134490966796875 - }, - { - "auxiliary_loss_clip": 0.0108818, - "auxiliary_loss_mlp": 0.01028388, - "balance_loss_clip": 1.03665876, - "balance_loss_mlp": 1.01553738, - "epoch": 0.9485044340898843, - "flos": 36429184506240.0, - "grad_norm": 1.5672743954307715, - "language_loss": 0.61733031, - "learning_rate": 2.772114638584555e-08, - "loss": 0.63849604, - "num_input_tokens_seen": 340345830, - "step": 15776, - "time_per_iteration": 2.759727954864502 - }, - { - "auxiliary_loss_clip": 0.01079507, - "auxiliary_loss_mlp": 0.01031725, - "balance_loss_clip": 1.03297567, - "balance_loss_mlp": 1.01894581, - "epoch": 0.9485645573425522, - "flos": 22602535643520.0, - "grad_norm": 1.5939795755888917, - "language_loss": 0.73614502, - "learning_rate": 2.765656478622458e-08, - "loss": 0.75725728, - "num_input_tokens_seen": 340365910, - "step": 15777, - "time_per_iteration": 2.6045753955841064 - }, - { - "auxiliary_loss_clip": 0.01108311, - "auxiliary_loss_mlp": 0.01035184, - "balance_loss_clip": 1.03904653, - "balance_loss_mlp": 1.0216893, - "epoch": 0.9486246805952202, - "flos": 22017766227840.0, - "grad_norm": 2.932173295404769, - "language_loss": 0.7171486, - "learning_rate": 2.759205797806441e-08, - "loss": 0.73858356, - "num_input_tokens_seen": 340383935, - "step": 15778, - "time_per_iteration": 2.5818030834198 - }, - { - "auxiliary_loss_clip": 0.0109326, - "auxiliary_loss_mlp": 0.00769105, - "balance_loss_clip": 1.03678966, - "balance_loss_mlp": 1.00016212, - "epoch": 0.9486848038478882, - "flos": 16508674604160.0, - "grad_norm": 1.785656818158453, - "language_loss": 0.70001411, - "learning_rate": 2.7527625963810865e-08, - "loss": 0.7186377, - "num_input_tokens_seen": 340402760, - "step": 15779, - "time_per_iteration": 2.5735414028167725 - }, - { - "auxiliary_loss_clip": 0.01109892, - "auxiliary_loss_mlp": 0.01032242, - "balance_loss_clip": 1.03769064, - "balance_loss_mlp": 1.01942062, - "epoch": 0.9487449271005561, - "flos": 19244385221760.0, - "grad_norm": 2.1485694494900045, - "language_loss": 0.78390372, - "learning_rate": 2.7463268745907542e-08, - "loss": 0.80532503, - "num_input_tokens_seen": 340422105, - "step": 15780, - "time_per_iteration": 2.571122169494629 - }, - { - "auxiliary_loss_clip": 0.0108342, - "auxiliary_loss_mlp": 0.00770056, - "balance_loss_clip": 1.03853536, - "balance_loss_mlp": 1.00020254, - "epoch": 0.9488050503532242, - "flos": 21762692772480.0, - "grad_norm": 1.7364814662461427, - "language_loss": 0.66234344, - "learning_rate": 2.7398986326794494e-08, - "loss": 0.68087816, - "num_input_tokens_seen": 340441160, - "step": 15781, - "time_per_iteration": 2.6827192306518555 - }, - { - "auxiliary_loss_clip": 0.01107411, - "auxiliary_loss_mlp": 0.01034117, - "balance_loss_clip": 1.03690338, - "balance_loss_mlp": 1.02159953, - "epoch": 0.9488651736058921, - "flos": 18368919037440.0, - "grad_norm": 9.76335675616754, - "language_loss": 0.79928899, - "learning_rate": 2.733477870890999e-08, - "loss": 0.82070434, - "num_input_tokens_seen": 340458200, - "step": 15782, - "time_per_iteration": 2.567207098007202 - }, - { - "auxiliary_loss_clip": 0.010185, - "auxiliary_loss_mlp": 0.01003019, - "balance_loss_clip": 1.00588965, - "balance_loss_mlp": 1.00194001, - "epoch": 0.9489252968585601, - "flos": 70084057230720.0, - "grad_norm": 0.7221824593756564, - "language_loss": 0.59740299, - "learning_rate": 2.7270645894688082e-08, - "loss": 0.6176182, - "num_input_tokens_seen": 340526420, - "step": 15783, - "time_per_iteration": 3.296163558959961 - }, - { - "auxiliary_loss_clip": 0.01096688, - "auxiliary_loss_mlp": 0.01035623, - "balance_loss_clip": 1.0347774, - "balance_loss_mlp": 1.02289176, - "epoch": 0.948985420111228, - "flos": 27855440490240.0, - "grad_norm": 1.6602222433603364, - "language_loss": 0.73771024, - "learning_rate": 2.720658788656105e-08, - "loss": 0.75903332, - "num_input_tokens_seen": 340546325, - "step": 15784, - "time_per_iteration": 2.671168804168701 - }, - { - "auxiliary_loss_clip": 0.01060019, - "auxiliary_loss_mlp": 0.01031532, - "balance_loss_clip": 1.03550255, - "balance_loss_mlp": 1.01762056, - "epoch": 0.949045543363896, - "flos": 24316049018880.0, - "grad_norm": 1.7690180821758892, - "language_loss": 0.69829547, - "learning_rate": 2.714260468695806e-08, - "loss": 0.71921104, - "num_input_tokens_seen": 340565145, - "step": 15785, - "time_per_iteration": 2.718092203140259 - }, - { - "auxiliary_loss_clip": 0.01108856, - "auxiliary_loss_mlp": 0.01028867, - "balance_loss_clip": 1.03556883, - "balance_loss_mlp": 1.01650548, - "epoch": 0.9491056666165639, - "flos": 24241677909120.0, - "grad_norm": 1.499623149824644, - "language_loss": 0.75997609, - "learning_rate": 2.707869629830495e-08, - "loss": 0.78135335, - "num_input_tokens_seen": 340585465, - "step": 15786, - "time_per_iteration": 2.5866501331329346 - }, - { - "auxiliary_loss_clip": 0.01075928, - "auxiliary_loss_mlp": 0.01032218, - "balance_loss_clip": 1.03659058, - "balance_loss_mlp": 1.02088714, - "epoch": 0.949165789869232, - "flos": 24531261356160.0, - "grad_norm": 1.9121797564334724, - "language_loss": 0.78743112, - "learning_rate": 2.7014862723025335e-08, - "loss": 0.80851257, - "num_input_tokens_seen": 340606010, - "step": 15787, - "time_per_iteration": 2.6785271167755127 - }, - { - "auxiliary_loss_clip": 0.0109935, - "auxiliary_loss_mlp": 0.01029051, - "balance_loss_clip": 1.03999209, - "balance_loss_mlp": 1.01711869, - "epoch": 0.9492259131218999, - "flos": 22235348862720.0, - "grad_norm": 1.5253176882486765, - "language_loss": 0.76644206, - "learning_rate": 2.6951103963540388e-08, - "loss": 0.78772604, - "num_input_tokens_seen": 340626135, - "step": 15788, - "time_per_iteration": 2.7900092601776123 - }, - { - "auxiliary_loss_clip": 0.01098885, - "auxiliary_loss_mlp": 0.01032975, - "balance_loss_clip": 1.03593767, - "balance_loss_mlp": 1.019647, - "epoch": 0.9492860363745679, - "flos": 22966310632320.0, - "grad_norm": 1.7589364420140376, - "language_loss": 0.71141213, - "learning_rate": 2.6887420022266848e-08, - "loss": 0.73273069, - "num_input_tokens_seen": 340644870, - "step": 15789, - "time_per_iteration": 2.6160874366760254 - }, - { - "auxiliary_loss_clip": 0.01059097, - "auxiliary_loss_mlp": 0.01031485, - "balance_loss_clip": 1.03295982, - "balance_loss_mlp": 1.01794267, - "epoch": 0.9493461596272358, - "flos": 18370283754240.0, - "grad_norm": 2.83542221151725, - "language_loss": 0.73137754, - "learning_rate": 2.682381090161989e-08, - "loss": 0.75228333, - "num_input_tokens_seen": 340663695, - "step": 15790, - "time_per_iteration": 2.6108055114746094 - }, - { - "auxiliary_loss_clip": 0.01073497, - "auxiliary_loss_mlp": 0.01035956, - "balance_loss_clip": 1.03374732, - "balance_loss_mlp": 1.02253366, - "epoch": 0.9494062828799038, - "flos": 20011724490240.0, - "grad_norm": 1.9002383614444849, - "language_loss": 0.77333057, - "learning_rate": 2.6760276604012033e-08, - "loss": 0.79442513, - "num_input_tokens_seen": 340682970, - "step": 15791, - "time_per_iteration": 2.688148260116577 - }, - { - "auxiliary_loss_clip": 0.01102402, - "auxiliary_loss_mlp": 0.01034148, - "balance_loss_clip": 1.03735161, - "balance_loss_mlp": 1.02070129, - "epoch": 0.9494664061325718, - "flos": 27228583313280.0, - "grad_norm": 1.7874682888186109, - "language_loss": 0.73599547, - "learning_rate": 2.6696817131852234e-08, - "loss": 0.75736099, - "num_input_tokens_seen": 340702275, - "step": 15792, - "time_per_iteration": 2.643265962600708 - }, - { - "auxiliary_loss_clip": 0.01095336, - "auxiliary_loss_mlp": 0.01034889, - "balance_loss_clip": 1.03347254, - "balance_loss_mlp": 1.02266979, - "epoch": 0.9495265293852397, - "flos": 18369816877440.0, - "grad_norm": 1.8451462038230002, - "language_loss": 0.78138769, - "learning_rate": 2.663343248754679e-08, - "loss": 0.80268991, - "num_input_tokens_seen": 340719060, - "step": 15793, - "time_per_iteration": 2.5426347255706787 - }, - { - "auxiliary_loss_clip": 0.01081824, - "auxiliary_loss_mlp": 0.01030161, - "balance_loss_clip": 1.03453922, - "balance_loss_mlp": 1.0182879, - "epoch": 0.9495866526379078, - "flos": 23075766351360.0, - "grad_norm": 1.686462964828876, - "language_loss": 0.77439916, - "learning_rate": 2.6570122673499562e-08, - "loss": 0.79551899, - "num_input_tokens_seen": 340737815, - "step": 15794, - "time_per_iteration": 4.211062669754028 - }, - { - "auxiliary_loss_clip": 0.01078065, - "auxiliary_loss_mlp": 0.00770639, - "balance_loss_clip": 1.03476417, - "balance_loss_mlp": 1.00026226, - "epoch": 0.9496467758905757, - "flos": 17529902179200.0, - "grad_norm": 1.8326530487226782, - "language_loss": 0.61200684, - "learning_rate": 2.650688769211107e-08, - "loss": 0.63049388, - "num_input_tokens_seen": 340756035, - "step": 15795, - "time_per_iteration": 4.150991201400757 - }, - { - "auxiliary_loss_clip": 0.01096105, - "auxiliary_loss_mlp": 0.01034842, - "balance_loss_clip": 1.03731775, - "balance_loss_mlp": 1.02214646, - "epoch": 0.9497068991432437, - "flos": 24133910129280.0, - "grad_norm": 1.6119216372134806, - "language_loss": 0.79217291, - "learning_rate": 2.644372754577895e-08, - "loss": 0.81348234, - "num_input_tokens_seen": 340775620, - "step": 15796, - "time_per_iteration": 2.6128690242767334 - }, - { - "auxiliary_loss_clip": 0.01097993, - "auxiliary_loss_mlp": 0.01029872, - "balance_loss_clip": 1.03628421, - "balance_loss_mlp": 1.01681852, - "epoch": 0.9497670223959116, - "flos": 20303319098880.0, - "grad_norm": 1.8328846097658669, - "language_loss": 0.75668991, - "learning_rate": 2.6380642236898398e-08, - "loss": 0.77796859, - "num_input_tokens_seen": 340794510, - "step": 15797, - "time_per_iteration": 4.209908723831177 - }, - { - "auxiliary_loss_clip": 0.01076014, - "auxiliary_loss_mlp": 0.00770873, - "balance_loss_clip": 1.03560376, - "balance_loss_mlp": 1.00026107, - "epoch": 0.9498271456485796, - "flos": 13698916099200.0, - "grad_norm": 2.550624917313258, - "language_loss": 0.6578297, - "learning_rate": 2.6317631767861727e-08, - "loss": 0.67629862, - "num_input_tokens_seen": 340812955, - "step": 15798, - "time_per_iteration": 2.6348631381988525 - }, - { - "auxiliary_loss_clip": 0.011003, - "auxiliary_loss_mlp": 0.0103345, - "balance_loss_clip": 1.03773224, - "balance_loss_mlp": 1.02130818, - "epoch": 0.9498872689012475, - "flos": 20814004713600.0, - "grad_norm": 1.909884412198324, - "language_loss": 0.77439278, - "learning_rate": 2.6254696141058575e-08, - "loss": 0.79573023, - "num_input_tokens_seen": 340829200, - "step": 15799, - "time_per_iteration": 2.6085915565490723 - }, - { - "auxiliary_loss_clip": 0.01091765, - "auxiliary_loss_mlp": 0.01034579, - "balance_loss_clip": 1.03405094, - "balance_loss_mlp": 1.02236009, - "epoch": 0.9499473921539155, - "flos": 21032700670080.0, - "grad_norm": 1.7422846362169004, - "language_loss": 0.71096122, - "learning_rate": 2.6191835358874814e-08, - "loss": 0.7322247, - "num_input_tokens_seen": 340848035, - "step": 15800, - "time_per_iteration": 4.11196756362915 - }, - { - "auxiliary_loss_clip": 0.01081003, - "auxiliary_loss_mlp": 0.01027785, - "balance_loss_clip": 1.03178167, - "balance_loss_mlp": 1.01504791, - "epoch": 0.9500075154065835, - "flos": 20998693468800.0, - "grad_norm": 1.6797265038283544, - "language_loss": 0.7196418, - "learning_rate": 2.6129049423694315e-08, - "loss": 0.74072969, - "num_input_tokens_seen": 340870025, - "step": 15801, - "time_per_iteration": 2.7228105068206787 - }, - { - "auxiliary_loss_clip": 0.01098003, - "auxiliary_loss_mlp": 0.01032345, - "balance_loss_clip": 1.03618026, - "balance_loss_mlp": 1.02023363, - "epoch": 0.9500676386592515, - "flos": 25121956515840.0, - "grad_norm": 1.5618247598543729, - "language_loss": 0.80991805, - "learning_rate": 2.6066338337898508e-08, - "loss": 0.83122152, - "num_input_tokens_seen": 340892290, - "step": 15802, - "time_per_iteration": 2.6597704887390137 - }, - { - "auxiliary_loss_clip": 0.01111718, - "auxiliary_loss_mlp": 0.01031196, - "balance_loss_clip": 1.03881001, - "balance_loss_mlp": 1.01899517, - "epoch": 0.9501277619119194, - "flos": 27523625627520.0, - "grad_norm": 1.6749081287524619, - "language_loss": 0.67810452, - "learning_rate": 2.60037021038646e-08, - "loss": 0.69953358, - "num_input_tokens_seen": 340912260, - "step": 15803, - "time_per_iteration": 2.6744706630706787 - }, - { - "auxiliary_loss_clip": 0.01082837, - "auxiliary_loss_mlp": 0.01036561, - "balance_loss_clip": 1.03429604, - "balance_loss_mlp": 1.02377534, - "epoch": 0.9501878851645874, - "flos": 20813968800000.0, - "grad_norm": 6.246974750170738, - "language_loss": 0.76370931, - "learning_rate": 2.5941140723968247e-08, - "loss": 0.78490329, - "num_input_tokens_seen": 340928930, - "step": 15804, - "time_per_iteration": 2.721076726913452 - }, - { - "auxiliary_loss_clip": 0.01096211, - "auxiliary_loss_mlp": 0.01035081, - "balance_loss_clip": 1.03763199, - "balance_loss_mlp": 1.0223074, - "epoch": 0.9502480084172553, - "flos": 18369385914240.0, - "grad_norm": 1.716451063779602, - "language_loss": 0.73370028, - "learning_rate": 2.5878654200581775e-08, - "loss": 0.75501317, - "num_input_tokens_seen": 340946615, - "step": 15805, - "time_per_iteration": 2.573842763900757 - }, - { - "auxiliary_loss_clip": 0.01084759, - "auxiliary_loss_mlp": 0.01033637, - "balance_loss_clip": 1.03832221, - "balance_loss_mlp": 1.02066755, - "epoch": 0.9503081316699233, - "flos": 23549607590400.0, - "grad_norm": 1.446963145068923, - "language_loss": 0.80407286, - "learning_rate": 2.5816242536074618e-08, - "loss": 0.82525682, - "num_input_tokens_seen": 340967545, - "step": 15806, - "time_per_iteration": 2.7522966861724854 - }, - { - "auxiliary_loss_clip": 0.0107262, - "auxiliary_loss_mlp": 0.010333, - "balance_loss_clip": 1.03583097, - "balance_loss_mlp": 1.0209856, - "epoch": 0.9503682549225914, - "flos": 18040444139520.0, - "grad_norm": 2.3524275166414688, - "language_loss": 0.82226515, - "learning_rate": 2.5753905732813108e-08, - "loss": 0.8433243, - "num_input_tokens_seen": 340984955, - "step": 15807, - "time_per_iteration": 2.6490519046783447 - }, - { - "auxiliary_loss_clip": 0.01089448, - "auxiliary_loss_mlp": 0.01031628, - "balance_loss_clip": 1.03269625, - "balance_loss_mlp": 1.01936102, - "epoch": 0.9504283781752593, - "flos": 25886135387520.0, - "grad_norm": 9.284971191525596, - "language_loss": 0.71851462, - "learning_rate": 2.5691643793161355e-08, - "loss": 0.73972535, - "num_input_tokens_seen": 341007300, - "step": 15808, - "time_per_iteration": 2.6571197509765625 - }, - { - "auxiliary_loss_clip": 0.01097791, - "auxiliary_loss_mlp": 0.01030365, - "balance_loss_clip": 1.03632784, - "balance_loss_mlp": 1.01830709, - "epoch": 0.9504885014279273, - "flos": 22124025636480.0, - "grad_norm": 1.4241274902229573, - "language_loss": 0.69725883, - "learning_rate": 2.562945671948058e-08, - "loss": 0.71854043, - "num_input_tokens_seen": 341026695, - "step": 15809, - "time_per_iteration": 2.602086067199707 - }, - { - "auxiliary_loss_clip": 0.0108373, - "auxiliary_loss_mlp": 0.01027915, - "balance_loss_clip": 1.03374672, - "balance_loss_mlp": 1.01552939, - "epoch": 0.9505486246805952, - "flos": 21615961714560.0, - "grad_norm": 1.5381287986116137, - "language_loss": 0.75574476, - "learning_rate": 2.5567344514128452e-08, - "loss": 0.77686119, - "num_input_tokens_seen": 341047080, - "step": 15810, - "time_per_iteration": 2.7851271629333496 - }, - { - "auxiliary_loss_clip": 0.01074163, - "auxiliary_loss_mlp": 0.01043017, - "balance_loss_clip": 1.03387725, - "balance_loss_mlp": 1.03033352, - "epoch": 0.9506087479332632, - "flos": 22528236360960.0, - "grad_norm": 1.4680738031168652, - "language_loss": 0.79985034, - "learning_rate": 2.5505307179460643e-08, - "loss": 0.82102215, - "num_input_tokens_seen": 341067310, - "step": 15811, - "time_per_iteration": 2.716155767440796 - }, - { - "auxiliary_loss_clip": 0.01082329, - "auxiliary_loss_mlp": 0.01038784, - "balance_loss_clip": 1.03409791, - "balance_loss_mlp": 1.02606487, - "epoch": 0.9506688711859311, - "flos": 27527360641920.0, - "grad_norm": 2.1864110496701823, - "language_loss": 0.69794703, - "learning_rate": 2.5443344717829495e-08, - "loss": 0.71915817, - "num_input_tokens_seen": 341085110, - "step": 15812, - "time_per_iteration": 2.7080633640289307 - }, - { - "auxiliary_loss_clip": 0.01071236, - "auxiliary_loss_mlp": 0.01036435, - "balance_loss_clip": 1.03449655, - "balance_loss_mlp": 1.0230056, - "epoch": 0.9507289944385992, - "flos": 19865783531520.0, - "grad_norm": 1.621391502442825, - "language_loss": 0.65664506, - "learning_rate": 2.538145713158446e-08, - "loss": 0.67772174, - "num_input_tokens_seen": 341103190, - "step": 15813, - "time_per_iteration": 2.6422770023345947 - }, - { - "auxiliary_loss_clip": 0.01099547, - "auxiliary_loss_mlp": 0.01037026, - "balance_loss_clip": 1.03611267, - "balance_loss_mlp": 1.02409816, - "epoch": 0.9507891176912671, - "flos": 25193274969600.0, - "grad_norm": 1.4581482978793308, - "language_loss": 0.70320028, - "learning_rate": 2.5319644423072327e-08, - "loss": 0.72456604, - "num_input_tokens_seen": 341125695, - "step": 15814, - "time_per_iteration": 2.658942699432373 - }, - { - "auxiliary_loss_clip": 0.01097344, - "auxiliary_loss_mlp": 0.01028268, - "balance_loss_clip": 1.03695726, - "balance_loss_mlp": 1.01665115, - "epoch": 0.9508492409439351, - "flos": 24899561458560.0, - "grad_norm": 1.8950917263769373, - "language_loss": 0.63310945, - "learning_rate": 2.5257906594637445e-08, - "loss": 0.65436554, - "num_input_tokens_seen": 341143930, - "step": 15815, - "time_per_iteration": 2.633420944213867 - }, - { - "auxiliary_loss_clip": 0.01084007, - "auxiliary_loss_mlp": 0.01027739, - "balance_loss_clip": 1.03480506, - "balance_loss_mlp": 1.01581824, - "epoch": 0.950909364196603, - "flos": 29784094375680.0, - "grad_norm": 1.8730237235822342, - "language_loss": 0.58833039, - "learning_rate": 2.519624364862061e-08, - "loss": 0.60944784, - "num_input_tokens_seen": 341164280, - "step": 15816, - "time_per_iteration": 2.7500061988830566 - }, - { - "auxiliary_loss_clip": 0.0110715, - "auxiliary_loss_mlp": 0.01039761, - "balance_loss_clip": 1.03585255, - "balance_loss_mlp": 1.02707124, - "epoch": 0.950969487449271, - "flos": 24717781704960.0, - "grad_norm": 1.491116548169098, - "language_loss": 0.73515993, - "learning_rate": 2.513465558735994e-08, - "loss": 0.75662911, - "num_input_tokens_seen": 341183670, - "step": 15817, - "time_per_iteration": 2.6232523918151855 - }, - { - "auxiliary_loss_clip": 0.01089005, - "auxiliary_loss_mlp": 0.01034701, - "balance_loss_clip": 1.03726554, - "balance_loss_mlp": 1.02046156, - "epoch": 0.9510296107019389, - "flos": 13699167494400.0, - "grad_norm": 1.5602178845218895, - "language_loss": 0.60236609, - "learning_rate": 2.5073142413190918e-08, - "loss": 0.62360317, - "num_input_tokens_seen": 341201900, - "step": 15818, - "time_per_iteration": 2.6116764545440674 - }, - { - "auxiliary_loss_clip": 0.01109081, - "auxiliary_loss_mlp": 0.01034014, - "balance_loss_clip": 1.0376842, - "balance_loss_mlp": 1.02124643, - "epoch": 0.9510897339546069, - "flos": 17311852667520.0, - "grad_norm": 2.0566483218675438, - "language_loss": 0.6942215, - "learning_rate": 2.5011704128446552e-08, - "loss": 0.71565247, - "num_input_tokens_seen": 341218340, - "step": 15819, - "time_per_iteration": 2.560081958770752 - }, - { - "auxiliary_loss_clip": 0.0107016, - "auxiliary_loss_mlp": 0.01028125, - "balance_loss_clip": 1.0372014, - "balance_loss_mlp": 1.0156858, - "epoch": 0.951149857207275, - "flos": 14793940166400.0, - "grad_norm": 1.7393168966248527, - "language_loss": 0.73959541, - "learning_rate": 2.49503407354561e-08, - "loss": 0.76057822, - "num_input_tokens_seen": 341235885, - "step": 15820, - "time_per_iteration": 2.797940969467163 - }, - { - "auxiliary_loss_clip": 0.01089647, - "auxiliary_loss_mlp": 0.01033566, - "balance_loss_clip": 1.03681791, - "balance_loss_mlp": 1.02076864, - "epoch": 0.9512099804599429, - "flos": 19391152193280.0, - "grad_norm": 1.9037531735176971, - "language_loss": 0.78643155, - "learning_rate": 2.4889052236546804e-08, - "loss": 0.80766368, - "num_input_tokens_seen": 341255280, - "step": 15821, - "time_per_iteration": 2.6202476024627686 - }, - { - "auxiliary_loss_clip": 0.01068626, - "auxiliary_loss_mlp": 0.0102901, - "balance_loss_clip": 1.03432822, - "balance_loss_mlp": 1.01609325, - "epoch": 0.9512701037126109, - "flos": 36757874885760.0, - "grad_norm": 1.5233600677568924, - "language_loss": 0.71154618, - "learning_rate": 2.4827838634042586e-08, - "loss": 0.73252249, - "num_input_tokens_seen": 341279055, - "step": 15822, - "time_per_iteration": 2.8137216567993164 - }, - { - "auxiliary_loss_clip": 0.01094806, - "auxiliary_loss_mlp": 0.01037452, - "balance_loss_clip": 1.03667974, - "balance_loss_mlp": 1.02534676, - "epoch": 0.9513302269652788, - "flos": 22638266697600.0, - "grad_norm": 1.6180069901826792, - "language_loss": 0.65828168, - "learning_rate": 2.47666999302647e-08, - "loss": 0.67960423, - "num_input_tokens_seen": 341298560, - "step": 15823, - "time_per_iteration": 2.616811513900757 - }, - { - "auxiliary_loss_clip": 0.01090848, - "auxiliary_loss_mlp": 0.01031876, - "balance_loss_clip": 1.03517807, - "balance_loss_mlp": 1.01999104, - "epoch": 0.9513903502179468, - "flos": 22893232412160.0, - "grad_norm": 1.8863282557920107, - "language_loss": 0.77391565, - "learning_rate": 2.4705636127531292e-08, - "loss": 0.79514301, - "num_input_tokens_seen": 341316650, - "step": 15824, - "time_per_iteration": 2.5897138118743896 - }, - { - "auxiliary_loss_clip": 0.01110536, - "auxiliary_loss_mlp": 0.01031394, - "balance_loss_clip": 1.0360415, - "balance_loss_mlp": 1.01814985, - "epoch": 0.9514504734706147, - "flos": 27928626451200.0, - "grad_norm": 1.8804632984111238, - "language_loss": 0.73739725, - "learning_rate": 2.4644647228158065e-08, - "loss": 0.75881654, - "num_input_tokens_seen": 341336185, - "step": 15825, - "time_per_iteration": 2.59452223777771 - }, - { - "auxiliary_loss_clip": 0.0101482, - "auxiliary_loss_mlp": 0.00999967, - "balance_loss_clip": 1.00606704, - "balance_loss_mlp": 0.99895328, - "epoch": 0.9515105967232828, - "flos": 67366767312000.0, - "grad_norm": 0.8525119381835639, - "language_loss": 0.53459394, - "learning_rate": 2.458373323445806e-08, - "loss": 0.55474186, - "num_input_tokens_seen": 341395795, - "step": 15826, - "time_per_iteration": 3.0530049800872803 - }, - { - "auxiliary_loss_clip": 0.01084306, - "auxiliary_loss_mlp": 0.01035155, - "balance_loss_clip": 1.03550363, - "balance_loss_mlp": 1.02248907, - "epoch": 0.9515707199759507, - "flos": 25846525664640.0, - "grad_norm": 2.1311197223836458, - "language_loss": 0.72489649, - "learning_rate": 2.452289414874076e-08, - "loss": 0.74609113, - "num_input_tokens_seen": 341415675, - "step": 15827, - "time_per_iteration": 2.67301869392395 - }, - { - "auxiliary_loss_clip": 0.01086796, - "auxiliary_loss_mlp": 0.01030812, - "balance_loss_clip": 1.03601933, - "balance_loss_mlp": 1.01807427, - "epoch": 0.9516308432286187, - "flos": 21828983322240.0, - "grad_norm": 1.785994656352798, - "language_loss": 0.7409234, - "learning_rate": 2.4462129973313207e-08, - "loss": 0.7620995, - "num_input_tokens_seen": 341432990, - "step": 15828, - "time_per_iteration": 2.6235291957855225 - }, - { - "auxiliary_loss_clip": 0.0106734, - "auxiliary_loss_mlp": 0.01033444, - "balance_loss_clip": 1.03639388, - "balance_loss_mlp": 1.02239347, - "epoch": 0.9516909664812866, - "flos": 27269593666560.0, - "grad_norm": 1.6745966727407804, - "language_loss": 0.72937709, - "learning_rate": 2.440144071047978e-08, - "loss": 0.75038493, - "num_input_tokens_seen": 341454100, - "step": 15829, - "time_per_iteration": 2.831969738006592 - }, - { - "auxiliary_loss_clip": 0.01093583, - "auxiliary_loss_mlp": 0.01033735, - "balance_loss_clip": 1.03440034, - "balance_loss_mlp": 1.02001393, - "epoch": 0.9517510897339546, - "flos": 21215342350080.0, - "grad_norm": 2.2807166636074863, - "language_loss": 0.61247396, - "learning_rate": 2.4340826362541533e-08, - "loss": 0.6337471, - "num_input_tokens_seen": 341472955, - "step": 15830, - "time_per_iteration": 2.57916522026062 - }, - { - "auxiliary_loss_clip": 0.01095854, - "auxiliary_loss_mlp": 0.01031396, - "balance_loss_clip": 1.03783762, - "balance_loss_mlp": 1.01720452, - "epoch": 0.9518112129866225, - "flos": 18733986915840.0, - "grad_norm": 2.414229225065315, - "language_loss": 0.72597665, - "learning_rate": 2.428028693179729e-08, - "loss": 0.74724913, - "num_input_tokens_seen": 341490165, - "step": 15831, - "time_per_iteration": 2.590857982635498 - }, - { - "auxiliary_loss_clip": 0.01054785, - "auxiliary_loss_mlp": 0.01024066, - "balance_loss_clip": 1.03245831, - "balance_loss_mlp": 1.01274085, - "epoch": 0.9518713362392905, - "flos": 16763676232320.0, - "grad_norm": 1.6809065599907524, - "language_loss": 0.65303266, - "learning_rate": 2.4219822420542545e-08, - "loss": 0.67382115, - "num_input_tokens_seen": 341508055, - "step": 15832, - "time_per_iteration": 2.7475693225860596 - }, - { - "auxiliary_loss_clip": 0.01093001, - "auxiliary_loss_mlp": 0.01036123, - "balance_loss_clip": 1.03763044, - "balance_loss_mlp": 1.02308214, - "epoch": 0.9519314594919586, - "flos": 15230649720960.0, - "grad_norm": 1.7187781750552136, - "language_loss": 0.77851391, - "learning_rate": 2.4159432831070135e-08, - "loss": 0.79980505, - "num_input_tokens_seen": 341526155, - "step": 15833, - "time_per_iteration": 2.5683181285858154 - }, - { - "auxiliary_loss_clip": 0.01069974, - "auxiliary_loss_mlp": 0.01031366, - "balance_loss_clip": 1.03459656, - "balance_loss_mlp": 1.01919472, - "epoch": 0.9519915827446265, - "flos": 19352943100800.0, - "grad_norm": 1.9155025813330617, - "language_loss": 0.75245464, - "learning_rate": 2.4099118165670007e-08, - "loss": 0.77346802, - "num_input_tokens_seen": 341540450, - "step": 15834, - "time_per_iteration": 5.729520559310913 - }, - { - "auxiliary_loss_clip": 0.01098407, - "auxiliary_loss_mlp": 0.01035182, - "balance_loss_clip": 1.038692, - "balance_loss_mlp": 1.02169371, - "epoch": 0.9520517059972945, - "flos": 22266303408000.0, - "grad_norm": 2.297648558034633, - "language_loss": 0.7629987, - "learning_rate": 2.4038878426629216e-08, - "loss": 0.78433454, - "num_input_tokens_seen": 341557865, - "step": 15835, - "time_per_iteration": 2.570033073425293 - }, - { - "auxiliary_loss_clip": 0.01086302, - "auxiliary_loss_mlp": 0.01031867, - "balance_loss_clip": 1.03379786, - "balance_loss_mlp": 1.01873004, - "epoch": 0.9521118292499624, - "flos": 14862313704960.0, - "grad_norm": 1.9369044520517964, - "language_loss": 0.6651296, - "learning_rate": 2.397871361623238e-08, - "loss": 0.68631124, - "num_input_tokens_seen": 341573890, - "step": 15836, - "time_per_iteration": 4.36873197555542 - }, - { - "auxiliary_loss_clip": 0.01072203, - "auxiliary_loss_mlp": 0.01027392, - "balance_loss_clip": 1.03464746, - "balance_loss_mlp": 1.01531649, - "epoch": 0.9521719525026304, - "flos": 23508812718720.0, - "grad_norm": 1.945512889089952, - "language_loss": 0.70333862, - "learning_rate": 2.391862373676057e-08, - "loss": 0.72433454, - "num_input_tokens_seen": 341593770, - "step": 15837, - "time_per_iteration": 2.705793619155884 - }, - { - "auxiliary_loss_clip": 0.01110794, - "auxiliary_loss_mlp": 0.01033236, - "balance_loss_clip": 1.03770208, - "balance_loss_mlp": 1.01952064, - "epoch": 0.9522320757552983, - "flos": 19714922409600.0, - "grad_norm": 2.0871537703897673, - "language_loss": 0.73548734, - "learning_rate": 2.3858608790492617e-08, - "loss": 0.75692767, - "num_input_tokens_seen": 341612065, - "step": 15838, - "time_per_iteration": 2.626145362854004 - }, - { - "auxiliary_loss_clip": 0.01076517, - "auxiliary_loss_mlp": 0.01027932, - "balance_loss_clip": 1.03396976, - "balance_loss_mlp": 1.0152297, - "epoch": 0.9522921990079664, - "flos": 25921291824000.0, - "grad_norm": 4.0429942363631275, - "language_loss": 0.78156877, - "learning_rate": 2.379866877970449e-08, - "loss": 0.80261326, - "num_input_tokens_seen": 341631365, - "step": 15839, - "time_per_iteration": 4.274654865264893 - }, - { - "auxiliary_loss_clip": 0.01085718, - "auxiliary_loss_mlp": 0.01032381, - "balance_loss_clip": 1.04032528, - "balance_loss_mlp": 1.02013206, - "epoch": 0.9523523222606343, - "flos": 19208115463680.0, - "grad_norm": 1.5000947489157939, - "language_loss": 0.80272675, - "learning_rate": 2.3738803706668585e-08, - "loss": 0.82390767, - "num_input_tokens_seen": 341650300, - "step": 15840, - "time_per_iteration": 2.7204654216766357 - }, - { - "auxiliary_loss_clip": 0.01078473, - "auxiliary_loss_mlp": 0.01028715, - "balance_loss_clip": 1.0350554, - "balance_loss_mlp": 1.0179565, - "epoch": 0.9524124455133023, - "flos": 20921269703040.0, - "grad_norm": 2.113087759766638, - "language_loss": 0.73338723, - "learning_rate": 2.3679013573655314e-08, - "loss": 0.75445914, - "num_input_tokens_seen": 341667680, - "step": 15841, - "time_per_iteration": 2.6518993377685547 - }, - { - "auxiliary_loss_clip": 0.01080022, - "auxiliary_loss_mlp": 0.01026928, - "balance_loss_clip": 1.03612816, - "balance_loss_mlp": 1.01526952, - "epoch": 0.9524725687659702, - "flos": 18843550375680.0, - "grad_norm": 1.7318468009780055, - "language_loss": 0.79018557, - "learning_rate": 2.3619298382931972e-08, - "loss": 0.8112551, - "num_input_tokens_seen": 341685760, - "step": 15842, - "time_per_iteration": 2.620762825012207 - }, - { - "auxiliary_loss_clip": 0.01085992, - "auxiliary_loss_mlp": 0.01032273, - "balance_loss_clip": 1.03697205, - "balance_loss_mlp": 1.01970291, - "epoch": 0.9525326920186382, - "flos": 22674680110080.0, - "grad_norm": 2.120857377915384, - "language_loss": 0.72623742, - "learning_rate": 2.3559658136762973e-08, - "loss": 0.74742007, - "num_input_tokens_seen": 341705300, - "step": 15843, - "time_per_iteration": 2.643082618713379 - }, - { - "auxiliary_loss_clip": 0.01080268, - "auxiliary_loss_mlp": 0.00770279, - "balance_loss_clip": 1.03644204, - "balance_loss_mlp": 1.00023687, - "epoch": 0.9525928152713061, - "flos": 22086642556800.0, - "grad_norm": 1.7610421238919713, - "language_loss": 0.78494173, - "learning_rate": 2.3500092837409612e-08, - "loss": 0.80344719, - "num_input_tokens_seen": 341724565, - "step": 15844, - "time_per_iteration": 2.672140121459961 - }, - { - "auxiliary_loss_clip": 0.0107313, - "auxiliary_loss_mlp": 0.01034185, - "balance_loss_clip": 1.03377759, - "balance_loss_mlp": 1.0192126, - "epoch": 0.9526529385239741, - "flos": 20704728562560.0, - "grad_norm": 2.3272849000884133, - "language_loss": 0.70132804, - "learning_rate": 2.3440602487130977e-08, - "loss": 0.7224012, - "num_input_tokens_seen": 341743605, - "step": 15845, - "time_per_iteration": 2.6669421195983887 - }, - { - "auxiliary_loss_clip": 0.01073757, - "auxiliary_loss_mlp": 0.01035061, - "balance_loss_clip": 1.03685403, - "balance_loss_mlp": 1.02289605, - "epoch": 0.9527130617766422, - "flos": 23368043318400.0, - "grad_norm": 1.600943165785114, - "language_loss": 0.75702989, - "learning_rate": 2.338118708818282e-08, - "loss": 0.77811807, - "num_input_tokens_seen": 341763475, - "step": 15846, - "time_per_iteration": 2.7024073600769043 - }, - { - "auxiliary_loss_clip": 0.01078418, - "auxiliary_loss_mlp": 0.01025955, - "balance_loss_clip": 1.03588128, - "balance_loss_mlp": 1.01366425, - "epoch": 0.9527731850293101, - "flos": 18985935888000.0, - "grad_norm": 1.8711646240490332, - "language_loss": 0.78105325, - "learning_rate": 2.3321846642817998e-08, - "loss": 0.80209702, - "num_input_tokens_seen": 341781265, - "step": 15847, - "time_per_iteration": 2.780184507369995 - }, - { - "auxiliary_loss_clip": 0.01066366, - "auxiliary_loss_mlp": 0.01035518, - "balance_loss_clip": 1.03419328, - "balance_loss_mlp": 1.0241214, - "epoch": 0.9528333082819781, - "flos": 19318038059520.0, - "grad_norm": 1.9530188537907924, - "language_loss": 0.7798357, - "learning_rate": 2.326258115328672e-08, - "loss": 0.80085456, - "num_input_tokens_seen": 341798825, - "step": 15848, - "time_per_iteration": 2.7238736152648926 - }, - { - "auxiliary_loss_clip": 0.01089796, - "auxiliary_loss_mlp": 0.01042438, - "balance_loss_clip": 1.03580141, - "balance_loss_mlp": 1.02845478, - "epoch": 0.952893431534646, - "flos": 23951340276480.0, - "grad_norm": 1.8605077163556365, - "language_loss": 0.72040188, - "learning_rate": 2.320339062183674e-08, - "loss": 0.74172425, - "num_input_tokens_seen": 341819480, - "step": 15849, - "time_per_iteration": 2.682178258895874 - }, - { - "auxiliary_loss_clip": 0.01105363, - "auxiliary_loss_mlp": 0.01038455, - "balance_loss_clip": 1.04046464, - "balance_loss_mlp": 1.02527022, - "epoch": 0.952953554787314, - "flos": 21030545854080.0, - "grad_norm": 1.819487619719596, - "language_loss": 0.75498259, - "learning_rate": 2.314427505071226e-08, - "loss": 0.77642077, - "num_input_tokens_seen": 341838035, - "step": 15850, - "time_per_iteration": 2.6890413761138916 - }, - { - "auxiliary_loss_clip": 0.01080509, - "auxiliary_loss_mlp": 0.01034554, - "balance_loss_clip": 1.03441119, - "balance_loss_mlp": 1.02248454, - "epoch": 0.9530136780399819, - "flos": 22382870019840.0, - "grad_norm": 2.121651511514479, - "language_loss": 0.72852147, - "learning_rate": 2.308523444215482e-08, - "loss": 0.74967206, - "num_input_tokens_seen": 341855895, - "step": 15851, - "time_per_iteration": 2.681929111480713 - }, - { - "auxiliary_loss_clip": 0.01082039, - "auxiliary_loss_mlp": 0.01026961, - "balance_loss_clip": 1.03587413, - "balance_loss_mlp": 1.01521945, - "epoch": 0.95307380129265, - "flos": 22159613036160.0, - "grad_norm": 1.7583423782489798, - "language_loss": 0.79609531, - "learning_rate": 2.3026268798403525e-08, - "loss": 0.8171854, - "num_input_tokens_seen": 341875240, - "step": 15852, - "time_per_iteration": 2.6543726921081543 - }, - { - "auxiliary_loss_clip": 0.01097888, - "auxiliary_loss_mlp": 0.01036154, - "balance_loss_clip": 1.03511071, - "balance_loss_mlp": 1.02323198, - "epoch": 0.9531339245453179, - "flos": 44022747214080.0, - "grad_norm": 1.5582381447981437, - "language_loss": 0.59615147, - "learning_rate": 2.2967378121694138e-08, - "loss": 0.61749196, - "num_input_tokens_seen": 341901020, - "step": 15853, - "time_per_iteration": 2.7729127407073975 - }, - { - "auxiliary_loss_clip": 0.01084188, - "auxiliary_loss_mlp": 0.01032237, - "balance_loss_clip": 1.0343461, - "balance_loss_mlp": 1.02051926, - "epoch": 0.9531940477979859, - "flos": 20266690204800.0, - "grad_norm": 1.8458954546465922, - "language_loss": 0.72333086, - "learning_rate": 2.290856241425998e-08, - "loss": 0.74449503, - "num_input_tokens_seen": 341919365, - "step": 15854, - "time_per_iteration": 2.667217254638672 - }, - { - "auxiliary_loss_clip": 0.01081866, - "auxiliary_loss_mlp": 0.01031192, - "balance_loss_clip": 1.03433609, - "balance_loss_mlp": 1.01909232, - "epoch": 0.9532541710506538, - "flos": 25335732309120.0, - "grad_norm": 2.1969630613589057, - "language_loss": 0.67196018, - "learning_rate": 2.284982167833127e-08, - "loss": 0.69309074, - "num_input_tokens_seen": 341939985, - "step": 15855, - "time_per_iteration": 2.6534695625305176 - }, - { - "auxiliary_loss_clip": 0.01109271, - "auxiliary_loss_mlp": 0.01030983, - "balance_loss_clip": 1.0367763, - "balance_loss_mlp": 1.01885402, - "epoch": 0.9533142943033218, - "flos": 26469288691200.0, - "grad_norm": 1.5274275727026758, - "language_loss": 0.76655555, - "learning_rate": 2.279115591613556e-08, - "loss": 0.78795809, - "num_input_tokens_seen": 341959255, - "step": 15856, - "time_per_iteration": 2.6008455753326416 - }, - { - "auxiliary_loss_clip": 0.01080944, - "auxiliary_loss_mlp": 0.0103369, - "balance_loss_clip": 1.03132057, - "balance_loss_mlp": 1.02190578, - "epoch": 0.9533744175559897, - "flos": 23656944407040.0, - "grad_norm": 1.7148081146844736, - "language_loss": 0.77968013, - "learning_rate": 2.2732565129897075e-08, - "loss": 0.80082643, - "num_input_tokens_seen": 341977205, - "step": 15857, - "time_per_iteration": 2.6481335163116455 - }, - { - "auxiliary_loss_clip": 0.01019391, - "auxiliary_loss_mlp": 0.01003272, - "balance_loss_clip": 1.00663853, - "balance_loss_mlp": 1.00225866, - "epoch": 0.9534345408086577, - "flos": 61052055500160.0, - "grad_norm": 0.7153193040155459, - "language_loss": 0.6259079, - "learning_rate": 2.267404932183803e-08, - "loss": 0.6461345, - "num_input_tokens_seen": 342038545, - "step": 15858, - "time_per_iteration": 3.112011671066284 - }, - { - "auxiliary_loss_clip": 0.01057029, - "auxiliary_loss_mlp": 0.01028806, - "balance_loss_clip": 1.03275323, - "balance_loss_mlp": 1.01722491, - "epoch": 0.9534946640613258, - "flos": 18951677291520.0, - "grad_norm": 1.5293384678689539, - "language_loss": 0.56808496, - "learning_rate": 2.2615608494177097e-08, - "loss": 0.58894336, - "num_input_tokens_seen": 342058195, - "step": 15859, - "time_per_iteration": 2.699678897857666 - }, - { - "auxiliary_loss_clip": 0.01104207, - "auxiliary_loss_mlp": 0.01030176, - "balance_loss_clip": 1.03593767, - "balance_loss_mlp": 1.01922047, - "epoch": 0.9535547873139937, - "flos": 16654292340480.0, - "grad_norm": 2.076157934676356, - "language_loss": 0.81695747, - "learning_rate": 2.2557242649130504e-08, - "loss": 0.8383013, - "num_input_tokens_seen": 342075025, - "step": 15860, - "time_per_iteration": 2.5248684883117676 - }, - { - "auxiliary_loss_clip": 0.01057722, - "auxiliary_loss_mlp": 0.0076914, - "balance_loss_clip": 1.03329587, - "balance_loss_mlp": 1.00022173, - "epoch": 0.9536149105666617, - "flos": 20667776446080.0, - "grad_norm": 2.0339237108527195, - "language_loss": 0.66784334, - "learning_rate": 2.249895178891159e-08, - "loss": 0.68611193, - "num_input_tokens_seen": 342094595, - "step": 15861, - "time_per_iteration": 2.764711856842041 - }, - { - "auxiliary_loss_clip": 0.01097732, - "auxiliary_loss_mlp": 0.01036637, - "balance_loss_clip": 1.03534853, - "balance_loss_mlp": 1.02341676, - "epoch": 0.9536750338193296, - "flos": 30700499086080.0, - "grad_norm": 1.722759616430161, - "language_loss": 0.65783358, - "learning_rate": 2.244073591573037e-08, - "loss": 0.67917728, - "num_input_tokens_seen": 342115970, - "step": 15862, - "time_per_iteration": 2.8370909690856934 - }, - { - "auxiliary_loss_clip": 0.01067937, - "auxiliary_loss_mlp": 0.01033375, - "balance_loss_clip": 1.03313565, - "balance_loss_mlp": 1.02116823, - "epoch": 0.9537351570719976, - "flos": 20405484357120.0, - "grad_norm": 1.5180821577389316, - "language_loss": 0.67942423, - "learning_rate": 2.238259503179485e-08, - "loss": 0.70043731, - "num_input_tokens_seen": 342134080, - "step": 15863, - "time_per_iteration": 2.85260272026062 - }, - { - "auxiliary_loss_clip": 0.01087422, - "auxiliary_loss_mlp": 0.01028951, - "balance_loss_clip": 1.03365183, - "balance_loss_mlp": 1.01648188, - "epoch": 0.9537952803246655, - "flos": 29929245235200.0, - "grad_norm": 2.093402061794127, - "language_loss": 0.78434008, - "learning_rate": 2.2324529139309267e-08, - "loss": 0.80550379, - "num_input_tokens_seen": 342154725, - "step": 15864, - "time_per_iteration": 2.7751903533935547 - }, - { - "auxiliary_loss_clip": 0.01077785, - "auxiliary_loss_mlp": 0.01026692, - "balance_loss_clip": 1.03687298, - "balance_loss_mlp": 1.01424658, - "epoch": 0.9538554035773336, - "flos": 20521404524160.0, - "grad_norm": 2.5427902857740463, - "language_loss": 0.60073441, - "learning_rate": 2.226653824047586e-08, - "loss": 0.6217792, - "num_input_tokens_seen": 342172275, - "step": 15865, - "time_per_iteration": 2.668893337249756 - }, - { - "auxiliary_loss_clip": 0.01066094, - "auxiliary_loss_mlp": 0.01038068, - "balance_loss_clip": 1.03419495, - "balance_loss_mlp": 1.02391815, - "epoch": 0.9539155268300015, - "flos": 18406517598720.0, - "grad_norm": 1.825358390609407, - "language_loss": 0.70074368, - "learning_rate": 2.2208622337493765e-08, - "loss": 0.72178537, - "num_input_tokens_seen": 342190880, - "step": 15866, - "time_per_iteration": 2.6656248569488525 - }, - { - "auxiliary_loss_clip": 0.01083648, - "auxiliary_loss_mlp": 0.0103856, - "balance_loss_clip": 1.03381348, - "balance_loss_mlp": 1.02469635, - "epoch": 0.9539756500826695, - "flos": 26213281482240.0, - "grad_norm": 7.66097760902825, - "language_loss": 0.84885997, - "learning_rate": 2.215078143255855e-08, - "loss": 0.87008202, - "num_input_tokens_seen": 342208165, - "step": 15867, - "time_per_iteration": 2.7268359661102295 - }, - { - "auxiliary_loss_clip": 0.01016664, - "auxiliary_loss_mlp": 0.0100223, - "balance_loss_clip": 1.0065484, - "balance_loss_mlp": 1.00118721, - "epoch": 0.9540357733353374, - "flos": 68289097766400.0, - "grad_norm": 0.8394455572132413, - "language_loss": 0.61809933, - "learning_rate": 2.2093015527864024e-08, - "loss": 0.63828826, - "num_input_tokens_seen": 342277110, - "step": 15868, - "time_per_iteration": 3.1767897605895996 - }, - { - "auxiliary_loss_clip": 0.01070741, - "auxiliary_loss_mlp": 0.0102867, - "balance_loss_clip": 1.03546822, - "balance_loss_mlp": 1.01636136, - "epoch": 0.9540958965880054, - "flos": 21288276915840.0, - "grad_norm": 1.9166883744985537, - "language_loss": 0.60024238, - "learning_rate": 2.2035324625600425e-08, - "loss": 0.62123656, - "num_input_tokens_seen": 342294695, - "step": 15869, - "time_per_iteration": 2.825597047805786 - }, - { - "auxiliary_loss_clip": 0.01069204, - "auxiliary_loss_mlp": 0.00772179, - "balance_loss_clip": 1.032269, - "balance_loss_mlp": 1.00027442, - "epoch": 0.9541560198406733, - "flos": 19751407649280.0, - "grad_norm": 1.8610543982135193, - "language_loss": 0.71071583, - "learning_rate": 2.197770872795579e-08, - "loss": 0.72912961, - "num_input_tokens_seen": 342314970, - "step": 15870, - "time_per_iteration": 2.7531421184539795 - }, - { - "auxiliary_loss_clip": 0.01067012, - "auxiliary_loss_mlp": 0.01028174, - "balance_loss_clip": 1.03300095, - "balance_loss_mlp": 1.01587772, - "epoch": 0.9542161430933414, - "flos": 24715626888960.0, - "grad_norm": 1.7781564124647944, - "language_loss": 0.76756346, - "learning_rate": 2.1920167837114368e-08, - "loss": 0.78851533, - "num_input_tokens_seen": 342334255, - "step": 15871, - "time_per_iteration": 2.724163770675659 - }, - { - "auxiliary_loss_clip": 0.01096753, - "auxiliary_loss_mlp": 0.01035234, - "balance_loss_clip": 1.0351069, - "balance_loss_mlp": 1.02213871, - "epoch": 0.9542762663460094, - "flos": 31065818359680.0, - "grad_norm": 1.9388679259393415, - "language_loss": 0.58526534, - "learning_rate": 2.1862701955258634e-08, - "loss": 0.60658514, - "num_input_tokens_seen": 342354730, - "step": 15872, - "time_per_iteration": 2.7208635807037354 - }, - { - "auxiliary_loss_clip": 0.01085098, - "auxiliary_loss_mlp": 0.01034184, - "balance_loss_clip": 1.03341579, - "balance_loss_mlp": 1.02033806, - "epoch": 0.9543363895986773, - "flos": 20776729374720.0, - "grad_norm": 2.397894266058994, - "language_loss": 0.74827802, - "learning_rate": 2.1805311084567514e-08, - "loss": 0.76947081, - "num_input_tokens_seen": 342374565, - "step": 15873, - "time_per_iteration": 4.379558086395264 - }, - { - "auxiliary_loss_clip": 0.01111454, - "auxiliary_loss_mlp": 0.01032817, - "balance_loss_clip": 1.0387311, - "balance_loss_mlp": 1.01963258, - "epoch": 0.9543965128513453, - "flos": 24462744163200.0, - "grad_norm": 1.9355802772017296, - "language_loss": 0.62851435, - "learning_rate": 2.1747995227217265e-08, - "loss": 0.64995706, - "num_input_tokens_seen": 342394590, - "step": 15874, - "time_per_iteration": 2.5884764194488525 - }, - { - "auxiliary_loss_clip": 0.01084158, - "auxiliary_loss_mlp": 0.01034983, - "balance_loss_clip": 1.03476036, - "balance_loss_mlp": 1.02254963, - "epoch": 0.9544566361040132, - "flos": 15261532439040.0, - "grad_norm": 1.953622113172995, - "language_loss": 0.89690936, - "learning_rate": 2.169075438538104e-08, - "loss": 0.91810071, - "num_input_tokens_seen": 342410445, - "step": 15875, - "time_per_iteration": 4.317510604858398 - }, - { - "auxiliary_loss_clip": 0.01112734, - "auxiliary_loss_mlp": 0.01033431, - "balance_loss_clip": 1.03820455, - "balance_loss_mlp": 1.02027059, - "epoch": 0.9545167593566812, - "flos": 25918777872000.0, - "grad_norm": 1.5906794520251055, - "language_loss": 0.67873561, - "learning_rate": 2.1633588561229765e-08, - "loss": 0.70019734, - "num_input_tokens_seen": 342430970, - "step": 15876, - "time_per_iteration": 2.599390983581543 - }, - { - "auxiliary_loss_clip": 0.01097415, - "auxiliary_loss_mlp": 0.01036316, - "balance_loss_clip": 1.03623271, - "balance_loss_mlp": 1.02300572, - "epoch": 0.9545768826093491, - "flos": 25628188844160.0, - "grad_norm": 1.8099744313437123, - "language_loss": 0.69018167, - "learning_rate": 2.1576497756931267e-08, - "loss": 0.711519, - "num_input_tokens_seen": 342449505, - "step": 15877, - "time_per_iteration": 2.621135711669922 - }, - { - "auxiliary_loss_clip": 0.01068154, - "auxiliary_loss_mlp": 0.01036314, - "balance_loss_clip": 1.03443968, - "balance_loss_mlp": 1.02281344, - "epoch": 0.9546370058620172, - "flos": 22491499726080.0, - "grad_norm": 1.864175160647524, - "language_loss": 0.71021724, - "learning_rate": 2.1519481974650035e-08, - "loss": 0.73126197, - "num_input_tokens_seen": 342470390, - "step": 15878, - "time_per_iteration": 4.388718843460083 - }, - { - "auxiliary_loss_clip": 0.01104891, - "auxiliary_loss_mlp": 0.01030745, - "balance_loss_clip": 1.03498161, - "balance_loss_mlp": 1.01849008, - "epoch": 0.9546971291146851, - "flos": 24609582961920.0, - "grad_norm": 1.3169717238469103, - "language_loss": 0.67999732, - "learning_rate": 2.1462541216548335e-08, - "loss": 0.70135367, - "num_input_tokens_seen": 342492560, - "step": 15879, - "time_per_iteration": 2.634164571762085 - }, - { - "auxiliary_loss_clip": 0.01071861, - "auxiliary_loss_mlp": 0.0076975, - "balance_loss_clip": 1.03325868, - "balance_loss_mlp": 1.00017679, - "epoch": 0.9547572523673531, - "flos": 28657756627200.0, - "grad_norm": 1.892359769973216, - "language_loss": 0.84921825, - "learning_rate": 2.1405675484785334e-08, - "loss": 0.86763442, - "num_input_tokens_seen": 342512315, - "step": 15880, - "time_per_iteration": 2.7207343578338623 - }, - { - "auxiliary_loss_clip": 0.01043217, - "auxiliary_loss_mlp": 0.01034447, - "balance_loss_clip": 1.03212571, - "balance_loss_mlp": 1.02152514, - "epoch": 0.954817375620021, - "flos": 33802606385280.0, - "grad_norm": 1.832275853665566, - "language_loss": 0.7208662, - "learning_rate": 2.134888478151753e-08, - "loss": 0.74164283, - "num_input_tokens_seen": 342533060, - "step": 15881, - "time_per_iteration": 3.034219980239868 - }, - { - "auxiliary_loss_clip": 0.01097589, - "auxiliary_loss_mlp": 0.01035091, - "balance_loss_clip": 1.03802884, - "balance_loss_mlp": 1.02235389, - "epoch": 0.954877498872689, - "flos": 14428225843200.0, - "grad_norm": 1.8582437523117474, - "language_loss": 0.71399862, - "learning_rate": 2.1292169108898083e-08, - "loss": 0.73532546, - "num_input_tokens_seen": 342550830, - "step": 15882, - "time_per_iteration": 2.5682435035705566 - }, - { - "auxiliary_loss_clip": 0.0108781, - "auxiliary_loss_mlp": 0.0103464, - "balance_loss_clip": 1.0364579, - "balance_loss_mlp": 1.02317178, - "epoch": 0.9549376221253569, - "flos": 59269447336320.0, - "grad_norm": 1.5893740552045255, - "language_loss": 0.65766758, - "learning_rate": 2.1235528469078168e-08, - "loss": 0.67889214, - "num_input_tokens_seen": 342575070, - "step": 15883, - "time_per_iteration": 3.0329291820526123 - }, - { - "auxiliary_loss_clip": 0.01099334, - "auxiliary_loss_mlp": 0.0103149, - "balance_loss_clip": 1.03810024, - "balance_loss_mlp": 1.01847863, - "epoch": 0.954997745378025, - "flos": 17274397760640.0, - "grad_norm": 2.161620424639411, - "language_loss": 0.78009343, - "learning_rate": 2.1178962864205175e-08, - "loss": 0.80140173, - "num_input_tokens_seen": 342592215, - "step": 15884, - "time_per_iteration": 2.62176513671875 - }, - { - "auxiliary_loss_clip": 0.01109312, - "auxiliary_loss_mlp": 0.01029181, - "balance_loss_clip": 1.03558671, - "balance_loss_mlp": 1.01618683, - "epoch": 0.955057868630693, - "flos": 13006378903680.0, - "grad_norm": 1.803007960356649, - "language_loss": 0.77870518, - "learning_rate": 2.1122472296424054e-08, - "loss": 0.80009007, - "num_input_tokens_seen": 342610030, - "step": 15885, - "time_per_iteration": 2.5647974014282227 - }, - { - "auxiliary_loss_clip": 0.01108326, - "auxiliary_loss_mlp": 0.01033941, - "balance_loss_clip": 1.03567576, - "balance_loss_mlp": 1.02171004, - "epoch": 0.9551179918833609, - "flos": 22637692080000.0, - "grad_norm": 1.6846495820783678, - "language_loss": 0.69959128, - "learning_rate": 2.1066056767877317e-08, - "loss": 0.7210139, - "num_input_tokens_seen": 342626475, - "step": 15886, - "time_per_iteration": 2.6008517742156982 - }, - { - "auxiliary_loss_clip": 0.01074503, - "auxiliary_loss_mlp": 0.0103392, - "balance_loss_clip": 1.03510761, - "balance_loss_mlp": 1.02006149, - "epoch": 0.9551781151360289, - "flos": 21542811667200.0, - "grad_norm": 1.590980896681407, - "language_loss": 0.72832477, - "learning_rate": 2.1009716280703916e-08, - "loss": 0.74940896, - "num_input_tokens_seen": 342646645, - "step": 15887, - "time_per_iteration": 2.6831438541412354 - }, - { - "auxiliary_loss_clip": 0.01084236, - "auxiliary_loss_mlp": 0.01031385, - "balance_loss_clip": 1.03418398, - "balance_loss_mlp": 1.01973784, - "epoch": 0.9552382383886968, - "flos": 20702250524160.0, - "grad_norm": 1.933274372299018, - "language_loss": 0.5720163, - "learning_rate": 2.0953450837040364e-08, - "loss": 0.59317255, - "num_input_tokens_seen": 342663615, - "step": 15888, - "time_per_iteration": 2.630725860595703 - }, - { - "auxiliary_loss_clip": 0.01019029, - "auxiliary_loss_mlp": 0.01004801, - "balance_loss_clip": 1.00631261, - "balance_loss_mlp": 1.0038352, - "epoch": 0.9552983616413648, - "flos": 67769792887680.0, - "grad_norm": 0.7050649141864813, - "language_loss": 0.57804728, - "learning_rate": 2.0897260439020514e-08, - "loss": 0.59828568, - "num_input_tokens_seen": 342728275, - "step": 15889, - "time_per_iteration": 3.214216470718384 - }, - { - "auxiliary_loss_clip": 0.01108889, - "auxiliary_loss_mlp": 0.01030731, - "balance_loss_clip": 1.03501582, - "balance_loss_mlp": 1.01774263, - "epoch": 0.9553584848940327, - "flos": 21579979265280.0, - "grad_norm": 1.4933948635050138, - "language_loss": 0.6719625, - "learning_rate": 2.084114508877466e-08, - "loss": 0.69335872, - "num_input_tokens_seen": 342748860, - "step": 15890, - "time_per_iteration": 2.600853443145752 - }, - { - "auxiliary_loss_clip": 0.01108529, - "auxiliary_loss_mlp": 0.01026392, - "balance_loss_clip": 1.03781927, - "balance_loss_mlp": 1.01449537, - "epoch": 0.9554186081467008, - "flos": 24208173498240.0, - "grad_norm": 1.434726031550495, - "language_loss": 0.74308884, - "learning_rate": 2.0785104788430874e-08, - "loss": 0.76443803, - "num_input_tokens_seen": 342769705, - "step": 15891, - "time_per_iteration": 2.604349374771118 - }, - { - "auxiliary_loss_clip": 0.01069647, - "auxiliary_loss_mlp": 0.01028677, - "balance_loss_clip": 1.03273499, - "balance_loss_mlp": 1.01785886, - "epoch": 0.9554787313993687, - "flos": 16251554073600.0, - "grad_norm": 1.905721456172026, - "language_loss": 0.77943361, - "learning_rate": 2.072913954011435e-08, - "loss": 0.80041689, - "num_input_tokens_seen": 342787000, - "step": 15892, - "time_per_iteration": 2.6338727474212646 - }, - { - "auxiliary_loss_clip": 0.0110724, - "auxiliary_loss_mlp": 0.01029475, - "balance_loss_clip": 1.03596187, - "balance_loss_mlp": 1.01690435, - "epoch": 0.9555388546520367, - "flos": 23404133508480.0, - "grad_norm": 1.3333935754800896, - "language_loss": 0.6973961, - "learning_rate": 2.0673249345947386e-08, - "loss": 0.71876323, - "num_input_tokens_seen": 342807795, - "step": 15893, - "time_per_iteration": 2.64900803565979 - }, - { - "auxiliary_loss_clip": 0.01089703, - "auxiliary_loss_mlp": 0.00770181, - "balance_loss_clip": 1.03906655, - "balance_loss_mlp": 1.00022793, - "epoch": 0.9555989779047046, - "flos": 14794047907200.0, - "grad_norm": 1.8780898151887826, - "language_loss": 0.65497565, - "learning_rate": 2.0617434208048955e-08, - "loss": 0.67357445, - "num_input_tokens_seen": 342825490, - "step": 15894, - "time_per_iteration": 2.640239953994751 - }, - { - "auxiliary_loss_clip": 0.01098184, - "auxiliary_loss_mlp": 0.01032823, - "balance_loss_clip": 1.03628969, - "balance_loss_mlp": 1.01997232, - "epoch": 0.9556591011573726, - "flos": 22236749493120.0, - "grad_norm": 1.917235716935355, - "language_loss": 0.82155561, - "learning_rate": 2.056169412853581e-08, - "loss": 0.84286571, - "num_input_tokens_seen": 342844965, - "step": 15895, - "time_per_iteration": 2.605703592300415 - }, - { - "auxiliary_loss_clip": 0.01083186, - "auxiliary_loss_mlp": 0.01029801, - "balance_loss_clip": 1.0364809, - "balance_loss_mlp": 1.01701021, - "epoch": 0.9557192244100405, - "flos": 27855296835840.0, - "grad_norm": 1.5092096829888868, - "language_loss": 0.72777927, - "learning_rate": 2.0506029109521593e-08, - "loss": 0.74890918, - "num_input_tokens_seen": 342865915, - "step": 15896, - "time_per_iteration": 2.800420045852661 - }, - { - "auxiliary_loss_clip": 0.01105404, - "auxiliary_loss_mlp": 0.01031303, - "balance_loss_clip": 1.03542614, - "balance_loss_mlp": 1.01926875, - "epoch": 0.9557793476627086, - "flos": 17602800831360.0, - "grad_norm": 1.8673042529516892, - "language_loss": 0.79697645, - "learning_rate": 2.045043915311706e-08, - "loss": 0.81834352, - "num_input_tokens_seen": 342884000, - "step": 15897, - "time_per_iteration": 2.58010196685791 - }, - { - "auxiliary_loss_clip": 0.01081754, - "auxiliary_loss_mlp": 0.0103489, - "balance_loss_clip": 1.03217518, - "balance_loss_mlp": 1.02133036, - "epoch": 0.9558394709153766, - "flos": 23875496709120.0, - "grad_norm": 1.7392744855605553, - "language_loss": 0.7268827, - "learning_rate": 2.03949242614303e-08, - "loss": 0.74804914, - "num_input_tokens_seen": 342903095, - "step": 15898, - "time_per_iteration": 2.675769567489624 - }, - { - "auxiliary_loss_clip": 0.010026, - "auxiliary_loss_mlp": 0.01003805, - "balance_loss_clip": 1.0089612, - "balance_loss_mlp": 1.00289333, - "epoch": 0.9558995941680445, - "flos": 53682001171200.0, - "grad_norm": 0.8518438349506685, - "language_loss": 0.52328175, - "learning_rate": 2.033948443656652e-08, - "loss": 0.54334575, - "num_input_tokens_seen": 342958155, - "step": 15899, - "time_per_iteration": 3.1892175674438477 - }, - { - "auxiliary_loss_clip": 0.01101857, - "auxiliary_loss_mlp": 0.010327, - "balance_loss_clip": 1.03783405, - "balance_loss_mlp": 1.01899076, - "epoch": 0.9559597174207125, - "flos": 13764488376960.0, - "grad_norm": 2.0360903333402183, - "language_loss": 0.68228984, - "learning_rate": 2.028411968062782e-08, - "loss": 0.70363533, - "num_input_tokens_seen": 342972500, - "step": 15900, - "time_per_iteration": 2.5987586975097656 - }, - { - "auxiliary_loss_clip": 0.0109791, - "auxiliary_loss_mlp": 0.0077014, - "balance_loss_clip": 1.03479004, - "balance_loss_mlp": 1.00019574, - "epoch": 0.9560198406733804, - "flos": 19936347799680.0, - "grad_norm": 2.313544780745396, - "language_loss": 0.83186281, - "learning_rate": 2.0228829995713627e-08, - "loss": 0.85054326, - "num_input_tokens_seen": 342989035, - "step": 15901, - "time_per_iteration": 2.593118667602539 - }, - { - "auxiliary_loss_clip": 0.00997227, - "auxiliary_loss_mlp": 0.01005499, - "balance_loss_clip": 1.00669014, - "balance_loss_mlp": 1.00429535, - "epoch": 0.9560799639260484, - "flos": 57289550699520.0, - "grad_norm": 0.70780862037499, - "language_loss": 0.54323339, - "learning_rate": 2.0173615383920485e-08, - "loss": 0.56326067, - "num_input_tokens_seen": 343051675, - "step": 15902, - "time_per_iteration": 3.3085649013519287 - }, - { - "auxiliary_loss_clip": 0.01086623, - "auxiliary_loss_mlp": 0.01032307, - "balance_loss_clip": 1.03854203, - "balance_loss_mlp": 1.02167904, - "epoch": 0.9561400871787163, - "flos": 18917167299840.0, - "grad_norm": 1.7381730365709078, - "language_loss": 0.8538748, - "learning_rate": 2.01184758473425e-08, - "loss": 0.87506413, - "num_input_tokens_seen": 343068895, - "step": 15903, - "time_per_iteration": 2.6358137130737305 - }, - { - "auxiliary_loss_clip": 0.01082056, - "auxiliary_loss_mlp": 0.00772044, - "balance_loss_clip": 1.03525889, - "balance_loss_mlp": 1.00020206, - "epoch": 0.9562002104313844, - "flos": 18038576632320.0, - "grad_norm": 1.7984010377221487, - "language_loss": 0.80295885, - "learning_rate": 2.0063411388070217e-08, - "loss": 0.82149988, - "num_input_tokens_seen": 343087115, - "step": 15904, - "time_per_iteration": 2.7303686141967773 - }, - { - "auxiliary_loss_clip": 0.01098663, - "auxiliary_loss_mlp": 0.01031681, - "balance_loss_clip": 1.03495884, - "balance_loss_mlp": 1.01843715, - "epoch": 0.9562603336840523, - "flos": 24717673964160.0, - "grad_norm": 4.96352662326913, - "language_loss": 0.60007298, - "learning_rate": 2.0008422008191972e-08, - "loss": 0.6213764, - "num_input_tokens_seen": 343105575, - "step": 15905, - "time_per_iteration": 2.655217170715332 - }, - { - "auxiliary_loss_clip": 0.01096188, - "auxiliary_loss_mlp": 0.01028565, - "balance_loss_clip": 1.03525162, - "balance_loss_mlp": 1.01632822, - "epoch": 0.9563204569367203, - "flos": 21177205084800.0, - "grad_norm": 1.9515891856264378, - "language_loss": 0.70387208, - "learning_rate": 1.995350770979254e-08, - "loss": 0.72511959, - "num_input_tokens_seen": 343123025, - "step": 15906, - "time_per_iteration": 2.6145029067993164 - }, - { - "auxiliary_loss_clip": 0.01055579, - "auxiliary_loss_mlp": 0.01030633, - "balance_loss_clip": 1.03260493, - "balance_loss_mlp": 1.01775253, - "epoch": 0.9563805801893882, - "flos": 20229738088320.0, - "grad_norm": 1.7038523385332285, - "language_loss": 0.70973694, - "learning_rate": 1.9898668494954473e-08, - "loss": 0.73059911, - "num_input_tokens_seen": 343141625, - "step": 15907, - "time_per_iteration": 2.831192970275879 - }, - { - "auxiliary_loss_clip": 0.0106678, - "auxiliary_loss_mlp": 0.01031766, - "balance_loss_clip": 1.03346992, - "balance_loss_mlp": 1.01988053, - "epoch": 0.9564407034420562, - "flos": 25411001258880.0, - "grad_norm": 1.938524110619909, - "language_loss": 0.70548427, - "learning_rate": 1.9843904365757447e-08, - "loss": 0.72646976, - "num_input_tokens_seen": 343161300, - "step": 15908, - "time_per_iteration": 2.704686164855957 - }, - { - "auxiliary_loss_clip": 0.01085855, - "auxiliary_loss_mlp": 0.00770126, - "balance_loss_clip": 1.03650773, - "balance_loss_mlp": 1.00021219, - "epoch": 0.9565008266947241, - "flos": 18623884752000.0, - "grad_norm": 1.9260740881631984, - "language_loss": 0.83019876, - "learning_rate": 1.978921532427802e-08, - "loss": 0.84875852, - "num_input_tokens_seen": 343177815, - "step": 15909, - "time_per_iteration": 2.6200265884399414 - }, - { - "auxiliary_loss_clip": 0.01096482, - "auxiliary_loss_mlp": 0.01033356, - "balance_loss_clip": 1.03509748, - "balance_loss_mlp": 1.02116704, - "epoch": 0.9565609499473922, - "flos": 24862142465280.0, - "grad_norm": 2.1314572111323717, - "language_loss": 0.67602086, - "learning_rate": 1.9734601372590086e-08, - "loss": 0.69731927, - "num_input_tokens_seen": 343198140, - "step": 15910, - "time_per_iteration": 2.6983892917633057 - }, - { - "auxiliary_loss_clip": 0.01101245, - "auxiliary_loss_mlp": 0.01034074, - "balance_loss_clip": 1.03767276, - "balance_loss_mlp": 1.02156329, - "epoch": 0.9566210732000601, - "flos": 21798459740160.0, - "grad_norm": 1.6976880535824044, - "language_loss": 0.74343169, - "learning_rate": 1.968006251276444e-08, - "loss": 0.76478493, - "num_input_tokens_seen": 343218280, - "step": 15911, - "time_per_iteration": 2.6060009002685547 - }, - { - "auxiliary_loss_clip": 0.01096979, - "auxiliary_loss_mlp": 0.01030324, - "balance_loss_clip": 1.03550327, - "balance_loss_mlp": 1.01809359, - "epoch": 0.9566811964527281, - "flos": 18697609416960.0, - "grad_norm": 2.080677167597926, - "language_loss": 0.69605064, - "learning_rate": 1.9625598746869198e-08, - "loss": 0.71732366, - "num_input_tokens_seen": 343236850, - "step": 15912, - "time_per_iteration": 4.122835874557495 - }, - { - "auxiliary_loss_clip": 0.01086077, - "auxiliary_loss_mlp": 0.01036299, - "balance_loss_clip": 1.03359342, - "balance_loss_mlp": 1.02379942, - "epoch": 0.9567413197053961, - "flos": 13000632727680.0, - "grad_norm": 2.5288406213063466, - "language_loss": 0.72268087, - "learning_rate": 1.95712100769696e-08, - "loss": 0.74390459, - "num_input_tokens_seen": 343253065, - "step": 15913, - "time_per_iteration": 4.12858247756958 - }, - { - "auxiliary_loss_clip": 0.01026666, - "auxiliary_loss_mlp": 0.01032498, - "balance_loss_clip": 1.03391707, - "balance_loss_mlp": 1.02069664, - "epoch": 0.956801442958064, - "flos": 19719267955200.0, - "grad_norm": 20.323205931222148, - "language_loss": 0.73863947, - "learning_rate": 1.9516896505128444e-08, - "loss": 0.75923109, - "num_input_tokens_seen": 343270330, - "step": 15914, - "time_per_iteration": 2.7809512615203857 - }, - { - "auxiliary_loss_clip": 0.01107977, - "auxiliary_loss_mlp": 0.01030147, - "balance_loss_clip": 1.0365274, - "balance_loss_mlp": 1.01769543, - "epoch": 0.956861566210732, - "flos": 18222834424320.0, - "grad_norm": 1.3806320366510194, - "language_loss": 0.67305696, - "learning_rate": 1.9462658033404965e-08, - "loss": 0.69443822, - "num_input_tokens_seen": 343289625, - "step": 15915, - "time_per_iteration": 4.22941780090332 - }, - { - "auxiliary_loss_clip": 0.01092649, - "auxiliary_loss_mlp": 0.01028226, - "balance_loss_clip": 1.03482556, - "balance_loss_mlp": 1.01616824, - "epoch": 0.9569216894634, - "flos": 22196960202240.0, - "grad_norm": 1.7476602306443554, - "language_loss": 0.64463937, - "learning_rate": 1.9408494663855967e-08, - "loss": 0.66584814, - "num_input_tokens_seen": 343309200, - "step": 15916, - "time_per_iteration": 2.5847983360290527 - }, - { - "auxiliary_loss_clip": 0.0110232, - "auxiliary_loss_mlp": 0.01028805, - "balance_loss_clip": 1.03600883, - "balance_loss_mlp": 1.01722336, - "epoch": 0.956981812716068, - "flos": 21689291329920.0, - "grad_norm": 1.8359549722537702, - "language_loss": 0.80332065, - "learning_rate": 1.935440639853536e-08, - "loss": 0.82463187, - "num_input_tokens_seen": 343326270, - "step": 15917, - "time_per_iteration": 2.5821292400360107 - }, - { - "auxiliary_loss_clip": 0.01077457, - "auxiliary_loss_mlp": 0.01034662, - "balance_loss_clip": 1.0340178, - "balance_loss_mlp": 1.02204955, - "epoch": 0.9570419359687359, - "flos": 13990905757440.0, - "grad_norm": 1.923592863089018, - "language_loss": 0.73075807, - "learning_rate": 1.9300393239494172e-08, - "loss": 0.75187922, - "num_input_tokens_seen": 343344430, - "step": 15918, - "time_per_iteration": 4.2131946086883545 - }, - { - "auxiliary_loss_clip": 0.01002537, - "auxiliary_loss_mlp": 0.0100177, - "balance_loss_clip": 1.0084734, - "balance_loss_mlp": 1.00083399, - "epoch": 0.9571020592214039, - "flos": 65196938534400.0, - "grad_norm": 0.6358192020761055, - "language_loss": 0.53063756, - "learning_rate": 1.924645518878032e-08, - "loss": 0.5506807, - "num_input_tokens_seen": 343416155, - "step": 15919, - "time_per_iteration": 3.3149333000183105 - }, - { - "auxiliary_loss_clip": 0.01106277, - "auxiliary_loss_mlp": 0.01037331, - "balance_loss_clip": 1.04041994, - "balance_loss_mlp": 1.02374721, - "epoch": 0.9571621824740718, - "flos": 17384068961280.0, - "grad_norm": 50.750331888616735, - "language_loss": 0.74972582, - "learning_rate": 1.919259224843972e-08, - "loss": 0.77116191, - "num_input_tokens_seen": 343431715, - "step": 15920, - "time_per_iteration": 2.6216814517974854 - }, - { - "auxiliary_loss_clip": 0.01074302, - "auxiliary_loss_mlp": 0.01033346, - "balance_loss_clip": 1.03634095, - "balance_loss_mlp": 1.02012527, - "epoch": 0.9572223057267398, - "flos": 14538184352640.0, - "grad_norm": 1.6187560061674033, - "language_loss": 0.7876358, - "learning_rate": 1.9138804420514298e-08, - "loss": 0.80871224, - "num_input_tokens_seen": 343450425, - "step": 15921, - "time_per_iteration": 2.6776888370513916 - }, - { - "auxiliary_loss_clip": 0.01102004, - "auxiliary_loss_mlp": 0.01032194, - "balance_loss_clip": 1.03531837, - "balance_loss_mlp": 1.01865196, - "epoch": 0.9572824289794077, - "flos": 33947793158400.0, - "grad_norm": 2.2537365266474776, - "language_loss": 0.51078975, - "learning_rate": 1.9085091707044197e-08, - "loss": 0.53213173, - "num_input_tokens_seen": 343470445, - "step": 15922, - "time_per_iteration": 2.7087700366973877 - }, - { - "auxiliary_loss_clip": 0.01055425, - "auxiliary_loss_mlp": 0.01043646, - "balance_loss_clip": 1.0309701, - "balance_loss_mlp": 1.02935278, - "epoch": 0.9573425522320758, - "flos": 18694915896960.0, - "grad_norm": 1.958282285271952, - "language_loss": 0.84238583, - "learning_rate": 1.903145411006557e-08, - "loss": 0.86337662, - "num_input_tokens_seen": 343485200, - "step": 15923, - "time_per_iteration": 2.6815152168273926 - }, - { - "auxiliary_loss_clip": 0.010812, - "auxiliary_loss_mlp": 0.01036534, - "balance_loss_clip": 1.03293872, - "balance_loss_mlp": 1.02475667, - "epoch": 0.9574026754847437, - "flos": 28510307297280.0, - "grad_norm": 1.538843441694693, - "language_loss": 0.75172049, - "learning_rate": 1.8977891631613008e-08, - "loss": 0.77289784, - "num_input_tokens_seen": 343505080, - "step": 15924, - "time_per_iteration": 2.7213785648345947 - }, - { - "auxiliary_loss_clip": 0.01087824, - "auxiliary_loss_mlp": 0.01031969, - "balance_loss_clip": 1.03699768, - "balance_loss_mlp": 1.01958311, - "epoch": 0.9574627987374117, - "flos": 24352390604160.0, - "grad_norm": 2.227622693008034, - "language_loss": 0.86090326, - "learning_rate": 1.892440427371711e-08, - "loss": 0.88210118, - "num_input_tokens_seen": 343523995, - "step": 15925, - "time_per_iteration": 2.8542959690093994 - }, - { - "auxiliary_loss_clip": 0.01079041, - "auxiliary_loss_mlp": 0.01034192, - "balance_loss_clip": 1.03561556, - "balance_loss_mlp": 1.02103734, - "epoch": 0.9575229219900797, - "flos": 23510680225920.0, - "grad_norm": 2.011468601980382, - "language_loss": 0.75676179, - "learning_rate": 1.8870992038406474e-08, - "loss": 0.77789414, - "num_input_tokens_seen": 343542015, - "step": 15926, - "time_per_iteration": 2.7330782413482666 - }, - { - "auxiliary_loss_clip": 0.0108326, - "auxiliary_loss_mlp": 0.01031405, - "balance_loss_clip": 1.03742075, - "balance_loss_mlp": 1.0197587, - "epoch": 0.9575830452427476, - "flos": 22674823764480.0, - "grad_norm": 2.0888079382528333, - "language_loss": 0.77707171, - "learning_rate": 1.8817654927706373e-08, - "loss": 0.79821837, - "num_input_tokens_seen": 343561680, - "step": 15927, - "time_per_iteration": 2.704115390777588 - }, - { - "auxiliary_loss_clip": 0.01063185, - "auxiliary_loss_mlp": 0.01031462, - "balance_loss_clip": 1.03502405, - "balance_loss_mlp": 1.0171268, - "epoch": 0.9576431684954156, - "flos": 30485250835200.0, - "grad_norm": 1.8055478063953943, - "language_loss": 0.68572605, - "learning_rate": 1.8764392943639183e-08, - "loss": 0.70667255, - "num_input_tokens_seen": 343585290, - "step": 15928, - "time_per_iteration": 2.8810582160949707 - }, - { - "auxiliary_loss_clip": 0.01089186, - "auxiliary_loss_mlp": 0.01031202, - "balance_loss_clip": 1.03828859, - "balance_loss_mlp": 1.018381, - "epoch": 0.9577032917480836, - "flos": 21687387909120.0, - "grad_norm": 1.5717055472294992, - "language_loss": 0.822155, - "learning_rate": 1.871120608822485e-08, - "loss": 0.84335887, - "num_input_tokens_seen": 343604045, - "step": 15929, - "time_per_iteration": 2.6657960414886475 - }, - { - "auxiliary_loss_clip": 0.01077088, - "auxiliary_loss_mlp": 0.01046119, - "balance_loss_clip": 1.03563619, - "balance_loss_mlp": 1.03236794, - "epoch": 0.9577634150007516, - "flos": 29023147728000.0, - "grad_norm": 1.8215200487797032, - "language_loss": 0.72274351, - "learning_rate": 1.8658094363480202e-08, - "loss": 0.74397558, - "num_input_tokens_seen": 343626595, - "step": 15930, - "time_per_iteration": 2.795675277709961 - }, - { - "auxiliary_loss_clip": 0.0103609, - "auxiliary_loss_mlp": 0.01032785, - "balance_loss_clip": 1.03149176, - "balance_loss_mlp": 1.01960611, - "epoch": 0.9578235382534195, - "flos": 19282235178240.0, - "grad_norm": 1.4066251693487615, - "language_loss": 0.62494546, - "learning_rate": 1.8605057771419185e-08, - "loss": 0.64563417, - "num_input_tokens_seen": 343646195, - "step": 15931, - "time_per_iteration": 2.7418274879455566 - }, - { - "auxiliary_loss_clip": 0.01106716, - "auxiliary_loss_mlp": 0.01030993, - "balance_loss_clip": 1.03693795, - "balance_loss_mlp": 1.01945996, - "epoch": 0.9578836615060875, - "flos": 13699275235200.0, - "grad_norm": 2.1628834321357746, - "language_loss": 0.69288397, - "learning_rate": 1.8552096314052633e-08, - "loss": 0.71426117, - "num_input_tokens_seen": 343663665, - "step": 15932, - "time_per_iteration": 2.6367006301879883 - }, - { - "auxiliary_loss_clip": 0.01080267, - "auxiliary_loss_mlp": 0.01036892, - "balance_loss_clip": 1.0359807, - "balance_loss_mlp": 1.02269387, - "epoch": 0.9579437847587554, - "flos": 17054516655360.0, - "grad_norm": 1.883637531567824, - "language_loss": 0.75359249, - "learning_rate": 1.849920999338961e-08, - "loss": 0.77476406, - "num_input_tokens_seen": 343682145, - "step": 15933, - "time_per_iteration": 2.692196846008301 - }, - { - "auxiliary_loss_clip": 0.00998155, - "auxiliary_loss_mlp": 0.00999946, - "balance_loss_clip": 1.01311505, - "balance_loss_mlp": 0.99865836, - "epoch": 0.9580039080114234, - "flos": 60570887886720.0, - "grad_norm": 0.7032232478309851, - "language_loss": 0.57280135, - "learning_rate": 1.8446398811434948e-08, - "loss": 0.59278238, - "num_input_tokens_seen": 343744685, - "step": 15934, - "time_per_iteration": 3.389955997467041 - }, - { - "auxiliary_loss_clip": 0.01027383, - "auxiliary_loss_mlp": 0.00751072, - "balance_loss_clip": 1.00506508, - "balance_loss_mlp": 0.99959391, - "epoch": 0.9580640312640913, - "flos": 66235365745920.0, - "grad_norm": 0.9122482390110158, - "language_loss": 0.65885007, - "learning_rate": 1.8393662770191277e-08, - "loss": 0.67663455, - "num_input_tokens_seen": 343801835, - "step": 15935, - "time_per_iteration": 3.0524590015411377 - }, - { - "auxiliary_loss_clip": 0.01007227, - "auxiliary_loss_mlp": 0.01002986, - "balance_loss_clip": 1.00997615, - "balance_loss_mlp": 1.00185907, - "epoch": 0.9581241545167594, - "flos": 62218002971520.0, - "grad_norm": 0.7845197827637053, - "language_loss": 0.57026505, - "learning_rate": 1.8341001871658546e-08, - "loss": 0.5903672, - "num_input_tokens_seen": 343861515, - "step": 15936, - "time_per_iteration": 3.161888837814331 - }, - { - "auxiliary_loss_clip": 0.01048485, - "auxiliary_loss_mlp": 0.0103042, - "balance_loss_clip": 1.03310895, - "balance_loss_mlp": 1.01798666, - "epoch": 0.9581842777694273, - "flos": 23768088065280.0, - "grad_norm": 1.4930030330503186, - "language_loss": 0.78472948, - "learning_rate": 1.8288416117833825e-08, - "loss": 0.80551857, - "num_input_tokens_seen": 343881240, - "step": 15937, - "time_per_iteration": 2.777000665664673 - }, - { - "auxiliary_loss_clip": 0.01096104, - "auxiliary_loss_mlp": 0.01032818, - "balance_loss_clip": 1.03548694, - "balance_loss_mlp": 1.01956844, - "epoch": 0.9582444010220953, - "flos": 21213079793280.0, - "grad_norm": 1.6538903091453836, - "language_loss": 0.6840139, - "learning_rate": 1.8235905510710636e-08, - "loss": 0.70530319, - "num_input_tokens_seen": 343900885, - "step": 15938, - "time_per_iteration": 2.638640880584717 - }, - { - "auxiliary_loss_clip": 0.01076145, - "auxiliary_loss_mlp": 0.01029859, - "balance_loss_clip": 1.0352782, - "balance_loss_mlp": 1.01772964, - "epoch": 0.9583045242747633, - "flos": 23805147922560.0, - "grad_norm": 3.037057978485483, - "language_loss": 0.6558556, - "learning_rate": 1.8183470052280712e-08, - "loss": 0.67691565, - "num_input_tokens_seen": 343918460, - "step": 15939, - "time_per_iteration": 2.8998749256134033 - }, - { - "auxiliary_loss_clip": 0.01079284, - "auxiliary_loss_mlp": 0.01037887, - "balance_loss_clip": 1.03366089, - "balance_loss_mlp": 1.02551913, - "epoch": 0.9583646475274312, - "flos": 24131468004480.0, - "grad_norm": 1.828143592735246, - "language_loss": 0.73795086, - "learning_rate": 1.8131109744532025e-08, - "loss": 0.75912249, - "num_input_tokens_seen": 343938030, - "step": 15940, - "time_per_iteration": 2.8199172019958496 - }, - { - "auxiliary_loss_clip": 0.01109084, - "auxiliary_loss_mlp": 0.01033619, - "balance_loss_clip": 1.03673601, - "balance_loss_mlp": 1.02029765, - "epoch": 0.9584247707800992, - "flos": 20886651970560.0, - "grad_norm": 2.719526095639371, - "language_loss": 0.72758561, - "learning_rate": 1.8078824589450535e-08, - "loss": 0.74901259, - "num_input_tokens_seen": 343956635, - "step": 15941, - "time_per_iteration": 2.580655097961426 - }, - { - "auxiliary_loss_clip": 0.01087013, - "auxiliary_loss_mlp": 0.01036207, - "balance_loss_clip": 1.03728151, - "balance_loss_mlp": 1.02411318, - "epoch": 0.9584848940327672, - "flos": 26067591918720.0, - "grad_norm": 2.197358569248491, - "language_loss": 0.7112202, - "learning_rate": 1.8026614589018442e-08, - "loss": 0.73245239, - "num_input_tokens_seen": 343976625, - "step": 15942, - "time_per_iteration": 2.6756019592285156 - }, - { - "auxiliary_loss_clip": 0.0110919, - "auxiliary_loss_mlp": 0.01033733, - "balance_loss_clip": 1.03630304, - "balance_loss_mlp": 1.02057219, - "epoch": 0.9585450172854352, - "flos": 34492988764800.0, - "grad_norm": 1.5723180530156076, - "language_loss": 0.72362167, - "learning_rate": 1.797447974521571e-08, - "loss": 0.74505079, - "num_input_tokens_seen": 343997790, - "step": 15943, - "time_per_iteration": 2.6411077976226807 - }, - { - "auxiliary_loss_clip": 0.0110037, - "auxiliary_loss_mlp": 0.01037182, - "balance_loss_clip": 1.03692496, - "balance_loss_mlp": 1.02418852, - "epoch": 0.9586051405381031, - "flos": 23110743219840.0, - "grad_norm": 2.3152366176868036, - "language_loss": 0.68444526, - "learning_rate": 1.792242006001965e-08, - "loss": 0.7058208, - "num_input_tokens_seen": 344016935, - "step": 15944, - "time_per_iteration": 2.608394145965576 - }, - { - "auxiliary_loss_clip": 0.0110797, - "auxiliary_loss_mlp": 0.01034536, - "balance_loss_clip": 1.03546226, - "balance_loss_mlp": 1.02167356, - "epoch": 0.9586652637907711, - "flos": 19603994232960.0, - "grad_norm": 1.6133021726163184, - "language_loss": 0.66145849, - "learning_rate": 1.7870435535403795e-08, - "loss": 0.6828835, - "num_input_tokens_seen": 344035590, - "step": 15945, - "time_per_iteration": 2.590332508087158 - }, - { - "auxiliary_loss_clip": 0.00971603, - "auxiliary_loss_mlp": 0.01001306, - "balance_loss_clip": 1.01690745, - "balance_loss_mlp": 1.00031078, - "epoch": 0.958725387043439, - "flos": 72073327317120.0, - "grad_norm": 0.7794478770145054, - "language_loss": 0.61829185, - "learning_rate": 1.7818526173339678e-08, - "loss": 0.63802093, - "num_input_tokens_seen": 344100845, - "step": 15946, - "time_per_iteration": 3.602818489074707 - }, - { - "auxiliary_loss_clip": 0.0110601, - "auxiliary_loss_mlp": 0.01029414, - "balance_loss_clip": 1.03621078, - "balance_loss_mlp": 1.01780248, - "epoch": 0.958785510296107, - "flos": 28911932242560.0, - "grad_norm": 2.157451201161463, - "language_loss": 0.7515372, - "learning_rate": 1.7766691975795723e-08, - "loss": 0.7728914, - "num_input_tokens_seen": 344121780, - "step": 15947, - "time_per_iteration": 2.7516565322875977 - }, - { - "auxiliary_loss_clip": 0.01080438, - "auxiliary_loss_mlp": 0.01027644, - "balance_loss_clip": 1.03239012, - "balance_loss_mlp": 1.01584816, - "epoch": 0.958845633548775, - "flos": 18477189607680.0, - "grad_norm": 2.209516368331818, - "language_loss": 0.69477844, - "learning_rate": 1.771493294473747e-08, - "loss": 0.71585929, - "num_input_tokens_seen": 344140150, - "step": 15948, - "time_per_iteration": 2.6244988441467285 - }, - { - "auxiliary_loss_clip": 0.01057363, - "auxiliary_loss_mlp": 0.01032491, - "balance_loss_clip": 1.03523755, - "balance_loss_mlp": 1.02053475, - "epoch": 0.958905756801443, - "flos": 24206916522240.0, - "grad_norm": 2.7387902232592944, - "language_loss": 0.78748626, - "learning_rate": 1.7663249082127574e-08, - "loss": 0.80838478, - "num_input_tokens_seen": 344158200, - "step": 15949, - "time_per_iteration": 2.7260756492614746 - }, - { - "auxiliary_loss_clip": 0.01111297, - "auxiliary_loss_mlp": 0.01033396, - "balance_loss_clip": 1.03865027, - "balance_loss_mlp": 1.0205152, - "epoch": 0.9589658800541109, - "flos": 25007939769600.0, - "grad_norm": 2.2665397116809634, - "language_loss": 0.68637884, - "learning_rate": 1.761164038992602e-08, - "loss": 0.70782578, - "num_input_tokens_seen": 344174720, - "step": 15950, - "time_per_iteration": 2.5775585174560547 - }, - { - "auxiliary_loss_clip": 0.01089548, - "auxiliary_loss_mlp": 0.0103189, - "balance_loss_clip": 1.03689277, - "balance_loss_mlp": 1.02061236, - "epoch": 0.9590260033067789, - "flos": 23514558894720.0, - "grad_norm": 1.7626457055742824, - "language_loss": 0.8612389, - "learning_rate": 1.7560106870089687e-08, - "loss": 0.88245326, - "num_input_tokens_seen": 344192580, - "step": 15951, - "time_per_iteration": 2.691873550415039 - }, - { - "auxiliary_loss_clip": 0.01085942, - "auxiliary_loss_mlp": 0.01037676, - "balance_loss_clip": 1.03608108, - "balance_loss_mlp": 1.02455699, - "epoch": 0.9590861265594469, - "flos": 25520349237120.0, - "grad_norm": 2.7020454087453434, - "language_loss": 0.79673147, - "learning_rate": 1.7508648524572568e-08, - "loss": 0.81796771, - "num_input_tokens_seen": 344210345, - "step": 15952, - "time_per_iteration": 5.9034318923950195 - }, - { - "auxiliary_loss_clip": 0.01098084, - "auxiliary_loss_mlp": 0.01032288, - "balance_loss_clip": 1.03800857, - "balance_loss_mlp": 1.01903152, - "epoch": 0.9591462498121148, - "flos": 21179323987200.0, - "grad_norm": 1.6067487690898035, - "language_loss": 0.69543386, - "learning_rate": 1.7457265355326434e-08, - "loss": 0.71673763, - "num_input_tokens_seen": 344229540, - "step": 15953, - "time_per_iteration": 2.684041976928711 - }, - { - "auxiliary_loss_clip": 0.0104366, - "auxiliary_loss_mlp": 0.01036069, - "balance_loss_clip": 1.03161478, - "balance_loss_mlp": 1.02161503, - "epoch": 0.9592063730647828, - "flos": 21723047136000.0, - "grad_norm": 2.5164695030249096, - "language_loss": 0.58295131, - "learning_rate": 1.7405957364299285e-08, - "loss": 0.60374862, - "num_input_tokens_seen": 344247830, - "step": 15954, - "time_per_iteration": 4.495413064956665 - }, - { - "auxiliary_loss_clip": 0.01098901, - "auxiliary_loss_mlp": 0.01035688, - "balance_loss_clip": 1.03649294, - "balance_loss_mlp": 1.02196693, - "epoch": 0.9592664963174508, - "flos": 29891395278720.0, - "grad_norm": 2.3199338562306027, - "language_loss": 0.74007273, - "learning_rate": 1.7354724553437117e-08, - "loss": 0.76141858, - "num_input_tokens_seen": 344267760, - "step": 15955, - "time_per_iteration": 2.659421443939209 - }, - { - "auxiliary_loss_clip": 0.0108768, - "auxiliary_loss_mlp": 0.0103626, - "balance_loss_clip": 1.03656662, - "balance_loss_mlp": 1.02313542, - "epoch": 0.9593266195701188, - "flos": 17999613354240.0, - "grad_norm": 1.8573128231358735, - "language_loss": 0.62227011, - "learning_rate": 1.7303566924682378e-08, - "loss": 0.64350951, - "num_input_tokens_seen": 344284905, - "step": 15956, - "time_per_iteration": 2.6006531715393066 - }, - { - "auxiliary_loss_clip": 0.01071121, - "auxiliary_loss_mlp": 0.01030387, - "balance_loss_clip": 1.03647816, - "balance_loss_mlp": 1.01758385, - "epoch": 0.9593867428227867, - "flos": 18838271076480.0, - "grad_norm": 1.7774918774932997, - "language_loss": 0.59834391, - "learning_rate": 1.725248447997507e-08, - "loss": 0.61935902, - "num_input_tokens_seen": 344302025, - "step": 15957, - "time_per_iteration": 4.193215847015381 - }, - { - "auxiliary_loss_clip": 0.0107309, - "auxiliary_loss_mlp": 0.01038609, - "balance_loss_clip": 1.03605032, - "balance_loss_mlp": 1.02551365, - "epoch": 0.9594468660754547, - "flos": 29567050444800.0, - "grad_norm": 1.9567714575730066, - "language_loss": 0.74019581, - "learning_rate": 1.7201477221252314e-08, - "loss": 0.76131284, - "num_input_tokens_seen": 344321935, - "step": 15958, - "time_per_iteration": 2.7699391841888428 - }, - { - "auxiliary_loss_clip": 0.01084783, - "auxiliary_loss_mlp": 0.00770183, - "balance_loss_clip": 1.03334033, - "balance_loss_mlp": 1.00015187, - "epoch": 0.9595069893281226, - "flos": 20703256104960.0, - "grad_norm": 1.5869581385449567, - "language_loss": 0.74366057, - "learning_rate": 1.7150545150448116e-08, - "loss": 0.76221019, - "num_input_tokens_seen": 344340405, - "step": 15959, - "time_per_iteration": 2.6944100856781006 - }, - { - "auxiliary_loss_clip": 0.01095064, - "auxiliary_loss_mlp": 0.01030122, - "balance_loss_clip": 1.03618193, - "balance_loss_mlp": 1.01764679, - "epoch": 0.9595671125807906, - "flos": 22453613856000.0, - "grad_norm": 2.137039778819024, - "language_loss": 0.65102017, - "learning_rate": 1.7099688269493816e-08, - "loss": 0.67227197, - "num_input_tokens_seen": 344359925, - "step": 15960, - "time_per_iteration": 2.6418590545654297 - }, - { - "auxiliary_loss_clip": 0.01105547, - "auxiliary_loss_mlp": 0.01034627, - "balance_loss_clip": 1.03602636, - "balance_loss_mlp": 1.02172852, - "epoch": 0.9596272358334585, - "flos": 23915214172800.0, - "grad_norm": 1.6573536017589419, - "language_loss": 0.78154403, - "learning_rate": 1.7048906580318544e-08, - "loss": 0.80294573, - "num_input_tokens_seen": 344379100, - "step": 15961, - "time_per_iteration": 2.5798726081848145 - }, - { - "auxiliary_loss_clip": 0.01064092, - "auxiliary_loss_mlp": 0.01028572, - "balance_loss_clip": 1.03795755, - "balance_loss_mlp": 1.0165441, - "epoch": 0.9596873590861266, - "flos": 17672539086720.0, - "grad_norm": 1.9079571548212244, - "language_loss": 0.75957453, - "learning_rate": 1.699820008484698e-08, - "loss": 0.78050113, - "num_input_tokens_seen": 344396895, - "step": 15962, - "time_per_iteration": 2.6588690280914307 - }, - { - "auxiliary_loss_clip": 0.01089965, - "auxiliary_loss_mlp": 0.01033348, - "balance_loss_clip": 1.03721845, - "balance_loss_mlp": 1.02023554, - "epoch": 0.9597474823387945, - "flos": 25808532053760.0, - "grad_norm": 2.2256779220037965, - "language_loss": 0.71570283, - "learning_rate": 1.6947568785002698e-08, - "loss": 0.73693591, - "num_input_tokens_seen": 344415115, - "step": 15963, - "time_per_iteration": 2.6878271102905273 - }, - { - "auxiliary_loss_clip": 0.01079235, - "auxiliary_loss_mlp": 0.01033914, - "balance_loss_clip": 1.03706479, - "balance_loss_mlp": 1.02217138, - "epoch": 0.9598076055914625, - "flos": 23768519028480.0, - "grad_norm": 1.5220973192302523, - "language_loss": 0.74199623, - "learning_rate": 1.689701268270527e-08, - "loss": 0.76312768, - "num_input_tokens_seen": 344435185, - "step": 15964, - "time_per_iteration": 2.6809606552124023 - }, - { - "auxiliary_loss_clip": 0.00990624, - "auxiliary_loss_mlp": 0.01004877, - "balance_loss_clip": 1.00604916, - "balance_loss_mlp": 1.00392365, - "epoch": 0.9598677288441305, - "flos": 56515962464640.0, - "grad_norm": 0.8796440193210406, - "language_loss": 0.57517397, - "learning_rate": 1.684653177987161e-08, - "loss": 0.59512901, - "num_input_tokens_seen": 344488950, - "step": 15965, - "time_per_iteration": 3.202644109725952 - }, - { - "auxiliary_loss_clip": 0.0110834, - "auxiliary_loss_mlp": 0.01031239, - "balance_loss_clip": 1.03589642, - "balance_loss_mlp": 1.01969969, - "epoch": 0.9599278520967984, - "flos": 22997480659200.0, - "grad_norm": 1.6089991513926745, - "language_loss": 0.79091173, - "learning_rate": 1.6796126078416627e-08, - "loss": 0.81230754, - "num_input_tokens_seen": 344506740, - "step": 15966, - "time_per_iteration": 2.545722723007202 - }, - { - "auxiliary_loss_clip": 0.01080372, - "auxiliary_loss_mlp": 0.01031178, - "balance_loss_clip": 1.03162789, - "balance_loss_mlp": 1.01910806, - "epoch": 0.9599879753494664, - "flos": 23039676161280.0, - "grad_norm": 1.6133407382349438, - "language_loss": 0.79225981, - "learning_rate": 1.674579558025102e-08, - "loss": 0.81337535, - "num_input_tokens_seen": 344526670, - "step": 15967, - "time_per_iteration": 2.7037363052368164 - }, - { - "auxiliary_loss_clip": 0.01052446, - "auxiliary_loss_mlp": 0.01030727, - "balance_loss_clip": 1.03173876, - "balance_loss_mlp": 1.01654053, - "epoch": 0.9600480986021344, - "flos": 16392287560320.0, - "grad_norm": 4.094831568098365, - "language_loss": 0.80607283, - "learning_rate": 1.669554028728348e-08, - "loss": 0.82690465, - "num_input_tokens_seen": 344541995, - "step": 15968, - "time_per_iteration": 2.6827492713928223 - }, - { - "auxiliary_loss_clip": 0.01061685, - "auxiliary_loss_mlp": 0.01040969, - "balance_loss_clip": 1.0351243, - "balance_loss_mlp": 1.02595484, - "epoch": 0.9601082218548024, - "flos": 24276439296000.0, - "grad_norm": 2.307236682413655, - "language_loss": 0.6711151, - "learning_rate": 1.6645360201420044e-08, - "loss": 0.69214165, - "num_input_tokens_seen": 344559980, - "step": 15969, - "time_per_iteration": 2.709578037261963 - }, - { - "auxiliary_loss_clip": 0.01097154, - "auxiliary_loss_mlp": 0.01041579, - "balance_loss_clip": 1.03613186, - "balance_loss_mlp": 1.02947915, - "epoch": 0.9601683451074703, - "flos": 19609991804160.0, - "grad_norm": 4.544697653030829, - "language_loss": 0.79086411, - "learning_rate": 1.6595255324563186e-08, - "loss": 0.81225151, - "num_input_tokens_seen": 344577765, - "step": 15970, - "time_per_iteration": 2.7411954402923584 - }, - { - "auxiliary_loss_clip": 0.01094881, - "auxiliary_loss_mlp": 0.01030457, - "balance_loss_clip": 1.03728533, - "balance_loss_mlp": 1.01805329, - "epoch": 0.9602284683601383, - "flos": 26651104358400.0, - "grad_norm": 1.536259054605733, - "language_loss": 0.7747072, - "learning_rate": 1.654522565861316e-08, - "loss": 0.79596055, - "num_input_tokens_seen": 344597650, - "step": 15971, - "time_per_iteration": 2.7451272010803223 - }, - { - "auxiliary_loss_clip": 0.01091946, - "auxiliary_loss_mlp": 0.01027817, - "balance_loss_clip": 1.0364567, - "balance_loss_mlp": 1.01459122, - "epoch": 0.9602885916128062, - "flos": 15554096714880.0, - "grad_norm": 1.7996222753948563, - "language_loss": 0.67346907, - "learning_rate": 1.64952712054669e-08, - "loss": 0.69466674, - "num_input_tokens_seen": 344613580, - "step": 15972, - "time_per_iteration": 2.6623332500457764 - }, - { - "auxiliary_loss_clip": 0.01094982, - "auxiliary_loss_mlp": 0.00769511, - "balance_loss_clip": 1.03505695, - "balance_loss_mlp": 1.00020421, - "epoch": 0.9603487148654742, - "flos": 16502353810560.0, - "grad_norm": 2.1539804803600555, - "language_loss": 0.76114738, - "learning_rate": 1.644539196701844e-08, - "loss": 0.77979231, - "num_input_tokens_seen": 344626910, - "step": 15973, - "time_per_iteration": 2.6319777965545654 - }, - { - "auxiliary_loss_clip": 0.01068013, - "auxiliary_loss_mlp": 0.01045452, - "balance_loss_clip": 1.03971171, - "balance_loss_mlp": 1.03173113, - "epoch": 0.9604088381181421, - "flos": 20845354308480.0, - "grad_norm": 1.5935467286542793, - "language_loss": 0.68907356, - "learning_rate": 1.639558794515983e-08, - "loss": 0.71020818, - "num_input_tokens_seen": 344644330, - "step": 15974, - "time_per_iteration": 2.722294569015503 - }, - { - "auxiliary_loss_clip": 0.01097463, - "auxiliary_loss_mlp": 0.01029415, - "balance_loss_clip": 1.03470838, - "balance_loss_mlp": 1.01666546, - "epoch": 0.9604689613708102, - "flos": 19683105937920.0, - "grad_norm": 1.5822354896144846, - "language_loss": 0.67808646, - "learning_rate": 1.6345859141779105e-08, - "loss": 0.69935524, - "num_input_tokens_seen": 344663910, - "step": 15975, - "time_per_iteration": 2.5872485637664795 - }, - { - "auxiliary_loss_clip": 0.01105768, - "auxiliary_loss_mlp": 0.0102977, - "balance_loss_clip": 1.03735399, - "balance_loss_mlp": 1.01801634, - "epoch": 0.9605290846234781, - "flos": 24097568544000.0, - "grad_norm": 2.0352421643496554, - "language_loss": 0.55362296, - "learning_rate": 1.6296205558762322e-08, - "loss": 0.57497835, - "num_input_tokens_seen": 344682320, - "step": 15976, - "time_per_iteration": 2.5711615085601807 - }, - { - "auxiliary_loss_clip": 0.01079409, - "auxiliary_loss_mlp": 0.0102813, - "balance_loss_clip": 1.03170323, - "balance_loss_mlp": 1.01589346, - "epoch": 0.9605892078761461, - "flos": 27122575299840.0, - "grad_norm": 1.7385849927083583, - "language_loss": 0.68164247, - "learning_rate": 1.624662719799219e-08, - "loss": 0.7027179, - "num_input_tokens_seen": 344701355, - "step": 15977, - "time_per_iteration": 2.671110153198242 - }, - { - "auxiliary_loss_clip": 0.01096711, - "auxiliary_loss_mlp": 0.01039725, - "balance_loss_clip": 1.0339942, - "balance_loss_mlp": 1.02705932, - "epoch": 0.9606493311288141, - "flos": 14136918543360.0, - "grad_norm": 1.9552633904927965, - "language_loss": 0.81768823, - "learning_rate": 1.6197124061348766e-08, - "loss": 0.83905256, - "num_input_tokens_seen": 344717980, - "step": 15978, - "time_per_iteration": 2.555152177810669 - }, - { - "auxiliary_loss_clip": 0.01100379, - "auxiliary_loss_mlp": 0.010331, - "balance_loss_clip": 1.03526115, - "balance_loss_mlp": 1.02026129, - "epoch": 0.960709454381482, - "flos": 15813336147840.0, - "grad_norm": 2.5958973786310664, - "language_loss": 0.83387029, - "learning_rate": 1.614769615070921e-08, - "loss": 0.85520506, - "num_input_tokens_seen": 344733480, - "step": 15979, - "time_per_iteration": 2.5497281551361084 - }, - { - "auxiliary_loss_clip": 0.0110855, - "auxiliary_loss_mlp": 0.0103807, - "balance_loss_clip": 1.03590512, - "balance_loss_mlp": 1.02634561, - "epoch": 0.96076957763415, - "flos": 22565403959040.0, - "grad_norm": 1.5387344792405315, - "language_loss": 0.79981411, - "learning_rate": 1.6098343467947805e-08, - "loss": 0.82128036, - "num_input_tokens_seen": 344752130, - "step": 15980, - "time_per_iteration": 2.5905473232269287 - }, - { - "auxiliary_loss_clip": 0.0109877, - "auxiliary_loss_mlp": 0.01028852, - "balance_loss_clip": 1.03493583, - "balance_loss_mlp": 1.01669884, - "epoch": 0.960829700886818, - "flos": 24681260551680.0, - "grad_norm": 2.1133956076478664, - "language_loss": 0.68550336, - "learning_rate": 1.6049066014935942e-08, - "loss": 0.7067796, - "num_input_tokens_seen": 344771195, - "step": 15981, - "time_per_iteration": 2.612859010696411 - }, - { - "auxiliary_loss_clip": 0.01093593, - "auxiliary_loss_mlp": 0.00769185, - "balance_loss_clip": 1.03381348, - "balance_loss_mlp": 1.00022793, - "epoch": 0.960889824139486, - "flos": 26542223256960.0, - "grad_norm": 1.7427488082202907, - "language_loss": 0.69655585, - "learning_rate": 1.5999863793542344e-08, - "loss": 0.71518368, - "num_input_tokens_seen": 344793150, - "step": 15982, - "time_per_iteration": 2.5976712703704834 - }, - { - "auxiliary_loss_clip": 0.00999386, - "auxiliary_loss_mlp": 0.00999842, - "balance_loss_clip": 1.00873065, - "balance_loss_mlp": 0.99883503, - "epoch": 0.9609499473921539, - "flos": 71114942586240.0, - "grad_norm": 0.6662874466097782, - "language_loss": 0.53221011, - "learning_rate": 1.595073680563286e-08, - "loss": 0.5522024, - "num_input_tokens_seen": 344852855, - "step": 15983, - "time_per_iteration": 3.3874897956848145 - }, - { - "auxiliary_loss_clip": 0.01107834, - "auxiliary_loss_mlp": 0.01034719, - "balance_loss_clip": 1.03694439, - "balance_loss_mlp": 1.02233326, - "epoch": 0.9610100706448219, - "flos": 20552466810240.0, - "grad_norm": 2.121316600078336, - "language_loss": 0.67938662, - "learning_rate": 1.5901685053070212e-08, - "loss": 0.70081216, - "num_input_tokens_seen": 344869830, - "step": 15984, - "time_per_iteration": 2.5932650566101074 - }, - { - "auxiliary_loss_clip": 0.01074236, - "auxiliary_loss_mlp": 0.01033486, - "balance_loss_clip": 1.03595209, - "balance_loss_mlp": 1.02153563, - "epoch": 0.9610701938974898, - "flos": 14064199459200.0, - "grad_norm": 1.5705983940163633, - "language_loss": 0.67496943, - "learning_rate": 1.5852708537714477e-08, - "loss": 0.69604665, - "num_input_tokens_seen": 344888905, - "step": 15985, - "time_per_iteration": 2.726486921310425 - }, - { - "auxiliary_loss_clip": 0.01108849, - "auxiliary_loss_mlp": 0.01032566, - "balance_loss_clip": 1.03674269, - "balance_loss_mlp": 1.02043021, - "epoch": 0.9611303171501578, - "flos": 20229989483520.0, - "grad_norm": 2.8268896406333237, - "language_loss": 0.78626662, - "learning_rate": 1.580380726142283e-08, - "loss": 0.80768073, - "num_input_tokens_seen": 344907160, - "step": 15986, - "time_per_iteration": 2.585028886795044 - }, - { - "auxiliary_loss_clip": 0.01059902, - "auxiliary_loss_mlp": 0.01031304, - "balance_loss_clip": 1.03704977, - "balance_loss_mlp": 1.01792264, - "epoch": 0.9611904404028258, - "flos": 20951075013120.0, - "grad_norm": 3.983829786169989, - "language_loss": 0.64043385, - "learning_rate": 1.5754981226049792e-08, - "loss": 0.6613459, - "num_input_tokens_seen": 344922400, - "step": 15987, - "time_per_iteration": 2.6663405895233154 - }, - { - "auxiliary_loss_clip": 0.01105457, - "auxiliary_loss_mlp": 0.01030427, - "balance_loss_clip": 1.03672767, - "balance_loss_mlp": 1.01882839, - "epoch": 0.9612505636554938, - "flos": 24827740214400.0, - "grad_norm": 1.6472459038973077, - "language_loss": 0.66917932, - "learning_rate": 1.5706230433446544e-08, - "loss": 0.69053823, - "num_input_tokens_seen": 344941910, - "step": 15988, - "time_per_iteration": 2.6424877643585205 - }, - { - "auxiliary_loss_clip": 0.01096712, - "auxiliary_loss_mlp": 0.01043698, - "balance_loss_clip": 1.03608358, - "balance_loss_mlp": 1.03205132, - "epoch": 0.9613106869081617, - "flos": 17164977955200.0, - "grad_norm": 2.1053842108044876, - "language_loss": 0.74786007, - "learning_rate": 1.5657554885462055e-08, - "loss": 0.76926422, - "num_input_tokens_seen": 344960020, - "step": 15989, - "time_per_iteration": 2.5956602096557617 - }, - { - "auxiliary_loss_clip": 0.01009811, - "auxiliary_loss_mlp": 0.01009546, - "balance_loss_clip": 1.00601673, - "balance_loss_mlp": 1.00818145, - "epoch": 0.9613708101608297, - "flos": 61563818522880.0, - "grad_norm": 0.8478358014550572, - "language_loss": 0.63107759, - "learning_rate": 1.5608954583941737e-08, - "loss": 0.65127116, - "num_input_tokens_seen": 345018290, - "step": 15990, - "time_per_iteration": 3.152273178100586 - }, - { - "auxiliary_loss_clip": 0.01096035, - "auxiliary_loss_mlp": 0.01034605, - "balance_loss_clip": 1.03537118, - "balance_loss_mlp": 1.02268958, - "epoch": 0.9614309334134977, - "flos": 27417904922880.0, - "grad_norm": 1.9574002604644196, - "language_loss": 0.77676558, - "learning_rate": 1.5560429530729003e-08, - "loss": 0.79807198, - "num_input_tokens_seen": 345040235, - "step": 15991, - "time_per_iteration": 4.212686538696289 - }, - { - "auxiliary_loss_clip": 0.01114207, - "auxiliary_loss_mlp": 0.01034295, - "balance_loss_clip": 1.0376842, - "balance_loss_mlp": 1.02099133, - "epoch": 0.9614910566661656, - "flos": 22819148611200.0, - "grad_norm": 2.7719678193079247, - "language_loss": 0.84980291, - "learning_rate": 1.5511979727663493e-08, - "loss": 0.87128794, - "num_input_tokens_seen": 345054540, - "step": 15992, - "time_per_iteration": 2.5331294536590576 - }, - { - "auxiliary_loss_clip": 0.01084656, - "auxiliary_loss_mlp": 0.01030807, - "balance_loss_clip": 1.0333364, - "balance_loss_mlp": 1.01737833, - "epoch": 0.9615511799188337, - "flos": 20667812359680.0, - "grad_norm": 5.393855831063401, - "language_loss": 0.7277714, - "learning_rate": 1.5463605176582406e-08, - "loss": 0.74892598, - "num_input_tokens_seen": 345074035, - "step": 15993, - "time_per_iteration": 4.279495000839233 - }, - { - "auxiliary_loss_clip": 0.01071095, - "auxiliary_loss_mlp": 0.01033351, - "balance_loss_clip": 1.03617215, - "balance_loss_mlp": 1.02064323, - "epoch": 0.9616113031715016, - "flos": 33149212035840.0, - "grad_norm": 1.4893605821515894, - "language_loss": 0.68342292, - "learning_rate": 1.5415305879320716e-08, - "loss": 0.70446742, - "num_input_tokens_seen": 345099270, - "step": 15994, - "time_per_iteration": 2.7772884368896484 - }, - { - "auxiliary_loss_clip": 0.01072149, - "auxiliary_loss_mlp": 0.01034975, - "balance_loss_clip": 1.03700161, - "balance_loss_mlp": 1.02212477, - "epoch": 0.9616714264241696, - "flos": 25009807276800.0, - "grad_norm": 1.9283329127731719, - "language_loss": 0.84783322, - "learning_rate": 1.5367081837709183e-08, - "loss": 0.86890447, - "num_input_tokens_seen": 345116975, - "step": 15995, - "time_per_iteration": 2.7321677207946777 - }, - { - "auxiliary_loss_clip": 0.0110129, - "auxiliary_loss_mlp": 0.01035588, - "balance_loss_clip": 1.03743207, - "balance_loss_mlp": 1.02226043, - "epoch": 0.9617315496768375, - "flos": 13547480359680.0, - "grad_norm": 2.3727684194410865, - "language_loss": 0.75815755, - "learning_rate": 1.5318933053576788e-08, - "loss": 0.77952629, - "num_input_tokens_seen": 345133645, - "step": 15996, - "time_per_iteration": 2.5802130699157715 - }, - { - "auxiliary_loss_clip": 0.01082505, - "auxiliary_loss_mlp": 0.0103401, - "balance_loss_clip": 1.03420186, - "balance_loss_mlp": 1.02108765, - "epoch": 0.9617916729295055, - "flos": 11254512781440.0, - "grad_norm": 3.2502659425610156, - "language_loss": 0.76369971, - "learning_rate": 1.52708595287494e-08, - "loss": 0.7848649, - "num_input_tokens_seen": 345150740, - "step": 15997, - "time_per_iteration": 4.12961745262146 - }, - { - "auxiliary_loss_clip": 0.01103332, - "auxiliary_loss_mlp": 0.00769549, - "balance_loss_clip": 1.03523898, - "balance_loss_mlp": 1.0002147, - "epoch": 0.9618517961821734, - "flos": 22819723228800.0, - "grad_norm": 1.6933641489070883, - "language_loss": 0.67267382, - "learning_rate": 1.522286126505001e-08, - "loss": 0.69140267, - "num_input_tokens_seen": 345170365, - "step": 15998, - "time_per_iteration": 2.5632731914520264 - }, - { - "auxiliary_loss_clip": 0.01079044, - "auxiliary_loss_mlp": 0.01030668, - "balance_loss_clip": 1.03057599, - "balance_loss_mlp": 1.01695287, - "epoch": 0.9619119194348414, - "flos": 16617340224000.0, - "grad_norm": 1.6277881889337782, - "language_loss": 0.7250607, - "learning_rate": 1.5174938264298498e-08, - "loss": 0.74615777, - "num_input_tokens_seen": 345188930, - "step": 15999, - "time_per_iteration": 2.5826117992401123 - }, - { - "auxiliary_loss_clip": 0.01079594, - "auxiliary_loss_mlp": 0.01023964, - "balance_loss_clip": 1.03278232, - "balance_loss_mlp": 1.01237655, - "epoch": 0.9619720426875094, - "flos": 24535140024960.0, - "grad_norm": 1.9376682350372685, - "language_loss": 0.65341753, - "learning_rate": 1.5127090528312514e-08, - "loss": 0.67445314, - "num_input_tokens_seen": 345209615, - "step": 16000, - "time_per_iteration": 2.6649346351623535 - }, - { - "auxiliary_loss_clip": 0.01074444, - "auxiliary_loss_mlp": 0.01027888, - "balance_loss_clip": 1.034127, - "balance_loss_mlp": 1.0147984, - "epoch": 0.9620321659401774, - "flos": 20632224960000.0, - "grad_norm": 1.8942735733189127, - "language_loss": 0.75229144, - "learning_rate": 1.5079318058905723e-08, - "loss": 0.77331471, - "num_input_tokens_seen": 345229175, - "step": 16001, - "time_per_iteration": 2.690169095993042 - }, - { - "auxiliary_loss_clip": 0.01093786, - "auxiliary_loss_mlp": 0.0103193, - "balance_loss_clip": 1.03392005, - "balance_loss_mlp": 1.01907945, - "epoch": 0.9620922891928453, - "flos": 18515290959360.0, - "grad_norm": 1.9242649128413576, - "language_loss": 0.68372071, - "learning_rate": 1.5031620857890447e-08, - "loss": 0.70497787, - "num_input_tokens_seen": 345247815, - "step": 16002, - "time_per_iteration": 2.609285831451416 - }, - { - "auxiliary_loss_clip": 0.01096986, - "auxiliary_loss_mlp": 0.01032704, - "balance_loss_clip": 1.03659725, - "balance_loss_mlp": 1.0204016, - "epoch": 0.9621524124455133, - "flos": 28767391914240.0, - "grad_norm": 1.3322402005995133, - "language_loss": 0.64338034, - "learning_rate": 1.4983998927074804e-08, - "loss": 0.66467726, - "num_input_tokens_seen": 345269935, - "step": 16003, - "time_per_iteration": 2.64509654045105 - }, - { - "auxiliary_loss_clip": 0.01056283, - "auxiliary_loss_mlp": 0.0103757, - "balance_loss_clip": 1.03516269, - "balance_loss_mlp": 1.02617371, - "epoch": 0.9622125356981813, - "flos": 19098875226240.0, - "grad_norm": 1.8799726685356375, - "language_loss": 0.75980008, - "learning_rate": 1.493645226826512e-08, - "loss": 0.78073859, - "num_input_tokens_seen": 345288310, - "step": 16004, - "time_per_iteration": 2.746777057647705 - }, - { - "auxiliary_loss_clip": 0.01096501, - "auxiliary_loss_mlp": 0.01030981, - "balance_loss_clip": 1.03659928, - "balance_loss_mlp": 1.01776099, - "epoch": 0.9622726589508492, - "flos": 20302816308480.0, - "grad_norm": 1.8988665709450379, - "language_loss": 0.79441619, - "learning_rate": 1.4888980883263958e-08, - "loss": 0.81569099, - "num_input_tokens_seen": 345306615, - "step": 16005, - "time_per_iteration": 2.6173338890075684 - }, - { - "auxiliary_loss_clip": 0.01093237, - "auxiliary_loss_mlp": 0.01030093, - "balance_loss_clip": 1.0344584, - "balance_loss_mlp": 1.01876855, - "epoch": 0.9623327822035173, - "flos": 54929750889600.0, - "grad_norm": 30.35501867161595, - "language_loss": 0.67897928, - "learning_rate": 1.4841584773871652e-08, - "loss": 0.7002126, - "num_input_tokens_seen": 345331935, - "step": 16006, - "time_per_iteration": 2.912827730178833 - }, - { - "auxiliary_loss_clip": 0.0107661, - "auxiliary_loss_mlp": 0.01037957, - "balance_loss_clip": 1.03514838, - "balance_loss_mlp": 1.02623272, - "epoch": 0.9623929054561852, - "flos": 21759029585280.0, - "grad_norm": 1.8340702205023383, - "language_loss": 0.77994108, - "learning_rate": 1.479426394188521e-08, - "loss": 0.80108666, - "num_input_tokens_seen": 345351510, - "step": 16007, - "time_per_iteration": 2.6248257160186768 - }, - { - "auxiliary_loss_clip": 0.0111027, - "auxiliary_loss_mlp": 0.010323, - "balance_loss_clip": 1.03747129, - "balance_loss_mlp": 1.01994443, - "epoch": 0.9624530287088532, - "flos": 17931563038080.0, - "grad_norm": 2.1097968556783244, - "language_loss": 0.67964327, - "learning_rate": 1.4747018389099198e-08, - "loss": 0.701069, - "num_input_tokens_seen": 345367750, - "step": 16008, - "time_per_iteration": 2.537191867828369 - }, - { - "auxiliary_loss_clip": 0.01085992, - "auxiliary_loss_mlp": 0.01032638, - "balance_loss_clip": 1.03743672, - "balance_loss_mlp": 1.01911998, - "epoch": 0.9625131519615211, - "flos": 23253739263360.0, - "grad_norm": 2.1552022323644846, - "language_loss": 0.72934628, - "learning_rate": 1.469984811730529e-08, - "loss": 0.75053251, - "num_input_tokens_seen": 345384790, - "step": 16009, - "time_per_iteration": 2.6170432567596436 - }, - { - "auxiliary_loss_clip": 0.01094692, - "auxiliary_loss_mlp": 0.01032269, - "balance_loss_clip": 1.03521502, - "balance_loss_mlp": 1.02035999, - "epoch": 0.9625732752141891, - "flos": 18916628595840.0, - "grad_norm": 2.236210012080847, - "language_loss": 0.75740463, - "learning_rate": 1.4652753128292061e-08, - "loss": 0.77867424, - "num_input_tokens_seen": 345403390, - "step": 16010, - "time_per_iteration": 2.6094565391540527 - }, - { - "auxiliary_loss_clip": 0.0110126, - "auxiliary_loss_mlp": 0.01033383, - "balance_loss_clip": 1.03804505, - "balance_loss_mlp": 1.01812458, - "epoch": 0.962633398466857, - "flos": 16252918790400.0, - "grad_norm": 1.8499312675955801, - "language_loss": 0.69607782, - "learning_rate": 1.4605733423845635e-08, - "loss": 0.71742427, - "num_input_tokens_seen": 345418685, - "step": 16011, - "time_per_iteration": 2.5665814876556396 - }, - { - "auxiliary_loss_clip": 0.01096422, - "auxiliary_loss_mlp": 0.01034569, - "balance_loss_clip": 1.03724504, - "balance_loss_mlp": 1.02317858, - "epoch": 0.962693521719525, - "flos": 54197424403200.0, - "grad_norm": 2.090107239169434, - "language_loss": 0.68528754, - "learning_rate": 1.4558789005748585e-08, - "loss": 0.70659745, - "num_input_tokens_seen": 345442380, - "step": 16012, - "time_per_iteration": 2.8673768043518066 - }, - { - "auxiliary_loss_clip": 0.0109098, - "auxiliary_loss_mlp": 0.01034537, - "balance_loss_clip": 1.03467774, - "balance_loss_mlp": 1.02032721, - "epoch": 0.962753644972193, - "flos": 33105795471360.0, - "grad_norm": 1.7699818597155268, - "language_loss": 0.72427005, - "learning_rate": 1.4511919875781264e-08, - "loss": 0.74552524, - "num_input_tokens_seen": 345463815, - "step": 16013, - "time_per_iteration": 2.75661301612854 - }, - { - "auxiliary_loss_clip": 0.01075741, - "auxiliary_loss_mlp": 0.01033286, - "balance_loss_clip": 1.03560877, - "balance_loss_mlp": 1.02013755, - "epoch": 0.962813768224861, - "flos": 42230660837760.0, - "grad_norm": 2.2049191715413996, - "language_loss": 0.63640058, - "learning_rate": 1.4465126035720698e-08, - "loss": 0.65749085, - "num_input_tokens_seen": 345484525, - "step": 16014, - "time_per_iteration": 2.801541328430176 - }, - { - "auxiliary_loss_clip": 0.01084087, - "auxiliary_loss_mlp": 0.01031024, - "balance_loss_clip": 1.03718603, - "balance_loss_mlp": 1.02020597, - "epoch": 0.9628738914775289, - "flos": 43944677003520.0, - "grad_norm": 1.6444594137562585, - "language_loss": 0.71679461, - "learning_rate": 1.4418407487341688e-08, - "loss": 0.73794574, - "num_input_tokens_seen": 345508295, - "step": 16015, - "time_per_iteration": 2.8245065212249756 - }, - { - "auxiliary_loss_clip": 0.01070924, - "auxiliary_loss_mlp": 0.01031789, - "balance_loss_clip": 1.03087783, - "balance_loss_mlp": 1.01914668, - "epoch": 0.9629340147301969, - "flos": 15596184476160.0, - "grad_norm": 1.8324403843710784, - "language_loss": 0.77434921, - "learning_rate": 1.4371764232415707e-08, - "loss": 0.79537642, - "num_input_tokens_seen": 345525155, - "step": 16016, - "time_per_iteration": 2.7069830894470215 - }, - { - "auxiliary_loss_clip": 0.01027071, - "auxiliary_loss_mlp": 0.01000442, - "balance_loss_clip": 1.0047729, - "balance_loss_mlp": 0.99953043, - "epoch": 0.9629941379828649, - "flos": 62951011816320.0, - "grad_norm": 0.808956080436883, - "language_loss": 0.63018364, - "learning_rate": 1.4325196272711337e-08, - "loss": 0.65045875, - "num_input_tokens_seen": 345578905, - "step": 16017, - "time_per_iteration": 3.0989813804626465 - }, - { - "auxiliary_loss_clip": 0.01093389, - "auxiliary_loss_mlp": 0.0102702, - "balance_loss_clip": 1.03960085, - "balance_loss_mlp": 1.01511717, - "epoch": 0.9630542612355328, - "flos": 29899116702720.0, - "grad_norm": 1.8256798404845316, - "language_loss": 0.66259742, - "learning_rate": 1.4278703609994502e-08, - "loss": 0.68380153, - "num_input_tokens_seen": 345598965, - "step": 16018, - "time_per_iteration": 2.7493810653686523 - }, - { - "auxiliary_loss_clip": 0.01059951, - "auxiliary_loss_mlp": 0.01036182, - "balance_loss_clip": 1.03763199, - "balance_loss_mlp": 1.02381968, - "epoch": 0.9631143844882009, - "flos": 17894575008000.0, - "grad_norm": 1.944806621091631, - "language_loss": 0.79563761, - "learning_rate": 1.4232286246028457e-08, - "loss": 0.81659889, - "num_input_tokens_seen": 345617945, - "step": 16019, - "time_per_iteration": 2.6809628009796143 - }, - { - "auxiliary_loss_clip": 0.01070109, - "auxiliary_loss_mlp": 0.01029701, - "balance_loss_clip": 1.03343022, - "balance_loss_mlp": 1.01866817, - "epoch": 0.9631745077408688, - "flos": 26139161767680.0, - "grad_norm": 1.782612322393727, - "language_loss": 0.71960497, - "learning_rate": 1.4185944182572907e-08, - "loss": 0.74060309, - "num_input_tokens_seen": 345637920, - "step": 16020, - "time_per_iteration": 2.724942684173584 - }, - { - "auxiliary_loss_clip": 0.01084456, - "auxiliary_loss_mlp": 0.01026989, - "balance_loss_clip": 1.03556895, - "balance_loss_mlp": 1.01536036, - "epoch": 0.9632346309935368, - "flos": 24973645259520.0, - "grad_norm": 2.276745158926346, - "language_loss": 0.77092677, - "learning_rate": 1.4139677421385331e-08, - "loss": 0.79204124, - "num_input_tokens_seen": 345656195, - "step": 16021, - "time_per_iteration": 2.6800484657287598 - }, - { - "auxiliary_loss_clip": 0.01074317, - "auxiliary_loss_mlp": 0.01030211, - "balance_loss_clip": 1.03503883, - "balance_loss_mlp": 1.0156492, - "epoch": 0.9632947542462047, - "flos": 23617226943360.0, - "grad_norm": 2.2141346213360498, - "language_loss": 0.6477133, - "learning_rate": 1.4093485964220331e-08, - "loss": 0.66875851, - "num_input_tokens_seen": 345676700, - "step": 16022, - "time_per_iteration": 2.6913392543792725 - }, - { - "auxiliary_loss_clip": 0.01079957, - "auxiliary_loss_mlp": 0.01037399, - "balance_loss_clip": 1.03176844, - "balance_loss_mlp": 1.02543032, - "epoch": 0.9633548774988727, - "flos": 26395599939840.0, - "grad_norm": 1.8655459266575873, - "language_loss": 0.73232532, - "learning_rate": 1.4047369812829168e-08, - "loss": 0.75349891, - "num_input_tokens_seen": 345696725, - "step": 16023, - "time_per_iteration": 2.8063480854034424 - }, - { - "auxiliary_loss_clip": 0.01092328, - "auxiliary_loss_mlp": 0.01033534, - "balance_loss_clip": 1.03424549, - "balance_loss_mlp": 1.02042127, - "epoch": 0.9634150007515406, - "flos": 23767728929280.0, - "grad_norm": 1.4474458645948844, - "language_loss": 0.81416321, - "learning_rate": 1.4001328968960891e-08, - "loss": 0.8354218, - "num_input_tokens_seen": 345716245, - "step": 16024, - "time_per_iteration": 2.6448142528533936 - }, - { - "auxiliary_loss_clip": 0.01102103, - "auxiliary_loss_mlp": 0.01032463, - "balance_loss_clip": 1.03745365, - "balance_loss_mlp": 1.01969528, - "epoch": 0.9634751240042086, - "flos": 24135346673280.0, - "grad_norm": 1.5219305560935168, - "language_loss": 0.81457579, - "learning_rate": 1.3955363434361212e-08, - "loss": 0.83592141, - "num_input_tokens_seen": 345739060, - "step": 16025, - "time_per_iteration": 2.6108663082122803 - }, - { - "auxiliary_loss_clip": 0.0110069, - "auxiliary_loss_mlp": 0.01030354, - "balance_loss_clip": 1.03579926, - "balance_loss_mlp": 1.017694, - "epoch": 0.9635352472568766, - "flos": 24349086552960.0, - "grad_norm": 2.10441973449587, - "language_loss": 0.75937688, - "learning_rate": 1.3909473210773181e-08, - "loss": 0.78068733, - "num_input_tokens_seen": 345758325, - "step": 16026, - "time_per_iteration": 2.6266496181488037 - }, - { - "auxiliary_loss_clip": 0.01073067, - "auxiliary_loss_mlp": 0.00772375, - "balance_loss_clip": 1.03285146, - "balance_loss_mlp": 1.00015044, - "epoch": 0.9635953705095446, - "flos": 23984772860160.0, - "grad_norm": 1.7472949763500514, - "language_loss": 0.632388, - "learning_rate": 1.3863658299936965e-08, - "loss": 0.65084237, - "num_input_tokens_seen": 345778530, - "step": 16027, - "time_per_iteration": 2.7170257568359375 - }, - { - "auxiliary_loss_clip": 0.01099141, - "auxiliary_loss_mlp": 0.01031793, - "balance_loss_clip": 1.0367496, - "balance_loss_mlp": 1.01860225, - "epoch": 0.9636554937622125, - "flos": 19828436365440.0, - "grad_norm": 2.4648071032004997, - "language_loss": 0.87019849, - "learning_rate": 1.3817918703589837e-08, - "loss": 0.89150786, - "num_input_tokens_seen": 345796535, - "step": 16028, - "time_per_iteration": 2.6614620685577393 - }, - { - "auxiliary_loss_clip": 0.00989988, - "auxiliary_loss_mlp": 0.009984, - "balance_loss_clip": 1.01412296, - "balance_loss_mlp": 0.99733889, - "epoch": 0.9637156170148805, - "flos": 67435499986560.0, - "grad_norm": 0.7245646375690661, - "language_loss": 0.53189749, - "learning_rate": 1.3772254423466412e-08, - "loss": 0.55178136, - "num_input_tokens_seen": 345859700, - "step": 16029, - "time_per_iteration": 3.479651927947998 - }, - { - "auxiliary_loss_clip": 0.01110359, - "auxiliary_loss_mlp": 0.01030913, - "balance_loss_clip": 1.0374155, - "balance_loss_mlp": 1.01844335, - "epoch": 0.9637757402675484, - "flos": 20300912887680.0, - "grad_norm": 1.5210490896959066, - "language_loss": 0.7357558, - "learning_rate": 1.372666546129797e-08, - "loss": 0.75716853, - "num_input_tokens_seen": 345878760, - "step": 16030, - "time_per_iteration": 4.589270353317261 - }, - { - "auxiliary_loss_clip": 0.01082803, - "auxiliary_loss_mlp": 0.01030981, - "balance_loss_clip": 1.03516376, - "balance_loss_mlp": 1.01882792, - "epoch": 0.9638358635202164, - "flos": 27234544970880.0, - "grad_norm": 2.0480859370229485, - "language_loss": 0.66053402, - "learning_rate": 1.3681151818813575e-08, - "loss": 0.68167186, - "num_input_tokens_seen": 345900445, - "step": 16031, - "time_per_iteration": 4.3295276165008545 - }, - { - "auxiliary_loss_clip": 0.0101801, - "auxiliary_loss_mlp": 0.00751055, - "balance_loss_clip": 1.00562906, - "balance_loss_mlp": 0.99969733, - "epoch": 0.9638959867728845, - "flos": 70288998278400.0, - "grad_norm": 0.8510769072154526, - "language_loss": 0.60678655, - "learning_rate": 1.3635713497738955e-08, - "loss": 0.62447721, - "num_input_tokens_seen": 345961020, - "step": 16032, - "time_per_iteration": 4.807501554489136 - }, - { - "auxiliary_loss_clip": 0.01087947, - "auxiliary_loss_mlp": 0.01029285, - "balance_loss_clip": 1.03354275, - "balance_loss_mlp": 1.01818657, - "epoch": 0.9639561100255524, - "flos": 25407517639680.0, - "grad_norm": 1.683266113413322, - "language_loss": 0.66466224, - "learning_rate": 1.3590350499796954e-08, - "loss": 0.68583459, - "num_input_tokens_seen": 345980210, - "step": 16033, - "time_per_iteration": 2.6166305541992188 - }, - { - "auxiliary_loss_clip": 0.01049582, - "auxiliary_loss_mlp": 0.01033058, - "balance_loss_clip": 1.03215432, - "balance_loss_mlp": 1.02048707, - "epoch": 0.9640162332782204, - "flos": 18113881495680.0, - "grad_norm": 1.6681343659776384, - "language_loss": 0.65576452, - "learning_rate": 1.3545062826707976e-08, - "loss": 0.67659092, - "num_input_tokens_seen": 345998280, - "step": 16034, - "time_per_iteration": 2.727808713912964 - }, - { - "auxiliary_loss_clip": 0.01064646, - "auxiliary_loss_mlp": 0.01034078, - "balance_loss_clip": 1.03320181, - "balance_loss_mlp": 1.02124476, - "epoch": 0.9640763565308883, - "flos": 23440295525760.0, - "grad_norm": 4.072427623407378, - "language_loss": 0.74320328, - "learning_rate": 1.3499850480189313e-08, - "loss": 0.7641905, - "num_input_tokens_seen": 346015545, - "step": 16035, - "time_per_iteration": 2.690566301345825 - }, - { - "auxiliary_loss_clip": 0.01111339, - "auxiliary_loss_mlp": 0.0102927, - "balance_loss_clip": 1.03984404, - "balance_loss_mlp": 1.01689649, - "epoch": 0.9641364797835563, - "flos": 22419355259520.0, - "grad_norm": 1.9463375206505085, - "language_loss": 0.81678671, - "learning_rate": 1.3454713461955591e-08, - "loss": 0.83819282, - "num_input_tokens_seen": 346034055, - "step": 16036, - "time_per_iteration": 4.158876180648804 - }, - { - "auxiliary_loss_clip": 0.0107928, - "auxiliary_loss_mlp": 0.0103524, - "balance_loss_clip": 1.03454709, - "balance_loss_mlp": 1.0221982, - "epoch": 0.9641966030362242, - "flos": 30622357048320.0, - "grad_norm": 1.959482947327249, - "language_loss": 0.69556695, - "learning_rate": 1.340965177371789e-08, - "loss": 0.71671212, - "num_input_tokens_seen": 346054130, - "step": 16037, - "time_per_iteration": 2.7688260078430176 - }, - { - "auxiliary_loss_clip": 0.01107935, - "auxiliary_loss_mlp": 0.01027132, - "balance_loss_clip": 1.03539455, - "balance_loss_mlp": 1.014907, - "epoch": 0.9642567262888923, - "flos": 20953122088320.0, - "grad_norm": 1.9894338477603324, - "language_loss": 0.63357198, - "learning_rate": 1.3364665417185506e-08, - "loss": 0.65492266, - "num_input_tokens_seen": 346072990, - "step": 16038, - "time_per_iteration": 2.5850584506988525 - }, - { - "auxiliary_loss_clip": 0.01074768, - "auxiliary_loss_mlp": 0.00773215, - "balance_loss_clip": 1.03389633, - "balance_loss_mlp": 1.00020552, - "epoch": 0.9643168495415602, - "flos": 22639415932800.0, - "grad_norm": 1.7788742009808307, - "language_loss": 0.71187615, - "learning_rate": 1.3319754394064187e-08, - "loss": 0.73035598, - "num_input_tokens_seen": 346093745, - "step": 16039, - "time_per_iteration": 2.845629930496216 - }, - { - "auxiliary_loss_clip": 0.01065131, - "auxiliary_loss_mlp": 0.0103299, - "balance_loss_clip": 1.03418183, - "balance_loss_mlp": 1.02005541, - "epoch": 0.9643769727942282, - "flos": 20266259241600.0, - "grad_norm": 2.1186364424609376, - "language_loss": 0.73193431, - "learning_rate": 1.327491870605657e-08, - "loss": 0.7529155, - "num_input_tokens_seen": 346110115, - "step": 16040, - "time_per_iteration": 2.786925792694092 - }, - { - "auxiliary_loss_clip": 0.01098258, - "auxiliary_loss_mlp": 0.01030326, - "balance_loss_clip": 1.03442872, - "balance_loss_mlp": 1.0174036, - "epoch": 0.9644370960468961, - "flos": 13881845088000.0, - "grad_norm": 2.252747259214268, - "language_loss": 0.72871804, - "learning_rate": 1.3230158354863296e-08, - "loss": 0.75000393, - "num_input_tokens_seen": 346127165, - "step": 16041, - "time_per_iteration": 2.6087379455566406 - }, - { - "auxiliary_loss_clip": 0.01079942, - "auxiliary_loss_mlp": 0.01032192, - "balance_loss_clip": 1.03319049, - "balance_loss_mlp": 1.0204258, - "epoch": 0.9644972192995641, - "flos": 17238199829760.0, - "grad_norm": 2.3450259817434675, - "language_loss": 0.7170828, - "learning_rate": 1.3185473342181674e-08, - "loss": 0.73820412, - "num_input_tokens_seen": 346145950, - "step": 16042, - "time_per_iteration": 2.630866765975952 - }, - { - "auxiliary_loss_clip": 0.01071379, - "auxiliary_loss_mlp": 0.0103485, - "balance_loss_clip": 1.03418255, - "balance_loss_mlp": 1.02246428, - "epoch": 0.964557342552232, - "flos": 23840340272640.0, - "grad_norm": 2.7842681771990954, - "language_loss": 0.80969441, - "learning_rate": 1.3140863669705683e-08, - "loss": 0.83075678, - "num_input_tokens_seen": 346165005, - "step": 16043, - "time_per_iteration": 2.7390518188476562 - }, - { - "auxiliary_loss_clip": 0.01080445, - "auxiliary_loss_mlp": 0.01033983, - "balance_loss_clip": 1.03533363, - "balance_loss_mlp": 1.0219785, - "epoch": 0.9646174658049, - "flos": 21653129312640.0, - "grad_norm": 1.6601766857412785, - "language_loss": 0.71968645, - "learning_rate": 1.3096329339127522e-08, - "loss": 0.74083078, - "num_input_tokens_seen": 346185095, - "step": 16044, - "time_per_iteration": 2.7201802730560303 - }, - { - "auxiliary_loss_clip": 0.01082368, - "auxiliary_loss_mlp": 0.01030048, - "balance_loss_clip": 1.03337395, - "balance_loss_mlp": 1.01734579, - "epoch": 0.9646775890575681, - "flos": 17129570123520.0, - "grad_norm": 1.8680563800690775, - "language_loss": 0.70015121, - "learning_rate": 1.3051870352135397e-08, - "loss": 0.72127533, - "num_input_tokens_seen": 346202580, - "step": 16045, - "time_per_iteration": 2.612548589706421 - }, - { - "auxiliary_loss_clip": 0.01038509, - "auxiliary_loss_mlp": 0.01033861, - "balance_loss_clip": 1.03134286, - "balance_loss_mlp": 1.02050328, - "epoch": 0.964737712310236, - "flos": 13005732458880.0, - "grad_norm": 1.8349369977772942, - "language_loss": 0.74999833, - "learning_rate": 1.3007486710415737e-08, - "loss": 0.77072203, - "num_input_tokens_seen": 346219395, - "step": 16046, - "time_per_iteration": 2.7376601696014404 - }, - { - "auxiliary_loss_clip": 0.0110139, - "auxiliary_loss_mlp": 0.01036417, - "balance_loss_clip": 1.03724825, - "balance_loss_mlp": 1.0229938, - "epoch": 0.964797835562904, - "flos": 24279240556800.0, - "grad_norm": 2.6746075842728643, - "language_loss": 0.62799901, - "learning_rate": 1.2963178415651199e-08, - "loss": 0.64937705, - "num_input_tokens_seen": 346239715, - "step": 16047, - "time_per_iteration": 2.6332709789276123 - }, - { - "auxiliary_loss_clip": 0.01088739, - "auxiliary_loss_mlp": 0.01036737, - "balance_loss_clip": 1.0394733, - "balance_loss_mlp": 1.02437496, - "epoch": 0.9648579588155719, - "flos": 20522697413760.0, - "grad_norm": 3.1779887131346722, - "language_loss": 0.68779409, - "learning_rate": 1.2918945469521992e-08, - "loss": 0.70904881, - "num_input_tokens_seen": 346258500, - "step": 16048, - "time_per_iteration": 2.6534385681152344 - }, - { - "auxiliary_loss_clip": 0.01099634, - "auxiliary_loss_mlp": 0.01033136, - "balance_loss_clip": 1.03766835, - "balance_loss_mlp": 1.02040398, - "epoch": 0.9649180820682399, - "flos": 32154844855680.0, - "grad_norm": 1.6641327759979738, - "language_loss": 0.63842821, - "learning_rate": 1.2874787873705662e-08, - "loss": 0.65975595, - "num_input_tokens_seen": 346279110, - "step": 16049, - "time_per_iteration": 2.7865707874298096 - }, - { - "auxiliary_loss_clip": 0.0110081, - "auxiliary_loss_mlp": 0.01031714, - "balance_loss_clip": 1.03886986, - "balance_loss_mlp": 1.01909614, - "epoch": 0.9649782053209078, - "flos": 20522589672960.0, - "grad_norm": 1.7000541737371648, - "language_loss": 0.70881176, - "learning_rate": 1.2830705629876427e-08, - "loss": 0.73013705, - "num_input_tokens_seen": 346297860, - "step": 16050, - "time_per_iteration": 2.6416265964508057 - }, - { - "auxiliary_loss_clip": 0.01097319, - "auxiliary_loss_mlp": 0.01036578, - "balance_loss_clip": 1.03291678, - "balance_loss_mlp": 1.02254736, - "epoch": 0.9650383285735759, - "flos": 43067953843200.0, - "grad_norm": 1.8954759239301664, - "language_loss": 0.70080233, - "learning_rate": 1.278669873970606e-08, - "loss": 0.72214133, - "num_input_tokens_seen": 346319860, - "step": 16051, - "time_per_iteration": 2.8770833015441895 - }, - { - "auxiliary_loss_clip": 0.0101809, - "auxiliary_loss_mlp": 0.01006389, - "balance_loss_clip": 1.0055362, - "balance_loss_mlp": 1.00536346, - "epoch": 0.9650984518262438, - "flos": 61748255882880.0, - "grad_norm": 0.8414397743745523, - "language_loss": 0.59155834, - "learning_rate": 1.2742767204863004e-08, - "loss": 0.61180305, - "num_input_tokens_seen": 346379025, - "step": 16052, - "time_per_iteration": 3.190720796585083 - }, - { - "auxiliary_loss_clip": 0.01103599, - "auxiliary_loss_mlp": 0.01027391, - "balance_loss_clip": 1.03430974, - "balance_loss_mlp": 1.01511848, - "epoch": 0.9651585750789118, - "flos": 29789337761280.0, - "grad_norm": 1.6456088019089208, - "language_loss": 0.74250531, - "learning_rate": 1.2698911027013482e-08, - "loss": 0.76381516, - "num_input_tokens_seen": 346402250, - "step": 16053, - "time_per_iteration": 2.707024335861206 - }, - { - "auxiliary_loss_clip": 0.01083745, - "auxiliary_loss_mlp": 0.01030902, - "balance_loss_clip": 1.03504825, - "balance_loss_mlp": 1.01819479, - "epoch": 0.9652186983315797, - "flos": 16873060124160.0, - "grad_norm": 2.539273604923119, - "language_loss": 0.68519378, - "learning_rate": 1.2655130207820386e-08, - "loss": 0.70634031, - "num_input_tokens_seen": 346419555, - "step": 16054, - "time_per_iteration": 2.650216817855835 - }, - { - "auxiliary_loss_clip": 0.01091665, - "auxiliary_loss_mlp": 0.00769869, - "balance_loss_clip": 1.03783798, - "balance_loss_mlp": 1.00018322, - "epoch": 0.9652788215842477, - "flos": 31649761762560.0, - "grad_norm": 1.504057282029753, - "language_loss": 0.6170547, - "learning_rate": 1.2611424748943944e-08, - "loss": 0.63567007, - "num_input_tokens_seen": 346441245, - "step": 16055, - "time_per_iteration": 2.708653450012207 - }, - { - "auxiliary_loss_clip": 0.01069001, - "auxiliary_loss_mlp": 0.01032832, - "balance_loss_clip": 1.03551924, - "balance_loss_mlp": 1.02045822, - "epoch": 0.9653389448369156, - "flos": 24754266944640.0, - "grad_norm": 1.8401015391403723, - "language_loss": 0.77219534, - "learning_rate": 1.2567794652041719e-08, - "loss": 0.79321373, - "num_input_tokens_seen": 346460065, - "step": 16056, - "time_per_iteration": 2.860055446624756 - }, - { - "auxiliary_loss_clip": 0.01081129, - "auxiliary_loss_mlp": 0.01031879, - "balance_loss_clip": 1.03277361, - "balance_loss_mlp": 1.01884317, - "epoch": 0.9653990680895836, - "flos": 20297249700480.0, - "grad_norm": 1.5426450454186225, - "language_loss": 0.71504593, - "learning_rate": 1.2524239918767498e-08, - "loss": 0.73617601, - "num_input_tokens_seen": 346478005, - "step": 16057, - "time_per_iteration": 2.6402721405029297 - }, - { - "auxiliary_loss_clip": 0.01104126, - "auxiliary_loss_mlp": 0.01033201, - "balance_loss_clip": 1.03448784, - "balance_loss_mlp": 1.02154279, - "epoch": 0.9654591913422517, - "flos": 22528775064960.0, - "grad_norm": 2.1611646514701786, - "language_loss": 0.71808469, - "learning_rate": 1.2480760550773295e-08, - "loss": 0.73945796, - "num_input_tokens_seen": 346497575, - "step": 16058, - "time_per_iteration": 2.5751798152923584 - }, - { - "auxiliary_loss_clip": 0.01095378, - "auxiliary_loss_mlp": 0.01032935, - "balance_loss_clip": 1.0353595, - "balance_loss_mlp": 1.02075791, - "epoch": 0.9655193145949196, - "flos": 26763002202240.0, - "grad_norm": 1.6714085825517457, - "language_loss": 0.74098462, - "learning_rate": 1.2437356549708011e-08, - "loss": 0.76226771, - "num_input_tokens_seen": 346520000, - "step": 16059, - "time_per_iteration": 2.7003426551818848 - }, - { - "auxiliary_loss_clip": 0.01090004, - "auxiliary_loss_mlp": 0.01034435, - "balance_loss_clip": 1.03552127, - "balance_loss_mlp": 1.02234113, - "epoch": 0.9655794378475876, - "flos": 41970703132800.0, - "grad_norm": 1.9389350053805974, - "language_loss": 0.73612213, - "learning_rate": 1.239402791721722e-08, - "loss": 0.75736654, - "num_input_tokens_seen": 346541605, - "step": 16060, - "time_per_iteration": 2.784961462020874 - }, - { - "auxiliary_loss_clip": 0.01084764, - "auxiliary_loss_mlp": 0.01031011, - "balance_loss_clip": 1.03691041, - "balance_loss_mlp": 1.019889, - "epoch": 0.9656395611002555, - "flos": 27709427704320.0, - "grad_norm": 2.3988386788091502, - "language_loss": 0.76481092, - "learning_rate": 1.2350774654944273e-08, - "loss": 0.78596866, - "num_input_tokens_seen": 346560955, - "step": 16061, - "time_per_iteration": 2.7270572185516357 - }, - { - "auxiliary_loss_clip": 0.01012338, - "auxiliary_loss_mlp": 0.01000976, - "balance_loss_clip": 1.00929773, - "balance_loss_mlp": 1.00001049, - "epoch": 0.9656996843529235, - "flos": 68968562411520.0, - "grad_norm": 0.7235401443187384, - "language_loss": 0.64154565, - "learning_rate": 1.2307596764528749e-08, - "loss": 0.66167879, - "num_input_tokens_seen": 346621615, - "step": 16062, - "time_per_iteration": 3.263425827026367 - }, - { - "auxiliary_loss_clip": 0.01055166, - "auxiliary_loss_mlp": 0.01028261, - "balance_loss_clip": 1.02995956, - "balance_loss_mlp": 1.01672757, - "epoch": 0.9657598076055914, - "flos": 20631327120000.0, - "grad_norm": 1.9907973555494325, - "language_loss": 0.92924762, - "learning_rate": 1.226449424760867e-08, - "loss": 0.95008188, - "num_input_tokens_seen": 346637460, - "step": 16063, - "time_per_iteration": 2.728024959564209 - }, - { - "auxiliary_loss_clip": 0.01099068, - "auxiliary_loss_mlp": 0.01033776, - "balance_loss_clip": 1.03742814, - "balance_loss_mlp": 1.02153897, - "epoch": 0.9658199308582595, - "flos": 20448577699200.0, - "grad_norm": 1.7715018792062844, - "language_loss": 0.82029349, - "learning_rate": 1.2221467105818062e-08, - "loss": 0.84162194, - "num_input_tokens_seen": 346655625, - "step": 16064, - "time_per_iteration": 2.633328914642334 - }, - { - "auxiliary_loss_clip": 0.01095042, - "auxiliary_loss_mlp": 0.00770428, - "balance_loss_clip": 1.03698933, - "balance_loss_mlp": 1.0001657, - "epoch": 0.9658800541109274, - "flos": 24718033100160.0, - "grad_norm": 1.5465951740979789, - "language_loss": 0.84208536, - "learning_rate": 1.2178515340788731e-08, - "loss": 0.86074007, - "num_input_tokens_seen": 346675220, - "step": 16065, - "time_per_iteration": 2.6656553745269775 - }, - { - "auxiliary_loss_clip": 0.01083456, - "auxiliary_loss_mlp": 0.01029904, - "balance_loss_clip": 1.03509152, - "balance_loss_mlp": 1.01748872, - "epoch": 0.9659401773635954, - "flos": 21610035970560.0, - "grad_norm": 1.7587516083331964, - "language_loss": 0.67517728, - "learning_rate": 1.2135638954149151e-08, - "loss": 0.69631088, - "num_input_tokens_seen": 346694710, - "step": 16066, - "time_per_iteration": 2.6471195220947266 - }, - { - "auxiliary_loss_clip": 0.01107434, - "auxiliary_loss_mlp": 0.0102656, - "balance_loss_clip": 1.03636479, - "balance_loss_mlp": 1.01466918, - "epoch": 0.9660003006162633, - "flos": 20301200196480.0, - "grad_norm": 1.8111019231916714, - "language_loss": 0.82353568, - "learning_rate": 1.209283794752558e-08, - "loss": 0.84487563, - "num_input_tokens_seen": 346712645, - "step": 16067, - "time_per_iteration": 2.605968952178955 - }, - { - "auxiliary_loss_clip": 0.01087949, - "auxiliary_loss_mlp": 0.01029805, - "balance_loss_clip": 1.03769147, - "balance_loss_mlp": 1.01721048, - "epoch": 0.9660604238689313, - "flos": 24461954064000.0, - "grad_norm": 2.0465290050813496, - "language_loss": 0.69553685, - "learning_rate": 1.2050112322540496e-08, - "loss": 0.71671438, - "num_input_tokens_seen": 346732375, - "step": 16068, - "time_per_iteration": 2.7153985500335693 - }, - { - "auxiliary_loss_clip": 0.01085915, - "auxiliary_loss_mlp": 0.01031292, - "balance_loss_clip": 1.0330863, - "balance_loss_mlp": 1.02038455, - "epoch": 0.9661205471215992, - "flos": 19864023765120.0, - "grad_norm": 1.6826807111904172, - "language_loss": 0.68126762, - "learning_rate": 1.20074620808146e-08, - "loss": 0.70243973, - "num_input_tokens_seen": 346750430, - "step": 16069, - "time_per_iteration": 2.576427936553955 - }, - { - "auxiliary_loss_clip": 0.01089339, - "auxiliary_loss_mlp": 0.01028006, - "balance_loss_clip": 1.03860068, - "balance_loss_mlp": 1.01594257, - "epoch": 0.9661806703742672, - "flos": 20557889763840.0, - "grad_norm": 1.979804846920071, - "language_loss": 0.8906877, - "learning_rate": 1.1964887223964826e-08, - "loss": 0.91186118, - "num_input_tokens_seen": 346768455, - "step": 16070, - "time_per_iteration": 5.773402214050293 - }, - { - "auxiliary_loss_clip": 0.01111791, - "auxiliary_loss_mlp": 0.01038495, - "balance_loss_clip": 1.03955567, - "balance_loss_mlp": 1.02573359, - "epoch": 0.9662407936269353, - "flos": 21430949736960.0, - "grad_norm": 2.2069490271978327, - "language_loss": 0.77111554, - "learning_rate": 1.1922387753605878e-08, - "loss": 0.79261839, - "num_input_tokens_seen": 346786530, - "step": 16071, - "time_per_iteration": 4.432383060455322 - }, - { - "auxiliary_loss_clip": 0.01083604, - "auxiliary_loss_mlp": 0.01031428, - "balance_loss_clip": 1.03396428, - "balance_loss_mlp": 1.01729596, - "epoch": 0.9663009168796032, - "flos": 14902893095040.0, - "grad_norm": 1.7077316996855652, - "language_loss": 0.65930271, - "learning_rate": 1.1879963671349137e-08, - "loss": 0.680453, - "num_input_tokens_seen": 346804635, - "step": 16072, - "time_per_iteration": 2.6231675148010254 - }, - { - "auxiliary_loss_clip": 0.01101171, - "auxiliary_loss_mlp": 0.01031913, - "balance_loss_clip": 1.03714108, - "balance_loss_mlp": 1.02001643, - "epoch": 0.9663610401322712, - "flos": 24310877460480.0, - "grad_norm": 1.7479386785661417, - "language_loss": 0.77363575, - "learning_rate": 1.1837614978803534e-08, - "loss": 0.7949667, - "num_input_tokens_seen": 346823070, - "step": 16073, - "time_per_iteration": 2.6588406562805176 - }, - { - "auxiliary_loss_clip": 0.01113364, - "auxiliary_loss_mlp": 0.01036179, - "balance_loss_clip": 1.03833628, - "balance_loss_mlp": 1.02317297, - "epoch": 0.9664211633849391, - "flos": 17637849527040.0, - "grad_norm": 4.0139714248409515, - "language_loss": 0.7596699, - "learning_rate": 1.1795341677574677e-08, - "loss": 0.78116536, - "num_input_tokens_seen": 346841180, - "step": 16074, - "time_per_iteration": 2.5176475048065186 - }, - { - "auxiliary_loss_clip": 0.01085316, - "auxiliary_loss_mlp": 0.01031408, - "balance_loss_clip": 1.03595638, - "balance_loss_mlp": 1.01841474, - "epoch": 0.9664812866376071, - "flos": 29789409588480.0, - "grad_norm": 1.5863798083052476, - "language_loss": 0.75684714, - "learning_rate": 1.1753143769265728e-08, - "loss": 0.77801442, - "num_input_tokens_seen": 346864250, - "step": 16075, - "time_per_iteration": 2.740597724914551 - }, - { - "auxiliary_loss_clip": 0.01078205, - "auxiliary_loss_mlp": 0.01035752, - "balance_loss_clip": 1.03695774, - "balance_loss_mlp": 1.02323484, - "epoch": 0.966541409890275, - "flos": 14282320798080.0, - "grad_norm": 1.8962598568271254, - "language_loss": 0.78820133, - "learning_rate": 1.171102125547696e-08, - "loss": 0.80934089, - "num_input_tokens_seen": 346881955, - "step": 16076, - "time_per_iteration": 4.21985650062561 - }, - { - "auxiliary_loss_clip": 0.01089256, - "auxiliary_loss_mlp": 0.01043191, - "balance_loss_clip": 1.03779173, - "balance_loss_mlp": 1.02938676, - "epoch": 0.9666015331429431, - "flos": 19860432405120.0, - "grad_norm": 1.7349669135653192, - "language_loss": 0.7190969, - "learning_rate": 1.166897413780532e-08, - "loss": 0.74042135, - "num_input_tokens_seen": 346900445, - "step": 16077, - "time_per_iteration": 2.626159191131592 - }, - { - "auxiliary_loss_clip": 0.01093266, - "auxiliary_loss_mlp": 0.01032581, - "balance_loss_clip": 1.03399146, - "balance_loss_mlp": 1.01980758, - "epoch": 0.966661656395611, - "flos": 27125951178240.0, - "grad_norm": 1.9266659878552612, - "language_loss": 0.593472, - "learning_rate": 1.1627002417845533e-08, - "loss": 0.61473054, - "num_input_tokens_seen": 346920135, - "step": 16078, - "time_per_iteration": 2.6967270374298096 - }, - { - "auxiliary_loss_clip": 0.01101009, - "auxiliary_loss_mlp": 0.0103522, - "balance_loss_clip": 1.03683424, - "balance_loss_mlp": 1.02165985, - "epoch": 0.966721779648279, - "flos": 21508229848320.0, - "grad_norm": 1.883589824979691, - "language_loss": 0.72105432, - "learning_rate": 1.158510609718899e-08, - "loss": 0.74241656, - "num_input_tokens_seen": 346940450, - "step": 16079, - "time_per_iteration": 2.63110089302063 - }, - { - "auxiliary_loss_clip": 0.01094454, - "auxiliary_loss_mlp": 0.01027425, - "balance_loss_clip": 1.03552699, - "balance_loss_mlp": 1.01592135, - "epoch": 0.9667819029009469, - "flos": 23878118401920.0, - "grad_norm": 1.528864037931963, - "language_loss": 0.71972895, - "learning_rate": 1.1543285177424644e-08, - "loss": 0.74094772, - "num_input_tokens_seen": 346960935, - "step": 16080, - "time_per_iteration": 2.6290252208709717 - }, - { - "auxiliary_loss_clip": 0.01075746, - "auxiliary_loss_mlp": 0.0103416, - "balance_loss_clip": 1.03217447, - "balance_loss_mlp": 1.02045643, - "epoch": 0.9668420261536149, - "flos": 21507224267520.0, - "grad_norm": 1.6987481016197885, - "language_loss": 0.73362374, - "learning_rate": 1.1501539660138115e-08, - "loss": 0.75472283, - "num_input_tokens_seen": 346980100, - "step": 16081, - "time_per_iteration": 2.6839892864227295 - }, - { - "auxiliary_loss_clip": 0.01080983, - "auxiliary_loss_mlp": 0.0102965, - "balance_loss_clip": 1.03344238, - "balance_loss_mlp": 1.01646531, - "epoch": 0.9669021494062828, - "flos": 26687266375680.0, - "grad_norm": 1.771131179159937, - "language_loss": 0.67452699, - "learning_rate": 1.145986954691236e-08, - "loss": 0.69563329, - "num_input_tokens_seen": 347001250, - "step": 16082, - "time_per_iteration": 2.7003889083862305 - }, - { - "auxiliary_loss_clip": 0.01065498, - "auxiliary_loss_mlp": 0.01042485, - "balance_loss_clip": 1.0319593, - "balance_loss_mlp": 1.02886534, - "epoch": 0.9669622726589508, - "flos": 29825032901760.0, - "grad_norm": 1.870561288505191, - "language_loss": 0.76813722, - "learning_rate": 1.141827483932789e-08, - "loss": 0.78921711, - "num_input_tokens_seen": 347022975, - "step": 16083, - "time_per_iteration": 2.736612558364868 - }, - { - "auxiliary_loss_clip": 0.01061787, - "auxiliary_loss_mlp": 0.01033012, - "balance_loss_clip": 1.03434837, - "balance_loss_mlp": 1.0202508, - "epoch": 0.9670223959116189, - "flos": 22922499018240.0, - "grad_norm": 2.1852037151642203, - "language_loss": 0.79155672, - "learning_rate": 1.1376755538961669e-08, - "loss": 0.81250471, - "num_input_tokens_seen": 347038780, - "step": 16084, - "time_per_iteration": 2.7562255859375 - }, - { - "auxiliary_loss_clip": 0.01101094, - "auxiliary_loss_mlp": 0.01029867, - "balance_loss_clip": 1.03601408, - "balance_loss_mlp": 1.01673627, - "epoch": 0.9670825191642868, - "flos": 18624495283200.0, - "grad_norm": 2.1888422828676655, - "language_loss": 0.6779865, - "learning_rate": 1.1335311647387991e-08, - "loss": 0.69929618, - "num_input_tokens_seen": 347056705, - "step": 16085, - "time_per_iteration": 2.5915327072143555 - }, - { - "auxiliary_loss_clip": 0.01089717, - "auxiliary_loss_mlp": 0.01031896, - "balance_loss_clip": 1.03720474, - "balance_loss_mlp": 1.01825213, - "epoch": 0.9671426424169548, - "flos": 24497936513280.0, - "grad_norm": 2.1661579345086097, - "language_loss": 0.69027126, - "learning_rate": 1.1293943166178709e-08, - "loss": 0.71148735, - "num_input_tokens_seen": 347075710, - "step": 16086, - "time_per_iteration": 2.6948018074035645 - }, - { - "auxiliary_loss_clip": 0.01095229, - "auxiliary_loss_mlp": 0.01033969, - "balance_loss_clip": 1.03497195, - "balance_loss_mlp": 1.02086806, - "epoch": 0.9672027656696227, - "flos": 20371189847040.0, - "grad_norm": 1.6309967969700185, - "language_loss": 0.78254652, - "learning_rate": 1.125265009690235e-08, - "loss": 0.80383849, - "num_input_tokens_seen": 347092325, - "step": 16087, - "time_per_iteration": 2.638317346572876 - }, - { - "auxiliary_loss_clip": 0.0107816, - "auxiliary_loss_mlp": 0.01029378, - "balance_loss_clip": 1.0323391, - "balance_loss_mlp": 1.0173974, - "epoch": 0.9672628889222907, - "flos": 18880179269760.0, - "grad_norm": 1.8693958793677588, - "language_loss": 0.71220851, - "learning_rate": 1.1211432441124769e-08, - "loss": 0.73328388, - "num_input_tokens_seen": 347110595, - "step": 16088, - "time_per_iteration": 2.7119359970092773 - }, - { - "auxiliary_loss_clip": 0.01105883, - "auxiliary_loss_mlp": 0.00770003, - "balance_loss_clip": 1.03694296, - "balance_loss_mlp": 1.00009024, - "epoch": 0.9673230121749586, - "flos": 28695247447680.0, - "grad_norm": 1.6915080049875915, - "language_loss": 0.70655894, - "learning_rate": 1.117029020040916e-08, - "loss": 0.72531772, - "num_input_tokens_seen": 347131625, - "step": 16089, - "time_per_iteration": 2.5807154178619385 - }, - { - "auxiliary_loss_clip": 0.01110035, - "auxiliary_loss_mlp": 0.01032325, - "balance_loss_clip": 1.03722262, - "balance_loss_mlp": 1.02004623, - "epoch": 0.9673831354276267, - "flos": 20484452407680.0, - "grad_norm": 2.217046221899868, - "language_loss": 0.7484473, - "learning_rate": 1.1129223376315167e-08, - "loss": 0.76987088, - "num_input_tokens_seen": 347147910, - "step": 16090, - "time_per_iteration": 2.5390427112579346 - }, - { - "auxiliary_loss_clip": 0.01087487, - "auxiliary_loss_mlp": 0.0102933, - "balance_loss_clip": 1.03507531, - "balance_loss_mlp": 1.01677692, - "epoch": 0.9674432586802946, - "flos": 26797548107520.0, - "grad_norm": 2.2040190235185158, - "language_loss": 0.69111538, - "learning_rate": 1.1088231970400653e-08, - "loss": 0.71228355, - "num_input_tokens_seen": 347168805, - "step": 16091, - "time_per_iteration": 2.672116279602051 - }, - { - "auxiliary_loss_clip": 0.01106741, - "auxiliary_loss_mlp": 0.01031458, - "balance_loss_clip": 1.03664362, - "balance_loss_mlp": 1.01816666, - "epoch": 0.9675033819329626, - "flos": 22310941034880.0, - "grad_norm": 1.7246952798155581, - "language_loss": 0.76974177, - "learning_rate": 1.1047315984219484e-08, - "loss": 0.79112375, - "num_input_tokens_seen": 347189455, - "step": 16092, - "time_per_iteration": 2.562080144882202 - }, - { - "auxiliary_loss_clip": 0.01107911, - "auxiliary_loss_mlp": 0.01030137, - "balance_loss_clip": 1.0373435, - "balance_loss_mlp": 1.01862192, - "epoch": 0.9675635051856305, - "flos": 12675713276160.0, - "grad_norm": 1.9074028879734577, - "language_loss": 0.76118815, - "learning_rate": 1.1006475419323313e-08, - "loss": 0.78256863, - "num_input_tokens_seen": 347206030, - "step": 16093, - "time_per_iteration": 2.5782711505889893 - }, - { - "auxiliary_loss_clip": 0.01083204, - "auxiliary_loss_mlp": 0.01028136, - "balance_loss_clip": 1.03609204, - "balance_loss_mlp": 1.01477861, - "epoch": 0.9676236284382985, - "flos": 24608469640320.0, - "grad_norm": 1.8185482437095273, - "language_loss": 0.68996257, - "learning_rate": 1.096571027726112e-08, - "loss": 0.71107602, - "num_input_tokens_seen": 347226250, - "step": 16094, - "time_per_iteration": 2.642312526702881 - }, - { - "auxiliary_loss_clip": 0.01099843, - "auxiliary_loss_mlp": 0.01031519, - "balance_loss_clip": 1.03670728, - "balance_loss_mlp": 1.01940703, - "epoch": 0.9676837516909664, - "flos": 23367145478400.0, - "grad_norm": 3.1538628444307912, - "language_loss": 0.7587145, - "learning_rate": 1.0925020559578557e-08, - "loss": 0.7800281, - "num_input_tokens_seen": 347247350, - "step": 16095, - "time_per_iteration": 2.6397533416748047 - }, - { - "auxiliary_loss_clip": 0.01114159, - "auxiliary_loss_mlp": 0.01035616, - "balance_loss_clip": 1.03943849, - "balance_loss_mlp": 1.0225327, - "epoch": 0.9677438749436345, - "flos": 20486894532480.0, - "grad_norm": 3.4142773987990513, - "language_loss": 0.70483637, - "learning_rate": 1.0884406267818392e-08, - "loss": 0.7263341, - "num_input_tokens_seen": 347266870, - "step": 16096, - "time_per_iteration": 2.571568727493286 - }, - { - "auxiliary_loss_clip": 0.01086881, - "auxiliary_loss_mlp": 0.01026382, - "balance_loss_clip": 1.03495574, - "balance_loss_mlp": 1.01391292, - "epoch": 0.9678039981963025, - "flos": 47555889719040.0, - "grad_norm": 1.7358126863243992, - "language_loss": 0.7179426, - "learning_rate": 1.0843867403520946e-08, - "loss": 0.73907518, - "num_input_tokens_seen": 347290120, - "step": 16097, - "time_per_iteration": 2.8643288612365723 - }, - { - "auxiliary_loss_clip": 0.01107467, - "auxiliary_loss_mlp": 0.01035068, - "balance_loss_clip": 1.03668487, - "balance_loss_mlp": 1.02286124, - "epoch": 0.9678641214489704, - "flos": 25040474513280.0, - "grad_norm": 1.9803265483631816, - "language_loss": 0.78437316, - "learning_rate": 1.0803403968223434e-08, - "loss": 0.80579853, - "num_input_tokens_seen": 347308785, - "step": 16098, - "time_per_iteration": 2.5864875316619873 - }, - { - "auxiliary_loss_clip": 0.0107379, - "auxiliary_loss_mlp": 0.01027915, - "balance_loss_clip": 1.03619742, - "balance_loss_mlp": 1.01629841, - "epoch": 0.9679242447016384, - "flos": 19240937516160.0, - "grad_norm": 2.361712995723687, - "language_loss": 0.90639651, - "learning_rate": 1.0763015963459965e-08, - "loss": 0.92741358, - "num_input_tokens_seen": 347326375, - "step": 16099, - "time_per_iteration": 2.786999464035034 - }, - { - "auxiliary_loss_clip": 0.01100177, - "auxiliary_loss_mlp": 0.01034182, - "balance_loss_clip": 1.03700566, - "balance_loss_mlp": 1.02131963, - "epoch": 0.9679843679543063, - "flos": 33254681345280.0, - "grad_norm": 1.5241755755242299, - "language_loss": 0.66061008, - "learning_rate": 1.0722703390762643e-08, - "loss": 0.68195367, - "num_input_tokens_seen": 347348250, - "step": 16100, - "time_per_iteration": 2.6941099166870117 - }, - { - "auxiliary_loss_clip": 0.01069319, - "auxiliary_loss_mlp": 0.01035758, - "balance_loss_clip": 1.03754771, - "balance_loss_mlp": 1.02276969, - "epoch": 0.9680444912069743, - "flos": 22783633038720.0, - "grad_norm": 1.6628994278317477, - "language_loss": 0.73592603, - "learning_rate": 1.0682466251659584e-08, - "loss": 0.75697684, - "num_input_tokens_seen": 347367400, - "step": 16101, - "time_per_iteration": 2.6911606788635254 - }, - { - "auxiliary_loss_clip": 0.01085079, - "auxiliary_loss_mlp": 0.01031615, - "balance_loss_clip": 1.03593111, - "balance_loss_mlp": 1.01842427, - "epoch": 0.9681046144596422, - "flos": 24024095274240.0, - "grad_norm": 1.6107351715516067, - "language_loss": 0.73375201, - "learning_rate": 1.0642304547676672e-08, - "loss": 0.75491893, - "num_input_tokens_seen": 347387600, - "step": 16102, - "time_per_iteration": 2.6399521827697754 - }, - { - "auxiliary_loss_clip": 0.01076768, - "auxiliary_loss_mlp": 0.01035627, - "balance_loss_clip": 1.04000163, - "balance_loss_mlp": 1.02195942, - "epoch": 0.9681647377123103, - "flos": 23441013797760.0, - "grad_norm": 1.9528459851096875, - "language_loss": 0.77444363, - "learning_rate": 1.0602218280337139e-08, - "loss": 0.79556757, - "num_input_tokens_seen": 347406915, - "step": 16103, - "time_per_iteration": 2.7056915760040283 - }, - { - "auxiliary_loss_clip": 0.01086653, - "auxiliary_loss_mlp": 0.0103195, - "balance_loss_clip": 1.03456104, - "balance_loss_mlp": 1.0200057, - "epoch": 0.9682248609649782, - "flos": 22675075159680.0, - "grad_norm": 1.639893337105475, - "language_loss": 0.80586064, - "learning_rate": 1.0562207451160655e-08, - "loss": 0.82704663, - "num_input_tokens_seen": 347425140, - "step": 16104, - "time_per_iteration": 2.648461103439331 - }, - { - "auxiliary_loss_clip": 0.01088229, - "auxiliary_loss_mlp": 0.01035055, - "balance_loss_clip": 1.03242385, - "balance_loss_mlp": 1.02403986, - "epoch": 0.9682849842176462, - "flos": 24428413739520.0, - "grad_norm": 1.4906398802277745, - "language_loss": 0.77576089, - "learning_rate": 1.0522272061664672e-08, - "loss": 0.79699373, - "num_input_tokens_seen": 347446350, - "step": 16105, - "time_per_iteration": 2.6988043785095215 - }, - { - "auxiliary_loss_clip": 0.01000224, - "auxiliary_loss_mlp": 0.01003466, - "balance_loss_clip": 1.00602651, - "balance_loss_mlp": 1.00240505, - "epoch": 0.9683451074703141, - "flos": 59995132784640.0, - "grad_norm": 0.8146584091458852, - "language_loss": 0.56716478, - "learning_rate": 1.0482412113363536e-08, - "loss": 0.58720171, - "num_input_tokens_seen": 347510135, - "step": 16106, - "time_per_iteration": 3.2270421981811523 - }, - { - "auxiliary_loss_clip": 0.01008919, - "auxiliary_loss_mlp": 0.0100775, - "balance_loss_clip": 1.01353073, - "balance_loss_mlp": 1.00654626, - "epoch": 0.9684052307229821, - "flos": 52696145514240.0, - "grad_norm": 0.9121301732264848, - "language_loss": 0.61534059, - "learning_rate": 1.0442627607768707e-08, - "loss": 0.63550723, - "num_input_tokens_seen": 347562505, - "step": 16107, - "time_per_iteration": 3.1101765632629395 - }, - { - "auxiliary_loss_clip": 0.01098789, - "auxiliary_loss_mlp": 0.01035497, - "balance_loss_clip": 1.03623629, - "balance_loss_mlp": 1.02143073, - "epoch": 0.96846535397565, - "flos": 22783848520320.0, - "grad_norm": 2.462094722606181, - "language_loss": 0.74264908, - "learning_rate": 1.040291854638875e-08, - "loss": 0.76399195, - "num_input_tokens_seen": 347579150, - "step": 16108, - "time_per_iteration": 2.743326187133789 - }, - { - "auxiliary_loss_clip": 0.01093024, - "auxiliary_loss_mlp": 0.01027661, - "balance_loss_clip": 1.03480208, - "balance_loss_mlp": 1.01471508, - "epoch": 0.968525477228318, - "flos": 23323980309120.0, - "grad_norm": 2.296731755168933, - "language_loss": 0.56901729, - "learning_rate": 1.0363284930729576e-08, - "loss": 0.59022415, - "num_input_tokens_seen": 347596705, - "step": 16109, - "time_per_iteration": 5.880841255187988 - }, - { - "auxiliary_loss_clip": 0.01018006, - "auxiliary_loss_mlp": 0.01003432, - "balance_loss_clip": 1.0045774, - "balance_loss_mlp": 1.00251389, - "epoch": 0.9685856004809861, - "flos": 67882947707520.0, - "grad_norm": 0.6721206293032698, - "language_loss": 0.54183471, - "learning_rate": 1.0323726762294205e-08, - "loss": 0.56204915, - "num_input_tokens_seen": 347661870, - "step": 16110, - "time_per_iteration": 3.1392929553985596 - }, - { - "auxiliary_loss_clip": 0.01040803, - "auxiliary_loss_mlp": 0.01042377, - "balance_loss_clip": 1.03240716, - "balance_loss_mlp": 1.02792311, - "epoch": 0.968645723733654, - "flos": 33947900899200.0, - "grad_norm": 1.336975675669519, - "language_loss": 0.62198687, - "learning_rate": 1.0284244042582325e-08, - "loss": 0.64281869, - "num_input_tokens_seen": 347684295, - "step": 16111, - "time_per_iteration": 4.477367401123047 - }, - { - "auxiliary_loss_clip": 0.01084355, - "auxiliary_loss_mlp": 0.01029493, - "balance_loss_clip": 1.0346024, - "balance_loss_mlp": 1.01831102, - "epoch": 0.968705846986322, - "flos": 18551488890240.0, - "grad_norm": 1.8865919253091008, - "language_loss": 0.74626237, - "learning_rate": 1.024483677309118e-08, - "loss": 0.76740086, - "num_input_tokens_seen": 347702585, - "step": 16112, - "time_per_iteration": 2.6802995204925537 - }, - { - "auxiliary_loss_clip": 0.01096094, - "auxiliary_loss_mlp": 0.01029113, - "balance_loss_clip": 1.03605258, - "balance_loss_mlp": 1.01711464, - "epoch": 0.9687659702389899, - "flos": 17420913336960.0, - "grad_norm": 3.1829431624893, - "language_loss": 0.66342431, - "learning_rate": 1.020550495531558e-08, - "loss": 0.68467641, - "num_input_tokens_seen": 347721810, - "step": 16113, - "time_per_iteration": 2.6158058643341064 - }, - { - "auxiliary_loss_clip": 0.01016205, - "auxiliary_loss_mlp": 0.01002001, - "balance_loss_clip": 1.00489581, - "balance_loss_mlp": 1.00113058, - "epoch": 0.9688260934916579, - "flos": 62047176865920.0, - "grad_norm": 0.7821411345284778, - "language_loss": 0.56506634, - "learning_rate": 1.0166248590746329e-08, - "loss": 0.58524841, - "num_input_tokens_seen": 347782330, - "step": 16114, - "time_per_iteration": 3.127088785171509 - }, - { - "auxiliary_loss_clip": 0.01081645, - "auxiliary_loss_mlp": 0.01038103, - "balance_loss_clip": 1.03432035, - "balance_loss_mlp": 1.02492452, - "epoch": 0.9688862167443258, - "flos": 15076520461440.0, - "grad_norm": 1.9569215626202732, - "language_loss": 0.82965726, - "learning_rate": 1.0127067680872458e-08, - "loss": 0.85085475, - "num_input_tokens_seen": 347794835, - "step": 16115, - "time_per_iteration": 4.220075607299805 - }, - { - "auxiliary_loss_clip": 0.01092985, - "auxiliary_loss_mlp": 0.0102895, - "balance_loss_clip": 1.03631961, - "balance_loss_mlp": 1.01743448, - "epoch": 0.9689463399969939, - "flos": 19938215306880.0, - "grad_norm": 1.7648967305379544, - "language_loss": 0.72280598, - "learning_rate": 1.0087962227179448e-08, - "loss": 0.74402535, - "num_input_tokens_seen": 347814320, - "step": 16116, - "time_per_iteration": 2.603519916534424 - }, - { - "auxiliary_loss_clip": 0.01068294, - "auxiliary_loss_mlp": 0.01034519, - "balance_loss_clip": 1.03542447, - "balance_loss_mlp": 1.02141237, - "epoch": 0.9690064632496618, - "flos": 19573039687680.0, - "grad_norm": 2.086312122853078, - "language_loss": 0.75657129, - "learning_rate": 1.0048932231150553e-08, - "loss": 0.77759945, - "num_input_tokens_seen": 347832125, - "step": 16117, - "time_per_iteration": 2.6519157886505127 - }, - { - "auxiliary_loss_clip": 0.01109753, - "auxiliary_loss_mlp": 0.01030569, - "balance_loss_clip": 1.03619337, - "balance_loss_mlp": 1.01758695, - "epoch": 0.9690665865023298, - "flos": 21872292145920.0, - "grad_norm": 2.3448073541677275, - "language_loss": 0.77482766, - "learning_rate": 1.000997769426548e-08, - "loss": 0.79623091, - "num_input_tokens_seen": 347850765, - "step": 16118, - "time_per_iteration": 2.5268216133117676 - }, - { - "auxiliary_loss_clip": 0.0108528, - "auxiliary_loss_mlp": 0.00771043, - "balance_loss_clip": 1.03405607, - "balance_loss_mlp": 1.00030315, - "epoch": 0.9691267097549977, - "flos": 20994491577600.0, - "grad_norm": 1.8097325369712165, - "language_loss": 0.78219616, - "learning_rate": 9.971098618001272e-09, - "loss": 0.80075938, - "num_input_tokens_seen": 347870125, - "step": 16119, - "time_per_iteration": 2.629453659057617 - }, - { - "auxiliary_loss_clip": 0.01056904, - "auxiliary_loss_mlp": 0.01034502, - "balance_loss_clip": 1.03209758, - "balance_loss_mlp": 1.0223546, - "epoch": 0.9691868330076657, - "flos": 24279132816000.0, - "grad_norm": 1.885470946971698, - "language_loss": 0.75497305, - "learning_rate": 9.932295003832747e-09, - "loss": 0.77588713, - "num_input_tokens_seen": 347890615, - "step": 16120, - "time_per_iteration": 2.746344566345215 - }, - { - "auxiliary_loss_clip": 0.01097943, - "auxiliary_loss_mlp": 0.0103265, - "balance_loss_clip": 1.03581011, - "balance_loss_mlp": 1.02084804, - "epoch": 0.9692469562603336, - "flos": 17675699483520.0, - "grad_norm": 1.8693618447103497, - "language_loss": 0.70098805, - "learning_rate": 9.89356685323095e-09, - "loss": 0.72229403, - "num_input_tokens_seen": 347908685, - "step": 16121, - "time_per_iteration": 2.5618736743927 - }, - { - "auxiliary_loss_clip": 0.01094421, - "auxiliary_loss_mlp": 0.01032945, - "balance_loss_clip": 1.03476155, - "balance_loss_mlp": 1.02091098, - "epoch": 0.9693070795130017, - "flos": 26834392483200.0, - "grad_norm": 1.8372756092604514, - "language_loss": 0.69241065, - "learning_rate": 9.854914167664486e-09, - "loss": 0.71368432, - "num_input_tokens_seen": 347926385, - "step": 16122, - "time_per_iteration": 2.56386661529541 - }, - { - "auxiliary_loss_clip": 0.01066781, - "auxiliary_loss_mlp": 0.0103272, - "balance_loss_clip": 1.0308547, - "balance_loss_mlp": 1.02011967, - "epoch": 0.9693672027656697, - "flos": 18077288515200.0, - "grad_norm": 2.0146395561935058, - "language_loss": 0.7544961, - "learning_rate": 9.81633694859907e-09, - "loss": 0.77549112, - "num_input_tokens_seen": 347945290, - "step": 16123, - "time_per_iteration": 2.6407599449157715 - }, - { - "auxiliary_loss_clip": 0.01072153, - "auxiliary_loss_mlp": 0.01038605, - "balance_loss_clip": 1.03460908, - "balance_loss_mlp": 1.02459204, - "epoch": 0.9694273260183376, - "flos": 21763015994880.0, - "grad_norm": 1.5149029001764542, - "language_loss": 0.74644059, - "learning_rate": 9.777835197497753e-09, - "loss": 0.7675482, - "num_input_tokens_seen": 347966330, - "step": 16124, - "time_per_iteration": 2.671185255050659 - }, - { - "auxiliary_loss_clip": 0.01098188, - "auxiliary_loss_mlp": 0.01035267, - "balance_loss_clip": 1.0364728, - "balance_loss_mlp": 1.02335227, - "epoch": 0.9694874492710056, - "flos": 24426115269120.0, - "grad_norm": 2.520792760553443, - "language_loss": 0.74161977, - "learning_rate": 9.739408915820258e-09, - "loss": 0.76295435, - "num_input_tokens_seen": 347982590, - "step": 16125, - "time_per_iteration": 2.6353843212127686 - }, - { - "auxiliary_loss_clip": 0.01019443, - "auxiliary_loss_mlp": 0.01000194, - "balance_loss_clip": 1.00674295, - "balance_loss_mlp": 0.99920446, - "epoch": 0.9695475725236735, - "flos": 67650748237440.0, - "grad_norm": 0.8991349506597905, - "language_loss": 0.61446786, - "learning_rate": 9.70105810502364e-09, - "loss": 0.63466424, - "num_input_tokens_seen": 348043310, - "step": 16126, - "time_per_iteration": 3.190199851989746 - }, - { - "auxiliary_loss_clip": 0.01097272, - "auxiliary_loss_mlp": 0.01035813, - "balance_loss_clip": 1.03880358, - "balance_loss_mlp": 1.02390397, - "epoch": 0.9696076957763415, - "flos": 19129326981120.0, - "grad_norm": 1.964418438296789, - "language_loss": 0.75083786, - "learning_rate": 9.662782766562738e-09, - "loss": 0.77216876, - "num_input_tokens_seen": 348062200, - "step": 16127, - "time_per_iteration": 2.6186792850494385 - }, - { - "auxiliary_loss_clip": 0.01063108, - "auxiliary_loss_mlp": 0.01033166, - "balance_loss_clip": 1.03249013, - "balance_loss_mlp": 1.02036893, - "epoch": 0.9696678190290094, - "flos": 15486836497920.0, - "grad_norm": 1.6000021312142574, - "language_loss": 0.69262868, - "learning_rate": 9.62458290188839e-09, - "loss": 0.71359146, - "num_input_tokens_seen": 348080685, - "step": 16128, - "time_per_iteration": 2.6917450428009033 - }, - { - "auxiliary_loss_clip": 0.01076173, - "auxiliary_loss_mlp": 0.0103545, - "balance_loss_clip": 1.03701282, - "balance_loss_mlp": 1.02326083, - "epoch": 0.9697279422816775, - "flos": 36208692869760.0, - "grad_norm": 1.6481386717416904, - "language_loss": 0.65212297, - "learning_rate": 9.586458512449213e-09, - "loss": 0.67323917, - "num_input_tokens_seen": 348102500, - "step": 16129, - "time_per_iteration": 2.761218309402466 - }, - { - "auxiliary_loss_clip": 0.01076577, - "auxiliary_loss_mlp": 0.01032504, - "balance_loss_clip": 1.03635514, - "balance_loss_mlp": 1.01933169, - "epoch": 0.9697880655343454, - "flos": 25484007651840.0, - "grad_norm": 2.2154494130728852, - "language_loss": 0.6313777, - "learning_rate": 9.548409599691166e-09, - "loss": 0.6524685, - "num_input_tokens_seen": 348122515, - "step": 16130, - "time_per_iteration": 2.6841318607330322 - }, - { - "auxiliary_loss_clip": 0.01098965, - "auxiliary_loss_mlp": 0.01031057, - "balance_loss_clip": 1.0350318, - "balance_loss_mlp": 1.01859963, - "epoch": 0.9698481887870134, - "flos": 15333533251200.0, - "grad_norm": 2.2812543570754005, - "language_loss": 0.69271004, - "learning_rate": 9.510436165056867e-09, - "loss": 0.7140103, - "num_input_tokens_seen": 348138775, - "step": 16131, - "time_per_iteration": 2.5763492584228516 - }, - { - "auxiliary_loss_clip": 0.0110919, - "auxiliary_loss_mlp": 0.00770076, - "balance_loss_clip": 1.03628075, - "balance_loss_mlp": 1.00023901, - "epoch": 0.9699083120396813, - "flos": 21982250655360.0, - "grad_norm": 1.8419150080244562, - "language_loss": 0.76590043, - "learning_rate": 9.472538209986058e-09, - "loss": 0.78469312, - "num_input_tokens_seen": 348157115, - "step": 16132, - "time_per_iteration": 2.563215732574463 - }, - { - "auxiliary_loss_clip": 0.01075956, - "auxiliary_loss_mlp": 0.01038228, - "balance_loss_clip": 1.03480387, - "balance_loss_mlp": 1.02540684, - "epoch": 0.9699684352923493, - "flos": 15664055224320.0, - "grad_norm": 2.851724008499009, - "language_loss": 0.79010421, - "learning_rate": 9.434715735916477e-09, - "loss": 0.81124604, - "num_input_tokens_seen": 348173035, - "step": 16133, - "time_per_iteration": 2.623619794845581 - }, - { - "auxiliary_loss_clip": 0.01078402, - "auxiliary_loss_mlp": 0.01028089, - "balance_loss_clip": 1.03522897, - "balance_loss_mlp": 1.01685965, - "epoch": 0.9700285585450172, - "flos": 21908382336000.0, - "grad_norm": 2.3483627644840444, - "language_loss": 0.64470112, - "learning_rate": 9.396968744281863e-09, - "loss": 0.66576606, - "num_input_tokens_seen": 348192960, - "step": 16134, - "time_per_iteration": 2.6657004356384277 - }, - { - "auxiliary_loss_clip": 0.01083734, - "auxiliary_loss_mlp": 0.01032972, - "balance_loss_clip": 1.03266311, - "balance_loss_mlp": 1.01973999, - "epoch": 0.9700886817976853, - "flos": 23914890950400.0, - "grad_norm": 1.8798527935954052, - "language_loss": 0.80912268, - "learning_rate": 9.359297236513519e-09, - "loss": 0.83028972, - "num_input_tokens_seen": 348212805, - "step": 16135, - "time_per_iteration": 2.744619131088257 - }, - { - "auxiliary_loss_clip": 0.01099551, - "auxiliary_loss_mlp": 0.01032267, - "balance_loss_clip": 1.03586113, - "balance_loss_mlp": 1.01880264, - "epoch": 0.9701488050503532, - "flos": 25447845634560.0, - "grad_norm": 1.8400261223826226, - "language_loss": 0.7311669, - "learning_rate": 9.321701214040079e-09, - "loss": 0.7524851, - "num_input_tokens_seen": 348232900, - "step": 16136, - "time_per_iteration": 2.6270158290863037 - }, - { - "auxiliary_loss_clip": 0.01106517, - "auxiliary_loss_mlp": 0.0103259, - "balance_loss_clip": 1.03631723, - "balance_loss_mlp": 1.02158737, - "epoch": 0.9702089283030212, - "flos": 20590855470720.0, - "grad_norm": 1.723008357652219, - "language_loss": 0.7604568, - "learning_rate": 9.28418067828729e-09, - "loss": 0.78184789, - "num_input_tokens_seen": 348253065, - "step": 16137, - "time_per_iteration": 2.611590623855591 - }, - { - "auxiliary_loss_clip": 0.0099169, - "auxiliary_loss_mlp": 0.01002259, - "balance_loss_clip": 1.01290679, - "balance_loss_mlp": 1.00113201, - "epoch": 0.9702690515556892, - "flos": 70651516291200.0, - "grad_norm": 0.7712451352581947, - "language_loss": 0.54897171, - "learning_rate": 9.246735630678015e-09, - "loss": 0.56891119, - "num_input_tokens_seen": 348316075, - "step": 16138, - "time_per_iteration": 3.3687798976898193 - }, - { - "auxiliary_loss_clip": 0.01087536, - "auxiliary_loss_mlp": 0.01031679, - "balance_loss_clip": 1.03544235, - "balance_loss_mlp": 1.02002001, - "epoch": 0.9703291748083571, - "flos": 35881439034240.0, - "grad_norm": 1.941978950715942, - "language_loss": 0.7094661, - "learning_rate": 9.209366072632007e-09, - "loss": 0.73065829, - "num_input_tokens_seen": 348337605, - "step": 16139, - "time_per_iteration": 2.725593328475952 - }, - { - "auxiliary_loss_clip": 0.01100195, - "auxiliary_loss_mlp": 0.01032609, - "balance_loss_clip": 1.03781474, - "balance_loss_mlp": 1.01973999, - "epoch": 0.9703892980610251, - "flos": 24316479982080.0, - "grad_norm": 1.5269759850750149, - "language_loss": 0.72774076, - "learning_rate": 9.172072005566134e-09, - "loss": 0.7490688, - "num_input_tokens_seen": 348359430, - "step": 16140, - "time_per_iteration": 2.6335747241973877 - }, - { - "auxiliary_loss_clip": 0.01102225, - "auxiliary_loss_mlp": 0.00771179, - "balance_loss_clip": 1.03837323, - "balance_loss_mlp": 1.00030136, - "epoch": 0.970449421313693, - "flos": 18003743418240.0, - "grad_norm": 2.2771543266487586, - "language_loss": 0.67710316, - "learning_rate": 9.13485343089504e-09, - "loss": 0.6958372, - "num_input_tokens_seen": 348377890, - "step": 16141, - "time_per_iteration": 2.588693141937256 - }, - { - "auxiliary_loss_clip": 0.01093094, - "auxiliary_loss_mlp": 0.01033621, - "balance_loss_clip": 1.03493285, - "balance_loss_mlp": 1.02134275, - "epoch": 0.9705095445663611, - "flos": 25337994865920.0, - "grad_norm": 2.0049530138805856, - "language_loss": 0.69002879, - "learning_rate": 9.097710350029597e-09, - "loss": 0.71129596, - "num_input_tokens_seen": 348396550, - "step": 16142, - "time_per_iteration": 2.727897882461548 - }, - { - "auxiliary_loss_clip": 0.01052884, - "auxiliary_loss_mlp": 0.01032081, - "balance_loss_clip": 1.03308058, - "balance_loss_mlp": 1.01940298, - "epoch": 0.970569667819029, - "flos": 26833602384000.0, - "grad_norm": 1.764320667349442, - "language_loss": 0.55796802, - "learning_rate": 9.060642764378457e-09, - "loss": 0.57881761, - "num_input_tokens_seen": 348417120, - "step": 16143, - "time_per_iteration": 2.7790820598602295 - }, - { - "auxiliary_loss_clip": 0.01097025, - "auxiliary_loss_mlp": 0.01033348, - "balance_loss_clip": 1.0362649, - "balance_loss_mlp": 1.02201712, - "epoch": 0.970629791071697, - "flos": 25848644567040.0, - "grad_norm": 2.1311740509263157, - "language_loss": 0.67920631, - "learning_rate": 9.023650675347382e-09, - "loss": 0.70051003, - "num_input_tokens_seen": 348437750, - "step": 16144, - "time_per_iteration": 2.6120004653930664 - }, - { - "auxiliary_loss_clip": 0.01096108, - "auxiliary_loss_mlp": 0.0103709, - "balance_loss_clip": 1.03683603, - "balance_loss_mlp": 1.0254854, - "epoch": 0.9706899143243649, - "flos": 36540184510080.0, - "grad_norm": 1.6337482713348195, - "language_loss": 0.71880758, - "learning_rate": 8.986734084339253e-09, - "loss": 0.74013954, - "num_input_tokens_seen": 348460935, - "step": 16145, - "time_per_iteration": 2.7305266857147217 - }, - { - "auxiliary_loss_clip": 0.0108585, - "auxiliary_loss_mlp": 0.0102977, - "balance_loss_clip": 1.03421783, - "balance_loss_mlp": 1.01635957, - "epoch": 0.9707500375770329, - "flos": 12268234414080.0, - "grad_norm": 5.028438763995754, - "language_loss": 0.80458283, - "learning_rate": 8.949892992753395e-09, - "loss": 0.82573903, - "num_input_tokens_seen": 348474480, - "step": 16146, - "time_per_iteration": 2.6035280227661133 - }, - { - "auxiliary_loss_clip": 0.00997894, - "auxiliary_loss_mlp": 0.0100175, - "balance_loss_clip": 1.00757813, - "balance_loss_mlp": 1.00062394, - "epoch": 0.9708101608297008, - "flos": 60853040196480.0, - "grad_norm": 0.7531682380125572, - "language_loss": 0.54495502, - "learning_rate": 8.91312740198713e-09, - "loss": 0.56495154, - "num_input_tokens_seen": 348541220, - "step": 16147, - "time_per_iteration": 3.225588798522949 - }, - { - "auxiliary_loss_clip": 0.01073097, - "auxiliary_loss_mlp": 0.00771677, - "balance_loss_clip": 1.03335106, - "balance_loss_mlp": 1.00021338, - "epoch": 0.9708702840823689, - "flos": 27124766029440.0, - "grad_norm": 3.20403684561858, - "language_loss": 0.61148691, - "learning_rate": 8.876437313434682e-09, - "loss": 0.62993467, - "num_input_tokens_seen": 348559230, - "step": 16148, - "time_per_iteration": 4.195791482925415 - }, - { - "auxiliary_loss_clip": 0.01070921, - "auxiliary_loss_mlp": 0.01038429, - "balance_loss_clip": 1.03563893, - "balance_loss_mlp": 1.02597761, - "epoch": 0.9709304073350368, - "flos": 20777699041920.0, - "grad_norm": 1.6574498866467469, - "language_loss": 0.73563087, - "learning_rate": 8.839822728487155e-09, - "loss": 0.75672436, - "num_input_tokens_seen": 348577850, - "step": 16149, - "time_per_iteration": 4.327805519104004 - }, - { - "auxiliary_loss_clip": 0.01096097, - "auxiliary_loss_mlp": 0.01036533, - "balance_loss_clip": 1.03510022, - "balance_loss_mlp": 1.02391517, - "epoch": 0.9709905305877048, - "flos": 41934541115520.0, - "grad_norm": 2.151336781292665, - "language_loss": 0.75191128, - "learning_rate": 8.803283648533222e-09, - "loss": 0.77323759, - "num_input_tokens_seen": 348598345, - "step": 16150, - "time_per_iteration": 4.396034479141235 - }, - { - "auxiliary_loss_clip": 0.0109299, - "auxiliary_loss_mlp": 0.01030493, - "balance_loss_clip": 1.03820729, - "balance_loss_mlp": 1.01590753, - "epoch": 0.9710506538403728, - "flos": 17165588486400.0, - "grad_norm": 1.9672912428051808, - "language_loss": 0.73628724, - "learning_rate": 8.766820074958214e-09, - "loss": 0.75752205, - "num_input_tokens_seen": 348616300, - "step": 16151, - "time_per_iteration": 2.6692330837249756 - }, - { - "auxiliary_loss_clip": 0.0109559, - "auxiliary_loss_mlp": 0.01028623, - "balance_loss_clip": 1.03646886, - "balance_loss_mlp": 1.01655281, - "epoch": 0.9711107770930407, - "flos": 21173470070400.0, - "grad_norm": 2.2868567232439787, - "language_loss": 0.74524468, - "learning_rate": 8.730432009145027e-09, - "loss": 0.76648676, - "num_input_tokens_seen": 348633845, - "step": 16152, - "time_per_iteration": 2.639920473098755 - }, - { - "auxiliary_loss_clip": 0.0107224, - "auxiliary_loss_mlp": 0.01033418, - "balance_loss_clip": 1.03668654, - "balance_loss_mlp": 1.02151465, - "epoch": 0.9711709003457087, - "flos": 22237072715520.0, - "grad_norm": 1.850919590804903, - "language_loss": 0.67173874, - "learning_rate": 8.694119452473448e-09, - "loss": 0.69279528, - "num_input_tokens_seen": 348653070, - "step": 16153, - "time_per_iteration": 2.69380521774292 - }, - { - "auxiliary_loss_clip": 0.01048504, - "auxiliary_loss_mlp": 0.01029289, - "balance_loss_clip": 1.03318441, - "balance_loss_mlp": 1.01809549, - "epoch": 0.9712310235983767, - "flos": 26213856099840.0, - "grad_norm": 13.061634148388642, - "language_loss": 0.70930749, - "learning_rate": 8.65788240632037e-09, - "loss": 0.73008543, - "num_input_tokens_seen": 348672145, - "step": 16154, - "time_per_iteration": 4.310068607330322 - }, - { - "auxiliary_loss_clip": 0.01063978, - "auxiliary_loss_mlp": 0.01032173, - "balance_loss_clip": 1.04066324, - "balance_loss_mlp": 1.01833844, - "epoch": 0.9712911468510447, - "flos": 20668171495680.0, - "grad_norm": 1.6587681231692977, - "language_loss": 0.80700165, - "learning_rate": 8.621720872059812e-09, - "loss": 0.82796311, - "num_input_tokens_seen": 348690615, - "step": 16155, - "time_per_iteration": 2.7987523078918457 - }, - { - "auxiliary_loss_clip": 0.01098298, - "auxiliary_loss_mlp": 0.00771693, - "balance_loss_clip": 1.03783345, - "balance_loss_mlp": 1.00030363, - "epoch": 0.9713512701037126, - "flos": 13552903313280.0, - "grad_norm": 1.945476752267927, - "language_loss": 0.6769433, - "learning_rate": 8.58563485106334e-09, - "loss": 0.69564319, - "num_input_tokens_seen": 348708665, - "step": 16156, - "time_per_iteration": 2.679084062576294 - }, - { - "auxiliary_loss_clip": 0.01098233, - "auxiliary_loss_mlp": 0.01031321, - "balance_loss_clip": 1.03533268, - "balance_loss_mlp": 1.01955533, - "epoch": 0.9714113933563806, - "flos": 25848752307840.0, - "grad_norm": 2.586712346196416, - "language_loss": 0.9075287, - "learning_rate": 8.54962434469919e-09, - "loss": 0.92882419, - "num_input_tokens_seen": 348726105, - "step": 16157, - "time_per_iteration": 2.6537325382232666 - }, - { - "auxiliary_loss_clip": 0.01071902, - "auxiliary_loss_mlp": 0.00770052, - "balance_loss_clip": 1.03688991, - "balance_loss_mlp": 1.00026488, - "epoch": 0.9714715166090485, - "flos": 12743081233920.0, - "grad_norm": 1.749407686127521, - "language_loss": 0.72465503, - "learning_rate": 8.513689354332721e-09, - "loss": 0.74307454, - "num_input_tokens_seen": 348743360, - "step": 16158, - "time_per_iteration": 2.7036380767822266 - }, - { - "auxiliary_loss_clip": 0.01059022, - "auxiliary_loss_mlp": 0.01037853, - "balance_loss_clip": 1.03384304, - "balance_loss_mlp": 1.02509737, - "epoch": 0.9715316398617165, - "flos": 18405547931520.0, - "grad_norm": 2.013888583799996, - "language_loss": 0.60360491, - "learning_rate": 8.477829881326836e-09, - "loss": 0.62457371, - "num_input_tokens_seen": 348759045, - "step": 16159, - "time_per_iteration": 2.6209466457366943 - }, - { - "auxiliary_loss_clip": 0.01103648, - "auxiliary_loss_mlp": 0.01025108, - "balance_loss_clip": 1.03575277, - "balance_loss_mlp": 1.01424837, - "epoch": 0.9715917631143844, - "flos": 28913799749760.0, - "grad_norm": 1.651339792325088, - "language_loss": 0.78989285, - "learning_rate": 8.44204592704112e-09, - "loss": 0.81118041, - "num_input_tokens_seen": 348779910, - "step": 16160, - "time_per_iteration": 2.5234336853027344 - }, - { - "auxiliary_loss_clip": 0.01027371, - "auxiliary_loss_mlp": 0.01000477, - "balance_loss_clip": 1.00497746, - "balance_loss_mlp": 0.99951786, - "epoch": 0.9716518863670525, - "flos": 65939712900480.0, - "grad_norm": 0.7683763573739155, - "language_loss": 0.54203629, - "learning_rate": 8.406337492832704e-09, - "loss": 0.56231475, - "num_input_tokens_seen": 348838995, - "step": 16161, - "time_per_iteration": 3.0858347415924072 - }, - { - "auxiliary_loss_clip": 0.01094745, - "auxiliary_loss_mlp": 0.00769904, - "balance_loss_clip": 1.03734314, - "balance_loss_mlp": 1.00019956, - "epoch": 0.9717120096197204, - "flos": 17712759340800.0, - "grad_norm": 1.8388776753499438, - "language_loss": 0.72078347, - "learning_rate": 8.3707045800554e-09, - "loss": 0.73942995, - "num_input_tokens_seen": 348858090, - "step": 16162, - "time_per_iteration": 2.4713857173919678 - }, - { - "auxiliary_loss_clip": 0.01070522, - "auxiliary_loss_mlp": 0.010289, - "balance_loss_clip": 1.03172445, - "balance_loss_mlp": 1.01611447, - "epoch": 0.9717721328723884, - "flos": 24463426521600.0, - "grad_norm": 1.6638325085203318, - "language_loss": 0.78620613, - "learning_rate": 8.335147190060787e-09, - "loss": 0.80720031, - "num_input_tokens_seen": 348877885, - "step": 16163, - "time_per_iteration": 2.6069257259368896 - }, - { - "auxiliary_loss_clip": 0.01083213, - "auxiliary_loss_mlp": 0.01027578, - "balance_loss_clip": 1.03707957, - "balance_loss_mlp": 1.01624179, - "epoch": 0.9718322561250564, - "flos": 20776477979520.0, - "grad_norm": 2.364704456697354, - "language_loss": 0.72864258, - "learning_rate": 8.299665324196903e-09, - "loss": 0.74975049, - "num_input_tokens_seen": 348897720, - "step": 16164, - "time_per_iteration": 2.6400234699249268 - }, - { - "auxiliary_loss_clip": 0.01045604, - "auxiliary_loss_mlp": 0.01044632, - "balance_loss_clip": 1.03097391, - "balance_loss_mlp": 1.029773, - "epoch": 0.9718923793777243, - "flos": 19025904746880.0, - "grad_norm": 1.8541776614197814, - "language_loss": 0.83818543, - "learning_rate": 8.264258983809114e-09, - "loss": 0.85908771, - "num_input_tokens_seen": 348915410, - "step": 16165, - "time_per_iteration": 2.729191303253174 - }, - { - "auxiliary_loss_clip": 0.01071333, - "auxiliary_loss_mlp": 0.01027443, - "balance_loss_clip": 1.03399253, - "balance_loss_mlp": 1.01615393, - "epoch": 0.9719525026303923, - "flos": 21871717528320.0, - "grad_norm": 2.4684136710713664, - "language_loss": 0.79201269, - "learning_rate": 8.228928170240345e-09, - "loss": 0.81300044, - "num_input_tokens_seen": 348934335, - "step": 16166, - "time_per_iteration": 2.6733477115631104 - }, - { - "auxiliary_loss_clip": 0.01084172, - "auxiliary_loss_mlp": 0.01027293, - "balance_loss_clip": 1.03812957, - "balance_loss_mlp": 1.01548481, - "epoch": 0.9720126258830603, - "flos": 14429303251200.0, - "grad_norm": 1.7663595445124196, - "language_loss": 0.70758253, - "learning_rate": 8.193672884830195e-09, - "loss": 0.72869724, - "num_input_tokens_seen": 348952405, - "step": 16167, - "time_per_iteration": 2.7085564136505127 - }, - { - "auxiliary_loss_clip": 0.01079731, - "auxiliary_loss_mlp": 0.01035805, - "balance_loss_clip": 1.03778422, - "balance_loss_mlp": 1.02379441, - "epoch": 0.9720727491357283, - "flos": 26251167352320.0, - "grad_norm": 1.8138771680519867, - "language_loss": 0.75927782, - "learning_rate": 8.158493128915812e-09, - "loss": 0.78043312, - "num_input_tokens_seen": 348973580, - "step": 16168, - "time_per_iteration": 2.67354154586792 - }, - { - "auxiliary_loss_clip": 0.01049039, - "auxiliary_loss_mlp": 0.01050689, - "balance_loss_clip": 1.03055644, - "balance_loss_mlp": 1.03582323, - "epoch": 0.9721328723883962, - "flos": 22674105492480.0, - "grad_norm": 2.5093639466048896, - "language_loss": 0.72537249, - "learning_rate": 8.123388903830797e-09, - "loss": 0.74636978, - "num_input_tokens_seen": 348992035, - "step": 16169, - "time_per_iteration": 2.7542500495910645 - }, - { - "auxiliary_loss_clip": 0.01073449, - "auxiliary_loss_mlp": 0.01038056, - "balance_loss_clip": 1.03180361, - "balance_loss_mlp": 1.02368569, - "epoch": 0.9721929956410642, - "flos": 28074172360320.0, - "grad_norm": 1.7172146559968202, - "language_loss": 0.57560009, - "learning_rate": 8.088360210906309e-09, - "loss": 0.59671509, - "num_input_tokens_seen": 349013160, - "step": 16170, - "time_per_iteration": 2.784191370010376 - }, - { - "auxiliary_loss_clip": 0.01075999, - "auxiliary_loss_mlp": 0.01032463, - "balance_loss_clip": 1.03437006, - "balance_loss_mlp": 1.01930237, - "epoch": 0.9722531188937321, - "flos": 20996251344000.0, - "grad_norm": 1.991276787299532, - "language_loss": 0.71702683, - "learning_rate": 8.053407051471062e-09, - "loss": 0.7381115, - "num_input_tokens_seen": 349033485, - "step": 16171, - "time_per_iteration": 2.7290470600128174 - }, - { - "auxiliary_loss_clip": 0.01074193, - "auxiliary_loss_mlp": 0.01036428, - "balance_loss_clip": 1.03374755, - "balance_loss_mlp": 1.02373838, - "epoch": 0.9723132421464001, - "flos": 16070600332800.0, - "grad_norm": 3.7050893371500973, - "language_loss": 0.68799138, - "learning_rate": 8.018529426850218e-09, - "loss": 0.70909762, - "num_input_tokens_seen": 349051705, - "step": 16172, - "time_per_iteration": 2.7984087467193604 - }, - { - "auxiliary_loss_clip": 0.01092548, - "auxiliary_loss_mlp": 0.01030417, - "balance_loss_clip": 1.03369451, - "balance_loss_mlp": 1.01790619, - "epoch": 0.972373365399068, - "flos": 27745769289600.0, - "grad_norm": 2.273393003122684, - "language_loss": 0.85909021, - "learning_rate": 7.983727338366274e-09, - "loss": 0.88031983, - "num_input_tokens_seen": 349070825, - "step": 16173, - "time_per_iteration": 2.637646198272705 - }, - { - "auxiliary_loss_clip": 0.01058492, - "auxiliary_loss_mlp": 0.01037401, - "balance_loss_clip": 1.03226995, - "balance_loss_mlp": 1.02290511, - "epoch": 0.9724334886517361, - "flos": 23002939526400.0, - "grad_norm": 2.532344740288213, - "language_loss": 0.64345253, - "learning_rate": 7.949000787339289e-09, - "loss": 0.66441143, - "num_input_tokens_seen": 349089730, - "step": 16174, - "time_per_iteration": 2.6890182495117188 - }, - { - "auxiliary_loss_clip": 0.0109623, - "auxiliary_loss_mlp": 0.01029844, - "balance_loss_clip": 1.03573728, - "balance_loss_mlp": 1.01808977, - "epoch": 0.972493611904404, - "flos": 25447055535360.0, - "grad_norm": 1.5574440695217635, - "language_loss": 0.78149283, - "learning_rate": 7.914349775085538e-09, - "loss": 0.80275363, - "num_input_tokens_seen": 349111315, - "step": 16175, - "time_per_iteration": 2.65380597114563 - }, - { - "auxiliary_loss_clip": 0.01098527, - "auxiliary_loss_mlp": 0.0103633, - "balance_loss_clip": 1.03696692, - "balance_loss_mlp": 1.02305567, - "epoch": 0.972553735157072, - "flos": 16983054547200.0, - "grad_norm": 2.4406961253744637, - "language_loss": 0.56965649, - "learning_rate": 7.879774302919307e-09, - "loss": 0.59100509, - "num_input_tokens_seen": 349129495, - "step": 16176, - "time_per_iteration": 2.564636707305908 - }, - { - "auxiliary_loss_clip": 0.01088801, - "auxiliary_loss_mlp": 0.01032276, - "balance_loss_clip": 1.03812397, - "balance_loss_mlp": 1.02081394, - "epoch": 0.97261385840974, - "flos": 26104651776000.0, - "grad_norm": 2.4918025895156557, - "language_loss": 0.72519267, - "learning_rate": 7.845274372151545e-09, - "loss": 0.74640346, - "num_input_tokens_seen": 349148850, - "step": 16177, - "time_per_iteration": 2.677704334259033 - }, - { - "auxiliary_loss_clip": 0.01087782, - "auxiliary_loss_mlp": 0.01029248, - "balance_loss_clip": 1.03436661, - "balance_loss_mlp": 1.01660562, - "epoch": 0.9726739816624079, - "flos": 25447881548160.0, - "grad_norm": 1.6303663037965777, - "language_loss": 0.68360388, - "learning_rate": 7.810849984090984e-09, - "loss": 0.70477414, - "num_input_tokens_seen": 349167620, - "step": 16178, - "time_per_iteration": 2.6498606204986572 - }, - { - "auxiliary_loss_clip": 0.01054589, - "auxiliary_loss_mlp": 0.01032086, - "balance_loss_clip": 1.03061843, - "balance_loss_mlp": 1.01890159, - "epoch": 0.972734104915076, - "flos": 29014923513600.0, - "grad_norm": 1.7151954479923888, - "language_loss": 0.66904575, - "learning_rate": 7.776501140042358e-09, - "loss": 0.68991244, - "num_input_tokens_seen": 349185845, - "step": 16179, - "time_per_iteration": 2.9617762565612793 - }, - { - "auxiliary_loss_clip": 0.01083826, - "auxiliary_loss_mlp": 0.00768898, - "balance_loss_clip": 1.03630555, - "balance_loss_mlp": 1.0001514, - "epoch": 0.9727942281677439, - "flos": 23437637919360.0, - "grad_norm": 2.630214780518977, - "language_loss": 0.77113461, - "learning_rate": 7.742227841308624e-09, - "loss": 0.78966182, - "num_input_tokens_seen": 349204525, - "step": 16180, - "time_per_iteration": 2.6464152336120605 - }, - { - "auxiliary_loss_clip": 0.01098634, - "auxiliary_loss_mlp": 0.01030814, - "balance_loss_clip": 1.03539276, - "balance_loss_mlp": 1.01826119, - "epoch": 0.9728543514204119, - "flos": 31724599749120.0, - "grad_norm": 2.262434982216008, - "language_loss": 0.76220429, - "learning_rate": 7.708030089189188e-09, - "loss": 0.78349876, - "num_input_tokens_seen": 349228075, - "step": 16181, - "time_per_iteration": 2.677198648452759 - }, - { - "auxiliary_loss_clip": 0.01106677, - "auxiliary_loss_mlp": 0.01035405, - "balance_loss_clip": 1.03586745, - "balance_loss_mlp": 1.02323365, - "epoch": 0.9729144746730798, - "flos": 16289368116480.0, - "grad_norm": 1.5196924475010567, - "language_loss": 0.63252479, - "learning_rate": 7.67390788498079e-09, - "loss": 0.65394562, - "num_input_tokens_seen": 349246990, - "step": 16182, - "time_per_iteration": 2.554809093475342 - }, - { - "auxiliary_loss_clip": 0.01041817, - "auxiliary_loss_mlp": 0.01042152, - "balance_loss_clip": 1.04146159, - "balance_loss_mlp": 1.0289433, - "epoch": 0.9729745979257478, - "flos": 25041408266880.0, - "grad_norm": 1.789194263856678, - "language_loss": 0.62447584, - "learning_rate": 7.639861229977507e-09, - "loss": 0.64531553, - "num_input_tokens_seen": 349265890, - "step": 16183, - "time_per_iteration": 3.175109624862671 - }, - { - "auxiliary_loss_clip": 0.01085962, - "auxiliary_loss_mlp": 0.01037692, - "balance_loss_clip": 1.03510141, - "balance_loss_mlp": 1.02473438, - "epoch": 0.9730347211784157, - "flos": 22638733574400.0, - "grad_norm": 1.6456930738589919, - "language_loss": 0.78234679, - "learning_rate": 7.605890125470527e-09, - "loss": 0.80358338, - "num_input_tokens_seen": 349285275, - "step": 16184, - "time_per_iteration": 2.9018943309783936 - }, - { - "auxiliary_loss_clip": 0.01068538, - "auxiliary_loss_mlp": 0.01033694, - "balance_loss_clip": 1.03069115, - "balance_loss_mlp": 1.02024758, - "epoch": 0.9730948444310837, - "flos": 10998613313280.0, - "grad_norm": 2.161376757576218, - "language_loss": 0.79345584, - "learning_rate": 7.571994572747709e-09, - "loss": 0.8144781, - "num_input_tokens_seen": 349301515, - "step": 16185, - "time_per_iteration": 2.641317129135132 - }, - { - "auxiliary_loss_clip": 0.01077077, - "auxiliary_loss_mlp": 0.01028307, - "balance_loss_clip": 1.03456235, - "balance_loss_mlp": 1.01660085, - "epoch": 0.9731549676837516, - "flos": 16799479113600.0, - "grad_norm": 2.015706111725158, - "language_loss": 0.77789813, - "learning_rate": 7.538174573094469e-09, - "loss": 0.79895198, - "num_input_tokens_seen": 349319590, - "step": 16186, - "time_per_iteration": 2.698368787765503 - }, - { - "auxiliary_loss_clip": 0.01084734, - "auxiliary_loss_mlp": 0.01029196, - "balance_loss_clip": 1.0357089, - "balance_loss_mlp": 1.01675642, - "epoch": 0.9732150909364197, - "flos": 21141761339520.0, - "grad_norm": 1.7572799383983544, - "language_loss": 0.65494901, - "learning_rate": 7.504430127793337e-09, - "loss": 0.67608833, - "num_input_tokens_seen": 349339230, - "step": 16187, - "time_per_iteration": 4.130638122558594 - }, - { - "auxiliary_loss_clip": 0.01079645, - "auxiliary_loss_mlp": 0.01038619, - "balance_loss_clip": 1.03164029, - "balance_loss_mlp": 1.02523208, - "epoch": 0.9732752141890876, - "flos": 33727337435520.0, - "grad_norm": 1.8431543667714356, - "language_loss": 0.80137229, - "learning_rate": 7.47076123812418e-09, - "loss": 0.82255495, - "num_input_tokens_seen": 349361155, - "step": 16188, - "time_per_iteration": 4.257014989852905 - }, - { - "auxiliary_loss_clip": 0.01072207, - "auxiliary_loss_mlp": 0.0103015, - "balance_loss_clip": 1.03375018, - "balance_loss_mlp": 1.01883137, - "epoch": 0.9733353374417556, - "flos": 23404384903680.0, - "grad_norm": 1.7664281085479938, - "language_loss": 0.78316271, - "learning_rate": 7.437167905363084e-09, - "loss": 0.80418628, - "num_input_tokens_seen": 349379335, - "step": 16189, - "time_per_iteration": 2.675529718399048 - }, - { - "auxiliary_loss_clip": 0.01092046, - "auxiliary_loss_mlp": 0.0102785, - "balance_loss_clip": 1.03294408, - "balance_loss_mlp": 1.01514196, - "epoch": 0.9733954606944236, - "flos": 39165792963840.0, - "grad_norm": 1.7197781596757225, - "language_loss": 0.51230407, - "learning_rate": 7.403650130784367e-09, - "loss": 0.533503, - "num_input_tokens_seen": 349401575, - "step": 16190, - "time_per_iteration": 4.908695459365845 - }, - { - "auxiliary_loss_clip": 0.01098154, - "auxiliary_loss_mlp": 0.01030754, - "balance_loss_clip": 1.03689873, - "balance_loss_mlp": 1.01865995, - "epoch": 0.9734555839470915, - "flos": 21981819692160.0, - "grad_norm": 1.7152390443101855, - "language_loss": 0.80948341, - "learning_rate": 7.3702079156590105e-09, - "loss": 0.83077252, - "num_input_tokens_seen": 349420650, - "step": 16191, - "time_per_iteration": 2.6668500900268555 - }, - { - "auxiliary_loss_clip": 0.01091143, - "auxiliary_loss_mlp": 0.01031598, - "balance_loss_clip": 1.03202808, - "balance_loss_mlp": 1.01971912, - "epoch": 0.9735157071997596, - "flos": 16575539771520.0, - "grad_norm": 1.6910464048805458, - "language_loss": 0.8259176, - "learning_rate": 7.336841261255111e-09, - "loss": 0.84714502, - "num_input_tokens_seen": 349436830, - "step": 16192, - "time_per_iteration": 2.569251537322998 - }, - { - "auxiliary_loss_clip": 0.01046721, - "auxiliary_loss_mlp": 0.01039813, - "balance_loss_clip": 1.03544569, - "balance_loss_mlp": 1.02665234, - "epoch": 0.9735758304524275, - "flos": 20223237726720.0, - "grad_norm": 1.8106504266161225, - "language_loss": 0.74773109, - "learning_rate": 7.303550168837658e-09, - "loss": 0.76859641, - "num_input_tokens_seen": 349454325, - "step": 16193, - "time_per_iteration": 4.564434051513672 - }, - { - "auxiliary_loss_clip": 0.01079567, - "auxiliary_loss_mlp": 0.01031423, - "balance_loss_clip": 1.03505838, - "balance_loss_mlp": 1.020015, - "epoch": 0.9736359537050955, - "flos": 23653353047040.0, - "grad_norm": 1.8191654334710798, - "language_loss": 0.85254693, - "learning_rate": 7.270334639669417e-09, - "loss": 0.87365687, - "num_input_tokens_seen": 349470230, - "step": 16194, - "time_per_iteration": 2.687668561935425 - }, - { - "auxiliary_loss_clip": 0.01070428, - "auxiliary_loss_mlp": 0.01037259, - "balance_loss_clip": 1.03369021, - "balance_loss_mlp": 1.02468801, - "epoch": 0.9736960769577634, - "flos": 15560202026880.0, - "grad_norm": 1.6441349965800172, - "language_loss": 0.75818932, - "learning_rate": 7.237194675009828e-09, - "loss": 0.77926624, - "num_input_tokens_seen": 349486250, - "step": 16195, - "time_per_iteration": 2.6451404094696045 - }, - { - "auxiliary_loss_clip": 0.01004847, - "auxiliary_loss_mlp": 0.01000872, - "balance_loss_clip": 1.00990903, - "balance_loss_mlp": 0.99979365, - "epoch": 0.9737562002104314, - "flos": 65351783088000.0, - "grad_norm": 0.708245154030494, - "language_loss": 0.52467954, - "learning_rate": 7.204130276115439e-09, - "loss": 0.54473674, - "num_input_tokens_seen": 349545865, - "step": 16196, - "time_per_iteration": 3.186091184616089 - }, - { - "auxiliary_loss_clip": 0.01084909, - "auxiliary_loss_mlp": 0.0103142, - "balance_loss_clip": 1.03660226, - "balance_loss_mlp": 1.01945114, - "epoch": 0.9738163234630993, - "flos": 27196730928000.0, - "grad_norm": 2.030454883195646, - "language_loss": 0.7627387, - "learning_rate": 7.171141444240136e-09, - "loss": 0.78390199, - "num_input_tokens_seen": 349566080, - "step": 16197, - "time_per_iteration": 2.8780059814453125 - }, - { - "auxiliary_loss_clip": 0.0111131, - "auxiliary_loss_mlp": 0.01028167, - "balance_loss_clip": 1.03635693, - "balance_loss_mlp": 1.01535797, - "epoch": 0.9738764467157673, - "flos": 21069365477760.0, - "grad_norm": 1.7142052132721648, - "language_loss": 0.67503351, - "learning_rate": 7.13822818063492e-09, - "loss": 0.69642824, - "num_input_tokens_seen": 349585665, - "step": 16198, - "time_per_iteration": 2.689474582672119 - }, - { - "auxiliary_loss_clip": 0.01107297, - "auxiliary_loss_mlp": 0.01031978, - "balance_loss_clip": 1.03572273, - "balance_loss_mlp": 1.01887083, - "epoch": 0.9739365699684353, - "flos": 21361211481600.0, - "grad_norm": 1.916549844614904, - "language_loss": 0.78117663, - "learning_rate": 7.10539048654768e-09, - "loss": 0.80256933, - "num_input_tokens_seen": 349605125, - "step": 16199, - "time_per_iteration": 2.5536978244781494 - }, - { - "auxiliary_loss_clip": 0.0108445, - "auxiliary_loss_mlp": 0.01035036, - "balance_loss_clip": 1.03713942, - "balance_loss_mlp": 1.02260256, - "epoch": 0.9739966932211033, - "flos": 21902061542400.0, - "grad_norm": 1.9705409422067974, - "language_loss": 0.79409683, - "learning_rate": 7.072628363223865e-09, - "loss": 0.81529176, - "num_input_tokens_seen": 349623360, - "step": 16200, - "time_per_iteration": 2.6256768703460693 - }, - { - "auxiliary_loss_clip": 0.01058782, - "auxiliary_loss_mlp": 0.01035774, - "balance_loss_clip": 1.03694201, - "balance_loss_mlp": 1.02227926, - "epoch": 0.9740568164737712, - "flos": 24827345164800.0, - "grad_norm": 2.0331349042288878, - "language_loss": 0.68434143, - "learning_rate": 7.039941811905592e-09, - "loss": 0.70528698, - "num_input_tokens_seen": 349644390, - "step": 16201, - "time_per_iteration": 2.8037257194519043 - }, - { - "auxiliary_loss_clip": 0.01075577, - "auxiliary_loss_mlp": 0.01033677, - "balance_loss_clip": 1.03323948, - "balance_loss_mlp": 1.02163649, - "epoch": 0.9741169397264392, - "flos": 23623583650560.0, - "grad_norm": 1.5025292618741064, - "language_loss": 0.72862577, - "learning_rate": 7.0073308338325364e-09, - "loss": 0.74971825, - "num_input_tokens_seen": 349663200, - "step": 16202, - "time_per_iteration": 2.662804126739502 - }, - { - "auxiliary_loss_clip": 0.0108729, - "auxiliary_loss_mlp": 0.01034456, - "balance_loss_clip": 1.03576303, - "balance_loss_mlp": 1.02150416, - "epoch": 0.9741770629791072, - "flos": 18841144164480.0, - "grad_norm": 2.6824456959299052, - "language_loss": 0.72871369, - "learning_rate": 6.974795430241265e-09, - "loss": 0.74993122, - "num_input_tokens_seen": 349681975, - "step": 16203, - "time_per_iteration": 2.5910871028900146 - }, - { - "auxiliary_loss_clip": 0.01109424, - "auxiliary_loss_mlp": 0.01033432, - "balance_loss_clip": 1.03729725, - "balance_loss_mlp": 1.02117181, - "epoch": 0.9742371862317751, - "flos": 22346241125760.0, - "grad_norm": 1.9882435140281416, - "language_loss": 0.77292311, - "learning_rate": 6.942335602365235e-09, - "loss": 0.7943517, - "num_input_tokens_seen": 349701185, - "step": 16204, - "time_per_iteration": 2.599534273147583 - }, - { - "auxiliary_loss_clip": 0.01091233, - "auxiliary_loss_mlp": 0.01034218, - "balance_loss_clip": 1.03830349, - "balance_loss_mlp": 1.02127194, - "epoch": 0.9742973094844432, - "flos": 21762764599680.0, - "grad_norm": 2.04301514318933, - "language_loss": 0.79557073, - "learning_rate": 6.909951351435905e-09, - "loss": 0.81682527, - "num_input_tokens_seen": 349720360, - "step": 16205, - "time_per_iteration": 2.611509323120117 - }, - { - "auxiliary_loss_clip": 0.01106984, - "auxiliary_loss_mlp": 0.01033558, - "balance_loss_clip": 1.03618968, - "balance_loss_mlp": 1.02133942, - "epoch": 0.9743574327371111, - "flos": 26248725227520.0, - "grad_norm": 1.7129263404714312, - "language_loss": 0.74342418, - "learning_rate": 6.87764267868074e-09, - "loss": 0.76482964, - "num_input_tokens_seen": 349741040, - "step": 16206, - "time_per_iteration": 2.5808560848236084 - }, - { - "auxiliary_loss_clip": 0.01055158, - "auxiliary_loss_mlp": 0.01032161, - "balance_loss_clip": 1.03472948, - "balance_loss_mlp": 1.019382, - "epoch": 0.9744175559897791, - "flos": 12349321367040.0, - "grad_norm": 2.3020742472105375, - "language_loss": 0.83948338, - "learning_rate": 6.8454095853252015e-09, - "loss": 0.86035663, - "num_input_tokens_seen": 349758895, - "step": 16207, - "time_per_iteration": 3.118260622024536 - }, - { - "auxiliary_loss_clip": 0.01096985, - "auxiliary_loss_mlp": 0.0103392, - "balance_loss_clip": 1.03668702, - "balance_loss_mlp": 1.0217663, - "epoch": 0.974477679242447, - "flos": 28397834835840.0, - "grad_norm": 1.7142608213779496, - "language_loss": 0.71005446, - "learning_rate": 6.813252072591425e-09, - "loss": 0.73136348, - "num_input_tokens_seen": 349779740, - "step": 16208, - "time_per_iteration": 2.631779909133911 - }, - { - "auxiliary_loss_clip": 0.01068659, - "auxiliary_loss_mlp": 0.01025995, - "balance_loss_clip": 1.03373158, - "balance_loss_mlp": 1.01523638, - "epoch": 0.974537802495115, - "flos": 17785370684160.0, - "grad_norm": 1.6324180098602632, - "language_loss": 0.77270913, - "learning_rate": 6.781170141698878e-09, - "loss": 0.79365563, - "num_input_tokens_seen": 349796820, - "step": 16209, - "time_per_iteration": 2.648383617401123 - }, - { - "auxiliary_loss_clip": 0.01070166, - "auxiliary_loss_mlp": 0.0077274, - "balance_loss_clip": 1.03177297, - "balance_loss_mlp": 1.0000906, - "epoch": 0.9745979257477829, - "flos": 23842315520640.0, - "grad_norm": 1.7959688952091581, - "language_loss": 0.79134548, - "learning_rate": 6.749163793864144e-09, - "loss": 0.80977452, - "num_input_tokens_seen": 349816550, - "step": 16210, - "time_per_iteration": 2.693124294281006 - }, - { - "auxiliary_loss_clip": 0.01082394, - "auxiliary_loss_mlp": 0.01035247, - "balance_loss_clip": 1.03368235, - "balance_loss_mlp": 1.02293837, - "epoch": 0.9746580490004509, - "flos": 27016172236800.0, - "grad_norm": 2.111674380643122, - "language_loss": 0.7811175, - "learning_rate": 6.7172330303009176e-09, - "loss": 0.80229396, - "num_input_tokens_seen": 349834350, - "step": 16211, - "time_per_iteration": 2.7423813343048096 - }, - { - "auxiliary_loss_clip": 0.01074533, - "auxiliary_loss_mlp": 0.0103461, - "balance_loss_clip": 1.03468013, - "balance_loss_mlp": 1.02106786, - "epoch": 0.9747181722531189, - "flos": 19792022952960.0, - "grad_norm": 2.2346090535911345, - "language_loss": 0.78309953, - "learning_rate": 6.685377852219787e-09, - "loss": 0.80419093, - "num_input_tokens_seen": 349853460, - "step": 16212, - "time_per_iteration": 2.7550909519195557 - }, - { - "auxiliary_loss_clip": 0.01076477, - "auxiliary_loss_mlp": 0.01032217, - "balance_loss_clip": 1.03523839, - "balance_loss_mlp": 1.02030253, - "epoch": 0.9747782955057869, - "flos": 31430598929280.0, - "grad_norm": 1.4958012465208934, - "language_loss": 0.79993176, - "learning_rate": 6.653598260829118e-09, - "loss": 0.8210187, - "num_input_tokens_seen": 349874830, - "step": 16213, - "time_per_iteration": 2.8637707233428955 - }, - { - "auxiliary_loss_clip": 0.01062528, - "auxiliary_loss_mlp": 0.01026041, - "balance_loss_clip": 1.03252554, - "balance_loss_mlp": 1.01400709, - "epoch": 0.9748384187584548, - "flos": 15961288268160.0, - "grad_norm": 1.9405763574770338, - "language_loss": 0.66294038, - "learning_rate": 6.6218942573335044e-09, - "loss": 0.68382609, - "num_input_tokens_seen": 349893690, - "step": 16214, - "time_per_iteration": 2.699460029602051 - }, - { - "auxiliary_loss_clip": 0.01095715, - "auxiliary_loss_mlp": 0.01030057, - "balance_loss_clip": 1.04145873, - "balance_loss_mlp": 1.01690817, - "epoch": 0.9748985420111228, - "flos": 20558715776640.0, - "grad_norm": 1.7124616404956563, - "language_loss": 0.73894978, - "learning_rate": 6.5902658429355386e-09, - "loss": 0.76020747, - "num_input_tokens_seen": 349912480, - "step": 16215, - "time_per_iteration": 2.6812703609466553 - }, - { - "auxiliary_loss_clip": 0.01057347, - "auxiliary_loss_mlp": 0.01034106, - "balance_loss_clip": 1.03506923, - "balance_loss_mlp": 1.02194071, - "epoch": 0.9749586652637908, - "flos": 36721605127680.0, - "grad_norm": 2.1409618688352583, - "language_loss": 0.6697464, - "learning_rate": 6.558713018834483e-09, - "loss": 0.69066095, - "num_input_tokens_seen": 349932470, - "step": 16216, - "time_per_iteration": 2.8369500637054443 - }, - { - "auxiliary_loss_clip": 0.01053374, - "auxiliary_loss_mlp": 0.01031983, - "balance_loss_clip": 1.03023767, - "balance_loss_mlp": 1.01911426, - "epoch": 0.9750187885164587, - "flos": 10999223844480.0, - "grad_norm": 2.017970280416665, - "language_loss": 0.71706629, - "learning_rate": 6.527235786226937e-09, - "loss": 0.73791993, - "num_input_tokens_seen": 349949060, - "step": 16217, - "time_per_iteration": 2.7381694316864014 - }, - { - "auxiliary_loss_clip": 0.01074463, - "auxiliary_loss_mlp": 0.01028184, - "balance_loss_clip": 1.03641343, - "balance_loss_mlp": 1.01587594, - "epoch": 0.9750789117691268, - "flos": 25739512070400.0, - "grad_norm": 1.6438610736190364, - "language_loss": 0.78195894, - "learning_rate": 6.495834146306167e-09, - "loss": 0.80298543, - "num_input_tokens_seen": 349968010, - "step": 16218, - "time_per_iteration": 2.7634236812591553 - }, - { - "auxiliary_loss_clip": 0.01079204, - "auxiliary_loss_mlp": 0.01029255, - "balance_loss_clip": 1.03390265, - "balance_loss_mlp": 1.01689315, - "epoch": 0.9751390350217947, - "flos": 13333955961600.0, - "grad_norm": 2.591457126969395, - "language_loss": 0.77337241, - "learning_rate": 6.464508100263222e-09, - "loss": 0.79445708, - "num_input_tokens_seen": 349985270, - "step": 16219, - "time_per_iteration": 2.7380733489990234 - }, - { - "auxiliary_loss_clip": 0.01087952, - "auxiliary_loss_mlp": 0.01032908, - "balance_loss_clip": 1.03535342, - "balance_loss_mlp": 1.02096331, - "epoch": 0.9751991582744627, - "flos": 22820621068800.0, - "grad_norm": 1.7048563405563817, - "language_loss": 0.81480777, - "learning_rate": 6.433257649285817e-09, - "loss": 0.83601636, - "num_input_tokens_seen": 350003935, - "step": 16220, - "time_per_iteration": 2.6495344638824463 - }, - { - "auxiliary_loss_clip": 0.01106693, - "auxiliary_loss_mlp": 0.01032081, - "balance_loss_clip": 1.03613138, - "balance_loss_mlp": 1.02025533, - "epoch": 0.9752592815271306, - "flos": 19646189735040.0, - "grad_norm": 1.7107968412659516, - "language_loss": 0.75237, - "learning_rate": 6.402082794559227e-09, - "loss": 0.77375782, - "num_input_tokens_seen": 350023595, - "step": 16221, - "time_per_iteration": 2.5049870014190674 - }, - { - "auxiliary_loss_clip": 0.01072645, - "auxiliary_loss_mlp": 0.01031441, - "balance_loss_clip": 1.0333364, - "balance_loss_mlp": 1.01963329, - "epoch": 0.9753194047797986, - "flos": 26690462686080.0, - "grad_norm": 1.478633421454376, - "language_loss": 0.66371262, - "learning_rate": 6.370983537265395e-09, - "loss": 0.68475342, - "num_input_tokens_seen": 350045920, - "step": 16222, - "time_per_iteration": 2.7511966228485107 - }, - { - "auxiliary_loss_clip": 0.0109569, - "auxiliary_loss_mlp": 0.01029193, - "balance_loss_clip": 1.03627598, - "balance_loss_mlp": 1.01753998, - "epoch": 0.9753795280324665, - "flos": 23221779137280.0, - "grad_norm": 1.9485164555129428, - "language_loss": 0.8856619, - "learning_rate": 6.3399598785836004e-09, - "loss": 0.90691066, - "num_input_tokens_seen": 350063925, - "step": 16223, - "time_per_iteration": 2.864657163619995 - }, - { - "auxiliary_loss_clip": 0.01045431, - "auxiliary_loss_mlp": 0.01035052, - "balance_loss_clip": 1.03191626, - "balance_loss_mlp": 1.02308941, - "epoch": 0.9754396512851345, - "flos": 19463835363840.0, - "grad_norm": 1.8028212337000589, - "language_loss": 0.74985182, - "learning_rate": 6.309011819690457e-09, - "loss": 0.7706567, - "num_input_tokens_seen": 350080900, - "step": 16224, - "time_per_iteration": 2.7734134197235107 - }, - { - "auxiliary_loss_clip": 0.01010696, - "auxiliary_loss_mlp": 0.0100273, - "balance_loss_clip": 1.00753188, - "balance_loss_mlp": 1.00170505, - "epoch": 0.9754997745378025, - "flos": 68459313340800.0, - "grad_norm": 0.8348178291134782, - "language_loss": 0.5909391, - "learning_rate": 6.278139361759249e-09, - "loss": 0.61107337, - "num_input_tokens_seen": 350144550, - "step": 16225, - "time_per_iteration": 3.203700065612793 - }, - { - "auxiliary_loss_clip": 0.01075593, - "auxiliary_loss_mlp": 0.00770137, - "balance_loss_clip": 1.03655672, - "balance_loss_mlp": 1.00027668, - "epoch": 0.9755598977904705, - "flos": 26395168976640.0, - "grad_norm": 2.1280321736121364, - "language_loss": 0.68929291, - "learning_rate": 6.247342505960818e-09, - "loss": 0.7077502, - "num_input_tokens_seen": 350164050, - "step": 16226, - "time_per_iteration": 2.7182259559631348 - }, - { - "auxiliary_loss_clip": 0.01094266, - "auxiliary_loss_mlp": 0.01042676, - "balance_loss_clip": 1.03407538, - "balance_loss_mlp": 1.02954507, - "epoch": 0.9756200210431384, - "flos": 16617663446400.0, - "grad_norm": 1.92516234582211, - "language_loss": 0.82812244, - "learning_rate": 6.216621253462894e-09, - "loss": 0.84949183, - "num_input_tokens_seen": 350181350, - "step": 16227, - "time_per_iteration": 4.278809547424316 - }, - { - "auxiliary_loss_clip": 0.01106745, - "auxiliary_loss_mlp": 0.01029012, - "balance_loss_clip": 1.03587723, - "balance_loss_mlp": 1.01710916, - "epoch": 0.9756801442958064, - "flos": 23623044946560.0, - "grad_norm": 1.6986847521997988, - "language_loss": 0.77753866, - "learning_rate": 6.185975605430549e-09, - "loss": 0.79889619, - "num_input_tokens_seen": 350199765, - "step": 16228, - "time_per_iteration": 4.098712205886841 - }, - { - "auxiliary_loss_clip": 0.01018838, - "auxiliary_loss_mlp": 0.01000083, - "balance_loss_clip": 1.00571454, - "balance_loss_mlp": 0.99909353, - "epoch": 0.9757402675484744, - "flos": 61625799440640.0, - "grad_norm": 0.84298125837055, - "language_loss": 0.55775201, - "learning_rate": 6.155405563025962e-09, - "loss": 0.57794118, - "num_input_tokens_seen": 350256420, - "step": 16229, - "time_per_iteration": 4.671915292739868 - }, - { - "auxiliary_loss_clip": 0.01097026, - "auxiliary_loss_mlp": 0.01031698, - "balance_loss_clip": 1.03510642, - "balance_loss_mlp": 1.01906228, - "epoch": 0.9758003908011423, - "flos": 24058964401920.0, - "grad_norm": 1.6630448372353723, - "language_loss": 0.74857068, - "learning_rate": 6.124911127407984e-09, - "loss": 0.76985788, - "num_input_tokens_seen": 350276270, - "step": 16230, - "time_per_iteration": 2.637298822402954 - }, - { - "auxiliary_loss_clip": 0.01080882, - "auxiliary_loss_mlp": 0.01029958, - "balance_loss_clip": 1.03464866, - "balance_loss_mlp": 1.01841259, - "epoch": 0.9758605140538104, - "flos": 17493093717120.0, - "grad_norm": 2.3627285859767992, - "language_loss": 0.72050405, - "learning_rate": 6.094492299733245e-09, - "loss": 0.74161243, - "num_input_tokens_seen": 350295000, - "step": 16231, - "time_per_iteration": 2.606243133544922 - }, - { - "auxiliary_loss_clip": 0.01087789, - "auxiliary_loss_mlp": 0.01032046, - "balance_loss_clip": 1.03723931, - "balance_loss_mlp": 1.019225, - "epoch": 0.9759206373064783, - "flos": 24826950115200.0, - "grad_norm": 1.897185559618263, - "language_loss": 0.76273429, - "learning_rate": 6.064149081155267e-09, - "loss": 0.78393269, - "num_input_tokens_seen": 350314980, - "step": 16232, - "time_per_iteration": 4.806816816329956 - }, - { - "auxiliary_loss_clip": 0.01007054, - "auxiliary_loss_mlp": 0.00999094, - "balance_loss_clip": 1.0077014, - "balance_loss_mlp": 0.99789584, - "epoch": 0.9759807605591463, - "flos": 68161182456960.0, - "grad_norm": 0.7408233152349849, - "language_loss": 0.53817546, - "learning_rate": 6.033881472824465e-09, - "loss": 0.55823696, - "num_input_tokens_seen": 350371985, - "step": 16233, - "time_per_iteration": 3.143988847732544 - }, - { - "auxiliary_loss_clip": 0.01108543, - "auxiliary_loss_mlp": 0.0103539, - "balance_loss_clip": 1.03642726, - "balance_loss_mlp": 1.02313495, - "epoch": 0.9760408838118142, - "flos": 18989239939200.0, - "grad_norm": 1.8846866025891749, - "language_loss": 0.71843183, - "learning_rate": 6.003689475888807e-09, - "loss": 0.7398712, - "num_input_tokens_seen": 350390590, - "step": 16234, - "time_per_iteration": 2.5556411743164062 - }, - { - "auxiliary_loss_clip": 0.01098185, - "auxiliary_loss_mlp": 0.0103115, - "balance_loss_clip": 1.03558266, - "balance_loss_mlp": 1.01847827, - "epoch": 0.9761010070644822, - "flos": 17125978763520.0, - "grad_norm": 2.9772233129130825, - "language_loss": 0.79668027, - "learning_rate": 5.973573091493156e-09, - "loss": 0.81797361, - "num_input_tokens_seen": 350403770, - "step": 16235, - "time_per_iteration": 2.5155222415924072 - }, - { - "auxiliary_loss_clip": 0.0109002, - "auxiliary_loss_mlp": 0.01034289, - "balance_loss_clip": 1.03545177, - "balance_loss_mlp": 1.02070475, - "epoch": 0.9761611303171501, - "flos": 22052599441920.0, - "grad_norm": 2.420578950017542, - "language_loss": 0.7674818, - "learning_rate": 5.943532320779265e-09, - "loss": 0.7887249, - "num_input_tokens_seen": 350421870, - "step": 16236, - "time_per_iteration": 2.641690731048584 - }, - { - "auxiliary_loss_clip": 0.01096794, - "auxiliary_loss_mlp": 0.01027707, - "balance_loss_clip": 1.03507769, - "balance_loss_mlp": 1.01571465, - "epoch": 0.9762212535698181, - "flos": 21757521214080.0, - "grad_norm": 3.537228150180641, - "language_loss": 0.75424302, - "learning_rate": 5.913567164886446e-09, - "loss": 0.77548802, - "num_input_tokens_seen": 350440025, - "step": 16237, - "time_per_iteration": 2.5821526050567627 - }, - { - "auxiliary_loss_clip": 0.01061626, - "auxiliary_loss_mlp": 0.01037494, - "balance_loss_clip": 1.03076112, - "balance_loss_mlp": 1.02306354, - "epoch": 0.9762813768224861, - "flos": 25921615046400.0, - "grad_norm": 1.5766064307721592, - "language_loss": 0.72649348, - "learning_rate": 5.8836776249509e-09, - "loss": 0.74748468, - "num_input_tokens_seen": 350459435, - "step": 16238, - "time_per_iteration": 2.716170072555542 - }, - { - "auxiliary_loss_clip": 0.01090292, - "auxiliary_loss_mlp": 0.00771217, - "balance_loss_clip": 1.03843439, - "balance_loss_mlp": 1.0002383, - "epoch": 0.9763415000751541, - "flos": 24051853509120.0, - "grad_norm": 2.1577792438101646, - "language_loss": 0.83911026, - "learning_rate": 5.8538637021063875e-09, - "loss": 0.85772538, - "num_input_tokens_seen": 350472655, - "step": 16239, - "time_per_iteration": 2.7628393173217773 - }, - { - "auxiliary_loss_clip": 0.01067831, - "auxiliary_loss_mlp": 0.01043243, - "balance_loss_clip": 1.03342855, - "balance_loss_mlp": 1.02861595, - "epoch": 0.976401623327822, - "flos": 17018677860480.0, - "grad_norm": 3.0421443760450266, - "language_loss": 0.60336649, - "learning_rate": 5.824125397483115e-09, - "loss": 0.62447721, - "num_input_tokens_seen": 350488160, - "step": 16240, - "time_per_iteration": 2.6417906284332275 - }, - { - "auxiliary_loss_clip": 0.01069406, - "auxiliary_loss_mlp": 0.01029004, - "balance_loss_clip": 1.0350244, - "balance_loss_mlp": 1.01704097, - "epoch": 0.97646174658049, - "flos": 16106941918080.0, - "grad_norm": 1.952892588808636, - "language_loss": 0.82362419, - "learning_rate": 5.7944627122088474e-09, - "loss": 0.84460825, - "num_input_tokens_seen": 350506065, - "step": 16241, - "time_per_iteration": 2.6529223918914795 - }, - { - "auxiliary_loss_clip": 0.01069965, - "auxiliary_loss_mlp": 0.01037931, - "balance_loss_clip": 1.03480554, - "balance_loss_mlp": 1.02566481, - "epoch": 0.9765218698331579, - "flos": 21252725429760.0, - "grad_norm": 1.9374011472646437, - "language_loss": 0.83271652, - "learning_rate": 5.764875647408463e-09, - "loss": 0.85379553, - "num_input_tokens_seen": 350524495, - "step": 16242, - "time_per_iteration": 2.7075135707855225 - }, - { - "auxiliary_loss_clip": 0.01097999, - "auxiliary_loss_mlp": 0.01027861, - "balance_loss_clip": 1.03740764, - "balance_loss_mlp": 1.01545691, - "epoch": 0.9765819930858259, - "flos": 18588045957120.0, - "grad_norm": 1.5885372986539104, - "language_loss": 0.75616562, - "learning_rate": 5.7353642042037294e-09, - "loss": 0.77742422, - "num_input_tokens_seen": 350544185, - "step": 16243, - "time_per_iteration": 2.8476173877716064 - }, - { - "auxiliary_loss_clip": 0.01096151, - "auxiliary_loss_mlp": 0.0103737, - "balance_loss_clip": 1.03450549, - "balance_loss_mlp": 1.02472222, - "epoch": 0.976642116338494, - "flos": 20266833859200.0, - "grad_norm": 1.629420195076715, - "language_loss": 0.70183492, - "learning_rate": 5.705928383713754e-09, - "loss": 0.72317016, - "num_input_tokens_seen": 350562675, - "step": 16244, - "time_per_iteration": 2.648705244064331 - }, - { - "auxiliary_loss_clip": 0.01090661, - "auxiliary_loss_mlp": 0.01030766, - "balance_loss_clip": 1.03870106, - "balance_loss_mlp": 1.01780796, - "epoch": 0.9767022395911619, - "flos": 25550477769600.0, - "grad_norm": 1.816908720128117, - "language_loss": 0.83598977, - "learning_rate": 5.676568187055197e-09, - "loss": 0.85720408, - "num_input_tokens_seen": 350581535, - "step": 16245, - "time_per_iteration": 2.7069408893585205 - }, - { - "auxiliary_loss_clip": 0.01056812, - "auxiliary_loss_mlp": 0.01028217, - "balance_loss_clip": 1.03245211, - "balance_loss_mlp": 1.0164988, - "epoch": 0.9767623628438299, - "flos": 21762656858880.0, - "grad_norm": 1.6507047411461764, - "language_loss": 0.78559917, - "learning_rate": 5.647283615340726e-09, - "loss": 0.80644941, - "num_input_tokens_seen": 350601615, - "step": 16246, - "time_per_iteration": 2.766493558883667 - }, - { - "auxiliary_loss_clip": 0.01101377, - "auxiliary_loss_mlp": 0.01033211, - "balance_loss_clip": 1.03545284, - "balance_loss_mlp": 1.02206457, - "epoch": 0.9768224860964978, - "flos": 15851114277120.0, - "grad_norm": 1.4053965502785082, - "language_loss": 0.74026012, - "learning_rate": 5.6180746696812275e-09, - "loss": 0.76160598, - "num_input_tokens_seen": 350619580, - "step": 16247, - "time_per_iteration": 2.56381893157959 - }, - { - "auxiliary_loss_clip": 0.01053333, - "auxiliary_loss_mlp": 0.01033648, - "balance_loss_clip": 1.03346825, - "balance_loss_mlp": 1.02078581, - "epoch": 0.9768826093491658, - "flos": 25151151294720.0, - "grad_norm": 4.399397738721356, - "language_loss": 0.79704082, - "learning_rate": 5.58894135118404e-09, - "loss": 0.81791055, - "num_input_tokens_seen": 350640015, - "step": 16248, - "time_per_iteration": 2.8011584281921387 - }, - { - "auxiliary_loss_clip": 0.01049095, - "auxiliary_loss_mlp": 0.01046497, - "balance_loss_clip": 1.03563344, - "balance_loss_mlp": 1.03080893, - "epoch": 0.9769427326018337, - "flos": 22967028904320.0, - "grad_norm": 1.8090514517823602, - "language_loss": 0.79385042, - "learning_rate": 5.559883660954278e-09, - "loss": 0.81480634, - "num_input_tokens_seen": 350659155, - "step": 16249, - "time_per_iteration": 2.7398455142974854 - }, - { - "auxiliary_loss_clip": 0.01092723, - "auxiliary_loss_mlp": 0.01035301, - "balance_loss_clip": 1.03559923, - "balance_loss_mlp": 1.02318323, - "epoch": 0.9770028558545018, - "flos": 15264297786240.0, - "grad_norm": 1.9233029914398667, - "language_loss": 0.66280472, - "learning_rate": 5.530901600093507e-09, - "loss": 0.68408501, - "num_input_tokens_seen": 350676615, - "step": 16250, - "time_per_iteration": 2.556757688522339 - }, - { - "auxiliary_loss_clip": 0.01027067, - "auxiliary_loss_mlp": 0.01001957, - "balance_loss_clip": 1.00477159, - "balance_loss_mlp": 1.0009917, - "epoch": 0.9770629791071697, - "flos": 71450348808960.0, - "grad_norm": 0.7726336256949028, - "language_loss": 0.59797513, - "learning_rate": 5.501995169700846e-09, - "loss": 0.61826539, - "num_input_tokens_seen": 350736805, - "step": 16251, - "time_per_iteration": 3.1876869201660156 - }, - { - "auxiliary_loss_clip": 0.01093869, - "auxiliary_loss_mlp": 0.01031057, - "balance_loss_clip": 1.03425992, - "balance_loss_mlp": 1.01817012, - "epoch": 0.9771231023598377, - "flos": 22412854897920.0, - "grad_norm": 1.7259349246741458, - "language_loss": 0.78470027, - "learning_rate": 5.473164370872307e-09, - "loss": 0.80594945, - "num_input_tokens_seen": 350753600, - "step": 16252, - "time_per_iteration": 2.606030225753784 - }, - { - "auxiliary_loss_clip": 0.01090281, - "auxiliary_loss_mlp": 0.01034389, - "balance_loss_clip": 1.0339278, - "balance_loss_mlp": 1.02142549, - "epoch": 0.9771832256125056, - "flos": 19025940660480.0, - "grad_norm": 2.220084536547545, - "language_loss": 0.64542538, - "learning_rate": 5.444409204701461e-09, - "loss": 0.66667211, - "num_input_tokens_seen": 350771225, - "step": 16253, - "time_per_iteration": 2.5694305896759033 - }, - { - "auxiliary_loss_clip": 0.01101639, - "auxiliary_loss_mlp": 0.0103353, - "balance_loss_clip": 1.03819561, - "balance_loss_mlp": 1.01936197, - "epoch": 0.9772433488651736, - "flos": 17822143232640.0, - "grad_norm": 2.129791137286544, - "language_loss": 0.7626065, - "learning_rate": 5.415729672278324e-09, - "loss": 0.7839582, - "num_input_tokens_seen": 350789100, - "step": 16254, - "time_per_iteration": 2.6212127208709717 - }, - { - "auxiliary_loss_clip": 0.0110148, - "auxiliary_loss_mlp": 0.01031853, - "balance_loss_clip": 1.03698289, - "balance_loss_mlp": 1.0193603, - "epoch": 0.9773034721178415, - "flos": 37629785623680.0, - "grad_norm": 1.8907694352431208, - "language_loss": 0.63917691, - "learning_rate": 5.387125774690471e-09, - "loss": 0.66051024, - "num_input_tokens_seen": 350811085, - "step": 16255, - "time_per_iteration": 2.7545289993286133 - }, - { - "auxiliary_loss_clip": 0.01080709, - "auxiliary_loss_mlp": 0.00771506, - "balance_loss_clip": 1.03611016, - "balance_loss_mlp": 1.0002296, - "epoch": 0.9773635953705095, - "flos": 20302457172480.0, - "grad_norm": 1.5349410458335684, - "language_loss": 0.75715804, - "learning_rate": 5.358597513023033e-09, - "loss": 0.77568018, - "num_input_tokens_seen": 350831065, - "step": 16256, - "time_per_iteration": 2.718520164489746 - }, - { - "auxiliary_loss_clip": 0.01107482, - "auxiliary_loss_mlp": 0.010355, - "balance_loss_clip": 1.0382638, - "balance_loss_mlp": 1.02249467, - "epoch": 0.9774237186231776, - "flos": 22309253095680.0, - "grad_norm": 4.923302241947984, - "language_loss": 0.77929807, - "learning_rate": 5.330144888357369e-09, - "loss": 0.80072796, - "num_input_tokens_seen": 350849675, - "step": 16257, - "time_per_iteration": 2.578667163848877 - }, - { - "auxiliary_loss_clip": 0.01092876, - "auxiliary_loss_mlp": 0.01032433, - "balance_loss_clip": 1.03653014, - "balance_loss_mlp": 1.01965332, - "epoch": 0.9774838418758455, - "flos": 24204905360640.0, - "grad_norm": 1.5736578002879344, - "language_loss": 0.75143224, - "learning_rate": 5.301767901772391e-09, - "loss": 0.77268535, - "num_input_tokens_seen": 350868955, - "step": 16258, - "time_per_iteration": 2.679143190383911 - }, - { - "auxiliary_loss_clip": 0.01019519, - "auxiliary_loss_mlp": 0.01001235, - "balance_loss_clip": 1.00671029, - "balance_loss_mlp": 1.00025165, - "epoch": 0.9775439651285135, - "flos": 66357139829760.0, - "grad_norm": 0.6768337597392673, - "language_loss": 0.59785736, - "learning_rate": 5.273466554344353e-09, - "loss": 0.61806488, - "num_input_tokens_seen": 350935110, - "step": 16259, - "time_per_iteration": 3.1992921829223633 - }, - { - "auxiliary_loss_clip": 0.01093161, - "auxiliary_loss_mlp": 0.0103161, - "balance_loss_clip": 1.03717732, - "balance_loss_mlp": 1.01865828, - "epoch": 0.9776040883811814, - "flos": 22601565976320.0, - "grad_norm": 1.55851171401808, - "language_loss": 0.73553669, - "learning_rate": 5.2452408471461705e-09, - "loss": 0.75678444, - "num_input_tokens_seen": 350953220, - "step": 16260, - "time_per_iteration": 2.639127731323242 - }, - { - "auxiliary_loss_clip": 0.01098909, - "auxiliary_loss_mlp": 0.01032118, - "balance_loss_clip": 1.03642654, - "balance_loss_mlp": 1.01898706, - "epoch": 0.9776642116338494, - "flos": 18442176825600.0, - "grad_norm": 1.9328284113468908, - "language_loss": 0.79923123, - "learning_rate": 5.2170907812485456e-09, - "loss": 0.8205415, - "num_input_tokens_seen": 350971915, - "step": 16261, - "time_per_iteration": 2.5895767211914062 - }, - { - "auxiliary_loss_clip": 0.01099762, - "auxiliary_loss_mlp": 0.01026055, - "balance_loss_clip": 1.03615069, - "balance_loss_mlp": 1.01340127, - "epoch": 0.9777243348865173, - "flos": 22638446265600.0, - "grad_norm": 2.216385126324637, - "language_loss": 0.74283129, - "learning_rate": 5.189016357718845e-09, - "loss": 0.76408947, - "num_input_tokens_seen": 350990470, - "step": 16262, - "time_per_iteration": 2.5935211181640625 - }, - { - "auxiliary_loss_clip": 0.01098991, - "auxiliary_loss_mlp": 0.01033569, - "balance_loss_clip": 1.03628135, - "balance_loss_mlp": 1.01945508, - "epoch": 0.9777844581391854, - "flos": 31321394605440.0, - "grad_norm": 2.37269608442218, - "language_loss": 0.70012951, - "learning_rate": 5.16101757762133e-09, - "loss": 0.7214551, - "num_input_tokens_seen": 351010755, - "step": 16263, - "time_per_iteration": 2.8126862049102783 - }, - { - "auxiliary_loss_clip": 0.01098892, - "auxiliary_loss_mlp": 0.01029577, - "balance_loss_clip": 1.03766048, - "balance_loss_mlp": 1.01819265, - "epoch": 0.9778445813918533, - "flos": 23039101543680.0, - "grad_norm": 2.465472735999823, - "language_loss": 0.66363978, - "learning_rate": 5.133094442018038e-09, - "loss": 0.68492448, - "num_input_tokens_seen": 351029965, - "step": 16264, - "time_per_iteration": 2.6721160411834717 - }, - { - "auxiliary_loss_clip": 0.01063654, - "auxiliary_loss_mlp": 0.01031661, - "balance_loss_clip": 1.03675711, - "balance_loss_mlp": 1.01770782, - "epoch": 0.9779047046445213, - "flos": 17566351505280.0, - "grad_norm": 1.8950171968116294, - "language_loss": 0.73092592, - "learning_rate": 5.105246951967679e-09, - "loss": 0.7518791, - "num_input_tokens_seen": 351046205, - "step": 16265, - "time_per_iteration": 2.7303049564361572 - }, - { - "auxiliary_loss_clip": 0.01095694, - "auxiliary_loss_mlp": 0.01031211, - "balance_loss_clip": 1.03564298, - "balance_loss_mlp": 1.01908779, - "epoch": 0.9779648278971892, - "flos": 20741141975040.0, - "grad_norm": 1.771422811009788, - "language_loss": 0.68976378, - "learning_rate": 5.077475108526297e-09, - "loss": 0.71103287, - "num_input_tokens_seen": 351065390, - "step": 16266, - "time_per_iteration": 4.168168306350708 - }, - { - "auxiliary_loss_clip": 0.01058776, - "auxiliary_loss_mlp": 0.01027743, - "balance_loss_clip": 1.03172088, - "balance_loss_mlp": 1.01640654, - "epoch": 0.9780249511498572, - "flos": 21026954494080.0, - "grad_norm": 1.5799613960571957, - "language_loss": 0.86905551, - "learning_rate": 5.049778912747049e-09, - "loss": 0.88992071, - "num_input_tokens_seen": 351084355, - "step": 16267, - "time_per_iteration": 4.231276512145996 - }, - { - "auxiliary_loss_clip": 0.01043069, - "auxiliary_loss_mlp": 0.01029392, - "balance_loss_clip": 1.03164184, - "balance_loss_mlp": 1.01611769, - "epoch": 0.9780850744025251, - "flos": 30774223751040.0, - "grad_norm": 1.9539809119147387, - "language_loss": 0.70374393, - "learning_rate": 5.022158365679985e-09, - "loss": 0.72446853, - "num_input_tokens_seen": 351105870, - "step": 16268, - "time_per_iteration": 4.722951412200928 - }, - { - "auxiliary_loss_clip": 0.01087833, - "auxiliary_loss_mlp": 0.01024552, - "balance_loss_clip": 1.0350287, - "balance_loss_mlp": 1.01256526, - "epoch": 0.9781451976551931, - "flos": 20302995876480.0, - "grad_norm": 1.5865109872612446, - "language_loss": 0.7393145, - "learning_rate": 4.994613468372711e-09, - "loss": 0.76043838, - "num_input_tokens_seen": 351124760, - "step": 16269, - "time_per_iteration": 3.1291208267211914 - }, - { - "auxiliary_loss_clip": 0.01085029, - "auxiliary_loss_mlp": 0.01034789, - "balance_loss_clip": 1.03650665, - "balance_loss_mlp": 1.02071679, - "epoch": 0.9782053209078612, - "flos": 24316479982080.0, - "grad_norm": 2.0431692702838613, - "language_loss": 0.70405006, - "learning_rate": 4.967144221869501e-09, - "loss": 0.72524822, - "num_input_tokens_seen": 351142820, - "step": 16270, - "time_per_iteration": 2.6683366298675537 - }, - { - "auxiliary_loss_clip": 0.01110841, - "auxiliary_loss_mlp": 0.01034856, - "balance_loss_clip": 1.03801334, - "balance_loss_mlp": 1.02240467, - "epoch": 0.9782654441605291, - "flos": 32489425065600.0, - "grad_norm": 1.7714650926331987, - "language_loss": 0.63896102, - "learning_rate": 4.939750627212191e-09, - "loss": 0.66041803, - "num_input_tokens_seen": 351164805, - "step": 16271, - "time_per_iteration": 4.501726388931274 - }, - { - "auxiliary_loss_clip": 0.01082074, - "auxiliary_loss_mlp": 0.01033993, - "balance_loss_clip": 1.03665876, - "balance_loss_mlp": 1.02143383, - "epoch": 0.9783255674131971, - "flos": 26979076465920.0, - "grad_norm": 1.43784561644357, - "language_loss": 0.70358956, - "learning_rate": 4.912432685439505e-09, - "loss": 0.72475022, - "num_input_tokens_seen": 351187005, - "step": 16272, - "time_per_iteration": 2.727437734603882 - }, - { - "auxiliary_loss_clip": 0.0105355, - "auxiliary_loss_mlp": 0.01034254, - "balance_loss_clip": 1.03778529, - "balance_loss_mlp": 1.02120066, - "epoch": 0.978385690665865, - "flos": 23112251591040.0, - "grad_norm": 1.7381276500973775, - "language_loss": 0.66595173, - "learning_rate": 4.88519039758728e-09, - "loss": 0.68682981, - "num_input_tokens_seen": 351208450, - "step": 16273, - "time_per_iteration": 2.929959774017334 - }, - { - "auxiliary_loss_clip": 0.01075306, - "auxiliary_loss_mlp": 0.01023021, - "balance_loss_clip": 1.03366828, - "balance_loss_mlp": 1.01021206, - "epoch": 0.978445813918533, - "flos": 25409672455680.0, - "grad_norm": 1.7200499959996831, - "language_loss": 0.7406745, - "learning_rate": 4.85802376468869e-09, - "loss": 0.76165771, - "num_input_tokens_seen": 351229585, - "step": 16274, - "time_per_iteration": 2.6932880878448486 - }, - { - "auxiliary_loss_clip": 0.01084441, - "auxiliary_loss_mlp": 0.01029809, - "balance_loss_clip": 1.03532362, - "balance_loss_mlp": 1.01775074, - "epoch": 0.9785059371712009, - "flos": 23550218121600.0, - "grad_norm": 1.6821946772712648, - "language_loss": 0.77833498, - "learning_rate": 4.830932787773579e-09, - "loss": 0.79947746, - "num_input_tokens_seen": 351249525, - "step": 16275, - "time_per_iteration": 2.6410744190216064 - }, - { - "auxiliary_loss_clip": 0.01037951, - "auxiliary_loss_mlp": 0.0103008, - "balance_loss_clip": 1.03559256, - "balance_loss_mlp": 1.01765788, - "epoch": 0.978566060423869, - "flos": 34351177870080.0, - "grad_norm": 2.314426015292287, - "language_loss": 0.71095657, - "learning_rate": 4.803917467869567e-09, - "loss": 0.73163688, - "num_input_tokens_seen": 351272530, - "step": 16276, - "time_per_iteration": 2.91654109954834 - }, - { - "auxiliary_loss_clip": 0.01077494, - "auxiliary_loss_mlp": 0.01032563, - "balance_loss_clip": 1.033566, - "balance_loss_mlp": 1.02052927, - "epoch": 0.9786261836765369, - "flos": 11618862387840.0, - "grad_norm": 2.1039915765233674, - "language_loss": 0.85744458, - "learning_rate": 4.776977806000726e-09, - "loss": 0.87854517, - "num_input_tokens_seen": 351288530, - "step": 16277, - "time_per_iteration": 2.6657748222351074 - }, - { - "auxiliary_loss_clip": 0.01090092, - "auxiliary_loss_mlp": 0.01031629, - "balance_loss_clip": 1.03429365, - "balance_loss_mlp": 1.01809239, - "epoch": 0.9786863069292049, - "flos": 17420949250560.0, - "grad_norm": 1.719720636117993, - "language_loss": 0.70917892, - "learning_rate": 4.7501138031891264e-09, - "loss": 0.73039615, - "num_input_tokens_seen": 351305890, - "step": 16278, - "time_per_iteration": 2.5898592472076416 - }, - { - "auxiliary_loss_clip": 0.01087893, - "auxiliary_loss_mlp": 0.01034892, - "balance_loss_clip": 1.03455925, - "balance_loss_mlp": 1.02192235, - "epoch": 0.9787464301818728, - "flos": 20844923345280.0, - "grad_norm": 1.8454549463354188, - "language_loss": 0.84530413, - "learning_rate": 4.723325460453065e-09, - "loss": 0.86653197, - "num_input_tokens_seen": 351325010, - "step": 16279, - "time_per_iteration": 2.6659061908721924 - }, - { - "auxiliary_loss_clip": 0.01096633, - "auxiliary_loss_mlp": 0.01032409, - "balance_loss_clip": 1.0337534, - "balance_loss_mlp": 1.01924253, - "epoch": 0.9788065534345408, - "flos": 18222942165120.0, - "grad_norm": 2.8453433920494753, - "language_loss": 0.79117471, - "learning_rate": 4.696612778808395e-09, - "loss": 0.81246513, - "num_input_tokens_seen": 351343060, - "step": 16280, - "time_per_iteration": 2.636876106262207 - }, - { - "auxiliary_loss_clip": 0.01064547, - "auxiliary_loss_mlp": 0.01034092, - "balance_loss_clip": 1.03490829, - "balance_loss_mlp": 1.02280319, - "epoch": 0.9788666766872087, - "flos": 21578219498880.0, - "grad_norm": 2.843907217465553, - "language_loss": 0.79550928, - "learning_rate": 4.669975759268085e-09, - "loss": 0.81649566, - "num_input_tokens_seen": 351363260, - "step": 16281, - "time_per_iteration": 2.710759162902832 - }, - { - "auxiliary_loss_clip": 0.01096946, - "auxiliary_loss_mlp": 0.01032797, - "balance_loss_clip": 1.03685427, - "balance_loss_mlp": 1.01961863, - "epoch": 0.9789267999398767, - "flos": 24900495212160.0, - "grad_norm": 1.5976604846892302, - "language_loss": 0.80062044, - "learning_rate": 4.643414402842216e-09, - "loss": 0.82191795, - "num_input_tokens_seen": 351382610, - "step": 16282, - "time_per_iteration": 2.6593406200408936 - }, - { - "auxiliary_loss_clip": 0.0108946, - "auxiliary_loss_mlp": 0.01043417, - "balance_loss_clip": 1.03729296, - "balance_loss_mlp": 1.03109717, - "epoch": 0.9789869231925448, - "flos": 19573111514880.0, - "grad_norm": 1.9818178483973194, - "language_loss": 0.82860035, - "learning_rate": 4.616928710538204e-09, - "loss": 0.84992909, - "num_input_tokens_seen": 351401075, - "step": 16283, - "time_per_iteration": 2.696199655532837 - }, - { - "auxiliary_loss_clip": 0.01092588, - "auxiliary_loss_mlp": 0.01034978, - "balance_loss_clip": 1.0365355, - "balance_loss_mlp": 1.02216268, - "epoch": 0.9790470464452127, - "flos": 16796641939200.0, - "grad_norm": 1.8242453499893954, - "language_loss": 0.71959805, - "learning_rate": 4.590518683360134e-09, - "loss": 0.74087369, - "num_input_tokens_seen": 351419275, - "step": 16284, - "time_per_iteration": 2.651407241821289 - }, - { - "auxiliary_loss_clip": 0.01094663, - "auxiliary_loss_mlp": 0.01033179, - "balance_loss_clip": 1.03582513, - "balance_loss_mlp": 1.02136493, - "epoch": 0.9791071696978807, - "flos": 18369350000640.0, - "grad_norm": 1.8718782161226852, - "language_loss": 0.64339489, - "learning_rate": 4.56418432230965e-09, - "loss": 0.66467333, - "num_input_tokens_seen": 351437375, - "step": 16285, - "time_per_iteration": 2.651705026626587 - }, - { - "auxiliary_loss_clip": 0.01084456, - "auxiliary_loss_mlp": 0.01031691, - "balance_loss_clip": 1.0361805, - "balance_loss_mlp": 1.01931071, - "epoch": 0.9791672929505486, - "flos": 24170323541760.0, - "grad_norm": 1.7019695394425336, - "language_loss": 0.70606256, - "learning_rate": 4.537925628385286e-09, - "loss": 0.72722405, - "num_input_tokens_seen": 351457810, - "step": 16286, - "time_per_iteration": 2.6652472019195557 - }, - { - "auxiliary_loss_clip": 0.01091075, - "auxiliary_loss_mlp": 0.01032716, - "balance_loss_clip": 1.03533125, - "balance_loss_mlp": 1.02069998, - "epoch": 0.9792274162032166, - "flos": 24354114456960.0, - "grad_norm": 2.0320936196191526, - "language_loss": 0.58058381, - "learning_rate": 4.511742602582691e-09, - "loss": 0.60182172, - "num_input_tokens_seen": 351478825, - "step": 16287, - "time_per_iteration": 2.617825746536255 - }, - { - "auxiliary_loss_clip": 0.01096267, - "auxiliary_loss_mlp": 0.01035937, - "balance_loss_clip": 1.0363822, - "balance_loss_mlp": 1.02341986, - "epoch": 0.9792875394558845, - "flos": 26395779507840.0, - "grad_norm": 1.7210061810048285, - "language_loss": 0.81500298, - "learning_rate": 4.485635245894626e-09, - "loss": 0.83632499, - "num_input_tokens_seen": 351498785, - "step": 16288, - "time_per_iteration": 2.657498359680176 - }, - { - "auxiliary_loss_clip": 0.01082554, - "auxiliary_loss_mlp": 0.00771248, - "balance_loss_clip": 1.03415895, - "balance_loss_mlp": 1.00014818, - "epoch": 0.9793476627085526, - "flos": 28148004766080.0, - "grad_norm": 1.396084239179073, - "language_loss": 0.71853596, - "learning_rate": 4.459603559311631e-09, - "loss": 0.73707396, - "num_input_tokens_seen": 351520235, - "step": 16289, - "time_per_iteration": 2.8403937816619873 - }, - { - "auxiliary_loss_clip": 0.01073083, - "auxiliary_loss_mlp": 0.01036624, - "balance_loss_clip": 1.03831482, - "balance_loss_mlp": 1.02417815, - "epoch": 0.9794077859612205, - "flos": 16763927627520.0, - "grad_norm": 2.8328336335773523, - "language_loss": 0.75429696, - "learning_rate": 4.43364754382003e-09, - "loss": 0.77539402, - "num_input_tokens_seen": 351538900, - "step": 16290, - "time_per_iteration": 2.6202123165130615 - }, - { - "auxiliary_loss_clip": 0.01099176, - "auxiliary_loss_mlp": 0.01032346, - "balance_loss_clip": 1.03652453, - "balance_loss_mlp": 1.01942921, - "epoch": 0.9794679092138885, - "flos": 19280834547840.0, - "grad_norm": 1.5711405452036733, - "language_loss": 0.6725769, - "learning_rate": 4.4077672004048105e-09, - "loss": 0.69389218, - "num_input_tokens_seen": 351558715, - "step": 16291, - "time_per_iteration": 2.5787715911865234 - }, - { - "auxiliary_loss_clip": 0.01111756, - "auxiliary_loss_mlp": 0.00770961, - "balance_loss_clip": 1.03711116, - "balance_loss_mlp": 1.00023437, - "epoch": 0.9795280324665564, - "flos": 32156640535680.0, - "grad_norm": 1.7868335154862072, - "language_loss": 0.63048244, - "learning_rate": 4.3819625300467456e-09, - "loss": 0.64930964, - "num_input_tokens_seen": 351578450, - "step": 16292, - "time_per_iteration": 2.6509621143341064 - }, - { - "auxiliary_loss_clip": 0.01072425, - "auxiliary_loss_mlp": 0.01030776, - "balance_loss_clip": 1.03524005, - "balance_loss_mlp": 1.01897478, - "epoch": 0.9795881557192244, - "flos": 19060953442560.0, - "grad_norm": 1.8984825692794804, - "language_loss": 0.73462898, - "learning_rate": 4.356233533724829e-09, - "loss": 0.75566101, - "num_input_tokens_seen": 351597195, - "step": 16293, - "time_per_iteration": 2.64638614654541 - }, - { - "auxiliary_loss_clip": 0.01100837, - "auxiliary_loss_mlp": 0.01030263, - "balance_loss_clip": 1.03659904, - "balance_loss_mlp": 1.01774597, - "epoch": 0.9796482789718923, - "flos": 28329928174080.0, - "grad_norm": 1.6810560431936798, - "language_loss": 0.84062809, - "learning_rate": 4.330580212414503e-09, - "loss": 0.86193907, - "num_input_tokens_seen": 351617460, - "step": 16294, - "time_per_iteration": 2.614396095275879 - }, - { - "auxiliary_loss_clip": 0.01071284, - "auxiliary_loss_mlp": 0.01032864, - "balance_loss_clip": 1.03327644, - "balance_loss_mlp": 1.02134895, - "epoch": 0.9797084022245603, - "flos": 17967976450560.0, - "grad_norm": 2.3230891850787168, - "language_loss": 0.71972656, - "learning_rate": 4.305002567088767e-09, - "loss": 0.74076802, - "num_input_tokens_seen": 351635900, - "step": 16295, - "time_per_iteration": 2.6593565940856934 - }, - { - "auxiliary_loss_clip": 0.01103524, - "auxiliary_loss_mlp": 0.01035987, - "balance_loss_clip": 1.0375762, - "balance_loss_mlp": 1.02305257, - "epoch": 0.9797685254772284, - "flos": 20266726118400.0, - "grad_norm": 1.590993337993389, - "language_loss": 0.80806482, - "learning_rate": 4.2795005987170674e-09, - "loss": 0.82946002, - "num_input_tokens_seen": 351655400, - "step": 16296, - "time_per_iteration": 2.571876287460327 - }, - { - "auxiliary_loss_clip": 0.01079454, - "auxiliary_loss_mlp": 0.01033969, - "balance_loss_clip": 1.03264189, - "balance_loss_mlp": 1.02170789, - "epoch": 0.9798286487298963, - "flos": 26907147480960.0, - "grad_norm": 2.4272229698670986, - "language_loss": 0.75518107, - "learning_rate": 4.254074308266853e-09, - "loss": 0.77631521, - "num_input_tokens_seen": 351675505, - "step": 16297, - "time_per_iteration": 2.737135410308838 - }, - { - "auxiliary_loss_clip": 0.01097573, - "auxiliary_loss_mlp": 0.01036214, - "balance_loss_clip": 1.03408372, - "balance_loss_mlp": 1.02367926, - "epoch": 0.9798887719825643, - "flos": 27161071701120.0, - "grad_norm": 2.8449878357962, - "language_loss": 0.78244084, - "learning_rate": 4.228723696702019e-09, - "loss": 0.80377865, - "num_input_tokens_seen": 351697920, - "step": 16298, - "time_per_iteration": 2.662205457687378 - }, - { - "auxiliary_loss_clip": 0.01092637, - "auxiliary_loss_mlp": 0.01026617, - "balance_loss_clip": 1.03448844, - "balance_loss_mlp": 1.01479197, - "epoch": 0.9799488952352322, - "flos": 20668422890880.0, - "grad_norm": 1.5107423407180305, - "language_loss": 0.72837794, - "learning_rate": 4.203448764984019e-09, - "loss": 0.74957049, - "num_input_tokens_seen": 351717615, - "step": 16299, - "time_per_iteration": 2.6172263622283936 - }, - { - "auxiliary_loss_clip": 0.01084816, - "auxiliary_loss_mlp": 0.01028705, - "balance_loss_clip": 1.0337168, - "balance_loss_mlp": 1.01527619, - "epoch": 0.9800090184879002, - "flos": 21981209160960.0, - "grad_norm": 2.196732565554413, - "language_loss": 0.89433563, - "learning_rate": 4.178249514071419e-09, - "loss": 0.91547084, - "num_input_tokens_seen": 351735260, - "step": 16300, - "time_per_iteration": 2.665531873703003 - }, - { - "auxiliary_loss_clip": 0.01099488, - "auxiliary_loss_mlp": 0.01029393, - "balance_loss_clip": 1.03554893, - "balance_loss_mlp": 1.01669717, - "epoch": 0.9800691417405681, - "flos": 21288420570240.0, - "grad_norm": 3.318186041205299, - "language_loss": 0.7811656, - "learning_rate": 4.1531259449194555e-09, - "loss": 0.80245435, - "num_input_tokens_seen": 351755800, - "step": 16301, - "time_per_iteration": 2.6590991020202637 - }, - { - "auxiliary_loss_clip": 0.01085984, - "auxiliary_loss_mlp": 0.01035207, - "balance_loss_clip": 1.03470898, - "balance_loss_mlp": 1.022452, - "epoch": 0.9801292649932362, - "flos": 18439878355200.0, - "grad_norm": 2.4753221911438525, - "language_loss": 0.75696325, - "learning_rate": 4.128078058480921e-09, - "loss": 0.77817523, - "num_input_tokens_seen": 351774790, - "step": 16302, - "time_per_iteration": 2.5974133014678955 - }, - { - "auxiliary_loss_clip": 0.01080371, - "auxiliary_loss_mlp": 0.01032591, - "balance_loss_clip": 1.03640592, - "balance_loss_mlp": 1.01979423, - "epoch": 0.9801893882459041, - "flos": 25046364343680.0, - "grad_norm": 1.7046850739781914, - "language_loss": 0.79628474, - "learning_rate": 4.103105855705724e-09, - "loss": 0.8174144, - "num_input_tokens_seen": 351792855, - "step": 16303, - "time_per_iteration": 2.6679980754852295 - }, - { - "auxiliary_loss_clip": 0.01066992, - "auxiliary_loss_mlp": 0.0103858, - "balance_loss_clip": 1.03263092, - "balance_loss_mlp": 1.02466226, - "epoch": 0.9802495114985721, - "flos": 18511484117760.0, - "grad_norm": 2.0636991947077696, - "language_loss": 0.83625126, - "learning_rate": 4.078209337540883e-09, - "loss": 0.85730696, - "num_input_tokens_seen": 351811450, - "step": 16304, - "time_per_iteration": 2.6905360221862793 - }, - { - "auxiliary_loss_clip": 0.01070996, - "auxiliary_loss_mlp": 0.01026297, - "balance_loss_clip": 1.03549314, - "balance_loss_mlp": 1.01519823, - "epoch": 0.98030963475124, - "flos": 21469841187840.0, - "grad_norm": 1.8378321137403202, - "language_loss": 0.70343494, - "learning_rate": 4.053388504930089e-09, - "loss": 0.72440791, - "num_input_tokens_seen": 351831960, - "step": 16305, - "time_per_iteration": 2.745544910430908 - }, - { - "auxiliary_loss_clip": 0.0107968, - "auxiliary_loss_mlp": 0.01040728, - "balance_loss_clip": 1.03601217, - "balance_loss_mlp": 1.0259459, - "epoch": 0.980369758003908, - "flos": 20412272027520.0, - "grad_norm": 2.7789217747629182, - "language_loss": 0.71784663, - "learning_rate": 4.028643358815032e-09, - "loss": 0.73905075, - "num_input_tokens_seen": 351851585, - "step": 16306, - "time_per_iteration": 4.391748905181885 - }, - { - "auxiliary_loss_clip": 0.01080084, - "auxiliary_loss_mlp": 0.01032092, - "balance_loss_clip": 1.03247881, - "balance_loss_mlp": 1.02032626, - "epoch": 0.9804298812565759, - "flos": 23399177431680.0, - "grad_norm": 1.5354293339216485, - "language_loss": 0.73557943, - "learning_rate": 4.00397390013385e-09, - "loss": 0.75670117, - "num_input_tokens_seen": 351871085, - "step": 16307, - "time_per_iteration": 4.338375091552734 - }, - { - "auxiliary_loss_clip": 0.01076228, - "auxiliary_loss_mlp": 0.01030733, - "balance_loss_clip": 1.03920865, - "balance_loss_mlp": 1.01993847, - "epoch": 0.980490004509244, - "flos": 23292666627840.0, - "grad_norm": 1.5555541089180664, - "language_loss": 0.74765921, - "learning_rate": 3.979380129822018e-09, - "loss": 0.76872879, - "num_input_tokens_seen": 351891775, - "step": 16308, - "time_per_iteration": 2.79581618309021 - }, - { - "auxiliary_loss_clip": 0.01007996, - "auxiliary_loss_mlp": 0.0100217, - "balance_loss_clip": 1.0048188, - "balance_loss_mlp": 1.00120437, - "epoch": 0.980550127761912, - "flos": 56051027798400.0, - "grad_norm": 0.7557884098405707, - "language_loss": 0.57835835, - "learning_rate": 3.954862048811902e-09, - "loss": 0.59845996, - "num_input_tokens_seen": 351946770, - "step": 16309, - "time_per_iteration": 3.0556421279907227 - }, - { - "auxiliary_loss_clip": 0.01065215, - "auxiliary_loss_mlp": 0.0103267, - "balance_loss_clip": 1.03367877, - "balance_loss_mlp": 1.02015948, - "epoch": 0.9806102510145799, - "flos": 25333290184320.0, - "grad_norm": 1.8451853001469216, - "language_loss": 0.66008574, - "learning_rate": 3.930419658033646e-09, - "loss": 0.68106461, - "num_input_tokens_seen": 351966155, - "step": 16310, - "time_per_iteration": 2.729114055633545 - }, - { - "auxiliary_loss_clip": 0.01008303, - "auxiliary_loss_mlp": 0.01000216, - "balance_loss_clip": 1.00770998, - "balance_loss_mlp": 0.99920315, - "epoch": 0.9806703742672479, - "flos": 67274837429760.0, - "grad_norm": 1.0882913527970195, - "language_loss": 0.54503131, - "learning_rate": 3.906052958413841e-09, - "loss": 0.56511647, - "num_input_tokens_seen": 352031655, - "step": 16311, - "time_per_iteration": 4.943628311157227 - }, - { - "auxiliary_loss_clip": 0.01095664, - "auxiliary_loss_mlp": 0.01027322, - "balance_loss_clip": 1.0345304, - "balance_loss_mlp": 1.01559234, - "epoch": 0.9807304975199158, - "flos": 25228970110080.0, - "grad_norm": 2.5422868238543836, - "language_loss": 0.79856956, - "learning_rate": 3.881761950876638e-09, - "loss": 0.81979948, - "num_input_tokens_seen": 352051920, - "step": 16312, - "time_per_iteration": 2.635751247406006 - }, - { - "auxiliary_loss_clip": 0.0108546, - "auxiliary_loss_mlp": 0.01029669, - "balance_loss_clip": 1.03607917, - "balance_loss_mlp": 1.01784945, - "epoch": 0.9807906207725838, - "flos": 17456392995840.0, - "grad_norm": 1.855062658283189, - "language_loss": 0.6311661, - "learning_rate": 3.8575466363430785e-09, - "loss": 0.65231735, - "num_input_tokens_seen": 352069315, - "step": 16313, - "time_per_iteration": 2.71441650390625 - }, - { - "auxiliary_loss_clip": 0.01098236, - "auxiliary_loss_mlp": 0.01030326, - "balance_loss_clip": 1.03771138, - "balance_loss_mlp": 1.01765394, - "epoch": 0.9808507440252517, - "flos": 21032413361280.0, - "grad_norm": 2.0090087344647496, - "language_loss": 0.72602594, - "learning_rate": 3.833407015731316e-09, - "loss": 0.74731159, - "num_input_tokens_seen": 352089480, - "step": 16314, - "time_per_iteration": 2.789362907409668 - }, - { - "auxiliary_loss_clip": 0.01003668, - "auxiliary_loss_mlp": 0.01002, - "balance_loss_clip": 1.01054919, - "balance_loss_mlp": 1.00098097, - "epoch": 0.9809108672779198, - "flos": 64044491598720.0, - "grad_norm": 0.6894102027306396, - "language_loss": 0.51673484, - "learning_rate": 3.80934308995684e-09, - "loss": 0.53679156, - "num_input_tokens_seen": 352150000, - "step": 16315, - "time_per_iteration": 3.215070962905884 - }, - { - "auxiliary_loss_clip": 0.01097221, - "auxiliary_loss_mlp": 0.01032423, - "balance_loss_clip": 1.03522766, - "balance_loss_mlp": 1.02035928, - "epoch": 0.9809709905305877, - "flos": 22780616296320.0, - "grad_norm": 1.4165501611522262, - "language_loss": 0.69878519, - "learning_rate": 3.785354859932033e-09, - "loss": 0.72008169, - "num_input_tokens_seen": 352170990, - "step": 16316, - "time_per_iteration": 2.677259683609009 - }, - { - "auxiliary_loss_clip": 0.0111046, - "auxiliary_loss_mlp": 0.01032289, - "balance_loss_clip": 1.03727913, - "balance_loss_mlp": 1.02019501, - "epoch": 0.9810311137832557, - "flos": 37013415217920.0, - "grad_norm": 2.664112062764968, - "language_loss": 0.55067998, - "learning_rate": 3.76144232656661e-09, - "loss": 0.57210749, - "num_input_tokens_seen": 352195335, - "step": 16317, - "time_per_iteration": 2.7027530670166016 - }, - { - "auxiliary_loss_clip": 0.01052915, - "auxiliary_loss_mlp": 0.01035996, - "balance_loss_clip": 1.02941895, - "balance_loss_mlp": 1.02321708, - "epoch": 0.9810912370359236, - "flos": 18916305373440.0, - "grad_norm": 1.7269155229815298, - "language_loss": 0.73437709, - "learning_rate": 3.737605490767404e-09, - "loss": 0.75526619, - "num_input_tokens_seen": 352214170, - "step": 16318, - "time_per_iteration": 2.7383875846862793 - }, - { - "auxiliary_loss_clip": 0.01082811, - "auxiliary_loss_mlp": 0.01027164, - "balance_loss_clip": 1.0344367, - "balance_loss_mlp": 1.01589835, - "epoch": 0.9811513602885916, - "flos": 18441602208000.0, - "grad_norm": 2.1831479646107597, - "language_loss": 0.82135093, - "learning_rate": 3.7138443534383555e-09, - "loss": 0.84245068, - "num_input_tokens_seen": 352231470, - "step": 16319, - "time_per_iteration": 2.6357314586639404 - }, - { - "auxiliary_loss_clip": 0.01018205, - "auxiliary_loss_mlp": 0.01008734, - "balance_loss_clip": 1.00481987, - "balance_loss_mlp": 1.00751829, - "epoch": 0.9812114835412595, - "flos": 68058945371520.0, - "grad_norm": 0.723170548219491, - "language_loss": 0.5353533, - "learning_rate": 3.6901589154803014e-09, - "loss": 0.55562276, - "num_input_tokens_seen": 352291770, - "step": 16320, - "time_per_iteration": 3.0364413261413574 - }, - { - "auxiliary_loss_clip": 0.01057502, - "auxiliary_loss_mlp": 0.01036848, - "balance_loss_clip": 1.03194261, - "balance_loss_mlp": 1.02422416, - "epoch": 0.9812716067939276, - "flos": 25373007648000.0, - "grad_norm": 2.190666128056564, - "language_loss": 0.73492098, - "learning_rate": 3.6665491777914116e-09, - "loss": 0.7558645, - "num_input_tokens_seen": 352310735, - "step": 16321, - "time_per_iteration": 2.7965734004974365 - }, - { - "auxiliary_loss_clip": 0.0108786, - "auxiliary_loss_mlp": 0.0103188, - "balance_loss_clip": 1.03798234, - "balance_loss_mlp": 1.01972055, - "epoch": 0.9813317300465956, - "flos": 22856818999680.0, - "grad_norm": 1.5299966395206919, - "language_loss": 0.78483856, - "learning_rate": 3.6430151412669698e-09, - "loss": 0.806036, - "num_input_tokens_seen": 352329545, - "step": 16322, - "time_per_iteration": 2.762363910675049 - }, - { - "auxiliary_loss_clip": 0.0109714, - "auxiliary_loss_mlp": 0.01034816, - "balance_loss_clip": 1.03686166, - "balance_loss_mlp": 1.02228689, - "epoch": 0.9813918532992635, - "flos": 23586954756480.0, - "grad_norm": 1.7335029741380326, - "language_loss": 0.81064153, - "learning_rate": 3.619556806799595e-09, - "loss": 0.8319611, - "num_input_tokens_seen": 352352080, - "step": 16323, - "time_per_iteration": 2.674591541290283 - }, - { - "auxiliary_loss_clip": 0.01110489, - "auxiliary_loss_mlp": 0.01030491, - "balance_loss_clip": 1.03752804, - "balance_loss_mlp": 1.01852298, - "epoch": 0.9814519765519315, - "flos": 19606328616960.0, - "grad_norm": 2.350364849870627, - "language_loss": 0.84632325, - "learning_rate": 3.596174175278799e-09, - "loss": 0.86773306, - "num_input_tokens_seen": 352366455, - "step": 16324, - "time_per_iteration": 2.5407159328460693 - }, - { - "auxiliary_loss_clip": 0.01086747, - "auxiliary_loss_mlp": 0.01033784, - "balance_loss_clip": 1.03741539, - "balance_loss_mlp": 1.02086782, - "epoch": 0.9815120998045994, - "flos": 33946284787200.0, - "grad_norm": 1.4316902818633324, - "language_loss": 0.74605346, - "learning_rate": 3.5728672475909827e-09, - "loss": 0.76725876, - "num_input_tokens_seen": 352386090, - "step": 16325, - "time_per_iteration": 2.817761182785034 - }, - { - "auxiliary_loss_clip": 0.01056448, - "auxiliary_loss_mlp": 0.01032925, - "balance_loss_clip": 1.03592491, - "balance_loss_mlp": 1.02158785, - "epoch": 0.9815722230572674, - "flos": 20850023076480.0, - "grad_norm": 1.5890667781038148, - "language_loss": 0.7638427, - "learning_rate": 3.5496360246201063e-09, - "loss": 0.78473639, - "num_input_tokens_seen": 352404000, - "step": 16326, - "time_per_iteration": 2.804213523864746 - }, - { - "auxiliary_loss_clip": 0.01075422, - "auxiliary_loss_mlp": 0.01032152, - "balance_loss_clip": 1.03580999, - "balance_loss_mlp": 1.01923585, - "epoch": 0.9816323463099353, - "flos": 22894525301760.0, - "grad_norm": 2.465136585192098, - "language_loss": 0.67442954, - "learning_rate": 3.5264805072470205e-09, - "loss": 0.69550526, - "num_input_tokens_seen": 352423540, - "step": 16327, - "time_per_iteration": 2.725055694580078 - }, - { - "auxiliary_loss_clip": 0.01102074, - "auxiliary_loss_mlp": 0.01036968, - "balance_loss_clip": 1.03595459, - "balance_loss_mlp": 1.0239681, - "epoch": 0.9816924695626034, - "flos": 31539444117120.0, - "grad_norm": 1.5972210745113198, - "language_loss": 0.73710746, - "learning_rate": 3.5034006963501337e-09, - "loss": 0.75849789, - "num_input_tokens_seen": 352445530, - "step": 16328, - "time_per_iteration": 2.739084243774414 - }, - { - "auxiliary_loss_clip": 0.01091132, - "auxiliary_loss_mlp": 0.01035924, - "balance_loss_clip": 1.03452396, - "balance_loss_mlp": 1.0219171, - "epoch": 0.9817525928152713, - "flos": 21506901045120.0, - "grad_norm": 1.7593287132982667, - "language_loss": 0.8105092, - "learning_rate": 3.4803965928040802e-09, - "loss": 0.83177972, - "num_input_tokens_seen": 352466325, - "step": 16329, - "time_per_iteration": 2.6751110553741455 - }, - { - "auxiliary_loss_clip": 0.0111119, - "auxiliary_loss_mlp": 0.01031516, - "balance_loss_clip": 1.03624225, - "balance_loss_mlp": 1.01837909, - "epoch": 0.9818127160679393, - "flos": 25550513683200.0, - "grad_norm": 3.221253947949931, - "language_loss": 0.75986403, - "learning_rate": 3.4574681974817168e-09, - "loss": 0.78129113, - "num_input_tokens_seen": 352485505, - "step": 16330, - "time_per_iteration": 2.6681814193725586 - }, - { - "auxiliary_loss_clip": 0.01117551, - "auxiliary_loss_mlp": 0.01032164, - "balance_loss_clip": 1.03826082, - "balance_loss_mlp": 1.01716757, - "epoch": 0.9818728393206072, - "flos": 28803661672320.0, - "grad_norm": 2.4256142149996562, - "language_loss": 0.66364849, - "learning_rate": 3.434615511252126e-09, - "loss": 0.68514568, - "num_input_tokens_seen": 352505360, - "step": 16331, - "time_per_iteration": 2.703917980194092 - }, - { - "auxiliary_loss_clip": 0.01095043, - "auxiliary_loss_mlp": 0.0102903, - "balance_loss_clip": 1.03584874, - "balance_loss_mlp": 1.01704907, - "epoch": 0.9819329625732752, - "flos": 23222246014080.0, - "grad_norm": 1.857287483122114, - "language_loss": 0.73337162, - "learning_rate": 3.411838534981948e-09, - "loss": 0.75461233, - "num_input_tokens_seen": 352524035, - "step": 16332, - "time_per_iteration": 2.650766611099243 - }, - { - "auxiliary_loss_clip": 0.01097564, - "auxiliary_loss_mlp": 0.01029848, - "balance_loss_clip": 1.03841019, - "balance_loss_mlp": 1.01876128, - "epoch": 0.9819930858259431, - "flos": 17530440883200.0, - "grad_norm": 1.7088986460158127, - "language_loss": 0.76663387, - "learning_rate": 3.389137269534936e-09, - "loss": 0.78790796, - "num_input_tokens_seen": 352543210, - "step": 16333, - "time_per_iteration": 2.6083765029907227 - }, - { - "auxiliary_loss_clip": 0.01091914, - "auxiliary_loss_mlp": 0.00769838, - "balance_loss_clip": 1.03712809, - "balance_loss_mlp": 1.00018179, - "epoch": 0.9820532090786112, - "flos": 12529915971840.0, - "grad_norm": 2.124926384042051, - "language_loss": 0.72888857, - "learning_rate": 3.366511715771958e-09, - "loss": 0.74750608, - "num_input_tokens_seen": 352559770, - "step": 16334, - "time_per_iteration": 2.641460657119751 - }, - { - "auxiliary_loss_clip": 0.01059033, - "auxiliary_loss_mlp": 0.01035338, - "balance_loss_clip": 1.03467429, - "balance_loss_mlp": 1.02285099, - "epoch": 0.9821133323312792, - "flos": 18840174497280.0, - "grad_norm": 2.150602428971571, - "language_loss": 0.78196549, - "learning_rate": 3.3439618745509934e-09, - "loss": 0.8029092, - "num_input_tokens_seen": 352577690, - "step": 16335, - "time_per_iteration": 2.813981056213379 - }, - { - "auxiliary_loss_clip": 0.01084888, - "auxiliary_loss_mlp": 0.01042166, - "balance_loss_clip": 1.03453565, - "balance_loss_mlp": 1.02693129, - "epoch": 0.9821734555839471, - "flos": 34824013528320.0, - "grad_norm": 1.9795504478924333, - "language_loss": 0.63792658, - "learning_rate": 3.3214877467271362e-09, - "loss": 0.65919709, - "num_input_tokens_seen": 352598850, - "step": 16336, - "time_per_iteration": 2.8951098918914795 - }, - { - "auxiliary_loss_clip": 0.01077961, - "auxiliary_loss_mlp": 0.0103778, - "balance_loss_clip": 1.03655946, - "balance_loss_mlp": 1.02337968, - "epoch": 0.9822335788366151, - "flos": 17128169493120.0, - "grad_norm": 2.0134726146913517, - "language_loss": 0.73876464, - "learning_rate": 3.299089333152372e-09, - "loss": 0.75992203, - "num_input_tokens_seen": 352616130, - "step": 16337, - "time_per_iteration": 2.7202372550964355 - }, - { - "auxiliary_loss_clip": 0.0109231, - "auxiliary_loss_mlp": 0.01031669, - "balance_loss_clip": 1.03548503, - "balance_loss_mlp": 1.01803732, - "epoch": 0.982293702089283, - "flos": 20813250528000.0, - "grad_norm": 1.6907700121861502, - "language_loss": 0.72918296, - "learning_rate": 3.2767666346764645e-09, - "loss": 0.75042278, - "num_input_tokens_seen": 352636885, - "step": 16338, - "time_per_iteration": 2.5943961143493652 - }, - { - "auxiliary_loss_clip": 0.0104046, - "auxiliary_loss_mlp": 0.01032977, - "balance_loss_clip": 1.03025174, - "balance_loss_mlp": 1.02005458, - "epoch": 0.982353825341951, - "flos": 24680829588480.0, - "grad_norm": 1.7984966178479147, - "language_loss": 0.81313229, - "learning_rate": 3.2545196521454045e-09, - "loss": 0.83386666, - "num_input_tokens_seen": 352657905, - "step": 16339, - "time_per_iteration": 2.8950557708740234 - }, - { - "auxiliary_loss_clip": 0.01054842, - "auxiliary_loss_mlp": 0.01039766, - "balance_loss_clip": 1.02929127, - "balance_loss_mlp": 1.02653337, - "epoch": 0.982413948594619, - "flos": 20850489953280.0, - "grad_norm": 1.809625302780829, - "language_loss": 0.62418073, - "learning_rate": 3.232348386403405e-09, - "loss": 0.64512682, - "num_input_tokens_seen": 352676320, - "step": 16340, - "time_per_iteration": 2.8046703338623047 - }, - { - "auxiliary_loss_clip": 0.01112791, - "auxiliary_loss_mlp": 0.01031705, - "balance_loss_clip": 1.03891397, - "balance_loss_mlp": 1.01859832, - "epoch": 0.982474071847287, - "flos": 15377380778880.0, - "grad_norm": 2.356487189204491, - "language_loss": 0.86053795, - "learning_rate": 3.2102528382904613e-09, - "loss": 0.88198292, - "num_input_tokens_seen": 352692665, - "step": 16341, - "time_per_iteration": 2.60111403465271 - }, - { - "auxiliary_loss_clip": 0.0108126, - "auxiliary_loss_mlp": 0.01031684, - "balance_loss_clip": 1.03337705, - "balance_loss_mlp": 1.01934528, - "epoch": 0.9825341950999549, - "flos": 23774732081280.0, - "grad_norm": 1.4139542605019915, - "language_loss": 0.66917169, - "learning_rate": 3.188233008645014e-09, - "loss": 0.69030112, - "num_input_tokens_seen": 352716130, - "step": 16342, - "time_per_iteration": 3.006946325302124 - }, - { - "auxiliary_loss_clip": 0.01109167, - "auxiliary_loss_mlp": 0.01027299, - "balance_loss_clip": 1.03658962, - "balance_loss_mlp": 1.0151639, - "epoch": 0.9825943183526229, - "flos": 22746285872640.0, - "grad_norm": 1.5649008890047298, - "language_loss": 0.77261454, - "learning_rate": 3.16628889830195e-09, - "loss": 0.79397917, - "num_input_tokens_seen": 352734705, - "step": 16343, - "time_per_iteration": 2.623782157897949 - }, - { - "auxiliary_loss_clip": 0.01074162, - "auxiliary_loss_mlp": 0.01030309, - "balance_loss_clip": 1.03596878, - "balance_loss_mlp": 1.01949716, - "epoch": 0.9826544416052908, - "flos": 27709966408320.0, - "grad_norm": 1.540067239801228, - "language_loss": 0.75307328, - "learning_rate": 3.1444205080932707e-09, - "loss": 0.77411795, - "num_input_tokens_seen": 352756225, - "step": 16344, - "time_per_iteration": 2.747864007949829 - }, - { - "auxiliary_loss_clip": 0.0108221, - "auxiliary_loss_mlp": 0.01029576, - "balance_loss_clip": 1.03211427, - "balance_loss_mlp": 1.01698792, - "epoch": 0.9827145648579588, - "flos": 26941657472640.0, - "grad_norm": 3.214033329820644, - "language_loss": 0.66152173, - "learning_rate": 3.122627838848313e-09, - "loss": 0.6826396, - "num_input_tokens_seen": 352776210, - "step": 16345, - "time_per_iteration": 4.445494651794434 - }, - { - "auxiliary_loss_clip": 0.01092474, - "auxiliary_loss_mlp": 0.01026144, - "balance_loss_clip": 1.03578293, - "balance_loss_mlp": 1.0152061, - "epoch": 0.9827746881106267, - "flos": 21866545969920.0, - "grad_norm": 1.4391085603801235, - "language_loss": 0.79666579, - "learning_rate": 3.1009108913933045e-09, - "loss": 0.81785202, - "num_input_tokens_seen": 352795455, - "step": 16346, - "time_per_iteration": 4.2288713455200195 - }, - { - "auxiliary_loss_clip": 0.01098997, - "auxiliary_loss_mlp": 0.01037578, - "balance_loss_clip": 1.03740525, - "balance_loss_mlp": 1.02411294, - "epoch": 0.9828348113632948, - "flos": 20850777262080.0, - "grad_norm": 2.0938671424216024, - "language_loss": 0.75089842, - "learning_rate": 3.079269666552031e-09, - "loss": 0.77226412, - "num_input_tokens_seen": 352812895, - "step": 16347, - "time_per_iteration": 2.571201801300049 - }, - { - "auxiliary_loss_clip": 0.01033873, - "auxiliary_loss_mlp": 0.01036396, - "balance_loss_clip": 1.02937937, - "balance_loss_mlp": 1.02430809, - "epoch": 0.9828949346159628, - "flos": 34569227381760.0, - "grad_norm": 1.7026010770508515, - "language_loss": 0.66808671, - "learning_rate": 3.0577041651449474e-09, - "loss": 0.68878937, - "num_input_tokens_seen": 352835470, - "step": 16348, - "time_per_iteration": 2.9019980430603027 - }, - { - "auxiliary_loss_clip": 0.01087559, - "auxiliary_loss_mlp": 0.01032268, - "balance_loss_clip": 1.03562045, - "balance_loss_mlp": 1.0198462, - "epoch": 0.9829550578686307, - "flos": 24457464864000.0, - "grad_norm": 1.7338187903548066, - "language_loss": 0.69069308, - "learning_rate": 3.0362143879898437e-09, - "loss": 0.71189135, - "num_input_tokens_seen": 352854295, - "step": 16349, - "time_per_iteration": 2.680927038192749 - }, - { - "auxiliary_loss_clip": 0.01075988, - "auxiliary_loss_mlp": 0.01029319, - "balance_loss_clip": 1.03591371, - "balance_loss_mlp": 1.01804733, - "epoch": 0.9830151811212987, - "flos": 16910084067840.0, - "grad_norm": 2.350613893884081, - "language_loss": 0.75915736, - "learning_rate": 3.0148003359014018e-09, - "loss": 0.78021044, - "num_input_tokens_seen": 352869695, - "step": 16350, - "time_per_iteration": 4.1306681632995605 - }, - { - "auxiliary_loss_clip": 0.01078562, - "auxiliary_loss_mlp": 0.01032593, - "balance_loss_clip": 1.03499365, - "balance_loss_mlp": 1.01986754, - "epoch": 0.9830753043739666, - "flos": 21288312829440.0, - "grad_norm": 2.112400068998379, - "language_loss": 0.84269607, - "learning_rate": 2.9934620096920826e-09, - "loss": 0.86380762, - "num_input_tokens_seen": 352887430, - "step": 16351, - "time_per_iteration": 2.6960017681121826 - }, - { - "auxiliary_loss_clip": 0.0107955, - "auxiliary_loss_mlp": 0.0102559, - "balance_loss_clip": 1.03737783, - "balance_loss_mlp": 1.0136925, - "epoch": 0.9831354276266346, - "flos": 31723522341120.0, - "grad_norm": 1.6146338638201096, - "language_loss": 0.68907672, - "learning_rate": 2.972199410170795e-09, - "loss": 0.71012813, - "num_input_tokens_seen": 352907555, - "step": 16352, - "time_per_iteration": 2.7532811164855957 - }, - { - "auxiliary_loss_clip": 0.01088475, - "auxiliary_loss_mlp": 0.00769371, - "balance_loss_clip": 1.03576922, - "balance_loss_mlp": 1.00027871, - "epoch": 0.9831955508793025, - "flos": 21619050284160.0, - "grad_norm": 1.4138760656880254, - "language_loss": 0.66266984, - "learning_rate": 2.951012538143782e-09, - "loss": 0.68124831, - "num_input_tokens_seen": 352928670, - "step": 16353, - "time_per_iteration": 2.6439483165740967 - }, - { - "auxiliary_loss_clip": 0.01082262, - "auxiliary_loss_mlp": 0.01030151, - "balance_loss_clip": 1.03453684, - "balance_loss_mlp": 1.01872444, - "epoch": 0.9832556741319706, - "flos": 22968214053120.0, - "grad_norm": 1.5813502969627034, - "language_loss": 0.74711162, - "learning_rate": 2.9299013944144025e-09, - "loss": 0.76823574, - "num_input_tokens_seen": 352948345, - "step": 16354, - "time_per_iteration": 2.6886255741119385 - }, - { - "auxiliary_loss_clip": 0.01098034, - "auxiliary_loss_mlp": 0.010272, - "balance_loss_clip": 1.03713632, - "balance_loss_mlp": 1.01496959, - "epoch": 0.9833157973846385, - "flos": 21323900229120.0, - "grad_norm": 2.034749936082402, - "language_loss": 0.77509081, - "learning_rate": 2.9088659797835702e-09, - "loss": 0.79634321, - "num_input_tokens_seen": 352967250, - "step": 16355, - "time_per_iteration": 2.655395269393921 - }, - { - "auxiliary_loss_clip": 0.01094864, - "auxiliary_loss_mlp": 0.01028209, - "balance_loss_clip": 1.03562486, - "balance_loss_mlp": 1.01627064, - "epoch": 0.9833759206373065, - "flos": 21068719032960.0, - "grad_norm": 2.2520856858074594, - "language_loss": 0.73119497, - "learning_rate": 2.8879062950484256e-09, - "loss": 0.75242567, - "num_input_tokens_seen": 352984725, - "step": 16356, - "time_per_iteration": 2.604156017303467 - }, - { - "auxiliary_loss_clip": 0.01082002, - "auxiliary_loss_mlp": 0.01032823, - "balance_loss_clip": 1.03355122, - "balance_loss_mlp": 1.02010965, - "epoch": 0.9834360438899744, - "flos": 18697322108160.0, - "grad_norm": 1.536085672752046, - "language_loss": 0.75979388, - "learning_rate": 2.8670223410041104e-09, - "loss": 0.7809422, - "num_input_tokens_seen": 353003480, - "step": 16357, - "time_per_iteration": 2.685453176498413 - }, - { - "auxiliary_loss_clip": 0.01086973, - "auxiliary_loss_mlp": 0.01028039, - "balance_loss_clip": 1.03633261, - "balance_loss_mlp": 1.01561737, - "epoch": 0.9834961671426424, - "flos": 21105240186240.0, - "grad_norm": 2.377018060234898, - "language_loss": 0.80362308, - "learning_rate": 2.846214118442436e-09, - "loss": 0.82477319, - "num_input_tokens_seen": 353021425, - "step": 16358, - "time_per_iteration": 2.672687292098999 - }, - { - "auxiliary_loss_clip": 0.01095168, - "auxiliary_loss_mlp": 0.01025846, - "balance_loss_clip": 1.03404856, - "balance_loss_mlp": 1.01396728, - "epoch": 0.9835562903953103, - "flos": 26687625511680.0, - "grad_norm": 2.5788189442251848, - "language_loss": 0.67699122, - "learning_rate": 2.8254816281523263e-09, - "loss": 0.69820142, - "num_input_tokens_seen": 353039870, - "step": 16359, - "time_per_iteration": 2.603217601776123 - }, - { - "auxiliary_loss_clip": 0.0110407, - "auxiliary_loss_mlp": 0.01030251, - "balance_loss_clip": 1.03442788, - "balance_loss_mlp": 1.01891446, - "epoch": 0.9836164136479784, - "flos": 22090162089600.0, - "grad_norm": 1.6643082336304196, - "language_loss": 0.69579446, - "learning_rate": 2.804824870920264e-09, - "loss": 0.71713769, - "num_input_tokens_seen": 353059750, - "step": 16360, - "time_per_iteration": 2.590282440185547 - }, - { - "auxiliary_loss_clip": 0.01097129, - "auxiliary_loss_mlp": 0.01035655, - "balance_loss_clip": 1.03531575, - "balance_loss_mlp": 1.02293587, - "epoch": 0.9836765369006463, - "flos": 23878405710720.0, - "grad_norm": 1.804692326953609, - "language_loss": 0.8430177, - "learning_rate": 2.7842438475293996e-09, - "loss": 0.86434555, - "num_input_tokens_seen": 353079940, - "step": 16361, - "time_per_iteration": 2.667570114135742 - }, - { - "auxiliary_loss_clip": 0.01107883, - "auxiliary_loss_mlp": 0.01027313, - "balance_loss_clip": 1.03631568, - "balance_loss_mlp": 1.01540446, - "epoch": 0.9837366601533143, - "flos": 25845017293440.0, - "grad_norm": 1.7879750486860067, - "language_loss": 0.75830048, - "learning_rate": 2.76373855876022e-09, - "loss": 0.77965236, - "num_input_tokens_seen": 353099990, - "step": 16362, - "time_per_iteration": 2.5723037719726562 - }, - { - "auxiliary_loss_clip": 0.01109574, - "auxiliary_loss_mlp": 0.01035763, - "balance_loss_clip": 1.03702784, - "balance_loss_mlp": 1.023103, - "epoch": 0.9837967834059823, - "flos": 21358015171200.0, - "grad_norm": 1.8659793314659867, - "language_loss": 0.71063733, - "learning_rate": 2.7433090053901043e-09, - "loss": 0.73209071, - "num_input_tokens_seen": 353118710, - "step": 16363, - "time_per_iteration": 2.580556631088257 - }, - { - "auxiliary_loss_clip": 0.01083391, - "auxiliary_loss_mlp": 0.01030702, - "balance_loss_clip": 1.03492188, - "balance_loss_mlp": 1.01919198, - "epoch": 0.9838569066586502, - "flos": 18515793749760.0, - "grad_norm": 2.149367136223202, - "language_loss": 0.63062841, - "learning_rate": 2.7229551881937653e-09, - "loss": 0.65176934, - "num_input_tokens_seen": 353136415, - "step": 16364, - "time_per_iteration": 2.6873748302459717 - }, - { - "auxiliary_loss_clip": 0.01071986, - "auxiliary_loss_mlp": 0.01031043, - "balance_loss_clip": 1.04158378, - "balance_loss_mlp": 1.01967084, - "epoch": 0.9839170299113182, - "flos": 22452392793600.0, - "grad_norm": 1.5415718965225467, - "language_loss": 0.75180268, - "learning_rate": 2.702677107943252e-09, - "loss": 0.77283293, - "num_input_tokens_seen": 353154650, - "step": 16365, - "time_per_iteration": 2.7838945388793945 - }, - { - "auxiliary_loss_clip": 0.0106364, - "auxiliary_loss_mlp": 0.01028118, - "balance_loss_clip": 1.03559554, - "balance_loss_mlp": 1.01572597, - "epoch": 0.9839771531639862, - "flos": 27892320779520.0, - "grad_norm": 2.0418627891356365, - "language_loss": 0.76325071, - "learning_rate": 2.6824747654072832e-09, - "loss": 0.78416824, - "num_input_tokens_seen": 353174065, - "step": 16366, - "time_per_iteration": 2.723862886428833 - }, - { - "auxiliary_loss_clip": 0.01105139, - "auxiliary_loss_mlp": 0.01026883, - "balance_loss_clip": 1.03549993, - "balance_loss_mlp": 1.01568365, - "epoch": 0.9840372764166542, - "flos": 28214510797440.0, - "grad_norm": 1.5805895259506346, - "language_loss": 0.77362347, - "learning_rate": 2.662348161352357e-09, - "loss": 0.79494369, - "num_input_tokens_seen": 353193560, - "step": 16367, - "time_per_iteration": 2.6186344623565674 - }, - { - "auxiliary_loss_clip": 0.01085162, - "auxiliary_loss_mlp": 0.01036358, - "balance_loss_clip": 1.0372721, - "balance_loss_mlp": 1.02363229, - "epoch": 0.9840973996693221, - "flos": 23403989854080.0, - "grad_norm": 1.6315530107439746, - "language_loss": 0.6176089, - "learning_rate": 2.642297296540974e-09, - "loss": 0.63882411, - "num_input_tokens_seen": 353213525, - "step": 16368, - "time_per_iteration": 2.7051217555999756 - }, - { - "auxiliary_loss_clip": 0.01093129, - "auxiliary_loss_mlp": 0.0103668, - "balance_loss_clip": 1.03431225, - "balance_loss_mlp": 1.02538538, - "epoch": 0.9841575229219901, - "flos": 21395865127680.0, - "grad_norm": 1.4838055886631645, - "language_loss": 0.65539753, - "learning_rate": 2.6223221717340816e-09, - "loss": 0.67669564, - "num_input_tokens_seen": 353234000, - "step": 16369, - "time_per_iteration": 2.684190273284912 - }, - { - "auxiliary_loss_clip": 0.01098619, - "auxiliary_loss_mlp": 0.00771023, - "balance_loss_clip": 1.03682351, - "balance_loss_mlp": 1.00028467, - "epoch": 0.984217646174658, - "flos": 24464072966400.0, - "grad_norm": 2.1848510788789053, - "language_loss": 0.68529809, - "learning_rate": 2.6024227876886295e-09, - "loss": 0.70399457, - "num_input_tokens_seen": 353254940, - "step": 16370, - "time_per_iteration": 2.690066337585449 - }, - { - "auxiliary_loss_clip": 0.01109517, - "auxiliary_loss_mlp": 0.01035832, - "balance_loss_clip": 1.03602791, - "balance_loss_mlp": 1.02231407, - "epoch": 0.984277769427326, - "flos": 16435057680000.0, - "grad_norm": 1.7624959425131688, - "language_loss": 0.73149407, - "learning_rate": 2.582599145159792e-09, - "loss": 0.75294757, - "num_input_tokens_seen": 353272590, - "step": 16371, - "time_per_iteration": 2.647754669189453 - }, - { - "auxiliary_loss_clip": 0.01019499, - "auxiliary_loss_mlp": 0.01000721, - "balance_loss_clip": 1.00614071, - "balance_loss_mlp": 0.99977916, - "epoch": 0.9843378926799939, - "flos": 64530615288960.0, - "grad_norm": 0.8676443160581451, - "language_loss": 0.65173286, - "learning_rate": 2.562851244898745e-09, - "loss": 0.67193508, - "num_input_tokens_seen": 353334380, - "step": 16372, - "time_per_iteration": 3.1656858921051025 - }, - { - "auxiliary_loss_clip": 0.01095097, - "auxiliary_loss_mlp": 0.01034077, - "balance_loss_clip": 1.03569186, - "balance_loss_mlp": 1.02207279, - "epoch": 0.984398015932662, - "flos": 17382811985280.0, - "grad_norm": 1.8275470136153955, - "language_loss": 0.70694923, - "learning_rate": 2.5431790876544456e-09, - "loss": 0.72824109, - "num_input_tokens_seen": 353351640, - "step": 16373, - "time_per_iteration": 2.658825635910034 - }, - { - "auxiliary_loss_clip": 0.0110683, - "auxiliary_loss_mlp": 0.01030818, - "balance_loss_clip": 1.03669751, - "balance_loss_mlp": 1.01893306, - "epoch": 0.9844581391853299, - "flos": 23879088069120.0, - "grad_norm": 1.8485344334805096, - "language_loss": 0.81536216, - "learning_rate": 2.523582674173186e-09, - "loss": 0.83673871, - "num_input_tokens_seen": 353372555, - "step": 16374, - "time_per_iteration": 2.6585822105407715 - }, - { - "auxiliary_loss_clip": 0.01064423, - "auxiliary_loss_mlp": 0.01034079, - "balance_loss_clip": 1.03823948, - "balance_loss_mlp": 1.02220547, - "epoch": 0.9845182624379979, - "flos": 19865352568320.0, - "grad_norm": 1.693148116934704, - "language_loss": 0.69581914, - "learning_rate": 2.504062005197927e-09, - "loss": 0.71680415, - "num_input_tokens_seen": 353391385, - "step": 16375, - "time_per_iteration": 2.7366557121276855 - }, - { - "auxiliary_loss_clip": 0.01083548, - "auxiliary_loss_mlp": 0.01043522, - "balance_loss_clip": 1.03258562, - "balance_loss_mlp": 1.02908564, - "epoch": 0.9845783856906659, - "flos": 28254659224320.0, - "grad_norm": 2.704312105533632, - "language_loss": 0.81189835, - "learning_rate": 2.484617081468521e-09, - "loss": 0.83316898, - "num_input_tokens_seen": 353411630, - "step": 16376, - "time_per_iteration": 2.695854663848877 - }, - { - "auxiliary_loss_clip": 0.01105113, - "auxiliary_loss_mlp": 0.01036471, - "balance_loss_clip": 1.03517056, - "balance_loss_mlp": 1.02441287, - "epoch": 0.9846385089433338, - "flos": 28328383889280.0, - "grad_norm": 1.6882577755341805, - "language_loss": 0.62188119, - "learning_rate": 2.4652479037228224e-09, - "loss": 0.64329708, - "num_input_tokens_seen": 353432895, - "step": 16377, - "time_per_iteration": 2.655351161956787 - }, - { - "auxiliary_loss_clip": 0.01079528, - "auxiliary_loss_mlp": 0.01034135, - "balance_loss_clip": 1.03616428, - "balance_loss_mlp": 1.02145696, - "epoch": 0.9846986321960018, - "flos": 24316767290880.0, - "grad_norm": 1.743655266311487, - "language_loss": 0.72909814, - "learning_rate": 2.445954472695133e-09, - "loss": 0.75023472, - "num_input_tokens_seen": 353454195, - "step": 16378, - "time_per_iteration": 2.7620902061462402 - }, - { - "auxiliary_loss_clip": 0.01107968, - "auxiliary_loss_mlp": 0.0103452, - "balance_loss_clip": 1.0362848, - "balance_loss_mlp": 1.02246761, - "epoch": 0.9847587554486698, - "flos": 27271999877760.0, - "grad_norm": 1.9591429215255713, - "language_loss": 0.71231186, - "learning_rate": 2.426736789116868e-09, - "loss": 0.73373675, - "num_input_tokens_seen": 353475125, - "step": 16379, - "time_per_iteration": 2.6217269897460938 - }, - { - "auxiliary_loss_clip": 0.01076838, - "auxiliary_loss_mlp": 0.01033584, - "balance_loss_clip": 1.03647435, - "balance_loss_mlp": 1.02120376, - "epoch": 0.9848188787013378, - "flos": 16542717719040.0, - "grad_norm": 1.9414180351359185, - "language_loss": 0.68380785, - "learning_rate": 2.407594853716999e-09, - "loss": 0.70491207, - "num_input_tokens_seen": 353493265, - "step": 16380, - "time_per_iteration": 2.6951489448547363 - }, - { - "auxiliary_loss_clip": 0.01078007, - "auxiliary_loss_mlp": 0.01037173, - "balance_loss_clip": 1.0345068, - "balance_loss_mlp": 1.02463818, - "epoch": 0.9848790019540057, - "flos": 20193647898240.0, - "grad_norm": 2.8812935007679146, - "language_loss": 0.78948879, - "learning_rate": 2.38852866722139e-09, - "loss": 0.81064057, - "num_input_tokens_seen": 353511650, - "step": 16381, - "time_per_iteration": 2.733790159225464 - }, - { - "auxiliary_loss_clip": 0.01095102, - "auxiliary_loss_mlp": 0.01029949, - "balance_loss_clip": 1.03512669, - "balance_loss_mlp": 1.01729441, - "epoch": 0.9849391252066737, - "flos": 28259723041920.0, - "grad_norm": 1.4147052567064669, - "language_loss": 0.82457852, - "learning_rate": 2.3695382303527965e-09, - "loss": 0.84582901, - "num_input_tokens_seen": 353534035, - "step": 16382, - "time_per_iteration": 2.738605499267578 - }, - { - "auxiliary_loss_clip": 0.01081484, - "auxiliary_loss_mlp": 0.01033799, - "balance_loss_clip": 1.03230476, - "balance_loss_mlp": 1.02016735, - "epoch": 0.9849992484593416, - "flos": 22454942659200.0, - "grad_norm": 1.7473709928633554, - "language_loss": 0.74585968, - "learning_rate": 2.3506235438315316e-09, - "loss": 0.76701248, - "num_input_tokens_seen": 353549950, - "step": 16383, - "time_per_iteration": 2.754387378692627 - }, - { - "auxiliary_loss_clip": 0.01064953, - "auxiliary_loss_mlp": 0.01031021, - "balance_loss_clip": 1.03860319, - "balance_loss_mlp": 1.01868236, - "epoch": 0.9850593717120096, - "flos": 34497190656000.0, - "grad_norm": 1.8075355031260896, - "language_loss": 0.66479164, - "learning_rate": 2.3317846083750203e-09, - "loss": 0.68575138, - "num_input_tokens_seen": 353573745, - "step": 16384, - "time_per_iteration": 2.9240455627441406 - }, - { - "auxiliary_loss_clip": 0.01090885, - "auxiliary_loss_mlp": 0.01035183, - "balance_loss_clip": 1.03831267, - "balance_loss_mlp": 1.02080083, - "epoch": 0.9851194949646775, - "flos": 38837282152320.0, - "grad_norm": 1.832467391931212, - "language_loss": 0.70495671, - "learning_rate": 2.313021424697359e-09, - "loss": 0.72621739, - "num_input_tokens_seen": 353595335, - "step": 16385, - "time_per_iteration": 6.049696922302246 - }, - { - "auxiliary_loss_clip": 0.01090368, - "auxiliary_loss_mlp": 0.01031869, - "balance_loss_clip": 1.03864157, - "balance_loss_mlp": 1.01980531, - "epoch": 0.9851796182173456, - "flos": 17712436118400.0, - "grad_norm": 2.4314123145549336, - "language_loss": 0.81251216, - "learning_rate": 2.294333993509978e-09, - "loss": 0.83373451, - "num_input_tokens_seen": 353614270, - "step": 16386, - "time_per_iteration": 2.663780689239502 - }, - { - "auxiliary_loss_clip": 0.01079909, - "auxiliary_loss_mlp": 0.01031709, - "balance_loss_clip": 1.03440416, - "balance_loss_mlp": 1.01883996, - "epoch": 0.9852397414700135, - "flos": 27454318335360.0, - "grad_norm": 1.9863892677340445, - "language_loss": 0.67923307, - "learning_rate": 2.2757223155216442e-09, - "loss": 0.70034921, - "num_input_tokens_seen": 353634900, - "step": 16387, - "time_per_iteration": 2.7573816776275635 - }, - { - "auxiliary_loss_clip": 0.01089839, - "auxiliary_loss_mlp": 0.00769242, - "balance_loss_clip": 1.03422558, - "balance_loss_mlp": 1.00012159, - "epoch": 0.9852998647226815, - "flos": 18296702743680.0, - "grad_norm": 1.7527242127962226, - "language_loss": 0.74020231, - "learning_rate": 2.257186391438237e-09, - "loss": 0.75879306, - "num_input_tokens_seen": 353652890, - "step": 16388, - "time_per_iteration": 2.6196138858795166 - }, - { - "auxiliary_loss_clip": 0.01089517, - "auxiliary_loss_mlp": 0.01032434, - "balance_loss_clip": 1.03372729, - "balance_loss_mlp": 1.02051258, - "epoch": 0.9853599879753495, - "flos": 19642562461440.0, - "grad_norm": 1.885399673475778, - "language_loss": 0.82288045, - "learning_rate": 2.238726221962528e-09, - "loss": 0.8441, - "num_input_tokens_seen": 353671295, - "step": 16389, - "time_per_iteration": 4.203902959823608 - }, - { - "auxiliary_loss_clip": 0.01086383, - "auxiliary_loss_mlp": 0.00770398, - "balance_loss_clip": 1.03422093, - "balance_loss_mlp": 1.00023174, - "epoch": 0.9854201112280174, - "flos": 23841956384640.0, - "grad_norm": 2.246145821478881, - "language_loss": 0.67169315, - "learning_rate": 2.2203418077946234e-09, - "loss": 0.69026095, - "num_input_tokens_seen": 353690560, - "step": 16390, - "time_per_iteration": 2.7021732330322266 - }, - { - "auxiliary_loss_clip": 0.01070253, - "auxiliary_loss_mlp": 0.01034626, - "balance_loss_clip": 1.03694236, - "balance_loss_mlp": 1.02117944, - "epoch": 0.9854802344806854, - "flos": 30080573233920.0, - "grad_norm": 1.5706472092274895, - "language_loss": 0.77193004, - "learning_rate": 2.2020331496312994e-09, - "loss": 0.79297888, - "num_input_tokens_seen": 353710660, - "step": 16391, - "time_per_iteration": 2.763343572616577 - }, - { - "auxiliary_loss_clip": 0.01066236, - "auxiliary_loss_mlp": 0.00769461, - "balance_loss_clip": 1.03303838, - "balance_loss_mlp": 1.00014699, - "epoch": 0.9855403577333534, - "flos": 21907412668800.0, - "grad_norm": 2.034127349616756, - "language_loss": 0.6821295, - "learning_rate": 2.1838002481673333e-09, - "loss": 0.70048642, - "num_input_tokens_seen": 353730440, - "step": 16392, - "time_per_iteration": 2.741312026977539 - }, - { - "auxiliary_loss_clip": 0.01076854, - "auxiliary_loss_mlp": 0.01030267, - "balance_loss_clip": 1.0342617, - "balance_loss_mlp": 1.0166111, - "epoch": 0.9856004809860214, - "flos": 15413794191360.0, - "grad_norm": 2.041115164847186, - "language_loss": 0.55706286, - "learning_rate": 2.1656431040937286e-09, - "loss": 0.57813406, - "num_input_tokens_seen": 353748360, - "step": 16393, - "time_per_iteration": 2.6840660572052 - }, - { - "auxiliary_loss_clip": 0.01074406, - "auxiliary_loss_mlp": 0.01031603, - "balance_loss_clip": 1.0325073, - "balance_loss_mlp": 1.01706505, - "epoch": 0.9856606042386893, - "flos": 13653201064320.0, - "grad_norm": 2.576410490354787, - "language_loss": 0.79111844, - "learning_rate": 2.1475617180990444e-09, - "loss": 0.81217849, - "num_input_tokens_seen": 353760880, - "step": 16394, - "time_per_iteration": 2.683983087539673 - }, - { - "auxiliary_loss_clip": 0.0109509, - "auxiliary_loss_mlp": 0.01033335, - "balance_loss_clip": 1.03339911, - "balance_loss_mlp": 1.02005494, - "epoch": 0.9857207274913573, - "flos": 23479151063040.0, - "grad_norm": 1.5070932402692028, - "language_loss": 0.76305884, - "learning_rate": 2.129556090869178e-09, - "loss": 0.78434312, - "num_input_tokens_seen": 353782255, - "step": 16395, - "time_per_iteration": 2.694324254989624 - }, - { - "auxiliary_loss_clip": 0.01094719, - "auxiliary_loss_mlp": 0.01031611, - "balance_loss_clip": 1.03447509, - "balance_loss_mlp": 1.01911831, - "epoch": 0.9857808507440252, - "flos": 21065486808960.0, - "grad_norm": 1.9132501064588425, - "language_loss": 0.7550149, - "learning_rate": 2.1116262230866933e-09, - "loss": 0.77627826, - "num_input_tokens_seen": 353803580, - "step": 16396, - "time_per_iteration": 2.6768436431884766 - }, - { - "auxiliary_loss_clip": 0.01070405, - "auxiliary_loss_mlp": 0.01026163, - "balance_loss_clip": 1.03497076, - "balance_loss_mlp": 1.01392639, - "epoch": 0.9858409739966932, - "flos": 25301365971840.0, - "grad_norm": 1.5225711689164605, - "language_loss": 0.7122134, - "learning_rate": 2.0937721154317133e-09, - "loss": 0.73317909, - "num_input_tokens_seen": 353824200, - "step": 16397, - "time_per_iteration": 2.7246475219726562 - }, - { - "auxiliary_loss_clip": 0.01081841, - "auxiliary_loss_mlp": 0.01028049, - "balance_loss_clip": 1.0351944, - "balance_loss_mlp": 1.01624179, - "epoch": 0.9859010972493611, - "flos": 20558751690240.0, - "grad_norm": 1.7750069543692049, - "language_loss": 0.7137388, - "learning_rate": 2.0759937685810304e-09, - "loss": 0.73483771, - "num_input_tokens_seen": 353843350, - "step": 16398, - "time_per_iteration": 2.6708950996398926 - }, - { - "auxiliary_loss_clip": 0.0106975, - "auxiliary_loss_mlp": 0.01026745, - "balance_loss_clip": 1.03256011, - "balance_loss_mlp": 1.01534224, - "epoch": 0.9859612205020292, - "flos": 24754985216640.0, - "grad_norm": 2.8400971198256215, - "language_loss": 0.73956269, - "learning_rate": 2.058291183208771e-09, - "loss": 0.76052767, - "num_input_tokens_seen": 353864520, - "step": 16399, - "time_per_iteration": 2.7505059242248535 - }, - { - "auxiliary_loss_clip": 0.01107815, - "auxiliary_loss_mlp": 0.01030039, - "balance_loss_clip": 1.03546059, - "balance_loss_mlp": 1.01738548, - "epoch": 0.9860213437546971, - "flos": 21105850717440.0, - "grad_norm": 2.280806532195227, - "language_loss": 0.57755244, - "learning_rate": 2.0406643599863993e-09, - "loss": 0.59893095, - "num_input_tokens_seen": 353882240, - "step": 16400, - "time_per_iteration": 2.5837459564208984 - }, - { - "auxiliary_loss_clip": 0.01087543, - "auxiliary_loss_mlp": 0.01029777, - "balance_loss_clip": 1.03587925, - "balance_loss_mlp": 1.01624036, - "epoch": 0.9860814670073651, - "flos": 19136078737920.0, - "grad_norm": 7.9161501077476775, - "language_loss": 0.80533803, - "learning_rate": 2.023113299582491e-09, - "loss": 0.82651126, - "num_input_tokens_seen": 353901590, - "step": 16401, - "time_per_iteration": 2.676846742630005 - }, - { - "auxiliary_loss_clip": 0.01095929, - "auxiliary_loss_mlp": 0.01033178, - "balance_loss_clip": 1.03656411, - "balance_loss_mlp": 1.02002931, - "epoch": 0.9861415902600331, - "flos": 17237050594560.0, - "grad_norm": 1.9620055100104796, - "language_loss": 0.77909809, - "learning_rate": 2.005638002662069e-09, - "loss": 0.80038917, - "num_input_tokens_seen": 353918785, - "step": 16402, - "time_per_iteration": 2.580324172973633 - }, - { - "auxiliary_loss_clip": 0.01099134, - "auxiliary_loss_mlp": 0.01035077, - "balance_loss_clip": 1.03702092, - "balance_loss_mlp": 1.02319241, - "epoch": 0.986201713512701, - "flos": 27782577751680.0, - "grad_norm": 1.8385726831624305, - "language_loss": 0.69819796, - "learning_rate": 1.9882384698881596e-09, - "loss": 0.71954, - "num_input_tokens_seen": 353940390, - "step": 16403, - "time_per_iteration": 2.6051719188690186 - }, - { - "auxiliary_loss_clip": 0.01092549, - "auxiliary_loss_mlp": 0.0102806, - "balance_loss_clip": 1.03301835, - "balance_loss_mlp": 1.01602554, - "epoch": 0.986261836765369, - "flos": 28730403884160.0, - "grad_norm": 2.0540712691142, - "language_loss": 0.74826646, - "learning_rate": 1.9709147019204566e-09, - "loss": 0.76947248, - "num_input_tokens_seen": 353962180, - "step": 16404, - "time_per_iteration": 2.6757051944732666 - }, - { - "auxiliary_loss_clip": 0.01096235, - "auxiliary_loss_mlp": 0.00769718, - "balance_loss_clip": 1.03480124, - "balance_loss_mlp": 1.00010228, - "epoch": 0.986321960018037, - "flos": 34313471568000.0, - "grad_norm": 1.7889327818353045, - "language_loss": 0.69631529, - "learning_rate": 1.953666699415768e-09, - "loss": 0.71497488, - "num_input_tokens_seen": 353984305, - "step": 16405, - "time_per_iteration": 2.7109172344207764 - }, - { - "auxiliary_loss_clip": 0.01085878, - "auxiliary_loss_mlp": 0.01034898, - "balance_loss_clip": 1.03745413, - "balance_loss_mlp": 1.02344775, - "epoch": 0.986382083270705, - "flos": 25189755436800.0, - "grad_norm": 1.6951529246514412, - "language_loss": 0.69703031, - "learning_rate": 1.93649446302846e-09, - "loss": 0.718238, - "num_input_tokens_seen": 354004495, - "step": 16406, - "time_per_iteration": 2.725384473800659 - }, - { - "auxiliary_loss_clip": 0.01049811, - "auxiliary_loss_mlp": 0.01032731, - "balance_loss_clip": 1.03564012, - "balance_loss_mlp": 1.02081645, - "epoch": 0.9864422065233729, - "flos": 11025904671360.0, - "grad_norm": 3.370275127153346, - "language_loss": 0.74895245, - "learning_rate": 1.9193979934095663e-09, - "loss": 0.76977789, - "num_input_tokens_seen": 354015985, - "step": 16407, - "time_per_iteration": 2.711702585220337 - }, - { - "auxiliary_loss_clip": 0.01083953, - "auxiliary_loss_mlp": 0.01030883, - "balance_loss_clip": 1.03477526, - "balance_loss_mlp": 1.01853251, - "epoch": 0.9865023297760409, - "flos": 16545590807040.0, - "grad_norm": 2.111055087475785, - "language_loss": 0.77460712, - "learning_rate": 1.9023772912072357e-09, - "loss": 0.79575551, - "num_input_tokens_seen": 354033260, - "step": 16408, - "time_per_iteration": 2.593550443649292 - }, - { - "auxiliary_loss_clip": 0.01101693, - "auxiliary_loss_mlp": 0.01032331, - "balance_loss_clip": 1.0380075, - "balance_loss_mlp": 1.01906323, - "epoch": 0.9865624530287088, - "flos": 18880179269760.0, - "grad_norm": 1.9451476197970003, - "language_loss": 0.68269604, - "learning_rate": 1.8854323570669515e-09, - "loss": 0.70403636, - "num_input_tokens_seen": 354052825, - "step": 16409, - "time_per_iteration": 2.587090492248535 - }, - { - "auxiliary_loss_clip": 0.01011915, - "auxiliary_loss_mlp": 0.01002193, - "balance_loss_clip": 1.00871253, - "balance_loss_mlp": 1.00125718, - "epoch": 0.9866225762813768, - "flos": 68887798680960.0, - "grad_norm": 0.806349802754998, - "language_loss": 0.61002564, - "learning_rate": 1.8685631916313118e-09, - "loss": 0.63016677, - "num_input_tokens_seen": 354113920, - "step": 16410, - "time_per_iteration": 3.278089761734009 - }, - { - "auxiliary_loss_clip": 0.0109769, - "auxiliary_loss_mlp": 0.01032935, - "balance_loss_clip": 1.03702283, - "balance_loss_mlp": 1.02077615, - "epoch": 0.9866826995340447, - "flos": 29023111814400.0, - "grad_norm": 3.0120361411647005, - "language_loss": 0.65963012, - "learning_rate": 1.8517697955400258e-09, - "loss": 0.68093634, - "num_input_tokens_seen": 354134210, - "step": 16411, - "time_per_iteration": 2.632351875305176 - }, - { - "auxiliary_loss_clip": 0.01027186, - "auxiliary_loss_mlp": 0.0100133, - "balance_loss_clip": 1.00486875, - "balance_loss_mlp": 1.00040567, - "epoch": 0.9867428227867128, - "flos": 65376814867200.0, - "grad_norm": 0.7224745052479478, - "language_loss": 0.56279814, - "learning_rate": 1.8350521694299182e-09, - "loss": 0.58308327, - "num_input_tokens_seen": 354198010, - "step": 16412, - "time_per_iteration": 3.1897354125976562 - }, - { - "auxiliary_loss_clip": 0.01079312, - "auxiliary_loss_mlp": 0.01032576, - "balance_loss_clip": 1.0352391, - "balance_loss_mlp": 1.01961815, - "epoch": 0.9868029460393807, - "flos": 26506312634880.0, - "grad_norm": 2.0935942074241685, - "language_loss": 0.72890359, - "learning_rate": 1.818410313934926e-09, - "loss": 0.75002241, - "num_input_tokens_seen": 354220000, - "step": 16413, - "time_per_iteration": 2.710663080215454 - }, - { - "auxiliary_loss_clip": 0.01060652, - "auxiliary_loss_mlp": 0.01030308, - "balance_loss_clip": 1.03323412, - "balance_loss_mlp": 1.01750505, - "epoch": 0.9868630692920487, - "flos": 22967280299520.0, - "grad_norm": 2.0312404595944664, - "language_loss": 0.71431053, - "learning_rate": 1.8018442296858782e-09, - "loss": 0.73522013, - "num_input_tokens_seen": 354240910, - "step": 16414, - "time_per_iteration": 2.7031588554382324 - }, - { - "auxiliary_loss_clip": 0.01089485, - "auxiliary_loss_mlp": 0.01036435, - "balance_loss_clip": 1.03525519, - "balance_loss_mlp": 1.02446055, - "epoch": 0.9869231925447167, - "flos": 19828687760640.0, - "grad_norm": 1.5435516461575216, - "language_loss": 0.7039904, - "learning_rate": 1.7853539173111608e-09, - "loss": 0.72524959, - "num_input_tokens_seen": 354259430, - "step": 16415, - "time_per_iteration": 2.702089309692383 - }, - { - "auxiliary_loss_clip": 0.01066346, - "auxiliary_loss_mlp": 0.01032014, - "balance_loss_clip": 1.0336014, - "balance_loss_mlp": 1.02089763, - "epoch": 0.9869833157973846, - "flos": 20195228096640.0, - "grad_norm": 2.9079123838637604, - "language_loss": 0.75465488, - "learning_rate": 1.7689393774362737e-09, - "loss": 0.77563846, - "num_input_tokens_seen": 354279490, - "step": 16416, - "time_per_iteration": 2.703504800796509 - }, - { - "auxiliary_loss_clip": 0.0108217, - "auxiliary_loss_mlp": 0.01030102, - "balance_loss_clip": 1.03591037, - "balance_loss_mlp": 1.01787734, - "epoch": 0.9870434390500527, - "flos": 16099507802880.0, - "grad_norm": 2.1800846259576216, - "language_loss": 0.70927489, - "learning_rate": 1.7526006106833858e-09, - "loss": 0.7303977, - "num_input_tokens_seen": 354295080, - "step": 16417, - "time_per_iteration": 2.694063663482666 - }, - { - "auxiliary_loss_clip": 0.01087544, - "auxiliary_loss_mlp": 0.0103492, - "balance_loss_clip": 1.03795171, - "balance_loss_mlp": 1.02209926, - "epoch": 0.9871035623027206, - "flos": 21760753438080.0, - "grad_norm": 1.6696226113868622, - "language_loss": 0.70512295, - "learning_rate": 1.7363376176720013e-09, - "loss": 0.72634757, - "num_input_tokens_seen": 354314610, - "step": 16418, - "time_per_iteration": 2.7078118324279785 - }, - { - "auxiliary_loss_clip": 0.01027164, - "auxiliary_loss_mlp": 0.01000807, - "balance_loss_clip": 1.00479984, - "balance_loss_mlp": 0.99982989, - "epoch": 0.9871636855553886, - "flos": 70219583245440.0, - "grad_norm": 0.658515497705567, - "language_loss": 0.53645599, - "learning_rate": 1.7201503990189603e-09, - "loss": 0.55673575, - "num_input_tokens_seen": 354383115, - "step": 16419, - "time_per_iteration": 3.2428295612335205 - }, - { - "auxiliary_loss_clip": 0.01087155, - "auxiliary_loss_mlp": 0.0104011, - "balance_loss_clip": 1.03351521, - "balance_loss_mlp": 1.0263052, - "epoch": 0.9872238088080565, - "flos": 25045825639680.0, - "grad_norm": 2.1175189547094457, - "language_loss": 0.77917069, - "learning_rate": 1.7040389553382162e-09, - "loss": 0.80044335, - "num_input_tokens_seen": 354403115, - "step": 16420, - "time_per_iteration": 2.6854612827301025 - }, - { - "auxiliary_loss_clip": 0.01071773, - "auxiliary_loss_mlp": 0.01029772, - "balance_loss_clip": 1.03993893, - "balance_loss_mlp": 1.01702881, - "epoch": 0.9872839320607245, - "flos": 19465846525440.0, - "grad_norm": 2.3612194130787505, - "language_loss": 0.70805871, - "learning_rate": 1.6880032872403916e-09, - "loss": 0.72907424, - "num_input_tokens_seen": 354424520, - "step": 16421, - "time_per_iteration": 2.7082440853118896 - }, - { - "auxiliary_loss_clip": 0.01100703, - "auxiliary_loss_mlp": 0.01035927, - "balance_loss_clip": 1.03684855, - "balance_loss_mlp": 1.02248001, - "epoch": 0.9873440553133924, - "flos": 26942914448640.0, - "grad_norm": 2.428011594135223, - "language_loss": 0.82735991, - "learning_rate": 1.6720433953338886e-09, - "loss": 0.84872615, - "num_input_tokens_seen": 354444800, - "step": 16422, - "time_per_iteration": 2.6437931060791016 - }, - { - "auxiliary_loss_clip": 0.0107317, - "auxiliary_loss_mlp": 0.01028669, - "balance_loss_clip": 1.03409743, - "balance_loss_mlp": 1.0163486, - "epoch": 0.9874041785660604, - "flos": 19062210418560.0, - "grad_norm": 1.6808127811152613, - "language_loss": 0.86108935, - "learning_rate": 1.656159280223779e-09, - "loss": 0.88210779, - "num_input_tokens_seen": 354464590, - "step": 16423, - "time_per_iteration": 2.7554445266723633 - }, - { - "auxiliary_loss_clip": 0.01100362, - "auxiliary_loss_mlp": 0.01026203, - "balance_loss_clip": 1.03655839, - "balance_loss_mlp": 1.01384747, - "epoch": 0.9874643018187284, - "flos": 21105814803840.0, - "grad_norm": 2.086841049232087, - "language_loss": 0.70854056, - "learning_rate": 1.6403509425122475e-09, - "loss": 0.72980618, - "num_input_tokens_seen": 354484145, - "step": 16424, - "time_per_iteration": 7.414201736450195 - }, - { - "auxiliary_loss_clip": 0.01097827, - "auxiliary_loss_mlp": 0.00769696, - "balance_loss_clip": 1.03443944, - "balance_loss_mlp": 1.00012803, - "epoch": 0.9875244250713964, - "flos": 24426043441920.0, - "grad_norm": 2.386486368838744, - "language_loss": 0.80787611, - "learning_rate": 1.6246183827990366e-09, - "loss": 0.82655132, - "num_input_tokens_seen": 354502475, - "step": 16425, - "time_per_iteration": 2.6806702613830566 - }, - { - "auxiliary_loss_clip": 0.0105599, - "auxiliary_loss_mlp": 0.01033218, - "balance_loss_clip": 1.03098166, - "balance_loss_mlp": 1.01937222, - "epoch": 0.9875845483240643, - "flos": 25117610970240.0, - "grad_norm": 1.8226222464901614, - "language_loss": 0.79747486, - "learning_rate": 1.6089616016803364e-09, - "loss": 0.81836694, - "num_input_tokens_seen": 354521855, - "step": 16426, - "time_per_iteration": 2.931814432144165 - }, - { - "auxiliary_loss_clip": 0.01099233, - "auxiliary_loss_mlp": 0.01035808, - "balance_loss_clip": 1.03837609, - "balance_loss_mlp": 1.02355909, - "epoch": 0.9876446715767323, - "flos": 16581788737920.0, - "grad_norm": 1.762658511590331, - "language_loss": 0.84837222, - "learning_rate": 1.593380599750338e-09, - "loss": 0.8697226, - "num_input_tokens_seen": 354539535, - "step": 16427, - "time_per_iteration": 2.615586042404175 - }, - { - "auxiliary_loss_clip": 0.01107577, - "auxiliary_loss_mlp": 0.01031141, - "balance_loss_clip": 1.03742325, - "balance_loss_mlp": 1.01907742, - "epoch": 0.9877047948294003, - "flos": 21616141282560.0, - "grad_norm": 1.7238113053204014, - "language_loss": 0.70466417, - "learning_rate": 1.577875377599458e-09, - "loss": 0.72605133, - "num_input_tokens_seen": 354557430, - "step": 16428, - "time_per_iteration": 2.5831527709960938 - }, - { - "auxiliary_loss_clip": 0.01068786, - "auxiliary_loss_mlp": 0.01032334, - "balance_loss_clip": 1.03269923, - "balance_loss_mlp": 1.02058625, - "epoch": 0.9877649180820682, - "flos": 21178497974400.0, - "grad_norm": 3.804683550860071, - "language_loss": 0.79990548, - "learning_rate": 1.5624459358158926e-09, - "loss": 0.82091671, - "num_input_tokens_seen": 354574735, - "step": 16429, - "time_per_iteration": 4.215754270553589 - }, - { - "auxiliary_loss_clip": 0.01106379, - "auxiliary_loss_mlp": 0.01030944, - "balance_loss_clip": 1.03527224, - "balance_loss_mlp": 1.01933873, - "epoch": 0.9878250413347363, - "flos": 39749233576320.0, - "grad_norm": 1.5905003981287011, - "language_loss": 0.6204477, - "learning_rate": 1.5470922749845073e-09, - "loss": 0.64182091, - "num_input_tokens_seen": 354597050, - "step": 16430, - "time_per_iteration": 2.7417891025543213 - }, - { - "auxiliary_loss_clip": 0.01109651, - "auxiliary_loss_mlp": 0.01032938, - "balance_loss_clip": 1.03770876, - "balance_loss_mlp": 1.02093995, - "epoch": 0.9878851645874042, - "flos": 29425634599680.0, - "grad_norm": 2.4386034001427848, - "language_loss": 0.73058724, - "learning_rate": 1.531814395687725e-09, - "loss": 0.75201309, - "num_input_tokens_seen": 354619095, - "step": 16431, - "time_per_iteration": 2.6387763023376465 - }, - { - "auxiliary_loss_clip": 0.01109492, - "auxiliary_loss_mlp": 0.01033474, - "balance_loss_clip": 1.03863847, - "balance_loss_mlp": 1.02136791, - "epoch": 0.9879452878400722, - "flos": 15806261168640.0, - "grad_norm": 2.1754704610847115, - "language_loss": 0.804088, - "learning_rate": 1.5166122985048602e-09, - "loss": 0.82551765, - "num_input_tokens_seen": 354633790, - "step": 16432, - "time_per_iteration": 2.59206485748291 - }, - { - "auxiliary_loss_clip": 0.01092115, - "auxiliary_loss_mlp": 0.01029277, - "balance_loss_clip": 1.0342344, - "balance_loss_mlp": 1.01833928, - "epoch": 0.9880054110927401, - "flos": 22233912318720.0, - "grad_norm": 1.600766850259046, - "language_loss": 0.80293298, - "learning_rate": 1.5014859840123405e-09, - "loss": 0.82414687, - "num_input_tokens_seen": 354653180, - "step": 16433, - "time_per_iteration": 2.705249071121216 - }, - { - "auxiliary_loss_clip": 0.01105179, - "auxiliary_loss_mlp": 0.01033969, - "balance_loss_clip": 1.03657746, - "balance_loss_mlp": 1.02180386, - "epoch": 0.9880655343454081, - "flos": 28763836467840.0, - "grad_norm": 2.2504734279341543, - "language_loss": 0.6503619, - "learning_rate": 1.4864354527837075e-09, - "loss": 0.67175341, - "num_input_tokens_seen": 354669900, - "step": 16434, - "time_per_iteration": 2.5459141731262207 - }, - { - "auxiliary_loss_clip": 0.01097534, - "auxiliary_loss_mlp": 0.01033458, - "balance_loss_clip": 1.03373504, - "balance_loss_mlp": 1.0204587, - "epoch": 0.988125657598076, - "flos": 32853379622400.0, - "grad_norm": 1.6032981258064614, - "language_loss": 0.69355786, - "learning_rate": 1.4714607053896154e-09, - "loss": 0.71486771, - "num_input_tokens_seen": 354693165, - "step": 16435, - "time_per_iteration": 2.691948652267456 - }, - { - "auxiliary_loss_clip": 0.01051732, - "auxiliary_loss_mlp": 0.01037505, - "balance_loss_clip": 1.03534651, - "balance_loss_mlp": 1.02496445, - "epoch": 0.988185780850744, - "flos": 19390685316480.0, - "grad_norm": 1.6101555042177864, - "language_loss": 0.75285351, - "learning_rate": 1.4565617423980548e-09, - "loss": 0.77374589, - "num_input_tokens_seen": 354711915, - "step": 16436, - "time_per_iteration": 2.687253475189209 - }, - { - "auxiliary_loss_clip": 0.01078926, - "auxiliary_loss_mlp": 0.01034051, - "balance_loss_clip": 1.03449368, - "balance_loss_mlp": 1.02073479, - "epoch": 0.988245904103412, - "flos": 22528415928960.0, - "grad_norm": 2.1049247557685486, - "language_loss": 0.7397666, - "learning_rate": 1.4417385643741286e-09, - "loss": 0.76089633, - "num_input_tokens_seen": 354729135, - "step": 16437, - "time_per_iteration": 2.6133415699005127 - }, - { - "auxiliary_loss_clip": 0.01070653, - "auxiliary_loss_mlp": 0.01032545, - "balance_loss_clip": 1.03327727, - "balance_loss_mlp": 1.02036154, - "epoch": 0.98830602735608, - "flos": 28659193171200.0, - "grad_norm": 1.7371031624510076, - "language_loss": 0.60138786, - "learning_rate": 1.4269911718796103e-09, - "loss": 0.62241983, - "num_input_tokens_seen": 354752530, - "step": 16438, - "time_per_iteration": 2.747478485107422 - }, - { - "auxiliary_loss_clip": 0.01082521, - "auxiliary_loss_mlp": 0.01033624, - "balance_loss_clip": 1.03334987, - "balance_loss_mlp": 1.02030826, - "epoch": 0.9883661506087479, - "flos": 20996035862400.0, - "grad_norm": 1.7884546303638278, - "language_loss": 0.71630102, - "learning_rate": 1.4123195654738295e-09, - "loss": 0.7374624, - "num_input_tokens_seen": 354771135, - "step": 16439, - "time_per_iteration": 2.64829158782959 - }, - { - "auxiliary_loss_clip": 0.01094806, - "auxiliary_loss_mlp": 0.01032761, - "balance_loss_clip": 1.03649998, - "balance_loss_mlp": 1.02029228, - "epoch": 0.9884262738614159, - "flos": 32706109860480.0, - "grad_norm": 1.9552284330659928, - "language_loss": 0.60129845, - "learning_rate": 1.3977237457134528e-09, - "loss": 0.62257409, - "num_input_tokens_seen": 354791800, - "step": 16440, - "time_per_iteration": 2.709625482559204 - }, - { - "auxiliary_loss_clip": 0.01109217, - "auxiliary_loss_mlp": 0.0103132, - "balance_loss_clip": 1.03572154, - "balance_loss_mlp": 1.01920807, - "epoch": 0.9884863971140839, - "flos": 17564699479680.0, - "grad_norm": 2.3996756667882346, - "language_loss": 0.76234657, - "learning_rate": 1.3832037131513707e-09, - "loss": 0.78375196, - "num_input_tokens_seen": 354809200, - "step": 16441, - "time_per_iteration": 2.5174717903137207 - }, - { - "auxiliary_loss_clip": 0.01084665, - "auxiliary_loss_mlp": 0.0102841, - "balance_loss_clip": 1.03476977, - "balance_loss_mlp": 1.0158329, - "epoch": 0.9885465203667518, - "flos": 40552519380480.0, - "grad_norm": 1.8936887176516917, - "language_loss": 0.67978179, - "learning_rate": 1.3687594683386982e-09, - "loss": 0.70091248, - "num_input_tokens_seen": 354829945, - "step": 16442, - "time_per_iteration": 2.780667781829834 - }, - { - "auxiliary_loss_clip": 0.01094262, - "auxiliary_loss_mlp": 0.01030255, - "balance_loss_clip": 1.03508973, - "balance_loss_mlp": 1.01828051, - "epoch": 0.9886066436194199, - "flos": 13807976768640.0, - "grad_norm": 2.546287070023655, - "language_loss": 0.74541289, - "learning_rate": 1.3543910118227753e-09, - "loss": 0.76665807, - "num_input_tokens_seen": 354845055, - "step": 16443, - "time_per_iteration": 2.5256857872009277 - }, - { - "auxiliary_loss_clip": 0.01085844, - "auxiliary_loss_mlp": 0.01029451, - "balance_loss_clip": 1.03409505, - "balance_loss_mlp": 1.01652241, - "epoch": 0.9886667668720878, - "flos": 23325129544320.0, - "grad_norm": 6.700934436882059, - "language_loss": 0.73816478, - "learning_rate": 1.3400983441487213e-09, - "loss": 0.75931776, - "num_input_tokens_seen": 354864680, - "step": 16444, - "time_per_iteration": 2.6347739696502686 - }, - { - "auxiliary_loss_clip": 0.01058824, - "auxiliary_loss_mlp": 0.0103504, - "balance_loss_clip": 1.03483725, - "balance_loss_mlp": 1.02182567, - "epoch": 0.9887268901247558, - "flos": 22706029704960.0, - "grad_norm": 2.0399337200236483, - "language_loss": 0.69289607, - "learning_rate": 1.325881465858547e-09, - "loss": 0.7138347, - "num_input_tokens_seen": 354885685, - "step": 16445, - "time_per_iteration": 2.7339391708374023 - }, - { - "auxiliary_loss_clip": 0.01101302, - "auxiliary_loss_mlp": 0.01026048, - "balance_loss_clip": 1.03817463, - "balance_loss_mlp": 1.01369166, - "epoch": 0.9887870133774237, - "flos": 13041283944960.0, - "grad_norm": 2.484106533550889, - "language_loss": 0.60372651, - "learning_rate": 1.311740377491155e-09, - "loss": 0.625, - "num_input_tokens_seen": 354901505, - "step": 16446, - "time_per_iteration": 2.571403980255127 - }, - { - "auxiliary_loss_clip": 0.01080619, - "auxiliary_loss_mlp": 0.01032334, - "balance_loss_clip": 1.03539968, - "balance_loss_mlp": 1.02072275, - "epoch": 0.9888471366300917, - "flos": 15158864390400.0, - "grad_norm": 2.54961121966749, - "language_loss": 0.71147966, - "learning_rate": 1.297675079582783e-09, - "loss": 0.73260915, - "num_input_tokens_seen": 354920060, - "step": 16447, - "time_per_iteration": 2.6204898357391357 - }, - { - "auxiliary_loss_clip": 0.01106743, - "auxiliary_loss_mlp": 0.00769349, - "balance_loss_clip": 1.03625035, - "balance_loss_mlp": 1.00023174, - "epoch": 0.9889072598827596, - "flos": 25118796119040.0, - "grad_norm": 2.22311895255621, - "language_loss": 0.83816832, - "learning_rate": 1.2836855726667818e-09, - "loss": 0.85692918, - "num_input_tokens_seen": 354938690, - "step": 16448, - "time_per_iteration": 2.615037679672241 - }, - { - "auxiliary_loss_clip": 0.01093774, - "auxiliary_loss_mlp": 0.01028295, - "balance_loss_clip": 1.03621387, - "balance_loss_mlp": 1.0171665, - "epoch": 0.9889673831354276, - "flos": 16728663450240.0, - "grad_norm": 1.5811514661156387, - "language_loss": 0.69698024, - "learning_rate": 1.26977185727406e-09, - "loss": 0.71820092, - "num_input_tokens_seen": 354956955, - "step": 16449, - "time_per_iteration": 2.5541889667510986 - }, - { - "auxiliary_loss_clip": 0.0109972, - "auxiliary_loss_mlp": 0.0102696, - "balance_loss_clip": 1.03743207, - "balance_loss_mlp": 1.01456869, - "epoch": 0.9890275063880956, - "flos": 35585175657600.0, - "grad_norm": 2.2330985869575106, - "language_loss": 0.7364139, - "learning_rate": 1.25593393393153e-09, - "loss": 0.75768065, - "num_input_tokens_seen": 354976800, - "step": 16450, - "time_per_iteration": 2.722463846206665 - }, - { - "auxiliary_loss_clip": 0.01108427, - "auxiliary_loss_mlp": 0.01032028, - "balance_loss_clip": 1.0343194, - "balance_loss_mlp": 1.01945782, - "epoch": 0.9890876296407636, - "flos": 18952359649920.0, - "grad_norm": 1.7405814084688636, - "language_loss": 0.79538721, - "learning_rate": 1.242171803164549e-09, - "loss": 0.81679177, - "num_input_tokens_seen": 354996625, - "step": 16451, - "time_per_iteration": 2.5799307823181152 - }, - { - "auxiliary_loss_clip": 0.01072025, - "auxiliary_loss_mlp": 0.01037826, - "balance_loss_clip": 1.03292084, - "balance_loss_mlp": 1.02433717, - "epoch": 0.9891477528934315, - "flos": 23769309127680.0, - "grad_norm": 2.076913559177625, - "language_loss": 0.70177102, - "learning_rate": 1.2284854654946996e-09, - "loss": 0.72286958, - "num_input_tokens_seen": 355014535, - "step": 16452, - "time_per_iteration": 2.6568350791931152 - }, - { - "auxiliary_loss_clip": 0.01106285, - "auxiliary_loss_mlp": 0.010265, - "balance_loss_clip": 1.03735638, - "balance_loss_mlp": 1.01531219, - "epoch": 0.9892078761460995, - "flos": 20772922533120.0, - "grad_norm": 1.7039408259240933, - "language_loss": 0.73759556, - "learning_rate": 1.2148749214409004e-09, - "loss": 0.75892341, - "num_input_tokens_seen": 355033280, - "step": 16453, - "time_per_iteration": 2.526846170425415 - }, - { - "auxiliary_loss_clip": 0.01068886, - "auxiliary_loss_mlp": 0.0103825, - "balance_loss_clip": 1.03598034, - "balance_loss_mlp": 1.02607906, - "epoch": 0.9892679993987675, - "flos": 23367827836800.0, - "grad_norm": 2.0358391498117765, - "language_loss": 0.69925886, - "learning_rate": 1.2013401715191828e-09, - "loss": 0.72033024, - "num_input_tokens_seen": 355053320, - "step": 16454, - "time_per_iteration": 2.7736165523529053 - }, - { - "auxiliary_loss_clip": 0.01077684, - "auxiliary_loss_mlp": 0.01031622, - "balance_loss_clip": 1.03315997, - "balance_loss_mlp": 1.01950455, - "epoch": 0.9893281226514354, - "flos": 22705419173760.0, - "grad_norm": 1.754815441534383, - "language_loss": 0.75814426, - "learning_rate": 1.1878812162433583e-09, - "loss": 0.77923727, - "num_input_tokens_seen": 355070230, - "step": 16455, - "time_per_iteration": 2.626431941986084 - }, - { - "auxiliary_loss_clip": 0.0107961, - "auxiliary_loss_mlp": 0.01026151, - "balance_loss_clip": 1.03627825, - "balance_loss_mlp": 1.01435518, - "epoch": 0.9893882459041035, - "flos": 21796664060160.0, - "grad_norm": 2.3755436774026037, - "language_loss": 0.6552164, - "learning_rate": 1.1744980561230188e-09, - "loss": 0.676274, - "num_input_tokens_seen": 355090125, - "step": 16456, - "time_per_iteration": 2.6569387912750244 - }, - { - "auxiliary_loss_clip": 0.01100413, - "auxiliary_loss_mlp": 0.01031079, - "balance_loss_clip": 1.03849792, - "balance_loss_mlp": 1.01922965, - "epoch": 0.9894483691567714, - "flos": 18113773754880.0, - "grad_norm": 2.3001936476450484, - "language_loss": 0.73839563, - "learning_rate": 1.161190691666203e-09, - "loss": 0.75971055, - "num_input_tokens_seen": 355107890, - "step": 16457, - "time_per_iteration": 2.674736738204956 - }, - { - "auxiliary_loss_clip": 0.01108737, - "auxiliary_loss_mlp": 0.01029092, - "balance_loss_clip": 1.03762496, - "balance_loss_mlp": 1.01680112, - "epoch": 0.9895084924094394, - "flos": 31211615664000.0, - "grad_norm": 2.2264264445995474, - "language_loss": 0.6879859, - "learning_rate": 1.1479591233773954e-09, - "loss": 0.70936424, - "num_input_tokens_seen": 355126340, - "step": 16458, - "time_per_iteration": 2.615215301513672 - }, - { - "auxiliary_loss_clip": 0.01093615, - "auxiliary_loss_mlp": 0.01030607, - "balance_loss_clip": 1.0354172, - "balance_loss_mlp": 1.01881158, - "epoch": 0.9895686156621073, - "flos": 19678042120320.0, - "grad_norm": 1.6836703680245058, - "language_loss": 0.79359543, - "learning_rate": 1.1348033517581956e-09, - "loss": 0.81483769, - "num_input_tokens_seen": 355144025, - "step": 16459, - "time_per_iteration": 2.5571677684783936 - }, - { - "auxiliary_loss_clip": 0.01083172, - "auxiliary_loss_mlp": 0.01034843, - "balance_loss_clip": 1.03401232, - "balance_loss_mlp": 1.02273118, - "epoch": 0.9896287389147753, - "flos": 23581675457280.0, - "grad_norm": 1.883745911652252, - "language_loss": 0.7132234, - "learning_rate": 1.1217233773075373e-09, - "loss": 0.73440349, - "num_input_tokens_seen": 355163125, - "step": 16460, - "time_per_iteration": 2.626668691635132 - }, - { - "auxiliary_loss_clip": 0.01086508, - "auxiliary_loss_mlp": 0.01026002, - "balance_loss_clip": 1.03445435, - "balance_loss_mlp": 1.01346099, - "epoch": 0.9896888621674432, - "flos": 29605331364480.0, - "grad_norm": 1.5613662208047323, - "language_loss": 0.87661237, - "learning_rate": 1.1087192005214685e-09, - "loss": 0.8977375, - "num_input_tokens_seen": 355184060, - "step": 16461, - "time_per_iteration": 2.7060861587524414 - }, - { - "auxiliary_loss_clip": 0.01095459, - "auxiliary_loss_mlp": 0.01032824, - "balance_loss_clip": 1.03561902, - "balance_loss_mlp": 1.01949632, - "epoch": 0.9897489854201112, - "flos": 23695045758720.0, - "grad_norm": 1.7346501147556415, - "language_loss": 0.62446827, - "learning_rate": 1.09579082189315e-09, - "loss": 0.64575106, - "num_input_tokens_seen": 355204505, - "step": 16462, - "time_per_iteration": 2.64906907081604 - }, - { - "auxiliary_loss_clip": 0.01100978, - "auxiliary_loss_mlp": 0.01031988, - "balance_loss_clip": 1.03905725, - "balance_loss_mlp": 1.02028179, - "epoch": 0.9898091086727792, - "flos": 13225146687360.0, - "grad_norm": 1.8196712786211515, - "language_loss": 0.72961009, - "learning_rate": 1.0829382419126343e-09, - "loss": 0.75093973, - "num_input_tokens_seen": 355223055, - "step": 16463, - "time_per_iteration": 5.664719343185425 - }, - { - "auxiliary_loss_clip": 0.01097369, - "auxiliary_loss_mlp": 0.01031261, - "balance_loss_clip": 1.03589058, - "balance_loss_mlp": 1.01759946, - "epoch": 0.9898692319254472, - "flos": 22930400010240.0, - "grad_norm": 1.8790074246381347, - "language_loss": 0.69955069, - "learning_rate": 1.0701614610675314e-09, - "loss": 0.720837, - "num_input_tokens_seen": 355242000, - "step": 16464, - "time_per_iteration": 4.500953197479248 - }, - { - "auxiliary_loss_clip": 0.01079876, - "auxiliary_loss_mlp": 0.0102935, - "balance_loss_clip": 1.03554177, - "balance_loss_mlp": 1.01688099, - "epoch": 0.9899293551781151, - "flos": 12458346122880.0, - "grad_norm": 2.0237880256001635, - "language_loss": 0.73348618, - "learning_rate": 1.0574604798421204e-09, - "loss": 0.75457835, - "num_input_tokens_seen": 355260175, - "step": 16465, - "time_per_iteration": 2.6900930404663086 - }, - { - "auxiliary_loss_clip": 0.01104028, - "auxiliary_loss_mlp": 0.010347, - "balance_loss_clip": 1.03416681, - "balance_loss_mlp": 1.02323794, - "epoch": 0.9899894784307831, - "flos": 26871129118080.0, - "grad_norm": 1.754568063294171, - "language_loss": 0.86540592, - "learning_rate": 1.0448352987182386e-09, - "loss": 0.88679326, - "num_input_tokens_seen": 355281930, - "step": 16466, - "time_per_iteration": 2.5950276851654053 - }, - { - "auxiliary_loss_clip": 0.01071496, - "auxiliary_loss_mlp": 0.01024722, - "balance_loss_clip": 1.03584099, - "balance_loss_mlp": 1.01242614, - "epoch": 0.990049601683451, - "flos": 21542093395200.0, - "grad_norm": 1.7230422201542275, - "language_loss": 0.71486777, - "learning_rate": 1.0322859181743915e-09, - "loss": 0.73583001, - "num_input_tokens_seen": 355301555, - "step": 16467, - "time_per_iteration": 2.7708022594451904 - }, - { - "auxiliary_loss_clip": 0.0108033, - "auxiliary_loss_mlp": 0.0104001, - "balance_loss_clip": 1.03324151, - "balance_loss_mlp": 1.02659893, - "epoch": 0.990109724936119, - "flos": 28771809287040.0, - "grad_norm": 1.3753584839252895, - "language_loss": 0.65033233, - "learning_rate": 1.019812338686643e-09, - "loss": 0.67153573, - "num_input_tokens_seen": 355324925, - "step": 16468, - "time_per_iteration": 4.24141263961792 - }, - { - "auxiliary_loss_clip": 0.01079098, - "auxiliary_loss_mlp": 0.0103126, - "balance_loss_clip": 1.03625393, - "balance_loss_mlp": 1.01935673, - "epoch": 0.9901698481887871, - "flos": 29274270687360.0, - "grad_norm": 2.0452120517655943, - "language_loss": 0.62340331, - "learning_rate": 1.0074145607281704e-09, - "loss": 0.64450687, - "num_input_tokens_seen": 355343875, - "step": 16469, - "time_per_iteration": 2.7885043621063232 - }, - { - "auxiliary_loss_clip": 0.01073759, - "auxiliary_loss_mlp": 0.01031927, - "balance_loss_clip": 1.03337479, - "balance_loss_mlp": 1.01906407, - "epoch": 0.990229971441455, - "flos": 15959025711360.0, - "grad_norm": 2.562370039861896, - "language_loss": 0.70241368, - "learning_rate": 9.950925847685976e-10, - "loss": 0.72347051, - "num_input_tokens_seen": 355358835, - "step": 16470, - "time_per_iteration": 2.6540679931640625 - }, - { - "auxiliary_loss_clip": 0.01019159, - "auxiliary_loss_mlp": 0.01000231, - "balance_loss_clip": 1.00684953, - "balance_loss_mlp": 0.99926519, - "epoch": 0.990290094694123, - "flos": 69780287911680.0, - "grad_norm": 0.6776686516780072, - "language_loss": 0.55451435, - "learning_rate": 9.828464112755509e-10, - "loss": 0.57470822, - "num_input_tokens_seen": 355431225, - "step": 16471, - "time_per_iteration": 3.345576047897339 - }, - { - "auxiliary_loss_clip": 0.01088522, - "auxiliary_loss_mlp": 0.01034754, - "balance_loss_clip": 1.03816175, - "balance_loss_mlp": 1.02205849, - "epoch": 0.9903502179467909, - "flos": 16252451913600.0, - "grad_norm": 2.029976016877621, - "language_loss": 0.83828497, - "learning_rate": 9.706760407131032e-10, - "loss": 0.85951781, - "num_input_tokens_seen": 355448250, - "step": 16472, - "time_per_iteration": 2.7130064964294434 - }, - { - "auxiliary_loss_clip": 0.01095822, - "auxiliary_loss_mlp": 0.01026203, - "balance_loss_clip": 1.03632557, - "balance_loss_mlp": 1.01452053, - "epoch": 0.9904103411994589, - "flos": 21688393489920.0, - "grad_norm": 1.9314835507092933, - "language_loss": 0.8592447, - "learning_rate": 9.585814735431075e-10, - "loss": 0.88046497, - "num_input_tokens_seen": 355467040, - "step": 16473, - "time_per_iteration": 2.6023082733154297 - }, - { - "auxiliary_loss_clip": 0.01105804, - "auxiliary_loss_mlp": 0.01029675, - "balance_loss_clip": 1.03511405, - "balance_loss_mlp": 1.01830196, - "epoch": 0.9904704644521268, - "flos": 25739440243200.0, - "grad_norm": 1.8812560615029836, - "language_loss": 0.84657192, - "learning_rate": 9.465627102240859e-10, - "loss": 0.86792672, - "num_input_tokens_seen": 355487825, - "step": 16474, - "time_per_iteration": 2.6265671253204346 - }, - { - "auxiliary_loss_clip": 0.01079812, - "auxiliary_loss_mlp": 0.01035461, - "balance_loss_clip": 1.03155684, - "balance_loss_mlp": 1.0240823, - "epoch": 0.9905305877047949, - "flos": 21908346422400.0, - "grad_norm": 1.8096895142828726, - "language_loss": 0.76610988, - "learning_rate": 9.346197512116738e-10, - "loss": 0.78726262, - "num_input_tokens_seen": 355507445, - "step": 16475, - "time_per_iteration": 2.642179012298584 - }, - { - "auxiliary_loss_clip": 0.0106673, - "auxiliary_loss_mlp": 0.01035079, - "balance_loss_clip": 1.03210354, - "balance_loss_mlp": 1.02151895, - "epoch": 0.9905907109574628, - "flos": 21392417422080.0, - "grad_norm": 1.7909726122896426, - "language_loss": 0.76034641, - "learning_rate": 9.227525969588423e-10, - "loss": 0.78136444, - "num_input_tokens_seen": 355527205, - "step": 16476, - "time_per_iteration": 2.6616551876068115 - }, - { - "auxiliary_loss_clip": 0.01101675, - "auxiliary_loss_mlp": 0.00771329, - "balance_loss_clip": 1.03651643, - "balance_loss_mlp": 1.00030255, - "epoch": 0.9906508342101308, - "flos": 20521620005760.0, - "grad_norm": 2.1261117563309884, - "language_loss": 0.6759547, - "learning_rate": 9.109612479154538e-10, - "loss": 0.69468474, - "num_input_tokens_seen": 355544740, - "step": 16477, - "time_per_iteration": 2.5836856365203857 - }, - { - "auxiliary_loss_clip": 0.0109369, - "auxiliary_loss_mlp": 0.01034434, - "balance_loss_clip": 1.03875303, - "balance_loss_mlp": 1.02113652, - "epoch": 0.9907109574627987, - "flos": 21361211481600.0, - "grad_norm": 2.3791528799950283, - "language_loss": 0.71740925, - "learning_rate": 8.992457045289282e-10, - "loss": 0.7386905, - "num_input_tokens_seen": 355564385, - "step": 16478, - "time_per_iteration": 2.6684231758117676 - }, - { - "auxiliary_loss_clip": 0.0110905, - "auxiliary_loss_mlp": 0.01040049, - "balance_loss_clip": 1.03671718, - "balance_loss_mlp": 1.02660859, - "epoch": 0.9907710807154667, - "flos": 17338605321600.0, - "grad_norm": 2.44296615516407, - "language_loss": 0.80982149, - "learning_rate": 8.876059672433545e-10, - "loss": 0.83131254, - "num_input_tokens_seen": 355579260, - "step": 16479, - "time_per_iteration": 2.536628484725952 - }, - { - "auxiliary_loss_clip": 0.01099491, - "auxiliary_loss_mlp": 0.01033699, - "balance_loss_clip": 1.03593194, - "balance_loss_mlp": 1.02183723, - "epoch": 0.9908312039681346, - "flos": 28621881918720.0, - "grad_norm": 1.8095138235680064, - "language_loss": 0.66404873, - "learning_rate": 8.760420364999355e-10, - "loss": 0.68538064, - "num_input_tokens_seen": 355599790, - "step": 16480, - "time_per_iteration": 2.675546884536743 - }, - { - "auxiliary_loss_clip": 0.0109416, - "auxiliary_loss_mlp": 0.01032967, - "balance_loss_clip": 1.03466868, - "balance_loss_mlp": 1.02073646, - "epoch": 0.9908913272208026, - "flos": 35770654512000.0, - "grad_norm": 1.7378127875185636, - "language_loss": 0.72355247, - "learning_rate": 8.645539127374313e-10, - "loss": 0.74482375, - "num_input_tokens_seen": 355620925, - "step": 16481, - "time_per_iteration": 2.702287197113037 - }, - { - "auxiliary_loss_clip": 0.01095367, - "auxiliary_loss_mlp": 0.01023626, - "balance_loss_clip": 1.03528941, - "balance_loss_mlp": 1.01157379, - "epoch": 0.9909514504734707, - "flos": 19902196944000.0, - "grad_norm": 1.99195789913312, - "language_loss": 0.77529383, - "learning_rate": 8.531415963912713e-10, - "loss": 0.79648376, - "num_input_tokens_seen": 355639165, - "step": 16482, - "time_per_iteration": 2.623577117919922 - }, - { - "auxiliary_loss_clip": 0.01099605, - "auxiliary_loss_mlp": 0.01030033, - "balance_loss_clip": 1.03522539, - "balance_loss_mlp": 1.01804006, - "epoch": 0.9910115737261386, - "flos": 20004793165440.0, - "grad_norm": 1.7513452456570024, - "language_loss": 0.75167656, - "learning_rate": 8.418050878944427e-10, - "loss": 0.772973, - "num_input_tokens_seen": 355657320, - "step": 16483, - "time_per_iteration": 2.6707489490509033 - }, - { - "auxiliary_loss_clip": 0.01018817, - "auxiliary_loss_mlp": 0.01002356, - "balance_loss_clip": 1.00542712, - "balance_loss_mlp": 1.0013783, - "epoch": 0.9910716969788066, - "flos": 70688432494080.0, - "grad_norm": 0.6739717924016945, - "language_loss": 0.53652573, - "learning_rate": 8.305443876768237e-10, - "loss": 0.55673742, - "num_input_tokens_seen": 355726370, - "step": 16484, - "time_per_iteration": 3.2860820293426514 - }, - { - "auxiliary_loss_clip": 0.01103552, - "auxiliary_loss_mlp": 0.0103248, - "balance_loss_clip": 1.03575015, - "balance_loss_mlp": 1.0208987, - "epoch": 0.9911318202314745, - "flos": 21434038306560.0, - "grad_norm": 1.844066586555626, - "language_loss": 0.82151747, - "learning_rate": 8.19359496165184e-10, - "loss": 0.84287775, - "num_input_tokens_seen": 355745840, - "step": 16485, - "time_per_iteration": 2.572507619857788 - }, - { - "auxiliary_loss_clip": 0.0106644, - "auxiliary_loss_mlp": 0.01039998, - "balance_loss_clip": 1.03260577, - "balance_loss_mlp": 1.02652156, - "epoch": 0.9911919434841425, - "flos": 19826820253440.0, - "grad_norm": 1.5753889689051752, - "language_loss": 0.81565136, - "learning_rate": 8.082504137836288e-10, - "loss": 0.83671576, - "num_input_tokens_seen": 355763385, - "step": 16486, - "time_per_iteration": 2.6582581996917725 - }, - { - "auxiliary_loss_clip": 0.01099209, - "auxiliary_loss_mlp": 0.01034412, - "balance_loss_clip": 1.03707922, - "balance_loss_mlp": 1.02275944, - "epoch": 0.9912520667368104, - "flos": 41719364691840.0, - "grad_norm": 1.4744278341882329, - "language_loss": 0.66241539, - "learning_rate": 7.972171409538209e-10, - "loss": 0.68375158, - "num_input_tokens_seen": 355786075, - "step": 16487, - "time_per_iteration": 2.90215802192688 - }, - { - "auxiliary_loss_clip": 0.01094686, - "auxiliary_loss_mlp": 0.00769572, - "balance_loss_clip": 1.03547466, - "balance_loss_mlp": 1.00026965, - "epoch": 0.9913121899894785, - "flos": 23769668263680.0, - "grad_norm": 1.5978817773669494, - "language_loss": 0.76796007, - "learning_rate": 7.862596780936481e-10, - "loss": 0.78660262, - "num_input_tokens_seen": 355806295, - "step": 16488, - "time_per_iteration": 2.771479368209839 - }, - { - "auxiliary_loss_clip": 0.01080089, - "auxiliary_loss_mlp": 0.01030538, - "balance_loss_clip": 1.03599024, - "balance_loss_mlp": 1.01780689, - "epoch": 0.9913723132421464, - "flos": 23769668263680.0, - "grad_norm": 9.931415679730078, - "language_loss": 0.68562698, - "learning_rate": 7.753780256190001e-10, - "loss": 0.70673329, - "num_input_tokens_seen": 355825730, - "step": 16489, - "time_per_iteration": 2.8262085914611816 - }, - { - "auxiliary_loss_clip": 0.00990057, - "auxiliary_loss_mlp": 0.01006045, - "balance_loss_clip": 1.00620961, - "balance_loss_mlp": 1.00509155, - "epoch": 0.9914324364948144, - "flos": 71267419820160.0, - "grad_norm": 0.6117813667004609, - "language_loss": 0.52562964, - "learning_rate": 7.645721839424357e-10, - "loss": 0.54559064, - "num_input_tokens_seen": 355891545, - "step": 16490, - "time_per_iteration": 3.339395523071289 - }, - { - "auxiliary_loss_clip": 0.01081829, - "auxiliary_loss_mlp": 0.01038351, - "balance_loss_clip": 1.03499651, - "balance_loss_mlp": 1.02465963, - "epoch": 0.9914925597474823, - "flos": 23695440808320.0, - "grad_norm": 1.555410578963239, - "language_loss": 0.75512695, - "learning_rate": 7.538421534734052e-10, - "loss": 0.7763288, - "num_input_tokens_seen": 355909920, - "step": 16491, - "time_per_iteration": 2.7577908039093018 - }, - { - "auxiliary_loss_clip": 0.01068183, - "auxiliary_loss_mlp": 0.01033404, - "balance_loss_clip": 1.03941417, - "balance_loss_mlp": 1.02027285, - "epoch": 0.9915526830001503, - "flos": 13433822749440.0, - "grad_norm": 2.4260837732983656, - "language_loss": 0.70534217, - "learning_rate": 7.431879346191383e-10, - "loss": 0.72635806, - "num_input_tokens_seen": 355923130, - "step": 16492, - "time_per_iteration": 2.717663288116455 - }, - { - "auxiliary_loss_clip": 0.01072141, - "auxiliary_loss_mlp": 0.01033142, - "balance_loss_clip": 1.03324449, - "balance_loss_mlp": 1.02005327, - "epoch": 0.9916128062528182, - "flos": 20740962407040.0, - "grad_norm": 1.9482484238506383, - "language_loss": 0.6859107, - "learning_rate": 7.326095277837563e-10, - "loss": 0.7069636, - "num_input_tokens_seen": 355941960, - "step": 16493, - "time_per_iteration": 2.626917839050293 - }, - { - "auxiliary_loss_clip": 0.01084989, - "auxiliary_loss_mlp": 0.01034596, - "balance_loss_clip": 1.03742933, - "balance_loss_mlp": 1.02203131, - "epoch": 0.9916729295054862, - "flos": 22487082353280.0, - "grad_norm": 1.8545300883783822, - "language_loss": 0.7110008, - "learning_rate": 7.221069333678276e-10, - "loss": 0.73219669, - "num_input_tokens_seen": 355961640, - "step": 16494, - "time_per_iteration": 2.6934683322906494 - }, - { - "auxiliary_loss_clip": 0.01098932, - "auxiliary_loss_mlp": 0.0103153, - "balance_loss_clip": 1.0362227, - "balance_loss_mlp": 1.01829231, - "epoch": 0.9917330527581543, - "flos": 14792467708800.0, - "grad_norm": 3.5378309901622007, - "language_loss": 0.68413657, - "learning_rate": 7.116801517701443e-10, - "loss": 0.70544124, - "num_input_tokens_seen": 355977980, - "step": 16495, - "time_per_iteration": 2.6093251705169678 - }, - { - "auxiliary_loss_clip": 0.0101026, - "auxiliary_loss_mlp": 0.01002792, - "balance_loss_clip": 1.00642037, - "balance_loss_mlp": 1.00182664, - "epoch": 0.9917931760108222, - "flos": 59191595585280.0, - "grad_norm": 0.7170668056568608, - "language_loss": 0.53470147, - "learning_rate": 7.013291833859458e-10, - "loss": 0.55483198, - "num_input_tokens_seen": 356042900, - "step": 16496, - "time_per_iteration": 3.3146417140960693 - }, - { - "auxiliary_loss_clip": 0.01085309, - "auxiliary_loss_mlp": 0.00774025, - "balance_loss_clip": 1.03571773, - "balance_loss_mlp": 1.0002538, - "epoch": 0.9918532992634902, - "flos": 26761637485440.0, - "grad_norm": 1.4656425983930446, - "language_loss": 0.71419513, - "learning_rate": 6.91054028607585e-10, - "loss": 0.73278844, - "num_input_tokens_seen": 356063000, - "step": 16497, - "time_per_iteration": 2.7043471336364746 - }, - { - "auxiliary_loss_clip": 0.01081862, - "auxiliary_loss_mlp": 0.01033667, - "balance_loss_clip": 1.03701067, - "balance_loss_mlp": 1.02047634, - "epoch": 0.9919134225161581, - "flos": 14975719920000.0, - "grad_norm": 2.009806913835038, - "language_loss": 0.82173979, - "learning_rate": 6.808546878249721e-10, - "loss": 0.84289509, - "num_input_tokens_seen": 356078130, - "step": 16498, - "time_per_iteration": 2.7074508666992188 - }, - { - "auxiliary_loss_clip": 0.01075485, - "auxiliary_loss_mlp": 0.01035966, - "balance_loss_clip": 1.03759611, - "balance_loss_mlp": 1.02313316, - "epoch": 0.9919735457688261, - "flos": 27818201064960.0, - "grad_norm": 1.7332426459484291, - "language_loss": 0.68403494, - "learning_rate": 6.707311614246869e-10, - "loss": 0.70514941, - "num_input_tokens_seen": 356101655, - "step": 16499, - "time_per_iteration": 2.7545838356018066 - }, - { - "auxiliary_loss_clip": 0.01111074, - "auxiliary_loss_mlp": 0.01029883, - "balance_loss_clip": 1.03827095, - "balance_loss_mlp": 1.01769972, - "epoch": 0.992033669021494, - "flos": 22562782266240.0, - "grad_norm": 1.7667057026294906, - "language_loss": 0.8223446, - "learning_rate": 6.606834497904223e-10, - "loss": 0.84375417, - "num_input_tokens_seen": 356121425, - "step": 16500, - "time_per_iteration": 2.587153911590576 - }, - { - "auxiliary_loss_clip": 0.01080633, - "auxiliary_loss_mlp": 0.01032066, - "balance_loss_clip": 1.03477311, - "balance_loss_mlp": 1.01933479, - "epoch": 0.9920937922741621, - "flos": 25374587846400.0, - "grad_norm": 1.774651095771433, - "language_loss": 0.81949353, - "learning_rate": 6.507115533036511e-10, - "loss": 0.84062058, - "num_input_tokens_seen": 356140710, - "step": 16501, - "time_per_iteration": 2.769408702850342 - }, - { - "auxiliary_loss_clip": 0.01098639, - "auxiliary_loss_mlp": 0.01029969, - "balance_loss_clip": 1.03593433, - "balance_loss_mlp": 1.01757097, - "epoch": 0.99215391552683, - "flos": 22054466949120.0, - "grad_norm": 2.025781816358009, - "language_loss": 0.76823413, - "learning_rate": 6.408154723420711e-10, - "loss": 0.78952026, - "num_input_tokens_seen": 356159835, - "step": 16502, - "time_per_iteration": 4.115024566650391 - }, - { - "auxiliary_loss_clip": 0.01083856, - "auxiliary_loss_mlp": 0.01031996, - "balance_loss_clip": 1.03553808, - "balance_loss_mlp": 1.01820326, - "epoch": 0.992214038779498, - "flos": 15413937845760.0, - "grad_norm": 3.0841815262581127, - "language_loss": 0.71393132, - "learning_rate": 6.309952072811597e-10, - "loss": 0.7350899, - "num_input_tokens_seen": 356177555, - "step": 16503, - "time_per_iteration": 4.208997964859009 - }, - { - "auxiliary_loss_clip": 0.0101848, - "auxiliary_loss_mlp": 0.01003931, - "balance_loss_clip": 1.00507569, - "balance_loss_mlp": 1.00273323, - "epoch": 0.9922741620321659, - "flos": 62014498467840.0, - "grad_norm": 0.631144225573371, - "language_loss": 0.55076844, - "learning_rate": 6.212507584932858e-10, - "loss": 0.57099259, - "num_input_tokens_seen": 356244975, - "step": 16504, - "time_per_iteration": 4.945772647857666 - }, - { - "auxiliary_loss_clip": 0.01075926, - "auxiliary_loss_mlp": 0.01024279, - "balance_loss_clip": 1.0352273, - "balance_loss_mlp": 1.01286459, - "epoch": 0.9923342852848339, - "flos": 17165480745600.0, - "grad_norm": 1.970652781818717, - "language_loss": 0.6979568, - "learning_rate": 6.115821263481536e-10, - "loss": 0.71895891, - "num_input_tokens_seen": 356262605, - "step": 16505, - "time_per_iteration": 2.655355453491211 - }, - { - "auxiliary_loss_clip": 0.01074237, - "auxiliary_loss_mlp": 0.01032467, - "balance_loss_clip": 1.0348562, - "balance_loss_mlp": 1.01904368, - "epoch": 0.9923944085375018, - "flos": 23183210908800.0, - "grad_norm": 1.9797286096997044, - "language_loss": 0.65255636, - "learning_rate": 6.019893112119146e-10, - "loss": 0.67362338, - "num_input_tokens_seen": 356278935, - "step": 16506, - "time_per_iteration": 2.8993325233459473 - }, - { - "auxiliary_loss_clip": 0.01044661, - "auxiliary_loss_mlp": 0.01028638, - "balance_loss_clip": 1.03355384, - "balance_loss_mlp": 1.01587033, - "epoch": 0.9924545317901698, - "flos": 20813861059200.0, - "grad_norm": 3.222870025511436, - "language_loss": 0.62715226, - "learning_rate": 5.924723134487219e-10, - "loss": 0.64788526, - "num_input_tokens_seen": 356295675, - "step": 16507, - "time_per_iteration": 4.278958559036255 - }, - { - "auxiliary_loss_clip": 0.01108709, - "auxiliary_loss_mlp": 0.01033793, - "balance_loss_clip": 1.03649449, - "balance_loss_mlp": 1.02098358, - "epoch": 0.9925146550428379, - "flos": 20083437993600.0, - "grad_norm": 2.2915394393265567, - "language_loss": 0.73150027, - "learning_rate": 5.830311334193983e-10, - "loss": 0.75292528, - "num_input_tokens_seen": 356312885, - "step": 16508, - "time_per_iteration": 2.5459229946136475 - }, - { - "auxiliary_loss_clip": 0.01107576, - "auxiliary_loss_mlp": 0.0102854, - "balance_loss_clip": 1.03538799, - "balance_loss_mlp": 1.01548636, - "epoch": 0.9925747782955058, - "flos": 24973717086720.0, - "grad_norm": 1.713644660775738, - "language_loss": 0.70212501, - "learning_rate": 5.736657714818793e-10, - "loss": 0.72348613, - "num_input_tokens_seen": 356334070, - "step": 16509, - "time_per_iteration": 2.731260299682617 - }, - { - "auxiliary_loss_clip": 0.01096747, - "auxiliary_loss_mlp": 0.01036856, - "balance_loss_clip": 1.03462338, - "balance_loss_mlp": 1.02401757, - "epoch": 0.9926349015481738, - "flos": 60472526492160.0, - "grad_norm": 1.6611558247345184, - "language_loss": 0.68540096, - "learning_rate": 5.643762279912146e-10, - "loss": 0.70673692, - "num_input_tokens_seen": 356359410, - "step": 16510, - "time_per_iteration": 3.000253438949585 - }, - { - "auxiliary_loss_clip": 0.01074893, - "auxiliary_loss_mlp": 0.01037014, - "balance_loss_clip": 1.03464723, - "balance_loss_mlp": 1.02426445, - "epoch": 0.9926950248008417, - "flos": 20741716592640.0, - "grad_norm": 2.1741536524362544, - "language_loss": 0.81332445, - "learning_rate": 5.551625032997886e-10, - "loss": 0.83444357, - "num_input_tokens_seen": 356378345, - "step": 16511, - "time_per_iteration": 2.708442211151123 - }, - { - "auxiliary_loss_clip": 0.01064556, - "auxiliary_loss_mlp": 0.01032872, - "balance_loss_clip": 1.03382301, - "balance_loss_mlp": 1.02089787, - "epoch": 0.9927551480535097, - "flos": 24352965221760.0, - "grad_norm": 1.9230315347792497, - "language_loss": 0.91452694, - "learning_rate": 5.460245977570998e-10, - "loss": 0.93550122, - "num_input_tokens_seen": 356397345, - "step": 16512, - "time_per_iteration": 2.6810131072998047 - }, - { - "auxiliary_loss_clip": 0.00999495, - "auxiliary_loss_mlp": 0.01002603, - "balance_loss_clip": 1.00600088, - "balance_loss_mlp": 1.00150681, - "epoch": 0.9928152713061776, - "flos": 71275572207360.0, - "grad_norm": 0.7168790027045711, - "language_loss": 0.55182147, - "learning_rate": 5.369625117095378e-10, - "loss": 0.57184243, - "num_input_tokens_seen": 356459160, - "step": 16513, - "time_per_iteration": 3.3187079429626465 - }, - { - "auxiliary_loss_clip": 0.01081239, - "auxiliary_loss_mlp": 0.01031377, - "balance_loss_clip": 1.03556442, - "balance_loss_mlp": 1.01862729, - "epoch": 0.9928753945588457, - "flos": 57809499045120.0, - "grad_norm": 1.3394977995740782, - "language_loss": 0.65011883, - "learning_rate": 5.279762455006054e-10, - "loss": 0.67124498, - "num_input_tokens_seen": 356486405, - "step": 16514, - "time_per_iteration": 2.9586453437805176 - }, - { - "auxiliary_loss_clip": 0.01077404, - "auxiliary_loss_mlp": 0.01029066, - "balance_loss_clip": 1.03275108, - "balance_loss_mlp": 1.01589894, - "epoch": 0.9929355178115136, - "flos": 19568981450880.0, - "grad_norm": 1.9082841618912534, - "language_loss": 0.73075938, - "learning_rate": 5.190657994713632e-10, - "loss": 0.75182408, - "num_input_tokens_seen": 356502905, - "step": 16515, - "time_per_iteration": 2.7386841773986816 - }, - { - "auxiliary_loss_clip": 0.01065642, - "auxiliary_loss_mlp": 0.0104261, - "balance_loss_clip": 1.03322613, - "balance_loss_mlp": 1.02893686, - "epoch": 0.9929956410641816, - "flos": 22964658606720.0, - "grad_norm": 1.635364336808878, - "language_loss": 0.77238375, - "learning_rate": 5.102311739593191e-10, - "loss": 0.79346621, - "num_input_tokens_seen": 356523830, - "step": 16516, - "time_per_iteration": 2.7601654529571533 - }, - { - "auxiliary_loss_clip": 0.01077729, - "auxiliary_loss_mlp": 0.01026442, - "balance_loss_clip": 1.0354166, - "balance_loss_mlp": 1.01530802, - "epoch": 0.9930557643168495, - "flos": 22566409539840.0, - "grad_norm": 1.7024793755229197, - "language_loss": 0.78187561, - "learning_rate": 5.014723692997602e-10, - "loss": 0.8029173, - "num_input_tokens_seen": 356543965, - "step": 16517, - "time_per_iteration": 2.891570568084717 - }, - { - "auxiliary_loss_clip": 0.01097555, - "auxiliary_loss_mlp": 0.01037224, - "balance_loss_clip": 1.03813481, - "balance_loss_mlp": 1.02333033, - "epoch": 0.9931158875695175, - "flos": 17201032231680.0, - "grad_norm": 2.4652288610488604, - "language_loss": 0.67716908, - "learning_rate": 4.927893858248655e-10, - "loss": 0.69851696, - "num_input_tokens_seen": 356561530, - "step": 16518, - "time_per_iteration": 2.646632432937622 - }, - { - "auxiliary_loss_clip": 0.01008101, - "auxiliary_loss_mlp": 0.01002102, - "balance_loss_clip": 1.00879121, - "balance_loss_mlp": 1.00086808, - "epoch": 0.9931760108221854, - "flos": 63711204278400.0, - "grad_norm": 0.7468001018305941, - "language_loss": 0.53340489, - "learning_rate": 4.84182223863483e-10, - "loss": 0.55350691, - "num_input_tokens_seen": 356616845, - "step": 16519, - "time_per_iteration": 3.0809152126312256 - }, - { - "auxiliary_loss_clip": 0.01065697, - "auxiliary_loss_mlp": 0.01041583, - "balance_loss_clip": 1.03317142, - "balance_loss_mlp": 1.02780831, - "epoch": 0.9932361340748534, - "flos": 15304805349120.0, - "grad_norm": 1.6932132720943704, - "language_loss": 0.6033656, - "learning_rate": 4.756508837426842e-10, - "loss": 0.62443841, - "num_input_tokens_seen": 356633560, - "step": 16520, - "time_per_iteration": 2.7310233116149902 - }, - { - "auxiliary_loss_clip": 0.01078536, - "auxiliary_loss_mlp": 0.01034414, - "balance_loss_clip": 1.03465533, - "balance_loss_mlp": 1.021927, - "epoch": 0.9932962573275215, - "flos": 36064906727040.0, - "grad_norm": 1.7244534802172446, - "language_loss": 0.62099916, - "learning_rate": 4.671953657853223e-10, - "loss": 0.64212871, - "num_input_tokens_seen": 356657600, - "step": 16521, - "time_per_iteration": 2.883345603942871 - }, - { - "auxiliary_loss_clip": 0.01087845, - "auxiliary_loss_mlp": 0.01036694, - "balance_loss_clip": 1.03922451, - "balance_loss_mlp": 1.02330661, - "epoch": 0.9933563805801894, - "flos": 21470523546240.0, - "grad_norm": 4.353373904102065, - "language_loss": 0.74153936, - "learning_rate": 4.5881567031225145e-10, - "loss": 0.76278472, - "num_input_tokens_seen": 356675880, - "step": 16522, - "time_per_iteration": 2.718522071838379 - }, - { - "auxiliary_loss_clip": 0.0107243, - "auxiliary_loss_mlp": 0.01030252, - "balance_loss_clip": 1.03389478, - "balance_loss_mlp": 1.01866508, - "epoch": 0.9934165038328574, - "flos": 23986532626560.0, - "grad_norm": 1.5310659871247791, - "language_loss": 0.73152745, - "learning_rate": 4.5051179764143964e-10, - "loss": 0.75255424, - "num_input_tokens_seen": 356696000, - "step": 16523, - "time_per_iteration": 2.7667906284332275 - }, - { - "auxiliary_loss_clip": 0.0108301, - "auxiliary_loss_mlp": 0.00769257, - "balance_loss_clip": 1.03243899, - "balance_loss_mlp": 1.00031519, - "epoch": 0.9934766270855253, - "flos": 21907807718400.0, - "grad_norm": 1.6911603974446854, - "language_loss": 0.71271038, - "learning_rate": 4.422837480875241e-10, - "loss": 0.73123306, - "num_input_tokens_seen": 356716845, - "step": 16524, - "time_per_iteration": 2.716357707977295 - }, - { - "auxiliary_loss_clip": 0.0107654, - "auxiliary_loss_mlp": 0.01030893, - "balance_loss_clip": 1.0359261, - "balance_loss_mlp": 1.01835251, - "epoch": 0.9935367503381933, - "flos": 17129139160320.0, - "grad_norm": 2.2457086362374863, - "language_loss": 0.79875743, - "learning_rate": 4.341315219624775e-10, - "loss": 0.81983173, - "num_input_tokens_seen": 356732100, - "step": 16525, - "time_per_iteration": 2.7416329383850098 - }, - { - "auxiliary_loss_clip": 0.0106301, - "auxiliary_loss_mlp": 0.01026465, - "balance_loss_clip": 1.03451014, - "balance_loss_mlp": 1.01388836, - "epoch": 0.9935968735908612, - "flos": 22346241125760.0, - "grad_norm": 2.0057410904081165, - "language_loss": 0.75025058, - "learning_rate": 4.2605511957582995e-10, - "loss": 0.77114534, - "num_input_tokens_seen": 356751480, - "step": 16526, - "time_per_iteration": 2.772752046585083 - }, - { - "auxiliary_loss_clip": 0.01103657, - "auxiliary_loss_mlp": 0.00769996, - "balance_loss_clip": 1.03466129, - "balance_loss_mlp": 1.0002234, - "epoch": 0.9936569968435293, - "flos": 29460539640960.0, - "grad_norm": 2.362336998601464, - "language_loss": 0.72371536, - "learning_rate": 4.180545412333369e-10, - "loss": 0.74245191, - "num_input_tokens_seen": 356772650, - "step": 16527, - "time_per_iteration": 2.622760057449341 - }, - { - "auxiliary_loss_clip": 0.01088795, - "auxiliary_loss_mlp": 0.01031074, - "balance_loss_clip": 1.03722143, - "balance_loss_mlp": 1.01860452, - "epoch": 0.9937171200961972, - "flos": 16544046522240.0, - "grad_norm": 2.2246858984054185, - "language_loss": 0.75991976, - "learning_rate": 4.1012978723875547e-10, - "loss": 0.78111851, - "num_input_tokens_seen": 356788510, - "step": 16528, - "time_per_iteration": 2.6447527408599854 - }, - { - "auxiliary_loss_clip": 0.01089717, - "auxiliary_loss_mlp": 0.01029048, - "balance_loss_clip": 1.03432751, - "balance_loss_mlp": 1.01511216, - "epoch": 0.9937772433488652, - "flos": 24390276474240.0, - "grad_norm": 2.1876724581944504, - "language_loss": 0.6753338, - "learning_rate": 4.022808578922898e-10, - "loss": 0.6965214, - "num_input_tokens_seen": 356809115, - "step": 16529, - "time_per_iteration": 2.7714054584503174 - }, - { - "auxiliary_loss_clip": 0.01103653, - "auxiliary_loss_mlp": 0.01036146, - "balance_loss_clip": 1.03892541, - "balance_loss_mlp": 1.02186477, - "epoch": 0.9938373666015331, - "flos": 15669909141120.0, - "grad_norm": 2.3036099926169116, - "language_loss": 0.65350854, - "learning_rate": 3.9450775349170186e-10, - "loss": 0.67490655, - "num_input_tokens_seen": 356826410, - "step": 16530, - "time_per_iteration": 2.6250078678131104 - }, - { - "auxiliary_loss_clip": 0.01093807, - "auxiliary_loss_mlp": 0.01032892, - "balance_loss_clip": 1.03568208, - "balance_loss_mlp": 1.02088773, - "epoch": 0.9938974898542011, - "flos": 19496190539520.0, - "grad_norm": 3.0743920406722274, - "language_loss": 0.71364164, - "learning_rate": 3.8681047433186676e-10, - "loss": 0.7349087, - "num_input_tokens_seen": 356844990, - "step": 16531, - "time_per_iteration": 2.574047803878784 - }, - { - "auxiliary_loss_clip": 0.01094022, - "auxiliary_loss_mlp": 0.01034706, - "balance_loss_clip": 1.03513575, - "balance_loss_mlp": 1.02152801, - "epoch": 0.993957613106869, - "flos": 26906896085760.0, - "grad_norm": 1.3526587285658505, - "language_loss": 0.74083483, - "learning_rate": 3.791890207045512e-10, - "loss": 0.76212215, - "num_input_tokens_seen": 356866530, - "step": 16532, - "time_per_iteration": 2.6634178161621094 - }, - { - "auxiliary_loss_clip": 0.01051179, - "auxiliary_loss_mlp": 0.01032159, - "balance_loss_clip": 1.03274739, - "balance_loss_mlp": 1.02109611, - "epoch": 0.994017736359537, - "flos": 14939593816320.0, - "grad_norm": 1.6154702582280394, - "language_loss": 0.70493329, - "learning_rate": 3.7164339289885717e-10, - "loss": 0.7257666, - "num_input_tokens_seen": 356884660, - "step": 16533, - "time_per_iteration": 2.721863031387329 - }, - { - "auxiliary_loss_clip": 0.01097407, - "auxiliary_loss_mlp": 0.01029682, - "balance_loss_clip": 1.03669178, - "balance_loss_mlp": 1.01622939, - "epoch": 0.9940778596122051, - "flos": 15377883569280.0, - "grad_norm": 3.6789979959756676, - "language_loss": 0.84027219, - "learning_rate": 3.641735912007782e-10, - "loss": 0.86154306, - "num_input_tokens_seen": 356900895, - "step": 16534, - "time_per_iteration": 2.619920492172241 - }, - { - "auxiliary_loss_clip": 0.01067064, - "auxiliary_loss_mlp": 0.01026895, - "balance_loss_clip": 1.03194451, - "balance_loss_mlp": 1.01531422, - "epoch": 0.994137982864873, - "flos": 25228108183680.0, - "grad_norm": 1.8397563877980199, - "language_loss": 0.65771168, - "learning_rate": 3.567796158934211e-10, - "loss": 0.67865133, - "num_input_tokens_seen": 356920985, - "step": 16535, - "time_per_iteration": 2.744962692260742 - }, - { - "auxiliary_loss_clip": 0.01070974, - "auxiliary_loss_mlp": 0.01028223, - "balance_loss_clip": 1.03729725, - "balance_loss_mlp": 1.01723814, - "epoch": 0.994198106117541, - "flos": 18442140912000.0, - "grad_norm": 2.0070166211015517, - "language_loss": 0.64754289, - "learning_rate": 3.4946146725767235e-10, - "loss": 0.66853487, - "num_input_tokens_seen": 356939800, - "step": 16536, - "time_per_iteration": 2.706944465637207 - }, - { - "auxiliary_loss_clip": 0.01060285, - "auxiliary_loss_mlp": 0.01035363, - "balance_loss_clip": 1.03116417, - "balance_loss_mlp": 1.02181518, - "epoch": 0.9942582293702089, - "flos": 16654112772480.0, - "grad_norm": 1.840427715417003, - "language_loss": 0.78430796, - "learning_rate": 3.4221914557064357e-10, - "loss": 0.80526441, - "num_input_tokens_seen": 356957780, - "step": 16537, - "time_per_iteration": 2.7647006511688232 - }, - { - "auxiliary_loss_clip": 0.01105131, - "auxiliary_loss_mlp": 0.01034471, - "balance_loss_clip": 1.03843594, - "balance_loss_mlp": 1.02052915, - "epoch": 0.9943183526228769, - "flos": 21944580266880.0, - "grad_norm": 1.5935823863373109, - "language_loss": 0.68781149, - "learning_rate": 3.35052651107004e-10, - "loss": 0.70920742, - "num_input_tokens_seen": 356979185, - "step": 16538, - "time_per_iteration": 2.7235569953918457 - }, - { - "auxiliary_loss_clip": 0.01063974, - "auxiliary_loss_mlp": 0.01035944, - "balance_loss_clip": 1.02961493, - "balance_loss_mlp": 1.02304578, - "epoch": 0.9943784758755448, - "flos": 23842566915840.0, - "grad_norm": 1.8915805101103145, - "language_loss": 0.75187773, - "learning_rate": 3.2796198413853614e-10, - "loss": 0.77287686, - "num_input_tokens_seen": 356997735, - "step": 16539, - "time_per_iteration": 2.8062071800231934 - }, - { - "auxiliary_loss_clip": 0.01060765, - "auxiliary_loss_mlp": 0.01033233, - "balance_loss_clip": 1.03562832, - "balance_loss_mlp": 1.02050102, - "epoch": 0.9944385991282129, - "flos": 21469984842240.0, - "grad_norm": 2.030639619740989, - "language_loss": 0.70239884, - "learning_rate": 3.209471449341361e-10, - "loss": 0.72333884, - "num_input_tokens_seen": 357015660, - "step": 16540, - "time_per_iteration": 2.8070261478424072 - }, - { - "auxiliary_loss_clip": 0.01093159, - "auxiliary_loss_mlp": 0.01027717, - "balance_loss_clip": 1.03431797, - "balance_loss_mlp": 1.01676154, - "epoch": 0.9944987223808808, - "flos": 22927024131840.0, - "grad_norm": 1.9807950756120538, - "language_loss": 0.75202429, - "learning_rate": 3.140081337600353e-10, - "loss": 0.77323306, - "num_input_tokens_seen": 357034800, - "step": 16541, - "time_per_iteration": 5.754985570907593 - }, - { - "auxiliary_loss_clip": 0.01080974, - "auxiliary_loss_mlp": 0.01036156, - "balance_loss_clip": 1.0349412, - "balance_loss_mlp": 1.02397323, - "epoch": 0.9945588456335488, - "flos": 22383013674240.0, - "grad_norm": 1.7422746830873381, - "language_loss": 0.76555264, - "learning_rate": 3.0714495087891255e-10, - "loss": 0.78672391, - "num_input_tokens_seen": 357053785, - "step": 16542, - "time_per_iteration": 2.708519458770752 - }, - { - "auxiliary_loss_clip": 0.01099205, - "auxiliary_loss_mlp": 0.01030947, - "balance_loss_clip": 1.03627014, - "balance_loss_mlp": 1.01776206, - "epoch": 0.9946189688862167, - "flos": 21397517153280.0, - "grad_norm": 2.4054715264061435, - "language_loss": 0.74274677, - "learning_rate": 3.0035759655122615e-10, - "loss": 0.76404828, - "num_input_tokens_seen": 357072025, - "step": 16543, - "time_per_iteration": 5.371897459030151 - }, - { - "auxiliary_loss_clip": 0.01094886, - "auxiliary_loss_mlp": 0.01036182, - "balance_loss_clip": 1.03557873, - "balance_loss_mlp": 1.02270508, - "epoch": 0.9946790921388847, - "flos": 12416545670400.0, - "grad_norm": 6.4447794910959235, - "language_loss": 0.82093954, - "learning_rate": 2.9364607103454785e-10, - "loss": 0.84225017, - "num_input_tokens_seen": 357086960, - "step": 16544, - "time_per_iteration": 2.648569107055664 - }, - { - "auxiliary_loss_clip": 0.0110726, - "auxiliary_loss_mlp": 0.01027758, - "balance_loss_clip": 1.0360719, - "balance_loss_mlp": 1.01605737, - "epoch": 0.9947392153915526, - "flos": 19058295836160.0, - "grad_norm": 1.8960221821622298, - "language_loss": 0.78761363, - "learning_rate": 2.870103745831187e-10, - "loss": 0.80896378, - "num_input_tokens_seen": 357105095, - "step": 16545, - "time_per_iteration": 2.6322624683380127 - }, - { - "auxiliary_loss_clip": 0.01078686, - "auxiliary_loss_mlp": 0.01030665, - "balance_loss_clip": 1.03411245, - "balance_loss_mlp": 1.01840401, - "epoch": 0.9947993386442207, - "flos": 27308808339840.0, - "grad_norm": 1.8256767545594197, - "language_loss": 0.72650462, - "learning_rate": 2.8045050744873733e-10, - "loss": 0.74759817, - "num_input_tokens_seen": 357125065, - "step": 16546, - "time_per_iteration": 4.327521562576294 - }, - { - "auxiliary_loss_clip": 0.0109393, - "auxiliary_loss_mlp": 0.01034353, - "balance_loss_clip": 1.03408468, - "balance_loss_mlp": 1.02212179, - "epoch": 0.9948594618968887, - "flos": 20806498771200.0, - "grad_norm": 2.038631565735454, - "language_loss": 0.77378041, - "learning_rate": 2.739664698798716e-10, - "loss": 0.79506326, - "num_input_tokens_seen": 357141600, - "step": 16547, - "time_per_iteration": 2.6839520931243896 - }, - { - "auxiliary_loss_clip": 0.01085655, - "auxiliary_loss_mlp": 0.01030142, - "balance_loss_clip": 1.03360105, - "balance_loss_mlp": 1.01823926, - "epoch": 0.9949195851495566, - "flos": 23292953936640.0, - "grad_norm": 2.7868363629317097, - "language_loss": 0.70053595, - "learning_rate": 2.67558262122769e-10, - "loss": 0.72169393, - "num_input_tokens_seen": 357157880, - "step": 16548, - "time_per_iteration": 2.6629064083099365 - }, - { - "auxiliary_loss_clip": 0.0109367, - "auxiliary_loss_mlp": 0.01034955, - "balance_loss_clip": 1.03438258, - "balance_loss_mlp": 1.02264059, - "epoch": 0.9949797084022246, - "flos": 18515470527360.0, - "grad_norm": 1.8066834079511649, - "language_loss": 0.75463164, - "learning_rate": 2.6122588442012427e-10, - "loss": 0.77591789, - "num_input_tokens_seen": 357176705, - "step": 16549, - "time_per_iteration": 2.6749610900878906 - }, - { - "auxiliary_loss_clip": 0.01080946, - "auxiliary_loss_mlp": 0.01034852, - "balance_loss_clip": 1.03683913, - "balance_loss_mlp": 1.02162528, - "epoch": 0.9950398316548925, - "flos": 30407719328640.0, - "grad_norm": 1.591136058001085, - "language_loss": 0.74426466, - "learning_rate": 2.5496933701241177e-10, - "loss": 0.7654227, - "num_input_tokens_seen": 357197630, - "step": 16550, - "time_per_iteration": 2.8009731769561768 - }, - { - "auxiliary_loss_clip": 0.01058637, - "auxiliary_loss_mlp": 0.00770717, - "balance_loss_clip": 1.03213239, - "balance_loss_mlp": 1.00024307, - "epoch": 0.9950999549075605, - "flos": 19900868140800.0, - "grad_norm": 1.804349453887292, - "language_loss": 0.78024846, - "learning_rate": 2.4878862013655297e-10, - "loss": 0.79854202, - "num_input_tokens_seen": 357215445, - "step": 16551, - "time_per_iteration": 2.7871713638305664 - }, - { - "auxiliary_loss_clip": 0.01090386, - "auxiliary_loss_mlp": 0.01032903, - "balance_loss_clip": 1.03510332, - "balance_loss_mlp": 1.02215683, - "epoch": 0.9951600781602284, - "flos": 17603555016960.0, - "grad_norm": 1.3788671688466283, - "language_loss": 0.66577691, - "learning_rate": 2.426837340270271e-10, - "loss": 0.68700981, - "num_input_tokens_seen": 357234285, - "step": 16552, - "time_per_iteration": 2.7981386184692383 - }, - { - "auxiliary_loss_clip": 0.01108432, - "auxiliary_loss_mlp": 0.010277, - "balance_loss_clip": 1.03545749, - "balance_loss_mlp": 1.01527882, - "epoch": 0.9952202014128965, - "flos": 28950715952640.0, - "grad_norm": 1.414763440540152, - "language_loss": 0.81504261, - "learning_rate": 2.3665467891520465e-10, - "loss": 0.83640391, - "num_input_tokens_seen": 357257565, - "step": 16553, - "time_per_iteration": 2.7514050006866455 - }, - { - "auxiliary_loss_clip": 0.01016193, - "auxiliary_loss_mlp": 0.01001561, - "balance_loss_clip": 1.00488806, - "balance_loss_mlp": 1.00064945, - "epoch": 0.9952803246655644, - "flos": 70810386145920.0, - "grad_norm": 0.7163424076202736, - "language_loss": 0.57331538, - "learning_rate": 2.3070145503001348e-10, - "loss": 0.59349293, - "num_input_tokens_seen": 357320205, - "step": 16554, - "time_per_iteration": 3.343486785888672 - }, - { - "auxiliary_loss_clip": 0.01092483, - "auxiliary_loss_mlp": 0.01037848, - "balance_loss_clip": 1.03661346, - "balance_loss_mlp": 1.02556348, - "epoch": 0.9953404479182324, - "flos": 21799070271360.0, - "grad_norm": 1.572030875373758, - "language_loss": 0.77075458, - "learning_rate": 2.24824062597051e-10, - "loss": 0.79205793, - "num_input_tokens_seen": 357340695, - "step": 16555, - "time_per_iteration": 2.6856164932250977 - }, - { - "auxiliary_loss_clip": 0.01077447, - "auxiliary_loss_mlp": 0.01031566, - "balance_loss_clip": 1.03370774, - "balance_loss_mlp": 1.01910233, - "epoch": 0.9954005711709003, - "flos": 21937397546880.0, - "grad_norm": 2.812435043549039, - "language_loss": 0.86056131, - "learning_rate": 2.1902250183902793e-10, - "loss": 0.8816514, - "num_input_tokens_seen": 357357505, - "step": 16556, - "time_per_iteration": 2.8494834899902344 - }, - { - "auxiliary_loss_clip": 0.01062031, - "auxiliary_loss_mlp": 0.01032977, - "balance_loss_clip": 1.03475928, - "balance_loss_mlp": 1.02018583, - "epoch": 0.9954606944235683, - "flos": 19354559212800.0, - "grad_norm": 1.7341010844964493, - "language_loss": 0.73350233, - "learning_rate": 2.132967729762125e-10, - "loss": 0.75445241, - "num_input_tokens_seen": 357375395, - "step": 16557, - "time_per_iteration": 2.776954412460327 - }, - { - "auxiliary_loss_clip": 0.01096785, - "auxiliary_loss_mlp": 0.01035737, - "balance_loss_clip": 1.03676844, - "balance_loss_mlp": 1.02380407, - "epoch": 0.9955208176762362, - "flos": 30518611591680.0, - "grad_norm": 1.8889126824734808, - "language_loss": 0.76071554, - "learning_rate": 2.0764687622554233e-10, - "loss": 0.78204083, - "num_input_tokens_seen": 357397375, - "step": 16558, - "time_per_iteration": 2.725471258163452 - }, - { - "auxiliary_loss_clip": 0.01082875, - "auxiliary_loss_mlp": 0.0103135, - "balance_loss_clip": 1.0333569, - "balance_loss_mlp": 1.01868999, - "epoch": 0.9955809409289043, - "flos": 30008249199360.0, - "grad_norm": 1.8788857895854807, - "language_loss": 0.6342541, - "learning_rate": 2.0207281180129044e-10, - "loss": 0.65539634, - "num_input_tokens_seen": 357418880, - "step": 16559, - "time_per_iteration": 2.754697322845459 - }, - { - "auxiliary_loss_clip": 0.01094664, - "auxiliary_loss_mlp": 0.01027311, - "balance_loss_clip": 1.03535438, - "balance_loss_mlp": 1.01506233, - "epoch": 0.9956410641815723, - "flos": 21543278544000.0, - "grad_norm": 2.2357683381447044, - "language_loss": 0.74527764, - "learning_rate": 1.965745799148433e-10, - "loss": 0.76649737, - "num_input_tokens_seen": 357438310, - "step": 16560, - "time_per_iteration": 2.675863265991211 - }, - { - "auxiliary_loss_clip": 0.01050704, - "auxiliary_loss_mlp": 0.01026972, - "balance_loss_clip": 1.03353262, - "balance_loss_mlp": 1.0149498, - "epoch": 0.9957011874342402, - "flos": 21689470897920.0, - "grad_norm": 1.7541279105695071, - "language_loss": 0.7902168, - "learning_rate": 1.9115218077470073e-10, - "loss": 0.81099355, - "num_input_tokens_seen": 357457155, - "step": 16561, - "time_per_iteration": 2.800518751144409 - }, - { - "auxiliary_loss_clip": 0.01105362, - "auxiliary_loss_mlp": 0.01030097, - "balance_loss_clip": 1.03662086, - "balance_loss_mlp": 1.01839638, - "epoch": 0.9957613106869082, - "flos": 17702667619200.0, - "grad_norm": 2.712205364252532, - "language_loss": 0.65797567, - "learning_rate": 1.8580561458647614e-10, - "loss": 0.67933023, - "num_input_tokens_seen": 357468060, - "step": 16562, - "time_per_iteration": 2.6822054386138916 - }, - { - "auxiliary_loss_clip": 0.01086196, - "auxiliary_loss_mlp": 0.00770624, - "balance_loss_clip": 1.03828645, - "balance_loss_mlp": 1.00018549, - "epoch": 0.9958214339395761, - "flos": 30555994671360.0, - "grad_norm": 3.146501927176202, - "language_loss": 0.6437341, - "learning_rate": 1.805348815528962e-10, - "loss": 0.66230226, - "num_input_tokens_seen": 357489665, - "step": 16563, - "time_per_iteration": 2.7867605686187744 - }, - { - "auxiliary_loss_clip": 0.01085894, - "auxiliary_loss_mlp": 0.01032654, - "balance_loss_clip": 1.03683317, - "balance_loss_mlp": 1.01987505, - "epoch": 0.9958815571922441, - "flos": 24169174306560.0, - "grad_norm": 2.104507608134006, - "language_loss": 0.64749634, - "learning_rate": 1.7533998187380105e-10, - "loss": 0.66868186, - "num_input_tokens_seen": 357511975, - "step": 16564, - "time_per_iteration": 2.7374000549316406 - }, - { - "auxiliary_loss_clip": 0.010846, - "auxiliary_loss_mlp": 0.00769579, - "balance_loss_clip": 1.03644037, - "balance_loss_mlp": 1.00024891, - "epoch": 0.995941680444912, - "flos": 15487016065920.0, - "grad_norm": 2.0341967172049857, - "language_loss": 0.7462337, - "learning_rate": 1.7022091574636633e-10, - "loss": 0.76477551, - "num_input_tokens_seen": 357529345, - "step": 16565, - "time_per_iteration": 2.6312754154205322 - }, - { - "auxiliary_loss_clip": 0.01087362, - "auxiliary_loss_mlp": 0.01027615, - "balance_loss_clip": 1.03376865, - "balance_loss_mlp": 1.0157181, - "epoch": 0.9960018036975801, - "flos": 18621227145600.0, - "grad_norm": 1.7027522514634321, - "language_loss": 0.79018843, - "learning_rate": 1.6517768336443694e-10, - "loss": 0.81133819, - "num_input_tokens_seen": 357547615, - "step": 16566, - "time_per_iteration": 2.6870059967041016 - }, - { - "auxiliary_loss_clip": 0.01056958, - "auxiliary_loss_mlp": 0.00769517, - "balance_loss_clip": 1.03390598, - "balance_loss_mlp": 1.0001384, - "epoch": 0.996061926950248, - "flos": 20084120352000.0, - "grad_norm": 1.7390992367091276, - "language_loss": 0.70729011, - "learning_rate": 1.6021028491941535e-10, - "loss": 0.72555488, - "num_input_tokens_seen": 357567380, - "step": 16567, - "time_per_iteration": 2.7366580963134766 - }, - { - "auxiliary_loss_clip": 0.01097619, - "auxiliary_loss_mlp": 0.01032972, - "balance_loss_clip": 1.03566027, - "balance_loss_mlp": 1.01965046, - "epoch": 0.996122050202916, - "flos": 24347829576960.0, - "grad_norm": 2.9373159802346076, - "language_loss": 0.79025483, - "learning_rate": 1.5531872059959538e-10, - "loss": 0.81156075, - "num_input_tokens_seen": 357586435, - "step": 16568, - "time_per_iteration": 2.6557395458221436 - }, - { - "auxiliary_loss_clip": 0.01093714, - "auxiliary_loss_mlp": 0.01028241, - "balance_loss_clip": 1.03664947, - "balance_loss_mlp": 1.0173099, - "epoch": 0.9961821734555839, - "flos": 24199302839040.0, - "grad_norm": 1.7081910845577881, - "language_loss": 0.81825495, - "learning_rate": 1.5050299059060634e-10, - "loss": 0.83947456, - "num_input_tokens_seen": 357604720, - "step": 16569, - "time_per_iteration": 2.750368118286133 - }, - { - "auxiliary_loss_clip": 0.0106979, - "auxiliary_loss_mlp": 0.00770531, - "balance_loss_clip": 1.03594494, - "balance_loss_mlp": 1.00018477, - "epoch": 0.9962422967082519, - "flos": 22633741584000.0, - "grad_norm": 1.812159782234162, - "language_loss": 0.7033971, - "learning_rate": 1.457630950747468e-10, - "loss": 0.72180027, - "num_input_tokens_seen": 357622345, - "step": 16570, - "time_per_iteration": 2.6845390796661377 - }, - { - "auxiliary_loss_clip": 0.01079783, - "auxiliary_loss_mlp": 0.01026272, - "balance_loss_clip": 1.03678036, - "balance_loss_mlp": 1.01413023, - "epoch": 0.9963024199609198, - "flos": 26396030903040.0, - "grad_norm": 1.5778392939659474, - "language_loss": 0.75031984, - "learning_rate": 1.4109903423209502e-10, - "loss": 0.77138042, - "num_input_tokens_seen": 357642710, - "step": 16571, - "time_per_iteration": 2.6998531818389893 - }, - { - "auxiliary_loss_clip": 0.01085876, - "auxiliary_loss_mlp": 0.01032303, - "balance_loss_clip": 1.03418159, - "balance_loss_mlp": 1.01976252, - "epoch": 0.9963625432135879, - "flos": 16581537342720.0, - "grad_norm": 2.056267788643989, - "language_loss": 0.79312503, - "learning_rate": 1.3651080823939843e-10, - "loss": 0.81430686, - "num_input_tokens_seen": 357659870, - "step": 16572, - "time_per_iteration": 2.6602699756622314 - }, - { - "auxiliary_loss_clip": 0.01083413, - "auxiliary_loss_mlp": 0.01031239, - "balance_loss_clip": 1.0354054, - "balance_loss_mlp": 1.01907969, - "epoch": 0.9964226664662559, - "flos": 26468534505600.0, - "grad_norm": 1.9072175246303182, - "language_loss": 0.7072866, - "learning_rate": 1.3199841727074e-10, - "loss": 0.72843313, - "num_input_tokens_seen": 357677075, - "step": 16573, - "time_per_iteration": 2.7399983406066895 - }, - { - "auxiliary_loss_clip": 0.01085736, - "auxiliary_loss_mlp": 0.01031859, - "balance_loss_clip": 1.03562653, - "balance_loss_mlp": 1.01902056, - "epoch": 0.9964827897189238, - "flos": 27448320764160.0, - "grad_norm": 17.706098733972073, - "language_loss": 0.63426065, - "learning_rate": 1.275618614968721e-10, - "loss": 0.65543658, - "num_input_tokens_seen": 357696715, - "step": 16574, - "time_per_iteration": 2.7760708332061768 - }, - { - "auxiliary_loss_clip": 0.01079151, - "auxiliary_loss_mlp": 0.01032112, - "balance_loss_clip": 1.03830504, - "balance_loss_mlp": 1.01859987, - "epoch": 0.9965429129715918, - "flos": 11721566350080.0, - "grad_norm": 2.269343954820431, - "language_loss": 0.76514804, - "learning_rate": 1.2320114108654856e-10, - "loss": 0.78626072, - "num_input_tokens_seen": 357712345, - "step": 16575, - "time_per_iteration": 2.670433759689331 - }, - { - "auxiliary_loss_clip": 0.01086412, - "auxiliary_loss_mlp": 0.01030377, - "balance_loss_clip": 1.03638375, - "balance_loss_mlp": 1.01757431, - "epoch": 0.9966030362242597, - "flos": 19756004590080.0, - "grad_norm": 1.890239065955032, - "language_loss": 0.70341682, - "learning_rate": 1.1891625620474855e-10, - "loss": 0.72458476, - "num_input_tokens_seen": 357731815, - "step": 16576, - "time_per_iteration": 2.7393879890441895 - }, - { - "auxiliary_loss_clip": 0.0109524, - "auxiliary_loss_mlp": 0.01024289, - "balance_loss_clip": 1.03612185, - "balance_loss_mlp": 1.01186752, - "epoch": 0.9966631594769277, - "flos": 23915178259200.0, - "grad_norm": 1.5127574576312723, - "language_loss": 0.71783984, - "learning_rate": 1.1470720701400871e-10, - "loss": 0.73903513, - "num_input_tokens_seen": 357751640, - "step": 16577, - "time_per_iteration": 2.6747822761535645 - }, - { - "auxiliary_loss_clip": 0.0108308, - "auxiliary_loss_mlp": 0.01034487, - "balance_loss_clip": 1.03563082, - "balance_loss_mlp": 1.02241135, - "epoch": 0.9967232827295956, - "flos": 15559591495680.0, - "grad_norm": 1.8839463168037793, - "language_loss": 0.78829128, - "learning_rate": 1.1057399367397912e-10, - "loss": 0.80946696, - "num_input_tokens_seen": 357769850, - "step": 16578, - "time_per_iteration": 2.6458945274353027 - }, - { - "auxiliary_loss_clip": 0.01069592, - "auxiliary_loss_mlp": 0.0076966, - "balance_loss_clip": 1.0383426, - "balance_loss_mlp": 1.00028622, - "epoch": 0.9967834059822637, - "flos": 20813035046400.0, - "grad_norm": 1.6210978789721697, - "language_loss": 0.76015878, - "learning_rate": 1.0651661634142328e-10, - "loss": 0.77855128, - "num_input_tokens_seen": 357789550, - "step": 16579, - "time_per_iteration": 2.7179081439971924 - }, - { - "auxiliary_loss_clip": 0.01085459, - "auxiliary_loss_mlp": 0.01037581, - "balance_loss_clip": 1.03625321, - "balance_loss_mlp": 1.02271509, - "epoch": 0.9968435292349316, - "flos": 36719234830080.0, - "grad_norm": 2.1621427705186513, - "language_loss": 0.69284117, - "learning_rate": 1.0253507516999604e-10, - "loss": 0.71407157, - "num_input_tokens_seen": 357809525, - "step": 16580, - "time_per_iteration": 4.3343565464019775 - }, - { - "auxiliary_loss_clip": 0.0105428, - "auxiliary_loss_mlp": 0.01032231, - "balance_loss_clip": 1.03120196, - "balance_loss_mlp": 1.02024424, - "epoch": 0.9969036524875996, - "flos": 26760919213440.0, - "grad_norm": 1.9697975439158977, - "language_loss": 0.79967076, - "learning_rate": 9.862937031113184e-11, - "loss": 0.8205359, - "num_input_tokens_seen": 357829795, - "step": 16581, - "time_per_iteration": 4.272336483001709 - }, - { - "auxiliary_loss_clip": 0.01078953, - "auxiliary_loss_mlp": 0.01027322, - "balance_loss_clip": 1.03649044, - "balance_loss_mlp": 1.01607418, - "epoch": 0.9969637757402675, - "flos": 24827237424000.0, - "grad_norm": 1.8090343516567968, - "language_loss": 0.80200183, - "learning_rate": 9.479950191249031e-11, - "loss": 0.82306457, - "num_input_tokens_seen": 357851655, - "step": 16582, - "time_per_iteration": 4.770942449569702 - }, - { - "auxiliary_loss_clip": 0.01092857, - "auxiliary_loss_mlp": 0.01033363, - "balance_loss_clip": 1.03387117, - "balance_loss_mlp": 1.02106702, - "epoch": 0.9970238989929355, - "flos": 23038742407680.0, - "grad_norm": 1.6176460264436903, - "language_loss": 0.60509884, - "learning_rate": 9.104547011951069e-11, - "loss": 0.62636101, - "num_input_tokens_seen": 357871205, - "step": 16583, - "time_per_iteration": 2.670657157897949 - }, - { - "auxiliary_loss_clip": 0.01088101, - "auxiliary_loss_mlp": 0.01037237, - "balance_loss_clip": 1.03633022, - "balance_loss_mlp": 1.0250237, - "epoch": 0.9970840222456034, - "flos": 25298816106240.0, - "grad_norm": 1.6127986377425965, - "language_loss": 0.77779889, - "learning_rate": 8.736727507452357e-11, - "loss": 0.79905224, - "num_input_tokens_seen": 357892145, - "step": 16584, - "time_per_iteration": 2.6968681812286377 - }, - { - "auxiliary_loss_clip": 0.01081813, - "auxiliary_loss_mlp": 0.01030965, - "balance_loss_clip": 1.03400755, - "balance_loss_mlp": 1.01991463, - "epoch": 0.9971441454982715, - "flos": 21615602578560.0, - "grad_norm": 1.5491233705844139, - "language_loss": 0.69406962, - "learning_rate": 8.376491691697297e-11, - "loss": 0.71519732, - "num_input_tokens_seen": 357911205, - "step": 16585, - "time_per_iteration": 4.212535381317139 - }, - { - "auxiliary_loss_clip": 0.0110602, - "auxiliary_loss_mlp": 0.01033386, - "balance_loss_clip": 1.03605747, - "balance_loss_mlp": 1.02094698, - "epoch": 0.9972042687509394, - "flos": 14975612179200.0, - "grad_norm": 2.688083222566017, - "language_loss": 0.82222629, - "learning_rate": 8.023839578363834e-11, - "loss": 0.84362036, - "num_input_tokens_seen": 357928190, - "step": 16586, - "time_per_iteration": 2.5343804359436035 - }, - { - "auxiliary_loss_clip": 0.01084137, - "auxiliary_loss_mlp": 0.01038457, - "balance_loss_clip": 1.03290653, - "balance_loss_mlp": 1.02660799, - "epoch": 0.9972643920036074, - "flos": 25806664546560.0, - "grad_norm": 2.102200677561442, - "language_loss": 0.7796334, - "learning_rate": 7.678771180796851e-11, - "loss": 0.80085933, - "num_input_tokens_seen": 357946985, - "step": 16587, - "time_per_iteration": 2.653956174850464 - }, - { - "auxiliary_loss_clip": 0.01083114, - "auxiliary_loss_mlp": 0.01036983, - "balance_loss_clip": 1.03732991, - "balance_loss_mlp": 1.02448964, - "epoch": 0.9973245152562754, - "flos": 23326242865920.0, - "grad_norm": 5.715123647273254, - "language_loss": 0.73174369, - "learning_rate": 7.341286512074773e-11, - "loss": 0.75294471, - "num_input_tokens_seen": 357966720, - "step": 16588, - "time_per_iteration": 2.5937352180480957 - }, - { - "auxiliary_loss_clip": 0.01112154, - "auxiliary_loss_mlp": 0.01029109, - "balance_loss_clip": 1.03663898, - "balance_loss_mlp": 1.01646113, - "epoch": 0.9973846385089433, - "flos": 12166212810240.0, - "grad_norm": 5.411177211250548, - "language_loss": 0.82386965, - "learning_rate": 7.011385585031781e-11, - "loss": 0.84528232, - "num_input_tokens_seen": 357981375, - "step": 16589, - "time_per_iteration": 2.5262768268585205 - }, - { - "auxiliary_loss_clip": 0.01100757, - "auxiliary_loss_mlp": 0.0103796, - "balance_loss_clip": 1.03564775, - "balance_loss_mlp": 1.02382755, - "epoch": 0.9974447617616113, - "flos": 20045157073920.0, - "grad_norm": 4.308142641596885, - "language_loss": 0.70464408, - "learning_rate": 6.689068412168986e-11, - "loss": 0.72603118, - "num_input_tokens_seen": 358000290, - "step": 16590, - "time_per_iteration": 2.5830941200256348 - }, - { - "auxiliary_loss_clip": 0.01086738, - "auxiliary_loss_mlp": 0.01032028, - "balance_loss_clip": 1.03551257, - "balance_loss_mlp": 1.01895654, - "epoch": 0.9975048850142793, - "flos": 32014614159360.0, - "grad_norm": 4.864987201646496, - "language_loss": 0.63802195, - "learning_rate": 6.374335005676634e-11, - "loss": 0.65920961, - "num_input_tokens_seen": 358022075, - "step": 16591, - "time_per_iteration": 2.68571400642395 - }, - { - "auxiliary_loss_clip": 0.0108584, - "auxiliary_loss_mlp": 0.01030286, - "balance_loss_clip": 1.03423333, - "balance_loss_mlp": 1.01809728, - "epoch": 0.9975650082669473, - "flos": 36933728895360.0, - "grad_norm": 2.6236190500257726, - "language_loss": 0.73096275, - "learning_rate": 6.067185377522933e-11, - "loss": 0.75212401, - "num_input_tokens_seen": 358043940, - "step": 16592, - "time_per_iteration": 2.7373883724212646 - }, - { - "auxiliary_loss_clip": 0.01087724, - "auxiliary_loss_mlp": 0.01032375, - "balance_loss_clip": 1.03737628, - "balance_loss_mlp": 1.01964951, - "epoch": 0.9976251315196152, - "flos": 16472117537280.0, - "grad_norm": 1.4821546433362938, - "language_loss": 0.85078406, - "learning_rate": 5.767619539343016e-11, - "loss": 0.87198508, - "num_input_tokens_seen": 358062720, - "step": 16593, - "time_per_iteration": 2.662369966506958 - }, - { - "auxiliary_loss_clip": 0.01104576, - "auxiliary_loss_mlp": 0.0076981, - "balance_loss_clip": 1.03564858, - "balance_loss_mlp": 1.00020099, - "epoch": 0.9976852547722832, - "flos": 19646836179840.0, - "grad_norm": 1.9769219864730705, - "language_loss": 0.6983223, - "learning_rate": 5.4756375024833656e-11, - "loss": 0.71706617, - "num_input_tokens_seen": 358081560, - "step": 16594, - "time_per_iteration": 2.5857043266296387 - }, - { - "auxiliary_loss_clip": 0.01069224, - "auxiliary_loss_mlp": 0.01026926, - "balance_loss_clip": 1.03892672, - "balance_loss_mlp": 1.01451635, - "epoch": 0.9977453780249511, - "flos": 20448434044800.0, - "grad_norm": 2.022789522013575, - "language_loss": 0.72606945, - "learning_rate": 5.1912392780462113e-11, - "loss": 0.74703097, - "num_input_tokens_seen": 358099065, - "step": 16595, - "time_per_iteration": 2.7689433097839355 - }, - { - "auxiliary_loss_clip": 0.01007096, - "auxiliary_loss_mlp": 0.01003373, - "balance_loss_clip": 1.00481117, - "balance_loss_mlp": 1.00250244, - "epoch": 0.9978055012776191, - "flos": 65455097581440.0, - "grad_norm": 0.7875629365454856, - "language_loss": 0.60383916, - "learning_rate": 4.9144248768007156e-11, - "loss": 0.62394392, - "num_input_tokens_seen": 358156095, - "step": 16596, - "time_per_iteration": 3.08450984954834 - }, - { - "auxiliary_loss_clip": 0.01096892, - "auxiliary_loss_mlp": 0.01029762, - "balance_loss_clip": 1.03594232, - "balance_loss_mlp": 1.01738787, - "epoch": 0.997865624530287, - "flos": 20631506688000.0, - "grad_norm": 2.0986961825985087, - "language_loss": 0.77297747, - "learning_rate": 4.645194309227385e-11, - "loss": 0.79424405, - "num_input_tokens_seen": 358175230, - "step": 16597, - "time_per_iteration": 2.6868622303009033 - }, - { - "auxiliary_loss_clip": 0.01097035, - "auxiliary_loss_mlp": 0.01030189, - "balance_loss_clip": 1.03486156, - "balance_loss_mlp": 1.01730847, - "epoch": 0.9979257477829551, - "flos": 29387102284800.0, - "grad_norm": 1.756861728101755, - "language_loss": 0.82217014, - "learning_rate": 4.383547585562475e-11, - "loss": 0.84344238, - "num_input_tokens_seen": 358197075, - "step": 16598, - "time_per_iteration": 2.7054567337036133 - }, - { - "auxiliary_loss_clip": 0.01081519, - "auxiliary_loss_mlp": 0.01044245, - "balance_loss_clip": 1.03558803, - "balance_loss_mlp": 1.03068531, - "epoch": 0.997985871035623, - "flos": 22635070387200.0, - "grad_norm": 2.4545106380847335, - "language_loss": 0.64762008, - "learning_rate": 4.129484715709175e-11, - "loss": 0.66887772, - "num_input_tokens_seen": 358215925, - "step": 16599, - "time_per_iteration": 2.6614456176757812 - }, - { - "auxiliary_loss_clip": 0.01010593, - "auxiliary_loss_mlp": 0.01000422, - "balance_loss_clip": 1.00784099, - "balance_loss_mlp": 0.9994688, - "epoch": 0.998045994288291, - "flos": 61806968663040.0, - "grad_norm": 0.9370474706148707, - "language_loss": 0.62274885, - "learning_rate": 3.8830057093264256e-11, - "loss": 0.64285898, - "num_input_tokens_seen": 358269035, - "step": 16600, - "time_per_iteration": 3.1614274978637695 - }, - { - "auxiliary_loss_clip": 0.01085288, - "auxiliary_loss_mlp": 0.01032015, - "balance_loss_clip": 1.03679216, - "balance_loss_mlp": 1.02095842, - "epoch": 0.998106117540959, - "flos": 19245534456960.0, - "grad_norm": 1.511083911813729, - "language_loss": 0.78393221, - "learning_rate": 3.644110575717896e-11, - "loss": 0.80510521, - "num_input_tokens_seen": 358287680, - "step": 16601, - "time_per_iteration": 2.772331953048706 - }, - { - "auxiliary_loss_clip": 0.01077732, - "auxiliary_loss_mlp": 0.01031133, - "balance_loss_clip": 1.03514004, - "balance_loss_mlp": 1.01892638, - "epoch": 0.9981662407936269, - "flos": 21106209853440.0, - "grad_norm": 2.513777021712519, - "language_loss": 0.82537293, - "learning_rate": 3.412799323987414e-11, - "loss": 0.84646153, - "num_input_tokens_seen": 358304080, - "step": 16602, - "time_per_iteration": 2.6796252727508545 - }, - { - "auxiliary_loss_clip": 0.01068281, - "auxiliary_loss_mlp": 0.01034598, - "balance_loss_clip": 1.03651309, - "balance_loss_mlp": 1.02249801, - "epoch": 0.998226364046295, - "flos": 24316839118080.0, - "grad_norm": 2.030453284598539, - "language_loss": 0.62777632, - "learning_rate": 3.189071962883538e-11, - "loss": 0.64880514, - "num_input_tokens_seen": 358323670, - "step": 16603, - "time_per_iteration": 2.693939447402954 - }, - { - "auxiliary_loss_clip": 0.01084524, - "auxiliary_loss_mlp": 0.01027967, - "balance_loss_clip": 1.03433418, - "balance_loss_mlp": 1.01537895, - "epoch": 0.9982864872989629, - "flos": 23836389776640.0, - "grad_norm": 1.7079397475017406, - "language_loss": 0.70913982, - "learning_rate": 2.972928500866168e-11, - "loss": 0.73026478, - "num_input_tokens_seen": 358341980, - "step": 16604, - "time_per_iteration": 2.8074941635131836 - }, - { - "auxiliary_loss_clip": 0.0110762, - "auxiliary_loss_mlp": 0.01027575, - "balance_loss_clip": 1.03609681, - "balance_loss_mlp": 1.01511717, - "epoch": 0.9983466105516309, - "flos": 18333116156160.0, - "grad_norm": 1.8321992225796084, - "language_loss": 0.64592469, - "learning_rate": 2.7643689461953613e-11, - "loss": 0.66727662, - "num_input_tokens_seen": 358360400, - "step": 16605, - "time_per_iteration": 2.559711456298828 - }, - { - "auxiliary_loss_clip": 0.01072745, - "auxiliary_loss_mlp": 0.0103157, - "balance_loss_clip": 1.03378582, - "balance_loss_mlp": 1.01944005, - "epoch": 0.9984067338042988, - "flos": 17236763285760.0, - "grad_norm": 1.7112583965701615, - "language_loss": 0.7144081, - "learning_rate": 2.5633933067092938e-11, - "loss": 0.73545122, - "num_input_tokens_seen": 358378990, - "step": 16606, - "time_per_iteration": 2.6522889137268066 - }, - { - "auxiliary_loss_clip": 0.0109612, - "auxiliary_loss_mlp": 0.00770001, - "balance_loss_clip": 1.03534591, - "balance_loss_mlp": 1.00014746, - "epoch": 0.9984668570569668, - "flos": 20667884186880.0, - "grad_norm": 1.989921171025738, - "language_loss": 0.82035434, - "learning_rate": 2.370001590090709e-11, - "loss": 0.8390156, - "num_input_tokens_seen": 358395970, - "step": 16607, - "time_per_iteration": 2.6804637908935547 - }, - { - "auxiliary_loss_clip": 0.0107541, - "auxiliary_loss_mlp": 0.01033615, - "balance_loss_clip": 1.03306758, - "balance_loss_mlp": 1.02051961, - "epoch": 0.9985269803096347, - "flos": 30262532555520.0, - "grad_norm": 1.6639542456977479, - "language_loss": 0.67119384, - "learning_rate": 2.184193803622669e-11, - "loss": 0.69228399, - "num_input_tokens_seen": 358417355, - "step": 16608, - "time_per_iteration": 2.906008005142212 - }, - { - "auxiliary_loss_clip": 0.01063208, - "auxiliary_loss_mlp": 0.0103334, - "balance_loss_clip": 1.03657353, - "balance_loss_mlp": 1.02062011, - "epoch": 0.9985871035623027, - "flos": 10560970005120.0, - "grad_norm": 1.8164676216631945, - "language_loss": 0.80704165, - "learning_rate": 2.0059699543883978e-11, - "loss": 0.82800716, - "num_input_tokens_seen": 358434345, - "step": 16609, - "time_per_iteration": 2.7321889400482178 - }, - { - "auxiliary_loss_clip": 0.01087746, - "auxiliary_loss_mlp": 0.01034825, - "balance_loss_clip": 1.03453326, - "balance_loss_mlp": 1.02246904, - "epoch": 0.9986472268149706, - "flos": 16873455173760.0, - "grad_norm": 1.4927424952025787, - "language_loss": 0.62772417, - "learning_rate": 1.8353300491158462e-11, - "loss": 0.64894992, - "num_input_tokens_seen": 358452870, - "step": 16610, - "time_per_iteration": 2.6517322063446045 - }, - { - "auxiliary_loss_clip": 0.01089605, - "auxiliary_loss_mlp": 0.01032987, - "balance_loss_clip": 1.03502405, - "balance_loss_mlp": 1.02091718, - "epoch": 0.9987073500676387, - "flos": 22054538776320.0, - "grad_norm": 2.237128509248557, - "language_loss": 0.67805243, - "learning_rate": 1.672274094288717e-11, - "loss": 0.69927835, - "num_input_tokens_seen": 358472210, - "step": 16611, - "time_per_iteration": 2.634993553161621 - }, - { - "auxiliary_loss_clip": 0.01066706, - "auxiliary_loss_mlp": 0.01038076, - "balance_loss_clip": 1.03627813, - "balance_loss_mlp": 1.02527332, - "epoch": 0.9987674733203066, - "flos": 30482880537600.0, - "grad_norm": 1.4582335875749615, - "language_loss": 0.69769359, - "learning_rate": 1.5168020961020544e-11, - "loss": 0.71874142, - "num_input_tokens_seen": 358493840, - "step": 16612, - "time_per_iteration": 2.8596408367156982 - }, - { - "auxiliary_loss_clip": 0.01083064, - "auxiliary_loss_mlp": 0.0103316, - "balance_loss_clip": 1.03643417, - "balance_loss_mlp": 1.02126336, - "epoch": 0.9988275965729746, - "flos": 27745230585600.0, - "grad_norm": 1.6439272991561156, - "language_loss": 0.73709273, - "learning_rate": 1.3689140604400407e-11, - "loss": 0.75825495, - "num_input_tokens_seen": 358515060, - "step": 16613, - "time_per_iteration": 2.7902584075927734 - }, - { - "auxiliary_loss_clip": 0.01071712, - "auxiliary_loss_mlp": 0.0077277, - "balance_loss_clip": 1.0345372, - "balance_loss_mlp": 1.00019884, - "epoch": 0.9988877198256426, - "flos": 17524191916800.0, - "grad_norm": 1.920035389313773, - "language_loss": 0.73619223, - "learning_rate": 1.2286099928981996e-11, - "loss": 0.754637, - "num_input_tokens_seen": 358528200, - "step": 16614, - "time_per_iteration": 2.6406190395355225 - }, - { - "auxiliary_loss_clip": 0.01094466, - "auxiliary_loss_mlp": 0.01032872, - "balance_loss_clip": 1.03571415, - "balance_loss_mlp": 1.02093267, - "epoch": 0.9989478430783105, - "flos": 20996502739200.0, - "grad_norm": 1.5665679331066227, - "language_loss": 0.722013, - "learning_rate": 1.0958898988278065e-11, - "loss": 0.74328637, - "num_input_tokens_seen": 358548360, - "step": 16615, - "time_per_iteration": 2.639946222305298 - }, - { - "auxiliary_loss_clip": 0.01112886, - "auxiliary_loss_mlp": 0.00770149, - "balance_loss_clip": 1.03912163, - "balance_loss_mlp": 1.00027168, - "epoch": 0.9990079663309785, - "flos": 13370620769280.0, - "grad_norm": 2.06456218997016, - "language_loss": 0.77498305, - "learning_rate": 9.70753783247069e-12, - "loss": 0.79381335, - "num_input_tokens_seen": 358566270, - "step": 16616, - "time_per_iteration": 2.703230619430542 - }, - { - "auxiliary_loss_clip": 0.01081698, - "auxiliary_loss_mlp": 0.01029148, - "balance_loss_clip": 1.03569651, - "balance_loss_mlp": 1.01701295, - "epoch": 0.9990680895836465, - "flos": 17310236555520.0, - "grad_norm": 2.2671180152095696, - "language_loss": 0.82647479, - "learning_rate": 8.532016508855378e-12, - "loss": 0.84758323, - "num_input_tokens_seen": 358584710, - "step": 16617, - "time_per_iteration": 2.6513431072235107 - }, - { - "auxiliary_loss_clip": 0.01086051, - "auxiliary_loss_mlp": 0.01027312, - "balance_loss_clip": 1.03479171, - "balance_loss_mlp": 1.01599932, - "epoch": 0.9991282128363145, - "flos": 24207993930240.0, - "grad_norm": 1.6210542380677302, - "language_loss": 0.78575474, - "learning_rate": 7.43233506206309e-12, - "loss": 0.8068884, - "num_input_tokens_seen": 358606750, - "step": 16618, - "time_per_iteration": 2.6931798458099365 - }, - { - "auxiliary_loss_clip": 0.01105935, - "auxiliary_loss_mlp": 0.01031572, - "balance_loss_clip": 1.03507876, - "balance_loss_mlp": 1.01963282, - "epoch": 0.9991883360889824, - "flos": 21175301664000.0, - "grad_norm": 1.7008832792892883, - "language_loss": 0.74742877, - "learning_rate": 6.408493534060255e-12, - "loss": 0.76880378, - "num_input_tokens_seen": 358624675, - "step": 16619, - "time_per_iteration": 4.155118942260742 - }, - { - "auxiliary_loss_clip": 0.01093229, - "auxiliary_loss_mlp": 0.01028121, - "balance_loss_clip": 1.03345323, - "balance_loss_mlp": 1.01702261, - "epoch": 0.9992484593416504, - "flos": 19901155449600.0, - "grad_norm": 2.899887344454389, - "language_loss": 0.8699075, - "learning_rate": 5.460491963260594e-12, - "loss": 0.89112103, - "num_input_tokens_seen": 358640715, - "step": 16620, - "time_per_iteration": 4.1041669845581055 - }, - { - "auxiliary_loss_clip": 0.01066897, - "auxiliary_loss_mlp": 0.01026513, - "balance_loss_clip": 1.02997065, - "balance_loss_mlp": 1.01463938, - "epoch": 0.9993085825943183, - "flos": 24857832833280.0, - "grad_norm": 2.059624292912411, - "language_loss": 0.72426653, - "learning_rate": 4.58833038607942e-12, - "loss": 0.74520063, - "num_input_tokens_seen": 358659630, - "step": 16621, - "time_per_iteration": 4.831484794616699 - }, - { - "auxiliary_loss_clip": 0.01000795, - "auxiliary_loss_mlp": 0.00999466, - "balance_loss_clip": 1.00817204, - "balance_loss_mlp": 0.99854225, - "epoch": 0.9993687058469863, - "flos": 71284478780160.0, - "grad_norm": 0.7355150485503724, - "language_loss": 0.56584859, - "learning_rate": 3.79200883515729e-12, - "loss": 0.58585119, - "num_input_tokens_seen": 358727840, - "step": 16622, - "time_per_iteration": 3.3878767490386963 - }, - { - "auxiliary_loss_clip": 0.0106847, - "auxiliary_loss_mlp": 0.01031465, - "balance_loss_clip": 1.0328269, - "balance_loss_mlp": 1.01847744, - "epoch": 0.9994288290996542, - "flos": 12199573566720.0, - "grad_norm": 2.3925047005917244, - "language_loss": 0.71642292, - "learning_rate": 3.071527340914315e-12, - "loss": 0.73742235, - "num_input_tokens_seen": 358744125, - "step": 16623, - "time_per_iteration": 2.725473642349243 - }, - { - "auxiliary_loss_clip": 0.01064784, - "auxiliary_loss_mlp": 0.0103218, - "balance_loss_clip": 1.0360446, - "balance_loss_mlp": 1.01946068, - "epoch": 0.9994889523523223, - "flos": 17889942153600.0, - "grad_norm": 1.8384385141171624, - "language_loss": 0.7497015, - "learning_rate": 2.4268859304399368e-12, - "loss": 0.77067113, - "num_input_tokens_seen": 358761420, - "step": 16624, - "time_per_iteration": 4.1755170822143555 - }, - { - "auxiliary_loss_clip": 0.010734, - "auxiliary_loss_mlp": 0.01030652, - "balance_loss_clip": 1.03599191, - "balance_loss_mlp": 1.01818919, - "epoch": 0.9995490756049902, - "flos": 26578888064640.0, - "grad_norm": 1.666585360491219, - "language_loss": 0.73861277, - "learning_rate": 1.8580846286031514e-12, - "loss": 0.75965327, - "num_input_tokens_seen": 358782600, - "step": 16625, - "time_per_iteration": 2.77114200592041 - }, - { - "auxiliary_loss_clip": 0.01094699, - "auxiliary_loss_mlp": 0.01032931, - "balance_loss_clip": 1.03575385, - "balance_loss_mlp": 1.02107549, - "epoch": 0.9996091988576582, - "flos": 22200048771840.0, - "grad_norm": 2.255734866069882, - "language_loss": 0.76902807, - "learning_rate": 1.3651234567202408e-12, - "loss": 0.7903043, - "num_input_tokens_seen": 358801220, - "step": 16626, - "time_per_iteration": 2.687950611114502 - }, - { - "auxiliary_loss_clip": 0.01107588, - "auxiliary_loss_mlp": 0.010339, - "balance_loss_clip": 1.03792691, - "balance_loss_mlp": 1.02201486, - "epoch": 0.9996693221103262, - "flos": 27373195468800.0, - "grad_norm": 1.6905180098527337, - "language_loss": 0.82313097, - "learning_rate": 9.480024334429515e-13, - "loss": 0.84454584, - "num_input_tokens_seen": 358819190, - "step": 16627, - "time_per_iteration": 2.609881639480591 - }, - { - "auxiliary_loss_clip": 0.01095764, - "auxiliary_loss_mlp": 0.01035323, - "balance_loss_clip": 1.0381639, - "balance_loss_mlp": 1.02206075, - "epoch": 0.9997294453629941, - "flos": 26870410846080.0, - "grad_norm": 1.8853432771890695, - "language_loss": 0.70178521, - "learning_rate": 6.067215747584952e-13, - "loss": 0.72309601, - "num_input_tokens_seen": 358839850, - "step": 16628, - "time_per_iteration": 2.7530713081359863 - }, - { - "auxiliary_loss_clip": 0.01097289, - "auxiliary_loss_mlp": 0.01026328, - "balance_loss_clip": 1.0342133, - "balance_loss_mlp": 1.01419258, - "epoch": 0.9997895686156621, - "flos": 23476996247040.0, - "grad_norm": 1.3278590412352376, - "language_loss": 0.75475144, - "learning_rate": 3.4128089332341456e-13, - "loss": 0.77598757, - "num_input_tokens_seen": 358859805, - "step": 16629, - "time_per_iteration": 2.7801589965820312 - }, - { - "auxiliary_loss_clip": 0.01089302, - "auxiliary_loss_mlp": 0.0103599, - "balance_loss_clip": 1.03652728, - "balance_loss_mlp": 1.02316952, - "epoch": 0.9998496918683301, - "flos": 20224961579520.0, - "grad_norm": 1.6419420095870436, - "language_loss": 0.60239536, - "learning_rate": 1.5168039935176126e-13, - "loss": 0.62364829, - "num_input_tokens_seen": 358877900, - "step": 16630, - "time_per_iteration": 2.6396772861480713 - }, - { - "auxiliary_loss_clip": 0.0106418, - "auxiliary_loss_mlp": 0.01027947, - "balance_loss_clip": 1.03312218, - "balance_loss_mlp": 1.01544785, - "epoch": 0.9999098151209981, - "flos": 21652913831040.0, - "grad_norm": 3.1563978222436964, - "language_loss": 0.6076231, - "learning_rate": 3.792010017100722e-14, - "loss": 0.62854433, - "num_input_tokens_seen": 358897285, - "step": 16631, - "time_per_iteration": 2.699958086013794 - }, - { - "auxiliary_loss_clip": 0.01046835, - "auxiliary_loss_mlp": 0.00770368, - "balance_loss_clip": 1.03351796, - "balance_loss_mlp": 1.00018144, - "epoch": 0.999969938373666, - "flos": 11544599018880.0, - "grad_norm": 13.911116352216522, - "language_loss": 0.7268914, - "learning_rate": 0.0, - "loss": 0.74506336, - "num_input_tokens_seen": 358911570, - "step": 16632, - "time_per_iteration": 2.6853044033050537 - } - ], - "logging_steps": 1.0, - "max_steps": 16632, - "num_input_tokens_seen": 358911570, - "num_train_epochs": 1, - "save_steps": 3328, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 1.3992169073237033e+18, - "train_batch_size": 5, - "trial_name": null, - "trial_params": null -}